1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
16 // #include "msg/Messenger.h"
17 #include "messages/MOSDRepScrub.h"
18 // #include "common/cmdparse.h"
19 // #include "common/ceph_context.h"
21 #include "common/errno.h"
22 #include "common/config.h"
24 #include "OpRequest.h"
25 #include "ScrubStore.h"
28 #include "common/Timer.h"
29 #include "common/perf_counters.h"
31 #include "messages/MOSDOp.h"
32 #include "messages/MOSDPGNotify.h"
33 // #include "messages/MOSDPGLog.h"
34 #include "messages/MOSDPGRemove.h"
35 #include "messages/MOSDPGInfo.h"
36 #include "messages/MOSDPGTrim.h"
37 #include "messages/MOSDPGScan.h"
38 #include "messages/MOSDPGBackfill.h"
39 #include "messages/MOSDPGBackfillRemove.h"
40 #include "messages/MBackfillReserve.h"
41 #include "messages/MRecoveryReserve.h"
42 #include "messages/MOSDPGPush.h"
43 #include "messages/MOSDPGPushReply.h"
44 #include "messages/MOSDPGPull.h"
45 #include "messages/MOSDECSubOpWrite.h"
46 #include "messages/MOSDECSubOpWriteReply.h"
47 #include "messages/MOSDECSubOpRead.h"
48 #include "messages/MOSDECSubOpReadReply.h"
49 #include "messages/MOSDPGUpdateLogMissing.h"
50 #include "messages/MOSDPGUpdateLogMissingReply.h"
51 #include "messages/MOSDBackoff.h"
52 #include "messages/MOSDScrubReserve.h"
53 #include "messages/MOSDSubOp.h"
54 #include "messages/MOSDRepOp.h"
55 #include "messages/MOSDSubOpReply.h"
56 #include "messages/MOSDRepOpReply.h"
57 #include "messages/MOSDRepScrubMap.h"
58 #include "messages/MOSDPGRecoveryDelete.h"
59 #include "messages/MOSDPGRecoveryDeleteReply.h"
61 #include "common/BackTrace.h"
62 #include "common/EventTrace.h"
65 #define TRACEPOINT_DEFINE
66 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
67 #include "tracing/pg.h"
68 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
69 #undef TRACEPOINT_DEFINE
71 #define tracepoint(...)
76 #define dout_context cct
77 #define dout_subsys ceph_subsys_osd
79 #define dout_prefix _prefix(_dout, this)
81 // prefix pgmeta_oid keys with _ so that PGLog::read_log_and_missing() can
83 const string
infover_key("_infover");
84 const string
info_key("_info");
85 const string
biginfo_key("_biginfo");
86 const string
epoch_key("_epoch");
87 const string
fastinfo_key("_fastinfo");
90 static ostream
& _prefix(std::ostream
*_dout
, T
*t
)
92 return *_dout
<< t
->gen_prefix();
95 MEMPOOL_DEFINE_OBJECT_FACTORY(PG::CephPeeringEvt
, pg_peering_evt
, osd
);
97 void PGStateHistory::enter(PG
* pg
, const utime_t entime
, const char* state
)
99 // Ignore trimming state machine for now
100 if (::strstr(state
, "Trimming") != NULL
) {
102 } else if (pi
!= nullptr) {
103 pi
->enter_state(entime
, state
);
105 // Store current state since we can't reliably take the PG lock here
106 if ( tmppi
== nullptr) {
107 tmppi
= std::unique_ptr
<PGStateInstance
>(new PGStateInstance
);
111 tmppi
->enter_state(entime
, state
);
115 void PGStateHistory::exit(const char* state
) {
116 // Ignore trimming state machine for now
117 // Do nothing if PG is being destroyed!
118 if (::strstr(state
, "Trimming") != NULL
|| pg_in_destructor
) {
121 bool ilocked
= false;
122 if(!thispg
->is_locked()) {
127 buffer
.push_back(std::unique_ptr
<PGStateInstance
>(tmppi
.release()));
128 pi
= buffer
.back().get();
129 pi
->setepoch(thispg
->get_osdmap()->get_epoch());
132 pi
->exit_state(ceph_clock_now());
133 if (::strcmp(state
, "Reset") == 0) {
142 void PGStateHistory::dump(Formatter
* f
) const {
143 f
->open_array_section("history");
144 for (auto pi
= buffer
.begin(); pi
!= buffer
.end(); ++pi
) {
145 f
->open_object_section("states");
146 f
->dump_stream("epoch") << (*pi
)->this_epoch
;
147 for (auto she
: (*pi
)->state_history
) {
148 f
->dump_string("state", std::get
<2>(she
));
149 f
->dump_stream("enter") << std::get
<0>(she
);
150 f
->dump_stream("exit") << std::get
<1>(she
);
157 void PG::get(const char* tag
)
161 Mutex::Locker
l(_ref_id_lock
);
166 void PG::put(const char* tag
)
170 Mutex::Locker
l(_ref_id_lock
);
171 auto tag_counts_entry
= _tag_counts
.find(tag
);
172 assert(tag_counts_entry
!= _tag_counts
.end());
173 --tag_counts_entry
->second
;
174 if (tag_counts_entry
->second
== 0) {
175 _tag_counts
.erase(tag_counts_entry
);
184 uint64_t PG::get_with_id()
187 Mutex::Locker
l(_ref_id_lock
);
188 uint64_t id
= ++_ref_id
;
192 dout(20) << __func__
<< ": " << info
.pgid
<< " got id " << id
<< " (new) ref==" << ref
<< dendl
;
193 assert(!_live_ids
.count(id
));
194 _live_ids
.insert(make_pair(id
, ss
.str()));
198 void PG::put_with_id(uint64_t id
)
200 dout(20) << __func__
<< ": " << info
.pgid
<< " put id " << id
<< " (current) ref==" << ref
<< dendl
;
202 Mutex::Locker
l(_ref_id_lock
);
203 assert(_live_ids
.count(id
));
210 void PG::dump_live_ids()
212 Mutex::Locker
l(_ref_id_lock
);
213 dout(0) << "\t" << __func__
<< ": " << info
.pgid
<< " live ids:" << dendl
;
214 for (map
<uint64_t, string
>::iterator i
= _live_ids
.begin();
215 i
!= _live_ids
.end();
217 dout(0) << "\t\tid: " << *i
<< dendl
;
219 dout(0) << "\t" << __func__
<< ": " << info
.pgid
<< " live tags:" << dendl
;
220 for (map
<string
, uint64_t>::iterator i
= _tag_counts
.begin();
221 i
!= _tag_counts
.end();
223 dout(0) << "\t\tid: " << *i
<< dendl
;
229 void PGPool::update(OSDMapRef map
)
231 const pg_pool_t
*pi
= map
->get_pg_pool(id
);
235 name
= map
->get_pool_name(id
);
236 bool updated
= false;
237 if ((map
->get_epoch() != cached_epoch
+ 1) ||
238 (pi
->get_snap_epoch() == map
->get_epoch())) {
240 pi
->build_removed_snaps(newly_removed_snaps
);
241 interval_set
<snapid_t
> intersection
;
242 intersection
.intersection_of(newly_removed_snaps
, cached_removed_snaps
);
243 if (intersection
== cached_removed_snaps
) {
244 newly_removed_snaps
.subtract(cached_removed_snaps
);
245 cached_removed_snaps
.union_of(newly_removed_snaps
);
247 lgeneric_subdout(cct
, osd
, 0) << __func__
248 << " cached_removed_snaps shrank from " << cached_removed_snaps
249 << " to " << newly_removed_snaps
<< dendl
;
250 cached_removed_snaps
= newly_removed_snaps
;
251 newly_removed_snaps
.clear();
253 snapc
= pi
->get_snap_context();
255 /* 1) map->get_epoch() == cached_epoch + 1 &&
256 * 2) pi->get_snap_epoch() != map->get_epoch()
258 * From the if branch, 1 && 2 must be true. From 2, we know that
259 * this map didn't change the set of removed snaps. From 1, we
260 * know that our cached_removed_snaps matches the previous map.
261 * Thus, from 1 && 2, cached_removed snaps matches the current
262 * set of removed snaps and all we have to do is clear
263 * newly_removed_snaps.
265 newly_removed_snaps
.clear();
267 cached_epoch
= map
->get_epoch();
268 lgeneric_subdout(cct
, osd
, 20)
269 << "PGPool::update cached_removed_snaps "
270 << cached_removed_snaps
271 << " newly_removed_snaps "
272 << newly_removed_snaps
273 << " snapc " << snapc
274 << (updated
? " (updated)":" (no change)")
278 PG::PG(OSDService
*o
, OSDMapRef curmap
,
279 const PGPool
&_pool
, spg_t p
) :
282 osdriver(osd
->store
, coll_t(), OSD::make_snapmapper_oid()),
287 p
.get_split_bits(curmap
->get_pg_num(_pool
.id
)),
290 osdmap_ref(curmap
), last_persisted_osdmap_ref(curmap
), pool(_pool
),
293 _ref_id_lock("PG::_ref_id_lock"), _ref_id(0),
296 trace_endpoint("0.0.0.0", 0, "PG"),
297 dirty_info(false), dirty_big_info(false),
302 pgmeta_oid(p
.make_pgmeta_oid()),
305 curmap
->get_pools().at(p
.pgid
.pool()).ec_pool(),
307 stat_queue_item(this),
309 recovery_queued(false),
310 recovery_ops_active(0),
314 pg_whoami(osd
->whoami
, p
.shard
),
316 last_peering_reset(0),
317 heartbeat_peer_lock("PG::heartbeat_peer_lock"),
318 backfill_reserved(false),
319 backfill_reserving(false),
320 flushes_in_progress(0),
321 pg_stats_publish_lock("PG::pg_stats_publish_lock"),
322 pg_stats_publish_valid(false),
323 osr(osd
->osr_registry
.lookup_or_create(p
, (stringify(p
)))),
324 finish_sync_event(NULL
),
325 backoff_lock("PG::backoff_lock"),
326 scrub_after_recovery(false),
328 recovery_state(this),
330 peer_features(CEPH_FEATURES_SUPPORTED_DEFAULT
),
331 acting_features(CEPH_FEATURES_SUPPORTED_DEFAULT
),
332 upacting_features(CEPH_FEATURES_SUPPORTED_DEFAULT
),
336 osd
->add_pgid(p
, this);
339 std::stringstream ss
;
340 ss
<< "PG " << info
.pgid
;
341 trace_endpoint
.copy_name(ss
.str());
348 pgstate_history
.set_pg_in_destructor();
350 osd
->remove_pgid(info
.pgid
, this);
354 void PG::lock_suspend_timeout(ThreadPool::TPHandle
&handle
)
356 handle
.suspend_tp_timeout();
358 handle
.reset_tp_timeout();
361 void PG::lock(bool no_lockdep
) const
363 _lock
.Lock(no_lockdep
);
364 // if we have unrecorded dirty state with the lock dropped, there is a bug
366 assert(!dirty_big_info
);
368 dout(30) << "lock" << dendl
;
371 std::string
PG::gen_prefix() const
374 OSDMapRef mapref
= osdmap_ref
;
375 if (_lock
.is_locked_by_me()) {
376 out
<< "osd." << osd
->whoami
377 << " pg_epoch: " << (mapref
? mapref
->get_epoch():0)
378 << " " << *this << " ";
380 out
<< "osd." << osd
->whoami
381 << " pg_epoch: " << (mapref
? mapref
->get_epoch():0)
382 << " pg[" << info
.pgid
<< "(unlocked)] ";
387 /********* PG **********/
389 void PG::proc_master_log(
390 ObjectStore::Transaction
& t
, pg_info_t
&oinfo
,
391 pg_log_t
&olog
, pg_missing_t
& omissing
, pg_shard_t from
)
393 dout(10) << "proc_master_log for osd." << from
<< ": "
394 << olog
<< " " << omissing
<< dendl
;
395 assert(!is_peered() && is_primary());
397 // merge log into our own log to build master log. no need to
398 // make any adjustments to their missing map; we are taking their
399 // log to be authoritative (i.e., their entries are by definitely
401 merge_log(t
, oinfo
, olog
, from
);
402 peer_info
[from
] = oinfo
;
403 dout(10) << " peer osd." << from
<< " now " << oinfo
<< " " << omissing
<< dendl
;
404 might_have_unfound
.insert(from
);
406 // See doc/dev/osd_internals/last_epoch_started
407 if (oinfo
.last_epoch_started
> info
.last_epoch_started
) {
408 info
.last_epoch_started
= oinfo
.last_epoch_started
;
411 if (oinfo
.last_interval_started
> info
.last_interval_started
) {
412 info
.last_interval_started
= oinfo
.last_interval_started
;
415 update_history(oinfo
.history
);
416 assert(cct
->_conf
->osd_find_best_info_ignore_history_les
||
417 info
.last_epoch_started
>= info
.history
.last_epoch_started
);
419 peer_missing
[from
].claim(omissing
);
422 void PG::proc_replica_log(
424 const pg_log_t
&olog
,
425 pg_missing_t
& omissing
,
428 dout(10) << "proc_replica_log for osd." << from
<< ": "
429 << oinfo
<< " " << olog
<< " " << omissing
<< dendl
;
431 pg_log
.proc_replica_log(oinfo
, olog
, omissing
, from
);
433 peer_info
[from
] = oinfo
;
434 dout(10) << " peer osd." << from
<< " now " << oinfo
<< " " << omissing
<< dendl
;
435 might_have_unfound
.insert(from
);
437 for (map
<hobject_t
, pg_missing_item
>::const_iterator i
=
438 omissing
.get_items().begin();
439 i
!= omissing
.get_items().end();
441 dout(20) << " after missing " << i
->first
<< " need " << i
->second
.need
442 << " have " << i
->second
.have
<< dendl
;
444 peer_missing
[from
].claim(omissing
);
447 bool PG::proc_replica_info(
448 pg_shard_t from
, const pg_info_t
&oinfo
, epoch_t send_epoch
)
450 map
<pg_shard_t
, pg_info_t
>::iterator p
= peer_info
.find(from
);
451 if (p
!= peer_info
.end() && p
->second
.last_update
== oinfo
.last_update
) {
452 dout(10) << " got dup osd." << from
<< " info " << oinfo
<< ", identical to ours" << dendl
;
456 if (!get_osdmap()->has_been_up_since(from
.osd
, send_epoch
)) {
457 dout(10) << " got info " << oinfo
<< " from down osd." << from
458 << " discarding" << dendl
;
462 dout(10) << " got osd." << from
<< " " << oinfo
<< dendl
;
463 assert(is_primary());
464 peer_info
[from
] = oinfo
;
465 might_have_unfound
.insert(from
);
467 update_history(oinfo
.history
);
470 if (!is_up(from
) && !is_acting(from
)) {
471 dout(10) << " osd." << from
<< " has stray content: " << oinfo
<< dendl
;
472 stray_set
.insert(from
);
478 // was this a new info? if so, update peers!
479 if (p
== peer_info
.end())
480 update_heartbeat_peers();
485 void PG::remove_snap_mapped_object(
486 ObjectStore::Transaction
&t
, const hobject_t
&soid
)
490 ghobject_t(soid
, ghobject_t::NO_GEN
, pg_whoami
.shard
));
491 clear_object_snap_mapping(&t
, soid
);
494 void PG::clear_object_snap_mapping(
495 ObjectStore::Transaction
*t
, const hobject_t
&soid
)
497 OSDriver::OSTransaction
_t(osdriver
.get_transaction(t
));
498 if (soid
.snap
< CEPH_MAXSNAP
) {
499 int r
= snap_mapper
.remove_oid(
502 if (!(r
== 0 || r
== -ENOENT
)) {
503 derr
<< __func__
<< ": remove_oid returned " << cpp_strerror(r
) << dendl
;
509 void PG::update_object_snap_mapping(
510 ObjectStore::Transaction
*t
, const hobject_t
&soid
, const set
<snapid_t
> &snaps
)
512 OSDriver::OSTransaction
_t(osdriver
.get_transaction(t
));
513 assert(soid
.snap
< CEPH_MAXSNAP
);
514 int r
= snap_mapper
.remove_oid(
517 if (!(r
== 0 || r
== -ENOENT
)) {
518 derr
<< __func__
<< ": remove_oid returned " << cpp_strerror(r
) << dendl
;
528 ObjectStore::Transaction
& t
, pg_info_t
&oinfo
, pg_log_t
&olog
, pg_shard_t from
)
530 PGLogEntryHandler rollbacker
{this, &t
};
532 oinfo
, olog
, from
, info
, &rollbacker
, dirty_info
, dirty_big_info
);
535 void PG::rewind_divergent_log(ObjectStore::Transaction
& t
, eversion_t newhead
)
537 PGLogEntryHandler rollbacker
{this, &t
};
538 pg_log
.rewind_divergent_log(
539 newhead
, info
, &rollbacker
, dirty_info
, dirty_big_info
);
543 * Process information from a replica to determine if it could have any
544 * objects that i need.
546 * TODO: if the missing set becomes very large, this could get expensive.
547 * Instead, we probably want to just iterate over our unfound set.
549 bool PG::search_for_missing(
550 const pg_info_t
&oinfo
, const pg_missing_t
&omissing
,
554 uint64_t num_unfound_before
= missing_loc
.num_unfound();
555 bool found_missing
= missing_loc
.add_source_info(
556 from
, oinfo
, omissing
, ctx
->handle
);
557 if (found_missing
&& num_unfound_before
!= missing_loc
.num_unfound())
558 publish_stats_to_osd();
560 (get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD
, NULL
) &
561 CEPH_FEATURE_OSD_ERASURE_CODES
)) {
562 pg_info_t
tinfo(oinfo
);
563 tinfo
.pgid
.shard
= pg_whoami
.shard
;
564 (*(ctx
->info_map
))[from
.osd
].push_back(
567 from
.shard
, pg_whoami
.shard
,
568 get_osdmap()->get_epoch(),
569 get_osdmap()->get_epoch(),
573 return found_missing
;
576 bool PG::MissingLoc::readable_with_acting(
577 const hobject_t
&hoid
,
578 const set
<pg_shard_t
> &acting
) const {
579 if (!needs_recovery(hoid
))
581 if (is_deleted(hoid
))
583 auto missing_loc_entry
= missing_loc
.find(hoid
);
584 if (missing_loc_entry
== missing_loc
.end())
586 const set
<pg_shard_t
> &locs
= missing_loc_entry
->second
;
587 ldout(pg
->cct
, 10) << __func__
<< ": locs:" << locs
<< dendl
;
588 set
<pg_shard_t
> have_acting
;
589 for (set
<pg_shard_t
>::const_iterator i
= locs
.begin();
592 if (acting
.count(*i
))
593 have_acting
.insert(*i
);
595 return (*is_readable
)(have_acting
);
598 void PG::MissingLoc::add_batch_sources_info(
599 const set
<pg_shard_t
> &sources
, ThreadPool::TPHandle
* handle
)
601 ldout(pg
->cct
, 10) << __func__
<< ": adding sources in batch "
602 << sources
.size() << dendl
;
604 for (map
<hobject_t
, pg_missing_item
>::const_iterator i
= needs_recovery_map
.begin();
605 i
!= needs_recovery_map
.end();
607 if (handle
&& ++loop
>= pg
->cct
->_conf
->osd_loop_before_reset_tphandle
) {
608 handle
->reset_tp_timeout();
611 if (i
->second
.is_delete())
613 missing_loc
[i
->first
].insert(sources
.begin(), sources
.end());
614 missing_loc_sources
.insert(sources
.begin(), sources
.end());
618 bool PG::MissingLoc::add_source_info(
620 const pg_info_t
&oinfo
,
621 const pg_missing_t
&omissing
,
622 ThreadPool::TPHandle
* handle
)
624 bool found_missing
= false;
627 for (map
<hobject_t
,pg_missing_item
>::const_iterator p
= needs_recovery_map
.begin();
628 p
!= needs_recovery_map
.end();
630 const hobject_t
&soid(p
->first
);
631 eversion_t need
= p
->second
.need
;
632 if (handle
&& ++loop
>= pg
->cct
->_conf
->osd_loop_before_reset_tphandle
) {
633 handle
->reset_tp_timeout();
636 if (p
->second
.is_delete()) {
637 ldout(pg
->cct
, 10) << __func__
<< " " << soid
638 << " delete, ignoring source" << dendl
;
639 found_missing
= true;
642 if (oinfo
.last_update
< need
) {
643 ldout(pg
->cct
, 10) << "search_for_missing " << soid
<< " " << need
644 << " also missing on osd." << fromosd
645 << " (last_update " << oinfo
.last_update
646 << " < needed " << need
<< ")" << dendl
;
649 if (!oinfo
.last_backfill
.is_max() &&
650 !oinfo
.last_backfill_bitwise
) {
651 ldout(pg
->cct
, 10) << "search_for_missing " << soid
<< " " << need
652 << " also missing on osd." << fromosd
653 << " (last_backfill " << oinfo
.last_backfill
654 << " but with wrong sort order)"
658 if (p
->first
>= oinfo
.last_backfill
) {
659 // FIXME: this is _probably_ true, although it could conceivably
660 // be in the undefined region! Hmm!
661 ldout(pg
->cct
, 10) << "search_for_missing " << soid
<< " " << need
662 << " also missing on osd." << fromosd
663 << " (past last_backfill " << oinfo
.last_backfill
667 if (oinfo
.last_complete
< need
) {
668 if (omissing
.is_missing(soid
)) {
669 ldout(pg
->cct
, 10) << "search_for_missing " << soid
<< " " << need
670 << " also missing on osd." << fromosd
<< dendl
;
675 ldout(pg
->cct
, 10) << "search_for_missing " << soid
<< " " << need
676 << " is on osd." << fromosd
<< dendl
;
678 missing_loc
[soid
].insert(fromosd
);
679 missing_loc_sources
.insert(fromosd
);
680 found_missing
= true;
683 ldout(pg
->cct
, 20) << "needs_recovery_map missing " << needs_recovery_map
685 return found_missing
;
688 void PG::discover_all_missing(map
<int, map
<spg_t
,pg_query_t
> > &query_map
)
690 auto &missing
= pg_log
.get_missing();
691 uint64_t unfound
= get_num_unfound();
694 dout(10) << __func__
<< " "
695 << missing
.num_missing() << " missing, "
696 << unfound
<< " unfound"
699 std::set
<pg_shard_t
>::const_iterator m
= might_have_unfound
.begin();
700 std::set
<pg_shard_t
>::const_iterator mend
= might_have_unfound
.end();
701 for (; m
!= mend
; ++m
) {
704 if (!get_osdmap()->is_up(peer
.osd
)) {
705 dout(20) << __func__
<< " skipping down osd." << peer
<< dendl
;
709 map
<pg_shard_t
, pg_info_t
>::const_iterator iter
= peer_info
.find(peer
);
710 if (iter
!= peer_info
.end() &&
711 (iter
->second
.is_empty() || iter
->second
.dne())) {
712 // ignore empty peers
716 // If we've requested any of this stuff, the pg_missing_t information
717 // should be on its way.
718 // TODO: coalsce requested_* into a single data structure
719 if (peer_missing
.find(peer
) != peer_missing
.end()) {
720 dout(20) << __func__
<< ": osd." << peer
721 << ": we already have pg_missing_t" << dendl
;
724 if (peer_log_requested
.find(peer
) != peer_log_requested
.end()) {
725 dout(20) << __func__
<< ": osd." << peer
726 << ": in peer_log_requested" << dendl
;
729 if (peer_missing_requested
.find(peer
) != peer_missing_requested
.end()) {
730 dout(20) << __func__
<< ": osd." << peer
731 << ": in peer_missing_requested" << dendl
;
736 dout(10) << __func__
<< ": osd." << peer
<< ": requesting pg_missing_t"
738 peer_missing_requested
.insert(peer
);
739 query_map
[peer
.osd
][spg_t(info
.pgid
.pgid
, peer
.shard
)] =
742 peer
.shard
, pg_whoami
.shard
,
743 info
.history
, get_osdmap()->get_epoch());
747 /******* PG ***********/
748 bool PG::needs_recovery() const
750 assert(is_primary());
752 auto &missing
= pg_log
.get_missing();
754 if (missing
.num_missing()) {
755 dout(10) << __func__
<< " primary has " << missing
.num_missing()
756 << " missing" << dendl
;
760 assert(!actingbackfill
.empty());
761 set
<pg_shard_t
>::const_iterator end
= actingbackfill
.end();
762 set
<pg_shard_t
>::const_iterator a
= actingbackfill
.begin();
763 for (; a
!= end
; ++a
) {
764 if (*a
== get_primary()) continue;
765 pg_shard_t peer
= *a
;
766 map
<pg_shard_t
, pg_missing_t
>::const_iterator pm
= peer_missing
.find(peer
);
767 if (pm
== peer_missing
.end()) {
768 dout(10) << __func__
<< " osd." << peer
<< " doesn't have missing set"
772 if (pm
->second
.num_missing()) {
773 dout(10) << __func__
<< " osd." << peer
<< " has "
774 << pm
->second
.num_missing() << " missing" << dendl
;
779 dout(10) << __func__
<< " is recovered" << dendl
;
783 bool PG::needs_backfill() const
785 assert(is_primary());
787 // We can assume that only possible osds that need backfill
788 // are on the backfill_targets vector nodes.
789 set
<pg_shard_t
>::const_iterator end
= backfill_targets
.end();
790 set
<pg_shard_t
>::const_iterator a
= backfill_targets
.begin();
791 for (; a
!= end
; ++a
) {
792 pg_shard_t peer
= *a
;
793 map
<pg_shard_t
, pg_info_t
>::const_iterator pi
= peer_info
.find(peer
);
794 if (!pi
->second
.last_backfill
.is_max()) {
795 dout(10) << __func__
<< " osd." << peer
<< " has last_backfill " << pi
->second
.last_backfill
<< dendl
;
800 dout(10) << __func__
<< " does not need backfill" << dendl
;
805 void PG::check_past_interval_bounds() const
807 auto rpib
= get_required_past_interval_bounds(
809 osd
->get_superblock().oldest_map
);
810 if (rpib
.first
>= rpib
.second
) {
811 if (!past_intervals
.empty()) {
812 osd
->clog
->error() << info
.pgid
<< " required past_interval bounds are"
813 << " empty [" << rpib
<< ") but past_intervals is not: "
815 derr
<< info
.pgid
<< " required past_interval bounds are"
816 << " empty [" << rpib
<< ") but past_intervals is not: "
817 << past_intervals
<< dendl
;
820 if (past_intervals
.empty()) {
821 osd
->clog
->error() << info
.pgid
<< " required past_interval bounds are"
822 << " not empty [" << rpib
<< ") but past_intervals "
823 << past_intervals
<< " is empty";
824 derr
<< info
.pgid
<< " required past_interval bounds are"
825 << " not empty [" << rpib
<< ") but past_intervals "
826 << past_intervals
<< " is empty" << dendl
;
827 assert(!past_intervals
.empty());
830 auto apib
= past_intervals
.get_bounds();
831 if (apib
.first
> rpib
.first
) {
832 osd
->clog
->error() << info
.pgid
<< " past_intervals [" << apib
833 << ") start interval does not contain the required"
834 << " bound [" << rpib
<< ") start";
835 derr
<< info
.pgid
<< " past_intervals [" << apib
836 << ") start interval does not contain the required"
837 << " bound [" << rpib
<< ") start" << dendl
;
838 assert(0 == "past_interval start interval mismatch");
840 if (apib
.second
!= rpib
.second
) {
841 osd
->clog
->error() << info
.pgid
<< " past_interal bound [" << apib
842 << ") end does not match required [" << rpib
844 derr
<< info
.pgid
<< " past_interal bound [" << apib
845 << ") end does not match required [" << rpib
847 assert(0 == "past_interval end mismatch");
852 bool PG::adjust_need_up_thru(const OSDMapRef osdmap
)
854 epoch_t up_thru
= osdmap
->get_up_thru(osd
->whoami
);
856 up_thru
>= info
.history
.same_interval_since
) {
857 dout(10) << "adjust_need_up_thru now " << up_thru
<< ", need_up_thru now false" << dendl
;
858 need_up_thru
= false;
864 void PG::remove_down_peer_info(const OSDMapRef osdmap
)
866 // Remove any downed osds from peer_info
867 bool removed
= false;
868 map
<pg_shard_t
, pg_info_t
>::iterator p
= peer_info
.begin();
869 while (p
!= peer_info
.end()) {
870 if (!osdmap
->is_up(p
->first
.osd
)) {
871 dout(10) << " dropping down osd." << p
->first
<< " info " << p
->second
<< dendl
;
872 peer_missing
.erase(p
->first
);
873 peer_log_requested
.erase(p
->first
);
874 peer_missing_requested
.erase(p
->first
);
875 peer_info
.erase(p
++);
881 // if we removed anyone, update peers (which include peer_info)
883 update_heartbeat_peers();
884 check_recovery_sources(osdmap
);
888 * Returns true unless there is a non-lost OSD in might_have_unfound.
890 bool PG::all_unfound_are_queried_or_lost(const OSDMapRef osdmap
) const
892 assert(is_primary());
894 set
<pg_shard_t
>::const_iterator peer
= might_have_unfound
.begin();
895 set
<pg_shard_t
>::const_iterator mend
= might_have_unfound
.end();
896 for (; peer
!= mend
; ++peer
) {
897 if (peer_missing
.count(*peer
))
899 map
<pg_shard_t
, pg_info_t
>::const_iterator iter
= peer_info
.find(*peer
);
900 if (iter
!= peer_info
.end() &&
901 (iter
->second
.is_empty() || iter
->second
.dne()))
903 if (!osdmap
->exists(peer
->osd
))
905 const osd_info_t
&osd_info(osdmap
->get_info(peer
->osd
));
906 if (osd_info
.lost_at
<= osd_info
.up_from
) {
907 // If there is even one OSD in might_have_unfound that isn't lost, we
908 // still might retrieve our unfound.
912 dout(10) << "all_unfound_are_queried_or_lost all of might_have_unfound " << might_have_unfound
913 << " have been queried or are marked lost" << dendl
;
917 PastIntervals::PriorSet
PG::build_prior()
921 for (map
<pg_shard_t
,pg_info_t
>::iterator it
= peer_info
.begin();
922 it
!= peer_info
.end();
924 assert(info
.history
.last_epoch_started
>= it
->second
.history
.last_epoch_started
);
928 const OSDMap
&osdmap
= *get_osdmap();
929 PastIntervals::PriorSet prior
= past_intervals
.get_prior_set(
931 info
.history
.last_epoch_started
,
932 get_pgbackend()->get_is_recoverable_predicate(),
933 [&](epoch_t start
, int osd
, epoch_t
*lost_at
) {
934 const osd_info_t
*pinfo
= 0;
935 if (osdmap
.exists(osd
)) {
936 pinfo
= &osdmap
.get_info(osd
);
938 *lost_at
= pinfo
->lost_at
;
941 if (osdmap
.is_up(osd
)) {
942 return PastIntervals::UP
;
944 return PastIntervals::DNE
;
945 } else if (pinfo
->lost_at
> start
) {
946 return PastIntervals::LOST
;
948 return PastIntervals::DOWN
;
956 state_set(PG_STATE_DOWN
);
959 if (get_osdmap()->get_up_thru(osd
->whoami
) < info
.history
.same_interval_since
) {
960 dout(10) << "up_thru " << get_osdmap()->get_up_thru(osd
->whoami
)
961 << " < same_since " << info
.history
.same_interval_since
962 << ", must notify monitor" << dendl
;
965 dout(10) << "up_thru " << get_osdmap()->get_up_thru(osd
->whoami
)
966 << " >= same_since " << info
.history
.same_interval_since
967 << ", all is well" << dendl
;
968 need_up_thru
= false;
970 set_probe_targets(prior
.probe
);
974 void PG::clear_primary_state()
976 dout(10) << "clear_primary_state" << dendl
;
978 // clear peering state
980 peer_log_requested
.clear();
981 peer_missing_requested
.clear();
983 peer_missing
.clear();
984 need_up_thru
= false;
985 peer_last_complete_ondisk
.clear();
986 peer_activated
.clear();
987 min_last_complete_ondisk
= eversion_t();
988 pg_trim_to
= eversion_t();
989 might_have_unfound
.clear();
990 projected_log
= PGLog::IndexedLog();
992 last_update_ondisk
= eversion_t();
996 finish_sync_event
= 0; // so that _finish_recovery doesn't go off in another thread
1000 release_pg_backoffs();
1002 pg_log
.reset_recovery_pointers();
1004 scrubber
.reserved_peers
.clear();
1005 scrub_after_recovery
= false;
1010 PG::Scrubber::Scrubber()
1011 : reserved(false), reserve_failed(false),
1014 shallow_errors(0), deep_errors(0), fixed(0),
1015 must_scrub(false), must_deep_scrub(false), must_repair(false),
1017 num_digest_updates_pending(0),
1022 PG::Scrubber::~Scrubber() {}
1027 * Returns an iterator to the best info in infos sorted by:
1028 * 1) Prefer newer last_update
1029 * 2) Prefer longer tail if it brings another info into contiguity
1030 * 3) Prefer current primary
1032 map
<pg_shard_t
, pg_info_t
>::const_iterator
PG::find_best_info(
1033 const map
<pg_shard_t
, pg_info_t
> &infos
,
1034 bool restrict_to_up_acting
,
1035 bool *history_les_bound
) const
1037 assert(history_les_bound
);
1038 /* See doc/dev/osd_internals/last_epoch_started.rst before attempting
1039 * to make changes to this process. Also, make sure to update it
1040 * when you find bugs! */
1041 eversion_t min_last_update_acceptable
= eversion_t::max();
1042 epoch_t max_last_epoch_started_found
= 0;
1043 for (map
<pg_shard_t
, pg_info_t
>::const_iterator i
= infos
.begin();
1046 if (!cct
->_conf
->osd_find_best_info_ignore_history_les
&&
1047 max_last_epoch_started_found
< i
->second
.history
.last_epoch_started
) {
1048 *history_les_bound
= true;
1049 max_last_epoch_started_found
= i
->second
.history
.last_epoch_started
;
1051 if (!i
->second
.is_incomplete() &&
1052 max_last_epoch_started_found
< i
->second
.last_epoch_started
) {
1053 max_last_epoch_started_found
= i
->second
.last_epoch_started
;
1056 for (map
<pg_shard_t
, pg_info_t
>::const_iterator i
= infos
.begin();
1059 if (max_last_epoch_started_found
<= i
->second
.last_epoch_started
) {
1060 if (min_last_update_acceptable
> i
->second
.last_update
)
1061 min_last_update_acceptable
= i
->second
.last_update
;
1064 if (min_last_update_acceptable
== eversion_t::max())
1067 map
<pg_shard_t
, pg_info_t
>::const_iterator best
= infos
.end();
1068 // find osd with newest last_update (oldest for ec_pool).
1069 // if there are multiples, prefer
1070 // - a longer tail, if it brings another peer into log contiguity
1071 // - the current primary
1072 for (map
<pg_shard_t
, pg_info_t
>::const_iterator p
= infos
.begin();
1075 if (restrict_to_up_acting
&& !is_up(p
->first
) &&
1076 !is_acting(p
->first
))
1078 // Only consider peers with last_update >= min_last_update_acceptable
1079 if (p
->second
.last_update
< min_last_update_acceptable
)
1081 // Disqualify anyone with a too old last_epoch_started
1082 if (p
->second
.last_epoch_started
< max_last_epoch_started_found
)
1084 // Disqualify anyone who is incomplete (not fully backfilled)
1085 if (p
->second
.is_incomplete())
1087 if (best
== infos
.end()) {
1091 // Prefer newer last_update
1092 if (pool
.info
.require_rollback()) {
1093 if (p
->second
.last_update
> best
->second
.last_update
)
1095 if (p
->second
.last_update
< best
->second
.last_update
) {
1100 if (p
->second
.last_update
< best
->second
.last_update
)
1102 if (p
->second
.last_update
> best
->second
.last_update
) {
1108 // Prefer longer tail
1109 if (p
->second
.log_tail
> best
->second
.log_tail
) {
1111 } else if (p
->second
.log_tail
< best
->second
.log_tail
) {
1116 // prefer current primary (usually the caller), all things being equal
1117 if (p
->first
== pg_whoami
) {
1118 dout(10) << "calc_acting prefer osd." << p
->first
1119 << " because it is current primary" << dendl
;
1127 void PG::calc_ec_acting(
1128 map
<pg_shard_t
, pg_info_t
>::const_iterator auth_log_shard
,
1130 const vector
<int> &acting
,
1131 pg_shard_t acting_primary
,
1132 const vector
<int> &up
,
1133 pg_shard_t up_primary
,
1134 const map
<pg_shard_t
, pg_info_t
> &all_info
,
1135 bool restrict_to_up_acting
,
1137 set
<pg_shard_t
> *backfill
,
1138 set
<pg_shard_t
> *acting_backfill
,
1139 pg_shard_t
*want_primary
,
1142 vector
<int> want(size
, CRUSH_ITEM_NONE
);
1143 map
<shard_id_t
, set
<pg_shard_t
> > all_info_by_shard
;
1144 unsigned usable
= 0;
1145 for (map
<pg_shard_t
, pg_info_t
>::const_iterator i
= all_info
.begin();
1146 i
!= all_info
.end();
1148 all_info_by_shard
[i
->first
.shard
].insert(i
->first
);
1150 for (uint8_t i
= 0; i
< want
.size(); ++i
) {
1151 ss
<< "For position " << (unsigned)i
<< ": ";
1152 if (up
.size() > (unsigned)i
&& up
[i
] != CRUSH_ITEM_NONE
&&
1153 !all_info
.find(pg_shard_t(up
[i
], shard_id_t(i
)))->second
.is_incomplete() &&
1154 all_info
.find(pg_shard_t(up
[i
], shard_id_t(i
)))->second
.last_update
>=
1155 auth_log_shard
->second
.log_tail
) {
1156 ss
<< " selecting up[i]: " << pg_shard_t(up
[i
], shard_id_t(i
)) << std::endl
;
1161 if (up
.size() > (unsigned)i
&& up
[i
] != CRUSH_ITEM_NONE
) {
1162 ss
<< " backfilling up[i]: " << pg_shard_t(up
[i
], shard_id_t(i
))
1164 backfill
->insert(pg_shard_t(up
[i
], shard_id_t(i
)));
1167 if (acting
.size() > (unsigned)i
&& acting
[i
] != CRUSH_ITEM_NONE
&&
1168 !all_info
.find(pg_shard_t(acting
[i
], shard_id_t(i
)))->second
.is_incomplete() &&
1169 all_info
.find(pg_shard_t(acting
[i
], shard_id_t(i
)))->second
.last_update
>=
1170 auth_log_shard
->second
.log_tail
) {
1171 ss
<< " selecting acting[i]: " << pg_shard_t(acting
[i
], shard_id_t(i
)) << std::endl
;
1172 want
[i
] = acting
[i
];
1174 } else if (!restrict_to_up_acting
) {
1175 for (set
<pg_shard_t
>::iterator j
= all_info_by_shard
[shard_id_t(i
)].begin();
1176 j
!= all_info_by_shard
[shard_id_t(i
)].end();
1178 assert(j
->shard
== i
);
1179 if (!all_info
.find(*j
)->second
.is_incomplete() &&
1180 all_info
.find(*j
)->second
.last_update
>=
1181 auth_log_shard
->second
.log_tail
) {
1182 ss
<< " selecting stray: " << *j
<< std::endl
;
1188 if (want
[i
] == CRUSH_ITEM_NONE
)
1189 ss
<< " failed to fill position " << (int)i
<< std::endl
;
1193 bool found_primary
= false;
1194 for (uint8_t i
= 0; i
< want
.size(); ++i
) {
1195 if (want
[i
] != CRUSH_ITEM_NONE
) {
1196 acting_backfill
->insert(pg_shard_t(want
[i
], shard_id_t(i
)));
1197 if (!found_primary
) {
1198 *want_primary
= pg_shard_t(want
[i
], shard_id_t(i
));
1199 found_primary
= true;
1203 acting_backfill
->insert(backfill
->begin(), backfill
->end());
1208 * calculate the desired acting set.
1210 * Choose an appropriate acting set. Prefer up[0], unless it is
1211 * incomplete, or another osd has a longer tail that allows us to
1212 * bring other up nodes up to date.
1214 void PG::calc_replicated_acting(
1215 map
<pg_shard_t
, pg_info_t
>::const_iterator auth_log_shard
,
1217 const vector
<int> &acting
,
1218 pg_shard_t acting_primary
,
1219 const vector
<int> &up
,
1220 pg_shard_t up_primary
,
1221 const map
<pg_shard_t
, pg_info_t
> &all_info
,
1222 bool restrict_to_up_acting
,
1224 set
<pg_shard_t
> *backfill
,
1225 set
<pg_shard_t
> *acting_backfill
,
1226 pg_shard_t
*want_primary
,
1229 ss
<< "calc_acting newest update on osd." << auth_log_shard
->first
1230 << " with " << auth_log_shard
->second
1231 << (restrict_to_up_acting
? " restrict_to_up_acting" : "") << std::endl
;
1232 pg_shard_t auth_log_shard_id
= auth_log_shard
->first
;
1235 map
<pg_shard_t
,pg_info_t
>::const_iterator primary
;
1237 !all_info
.find(up_primary
)->second
.is_incomplete() &&
1238 all_info
.find(up_primary
)->second
.last_update
>=
1239 auth_log_shard
->second
.log_tail
) {
1240 ss
<< "up_primary: " << up_primary
<< ") selected as primary" << std::endl
;
1241 primary
= all_info
.find(up_primary
); // prefer up[0], all thing being equal
1243 assert(!auth_log_shard
->second
.is_incomplete());
1244 ss
<< "up[0] needs backfill, osd." << auth_log_shard_id
1245 << " selected as primary instead" << std::endl
;
1246 primary
= auth_log_shard
;
1249 ss
<< "calc_acting primary is osd." << primary
->first
1250 << " with " << primary
->second
<< std::endl
;
1251 *want_primary
= primary
->first
;
1252 want
->push_back(primary
->first
.osd
);
1253 acting_backfill
->insert(primary
->first
);
1254 unsigned usable
= 1;
1256 // select replicas that have log contiguity with primary.
1257 // prefer up, then acting, then any peer_info osds
1258 for (vector
<int>::const_iterator i
= up
.begin();
1261 pg_shard_t up_cand
= pg_shard_t(*i
, shard_id_t::NO_SHARD
);
1262 if (up_cand
== primary
->first
)
1264 const pg_info_t
&cur_info
= all_info
.find(up_cand
)->second
;
1265 if (cur_info
.is_incomplete() ||
1266 cur_info
.last_update
< MIN(
1267 primary
->second
.log_tail
,
1268 auth_log_shard
->second
.log_tail
)) {
1269 /* We include auth_log_shard->second.log_tail because in GetLog,
1270 * we will request logs back to the min last_update over our
1271 * acting_backfill set, which will result in our log being extended
1272 * as far backwards as necessary to pick up any peers which can
1273 * be log recovered by auth_log_shard's log */
1274 ss
<< " shard " << up_cand
<< " (up) backfill " << cur_info
<< std::endl
;
1275 backfill
->insert(up_cand
);
1276 acting_backfill
->insert(up_cand
);
1278 want
->push_back(*i
);
1279 acting_backfill
->insert(up_cand
);
1281 ss
<< " osd." << *i
<< " (up) accepted " << cur_info
<< std::endl
;
1285 // This no longer has backfill OSDs, but they are covered above.
1286 for (vector
<int>::const_iterator i
= acting
.begin();
1289 pg_shard_t
acting_cand(*i
, shard_id_t::NO_SHARD
);
1293 // skip up osds we already considered above
1294 if (acting_cand
== primary
->first
)
1296 vector
<int>::const_iterator up_it
= find(up
.begin(), up
.end(), acting_cand
.osd
);
1297 if (up_it
!= up
.end())
1300 const pg_info_t
&cur_info
= all_info
.find(acting_cand
)->second
;
1301 if (cur_info
.is_incomplete() ||
1302 cur_info
.last_update
< primary
->second
.log_tail
) {
1303 ss
<< " shard " << acting_cand
<< " (stray) REJECTED "
1304 << cur_info
<< std::endl
;
1306 want
->push_back(*i
);
1307 acting_backfill
->insert(acting_cand
);
1308 ss
<< " shard " << acting_cand
<< " (stray) accepted "
1309 << cur_info
<< std::endl
;
1314 if (restrict_to_up_acting
) {
1317 for (map
<pg_shard_t
,pg_info_t
>::const_iterator i
= all_info
.begin();
1318 i
!= all_info
.end();
1323 // skip up osds we already considered above
1324 if (i
->first
== primary
->first
)
1326 vector
<int>::const_iterator up_it
= find(up
.begin(), up
.end(), i
->first
.osd
);
1327 if (up_it
!= up
.end())
1329 vector
<int>::const_iterator acting_it
= find(
1330 acting
.begin(), acting
.end(), i
->first
.osd
);
1331 if (acting_it
!= acting
.end())
1334 if (i
->second
.is_incomplete() ||
1335 i
->second
.last_update
< primary
->second
.log_tail
) {
1336 ss
<< " shard " << i
->first
<< " (stray) REJECTED "
1337 << i
->second
<< std::endl
;
1339 want
->push_back(i
->first
.osd
);
1340 acting_backfill
->insert(i
->first
);
1341 ss
<< " shard " << i
->first
<< " (stray) accepted "
1342 << i
->second
<< std::endl
;
1351 * calculate the desired acting, and request a change with the monitor
1352 * if it differs from the current acting.
1354 * if restrict_to_up_acting=true, we filter out anything that's not in
1355 * up/acting. in order to lift this restriction, we need to
1356 * 1) check whether it's worth switching the acting set any time we get
1357 * a new pg info (not just here, when recovery finishes)
1358 * 2) check whether anything in want_acting went down on each new map
1359 * (and, if so, calculate a new want_acting)
1360 * 3) remove the assertion in PG::RecoveryState::Active::react(const AdvMap)
1363 bool PG::choose_acting(pg_shard_t
&auth_log_shard_id
,
1364 bool restrict_to_up_acting
,
1365 bool *history_les_bound
)
1367 map
<pg_shard_t
, pg_info_t
> all_info(peer_info
.begin(), peer_info
.end());
1368 all_info
[pg_whoami
] = info
;
1370 for (map
<pg_shard_t
, pg_info_t
>::iterator p
= all_info
.begin();
1371 p
!= all_info
.end();
1373 dout(10) << "calc_acting osd." << p
->first
<< " " << p
->second
<< dendl
;
1376 map
<pg_shard_t
, pg_info_t
>::const_iterator auth_log_shard
=
1377 find_best_info(all_info
, restrict_to_up_acting
, history_les_bound
);
1379 if (auth_log_shard
== all_info
.end()) {
1381 dout(10) << "choose_acting no suitable info found (incomplete backfills?),"
1382 << " reverting to up" << dendl
;
1385 osd
->queue_want_pg_temp(info
.pgid
.pgid
, empty
);
1387 dout(10) << "choose_acting failed" << dendl
;
1388 assert(want_acting
.empty());
1393 assert(!auth_log_shard
->second
.is_incomplete());
1394 auth_log_shard_id
= auth_log_shard
->first
;
1396 set
<pg_shard_t
> want_backfill
, want_acting_backfill
;
1398 pg_shard_t want_primary
;
1400 if (!pool
.info
.ec_pool())
1401 calc_replicated_acting(
1403 get_osdmap()->get_pg_size(info
.pgid
.pgid
),
1409 restrict_to_up_acting
,
1412 &want_acting_backfill
,
1418 get_osdmap()->get_pg_size(info
.pgid
.pgid
),
1424 restrict_to_up_acting
,
1427 &want_acting_backfill
,
1430 dout(10) << ss
.str() << dendl
;
1432 unsigned num_want_acting
= 0;
1433 set
<pg_shard_t
> have
;
1434 for (int i
= 0; i
< (int)want
.size(); ++i
) {
1435 if (want
[i
] != CRUSH_ITEM_NONE
) {
1440 pool
.info
.ec_pool() ? shard_id_t(i
) : shard_id_t::NO_SHARD
));
1444 // We go incomplete if below min_size for ec_pools since backfill
1445 // does not currently maintain rollbackability
1446 // Otherwise, we will go "peered", but not "active"
1447 if (num_want_acting
< pool
.info
.min_size
&&
1448 (pool
.info
.ec_pool() ||
1449 !cct
->_conf
->osd_allow_recovery_below_min_size
)) {
1450 want_acting
.clear();
1451 dout(10) << "choose_acting failed, below min size" << dendl
;
1455 /* Check whether we have enough acting shards to later perform recovery */
1456 boost::scoped_ptr
<IsPGRecoverablePredicate
> recoverable_predicate(
1457 get_pgbackend()->get_is_recoverable_predicate());
1458 if (!(*recoverable_predicate
)(have
)) {
1459 want_acting
.clear();
1460 dout(10) << "choose_acting failed, not recoverable" << dendl
;
1464 if (want
!= acting
) {
1465 dout(10) << "choose_acting want " << want
<< " != acting " << acting
1466 << ", requesting pg_temp change" << dendl
;
1469 if (want_acting
== up
) {
1470 // There can't be any pending backfill if
1471 // want is the same as crush map up OSDs.
1472 assert(want_backfill
.empty());
1474 osd
->queue_want_pg_temp(info
.pgid
.pgid
, empty
);
1476 osd
->queue_want_pg_temp(info
.pgid
.pgid
, want
);
1479 want_acting
.clear();
1480 actingbackfill
= want_acting_backfill
;
1481 dout(10) << "actingbackfill is " << actingbackfill
<< dendl
;
1482 assert(backfill_targets
.empty() || backfill_targets
== want_backfill
);
1483 if (backfill_targets
.empty()) {
1484 // Caller is GetInfo
1485 backfill_targets
= want_backfill
;
1487 // Will not change if already set because up would have had to change
1488 // Verify that nothing in backfill is in stray_set
1489 for (set
<pg_shard_t
>::iterator i
= want_backfill
.begin();
1490 i
!= want_backfill
.end();
1492 assert(stray_set
.find(*i
) == stray_set
.end());
1494 dout(10) << "choose_acting want " << want
<< " (== acting) backfill_targets "
1495 << want_backfill
<< dendl
;
1499 /* Build the might_have_unfound set.
1501 * This is used by the primary OSD during recovery.
1503 * This set tracks the OSDs which might have unfound objects that the primary
1504 * OSD needs. As we receive pg_missing_t from each OSD in might_have_unfound, we
1505 * will remove the OSD from the set.
1507 void PG::build_might_have_unfound()
1509 assert(might_have_unfound
.empty());
1510 assert(is_primary());
1512 dout(10) << __func__
<< dendl
;
1514 check_past_interval_bounds();
1516 might_have_unfound
= past_intervals
.get_might_have_unfound(
1518 pool
.info
.ec_pool());
1520 // include any (stray) peers
1521 for (map
<pg_shard_t
, pg_info_t
>::iterator p
= peer_info
.begin();
1522 p
!= peer_info
.end();
1524 might_have_unfound
.insert(p
->first
);
1526 dout(15) << __func__
<< ": built " << might_have_unfound
<< dendl
;
1529 struct C_PG_ActivateCommitted
: public Context
{
1532 epoch_t activation_epoch
;
1533 C_PG_ActivateCommitted(PG
*p
, epoch_t e
, epoch_t ae
)
1534 : pg(p
), epoch(e
), activation_epoch(ae
) {}
1535 void finish(int r
) override
{
1536 pg
->_activate_committed(epoch
, activation_epoch
);
1540 void PG::activate(ObjectStore::Transaction
& t
,
1541 epoch_t activation_epoch
,
1542 list
<Context
*>& tfin
,
1543 map
<int, map
<spg_t
,pg_query_t
> >& query_map
,
1547 PastIntervals
> > > *activator_map
,
1550 assert(!is_peered());
1551 assert(scrubber
.callbacks
.empty());
1552 assert(callbacks_for_degraded_object
.empty());
1555 state_clear(PG_STATE_DOWN
);
1557 send_notify
= false;
1560 // only update primary last_epoch_started if we will go active
1561 if (acting
.size() >= pool
.info
.min_size
) {
1562 assert(cct
->_conf
->osd_find_best_info_ignore_history_les
||
1563 info
.last_epoch_started
<= activation_epoch
);
1564 info
.last_epoch_started
= activation_epoch
;
1565 info
.last_interval_started
= info
.history
.same_interval_since
;
1567 } else if (is_acting(pg_whoami
)) {
1568 /* update last_epoch_started on acting replica to whatever the primary sent
1569 * unless it's smaller (could happen if we are going peered rather than
1570 * active, see doc/dev/osd_internals/last_epoch_started.rst) */
1571 if (info
.last_epoch_started
< activation_epoch
) {
1572 info
.last_epoch_started
= activation_epoch
;
1573 info
.last_interval_started
= info
.history
.same_interval_since
;
1577 auto &missing
= pg_log
.get_missing();
1580 last_update_ondisk
= info
.last_update
;
1581 min_last_complete_ondisk
= eversion_t(0,0); // we don't know (yet)!
1583 last_update_applied
= info
.last_update
;
1584 last_rollback_info_trimmed_to_applied
= pg_log
.get_can_rollback_to();
1586 need_up_thru
= false;
1588 // write pg info, log
1590 dirty_big_info
= true; // maybe
1592 // find out when we commit
1593 t
.register_on_complete(
1594 new C_PG_ActivateCommitted(
1596 get_osdmap()->get_epoch(),
1599 // initialize snap_trimq
1601 dout(20) << "activate - purged_snaps " << info
.purged_snaps
1602 << " cached_removed_snaps " << pool
.cached_removed_snaps
<< dendl
;
1603 snap_trimq
= pool
.cached_removed_snaps
;
1604 interval_set
<snapid_t
> intersection
;
1605 intersection
.intersection_of(snap_trimq
, info
.purged_snaps
);
1606 if (intersection
== info
.purged_snaps
) {
1607 snap_trimq
.subtract(info
.purged_snaps
);
1609 dout(0) << "warning: info.purged_snaps (" << info
.purged_snaps
1610 << ") is not a subset of pool.cached_removed_snaps ("
1611 << pool
.cached_removed_snaps
<< ")" << dendl
;
1612 snap_trimq
.subtract(intersection
);
1616 // init complete pointer
1617 if (missing
.num_missing() == 0) {
1618 dout(10) << "activate - no missing, moving last_complete " << info
.last_complete
1619 << " -> " << info
.last_update
<< dendl
;
1620 info
.last_complete
= info
.last_update
;
1621 pg_log
.reset_recovery_pointers();
1623 dout(10) << "activate - not complete, " << missing
<< dendl
;
1624 pg_log
.activate_not_complete(info
);
1632 // start up replicas
1634 assert(!actingbackfill
.empty());
1635 for (set
<pg_shard_t
>::iterator i
= actingbackfill
.begin();
1636 i
!= actingbackfill
.end();
1638 if (*i
== pg_whoami
) continue;
1639 pg_shard_t peer
= *i
;
1640 assert(peer_info
.count(peer
));
1641 pg_info_t
& pi
= peer_info
[peer
];
1643 dout(10) << "activate peer osd." << peer
<< " " << pi
<< dendl
;
1646 assert(peer_missing
.count(peer
));
1647 pg_missing_t
& pm
= peer_missing
[peer
];
1649 bool needs_past_intervals
= pi
.dne();
1652 * cover case where peer sort order was different and
1653 * last_backfill cannot be interpreted
1655 bool force_restart_backfill
=
1656 !pi
.last_backfill
.is_max() &&
1657 !pi
.last_backfill_bitwise
;
1659 if (pi
.last_update
== info
.last_update
&& !force_restart_backfill
) {
1661 if (!pi
.last_backfill
.is_max())
1662 osd
->clog
->info() << info
.pgid
<< " continuing backfill to osd."
1664 << " from (" << pi
.log_tail
<< "," << pi
.last_update
1665 << "] " << pi
.last_backfill
1666 << " to " << info
.last_update
;
1667 if (!pi
.is_empty() && activator_map
) {
1668 dout(10) << "activate peer osd." << peer
<< " is up to date, queueing in pending_activators" << dendl
;
1669 (*activator_map
)[peer
.osd
].push_back(
1672 peer
.shard
, pg_whoami
.shard
,
1673 get_osdmap()->get_epoch(),
1674 get_osdmap()->get_epoch(),
1678 dout(10) << "activate peer osd." << peer
<< " is up to date, but sending pg_log anyway" << dendl
;
1680 i
->shard
, pg_whoami
.shard
,
1681 get_osdmap()->get_epoch(), info
);
1684 pg_log
.get_tail() > pi
.last_update
||
1685 pi
.last_backfill
== hobject_t() ||
1686 force_restart_backfill
||
1687 (backfill_targets
.count(*i
) && pi
.last_backfill
.is_max())) {
1688 /* ^ This last case covers a situation where a replica is not contiguous
1689 * with the auth_log, but is contiguous with this replica. Reshuffling
1690 * the active set to handle this would be tricky, so instead we just go
1691 * ahead and backfill it anyway. This is probably preferrable in any
1692 * case since the replica in question would have to be significantly
1696 osd
->clog
->debug() << info
.pgid
<< " starting backfill to osd." << peer
1697 << " from (" << pi
.log_tail
<< "," << pi
.last_update
1698 << "] " << pi
.last_backfill
1699 << " to " << info
.last_update
;
1701 pi
.last_update
= info
.last_update
;
1702 pi
.last_complete
= info
.last_update
;
1703 pi
.set_last_backfill(hobject_t());
1704 pi
.last_epoch_started
= info
.last_epoch_started
;
1705 pi
.last_interval_started
= info
.last_interval_started
;
1706 pi
.history
= info
.history
;
1707 pi
.hit_set
= info
.hit_set
;
1708 pi
.stats
.stats
.clear();
1710 // initialize peer with our purged_snaps.
1711 pi
.purged_snaps
= info
.purged_snaps
;
1714 i
->shard
, pg_whoami
.shard
,
1715 get_osdmap()->get_epoch(), pi
);
1717 // send some recent log, so that op dup detection works well.
1718 m
->log
.copy_up_to(pg_log
.get_log(), cct
->_conf
->osd_min_pg_log_entries
);
1719 m
->info
.log_tail
= m
->log
.tail
;
1720 pi
.log_tail
= m
->log
.tail
; // sigh...
1725 assert(pg_log
.get_tail() <= pi
.last_update
);
1727 i
->shard
, pg_whoami
.shard
,
1728 get_osdmap()->get_epoch(), info
);
1729 // send new stuff to append to replicas log
1730 m
->log
.copy_after(pg_log
.get_log(), pi
.last_update
);
1733 // share past_intervals if we are creating the pg on the replica
1734 // based on whether our info for that peer was dne() *before*
1735 // updating pi.history in the backfill block above.
1736 if (m
&& needs_past_intervals
)
1737 m
->past_intervals
= past_intervals
;
1739 // update local version of peer's missing list!
1740 if (m
&& pi
.last_backfill
!= hobject_t()) {
1741 for (list
<pg_log_entry_t
>::iterator p
= m
->log
.log
.begin();
1742 p
!= m
->log
.log
.end();
1744 if (p
->soid
<= pi
.last_backfill
&&
1746 if (perform_deletes_during_peering() && p
->is_delete()) {
1747 pm
.rm(p
->soid
, p
->version
);
1749 pm
.add_next_event(*p
);
1756 dout(10) << "activate peer osd." << peer
<< " sending " << m
->log
<< dendl
;
1757 //m->log.print(cout);
1758 osd
->send_message_osd_cluster(peer
.osd
, m
, get_osdmap()->get_epoch());
1762 pi
.last_update
= info
.last_update
;
1764 // update our missing
1765 if (pm
.num_missing() == 0) {
1766 pi
.last_complete
= pi
.last_update
;
1767 dout(10) << "activate peer osd." << peer
<< " " << pi
<< " uptodate" << dendl
;
1769 dout(10) << "activate peer osd." << peer
<< " " << pi
<< " missing " << pm
<< dendl
;
1773 // Set up missing_loc
1774 set
<pg_shard_t
> complete_shards
;
1775 for (set
<pg_shard_t
>::iterator i
= actingbackfill
.begin();
1776 i
!= actingbackfill
.end();
1778 dout(20) << __func__
<< " setting up missing_loc from shard " << *i
<< " " << dendl
;
1779 if (*i
== get_primary()) {
1780 missing_loc
.add_active_missing(missing
);
1781 if (!missing
.have_missing())
1782 complete_shards
.insert(*i
);
1784 auto peer_missing_entry
= peer_missing
.find(*i
);
1785 assert(peer_missing_entry
!= peer_missing
.end());
1786 missing_loc
.add_active_missing(peer_missing_entry
->second
);
1787 if (!peer_missing_entry
->second
.have_missing() &&
1788 peer_info
[*i
].last_backfill
.is_max())
1789 complete_shards
.insert(*i
);
1792 // If necessary, create might_have_unfound to help us find our unfound objects.
1793 // NOTE: It's important that we build might_have_unfound before trimming the
1795 might_have_unfound
.clear();
1796 if (needs_recovery()) {
1797 // If only one shard has missing, we do a trick to add all others as recovery
1798 // source, this is considered safe since the PGLogs have been merged locally,
1799 // and covers vast majority of the use cases, like one OSD/host is down for
1800 // a while for hardware repairing
1801 if (complete_shards
.size() + 1 == actingbackfill
.size()) {
1802 missing_loc
.add_batch_sources_info(complete_shards
, ctx
->handle
);
1804 missing_loc
.add_source_info(pg_whoami
, info
, pg_log
.get_missing(),
1806 for (set
<pg_shard_t
>::iterator i
= actingbackfill
.begin();
1807 i
!= actingbackfill
.end();
1809 if (*i
== pg_whoami
) continue;
1810 dout(10) << __func__
<< ": adding " << *i
<< " as a source" << dendl
;
1811 assert(peer_missing
.count(*i
));
1812 assert(peer_info
.count(*i
));
1813 missing_loc
.add_source_info(
1820 for (map
<pg_shard_t
, pg_missing_t
>::iterator i
= peer_missing
.begin();
1821 i
!= peer_missing
.end();
1823 if (is_actingbackfill(i
->first
))
1825 assert(peer_info
.count(i
->first
));
1827 peer_info
[i
->first
],
1833 build_might_have_unfound();
1836 discover_all_missing(query_map
);
1839 // num_objects_degraded if calculated should reflect this too, unless no
1840 // missing and we are about to go clean.
1841 if (get_osdmap()->get_pg_size(info
.pgid
.pgid
) > actingset
.size()) {
1842 state_set(PG_STATE_UNDERSIZED
);
1845 state_set(PG_STATE_ACTIVATING
);
1846 release_pg_backoffs();
1847 projected_last_update
= info
.last_update
;
1849 if (acting
.size() >= pool
.info
.min_size
) {
1850 PGLogEntryHandler handler
{this, &t
};
1851 pg_log
.roll_forward(&handler
);
1855 bool PG::op_has_sufficient_caps(OpRequestRef
& op
)
1857 // only check MOSDOp
1858 if (op
->get_req()->get_type() != CEPH_MSG_OSD_OP
)
1861 const MOSDOp
*req
= static_cast<const MOSDOp
*>(op
->get_req());
1863 Session
*session
= static_cast<Session
*>(req
->get_connection()->get_priv());
1865 dout(0) << "op_has_sufficient_caps: no session for op " << *req
<< dendl
;
1868 OSDCap
& caps
= session
->caps
;
1871 const string
&key
= req
->get_hobj().get_key().empty() ?
1872 req
->get_oid().name
:
1873 req
->get_hobj().get_key();
1875 bool cap
= caps
.is_capable(pool
.name
, req
->get_hobj().nspace
,
1877 op
->need_read_cap(),
1878 op
->need_write_cap(),
1881 dout(20) << "op_has_sufficient_caps "
1882 << "session=" << session
1883 << " pool=" << pool
.id
<< " (" << pool
.name
1884 << " " << req
->get_hobj().nspace
1885 << ") owner=" << pool
.auid
1886 << " need_read_cap=" << op
->need_read_cap()
1887 << " need_write_cap=" << op
->need_write_cap()
1888 << " classes=" << op
->classes()
1889 << " -> " << (cap
? "yes" : "NO")
1894 void PG::_activate_committed(epoch_t epoch
, epoch_t activation_epoch
)
1897 if (pg_has_reset_since(epoch
)) {
1898 dout(10) << "_activate_committed " << epoch
1899 << ", that was an old interval" << dendl
;
1900 } else if (is_primary()) {
1901 peer_activated
.insert(pg_whoami
);
1902 dout(10) << "_activate_committed " << epoch
1903 << " peer_activated now " << peer_activated
1904 << " last_interval_started " << info
.history
.last_interval_started
1905 << " last_epoch_started " << info
.history
.last_epoch_started
1906 << " same_interval_since " << info
.history
.same_interval_since
<< dendl
;
1907 assert(!actingbackfill
.empty());
1908 if (peer_activated
.size() == actingbackfill
.size())
1909 all_activated_and_committed();
1911 dout(10) << "_activate_committed " << epoch
<< " telling primary" << dendl
;
1912 MOSDPGInfo
*m
= new MOSDPGInfo(epoch
);
1913 pg_notify_t i
= pg_notify_t(
1914 get_primary().shard
, pg_whoami
.shard
,
1915 get_osdmap()->get_epoch(),
1916 get_osdmap()->get_epoch(),
1919 i
.info
.history
.last_epoch_started
= activation_epoch
;
1920 i
.info
.history
.last_interval_started
= i
.info
.history
.same_interval_since
;
1921 if (acting
.size() >= pool
.info
.min_size
) {
1922 state_set(PG_STATE_ACTIVE
);
1924 state_set(PG_STATE_PEERED
);
1927 m
->pg_list
.push_back(make_pair(i
, PastIntervals()));
1928 osd
->send_message_osd_cluster(get_primary().osd
, m
, get_osdmap()->get_epoch());
1931 if (flushes_in_progress
== 0) {
1932 requeue_ops(waiting_for_peered
);
1933 } else if (!waiting_for_peered
.empty()) {
1934 dout(10) << __func__
<< " flushes in progress, moving "
1935 << waiting_for_peered
.size() << " items to waiting_for_flush"
1937 assert(waiting_for_flush
.empty());
1938 waiting_for_flush
.swap(waiting_for_peered
);
1942 assert(!dirty_info
);
1948 * update info.history.last_epoch_started ONLY after we and all
1949 * replicas have activated AND committed the activate transaction
1950 * (i.e. the peering results are stable on disk).
1952 void PG::all_activated_and_committed()
1954 dout(10) << "all_activated_and_committed" << dendl
;
1955 assert(is_primary());
1956 assert(peer_activated
.size() == actingbackfill
.size());
1957 assert(!actingbackfill
.empty());
1958 assert(blocked_by
.empty());
1961 _update_calc_stats();
1962 if (info
.stats
.stats
.sum
.num_objects_degraded
) {
1963 state_set(PG_STATE_DEGRADED
);
1965 state_clear(PG_STATE_DEGRADED
);
1968 queue_peering_event(
1970 std::make_shared
<CephPeeringEvt
>(
1971 get_osdmap()->get_epoch(),
1972 get_osdmap()->get_epoch(),
1973 AllReplicasActivated())));
1976 bool PG::requeue_scrub(bool high_priority
)
1978 assert(is_locked());
1980 dout(10) << __func__
<< ": already queued" << dendl
;
1983 dout(10) << __func__
<< ": queueing" << dendl
;
1984 scrub_queued
= true;
1985 osd
->queue_for_scrub(this, high_priority
);
1990 void PG::queue_recovery()
1992 if (!is_primary() || !is_peered()) {
1993 dout(10) << "queue_recovery -- not primary or not peered " << dendl
;
1994 assert(!recovery_queued
);
1995 } else if (recovery_queued
) {
1996 dout(10) << "queue_recovery -- already queued" << dendl
;
1998 dout(10) << "queue_recovery -- queuing" << dendl
;
1999 recovery_queued
= true;
2000 osd
->queue_for_recovery(this);
2004 bool PG::queue_scrub()
2006 assert(is_locked());
2007 if (is_scrubbing()) {
2010 scrubber
.priority
= scrubber
.must_scrub
?
2011 cct
->_conf
->osd_requested_scrub_priority
: get_scrub_priority();
2012 scrubber
.must_scrub
= false;
2013 state_set(PG_STATE_SCRUBBING
);
2014 if (scrubber
.must_deep_scrub
) {
2015 state_set(PG_STATE_DEEP_SCRUB
);
2016 scrubber
.must_deep_scrub
= false;
2018 if (scrubber
.must_repair
|| scrubber
.auto_repair
) {
2019 state_set(PG_STATE_REPAIR
);
2020 scrubber
.must_repair
= false;
2026 unsigned PG::get_scrub_priority()
2028 // a higher value -> a higher priority
2029 int pool_scrub_priority
= 0;
2030 pool
.info
.opts
.get(pool_opts_t::SCRUB_PRIORITY
, &pool_scrub_priority
);
2031 return pool_scrub_priority
> 0 ? pool_scrub_priority
: cct
->_conf
->osd_scrub_priority
;
2034 struct C_PG_FinishRecovery
: public Context
{
2036 explicit C_PG_FinishRecovery(PG
*p
) : pg(p
) {}
2037 void finish(int r
) override
{
2038 pg
->_finish_recovery(this);
2042 void PG::mark_clean()
2044 if (actingset
.size() == get_osdmap()->get_pg_size(info
.pgid
.pgid
)) {
2045 state_clear(PG_STATE_FORCED_BACKFILL
| PG_STATE_FORCED_RECOVERY
);
2046 state_set(PG_STATE_CLEAN
);
2047 info
.history
.last_epoch_clean
= get_osdmap()->get_epoch();
2048 info
.history
.last_interval_clean
= info
.history
.same_interval_since
;
2049 past_intervals
.clear();
2050 dirty_big_info
= true;
2057 void PG::_change_recovery_force_mode(int new_mode
, bool clear
)
2060 // we can't and shouldn't do anything if the PG is being deleted locally
2062 state_clear(new_mode
);
2064 state_set(new_mode
);
2066 publish_stats_to_osd();
2070 inline int PG::clamp_recovery_priority(int priority
)
2072 static_assert(OSD_RECOVERY_PRIORITY_MIN
< OSD_RECOVERY_PRIORITY_MAX
, "Invalid priority range");
2073 static_assert(OSD_RECOVERY_PRIORITY_MIN
>= 0, "Priority range must match unsigned type");
2075 // Clamp to valid range
2076 if (priority
> OSD_RECOVERY_PRIORITY_MAX
) {
2077 return OSD_RECOVERY_PRIORITY_MAX
;
2078 } else if (priority
< OSD_RECOVERY_PRIORITY_MIN
) {
2079 return OSD_RECOVERY_PRIORITY_MIN
;
2085 unsigned PG::get_recovery_priority()
2087 // a higher value -> a higher priority
2090 if (state
& PG_STATE_FORCED_RECOVERY
) {
2091 ret
= OSD_RECOVERY_PRIORITY_FORCED
;
2093 pool
.info
.opts
.get(pool_opts_t::RECOVERY_PRIORITY
, &ret
);
2094 ret
= clamp_recovery_priority(OSD_RECOVERY_PRIORITY_BASE
+ ret
);
2096 dout(20) << __func__
<< " recovery priority for " << *this << " is " << ret
<< ", state is " << state
<< dendl
;
2097 return static_cast<unsigned>(ret
);
2100 unsigned PG::get_backfill_priority()
2102 // a higher value -> a higher priority
2103 int ret
= OSD_BACKFILL_PRIORITY_BASE
;
2104 if (state
& PG_STATE_FORCED_BACKFILL
) {
2105 ret
= OSD_RECOVERY_PRIORITY_FORCED
;
2107 if (acting
.size() < pool
.info
.min_size
) {
2108 // inactive: no. of replicas < min_size, highest priority since it blocks IO
2109 ret
= OSD_BACKFILL_INACTIVE_PRIORITY_BASE
+ (pool
.info
.min_size
- acting
.size());
2111 } else if (is_undersized()) {
2112 // undersized: OSD_BACKFILL_DEGRADED_PRIORITY_BASE + num missing replicas
2113 assert(pool
.info
.size
> actingset
.size());
2114 ret
= OSD_BACKFILL_DEGRADED_PRIORITY_BASE
+ (pool
.info
.size
- actingset
.size());
2116 } else if (is_degraded()) {
2117 // degraded: baseline degraded
2118 ret
= OSD_BACKFILL_DEGRADED_PRIORITY_BASE
;
2121 // Adjust with pool's recovery priority
2122 int pool_recovery_priority
= 0;
2123 pool
.info
.opts
.get(pool_opts_t::RECOVERY_PRIORITY
, &pool_recovery_priority
);
2125 ret
= clamp_recovery_priority(pool_recovery_priority
+ ret
);
2128 return static_cast<unsigned>(ret
);
2131 void PG::finish_recovery(list
<Context
*>& tfin
)
2133 dout(10) << "finish_recovery" << dendl
;
2134 assert(info
.last_complete
== info
.last_update
);
2136 clear_recovery_state();
2139 * sync all this before purging strays. but don't block!
2141 finish_sync_event
= new C_PG_FinishRecovery(this);
2142 tfin
.push_back(finish_sync_event
);
2145 void PG::_finish_recovery(Context
*c
)
2152 if (c
== finish_sync_event
) {
2153 dout(10) << "_finish_recovery" << dendl
;
2154 finish_sync_event
= 0;
2157 publish_stats_to_osd();
2159 if (scrub_after_recovery
) {
2160 dout(10) << "_finish_recovery requeueing for scrub" << dendl
;
2161 scrub_after_recovery
= false;
2162 scrubber
.must_deep_scrub
= true;
2166 dout(10) << "_finish_recovery -- stale" << dendl
;
2171 void PG::start_recovery_op(const hobject_t
& soid
)
2173 dout(10) << "start_recovery_op " << soid
2174 #ifdef DEBUG_RECOVERY_OIDS
2175 << " (" << recovering_oids
<< ")"
2178 assert(recovery_ops_active
>= 0);
2179 recovery_ops_active
++;
2180 #ifdef DEBUG_RECOVERY_OIDS
2181 assert(recovering_oids
.count(soid
) == 0);
2182 recovering_oids
.insert(soid
);
2184 osd
->start_recovery_op(this, soid
);
2187 void PG::finish_recovery_op(const hobject_t
& soid
, bool dequeue
)
2189 dout(10) << "finish_recovery_op " << soid
2190 #ifdef DEBUG_RECOVERY_OIDS
2191 << " (" << recovering_oids
<< ")"
2194 assert(recovery_ops_active
> 0);
2195 recovery_ops_active
--;
2196 #ifdef DEBUG_RECOVERY_OIDS
2197 assert(recovering_oids
.count(soid
));
2198 recovering_oids
.erase(soid
);
2200 osd
->finish_recovery_op(this, soid
, dequeue
);
2207 void PG::split_into(pg_t child_pgid
, PG
*child
, unsigned split_bits
)
2209 child
->update_snap_mapper_bits(split_bits
);
2210 child
->update_osdmap_ref(get_osdmap());
2215 pg_log
.split_into(child_pgid
, split_bits
, &(child
->pg_log
));
2216 child
->info
.last_complete
= info
.last_complete
;
2218 info
.last_update
= pg_log
.get_head();
2219 child
->info
.last_update
= child
->pg_log
.get_head();
2221 child
->info
.last_user_version
= info
.last_user_version
;
2223 info
.log_tail
= pg_log
.get_tail();
2224 child
->info
.log_tail
= child
->pg_log
.get_tail();
2226 if (info
.last_complete
< pg_log
.get_tail())
2227 info
.last_complete
= pg_log
.get_tail();
2228 if (child
->info
.last_complete
< child
->pg_log
.get_tail())
2229 child
->info
.last_complete
= child
->pg_log
.get_tail();
2232 child
->info
.history
= info
.history
;
2233 child
->info
.history
.epoch_created
= get_osdmap()->get_epoch();
2234 child
->info
.purged_snaps
= info
.purged_snaps
;
2236 if (info
.last_backfill
.is_max()) {
2237 child
->info
.set_last_backfill(hobject_t::get_max());
2239 // restart backfill on parent and child to be safe. we could
2240 // probably do better in the bitwise sort case, but it's more
2241 // fragile (there may be special work to do on backfill completion
2243 info
.set_last_backfill(hobject_t());
2244 child
->info
.set_last_backfill(hobject_t());
2245 // restarting backfill implies that the missing set is empty,
2246 // since it is only used for objects prior to last_backfill
2247 pg_log
.reset_backfill();
2248 child
->pg_log
.reset_backfill();
2251 child
->info
.stats
= info
.stats
;
2252 child
->info
.stats
.parent_split_bits
= split_bits
;
2253 info
.stats
.stats_invalid
= true;
2254 child
->info
.stats
.stats_invalid
= true;
2255 child
->info
.last_epoch_started
= info
.last_epoch_started
;
2256 child
->info
.last_interval_started
= info
.last_interval_started
;
2258 child
->snap_trimq
= snap_trimq
;
2260 // There can't be recovery/backfill going on now
2261 int primary
, up_primary
;
2262 vector
<int> newup
, newacting
;
2263 get_osdmap()->pg_to_up_acting_osds(
2264 child
->info
.pgid
.pgid
, &newup
, &up_primary
, &newacting
, &primary
);
2265 child
->init_primary_up_acting(
2270 child
->role
= OSDMap::calc_pg_role(osd
->whoami
, child
->acting
);
2272 // this comparison includes primary rank via pg_shard_t
2273 if (get_primary() != child
->get_primary())
2274 child
->info
.history
.same_primary_since
= get_osdmap()->get_epoch();
2276 child
->info
.stats
.up
= up
;
2277 child
->info
.stats
.up_primary
= up_primary
;
2278 child
->info
.stats
.acting
= acting
;
2279 child
->info
.stats
.acting_primary
= primary
;
2280 child
->info
.stats
.mapping_epoch
= get_osdmap()->get_epoch();
2283 child
->past_intervals
= past_intervals
;
2285 _split_into(child_pgid
, child
, split_bits
);
2287 // release all backoffs for simplicity
2288 release_backoffs(hobject_t(), hobject_t::get_max());
2290 child
->on_new_interval();
2292 child
->dirty_info
= true;
2293 child
->dirty_big_info
= true;
2295 dirty_big_info
= true;
2298 void PG::add_backoff(SessionRef s
, const hobject_t
& begin
, const hobject_t
& end
)
2300 ConnectionRef con
= s
->con
;
2301 if (!con
) // OSD::ms_handle_reset clears s->con without a lock
2303 BackoffRef
b(s
->have_backoff(info
.pgid
, begin
));
2305 derr
<< __func__
<< " already have backoff for " << s
<< " begin " << begin
2306 << " " << *b
<< dendl
;
2309 Mutex::Locker
l(backoff_lock
);
2311 b
= new Backoff(info
.pgid
, this, s
, ++s
->backoff_seq
, begin
, end
);
2312 backoffs
[begin
].insert(b
);
2314 dout(10) << __func__
<< " session " << s
<< " added " << *b
<< dendl
;
2319 get_osdmap()->get_epoch(),
2320 CEPH_OSD_BACKOFF_OP_BLOCK
,
2326 void PG::release_backoffs(const hobject_t
& begin
, const hobject_t
& end
)
2328 dout(10) << __func__
<< " [" << begin
<< "," << end
<< ")" << dendl
;
2329 vector
<BackoffRef
> bv
;
2331 Mutex::Locker
l(backoff_lock
);
2332 auto p
= backoffs
.lower_bound(begin
);
2333 while (p
!= backoffs
.end()) {
2334 int r
= cmp(p
->first
, end
);
2335 dout(20) << __func__
<< " ? " << r
<< " " << p
->first
2336 << " " << p
->second
<< dendl
;
2337 // note: must still examine begin=end=p->first case
2338 if (r
> 0 || (r
== 0 && begin
< end
)) {
2341 dout(20) << __func__
<< " checking " << p
->first
2342 << " " << p
->second
<< dendl
;
2343 auto q
= p
->second
.begin();
2344 while (q
!= p
->second
.end()) {
2345 dout(20) << __func__
<< " checking " << *q
<< dendl
;
2346 int r
= cmp((*q
)->begin
, begin
);
2347 if (r
== 0 || (r
> 0 && (*q
)->end
< end
)) {
2349 q
= p
->second
.erase(q
);
2354 if (p
->second
.empty()) {
2355 p
= backoffs
.erase(p
);
2362 Mutex::Locker
l(b
->lock
);
2363 dout(10) << __func__
<< " " << *b
<< dendl
;
2365 assert(b
->pg
== this);
2366 ConnectionRef con
= b
->session
->con
;
2367 if (con
) { // OSD::ms_handle_reset clears s->con without a lock
2371 get_osdmap()->get_epoch(),
2372 CEPH_OSD_BACKOFF_OP_UNBLOCK
,
2378 b
->state
= Backoff::STATE_DELETING
;
2380 b
->session
->rm_backoff(b
);
2388 void PG::clear_backoffs()
2390 dout(10) << __func__
<< " " << dendl
;
2391 map
<hobject_t
,set
<BackoffRef
>> ls
;
2393 Mutex::Locker
l(backoff_lock
);
2396 for (auto& p
: ls
) {
2397 for (auto& b
: p
.second
) {
2398 Mutex::Locker
l(b
->lock
);
2399 dout(10) << __func__
<< " " << *b
<< dendl
;
2401 assert(b
->pg
== this);
2403 b
->state
= Backoff::STATE_DELETING
;
2405 b
->session
->rm_backoff(b
);
2414 // called by Session::clear_backoffs()
2415 void PG::rm_backoff(BackoffRef b
)
2417 dout(10) << __func__
<< " " << *b
<< dendl
;
2418 Mutex::Locker
l(backoff_lock
);
2419 assert(b
->lock
.is_locked_by_me());
2420 assert(b
->pg
== this);
2421 auto p
= backoffs
.find(b
->begin
);
2422 // may race with release_backoffs()
2423 if (p
!= backoffs
.end()) {
2424 auto q
= p
->second
.find(b
);
2425 if (q
!= p
->second
.end()) {
2427 if (p
->second
.empty()) {
2434 void PG::clear_recovery_state()
2436 dout(10) << "clear_recovery_state" << dendl
;
2438 pg_log
.reset_recovery_pointers();
2439 finish_sync_event
= 0;
2442 while (recovery_ops_active
> 0) {
2443 #ifdef DEBUG_RECOVERY_OIDS
2444 soid
= *recovering_oids
.begin();
2446 finish_recovery_op(soid
, true);
2449 backfill_targets
.clear();
2450 backfill_info
.clear();
2451 peer_backfill_info
.clear();
2452 waiting_on_backfill
.clear();
2453 _clear_recovery_state(); // pg impl specific hook
2456 void PG::cancel_recovery()
2458 dout(10) << "cancel_recovery" << dendl
;
2459 clear_recovery_state();
2463 void PG::purge_strays()
2465 dout(10) << "purge_strays " << stray_set
<< dendl
;
2467 bool removed
= false;
2468 for (set
<pg_shard_t
>::iterator p
= stray_set
.begin();
2469 p
!= stray_set
.end();
2471 assert(!is_actingbackfill(*p
));
2472 if (get_osdmap()->is_up(p
->osd
)) {
2473 dout(10) << "sending PGRemove to osd." << *p
<< dendl
;
2474 vector
<spg_t
> to_remove
;
2475 to_remove
.push_back(spg_t(info
.pgid
.pgid
, p
->shard
));
2476 MOSDPGRemove
*m
= new MOSDPGRemove(
2477 get_osdmap()->get_epoch(),
2479 osd
->send_message_osd_cluster(p
->osd
, m
, get_osdmap()->get_epoch());
2481 dout(10) << "not sending PGRemove to down osd." << *p
<< dendl
;
2483 peer_missing
.erase(*p
);
2484 peer_info
.erase(*p
);
2485 peer_purged
.insert(*p
);
2489 // if we removed anyone, update peers (which include peer_info)
2491 update_heartbeat_peers();
2495 // clear _requested maps; we may have to peer() again if we discover
2496 // (more) stray content
2497 peer_log_requested
.clear();
2498 peer_missing_requested
.clear();
2501 void PG::set_probe_targets(const set
<pg_shard_t
> &probe_set
)
2503 Mutex::Locker
l(heartbeat_peer_lock
);
2504 probe_targets
.clear();
2505 for (set
<pg_shard_t
>::iterator i
= probe_set
.begin();
2506 i
!= probe_set
.end();
2508 probe_targets
.insert(i
->osd
);
2512 void PG::clear_probe_targets()
2514 Mutex::Locker
l(heartbeat_peer_lock
);
2515 probe_targets
.clear();
2518 void PG::update_heartbeat_peers()
2520 assert(is_locked());
2526 for (unsigned i
=0; i
<acting
.size(); i
++) {
2527 if (acting
[i
] != CRUSH_ITEM_NONE
)
2528 new_peers
.insert(acting
[i
]);
2530 for (unsigned i
=0; i
<up
.size(); i
++) {
2531 if (up
[i
] != CRUSH_ITEM_NONE
)
2532 new_peers
.insert(up
[i
]);
2534 for (map
<pg_shard_t
,pg_info_t
>::iterator p
= peer_info
.begin();
2535 p
!= peer_info
.end();
2537 new_peers
.insert(p
->first
.osd
);
2539 bool need_update
= false;
2540 heartbeat_peer_lock
.Lock();
2541 if (new_peers
== heartbeat_peers
) {
2542 dout(10) << "update_heartbeat_peers " << heartbeat_peers
<< " unchanged" << dendl
;
2544 dout(10) << "update_heartbeat_peers " << heartbeat_peers
<< " -> " << new_peers
<< dendl
;
2545 heartbeat_peers
.swap(new_peers
);
2548 heartbeat_peer_lock
.Unlock();
2551 osd
->need_heartbeat_peer_update();
2555 bool PG::check_in_progress_op(
2556 const osd_reqid_t
&r
,
2557 eversion_t
*version
,
2558 version_t
*user_version
,
2559 int *return_code
) const
2562 projected_log
.get_request(r
, version
, user_version
, return_code
) ||
2563 pg_log
.get_log().get_request(r
, version
, user_version
, return_code
));
2566 void PG::_update_calc_stats()
2568 info
.stats
.version
= info
.last_update
;
2569 info
.stats
.created
= info
.history
.epoch_created
;
2570 info
.stats
.last_scrub
= info
.history
.last_scrub
;
2571 info
.stats
.last_scrub_stamp
= info
.history
.last_scrub_stamp
;
2572 info
.stats
.last_deep_scrub
= info
.history
.last_deep_scrub
;
2573 info
.stats
.last_deep_scrub_stamp
= info
.history
.last_deep_scrub_stamp
;
2574 info
.stats
.last_clean_scrub_stamp
= info
.history
.last_clean_scrub_stamp
;
2575 info
.stats
.last_epoch_clean
= info
.history
.last_epoch_clean
;
2577 info
.stats
.log_size
= pg_log
.get_head().version
- pg_log
.get_tail().version
;
2578 info
.stats
.ondisk_log_size
= info
.stats
.log_size
;
2579 info
.stats
.log_start
= pg_log
.get_tail();
2580 info
.stats
.ondisk_log_start
= pg_log
.get_tail();
2581 info
.stats
.snaptrimq_len
= snap_trimq
.size();
2583 unsigned num_shards
= get_osdmap()->get_pg_size(info
.pgid
.pgid
);
2585 // In rare case that upset is too large (usually transient), use as target
2586 // for calculations below.
2587 unsigned target
= std::max(num_shards
, (unsigned)upset
.size());
2588 // Not sure this could ever happen, that actingset > upset
2589 // which only matters if actingset > num_shards.
2590 unsigned nrep
= std::max(actingset
.size(), upset
.size());
2591 // calc num_object_copies
2592 info
.stats
.stats
.calc_copies(MAX(target
, nrep
));
2593 info
.stats
.stats
.sum
.num_objects_degraded
= 0;
2594 info
.stats
.stats
.sum
.num_objects_unfound
= 0;
2595 info
.stats
.stats
.sum
.num_objects_misplaced
= 0;
2596 if ((is_remapped() || is_undersized() || !is_clean()) && (is_peered() || is_activating())) {
2597 dout(20) << __func__
<< " actingset " << actingset
<< " upset "
2598 << upset
<< " actingbackfill " << actingbackfill
<< dendl
;
2599 dout(20) << __func__
<< " acting " << acting
<< " up " << up
<< dendl
;
2601 assert(!actingbackfill
.empty());
2603 // NOTE: we only generate degraded, misplaced and unfound
2604 // values for the summation, not individual stat categories.
2605 int64_t num_objects
= info
.stats
.stats
.sum
.num_objects
;
2607 // Objects missing from up nodes, sorted by # objects.
2608 boost::container::flat_set
<pair
<int64_t,pg_shard_t
>> missing_target_objects
;
2609 // Objects missing from nodes not in up, sort by # objects
2610 boost::container::flat_set
<pair
<int64_t,pg_shard_t
>> acting_source_objects
;
2615 missing
= pg_log
.get_missing().num_missing();
2616 assert(actingbackfill
.count(pg_whoami
));
2617 if (upset
.count(pg_whoami
)) {
2618 missing_target_objects
.insert(make_pair(missing
, pg_whoami
));
2620 acting_source_objects
.insert(make_pair(missing
, pg_whoami
));
2622 info
.stats
.stats
.sum
.num_objects_missing_on_primary
= missing
;
2625 for (auto& peer
: peer_info
) {
2626 // Ignore other peers until we add code to look at detailed missing
2627 // information. (recovery)
2628 if (!actingbackfill
.count(peer
.first
)) {
2632 // Backfill targets always track num_objects accurately
2633 // all other peers track missing accurately.
2634 if (is_backfill_targets(peer
.first
)) {
2635 missing
= std::max((int64_t)0, num_objects
- peer
.second
.stats
.stats
.sum
.num_objects
);
2637 if (peer_missing
.count(peer
.first
)) {
2638 missing
= peer_missing
[peer
.first
].num_missing();
2640 dout(20) << __func__
<< " no peer_missing found for " << peer
.first
<< dendl
;
2643 if (upset
.count(peer
.first
)) {
2644 missing_target_objects
.insert(make_pair(missing
, peer
.first
));
2646 acting_source_objects
.insert(make_pair(missing
, peer
.first
));
2648 peer
.second
.stats
.stats
.sum
.num_objects_missing
= missing
;
2651 if (pool
.info
.is_replicated()) {
2652 // Add to missing_target_objects up to target elements (num_objects missing)
2653 assert(target
>= missing_target_objects
.size());
2654 unsigned needed
= target
- missing_target_objects
.size();
2655 for (; needed
; --needed
)
2656 missing_target_objects
.insert(make_pair(num_objects
, pg_shard_t(pg_shard_t::NO_OSD
)));
2658 for (unsigned i
= 0 ; i
< num_shards
; ++i
) {
2659 shard_id_t
shard(i
);
2661 for (const auto& t
: missing_target_objects
) {
2662 if (std::get
<1>(t
).shard
== shard
) {
2668 missing_target_objects
.insert(make_pair(num_objects
, pg_shard_t(pg_shard_t::NO_OSD
,shard
)));
2672 for (const auto& item
: missing_target_objects
)
2673 dout(20) << __func__
<< " missing shard " << std::get
<1>(item
) << " missing= " << std::get
<0>(item
) << dendl
;
2674 for (const auto& item
: acting_source_objects
)
2675 dout(20) << __func__
<< " acting shard " << std::get
<1>(item
) << " missing= " << std::get
<0>(item
) << dendl
;
2677 // A misplaced object is not stored on the correct OSD
2678 int64_t misplaced
= 0;
2679 // a degraded objects has fewer replicas or EC shards than the pool specifies.
2680 int64_t degraded
= 0;
2682 for (auto m
= missing_target_objects
.rbegin();
2683 m
!= missing_target_objects
.rend(); ++m
) {
2685 int64_t extra_missing
= -1;
2687 if (pool
.info
.is_replicated()) {
2688 if (!acting_source_objects
.empty()) {
2689 auto extra_copy
= acting_source_objects
.begin();
2690 extra_missing
= std::get
<0>(*extra_copy
);
2691 acting_source_objects
.erase(extra_copy
);
2693 } else { // Erasure coded
2694 // Use corresponding shard
2695 for (const auto& a
: acting_source_objects
) {
2696 if (std::get
<1>(a
).shard
== std::get
<1>(*m
).shard
) {
2697 extra_missing
= std::get
<0>(a
);
2698 acting_source_objects
.erase(a
);
2704 if (extra_missing
>= 0 && std::get
<0>(*m
) >= extra_missing
) {
2705 // We don't know which of the objects on the target
2706 // are part of extra_missing so assume are all degraded.
2707 misplaced
+= std::get
<0>(*m
) - extra_missing
;
2708 degraded
+= extra_missing
;
2710 // 1. extra_missing == -1, more targets than sources so degraded
2711 // 2. extra_missing > std::get<0>(m), so that we know that some extra_missing
2712 // previously degraded are now present on the target.
2713 degraded
+= std::get
<0>(*m
);
2716 // If there are still acting that haven't been accounted for
2717 // then they are misplaced
2718 for (const auto& a
: acting_source_objects
) {
2719 int64_t extra_misplaced
= std::max((int64_t)0, num_objects
- std::get
<0>(a
));
2720 dout(20) << __func__
<< " extra acting misplaced " << extra_misplaced
<< dendl
;
2721 misplaced
+= extra_misplaced
;
2723 dout(20) << __func__
<< " degraded " << degraded
<< dendl
;
2724 dout(20) << __func__
<< " misplaced " << misplaced
<< dendl
;
2726 info
.stats
.stats
.sum
.num_objects_degraded
= degraded
;
2727 info
.stats
.stats
.sum
.num_objects_unfound
= get_num_unfound();
2728 info
.stats
.stats
.sum
.num_objects_misplaced
= misplaced
;
2732 void PG::_update_blocked_by()
2734 // set a max on the number of blocking peers we report. if we go
2735 // over, report a random subset. keep the result sorted.
2736 unsigned keep
= MIN(blocked_by
.size(), cct
->_conf
->osd_max_pg_blocked_by
);
2737 unsigned skip
= blocked_by
.size() - keep
;
2738 info
.stats
.blocked_by
.clear();
2739 info
.stats
.blocked_by
.resize(keep
);
2741 for (set
<int>::iterator p
= blocked_by
.begin();
2742 p
!= blocked_by
.end() && keep
> 0;
2744 if (skip
> 0 && (rand() % (skip
+ keep
) < skip
)) {
2747 info
.stats
.blocked_by
[pos
++] = *p
;
2753 void PG::publish_stats_to_osd()
2758 pg_stats_publish_lock
.Lock();
2760 if (info
.stats
.stats
.sum
.num_scrub_errors
)
2761 state_set(PG_STATE_INCONSISTENT
);
2763 state_clear(PG_STATE_INCONSISTENT
);
2765 utime_t now
= ceph_clock_now();
2766 if (info
.stats
.state
!= state
) {
2767 info
.stats
.last_change
= now
;
2768 // Optimistic estimation, if we just find out an inactive PG,
2769 // assumt it is active till now.
2770 if (!(state
& PG_STATE_ACTIVE
) &&
2771 (info
.stats
.state
& PG_STATE_ACTIVE
))
2772 info
.stats
.last_active
= now
;
2774 if ((state
& PG_STATE_ACTIVE
) &&
2775 !(info
.stats
.state
& PG_STATE_ACTIVE
))
2776 info
.stats
.last_became_active
= now
;
2777 if ((state
& (PG_STATE_ACTIVE
|PG_STATE_PEERED
)) &&
2778 !(info
.stats
.state
& (PG_STATE_ACTIVE
|PG_STATE_PEERED
)))
2779 info
.stats
.last_became_peered
= now
;
2780 if (!(state
& PG_STATE_CREATING
) &&
2781 (info
.stats
.state
& PG_STATE_CREATING
)) {
2782 osd
->send_pg_created(get_pgid().pgid
);
2784 info
.stats
.state
= state
;
2787 _update_calc_stats();
2788 if (info
.stats
.stats
.sum
.num_objects_degraded
) {
2789 state_set(PG_STATE_DEGRADED
);
2791 state_clear(PG_STATE_DEGRADED
);
2793 _update_blocked_by();
2795 bool publish
= false;
2796 pg_stat_t pre_publish
= info
.stats
;
2797 pre_publish
.stats
.add(unstable_stats
);
2798 utime_t cutoff
= now
;
2799 cutoff
-= cct
->_conf
->osd_pg_stat_report_interval_max
;
2800 if (pg_stats_publish_valid
&& pre_publish
== pg_stats_publish
&&
2801 info
.stats
.last_fresh
> cutoff
) {
2802 dout(15) << "publish_stats_to_osd " << pg_stats_publish
.reported_epoch
2803 << ": no change since " << info
.stats
.last_fresh
<< dendl
;
2805 // update our stat summary and timestamps
2806 info
.stats
.reported_epoch
= get_osdmap()->get_epoch();
2807 ++info
.stats
.reported_seq
;
2809 info
.stats
.last_fresh
= now
;
2811 if (info
.stats
.state
& PG_STATE_CLEAN
)
2812 info
.stats
.last_clean
= now
;
2813 if (info
.stats
.state
& PG_STATE_ACTIVE
)
2814 info
.stats
.last_active
= now
;
2815 if (info
.stats
.state
& (PG_STATE_ACTIVE
|PG_STATE_PEERED
))
2816 info
.stats
.last_peered
= now
;
2817 info
.stats
.last_unstale
= now
;
2818 if ((info
.stats
.state
& PG_STATE_DEGRADED
) == 0)
2819 info
.stats
.last_undegraded
= now
;
2820 if ((info
.stats
.state
& PG_STATE_UNDERSIZED
) == 0)
2821 info
.stats
.last_fullsized
= now
;
2823 // do not send pgstat to mon anymore once we are luminous, since mgr takes
2824 // care of this by sending MMonMgrReport to mon.
2826 osd
->osd
->get_osdmap()->require_osd_release
< CEPH_RELEASE_LUMINOUS
;
2827 pg_stats_publish_valid
= true;
2828 pg_stats_publish
= pre_publish
;
2830 dout(15) << "publish_stats_to_osd " << pg_stats_publish
.reported_epoch
2831 << ":" << pg_stats_publish
.reported_seq
<< dendl
;
2833 pg_stats_publish_lock
.Unlock();
2836 osd
->pg_stat_queue_enqueue(this);
2839 void PG::clear_publish_stats()
2841 dout(15) << "clear_stats" << dendl
;
2842 pg_stats_publish_lock
.Lock();
2843 pg_stats_publish_valid
= false;
2844 pg_stats_publish_lock
.Unlock();
2846 osd
->pg_stat_queue_dequeue(this);
2850 * initialize a newly instantiated pg
2852 * Initialize PG state, as when a PG is initially created, or when it
2853 * is first instantiated on the current node.
2855 * @param role our role/rank
2856 * @param newup up set
2857 * @param newacting acting set
2858 * @param history pg history
2859 * @param pi past_intervals
2860 * @param backfill true if info should be marked as backfill
2861 * @param t transaction to write out our new state in
2865 const vector
<int>& newup
, int new_up_primary
,
2866 const vector
<int>& newacting
, int new_acting_primary
,
2867 const pg_history_t
& history
,
2868 const PastIntervals
& pi
,
2870 ObjectStore::Transaction
*t
)
2872 dout(10) << "init role " << role
<< " up " << newup
<< " acting " << newacting
2873 << " history " << history
2874 << " past_intervals " << pi
2880 init_primary_up_acting(
2884 new_acting_primary
);
2886 info
.history
= history
;
2887 past_intervals
= pi
;
2890 info
.stats
.up_primary
= new_up_primary
;
2891 info
.stats
.acting
= acting
;
2892 info
.stats
.acting_primary
= new_acting_primary
;
2893 info
.stats
.mapping_epoch
= info
.history
.same_interval_since
;
2896 dout(10) << __func__
<< ": Setting backfill" << dendl
;
2897 info
.set_last_backfill(hobject_t());
2898 info
.last_complete
= info
.last_update
;
2899 pg_log
.mark_log_for_rewrite();
2905 dirty_big_info
= true;
2909 #pragma GCC diagnostic ignored "-Wpragmas"
2910 #pragma GCC diagnostic push
2911 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
2913 void PG::upgrade(ObjectStore
*store
)
2915 assert(info_struct_v
<= 10);
2916 ObjectStore::Transaction t
;
2918 assert(info_struct_v
>= 7);
2921 if (info_struct_v
<= 7) {
2922 pg_log
.mark_log_for_rewrite();
2923 ghobject_t
log_oid(OSD::make_pg_log_oid(pg_id
));
2924 ghobject_t
biginfo_oid(OSD::make_pg_biginfo_oid(pg_id
));
2925 t
.remove(coll_t::meta(), log_oid
);
2926 t
.remove(coll_t::meta(), biginfo_oid
);
2927 t
.touch(coll
, pgmeta_oid
);
2931 if (info_struct_v
<= 8) {
2932 // no special action needed.
2936 if (info_struct_v
<= 9) {
2937 // previous versions weren't (as) aggressively clearing past_intervals
2938 if (info
.history
.last_epoch_clean
>= info
.history
.same_interval_since
) {
2939 dout(20) << __func__
<< " clearing past_intervals" << dendl
;
2940 past_intervals
.clear();
2944 // update infover_key
2945 if (info_struct_v
< cur_struct_v
) {
2946 map
<string
,bufferlist
> v
;
2947 __u8 ver
= cur_struct_v
;
2948 ::encode(ver
, v
[infover_key
]);
2949 t
.omap_setkeys(coll
, pgmeta_oid
, v
);
2953 dirty_big_info
= true;
2956 ceph::shared_ptr
<ObjectStore::Sequencer
> osr (std::make_shared
<
2957 ObjectStore::Sequencer
>("upgrade"));
2958 int r
= store
->apply_transaction(osr
.get(), std::move(t
));
2960 derr
<< __func__
<< ": apply_transaction returned "
2961 << cpp_strerror(r
) << dendl
;
2967 if (!osr
->flush_commit(&waiter
)) {
2972 #pragma GCC diagnostic pop
2973 #pragma GCC diagnostic warning "-Wpragmas"
2975 int PG::_prepare_write_info(CephContext
* cct
,
2976 map
<string
,bufferlist
> *km
,
2978 pg_info_t
&info
, pg_info_t
&last_written_info
,
2979 PastIntervals
&past_intervals
,
2980 bool dirty_big_info
,
2983 PerfCounters
*logger
)
2986 ::encode(epoch
, (*km
)[epoch_key
]);
2990 logger
->inc(l_osd_pg_info
);
2992 // try to do info efficiently?
2993 if (!dirty_big_info
&& try_fast_info
&&
2994 info
.last_update
> last_written_info
.last_update
) {
2995 pg_fast_info_t fast
;
2996 fast
.populate_from(info
);
2997 bool did
= fast
.try_apply_to(&last_written_info
);
2998 assert(did
); // we verified last_update increased above
2999 if (info
== last_written_info
) {
3000 ::encode(fast
, (*km
)[fastinfo_key
]);
3002 logger
->inc(l_osd_pg_fastinfo
);
3005 generic_dout(30) << __func__
<< " fastinfo failed, info:\n";
3007 JSONFormatter
jf(true);
3008 jf
.dump_object("info", info
);
3012 *_dout
<< "\nlast_written_info:\n";
3013 JSONFormatter
jf(true);
3014 jf
.dump_object("last_written_info", last_written_info
);
3019 last_written_info
= info
;
3021 // info. store purged_snaps separately.
3022 interval_set
<snapid_t
> purged_snaps
;
3023 purged_snaps
.swap(info
.purged_snaps
);
3024 ::encode(info
, (*km
)[info_key
]);
3025 purged_snaps
.swap(info
.purged_snaps
);
3027 if (dirty_big_info
) {
3028 // potentially big stuff
3029 bufferlist
& bigbl
= (*km
)[biginfo_key
];
3030 ::encode(past_intervals
, bigbl
);
3031 ::encode(info
.purged_snaps
, bigbl
);
3032 //dout(20) << "write_info bigbl " << bigbl.length() << dendl;
3034 logger
->inc(l_osd_pg_biginfo
);
3040 void PG::_create(ObjectStore::Transaction
& t
, spg_t pgid
, int bits
)
3043 t
.create_collection(coll
, bits
);
3046 void PG::_init(ObjectStore::Transaction
& t
, spg_t pgid
, const pg_pool_t
*pool
)
3051 // Give a hint to the PG collection
3053 uint32_t pg_num
= pool
->get_pg_num();
3054 uint64_t expected_num_objects_pg
= pool
->expected_num_objects
/ pg_num
;
3055 ::encode(pg_num
, hint
);
3056 ::encode(expected_num_objects_pg
, hint
);
3057 uint32_t hint_type
= ObjectStore::Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS
;
3058 t
.collection_hint(coll
, hint_type
, hint
);
3061 ghobject_t
pgmeta_oid(pgid
.make_pgmeta_oid());
3062 t
.touch(coll
, pgmeta_oid
);
3063 map
<string
,bufferlist
> values
;
3064 __u8 struct_v
= cur_struct_v
;
3065 ::encode(struct_v
, values
[infover_key
]);
3066 t
.omap_setkeys(coll
, pgmeta_oid
, values
);
3069 void PG::prepare_write_info(map
<string
,bufferlist
> *km
)
3071 info
.stats
.stats
.add(unstable_stats
);
3072 unstable_stats
.clear();
3074 bool need_update_epoch
= last_epoch
< get_osdmap()->get_epoch();
3075 int ret
= _prepare_write_info(cct
, km
, get_osdmap()->get_epoch(),
3079 dirty_big_info
, need_update_epoch
,
3080 cct
->_conf
->osd_fast_info
,
3083 if (need_update_epoch
)
3084 last_epoch
= get_osdmap()->get_epoch();
3085 last_persisted_osdmap_ref
= osdmap_ref
;
3088 dirty_big_info
= false;
3091 #pragma GCC diagnostic ignored "-Wpragmas"
3092 #pragma GCC diagnostic push
3093 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
3095 bool PG::_has_removal_flag(ObjectStore
*store
,
3099 ghobject_t
pgmeta_oid(pgid
.make_pgmeta_oid());
3101 // first try new way
3103 keys
.insert("_remove");
3104 map
<string
,bufferlist
> values
;
3105 if (store
->omap_get_values(coll
, pgmeta_oid
, keys
, &values
) == 0 &&
3112 int PG::peek_map_epoch(ObjectStore
*store
,
3118 ghobject_t
legacy_infos_oid(OSD::make_infos_oid());
3119 ghobject_t
pgmeta_oid(pgid
.make_pgmeta_oid());
3120 epoch_t cur_epoch
= 0;
3124 // validate collection name
3125 assert(coll
.is_pg());
3130 keys
.insert(infover_key
);
3131 keys
.insert(epoch_key
);
3132 map
<string
,bufferlist
> values
;
3133 int r
= store
->omap_get_values(coll
, pgmeta_oid
, keys
, &values
);
3135 assert(values
.size() == 2);
3137 // sanity check version
3138 bufferlist::iterator bp
= values
[infover_key
].begin();
3140 ::decode(struct_v
, bp
);
3141 assert(struct_v
>= 8);
3144 bp
= values
[epoch_key
].begin();
3145 ::decode(cur_epoch
, bp
);
3147 // probably bug 10617; see OSD::load_pgs()
3151 *pepoch
= cur_epoch
;
3155 #pragma GCC diagnostic pop
3156 #pragma GCC diagnostic warning "-Wpragmas"
3158 void PG::write_if_dirty(ObjectStore::Transaction
& t
)
3160 map
<string
,bufferlist
> km
;
3161 if (dirty_big_info
|| dirty_info
)
3162 prepare_write_info(&km
);
3163 pg_log
.write_log_and_missing(t
, &km
, coll
, pgmeta_oid
, pool
.info
.require_rollback());
3165 t
.omap_setkeys(coll
, pgmeta_oid
, km
);
3170 assert(is_primary());
3172 dout(10) << __func__
<< " to " << pg_trim_to
<< dendl
;
3173 if (pg_trim_to
!= eversion_t()) {
3174 // inform peers to trim log
3175 assert(!actingbackfill
.empty());
3176 for (set
<pg_shard_t
>::iterator i
= actingbackfill
.begin();
3177 i
!= actingbackfill
.end();
3179 if (*i
== pg_whoami
) continue;
3180 osd
->send_message_osd_cluster(
3183 get_osdmap()->get_epoch(),
3184 spg_t(info
.pgid
.pgid
, i
->shard
),
3186 get_osdmap()->get_epoch());
3189 // trim primary as well
3190 pg_log
.trim(pg_trim_to
, info
);
3195 void PG::add_log_entry(const pg_log_entry_t
& e
, bool applied
)
3197 // raise last_complete only if we were previously up to date
3198 if (info
.last_complete
== info
.last_update
)
3199 info
.last_complete
= e
.version
;
3201 // raise last_update.
3202 assert(e
.version
> info
.last_update
);
3203 info
.last_update
= e
.version
;
3205 // raise user_version, if it increased (it may have not get bumped
3206 // by all logged updates)
3207 if (e
.user_version
> info
.last_user_version
)
3208 info
.last_user_version
= e
.user_version
;
3211 pg_log
.add(e
, applied
);
3212 dout(10) << "add_log_entry " << e
<< dendl
;
3216 void PG::append_log(
3217 const vector
<pg_log_entry_t
>& logv
,
3219 eversion_t roll_forward_to
,
3220 ObjectStore::Transaction
&t
,
3221 bool transaction_applied
)
3223 if (transaction_applied
)
3224 update_snap_map(logv
, t
);
3226 /* The primary has sent an info updating the history, but it may not
3227 * have arrived yet. We want to make sure that we cannot remember this
3228 * write without remembering that it happened in an interval which went
3229 * active in epoch history.last_epoch_started.
3231 if (info
.last_epoch_started
!= info
.history
.last_epoch_started
) {
3232 info
.history
.last_epoch_started
= info
.last_epoch_started
;
3234 if (info
.last_interval_started
!= info
.history
.last_interval_started
) {
3235 info
.history
.last_interval_started
= info
.last_interval_started
;
3237 dout(10) << "append_log " << pg_log
.get_log() << " " << logv
<< dendl
;
3239 PGLogEntryHandler handler
{this, &t
};
3240 if (!transaction_applied
) {
3241 /* We must be a backfill peer, so it's ok if we apply
3242 * out-of-turn since we won't be considered when
3243 * determining a min possible last_update.
3245 pg_log
.roll_forward(&handler
);
3248 for (vector
<pg_log_entry_t
>::const_iterator p
= logv
.begin();
3251 add_log_entry(*p
, transaction_applied
);
3253 /* We don't want to leave the rollforward artifacts around
3254 * here past last_backfill. It's ok for the same reason as
3256 if (transaction_applied
&&
3257 p
->soid
> info
.last_backfill
) {
3258 pg_log
.roll_forward(&handler
);
3261 auto last
= logv
.rbegin();
3262 if (is_primary() && last
!= logv
.rend()) {
3263 projected_log
.skip_can_rollback_to_to_head();
3264 projected_log
.trim(cct
, last
->version
, nullptr, nullptr, nullptr);
3267 if (transaction_applied
&& roll_forward_to
> pg_log
.get_can_rollback_to()) {
3268 pg_log
.roll_forward_to(
3271 t
.register_on_applied(
3272 new C_UpdateLastRollbackInfoTrimmedToApplied(
3274 get_osdmap()->get_epoch(),
3278 pg_log
.trim(trim_to
, info
);
3280 // update the local pg, pg log
3285 bool PG::check_log_for_corruption(ObjectStore
*store
)
3287 /// TODO: this method needs to work with the omap log
3291 //! Get the name we're going to save our corrupt page log as
3292 std::string
PG::get_corrupt_pg_log_name() const
3294 const int MAX_BUF
= 512;
3297 time_t my_time(time(NULL
));
3298 const struct tm
*t
= localtime_r(&my_time
, &tm_buf
);
3299 int ret
= strftime(buf
, sizeof(buf
), "corrupt_log_%Y-%m-%d_%k:%M_", t
);
3301 dout(0) << "strftime failed" << dendl
;
3302 return "corrupt_log_unknown_time";
3305 out
+= stringify(info
.pgid
);
3310 ObjectStore
*store
, spg_t pgid
, const coll_t
&coll
, bufferlist
&bl
,
3311 pg_info_t
&info
, PastIntervals
&past_intervals
,
3314 // try for v8 or later
3316 keys
.insert(infover_key
);
3317 keys
.insert(info_key
);
3318 keys
.insert(biginfo_key
);
3319 keys
.insert(fastinfo_key
);
3320 ghobject_t
pgmeta_oid(pgid
.make_pgmeta_oid());
3321 map
<string
,bufferlist
> values
;
3322 int r
= store
->omap_get_values(coll
, pgmeta_oid
, keys
, &values
);
3324 assert(values
.size() == 3 ||
3325 values
.size() == 4);
3327 bufferlist::iterator p
= values
[infover_key
].begin();
3328 ::decode(struct_v
, p
);
3329 assert(struct_v
>= 8);
3331 p
= values
[info_key
].begin();
3334 p
= values
[biginfo_key
].begin();
3335 if (struct_v
>= 10) {
3336 ::decode(past_intervals
, p
);
3338 past_intervals
.decode_classic(p
);
3340 ::decode(info
.purged_snaps
, p
);
3342 p
= values
[fastinfo_key
].begin();
3344 pg_fast_info_t fast
;
3346 fast
.try_apply_to(&info
);
3352 ghobject_t
infos_oid(OSD::make_infos_oid());
3353 bufferlist::iterator p
= bl
.begin();
3354 ::decode(struct_v
, p
);
3355 assert(struct_v
== 7);
3357 // get info out of leveldb
3358 string k
= get_info_key(info
.pgid
);
3359 string bk
= get_biginfo_key(info
.pgid
);
3364 store
->omap_get_values(coll_t::meta(), ghobject_t(infos_oid
), keys
, &values
);
3365 assert(values
.size() == 2);
3367 p
= values
[k
].begin();
3370 p
= values
[bk
].begin();
3371 ::decode(past_intervals
, p
);
3372 interval_set
<snapid_t
> snap_collections
; // obsolete
3373 ::decode(snap_collections
, p
);
3374 ::decode(info
.purged_snaps
, p
);
3378 void PG::read_state(ObjectStore
*store
, bufferlist
&bl
)
3380 int r
= read_info(store
, pg_id
, coll
, bl
, info
, past_intervals
,
3384 last_written_info
= info
;
3386 // if we are upgrading from jewel, we need to force rebuild of
3387 // missing set. v9 was fastinfo, added v11.0.2-331-g1d5dc29a13
3388 // (before kraken). persisted missing set was circa
3389 // v11.0.0-866-gb0e239da95 (a bit earlier, also before kraken).
3390 // v8 was pre-jewel (per-pg meta object).
3391 bool force_rebuild_missing
= info_struct_v
< 9;
3392 if (force_rebuild_missing
) {
3393 dout(10) << __func__
<< " detected upgrade from jewel, force_rebuild_missing"
3398 pg_log
.read_log_and_missing(
3401 info_struct_v
< 8 ? coll_t::meta() : coll
,
3402 ghobject_t(info_struct_v
< 8 ? OSD::make_pg_log_oid(pg_id
) : pgmeta_oid
),
3404 force_rebuild_missing
,
3406 cct
->_conf
->osd_ignore_stale_divergent_priors
,
3407 cct
->_conf
->osd_debug_verify_missing_on_start
);
3409 osd
->clog
->error() << oss
.str();
3411 if (force_rebuild_missing
) {
3412 dout(10) << __func__
<< " forced rebuild of missing got "
3413 << pg_log
.get_missing()
3417 // log any weirdness
3421 void PG::log_weirdness()
3423 if (pg_log
.get_tail() != info
.log_tail
)
3424 osd
->clog
->error() << info
.pgid
3425 << " info mismatch, log.tail " << pg_log
.get_tail()
3426 << " != info.log_tail " << info
.log_tail
;
3427 if (pg_log
.get_head() != info
.last_update
)
3428 osd
->clog
->error() << info
.pgid
3429 << " info mismatch, log.head " << pg_log
.get_head()
3430 << " != info.last_update " << info
.last_update
;
3432 if (!pg_log
.get_log().empty()) {
3434 if ((pg_log
.get_log().log
.begin()->version
<= pg_log
.get_tail()))
3435 osd
->clog
->error() << info
.pgid
3436 << " log bound mismatch, info (tail,head] ("
3437 << pg_log
.get_tail() << "," << pg_log
.get_head() << "]"
3439 << pg_log
.get_log().log
.begin()->version
<< ","
3440 << pg_log
.get_log().log
.rbegin()->version
<< "]";
3443 if (pg_log
.get_log().caller_ops
.size() > pg_log
.get_log().log
.size()) {
3444 osd
->clog
->error() << info
.pgid
3445 << " caller_ops.size " << pg_log
.get_log().caller_ops
.size()
3446 << " > log size " << pg_log
.get_log().log
.size();
3450 void PG::update_snap_map(
3451 const vector
<pg_log_entry_t
> &log_entries
,
3452 ObjectStore::Transaction
&t
)
3454 for (vector
<pg_log_entry_t
>::const_iterator i
= log_entries
.begin();
3455 i
!= log_entries
.end();
3457 OSDriver::OSTransaction
_t(osdriver
.get_transaction(&t
));
3458 if (i
->soid
.snap
< CEPH_MAXSNAP
) {
3459 if (i
->is_delete()) {
3460 int r
= snap_mapper
.remove_oid(
3464 } else if (i
->is_update()) {
3465 assert(i
->snaps
.length() > 0);
3466 vector
<snapid_t
> snaps
;
3467 bufferlist snapbl
= i
->snaps
;
3468 bufferlist::iterator p
= snapbl
.begin();
3472 derr
<< __func__
<< " decode snaps failure on " << *i
<< dendl
;
3475 set
<snapid_t
> _snaps(snaps
.begin(), snaps
.end());
3477 if (i
->is_clone() || i
->is_promote()) {
3478 snap_mapper
.add_oid(
3482 } else if (i
->is_modify()) {
3483 assert(i
->is_modify());
3484 int r
= snap_mapper
.update_snaps(
3491 assert(i
->is_clean());
3499 * filter trimming|trimmed snaps out of snapcontext
3501 void PG::filter_snapc(vector
<snapid_t
> &snaps
)
3503 //nothing needs to trim, we can return immediately
3504 if(snap_trimq
.empty() && info
.purged_snaps
.empty())
3507 bool filtering
= false;
3508 vector
<snapid_t
> newsnaps
;
3509 for (vector
<snapid_t
>::iterator p
= snaps
.begin();
3512 if (snap_trimq
.contains(*p
) || info
.purged_snaps
.contains(*p
)) {
3514 // start building a new vector with what we've seen so far
3515 dout(10) << "filter_snapc filtering " << snaps
<< dendl
;
3516 newsnaps
.insert(newsnaps
.begin(), snaps
.begin(), p
);
3519 dout(20) << "filter_snapc removing trimq|purged snap " << *p
<< dendl
;
3522 newsnaps
.push_back(*p
); // continue building new vector
3526 snaps
.swap(newsnaps
);
3527 dout(10) << "filter_snapc result " << snaps
<< dendl
;
3531 void PG::requeue_object_waiters(map
<hobject_t
, list
<OpRequestRef
>>& m
)
3533 for (map
<hobject_t
, list
<OpRequestRef
>>::iterator it
= m
.begin();
3536 requeue_ops(it
->second
);
3540 void PG::requeue_op(OpRequestRef op
)
3542 auto p
= waiting_for_map
.find(op
->get_source());
3543 if (p
!= waiting_for_map
.end()) {
3544 dout(20) << __func__
<< " " << op
<< " (waiting_for_map " << p
->first
<< ")"
3546 p
->second
.push_front(op
);
3548 dout(20) << __func__
<< " " << op
<< dendl
;
3549 osd
->enqueue_front(info
.pgid
, PGQueueable(op
, get_osdmap()->get_epoch()));
3553 void PG::requeue_ops(list
<OpRequestRef
> &ls
)
3555 for (list
<OpRequestRef
>::reverse_iterator i
= ls
.rbegin();
3558 auto p
= waiting_for_map
.find((*i
)->get_source());
3559 if (p
!= waiting_for_map
.end()) {
3560 dout(20) << __func__
<< " " << *i
<< " (waiting_for_map " << p
->first
3562 p
->second
.push_front(*i
);
3564 dout(20) << __func__
<< " " << *i
<< dendl
;
3565 osd
->enqueue_front(info
.pgid
, PGQueueable(*i
, get_osdmap()->get_epoch()));
3571 void PG::requeue_map_waiters()
3573 epoch_t epoch
= get_osdmap()->get_epoch();
3574 auto p
= waiting_for_map
.begin();
3575 while (p
!= waiting_for_map
.end()) {
3576 if (epoch
< p
->second
.front()->min_epoch
) {
3577 dout(20) << __func__
<< " " << p
->first
<< " front op "
3578 << p
->second
.front() << " must still wait, doing nothing"
3582 dout(20) << __func__
<< " " << p
->first
<< " " << p
->second
<< dendl
;
3583 for (auto q
= p
->second
.rbegin(); q
!= p
->second
.rend(); ++q
) {
3584 osd
->enqueue_front(info
.pgid
, PGQueueable(*q
, epoch
));
3586 p
= waiting_for_map
.erase(p
);
3592 // ==========================================================================================
3596 * when holding pg and sched_scrub_lock, then the states are:
3598 * scrubber.reserved = true
3599 * scrub_rserved_peers includes whoami
3600 * osd->scrub_pending++
3601 * scheduling, replica declined:
3602 * scrubber.reserved = true
3603 * scrubber.reserved_peers includes -1
3604 * osd->scrub_pending++
3606 * scrubber.reserved = true
3607 * scrubber.reserved_peers.size() == acting.size();
3609 * osd->scrub_pending++
3611 * scrubber.reserved = false;
3612 * scrubber.reserved_peers empty
3613 * osd->scrubber.active++
3616 // returns true if a scrub has been newly kicked off
3617 bool PG::sched_scrub()
3619 bool nodeep_scrub
= false;
3620 assert(is_locked());
3621 if (!(is_primary() && is_active() && is_clean() && !is_scrubbing())) {
3625 double deep_scrub_interval
= 0;
3626 pool
.info
.opts
.get(pool_opts_t::DEEP_SCRUB_INTERVAL
, &deep_scrub_interval
);
3627 if (deep_scrub_interval
<= 0) {
3628 deep_scrub_interval
= cct
->_conf
->osd_deep_scrub_interval
;
3630 bool time_for_deep
= ceph_clock_now() >=
3631 info
.history
.last_deep_scrub_stamp
+ deep_scrub_interval
;
3633 bool deep_coin_flip
= false;
3634 // Only add random deep scrubs when NOT user initiated scrub
3635 if (!scrubber
.must_scrub
)
3636 deep_coin_flip
= (rand() % 100) < cct
->_conf
->osd_deep_scrub_randomize_ratio
* 100;
3637 dout(20) << __func__
<< ": time_for_deep=" << time_for_deep
<< " deep_coin_flip=" << deep_coin_flip
<< dendl
;
3639 time_for_deep
= (time_for_deep
|| deep_coin_flip
);
3641 //NODEEP_SCRUB so ignore time initiated deep-scrub
3642 if (osd
->osd
->get_osdmap()->test_flag(CEPH_OSDMAP_NODEEP_SCRUB
) ||
3643 pool
.info
.has_flag(pg_pool_t::FLAG_NODEEP_SCRUB
)) {
3644 time_for_deep
= false;
3645 nodeep_scrub
= true;
3648 if (!scrubber
.must_scrub
) {
3649 assert(!scrubber
.must_deep_scrub
);
3651 //NOSCRUB so skip regular scrubs
3652 if ((osd
->osd
->get_osdmap()->test_flag(CEPH_OSDMAP_NOSCRUB
) ||
3653 pool
.info
.has_flag(pg_pool_t::FLAG_NOSCRUB
)) && !time_for_deep
) {
3654 if (scrubber
.reserved
) {
3655 // cancel scrub if it is still in scheduling,
3656 // so pgs from other pools where scrub are still legal
3657 // have a chance to go ahead with scrubbing.
3658 clear_scrub_reserved();
3659 scrub_unreserve_replicas();
3665 if (cct
->_conf
->osd_scrub_auto_repair
3666 && get_pgbackend()->auto_repair_supported()
3668 // respect the command from user, and not do auto-repair
3669 && !scrubber
.must_repair
3670 && !scrubber
.must_scrub
3671 && !scrubber
.must_deep_scrub
) {
3672 dout(20) << __func__
<< ": auto repair with deep scrubbing" << dendl
;
3673 scrubber
.auto_repair
= true;
3675 // this happens when user issue the scrub/repair command during
3676 // the scheduling of the scrub/repair (e.g. request reservation)
3677 scrubber
.auto_repair
= false;
3681 if (!scrubber
.reserved
) {
3682 assert(scrubber
.reserved_peers
.empty());
3683 if ((cct
->_conf
->osd_scrub_during_recovery
|| !osd
->is_recovery_active()) &&
3684 osd
->inc_scrubs_pending()) {
3685 dout(20) << __func__
<< ": reserved locally, reserving replicas" << dendl
;
3686 scrubber
.reserved
= true;
3687 scrubber
.reserved_peers
.insert(pg_whoami
);
3688 scrub_reserve_replicas();
3690 dout(20) << __func__
<< ": failed to reserve locally" << dendl
;
3694 if (scrubber
.reserved
) {
3695 if (scrubber
.reserve_failed
) {
3696 dout(20) << "sched_scrub: failed, a peer declined" << dendl
;
3697 clear_scrub_reserved();
3698 scrub_unreserve_replicas();
3700 } else if (scrubber
.reserved_peers
.size() == acting
.size()) {
3701 dout(20) << "sched_scrub: success, reserved self and replicas" << dendl
;
3702 if (time_for_deep
) {
3703 dout(10) << "sched_scrub: scrub will be deep" << dendl
;
3704 state_set(PG_STATE_DEEP_SCRUB
);
3705 } else if (!scrubber
.must_deep_scrub
&& info
.stats
.stats
.sum
.num_deep_scrub_errors
) {
3706 if (!nodeep_scrub
) {
3707 osd
->clog
->info() << "osd." << osd
->whoami
3708 << " pg " << info
.pgid
3709 << " Deep scrub errors, upgrading scrub to deep-scrub";
3710 state_set(PG_STATE_DEEP_SCRUB
);
3711 } else if (!scrubber
.must_scrub
) {
3712 osd
->clog
->error() << "osd." << osd
->whoami
3713 << " pg " << info
.pgid
3714 << " Regular scrub skipped due to deep-scrub errors and nodeep-scrub set";
3715 clear_scrub_reserved();
3716 scrub_unreserve_replicas();
3719 osd
->clog
->error() << "osd." << osd
->whoami
3720 << " pg " << info
.pgid
3721 << " Regular scrub request, deep-scrub details will be lost";
3726 // none declined, since scrubber.reserved is set
3727 dout(20) << "sched_scrub: reserved " << scrubber
.reserved_peers
<< ", waiting for replicas" << dendl
;
3734 void PG::reg_next_scrub()
3740 if (scrubber
.must_scrub
||
3741 (info
.stats
.stats_invalid
&& cct
->_conf
->osd_scrub_invalid_stats
)) {
3742 reg_stamp
= ceph_clock_now();
3744 reg_stamp
= info
.history
.last_scrub_stamp
;
3746 // note down the sched_time, so we can locate this scrub, and remove it
3748 double scrub_min_interval
= 0, scrub_max_interval
= 0;
3749 pool
.info
.opts
.get(pool_opts_t::SCRUB_MIN_INTERVAL
, &scrub_min_interval
);
3750 pool
.info
.opts
.get(pool_opts_t::SCRUB_MAX_INTERVAL
, &scrub_max_interval
);
3751 assert(scrubber
.scrub_reg_stamp
== utime_t());
3752 scrubber
.scrub_reg_stamp
= osd
->reg_pg_scrub(info
.pgid
,
3756 scrubber
.must_scrub
);
3759 void PG::unreg_next_scrub()
3762 osd
->unreg_pg_scrub(info
.pgid
, scrubber
.scrub_reg_stamp
);
3763 scrubber
.scrub_reg_stamp
= utime_t();
3767 void PG::do_replica_scrub_map(OpRequestRef op
)
3769 const MOSDRepScrubMap
*m
= static_cast<const MOSDRepScrubMap
*>(op
->get_req());
3770 dout(7) << __func__
<< " " << *m
<< dendl
;
3771 if (m
->map_epoch
< info
.history
.same_interval_since
) {
3772 dout(10) << __func__
<< " discarding old from "
3773 << m
->map_epoch
<< " < " << info
.history
.same_interval_since
3777 if (!scrubber
.is_chunky_scrub_active()) {
3778 dout(10) << __func__
<< " scrub isn't active" << dendl
;
3784 bufferlist::iterator p
= const_cast<bufferlist
&>(m
->get_data()).begin();
3785 scrubber
.received_maps
[m
->from
].decode(p
, info
.pgid
.pool());
3786 dout(10) << "map version is "
3787 << scrubber
.received_maps
[m
->from
].valid_through
3790 dout(10) << __func__
<< " waiting_on_whom was " << scrubber
.waiting_on_whom
3792 assert(scrubber
.waiting_on_whom
.count(m
->from
));
3793 scrubber
.waiting_on_whom
.erase(m
->from
);
3795 dout(10) << __func__
<< " replica was preempted, setting flag" << dendl
;
3796 scrub_preempted
= true;
3798 if (scrubber
.waiting_on_whom
.empty()) {
3799 if (ops_blocked_by_scrub()) {
3800 requeue_scrub(true);
3802 requeue_scrub(false);
3807 void PG::sub_op_scrub_map(OpRequestRef op
)
3809 // for legacy jewel compatibility only
3810 const MOSDSubOp
*m
= static_cast<const MOSDSubOp
*>(op
->get_req());
3811 assert(m
->get_type() == MSG_OSD_SUBOP
);
3812 dout(7) << "sub_op_scrub_map" << dendl
;
3814 if (m
->map_epoch
< info
.history
.same_interval_since
) {
3815 dout(10) << "sub_op_scrub discarding old sub_op from "
3816 << m
->map_epoch
<< " < " << info
.history
.same_interval_since
<< dendl
;
3820 if (!scrubber
.is_chunky_scrub_active()) {
3821 dout(10) << "sub_op_scrub_map scrub isn't active" << dendl
;
3827 dout(10) << " got " << m
->from
<< " scrub map" << dendl
;
3828 bufferlist::iterator p
= const_cast<bufferlist
&>(m
->get_data()).begin();
3830 scrubber
.received_maps
[m
->from
].decode(p
, info
.pgid
.pool());
3831 dout(10) << "map version is "
3832 << scrubber
.received_maps
[m
->from
].valid_through
3835 scrubber
.waiting_on_whom
.erase(m
->from
);
3837 if (scrubber
.waiting_on_whom
.empty()) {
3838 if (ops_blocked_by_scrub()) {
3839 requeue_scrub(true);
3841 requeue_scrub(false);
3846 // send scrub v3 messages (chunky scrub)
3847 void PG::_request_scrub_map(
3848 pg_shard_t replica
, eversion_t version
,
3849 hobject_t start
, hobject_t end
,
3851 bool allow_preemption
)
3853 assert(replica
!= pg_whoami
);
3854 dout(10) << "scrub requesting scrubmap from osd." << replica
3855 << " deep " << (int)deep
<< dendl
;
3856 MOSDRepScrub
*repscrubop
= new MOSDRepScrub(
3857 spg_t(info
.pgid
.pgid
, replica
.shard
), version
,
3858 get_osdmap()->get_epoch(),
3859 get_last_peering_reset(),
3863 ops_blocked_by_scrub());
3864 // default priority, we want the rep scrub processed prior to any recovery
3865 // or client io messages (we are holding a lock!)
3866 osd
->send_message_osd_cluster(
3867 replica
.osd
, repscrubop
, get_osdmap()->get_epoch());
3870 void PG::handle_scrub_reserve_request(OpRequestRef op
)
3872 dout(7) << __func__
<< " " << *op
->get_req() << dendl
;
3874 if (scrubber
.reserved
) {
3875 dout(10) << __func__
<< " ignoring reserve request: Already reserved"
3879 if ((cct
->_conf
->osd_scrub_during_recovery
|| !osd
->is_recovery_active()) &&
3880 osd
->inc_scrubs_pending()) {
3881 scrubber
.reserved
= true;
3883 dout(20) << __func__
<< ": failed to reserve remotely" << dendl
;
3884 scrubber
.reserved
= false;
3886 if (op
->get_req()->get_type() == MSG_OSD_SCRUB_RESERVE
) {
3887 const MOSDScrubReserve
*m
=
3888 static_cast<const MOSDScrubReserve
*>(op
->get_req());
3889 Message
*reply
= new MOSDScrubReserve(
3890 spg_t(info
.pgid
.pgid
, primary
.shard
),
3892 scrubber
.reserved
? MOSDScrubReserve::GRANT
: MOSDScrubReserve::REJECT
,
3894 osd
->send_message_osd_cluster(reply
, op
->get_req()->get_connection());
3896 // for jewel compat only
3897 const MOSDSubOp
*req
= static_cast<const MOSDSubOp
*>(op
->get_req());
3898 assert(req
->get_type() == MSG_OSD_SUBOP
);
3899 MOSDSubOpReply
*reply
= new MOSDSubOpReply(
3900 req
, pg_whoami
, 0, get_osdmap()->get_epoch(), CEPH_OSD_FLAG_ACK
);
3901 ::encode(scrubber
.reserved
, reply
->get_data());
3902 osd
->send_message_osd_cluster(reply
, op
->get_req()->get_connection());
3906 void PG::handle_scrub_reserve_grant(OpRequestRef op
, pg_shard_t from
)
3908 dout(7) << __func__
<< " " << *op
->get_req() << dendl
;
3910 if (!scrubber
.reserved
) {
3911 dout(10) << "ignoring obsolete scrub reserve reply" << dendl
;
3914 if (scrubber
.reserved_peers
.find(from
) != scrubber
.reserved_peers
.end()) {
3915 dout(10) << " already had osd." << from
<< " reserved" << dendl
;
3917 dout(10) << " osd." << from
<< " scrub reserve = success" << dendl
;
3918 scrubber
.reserved_peers
.insert(from
);
3923 void PG::handle_scrub_reserve_reject(OpRequestRef op
, pg_shard_t from
)
3925 dout(7) << __func__
<< " " << *op
->get_req() << dendl
;
3927 if (!scrubber
.reserved
) {
3928 dout(10) << "ignoring obsolete scrub reserve reply" << dendl
;
3931 if (scrubber
.reserved_peers
.find(from
) != scrubber
.reserved_peers
.end()) {
3932 dout(10) << " already had osd." << from
<< " reserved" << dendl
;
3934 /* One decline stops this pg from being scheduled for scrubbing. */
3935 dout(10) << " osd." << from
<< " scrub reserve = fail" << dendl
;
3936 scrubber
.reserve_failed
= true;
3941 void PG::handle_scrub_reserve_release(OpRequestRef op
)
3943 dout(7) << __func__
<< " " << *op
->get_req() << dendl
;
3945 clear_scrub_reserved();
3948 void PG::reject_reservation()
3950 osd
->send_message_osd_cluster(
3952 new MBackfillReserve(
3953 MBackfillReserve::REJECT
,
3954 spg_t(info
.pgid
.pgid
, primary
.shard
),
3955 get_osdmap()->get_epoch()),
3956 get_osdmap()->get_epoch());
3959 void PG::schedule_backfill_retry(float delay
)
3961 Mutex::Locker
lock(osd
->recovery_request_lock
);
3962 osd
->recovery_request_timer
.add_event_after(
3964 new QueuePeeringEvt
<RequestBackfill
>(
3965 this, get_osdmap()->get_epoch(),
3966 RequestBackfill()));
3969 void PG::schedule_recovery_retry(float delay
)
3971 Mutex::Locker
lock(osd
->recovery_request_lock
);
3972 osd
->recovery_request_timer
.add_event_after(
3974 new QueuePeeringEvt
<DoRecovery
>(
3975 this, get_osdmap()->get_epoch(),
3979 void PG::clear_scrub_reserved()
3981 scrubber
.reserved_peers
.clear();
3982 scrubber
.reserve_failed
= false;
3984 if (scrubber
.reserved
) {
3985 scrubber
.reserved
= false;
3986 osd
->dec_scrubs_pending();
3990 void PG::scrub_reserve_replicas()
3992 assert(backfill_targets
.empty());
3993 for (set
<pg_shard_t
>::iterator i
= actingbackfill
.begin();
3994 i
!= actingbackfill
.end();
3996 if (*i
== pg_whoami
) continue;
3997 dout(10) << "scrub requesting reserve from osd." << *i
<< dendl
;
3998 if (HAVE_FEATURE(get_min_acting_features(), SERVER_LUMINOUS
)) {
3999 osd
->send_message_osd_cluster(
4001 new MOSDScrubReserve(spg_t(info
.pgid
.pgid
, i
->shard
),
4002 get_osdmap()->get_epoch(),
4003 MOSDScrubReserve::REQUEST
, pg_whoami
),
4004 get_osdmap()->get_epoch());
4006 // for jewel compat only
4007 vector
<OSDOp
> scrub(1);
4008 scrub
[0].op
.op
= CEPH_OSD_OP_SCRUB_RESERVE
;
4012 MOSDSubOp
*subop
= new MOSDSubOp(
4013 reqid
, pg_whoami
, spg_t(info
.pgid
.pgid
, i
->shard
), poid
, 0,
4014 get_osdmap()->get_epoch(), osd
->get_tid(), v
);
4016 osd
->send_message_osd_cluster(
4017 i
->osd
, subop
, get_osdmap()->get_epoch());
4022 void PG::scrub_unreserve_replicas()
4024 assert(backfill_targets
.empty());
4025 for (set
<pg_shard_t
>::iterator i
= actingbackfill
.begin();
4026 i
!= actingbackfill
.end();
4028 if (*i
== pg_whoami
) continue;
4029 dout(10) << "scrub requesting unreserve from osd." << *i
<< dendl
;
4030 if (HAVE_FEATURE(get_min_acting_features(), SERVER_LUMINOUS
)) {
4031 osd
->send_message_osd_cluster(
4033 new MOSDScrubReserve(spg_t(info
.pgid
.pgid
, i
->shard
),
4034 get_osdmap()->get_epoch(),
4035 MOSDScrubReserve::RELEASE
, pg_whoami
),
4036 get_osdmap()->get_epoch());
4038 // for jewel compat only
4039 vector
<OSDOp
> scrub(1);
4040 scrub
[0].op
.op
= CEPH_OSD_OP_SCRUB_UNRESERVE
;
4044 MOSDSubOp
*subop
= new MOSDSubOp(
4045 reqid
, pg_whoami
, spg_t(info
.pgid
.pgid
, i
->shard
), poid
, 0,
4046 get_osdmap()->get_epoch(), osd
->get_tid(), v
);
4048 osd
->send_message_osd_cluster(i
->osd
, subop
, get_osdmap()->get_epoch());
4053 void PG::_scan_rollback_obs(
4054 const vector
<ghobject_t
> &rollback_obs
,
4055 ThreadPool::TPHandle
&handle
)
4057 ObjectStore::Transaction t
;
4058 eversion_t trimmed_to
= last_rollback_info_trimmed_to_applied
;
4059 for (vector
<ghobject_t
>::const_iterator i
= rollback_obs
.begin();
4060 i
!= rollback_obs
.end();
4062 if (i
->generation
< trimmed_to
.version
) {
4063 osd
->clog
->error() << "osd." << osd
->whoami
4064 << " pg " << info
.pgid
4065 << " found obsolete rollback obj "
4066 << *i
<< " generation < trimmed_to "
4073 derr
<< __func__
<< ": queueing trans to clean up obsolete rollback objs"
4075 osd
->store
->queue_transaction(osr
.get(), std::move(t
), NULL
);
4079 void PG::_scan_snaps(ScrubMap
&smap
)
4084 // Test qa/standalone/scrub/osd-scrub-snaps.sh uses this message to verify
4085 // caller using clean_meta_map(), and it works properly.
4086 dout(20) << __func__
<< " start" << dendl
;
4088 for (map
<hobject_t
, ScrubMap::object
>::reverse_iterator i
= smap
.objects
.rbegin();
4089 i
!= smap
.objects
.rend();
4091 const hobject_t
&hoid
= i
->first
;
4092 ScrubMap::object
&o
= i
->second
;
4094 dout(20) << __func__
<< " " << hoid
<< dendl
;
4096 if (hoid
.is_head() || hoid
.is_snapdir()) {
4097 // parse the SnapSet
4099 if (o
.attrs
.find(SS_ATTR
) == o
.attrs
.end()) {
4102 bl
.push_back(o
.attrs
[SS_ATTR
]);
4103 auto p
= bl
.begin();
4105 ::decode(snapset
, p
);
4109 head
= hoid
.get_head();
4110 // Make sure head_exists is correct for is_legacy() check
4112 snapset
.head_exists
= true;
4115 if (hoid
.snap
< CEPH_MAXSNAP
) {
4116 // check and if necessary fix snap_mapper
4117 if (hoid
.get_head() != head
) {
4118 derr
<< __func__
<< " no head for " << hoid
<< " (have " << head
<< ")"
4122 set
<snapid_t
> obj_snaps
;
4123 if (!snapset
.is_legacy()) {
4124 auto p
= snapset
.clone_snaps
.find(hoid
.snap
);
4125 if (p
== snapset
.clone_snaps
.end()) {
4126 derr
<< __func__
<< " no clone_snaps for " << hoid
<< " in " << snapset
4130 obj_snaps
.insert(p
->second
.begin(), p
->second
.end());
4133 if (o
.attrs
.find(OI_ATTR
) == o
.attrs
.end()) {
4136 bl
.push_back(o
.attrs
[OI_ATTR
]);
4143 obj_snaps
.insert(oi
.legacy_snaps
.begin(), oi
.legacy_snaps
.end());
4145 set
<snapid_t
> cur_snaps
;
4146 int r
= snap_mapper
.get_snaps(hoid
, &cur_snaps
);
4147 if (r
!= 0 && r
!= -ENOENT
) {
4148 derr
<< __func__
<< ": get_snaps returned " << cpp_strerror(r
) << dendl
;
4151 if (r
== -ENOENT
|| cur_snaps
!= obj_snaps
) {
4152 ObjectStore::Transaction t
;
4153 OSDriver::OSTransaction
_t(osdriver
.get_transaction(&t
));
4155 r
= snap_mapper
.remove_oid(hoid
, &_t
);
4157 derr
<< __func__
<< ": remove_oid returned " << cpp_strerror(r
)
4161 osd
->clog
->error() << "osd." << osd
->whoami
4162 << " found snap mapper error on pg "
4164 << " oid " << hoid
<< " snaps in mapper: "
4165 << cur_snaps
<< ", oi: "
4169 osd
->clog
->error() << "osd." << osd
->whoami
4170 << " found snap mapper error on pg "
4172 << " oid " << hoid
<< " snaps missing in mapper"
4177 snap_mapper
.add_oid(hoid
, obj_snaps
, &_t
);
4179 // wait for repair to apply to avoid confusing other bits of the system.
4182 Mutex
my_lock("PG::_scan_snaps my_lock");
4185 t
.register_on_applied_sync(
4186 new C_SafeCond(&my_lock
, &my_cond
, &done
, &r
));
4187 r
= osd
->store
->apply_transaction(osr
.get(), std::move(t
));
4189 derr
<< __func__
<< ": apply_transaction got " << cpp_strerror(r
)
4194 my_cond
.Wait(my_lock
);
4203 void PG::_repair_oinfo_oid(ScrubMap
&smap
)
4205 for (map
<hobject_t
, ScrubMap::object
>::reverse_iterator i
= smap
.objects
.rbegin();
4206 i
!= smap
.objects
.rend();
4208 const hobject_t
&hoid
= i
->first
;
4209 ScrubMap::object
&o
= i
->second
;
4212 if (o
.attrs
.find(OI_ATTR
) == o
.attrs
.end()) {
4215 bl
.push_back(o
.attrs
[OI_ATTR
]);
4222 if (oi
.soid
!= hoid
) {
4223 ObjectStore::Transaction t
;
4224 OSDriver::OSTransaction
_t(osdriver
.get_transaction(&t
));
4225 osd
->clog
->error() << "osd." << osd
->whoami
4226 << " found object info error on pg "
4228 << " oid " << hoid
<< " oid in object info: "
4234 ::encode(oi
, bl
, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD
, nullptr));
4236 bufferptr
bp(bl
.c_str(), bl
.length());
4237 o
.attrs
[OI_ATTR
] = bp
;
4239 t
.setattr(coll
, ghobject_t(hoid
), OI_ATTR
, bl
);
4240 int r
= osd
->store
->apply_transaction(osr
.get(), std::move(t
));
4242 derr
<< __func__
<< ": apply_transaction got " << cpp_strerror(r
)
4248 int PG::build_scrub_map_chunk(
4250 ScrubMapBuilder
&pos
,
4254 ThreadPool::TPHandle
&handle
)
4256 dout(10) << __func__
<< " [" << start
<< "," << end
<< ") "
4261 while (pos
.empty()) {
4263 map
.valid_through
= info
.last_update
;
4267 vector
<ghobject_t
> rollback_obs
;
4268 pos
.ret
= get_pgbackend()->objects_list_range(
4275 dout(5) << "objects_list_range error: " << pos
.ret
<< dendl
;
4278 if (pos
.ls
.empty()) {
4281 _scan_rollback_obs(rollback_obs
, handle
);
4283 return -EINPROGRESS
;
4287 while (!pos
.done()) {
4288 int r
= get_pgbackend()->be_scan_list(map
, pos
);
4289 if (r
== -EINPROGRESS
) {
4295 dout(20) << __func__
<< " finishing" << dendl
;
4297 _repair_oinfo_oid(map
);
4298 if (!is_primary()) {
4299 ScrubMap for_meta_scrub
;
4300 // In case we restarted smaller chunk, clear old data
4301 scrubber
.cleaned_meta_map
.clear_from(scrubber
.start
);
4302 scrubber
.cleaned_meta_map
.insert(map
);
4303 scrubber
.clean_meta_map(for_meta_scrub
);
4304 _scan_snaps(for_meta_scrub
);
4307 dout(20) << __func__
<< " done, got " << map
.objects
.size() << " items"
4312 void PG::Scrubber::cleanup_store(ObjectStore::Transaction
*t
) {
4315 struct OnComplete
: Context
{
4316 std::unique_ptr
<Scrub::Store
> store
;
4318 std::unique_ptr
<Scrub::Store
> &&store
)
4319 : store(std::move(store
)) {}
4320 void finish(int) override
{}
4323 t
->register_on_complete(new OnComplete(std::move(store
)));
4327 void PG::repair_object(
4328 const hobject_t
& soid
, list
<pair
<ScrubMap::object
, pg_shard_t
> > *ok_peers
,
4329 pg_shard_t bad_peer
)
4331 list
<pg_shard_t
> op_shards
;
4332 for (auto i
: *ok_peers
) {
4333 op_shards
.push_back(i
.second
);
4335 dout(10) << "repair_object " << soid
<< " bad_peer osd."
4336 << bad_peer
<< " ok_peers osd.{" << op_shards
<< "}" << dendl
;
4337 ScrubMap::object
&po
= ok_peers
->back().first
;
4340 bv
.push_back(po
.attrs
[OI_ATTR
]);
4343 bufferlist::iterator bliter
= bv
.begin();
4344 ::decode(oi
, bliter
);
4346 dout(0) << __func__
<< ": Need version of replica, bad object_info_t: " << soid
<< dendl
;
4349 if (bad_peer
!= primary
) {
4350 peer_missing
[bad_peer
].add(soid
, oi
.version
, eversion_t(), false);
4352 // We should only be scrubbing if the PG is clean.
4353 assert(waiting_for_unreadable_object
.empty());
4355 pg_log
.missing_add(soid
, oi
.version
, eversion_t());
4357 pg_log
.set_last_requested(0);
4358 dout(10) << __func__
<< ": primary = " << primary
<< dendl
;
4361 if (is_ec_pg() || bad_peer
== primary
) {
4362 // we'd better collect all shard for EC pg, and prepare good peers as the
4363 // source of pull in the case of replicated pg.
4364 missing_loc
.add_missing(soid
, oi
.version
, eversion_t());
4365 list
<pair
<ScrubMap::object
, pg_shard_t
> >::iterator i
;
4366 for (i
= ok_peers
->begin();
4367 i
!= ok_peers
->end();
4369 missing_loc
.add_location(soid
, i
->second
);
4375 * Wait for last_update_applied to match msg->scrub_to as above. Wait
4376 * for pushes to complete in case of recent recovery. Build a single
4377 * scrubmap of objects that are in the range [msg->start, msg->end).
4379 void PG::replica_scrub(
4381 ThreadPool::TPHandle
&handle
)
4383 const MOSDRepScrub
*msg
= static_cast<const MOSDRepScrub
*>(op
->get_req());
4384 assert(!scrubber
.active_rep_scrub
);
4385 dout(7) << "replica_scrub" << dendl
;
4387 if (msg
->map_epoch
< info
.history
.same_interval_since
) {
4388 dout(10) << "replica_scrub discarding old replica_scrub from "
4389 << msg
->map_epoch
<< " < " << info
.history
.same_interval_since
4394 assert(msg
->chunky
);
4395 if (last_update_applied
< msg
->scrub_to
) {
4396 dout(10) << "waiting for last_update_applied to catch up" << dendl
;
4397 scrubber
.active_rep_scrub
= op
;
4401 if (active_pushes
> 0) {
4402 dout(10) << "waiting for active pushes to finish" << dendl
;
4403 scrubber
.active_rep_scrub
= op
;
4407 scrubber
.state
= Scrubber::BUILD_MAP_REPLICA
;
4408 scrubber
.replica_scrub_start
= msg
->min_epoch
;
4409 scrubber
.start
= msg
->start
;
4410 scrubber
.end
= msg
->end
;
4411 scrubber
.max_end
= msg
->end
;
4412 scrubber
.deep
= msg
->deep
;
4413 scrubber
.epoch_start
= info
.history
.same_interval_since
;
4414 if (msg
->priority
) {
4415 scrubber
.priority
= msg
->priority
;
4417 scrubber
.priority
= get_scrub_priority();
4420 scrub_can_preempt
= msg
->allow_preemption
;
4421 scrub_preempted
= false;
4422 scrubber
.replica_scrubmap_pos
.reset();
4424 requeue_scrub(msg
->high_priority
);
4428 * PG_STATE_SCRUBBING is set when the scrub is queued
4430 * scrub will be chunky if all OSDs in PG support chunky scrub
4431 * scrub will fail if OSDs are too old.
4433 void PG::scrub(epoch_t queued
, ThreadPool::TPHandle
&handle
)
4435 if (cct
->_conf
->osd_scrub_sleep
> 0 &&
4436 (scrubber
.state
== PG::Scrubber::NEW_CHUNK
||
4437 scrubber
.state
== PG::Scrubber::INACTIVE
) &&
4438 scrubber
.needs_sleep
) {
4439 ceph_assert(!scrubber
.sleeping
);
4440 dout(20) << __func__
<< " state is INACTIVE|NEW_CHUNK, sleeping" << dendl
;
4442 // Do an async sleep so we don't block the op queue
4443 OSDService
*osds
= osd
;
4444 spg_t pgid
= get_pgid();
4445 int state
= scrubber
.state
;
4446 auto scrub_requeue_callback
=
4447 new FunctionContext([osds
, pgid
, state
](int r
) {
4448 PG
*pg
= osds
->osd
->lookup_lock_pg(pgid
);
4449 if (pg
== nullptr) {
4450 lgeneric_dout(osds
->osd
->cct
, 20)
4451 << "scrub_requeue_callback: Could not find "
4452 << "PG " << pgid
<< " can't complete scrub requeue after sleep"
4456 pg
->scrubber
.sleeping
= false;
4457 pg
->scrubber
.needs_sleep
= false;
4458 lgeneric_dout(pg
->cct
, 20)
4459 << "scrub_requeue_callback: slept for "
4460 << ceph_clock_now() - pg
->scrubber
.sleep_start
4461 << ", re-queuing scrub with state " << state
<< dendl
;
4462 pg
->scrub_queued
= false;
4463 pg
->requeue_scrub();
4464 pg
->scrubber
.sleep_start
= utime_t();
4467 Mutex::Locker
l(osd
->scrub_sleep_lock
);
4468 osd
->scrub_sleep_timer
.add_event_after(cct
->_conf
->osd_scrub_sleep
,
4469 scrub_requeue_callback
);
4470 scrubber
.sleeping
= true;
4471 scrubber
.sleep_start
= ceph_clock_now();
4474 if (pg_has_reset_since(queued
)) {
4477 assert(scrub_queued
);
4478 scrub_queued
= false;
4479 scrubber
.needs_sleep
= true;
4482 if (!is_primary() &&
4483 scrubber
.state
== PG::Scrubber::BUILD_MAP_REPLICA
) {
4484 chunky_scrub(handle
);
4488 if (!is_primary() || !is_active() || !is_clean() || !is_scrubbing()) {
4489 dout(10) << "scrub -- not primary or active or not clean" << dendl
;
4490 state_clear(PG_STATE_SCRUBBING
);
4491 state_clear(PG_STATE_REPAIR
);
4492 state_clear(PG_STATE_DEEP_SCRUB
);
4493 publish_stats_to_osd();
4497 if (!scrubber
.active
) {
4498 assert(backfill_targets
.empty());
4500 scrubber
.deep
= state_test(PG_STATE_DEEP_SCRUB
);
4502 dout(10) << "starting a new chunky scrub" << dendl
;
4505 chunky_scrub(handle
);
4509 * Chunky scrub scrubs objects one chunk at a time with writes blocked for that
4512 * The object store is partitioned into chunks which end on hash boundaries. For
4513 * each chunk, the following logic is performed:
4515 * (1) Block writes on the chunk
4516 * (2) Request maps from replicas
4517 * (3) Wait for pushes to be applied (after recovery)
4518 * (4) Wait for writes to flush on the chunk
4519 * (5) Wait for maps from replicas
4520 * (6) Compare / repair all scrub maps
4521 * (7) Wait for digest updates to apply
4523 * This logic is encoded in the mostly linear state machine:
4525 * +------------------+
4526 * _________v__________ |
4529 * |____________________| |
4532 * _________v___v______ | |
4535 * |____________________| | |
4537 * _________v__________ | |
4539 * | WAIT_PUSHES | | |
4540 * |____________________| | |
4542 * _________v__________ | |
4544 * | WAIT_LAST_UPDATE | | |
4545 * |____________________| | |
4547 * _________v__________ | |
4550 * |____________________| | |
4552 * _________v__________ | |
4554 * | WAIT_REPLICAS | | |
4555 * |____________________| | |
4557 * _________v__________ | |
4559 * | COMPARE_MAPS | | |
4560 * |____________________| | |
4563 * _________v__________ | |
4565 * |WAIT_DIGEST_UPDATES | | |
4566 * |____________________| | |
4569 * _________v__________ |
4572 * |____________________| |
4574 * +------------------+
4576 * The primary determines the last update from the subset by walking the log. If
4577 * it sees a log entry pertaining to a file in the chunk, it tells the replicas
4578 * to wait until that update is applied before building a scrub map. Both the
4579 * primary and replicas will wait for any active pushes to be applied.
4581 * In contrast to classic_scrub, chunky_scrub is entirely handled by scrub_wq.
4583 * scrubber.state encodes the current state of the scrub (refer to state diagram
4586 void PG::chunky_scrub(ThreadPool::TPHandle
&handle
)
4588 // check for map changes
4589 if (scrubber
.is_chunky_scrub_active()) {
4590 if (scrubber
.epoch_start
!= info
.history
.same_interval_since
) {
4591 dout(10) << "scrub pg changed, aborting" << dendl
;
4592 scrub_clear_state();
4593 scrub_unreserve_replicas();
4602 dout(20) << "scrub state " << Scrubber::state_string(scrubber
.state
)
4603 << " [" << scrubber
.start
<< "," << scrubber
.end
<< ")"
4604 << " max_end " << scrubber
.max_end
<< dendl
;
4606 switch (scrubber
.state
) {
4607 case PG::Scrubber::INACTIVE
:
4608 dout(10) << "scrub start" << dendl
;
4609 assert(is_primary());
4611 publish_stats_to_osd();
4612 scrubber
.epoch_start
= info
.history
.same_interval_since
;
4613 scrubber
.active
= true;
4615 osd
->inc_scrubs_active(scrubber
.reserved
);
4616 if (scrubber
.reserved
) {
4617 scrubber
.reserved
= false;
4618 scrubber
.reserved_peers
.clear();
4622 ObjectStore::Transaction t
;
4623 scrubber
.cleanup_store(&t
);
4624 scrubber
.store
.reset(Scrub::Store::create(osd
->store
, &t
,
4626 osd
->store
->queue_transaction(osr
.get(), std::move(t
), nullptr);
4629 // Don't include temporary objects when scrubbing
4630 scrubber
.start
= info
.pgid
.pgid
.get_hobj_start();
4631 scrubber
.state
= PG::Scrubber::NEW_CHUNK
;
4634 bool repair
= state_test(PG_STATE_REPAIR
);
4635 bool deep_scrub
= state_test(PG_STATE_DEEP_SCRUB
);
4636 const char *mode
= (repair
? "repair": (deep_scrub
? "deep-scrub" : "scrub"));
4638 oss
<< info
.pgid
.pgid
<< " " << mode
<< " starts" << std::endl
;
4639 osd
->clog
->debug(oss
);
4642 scrubber
.preempt_left
= cct
->_conf
->get_val
<uint64_t>(
4643 "osd_scrub_max_preemptions");
4644 scrubber
.preempt_divisor
= 1;
4647 case PG::Scrubber::NEW_CHUNK
:
4648 scrubber
.primary_scrubmap
= ScrubMap();
4649 scrubber
.received_maps
.clear();
4651 // begin (possible) preemption window
4652 if (scrub_preempted
) {
4653 scrubber
.preempt_left
--;
4654 scrubber
.preempt_divisor
*= 2;
4655 dout(10) << __func__
<< " preempted, " << scrubber
.preempt_left
4656 << " left" << dendl
;
4657 scrub_preempted
= false;
4659 scrub_can_preempt
= scrubber
.preempt_left
> 0;
4662 /* get the start and end of our scrub chunk
4664 * Our scrub chunk has an important restriction we're going to need to
4665 * respect. We can't let head or snapdir be start or end.
4666 * Using a half-open interval means that if end == head|snapdir,
4667 * we'd scrub/lock head and the clone right next to head in different
4668 * chunks which would allow us to miss clones created between
4669 * scrubbing that chunk and scrubbing the chunk including head.
4670 * This isn't true for any of the other clones since clones can
4671 * only be created "just to the left of" head. There is one exception
4672 * to this: promotion of clones which always happens to the left of the
4673 * left-most clone, but promote_object checks the scrubber in that
4674 * case, so it should be ok. Also, it's ok to "miss" clones at the
4675 * left end of the range if we are a tier because they may legitimately
4676 * not exist (see _scrub).
4678 int min
= std::max
<int64_t>(3, cct
->_conf
->osd_scrub_chunk_min
/
4679 scrubber
.preempt_divisor
);
4680 int max
= std::max
<int64_t>(min
, cct
->_conf
->osd_scrub_chunk_max
/
4681 scrubber
.preempt_divisor
);
4682 hobject_t start
= scrubber
.start
;
4683 hobject_t candidate_end
;
4684 vector
<hobject_t
> objects
;
4686 ret
= get_pgbackend()->objects_list_partial(
4694 if (!objects
.empty()) {
4695 hobject_t back
= objects
.back();
4696 while (candidate_end
.has_snapset() &&
4697 candidate_end
.get_head() == back
.get_head()) {
4698 candidate_end
= back
;
4700 if (objects
.empty()) {
4702 "Somehow we got more than 2 objects which"
4703 "have the same head but are not clones");
4705 back
= objects
.back();
4707 if (candidate_end
.has_snapset()) {
4708 assert(candidate_end
.get_head() != back
.get_head());
4709 candidate_end
= candidate_end
.get_object_boundary();
4712 assert(candidate_end
.is_max());
4715 if (!_range_available_for_scrub(scrubber
.start
, candidate_end
)) {
4716 // we'll be requeued by whatever made us unavailable for scrub
4717 dout(10) << __func__
<< ": scrub blocked somewhere in range "
4718 << "[" << scrubber
.start
<< ", " << candidate_end
<< ")"
4723 scrubber
.end
= candidate_end
;
4724 if (scrubber
.end
> scrubber
.max_end
)
4725 scrubber
.max_end
= scrubber
.end
;
4728 // walk the log to find the latest update that affects our chunk
4729 scrubber
.subset_last_update
= eversion_t();
4730 for (auto p
= projected_log
.log
.rbegin();
4731 p
!= projected_log
.log
.rend();
4733 if (p
->soid
>= scrubber
.start
&&
4734 p
->soid
< scrubber
.end
) {
4735 scrubber
.subset_last_update
= p
->version
;
4739 if (scrubber
.subset_last_update
== eversion_t()) {
4740 for (list
<pg_log_entry_t
>::const_reverse_iterator p
=
4741 pg_log
.get_log().log
.rbegin();
4742 p
!= pg_log
.get_log().log
.rend();
4744 if (p
->soid
>= scrubber
.start
&&
4745 p
->soid
< scrubber
.end
) {
4746 scrubber
.subset_last_update
= p
->version
;
4752 // ask replicas to wait until
4753 // last_update_applied >= scrubber.subset_last_update and then scan
4754 scrubber
.waiting_on_whom
.insert(pg_whoami
);
4756 // request maps from replicas
4757 for (set
<pg_shard_t
>::iterator i
= actingbackfill
.begin();
4758 i
!= actingbackfill
.end();
4760 if (*i
== pg_whoami
) continue;
4761 _request_scrub_map(*i
, scrubber
.subset_last_update
,
4762 scrubber
.start
, scrubber
.end
, scrubber
.deep
,
4763 scrubber
.preempt_left
> 0);
4764 scrubber
.waiting_on_whom
.insert(*i
);
4766 dout(10) << __func__
<< " waiting_on_whom " << scrubber
.waiting_on_whom
4769 scrubber
.state
= PG::Scrubber::WAIT_PUSHES
;
4772 case PG::Scrubber::WAIT_PUSHES
:
4773 if (active_pushes
== 0) {
4774 scrubber
.state
= PG::Scrubber::WAIT_LAST_UPDATE
;
4776 dout(15) << "wait for pushes to apply" << dendl
;
4781 case PG::Scrubber::WAIT_LAST_UPDATE
:
4782 if (last_update_applied
< scrubber
.subset_last_update
) {
4783 // will be requeued by op_applied
4784 dout(15) << "wait for writes to flush" << dendl
;
4789 scrubber
.state
= PG::Scrubber::BUILD_MAP
;
4790 scrubber
.primary_scrubmap_pos
.reset();
4793 case PG::Scrubber::BUILD_MAP
:
4794 assert(last_update_applied
>= scrubber
.subset_last_update
);
4796 // build my own scrub map
4797 if (scrub_preempted
) {
4798 dout(10) << __func__
<< " preempted" << dendl
;
4799 scrubber
.state
= PG::Scrubber::BUILD_MAP_DONE
;
4802 ret
= build_scrub_map_chunk(
4803 scrubber
.primary_scrubmap
,
4804 scrubber
.primary_scrubmap_pos
,
4805 scrubber
.start
, scrubber
.end
,
4808 if (ret
== -EINPROGRESS
) {
4813 scrubber
.state
= PG::Scrubber::BUILD_MAP_DONE
;
4816 case PG::Scrubber::BUILD_MAP_DONE
:
4817 if (scrubber
.primary_scrubmap_pos
.ret
< 0) {
4818 dout(5) << "error: " << scrubber
.primary_scrubmap_pos
.ret
4819 << ", aborting" << dendl
;
4820 scrub_clear_state();
4821 scrub_unreserve_replicas();
4824 dout(10) << __func__
<< " waiting_on_whom was "
4825 << scrubber
.waiting_on_whom
<< dendl
;
4826 assert(scrubber
.waiting_on_whom
.count(pg_whoami
));
4827 scrubber
.waiting_on_whom
.erase(pg_whoami
);
4829 scrubber
.state
= PG::Scrubber::WAIT_REPLICAS
;
4832 case PG::Scrubber::WAIT_REPLICAS
:
4833 if (!scrubber
.waiting_on_whom
.empty()) {
4834 // will be requeued by sub_op_scrub_map
4835 dout(10) << "wait for replicas to build scrub map" << dendl
;
4839 // end (possible) preemption window
4840 scrub_can_preempt
= false;
4841 if (scrub_preempted
) {
4842 dout(10) << __func__
<< " preempted, restarting chunk" << dendl
;
4843 scrubber
.state
= PG::Scrubber::NEW_CHUNK
;
4845 scrubber
.state
= PG::Scrubber::COMPARE_MAPS
;
4849 case PG::Scrubber::COMPARE_MAPS
:
4850 assert(last_update_applied
>= scrubber
.subset_last_update
);
4851 assert(scrubber
.waiting_on_whom
.empty());
4853 scrub_compare_maps();
4854 scrubber
.start
= scrubber
.end
;
4855 scrubber
.run_callbacks();
4857 // requeue the writes from the chunk that just finished
4858 requeue_ops(waiting_for_scrub
);
4860 scrubber
.state
= PG::Scrubber::WAIT_DIGEST_UPDATES
;
4864 case PG::Scrubber::WAIT_DIGEST_UPDATES
:
4865 if (scrubber
.num_digest_updates_pending
) {
4866 dout(10) << __func__
<< " waiting on "
4867 << scrubber
.num_digest_updates_pending
4868 << " digest updates" << dendl
;
4873 scrubber
.preempt_left
= cct
->_conf
->get_val
<uint64_t>(
4874 "osd_scrub_max_preemptions");
4875 scrubber
.preempt_divisor
= 1;
4877 if (!(scrubber
.end
.is_max())) {
4878 scrubber
.state
= PG::Scrubber::NEW_CHUNK
;
4882 scrubber
.state
= PG::Scrubber::FINISH
;
4887 case PG::Scrubber::FINISH
:
4889 scrubber
.state
= PG::Scrubber::INACTIVE
;
4892 if (!snap_trimq
.empty()) {
4893 dout(10) << "scrub finished, requeuing snap_trimmer" << dendl
;
4894 snap_trimmer_scrub_complete();
4899 case PG::Scrubber::BUILD_MAP_REPLICA
:
4900 // build my own scrub map
4901 if (scrub_preempted
) {
4902 dout(10) << __func__
<< " preempted" << dendl
;
4905 ret
= build_scrub_map_chunk(
4906 scrubber
.replica_scrubmap
,
4907 scrubber
.replica_scrubmap_pos
,
4908 scrubber
.start
, scrubber
.end
,
4912 if (ret
== -EINPROGRESS
) {
4918 if (HAVE_FEATURE(acting_features
, SERVER_LUMINOUS
)) {
4919 MOSDRepScrubMap
*reply
= new MOSDRepScrubMap(
4920 spg_t(info
.pgid
.pgid
, get_primary().shard
),
4921 scrubber
.replica_scrub_start
,
4923 reply
->preempted
= scrub_preempted
;
4924 ::encode(scrubber
.replica_scrubmap
, reply
->get_data());
4925 osd
->send_message_osd_cluster(
4926 get_primary().osd
, reply
,
4927 scrubber
.replica_scrub_start
);
4929 // for jewel compatibility
4930 vector
<OSDOp
> scrub(1);
4931 scrub
[0].op
.op
= CEPH_OSD_OP_SCRUB_MAP
;
4935 MOSDSubOp
*subop
= new MOSDSubOp(
4938 spg_t(info
.pgid
.pgid
, get_primary().shard
),
4941 scrubber
.replica_scrub_start
,
4944 ::encode(scrubber
.replica_scrubmap
, subop
->get_data());
4946 osd
->send_message_osd_cluster(
4947 get_primary().osd
, subop
,
4948 scrubber
.replica_scrub_start
);
4950 scrub_preempted
= false;
4951 scrub_can_preempt
= false;
4952 scrubber
.state
= PG::Scrubber::INACTIVE
;
4953 scrubber
.replica_scrubmap
= ScrubMap();
4954 scrubber
.replica_scrubmap_pos
= ScrubMapBuilder();
4955 scrubber
.start
= hobject_t();
4956 scrubber
.end
= hobject_t();
4957 scrubber
.max_end
= hobject_t();
4965 dout(20) << "scrub final state " << Scrubber::state_string(scrubber
.state
)
4966 << " [" << scrubber
.start
<< "," << scrubber
.end
<< ")"
4967 << " max_end " << scrubber
.max_end
<< dendl
;
4970 bool PG::write_blocked_by_scrub(const hobject_t
& soid
)
4972 if (soid
< scrubber
.start
|| soid
>= scrubber
.end
) {
4975 if (scrub_can_preempt
) {
4976 if (!scrub_preempted
) {
4977 dout(10) << __func__
<< " " << soid
<< " preempted" << dendl
;
4978 scrub_preempted
= true;
4980 dout(10) << __func__
<< " " << soid
<< " already preempted" << dendl
;
4987 bool PG::range_intersects_scrub(const hobject_t
&start
, const hobject_t
& end
)
4989 // does [start, end] intersect [scrubber.start, scrubber.max_end)
4990 return (start
< scrubber
.max_end
&&
4991 end
>= scrubber
.start
);
4994 void PG::scrub_clear_state()
4996 assert(is_locked());
4997 state_clear(PG_STATE_SCRUBBING
);
4998 state_clear(PG_STATE_REPAIR
);
4999 state_clear(PG_STATE_DEEP_SCRUB
);
5000 publish_stats_to_osd();
5002 // active -> nothing.
5003 if (scrubber
.active
)
5004 osd
->dec_scrubs_active();
5006 requeue_ops(waiting_for_scrub
);
5010 // type-specific state clear
5011 _scrub_clear_state();
5014 void PG::scrub_compare_maps()
5016 dout(10) << __func__
<< " has maps, analyzing" << dendl
;
5018 // construct authoritative scrub map for type specific scrubbing
5019 scrubber
.cleaned_meta_map
.insert(scrubber
.primary_scrubmap
);
5021 pair
<boost::optional
<uint32_t>,
5022 boost::optional
<uint32_t>>> missing_digest
;
5024 map
<pg_shard_t
, ScrubMap
*> maps
;
5025 maps
[pg_whoami
] = &scrubber
.primary_scrubmap
;
5027 for (const auto& i
: actingbackfill
) {
5028 if (i
== pg_whoami
) continue;
5029 dout(2) << __func__
<< " replica " << i
<< " has "
5030 << scrubber
.received_maps
[i
].objects
.size()
5031 << " items" << dendl
;
5032 maps
[i
] = &scrubber
.received_maps
[i
];
5035 set
<hobject_t
> master_set
;
5037 // Construct master set
5038 for (const auto map
: maps
) {
5039 for (const auto i
: map
.second
->objects
) {
5040 master_set
.insert(i
.first
);
5045 get_pgbackend()->be_large_omap_check(maps
, master_set
,
5046 scrubber
.large_omap_objects
, ss
);
5047 if (!ss
.str().empty()) {
5048 osd
->clog
->warn(ss
);
5051 if (acting
.size() > 1) {
5052 dout(10) << __func__
<< " comparing replica scrub maps" << dendl
;
5054 // Map from object with errors to good peer
5055 map
<hobject_t
, list
<pg_shard_t
>> authoritative
;
5057 dout(2) << __func__
<< " osd." << acting
[0] << " has "
5058 << scrubber
.primary_scrubmap
.objects
.size() << " items" << dendl
;
5063 get_pgbackend()->be_compare_scrubmaps(
5066 state_test(PG_STATE_REPAIR
),
5068 scrubber
.inconsistent
,
5071 scrubber
.shallow_errors
,
5072 scrubber
.deep_errors
,
5073 scrubber
.store
.get(),
5076 dout(2) << ss
.str() << dendl
;
5078 if (!ss
.str().empty()) {
5079 osd
->clog
->error(ss
);
5082 for (map
<hobject_t
, list
<pg_shard_t
>>::iterator i
= authoritative
.begin();
5083 i
!= authoritative
.end();
5085 list
<pair
<ScrubMap::object
, pg_shard_t
> > good_peers
;
5086 for (list
<pg_shard_t
>::const_iterator j
= i
->second
.begin();
5087 j
!= i
->second
.end();
5089 good_peers
.push_back(make_pair(maps
[*j
]->objects
[i
->first
], *j
));
5091 scrubber
.authoritative
.insert(
5097 for (map
<hobject_t
, list
<pg_shard_t
>>::iterator i
= authoritative
.begin();
5098 i
!= authoritative
.end();
5100 scrubber
.cleaned_meta_map
.objects
.erase(i
->first
);
5101 scrubber
.cleaned_meta_map
.objects
.insert(
5102 *(maps
[i
->second
.back()]->objects
.find(i
->first
))
5107 ScrubMap for_meta_scrub
;
5108 scrubber
.clean_meta_map(for_meta_scrub
);
5110 // ok, do the pg-type specific scrubbing
5111 scrub_snapshot_metadata(for_meta_scrub
, missing_digest
);
5112 // Called here on the primary can use an authoritative map if it isn't the primary
5113 _scan_snaps(for_meta_scrub
);
5114 if (!scrubber
.store
->empty()) {
5115 if (state_test(PG_STATE_REPAIR
)) {
5116 dout(10) << __func__
<< ": discarding scrub results" << dendl
;
5117 scrubber
.store
->flush(nullptr);
5119 dout(10) << __func__
<< ": updating scrub object" << dendl
;
5120 ObjectStore::Transaction t
;
5121 scrubber
.store
->flush(&t
);
5122 osd
->store
->queue_transaction(osr
.get(), std::move(t
), nullptr);
5127 bool PG::scrub_process_inconsistent()
5129 dout(10) << __func__
<< ": checking authoritative" << dendl
;
5130 bool repair
= state_test(PG_STATE_REPAIR
);
5131 bool deep_scrub
= state_test(PG_STATE_DEEP_SCRUB
);
5132 const char *mode
= (repair
? "repair": (deep_scrub
? "deep-scrub" : "scrub"));
5134 // authoriative only store objects which missing or inconsistent.
5135 if (!scrubber
.authoritative
.empty()) {
5137 ss
<< info
.pgid
<< " " << mode
<< " "
5138 << scrubber
.missing
.size() << " missing, "
5139 << scrubber
.inconsistent
.size() << " inconsistent objects";
5140 dout(2) << ss
.str() << dendl
;
5141 osd
->clog
->error(ss
);
5143 state_clear(PG_STATE_CLEAN
);
5144 for (map
<hobject_t
, list
<pair
<ScrubMap::object
, pg_shard_t
> >>::iterator i
=
5145 scrubber
.authoritative
.begin();
5146 i
!= scrubber
.authoritative
.end();
5148 set
<pg_shard_t
>::iterator j
;
5150 auto missing_entry
= scrubber
.missing
.find(i
->first
);
5151 if (missing_entry
!= scrubber
.missing
.end()) {
5152 for (j
= missing_entry
->second
.begin();
5153 j
!= missing_entry
->second
.end();
5162 if (scrubber
.inconsistent
.count(i
->first
)) {
5163 for (j
= scrubber
.inconsistent
[i
->first
].begin();
5164 j
!= scrubber
.inconsistent
[i
->first
].end();
5166 repair_object(i
->first
,
5175 return (!scrubber
.authoritative
.empty() && repair
);
5178 bool PG::ops_blocked_by_scrub() const {
5179 return (waiting_for_scrub
.size() != 0);
5182 // the part that actually finalizes a scrub
5183 void PG::scrub_finish()
5185 bool repair
= state_test(PG_STATE_REPAIR
);
5186 // if the repair request comes from auto-repair and large number of errors,
5187 // we would like to cancel auto-repair
5188 if (repair
&& scrubber
.auto_repair
5189 && scrubber
.authoritative
.size() > cct
->_conf
->osd_scrub_auto_repair_num_errors
) {
5190 state_clear(PG_STATE_REPAIR
);
5193 bool deep_scrub
= state_test(PG_STATE_DEEP_SCRUB
);
5194 const char *mode
= (repair
? "repair": (deep_scrub
? "deep-scrub" : "scrub"));
5196 // type-specific finish (can tally more errors)
5199 bool has_error
= scrub_process_inconsistent();
5203 oss
<< info
.pgid
.pgid
<< " " << mode
<< " ";
5204 int total_errors
= scrubber
.shallow_errors
+ scrubber
.deep_errors
;
5206 oss
<< total_errors
<< " errors";
5209 if (!deep_scrub
&& info
.stats
.stats
.sum
.num_deep_scrub_errors
)
5210 oss
<< " ( " << info
.stats
.stats
.sum
.num_deep_scrub_errors
5211 << " remaining deep scrub error details lost)";
5213 oss
<< ", " << scrubber
.fixed
<< " fixed";
5215 osd
->clog
->error(oss
);
5217 osd
->clog
->debug(oss
);
5222 utime_t now
= ceph_clock_now();
5223 info
.history
.last_scrub
= info
.last_update
;
5224 info
.history
.last_scrub_stamp
= now
;
5225 if (scrubber
.deep
) {
5226 info
.history
.last_deep_scrub
= info
.last_update
;
5227 info
.history
.last_deep_scrub_stamp
= now
;
5229 // Since we don't know which errors were fixed, we can only clear them
5230 // when every one has been fixed.
5232 if (scrubber
.fixed
== scrubber
.shallow_errors
+ scrubber
.deep_errors
) {
5234 scrubber
.shallow_errors
= scrubber
.deep_errors
= 0;
5236 // Deep scrub in order to get corrected error counts
5237 scrub_after_recovery
= true;
5241 if ((scrubber
.shallow_errors
== 0) && (scrubber
.deep_errors
== 0))
5242 info
.history
.last_clean_scrub_stamp
= now
;
5243 info
.stats
.stats
.sum
.num_shallow_scrub_errors
= scrubber
.shallow_errors
;
5244 info
.stats
.stats
.sum
.num_deep_scrub_errors
= scrubber
.deep_errors
;
5245 info
.stats
.stats
.sum
.num_large_omap_objects
= scrubber
.large_omap_objects
;
5247 info
.stats
.stats
.sum
.num_shallow_scrub_errors
= scrubber
.shallow_errors
;
5248 // XXX: last_clean_scrub_stamp doesn't mean the pg is not inconsistent
5249 // because of deep-scrub errors
5250 if (scrubber
.shallow_errors
== 0)
5251 info
.history
.last_clean_scrub_stamp
= now
;
5253 info
.stats
.stats
.sum
.num_scrub_errors
=
5254 info
.stats
.stats
.sum
.num_shallow_scrub_errors
+
5255 info
.stats
.stats
.sum
.num_deep_scrub_errors
;
5259 ObjectStore::Transaction t
;
5262 int tr
= osd
->store
->queue_transaction(osr
.get(), std::move(t
), NULL
);
5268 queue_peering_event(
5270 std::make_shared
<CephPeeringEvt
>(
5271 get_osdmap()->get_epoch(),
5272 get_osdmap()->get_epoch(),
5276 scrub_clear_state();
5277 scrub_unreserve_replicas();
5279 if (is_active() && is_primary()) {
5284 void PG::share_pg_info()
5286 dout(10) << "share_pg_info" << dendl
;
5288 // share new pg_info_t with replicas
5289 assert(!actingbackfill
.empty());
5290 for (set
<pg_shard_t
>::iterator i
= actingbackfill
.begin();
5291 i
!= actingbackfill
.end();
5293 if (*i
== pg_whoami
) continue;
5294 pg_shard_t peer
= *i
;
5295 if (peer_info
.count(peer
)) {
5296 peer_info
[peer
].last_epoch_started
= info
.last_epoch_started
;
5297 peer_info
[peer
].last_interval_started
= info
.last_interval_started
;
5298 peer_info
[peer
].history
.merge(info
.history
);
5300 MOSDPGInfo
*m
= new MOSDPGInfo(get_osdmap()->get_epoch());
5301 m
->pg_list
.push_back(
5304 peer
.shard
, pg_whoami
.shard
,
5305 get_osdmap()->get_epoch(),
5306 get_osdmap()->get_epoch(),
5309 osd
->send_message_osd_cluster(peer
.osd
, m
, get_osdmap()->get_epoch());
5313 bool PG::append_log_entries_update_missing(
5314 const mempool::osd_pglog::list
<pg_log_entry_t
> &entries
,
5315 ObjectStore::Transaction
&t
, boost::optional
<eversion_t
> trim_to
,
5316 boost::optional
<eversion_t
> roll_forward_to
)
5318 assert(!entries
.empty());
5319 assert(entries
.begin()->version
> info
.last_update
);
5321 PGLogEntryHandler rollbacker
{this, &t
};
5322 bool invalidate_stats
=
5323 pg_log
.append_new_log_entries(info
.last_backfill
,
5324 info
.last_backfill_bitwise
,
5328 if (roll_forward_to
&& entries
.rbegin()->soid
> info
.last_backfill
) {
5329 pg_log
.roll_forward(&rollbacker
);
5331 if (roll_forward_to
&& *roll_forward_to
> pg_log
.get_can_rollback_to()) {
5332 pg_log
.roll_forward_to(*roll_forward_to
, &rollbacker
);
5333 last_rollback_info_trimmed_to_applied
= *roll_forward_to
;
5336 info
.last_update
= pg_log
.get_head();
5338 if (pg_log
.get_missing().num_missing() == 0) {
5339 // advance last_complete since nothing else is missing!
5340 info
.last_complete
= info
.last_update
;
5342 info
.stats
.stats_invalid
= info
.stats
.stats_invalid
|| invalidate_stats
;
5344 dout(20) << __func__
<< "trim_to bool = " << bool(trim_to
) << " trim_to = " << (trim_to
? *trim_to
: eversion_t()) << dendl
;
5346 pg_log
.trim(*trim_to
, info
);
5349 return invalidate_stats
;
5353 void PG::merge_new_log_entries(
5354 const mempool::osd_pglog::list
<pg_log_entry_t
> &entries
,
5355 ObjectStore::Transaction
&t
,
5356 boost::optional
<eversion_t
> trim_to
,
5357 boost::optional
<eversion_t
> roll_forward_to
)
5359 dout(10) << __func__
<< " " << entries
<< dendl
;
5360 assert(is_primary());
5362 bool rebuild_missing
= append_log_entries_update_missing(entries
, t
, trim_to
, roll_forward_to
);
5363 for (set
<pg_shard_t
>::const_iterator i
= actingbackfill
.begin();
5364 i
!= actingbackfill
.end();
5366 pg_shard_t
peer(*i
);
5367 if (peer
== pg_whoami
) continue;
5368 assert(peer_missing
.count(peer
));
5369 assert(peer_info
.count(peer
));
5370 pg_missing_t
& pmissing(peer_missing
[peer
]);
5371 dout(20) << __func__
<< " peer_missing for " << peer
<< " = " << pmissing
<< dendl
;
5372 pg_info_t
& pinfo(peer_info
[peer
]);
5373 bool invalidate_stats
= PGLog::append_log_entries_update_missing(
5374 pinfo
.last_backfill
,
5375 info
.last_backfill_bitwise
,
5382 pinfo
.last_update
= info
.last_update
;
5383 pinfo
.stats
.stats_invalid
= pinfo
.stats
.stats_invalid
|| invalidate_stats
;
5384 rebuild_missing
= rebuild_missing
|| invalidate_stats
;
5387 if (!rebuild_missing
) {
5391 for (auto &&i
: entries
) {
5392 missing_loc
.rebuild(
5397 pg_log
.get_missing(),
5403 void PG::update_history(const pg_history_t
& new_history
)
5406 if (info
.history
.merge(new_history
)) {
5407 dout(20) << __func__
<< " advanced history from " << new_history
<< dendl
;
5409 if (info
.history
.last_epoch_clean
>= info
.history
.same_interval_since
) {
5410 dout(20) << __func__
<< " clearing past_intervals" << dendl
;
5411 past_intervals
.clear();
5412 dirty_big_info
= true;
5418 void PG::fulfill_info(
5419 pg_shard_t from
, const pg_query_t
&query
,
5420 pair
<pg_shard_t
, pg_info_t
> ¬ify_info
)
5422 assert(from
== primary
);
5423 assert(query
.type
== pg_query_t::INFO
);
5426 dout(10) << "sending info" << dendl
;
5427 notify_info
= make_pair(from
, info
);
5430 void PG::fulfill_log(
5431 pg_shard_t from
, const pg_query_t
&query
, epoch_t query_epoch
)
5433 dout(10) << "log request from " << from
<< dendl
;
5434 assert(from
== primary
);
5435 assert(query
.type
!= pg_query_t::INFO
);
5436 ConnectionRef con
= osd
->get_con_osd_cluster(
5437 from
.osd
, get_osdmap()->get_epoch());
5440 MOSDPGLog
*mlog
= new MOSDPGLog(
5441 from
.shard
, pg_whoami
.shard
,
5442 get_osdmap()->get_epoch(),
5444 mlog
->missing
= pg_log
.get_missing();
5446 // primary -> other, when building master log
5447 if (query
.type
== pg_query_t::LOG
) {
5448 dout(10) << " sending info+missing+log since " << query
.since
5450 if (query
.since
!= eversion_t() && query
.since
< pg_log
.get_tail()) {
5451 osd
->clog
->error() << info
.pgid
<< " got broken pg_query_t::LOG since " << query
.since
5452 << " when my log.tail is " << pg_log
.get_tail()
5453 << ", sending full log instead";
5454 mlog
->log
= pg_log
.get_log(); // primary should not have requested this!!
5456 mlog
->log
.copy_after(pg_log
.get_log(), query
.since
);
5458 else if (query
.type
== pg_query_t::FULLLOG
) {
5459 dout(10) << " sending info+missing+full log" << dendl
;
5460 mlog
->log
= pg_log
.get_log();
5463 dout(10) << " sending " << mlog
->log
<< " " << mlog
->missing
<< dendl
;
5465 osd
->share_map_peer(from
.osd
, con
.get(), get_osdmap());
5466 osd
->send_message_osd_cluster(mlog
, con
.get());
5469 void PG::check_full_transition(OSDMapRef lastmap
, OSDMapRef osdmap
)
5471 bool changed
= false;
5472 if (osdmap
->test_flag(CEPH_OSDMAP_FULL
) &&
5473 !lastmap
->test_flag(CEPH_OSDMAP_FULL
)) {
5474 dout(10) << " cluster was marked full in " << osdmap
->get_epoch() << dendl
;
5477 const pg_pool_t
*pi
= osdmap
->get_pg_pool(info
.pgid
.pool());
5479 if (pi
->has_flag(pg_pool_t::FLAG_FULL
)) {
5480 const pg_pool_t
*opi
= lastmap
->get_pg_pool(info
.pgid
.pool());
5481 if (!opi
|| !opi
->has_flag(pg_pool_t::FLAG_FULL
)) {
5482 dout(10) << " pool was marked full in " << osdmap
->get_epoch() << dendl
;
5487 info
.history
.last_epoch_marked_full
= osdmap
->get_epoch();
5492 bool PG::should_restart_peering(
5494 int newactingprimary
,
5495 const vector
<int>& newup
,
5496 const vector
<int>& newacting
,
5500 if (PastIntervals::is_new_interval(
5512 dout(20) << "new interval newup " << newup
5513 << " newacting " << newacting
<< dendl
;
5520 bool PG::old_peering_msg(epoch_t reply_epoch
, epoch_t query_epoch
)
5522 if (last_peering_reset
> reply_epoch
||
5523 last_peering_reset
> query_epoch
) {
5524 dout(10) << "old_peering_msg reply_epoch " << reply_epoch
<< " query_epoch " << query_epoch
5525 << " last_peering_reset " << last_peering_reset
5532 void PG::set_last_peering_reset()
5534 dout(20) << "set_last_peering_reset " << get_osdmap()->get_epoch() << dendl
;
5535 if (last_peering_reset
!= get_osdmap()->get_epoch()) {
5536 last_peering_reset
= get_osdmap()->get_epoch();
5537 reset_interval_flush();
5544 FlushState(PG
*pg
, epoch_t epoch
) : pg(pg
), epoch(epoch
) {}
5547 if (!pg
->pg_has_reset_since(epoch
))
5548 pg
->queue_flushed(epoch
);
5552 typedef ceph::shared_ptr
<FlushState
> FlushStateRef
;
5554 void PG::start_flush(ObjectStore::Transaction
*t
,
5555 list
<Context
*> *on_applied
,
5556 list
<Context
*> *on_safe
)
5558 // flush in progress ops
5559 FlushStateRef
flush_trigger (std::make_shared
<FlushState
>(
5560 this, get_osdmap()->get_epoch()));
5562 flushes_in_progress
++;
5563 on_applied
->push_back(new ContainerContext
<FlushStateRef
>(flush_trigger
));
5564 on_safe
->push_back(new ContainerContext
<FlushStateRef
>(flush_trigger
));
5567 void PG::reset_interval_flush()
5569 dout(10) << "Clearing blocked outgoing recovery messages" << dendl
;
5570 recovery_state
.clear_blocked_outgoing();
5572 Context
*c
= new QueuePeeringEvt
<IntervalFlush
>(
5573 this, get_osdmap()->get_epoch(), IntervalFlush());
5574 if (!osr
->flush_commit(c
)) {
5575 dout(10) << "Beginning to block outgoing recovery messages" << dendl
;
5576 recovery_state
.begin_block_outgoing();
5578 dout(10) << "Not blocking outgoing recovery messages" << dendl
;
5583 /* Called before initializing peering during advance_map */
5584 void PG::start_peering_interval(
5585 const OSDMapRef lastmap
,
5586 const vector
<int>& newup
, int new_up_primary
,
5587 const vector
<int>& newacting
, int new_acting_primary
,
5588 ObjectStore::Transaction
*t
)
5590 const OSDMapRef osdmap
= get_osdmap();
5592 set_last_peering_reset();
5594 vector
<int> oldacting
, oldup
;
5595 int oldrole
= get_role();
5599 pg_shard_t old_acting_primary
= get_primary();
5600 pg_shard_t old_up_primary
= up_primary
;
5601 bool was_old_primary
= is_primary();
5602 bool was_old_replica
= is_replica();
5604 acting
.swap(oldacting
);
5606 init_primary_up_acting(
5610 new_acting_primary
);
5612 if (info
.stats
.up
!= up
||
5613 info
.stats
.acting
!= acting
||
5614 info
.stats
.up_primary
!= new_up_primary
||
5615 info
.stats
.acting_primary
!= new_acting_primary
) {
5617 info
.stats
.up_primary
= new_up_primary
;
5618 info
.stats
.acting
= acting
;
5619 info
.stats
.acting_primary
= new_acting_primary
;
5620 info
.stats
.mapping_epoch
= osdmap
->get_epoch();
5623 pg_stats_publish_lock
.Lock();
5624 pg_stats_publish_valid
= false;
5625 pg_stats_publish_lock
.Unlock();
5627 // This will now be remapped during a backfill in cases
5628 // that it would not have been before.
5630 state_set(PG_STATE_REMAPPED
);
5632 state_clear(PG_STATE_REMAPPED
);
5634 int role
= osdmap
->calc_pg_role(osd
->whoami
, acting
, acting
.size());
5635 if (pool
.info
.is_replicated() || role
== pg_whoami
.shard
)
5640 // did acting, up, primary|acker change?
5642 dout(10) << " no lastmap" << dendl
;
5644 dirty_big_info
= true;
5645 info
.history
.same_interval_since
= osdmap
->get_epoch();
5647 std::stringstream debug
;
5648 assert(info
.history
.same_interval_since
!= 0);
5649 boost::scoped_ptr
<IsPGRecoverablePredicate
> recoverable(
5650 get_is_recoverable_predicate());
5651 bool new_interval
= PastIntervals::check_new_interval(
5652 old_acting_primary
.osd
,
5654 oldacting
, newacting
,
5658 info
.history
.same_interval_since
,
5659 info
.history
.last_epoch_clean
,
5666 dout(10) << __func__
<< ": check_new_interval output: "
5667 << debug
.str() << dendl
;
5669 if (osdmap
->get_epoch() == osd
->get_superblock().oldest_map
&&
5670 info
.history
.last_epoch_clean
< osdmap
->get_epoch()) {
5671 dout(10) << " map gap, clearing past_intervals and faking" << dendl
;
5672 // our information is incomplete and useless; someone else was clean
5673 // after everything we know if osdmaps were trimmed.
5674 past_intervals
.clear();
5676 dout(10) << " noting past " << past_intervals
<< dendl
;
5679 dirty_big_info
= true;
5680 info
.history
.same_interval_since
= osdmap
->get_epoch();
5681 if (info
.pgid
.pgid
.is_split(lastmap
->get_pg_num(info
.pgid
.pgid
.pool()),
5682 osdmap
->get_pg_num(info
.pgid
.pgid
.pool()),
5684 info
.history
.last_epoch_split
= osdmap
->get_epoch();
5689 if (old_up_primary
!= up_primary
||
5691 info
.history
.same_up_since
= osdmap
->get_epoch();
5693 // this comparison includes primary rank via pg_shard_t
5694 if (old_acting_primary
!= get_primary()) {
5695 info
.history
.same_primary_since
= osdmap
->get_epoch();
5700 dout(1) << __func__
<< " up " << oldup
<< " -> " << up
5701 << ", acting " << oldacting
<< " -> " << acting
5702 << ", acting_primary " << old_acting_primary
<< " -> " << new_acting_primary
5703 << ", up_primary " << old_up_primary
<< " -> " << new_up_primary
5704 << ", role " << oldrole
<< " -> " << role
5705 << ", features acting " << acting_features
5706 << " upacting " << upacting_features
5710 state_clear(PG_STATE_ACTIVE
);
5711 state_clear(PG_STATE_PEERED
);
5712 state_clear(PG_STATE_DOWN
);
5713 state_clear(PG_STATE_RECOVERY_WAIT
);
5714 state_clear(PG_STATE_RECOVERY_TOOFULL
);
5715 state_clear(PG_STATE_RECOVERING
);
5717 peer_purged
.clear();
5718 actingbackfill
.clear();
5719 scrub_queued
= false;
5721 // reset primary/replica state?
5722 if (was_old_primary
|| is_primary()) {
5723 osd
->remove_want_pg_temp(info
.pgid
.pgid
);
5724 } else if (was_old_replica
|| is_replica()) {
5725 osd
->remove_want_pg_temp(info
.pgid
.pgid
);
5727 clear_primary_state();
5733 projected_last_update
= eversion_t();
5737 // should we tell the primary we are here?
5738 send_notify
= !is_primary();
5740 if (role
!= oldrole
||
5741 was_old_primary
!= is_primary()) {
5742 // did primary change?
5743 if (was_old_primary
!= is_primary()) {
5744 state_clear(PG_STATE_CLEAN
);
5745 clear_publish_stats();
5750 // take active waiters
5751 requeue_ops(waiting_for_peered
);
5755 // did primary change?
5756 if (get_primary() != old_acting_primary
) {
5757 dout(10) << *this << " " << oldacting
<< " -> " << acting
5758 << ", acting primary "
5759 << old_acting_primary
<< " -> " << get_primary()
5762 // primary is the same.
5764 // i am (still) primary. but my replica set changed.
5765 state_clear(PG_STATE_CLEAN
);
5767 dout(10) << oldacting
<< " -> " << acting
5768 << ", replicas changed" << dendl
;
5774 if (acting
.empty() && !up
.empty() && up_primary
== pg_whoami
) {
5775 dout(10) << " acting empty, but i am up[0], clearing pg_temp" << dendl
;
5776 osd
->queue_want_pg_temp(info
.pgid
.pgid
, acting
);
5780 void PG::on_new_interval()
5782 const OSDMapRef osdmap
= get_osdmap();
5786 // initialize features
5787 acting_features
= CEPH_FEATURES_SUPPORTED_DEFAULT
;
5788 upacting_features
= CEPH_FEATURES_SUPPORTED_DEFAULT
;
5789 for (vector
<int>::iterator p
= acting
.begin(); p
!= acting
.end(); ++p
) {
5790 if (*p
== CRUSH_ITEM_NONE
)
5792 uint64_t f
= osdmap
->get_xinfo(*p
).features
;
5793 acting_features
&= f
;
5794 upacting_features
&= f
;
5796 for (vector
<int>::iterator p
= up
.begin(); p
!= up
.end(); ++p
) {
5797 if (*p
== CRUSH_ITEM_NONE
)
5799 upacting_features
&= osdmap
->get_xinfo(*p
).features
;
5805 void PG::proc_primary_info(ObjectStore::Transaction
&t
, const pg_info_t
&oinfo
)
5807 assert(!is_primary());
5809 update_history(oinfo
.history
);
5810 if (!info
.stats
.stats_invalid
&& info
.stats
.stats
.sum
.num_scrub_errors
) {
5811 info
.stats
.stats
.sum
.num_scrub_errors
= 0;
5812 info
.stats
.stats
.sum
.num_shallow_scrub_errors
= 0;
5813 info
.stats
.stats
.sum
.num_deep_scrub_errors
= 0;
5817 if (!(info
.purged_snaps
== oinfo
.purged_snaps
)) {
5818 dout(10) << __func__
<< " updating purged_snaps to " << oinfo
.purged_snaps
5820 info
.purged_snaps
= oinfo
.purged_snaps
;
5822 dirty_big_info
= true;
5826 ostream
& operator<<(ostream
& out
, const PG
& pg
)
5828 out
<< "pg[" << pg
.info
5830 if (pg
.acting
!= pg
.up
)
5831 out
<< "/" << pg
.acting
;
5833 out
<< "p" << pg
.get_primary();
5834 out
<< " r=" << pg
.get_role();
5835 out
<< " lpr=" << pg
.get_last_peering_reset();
5837 if (!pg
.past_intervals
.empty()) {
5838 out
<< " pi=[" << pg
.past_intervals
.get_bounds()
5839 << ")/" << pg
.past_intervals
.size();
5842 if (pg
.is_peered()) {
5843 if (pg
.last_update_ondisk
!= pg
.info
.last_update
)
5844 out
<< " luod=" << pg
.last_update_ondisk
;
5845 if (pg
.last_update_applied
!= pg
.info
.last_update
)
5846 out
<< " lua=" << pg
.last_update_applied
;
5849 if (pg
.recovery_ops_active
)
5850 out
<< " rops=" << pg
.recovery_ops_active
;
5852 if (pg
.pg_log
.get_tail() != pg
.info
.log_tail
||
5853 pg
.pg_log
.get_head() != pg
.info
.last_update
)
5854 out
<< " (info mismatch, " << pg
.pg_log
.get_log() << ")";
5856 if (!pg
.pg_log
.get_log().empty()) {
5857 if ((pg
.pg_log
.get_log().log
.begin()->version
<= pg
.pg_log
.get_tail())) {
5858 out
<< " (log bound mismatch, actual=["
5859 << pg
.pg_log
.get_log().log
.begin()->version
<< ","
5860 << pg
.pg_log
.get_log().log
.rbegin()->version
<< "]";
5865 if (!pg
.backfill_targets
.empty())
5866 out
<< " bft=" << pg
.backfill_targets
;
5867 out
<< " crt=" << pg
.pg_log
.get_can_rollback_to();
5869 if (pg
.last_complete_ondisk
!= pg
.info
.last_complete
)
5870 out
<< " lcod " << pg
.last_complete_ondisk
;
5872 if (pg
.is_primary()) {
5873 out
<< " mlcod " << pg
.min_last_complete_ondisk
;
5876 out
<< " " << pg_state_string(pg
.get_state());
5877 if (pg
.should_send_notify())
5880 if (pg
.scrubber
.must_repair
)
5881 out
<< " MUST_REPAIR";
5882 if (pg
.scrubber
.auto_repair
)
5883 out
<< " AUTO_REPAIR";
5884 if (pg
.scrubber
.must_deep_scrub
)
5885 out
<< " MUST_DEEP_SCRUB";
5886 if (pg
.scrubber
.must_scrub
)
5887 out
<< " MUST_SCRUB";
5889 //out << " (" << pg.pg_log.get_tail() << "," << pg.pg_log.get_head() << "]";
5890 if (pg
.pg_log
.get_missing().num_missing()) {
5891 out
<< " m=" << pg
.pg_log
.get_missing().num_missing();
5892 if (pg
.is_primary()) {
5893 uint64_t unfound
= pg
.get_num_unfound();
5895 out
<< " u=" << unfound
;
5898 if (pg
.snap_trimq
.size())
5899 out
<< " snaptrimq=" << pg
.snap_trimq
;
5907 bool PG::can_discard_op(OpRequestRef
& op
)
5909 const MOSDOp
*m
= static_cast<const MOSDOp
*>(op
->get_req());
5910 if (cct
->_conf
->osd_discard_disconnected_ops
&& OSD::op_is_discardable(m
)) {
5911 dout(20) << " discard " << *m
<< dendl
;
5915 if (m
->get_map_epoch() < info
.history
.same_primary_since
) {
5916 dout(7) << " changed after " << m
->get_map_epoch()
5917 << ", dropping " << *m
<< dendl
;
5921 if (m
->get_connection()->has_feature(CEPH_FEATURE_RESEND_ON_SPLIT
)) {
5922 if (m
->get_map_epoch() < pool
.info
.get_last_force_op_resend()) {
5923 dout(7) << __func__
<< " sent before last_force_op_resend "
5924 << pool
.info
.last_force_op_resend
<< ", dropping" << *m
<< dendl
;
5927 if (m
->get_map_epoch() < info
.history
.last_epoch_split
) {
5928 dout(7) << __func__
<< " pg split in "
5929 << info
.history
.last_epoch_split
<< ", dropping" << dendl
;
5932 } else if (m
->get_connection()->has_feature(CEPH_FEATURE_OSD_POOLRESEND
)) {
5933 if (m
->get_map_epoch() < pool
.info
.get_last_force_op_resend_preluminous()) {
5934 dout(7) << __func__
<< " sent before last_force_op_resend_preluminous "
5935 << pool
.info
.last_force_op_resend_preluminous
5936 << ", dropping" << *m
<< dendl
;
5944 template<typename T
, int MSGTYPE
>
5945 bool PG::can_discard_replica_op(OpRequestRef
& op
)
5947 const T
*m
= static_cast<const T
*>(op
->get_req());
5948 assert(m
->get_type() == MSGTYPE
);
5950 int from
= m
->get_source().num();
5952 // if a repop is replied after a replica goes down in a new osdmap, and
5953 // before the pg advances to this new osdmap, the repop replies before this
5954 // repop can be discarded by that replica OSD, because the primary resets the
5955 // connection to it when handling the new osdmap marking it down, and also
5956 // resets the messenger sesssion when the replica reconnects. to avoid the
5957 // out-of-order replies, the messages from that replica should be discarded.
5958 if (osd
->get_osdmap()->is_down(from
))
5960 /* Mostly, this overlaps with the old_peering_msg
5961 * condition. An important exception is pushes
5962 * sent by replicas not in the acting set, since
5963 * if such a replica goes down it does not cause
5964 * a new interval. */
5965 if (get_osdmap()->get_down_at(from
) >= m
->map_epoch
)
5969 // if pg changes _at all_, we reset and repeer!
5970 if (old_peering_msg(m
->map_epoch
, m
->map_epoch
)) {
5971 dout(10) << "can_discard_replica_op pg changed " << info
.history
5972 << " after " << m
->map_epoch
5973 << ", dropping" << dendl
;
5979 bool PG::can_discard_scan(OpRequestRef op
)
5981 const MOSDPGScan
*m
= static_cast<const MOSDPGScan
*>(op
->get_req());
5982 assert(m
->get_type() == MSG_OSD_PG_SCAN
);
5984 if (old_peering_msg(m
->map_epoch
, m
->query_epoch
)) {
5985 dout(10) << " got old scan, ignoring" << dendl
;
5991 bool PG::can_discard_backfill(OpRequestRef op
)
5993 const MOSDPGBackfill
*m
= static_cast<const MOSDPGBackfill
*>(op
->get_req());
5994 assert(m
->get_type() == MSG_OSD_PG_BACKFILL
);
5996 if (old_peering_msg(m
->map_epoch
, m
->query_epoch
)) {
5997 dout(10) << " got old backfill, ignoring" << dendl
;
6005 bool PG::can_discard_request(OpRequestRef
& op
)
6007 switch (op
->get_req()->get_type()) {
6008 case CEPH_MSG_OSD_OP
:
6009 return can_discard_op(op
);
6010 case CEPH_MSG_OSD_BACKOFF
:
6011 return false; // never discard
6013 return can_discard_replica_op
<MOSDSubOp
, MSG_OSD_SUBOP
>(op
);
6015 return can_discard_replica_op
<MOSDRepOp
, MSG_OSD_REPOP
>(op
);
6016 case MSG_OSD_PG_PUSH
:
6017 return can_discard_replica_op
<MOSDPGPush
, MSG_OSD_PG_PUSH
>(op
);
6018 case MSG_OSD_PG_PULL
:
6019 return can_discard_replica_op
<MOSDPGPull
, MSG_OSD_PG_PULL
>(op
);
6020 case MSG_OSD_PG_PUSH_REPLY
:
6021 return can_discard_replica_op
<MOSDPGPushReply
, MSG_OSD_PG_PUSH_REPLY
>(op
);
6022 case MSG_OSD_SUBOPREPLY
:
6023 return can_discard_replica_op
<MOSDSubOpReply
, MSG_OSD_SUBOPREPLY
>(op
);
6024 case MSG_OSD_REPOPREPLY
:
6025 return can_discard_replica_op
<MOSDRepOpReply
, MSG_OSD_REPOPREPLY
>(op
);
6026 case MSG_OSD_PG_RECOVERY_DELETE
:
6027 return can_discard_replica_op
<MOSDPGRecoveryDelete
, MSG_OSD_PG_RECOVERY_DELETE
>(op
);
6029 case MSG_OSD_PG_RECOVERY_DELETE_REPLY
:
6030 return can_discard_replica_op
<MOSDPGRecoveryDeleteReply
, MSG_OSD_PG_RECOVERY_DELETE_REPLY
>(op
);
6032 case MSG_OSD_EC_WRITE
:
6033 return can_discard_replica_op
<MOSDECSubOpWrite
, MSG_OSD_EC_WRITE
>(op
);
6034 case MSG_OSD_EC_WRITE_REPLY
:
6035 return can_discard_replica_op
<MOSDECSubOpWriteReply
, MSG_OSD_EC_WRITE_REPLY
>(op
);
6036 case MSG_OSD_EC_READ
:
6037 return can_discard_replica_op
<MOSDECSubOpRead
, MSG_OSD_EC_READ
>(op
);
6038 case MSG_OSD_EC_READ_REPLY
:
6039 return can_discard_replica_op
<MOSDECSubOpReadReply
, MSG_OSD_EC_READ_REPLY
>(op
);
6040 case MSG_OSD_REP_SCRUB
:
6041 return can_discard_replica_op
<MOSDRepScrub
, MSG_OSD_REP_SCRUB
>(op
);
6042 case MSG_OSD_SCRUB_RESERVE
:
6043 return can_discard_replica_op
<MOSDScrubReserve
, MSG_OSD_SCRUB_RESERVE
>(op
);
6044 case MSG_OSD_REP_SCRUBMAP
:
6045 return can_discard_replica_op
<MOSDRepScrubMap
, MSG_OSD_REP_SCRUBMAP
>(op
);
6046 case MSG_OSD_PG_UPDATE_LOG_MISSING
:
6047 return can_discard_replica_op
<
6048 MOSDPGUpdateLogMissing
, MSG_OSD_PG_UPDATE_LOG_MISSING
>(op
);
6049 case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY
:
6050 return can_discard_replica_op
<
6051 MOSDPGUpdateLogMissingReply
, MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY
>(op
);
6053 case MSG_OSD_PG_SCAN
:
6054 return can_discard_scan(op
);
6055 case MSG_OSD_PG_BACKFILL
:
6056 return can_discard_backfill(op
);
6057 case MSG_OSD_PG_BACKFILL_REMOVE
:
6058 return can_discard_replica_op
<MOSDPGBackfillRemove
,
6059 MSG_OSD_PG_BACKFILL_REMOVE
>(op
);
6064 void PG::take_waiters()
6066 dout(10) << "take_waiters" << dendl
;
6067 requeue_map_waiters();
6068 for (list
<CephPeeringEvtRef
>::iterator i
= peering_waiters
.begin();
6069 i
!= peering_waiters
.end();
6070 ++i
) osd
->queue_for_peering(this);
6071 peering_queue
.splice(peering_queue
.begin(), peering_waiters
,
6072 peering_waiters
.begin(), peering_waiters
.end());
6075 void PG::handle_peering_event(CephPeeringEvtRef evt
, RecoveryCtx
*rctx
)
6077 dout(10) << "handle_peering_event: " << evt
->get_desc() << dendl
;
6078 if (!have_same_or_newer_map(evt
->get_epoch_sent())) {
6079 dout(10) << "deferring event " << evt
->get_desc() << dendl
;
6080 peering_waiters
.push_back(evt
);
6083 if (old_peering_evt(evt
))
6085 recovery_state
.handle_event(evt
, rctx
);
6088 void PG::queue_peering_event(CephPeeringEvtRef evt
)
6090 if (old_peering_evt(evt
))
6092 peering_queue
.push_back(evt
);
6093 osd
->queue_for_peering(this);
6096 void PG::queue_null(epoch_t msg_epoch
,
6097 epoch_t query_epoch
)
6099 dout(10) << "null" << dendl
;
6100 queue_peering_event(
6101 CephPeeringEvtRef(std::make_shared
<CephPeeringEvt
>(msg_epoch
, query_epoch
,
6105 void PG::queue_flushed(epoch_t e
)
6107 dout(10) << "flushed" << dendl
;
6108 queue_peering_event(
6109 CephPeeringEvtRef(std::make_shared
<CephPeeringEvt
>(e
, e
,
6113 void PG::queue_query(epoch_t msg_epoch
,
6114 epoch_t query_epoch
,
6115 pg_shard_t from
, const pg_query_t
& q
)
6117 dout(10) << "handle_query " << q
<< " from replica " << from
<< dendl
;
6118 queue_peering_event(
6119 CephPeeringEvtRef(std::make_shared
<CephPeeringEvt
>(msg_epoch
, query_epoch
,
6120 MQuery(from
, q
, query_epoch
))));
6123 void PG::handle_advance_map(
6124 OSDMapRef osdmap
, OSDMapRef lastmap
,
6125 vector
<int>& newup
, int up_primary
,
6126 vector
<int>& newacting
, int acting_primary
,
6129 assert(lastmap
->get_epoch() == osdmap_ref
->get_epoch());
6130 assert(lastmap
== osdmap_ref
);
6131 dout(10) << "handle_advance_map "
6132 << newup
<< "/" << newacting
6133 << " -- " << up_primary
<< "/" << acting_primary
6135 update_osdmap_ref(osdmap
);
6136 pool
.update(osdmap
);
6137 past_intervals
.update_type_from_map(pool
.info
.ec_pool(), *osdmap
);
6138 if (cct
->_conf
->osd_debug_verify_cached_snaps
) {
6139 interval_set
<snapid_t
> actual_removed_snaps
;
6140 const pg_pool_t
*pi
= osdmap
->get_pg_pool(info
.pgid
.pool());
6142 pi
->build_removed_snaps(actual_removed_snaps
);
6143 if (!(actual_removed_snaps
== pool
.cached_removed_snaps
)) {
6144 derr
<< __func__
<< ": mismatch between the actual removed snaps "
6145 << actual_removed_snaps
<< " and pool.cached_removed_snaps "
6146 << " pool.cached_removed_snaps " << pool
.cached_removed_snaps
6149 assert(actual_removed_snaps
== pool
.cached_removed_snaps
);
6152 osdmap
, lastmap
, newup
, up_primary
,
6153 newacting
, acting_primary
);
6154 recovery_state
.handle_event(evt
, rctx
);
6155 if (pool
.info
.last_change
== osdmap_ref
->get_epoch()) {
6157 update_store_with_options();
6161 void PG::handle_activate_map(RecoveryCtx
*rctx
)
6163 dout(10) << "handle_activate_map " << dendl
;
6165 recovery_state
.handle_event(evt
, rctx
);
6166 if (osdmap_ref
->get_epoch() - last_persisted_osdmap_ref
->get_epoch() >
6167 cct
->_conf
->osd_pg_epoch_persisted_max_stale
) {
6168 dout(20) << __func__
<< ": Dirtying info: last_persisted is "
6169 << last_persisted_osdmap_ref
->get_epoch()
6170 << " while current is " << osdmap_ref
->get_epoch() << dendl
;
6173 dout(20) << __func__
<< ": Not dirtying info: last_persisted is "
6174 << last_persisted_osdmap_ref
->get_epoch()
6175 << " while current is " << osdmap_ref
->get_epoch() << dendl
;
6177 if (osdmap_ref
->check_new_blacklist_entries()) check_blacklisted_watchers();
6180 void PG::handle_loaded(RecoveryCtx
*rctx
)
6182 dout(10) << "handle_loaded" << dendl
;
6184 recovery_state
.handle_event(evt
, rctx
);
6187 void PG::handle_create(RecoveryCtx
*rctx
)
6189 dout(10) << "handle_create" << dendl
;
6190 rctx
->created_pgs
.insert(this);
6192 recovery_state
.handle_event(evt
, rctx
);
6194 recovery_state
.handle_event(evt2
, rctx
);
6196 rctx
->on_applied
->add(make_lambda_context([this]() {
6197 update_store_with_options();
6201 void PG::handle_query_state(Formatter
*f
)
6203 dout(10) << "handle_query_state" << dendl
;
6205 recovery_state
.handle_event(q
, 0);
6208 void PG::update_store_with_options()
6210 auto r
= osd
->store
->set_collection_opts(coll
, pool
.info
.opts
);
6211 if(r
< 0 && r
!= -EOPNOTSUPP
) {
6212 derr
<< __func__
<< " set_collection_opts returns error:" << r
<< dendl
;
6216 void PG::update_store_on_load()
6218 if (osd
->store
->get_type() == "filestore") {
6219 // legacy filestore didn't store collection bit width; fix.
6220 int bits
= osd
->store
->collection_bits(coll
);
6222 assert(!coll
.is_meta()); // otherwise OSD::load_pgs() did a bad thing
6223 bits
= info
.pgid
.get_split_bits(pool
.info
.get_pg_num());
6224 lderr(cct
) << __func__
<< " setting bit width to " << bits
<< dendl
;
6225 ObjectStore::Transaction t
;
6226 t
.collection_set_bits(coll
, bits
);
6227 osd
->store
->apply_transaction(osr
.get(), std::move(t
));
6232 /*------------ Recovery State Machine----------------*/
6234 #define dout_prefix (*_dout << context< RecoveryMachine >().pg->gen_prefix() \
6235 << "state<" << get_state_name() << ">: ")
6237 /*------Crashed-------*/
6238 PG::RecoveryState::Crashed::Crashed(my_context ctx
)
6240 NamedState(context
< RecoveryMachine
>().pg
, "Crashed")
6242 context
< RecoveryMachine
>().log_enter(state_name
);
6243 assert(0 == "we got a bad state machine event");
6247 /*------Initial-------*/
6248 PG::RecoveryState::Initial::Initial(my_context ctx
)
6250 NamedState(context
< RecoveryMachine
>().pg
, "Initial")
6252 context
< RecoveryMachine
>().log_enter(state_name
);
6255 boost::statechart::result
PG::RecoveryState::Initial::react(const Load
& l
)
6257 PG
*pg
= context
< RecoveryMachine
>().pg
;
6259 // do we tell someone we're here?
6260 pg
->send_notify
= (!pg
->is_primary());
6261 pg
->update_store_with_options();
6263 pg
->update_store_on_load();
6265 return transit
< Reset
>();
6268 boost::statechart::result
PG::RecoveryState::Initial::react(const MNotifyRec
& notify
)
6270 PG
*pg
= context
< RecoveryMachine
>().pg
;
6271 pg
->proc_replica_info(
6272 notify
.from
, notify
.notify
.info
, notify
.notify
.epoch_sent
);
6273 pg
->set_last_peering_reset();
6274 return transit
< Primary
>();
6277 boost::statechart::result
PG::RecoveryState::Initial::react(const MInfoRec
& i
)
6279 PG
*pg
= context
< RecoveryMachine
>().pg
;
6280 assert(!pg
->is_primary());
6282 return transit
< Stray
>();
6285 boost::statechart::result
PG::RecoveryState::Initial::react(const MLogRec
& i
)
6287 PG
*pg
= context
< RecoveryMachine
>().pg
;
6288 assert(!pg
->is_primary());
6290 return transit
< Stray
>();
6293 void PG::RecoveryState::Initial::exit()
6295 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
6296 PG
*pg
= context
< RecoveryMachine
>().pg
;
6297 utime_t dur
= ceph_clock_now() - enter_time
;
6298 pg
->osd
->recoverystate_perf
->tinc(rs_initial_latency
, dur
);
6301 /*------Started-------*/
6302 PG::RecoveryState::Started::Started(my_context ctx
)
6304 NamedState(context
< RecoveryMachine
>().pg
, "Started")
6306 context
< RecoveryMachine
>().log_enter(state_name
);
6309 boost::statechart::result
6310 PG::RecoveryState::Started::react(const IntervalFlush
&)
6312 PG
*pg
= context
< RecoveryMachine
>().pg
;
6313 ldout(pg
->cct
, 10) << "Ending blocked outgoing recovery messages" << dendl
;
6314 context
< RecoveryMachine
>().pg
->recovery_state
.end_block_outgoing();
6315 return discard_event();
6319 boost::statechart::result
6320 PG::RecoveryState::Started::react(const FlushedEvt
&)
6322 PG
*pg
= context
< RecoveryMachine
>().pg
;
6324 return discard_event();
6328 boost::statechart::result
PG::RecoveryState::Started::react(const AdvMap
& advmap
)
6330 PG
*pg
= context
< RecoveryMachine
>().pg
;
6331 ldout(pg
->cct
, 10) << "Started advmap" << dendl
;
6332 pg
->check_full_transition(advmap
.lastmap
, advmap
.osdmap
);
6333 if (pg
->should_restart_peering(
6335 advmap
.acting_primary
,
6340 ldout(pg
->cct
, 10) << "should_restart_peering, transitioning to Reset"
6343 return transit
< Reset
>();
6345 pg
->remove_down_peer_info(advmap
.osdmap
);
6346 return discard_event();
6349 boost::statechart::result
PG::RecoveryState::Started::react(const QueryState
& q
)
6351 q
.f
->open_object_section("state");
6352 q
.f
->dump_string("name", state_name
);
6353 q
.f
->dump_stream("enter_time") << enter_time
;
6354 q
.f
->close_section();
6355 return discard_event();
6358 void PG::RecoveryState::Started::exit()
6360 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
6361 PG
*pg
= context
< RecoveryMachine
>().pg
;
6362 utime_t dur
= ceph_clock_now() - enter_time
;
6363 pg
->osd
->recoverystate_perf
->tinc(rs_started_latency
, dur
);
6366 /*--------Reset---------*/
6367 PG::RecoveryState::Reset::Reset(my_context ctx
)
6369 NamedState(context
< RecoveryMachine
>().pg
, "Reset")
6371 context
< RecoveryMachine
>().log_enter(state_name
);
6372 PG
*pg
= context
< RecoveryMachine
>().pg
;
6374 pg
->flushes_in_progress
= 0;
6375 pg
->set_last_peering_reset();
6378 boost::statechart::result
6379 PG::RecoveryState::Reset::react(const FlushedEvt
&)
6381 PG
*pg
= context
< RecoveryMachine
>().pg
;
6383 return discard_event();
6386 boost::statechart::result
6387 PG::RecoveryState::Reset::react(const IntervalFlush
&)
6389 PG
*pg
= context
< RecoveryMachine
>().pg
;
6390 ldout(pg
->cct
, 10) << "Ending blocked outgoing recovery messages" << dendl
;
6391 context
< RecoveryMachine
>().pg
->recovery_state
.end_block_outgoing();
6392 return discard_event();
6395 boost::statechart::result
PG::RecoveryState::Reset::react(const AdvMap
& advmap
)
6397 PG
*pg
= context
< RecoveryMachine
>().pg
;
6398 ldout(pg
->cct
, 10) << "Reset advmap" << dendl
;
6400 pg
->check_full_transition(advmap
.lastmap
, advmap
.osdmap
);
6402 if (pg
->should_restart_peering(
6404 advmap
.acting_primary
,
6409 ldout(pg
->cct
, 10) << "should restart peering, calling start_peering_interval again"
6411 pg
->start_peering_interval(
6413 advmap
.newup
, advmap
.up_primary
,
6414 advmap
.newacting
, advmap
.acting_primary
,
6415 context
< RecoveryMachine
>().get_cur_transaction());
6417 pg
->remove_down_peer_info(advmap
.osdmap
);
6418 pg
->check_past_interval_bounds();
6419 return discard_event();
6422 boost::statechart::result
PG::RecoveryState::Reset::react(const ActMap
&)
6424 PG
*pg
= context
< RecoveryMachine
>().pg
;
6425 if (pg
->should_send_notify() && pg
->get_primary().osd
>= 0) {
6426 context
< RecoveryMachine
>().send_notify(
6429 pg
->get_primary().shard
, pg
->pg_whoami
.shard
,
6430 pg
->get_osdmap()->get_epoch(),
6431 pg
->get_osdmap()->get_epoch(),
6433 pg
->past_intervals
);
6436 pg
->update_heartbeat_peers();
6439 return transit
< Started
>();
6442 boost::statechart::result
PG::RecoveryState::Reset::react(const QueryState
& q
)
6444 q
.f
->open_object_section("state");
6445 q
.f
->dump_string("name", state_name
);
6446 q
.f
->dump_stream("enter_time") << enter_time
;
6447 q
.f
->close_section();
6448 return discard_event();
6451 void PG::RecoveryState::Reset::exit()
6453 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
6454 PG
*pg
= context
< RecoveryMachine
>().pg
;
6455 utime_t dur
= ceph_clock_now() - enter_time
;
6456 pg
->osd
->recoverystate_perf
->tinc(rs_reset_latency
, dur
);
6459 /*-------Start---------*/
6460 PG::RecoveryState::Start::Start(my_context ctx
)
6462 NamedState(context
< RecoveryMachine
>().pg
, "Start")
6464 context
< RecoveryMachine
>().log_enter(state_name
);
6466 PG
*pg
= context
< RecoveryMachine
>().pg
;
6467 if (pg
->is_primary()) {
6468 ldout(pg
->cct
, 1) << "transitioning to Primary" << dendl
;
6469 post_event(MakePrimary());
6471 ldout(pg
->cct
, 1) << "transitioning to Stray" << dendl
;
6472 post_event(MakeStray());
6476 void PG::RecoveryState::Start::exit()
6478 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
6479 PG
*pg
= context
< RecoveryMachine
>().pg
;
6480 utime_t dur
= ceph_clock_now() - enter_time
;
6481 pg
->osd
->recoverystate_perf
->tinc(rs_start_latency
, dur
);
6484 /*---------Primary--------*/
6485 PG::RecoveryState::Primary::Primary(my_context ctx
)
6487 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary")
6489 context
< RecoveryMachine
>().log_enter(state_name
);
6490 PG
*pg
= context
< RecoveryMachine
>().pg
;
6491 assert(pg
->want_acting
.empty());
6493 // set CREATING bit until we have peered for the first time.
6494 if (pg
->info
.history
.last_epoch_started
== 0) {
6495 pg
->state_set(PG_STATE_CREATING
);
6496 // use the history timestamp, which ultimately comes from the
6497 // monitor in the create case.
6498 utime_t t
= pg
->info
.history
.last_scrub_stamp
;
6499 pg
->info
.stats
.last_fresh
= t
;
6500 pg
->info
.stats
.last_active
= t
;
6501 pg
->info
.stats
.last_change
= t
;
6502 pg
->info
.stats
.last_peered
= t
;
6503 pg
->info
.stats
.last_clean
= t
;
6504 pg
->info
.stats
.last_unstale
= t
;
6505 pg
->info
.stats
.last_undegraded
= t
;
6506 pg
->info
.stats
.last_fullsized
= t
;
6507 pg
->info
.stats
.last_scrub_stamp
= t
;
6508 pg
->info
.stats
.last_deep_scrub_stamp
= t
;
6509 pg
->info
.stats
.last_clean_scrub_stamp
= t
;
6513 boost::statechart::result
PG::RecoveryState::Primary::react(const MNotifyRec
& notevt
)
6515 PG
*pg
= context
< RecoveryMachine
>().pg
;
6516 ldout(pg
->cct
, 7) << "handle_pg_notify from osd." << notevt
.from
<< dendl
;
6517 pg
->proc_replica_info(
6518 notevt
.from
, notevt
.notify
.info
, notevt
.notify
.epoch_sent
);
6519 return discard_event();
6522 boost::statechart::result
PG::RecoveryState::Primary::react(const ActMap
&)
6524 PG
*pg
= context
< RecoveryMachine
>().pg
;
6525 ldout(pg
->cct
, 7) << "handle ActMap primary" << dendl
;
6526 pg
->publish_stats_to_osd();
6528 return discard_event();
6531 void PG::RecoveryState::Primary::exit()
6533 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
6534 PG
*pg
= context
< RecoveryMachine
>().pg
;
6535 pg
->want_acting
.clear();
6536 utime_t dur
= ceph_clock_now() - enter_time
;
6537 pg
->osd
->recoverystate_perf
->tinc(rs_primary_latency
, dur
);
6538 pg
->clear_primary_state();
6539 pg
->state_clear(PG_STATE_CREATING
);
6542 /*---------Peering--------*/
6543 PG::RecoveryState::Peering::Peering(my_context ctx
)
6545 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Peering"),
6546 history_les_bound(false)
6548 context
< RecoveryMachine
>().log_enter(state_name
);
6550 PG
*pg
= context
< RecoveryMachine
>().pg
;
6551 assert(!pg
->is_peered());
6552 assert(!pg
->is_peering());
6553 assert(pg
->is_primary());
6554 pg
->state_set(PG_STATE_PEERING
);
6557 boost::statechart::result
PG::RecoveryState::Peering::react(const AdvMap
& advmap
)
6559 PG
*pg
= context
< RecoveryMachine
>().pg
;
6560 ldout(pg
->cct
, 10) << "Peering advmap" << dendl
;
6561 if (prior_set
.affected_by_map(*(advmap
.osdmap
), pg
)) {
6562 ldout(pg
->cct
, 1) << "Peering, affected_by_map, going to Reset" << dendl
;
6564 return transit
< Reset
>();
6567 pg
->adjust_need_up_thru(advmap
.osdmap
);
6569 return forward_event();
6572 boost::statechart::result
PG::RecoveryState::Peering::react(const QueryState
& q
)
6574 PG
*pg
= context
< RecoveryMachine
>().pg
;
6576 q
.f
->open_object_section("state");
6577 q
.f
->dump_string("name", state_name
);
6578 q
.f
->dump_stream("enter_time") << enter_time
;
6580 q
.f
->open_array_section("past_intervals");
6581 pg
->past_intervals
.dump(q
.f
);
6582 q
.f
->close_section();
6584 q
.f
->open_array_section("probing_osds");
6585 for (set
<pg_shard_t
>::iterator p
= prior_set
.probe
.begin();
6586 p
!= prior_set
.probe
.end();
6588 q
.f
->dump_stream("osd") << *p
;
6589 q
.f
->close_section();
6591 if (prior_set
.pg_down
)
6592 q
.f
->dump_string("blocked", "peering is blocked due to down osds");
6594 q
.f
->open_array_section("down_osds_we_would_probe");
6595 for (set
<int>::iterator p
= prior_set
.down
.begin();
6596 p
!= prior_set
.down
.end();
6598 q
.f
->dump_int("osd", *p
);
6599 q
.f
->close_section();
6601 q
.f
->open_array_section("peering_blocked_by");
6602 for (map
<int,epoch_t
>::iterator p
= prior_set
.blocked_by
.begin();
6603 p
!= prior_set
.blocked_by
.end();
6605 q
.f
->open_object_section("osd");
6606 q
.f
->dump_int("osd", p
->first
);
6607 q
.f
->dump_int("current_lost_at", p
->second
);
6608 q
.f
->dump_string("comment", "starting or marking this osd lost may let us proceed");
6609 q
.f
->close_section();
6611 q
.f
->close_section();
6613 if (history_les_bound
) {
6614 q
.f
->open_array_section("peering_blocked_by_detail");
6615 q
.f
->open_object_section("item");
6616 q
.f
->dump_string("detail","peering_blocked_by_history_les_bound");
6617 q
.f
->close_section();
6618 q
.f
->close_section();
6621 q
.f
->close_section();
6622 return forward_event();
6625 void PG::RecoveryState::Peering::exit()
6627 PG
*pg
= context
< RecoveryMachine
>().pg
;
6628 ldout(pg
->cct
, 10) << "Leaving Peering" << dendl
;
6629 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
6630 pg
->state_clear(PG_STATE_PEERING
);
6631 pg
->clear_probe_targets();
6633 utime_t dur
= ceph_clock_now() - enter_time
;
6634 pg
->osd
->recoverystate_perf
->tinc(rs_peering_latency
, dur
);
6638 /*------Backfilling-------*/
6639 PG::RecoveryState::Backfilling::Backfilling(my_context ctx
)
6641 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Active/Backfilling")
6643 context
< RecoveryMachine
>().log_enter(state_name
);
6644 PG
*pg
= context
< RecoveryMachine
>().pg
;
6645 pg
->backfill_reserved
= true;
6646 pg
->queue_recovery();
6647 pg
->state_clear(PG_STATE_BACKFILL_TOOFULL
);
6648 pg
->state_clear(PG_STATE_BACKFILL_WAIT
);
6649 pg
->state_set(PG_STATE_BACKFILLING
);
6650 pg
->publish_stats_to_osd();
6653 boost::statechart::result
6654 PG::RecoveryState::Backfilling::react(const DeferBackfill
&c
)
6656 PG
*pg
= context
< RecoveryMachine
>().pg
;
6657 ldout(pg
->cct
, 10) << "defer backfill, retry delay " << c
.delay
<< dendl
;
6658 pg
->osd
->local_reserver
.cancel_reservation(pg
->info
.pgid
);
6660 pg
->state_set(PG_STATE_BACKFILL_WAIT
);
6661 pg
->state_clear(PG_STATE_BACKFILLING
);
6663 for (set
<pg_shard_t
>::iterator it
= pg
->backfill_targets
.begin();
6664 it
!= pg
->backfill_targets
.end();
6666 assert(*it
!= pg
->pg_whoami
);
6667 ConnectionRef con
= pg
->osd
->get_con_osd_cluster(
6668 it
->osd
, pg
->get_osdmap()->get_epoch());
6670 pg
->osd
->send_message_osd_cluster(
6671 new MBackfillReserve(
6672 MBackfillReserve::REJECT
,
6673 spg_t(pg
->info
.pgid
.pgid
, it
->shard
),
6674 pg
->get_osdmap()->get_epoch()),
6680 if (!pg
->waiting_on_backfill
.empty()) {
6681 pg
->waiting_on_backfill
.clear();
6682 pg
->finish_recovery_op(hobject_t::get_max());
6685 pg
->schedule_backfill_retry(c
.delay
);
6686 return transit
<NotBackfilling
>();
6689 boost::statechart::result
6690 PG::RecoveryState::Backfilling::react(const UnfoundBackfill
&c
)
6692 PG
*pg
= context
< RecoveryMachine
>().pg
;
6693 ldout(pg
->cct
, 10) << "backfill has unfound, can't continue" << dendl
;
6694 pg
->osd
->local_reserver
.cancel_reservation(pg
->info
.pgid
);
6696 pg
->state_set(PG_STATE_BACKFILL_UNFOUND
);
6697 pg
->state_clear(PG_STATE_BACKFILLING
);
6699 for (set
<pg_shard_t
>::iterator it
= pg
->backfill_targets
.begin();
6700 it
!= pg
->backfill_targets
.end();
6702 assert(*it
!= pg
->pg_whoami
);
6703 ConnectionRef con
= pg
->osd
->get_con_osd_cluster(
6704 it
->osd
, pg
->get_osdmap()->get_epoch());
6706 pg
->osd
->send_message_osd_cluster(
6707 new MBackfillReserve(
6708 MBackfillReserve::REJECT
,
6709 spg_t(pg
->info
.pgid
.pgid
, it
->shard
),
6710 pg
->get_osdmap()->get_epoch()),
6715 pg
->waiting_on_backfill
.clear();
6717 return transit
<NotBackfilling
>();
6720 boost::statechart::result
6721 PG::RecoveryState::Backfilling::react(const RemoteReservationRejected
&)
6723 PG
*pg
= context
< RecoveryMachine
>().pg
;
6724 pg
->osd
->local_reserver
.cancel_reservation(pg
->info
.pgid
);
6725 pg
->state_set(PG_STATE_BACKFILL_TOOFULL
);
6727 for (set
<pg_shard_t
>::iterator it
= pg
->backfill_targets
.begin();
6728 it
!= pg
->backfill_targets
.end();
6730 assert(*it
!= pg
->pg_whoami
);
6731 ConnectionRef con
= pg
->osd
->get_con_osd_cluster(
6732 it
->osd
, pg
->get_osdmap()->get_epoch());
6734 pg
->osd
->send_message_osd_cluster(
6735 new MBackfillReserve(
6736 MBackfillReserve::REJECT
,
6737 spg_t(pg
->info
.pgid
.pgid
, it
->shard
),
6738 pg
->get_osdmap()->get_epoch()),
6743 if (!pg
->waiting_on_backfill
.empty()) {
6744 pg
->waiting_on_backfill
.clear();
6745 pg
->finish_recovery_op(hobject_t::get_max());
6748 pg
->schedule_backfill_retry(pg
->cct
->_conf
->osd_recovery_retry_interval
);
6749 return transit
<NotBackfilling
>();
6752 void PG::RecoveryState::Backfilling::exit()
6754 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
6755 PG
*pg
= context
< RecoveryMachine
>().pg
;
6756 pg
->backfill_reserved
= false;
6757 pg
->backfill_reserving
= false;
6758 pg
->state_clear(PG_STATE_BACKFILLING
);
6759 pg
->state_clear(PG_STATE_FORCED_BACKFILL
| PG_STATE_FORCED_RECOVERY
);
6760 utime_t dur
= ceph_clock_now() - enter_time
;
6761 pg
->osd
->recoverystate_perf
->tinc(rs_backfilling_latency
, dur
);
6764 /*--WaitRemoteBackfillReserved--*/
6766 PG::RecoveryState::WaitRemoteBackfillReserved::WaitRemoteBackfillReserved(my_context ctx
)
6768 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Active/WaitRemoteBackfillReserved"),
6769 backfill_osd_it(context
< Active
>().remote_shards_to_reserve_backfill
.begin())
6771 context
< RecoveryMachine
>().log_enter(state_name
);
6772 PG
*pg
= context
< RecoveryMachine
>().pg
;
6773 pg
->state_set(PG_STATE_BACKFILL_WAIT
);
6774 pg
->publish_stats_to_osd();
6775 post_event(RemoteBackfillReserved());
6778 boost::statechart::result
6779 PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteBackfillReserved
&evt
)
6781 PG
*pg
= context
< RecoveryMachine
>().pg
;
6783 if (backfill_osd_it
!= context
< Active
>().remote_shards_to_reserve_backfill
.end()) {
6784 //The primary never backfills itself
6785 assert(*backfill_osd_it
!= pg
->pg_whoami
);
6786 ConnectionRef con
= pg
->osd
->get_con_osd_cluster(
6787 backfill_osd_it
->osd
, pg
->get_osdmap()->get_epoch());
6789 pg
->osd
->send_message_osd_cluster(
6790 new MBackfillReserve(
6791 MBackfillReserve::REQUEST
,
6792 spg_t(pg
->info
.pgid
.pgid
, backfill_osd_it
->shard
),
6793 pg
->get_osdmap()->get_epoch(),
6794 pg
->get_backfill_priority()),
6799 post_event(AllBackfillsReserved());
6801 return discard_event();
6804 void PG::RecoveryState::WaitRemoteBackfillReserved::exit()
6806 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
6807 PG
*pg
= context
< RecoveryMachine
>().pg
;
6808 utime_t dur
= ceph_clock_now() - enter_time
;
6809 pg
->osd
->recoverystate_perf
->tinc(rs_waitremotebackfillreserved_latency
, dur
);
6812 boost::statechart::result
6813 PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteReservationRejected
&evt
)
6815 PG
*pg
= context
< RecoveryMachine
>().pg
;
6816 pg
->osd
->local_reserver
.cancel_reservation(pg
->info
.pgid
);
6818 // Send REJECT to all previously acquired reservations
6819 set
<pg_shard_t
>::const_iterator it
, begin
, end
, next
;
6820 begin
= context
< Active
>().remote_shards_to_reserve_backfill
.begin();
6821 end
= context
< Active
>().remote_shards_to_reserve_backfill
.end();
6822 assert(begin
!= end
);
6823 for (next
= it
= begin
, ++next
; next
!= backfill_osd_it
; ++it
, ++next
) {
6824 //The primary never backfills itself
6825 assert(*it
!= pg
->pg_whoami
);
6826 ConnectionRef con
= pg
->osd
->get_con_osd_cluster(
6827 it
->osd
, pg
->get_osdmap()->get_epoch());
6829 pg
->osd
->send_message_osd_cluster(
6830 new MBackfillReserve(
6831 MBackfillReserve::REJECT
,
6832 spg_t(pg
->info
.pgid
.pgid
, it
->shard
),
6833 pg
->get_osdmap()->get_epoch()),
6838 pg
->state_clear(PG_STATE_BACKFILL_WAIT
);
6839 pg
->state_set(PG_STATE_BACKFILL_TOOFULL
);
6840 pg
->publish_stats_to_osd();
6842 pg
->schedule_backfill_retry(pg
->cct
->_conf
->osd_recovery_retry_interval
);
6844 return transit
<NotBackfilling
>();
6847 /*--WaitLocalBackfillReserved--*/
6848 PG::RecoveryState::WaitLocalBackfillReserved::WaitLocalBackfillReserved(my_context ctx
)
6850 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Active/WaitLocalBackfillReserved")
6852 context
< RecoveryMachine
>().log_enter(state_name
);
6853 PG
*pg
= context
< RecoveryMachine
>().pg
;
6854 pg
->state_set(PG_STATE_BACKFILL_WAIT
);
6855 pg
->osd
->local_reserver
.request_reservation(
6857 new QueuePeeringEvt
<LocalBackfillReserved
>(
6858 pg
, pg
->get_osdmap()->get_epoch(),
6859 LocalBackfillReserved()),
6860 pg
->get_backfill_priority(),
6861 new QueuePeeringEvt
<DeferBackfill
>(
6862 pg
, pg
->get_osdmap()->get_epoch(),
6863 DeferBackfill(0.0)));
6864 pg
->publish_stats_to_osd();
6867 void PG::RecoveryState::WaitLocalBackfillReserved::exit()
6869 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
6870 PG
*pg
= context
< RecoveryMachine
>().pg
;
6871 utime_t dur
= ceph_clock_now() - enter_time
;
6872 pg
->osd
->recoverystate_perf
->tinc(rs_waitlocalbackfillreserved_latency
, dur
);
6875 /*----NotBackfilling------*/
6876 PG::RecoveryState::NotBackfilling::NotBackfilling(my_context ctx
)
6878 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Active/NotBackfilling")
6880 context
< RecoveryMachine
>().log_enter(state_name
);
6881 PG
*pg
= context
< RecoveryMachine
>().pg
;
6882 pg
->publish_stats_to_osd();
6885 boost::statechart::result
6886 PG::RecoveryState::NotBackfilling::react(const RemoteBackfillReserved
&evt
)
6888 return discard_event();
6891 boost::statechart::result
6892 PG::RecoveryState::NotBackfilling::react(const RemoteReservationRejected
&evt
)
6894 return discard_event();
6897 void PG::RecoveryState::NotBackfilling::exit()
6899 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
6900 PG
*pg
= context
< RecoveryMachine
>().pg
;
6901 pg
->state_clear(PG_STATE_BACKFILL_UNFOUND
);
6902 utime_t dur
= ceph_clock_now() - enter_time
;
6903 pg
->osd
->recoverystate_perf
->tinc(rs_notbackfilling_latency
, dur
);
6906 /*----NotRecovering------*/
6907 PG::RecoveryState::NotRecovering::NotRecovering(my_context ctx
)
6909 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Active/NotRecovering")
6911 context
< RecoveryMachine
>().log_enter(state_name
);
6912 PG
*pg
= context
< RecoveryMachine
>().pg
;
6913 pg
->publish_stats_to_osd();
6916 void PG::RecoveryState::NotRecovering::exit()
6918 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
6919 PG
*pg
= context
< RecoveryMachine
>().pg
;
6920 pg
->state_clear(PG_STATE_RECOVERY_UNFOUND
);
6921 utime_t dur
= ceph_clock_now() - enter_time
;
6922 pg
->osd
->recoverystate_perf
->tinc(rs_notrecovering_latency
, dur
);
6925 /*---RepNotRecovering----*/
6926 PG::RecoveryState::RepNotRecovering::RepNotRecovering(my_context ctx
)
6928 NamedState(context
< RecoveryMachine
>().pg
, "Started/ReplicaActive/RepNotRecovering")
6930 context
< RecoveryMachine
>().log_enter(state_name
);
6933 boost::statechart::result
6934 PG::RecoveryState::RepNotRecovering::react(const RejectRemoteReservation
&evt
)
6936 PG
*pg
= context
< RecoveryMachine
>().pg
;
6937 pg
->reject_reservation();
6938 post_event(RemoteReservationRejected());
6939 return discard_event();
6942 void PG::RecoveryState::RepNotRecovering::exit()
6944 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
6945 PG
*pg
= context
< RecoveryMachine
>().pg
;
6946 utime_t dur
= ceph_clock_now() - enter_time
;
6947 pg
->osd
->recoverystate_perf
->tinc(rs_repnotrecovering_latency
, dur
);
6950 /*---RepWaitRecoveryReserved--*/
6951 PG::RecoveryState::RepWaitRecoveryReserved::RepWaitRecoveryReserved(my_context ctx
)
6953 NamedState(context
< RecoveryMachine
>().pg
, "Started/ReplicaActive/RepWaitRecoveryReserved")
6955 context
< RecoveryMachine
>().log_enter(state_name
);
6956 PG
*pg
= context
< RecoveryMachine
>().pg
;
6958 pg
->osd
->remote_reserver
.request_reservation(
6960 new QueuePeeringEvt
<RemoteRecoveryReserved
>(
6961 pg
, pg
->get_osdmap()->get_epoch(),
6962 RemoteRecoveryReserved()),
6963 pg
->get_recovery_priority());
6966 boost::statechart::result
6967 PG::RecoveryState::RepWaitRecoveryReserved::react(const RemoteRecoveryReserved
&evt
)
6969 PG
*pg
= context
< RecoveryMachine
>().pg
;
6970 pg
->osd
->send_message_osd_cluster(
6972 new MRecoveryReserve(
6973 MRecoveryReserve::GRANT
,
6974 spg_t(pg
->info
.pgid
.pgid
, pg
->primary
.shard
),
6975 pg
->get_osdmap()->get_epoch()),
6976 pg
->get_osdmap()->get_epoch());
6977 return transit
<RepRecovering
>();
6980 boost::statechart::result
6981 PG::RecoveryState::RepWaitRecoveryReserved::react(
6982 const RemoteReservationCanceled
&evt
)
6984 PG
*pg
= context
< RecoveryMachine
>().pg
;
6985 pg
->osd
->remote_reserver
.cancel_reservation(pg
->info
.pgid
);
6986 return transit
<RepNotRecovering
>();
6989 void PG::RecoveryState::RepWaitRecoveryReserved::exit()
6991 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
6992 PG
*pg
= context
< RecoveryMachine
>().pg
;
6993 utime_t dur
= ceph_clock_now() - enter_time
;
6994 pg
->osd
->recoverystate_perf
->tinc(rs_repwaitrecoveryreserved_latency
, dur
);
6997 /*-RepWaitBackfillReserved*/
6998 PG::RecoveryState::RepWaitBackfillReserved::RepWaitBackfillReserved(my_context ctx
)
7000 NamedState(context
< RecoveryMachine
>().pg
, "Started/ReplicaActive/RepWaitBackfillReserved")
7002 context
< RecoveryMachine
>().log_enter(state_name
);
7005 boost::statechart::result
7006 PG::RecoveryState::RepNotRecovering::react(const RequestBackfillPrio
&evt
)
7008 PG
*pg
= context
< RecoveryMachine
>().pg
;
7011 if (pg
->cct
->_conf
->osd_debug_reject_backfill_probability
> 0 &&
7012 (rand()%1000 < (pg
->cct
->_conf
->osd_debug_reject_backfill_probability
*1000.0))) {
7013 ldout(pg
->cct
, 10) << "backfill reservation rejected: failure injection"
7015 post_event(RejectRemoteReservation());
7016 } else if (!pg
->cct
->_conf
->osd_debug_skip_full_check_in_backfill_reservation
&&
7017 pg
->osd
->check_backfill_full(ss
)) {
7018 ldout(pg
->cct
, 10) << "backfill reservation rejected: "
7019 << ss
.str() << dendl
;
7020 post_event(RejectRemoteReservation());
7022 pg
->osd
->remote_reserver
.request_reservation(
7024 new QueuePeeringEvt
<RemoteBackfillReserved
>(
7025 pg
, pg
->get_osdmap()->get_epoch(),
7026 RemoteBackfillReserved()), evt
.priority
);
7028 return transit
<RepWaitBackfillReserved
>();
7031 void PG::RecoveryState::RepWaitBackfillReserved::exit()
7033 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
7034 PG
*pg
= context
< RecoveryMachine
>().pg
;
7035 utime_t dur
= ceph_clock_now() - enter_time
;
7036 pg
->osd
->recoverystate_perf
->tinc(rs_repwaitbackfillreserved_latency
, dur
);
7039 boost::statechart::result
7040 PG::RecoveryState::RepWaitBackfillReserved::react(const RemoteBackfillReserved
&evt
)
7042 PG
*pg
= context
< RecoveryMachine
>().pg
;
7045 if (pg
->cct
->_conf
->osd_debug_reject_backfill_probability
> 0 &&
7046 (rand()%1000 < (pg
->cct
->_conf
->osd_debug_reject_backfill_probability
*1000.0))) {
7047 ldout(pg
->cct
, 10) << "backfill reservation rejected after reservation: "
7048 << "failure injection" << dendl
;
7049 post_event(RejectRemoteReservation());
7050 return discard_event();
7051 } else if (!pg
->cct
->_conf
->osd_debug_skip_full_check_in_backfill_reservation
&&
7052 pg
->osd
->check_backfill_full(ss
)) {
7053 ldout(pg
->cct
, 10) << "backfill reservation rejected after reservation: "
7054 << ss
.str() << dendl
;
7055 post_event(RejectRemoteReservation());
7056 return discard_event();
7058 pg
->osd
->send_message_osd_cluster(
7060 new MBackfillReserve(
7061 MBackfillReserve::GRANT
,
7062 spg_t(pg
->info
.pgid
.pgid
, pg
->primary
.shard
),
7063 pg
->get_osdmap()->get_epoch()),
7064 pg
->get_osdmap()->get_epoch());
7065 return transit
<RepRecovering
>();
7069 boost::statechart::result
7070 PG::RecoveryState::RepWaitBackfillReserved::react(
7071 const RejectRemoteReservation
&evt
)
7073 PG
*pg
= context
< RecoveryMachine
>().pg
;
7074 pg
->reject_reservation();
7075 post_event(RemoteReservationRejected());
7076 return discard_event();
7079 boost::statechart::result
7080 PG::RecoveryState::RepWaitBackfillReserved::react(
7081 const RemoteReservationRejected
&evt
)
7083 PG
*pg
= context
< RecoveryMachine
>().pg
;
7084 pg
->osd
->remote_reserver
.cancel_reservation(pg
->info
.pgid
);
7085 return transit
<RepNotRecovering
>();
7088 boost::statechart::result
7089 PG::RecoveryState::RepWaitBackfillReserved::react(
7090 const RemoteReservationCanceled
&evt
)
7092 PG
*pg
= context
< RecoveryMachine
>().pg
;
7093 pg
->osd
->remote_reserver
.cancel_reservation(pg
->info
.pgid
);
7094 return transit
<RepNotRecovering
>();
7097 /*---RepRecovering-------*/
7098 PG::RecoveryState::RepRecovering::RepRecovering(my_context ctx
)
7100 NamedState(context
< RecoveryMachine
>().pg
, "Started/ReplicaActive/RepRecovering")
7102 context
< RecoveryMachine
>().log_enter(state_name
);
7105 boost::statechart::result
7106 PG::RecoveryState::RepRecovering::react(const BackfillTooFull
&)
7108 PG
*pg
= context
< RecoveryMachine
>().pg
;
7109 pg
->reject_reservation();
7110 return discard_event();
7113 void PG::RecoveryState::RepRecovering::exit()
7115 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
7116 PG
*pg
= context
< RecoveryMachine
>().pg
;
7117 pg
->osd
->remote_reserver
.cancel_reservation(pg
->info
.pgid
);
7118 utime_t dur
= ceph_clock_now() - enter_time
;
7119 pg
->osd
->recoverystate_perf
->tinc(rs_reprecovering_latency
, dur
);
7122 /*------Activating--------*/
7123 PG::RecoveryState::Activating::Activating(my_context ctx
)
7125 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Active/Activating")
7127 context
< RecoveryMachine
>().log_enter(state_name
);
7130 void PG::RecoveryState::Activating::exit()
7132 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
7133 PG
*pg
= context
< RecoveryMachine
>().pg
;
7134 utime_t dur
= ceph_clock_now() - enter_time
;
7135 pg
->osd
->recoverystate_perf
->tinc(rs_activating_latency
, dur
);
7138 PG::RecoveryState::WaitLocalRecoveryReserved::WaitLocalRecoveryReserved(my_context ctx
)
7140 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Active/WaitLocalRecoveryReserved")
7142 context
< RecoveryMachine
>().log_enter(state_name
);
7143 PG
*pg
= context
< RecoveryMachine
>().pg
;
7145 // Make sure all nodes that part of the recovery aren't full
7146 if (!pg
->cct
->_conf
->osd_debug_skip_full_check_in_recovery
&&
7147 pg
->osd
->check_osdmap_full(pg
->actingbackfill
)) {
7148 post_event(RecoveryTooFull());
7152 pg
->state_clear(PG_STATE_RECOVERY_TOOFULL
);
7153 pg
->state_set(PG_STATE_RECOVERY_WAIT
);
7154 pg
->osd
->local_reserver
.request_reservation(
7156 new QueuePeeringEvt
<LocalRecoveryReserved
>(
7157 pg
, pg
->get_osdmap()->get_epoch(),
7158 LocalRecoveryReserved()),
7159 pg
->get_recovery_priority(),
7160 new QueuePeeringEvt
<DeferRecovery
>(
7161 pg
, pg
->get_osdmap()->get_epoch(),
7162 DeferRecovery(0.0)));
7163 pg
->publish_stats_to_osd();
7166 boost::statechart::result
7167 PG::RecoveryState::WaitLocalRecoveryReserved::react(const RecoveryTooFull
&evt
)
7169 PG
*pg
= context
< RecoveryMachine
>().pg
;
7170 pg
->state_set(PG_STATE_RECOVERY_TOOFULL
);
7171 pg
->schedule_recovery_retry(pg
->cct
->_conf
->osd_recovery_retry_interval
);
7172 return transit
<NotRecovering
>();
7175 void PG::RecoveryState::WaitLocalRecoveryReserved::exit()
7177 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
7178 PG
*pg
= context
< RecoveryMachine
>().pg
;
7179 utime_t dur
= ceph_clock_now() - enter_time
;
7180 pg
->osd
->recoverystate_perf
->tinc(rs_waitlocalrecoveryreserved_latency
, dur
);
7183 PG::RecoveryState::WaitRemoteRecoveryReserved::WaitRemoteRecoveryReserved(my_context ctx
)
7185 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Active/WaitRemoteRecoveryReserved"),
7186 remote_recovery_reservation_it(context
< Active
>().remote_shards_to_reserve_recovery
.begin())
7188 context
< RecoveryMachine
>().log_enter(state_name
);
7189 post_event(RemoteRecoveryReserved());
7192 boost::statechart::result
7193 PG::RecoveryState::WaitRemoteRecoveryReserved::react(const RemoteRecoveryReserved
&evt
) {
7194 PG
*pg
= context
< RecoveryMachine
>().pg
;
7196 if (remote_recovery_reservation_it
!= context
< Active
>().remote_shards_to_reserve_recovery
.end()) {
7197 assert(*remote_recovery_reservation_it
!= pg
->pg_whoami
);
7198 ConnectionRef con
= pg
->osd
->get_con_osd_cluster(
7199 remote_recovery_reservation_it
->osd
, pg
->get_osdmap()->get_epoch());
7201 pg
->osd
->send_message_osd_cluster(
7202 new MRecoveryReserve(
7203 MRecoveryReserve::REQUEST
,
7204 spg_t(pg
->info
.pgid
.pgid
, remote_recovery_reservation_it
->shard
),
7205 pg
->get_osdmap()->get_epoch()),
7208 ++remote_recovery_reservation_it
;
7210 post_event(AllRemotesReserved());
7212 return discard_event();
7215 void PG::RecoveryState::WaitRemoteRecoveryReserved::exit()
7217 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
7218 PG
*pg
= context
< RecoveryMachine
>().pg
;
7219 utime_t dur
= ceph_clock_now() - enter_time
;
7220 pg
->osd
->recoverystate_perf
->tinc(rs_waitremoterecoveryreserved_latency
, dur
);
7223 PG::RecoveryState::Recovering::Recovering(my_context ctx
)
7225 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Active/Recovering")
7227 context
< RecoveryMachine
>().log_enter(state_name
);
7229 PG
*pg
= context
< RecoveryMachine
>().pg
;
7230 pg
->state_clear(PG_STATE_RECOVERY_WAIT
);
7231 pg
->state_clear(PG_STATE_RECOVERY_TOOFULL
);
7232 pg
->state_set(PG_STATE_RECOVERING
);
7233 assert(!pg
->state_test(PG_STATE_ACTIVATING
));
7234 pg
->publish_stats_to_osd();
7235 pg
->queue_recovery();
7238 void PG::RecoveryState::Recovering::release_reservations(bool cancel
)
7240 PG
*pg
= context
< RecoveryMachine
>().pg
;
7241 assert(cancel
|| !pg
->pg_log
.get_missing().have_missing());
7243 // release remote reservations
7244 for (set
<pg_shard_t
>::const_iterator i
=
7245 context
< Active
>().remote_shards_to_reserve_recovery
.begin();
7246 i
!= context
< Active
>().remote_shards_to_reserve_recovery
.end();
7248 if (*i
== pg
->pg_whoami
) // skip myself
7250 ConnectionRef con
= pg
->osd
->get_con_osd_cluster(
7251 i
->osd
, pg
->get_osdmap()->get_epoch());
7253 pg
->osd
->send_message_osd_cluster(
7254 new MRecoveryReserve(
7255 MRecoveryReserve::RELEASE
,
7256 spg_t(pg
->info
.pgid
.pgid
, i
->shard
),
7257 pg
->get_osdmap()->get_epoch()),
7263 boost::statechart::result
7264 PG::RecoveryState::Recovering::react(const AllReplicasRecovered
&evt
)
7266 PG
*pg
= context
< RecoveryMachine
>().pg
;
7267 pg
->state_clear(PG_STATE_RECOVERING
);
7268 pg
->state_clear(PG_STATE_FORCED_RECOVERY
);
7269 release_reservations();
7270 pg
->osd
->local_reserver
.cancel_reservation(pg
->info
.pgid
);
7271 return transit
<Recovered
>();
7274 boost::statechart::result
7275 PG::RecoveryState::Recovering::react(const RequestBackfill
&evt
)
7277 PG
*pg
= context
< RecoveryMachine
>().pg
;
7278 pg
->state_clear(PG_STATE_RECOVERING
);
7279 pg
->state_clear(PG_STATE_FORCED_RECOVERY
);
7280 release_reservations();
7281 pg
->osd
->local_reserver
.cancel_reservation(pg
->info
.pgid
);
7282 // XXX: Is this needed?
7283 pg
->publish_stats_to_osd();
7284 return transit
<WaitLocalBackfillReserved
>();
7287 boost::statechart::result
7288 PG::RecoveryState::Recovering::react(const DeferRecovery
&evt
)
7290 PG
*pg
= context
< RecoveryMachine
>().pg
;
7291 if (!pg
->state_test(PG_STATE_RECOVERING
)) {
7292 // we may have finished recovery and have an AllReplicasRecovered
7293 // event queued to move us to the next state.
7294 ldout(pg
->cct
, 10) << "got defer recovery but not recovering" << dendl
;
7295 return discard_event();
7297 ldout(pg
->cct
, 10) << "defer recovery, retry delay " << evt
.delay
<< dendl
;
7298 pg
->state_clear(PG_STATE_RECOVERING
);
7299 pg
->state_set(PG_STATE_RECOVERY_WAIT
);
7300 pg
->osd
->local_reserver
.cancel_reservation(pg
->info
.pgid
);
7301 release_reservations(true);
7302 pg
->schedule_recovery_retry(evt
.delay
);
7303 return transit
<NotRecovering
>();
7306 boost::statechart::result
7307 PG::RecoveryState::Recovering::react(const UnfoundRecovery
&evt
)
7309 PG
*pg
= context
< RecoveryMachine
>().pg
;
7310 ldout(pg
->cct
, 10) << "recovery has unfound, can't continue" << dendl
;
7311 pg
->state_set(PG_STATE_RECOVERY_UNFOUND
);
7312 pg
->state_clear(PG_STATE_RECOVERING
);
7313 pg
->osd
->local_reserver
.cancel_reservation(pg
->info
.pgid
);
7314 release_reservations(true);
7315 return transit
<NotRecovering
>();
7318 void PG::RecoveryState::Recovering::exit()
7320 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
7321 PG
*pg
= context
< RecoveryMachine
>().pg
;
7322 utime_t dur
= ceph_clock_now() - enter_time
;
7323 pg
->osd
->recoverystate_perf
->tinc(rs_recovering_latency
, dur
);
7326 PG::RecoveryState::Recovered::Recovered(my_context ctx
)
7328 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Active/Recovered")
7330 pg_shard_t auth_log_shard
;
7332 context
< RecoveryMachine
>().log_enter(state_name
);
7334 PG
*pg
= context
< RecoveryMachine
>().pg
;
7336 assert(!pg
->needs_recovery());
7338 // if we finished backfill, all acting are active; recheck if
7339 // DEGRADED | UNDERSIZED is appropriate.
7340 assert(!pg
->actingbackfill
.empty());
7341 if (pg
->get_osdmap()->get_pg_size(pg
->info
.pgid
.pgid
) <=
7342 pg
->actingbackfill
.size()) {
7343 pg
->state_clear(PG_STATE_FORCED_BACKFILL
| PG_STATE_FORCED_RECOVERY
);
7344 pg
->publish_stats_to_osd();
7347 // trim pglog on recovered
7350 // adjust acting set? (e.g. because backfill completed...)
7351 bool history_les_bound
= false;
7352 if (pg
->acting
!= pg
->up
&& !pg
->choose_acting(auth_log_shard
,
7353 true, &history_les_bound
))
7354 assert(pg
->want_acting
.size());
7356 if (context
< Active
>().all_replicas_activated
)
7357 post_event(GoClean());
7360 void PG::RecoveryState::Recovered::exit()
7362 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
7363 PG
*pg
= context
< RecoveryMachine
>().pg
;
7364 utime_t dur
= ceph_clock_now() - enter_time
;
7365 pg
->osd
->recoverystate_perf
->tinc(rs_recovered_latency
, dur
);
7368 PG::RecoveryState::Clean::Clean(my_context ctx
)
7370 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Active/Clean")
7372 context
< RecoveryMachine
>().log_enter(state_name
);
7374 PG
*pg
= context
< RecoveryMachine
>().pg
;
7376 if (pg
->info
.last_complete
!= pg
->info
.last_update
) {
7379 pg
->finish_recovery(*context
< RecoveryMachine
>().get_on_safe_context_list());
7381 if (pg
->is_active()) {
7385 pg
->share_pg_info();
7386 pg
->publish_stats_to_osd();
7387 pg
->requeue_ops(pg
->waiting_for_clean_to_primary_repair
);
7390 void PG::RecoveryState::Clean::exit()
7392 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
7393 PG
*pg
= context
< RecoveryMachine
>().pg
;
7394 pg
->state_clear(PG_STATE_CLEAN
);
7395 utime_t dur
= ceph_clock_now() - enter_time
;
7396 pg
->osd
->recoverystate_perf
->tinc(rs_clean_latency
, dur
);
7399 template <typename T
>
7400 set
<pg_shard_t
> unique_osd_shard_set(const pg_shard_t
& skip
, const T
&in
)
7402 set
<int> osds_found
;
7403 set
<pg_shard_t
> out
;
7404 for (typename
T::const_iterator i
= in
.begin();
7407 if (*i
!= skip
&& !osds_found
.count(i
->osd
)) {
7408 osds_found
.insert(i
->osd
);
7415 /*---------Active---------*/
7416 PG::RecoveryState::Active::Active(my_context ctx
)
7418 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Active"),
7419 remote_shards_to_reserve_recovery(
7420 unique_osd_shard_set(
7421 context
< RecoveryMachine
>().pg
->pg_whoami
,
7422 context
< RecoveryMachine
>().pg
->actingbackfill
)),
7423 remote_shards_to_reserve_backfill(
7424 unique_osd_shard_set(
7425 context
< RecoveryMachine
>().pg
->pg_whoami
,
7426 context
< RecoveryMachine
>().pg
->backfill_targets
)),
7427 all_replicas_activated(false)
7429 context
< RecoveryMachine
>().log_enter(state_name
);
7431 PG
*pg
= context
< RecoveryMachine
>().pg
;
7433 assert(!pg
->backfill_reserving
);
7434 assert(!pg
->backfill_reserved
);
7435 assert(pg
->is_primary());
7436 ldout(pg
->cct
, 10) << "In Active, about to call activate" << dendl
;
7438 context
< RecoveryMachine
>().get_cur_transaction(),
7439 context
< RecoveryMachine
>().get_on_applied_context_list(),
7440 context
< RecoveryMachine
>().get_on_safe_context_list());
7441 pg
->activate(*context
< RecoveryMachine
>().get_cur_transaction(),
7442 pg
->get_osdmap()->get_epoch(),
7443 *context
< RecoveryMachine
>().get_on_safe_context_list(),
7444 *context
< RecoveryMachine
>().get_query_map(),
7445 context
< RecoveryMachine
>().get_info_map(),
7446 context
< RecoveryMachine
>().get_recovery_ctx());
7448 // everyone has to commit/ack before we are truly active
7449 pg
->blocked_by
.clear();
7450 for (set
<pg_shard_t
>::iterator p
= pg
->actingbackfill
.begin();
7451 p
!= pg
->actingbackfill
.end();
7453 if (p
->shard
!= pg
->pg_whoami
.shard
) {
7454 pg
->blocked_by
.insert(p
->shard
);
7457 pg
->publish_stats_to_osd();
7458 ldout(pg
->cct
, 10) << "Activate Finished" << dendl
;
7461 boost::statechart::result
PG::RecoveryState::Active::react(const AdvMap
& advmap
)
7463 PG
*pg
= context
< RecoveryMachine
>().pg
;
7464 ldout(pg
->cct
, 10) << "Active advmap" << dendl
;
7465 if (!pg
->pool
.newly_removed_snaps
.empty()) {
7466 pg
->snap_trimq
.union_of(pg
->pool
.newly_removed_snaps
);
7467 ldout(pg
->cct
, 10) << *pg
<< " snap_trimq now " << pg
->snap_trimq
<< dendl
;
7468 pg
->dirty_info
= true;
7469 pg
->dirty_big_info
= true;
7472 for (size_t i
= 0; i
< pg
->want_acting
.size(); i
++) {
7473 int osd
= pg
->want_acting
[i
];
7474 if (!advmap
.osdmap
->is_up(osd
)) {
7475 pg_shard_t
osd_with_shard(osd
, shard_id_t(i
));
7476 assert(pg
->is_acting(osd_with_shard
) || pg
->is_up(osd_with_shard
));
7480 bool need_publish
= false;
7481 /* Check for changes in pool size (if the acting set changed as a result,
7482 * this does not matter) */
7483 if (advmap
.lastmap
->get_pg_size(pg
->info
.pgid
.pgid
) !=
7484 pg
->get_osdmap()->get_pg_size(pg
->info
.pgid
.pgid
)) {
7485 if (pg
->get_osdmap()->get_pg_size(pg
->info
.pgid
.pgid
) <= pg
->actingset
.size()) {
7486 pg
->state_clear(PG_STATE_UNDERSIZED
);
7488 pg
->state_set(PG_STATE_UNDERSIZED
);
7490 // degraded changes will be detected by call from publish_stats_to_osd()
7491 need_publish
= true;
7494 // if we haven't reported our PG stats in a long time, do so now.
7495 if (pg
->info
.stats
.reported_epoch
+ pg
->cct
->_conf
->osd_pg_stat_report_interval_max
< advmap
.osdmap
->get_epoch()) {
7496 ldout(pg
->cct
, 20) << "reporting stats to osd after " << (advmap
.osdmap
->get_epoch() - pg
->info
.stats
.reported_epoch
)
7497 << " epochs" << dendl
;
7498 need_publish
= true;
7502 pg
->publish_stats_to_osd();
7504 return forward_event();
7507 boost::statechart::result
PG::RecoveryState::Active::react(const ActMap
&)
7509 PG
*pg
= context
< RecoveryMachine
>().pg
;
7510 ldout(pg
->cct
, 10) << "Active: handling ActMap" << dendl
;
7511 assert(pg
->is_primary());
7513 if (pg
->have_unfound()) {
7514 // object may have become unfound
7515 pg
->discover_all_missing(*context
< RecoveryMachine
>().get_query_map());
7518 if (pg
->cct
->_conf
->osd_check_for_log_corruption
)
7519 pg
->check_log_for_corruption(pg
->osd
->store
);
7521 uint64_t unfound
= pg
->missing_loc
.num_unfound();
7523 pg
->all_unfound_are_queried_or_lost(pg
->get_osdmap())) {
7524 if (pg
->cct
->_conf
->osd_auto_mark_unfound_lost
) {
7525 pg
->osd
->clog
->error() << pg
->info
.pgid
.pgid
<< " has " << unfound
7526 << " objects unfound and apparently lost, would automatically "
7527 << "mark these objects lost but this feature is not yet implemented "
7528 << "(osd_auto_mark_unfound_lost)";
7530 pg
->osd
->clog
->error() << pg
->info
.pgid
.pgid
<< " has "
7531 << unfound
<< " objects unfound and apparently lost";
7534 if (pg
->is_active()) {
7535 ldout(pg
->cct
, 10) << "Active: kicking snap trim" << dendl
;
7536 pg
->kick_snap_trim();
7539 if (pg
->is_peered() &&
7541 !pg
->get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL
) &&
7542 (!pg
->get_osdmap()->test_flag(CEPH_OSDMAP_NOREBALANCE
) || pg
->is_degraded())) {
7543 pg
->queue_recovery();
7545 return forward_event();
7548 boost::statechart::result
PG::RecoveryState::Active::react(const MNotifyRec
& notevt
)
7550 PG
*pg
= context
< RecoveryMachine
>().pg
;
7551 assert(pg
->is_primary());
7552 if (pg
->peer_info
.count(notevt
.from
)) {
7553 ldout(pg
->cct
, 10) << "Active: got notify from " << notevt
.from
7554 << ", already have info from that osd, ignoring"
7556 } else if (pg
->peer_purged
.count(notevt
.from
)) {
7557 ldout(pg
->cct
, 10) << "Active: got notify from " << notevt
.from
7558 << ", already purged that peer, ignoring"
7561 ldout(pg
->cct
, 10) << "Active: got notify from " << notevt
.from
7562 << ", calling proc_replica_info and discover_all_missing"
7564 pg
->proc_replica_info(
7565 notevt
.from
, notevt
.notify
.info
, notevt
.notify
.epoch_sent
);
7566 if (pg
->have_unfound()) {
7567 pg
->discover_all_missing(*context
< RecoveryMachine
>().get_query_map());
7570 return discard_event();
7573 boost::statechart::result
PG::RecoveryState::Active::react(const MInfoRec
& infoevt
)
7575 PG
*pg
= context
< RecoveryMachine
>().pg
;
7576 assert(pg
->is_primary());
7578 assert(!pg
->actingbackfill
.empty());
7579 // don't update history (yet) if we are active and primary; the replica
7580 // may be telling us they have activated (and committed) but we can't
7581 // share that until _everyone_ does the same.
7582 if (pg
->is_actingbackfill(infoevt
.from
)) {
7583 ldout(pg
->cct
, 10) << " peer osd." << infoevt
.from
7584 << " activated and committed" << dendl
;
7585 pg
->peer_activated
.insert(infoevt
.from
);
7586 pg
->blocked_by
.erase(infoevt
.from
.shard
);
7587 pg
->publish_stats_to_osd();
7588 if (pg
->peer_activated
.size() == pg
->actingbackfill
.size()) {
7589 pg
->all_activated_and_committed();
7592 return discard_event();
7595 boost::statechart::result
PG::RecoveryState::Active::react(const MLogRec
& logevt
)
7597 PG
*pg
= context
< RecoveryMachine
>().pg
;
7598 ldout(pg
->cct
, 10) << "searching osd." << logevt
.from
7599 << " log for unfound items" << dendl
;
7600 pg
->proc_replica_log(
7601 logevt
.msg
->info
, logevt
.msg
->log
, logevt
.msg
->missing
, logevt
.from
);
7602 bool got_missing
= pg
->search_for_missing(
7603 pg
->peer_info
[logevt
.from
],
7604 pg
->peer_missing
[logevt
.from
],
7606 context
< RecoveryMachine
>().get_recovery_ctx());
7607 // If there are missing AND we are "fully" active then start recovery now
7608 if (got_missing
&& pg
->state_test(PG_STATE_ACTIVE
)) {
7609 post_event(DoRecovery());
7611 return discard_event();
7614 boost::statechart::result
PG::RecoveryState::Active::react(const QueryState
& q
)
7616 PG
*pg
= context
< RecoveryMachine
>().pg
;
7618 q
.f
->open_object_section("state");
7619 q
.f
->dump_string("name", state_name
);
7620 q
.f
->dump_stream("enter_time") << enter_time
;
7623 q
.f
->open_array_section("might_have_unfound");
7624 for (set
<pg_shard_t
>::iterator p
= pg
->might_have_unfound
.begin();
7625 p
!= pg
->might_have_unfound
.end();
7627 q
.f
->open_object_section("osd");
7628 q
.f
->dump_stream("osd") << *p
;
7629 if (pg
->peer_missing
.count(*p
)) {
7630 q
.f
->dump_string("status", "already probed");
7631 } else if (pg
->peer_missing_requested
.count(*p
)) {
7632 q
.f
->dump_string("status", "querying");
7633 } else if (!pg
->get_osdmap()->is_up(p
->osd
)) {
7634 q
.f
->dump_string("status", "osd is down");
7636 q
.f
->dump_string("status", "not queried");
7638 q
.f
->close_section();
7640 q
.f
->close_section();
7643 q
.f
->open_object_section("recovery_progress");
7644 pg
->dump_recovery_info(q
.f
);
7645 q
.f
->close_section();
7649 q
.f
->open_object_section("scrub");
7650 q
.f
->dump_stream("scrubber.epoch_start") << pg
->scrubber
.epoch_start
;
7651 q
.f
->dump_bool("scrubber.active", pg
->scrubber
.active
);
7652 q
.f
->dump_string("scrubber.state", Scrubber::state_string(pg
->scrubber
.state
));
7653 q
.f
->dump_stream("scrubber.start") << pg
->scrubber
.start
;
7654 q
.f
->dump_stream("scrubber.end") << pg
->scrubber
.end
;
7655 q
.f
->dump_stream("scrubber.max_end") << pg
->scrubber
.max_end
;
7656 q
.f
->dump_stream("scrubber.subset_last_update") << pg
->scrubber
.subset_last_update
;
7657 q
.f
->dump_bool("scrubber.deep", pg
->scrubber
.deep
);
7659 q
.f
->open_array_section("scrubber.waiting_on_whom");
7660 for (set
<pg_shard_t
>::iterator p
= pg
->scrubber
.waiting_on_whom
.begin();
7661 p
!= pg
->scrubber
.waiting_on_whom
.end();
7663 q
.f
->dump_stream("shard") << *p
;
7665 q
.f
->close_section();
7667 q
.f
->close_section();
7670 q
.f
->close_section();
7671 return forward_event();
7674 boost::statechart::result
PG::RecoveryState::Active::react(const AllReplicasActivated
&evt
)
7676 PG
*pg
= context
< RecoveryMachine
>().pg
;
7677 all_replicas_activated
= true;
7679 pg
->state_clear(PG_STATE_ACTIVATING
);
7680 pg
->state_clear(PG_STATE_CREATING
);
7681 if (pg
->acting
.size() >= pg
->pool
.info
.min_size
) {
7682 pg
->state_set(PG_STATE_ACTIVE
);
7684 pg
->state_set(PG_STATE_PEERED
);
7687 // info.last_epoch_started is set during activate()
7688 pg
->info
.history
.last_epoch_started
= pg
->info
.last_epoch_started
;
7689 pg
->info
.history
.last_interval_started
= pg
->info
.last_interval_started
;
7690 pg
->dirty_info
= true;
7692 pg
->share_pg_info();
7693 pg
->publish_stats_to_osd();
7698 if (pg
->flushes_in_progress
== 0) {
7699 pg
->requeue_ops(pg
->waiting_for_peered
);
7700 } else if (!pg
->waiting_for_peered
.empty()) {
7701 ldout(pg
->cct
, 10) << __func__
<< " flushes in progress, moving "
7702 << pg
->waiting_for_peered
.size()
7703 << " items to waiting_for_flush"
7705 assert(pg
->waiting_for_flush
.empty());
7706 pg
->waiting_for_flush
.swap(pg
->waiting_for_peered
);
7711 return discard_event();
7714 void PG::RecoveryState::Active::exit()
7716 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
7717 PG
*pg
= context
< RecoveryMachine
>().pg
;
7718 pg
->osd
->local_reserver
.cancel_reservation(pg
->info
.pgid
);
7720 pg
->blocked_by
.clear();
7721 pg
->backfill_reserved
= false;
7722 pg
->backfill_reserving
= false;
7723 pg
->state_clear(PG_STATE_ACTIVATING
);
7724 pg
->state_clear(PG_STATE_DEGRADED
);
7725 pg
->state_clear(PG_STATE_UNDERSIZED
);
7726 pg
->state_clear(PG_STATE_BACKFILL_TOOFULL
);
7727 pg
->state_clear(PG_STATE_BACKFILL_WAIT
);
7728 pg
->state_clear(PG_STATE_RECOVERY_WAIT
);
7729 pg
->state_clear(PG_STATE_RECOVERY_TOOFULL
);
7730 utime_t dur
= ceph_clock_now() - enter_time
;
7731 pg
->osd
->recoverystate_perf
->tinc(rs_active_latency
, dur
);
7735 /*------ReplicaActive-----*/
7736 PG::RecoveryState::ReplicaActive::ReplicaActive(my_context ctx
)
7738 NamedState(context
< RecoveryMachine
>().pg
, "Started/ReplicaActive")
7740 context
< RecoveryMachine
>().log_enter(state_name
);
7742 PG
*pg
= context
< RecoveryMachine
>().pg
;
7744 context
< RecoveryMachine
>().get_cur_transaction(),
7745 context
< RecoveryMachine
>().get_on_applied_context_list(),
7746 context
< RecoveryMachine
>().get_on_safe_context_list());
7750 boost::statechart::result
PG::RecoveryState::ReplicaActive::react(
7751 const Activate
& actevt
) {
7752 PG
*pg
= context
< RecoveryMachine
>().pg
;
7753 ldout(pg
->cct
, 10) << "In ReplicaActive, about to call activate" << dendl
;
7754 map
<int, map
<spg_t
, pg_query_t
> > query_map
;
7755 pg
->activate(*context
< RecoveryMachine
>().get_cur_transaction(),
7756 actevt
.activation_epoch
,
7757 *context
< RecoveryMachine
>().get_on_safe_context_list(),
7758 query_map
, NULL
, NULL
);
7759 ldout(pg
->cct
, 10) << "Activate Finished" << dendl
;
7760 return discard_event();
7763 boost::statechart::result
PG::RecoveryState::ReplicaActive::react(const MInfoRec
& infoevt
)
7765 PG
*pg
= context
< RecoveryMachine
>().pg
;
7766 pg
->proc_primary_info(*context
<RecoveryMachine
>().get_cur_transaction(),
7768 return discard_event();
7771 boost::statechart::result
PG::RecoveryState::ReplicaActive::react(const MLogRec
& logevt
)
7773 PG
*pg
= context
< RecoveryMachine
>().pg
;
7774 ldout(pg
->cct
, 10) << "received log from " << logevt
.from
<< dendl
;
7775 ObjectStore::Transaction
* t
= context
<RecoveryMachine
>().get_cur_transaction();
7776 pg
->merge_log(*t
, logevt
.msg
->info
, logevt
.msg
->log
, logevt
.from
);
7777 assert(pg
->pg_log
.get_head() == pg
->info
.last_update
);
7779 return discard_event();
7782 boost::statechart::result
PG::RecoveryState::ReplicaActive::react(const ActMap
&)
7784 PG
*pg
= context
< RecoveryMachine
>().pg
;
7785 if (pg
->should_send_notify() && pg
->get_primary().osd
>= 0) {
7786 context
< RecoveryMachine
>().send_notify(
7789 pg
->get_primary().shard
, pg
->pg_whoami
.shard
,
7790 pg
->get_osdmap()->get_epoch(),
7791 pg
->get_osdmap()->get_epoch(),
7793 pg
->past_intervals
);
7796 return discard_event();
7799 boost::statechart::result
PG::RecoveryState::ReplicaActive::react(const MQuery
& query
)
7801 PG
*pg
= context
< RecoveryMachine
>().pg
;
7802 if (query
.query
.type
== pg_query_t::MISSING
) {
7803 pg
->update_history(query
.query
.history
);
7804 pg
->fulfill_log(query
.from
, query
.query
, query
.query_epoch
);
7805 } // else: from prior to activation, safe to ignore
7806 return discard_event();
7809 boost::statechart::result
PG::RecoveryState::ReplicaActive::react(const QueryState
& q
)
7811 q
.f
->open_object_section("state");
7812 q
.f
->dump_string("name", state_name
);
7813 q
.f
->dump_stream("enter_time") << enter_time
;
7814 q
.f
->close_section();
7815 return forward_event();
7818 void PG::RecoveryState::ReplicaActive::exit()
7820 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
7821 PG
*pg
= context
< RecoveryMachine
>().pg
;
7822 pg
->osd
->remote_reserver
.cancel_reservation(pg
->info
.pgid
);
7823 utime_t dur
= ceph_clock_now() - enter_time
;
7824 pg
->osd
->recoverystate_perf
->tinc(rs_replicaactive_latency
, dur
);
7828 PG::RecoveryState::Stray::Stray(my_context ctx
)
7830 NamedState(context
< RecoveryMachine
>().pg
, "Started/Stray")
7832 context
< RecoveryMachine
>().log_enter(state_name
);
7834 PG
*pg
= context
< RecoveryMachine
>().pg
;
7835 assert(!pg
->is_peered());
7836 assert(!pg
->is_peering());
7837 assert(!pg
->is_primary());
7839 context
< RecoveryMachine
>().get_cur_transaction(),
7840 context
< RecoveryMachine
>().get_on_applied_context_list(),
7841 context
< RecoveryMachine
>().get_on_safe_context_list());
7844 boost::statechart::result
PG::RecoveryState::Stray::react(const MLogRec
& logevt
)
7846 PG
*pg
= context
< RecoveryMachine
>().pg
;
7847 MOSDPGLog
*msg
= logevt
.msg
.get();
7848 ldout(pg
->cct
, 10) << "got info+log from osd." << logevt
.from
<< " " << msg
->info
<< " " << msg
->log
<< dendl
;
7850 ObjectStore::Transaction
* t
= context
<RecoveryMachine
>().get_cur_transaction();
7851 if (msg
->info
.last_backfill
== hobject_t()) {
7853 pg
->unreg_next_scrub();
7854 pg
->info
= msg
->info
;
7855 pg
->reg_next_scrub();
7856 pg
->dirty_info
= true;
7857 pg
->dirty_big_info
= true; // maybe.
7859 PGLogEntryHandler rollbacker
{pg
, t
};
7860 pg
->pg_log
.reset_backfill_claim_log(msg
->log
, &rollbacker
);
7862 pg
->pg_log
.reset_backfill();
7864 pg
->merge_log(*t
, msg
->info
, msg
->log
, logevt
.from
);
7867 assert(pg
->pg_log
.get_head() == pg
->info
.last_update
);
7869 post_event(Activate(logevt
.msg
->info
.last_epoch_started
));
7870 return transit
<ReplicaActive
>();
7873 boost::statechart::result
PG::RecoveryState::Stray::react(const MInfoRec
& infoevt
)
7875 PG
*pg
= context
< RecoveryMachine
>().pg
;
7876 ldout(pg
->cct
, 10) << "got info from osd." << infoevt
.from
<< " " << infoevt
.info
<< dendl
;
7878 if (pg
->info
.last_update
> infoevt
.info
.last_update
) {
7879 // rewind divergent log entries
7880 ObjectStore::Transaction
* t
= context
<RecoveryMachine
>().get_cur_transaction();
7881 pg
->rewind_divergent_log(*t
, infoevt
.info
.last_update
);
7882 pg
->info
.stats
= infoevt
.info
.stats
;
7883 pg
->info
.hit_set
= infoevt
.info
.hit_set
;
7886 assert(infoevt
.info
.last_update
== pg
->info
.last_update
);
7887 assert(pg
->pg_log
.get_head() == pg
->info
.last_update
);
7889 post_event(Activate(infoevt
.info
.last_epoch_started
));
7890 return transit
<ReplicaActive
>();
7893 boost::statechart::result
PG::RecoveryState::Stray::react(const MQuery
& query
)
7895 PG
*pg
= context
< RecoveryMachine
>().pg
;
7896 if (query
.query
.type
== pg_query_t::INFO
) {
7897 pair
<pg_shard_t
, pg_info_t
> notify_info
;
7898 pg
->update_history(query
.query
.history
);
7899 pg
->fulfill_info(query
.from
, query
.query
, notify_info
);
7900 context
< RecoveryMachine
>().send_notify(
7903 notify_info
.first
.shard
, pg
->pg_whoami
.shard
,
7905 pg
->get_osdmap()->get_epoch(),
7906 notify_info
.second
),
7907 pg
->past_intervals
);
7909 pg
->fulfill_log(query
.from
, query
.query
, query
.query_epoch
);
7911 return discard_event();
7914 boost::statechart::result
PG::RecoveryState::Stray::react(const ActMap
&)
7916 PG
*pg
= context
< RecoveryMachine
>().pg
;
7917 if (pg
->should_send_notify() && pg
->get_primary().osd
>= 0) {
7918 context
< RecoveryMachine
>().send_notify(
7921 pg
->get_primary().shard
, pg
->pg_whoami
.shard
,
7922 pg
->get_osdmap()->get_epoch(),
7923 pg
->get_osdmap()->get_epoch(),
7925 pg
->past_intervals
);
7928 return discard_event();
7931 void PG::RecoveryState::Stray::exit()
7933 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
7934 PG
*pg
= context
< RecoveryMachine
>().pg
;
7935 utime_t dur
= ceph_clock_now() - enter_time
;
7936 pg
->osd
->recoverystate_perf
->tinc(rs_stray_latency
, dur
);
7939 /*--------GetInfo---------*/
7940 PG::RecoveryState::GetInfo::GetInfo(my_context ctx
)
7942 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Peering/GetInfo")
7944 context
< RecoveryMachine
>().log_enter(state_name
);
7946 PG
*pg
= context
< RecoveryMachine
>().pg
;
7947 pg
->check_past_interval_bounds();
7948 PastIntervals::PriorSet
&prior_set
= context
< Peering
>().prior_set
;
7950 assert(pg
->blocked_by
.empty());
7952 prior_set
= pg
->build_prior();
7954 pg
->reset_min_peer_features();
7956 if (prior_set
.pg_down
) {
7957 post_event(IsDown());
7958 } else if (peer_info_requested
.empty()) {
7959 post_event(GotInfo());
7963 void PG::RecoveryState::GetInfo::get_infos()
7965 PG
*pg
= context
< RecoveryMachine
>().pg
;
7966 PastIntervals::PriorSet
&prior_set
= context
< Peering
>().prior_set
;
7968 pg
->blocked_by
.clear();
7969 for (set
<pg_shard_t
>::const_iterator it
= prior_set
.probe
.begin();
7970 it
!= prior_set
.probe
.end();
7972 pg_shard_t peer
= *it
;
7973 if (peer
== pg
->pg_whoami
) {
7976 if (pg
->peer_info
.count(peer
)) {
7977 ldout(pg
->cct
, 10) << " have osd." << peer
<< " info " << pg
->peer_info
[peer
] << dendl
;
7980 if (peer_info_requested
.count(peer
)) {
7981 ldout(pg
->cct
, 10) << " already requested info from osd." << peer
<< dendl
;
7982 pg
->blocked_by
.insert(peer
.osd
);
7983 } else if (!pg
->get_osdmap()->is_up(peer
.osd
)) {
7984 ldout(pg
->cct
, 10) << " not querying info from down osd." << peer
<< dendl
;
7986 ldout(pg
->cct
, 10) << " querying info from osd." << peer
<< dendl
;
7987 context
< RecoveryMachine
>().send_query(
7988 peer
, pg_query_t(pg_query_t::INFO
,
7989 it
->shard
, pg
->pg_whoami
.shard
,
7991 pg
->get_osdmap()->get_epoch()));
7992 peer_info_requested
.insert(peer
);
7993 pg
->blocked_by
.insert(peer
.osd
);
7997 pg
->publish_stats_to_osd();
8000 boost::statechart::result
PG::RecoveryState::GetInfo::react(const MNotifyRec
& infoevt
)
8002 PG
*pg
= context
< RecoveryMachine
>().pg
;
8004 set
<pg_shard_t
>::iterator p
= peer_info_requested
.find(infoevt
.from
);
8005 if (p
!= peer_info_requested
.end()) {
8006 peer_info_requested
.erase(p
);
8007 pg
->blocked_by
.erase(infoevt
.from
.osd
);
8010 epoch_t old_start
= pg
->info
.history
.last_epoch_started
;
8011 if (pg
->proc_replica_info(
8012 infoevt
.from
, infoevt
.notify
.info
, infoevt
.notify
.epoch_sent
)) {
8013 // we got something new ...
8014 PastIntervals::PriorSet
&prior_set
= context
< Peering
>().prior_set
;
8015 if (old_start
< pg
->info
.history
.last_epoch_started
) {
8016 ldout(pg
->cct
, 10) << " last_epoch_started moved forward, rebuilding prior" << dendl
;
8017 prior_set
= pg
->build_prior();
8019 // filter out any osds that got dropped from the probe set from
8020 // peer_info_requested. this is less expensive than restarting
8021 // peering (which would re-probe everyone).
8022 set
<pg_shard_t
>::iterator p
= peer_info_requested
.begin();
8023 while (p
!= peer_info_requested
.end()) {
8024 if (prior_set
.probe
.count(*p
) == 0) {
8025 ldout(pg
->cct
, 20) << " dropping osd." << *p
<< " from info_requested, no longer in probe set" << dendl
;
8026 peer_info_requested
.erase(p
++);
8033 ldout(pg
->cct
, 20) << "Adding osd: " << infoevt
.from
.osd
<< " peer features: "
8034 << hex
<< infoevt
.features
<< dec
<< dendl
;
8035 pg
->apply_peer_features(infoevt
.features
);
8037 // are we done getting everything?
8038 if (peer_info_requested
.empty() && !prior_set
.pg_down
) {
8039 ldout(pg
->cct
, 20) << "Common peer features: " << hex
<< pg
->get_min_peer_features() << dec
<< dendl
;
8040 ldout(pg
->cct
, 20) << "Common acting features: " << hex
<< pg
->get_min_acting_features() << dec
<< dendl
;
8041 ldout(pg
->cct
, 20) << "Common upacting features: " << hex
<< pg
->get_min_upacting_features() << dec
<< dendl
;
8042 post_event(GotInfo());
8045 return discard_event();
8048 boost::statechart::result
PG::RecoveryState::GetInfo::react(const QueryState
& q
)
8050 PG
*pg
= context
< RecoveryMachine
>().pg
;
8051 q
.f
->open_object_section("state");
8052 q
.f
->dump_string("name", state_name
);
8053 q
.f
->dump_stream("enter_time") << enter_time
;
8055 q
.f
->open_array_section("requested_info_from");
8056 for (set
<pg_shard_t
>::iterator p
= peer_info_requested
.begin();
8057 p
!= peer_info_requested
.end();
8059 q
.f
->open_object_section("osd");
8060 q
.f
->dump_stream("osd") << *p
;
8061 if (pg
->peer_info
.count(*p
)) {
8062 q
.f
->open_object_section("got_info");
8063 pg
->peer_info
[*p
].dump(q
.f
);
8064 q
.f
->close_section();
8066 q
.f
->close_section();
8068 q
.f
->close_section();
8070 q
.f
->close_section();
8071 return forward_event();
8074 void PG::RecoveryState::GetInfo::exit()
8076 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
8077 PG
*pg
= context
< RecoveryMachine
>().pg
;
8078 utime_t dur
= ceph_clock_now() - enter_time
;
8079 pg
->osd
->recoverystate_perf
->tinc(rs_getinfo_latency
, dur
);
8080 pg
->blocked_by
.clear();
8081 pg
->publish_stats_to_osd();
8084 /*------GetLog------------*/
8085 PG::RecoveryState::GetLog::GetLog(my_context ctx
)
8088 context
< RecoveryMachine
>().pg
, "Started/Primary/Peering/GetLog"),
8091 context
< RecoveryMachine
>().log_enter(state_name
);
8093 PG
*pg
= context
< RecoveryMachine
>().pg
;
8096 if (!pg
->choose_acting(auth_log_shard
, false,
8097 &context
< Peering
>().history_les_bound
)) {
8098 if (!pg
->want_acting
.empty()) {
8099 post_event(NeedActingChange());
8101 post_event(IsIncomplete());
8107 if (auth_log_shard
== pg
->pg_whoami
) {
8108 post_event(GotLog());
8112 const pg_info_t
& best
= pg
->peer_info
[auth_log_shard
];
8115 if (pg
->info
.last_update
< best
.log_tail
) {
8116 ldout(pg
->cct
, 10) << " not contiguous with osd." << auth_log_shard
<< ", down" << dendl
;
8117 post_event(IsIncomplete());
8121 // how much log to request?
8122 eversion_t request_log_from
= pg
->info
.last_update
;
8123 assert(!pg
->actingbackfill
.empty());
8124 for (set
<pg_shard_t
>::iterator p
= pg
->actingbackfill
.begin();
8125 p
!= pg
->actingbackfill
.end();
8127 if (*p
== pg
->pg_whoami
) continue;
8128 pg_info_t
& ri
= pg
->peer_info
[*p
];
8129 if (ri
.last_update
< pg
->info
.log_tail
&& ri
.last_update
>= best
.log_tail
&&
8130 ri
.last_update
< request_log_from
)
8131 request_log_from
= ri
.last_update
;
8135 ldout(pg
->cct
, 10) << " requesting log from osd." << auth_log_shard
<< dendl
;
8136 context
<RecoveryMachine
>().send_query(
8140 auth_log_shard
.shard
, pg
->pg_whoami
.shard
,
8141 request_log_from
, pg
->info
.history
,
8142 pg
->get_osdmap()->get_epoch()));
8144 assert(pg
->blocked_by
.empty());
8145 pg
->blocked_by
.insert(auth_log_shard
.osd
);
8146 pg
->publish_stats_to_osd();
8149 boost::statechart::result
PG::RecoveryState::GetLog::react(const AdvMap
& advmap
)
8151 PG
*pg
= context
< RecoveryMachine
>().pg
;
8152 // make sure our log source didn't go down. we need to check
8153 // explicitly because it may not be part of the prior set, which
8154 // means the Peering state check won't catch it going down.
8155 if (!advmap
.osdmap
->is_up(auth_log_shard
.osd
)) {
8156 ldout(pg
->cct
, 10) << "GetLog: auth_log_shard osd."
8157 << auth_log_shard
.osd
<< " went down" << dendl
;
8159 return transit
< Reset
>();
8162 // let the Peering state do its checks.
8163 return forward_event();
8166 boost::statechart::result
PG::RecoveryState::GetLog::react(const MLogRec
& logevt
)
8168 PG
*pg
= context
< RecoveryMachine
>().pg
;
8170 if (logevt
.from
!= auth_log_shard
) {
8171 ldout(pg
->cct
, 10) << "GetLog: discarding log from "
8172 << "non-auth_log_shard osd." << logevt
.from
<< dendl
;
8173 return discard_event();
8175 ldout(pg
->cct
, 10) << "GetLog: received master log from osd"
8176 << logevt
.from
<< dendl
;
8178 post_event(GotLog());
8179 return discard_event();
8182 boost::statechart::result
PG::RecoveryState::GetLog::react(const GotLog
&)
8184 PG
*pg
= context
< RecoveryMachine
>().pg
;
8185 ldout(pg
->cct
, 10) << "leaving GetLog" << dendl
;
8187 ldout(pg
->cct
, 10) << "processing master log" << dendl
;
8188 pg
->proc_master_log(*context
<RecoveryMachine
>().get_cur_transaction(),
8189 msg
->info
, msg
->log
, msg
->missing
,
8193 context
< RecoveryMachine
>().get_cur_transaction(),
8194 context
< RecoveryMachine
>().get_on_applied_context_list(),
8195 context
< RecoveryMachine
>().get_on_safe_context_list());
8196 return transit
< GetMissing
>();
8199 boost::statechart::result
PG::RecoveryState::GetLog::react(const QueryState
& q
)
8201 q
.f
->open_object_section("state");
8202 q
.f
->dump_string("name", state_name
);
8203 q
.f
->dump_stream("enter_time") << enter_time
;
8204 q
.f
->dump_stream("auth_log_shard") << auth_log_shard
;
8205 q
.f
->close_section();
8206 return forward_event();
8209 void PG::RecoveryState::GetLog::exit()
8211 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
8212 PG
*pg
= context
< RecoveryMachine
>().pg
;
8213 utime_t dur
= ceph_clock_now() - enter_time
;
8214 pg
->osd
->recoverystate_perf
->tinc(rs_getlog_latency
, dur
);
8215 pg
->blocked_by
.clear();
8216 pg
->publish_stats_to_osd();
8219 /*------WaitActingChange--------*/
8220 PG::RecoveryState::WaitActingChange::WaitActingChange(my_context ctx
)
8222 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Peering/WaitActingChange")
8224 context
< RecoveryMachine
>().log_enter(state_name
);
8227 boost::statechart::result
PG::RecoveryState::WaitActingChange::react(const AdvMap
& advmap
)
8229 PG
*pg
= context
< RecoveryMachine
>().pg
;
8230 OSDMapRef osdmap
= advmap
.osdmap
;
8232 ldout(pg
->cct
, 10) << "verifying no want_acting " << pg
->want_acting
<< " targets didn't go down" << dendl
;
8233 for (vector
<int>::iterator p
= pg
->want_acting
.begin(); p
!= pg
->want_acting
.end(); ++p
) {
8234 if (!osdmap
->is_up(*p
)) {
8235 ldout(pg
->cct
, 10) << " want_acting target osd." << *p
<< " went down, resetting" << dendl
;
8237 return transit
< Reset
>();
8240 return forward_event();
8243 boost::statechart::result
PG::RecoveryState::WaitActingChange::react(const MLogRec
& logevt
)
8245 PG
*pg
= context
< RecoveryMachine
>().pg
;
8246 ldout(pg
->cct
, 10) << "In WaitActingChange, ignoring MLocRec" << dendl
;
8247 return discard_event();
8250 boost::statechart::result
PG::RecoveryState::WaitActingChange::react(const MInfoRec
& evt
)
8252 PG
*pg
= context
< RecoveryMachine
>().pg
;
8253 ldout(pg
->cct
, 10) << "In WaitActingChange, ignoring MInfoRec" << dendl
;
8254 return discard_event();
8257 boost::statechart::result
PG::RecoveryState::WaitActingChange::react(const MNotifyRec
& evt
)
8259 PG
*pg
= context
< RecoveryMachine
>().pg
;
8260 ldout(pg
->cct
, 10) << "In WaitActingChange, ignoring MNotifyRec" << dendl
;
8261 return discard_event();
8264 boost::statechart::result
PG::RecoveryState::WaitActingChange::react(const QueryState
& q
)
8266 q
.f
->open_object_section("state");
8267 q
.f
->dump_string("name", state_name
);
8268 q
.f
->dump_stream("enter_time") << enter_time
;
8269 q
.f
->dump_string("comment", "waiting for pg acting set to change");
8270 q
.f
->close_section();
8271 return forward_event();
8274 void PG::RecoveryState::WaitActingChange::exit()
8276 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
8277 PG
*pg
= context
< RecoveryMachine
>().pg
;
8278 utime_t dur
= ceph_clock_now() - enter_time
;
8279 pg
->osd
->recoverystate_perf
->tinc(rs_waitactingchange_latency
, dur
);
8282 /*------Down--------*/
8283 PG::RecoveryState::Down::Down(my_context ctx
)
8285 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Peering/Down")
8287 context
< RecoveryMachine
>().log_enter(state_name
);
8288 PG
*pg
= context
< RecoveryMachine
>().pg
;
8290 pg
->state_clear(PG_STATE_PEERING
);
8291 pg
->state_set(PG_STATE_DOWN
);
8293 auto &prior_set
= context
< Peering
>().prior_set
;
8294 assert(pg
->blocked_by
.empty());
8295 pg
->blocked_by
.insert(prior_set
.down
.begin(), prior_set
.down
.end());
8296 pg
->publish_stats_to_osd();
8299 void PG::RecoveryState::Down::exit()
8301 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
8302 PG
*pg
= context
< RecoveryMachine
>().pg
;
8304 pg
->state_clear(PG_STATE_DOWN
);
8305 utime_t dur
= ceph_clock_now() - enter_time
;
8306 pg
->osd
->recoverystate_perf
->tinc(rs_down_latency
, dur
);
8308 pg
->blocked_by
.clear();
8309 pg
->publish_stats_to_osd();
8312 boost::statechart::result
PG::RecoveryState::Down::react(const QueryState
& q
)
8314 q
.f
->open_object_section("state");
8315 q
.f
->dump_string("name", state_name
);
8316 q
.f
->dump_stream("enter_time") << enter_time
;
8317 q
.f
->dump_string("comment",
8318 "not enough up instances of this PG to go active");
8319 q
.f
->close_section();
8320 return forward_event();
8323 /*------Incomplete--------*/
8324 PG::RecoveryState::Incomplete::Incomplete(my_context ctx
)
8326 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Peering/Incomplete")
8328 context
< RecoveryMachine
>().log_enter(state_name
);
8329 PG
*pg
= context
< RecoveryMachine
>().pg
;
8331 pg
->state_clear(PG_STATE_PEERING
);
8332 pg
->state_set(PG_STATE_INCOMPLETE
);
8334 PastIntervals::PriorSet
&prior_set
= context
< Peering
>().prior_set
;
8335 assert(pg
->blocked_by
.empty());
8336 pg
->blocked_by
.insert(prior_set
.down
.begin(), prior_set
.down
.end());
8337 pg
->publish_stats_to_osd();
8340 boost::statechart::result
PG::RecoveryState::Incomplete::react(const AdvMap
&advmap
) {
8341 PG
*pg
= context
< RecoveryMachine
>().pg
;
8342 int64_t poolnum
= pg
->info
.pgid
.pool();
8344 // Reset if min_size turn smaller than previous value, pg might now be able to go active
8345 if (!advmap
.osdmap
->have_pg_pool(poolnum
) ||
8346 advmap
.lastmap
->get_pools().find(poolnum
)->second
.min_size
>
8347 advmap
.osdmap
->get_pools().find(poolnum
)->second
.min_size
) {
8349 return transit
< Reset
>();
8352 return forward_event();
8355 boost::statechart::result
PG::RecoveryState::Incomplete::react(const MNotifyRec
& notevt
) {
8356 PG
*pg
= context
< RecoveryMachine
>().pg
;
8357 ldout(pg
->cct
, 7) << "handle_pg_notify from osd." << notevt
.from
<< dendl
;
8358 if (pg
->proc_replica_info(
8359 notevt
.from
, notevt
.notify
.info
, notevt
.notify
.epoch_sent
)) {
8360 // We got something new, try again!
8361 return transit
< GetLog
>();
8363 return discard_event();
8367 boost::statechart::result
PG::RecoveryState::Incomplete::react(
8368 const QueryState
& q
)
8370 q
.f
->open_object_section("state");
8371 q
.f
->dump_string("name", state_name
);
8372 q
.f
->dump_stream("enter_time") << enter_time
;
8373 q
.f
->dump_string("comment", "not enough complete instances of this PG");
8374 q
.f
->close_section();
8375 return forward_event();
8378 void PG::RecoveryState::Incomplete::exit()
8380 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
8381 PG
*pg
= context
< RecoveryMachine
>().pg
;
8383 pg
->state_clear(PG_STATE_INCOMPLETE
);
8384 utime_t dur
= ceph_clock_now() - enter_time
;
8385 pg
->osd
->recoverystate_perf
->tinc(rs_incomplete_latency
, dur
);
8387 pg
->blocked_by
.clear();
8388 pg
->publish_stats_to_osd();
8391 /*------GetMissing--------*/
8392 PG::RecoveryState::GetMissing::GetMissing(my_context ctx
)
8394 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Peering/GetMissing")
8396 context
< RecoveryMachine
>().log_enter(state_name
);
8398 PG
*pg
= context
< RecoveryMachine
>().pg
;
8399 assert(!pg
->actingbackfill
.empty());
8401 for (set
<pg_shard_t
>::iterator i
= pg
->actingbackfill
.begin();
8402 i
!= pg
->actingbackfill
.end();
8404 if (*i
== pg
->get_primary()) continue;
8405 const pg_info_t
& pi
= pg
->peer_info
[*i
];
8406 // reset this so to make sure the pg_missing_t is initialized and
8407 // has the correct semantics even if we don't need to get a
8408 // missing set from a shard. This way later additions due to
8409 // lost+unfound delete work properly.
8410 pg
->peer_missing
[*i
].may_include_deletes
= !pg
->perform_deletes_during_peering();
8413 continue; // no pg data, nothing divergent
8415 if (pi
.last_update
< pg
->pg_log
.get_tail()) {
8416 ldout(pg
->cct
, 10) << " osd." << *i
<< " is not contiguous, will restart backfill" << dendl
;
8417 pg
->peer_missing
[*i
].clear();
8420 if (pi
.last_backfill
== hobject_t()) {
8421 ldout(pg
->cct
, 10) << " osd." << *i
<< " will fully backfill; can infer empty missing set" << dendl
;
8422 pg
->peer_missing
[*i
].clear();
8426 if (pi
.last_update
== pi
.last_complete
&& // peer has no missing
8427 pi
.last_update
== pg
->info
.last_update
) { // peer is up to date
8428 // replica has no missing and identical log as us. no need to
8430 // FIXME: we can do better here. if last_update==last_complete we
8431 // can infer the rest!
8432 ldout(pg
->cct
, 10) << " osd." << *i
<< " has no missing, identical log" << dendl
;
8433 pg
->peer_missing
[*i
].clear();
8437 // We pull the log from the peer's last_epoch_started to ensure we
8438 // get enough log to detect divergent updates.
8439 since
.epoch
= pi
.last_epoch_started
;
8440 assert(pi
.last_update
>= pg
->info
.log_tail
); // or else choose_acting() did a bad thing
8441 if (pi
.log_tail
<= since
) {
8442 ldout(pg
->cct
, 10) << " requesting log+missing since " << since
<< " from osd." << *i
<< dendl
;
8443 context
< RecoveryMachine
>().send_query(
8447 i
->shard
, pg
->pg_whoami
.shard
,
8448 since
, pg
->info
.history
,
8449 pg
->get_osdmap()->get_epoch()));
8451 ldout(pg
->cct
, 10) << " requesting fulllog+missing from osd." << *i
8452 << " (want since " << since
<< " < log.tail "
8453 << pi
.log_tail
<< ")" << dendl
;
8454 context
< RecoveryMachine
>().send_query(
8456 pg_query_t::FULLLOG
,
8457 i
->shard
, pg
->pg_whoami
.shard
,
8458 pg
->info
.history
, pg
->get_osdmap()->get_epoch()));
8460 peer_missing_requested
.insert(*i
);
8461 pg
->blocked_by
.insert(i
->osd
);
8464 if (peer_missing_requested
.empty()) {
8465 if (pg
->need_up_thru
) {
8466 ldout(pg
->cct
, 10) << " still need up_thru update before going active"
8468 post_event(NeedUpThru());
8473 post_event(Activate(pg
->get_osdmap()->get_epoch()));
8475 pg
->publish_stats_to_osd();
8479 boost::statechart::result
PG::RecoveryState::GetMissing::react(const MLogRec
& logevt
)
8481 PG
*pg
= context
< RecoveryMachine
>().pg
;
8483 peer_missing_requested
.erase(logevt
.from
);
8484 pg
->proc_replica_log(logevt
.msg
->info
, logevt
.msg
->log
, logevt
.msg
->missing
, logevt
.from
);
8486 if (peer_missing_requested
.empty()) {
8487 if (pg
->need_up_thru
) {
8488 ldout(pg
->cct
, 10) << " still need up_thru update before going active"
8490 post_event(NeedUpThru());
8492 ldout(pg
->cct
, 10) << "Got last missing, don't need missing "
8493 << "posting Activate" << dendl
;
8494 post_event(Activate(pg
->get_osdmap()->get_epoch()));
8497 return discard_event();
8500 boost::statechart::result
PG::RecoveryState::GetMissing::react(const QueryState
& q
)
8502 PG
*pg
= context
< RecoveryMachine
>().pg
;
8503 q
.f
->open_object_section("state");
8504 q
.f
->dump_string("name", state_name
);
8505 q
.f
->dump_stream("enter_time") << enter_time
;
8507 q
.f
->open_array_section("peer_missing_requested");
8508 for (set
<pg_shard_t
>::iterator p
= peer_missing_requested
.begin();
8509 p
!= peer_missing_requested
.end();
8511 q
.f
->open_object_section("osd");
8512 q
.f
->dump_stream("osd") << *p
;
8513 if (pg
->peer_missing
.count(*p
)) {
8514 q
.f
->open_object_section("got_missing");
8515 pg
->peer_missing
[*p
].dump(q
.f
);
8516 q
.f
->close_section();
8518 q
.f
->close_section();
8520 q
.f
->close_section();
8522 q
.f
->close_section();
8523 return forward_event();
8526 void PG::RecoveryState::GetMissing::exit()
8528 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
8529 PG
*pg
= context
< RecoveryMachine
>().pg
;
8530 utime_t dur
= ceph_clock_now() - enter_time
;
8531 pg
->osd
->recoverystate_perf
->tinc(rs_getmissing_latency
, dur
);
8532 pg
->blocked_by
.clear();
8533 pg
->publish_stats_to_osd();
8536 /*------WaitUpThru--------*/
8537 PG::RecoveryState::WaitUpThru::WaitUpThru(my_context ctx
)
8539 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Peering/WaitUpThru")
8541 context
< RecoveryMachine
>().log_enter(state_name
);
8544 boost::statechart::result
PG::RecoveryState::WaitUpThru::react(const ActMap
& am
)
8546 PG
*pg
= context
< RecoveryMachine
>().pg
;
8547 if (!pg
->need_up_thru
) {
8548 post_event(Activate(pg
->get_osdmap()->get_epoch()));
8550 return forward_event();
8553 boost::statechart::result
PG::RecoveryState::WaitUpThru::react(const MLogRec
& logevt
)
8555 PG
*pg
= context
< RecoveryMachine
>().pg
;
8556 ldout(pg
->cct
, 10) << "Noting missing from osd." << logevt
.from
<< dendl
;
8557 pg
->peer_missing
[logevt
.from
].claim(logevt
.msg
->missing
);
8558 pg
->peer_info
[logevt
.from
] = logevt
.msg
->info
;
8559 return discard_event();
8562 boost::statechart::result
PG::RecoveryState::WaitUpThru::react(const QueryState
& q
)
8564 q
.f
->open_object_section("state");
8565 q
.f
->dump_string("name", state_name
);
8566 q
.f
->dump_stream("enter_time") << enter_time
;
8567 q
.f
->dump_string("comment", "waiting for osdmap to reflect a new up_thru for this osd");
8568 q
.f
->close_section();
8569 return forward_event();
8572 void PG::RecoveryState::WaitUpThru::exit()
8574 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
8575 PG
*pg
= context
< RecoveryMachine
>().pg
;
8576 utime_t dur
= ceph_clock_now() - enter_time
;
8577 pg
->osd
->recoverystate_perf
->tinc(rs_waitupthru_latency
, dur
);
8580 /*----RecoveryState::RecoveryMachine Methods-----*/
8582 #define dout_prefix *_dout << pg->gen_prefix()
8584 void PG::RecoveryState::RecoveryMachine::log_enter(const char *state_name
)
8586 PG
*pg
= context
< RecoveryMachine
>().pg
;
8587 ldout(pg
->cct
, 5) << "enter " << state_name
<< dendl
;
8588 pg
->osd
->pg_recovery_stats
.log_enter(state_name
);
8591 void PG::RecoveryState::RecoveryMachine::log_exit(const char *state_name
, utime_t enter_time
)
8593 utime_t dur
= ceph_clock_now() - enter_time
;
8594 PG
*pg
= context
< RecoveryMachine
>().pg
;
8595 ldout(pg
->cct
, 5) << "exit " << state_name
<< " " << dur
<< " " << event_count
<< " " << event_time
<< dendl
;
8596 pg
->osd
->pg_recovery_stats
.log_exit(state_name
, ceph_clock_now() - enter_time
,
8597 event_count
, event_time
);
8599 event_time
= utime_t();
8603 /*---------------------------------------------------*/
8605 #define dout_prefix (*_dout << (debug_pg ? debug_pg->gen_prefix() : string()) << " PriorSet: ")
8607 void PG::RecoveryState::start_handle(RecoveryCtx
*new_ctx
) {
8612 if (messages_pending_flush
) {
8613 rctx
= RecoveryCtx(*messages_pending_flush
, *new_ctx
);
8617 rctx
->start_time
= ceph_clock_now();
8621 void PG::RecoveryState::begin_block_outgoing() {
8622 assert(!messages_pending_flush
);
8625 messages_pending_flush
= BufferedRecoveryMessages();
8626 rctx
= RecoveryCtx(*messages_pending_flush
, *orig_ctx
);
8629 void PG::RecoveryState::clear_blocked_outgoing() {
8632 messages_pending_flush
= boost::optional
<BufferedRecoveryMessages
>();
8635 void PG::RecoveryState::end_block_outgoing() {
8636 assert(messages_pending_flush
);
8640 rctx
= RecoveryCtx(*orig_ctx
);
8641 rctx
->accept_buffered_messages(*messages_pending_flush
);
8642 messages_pending_flush
= boost::optional
<BufferedRecoveryMessages
>();
8645 void PG::RecoveryState::end_handle() {
8647 utime_t dur
= ceph_clock_now() - rctx
->start_time
;
8648 machine
.event_time
+= dur
;
8651 machine
.event_count
++;
8652 rctx
= boost::optional
<RecoveryCtx
>();
8656 ostream
& operator<<(ostream
& out
, const PG::BackfillInterval
& bi
)
8658 out
<< "BackfillInfo(" << bi
.begin
<< "-" << bi
.end
8659 << " " << bi
.objects
.size() << " objects";
8660 if (!bi
.objects
.empty())
8661 out
<< " " << bi
.objects
;
8666 void intrusive_ptr_add_ref(PG
*pg
) { pg
->get("intptr"); }
8667 void intrusive_ptr_release(PG
*pg
) { pg
->put("intptr"); }
8669 #ifdef PG_DEBUG_REFS
8670 uint64_t get_with_id(PG
*pg
) { return pg
->get_with_id(); }
8671 void put_with_id(PG
*pg
, uint64_t id
) { return pg
->put_with_id(id
); }