1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
16 // #include "msg/Messenger.h"
17 #include "messages/MOSDRepScrub.h"
18 // #include "common/cmdparse.h"
19 // #include "common/ceph_context.h"
21 #include "common/errno.h"
22 #include "common/config.h"
24 #include "OpRequest.h"
25 #include "ScrubStore.h"
28 #include "common/Timer.h"
29 #include "common/perf_counters.h"
31 #include "messages/MOSDOp.h"
32 #include "messages/MOSDPGNotify.h"
33 // #include "messages/MOSDPGLog.h"
34 #include "messages/MOSDPGRemove.h"
35 #include "messages/MOSDPGInfo.h"
36 #include "messages/MOSDPGTrim.h"
37 #include "messages/MOSDPGScan.h"
38 #include "messages/MOSDPGBackfill.h"
39 #include "messages/MOSDPGBackfillRemove.h"
40 #include "messages/MBackfillReserve.h"
41 #include "messages/MRecoveryReserve.h"
42 #include "messages/MOSDPGPush.h"
43 #include "messages/MOSDPGPushReply.h"
44 #include "messages/MOSDPGPull.h"
45 #include "messages/MOSDECSubOpWrite.h"
46 #include "messages/MOSDECSubOpWriteReply.h"
47 #include "messages/MOSDECSubOpRead.h"
48 #include "messages/MOSDECSubOpReadReply.h"
49 #include "messages/MOSDPGUpdateLogMissing.h"
50 #include "messages/MOSDPGUpdateLogMissingReply.h"
51 #include "messages/MOSDBackoff.h"
52 #include "messages/MOSDScrubReserve.h"
53 #include "messages/MOSDSubOp.h"
54 #include "messages/MOSDRepOp.h"
55 #include "messages/MOSDSubOpReply.h"
56 #include "messages/MOSDRepOpReply.h"
57 #include "messages/MOSDRepScrubMap.h"
59 #include "common/BackTrace.h"
60 #include "common/EventTrace.h"
63 #define TRACEPOINT_DEFINE
64 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
65 #include "tracing/pg.h"
66 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
67 #undef TRACEPOINT_DEFINE
69 #define tracepoint(...)
74 #define dout_context cct
75 #define dout_subsys ceph_subsys_osd
77 #define dout_prefix _prefix(_dout, this)
79 // prefix pgmeta_oid keys with _ so that PGLog::read_log_and_missing() can
81 const string
infover_key("_infover");
82 const string
info_key("_info");
83 const string
biginfo_key("_biginfo");
84 const string
epoch_key("_epoch");
85 const string
fastinfo_key("_fastinfo");
88 static ostream
& _prefix(std::ostream
*_dout
, T
*t
)
90 return *_dout
<< t
->gen_prefix();
93 MEMPOOL_DEFINE_OBJECT_FACTORY(PG::CephPeeringEvt
, pg_peering_evt
, osd
);
95 void PGStateHistory::enter(PG
* pg
, const utime_t entime
, const char* state
)
97 // Ignore trimming state machine for now
98 if (::strstr(state
, "Trimming") != NULL
) {
100 } else if (pi
!= nullptr) {
101 pi
->enter_state(entime
, state
);
103 // Store current state since we can't reliably take the PG lock here
104 if ( tmppi
== nullptr) {
105 tmppi
= std::unique_ptr
<PGStateInstance
>(new PGStateInstance
);
109 tmppi
->enter_state(entime
, state
);
113 void PGStateHistory::exit(const char* state
) {
114 // Ignore trimming state machine for now
115 // Do nothing if PG is being destroyed!
116 if (::strstr(state
, "Trimming") != NULL
|| pg_in_destructor
) {
119 bool ilocked
= false;
120 if(!thispg
->is_locked()) {
125 buffer
.push_back(std::unique_ptr
<PGStateInstance
>(tmppi
.release()));
126 pi
= buffer
.back().get();
127 pi
->setepoch(thispg
->get_osdmap()->get_epoch());
130 pi
->exit_state(ceph_clock_now());
131 if (::strcmp(state
, "Reset") == 0) {
140 void PGStateHistory::dump(Formatter
* f
) const {
141 f
->open_array_section("history");
142 for (auto pi
= buffer
.begin(); pi
!= buffer
.end(); ++pi
) {
143 f
->open_object_section("states");
144 f
->dump_stream("epoch") << (*pi
)->this_epoch
;
145 for (auto she
: (*pi
)->state_history
) {
146 f
->dump_string("state", std::get
<2>(she
));
147 f
->dump_stream("enter") << std::get
<0>(she
);
148 f
->dump_stream("exit") << std::get
<1>(she
);
155 void PG::get(const char* tag
)
159 Mutex::Locker
l(_ref_id_lock
);
164 void PG::put(const char* tag
)
168 Mutex::Locker
l(_ref_id_lock
);
169 auto tag_counts_entry
= _tag_counts
.find(tag
);
170 assert(tag_counts_entry
!= _tag_counts
.end());
171 --tag_counts_entry
->second
;
172 if (tag_counts_entry
->second
== 0) {
173 _tag_counts
.erase(tag_counts_entry
);
182 uint64_t PG::get_with_id()
185 Mutex::Locker
l(_ref_id_lock
);
186 uint64_t id
= ++_ref_id
;
190 dout(20) << __func__
<< ": " << info
.pgid
<< " got id " << id
<< " (new) ref==" << ref
<< dendl
;
191 assert(!_live_ids
.count(id
));
192 _live_ids
.insert(make_pair(id
, ss
.str()));
196 void PG::put_with_id(uint64_t id
)
198 dout(20) << __func__
<< ": " << info
.pgid
<< " put id " << id
<< " (current) ref==" << ref
<< dendl
;
200 Mutex::Locker
l(_ref_id_lock
);
201 assert(_live_ids
.count(id
));
208 void PG::dump_live_ids()
210 Mutex::Locker
l(_ref_id_lock
);
211 dout(0) << "\t" << __func__
<< ": " << info
.pgid
<< " live ids:" << dendl
;
212 for (map
<uint64_t, string
>::iterator i
= _live_ids
.begin();
213 i
!= _live_ids
.end();
215 dout(0) << "\t\tid: " << *i
<< dendl
;
217 dout(0) << "\t" << __func__
<< ": " << info
.pgid
<< " live tags:" << dendl
;
218 for (map
<string
, uint64_t>::iterator i
= _tag_counts
.begin();
219 i
!= _tag_counts
.end();
221 dout(0) << "\t\tid: " << *i
<< dendl
;
226 void PGPool::update(OSDMapRef map
)
228 const pg_pool_t
*pi
= map
->get_pg_pool(id
);
232 name
= map
->get_pool_name(id
);
233 bool updated
= false;
234 if ((map
->get_epoch() != cached_epoch
+ 1) ||
235 (pi
->get_snap_epoch() == map
->get_epoch())) {
237 pi
->build_removed_snaps(newly_removed_snaps
);
238 interval_set
<snapid_t
> intersection
;
239 intersection
.intersection_of(newly_removed_snaps
, cached_removed_snaps
);
240 if (intersection
== cached_removed_snaps
) {
241 newly_removed_snaps
.subtract(cached_removed_snaps
);
242 cached_removed_snaps
.union_of(newly_removed_snaps
);
244 lgeneric_subdout(cct
, osd
, 0) << __func__
245 << " cached_removed_snaps shrank from " << cached_removed_snaps
246 << " to " << newly_removed_snaps
<< dendl
;
247 cached_removed_snaps
= newly_removed_snaps
;
248 newly_removed_snaps
.clear();
250 snapc
= pi
->get_snap_context();
252 /* 1) map->get_epoch() == cached_epoch + 1 &&
253 * 2) pi->get_snap_epoch() != map->get_epoch()
255 * From the if branch, 1 && 2 must be true. From 2, we know that
256 * this map didn't change the set of removed snaps. From 1, we
257 * know that our cached_removed_snaps matches the previous map.
258 * Thus, from 1 && 2, cached_removed snaps matches the current
259 * set of removed snaps and all we have to do is clear
260 * newly_removed_snaps.
262 newly_removed_snaps
.clear();
264 cached_epoch
= map
->get_epoch();
265 lgeneric_subdout(cct
, osd
, 20)
266 << "PGPool::update cached_removed_snaps "
267 << cached_removed_snaps
268 << " newly_removed_snaps "
269 << newly_removed_snaps
270 << " snapc " << snapc
271 << (updated
? " (updated)":" (no change)")
275 PG::PG(OSDService
*o
, OSDMapRef curmap
,
276 const PGPool
&_pool
, spg_t p
) :
279 osdriver(osd
->store
, coll_t(), OSD::make_snapmapper_oid()),
284 p
.get_split_bits(curmap
->get_pg_num(_pool
.id
)),
287 osdmap_ref(curmap
), last_persisted_osdmap_ref(curmap
), pool(_pool
),
290 _ref_id_lock("PG::_ref_id_lock"), _ref_id(0),
293 trace_endpoint("0.0.0.0", 0, "PG"),
294 dirty_info(false), dirty_big_info(false),
297 coll(p
), pg_log(cct
),
298 pgmeta_oid(p
.make_pgmeta_oid()),
301 curmap
->get_pools().at(p
.pgid
.pool()).ec_pool(),
303 stat_queue_item(this),
305 recovery_queued(false),
306 recovery_ops_active(0),
310 pg_whoami(osd
->whoami
, p
.shard
),
312 last_peering_reset(0),
313 heartbeat_peer_lock("PG::heartbeat_peer_lock"),
314 backfill_reserved(false),
315 backfill_reserving(false),
316 flushes_in_progress(0),
317 pg_stats_publish_lock("PG::pg_stats_publish_lock"),
318 pg_stats_publish_valid(false),
319 osr(osd
->osr_registry
.lookup_or_create(p
, (stringify(p
)))),
320 finish_sync_event(NULL
),
321 backoff_lock("PG::backoff_lock"),
322 scrub_after_recovery(false),
324 recovery_state(this),
326 peer_features(CEPH_FEATURES_SUPPORTED_DEFAULT
),
327 acting_features(CEPH_FEATURES_SUPPORTED_DEFAULT
),
328 upacting_features(CEPH_FEATURES_SUPPORTED_DEFAULT
),
332 osd
->add_pgid(p
, this);
335 std::stringstream ss
;
336 ss
<< "PG " << info
.pgid
;
337 trace_endpoint
.copy_name(ss
.str());
344 pgstate_history
.set_pg_in_destructor();
346 osd
->remove_pgid(info
.pgid
, this);
350 void PG::lock_suspend_timeout(ThreadPool::TPHandle
&handle
)
352 handle
.suspend_tp_timeout();
354 handle
.reset_tp_timeout();
357 void PG::lock(bool no_lockdep
) const
359 _lock
.Lock(no_lockdep
);
360 // if we have unrecorded dirty state with the lock dropped, there is a bug
362 assert(!dirty_big_info
);
364 dout(30) << "lock" << dendl
;
367 std::string
PG::gen_prefix() const
370 OSDMapRef mapref
= osdmap_ref
;
371 if (_lock
.is_locked_by_me()) {
372 out
<< "osd." << osd
->whoami
373 << " pg_epoch: " << (mapref
? mapref
->get_epoch():0)
374 << " " << *this << " ";
376 out
<< "osd." << osd
->whoami
377 << " pg_epoch: " << (mapref
? mapref
->get_epoch():0)
378 << " pg[" << info
.pgid
<< "(unlocked)] ";
383 /********* PG **********/
385 void PG::proc_master_log(
386 ObjectStore::Transaction
& t
, pg_info_t
&oinfo
,
387 pg_log_t
&olog
, pg_missing_t
& omissing
, pg_shard_t from
)
389 dout(10) << "proc_master_log for osd." << from
<< ": "
390 << olog
<< " " << omissing
<< dendl
;
391 assert(!is_peered() && is_primary());
393 // merge log into our own log to build master log. no need to
394 // make any adjustments to their missing map; we are taking their
395 // log to be authoritative (i.e., their entries are by definitely
397 merge_log(t
, oinfo
, olog
, from
);
398 peer_info
[from
] = oinfo
;
399 dout(10) << " peer osd." << from
<< " now " << oinfo
<< " " << omissing
<< dendl
;
400 might_have_unfound
.insert(from
);
402 // See doc/dev/osd_internals/last_epoch_started
403 if (oinfo
.last_epoch_started
> info
.last_epoch_started
) {
404 info
.last_epoch_started
= oinfo
.last_epoch_started
;
407 if (oinfo
.last_interval_started
> info
.last_interval_started
) {
408 info
.last_interval_started
= oinfo
.last_interval_started
;
411 update_history(oinfo
.history
);
412 assert(cct
->_conf
->osd_find_best_info_ignore_history_les
||
413 info
.last_epoch_started
>= info
.history
.last_epoch_started
);
415 peer_missing
[from
].claim(omissing
);
418 void PG::proc_replica_log(
420 const pg_log_t
&olog
,
421 pg_missing_t
& omissing
,
424 dout(10) << "proc_replica_log for osd." << from
<< ": "
425 << oinfo
<< " " << olog
<< " " << omissing
<< dendl
;
427 pg_log
.proc_replica_log(oinfo
, olog
, omissing
, from
);
429 peer_info
[from
] = oinfo
;
430 dout(10) << " peer osd." << from
<< " now " << oinfo
<< " " << omissing
<< dendl
;
431 might_have_unfound
.insert(from
);
433 for (map
<hobject_t
, pg_missing_item
>::const_iterator i
=
434 omissing
.get_items().begin();
435 i
!= omissing
.get_items().end();
437 dout(20) << " after missing " << i
->first
<< " need " << i
->second
.need
438 << " have " << i
->second
.have
<< dendl
;
440 peer_missing
[from
].claim(omissing
);
443 bool PG::proc_replica_info(
444 pg_shard_t from
, const pg_info_t
&oinfo
, epoch_t send_epoch
)
446 map
<pg_shard_t
, pg_info_t
>::iterator p
= peer_info
.find(from
);
447 if (p
!= peer_info
.end() && p
->second
.last_update
== oinfo
.last_update
) {
448 dout(10) << " got dup osd." << from
<< " info " << oinfo
<< ", identical to ours" << dendl
;
452 if (!get_osdmap()->has_been_up_since(from
.osd
, send_epoch
)) {
453 dout(10) << " got info " << oinfo
<< " from down osd." << from
454 << " discarding" << dendl
;
458 dout(10) << " got osd." << from
<< " " << oinfo
<< dendl
;
459 assert(is_primary());
460 peer_info
[from
] = oinfo
;
461 might_have_unfound
.insert(from
);
463 update_history(oinfo
.history
);
466 if (!is_up(from
) && !is_acting(from
)) {
467 dout(10) << " osd." << from
<< " has stray content: " << oinfo
<< dendl
;
468 stray_set
.insert(from
);
474 // was this a new info? if so, update peers!
475 if (p
== peer_info
.end())
476 update_heartbeat_peers();
481 void PG::remove_snap_mapped_object(
482 ObjectStore::Transaction
&t
, const hobject_t
&soid
)
486 ghobject_t(soid
, ghobject_t::NO_GEN
, pg_whoami
.shard
));
487 clear_object_snap_mapping(&t
, soid
);
490 void PG::clear_object_snap_mapping(
491 ObjectStore::Transaction
*t
, const hobject_t
&soid
)
493 OSDriver::OSTransaction
_t(osdriver
.get_transaction(t
));
494 if (soid
.snap
< CEPH_MAXSNAP
) {
495 int r
= snap_mapper
.remove_oid(
498 if (!(r
== 0 || r
== -ENOENT
)) {
499 derr
<< __func__
<< ": remove_oid returned " << cpp_strerror(r
) << dendl
;
505 void PG::update_object_snap_mapping(
506 ObjectStore::Transaction
*t
, const hobject_t
&soid
, const set
<snapid_t
> &snaps
)
508 OSDriver::OSTransaction
_t(osdriver
.get_transaction(t
));
509 assert(soid
.snap
< CEPH_MAXSNAP
);
510 int r
= snap_mapper
.remove_oid(
513 if (!(r
== 0 || r
== -ENOENT
)) {
514 derr
<< __func__
<< ": remove_oid returned " << cpp_strerror(r
) << dendl
;
524 ObjectStore::Transaction
& t
, pg_info_t
&oinfo
, pg_log_t
&olog
, pg_shard_t from
)
526 PGLogEntryHandler rollbacker
{this, &t
};
528 oinfo
, olog
, from
, info
, &rollbacker
, dirty_info
, dirty_big_info
);
531 void PG::rewind_divergent_log(ObjectStore::Transaction
& t
, eversion_t newhead
)
533 PGLogEntryHandler rollbacker
{this, &t
};
534 pg_log
.rewind_divergent_log(
535 newhead
, info
, &rollbacker
, dirty_info
, dirty_big_info
);
539 * Process information from a replica to determine if it could have any
540 * objects that i need.
542 * TODO: if the missing set becomes very large, this could get expensive.
543 * Instead, we probably want to just iterate over our unfound set.
545 bool PG::search_for_missing(
546 const pg_info_t
&oinfo
, const pg_missing_t
&omissing
,
550 uint64_t num_unfound_before
= missing_loc
.num_unfound();
551 bool found_missing
= missing_loc
.add_source_info(
552 from
, oinfo
, omissing
, ctx
->handle
);
553 if (found_missing
&& num_unfound_before
!= missing_loc
.num_unfound())
554 publish_stats_to_osd();
556 (get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD
, NULL
) &
557 CEPH_FEATURE_OSD_ERASURE_CODES
)) {
558 pg_info_t
tinfo(oinfo
);
559 tinfo
.pgid
.shard
= pg_whoami
.shard
;
560 (*(ctx
->info_map
))[from
.osd
].push_back(
563 from
.shard
, pg_whoami
.shard
,
564 get_osdmap()->get_epoch(),
565 get_osdmap()->get_epoch(),
569 return found_missing
;
572 bool PG::MissingLoc::readable_with_acting(
573 const hobject_t
&hoid
,
574 const set
<pg_shard_t
> &acting
) const {
575 if (!needs_recovery(hoid
)) return true;
576 auto missing_loc_entry
= missing_loc
.find(hoid
);
577 if (missing_loc_entry
== missing_loc
.end()) return false;
578 const set
<pg_shard_t
> &locs
= missing_loc_entry
->second
;
579 ldout(pg
->cct
, 10) << __func__
<< ": locs:" << locs
<< dendl
;
580 set
<pg_shard_t
> have_acting
;
581 for (set
<pg_shard_t
>::const_iterator i
= locs
.begin();
584 if (acting
.count(*i
))
585 have_acting
.insert(*i
);
587 return (*is_readable
)(have_acting
);
590 void PG::MissingLoc::add_batch_sources_info(
591 const set
<pg_shard_t
> &sources
, ThreadPool::TPHandle
* handle
)
593 ldout(pg
->cct
, 10) << __func__
<< ": adding sources in batch "
594 << sources
.size() << dendl
;
596 for (map
<hobject_t
, pg_missing_item
>::const_iterator i
= needs_recovery_map
.begin();
597 i
!= needs_recovery_map
.end();
599 if (handle
&& ++loop
>= pg
->cct
->_conf
->osd_loop_before_reset_tphandle
) {
600 handle
->reset_tp_timeout();
603 missing_loc
[i
->first
].insert(sources
.begin(), sources
.end());
604 missing_loc_sources
.insert(sources
.begin(), sources
.end());
608 bool PG::MissingLoc::add_source_info(
610 const pg_info_t
&oinfo
,
611 const pg_missing_t
&omissing
,
612 ThreadPool::TPHandle
* handle
)
614 bool found_missing
= false;
617 for (map
<hobject_t
,pg_missing_item
>::const_iterator p
= needs_recovery_map
.begin();
618 p
!= needs_recovery_map
.end();
620 const hobject_t
&soid(p
->first
);
621 eversion_t need
= p
->second
.need
;
622 if (handle
&& ++loop
>= pg
->cct
->_conf
->osd_loop_before_reset_tphandle
) {
623 handle
->reset_tp_timeout();
626 if (oinfo
.last_update
< need
) {
627 ldout(pg
->cct
, 10) << "search_for_missing " << soid
<< " " << need
628 << " also missing on osd." << fromosd
629 << " (last_update " << oinfo
.last_update
630 << " < needed " << need
<< ")" << dendl
;
633 if (!oinfo
.last_backfill
.is_max() &&
634 !oinfo
.last_backfill_bitwise
) {
635 ldout(pg
->cct
, 10) << "search_for_missing " << soid
<< " " << need
636 << " also missing on osd." << fromosd
637 << " (last_backfill " << oinfo
.last_backfill
638 << " but with wrong sort order)"
642 if (p
->first
>= oinfo
.last_backfill
) {
643 // FIXME: this is _probably_ true, although it could conceivably
644 // be in the undefined region! Hmm!
645 ldout(pg
->cct
, 10) << "search_for_missing " << soid
<< " " << need
646 << " also missing on osd." << fromosd
647 << " (past last_backfill " << oinfo
.last_backfill
651 if (oinfo
.last_complete
< need
) {
652 if (omissing
.is_missing(soid
)) {
653 ldout(pg
->cct
, 10) << "search_for_missing " << soid
<< " " << need
654 << " also missing on osd." << fromosd
<< dendl
;
659 ldout(pg
->cct
, 10) << "search_for_missing " << soid
<< " " << need
660 << " is on osd." << fromosd
<< dendl
;
662 missing_loc
[soid
].insert(fromosd
);
663 missing_loc_sources
.insert(fromosd
);
664 found_missing
= true;
667 ldout(pg
->cct
, 20) << "needs_recovery_map missing " << needs_recovery_map
669 return found_missing
;
672 void PG::discover_all_missing(map
<int, map
<spg_t
,pg_query_t
> > &query_map
)
674 auto &missing
= pg_log
.get_missing();
675 uint64_t unfound
= get_num_unfound();
678 dout(10) << __func__
<< " "
679 << missing
.num_missing() << " missing, "
680 << unfound
<< " unfound"
683 std::set
<pg_shard_t
>::const_iterator m
= might_have_unfound
.begin();
684 std::set
<pg_shard_t
>::const_iterator mend
= might_have_unfound
.end();
685 for (; m
!= mend
; ++m
) {
688 if (!get_osdmap()->is_up(peer
.osd
)) {
689 dout(20) << __func__
<< " skipping down osd." << peer
<< dendl
;
693 map
<pg_shard_t
, pg_info_t
>::const_iterator iter
= peer_info
.find(peer
);
694 if (iter
!= peer_info
.end() &&
695 (iter
->second
.is_empty() || iter
->second
.dne())) {
696 // ignore empty peers
700 // If we've requested any of this stuff, the pg_missing_t information
701 // should be on its way.
702 // TODO: coalsce requested_* into a single data structure
703 if (peer_missing
.find(peer
) != peer_missing
.end()) {
704 dout(20) << __func__
<< ": osd." << peer
705 << ": we already have pg_missing_t" << dendl
;
708 if (peer_log_requested
.find(peer
) != peer_log_requested
.end()) {
709 dout(20) << __func__
<< ": osd." << peer
710 << ": in peer_log_requested" << dendl
;
713 if (peer_missing_requested
.find(peer
) != peer_missing_requested
.end()) {
714 dout(20) << __func__
<< ": osd." << peer
715 << ": in peer_missing_requested" << dendl
;
720 dout(10) << __func__
<< ": osd." << peer
<< ": requesting pg_missing_t"
722 peer_missing_requested
.insert(peer
);
723 query_map
[peer
.osd
][spg_t(info
.pgid
.pgid
, peer
.shard
)] =
726 peer
.shard
, pg_whoami
.shard
,
727 info
.history
, get_osdmap()->get_epoch());
731 /******* PG ***********/
732 bool PG::needs_recovery() const
734 assert(is_primary());
736 auto &missing
= pg_log
.get_missing();
738 if (missing
.num_missing()) {
739 dout(10) << __func__
<< " primary has " << missing
.num_missing()
740 << " missing" << dendl
;
744 assert(!actingbackfill
.empty());
745 set
<pg_shard_t
>::const_iterator end
= actingbackfill
.end();
746 set
<pg_shard_t
>::const_iterator a
= actingbackfill
.begin();
747 for (; a
!= end
; ++a
) {
748 if (*a
== get_primary()) continue;
749 pg_shard_t peer
= *a
;
750 map
<pg_shard_t
, pg_missing_t
>::const_iterator pm
= peer_missing
.find(peer
);
751 if (pm
== peer_missing
.end()) {
752 dout(10) << __func__
<< " osd." << peer
<< " doesn't have missing set"
756 if (pm
->second
.num_missing()) {
757 dout(10) << __func__
<< " osd." << peer
<< " has "
758 << pm
->second
.num_missing() << " missing" << dendl
;
763 dout(10) << __func__
<< " is recovered" << dendl
;
767 bool PG::needs_backfill() const
769 assert(is_primary());
771 // We can assume that only possible osds that need backfill
772 // are on the backfill_targets vector nodes.
773 set
<pg_shard_t
>::const_iterator end
= backfill_targets
.end();
774 set
<pg_shard_t
>::const_iterator a
= backfill_targets
.begin();
775 for (; a
!= end
; ++a
) {
776 pg_shard_t peer
= *a
;
777 map
<pg_shard_t
, pg_info_t
>::const_iterator pi
= peer_info
.find(peer
);
778 if (!pi
->second
.last_backfill
.is_max()) {
779 dout(10) << __func__
<< " osd." << peer
<< " has last_backfill " << pi
->second
.last_backfill
<< dendl
;
784 dout(10) << __func__
<< " does not need backfill" << dendl
;
789 void PG::check_past_interval_bounds() const
791 auto rpib
= get_required_past_interval_bounds(
793 osd
->get_superblock().oldest_map
);
794 if (rpib
.first
>= rpib
.second
) {
795 if (!past_intervals
.empty()) {
796 osd
->clog
->error() << info
.pgid
<< " required past_interval bounds are"
797 << " empty [" << rpib
<< ") but past_intervals is not: "
799 derr
<< info
.pgid
<< " required past_interval bounds are"
800 << " empty [" << rpib
<< ") but past_intervals is not: "
801 << past_intervals
<< dendl
;
804 if (past_intervals
.empty()) {
805 osd
->clog
->error() << info
.pgid
<< " required past_interval bounds are"
806 << " not empty [" << rpib
<< ") but past_intervals "
807 << past_intervals
<< " is empty";
808 derr
<< info
.pgid
<< " required past_interval bounds are"
809 << " not empty [" << rpib
<< ") but past_intervals "
810 << past_intervals
<< " is empty" << dendl
;
811 assert(!past_intervals
.empty());
814 auto apib
= past_intervals
.get_bounds();
815 if (apib
.first
> rpib
.first
) {
816 osd
->clog
->error() << info
.pgid
<< " past_intervals [" << apib
817 << ") start interval does not contain the required"
818 << " bound [" << rpib
<< ") start";
819 derr
<< info
.pgid
<< " past_intervals [" << apib
820 << ") start interval does not contain the required"
821 << " bound [" << rpib
<< ") start" << dendl
;
822 assert(0 == "past_interval start interval mismatch");
824 if (apib
.second
!= rpib
.second
) {
825 osd
->clog
->error() << info
.pgid
<< " past_interal bound [" << apib
826 << ") end does not match required [" << rpib
828 derr
<< info
.pgid
<< " past_interal bound [" << apib
829 << ") end does not match required [" << rpib
831 assert(0 == "past_interval end mismatch");
836 bool PG::adjust_need_up_thru(const OSDMapRef osdmap
)
838 epoch_t up_thru
= osdmap
->get_up_thru(osd
->whoami
);
840 up_thru
>= info
.history
.same_interval_since
) {
841 dout(10) << "adjust_need_up_thru now " << up_thru
<< ", need_up_thru now false" << dendl
;
842 need_up_thru
= false;
848 void PG::remove_down_peer_info(const OSDMapRef osdmap
)
850 // Remove any downed osds from peer_info
851 bool removed
= false;
852 map
<pg_shard_t
, pg_info_t
>::iterator p
= peer_info
.begin();
853 while (p
!= peer_info
.end()) {
854 if (!osdmap
->is_up(p
->first
.osd
)) {
855 dout(10) << " dropping down osd." << p
->first
<< " info " << p
->second
<< dendl
;
856 peer_missing
.erase(p
->first
);
857 peer_log_requested
.erase(p
->first
);
858 peer_missing_requested
.erase(p
->first
);
859 peer_info
.erase(p
++);
865 // if we removed anyone, update peers (which include peer_info)
867 update_heartbeat_peers();
868 check_recovery_sources(osdmap
);
872 * Returns true unless there is a non-lost OSD in might_have_unfound.
874 bool PG::all_unfound_are_queried_or_lost(const OSDMapRef osdmap
) const
876 assert(is_primary());
878 set
<pg_shard_t
>::const_iterator peer
= might_have_unfound
.begin();
879 set
<pg_shard_t
>::const_iterator mend
= might_have_unfound
.end();
880 for (; peer
!= mend
; ++peer
) {
881 if (peer_missing
.count(*peer
))
883 map
<pg_shard_t
, pg_info_t
>::const_iterator iter
= peer_info
.find(*peer
);
884 if (iter
!= peer_info
.end() &&
885 (iter
->second
.is_empty() || iter
->second
.dne()))
887 if (!osdmap
->exists(peer
->osd
))
889 const osd_info_t
&osd_info(osdmap
->get_info(peer
->osd
));
890 if (osd_info
.lost_at
<= osd_info
.up_from
) {
891 // If there is even one OSD in might_have_unfound that isn't lost, we
892 // still might retrieve our unfound.
896 dout(10) << "all_unfound_are_queried_or_lost all of might_have_unfound " << might_have_unfound
897 << " have been queried or are marked lost" << dendl
;
901 PastIntervals::PriorSet
PG::build_prior()
905 for (map
<pg_shard_t
,pg_info_t
>::iterator it
= peer_info
.begin();
906 it
!= peer_info
.end();
908 assert(info
.history
.last_epoch_started
>= it
->second
.history
.last_epoch_started
);
912 const OSDMap
&osdmap
= *get_osdmap();
913 PastIntervals::PriorSet prior
= past_intervals
.get_prior_set(
915 info
.history
.last_epoch_started
,
916 get_pgbackend()->get_is_recoverable_predicate(),
917 [&](epoch_t start
, int osd
, epoch_t
*lost_at
) {
918 const osd_info_t
*pinfo
= 0;
919 if (osdmap
.exists(osd
)) {
920 pinfo
= &osdmap
.get_info(osd
);
922 *lost_at
= pinfo
->lost_at
;
925 if (osdmap
.is_up(osd
)) {
926 return PastIntervals::UP
;
928 return PastIntervals::DNE
;
929 } else if (pinfo
->lost_at
> start
) {
930 return PastIntervals::LOST
;
932 return PastIntervals::DOWN
;
940 state_set(PG_STATE_DOWN
);
943 if (get_osdmap()->get_up_thru(osd
->whoami
) < info
.history
.same_interval_since
) {
944 dout(10) << "up_thru " << get_osdmap()->get_up_thru(osd
->whoami
)
945 << " < same_since " << info
.history
.same_interval_since
946 << ", must notify monitor" << dendl
;
949 dout(10) << "up_thru " << get_osdmap()->get_up_thru(osd
->whoami
)
950 << " >= same_since " << info
.history
.same_interval_since
951 << ", all is well" << dendl
;
952 need_up_thru
= false;
954 set_probe_targets(prior
.probe
);
958 void PG::clear_primary_state()
960 dout(10) << "clear_primary_state" << dendl
;
962 // clear peering state
964 peer_log_requested
.clear();
965 peer_missing_requested
.clear();
967 peer_missing
.clear();
968 need_up_thru
= false;
969 peer_last_complete_ondisk
.clear();
970 peer_activated
.clear();
971 min_last_complete_ondisk
= eversion_t();
972 pg_trim_to
= eversion_t();
973 might_have_unfound
.clear();
974 projected_log
= PGLog::IndexedLog();
976 last_update_ondisk
= eversion_t();
980 finish_sync_event
= 0; // so that _finish_recovery doesn't go off in another thread
984 release_pg_backoffs();
986 pg_log
.reset_recovery_pointers();
988 scrubber
.reserved_peers
.clear();
989 scrub_after_recovery
= false;
994 PG::Scrubber::Scrubber()
995 : reserved(false), reserve_failed(false),
998 waiting_on(0), shallow_errors(0), deep_errors(0), fixed(0),
999 must_scrub(false), must_deep_scrub(false), must_repair(false),
1001 num_digest_updates_pending(0),
1007 PG::Scrubber::~Scrubber() {}
1012 * Returns an iterator to the best info in infos sorted by:
1013 * 1) Prefer newer last_update
1014 * 2) Prefer longer tail if it brings another info into contiguity
1015 * 3) Prefer current primary
1017 map
<pg_shard_t
, pg_info_t
>::const_iterator
PG::find_best_info(
1018 const map
<pg_shard_t
, pg_info_t
> &infos
,
1019 bool restrict_to_up_acting
,
1020 bool *history_les_bound
) const
1022 assert(history_les_bound
);
1023 /* See doc/dev/osd_internals/last_epoch_started.rst before attempting
1024 * to make changes to this process. Also, make sure to update it
1025 * when you find bugs! */
1026 eversion_t min_last_update_acceptable
= eversion_t::max();
1027 epoch_t max_last_epoch_started_found
= 0;
1028 for (map
<pg_shard_t
, pg_info_t
>::const_iterator i
= infos
.begin();
1031 if (!cct
->_conf
->osd_find_best_info_ignore_history_les
&&
1032 max_last_epoch_started_found
< i
->second
.history
.last_epoch_started
) {
1033 *history_les_bound
= true;
1034 max_last_epoch_started_found
= i
->second
.history
.last_epoch_started
;
1036 if (!i
->second
.is_incomplete() &&
1037 max_last_epoch_started_found
< i
->second
.last_epoch_started
) {
1038 max_last_epoch_started_found
= i
->second
.last_epoch_started
;
1041 for (map
<pg_shard_t
, pg_info_t
>::const_iterator i
= infos
.begin();
1044 if (max_last_epoch_started_found
<= i
->second
.last_epoch_started
) {
1045 if (min_last_update_acceptable
> i
->second
.last_update
)
1046 min_last_update_acceptable
= i
->second
.last_update
;
1049 if (min_last_update_acceptable
== eversion_t::max())
1052 map
<pg_shard_t
, pg_info_t
>::const_iterator best
= infos
.end();
1053 // find osd with newest last_update (oldest for ec_pool).
1054 // if there are multiples, prefer
1055 // - a longer tail, if it brings another peer into log contiguity
1056 // - the current primary
1057 for (map
<pg_shard_t
, pg_info_t
>::const_iterator p
= infos
.begin();
1060 if (restrict_to_up_acting
&& !is_up(p
->first
) &&
1061 !is_acting(p
->first
))
1063 // Only consider peers with last_update >= min_last_update_acceptable
1064 if (p
->second
.last_update
< min_last_update_acceptable
)
1066 // Disqualify anyone with a too old last_epoch_started
1067 if (p
->second
.last_epoch_started
< max_last_epoch_started_found
)
1069 // Disqualify anyone who is incomplete (not fully backfilled)
1070 if (p
->second
.is_incomplete())
1072 if (best
== infos
.end()) {
1076 // Prefer newer last_update
1077 if (pool
.info
.require_rollback()) {
1078 if (p
->second
.last_update
> best
->second
.last_update
)
1080 if (p
->second
.last_update
< best
->second
.last_update
) {
1085 if (p
->second
.last_update
< best
->second
.last_update
)
1087 if (p
->second
.last_update
> best
->second
.last_update
) {
1093 // Prefer longer tail
1094 if (p
->second
.log_tail
> best
->second
.log_tail
) {
1096 } else if (p
->second
.log_tail
< best
->second
.log_tail
) {
1101 // prefer current primary (usually the caller), all things being equal
1102 if (p
->first
== pg_whoami
) {
1103 dout(10) << "calc_acting prefer osd." << p
->first
1104 << " because it is current primary" << dendl
;
1112 void PG::calc_ec_acting(
1113 map
<pg_shard_t
, pg_info_t
>::const_iterator auth_log_shard
,
1115 const vector
<int> &acting
,
1116 pg_shard_t acting_primary
,
1117 const vector
<int> &up
,
1118 pg_shard_t up_primary
,
1119 const map
<pg_shard_t
, pg_info_t
> &all_info
,
1120 bool restrict_to_up_acting
,
1122 set
<pg_shard_t
> *backfill
,
1123 set
<pg_shard_t
> *acting_backfill
,
1124 pg_shard_t
*want_primary
,
1127 vector
<int> want(size
, CRUSH_ITEM_NONE
);
1128 map
<shard_id_t
, set
<pg_shard_t
> > all_info_by_shard
;
1129 unsigned usable
= 0;
1130 for (map
<pg_shard_t
, pg_info_t
>::const_iterator i
= all_info
.begin();
1131 i
!= all_info
.end();
1133 all_info_by_shard
[i
->first
.shard
].insert(i
->first
);
1135 for (uint8_t i
= 0; i
< want
.size(); ++i
) {
1136 ss
<< "For position " << (unsigned)i
<< ": ";
1137 if (up
.size() > (unsigned)i
&& up
[i
] != CRUSH_ITEM_NONE
&&
1138 !all_info
.find(pg_shard_t(up
[i
], shard_id_t(i
)))->second
.is_incomplete() &&
1139 all_info
.find(pg_shard_t(up
[i
], shard_id_t(i
)))->second
.last_update
>=
1140 auth_log_shard
->second
.log_tail
) {
1141 ss
<< " selecting up[i]: " << pg_shard_t(up
[i
], shard_id_t(i
)) << std::endl
;
1146 if (up
.size() > (unsigned)i
&& up
[i
] != CRUSH_ITEM_NONE
) {
1147 ss
<< " backfilling up[i]: " << pg_shard_t(up
[i
], shard_id_t(i
))
1149 backfill
->insert(pg_shard_t(up
[i
], shard_id_t(i
)));
1152 if (acting
.size() > (unsigned)i
&& acting
[i
] != CRUSH_ITEM_NONE
&&
1153 !all_info
.find(pg_shard_t(acting
[i
], shard_id_t(i
)))->second
.is_incomplete() &&
1154 all_info
.find(pg_shard_t(acting
[i
], shard_id_t(i
)))->second
.last_update
>=
1155 auth_log_shard
->second
.log_tail
) {
1156 ss
<< " selecting acting[i]: " << pg_shard_t(acting
[i
], shard_id_t(i
)) << std::endl
;
1157 want
[i
] = acting
[i
];
1159 } else if (!restrict_to_up_acting
) {
1160 for (set
<pg_shard_t
>::iterator j
= all_info_by_shard
[shard_id_t(i
)].begin();
1161 j
!= all_info_by_shard
[shard_id_t(i
)].end();
1163 assert(j
->shard
== i
);
1164 if (!all_info
.find(*j
)->second
.is_incomplete() &&
1165 all_info
.find(*j
)->second
.last_update
>=
1166 auth_log_shard
->second
.log_tail
) {
1167 ss
<< " selecting stray: " << *j
<< std::endl
;
1173 if (want
[i
] == CRUSH_ITEM_NONE
)
1174 ss
<< " failed to fill position " << (int)i
<< std::endl
;
1178 bool found_primary
= false;
1179 for (uint8_t i
= 0; i
< want
.size(); ++i
) {
1180 if (want
[i
] != CRUSH_ITEM_NONE
) {
1181 acting_backfill
->insert(pg_shard_t(want
[i
], shard_id_t(i
)));
1182 if (!found_primary
) {
1183 *want_primary
= pg_shard_t(want
[i
], shard_id_t(i
));
1184 found_primary
= true;
1188 acting_backfill
->insert(backfill
->begin(), backfill
->end());
1193 * calculate the desired acting set.
1195 * Choose an appropriate acting set. Prefer up[0], unless it is
1196 * incomplete, or another osd has a longer tail that allows us to
1197 * bring other up nodes up to date.
1199 void PG::calc_replicated_acting(
1200 map
<pg_shard_t
, pg_info_t
>::const_iterator auth_log_shard
,
1202 const vector
<int> &acting
,
1203 pg_shard_t acting_primary
,
1204 const vector
<int> &up
,
1205 pg_shard_t up_primary
,
1206 const map
<pg_shard_t
, pg_info_t
> &all_info
,
1207 bool restrict_to_up_acting
,
1209 set
<pg_shard_t
> *backfill
,
1210 set
<pg_shard_t
> *acting_backfill
,
1211 pg_shard_t
*want_primary
,
1214 ss
<< "calc_acting newest update on osd." << auth_log_shard
->first
1215 << " with " << auth_log_shard
->second
1216 << (restrict_to_up_acting
? " restrict_to_up_acting" : "") << std::endl
;
1217 pg_shard_t auth_log_shard_id
= auth_log_shard
->first
;
1220 map
<pg_shard_t
,pg_info_t
>::const_iterator primary
;
1222 !all_info
.find(up_primary
)->second
.is_incomplete() &&
1223 all_info
.find(up_primary
)->second
.last_update
>=
1224 auth_log_shard
->second
.log_tail
) {
1225 ss
<< "up_primary: " << up_primary
<< ") selected as primary" << std::endl
;
1226 primary
= all_info
.find(up_primary
); // prefer up[0], all thing being equal
1228 assert(!auth_log_shard
->second
.is_incomplete());
1229 ss
<< "up[0] needs backfill, osd." << auth_log_shard_id
1230 << " selected as primary instead" << std::endl
;
1231 primary
= auth_log_shard
;
1234 ss
<< "calc_acting primary is osd." << primary
->first
1235 << " with " << primary
->second
<< std::endl
;
1236 *want_primary
= primary
->first
;
1237 want
->push_back(primary
->first
.osd
);
1238 acting_backfill
->insert(primary
->first
);
1239 unsigned usable
= 1;
1241 // select replicas that have log contiguity with primary.
1242 // prefer up, then acting, then any peer_info osds
1243 for (vector
<int>::const_iterator i
= up
.begin();
1246 pg_shard_t up_cand
= pg_shard_t(*i
, shard_id_t::NO_SHARD
);
1247 if (up_cand
== primary
->first
)
1249 const pg_info_t
&cur_info
= all_info
.find(up_cand
)->second
;
1250 if (cur_info
.is_incomplete() ||
1251 cur_info
.last_update
< MIN(
1252 primary
->second
.log_tail
,
1253 auth_log_shard
->second
.log_tail
)) {
1254 /* We include auth_log_shard->second.log_tail because in GetLog,
1255 * we will request logs back to the min last_update over our
1256 * acting_backfill set, which will result in our log being extended
1257 * as far backwards as necessary to pick up any peers which can
1258 * be log recovered by auth_log_shard's log */
1259 ss
<< " shard " << up_cand
<< " (up) backfill " << cur_info
<< std::endl
;
1260 backfill
->insert(up_cand
);
1261 acting_backfill
->insert(up_cand
);
1263 want
->push_back(*i
);
1264 acting_backfill
->insert(up_cand
);
1266 ss
<< " osd." << *i
<< " (up) accepted " << cur_info
<< std::endl
;
1270 // This no longer has backfill OSDs, but they are covered above.
1271 for (vector
<int>::const_iterator i
= acting
.begin();
1274 pg_shard_t
acting_cand(*i
, shard_id_t::NO_SHARD
);
1278 // skip up osds we already considered above
1279 if (acting_cand
== primary
->first
)
1281 vector
<int>::const_iterator up_it
= find(up
.begin(), up
.end(), acting_cand
.osd
);
1282 if (up_it
!= up
.end())
1285 const pg_info_t
&cur_info
= all_info
.find(acting_cand
)->second
;
1286 if (cur_info
.is_incomplete() ||
1287 cur_info
.last_update
< primary
->second
.log_tail
) {
1288 ss
<< " shard " << acting_cand
<< " (stray) REJECTED "
1289 << cur_info
<< std::endl
;
1291 want
->push_back(*i
);
1292 acting_backfill
->insert(acting_cand
);
1293 ss
<< " shard " << acting_cand
<< " (stray) accepted "
1294 << cur_info
<< std::endl
;
1299 if (restrict_to_up_acting
) {
1302 for (map
<pg_shard_t
,pg_info_t
>::const_iterator i
= all_info
.begin();
1303 i
!= all_info
.end();
1308 // skip up osds we already considered above
1309 if (i
->first
== primary
->first
)
1311 vector
<int>::const_iterator up_it
= find(up
.begin(), up
.end(), i
->first
.osd
);
1312 if (up_it
!= up
.end())
1314 vector
<int>::const_iterator acting_it
= find(
1315 acting
.begin(), acting
.end(), i
->first
.osd
);
1316 if (acting_it
!= acting
.end())
1319 if (i
->second
.is_incomplete() ||
1320 i
->second
.last_update
< primary
->second
.log_tail
) {
1321 ss
<< " shard " << i
->first
<< " (stray) REJECTED "
1322 << i
->second
<< std::endl
;
1324 want
->push_back(i
->first
.osd
);
1325 acting_backfill
->insert(i
->first
);
1326 ss
<< " shard " << i
->first
<< " (stray) accepted "
1327 << i
->second
<< std::endl
;
1336 * calculate the desired acting, and request a change with the monitor
1337 * if it differs from the current acting.
1339 * if restrict_to_up_acting=true, we filter out anything that's not in
1340 * up/acting. in order to lift this restriction, we need to
1341 * 1) check whether it's worth switching the acting set any time we get
1342 * a new pg info (not just here, when recovery finishes)
1343 * 2) check whether anything in want_acting went down on each new map
1344 * (and, if so, calculate a new want_acting)
1345 * 3) remove the assertion in PG::RecoveryState::Active::react(const AdvMap)
1348 bool PG::choose_acting(pg_shard_t
&auth_log_shard_id
,
1349 bool restrict_to_up_acting
,
1350 bool *history_les_bound
)
1352 map
<pg_shard_t
, pg_info_t
> all_info(peer_info
.begin(), peer_info
.end());
1353 all_info
[pg_whoami
] = info
;
1355 for (map
<pg_shard_t
, pg_info_t
>::iterator p
= all_info
.begin();
1356 p
!= all_info
.end();
1358 dout(10) << "calc_acting osd." << p
->first
<< " " << p
->second
<< dendl
;
1361 map
<pg_shard_t
, pg_info_t
>::const_iterator auth_log_shard
=
1362 find_best_info(all_info
, restrict_to_up_acting
, history_les_bound
);
1364 if (auth_log_shard
== all_info
.end()) {
1366 dout(10) << "choose_acting no suitable info found (incomplete backfills?),"
1367 << " reverting to up" << dendl
;
1370 osd
->queue_want_pg_temp(info
.pgid
.pgid
, empty
);
1372 dout(10) << "choose_acting failed" << dendl
;
1373 assert(want_acting
.empty());
1378 assert(!auth_log_shard
->second
.is_incomplete());
1379 auth_log_shard_id
= auth_log_shard
->first
;
1381 set
<pg_shard_t
> want_backfill
, want_acting_backfill
;
1383 pg_shard_t want_primary
;
1385 if (!pool
.info
.ec_pool())
1386 calc_replicated_acting(
1388 get_osdmap()->get_pg_size(info
.pgid
.pgid
),
1394 restrict_to_up_acting
,
1397 &want_acting_backfill
,
1403 get_osdmap()->get_pg_size(info
.pgid
.pgid
),
1409 restrict_to_up_acting
,
1412 &want_acting_backfill
,
1415 dout(10) << ss
.str() << dendl
;
1417 unsigned num_want_acting
= 0;
1418 set
<pg_shard_t
> have
;
1419 for (int i
= 0; i
< (int)want
.size(); ++i
) {
1420 if (want
[i
] != CRUSH_ITEM_NONE
) {
1425 pool
.info
.ec_pool() ? shard_id_t(i
) : shard_id_t::NO_SHARD
));
1429 // We go incomplete if below min_size for ec_pools since backfill
1430 // does not currently maintain rollbackability
1431 // Otherwise, we will go "peered", but not "active"
1432 if (num_want_acting
< pool
.info
.min_size
&&
1433 (pool
.info
.ec_pool() ||
1434 !cct
->_conf
->osd_allow_recovery_below_min_size
)) {
1435 want_acting
.clear();
1436 dout(10) << "choose_acting failed, below min size" << dendl
;
1440 /* Check whether we have enough acting shards to later perform recovery */
1441 boost::scoped_ptr
<IsPGRecoverablePredicate
> recoverable_predicate(
1442 get_pgbackend()->get_is_recoverable_predicate());
1443 if (!(*recoverable_predicate
)(have
)) {
1444 want_acting
.clear();
1445 dout(10) << "choose_acting failed, not recoverable" << dendl
;
1449 if (want
!= acting
) {
1450 dout(10) << "choose_acting want " << want
<< " != acting " << acting
1451 << ", requesting pg_temp change" << dendl
;
1454 if (want_acting
== up
) {
1455 // There can't be any pending backfill if
1456 // want is the same as crush map up OSDs.
1457 assert(want_backfill
.empty());
1459 osd
->queue_want_pg_temp(info
.pgid
.pgid
, empty
);
1461 osd
->queue_want_pg_temp(info
.pgid
.pgid
, want
);
1464 want_acting
.clear();
1465 actingbackfill
= want_acting_backfill
;
1466 dout(10) << "actingbackfill is " << actingbackfill
<< dendl
;
1467 assert(backfill_targets
.empty() || backfill_targets
== want_backfill
);
1468 if (backfill_targets
.empty()) {
1469 // Caller is GetInfo
1470 backfill_targets
= want_backfill
;
1472 // Will not change if already set because up would have had to change
1473 // Verify that nothing in backfill is in stray_set
1474 for (set
<pg_shard_t
>::iterator i
= want_backfill
.begin();
1475 i
!= want_backfill
.end();
1477 assert(stray_set
.find(*i
) == stray_set
.end());
1479 dout(10) << "choose_acting want " << want
<< " (== acting) backfill_targets "
1480 << want_backfill
<< dendl
;
1484 /* Build the might_have_unfound set.
1486 * This is used by the primary OSD during recovery.
1488 * This set tracks the OSDs which might have unfound objects that the primary
1489 * OSD needs. As we receive pg_missing_t from each OSD in might_have_unfound, we
1490 * will remove the OSD from the set.
1492 void PG::build_might_have_unfound()
1494 assert(might_have_unfound
.empty());
1495 assert(is_primary());
1497 dout(10) << __func__
<< dendl
;
1499 check_past_interval_bounds();
1501 might_have_unfound
= past_intervals
.get_might_have_unfound(
1503 pool
.info
.ec_pool());
1505 // include any (stray) peers
1506 for (map
<pg_shard_t
, pg_info_t
>::iterator p
= peer_info
.begin();
1507 p
!= peer_info
.end();
1509 might_have_unfound
.insert(p
->first
);
1511 dout(15) << __func__
<< ": built " << might_have_unfound
<< dendl
;
1514 struct C_PG_ActivateCommitted
: public Context
{
1517 epoch_t activation_epoch
;
1518 C_PG_ActivateCommitted(PG
*p
, epoch_t e
, epoch_t ae
)
1519 : pg(p
), epoch(e
), activation_epoch(ae
) {}
1520 void finish(int r
) override
{
1521 pg
->_activate_committed(epoch
, activation_epoch
);
1525 void PG::activate(ObjectStore::Transaction
& t
,
1526 epoch_t activation_epoch
,
1527 list
<Context
*>& tfin
,
1528 map
<int, map
<spg_t
,pg_query_t
> >& query_map
,
1532 PastIntervals
> > > *activator_map
,
1535 assert(!is_peered());
1536 assert(scrubber
.callbacks
.empty());
1537 assert(callbacks_for_degraded_object
.empty());
1540 state_clear(PG_STATE_DOWN
);
1542 send_notify
= false;
1545 // only update primary last_epoch_started if we will go active
1546 if (acting
.size() >= pool
.info
.min_size
) {
1547 assert(cct
->_conf
->osd_find_best_info_ignore_history_les
||
1548 info
.last_epoch_started
<= activation_epoch
);
1549 info
.last_epoch_started
= activation_epoch
;
1550 info
.last_interval_started
= info
.history
.same_interval_since
;
1552 } else if (is_acting(pg_whoami
)) {
1553 /* update last_epoch_started on acting replica to whatever the primary sent
1554 * unless it's smaller (could happen if we are going peered rather than
1555 * active, see doc/dev/osd_internals/last_epoch_started.rst) */
1556 if (info
.last_epoch_started
< activation_epoch
) {
1557 info
.last_epoch_started
= activation_epoch
;
1558 info
.last_interval_started
= info
.history
.same_interval_since
;
1562 auto &missing
= pg_log
.get_missing();
1565 last_update_ondisk
= info
.last_update
;
1566 min_last_complete_ondisk
= eversion_t(0,0); // we don't know (yet)!
1568 last_update_applied
= info
.last_update
;
1569 last_rollback_info_trimmed_to_applied
= pg_log
.get_can_rollback_to();
1571 need_up_thru
= false;
1573 // write pg info, log
1575 dirty_big_info
= true; // maybe
1577 // find out when we commit
1578 t
.register_on_complete(
1579 new C_PG_ActivateCommitted(
1581 get_osdmap()->get_epoch(),
1584 // initialize snap_trimq
1586 dout(20) << "activate - purged_snaps " << info
.purged_snaps
1587 << " cached_removed_snaps " << pool
.cached_removed_snaps
<< dendl
;
1588 snap_trimq
= pool
.cached_removed_snaps
;
1589 interval_set
<snapid_t
> intersection
;
1590 intersection
.intersection_of(snap_trimq
, info
.purged_snaps
);
1591 if (intersection
== info
.purged_snaps
) {
1592 snap_trimq
.subtract(info
.purged_snaps
);
1594 dout(0) << "warning: info.purged_snaps (" << info
.purged_snaps
1595 << ") is not a subset of pool.cached_removed_snaps ("
1596 << pool
.cached_removed_snaps
<< ")" << dendl
;
1597 snap_trimq
.subtract(intersection
);
1601 // init complete pointer
1602 if (missing
.num_missing() == 0) {
1603 dout(10) << "activate - no missing, moving last_complete " << info
.last_complete
1604 << " -> " << info
.last_update
<< dendl
;
1605 info
.last_complete
= info
.last_update
;
1606 pg_log
.reset_recovery_pointers();
1608 dout(10) << "activate - not complete, " << missing
<< dendl
;
1609 pg_log
.activate_not_complete(info
);
1617 // start up replicas
1619 assert(!actingbackfill
.empty());
1620 for (set
<pg_shard_t
>::iterator i
= actingbackfill
.begin();
1621 i
!= actingbackfill
.end();
1623 if (*i
== pg_whoami
) continue;
1624 pg_shard_t peer
= *i
;
1625 assert(peer_info
.count(peer
));
1626 pg_info_t
& pi
= peer_info
[peer
];
1628 dout(10) << "activate peer osd." << peer
<< " " << pi
<< dendl
;
1631 pg_missing_t
& pm
= peer_missing
[peer
];
1633 bool needs_past_intervals
= pi
.dne();
1636 * cover case where peer sort order was different and
1637 * last_backfill cannot be interpreted
1639 bool force_restart_backfill
=
1640 !pi
.last_backfill
.is_max() &&
1641 !pi
.last_backfill_bitwise
;
1643 if (pi
.last_update
== info
.last_update
&& !force_restart_backfill
) {
1645 if (!pi
.last_backfill
.is_max())
1646 osd
->clog
->info() << info
.pgid
<< " continuing backfill to osd."
1648 << " from (" << pi
.log_tail
<< "," << pi
.last_update
1649 << "] " << pi
.last_backfill
1650 << " to " << info
.last_update
;
1651 if (!pi
.is_empty() && activator_map
) {
1652 dout(10) << "activate peer osd." << peer
<< " is up to date, queueing in pending_activators" << dendl
;
1653 (*activator_map
)[peer
.osd
].push_back(
1656 peer
.shard
, pg_whoami
.shard
,
1657 get_osdmap()->get_epoch(),
1658 get_osdmap()->get_epoch(),
1662 dout(10) << "activate peer osd." << peer
<< " is up to date, but sending pg_log anyway" << dendl
;
1664 i
->shard
, pg_whoami
.shard
,
1665 get_osdmap()->get_epoch(), info
);
1668 pg_log
.get_tail() > pi
.last_update
||
1669 pi
.last_backfill
== hobject_t() ||
1670 force_restart_backfill
||
1671 (backfill_targets
.count(*i
) && pi
.last_backfill
.is_max())) {
1672 /* ^ This last case covers a situation where a replica is not contiguous
1673 * with the auth_log, but is contiguous with this replica. Reshuffling
1674 * the active set to handle this would be tricky, so instead we just go
1675 * ahead and backfill it anyway. This is probably preferrable in any
1676 * case since the replica in question would have to be significantly
1680 osd
->clog
->debug() << info
.pgid
<< " starting backfill to osd." << peer
1681 << " from (" << pi
.log_tail
<< "," << pi
.last_update
1682 << "] " << pi
.last_backfill
1683 << " to " << info
.last_update
;
1685 pi
.last_update
= info
.last_update
;
1686 pi
.last_complete
= info
.last_update
;
1687 pi
.set_last_backfill(hobject_t());
1688 pi
.last_epoch_started
= info
.last_epoch_started
;
1689 pi
.last_interval_started
= info
.last_interval_started
;
1690 pi
.history
= info
.history
;
1691 pi
.hit_set
= info
.hit_set
;
1692 pi
.stats
.stats
.clear();
1694 // initialize peer with our purged_snaps.
1695 pi
.purged_snaps
= info
.purged_snaps
;
1698 i
->shard
, pg_whoami
.shard
,
1699 get_osdmap()->get_epoch(), pi
);
1701 // send some recent log, so that op dup detection works well.
1702 m
->log
.copy_up_to(pg_log
.get_log(), cct
->_conf
->osd_min_pg_log_entries
);
1703 m
->info
.log_tail
= m
->log
.tail
;
1704 pi
.log_tail
= m
->log
.tail
; // sigh...
1709 assert(pg_log
.get_tail() <= pi
.last_update
);
1711 i
->shard
, pg_whoami
.shard
,
1712 get_osdmap()->get_epoch(), info
);
1713 // send new stuff to append to replicas log
1714 m
->log
.copy_after(pg_log
.get_log(), pi
.last_update
);
1717 // share past_intervals if we are creating the pg on the replica
1718 // based on whether our info for that peer was dne() *before*
1719 // updating pi.history in the backfill block above.
1720 if (m
&& needs_past_intervals
)
1721 m
->past_intervals
= past_intervals
;
1723 // update local version of peer's missing list!
1724 if (m
&& pi
.last_backfill
!= hobject_t()) {
1725 for (list
<pg_log_entry_t
>::iterator p
= m
->log
.log
.begin();
1726 p
!= m
->log
.log
.end();
1728 if (p
->soid
<= pi
.last_backfill
&&
1730 pm
.add_next_event(*p
);
1734 dout(10) << "activate peer osd." << peer
<< " sending " << m
->log
<< dendl
;
1735 //m->log.print(cout);
1736 osd
->send_message_osd_cluster(peer
.osd
, m
, get_osdmap()->get_epoch());
1740 pi
.last_update
= info
.last_update
;
1742 // update our missing
1743 if (pm
.num_missing() == 0) {
1744 pi
.last_complete
= pi
.last_update
;
1745 dout(10) << "activate peer osd." << peer
<< " " << pi
<< " uptodate" << dendl
;
1747 dout(10) << "activate peer osd." << peer
<< " " << pi
<< " missing " << pm
<< dendl
;
1751 // Set up missing_loc
1752 set
<pg_shard_t
> complete_shards
;
1753 for (set
<pg_shard_t
>::iterator i
= actingbackfill
.begin();
1754 i
!= actingbackfill
.end();
1756 if (*i
== get_primary()) {
1757 missing_loc
.add_active_missing(missing
);
1758 if (!missing
.have_missing())
1759 complete_shards
.insert(*i
);
1761 auto peer_missing_entry
= peer_missing
.find(*i
);
1762 assert(peer_missing_entry
!= peer_missing
.end());
1763 missing_loc
.add_active_missing(peer_missing_entry
->second
);
1764 if (!peer_missing_entry
->second
.have_missing() &&
1765 peer_info
[*i
].last_backfill
.is_max())
1766 complete_shards
.insert(*i
);
1769 // If necessary, create might_have_unfound to help us find our unfound objects.
1770 // NOTE: It's important that we build might_have_unfound before trimming the
1772 might_have_unfound
.clear();
1773 if (needs_recovery()) {
1774 // If only one shard has missing, we do a trick to add all others as recovery
1775 // source, this is considered safe since the PGLogs have been merged locally,
1776 // and covers vast majority of the use cases, like one OSD/host is down for
1777 // a while for hardware repairing
1778 if (complete_shards
.size() + 1 == actingbackfill
.size()) {
1779 missing_loc
.add_batch_sources_info(complete_shards
, ctx
->handle
);
1781 missing_loc
.add_source_info(pg_whoami
, info
, pg_log
.get_missing(),
1783 for (set
<pg_shard_t
>::iterator i
= actingbackfill
.begin();
1784 i
!= actingbackfill
.end();
1786 if (*i
== pg_whoami
) continue;
1787 dout(10) << __func__
<< ": adding " << *i
<< " as a source" << dendl
;
1788 assert(peer_missing
.count(*i
));
1789 assert(peer_info
.count(*i
));
1790 missing_loc
.add_source_info(
1797 for (map
<pg_shard_t
, pg_missing_t
>::iterator i
= peer_missing
.begin();
1798 i
!= peer_missing
.end();
1800 if (is_actingbackfill(i
->first
))
1802 assert(peer_info
.count(i
->first
));
1804 peer_info
[i
->first
],
1810 build_might_have_unfound();
1812 state_set(PG_STATE_DEGRADED
);
1814 discover_all_missing(query_map
);
1818 if (get_osdmap()->get_pg_size(info
.pgid
.pgid
) > actingset
.size()) {
1819 state_set(PG_STATE_DEGRADED
);
1820 state_set(PG_STATE_UNDERSIZED
);
1823 state_set(PG_STATE_ACTIVATING
);
1824 release_pg_backoffs();
1825 projected_last_update
= info
.last_update
;
1827 if (acting
.size() >= pool
.info
.min_size
) {
1828 PGLogEntryHandler handler
{this, &t
};
1829 pg_log
.roll_forward(&handler
);
1833 bool PG::op_has_sufficient_caps(OpRequestRef
& op
)
1835 // only check MOSDOp
1836 if (op
->get_req()->get_type() != CEPH_MSG_OSD_OP
)
1839 const MOSDOp
*req
= static_cast<const MOSDOp
*>(op
->get_req());
1841 Session
*session
= static_cast<Session
*>(req
->get_connection()->get_priv());
1843 dout(0) << "op_has_sufficient_caps: no session for op " << *req
<< dendl
;
1846 OSDCap
& caps
= session
->caps
;
1849 const string
&key
= req
->get_hobj().get_key().empty() ?
1850 req
->get_oid().name
:
1851 req
->get_hobj().get_key();
1853 bool cap
= caps
.is_capable(pool
.name
, req
->get_hobj().nspace
,
1855 op
->need_read_cap(),
1856 op
->need_write_cap(),
1859 dout(20) << "op_has_sufficient_caps pool=" << pool
.id
<< " (" << pool
.name
1860 << " " << req
->get_hobj().nspace
1861 << ") owner=" << pool
.auid
1862 << " need_read_cap=" << op
->need_read_cap()
1863 << " need_write_cap=" << op
->need_write_cap()
1864 << " classes=" << op
->classes()
1865 << " -> " << (cap
? "yes" : "NO")
1870 void PG::_activate_committed(epoch_t epoch
, epoch_t activation_epoch
)
1873 if (pg_has_reset_since(epoch
)) {
1874 dout(10) << "_activate_committed " << epoch
1875 << ", that was an old interval" << dendl
;
1876 } else if (is_primary()) {
1877 peer_activated
.insert(pg_whoami
);
1878 dout(10) << "_activate_committed " << epoch
1879 << " peer_activated now " << peer_activated
1880 << " last_interval_started " << info
.history
.last_interval_started
1881 << " last_epoch_started " << info
.history
.last_epoch_started
1882 << " same_interval_since " << info
.history
.same_interval_since
<< dendl
;
1883 assert(!actingbackfill
.empty());
1884 if (peer_activated
.size() == actingbackfill
.size())
1885 all_activated_and_committed();
1887 dout(10) << "_activate_committed " << epoch
<< " telling primary" << dendl
;
1888 MOSDPGInfo
*m
= new MOSDPGInfo(epoch
);
1889 pg_notify_t i
= pg_notify_t(
1890 get_primary().shard
, pg_whoami
.shard
,
1891 get_osdmap()->get_epoch(),
1892 get_osdmap()->get_epoch(),
1895 i
.info
.history
.last_epoch_started
= activation_epoch
;
1896 i
.info
.history
.last_interval_started
= i
.info
.history
.same_interval_since
;
1897 if (acting
.size() >= pool
.info
.min_size
) {
1898 state_set(PG_STATE_ACTIVE
);
1900 state_set(PG_STATE_PEERED
);
1903 m
->pg_list
.push_back(make_pair(i
, PastIntervals()));
1904 osd
->send_message_osd_cluster(get_primary().osd
, m
, get_osdmap()->get_epoch());
1907 if (flushes_in_progress
== 0) {
1908 requeue_ops(waiting_for_peered
);
1912 assert(!dirty_info
);
1918 * update info.history.last_epoch_started ONLY after we and all
1919 * replicas have activated AND committed the activate transaction
1920 * (i.e. the peering results are stable on disk).
1922 void PG::all_activated_and_committed()
1924 dout(10) << "all_activated_and_committed" << dendl
;
1925 assert(is_primary());
1926 assert(peer_activated
.size() == actingbackfill
.size());
1927 assert(!actingbackfill
.empty());
1928 assert(blocked_by
.empty());
1930 queue_peering_event(
1932 std::make_shared
<CephPeeringEvt
>(
1933 get_osdmap()->get_epoch(),
1934 get_osdmap()->get_epoch(),
1935 AllReplicasActivated())));
1938 bool PG::requeue_scrub(bool high_priority
)
1940 assert(is_locked());
1942 dout(10) << __func__
<< ": already queued" << dendl
;
1945 dout(10) << __func__
<< ": queueing" << dendl
;
1946 scrub_queued
= true;
1947 osd
->queue_for_scrub(this, high_priority
);
1952 void PG::queue_recovery(bool front
)
1954 if (!is_primary() || !is_peered()) {
1955 dout(10) << "queue_recovery -- not primary or not peered " << dendl
;
1956 assert(!recovery_queued
);
1957 } else if (recovery_queued
) {
1958 dout(10) << "queue_recovery -- already queued" << dendl
;
1960 dout(10) << "queue_recovery -- queuing" << dendl
;
1961 recovery_queued
= true;
1962 osd
->queue_for_recovery(this, front
);
1966 bool PG::queue_scrub()
1968 assert(is_locked());
1969 if (is_scrubbing()) {
1972 scrubber
.priority
= scrubber
.must_scrub
?
1973 cct
->_conf
->osd_requested_scrub_priority
: get_scrub_priority();
1974 scrubber
.must_scrub
= false;
1975 state_set(PG_STATE_SCRUBBING
);
1976 if (scrubber
.must_deep_scrub
) {
1977 state_set(PG_STATE_DEEP_SCRUB
);
1978 scrubber
.must_deep_scrub
= false;
1980 if (scrubber
.must_repair
|| scrubber
.auto_repair
) {
1981 state_set(PG_STATE_REPAIR
);
1982 scrubber
.must_repair
= false;
1988 unsigned PG::get_scrub_priority()
1990 // a higher value -> a higher priority
1991 int pool_scrub_priority
= 0;
1992 pool
.info
.opts
.get(pool_opts_t::SCRUB_PRIORITY
, &pool_scrub_priority
);
1993 return pool_scrub_priority
> 0 ? pool_scrub_priority
: cct
->_conf
->osd_scrub_priority
;
1996 struct C_PG_FinishRecovery
: public Context
{
1998 explicit C_PG_FinishRecovery(PG
*p
) : pg(p
) {}
1999 void finish(int r
) override
{
2000 pg
->_finish_recovery(this);
2004 void PG::mark_clean()
2006 if (actingset
.size() == get_osdmap()->get_pg_size(info
.pgid
.pgid
)) {
2007 state_set(PG_STATE_CLEAN
);
2008 info
.history
.last_epoch_clean
= get_osdmap()->get_epoch();
2009 info
.history
.last_interval_clean
= info
.history
.same_interval_since
;
2010 past_intervals
.clear();
2011 dirty_big_info
= true;
2018 unsigned PG::get_recovery_priority()
2020 // a higher value -> a higher priority
2022 int pool_recovery_priority
= 0;
2023 pool
.info
.opts
.get(pool_opts_t::RECOVERY_PRIORITY
, &pool_recovery_priority
);
2025 int ret
= OSD_RECOVERY_PRIORITY_BASE
+ pool_recovery_priority
;
2027 // Clamp to valid range
2028 if (ret
> OSD_RECOVERY_PRIORITY_MAX
) {
2029 ret
= OSD_RECOVERY_PRIORITY_MAX
;
2030 } else if (ret
< OSD_RECOVERY_PRIORITY_MIN
) {
2031 ret
= OSD_RECOVERY_PRIORITY_MIN
;
2034 static_assert(OSD_RECOVERY_PRIORITY_MIN
< OSD_RECOVERY_PRIORITY_MAX
, "Invalid priority range");
2035 static_assert(OSD_RECOVERY_PRIORITY_MIN
>= 0, "Priority range must match unsigned type");
2037 return static_cast<unsigned>(ret
);
2040 unsigned PG::get_backfill_priority()
2042 // a higher value -> a higher priority
2044 int ret
= OSD_BACKFILL_PRIORITY_BASE
;
2045 if (acting
.size() < pool
.info
.min_size
) {
2046 // inactive: no. of replicas < min_size, highest priority since it blocks IO
2047 ret
= OSD_BACKFILL_INACTIVE_PRIORITY_BASE
+ (pool
.info
.min_size
- acting
.size());
2049 } else if (is_undersized()) {
2050 // undersized: OSD_BACKFILL_DEGRADED_PRIORITY_BASE + num missing replicas
2051 assert(pool
.info
.size
> actingset
.size());
2052 ret
= OSD_BACKFILL_DEGRADED_PRIORITY_BASE
+ (pool
.info
.size
- actingset
.size());
2054 } else if (is_degraded()) {
2055 // degraded: baseline degraded
2056 ret
= OSD_BACKFILL_DEGRADED_PRIORITY_BASE
;
2059 // Adjust with pool's recovery priority
2060 int pool_recovery_priority
= 0;
2061 pool
.info
.opts
.get(pool_opts_t::RECOVERY_PRIORITY
, &pool_recovery_priority
);
2062 ret
+= pool_recovery_priority
;
2064 // Clamp to valid range
2065 if (ret
> OSD_RECOVERY_PRIORITY_MAX
) {
2066 ret
= OSD_RECOVERY_PRIORITY_MAX
;
2067 } else if (ret
< OSD_RECOVERY_PRIORITY_MIN
) {
2068 ret
= OSD_RECOVERY_PRIORITY_MIN
;
2071 return static_cast<unsigned>(ret
);
2074 void PG::finish_recovery(list
<Context
*>& tfin
)
2076 dout(10) << "finish_recovery" << dendl
;
2077 assert(info
.last_complete
== info
.last_update
);
2079 clear_recovery_state();
2082 * sync all this before purging strays. but don't block!
2084 finish_sync_event
= new C_PG_FinishRecovery(this);
2085 tfin
.push_back(finish_sync_event
);
2088 void PG::_finish_recovery(Context
*c
)
2095 if (c
== finish_sync_event
) {
2096 dout(10) << "_finish_recovery" << dendl
;
2097 finish_sync_event
= 0;
2100 publish_stats_to_osd();
2102 if (scrub_after_recovery
) {
2103 dout(10) << "_finish_recovery requeueing for scrub" << dendl
;
2104 scrub_after_recovery
= false;
2105 scrubber
.must_deep_scrub
= true;
2109 dout(10) << "_finish_recovery -- stale" << dendl
;
2114 void PG::start_recovery_op(const hobject_t
& soid
)
2116 dout(10) << "start_recovery_op " << soid
2117 #ifdef DEBUG_RECOVERY_OIDS
2118 << " (" << recovering_oids
<< ")"
2121 assert(recovery_ops_active
>= 0);
2122 recovery_ops_active
++;
2123 #ifdef DEBUG_RECOVERY_OIDS
2124 assert(recovering_oids
.count(soid
) == 0);
2125 recovering_oids
.insert(soid
);
2127 osd
->start_recovery_op(this, soid
);
2130 void PG::finish_recovery_op(const hobject_t
& soid
, bool dequeue
)
2132 dout(10) << "finish_recovery_op " << soid
2133 #ifdef DEBUG_RECOVERY_OIDS
2134 << " (" << recovering_oids
<< ")"
2137 assert(recovery_ops_active
> 0);
2138 recovery_ops_active
--;
2139 #ifdef DEBUG_RECOVERY_OIDS
2140 assert(recovering_oids
.count(soid
));
2141 recovering_oids
.erase(soid
);
2143 osd
->finish_recovery_op(this, soid
, dequeue
);
2150 void PG::split_into(pg_t child_pgid
, PG
*child
, unsigned split_bits
)
2152 child
->update_snap_mapper_bits(split_bits
);
2153 child
->update_osdmap_ref(get_osdmap());
2158 pg_log
.split_into(child_pgid
, split_bits
, &(child
->pg_log
));
2159 child
->info
.last_complete
= info
.last_complete
;
2161 info
.last_update
= pg_log
.get_head();
2162 child
->info
.last_update
= child
->pg_log
.get_head();
2164 child
->info
.last_user_version
= info
.last_user_version
;
2166 info
.log_tail
= pg_log
.get_tail();
2167 child
->info
.log_tail
= child
->pg_log
.get_tail();
2169 if (info
.last_complete
< pg_log
.get_tail())
2170 info
.last_complete
= pg_log
.get_tail();
2171 if (child
->info
.last_complete
< child
->pg_log
.get_tail())
2172 child
->info
.last_complete
= child
->pg_log
.get_tail();
2175 child
->info
.history
= info
.history
;
2176 child
->info
.history
.epoch_created
= get_osdmap()->get_epoch();
2177 child
->info
.purged_snaps
= info
.purged_snaps
;
2179 if (info
.last_backfill
.is_max()) {
2180 child
->info
.set_last_backfill(hobject_t::get_max());
2182 // restart backfill on parent and child to be safe. we could
2183 // probably do better in the bitwise sort case, but it's more
2184 // fragile (there may be special work to do on backfill completion
2186 info
.set_last_backfill(hobject_t());
2187 child
->info
.set_last_backfill(hobject_t());
2190 child
->info
.stats
= info
.stats
;
2191 child
->info
.stats
.parent_split_bits
= split_bits
;
2192 info
.stats
.stats_invalid
= true;
2193 child
->info
.stats
.stats_invalid
= true;
2194 child
->info
.last_epoch_started
= info
.last_epoch_started
;
2195 child
->info
.last_interval_started
= info
.last_interval_started
;
2197 child
->snap_trimq
= snap_trimq
;
2199 // There can't be recovery/backfill going on now
2200 int primary
, up_primary
;
2201 vector
<int> newup
, newacting
;
2202 get_osdmap()->pg_to_up_acting_osds(
2203 child
->info
.pgid
.pgid
, &newup
, &up_primary
, &newacting
, &primary
);
2204 child
->init_primary_up_acting(
2209 child
->role
= OSDMap::calc_pg_role(osd
->whoami
, child
->acting
);
2211 // this comparison includes primary rank via pg_shard_t
2212 if (get_primary() != child
->get_primary())
2213 child
->info
.history
.same_primary_since
= get_osdmap()->get_epoch();
2215 child
->info
.stats
.up
= up
;
2216 child
->info
.stats
.up_primary
= up_primary
;
2217 child
->info
.stats
.acting
= acting
;
2218 child
->info
.stats
.acting_primary
= primary
;
2219 child
->info
.stats
.mapping_epoch
= get_osdmap()->get_epoch();
2222 child
->past_intervals
= past_intervals
;
2224 _split_into(child_pgid
, child
, split_bits
);
2226 // release all backoffs for simplicity
2227 release_backoffs(hobject_t(), hobject_t::get_max());
2229 child
->on_new_interval();
2231 child
->dirty_info
= true;
2232 child
->dirty_big_info
= true;
2234 dirty_big_info
= true;
2237 void PG::add_backoff(SessionRef s
, const hobject_t
& begin
, const hobject_t
& end
)
2239 ConnectionRef con
= s
->con
;
2240 if (!con
) // OSD::ms_handle_reset clears s->con without a lock
2242 BackoffRef
b(s
->have_backoff(info
.pgid
, begin
));
2244 derr
<< __func__
<< " already have backoff for " << s
<< " begin " << begin
2245 << " " << *b
<< dendl
;
2248 Mutex::Locker
l(backoff_lock
);
2250 b
= new Backoff(info
.pgid
, this, s
, ++s
->backoff_seq
, begin
, end
);
2251 backoffs
[begin
].insert(b
);
2253 dout(10) << __func__
<< " session " << s
<< " added " << *b
<< dendl
;
2258 get_osdmap()->get_epoch(),
2259 CEPH_OSD_BACKOFF_OP_BLOCK
,
2265 void PG::release_backoffs(const hobject_t
& begin
, const hobject_t
& end
)
2267 dout(10) << __func__
<< " [" << begin
<< "," << end
<< ")" << dendl
;
2268 vector
<BackoffRef
> bv
;
2270 Mutex::Locker
l(backoff_lock
);
2271 auto p
= backoffs
.lower_bound(begin
);
2272 while (p
!= backoffs
.end()) {
2273 int r
= cmp(p
->first
, end
);
2274 dout(20) << __func__
<< " ? " << r
<< " " << p
->first
2275 << " " << p
->second
<< dendl
;
2276 // note: must still examine begin=end=p->first case
2277 if (r
> 0 || (r
== 0 && begin
< end
)) {
2280 dout(20) << __func__
<< " checking " << p
->first
2281 << " " << p
->second
<< dendl
;
2282 auto q
= p
->second
.begin();
2283 while (q
!= p
->second
.end()) {
2284 dout(20) << __func__
<< " checking " << *q
<< dendl
;
2285 int r
= cmp((*q
)->begin
, begin
);
2286 if (r
== 0 || (r
> 0 && (*q
)->end
< end
)) {
2288 q
= p
->second
.erase(q
);
2293 if (p
->second
.empty()) {
2294 p
= backoffs
.erase(p
);
2301 Mutex::Locker
l(b
->lock
);
2302 dout(10) << __func__
<< " " << *b
<< dendl
;
2304 assert(b
->pg
== this);
2305 ConnectionRef con
= b
->session
->con
;
2306 if (con
) { // OSD::ms_handle_reset clears s->con without a lock
2310 get_osdmap()->get_epoch(),
2311 CEPH_OSD_BACKOFF_OP_UNBLOCK
,
2317 b
->state
= Backoff::STATE_DELETING
;
2319 b
->session
->rm_backoff(b
);
2327 void PG::clear_backoffs()
2329 dout(10) << __func__
<< " " << dendl
;
2330 map
<hobject_t
,set
<BackoffRef
>> ls
;
2332 Mutex::Locker
l(backoff_lock
);
2335 for (auto& p
: ls
) {
2336 for (auto& b
: p
.second
) {
2337 Mutex::Locker
l(b
->lock
);
2338 dout(10) << __func__
<< " " << *b
<< dendl
;
2340 assert(b
->pg
== this);
2342 b
->state
= Backoff::STATE_DELETING
;
2344 b
->session
->rm_backoff(b
);
2353 // called by Session::clear_backoffs()
2354 void PG::rm_backoff(BackoffRef b
)
2356 dout(10) << __func__
<< " " << *b
<< dendl
;
2357 Mutex::Locker
l(backoff_lock
);
2358 assert(b
->lock
.is_locked_by_me());
2359 assert(b
->pg
== this);
2360 auto p
= backoffs
.find(b
->begin
);
2361 // may race with release_backoffs()
2362 if (p
!= backoffs
.end()) {
2363 auto q
= p
->second
.find(b
);
2364 if (q
!= p
->second
.end()) {
2366 if (p
->second
.empty()) {
2373 void PG::clear_recovery_state()
2375 dout(10) << "clear_recovery_state" << dendl
;
2377 pg_log
.reset_recovery_pointers();
2378 finish_sync_event
= 0;
2381 while (recovery_ops_active
> 0) {
2382 #ifdef DEBUG_RECOVERY_OIDS
2383 soid
= *recovering_oids
.begin();
2385 finish_recovery_op(soid
, true);
2388 backfill_targets
.clear();
2389 backfill_info
.clear();
2390 peer_backfill_info
.clear();
2391 waiting_on_backfill
.clear();
2392 _clear_recovery_state(); // pg impl specific hook
2395 void PG::cancel_recovery()
2397 dout(10) << "cancel_recovery" << dendl
;
2398 clear_recovery_state();
2402 void PG::purge_strays()
2404 dout(10) << "purge_strays " << stray_set
<< dendl
;
2406 bool removed
= false;
2407 for (set
<pg_shard_t
>::iterator p
= stray_set
.begin();
2408 p
!= stray_set
.end();
2410 assert(!is_actingbackfill(*p
));
2411 if (get_osdmap()->is_up(p
->osd
)) {
2412 dout(10) << "sending PGRemove to osd." << *p
<< dendl
;
2413 vector
<spg_t
> to_remove
;
2414 to_remove
.push_back(spg_t(info
.pgid
.pgid
, p
->shard
));
2415 MOSDPGRemove
*m
= new MOSDPGRemove(
2416 get_osdmap()->get_epoch(),
2418 osd
->send_message_osd_cluster(p
->osd
, m
, get_osdmap()->get_epoch());
2420 dout(10) << "not sending PGRemove to down osd." << *p
<< dendl
;
2422 peer_missing
.erase(*p
);
2423 peer_info
.erase(*p
);
2424 peer_purged
.insert(*p
);
2428 // if we removed anyone, update peers (which include peer_info)
2430 update_heartbeat_peers();
2434 // clear _requested maps; we may have to peer() again if we discover
2435 // (more) stray content
2436 peer_log_requested
.clear();
2437 peer_missing_requested
.clear();
2440 void PG::set_probe_targets(const set
<pg_shard_t
> &probe_set
)
2442 Mutex::Locker
l(heartbeat_peer_lock
);
2443 probe_targets
.clear();
2444 for (set
<pg_shard_t
>::iterator i
= probe_set
.begin();
2445 i
!= probe_set
.end();
2447 probe_targets
.insert(i
->osd
);
2451 void PG::clear_probe_targets()
2453 Mutex::Locker
l(heartbeat_peer_lock
);
2454 probe_targets
.clear();
2457 void PG::update_heartbeat_peers()
2459 assert(is_locked());
2465 for (unsigned i
=0; i
<acting
.size(); i
++) {
2466 if (acting
[i
] != CRUSH_ITEM_NONE
)
2467 new_peers
.insert(acting
[i
]);
2469 for (unsigned i
=0; i
<up
.size(); i
++) {
2470 if (up
[i
] != CRUSH_ITEM_NONE
)
2471 new_peers
.insert(up
[i
]);
2473 for (map
<pg_shard_t
,pg_info_t
>::iterator p
= peer_info
.begin();
2474 p
!= peer_info
.end();
2476 new_peers
.insert(p
->first
.osd
);
2478 bool need_update
= false;
2479 heartbeat_peer_lock
.Lock();
2480 if (new_peers
== heartbeat_peers
) {
2481 dout(10) << "update_heartbeat_peers " << heartbeat_peers
<< " unchanged" << dendl
;
2483 dout(10) << "update_heartbeat_peers " << heartbeat_peers
<< " -> " << new_peers
<< dendl
;
2484 heartbeat_peers
.swap(new_peers
);
2487 heartbeat_peer_lock
.Unlock();
2490 osd
->need_heartbeat_peer_update();
2494 bool PG::check_in_progress_op(
2495 const osd_reqid_t
&r
,
2496 eversion_t
*version
,
2497 version_t
*user_version
,
2498 int *return_code
) const
2501 projected_log
.get_request(r
, version
, user_version
, return_code
) ||
2502 pg_log
.get_log().get_request(r
, version
, user_version
, return_code
));
2505 void PG::_update_calc_stats()
2507 info
.stats
.version
= info
.last_update
;
2508 info
.stats
.created
= info
.history
.epoch_created
;
2509 info
.stats
.last_scrub
= info
.history
.last_scrub
;
2510 info
.stats
.last_scrub_stamp
= info
.history
.last_scrub_stamp
;
2511 info
.stats
.last_deep_scrub
= info
.history
.last_deep_scrub
;
2512 info
.stats
.last_deep_scrub_stamp
= info
.history
.last_deep_scrub_stamp
;
2513 info
.stats
.last_clean_scrub_stamp
= info
.history
.last_clean_scrub_stamp
;
2514 info
.stats
.last_epoch_clean
= info
.history
.last_epoch_clean
;
2516 info
.stats
.log_size
= pg_log
.get_head().version
- pg_log
.get_tail().version
;
2517 info
.stats
.ondisk_log_size
= info
.stats
.log_size
;
2518 info
.stats
.log_start
= pg_log
.get_tail();
2519 info
.stats
.ondisk_log_start
= pg_log
.get_tail();
2521 // If actingset is larger then upset we will have misplaced,
2522 // so we will report based on actingset size.
2524 // If upset is larger then we will have degraded,
2525 // so we will report based on upset size.
2527 // If target is the largest of them all, it will contribute to
2528 // the degraded count because num_object_copies is
2529 // computed using target and eventual used to get degraded total.
2531 unsigned target
= get_osdmap()->get_pg_size(info
.pgid
.pgid
);
2532 unsigned nrep
= MAX(actingset
.size(), upset
.size());
2533 // calc num_object_copies
2534 info
.stats
.stats
.calc_copies(MAX(target
, nrep
));
2535 info
.stats
.stats
.sum
.num_objects_degraded
= 0;
2536 info
.stats
.stats
.sum
.num_objects_unfound
= 0;
2537 info
.stats
.stats
.sum
.num_objects_misplaced
= 0;
2538 if ((is_degraded() || is_undersized() || !is_clean()) && is_peered()) {
2539 // NOTE: we only generate copies, degraded, misplaced and unfound
2540 // values for the summation, not individual stat categories.
2541 int64_t num_objects
= info
.stats
.stats
.sum
.num_objects
;
2543 // Total sum of all missing
2544 int64_t missing
= 0;
2545 // Objects that have arrived backfilled to up OSDs (not in acting)
2546 int64_t backfilled
= 0;
2547 // A misplaced object is not stored on the correct OSD
2548 int64_t misplaced
= 0;
2549 // Total of object copies/shards found
2550 int64_t object_copies
= 0;
2552 // num_objects_missing on each peer
2553 for (map
<pg_shard_t
, pg_info_t
>::iterator pi
=
2555 pi
!= peer_info
.end();
2557 map
<pg_shard_t
, pg_missing_t
>::const_iterator pm
=
2558 peer_missing
.find(pi
->first
);
2559 if (pm
!= peer_missing
.end()) {
2560 pi
->second
.stats
.stats
.sum
.num_objects_missing
=
2561 pm
->second
.num_missing();
2565 assert(!actingbackfill
.empty());
2566 for (set
<pg_shard_t
>::iterator i
= actingbackfill
.begin();
2567 i
!= actingbackfill
.end();
2569 const pg_shard_t
&p
= *i
;
2571 bool in_up
= (upset
.find(p
) != upset
.end());
2572 bool in_acting
= (actingset
.find(p
) != actingset
.end());
2573 assert(in_up
|| in_acting
);
2575 // in acting Compute total objects excluding num_missing
2576 // in acting and not in up Compute misplaced objects excluding num_missing
2577 // in up and not in acting Compute total objects already backfilled
2579 unsigned osd_missing
;
2581 if (p
== pg_whoami
) {
2582 osd_missing
= pg_log
.get_missing().num_missing();
2583 info
.stats
.stats
.sum
.num_objects_missing_on_primary
=
2585 object_copies
+= num_objects
; // My local (primary) count
2587 assert(peer_missing
.count(p
));
2588 osd_missing
= peer_missing
[p
].num_missing();
2589 object_copies
+= peer_info
[p
].stats
.stats
.sum
.num_objects
;
2591 missing
+= osd_missing
;
2592 // Count non-missing objects not in up as misplaced
2593 if (!in_up
&& num_objects
> osd_missing
)
2594 misplaced
+= num_objects
- osd_missing
;
2596 assert(in_up
&& !in_acting
);
2598 // If this peer has more objects then it should, ignore them
2599 backfilled
+= MIN(num_objects
, peer_info
[p
].stats
.stats
.sum
.num_objects
);
2603 // Any objects that have been backfilled to up OSDs can deducted from misplaced
2604 misplaced
= MAX(0, misplaced
- backfilled
);
2606 // Deduct computed total missing on acting nodes
2607 object_copies
-= missing
;
2608 // Include computed backfilled objects on up nodes
2609 object_copies
+= backfilled
;
2610 // a degraded objects has fewer replicas or EC shards than the
2611 // pool specifies. num_object_copies will never be smaller than target * num_copies.
2612 int64_t degraded
= MAX(0, info
.stats
.stats
.sum
.num_object_copies
- object_copies
);
2614 info
.stats
.stats
.sum
.num_objects_degraded
= degraded
;
2615 info
.stats
.stats
.sum
.num_objects_unfound
= get_num_unfound();
2616 info
.stats
.stats
.sum
.num_objects_misplaced
= misplaced
;
2620 void PG::_update_blocked_by()
2622 // set a max on the number of blocking peers we report. if we go
2623 // over, report a random subset. keep the result sorted.
2624 unsigned keep
= MIN(blocked_by
.size(), cct
->_conf
->osd_max_pg_blocked_by
);
2625 unsigned skip
= blocked_by
.size() - keep
;
2626 info
.stats
.blocked_by
.clear();
2627 info
.stats
.blocked_by
.resize(keep
);
2629 for (set
<int>::iterator p
= blocked_by
.begin();
2630 p
!= blocked_by
.end() && keep
> 0;
2632 if (skip
> 0 && (rand() % (skip
+ keep
) < skip
)) {
2635 info
.stats
.blocked_by
[pos
++] = *p
;
2641 void PG::publish_stats_to_osd()
2646 pg_stats_publish_lock
.Lock();
2648 if (info
.stats
.stats
.sum
.num_scrub_errors
)
2649 state_set(PG_STATE_INCONSISTENT
);
2651 state_clear(PG_STATE_INCONSISTENT
);
2653 utime_t now
= ceph_clock_now();
2654 if (info
.stats
.state
!= state
) {
2655 info
.stats
.last_change
= now
;
2656 // Optimistic estimation, if we just find out an inactive PG,
2657 // assumt it is active till now.
2658 if (!(state
& PG_STATE_ACTIVE
) &&
2659 (info
.stats
.state
& PG_STATE_ACTIVE
))
2660 info
.stats
.last_active
= now
;
2662 if ((state
& PG_STATE_ACTIVE
) &&
2663 !(info
.stats
.state
& PG_STATE_ACTIVE
))
2664 info
.stats
.last_became_active
= now
;
2665 if ((state
& (PG_STATE_ACTIVE
|PG_STATE_PEERED
)) &&
2666 !(info
.stats
.state
& (PG_STATE_ACTIVE
|PG_STATE_PEERED
)))
2667 info
.stats
.last_became_peered
= now
;
2668 if (!(state
& PG_STATE_CREATING
) &&
2669 (info
.stats
.state
& PG_STATE_CREATING
)) {
2670 osd
->send_pg_created(get_pgid().pgid
);
2672 info
.stats
.state
= state
;
2675 _update_calc_stats();
2676 _update_blocked_by();
2678 bool publish
= false;
2679 pg_stat_t pre_publish
= info
.stats
;
2680 pre_publish
.stats
.add(unstable_stats
);
2681 utime_t cutoff
= now
;
2682 cutoff
-= cct
->_conf
->osd_pg_stat_report_interval_max
;
2683 if (pg_stats_publish_valid
&& pre_publish
== pg_stats_publish
&&
2684 info
.stats
.last_fresh
> cutoff
) {
2685 dout(15) << "publish_stats_to_osd " << pg_stats_publish
.reported_epoch
2686 << ": no change since " << info
.stats
.last_fresh
<< dendl
;
2688 // update our stat summary and timestamps
2689 info
.stats
.reported_epoch
= get_osdmap()->get_epoch();
2690 ++info
.stats
.reported_seq
;
2692 info
.stats
.last_fresh
= now
;
2694 if (info
.stats
.state
& PG_STATE_CLEAN
)
2695 info
.stats
.last_clean
= now
;
2696 if (info
.stats
.state
& PG_STATE_ACTIVE
)
2697 info
.stats
.last_active
= now
;
2698 if (info
.stats
.state
& (PG_STATE_ACTIVE
|PG_STATE_PEERED
))
2699 info
.stats
.last_peered
= now
;
2700 info
.stats
.last_unstale
= now
;
2701 if ((info
.stats
.state
& PG_STATE_DEGRADED
) == 0)
2702 info
.stats
.last_undegraded
= now
;
2703 if ((info
.stats
.state
& PG_STATE_UNDERSIZED
) == 0)
2704 info
.stats
.last_fullsized
= now
;
2706 // do not send pgstat to mon anymore once we are luminous, since mgr takes
2707 // care of this by sending MMonMgrReport to mon.
2709 osd
->osd
->get_osdmap()->require_osd_release
< CEPH_RELEASE_LUMINOUS
;
2710 pg_stats_publish_valid
= true;
2711 pg_stats_publish
= pre_publish
;
2713 dout(15) << "publish_stats_to_osd " << pg_stats_publish
.reported_epoch
2714 << ":" << pg_stats_publish
.reported_seq
<< dendl
;
2716 pg_stats_publish_lock
.Unlock();
2719 osd
->pg_stat_queue_enqueue(this);
2722 void PG::clear_publish_stats()
2724 dout(15) << "clear_stats" << dendl
;
2725 pg_stats_publish_lock
.Lock();
2726 pg_stats_publish_valid
= false;
2727 pg_stats_publish_lock
.Unlock();
2729 osd
->pg_stat_queue_dequeue(this);
2733 * initialize a newly instantiated pg
2735 * Initialize PG state, as when a PG is initially created, or when it
2736 * is first instantiated on the current node.
2738 * @param role our role/rank
2739 * @param newup up set
2740 * @param newacting acting set
2741 * @param history pg history
2742 * @param pi past_intervals
2743 * @param backfill true if info should be marked as backfill
2744 * @param t transaction to write out our new state in
2748 const vector
<int>& newup
, int new_up_primary
,
2749 const vector
<int>& newacting
, int new_acting_primary
,
2750 const pg_history_t
& history
,
2751 const PastIntervals
& pi
,
2753 ObjectStore::Transaction
*t
)
2755 dout(10) << "init role " << role
<< " up " << newup
<< " acting " << newacting
2756 << " history " << history
2757 << " past_intervals " << pi
2763 init_primary_up_acting(
2767 new_acting_primary
);
2769 info
.history
= history
;
2770 past_intervals
= pi
;
2773 info
.stats
.up_primary
= new_up_primary
;
2774 info
.stats
.acting
= acting
;
2775 info
.stats
.acting_primary
= new_acting_primary
;
2776 info
.stats
.mapping_epoch
= info
.history
.same_interval_since
;
2779 dout(10) << __func__
<< ": Setting backfill" << dendl
;
2780 info
.set_last_backfill(hobject_t());
2781 info
.last_complete
= info
.last_update
;
2782 pg_log
.mark_log_for_rewrite();
2788 dirty_big_info
= true;
2792 #pragma GCC diagnostic ignored "-Wpragmas"
2793 #pragma GCC diagnostic push
2794 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
2796 void PG::upgrade(ObjectStore
*store
)
2798 assert(info_struct_v
<= 10);
2799 ObjectStore::Transaction t
;
2801 assert(info_struct_v
>= 7);
2804 if (info_struct_v
<= 7) {
2805 pg_log
.mark_log_for_rewrite();
2806 ghobject_t
log_oid(OSD::make_pg_log_oid(pg_id
));
2807 ghobject_t
biginfo_oid(OSD::make_pg_biginfo_oid(pg_id
));
2808 t
.remove(coll_t::meta(), log_oid
);
2809 t
.remove(coll_t::meta(), biginfo_oid
);
2810 t
.touch(coll
, pgmeta_oid
);
2814 if (info_struct_v
<= 8) {
2815 // no special action needed.
2819 if (info_struct_v
<= 9) {
2820 // previous versions weren't (as) aggressively clearing past_intervals
2821 if (info
.history
.last_epoch_clean
>= info
.history
.same_interval_since
) {
2822 dout(20) << __func__
<< " clearing past_intervals" << dendl
;
2823 past_intervals
.clear();
2827 // update infover_key
2828 if (info_struct_v
< cur_struct_v
) {
2829 map
<string
,bufferlist
> v
;
2830 __u8 ver
= cur_struct_v
;
2831 ::encode(ver
, v
[infover_key
]);
2832 t
.omap_setkeys(coll
, pgmeta_oid
, v
);
2836 dirty_big_info
= true;
2839 ceph::shared_ptr
<ObjectStore::Sequencer
> osr (std::make_shared
<
2840 ObjectStore::Sequencer
>("upgrade"));
2841 int r
= store
->apply_transaction(osr
.get(), std::move(t
));
2843 derr
<< __func__
<< ": apply_transaction returned "
2844 << cpp_strerror(r
) << dendl
;
2850 if (!osr
->flush_commit(&waiter
)) {
2855 #pragma GCC diagnostic pop
2856 #pragma GCC diagnostic warning "-Wpragmas"
2858 int PG::_prepare_write_info(CephContext
* cct
,
2859 map
<string
,bufferlist
> *km
,
2861 pg_info_t
&info
, pg_info_t
&last_written_info
,
2862 PastIntervals
&past_intervals
,
2863 bool dirty_big_info
,
2866 PerfCounters
*logger
)
2869 ::encode(epoch
, (*km
)[epoch_key
]);
2873 logger
->inc(l_osd_pg_info
);
2875 // try to do info efficiently?
2876 if (!dirty_big_info
&& try_fast_info
&&
2877 info
.last_update
> last_written_info
.last_update
) {
2878 pg_fast_info_t fast
;
2879 fast
.populate_from(info
);
2880 bool did
= fast
.try_apply_to(&last_written_info
);
2881 assert(did
); // we verified last_update increased above
2882 if (info
== last_written_info
) {
2883 ::encode(fast
, (*km
)[fastinfo_key
]);
2885 logger
->inc(l_osd_pg_fastinfo
);
2888 generic_dout(30) << __func__
<< " fastinfo failed, info:\n";
2890 JSONFormatter
jf(true);
2891 jf
.dump_object("info", info
);
2895 *_dout
<< "\nlast_written_info:\n";
2896 JSONFormatter
jf(true);
2897 jf
.dump_object("last_written_info", last_written_info
);
2902 last_written_info
= info
;
2904 // info. store purged_snaps separately.
2905 interval_set
<snapid_t
> purged_snaps
;
2906 purged_snaps
.swap(info
.purged_snaps
);
2907 ::encode(info
, (*km
)[info_key
]);
2908 purged_snaps
.swap(info
.purged_snaps
);
2910 if (dirty_big_info
) {
2911 // potentially big stuff
2912 bufferlist
& bigbl
= (*km
)[biginfo_key
];
2913 ::encode(past_intervals
, bigbl
);
2914 ::encode(info
.purged_snaps
, bigbl
);
2915 //dout(20) << "write_info bigbl " << bigbl.length() << dendl;
2917 logger
->inc(l_osd_pg_biginfo
);
2923 void PG::_create(ObjectStore::Transaction
& t
, spg_t pgid
, int bits
)
2926 t
.create_collection(coll
, bits
);
2929 void PG::_init(ObjectStore::Transaction
& t
, spg_t pgid
, const pg_pool_t
*pool
)
2934 // Give a hint to the PG collection
2936 uint32_t pg_num
= pool
->get_pg_num();
2937 uint64_t expected_num_objects_pg
= pool
->expected_num_objects
/ pg_num
;
2938 ::encode(pg_num
, hint
);
2939 ::encode(expected_num_objects_pg
, hint
);
2940 uint32_t hint_type
= ObjectStore::Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS
;
2941 t
.collection_hint(coll
, hint_type
, hint
);
2944 ghobject_t
pgmeta_oid(pgid
.make_pgmeta_oid());
2945 t
.touch(coll
, pgmeta_oid
);
2946 map
<string
,bufferlist
> values
;
2947 __u8 struct_v
= cur_struct_v
;
2948 ::encode(struct_v
, values
[infover_key
]);
2949 t
.omap_setkeys(coll
, pgmeta_oid
, values
);
2952 void PG::prepare_write_info(map
<string
,bufferlist
> *km
)
2954 info
.stats
.stats
.add(unstable_stats
);
2955 unstable_stats
.clear();
2957 bool need_update_epoch
= last_epoch
< get_osdmap()->get_epoch();
2958 int ret
= _prepare_write_info(cct
, km
, get_osdmap()->get_epoch(),
2962 dirty_big_info
, need_update_epoch
,
2963 cct
->_conf
->osd_fast_info
,
2966 if (need_update_epoch
)
2967 last_epoch
= get_osdmap()->get_epoch();
2968 last_persisted_osdmap_ref
= osdmap_ref
;
2971 dirty_big_info
= false;
2974 #pragma GCC diagnostic ignored "-Wpragmas"
2975 #pragma GCC diagnostic push
2976 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
2978 bool PG::_has_removal_flag(ObjectStore
*store
,
2982 ghobject_t
pgmeta_oid(pgid
.make_pgmeta_oid());
2984 // first try new way
2986 keys
.insert("_remove");
2987 map
<string
,bufferlist
> values
;
2988 if (store
->omap_get_values(coll
, pgmeta_oid
, keys
, &values
) == 0 &&
2995 int PG::peek_map_epoch(ObjectStore
*store
,
3001 ghobject_t
legacy_infos_oid(OSD::make_infos_oid());
3002 ghobject_t
pgmeta_oid(pgid
.make_pgmeta_oid());
3003 epoch_t cur_epoch
= 0;
3007 // validate collection name
3008 assert(coll
.is_pg());
3013 keys
.insert(infover_key
);
3014 keys
.insert(epoch_key
);
3015 map
<string
,bufferlist
> values
;
3016 int r
= store
->omap_get_values(coll
, pgmeta_oid
, keys
, &values
);
3018 assert(values
.size() == 2);
3020 // sanity check version
3021 bufferlist::iterator bp
= values
[infover_key
].begin();
3023 ::decode(struct_v
, bp
);
3024 assert(struct_v
>= 8);
3027 bp
= values
[epoch_key
].begin();
3028 ::decode(cur_epoch
, bp
);
3030 // probably bug 10617; see OSD::load_pgs()
3034 *pepoch
= cur_epoch
;
3038 #pragma GCC diagnostic pop
3039 #pragma GCC diagnostic warning "-Wpragmas"
3041 void PG::write_if_dirty(ObjectStore::Transaction
& t
)
3043 map
<string
,bufferlist
> km
;
3044 if (dirty_big_info
|| dirty_info
)
3045 prepare_write_info(&km
);
3046 pg_log
.write_log_and_missing(t
, &km
, coll
, pgmeta_oid
, pool
.info
.require_rollback());
3048 t
.omap_setkeys(coll
, pgmeta_oid
, km
);
3053 assert(is_primary());
3055 dout(10) << __func__
<< " to " << pg_trim_to
<< dendl
;
3056 if (pg_trim_to
!= eversion_t()) {
3057 // inform peers to trim log
3058 assert(!actingbackfill
.empty());
3059 for (set
<pg_shard_t
>::iterator i
= actingbackfill
.begin();
3060 i
!= actingbackfill
.end();
3062 if (*i
== pg_whoami
) continue;
3063 osd
->send_message_osd_cluster(
3066 get_osdmap()->get_epoch(),
3067 spg_t(info
.pgid
.pgid
, i
->shard
),
3069 get_osdmap()->get_epoch());
3072 // trim primary as well
3073 pg_log
.trim(pg_trim_to
, info
);
3078 void PG::add_log_entry(const pg_log_entry_t
& e
, bool applied
)
3080 // raise last_complete only if we were previously up to date
3081 if (info
.last_complete
== info
.last_update
)
3082 info
.last_complete
= e
.version
;
3084 // raise last_update.
3085 assert(e
.version
> info
.last_update
);
3086 info
.last_update
= e
.version
;
3088 // raise user_version, if it increased (it may have not get bumped
3089 // by all logged updates)
3090 if (e
.user_version
> info
.last_user_version
)
3091 info
.last_user_version
= e
.user_version
;
3094 pg_log
.add(e
, applied
);
3095 dout(10) << "add_log_entry " << e
<< dendl
;
3099 void PG::append_log(
3100 const vector
<pg_log_entry_t
>& logv
,
3102 eversion_t roll_forward_to
,
3103 ObjectStore::Transaction
&t
,
3104 bool transaction_applied
)
3106 if (transaction_applied
)
3107 update_snap_map(logv
, t
);
3109 /* The primary has sent an info updating the history, but it may not
3110 * have arrived yet. We want to make sure that we cannot remember this
3111 * write without remembering that it happened in an interval which went
3112 * active in epoch history.last_epoch_started.
3114 if (info
.last_epoch_started
!= info
.history
.last_epoch_started
) {
3115 info
.history
.last_epoch_started
= info
.last_epoch_started
;
3117 if (info
.last_interval_started
!= info
.history
.last_interval_started
) {
3118 info
.history
.last_interval_started
= info
.last_interval_started
;
3120 dout(10) << "append_log " << pg_log
.get_log() << " " << logv
<< dendl
;
3122 PGLogEntryHandler handler
{this, &t
};
3123 if (!transaction_applied
) {
3124 /* We must be a backfill peer, so it's ok if we apply
3125 * out-of-turn since we won't be considered when
3126 * determining a min possible last_update.
3128 pg_log
.roll_forward(&handler
);
3131 for (vector
<pg_log_entry_t
>::const_iterator p
= logv
.begin();
3134 add_log_entry(*p
, transaction_applied
);
3136 /* We don't want to leave the rollforward artifacts around
3137 * here past last_backfill. It's ok for the same reason as
3139 if (transaction_applied
&&
3140 p
->soid
> info
.last_backfill
) {
3141 pg_log
.roll_forward(&handler
);
3144 auto last
= logv
.rbegin();
3145 if (is_primary() && last
!= logv
.rend()) {
3146 projected_log
.skip_can_rollback_to_to_head();
3147 projected_log
.trim(cct
, last
->version
, nullptr);
3150 if (transaction_applied
&& roll_forward_to
> pg_log
.get_can_rollback_to()) {
3151 pg_log
.roll_forward_to(
3154 t
.register_on_applied(
3155 new C_UpdateLastRollbackInfoTrimmedToApplied(
3157 get_osdmap()->get_epoch(),
3161 pg_log
.trim(trim_to
, info
);
3163 // update the local pg, pg log
3168 bool PG::check_log_for_corruption(ObjectStore
*store
)
3170 /// TODO: this method needs to work with the omap log
3174 //! Get the name we're going to save our corrupt page log as
3175 std::string
PG::get_corrupt_pg_log_name() const
3177 const int MAX_BUF
= 512;
3180 time_t my_time(time(NULL
));
3181 const struct tm
*t
= localtime_r(&my_time
, &tm_buf
);
3182 int ret
= strftime(buf
, sizeof(buf
), "corrupt_log_%Y-%m-%d_%k:%M_", t
);
3184 dout(0) << "strftime failed" << dendl
;
3185 return "corrupt_log_unknown_time";
3188 out
+= stringify(info
.pgid
);
3193 ObjectStore
*store
, spg_t pgid
, const coll_t
&coll
, bufferlist
&bl
,
3194 pg_info_t
&info
, PastIntervals
&past_intervals
,
3197 // try for v8 or later
3199 keys
.insert(infover_key
);
3200 keys
.insert(info_key
);
3201 keys
.insert(biginfo_key
);
3202 keys
.insert(fastinfo_key
);
3203 ghobject_t
pgmeta_oid(pgid
.make_pgmeta_oid());
3204 map
<string
,bufferlist
> values
;
3205 int r
= store
->omap_get_values(coll
, pgmeta_oid
, keys
, &values
);
3207 assert(values
.size() == 3 ||
3208 values
.size() == 4);
3210 bufferlist::iterator p
= values
[infover_key
].begin();
3211 ::decode(struct_v
, p
);
3212 assert(struct_v
>= 8);
3214 p
= values
[info_key
].begin();
3217 p
= values
[biginfo_key
].begin();
3218 if (struct_v
>= 10) {
3219 ::decode(past_intervals
, p
);
3221 past_intervals
.decode_classic(p
);
3223 ::decode(info
.purged_snaps
, p
);
3225 p
= values
[fastinfo_key
].begin();
3227 pg_fast_info_t fast
;
3229 fast
.try_apply_to(&info
);
3235 ghobject_t
infos_oid(OSD::make_infos_oid());
3236 bufferlist::iterator p
= bl
.begin();
3237 ::decode(struct_v
, p
);
3238 assert(struct_v
== 7);
3240 // get info out of leveldb
3241 string k
= get_info_key(info
.pgid
);
3242 string bk
= get_biginfo_key(info
.pgid
);
3247 store
->omap_get_values(coll_t::meta(), ghobject_t(infos_oid
), keys
, &values
);
3248 assert(values
.size() == 2);
3250 p
= values
[k
].begin();
3253 p
= values
[bk
].begin();
3254 ::decode(past_intervals
, p
);
3255 interval_set
<snapid_t
> snap_collections
; // obsolete
3256 ::decode(snap_collections
, p
);
3257 ::decode(info
.purged_snaps
, p
);
3261 void PG::read_state(ObjectStore
*store
, bufferlist
&bl
)
3263 int r
= read_info(store
, pg_id
, coll
, bl
, info
, past_intervals
,
3267 last_written_info
= info
;
3270 pg_log
.read_log_and_missing(
3273 info_struct_v
< 8 ? coll_t::meta() : coll
,
3274 ghobject_t(info_struct_v
< 8 ? OSD::make_pg_log_oid(pg_id
) : pgmeta_oid
),
3277 cct
->_conf
->osd_ignore_stale_divergent_priors
,
3278 cct
->_conf
->osd_debug_verify_missing_on_start
);
3280 osd
->clog
->error() << oss
.rdbuf();
3282 // log any weirdness
3286 void PG::log_weirdness()
3288 if (pg_log
.get_tail() != info
.log_tail
)
3289 osd
->clog
->error() << info
.pgid
3290 << " info mismatch, log.tail " << pg_log
.get_tail()
3291 << " != info.log_tail " << info
.log_tail
;
3292 if (pg_log
.get_head() != info
.last_update
)
3293 osd
->clog
->error() << info
.pgid
3294 << " info mismatch, log.head " << pg_log
.get_head()
3295 << " != info.last_update " << info
.last_update
;
3297 if (!pg_log
.get_log().empty()) {
3299 if ((pg_log
.get_log().log
.begin()->version
<= pg_log
.get_tail()))
3300 osd
->clog
->error() << info
.pgid
3301 << " log bound mismatch, info (" << pg_log
.get_tail() << ","
3302 << pg_log
.get_head() << "]"
3304 << pg_log
.get_log().log
.begin()->version
<< ","
3305 << pg_log
.get_log().log
.rbegin()->version
<< "]";
3308 if (pg_log
.get_log().caller_ops
.size() > pg_log
.get_log().log
.size()) {
3309 osd
->clog
->error() << info
.pgid
3310 << " caller_ops.size " << pg_log
.get_log().caller_ops
.size()
3311 << " > log size " << pg_log
.get_log().log
.size();
3315 void PG::update_snap_map(
3316 const vector
<pg_log_entry_t
> &log_entries
,
3317 ObjectStore::Transaction
&t
)
3319 for (vector
<pg_log_entry_t
>::const_iterator i
= log_entries
.begin();
3320 i
!= log_entries
.end();
3322 OSDriver::OSTransaction
_t(osdriver
.get_transaction(&t
));
3323 if (i
->soid
.snap
< CEPH_MAXSNAP
) {
3324 if (i
->is_delete()) {
3325 int r
= snap_mapper
.remove_oid(
3329 } else if (i
->is_update()) {
3330 assert(i
->snaps
.length() > 0);
3331 vector
<snapid_t
> snaps
;
3332 bufferlist snapbl
= i
->snaps
;
3333 bufferlist::iterator p
= snapbl
.begin();
3339 set
<snapid_t
> _snaps(snaps
.begin(), snaps
.end());
3341 if (i
->is_clone() || i
->is_promote()) {
3342 snap_mapper
.add_oid(
3346 } else if (i
->is_modify()) {
3347 assert(i
->is_modify());
3348 int r
= snap_mapper
.update_snaps(
3355 assert(i
->is_clean());
3363 * filter trimming|trimmed snaps out of snapcontext
3365 void PG::filter_snapc(vector
<snapid_t
> &snaps
)
3367 //nothing needs to trim, we can return immediately
3368 if(snap_trimq
.empty() && info
.purged_snaps
.empty())
3371 bool filtering
= false;
3372 vector
<snapid_t
> newsnaps
;
3373 for (vector
<snapid_t
>::iterator p
= snaps
.begin();
3376 if (snap_trimq
.contains(*p
) || info
.purged_snaps
.contains(*p
)) {
3378 // start building a new vector with what we've seen so far
3379 dout(10) << "filter_snapc filtering " << snaps
<< dendl
;
3380 newsnaps
.insert(newsnaps
.begin(), snaps
.begin(), p
);
3383 dout(20) << "filter_snapc removing trimq|purged snap " << *p
<< dendl
;
3386 newsnaps
.push_back(*p
); // continue building new vector
3390 snaps
.swap(newsnaps
);
3391 dout(10) << "filter_snapc result " << snaps
<< dendl
;
3395 void PG::requeue_object_waiters(map
<hobject_t
, list
<OpRequestRef
>>& m
)
3397 for (map
<hobject_t
, list
<OpRequestRef
>>::iterator it
= m
.begin();
3400 requeue_ops(it
->second
);
3404 void PG::requeue_op(OpRequestRef op
)
3406 auto p
= waiting_for_map
.find(op
->get_source());
3407 if (p
!= waiting_for_map
.end()) {
3408 dout(20) << __func__
<< " " << op
<< " (waiting_for_map " << p
->first
<< ")"
3410 p
->second
.push_front(op
);
3412 dout(20) << __func__
<< " " << op
<< dendl
;
3413 osd
->enqueue_front(info
.pgid
, PGQueueable(op
, get_osdmap()->get_epoch()));
3417 void PG::requeue_ops(list
<OpRequestRef
> &ls
)
3419 for (list
<OpRequestRef
>::reverse_iterator i
= ls
.rbegin();
3422 auto p
= waiting_for_map
.find((*i
)->get_source());
3423 if (p
!= waiting_for_map
.end()) {
3424 dout(20) << __func__
<< " " << *i
<< " (waiting_for_map " << p
->first
3426 p
->second
.push_front(*i
);
3428 dout(20) << __func__
<< " " << *i
<< dendl
;
3429 osd
->enqueue_front(info
.pgid
, PGQueueable(*i
, get_osdmap()->get_epoch()));
3435 void PG::requeue_map_waiters()
3437 epoch_t epoch
= get_osdmap()->get_epoch();
3438 auto p
= waiting_for_map
.begin();
3439 while (p
!= waiting_for_map
.end()) {
3440 if (epoch
< p
->second
.front()->min_epoch
) {
3441 dout(20) << __func__
<< " " << p
->first
<< " front op "
3442 << p
->second
.front() << " must still wait, doing nothing"
3446 dout(20) << __func__
<< " " << p
->first
<< " " << p
->second
<< dendl
;
3447 for (auto q
= p
->second
.rbegin(); q
!= p
->second
.rend(); ++q
) {
3448 osd
->enqueue_front(info
.pgid
, PGQueueable(*q
, epoch
));
3450 p
= waiting_for_map
.erase(p
);
3456 // ==========================================================================================
3460 * when holding pg and sched_scrub_lock, then the states are:
3462 * scrubber.reserved = true
3463 * scrub_rserved_peers includes whoami
3464 * osd->scrub_pending++
3465 * scheduling, replica declined:
3466 * scrubber.reserved = true
3467 * scrubber.reserved_peers includes -1
3468 * osd->scrub_pending++
3470 * scrubber.reserved = true
3471 * scrubber.reserved_peers.size() == acting.size();
3473 * osd->scrub_pending++
3475 * scrubber.reserved = false;
3476 * scrubber.reserved_peers empty
3477 * osd->scrubber.active++
3480 // returns true if a scrub has been newly kicked off
3481 bool PG::sched_scrub()
3483 bool nodeep_scrub
= false;
3484 assert(is_locked());
3485 if (!(is_primary() && is_active() && is_clean() && !is_scrubbing())) {
3489 double deep_scrub_interval
= 0;
3490 pool
.info
.opts
.get(pool_opts_t::DEEP_SCRUB_INTERVAL
, &deep_scrub_interval
);
3491 if (deep_scrub_interval
<= 0) {
3492 deep_scrub_interval
= cct
->_conf
->osd_deep_scrub_interval
;
3494 bool time_for_deep
= ceph_clock_now() >=
3495 info
.history
.last_deep_scrub_stamp
+ deep_scrub_interval
;
3497 bool deep_coin_flip
= false;
3498 // Only add random deep scrubs when NOT user initiated scrub
3499 if (!scrubber
.must_scrub
)
3500 deep_coin_flip
= (rand() % 100) < cct
->_conf
->osd_deep_scrub_randomize_ratio
* 100;
3501 dout(20) << __func__
<< ": time_for_deep=" << time_for_deep
<< " deep_coin_flip=" << deep_coin_flip
<< dendl
;
3503 time_for_deep
= (time_for_deep
|| deep_coin_flip
);
3505 //NODEEP_SCRUB so ignore time initiated deep-scrub
3506 if (osd
->osd
->get_osdmap()->test_flag(CEPH_OSDMAP_NODEEP_SCRUB
) ||
3507 pool
.info
.has_flag(pg_pool_t::FLAG_NODEEP_SCRUB
)) {
3508 time_for_deep
= false;
3509 nodeep_scrub
= true;
3512 if (!scrubber
.must_scrub
) {
3513 assert(!scrubber
.must_deep_scrub
);
3515 //NOSCRUB so skip regular scrubs
3516 if ((osd
->osd
->get_osdmap()->test_flag(CEPH_OSDMAP_NOSCRUB
) ||
3517 pool
.info
.has_flag(pg_pool_t::FLAG_NOSCRUB
)) && !time_for_deep
) {
3518 if (scrubber
.reserved
) {
3519 // cancel scrub if it is still in scheduling,
3520 // so pgs from other pools where scrub are still legal
3521 // have a chance to go ahead with scrubbing.
3522 clear_scrub_reserved();
3523 scrub_unreserve_replicas();
3529 if (cct
->_conf
->osd_scrub_auto_repair
3530 && get_pgbackend()->auto_repair_supported()
3532 // respect the command from user, and not do auto-repair
3533 && !scrubber
.must_repair
3534 && !scrubber
.must_scrub
3535 && !scrubber
.must_deep_scrub
) {
3536 dout(20) << __func__
<< ": auto repair with deep scrubbing" << dendl
;
3537 scrubber
.auto_repair
= true;
3539 // this happens when user issue the scrub/repair command during
3540 // the scheduling of the scrub/repair (e.g. request reservation)
3541 scrubber
.auto_repair
= false;
3545 if (!scrubber
.reserved
) {
3546 assert(scrubber
.reserved_peers
.empty());
3547 if (osd
->inc_scrubs_pending()) {
3548 dout(20) << "sched_scrub: reserved locally, reserving replicas" << dendl
;
3549 scrubber
.reserved
= true;
3550 scrubber
.reserved_peers
.insert(pg_whoami
);
3551 scrub_reserve_replicas();
3553 dout(20) << "sched_scrub: failed to reserve locally" << dendl
;
3557 if (scrubber
.reserved
) {
3558 if (scrubber
.reserve_failed
) {
3559 dout(20) << "sched_scrub: failed, a peer declined" << dendl
;
3560 clear_scrub_reserved();
3561 scrub_unreserve_replicas();
3563 } else if (scrubber
.reserved_peers
.size() == acting
.size()) {
3564 dout(20) << "sched_scrub: success, reserved self and replicas" << dendl
;
3565 if (time_for_deep
) {
3566 dout(10) << "sched_scrub: scrub will be deep" << dendl
;
3567 state_set(PG_STATE_DEEP_SCRUB
);
3568 } else if (!scrubber
.must_deep_scrub
&& info
.stats
.stats
.sum
.num_deep_scrub_errors
) {
3569 if (!nodeep_scrub
) {
3570 osd
->clog
->info() << "osd." << osd
->whoami
3571 << " pg " << info
.pgid
3572 << " Deep scrub errors, upgrading scrub to deep-scrub";
3573 state_set(PG_STATE_DEEP_SCRUB
);
3574 } else if (!scrubber
.must_scrub
) {
3575 osd
->clog
->error() << "osd." << osd
->whoami
3576 << " pg " << info
.pgid
3577 << " Regular scrub skipped due to deep-scrub errors and nodeep-scrub set";
3578 clear_scrub_reserved();
3579 scrub_unreserve_replicas();
3582 osd
->clog
->error() << "osd." << osd
->whoami
3583 << " pg " << info
.pgid
3584 << " Regular scrub request, losing deep-scrub details";
3589 // none declined, since scrubber.reserved is set
3590 dout(20) << "sched_scrub: reserved " << scrubber
.reserved_peers
<< ", waiting for replicas" << dendl
;
3597 void PG::reg_next_scrub()
3603 if (scrubber
.must_scrub
||
3604 (info
.stats
.stats_invalid
&& cct
->_conf
->osd_scrub_invalid_stats
)) {
3605 reg_stamp
= ceph_clock_now();
3607 reg_stamp
= info
.history
.last_scrub_stamp
;
3609 // note down the sched_time, so we can locate this scrub, and remove it
3611 double scrub_min_interval
= 0, scrub_max_interval
= 0;
3612 pool
.info
.opts
.get(pool_opts_t::SCRUB_MIN_INTERVAL
, &scrub_min_interval
);
3613 pool
.info
.opts
.get(pool_opts_t::SCRUB_MAX_INTERVAL
, &scrub_max_interval
);
3614 assert(scrubber
.scrub_reg_stamp
== utime_t());
3615 scrubber
.scrub_reg_stamp
= osd
->reg_pg_scrub(info
.pgid
,
3619 scrubber
.must_scrub
);
3622 void PG::unreg_next_scrub()
3625 osd
->unreg_pg_scrub(info
.pgid
, scrubber
.scrub_reg_stamp
);
3626 scrubber
.scrub_reg_stamp
= utime_t();
3630 void PG::do_replica_scrub_map(OpRequestRef op
)
3632 const MOSDRepScrubMap
*m
= static_cast<const MOSDRepScrubMap
*>(op
->get_req());
3633 dout(7) << __func__
<< " " << *m
<< dendl
;
3634 if (m
->map_epoch
< info
.history
.same_interval_since
) {
3635 dout(10) << __func__
<< " discarding old from "
3636 << m
->map_epoch
<< " < " << info
.history
.same_interval_since
3640 if (!scrubber
.is_chunky_scrub_active()) {
3641 dout(10) << __func__
<< " scrub isn't active" << dendl
;
3647 bufferlist::iterator p
= const_cast<bufferlist
&>(m
->get_data()).begin();
3648 scrubber
.received_maps
[m
->from
].decode(p
, info
.pgid
.pool());
3649 dout(10) << "map version is "
3650 << scrubber
.received_maps
[m
->from
].valid_through
3653 --scrubber
.waiting_on
;
3654 scrubber
.waiting_on_whom
.erase(m
->from
);
3655 if (scrubber
.waiting_on
== 0) {
3656 if (ops_blocked_by_scrub()) {
3657 requeue_scrub(true);
3659 requeue_scrub(false);
3664 void PG::sub_op_scrub_map(OpRequestRef op
)
3666 // for legacy jewel compatibility only
3667 const MOSDSubOp
*m
= static_cast<const MOSDSubOp
*>(op
->get_req());
3668 assert(m
->get_type() == MSG_OSD_SUBOP
);
3669 dout(7) << "sub_op_scrub_map" << dendl
;
3671 if (m
->map_epoch
< info
.history
.same_interval_since
) {
3672 dout(10) << "sub_op_scrub discarding old sub_op from "
3673 << m
->map_epoch
<< " < " << info
.history
.same_interval_since
<< dendl
;
3677 if (!scrubber
.is_chunky_scrub_active()) {
3678 dout(10) << "sub_op_scrub_map scrub isn't active" << dendl
;
3684 dout(10) << " got " << m
->from
<< " scrub map" << dendl
;
3685 bufferlist::iterator p
= const_cast<bufferlist
&>(m
->get_data()).begin();
3687 scrubber
.received_maps
[m
->from
].decode(p
, info
.pgid
.pool());
3688 dout(10) << "map version is "
3689 << scrubber
.received_maps
[m
->from
].valid_through
3692 --scrubber
.waiting_on
;
3693 scrubber
.waiting_on_whom
.erase(m
->from
);
3695 if (scrubber
.waiting_on
== 0) {
3696 if (ops_blocked_by_scrub()) {
3697 requeue_scrub(true);
3699 requeue_scrub(false);
3704 // send scrub v3 messages (chunky scrub)
3705 void PG::_request_scrub_map(
3706 pg_shard_t replica
, eversion_t version
,
3707 hobject_t start
, hobject_t end
,
3708 bool deep
, uint32_t seed
)
3710 assert(replica
!= pg_whoami
);
3711 dout(10) << "scrub requesting scrubmap from osd." << replica
3712 << " deep " << (int)deep
<< " seed " << seed
<< dendl
;
3713 MOSDRepScrub
*repscrubop
= new MOSDRepScrub(
3714 spg_t(info
.pgid
.pgid
, replica
.shard
), version
,
3715 get_osdmap()->get_epoch(),
3716 get_last_peering_reset(),
3717 start
, end
, deep
, seed
);
3718 // default priority, we want the rep scrub processed prior to any recovery
3719 // or client io messages (we are holding a lock!)
3720 osd
->send_message_osd_cluster(
3721 replica
.osd
, repscrubop
, get_osdmap()->get_epoch());
3724 void PG::handle_scrub_reserve_request(OpRequestRef op
)
3726 dout(7) << __func__
<< " " << *op
->get_req() << dendl
;
3728 if (scrubber
.reserved
) {
3729 dout(10) << __func__
<< " ignoring reserve request: Already reserved"
3733 scrubber
.reserved
= osd
->inc_scrubs_pending();
3734 if (op
->get_req()->get_type() == MSG_OSD_SCRUB_RESERVE
) {
3735 const MOSDScrubReserve
*m
=
3736 static_cast<const MOSDScrubReserve
*>(op
->get_req());
3737 Message
*reply
= new MOSDScrubReserve(
3738 spg_t(info
.pgid
.pgid
, primary
.shard
),
3740 scrubber
.reserved
? MOSDScrubReserve::GRANT
: MOSDScrubReserve::REJECT
,
3742 osd
->send_message_osd_cluster(reply
, op
->get_req()->get_connection());
3744 // for jewel compat only
3745 const MOSDSubOp
*req
= static_cast<const MOSDSubOp
*>(op
->get_req());
3746 assert(req
->get_type() == MSG_OSD_SUBOP
);
3747 MOSDSubOpReply
*reply
= new MOSDSubOpReply(
3748 req
, pg_whoami
, 0, get_osdmap()->get_epoch(), CEPH_OSD_FLAG_ACK
);
3749 ::encode(scrubber
.reserved
, reply
->get_data());
3750 osd
->send_message_osd_cluster(reply
, op
->get_req()->get_connection());
3754 void PG::handle_scrub_reserve_grant(OpRequestRef op
, pg_shard_t from
)
3756 dout(7) << __func__
<< " " << *op
->get_req() << dendl
;
3758 if (!scrubber
.reserved
) {
3759 dout(10) << "ignoring obsolete scrub reserve reply" << dendl
;
3762 if (scrubber
.reserved_peers
.find(from
) != scrubber
.reserved_peers
.end()) {
3763 dout(10) << " already had osd." << from
<< " reserved" << dendl
;
3765 dout(10) << " osd." << from
<< " scrub reserve = success" << dendl
;
3766 scrubber
.reserved_peers
.insert(from
);
3771 void PG::handle_scrub_reserve_reject(OpRequestRef op
, pg_shard_t from
)
3773 dout(7) << __func__
<< " " << *op
->get_req() << dendl
;
3775 if (!scrubber
.reserved
) {
3776 dout(10) << "ignoring obsolete scrub reserve reply" << dendl
;
3779 if (scrubber
.reserved_peers
.find(from
) != scrubber
.reserved_peers
.end()) {
3780 dout(10) << " already had osd." << from
<< " reserved" << dendl
;
3782 /* One decline stops this pg from being scheduled for scrubbing. */
3783 dout(10) << " osd." << from
<< " scrub reserve = fail" << dendl
;
3784 scrubber
.reserve_failed
= true;
3789 void PG::handle_scrub_reserve_release(OpRequestRef op
)
3791 dout(7) << __func__
<< " " << *op
->get_req() << dendl
;
3793 clear_scrub_reserved();
3796 void PG::reject_reservation()
3798 osd
->send_message_osd_cluster(
3800 new MBackfillReserve(
3801 MBackfillReserve::REJECT
,
3802 spg_t(info
.pgid
.pgid
, primary
.shard
),
3803 get_osdmap()->get_epoch()),
3804 get_osdmap()->get_epoch());
3807 void PG::schedule_backfill_full_retry()
3809 Mutex::Locker
lock(osd
->recovery_request_lock
);
3810 osd
->recovery_request_timer
.add_event_after(
3811 cct
->_conf
->osd_backfill_retry_interval
,
3812 new QueuePeeringEvt
<RequestBackfill
>(
3813 this, get_osdmap()->get_epoch(),
3814 RequestBackfill()));
3817 void PG::schedule_recovery_full_retry()
3819 Mutex::Locker
lock(osd
->recovery_request_lock
);
3820 osd
->recovery_request_timer
.add_event_after(
3821 cct
->_conf
->osd_recovery_retry_interval
,
3822 new QueuePeeringEvt
<DoRecovery
>(
3823 this, get_osdmap()->get_epoch(),
3827 void PG::clear_scrub_reserved()
3829 scrubber
.reserved_peers
.clear();
3830 scrubber
.reserve_failed
= false;
3832 if (scrubber
.reserved
) {
3833 scrubber
.reserved
= false;
3834 osd
->dec_scrubs_pending();
3838 void PG::scrub_reserve_replicas()
3840 assert(backfill_targets
.empty());
3841 for (set
<pg_shard_t
>::iterator i
= actingbackfill
.begin();
3842 i
!= actingbackfill
.end();
3844 if (*i
== pg_whoami
) continue;
3845 dout(10) << "scrub requesting reserve from osd." << *i
<< dendl
;
3846 if (HAVE_FEATURE(get_min_acting_features(), SERVER_LUMINOUS
)) {
3847 osd
->send_message_osd_cluster(
3849 new MOSDScrubReserve(spg_t(info
.pgid
.pgid
, i
->shard
),
3850 get_osdmap()->get_epoch(),
3851 MOSDScrubReserve::REQUEST
, pg_whoami
),
3852 get_osdmap()->get_epoch());
3854 // for jewel compat only
3855 vector
<OSDOp
> scrub(1);
3856 scrub
[0].op
.op
= CEPH_OSD_OP_SCRUB_RESERVE
;
3860 MOSDSubOp
*subop
= new MOSDSubOp(
3861 reqid
, pg_whoami
, spg_t(info
.pgid
.pgid
, i
->shard
), poid
, 0,
3862 get_osdmap()->get_epoch(), osd
->get_tid(), v
);
3864 osd
->send_message_osd_cluster(
3865 i
->osd
, subop
, get_osdmap()->get_epoch());
3870 void PG::scrub_unreserve_replicas()
3872 assert(backfill_targets
.empty());
3873 for (set
<pg_shard_t
>::iterator i
= actingbackfill
.begin();
3874 i
!= actingbackfill
.end();
3876 if (*i
== pg_whoami
) continue;
3877 dout(10) << "scrub requesting unreserve from osd." << *i
<< dendl
;
3878 if (HAVE_FEATURE(get_min_acting_features(), SERVER_LUMINOUS
)) {
3879 osd
->send_message_osd_cluster(
3881 new MOSDScrubReserve(spg_t(info
.pgid
.pgid
, i
->shard
),
3882 get_osdmap()->get_epoch(),
3883 MOSDScrubReserve::RELEASE
, pg_whoami
),
3884 get_osdmap()->get_epoch());
3886 // for jewel compat only
3887 vector
<OSDOp
> scrub(1);
3888 scrub
[0].op
.op
= CEPH_OSD_OP_SCRUB_UNRESERVE
;
3892 MOSDSubOp
*subop
= new MOSDSubOp(
3893 reqid
, pg_whoami
, spg_t(info
.pgid
.pgid
, i
->shard
), poid
, 0,
3894 get_osdmap()->get_epoch(), osd
->get_tid(), v
);
3896 osd
->send_message_osd_cluster(i
->osd
, subop
, get_osdmap()->get_epoch());
3901 void PG::_scan_rollback_obs(
3902 const vector
<ghobject_t
> &rollback_obs
,
3903 ThreadPool::TPHandle
&handle
)
3905 ObjectStore::Transaction t
;
3906 eversion_t trimmed_to
= last_rollback_info_trimmed_to_applied
;
3907 for (vector
<ghobject_t
>::const_iterator i
= rollback_obs
.begin();
3908 i
!= rollback_obs
.end();
3910 if (i
->generation
< trimmed_to
.version
) {
3911 osd
->clog
->error() << "osd." << osd
->whoami
3912 << " pg " << info
.pgid
3913 << " found obsolete rollback obj "
3914 << *i
<< " generation < trimmed_to "
3921 derr
<< __func__
<< ": queueing trans to clean up obsolete rollback objs"
3923 osd
->store
->queue_transaction(osr
.get(), std::move(t
), NULL
);
3927 void PG::_scan_snaps(ScrubMap
&smap
)
3931 for (map
<hobject_t
, ScrubMap::object
>::reverse_iterator i
= smap
.objects
.rbegin();
3932 i
!= smap
.objects
.rend();
3934 const hobject_t
&hoid
= i
->first
;
3935 ScrubMap::object
&o
= i
->second
;
3937 if (hoid
.is_head() || hoid
.is_snapdir()) {
3938 // parse the SnapSet
3940 if (o
.attrs
.find(SS_ATTR
) == o
.attrs
.end()) {
3943 bl
.push_back(o
.attrs
[SS_ATTR
]);
3944 auto p
= bl
.begin();
3946 ::decode(snapset
, p
);
3950 head
= hoid
.get_head();
3953 if (hoid
.snap
< CEPH_MAXSNAP
) {
3954 // check and if necessary fix snap_mapper
3955 if (hoid
.get_head() != head
) {
3956 derr
<< __func__
<< " no head for " << hoid
<< " (have " << head
<< ")"
3960 set
<snapid_t
> obj_snaps
;
3961 if (!snapset
.is_legacy()) {
3962 auto p
= snapset
.clone_snaps
.find(hoid
.snap
);
3963 if (p
== snapset
.clone_snaps
.end()) {
3964 derr
<< __func__
<< " no clone_snaps for " << hoid
<< " in " << snapset
3968 obj_snaps
.insert(p
->second
.begin(), p
->second
.end());
3971 if (o
.attrs
.find(OI_ATTR
) == o
.attrs
.end()) {
3974 bl
.push_back(o
.attrs
[OI_ATTR
]);
3981 obj_snaps
.insert(oi
.legacy_snaps
.begin(), oi
.legacy_snaps
.end());
3983 set
<snapid_t
> cur_snaps
;
3984 int r
= snap_mapper
.get_snaps(hoid
, &cur_snaps
);
3985 if (r
!= 0 && r
!= -ENOENT
) {
3986 derr
<< __func__
<< ": get_snaps returned " << cpp_strerror(r
) << dendl
;
3989 if (r
== -ENOENT
|| cur_snaps
!= obj_snaps
) {
3990 ObjectStore::Transaction t
;
3991 OSDriver::OSTransaction
_t(osdriver
.get_transaction(&t
));
3993 r
= snap_mapper
.remove_oid(hoid
, &_t
);
3995 derr
<< __func__
<< ": remove_oid returned " << cpp_strerror(r
)
3999 osd
->clog
->error() << "osd." << osd
->whoami
4000 << " found snap mapper error on pg "
4002 << " oid " << hoid
<< " snaps in mapper: "
4003 << cur_snaps
<< ", oi: "
4007 osd
->clog
->error() << "osd." << osd
->whoami
4008 << " found snap mapper error on pg "
4010 << " oid " << hoid
<< " snaps missing in mapper"
4015 snap_mapper
.add_oid(hoid
, obj_snaps
, &_t
);
4016 r
= osd
->store
->apply_transaction(osr
.get(), std::move(t
));
4018 derr
<< __func__
<< ": apply_transaction got " << cpp_strerror(r
)
4026 void PG::_repair_oinfo_oid(ScrubMap
&smap
)
4028 for (map
<hobject_t
, ScrubMap::object
>::reverse_iterator i
= smap
.objects
.rbegin();
4029 i
!= smap
.objects
.rend();
4031 const hobject_t
&hoid
= i
->first
;
4032 ScrubMap::object
&o
= i
->second
;
4035 if (o
.attrs
.find(OI_ATTR
) == o
.attrs
.end()) {
4038 bl
.push_back(o
.attrs
[OI_ATTR
]);
4045 if (oi
.soid
!= hoid
) {
4046 ObjectStore::Transaction t
;
4047 OSDriver::OSTransaction
_t(osdriver
.get_transaction(&t
));
4048 osd
->clog
->error() << "osd." << osd
->whoami
4049 << " found object info error on pg "
4051 << " oid " << hoid
<< " oid in object info: "
4057 ::encode(oi
, bl
, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD
, nullptr));
4059 bufferptr
bp(bl
.c_str(), bl
.length());
4060 o
.attrs
[OI_ATTR
] = bp
;
4062 t
.setattr(coll
, ghobject_t(hoid
), OI_ATTR
, bl
);
4063 int r
= osd
->store
->apply_transaction(osr
.get(), std::move(t
));
4065 derr
<< __func__
<< ": apply_transaction got " << cpp_strerror(r
)
4073 * build a scrub map over a chunk without releasing the lock
4074 * only used by chunky scrub
4076 int PG::build_scrub_map_chunk(
4078 hobject_t start
, hobject_t end
, bool deep
, uint32_t seed
,
4079 ThreadPool::TPHandle
&handle
)
4081 dout(10) << __func__
<< " [" << start
<< "," << end
<< ") "
4082 << " seed " << seed
<< dendl
;
4084 map
.valid_through
= info
.last_update
;
4087 vector
<hobject_t
> ls
;
4088 vector
<ghobject_t
> rollback_obs
;
4089 int ret
= get_pgbackend()->objects_list_range(
4096 dout(5) << "objects_list_range error: " << ret
<< dendl
;
4101 get_pgbackend()->be_scan_list(map
, ls
, deep
, seed
, handle
);
4102 _scan_rollback_obs(rollback_obs
, handle
);
4104 _repair_oinfo_oid(map
);
4106 dout(20) << __func__
<< " done" << dendl
;
4110 void PG::Scrubber::cleanup_store(ObjectStore::Transaction
*t
) {
4113 struct OnComplete
: Context
{
4114 std::unique_ptr
<Scrub::Store
> store
;
4116 std::unique_ptr
<Scrub::Store
> &&store
)
4117 : store(std::move(store
)) {}
4118 void finish(int) override
{}
4121 t
->register_on_complete(new OnComplete(std::move(store
)));
4125 void PG::repair_object(
4126 const hobject_t
& soid
, list
<pair
<ScrubMap::object
, pg_shard_t
> > *ok_peers
,
4127 pg_shard_t bad_peer
)
4129 list
<pg_shard_t
> op_shards
;
4130 for (auto i
: *ok_peers
) {
4131 op_shards
.push_back(i
.second
);
4133 dout(10) << "repair_object " << soid
<< " bad_peer osd."
4134 << bad_peer
<< " ok_peers osd.{" << op_shards
<< "}" << dendl
;
4135 ScrubMap::object
&po
= ok_peers
->back().first
;
4138 bv
.push_back(po
.attrs
[OI_ATTR
]);
4141 bufferlist::iterator bliter
= bv
.begin();
4142 ::decode(oi
, bliter
);
4144 dout(0) << __func__
<< ": Need version of replica, bad object_info_t: " << soid
<< dendl
;
4147 if (bad_peer
!= primary
) {
4148 peer_missing
[bad_peer
].add(soid
, oi
.version
, eversion_t());
4150 // We should only be scrubbing if the PG is clean.
4151 assert(waiting_for_unreadable_object
.empty());
4153 pg_log
.missing_add(soid
, oi
.version
, eversion_t());
4155 pg_log
.set_last_requested(0);
4156 dout(10) << __func__
<< ": primary = " << primary
<< dendl
;
4159 if (is_ec_pg() || bad_peer
== primary
) {
4160 // we'd better collect all shard for EC pg, and prepare good peers as the
4161 // source of pull in the case of replicated pg.
4162 missing_loc
.add_missing(soid
, oi
.version
, eversion_t());
4163 list
<pair
<ScrubMap::object
, pg_shard_t
> >::iterator i
;
4164 for (i
= ok_peers
->begin();
4165 i
!= ok_peers
->end();
4167 missing_loc
.add_location(soid
, i
->second
);
4173 * Wait for last_update_applied to match msg->scrub_to as above. Wait
4174 * for pushes to complete in case of recent recovery. Build a single
4175 * scrubmap of objects that are in the range [msg->start, msg->end).
4177 void PG::replica_scrub(
4179 ThreadPool::TPHandle
&handle
)
4181 const MOSDRepScrub
*msg
= static_cast<const MOSDRepScrub
*>(op
->get_req());
4182 assert(!scrubber
.active_rep_scrub
);
4183 dout(7) << "replica_scrub" << dendl
;
4185 if (msg
->map_epoch
< info
.history
.same_interval_since
) {
4186 dout(10) << "replica_scrub discarding old replica_scrub from "
4187 << msg
->map_epoch
<< " < " << info
.history
.same_interval_since
4194 assert(msg
->chunky
);
4195 if (last_update_applied
< msg
->scrub_to
) {
4196 dout(10) << "waiting for last_update_applied to catch up" << dendl
;
4197 scrubber
.active_rep_scrub
= op
;
4201 if (active_pushes
> 0) {
4202 dout(10) << "waiting for active pushes to finish" << dendl
;
4203 scrubber
.active_rep_scrub
= op
;
4207 // compensate for hobject_t's with wrong pool from sloppy hammer OSDs
4208 hobject_t start
= msg
->start
;
4209 hobject_t end
= msg
->end
;
4210 if (!start
.is_max())
4211 start
.pool
= info
.pgid
.pool();
4213 end
.pool
= info
.pgid
.pool();
4215 build_scrub_map_chunk(
4216 map
, start
, end
, msg
->deep
, msg
->seed
,
4219 if (HAVE_FEATURE(acting_features
, SERVER_LUMINOUS
)) {
4220 MOSDRepScrubMap
*reply
= new MOSDRepScrubMap(
4221 spg_t(info
.pgid
.pgid
, get_primary().shard
),
4224 ::encode(map
, reply
->get_data());
4225 osd
->send_message_osd_cluster(reply
, msg
->get_connection());
4227 // for jewel compatibility
4228 vector
<OSDOp
> scrub(1);
4229 scrub
[0].op
.op
= CEPH_OSD_OP_SCRUB_MAP
;
4233 MOSDSubOp
*subop
= new MOSDSubOp(
4236 spg_t(info
.pgid
.pgid
, get_primary().shard
),
4242 ::encode(map
, subop
->get_data());
4244 osd
->send_message_osd_cluster(subop
, msg
->get_connection());
4249 * PG_STATE_SCRUBBING is set when the scrub is queued
4251 * scrub will be chunky if all OSDs in PG support chunky scrub
4252 * scrub will fail if OSDs are too old.
4254 void PG::scrub(epoch_t queued
, ThreadPool::TPHandle
&handle
)
4256 if (cct
->_conf
->osd_scrub_sleep
> 0 &&
4257 (scrubber
.state
== PG::Scrubber::NEW_CHUNK
||
4258 scrubber
.state
== PG::Scrubber::INACTIVE
) &&
4259 scrubber
.needs_sleep
) {
4260 ceph_assert(!scrubber
.sleeping
);
4261 dout(20) << __func__
<< " state is INACTIVE|NEW_CHUNK, sleeping" << dendl
;
4263 // Do an async sleep so we don't block the op queue
4264 OSDService
*osds
= osd
;
4265 spg_t pgid
= get_pgid();
4266 int state
= scrubber
.state
;
4267 auto scrub_requeue_callback
=
4268 new FunctionContext([osds
, pgid
, state
](int r
) {
4269 PG
*pg
= osds
->osd
->lookup_lock_pg(pgid
);
4270 if (pg
== nullptr) {
4271 lgeneric_dout(osds
->osd
->cct
, 20)
4272 << "scrub_requeue_callback: Could not find "
4273 << "PG " << pgid
<< " can't complete scrub requeue after sleep"
4277 pg
->scrubber
.sleeping
= false;
4278 pg
->scrubber
.needs_sleep
= false;
4279 lgeneric_dout(pg
->cct
, 20)
4280 << "scrub_requeue_callback: slept for "
4281 << ceph_clock_now() - pg
->scrubber
.sleep_start
4282 << ", re-queuing scrub with state " << state
<< dendl
;
4283 pg
->scrub_queued
= false;
4284 pg
->requeue_scrub();
4285 pg
->scrubber
.sleep_start
= utime_t();
4288 Mutex::Locker
l(osd
->scrub_sleep_lock
);
4289 osd
->scrub_sleep_timer
.add_event_after(cct
->_conf
->osd_scrub_sleep
,
4290 scrub_requeue_callback
);
4291 scrubber
.sleeping
= true;
4292 scrubber
.sleep_start
= ceph_clock_now();
4295 if (pg_has_reset_since(queued
)) {
4298 assert(scrub_queued
);
4299 scrub_queued
= false;
4300 scrubber
.needs_sleep
= true;
4302 if (!is_primary() || !is_active() || !is_clean() || !is_scrubbing()) {
4303 dout(10) << "scrub -- not primary or active or not clean" << dendl
;
4304 state_clear(PG_STATE_SCRUBBING
);
4305 state_clear(PG_STATE_REPAIR
);
4306 state_clear(PG_STATE_DEEP_SCRUB
);
4307 publish_stats_to_osd();
4311 if (!scrubber
.active
) {
4312 assert(backfill_targets
.empty());
4314 scrubber
.deep
= state_test(PG_STATE_DEEP_SCRUB
);
4316 dout(10) << "starting a new chunky scrub" << dendl
;
4319 chunky_scrub(handle
);
4323 * Chunky scrub scrubs objects one chunk at a time with writes blocked for that
4326 * The object store is partitioned into chunks which end on hash boundaries. For
4327 * each chunk, the following logic is performed:
4329 * (1) Block writes on the chunk
4330 * (2) Request maps from replicas
4331 * (3) Wait for pushes to be applied (after recovery)
4332 * (4) Wait for writes to flush on the chunk
4333 * (5) Wait for maps from replicas
4334 * (6) Compare / repair all scrub maps
4335 * (7) Wait for digest updates to apply
4337 * This logic is encoded in the mostly linear state machine:
4339 * +------------------+
4340 * _________v__________ |
4343 * |____________________| |
4346 * _________v___v______ | |
4349 * |____________________| | |
4351 * _________v__________ | |
4353 * | WAIT_PUSHES | | |
4354 * |____________________| | |
4356 * _________v__________ | |
4358 * | WAIT_LAST_UPDATE | | |
4359 * |____________________| | |
4361 * _________v__________ | |
4364 * |____________________| | |
4366 * _________v__________ | |
4368 * | WAIT_REPLICAS | | |
4369 * |____________________| | |
4371 * _________v__________ | |
4373 * | COMPARE_MAPS | | |
4374 * |____________________| | |
4377 * _________v__________ | |
4379 * |WAIT_DIGEST_UPDATES | | |
4380 * |____________________| | |
4383 * _________v__________ |
4386 * |____________________| |
4388 * +------------------+
4390 * The primary determines the last update from the subset by walking the log. If
4391 * it sees a log entry pertaining to a file in the chunk, it tells the replicas
4392 * to wait until that update is applied before building a scrub map. Both the
4393 * primary and replicas will wait for any active pushes to be applied.
4395 * In contrast to classic_scrub, chunky_scrub is entirely handled by scrub_wq.
4397 * scrubber.state encodes the current state of the scrub (refer to state diagram
4400 void PG::chunky_scrub(ThreadPool::TPHandle
&handle
)
4402 // check for map changes
4403 if (scrubber
.is_chunky_scrub_active()) {
4404 if (scrubber
.epoch_start
!= info
.history
.same_interval_since
) {
4405 dout(10) << "scrub pg changed, aborting" << dendl
;
4406 scrub_clear_state();
4407 scrub_unreserve_replicas();
4416 dout(20) << "scrub state " << Scrubber::state_string(scrubber
.state
)
4417 << " [" << scrubber
.start
<< "," << scrubber
.end
<< ")" << dendl
;
4419 switch (scrubber
.state
) {
4420 case PG::Scrubber::INACTIVE
:
4421 dout(10) << "scrub start" << dendl
;
4423 publish_stats_to_osd();
4424 scrubber
.epoch_start
= info
.history
.same_interval_since
;
4425 scrubber
.active
= true;
4427 osd
->inc_scrubs_active(scrubber
.reserved
);
4428 if (scrubber
.reserved
) {
4429 scrubber
.reserved
= false;
4430 scrubber
.reserved_peers
.clear();
4434 ObjectStore::Transaction t
;
4435 scrubber
.cleanup_store(&t
);
4436 scrubber
.store
.reset(Scrub::Store::create(osd
->store
, &t
,
4438 osd
->store
->queue_transaction(osr
.get(), std::move(t
), nullptr);
4441 // Don't include temporary objects when scrubbing
4442 scrubber
.start
= info
.pgid
.pgid
.get_hobj_start();
4443 scrubber
.state
= PG::Scrubber::NEW_CHUNK
;
4446 bool repair
= state_test(PG_STATE_REPAIR
);
4447 bool deep_scrub
= state_test(PG_STATE_DEEP_SCRUB
);
4448 const char *mode
= (repair
? "repair": (deep_scrub
? "deep-scrub" : "scrub"));
4450 oss
<< info
.pgid
.pgid
<< " " << mode
<< " starts" << std::endl
;
4451 osd
->clog
->info(oss
);
4458 case PG::Scrubber::NEW_CHUNK
:
4459 scrubber
.primary_scrubmap
= ScrubMap();
4460 scrubber
.received_maps
.clear();
4463 /* get the start and end of our scrub chunk
4465 * Our scrub chunk has an important restriction we're going to need to
4466 * respect. We can't let head or snapdir be start or end.
4467 * Using a half-open interval means that if end == head|snapdir,
4468 * we'd scrub/lock head and the clone right next to head in different
4469 * chunks which would allow us to miss clones created between
4470 * scrubbing that chunk and scrubbing the chunk including head.
4471 * This isn't true for any of the other clones since clones can
4472 * only be created "just to the left of" head. There is one exception
4473 * to this: promotion of clones which always happens to the left of the
4474 * left-most clone, but promote_object checks the scrubber in that
4475 * case, so it should be ok. Also, it's ok to "miss" clones at the
4476 * left end of the range if we are a tier because they may legitimately
4477 * not exist (see _scrub).
4479 int min
= MAX(3, cct
->_conf
->osd_scrub_chunk_min
);
4480 hobject_t start
= scrubber
.start
;
4481 hobject_t candidate_end
;
4482 vector
<hobject_t
> objects
;
4483 ret
= get_pgbackend()->objects_list_partial(
4486 MAX(min
, cct
->_conf
->osd_scrub_chunk_max
),
4491 if (!objects
.empty()) {
4492 hobject_t back
= objects
.back();
4493 while (candidate_end
.has_snapset() &&
4494 candidate_end
.get_head() == back
.get_head()) {
4495 candidate_end
= back
;
4497 if (objects
.empty()) {
4499 "Somehow we got more than 2 objects which"
4500 "have the same head but are not clones");
4502 back
= objects
.back();
4504 if (candidate_end
.has_snapset()) {
4505 assert(candidate_end
.get_head() != back
.get_head());
4506 candidate_end
= candidate_end
.get_object_boundary();
4509 assert(candidate_end
.is_max());
4512 if (!_range_available_for_scrub(scrubber
.start
, candidate_end
)) {
4513 // we'll be requeued by whatever made us unavailable for scrub
4514 dout(10) << __func__
<< ": scrub blocked somewhere in range "
4515 << "[" << scrubber
.start
<< ", " << candidate_end
<< ")"
4520 scrubber
.end
= candidate_end
;
4523 // walk the log to find the latest update that affects our chunk
4524 scrubber
.subset_last_update
= eversion_t();
4525 for (auto p
= projected_log
.log
.rbegin();
4526 p
!= projected_log
.log
.rend();
4528 if (p
->soid
>= scrubber
.start
&&
4529 p
->soid
< scrubber
.end
) {
4530 scrubber
.subset_last_update
= p
->version
;
4534 if (scrubber
.subset_last_update
== eversion_t()) {
4535 for (list
<pg_log_entry_t
>::const_reverse_iterator p
=
4536 pg_log
.get_log().log
.rbegin();
4537 p
!= pg_log
.get_log().log
.rend();
4539 if (p
->soid
>= scrubber
.start
&&
4540 p
->soid
< scrubber
.end
) {
4541 scrubber
.subset_last_update
= p
->version
;
4547 // ask replicas to wait until
4548 // last_update_applied >= scrubber.subset_last_update and then scan
4549 scrubber
.waiting_on_whom
.insert(pg_whoami
);
4550 ++scrubber
.waiting_on
;
4552 // request maps from replicas
4553 for (set
<pg_shard_t
>::iterator i
= actingbackfill
.begin();
4554 i
!= actingbackfill
.end();
4556 if (*i
== pg_whoami
) continue;
4557 _request_scrub_map(*i
, scrubber
.subset_last_update
,
4558 scrubber
.start
, scrubber
.end
, scrubber
.deep
,
4560 scrubber
.waiting_on_whom
.insert(*i
);
4561 ++scrubber
.waiting_on
;
4564 scrubber
.state
= PG::Scrubber::WAIT_PUSHES
;
4568 case PG::Scrubber::WAIT_PUSHES
:
4569 if (active_pushes
== 0) {
4570 scrubber
.state
= PG::Scrubber::WAIT_LAST_UPDATE
;
4572 dout(15) << "wait for pushes to apply" << dendl
;
4577 case PG::Scrubber::WAIT_LAST_UPDATE
:
4578 if (last_update_applied
>= scrubber
.subset_last_update
) {
4579 scrubber
.state
= PG::Scrubber::BUILD_MAP
;
4581 // will be requeued by op_applied
4582 dout(15) << "wait for writes to flush" << dendl
;
4587 case PG::Scrubber::BUILD_MAP
:
4588 assert(last_update_applied
>= scrubber
.subset_last_update
);
4590 // build my own scrub map
4591 ret
= build_scrub_map_chunk(scrubber
.primary_scrubmap
,
4592 scrubber
.start
, scrubber
.end
,
4593 scrubber
.deep
, scrubber
.seed
,
4596 dout(5) << "error building scrub map: " << ret
<< ", aborting" << dendl
;
4597 scrub_clear_state();
4598 scrub_unreserve_replicas();
4602 --scrubber
.waiting_on
;
4603 scrubber
.waiting_on_whom
.erase(pg_whoami
);
4605 scrubber
.state
= PG::Scrubber::WAIT_REPLICAS
;
4608 case PG::Scrubber::WAIT_REPLICAS
:
4609 if (scrubber
.waiting_on
> 0) {
4610 // will be requeued by sub_op_scrub_map
4611 dout(10) << "wait for replicas to build scrub map" << dendl
;
4614 scrubber
.state
= PG::Scrubber::COMPARE_MAPS
;
4618 case PG::Scrubber::COMPARE_MAPS
:
4619 assert(last_update_applied
>= scrubber
.subset_last_update
);
4620 assert(scrubber
.waiting_on
== 0);
4622 scrub_compare_maps();
4623 scrubber
.start
= scrubber
.end
;
4624 scrubber
.run_callbacks();
4626 // requeue the writes from the chunk that just finished
4627 requeue_ops(waiting_for_scrub
);
4629 scrubber
.state
= PG::Scrubber::WAIT_DIGEST_UPDATES
;
4633 case PG::Scrubber::WAIT_DIGEST_UPDATES
:
4634 if (scrubber
.num_digest_updates_pending
) {
4635 dout(10) << __func__
<< " waiting on "
4636 << scrubber
.num_digest_updates_pending
4637 << " digest updates" << dendl
;
4642 if (!(scrubber
.end
.is_max())) {
4643 scrubber
.state
= PG::Scrubber::NEW_CHUNK
;
4647 scrubber
.state
= PG::Scrubber::FINISH
;
4652 case PG::Scrubber::FINISH
:
4654 scrubber
.state
= PG::Scrubber::INACTIVE
;
4657 if (!snap_trimq
.empty()) {
4658 dout(10) << "scrub finished, requeuing snap_trimmer" << dendl
;
4659 snap_trimmer_scrub_complete();
4668 dout(20) << "scrub final state " << Scrubber::state_string(scrubber
.state
)
4669 << " [" << scrubber
.start
<< "," << scrubber
.end
<< ")" << dendl
;
4672 void PG::scrub_clear_state()
4674 assert(is_locked());
4675 state_clear(PG_STATE_SCRUBBING
);
4676 state_clear(PG_STATE_REPAIR
);
4677 state_clear(PG_STATE_DEEP_SCRUB
);
4678 publish_stats_to_osd();
4680 // active -> nothing.
4681 if (scrubber
.active
)
4682 osd
->dec_scrubs_active();
4684 requeue_ops(waiting_for_scrub
);
4688 // type-specific state clear
4689 _scrub_clear_state();
4692 void PG::scrub_compare_maps()
4694 dout(10) << __func__
<< " has maps, analyzing" << dendl
;
4696 // construct authoritative scrub map for type specific scrubbing
4697 scrubber
.cleaned_meta_map
.insert(scrubber
.primary_scrubmap
);
4698 map
<hobject_t
, pair
<uint32_t, uint32_t>> missing_digest
;
4700 if (acting
.size() > 1) {
4701 dout(10) << __func__
<< " comparing replica scrub maps" << dendl
;
4705 // Map from object with errors to good peer
4706 map
<hobject_t
, list
<pg_shard_t
>> authoritative
;
4707 map
<pg_shard_t
, ScrubMap
*> maps
;
4709 dout(2) << __func__
<< " osd." << acting
[0] << " has "
4710 << scrubber
.primary_scrubmap
.objects
.size() << " items" << dendl
;
4711 maps
[pg_whoami
] = &scrubber
.primary_scrubmap
;
4713 for (set
<pg_shard_t
>::iterator i
= actingbackfill
.begin();
4714 i
!= actingbackfill
.end();
4716 if (*i
== pg_whoami
) continue;
4717 dout(2) << __func__
<< " replica " << *i
<< " has "
4718 << scrubber
.received_maps
[*i
].objects
.size()
4719 << " items" << dendl
;
4720 maps
[*i
] = &scrubber
.received_maps
[*i
];
4723 get_pgbackend()->be_compare_scrubmaps(
4725 state_test(PG_STATE_REPAIR
),
4727 scrubber
.inconsistent
,
4730 scrubber
.shallow_errors
,
4731 scrubber
.deep_errors
,
4732 scrubber
.store
.get(),
4735 dout(2) << ss
.str() << dendl
;
4737 if (!ss
.str().empty()) {
4738 osd
->clog
->error(ss
);
4741 for (map
<hobject_t
, list
<pg_shard_t
>>::iterator i
= authoritative
.begin();
4742 i
!= authoritative
.end();
4744 list
<pair
<ScrubMap::object
, pg_shard_t
> > good_peers
;
4745 for (list
<pg_shard_t
>::const_iterator j
= i
->second
.begin();
4746 j
!= i
->second
.end();
4748 good_peers
.push_back(make_pair(maps
[*j
]->objects
[i
->first
], *j
));
4750 scrubber
.authoritative
.insert(
4756 for (map
<hobject_t
, list
<pg_shard_t
>>::iterator i
= authoritative
.begin();
4757 i
!= authoritative
.end();
4759 scrubber
.cleaned_meta_map
.objects
.erase(i
->first
);
4760 scrubber
.cleaned_meta_map
.objects
.insert(
4761 *(maps
[i
->second
.back()]->objects
.find(i
->first
))
4766 ScrubMap for_meta_scrub
;
4767 if (scrubber
.end
.is_max() ||
4768 scrubber
.cleaned_meta_map
.objects
.empty()) {
4769 scrubber
.cleaned_meta_map
.swap(for_meta_scrub
);
4771 auto iter
= scrubber
.cleaned_meta_map
.objects
.end();
4772 --iter
; // not empty, see if clause
4773 auto begin
= scrubber
.cleaned_meta_map
.objects
.begin();
4774 while (iter
!= begin
) {
4776 if (next
->first
.get_head() != iter
->first
.get_head()) {
4781 for_meta_scrub
.objects
.insert(begin
, iter
);
4782 scrubber
.cleaned_meta_map
.objects
.erase(begin
, iter
);
4785 // ok, do the pg-type specific scrubbing
4786 scrub_snapshot_metadata(for_meta_scrub
, missing_digest
);
4787 if (!scrubber
.store
->empty()) {
4788 if (state_test(PG_STATE_REPAIR
)) {
4789 dout(10) << __func__
<< ": discarding scrub results" << dendl
;
4790 scrubber
.store
->flush(nullptr);
4792 dout(10) << __func__
<< ": updating scrub object" << dendl
;
4793 ObjectStore::Transaction t
;
4794 scrubber
.store
->flush(&t
);
4795 osd
->store
->queue_transaction(osr
.get(), std::move(t
), nullptr);
4800 bool PG::scrub_process_inconsistent()
4802 dout(10) << __func__
<< ": checking authoritative" << dendl
;
4803 bool repair
= state_test(PG_STATE_REPAIR
);
4804 bool deep_scrub
= state_test(PG_STATE_DEEP_SCRUB
);
4805 const char *mode
= (repair
? "repair": (deep_scrub
? "deep-scrub" : "scrub"));
4807 // authoriative only store objects which missing or inconsistent.
4808 if (!scrubber
.authoritative
.empty()) {
4810 ss
<< info
.pgid
<< " " << mode
<< " "
4811 << scrubber
.missing
.size() << " missing, "
4812 << scrubber
.inconsistent
.size() << " inconsistent objects";
4813 dout(2) << ss
.str() << dendl
;
4814 osd
->clog
->error(ss
);
4816 state_clear(PG_STATE_CLEAN
);
4817 for (map
<hobject_t
, list
<pair
<ScrubMap::object
, pg_shard_t
> >>::iterator i
=
4818 scrubber
.authoritative
.begin();
4819 i
!= scrubber
.authoritative
.end();
4821 set
<pg_shard_t
>::iterator j
;
4823 auto missing_entry
= scrubber
.missing
.find(i
->first
);
4824 if (missing_entry
!= scrubber
.missing
.end()) {
4825 for (j
= missing_entry
->second
.begin();
4826 j
!= missing_entry
->second
.end();
4835 if (scrubber
.inconsistent
.count(i
->first
)) {
4836 for (j
= scrubber
.inconsistent
[i
->first
].begin();
4837 j
!= scrubber
.inconsistent
[i
->first
].end();
4839 repair_object(i
->first
,
4848 return (!scrubber
.authoritative
.empty() && repair
);
4851 bool PG::ops_blocked_by_scrub() const {
4852 return (waiting_for_scrub
.size() != 0);
4855 // the part that actually finalizes a scrub
4856 void PG::scrub_finish()
4858 bool repair
= state_test(PG_STATE_REPAIR
);
4859 // if the repair request comes from auto-repair and large number of errors,
4860 // we would like to cancel auto-repair
4861 if (repair
&& scrubber
.auto_repair
4862 && scrubber
.authoritative
.size() > cct
->_conf
->osd_scrub_auto_repair_num_errors
) {
4863 state_clear(PG_STATE_REPAIR
);
4866 bool deep_scrub
= state_test(PG_STATE_DEEP_SCRUB
);
4867 const char *mode
= (repair
? "repair": (deep_scrub
? "deep-scrub" : "scrub"));
4869 // type-specific finish (can tally more errors)
4872 bool has_error
= scrub_process_inconsistent();
4876 oss
<< info
.pgid
.pgid
<< " " << mode
<< " ";
4877 int total_errors
= scrubber
.shallow_errors
+ scrubber
.deep_errors
;
4879 oss
<< total_errors
<< " errors";
4882 if (!deep_scrub
&& info
.stats
.stats
.sum
.num_deep_scrub_errors
)
4883 oss
<< " ( " << info
.stats
.stats
.sum
.num_deep_scrub_errors
4884 << " remaining deep scrub error details lost)";
4886 oss
<< ", " << scrubber
.fixed
<< " fixed";
4888 osd
->clog
->error(oss
);
4890 osd
->clog
->info(oss
);
4895 utime_t now
= ceph_clock_now();
4896 info
.history
.last_scrub
= info
.last_update
;
4897 info
.history
.last_scrub_stamp
= now
;
4898 if (scrubber
.deep
) {
4899 info
.history
.last_deep_scrub
= info
.last_update
;
4900 info
.history
.last_deep_scrub_stamp
= now
;
4902 // Since we don't know which errors were fixed, we can only clear them
4903 // when every one has been fixed.
4905 if (scrubber
.fixed
== scrubber
.shallow_errors
+ scrubber
.deep_errors
) {
4907 scrubber
.shallow_errors
= scrubber
.deep_errors
= 0;
4909 // Deep scrub in order to get corrected error counts
4910 scrub_after_recovery
= true;
4914 if ((scrubber
.shallow_errors
== 0) && (scrubber
.deep_errors
== 0))
4915 info
.history
.last_clean_scrub_stamp
= now
;
4916 info
.stats
.stats
.sum
.num_shallow_scrub_errors
= scrubber
.shallow_errors
;
4917 info
.stats
.stats
.sum
.num_deep_scrub_errors
= scrubber
.deep_errors
;
4919 info
.stats
.stats
.sum
.num_shallow_scrub_errors
= scrubber
.shallow_errors
;
4920 // XXX: last_clean_scrub_stamp doesn't mean the pg is not inconsistent
4921 // because of deep-scrub errors
4922 if (scrubber
.shallow_errors
== 0)
4923 info
.history
.last_clean_scrub_stamp
= now
;
4925 info
.stats
.stats
.sum
.num_scrub_errors
=
4926 info
.stats
.stats
.sum
.num_shallow_scrub_errors
+
4927 info
.stats
.stats
.sum
.num_deep_scrub_errors
;
4931 ObjectStore::Transaction t
;
4934 int tr
= osd
->store
->queue_transaction(osr
.get(), std::move(t
), NULL
);
4940 queue_peering_event(
4942 std::make_shared
<CephPeeringEvt
>(
4943 get_osdmap()->get_epoch(),
4944 get_osdmap()->get_epoch(),
4948 scrub_clear_state();
4949 scrub_unreserve_replicas();
4951 if (is_active() && is_primary()) {
4956 void PG::share_pg_info()
4958 dout(10) << "share_pg_info" << dendl
;
4960 // share new pg_info_t with replicas
4961 assert(!actingbackfill
.empty());
4962 for (set
<pg_shard_t
>::iterator i
= actingbackfill
.begin();
4963 i
!= actingbackfill
.end();
4965 if (*i
== pg_whoami
) continue;
4966 pg_shard_t peer
= *i
;
4967 if (peer_info
.count(peer
)) {
4968 peer_info
[peer
].last_epoch_started
= info
.last_epoch_started
;
4969 peer_info
[peer
].last_interval_started
= info
.last_interval_started
;
4970 peer_info
[peer
].history
.merge(info
.history
);
4972 MOSDPGInfo
*m
= new MOSDPGInfo(get_osdmap()->get_epoch());
4973 m
->pg_list
.push_back(
4976 peer
.shard
, pg_whoami
.shard
,
4977 get_osdmap()->get_epoch(),
4978 get_osdmap()->get_epoch(),
4981 osd
->send_message_osd_cluster(peer
.osd
, m
, get_osdmap()->get_epoch());
4985 bool PG::append_log_entries_update_missing(
4986 const mempool::osd_pglog::list
<pg_log_entry_t
> &entries
,
4987 ObjectStore::Transaction
&t
)
4989 assert(!entries
.empty());
4990 assert(entries
.begin()->version
> info
.last_update
);
4992 PGLogEntryHandler rollbacker
{this, &t
};
4993 bool invalidate_stats
=
4994 pg_log
.append_new_log_entries(info
.last_backfill
,
4995 info
.last_backfill_bitwise
,
4998 info
.last_update
= pg_log
.get_head();
5000 if (pg_log
.get_missing().num_missing() == 0) {
5001 // advance last_complete since nothing else is missing!
5002 info
.last_complete
= info
.last_update
;
5005 info
.stats
.stats_invalid
= info
.stats
.stats_invalid
|| invalidate_stats
;
5008 return invalidate_stats
;
5012 void PG::merge_new_log_entries(
5013 const mempool::osd_pglog::list
<pg_log_entry_t
> &entries
,
5014 ObjectStore::Transaction
&t
)
5016 dout(10) << __func__
<< " " << entries
<< dendl
;
5017 assert(is_primary());
5019 bool rebuild_missing
= append_log_entries_update_missing(entries
, t
);
5020 for (set
<pg_shard_t
>::const_iterator i
= actingbackfill
.begin();
5021 i
!= actingbackfill
.end();
5023 pg_shard_t
peer(*i
);
5024 if (peer
== pg_whoami
) continue;
5025 assert(peer_missing
.count(peer
));
5026 assert(peer_info
.count(peer
));
5027 pg_missing_t
& pmissing(peer_missing
[peer
]);
5028 pg_info_t
& pinfo(peer_info
[peer
]);
5029 bool invalidate_stats
= PGLog::append_log_entries_update_missing(
5030 pinfo
.last_backfill
,
5031 info
.last_backfill_bitwise
,
5038 pinfo
.last_update
= info
.last_update
;
5039 pinfo
.stats
.stats_invalid
= pinfo
.stats
.stats_invalid
|| invalidate_stats
;
5040 rebuild_missing
= rebuild_missing
|| invalidate_stats
;
5043 if (!rebuild_missing
) {
5047 for (auto &&i
: entries
) {
5048 missing_loc
.rebuild(
5053 pg_log
.get_missing(),
5059 void PG::update_history(const pg_history_t
& new_history
)
5062 if (info
.history
.merge(new_history
)) {
5063 dout(20) << __func__
<< " advanced history from " << new_history
<< dendl
;
5065 if (info
.history
.last_epoch_clean
>= info
.history
.same_interval_since
) {
5066 dout(20) << __func__
<< " clearing past_intervals" << dendl
;
5067 past_intervals
.clear();
5068 dirty_big_info
= true;
5074 void PG::fulfill_info(
5075 pg_shard_t from
, const pg_query_t
&query
,
5076 pair
<pg_shard_t
, pg_info_t
> ¬ify_info
)
5078 assert(from
== primary
);
5079 assert(query
.type
== pg_query_t::INFO
);
5082 dout(10) << "sending info" << dendl
;
5083 notify_info
= make_pair(from
, info
);
5086 void PG::fulfill_log(
5087 pg_shard_t from
, const pg_query_t
&query
, epoch_t query_epoch
)
5089 dout(10) << "log request from " << from
<< dendl
;
5090 assert(from
== primary
);
5091 assert(query
.type
!= pg_query_t::INFO
);
5092 ConnectionRef con
= osd
->get_con_osd_cluster(
5093 from
.osd
, get_osdmap()->get_epoch());
5096 MOSDPGLog
*mlog
= new MOSDPGLog(
5097 from
.shard
, pg_whoami
.shard
,
5098 get_osdmap()->get_epoch(),
5100 mlog
->missing
= pg_log
.get_missing();
5102 // primary -> other, when building master log
5103 if (query
.type
== pg_query_t::LOG
) {
5104 dout(10) << " sending info+missing+log since " << query
.since
5106 if (query
.since
!= eversion_t() && query
.since
< pg_log
.get_tail()) {
5107 osd
->clog
->error() << info
.pgid
<< " got broken pg_query_t::LOG since " << query
.since
5108 << " when my log.tail is " << pg_log
.get_tail()
5109 << ", sending full log instead";
5110 mlog
->log
= pg_log
.get_log(); // primary should not have requested this!!
5112 mlog
->log
.copy_after(pg_log
.get_log(), query
.since
);
5114 else if (query
.type
== pg_query_t::FULLLOG
) {
5115 dout(10) << " sending info+missing+full log" << dendl
;
5116 mlog
->log
= pg_log
.get_log();
5119 dout(10) << " sending " << mlog
->log
<< " " << mlog
->missing
<< dendl
;
5121 osd
->share_map_peer(from
.osd
, con
.get(), get_osdmap());
5122 osd
->send_message_osd_cluster(mlog
, con
.get());
5125 void PG::check_full_transition(OSDMapRef lastmap
, OSDMapRef osdmap
)
5127 bool changed
= false;
5128 if (osdmap
->test_flag(CEPH_OSDMAP_FULL
) &&
5129 !lastmap
->test_flag(CEPH_OSDMAP_FULL
)) {
5130 dout(10) << " cluster was marked full in " << osdmap
->get_epoch() << dendl
;
5133 const pg_pool_t
*pi
= osdmap
->get_pg_pool(info
.pgid
.pool());
5135 if (pi
->has_flag(pg_pool_t::FLAG_FULL
)) {
5136 const pg_pool_t
*opi
= lastmap
->get_pg_pool(info
.pgid
.pool());
5137 if (!opi
|| !opi
->has_flag(pg_pool_t::FLAG_FULL
)) {
5138 dout(10) << " pool was marked full in " << osdmap
->get_epoch() << dendl
;
5143 info
.history
.last_epoch_marked_full
= osdmap
->get_epoch();
5148 bool PG::should_restart_peering(
5150 int newactingprimary
,
5151 const vector
<int>& newup
,
5152 const vector
<int>& newacting
,
5156 if (PastIntervals::is_new_interval(
5168 dout(20) << "new interval newup " << newup
5169 << " newacting " << newacting
<< dendl
;
5176 bool PG::old_peering_msg(epoch_t reply_epoch
, epoch_t query_epoch
)
5178 if (last_peering_reset
> reply_epoch
||
5179 last_peering_reset
> query_epoch
) {
5180 dout(10) << "old_peering_msg reply_epoch " << reply_epoch
<< " query_epoch " << query_epoch
5181 << " last_peering_reset " << last_peering_reset
5188 void PG::set_last_peering_reset()
5190 dout(20) << "set_last_peering_reset " << get_osdmap()->get_epoch() << dendl
;
5191 if (last_peering_reset
!= get_osdmap()->get_epoch()) {
5192 last_peering_reset
= get_osdmap()->get_epoch();
5193 reset_interval_flush();
5200 FlushState(PG
*pg
, epoch_t epoch
) : pg(pg
), epoch(epoch
) {}
5203 if (!pg
->pg_has_reset_since(epoch
))
5204 pg
->queue_flushed(epoch
);
5208 typedef ceph::shared_ptr
<FlushState
> FlushStateRef
;
5210 void PG::start_flush(ObjectStore::Transaction
*t
,
5211 list
<Context
*> *on_applied
,
5212 list
<Context
*> *on_safe
)
5214 // flush in progress ops
5215 FlushStateRef
flush_trigger (std::make_shared
<FlushState
>(
5216 this, get_osdmap()->get_epoch()));
5218 flushes_in_progress
++;
5219 on_applied
->push_back(new ContainerContext
<FlushStateRef
>(flush_trigger
));
5220 on_safe
->push_back(new ContainerContext
<FlushStateRef
>(flush_trigger
));
5223 void PG::reset_interval_flush()
5225 dout(10) << "Clearing blocked outgoing recovery messages" << dendl
;
5226 recovery_state
.clear_blocked_outgoing();
5228 Context
*c
= new QueuePeeringEvt
<IntervalFlush
>(
5229 this, get_osdmap()->get_epoch(), IntervalFlush());
5230 if (!osr
->flush_commit(c
)) {
5231 dout(10) << "Beginning to block outgoing recovery messages" << dendl
;
5232 recovery_state
.begin_block_outgoing();
5234 dout(10) << "Not blocking outgoing recovery messages" << dendl
;
5239 /* Called before initializing peering during advance_map */
5240 void PG::start_peering_interval(
5241 const OSDMapRef lastmap
,
5242 const vector
<int>& newup
, int new_up_primary
,
5243 const vector
<int>& newacting
, int new_acting_primary
,
5244 ObjectStore::Transaction
*t
)
5246 const OSDMapRef osdmap
= get_osdmap();
5248 set_last_peering_reset();
5250 vector
<int> oldacting
, oldup
;
5251 int oldrole
= get_role();
5255 pg_shard_t old_acting_primary
= get_primary();
5256 pg_shard_t old_up_primary
= up_primary
;
5257 bool was_old_primary
= is_primary();
5259 acting
.swap(oldacting
);
5261 init_primary_up_acting(
5265 new_acting_primary
);
5267 if (info
.stats
.up
!= up
||
5268 info
.stats
.acting
!= acting
||
5269 info
.stats
.up_primary
!= new_up_primary
||
5270 info
.stats
.acting_primary
!= new_acting_primary
) {
5272 info
.stats
.up_primary
= new_up_primary
;
5273 info
.stats
.acting
= acting
;
5274 info
.stats
.acting_primary
= new_acting_primary
;
5275 info
.stats
.mapping_epoch
= osdmap
->get_epoch();
5278 pg_stats_publish_lock
.Lock();
5279 pg_stats_publish_valid
= false;
5280 pg_stats_publish_lock
.Unlock();
5282 // This will now be remapped during a backfill in cases
5283 // that it would not have been before.
5285 state_set(PG_STATE_REMAPPED
);
5287 state_clear(PG_STATE_REMAPPED
);
5289 int role
= osdmap
->calc_pg_role(osd
->whoami
, acting
, acting
.size());
5290 if (pool
.info
.is_replicated() || role
== pg_whoami
.shard
)
5295 // did acting, up, primary|acker change?
5297 dout(10) << " no lastmap" << dendl
;
5299 dirty_big_info
= true;
5300 info
.history
.same_interval_since
= osdmap
->get_epoch();
5302 std::stringstream debug
;
5303 assert(info
.history
.same_interval_since
!= 0);
5304 boost::scoped_ptr
<IsPGRecoverablePredicate
> recoverable(
5305 get_is_recoverable_predicate());
5306 bool new_interval
= PastIntervals::check_new_interval(
5307 old_acting_primary
.osd
,
5309 oldacting
, newacting
,
5313 info
.history
.same_interval_since
,
5314 info
.history
.last_epoch_clean
,
5321 dout(10) << __func__
<< ": check_new_interval output: "
5322 << debug
.str() << dendl
;
5324 if (osdmap
->get_epoch() == osd
->get_superblock().oldest_map
&&
5325 info
.history
.last_epoch_clean
< osdmap
->get_epoch()) {
5326 dout(10) << " map gap, clearing past_intervals and faking" << dendl
;
5327 // our information is incomplete and useless; someone else was clean
5328 // after everything we know if osdmaps were trimmed.
5329 past_intervals
.clear();
5331 dout(10) << " noting past " << past_intervals
<< dendl
;
5334 dirty_big_info
= true;
5335 info
.history
.same_interval_since
= osdmap
->get_epoch();
5336 if (info
.pgid
.pgid
.is_split(lastmap
->get_pg_num(info
.pgid
.pgid
.pool()),
5337 osdmap
->get_pg_num(info
.pgid
.pgid
.pool()),
5339 info
.history
.last_epoch_split
= osdmap
->get_epoch();
5344 if (old_up_primary
!= up_primary
||
5346 info
.history
.same_up_since
= osdmap
->get_epoch();
5348 // this comparison includes primary rank via pg_shard_t
5349 if (old_acting_primary
!= get_primary()) {
5350 info
.history
.same_primary_since
= osdmap
->get_epoch();
5355 dout(1) << __func__
<< " up " << oldup
<< " -> " << up
5356 << ", acting " << oldacting
<< " -> " << acting
5357 << ", acting_primary " << old_acting_primary
<< " -> " << new_acting_primary
5358 << ", up_primary " << old_up_primary
<< " -> " << new_up_primary
5359 << ", role " << oldrole
<< " -> " << role
5360 << ", features acting " << acting_features
5361 << " upacting " << upacting_features
5365 state_clear(PG_STATE_ACTIVE
);
5366 state_clear(PG_STATE_PEERED
);
5367 state_clear(PG_STATE_DOWN
);
5368 state_clear(PG_STATE_RECOVERY_WAIT
);
5369 state_clear(PG_STATE_RECOVERY_TOOFULL
);
5370 state_clear(PG_STATE_RECOVERING
);
5372 peer_purged
.clear();
5373 actingbackfill
.clear();
5374 scrub_queued
= false;
5376 // reset primary state?
5377 if (was_old_primary
|| is_primary()) {
5378 osd
->remove_want_pg_temp(info
.pgid
.pgid
);
5380 clear_primary_state();
5386 projected_last_update
= eversion_t();
5390 // should we tell the primary we are here?
5391 send_notify
= !is_primary();
5393 if (role
!= oldrole
||
5394 was_old_primary
!= is_primary()) {
5395 // did primary change?
5396 if (was_old_primary
!= is_primary()) {
5397 state_clear(PG_STATE_CLEAN
);
5398 clear_publish_stats();
5403 // take active waiters
5404 requeue_ops(waiting_for_peered
);
5408 // did primary change?
5409 if (get_primary() != old_acting_primary
) {
5410 dout(10) << *this << " " << oldacting
<< " -> " << acting
5411 << ", acting primary "
5412 << old_acting_primary
<< " -> " << get_primary()
5415 // primary is the same.
5417 // i am (still) primary. but my replica set changed.
5418 state_clear(PG_STATE_CLEAN
);
5420 dout(10) << oldacting
<< " -> " << acting
5421 << ", replicas changed" << dendl
;
5427 if (acting
.empty() && !up
.empty() && up_primary
== pg_whoami
) {
5428 dout(10) << " acting empty, but i am up[0], clearing pg_temp" << dendl
;
5429 osd
->queue_want_pg_temp(info
.pgid
.pgid
, acting
);
5433 void PG::on_new_interval()
5435 const OSDMapRef osdmap
= get_osdmap();
5439 // initialize features
5440 acting_features
= CEPH_FEATURES_SUPPORTED_DEFAULT
;
5441 upacting_features
= CEPH_FEATURES_SUPPORTED_DEFAULT
;
5442 for (vector
<int>::iterator p
= acting
.begin(); p
!= acting
.end(); ++p
) {
5443 if (*p
== CRUSH_ITEM_NONE
)
5445 uint64_t f
= osdmap
->get_xinfo(*p
).features
;
5446 acting_features
&= f
;
5447 upacting_features
&= f
;
5449 for (vector
<int>::iterator p
= up
.begin(); p
!= up
.end(); ++p
) {
5450 if (*p
== CRUSH_ITEM_NONE
)
5452 upacting_features
&= osdmap
->get_xinfo(*p
).features
;
5455 assert(osdmap
->test_flag(CEPH_OSDMAP_SORTBITWISE
));
5460 void PG::proc_primary_info(ObjectStore::Transaction
&t
, const pg_info_t
&oinfo
)
5462 assert(!is_primary());
5464 update_history(oinfo
.history
);
5466 if (last_complete_ondisk
.epoch
>= info
.history
.last_epoch_started
) {
5467 // DEBUG: verify that the snaps are empty in snap_mapper
5468 if (cct
->_conf
->osd_debug_verify_snaps_on_info
) {
5469 interval_set
<snapid_t
> p
;
5470 p
.union_of(oinfo
.purged_snaps
, info
.purged_snaps
);
5471 p
.subtract(info
.purged_snaps
);
5473 for (interval_set
<snapid_t
>::iterator i
= p
.begin();
5476 for (snapid_t snap
= i
.get_start();
5477 snap
!= i
.get_len() + i
.get_start();
5479 vector
<hobject_t
> hoids
;
5480 int r
= snap_mapper
.get_next_objects_to_trim(snap
, 1, &hoids
);
5481 if (r
!= 0 && r
!= -ENOENT
) {
5482 derr
<< __func__
<< ": snap_mapper get_next_object_to_trim returned "
5483 << cpp_strerror(r
) << dendl
;
5485 } else if (r
!= -ENOENT
) {
5486 assert(!hoids
.empty());
5487 derr
<< __func__
<< ": snap_mapper get_next_object_to_trim returned "
5488 << cpp_strerror(r
) << " for object "
5489 << hoids
[0] << " on snap " << snap
5490 << " which should have been fully trimmed " << dendl
;
5497 info
.purged_snaps
= oinfo
.purged_snaps
;
5499 dirty_big_info
= true;
5503 ostream
& operator<<(ostream
& out
, const PG
& pg
)
5505 out
<< "pg[" << pg
.info
5507 if (pg
.acting
!= pg
.up
)
5508 out
<< "/" << pg
.acting
;
5509 out
<< " r=" << pg
.get_role();
5510 out
<< " lpr=" << pg
.get_last_peering_reset();
5512 if (!pg
.past_intervals
.empty()) {
5513 out
<< " pi=[" << pg
.past_intervals
.get_bounds()
5514 << ")/" << pg
.past_intervals
.size();
5517 if (pg
.is_peered()) {
5518 if (pg
.last_update_ondisk
!= pg
.info
.last_update
)
5519 out
<< " luod=" << pg
.last_update_ondisk
;
5520 if (pg
.last_update_applied
!= pg
.info
.last_update
)
5521 out
<< " lua=" << pg
.last_update_applied
;
5524 if (pg
.recovery_ops_active
)
5525 out
<< " rops=" << pg
.recovery_ops_active
;
5527 if (pg
.pg_log
.get_tail() != pg
.info
.log_tail
||
5528 pg
.pg_log
.get_head() != pg
.info
.last_update
)
5529 out
<< " (info mismatch, " << pg
.pg_log
.get_log() << ")";
5531 if (!pg
.pg_log
.get_log().empty()) {
5532 if ((pg
.pg_log
.get_log().log
.begin()->version
<= pg
.pg_log
.get_tail())) {
5533 out
<< " (log bound mismatch, actual=["
5534 << pg
.pg_log
.get_log().log
.begin()->version
<< ","
5535 << pg
.pg_log
.get_log().log
.rbegin()->version
<< "]";
5540 if (!pg
.backfill_targets
.empty())
5541 out
<< " bft=" << pg
.backfill_targets
;
5542 out
<< " crt=" << pg
.pg_log
.get_can_rollback_to();
5544 if (pg
.last_complete_ondisk
!= pg
.info
.last_complete
)
5545 out
<< " lcod " << pg
.last_complete_ondisk
;
5547 if (pg
.is_primary()) {
5548 out
<< " mlcod " << pg
.min_last_complete_ondisk
;
5551 out
<< " " << pg_state_string(pg
.get_state());
5552 if (pg
.should_send_notify())
5555 if (pg
.scrubber
.must_repair
)
5556 out
<< " MUST_REPAIR";
5557 if (pg
.scrubber
.auto_repair
)
5558 out
<< " AUTO_REPAIR";
5559 if (pg
.scrubber
.must_deep_scrub
)
5560 out
<< " MUST_DEEP_SCRUB";
5561 if (pg
.scrubber
.must_scrub
)
5562 out
<< " MUST_SCRUB";
5564 //out << " (" << pg.pg_log.get_tail() << "," << pg.pg_log.get_head() << "]";
5565 if (pg
.pg_log
.get_missing().num_missing()) {
5566 out
<< " m=" << pg
.pg_log
.get_missing().num_missing();
5567 if (pg
.is_primary()) {
5568 uint64_t unfound
= pg
.get_num_unfound();
5570 out
<< " u=" << unfound
;
5573 if (pg
.snap_trimq
.size())
5574 out
<< " snaptrimq=" << pg
.snap_trimq
;
5582 bool PG::can_discard_op(OpRequestRef
& op
)
5584 const MOSDOp
*m
= static_cast<const MOSDOp
*>(op
->get_req());
5585 if (cct
->_conf
->osd_discard_disconnected_ops
&& OSD::op_is_discardable(m
)) {
5586 dout(20) << " discard " << *m
<< dendl
;
5590 if (m
->get_map_epoch() < info
.history
.same_primary_since
) {
5591 dout(7) << " changed after " << m
->get_map_epoch()
5592 << ", dropping " << *m
<< dendl
;
5596 if (m
->get_connection()->has_feature(CEPH_FEATURE_RESEND_ON_SPLIT
)) {
5597 if (m
->get_map_epoch() < pool
.info
.get_last_force_op_resend()) {
5598 dout(7) << __func__
<< " sent before last_force_op_resend "
5599 << pool
.info
.last_force_op_resend
<< ", dropping" << *m
<< dendl
;
5602 if (m
->get_map_epoch() < info
.history
.last_epoch_split
) {
5603 dout(7) << __func__
<< " pg split in "
5604 << info
.history
.last_epoch_split
<< ", dropping" << dendl
;
5607 } else if (m
->get_connection()->has_feature(CEPH_FEATURE_OSD_POOLRESEND
)) {
5608 if (m
->get_map_epoch() < pool
.info
.get_last_force_op_resend_preluminous()) {
5609 dout(7) << __func__
<< " sent before last_force_op_resend_preluminous "
5610 << pool
.info
.last_force_op_resend_preluminous
5611 << ", dropping" << *m
<< dendl
;
5619 template<typename T
, int MSGTYPE
>
5620 bool PG::can_discard_replica_op(OpRequestRef
& op
)
5622 const T
*m
= static_cast<const T
*>(op
->get_req());
5623 assert(m
->get_type() == MSGTYPE
);
5625 /* Mostly, this overlaps with the old_peering_msg
5626 * condition. An important exception is pushes
5627 * sent by replicas not in the acting set, since
5628 * if such a replica goes down it does not cause
5629 * a new interval. */
5630 int from
= m
->get_source().num();
5631 if (get_osdmap()->get_down_at(from
) >= m
->map_epoch
)
5635 // if pg changes _at all_, we reset and repeer!
5636 if (old_peering_msg(m
->map_epoch
, m
->map_epoch
)) {
5637 dout(10) << "can_discard_replica_op pg changed " << info
.history
5638 << " after " << m
->map_epoch
5639 << ", dropping" << dendl
;
5645 bool PG::can_discard_scan(OpRequestRef op
)
5647 const MOSDPGScan
*m
= static_cast<const MOSDPGScan
*>(op
->get_req());
5648 assert(m
->get_type() == MSG_OSD_PG_SCAN
);
5650 if (old_peering_msg(m
->map_epoch
, m
->query_epoch
)) {
5651 dout(10) << " got old scan, ignoring" << dendl
;
5657 bool PG::can_discard_backfill(OpRequestRef op
)
5659 const MOSDPGBackfill
*m
= static_cast<const MOSDPGBackfill
*>(op
->get_req());
5660 assert(m
->get_type() == MSG_OSD_PG_BACKFILL
);
5662 if (old_peering_msg(m
->map_epoch
, m
->query_epoch
)) {
5663 dout(10) << " got old backfill, ignoring" << dendl
;
5671 bool PG::can_discard_request(OpRequestRef
& op
)
5673 switch (op
->get_req()->get_type()) {
5674 case CEPH_MSG_OSD_OP
:
5675 return can_discard_op(op
);
5676 case CEPH_MSG_OSD_BACKOFF
:
5677 return false; // never discard
5679 return can_discard_replica_op
<MOSDSubOp
, MSG_OSD_SUBOP
>(op
);
5681 return can_discard_replica_op
<MOSDRepOp
, MSG_OSD_REPOP
>(op
);
5682 case MSG_OSD_PG_PUSH
:
5683 return can_discard_replica_op
<MOSDPGPush
, MSG_OSD_PG_PUSH
>(op
);
5684 case MSG_OSD_PG_PULL
:
5685 return can_discard_replica_op
<MOSDPGPull
, MSG_OSD_PG_PULL
>(op
);
5686 case MSG_OSD_PG_PUSH_REPLY
:
5687 return can_discard_replica_op
<MOSDPGPushReply
, MSG_OSD_PG_PUSH_REPLY
>(op
);
5688 case MSG_OSD_SUBOPREPLY
:
5689 return can_discard_replica_op
<MOSDSubOpReply
, MSG_OSD_SUBOPREPLY
>(op
);
5690 case MSG_OSD_REPOPREPLY
:
5691 return can_discard_replica_op
<MOSDRepOpReply
, MSG_OSD_REPOPREPLY
>(op
);
5693 case MSG_OSD_EC_WRITE
:
5694 return can_discard_replica_op
<MOSDECSubOpWrite
, MSG_OSD_EC_WRITE
>(op
);
5695 case MSG_OSD_EC_WRITE_REPLY
:
5696 return can_discard_replica_op
<MOSDECSubOpWriteReply
, MSG_OSD_EC_WRITE_REPLY
>(op
);
5697 case MSG_OSD_EC_READ
:
5698 return can_discard_replica_op
<MOSDECSubOpRead
, MSG_OSD_EC_READ
>(op
);
5699 case MSG_OSD_EC_READ_REPLY
:
5700 return can_discard_replica_op
<MOSDECSubOpReadReply
, MSG_OSD_EC_READ_REPLY
>(op
);
5701 case MSG_OSD_REP_SCRUB
:
5702 return can_discard_replica_op
<MOSDRepScrub
, MSG_OSD_REP_SCRUB
>(op
);
5703 case MSG_OSD_SCRUB_RESERVE
:
5704 return can_discard_replica_op
<MOSDScrubReserve
, MSG_OSD_SCRUB_RESERVE
>(op
);
5705 case MSG_OSD_REP_SCRUBMAP
:
5706 return can_discard_replica_op
<MOSDRepScrubMap
, MSG_OSD_REP_SCRUBMAP
>(op
);
5707 case MSG_OSD_PG_UPDATE_LOG_MISSING
:
5708 return can_discard_replica_op
<
5709 MOSDPGUpdateLogMissing
, MSG_OSD_PG_UPDATE_LOG_MISSING
>(op
);
5710 case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY
:
5711 return can_discard_replica_op
<
5712 MOSDPGUpdateLogMissingReply
, MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY
>(op
);
5714 case MSG_OSD_PG_SCAN
:
5715 return can_discard_scan(op
);
5716 case MSG_OSD_PG_BACKFILL
:
5717 return can_discard_backfill(op
);
5718 case MSG_OSD_PG_BACKFILL_REMOVE
:
5719 return can_discard_replica_op
<MOSDPGBackfillRemove
,
5720 MSG_OSD_PG_BACKFILL_REMOVE
>(op
);
5725 void PG::take_waiters()
5727 dout(10) << "take_waiters" << dendl
;
5728 requeue_map_waiters();
5729 for (list
<CephPeeringEvtRef
>::iterator i
= peering_waiters
.begin();
5730 i
!= peering_waiters
.end();
5731 ++i
) osd
->queue_for_peering(this);
5732 peering_queue
.splice(peering_queue
.begin(), peering_waiters
,
5733 peering_waiters
.begin(), peering_waiters
.end());
5736 void PG::handle_peering_event(CephPeeringEvtRef evt
, RecoveryCtx
*rctx
)
5738 dout(10) << "handle_peering_event: " << evt
->get_desc() << dendl
;
5739 if (!have_same_or_newer_map(evt
->get_epoch_sent())) {
5740 dout(10) << "deferring event " << evt
->get_desc() << dendl
;
5741 peering_waiters
.push_back(evt
);
5744 if (old_peering_evt(evt
))
5746 recovery_state
.handle_event(evt
, rctx
);
5749 void PG::queue_peering_event(CephPeeringEvtRef evt
)
5751 if (old_peering_evt(evt
))
5753 peering_queue
.push_back(evt
);
5754 osd
->queue_for_peering(this);
5757 void PG::queue_null(epoch_t msg_epoch
,
5758 epoch_t query_epoch
)
5760 dout(10) << "null" << dendl
;
5761 queue_peering_event(
5762 CephPeeringEvtRef(std::make_shared
<CephPeeringEvt
>(msg_epoch
, query_epoch
,
5766 void PG::queue_flushed(epoch_t e
)
5768 dout(10) << "flushed" << dendl
;
5769 queue_peering_event(
5770 CephPeeringEvtRef(std::make_shared
<CephPeeringEvt
>(e
, e
,
5774 void PG::queue_query(epoch_t msg_epoch
,
5775 epoch_t query_epoch
,
5776 pg_shard_t from
, const pg_query_t
& q
)
5778 dout(10) << "handle_query " << q
<< " from replica " << from
<< dendl
;
5779 queue_peering_event(
5780 CephPeeringEvtRef(std::make_shared
<CephPeeringEvt
>(msg_epoch
, query_epoch
,
5781 MQuery(from
, q
, query_epoch
))));
5784 void PG::handle_advance_map(
5785 OSDMapRef osdmap
, OSDMapRef lastmap
,
5786 vector
<int>& newup
, int up_primary
,
5787 vector
<int>& newacting
, int acting_primary
,
5790 assert(lastmap
->get_epoch() == osdmap_ref
->get_epoch());
5791 assert(lastmap
== osdmap_ref
);
5792 dout(10) << "handle_advance_map "
5793 << newup
<< "/" << newacting
5794 << " -- " << up_primary
<< "/" << acting_primary
5796 update_osdmap_ref(osdmap
);
5797 pool
.update(osdmap
);
5798 past_intervals
.update_type_from_map(pool
.info
.ec_pool(), *osdmap
);
5799 if (cct
->_conf
->osd_debug_verify_cached_snaps
) {
5800 interval_set
<snapid_t
> actual_removed_snaps
;
5801 const pg_pool_t
*pi
= osdmap
->get_pg_pool(info
.pgid
.pool());
5803 pi
->build_removed_snaps(actual_removed_snaps
);
5804 if (!(actual_removed_snaps
== pool
.cached_removed_snaps
)) {
5805 derr
<< __func__
<< ": mismatch between the actual removed snaps "
5806 << actual_removed_snaps
<< " and pool.cached_removed_snaps "
5807 << " pool.cached_removed_snaps " << pool
.cached_removed_snaps
5810 assert(actual_removed_snaps
== pool
.cached_removed_snaps
);
5813 osdmap
, lastmap
, newup
, up_primary
,
5814 newacting
, acting_primary
);
5815 recovery_state
.handle_event(evt
, rctx
);
5816 if (pool
.info
.last_change
== osdmap_ref
->get_epoch()) {
5818 update_store_with_options();
5822 void PG::handle_activate_map(RecoveryCtx
*rctx
)
5824 dout(10) << "handle_activate_map " << dendl
;
5826 recovery_state
.handle_event(evt
, rctx
);
5827 if (osdmap_ref
->get_epoch() - last_persisted_osdmap_ref
->get_epoch() >
5828 cct
->_conf
->osd_pg_epoch_persisted_max_stale
) {
5829 dout(20) << __func__
<< ": Dirtying info: last_persisted is "
5830 << last_persisted_osdmap_ref
->get_epoch()
5831 << " while current is " << osdmap_ref
->get_epoch() << dendl
;
5834 dout(20) << __func__
<< ": Not dirtying info: last_persisted is "
5835 << last_persisted_osdmap_ref
->get_epoch()
5836 << " while current is " << osdmap_ref
->get_epoch() << dendl
;
5838 if (osdmap_ref
->check_new_blacklist_entries()) check_blacklisted_watchers();
5841 void PG::handle_loaded(RecoveryCtx
*rctx
)
5843 dout(10) << "handle_loaded" << dendl
;
5845 recovery_state
.handle_event(evt
, rctx
);
5848 void PG::handle_create(RecoveryCtx
*rctx
)
5850 dout(10) << "handle_create" << dendl
;
5851 rctx
->created_pgs
.insert(this);
5853 recovery_state
.handle_event(evt
, rctx
);
5855 recovery_state
.handle_event(evt2
, rctx
);
5858 void PG::handle_query_state(Formatter
*f
)
5860 dout(10) << "handle_query_state" << dendl
;
5862 recovery_state
.handle_event(q
, 0);
5865 void PG::update_store_with_options()
5867 auto r
= osd
->store
->set_collection_opts(coll
, pool
.info
.opts
);
5868 if(r
< 0 && r
!= -EOPNOTSUPP
) {
5869 derr
<< __func__
<< "set_collection_opts returns error:" << r
<< dendl
;
5873 void PG::update_store_on_load()
5875 if (osd
->store
->get_type() == "filestore") {
5876 // legacy filestore didn't store collection bit width; fix.
5877 int bits
= osd
->store
->collection_bits(coll
);
5882 bits
= info
.pgid
.get_split_bits(pool
.info
.get_pg_num());
5883 lderr(cct
) << __func__
<< " setting bit width to " << bits
<< dendl
;
5884 ObjectStore::Transaction t
;
5885 t
.collection_set_bits(coll
, bits
);
5886 osd
->store
->apply_transaction(osr
.get(), std::move(t
));
5891 /*------------ Recovery State Machine----------------*/
5893 #define dout_prefix (*_dout << context< RecoveryMachine >().pg->gen_prefix() \
5894 << "state<" << get_state_name() << ">: ")
5896 /*------Crashed-------*/
5897 PG::RecoveryState::Crashed::Crashed(my_context ctx
)
5899 NamedState(context
< RecoveryMachine
>().pg
, "Crashed")
5901 context
< RecoveryMachine
>().log_enter(state_name
);
5902 assert(0 == "we got a bad state machine event");
5906 /*------Initial-------*/
5907 PG::RecoveryState::Initial::Initial(my_context ctx
)
5909 NamedState(context
< RecoveryMachine
>().pg
, "Initial")
5911 context
< RecoveryMachine
>().log_enter(state_name
);
5914 boost::statechart::result
PG::RecoveryState::Initial::react(const Load
& l
)
5916 PG
*pg
= context
< RecoveryMachine
>().pg
;
5918 // do we tell someone we're here?
5919 pg
->send_notify
= (!pg
->is_primary());
5920 pg
->update_store_with_options();
5922 pg
->update_store_on_load();
5924 return transit
< Reset
>();
5927 boost::statechart::result
PG::RecoveryState::Initial::react(const MNotifyRec
& notify
)
5929 PG
*pg
= context
< RecoveryMachine
>().pg
;
5930 pg
->proc_replica_info(
5931 notify
.from
, notify
.notify
.info
, notify
.notify
.epoch_sent
);
5932 pg
->set_last_peering_reset();
5933 return transit
< Primary
>();
5936 boost::statechart::result
PG::RecoveryState::Initial::react(const MInfoRec
& i
)
5938 PG
*pg
= context
< RecoveryMachine
>().pg
;
5939 assert(!pg
->is_primary());
5941 return transit
< Stray
>();
5944 boost::statechart::result
PG::RecoveryState::Initial::react(const MLogRec
& i
)
5946 PG
*pg
= context
< RecoveryMachine
>().pg
;
5947 assert(!pg
->is_primary());
5949 return transit
< Stray
>();
5952 void PG::RecoveryState::Initial::exit()
5954 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
5955 PG
*pg
= context
< RecoveryMachine
>().pg
;
5956 utime_t dur
= ceph_clock_now() - enter_time
;
5957 pg
->osd
->recoverystate_perf
->tinc(rs_initial_latency
, dur
);
5960 /*------Started-------*/
5961 PG::RecoveryState::Started::Started(my_context ctx
)
5963 NamedState(context
< RecoveryMachine
>().pg
, "Started")
5965 context
< RecoveryMachine
>().log_enter(state_name
);
5968 boost::statechart::result
5969 PG::RecoveryState::Started::react(const IntervalFlush
&)
5971 PG
*pg
= context
< RecoveryMachine
>().pg
;
5972 ldout(pg
->cct
, 10) << "Ending blocked outgoing recovery messages" << dendl
;
5973 context
< RecoveryMachine
>().pg
->recovery_state
.end_block_outgoing();
5974 return discard_event();
5978 boost::statechart::result
5979 PG::RecoveryState::Started::react(const FlushedEvt
&)
5981 PG
*pg
= context
< RecoveryMachine
>().pg
;
5983 return discard_event();
5987 boost::statechart::result
PG::RecoveryState::Started::react(const AdvMap
& advmap
)
5989 PG
*pg
= context
< RecoveryMachine
>().pg
;
5990 ldout(pg
->cct
, 10) << "Started advmap" << dendl
;
5991 pg
->check_full_transition(advmap
.lastmap
, advmap
.osdmap
);
5992 if (pg
->should_restart_peering(
5994 advmap
.acting_primary
,
5999 ldout(pg
->cct
, 10) << "should_restart_peering, transitioning to Reset"
6002 return transit
< Reset
>();
6004 pg
->remove_down_peer_info(advmap
.osdmap
);
6005 return discard_event();
6008 boost::statechart::result
PG::RecoveryState::Started::react(const QueryState
& q
)
6010 q
.f
->open_object_section("state");
6011 q
.f
->dump_string("name", state_name
);
6012 q
.f
->dump_stream("enter_time") << enter_time
;
6013 q
.f
->close_section();
6014 return discard_event();
6017 void PG::RecoveryState::Started::exit()
6019 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
6020 PG
*pg
= context
< RecoveryMachine
>().pg
;
6021 utime_t dur
= ceph_clock_now() - enter_time
;
6022 pg
->osd
->recoverystate_perf
->tinc(rs_started_latency
, dur
);
6025 /*--------Reset---------*/
6026 PG::RecoveryState::Reset::Reset(my_context ctx
)
6028 NamedState(context
< RecoveryMachine
>().pg
, "Reset")
6030 context
< RecoveryMachine
>().log_enter(state_name
);
6031 PG
*pg
= context
< RecoveryMachine
>().pg
;
6033 pg
->flushes_in_progress
= 0;
6034 pg
->set_last_peering_reset();
6037 boost::statechart::result
6038 PG::RecoveryState::Reset::react(const FlushedEvt
&)
6040 PG
*pg
= context
< RecoveryMachine
>().pg
;
6042 return discard_event();
6045 boost::statechart::result
6046 PG::RecoveryState::Reset::react(const IntervalFlush
&)
6048 PG
*pg
= context
< RecoveryMachine
>().pg
;
6049 ldout(pg
->cct
, 10) << "Ending blocked outgoing recovery messages" << dendl
;
6050 context
< RecoveryMachine
>().pg
->recovery_state
.end_block_outgoing();
6051 return discard_event();
6054 boost::statechart::result
PG::RecoveryState::Reset::react(const AdvMap
& advmap
)
6056 PG
*pg
= context
< RecoveryMachine
>().pg
;
6057 ldout(pg
->cct
, 10) << "Reset advmap" << dendl
;
6059 pg
->check_full_transition(advmap
.lastmap
, advmap
.osdmap
);
6061 if (pg
->should_restart_peering(
6063 advmap
.acting_primary
,
6068 ldout(pg
->cct
, 10) << "should restart peering, calling start_peering_interval again"
6070 pg
->start_peering_interval(
6072 advmap
.newup
, advmap
.up_primary
,
6073 advmap
.newacting
, advmap
.acting_primary
,
6074 context
< RecoveryMachine
>().get_cur_transaction());
6076 pg
->remove_down_peer_info(advmap
.osdmap
);
6077 pg
->check_past_interval_bounds();
6078 return discard_event();
6081 boost::statechart::result
PG::RecoveryState::Reset::react(const ActMap
&)
6083 PG
*pg
= context
< RecoveryMachine
>().pg
;
6084 if (pg
->should_send_notify() && pg
->get_primary().osd
>= 0) {
6085 context
< RecoveryMachine
>().send_notify(
6088 pg
->get_primary().shard
, pg
->pg_whoami
.shard
,
6089 pg
->get_osdmap()->get_epoch(),
6090 pg
->get_osdmap()->get_epoch(),
6092 pg
->past_intervals
);
6095 pg
->update_heartbeat_peers();
6098 return transit
< Started
>();
6101 boost::statechart::result
PG::RecoveryState::Reset::react(const QueryState
& q
)
6103 q
.f
->open_object_section("state");
6104 q
.f
->dump_string("name", state_name
);
6105 q
.f
->dump_stream("enter_time") << enter_time
;
6106 q
.f
->close_section();
6107 return discard_event();
6110 void PG::RecoveryState::Reset::exit()
6112 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
6113 PG
*pg
= context
< RecoveryMachine
>().pg
;
6114 utime_t dur
= ceph_clock_now() - enter_time
;
6115 pg
->osd
->recoverystate_perf
->tinc(rs_reset_latency
, dur
);
6118 /*-------Start---------*/
6119 PG::RecoveryState::Start::Start(my_context ctx
)
6121 NamedState(context
< RecoveryMachine
>().pg
, "Start")
6123 context
< RecoveryMachine
>().log_enter(state_name
);
6125 PG
*pg
= context
< RecoveryMachine
>().pg
;
6126 if (pg
->is_primary()) {
6127 ldout(pg
->cct
, 1) << "transitioning to Primary" << dendl
;
6128 post_event(MakePrimary());
6130 ldout(pg
->cct
, 1) << "transitioning to Stray" << dendl
;
6131 post_event(MakeStray());
6135 void PG::RecoveryState::Start::exit()
6137 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
6138 PG
*pg
= context
< RecoveryMachine
>().pg
;
6139 utime_t dur
= ceph_clock_now() - enter_time
;
6140 pg
->osd
->recoverystate_perf
->tinc(rs_start_latency
, dur
);
6143 /*---------Primary--------*/
6144 PG::RecoveryState::Primary::Primary(my_context ctx
)
6146 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary")
6148 context
< RecoveryMachine
>().log_enter(state_name
);
6149 PG
*pg
= context
< RecoveryMachine
>().pg
;
6150 assert(pg
->want_acting
.empty());
6152 // set CREATING bit until we have peered for the first time.
6153 if (pg
->info
.history
.last_epoch_started
== 0) {
6154 pg
->state_set(PG_STATE_CREATING
);
6155 // use the history timestamp, which ultimately comes from the
6156 // monitor in the create case.
6157 utime_t t
= pg
->info
.history
.last_scrub_stamp
;
6158 pg
->info
.stats
.last_fresh
= t
;
6159 pg
->info
.stats
.last_active
= t
;
6160 pg
->info
.stats
.last_change
= t
;
6161 pg
->info
.stats
.last_peered
= t
;
6162 pg
->info
.stats
.last_clean
= t
;
6163 pg
->info
.stats
.last_unstale
= t
;
6164 pg
->info
.stats
.last_undegraded
= t
;
6165 pg
->info
.stats
.last_fullsized
= t
;
6166 pg
->info
.stats
.last_scrub_stamp
= t
;
6167 pg
->info
.stats
.last_deep_scrub_stamp
= t
;
6168 pg
->info
.stats
.last_clean_scrub_stamp
= t
;
6172 boost::statechart::result
PG::RecoveryState::Primary::react(const MNotifyRec
& notevt
)
6174 PG
*pg
= context
< RecoveryMachine
>().pg
;
6175 ldout(pg
->cct
, 7) << "handle_pg_notify from osd." << notevt
.from
<< dendl
;
6176 pg
->proc_replica_info(
6177 notevt
.from
, notevt
.notify
.info
, notevt
.notify
.epoch_sent
);
6178 return discard_event();
6181 boost::statechart::result
PG::RecoveryState::Primary::react(const ActMap
&)
6183 PG
*pg
= context
< RecoveryMachine
>().pg
;
6184 ldout(pg
->cct
, 7) << "handle ActMap primary" << dendl
;
6185 pg
->publish_stats_to_osd();
6187 return discard_event();
6190 void PG::RecoveryState::Primary::exit()
6192 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
6193 PG
*pg
= context
< RecoveryMachine
>().pg
;
6194 pg
->want_acting
.clear();
6195 utime_t dur
= ceph_clock_now() - enter_time
;
6196 pg
->osd
->recoverystate_perf
->tinc(rs_primary_latency
, dur
);
6197 pg
->clear_primary_state();
6198 pg
->state_clear(PG_STATE_CREATING
);
6201 /*---------Peering--------*/
6202 PG::RecoveryState::Peering::Peering(my_context ctx
)
6204 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Peering"),
6205 history_les_bound(false)
6207 context
< RecoveryMachine
>().log_enter(state_name
);
6209 PG
*pg
= context
< RecoveryMachine
>().pg
;
6210 assert(!pg
->is_peered());
6211 assert(!pg
->is_peering());
6212 assert(pg
->is_primary());
6213 pg
->state_set(PG_STATE_PEERING
);
6216 boost::statechart::result
PG::RecoveryState::Peering::react(const AdvMap
& advmap
)
6218 PG
*pg
= context
< RecoveryMachine
>().pg
;
6219 ldout(pg
->cct
, 10) << "Peering advmap" << dendl
;
6220 if (prior_set
.affected_by_map(*(advmap
.osdmap
), pg
)) {
6221 ldout(pg
->cct
, 1) << "Peering, affected_by_map, going to Reset" << dendl
;
6223 return transit
< Reset
>();
6226 pg
->adjust_need_up_thru(advmap
.osdmap
);
6228 return forward_event();
6231 boost::statechart::result
PG::RecoveryState::Peering::react(const QueryState
& q
)
6233 PG
*pg
= context
< RecoveryMachine
>().pg
;
6235 q
.f
->open_object_section("state");
6236 q
.f
->dump_string("name", state_name
);
6237 q
.f
->dump_stream("enter_time") << enter_time
;
6239 q
.f
->open_array_section("past_intervals");
6240 pg
->past_intervals
.dump(q
.f
);
6241 q
.f
->close_section();
6243 q
.f
->open_array_section("probing_osds");
6244 for (set
<pg_shard_t
>::iterator p
= prior_set
.probe
.begin();
6245 p
!= prior_set
.probe
.end();
6247 q
.f
->dump_stream("osd") << *p
;
6248 q
.f
->close_section();
6250 if (prior_set
.pg_down
)
6251 q
.f
->dump_string("blocked", "peering is blocked due to down osds");
6253 q
.f
->open_array_section("down_osds_we_would_probe");
6254 for (set
<int>::iterator p
= prior_set
.down
.begin();
6255 p
!= prior_set
.down
.end();
6257 q
.f
->dump_int("osd", *p
);
6258 q
.f
->close_section();
6260 q
.f
->open_array_section("peering_blocked_by");
6261 for (map
<int,epoch_t
>::iterator p
= prior_set
.blocked_by
.begin();
6262 p
!= prior_set
.blocked_by
.end();
6264 q
.f
->open_object_section("osd");
6265 q
.f
->dump_int("osd", p
->first
);
6266 q
.f
->dump_int("current_lost_at", p
->second
);
6267 q
.f
->dump_string("comment", "starting or marking this osd lost may let us proceed");
6268 q
.f
->close_section();
6270 q
.f
->close_section();
6272 if (history_les_bound
) {
6273 q
.f
->open_array_section("peering_blocked_by_detail");
6274 q
.f
->open_object_section("item");
6275 q
.f
->dump_string("detail","peering_blocked_by_history_les_bound");
6276 q
.f
->close_section();
6277 q
.f
->close_section();
6280 q
.f
->close_section();
6281 return forward_event();
6284 void PG::RecoveryState::Peering::exit()
6286 PG
*pg
= context
< RecoveryMachine
>().pg
;
6287 ldout(pg
->cct
, 10) << "Leaving Peering" << dendl
;
6288 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
6289 pg
->state_clear(PG_STATE_PEERING
);
6290 pg
->clear_probe_targets();
6292 utime_t dur
= ceph_clock_now() - enter_time
;
6293 pg
->osd
->recoverystate_perf
->tinc(rs_peering_latency
, dur
);
6297 /*------Backfilling-------*/
6298 PG::RecoveryState::Backfilling::Backfilling(my_context ctx
)
6300 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Active/Backfilling")
6302 context
< RecoveryMachine
>().log_enter(state_name
);
6303 PG
*pg
= context
< RecoveryMachine
>().pg
;
6304 pg
->backfill_reserved
= true;
6305 pg
->queue_recovery();
6306 pg
->state_clear(PG_STATE_BACKFILL_TOOFULL
);
6307 pg
->state_clear(PG_STATE_BACKFILL_WAIT
);
6308 pg
->state_set(PG_STATE_BACKFILL
);
6309 pg
->publish_stats_to_osd();
6312 boost::statechart::result
6313 PG::RecoveryState::Backfilling::react(const CancelBackfill
&)
6315 PG
*pg
= context
< RecoveryMachine
>().pg
;
6316 pg
->osd
->local_reserver
.cancel_reservation(pg
->info
.pgid
);
6317 // XXX: Add a new pg state so user can see why backfill isn't proceeding
6318 // Can't use PG_STATE_BACKFILL_WAIT since it means waiting for reservations
6319 //pg->state_set(PG_STATE_BACKFILL_STALLED????);
6321 for (set
<pg_shard_t
>::iterator it
= pg
->backfill_targets
.begin();
6322 it
!= pg
->backfill_targets
.end();
6324 assert(*it
!= pg
->pg_whoami
);
6325 ConnectionRef con
= pg
->osd
->get_con_osd_cluster(
6326 it
->osd
, pg
->get_osdmap()->get_epoch());
6328 pg
->osd
->send_message_osd_cluster(
6329 new MBackfillReserve(
6330 MBackfillReserve::REJECT
,
6331 spg_t(pg
->info
.pgid
.pgid
, it
->shard
),
6332 pg
->get_osdmap()->get_epoch()),
6337 pg
->waiting_on_backfill
.clear();
6339 pg
->schedule_backfill_full_retry();
6340 return transit
<NotBackfilling
>();
6343 boost::statechart::result
6344 PG::RecoveryState::Backfilling::react(const RemoteReservationRejected
&)
6346 PG
*pg
= context
< RecoveryMachine
>().pg
;
6347 pg
->osd
->local_reserver
.cancel_reservation(pg
->info
.pgid
);
6348 pg
->state_set(PG_STATE_BACKFILL_TOOFULL
);
6350 for (set
<pg_shard_t
>::iterator it
= pg
->backfill_targets
.begin();
6351 it
!= pg
->backfill_targets
.end();
6353 assert(*it
!= pg
->pg_whoami
);
6354 ConnectionRef con
= pg
->osd
->get_con_osd_cluster(
6355 it
->osd
, pg
->get_osdmap()->get_epoch());
6357 pg
->osd
->send_message_osd_cluster(
6358 new MBackfillReserve(
6359 MBackfillReserve::REJECT
,
6360 spg_t(pg
->info
.pgid
.pgid
, it
->shard
),
6361 pg
->get_osdmap()->get_epoch()),
6366 pg
->waiting_on_backfill
.clear();
6367 pg
->finish_recovery_op(hobject_t::get_max());
6369 pg
->schedule_backfill_full_retry();
6370 return transit
<NotBackfilling
>();
6373 void PG::RecoveryState::Backfilling::exit()
6375 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
6376 PG
*pg
= context
< RecoveryMachine
>().pg
;
6377 pg
->backfill_reserved
= false;
6378 pg
->backfill_reserving
= false;
6379 pg
->state_clear(PG_STATE_BACKFILL
);
6380 utime_t dur
= ceph_clock_now() - enter_time
;
6381 pg
->osd
->recoverystate_perf
->tinc(rs_backfilling_latency
, dur
);
6384 /*--WaitRemoteBackfillReserved--*/
6386 PG::RecoveryState::WaitRemoteBackfillReserved::WaitRemoteBackfillReserved(my_context ctx
)
6388 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Active/WaitRemoteBackfillReserved"),
6389 backfill_osd_it(context
< Active
>().remote_shards_to_reserve_backfill
.begin())
6391 context
< RecoveryMachine
>().log_enter(state_name
);
6392 PG
*pg
= context
< RecoveryMachine
>().pg
;
6393 pg
->state_set(PG_STATE_BACKFILL_WAIT
);
6394 pg
->publish_stats_to_osd();
6395 post_event(RemoteBackfillReserved());
6398 boost::statechart::result
6399 PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteBackfillReserved
&evt
)
6401 PG
*pg
= context
< RecoveryMachine
>().pg
;
6403 if (backfill_osd_it
!= context
< Active
>().remote_shards_to_reserve_backfill
.end()) {
6404 //The primary never backfills itself
6405 assert(*backfill_osd_it
!= pg
->pg_whoami
);
6406 ConnectionRef con
= pg
->osd
->get_con_osd_cluster(
6407 backfill_osd_it
->osd
, pg
->get_osdmap()->get_epoch());
6409 pg
->osd
->send_message_osd_cluster(
6410 new MBackfillReserve(
6411 MBackfillReserve::REQUEST
,
6412 spg_t(pg
->info
.pgid
.pgid
, backfill_osd_it
->shard
),
6413 pg
->get_osdmap()->get_epoch(),
6414 pg
->get_backfill_priority()),
6419 post_event(AllBackfillsReserved());
6421 return discard_event();
6424 void PG::RecoveryState::WaitRemoteBackfillReserved::exit()
6426 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
6427 PG
*pg
= context
< RecoveryMachine
>().pg
;
6428 utime_t dur
= ceph_clock_now() - enter_time
;
6429 pg
->osd
->recoverystate_perf
->tinc(rs_waitremotebackfillreserved_latency
, dur
);
6432 boost::statechart::result
6433 PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteReservationRejected
&evt
)
6435 PG
*pg
= context
< RecoveryMachine
>().pg
;
6436 pg
->osd
->local_reserver
.cancel_reservation(pg
->info
.pgid
);
6438 // Send REJECT to all previously acquired reservations
6439 set
<pg_shard_t
>::const_iterator it
, begin
, end
, next
;
6440 begin
= context
< Active
>().remote_shards_to_reserve_backfill
.begin();
6441 end
= context
< Active
>().remote_shards_to_reserve_backfill
.end();
6442 assert(begin
!= end
);
6443 for (next
= it
= begin
, ++next
; next
!= backfill_osd_it
; ++it
, ++next
) {
6444 //The primary never backfills itself
6445 assert(*it
!= pg
->pg_whoami
);
6446 ConnectionRef con
= pg
->osd
->get_con_osd_cluster(
6447 it
->osd
, pg
->get_osdmap()->get_epoch());
6449 pg
->osd
->send_message_osd_cluster(
6450 new MBackfillReserve(
6451 MBackfillReserve::REJECT
,
6452 spg_t(pg
->info
.pgid
.pgid
, it
->shard
),
6453 pg
->get_osdmap()->get_epoch()),
6458 pg
->state_clear(PG_STATE_BACKFILL_WAIT
);
6459 pg
->state_set(PG_STATE_BACKFILL_TOOFULL
);
6460 pg
->publish_stats_to_osd();
6462 pg
->schedule_backfill_full_retry();
6464 return transit
<NotBackfilling
>();
6467 /*--WaitLocalBackfillReserved--*/
6468 PG::RecoveryState::WaitLocalBackfillReserved::WaitLocalBackfillReserved(my_context ctx
)
6470 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Active/WaitLocalBackfillReserved")
6472 context
< RecoveryMachine
>().log_enter(state_name
);
6473 PG
*pg
= context
< RecoveryMachine
>().pg
;
6474 pg
->state_set(PG_STATE_BACKFILL_WAIT
);
6475 pg
->osd
->local_reserver
.request_reservation(
6477 new QueuePeeringEvt
<LocalBackfillReserved
>(
6478 pg
, pg
->get_osdmap()->get_epoch(),
6479 LocalBackfillReserved()),
6480 pg
->get_backfill_priority());
6481 pg
->publish_stats_to_osd();
6484 void PG::RecoveryState::WaitLocalBackfillReserved::exit()
6486 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
6487 PG
*pg
= context
< RecoveryMachine
>().pg
;
6488 utime_t dur
= ceph_clock_now() - enter_time
;
6489 pg
->osd
->recoverystate_perf
->tinc(rs_waitlocalbackfillreserved_latency
, dur
);
6492 /*----NotBackfilling------*/
6493 PG::RecoveryState::NotBackfilling::NotBackfilling(my_context ctx
)
6495 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Active/NotBackfilling")
6497 context
< RecoveryMachine
>().log_enter(state_name
);
6498 PG
*pg
= context
< RecoveryMachine
>().pg
;
6499 pg
->publish_stats_to_osd();
6502 boost::statechart::result
6503 PG::RecoveryState::NotBackfilling::react(const RemoteBackfillReserved
&evt
)
6505 return discard_event();
6508 boost::statechart::result
6509 PG::RecoveryState::NotBackfilling::react(const RemoteReservationRejected
&evt
)
6511 return discard_event();
6514 void PG::RecoveryState::NotBackfilling::exit()
6516 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
6517 PG
*pg
= context
< RecoveryMachine
>().pg
;
6518 utime_t dur
= ceph_clock_now() - enter_time
;
6519 pg
->osd
->recoverystate_perf
->tinc(rs_notbackfilling_latency
, dur
);
6522 /*----NotRecovering------*/
6523 PG::RecoveryState::NotRecovering::NotRecovering(my_context ctx
)
6525 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Active/NotRecovering")
6527 context
< RecoveryMachine
>().log_enter(state_name
);
6528 PG
*pg
= context
< RecoveryMachine
>().pg
;
6529 pg
->publish_stats_to_osd();
6532 void PG::RecoveryState::NotRecovering::exit()
6534 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
6535 PG
*pg
= context
< RecoveryMachine
>().pg
;
6536 utime_t dur
= ceph_clock_now() - enter_time
;
6537 pg
->osd
->recoverystate_perf
->tinc(rs_notrecovering_latency
, dur
);
6540 /*---RepNotRecovering----*/
6541 PG::RecoveryState::RepNotRecovering::RepNotRecovering(my_context ctx
)
6543 NamedState(context
< RecoveryMachine
>().pg
, "Started/ReplicaActive/RepNotRecovering")
6545 context
< RecoveryMachine
>().log_enter(state_name
);
6548 void PG::RecoveryState::RepNotRecovering::exit()
6550 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
6551 PG
*pg
= context
< RecoveryMachine
>().pg
;
6552 utime_t dur
= ceph_clock_now() - enter_time
;
6553 pg
->osd
->recoverystate_perf
->tinc(rs_repnotrecovering_latency
, dur
);
6556 /*---RepWaitRecoveryReserved--*/
6557 PG::RecoveryState::RepWaitRecoveryReserved::RepWaitRecoveryReserved(my_context ctx
)
6559 NamedState(context
< RecoveryMachine
>().pg
, "Started/ReplicaActive/RepWaitRecoveryReserved")
6561 context
< RecoveryMachine
>().log_enter(state_name
);
6562 PG
*pg
= context
< RecoveryMachine
>().pg
;
6564 pg
->osd
->remote_reserver
.request_reservation(
6566 new QueuePeeringEvt
<RemoteRecoveryReserved
>(
6567 pg
, pg
->get_osdmap()->get_epoch(),
6568 RemoteRecoveryReserved()),
6569 pg
->get_recovery_priority());
6572 boost::statechart::result
6573 PG::RecoveryState::RepWaitRecoveryReserved::react(const RemoteRecoveryReserved
&evt
)
6575 PG
*pg
= context
< RecoveryMachine
>().pg
;
6576 pg
->osd
->send_message_osd_cluster(
6578 new MRecoveryReserve(
6579 MRecoveryReserve::GRANT
,
6580 spg_t(pg
->info
.pgid
.pgid
, pg
->primary
.shard
),
6581 pg
->get_osdmap()->get_epoch()),
6582 pg
->get_osdmap()->get_epoch());
6583 return transit
<RepRecovering
>();
6586 void PG::RecoveryState::RepWaitRecoveryReserved::exit()
6588 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
6589 PG
*pg
= context
< RecoveryMachine
>().pg
;
6590 utime_t dur
= ceph_clock_now() - enter_time
;
6591 pg
->osd
->recoverystate_perf
->tinc(rs_repwaitrecoveryreserved_latency
, dur
);
6594 /*-RepWaitBackfillReserved*/
6595 PG::RecoveryState::RepWaitBackfillReserved::RepWaitBackfillReserved(my_context ctx
)
6597 NamedState(context
< RecoveryMachine
>().pg
, "Started/ReplicaActive/RepWaitBackfillReserved")
6599 context
< RecoveryMachine
>().log_enter(state_name
);
6602 boost::statechart::result
6603 PG::RecoveryState::RepNotRecovering::react(const RequestBackfillPrio
&evt
)
6605 PG
*pg
= context
< RecoveryMachine
>().pg
;
6608 if (pg
->cct
->_conf
->osd_debug_reject_backfill_probability
> 0 &&
6609 (rand()%1000 < (pg
->cct
->_conf
->osd_debug_reject_backfill_probability
*1000.0))) {
6610 ldout(pg
->cct
, 10) << "backfill reservation rejected: failure injection"
6612 post_event(RemoteReservationRejected());
6613 } else if (!pg
->cct
->_conf
->osd_debug_skip_full_check_in_backfill_reservation
&&
6614 pg
->osd
->check_backfill_full(ss
)) {
6615 ldout(pg
->cct
, 10) << "backfill reservation rejected: "
6616 << ss
.str() << dendl
;
6617 post_event(RemoteReservationRejected());
6619 pg
->osd
->remote_reserver
.request_reservation(
6621 new QueuePeeringEvt
<RemoteBackfillReserved
>(
6622 pg
, pg
->get_osdmap()->get_epoch(),
6623 RemoteBackfillReserved()), evt
.priority
);
6625 return transit
<RepWaitBackfillReserved
>();
6628 void PG::RecoveryState::RepWaitBackfillReserved::exit()
6630 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
6631 PG
*pg
= context
< RecoveryMachine
>().pg
;
6632 utime_t dur
= ceph_clock_now() - enter_time
;
6633 pg
->osd
->recoverystate_perf
->tinc(rs_repwaitbackfillreserved_latency
, dur
);
6636 boost::statechart::result
6637 PG::RecoveryState::RepWaitBackfillReserved::react(const RemoteBackfillReserved
&evt
)
6639 PG
*pg
= context
< RecoveryMachine
>().pg
;
6642 if (pg
->cct
->_conf
->osd_debug_reject_backfill_probability
> 0 &&
6643 (rand()%1000 < (pg
->cct
->_conf
->osd_debug_reject_backfill_probability
*1000.0))) {
6644 ldout(pg
->cct
, 10) << "backfill reservation rejected after reservation: "
6645 << "failure injection" << dendl
;
6646 pg
->osd
->remote_reserver
.cancel_reservation(pg
->info
.pgid
);
6647 post_event(RemoteReservationRejected());
6648 return discard_event();
6649 } else if (!pg
->cct
->_conf
->osd_debug_skip_full_check_in_backfill_reservation
&&
6650 pg
->osd
->check_backfill_full(ss
)) {
6651 ldout(pg
->cct
, 10) << "backfill reservation rejected after reservation: "
6652 << ss
.str() << dendl
;
6653 pg
->osd
->remote_reserver
.cancel_reservation(pg
->info
.pgid
);
6654 post_event(RemoteReservationRejected());
6655 return discard_event();
6657 pg
->osd
->send_message_osd_cluster(
6659 new MBackfillReserve(
6660 MBackfillReserve::GRANT
,
6661 spg_t(pg
->info
.pgid
.pgid
, pg
->primary
.shard
),
6662 pg
->get_osdmap()->get_epoch()),
6663 pg
->get_osdmap()->get_epoch());
6664 return transit
<RepRecovering
>();
6668 boost::statechart::result
6669 PG::RecoveryState::RepWaitBackfillReserved::react(const RemoteReservationRejected
&evt
)
6671 PG
*pg
= context
< RecoveryMachine
>().pg
;
6672 pg
->reject_reservation();
6673 return transit
<RepNotRecovering
>();
6676 /*---RepRecovering-------*/
6677 PG::RecoveryState::RepRecovering::RepRecovering(my_context ctx
)
6679 NamedState(context
< RecoveryMachine
>().pg
, "Started/ReplicaActive/RepRecovering")
6681 context
< RecoveryMachine
>().log_enter(state_name
);
6684 boost::statechart::result
6685 PG::RecoveryState::RepRecovering::react(const BackfillTooFull
&)
6687 PG
*pg
= context
< RecoveryMachine
>().pg
;
6688 pg
->reject_reservation();
6689 return discard_event();
6692 void PG::RecoveryState::RepRecovering::exit()
6694 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
6695 PG
*pg
= context
< RecoveryMachine
>().pg
;
6696 pg
->osd
->remote_reserver
.cancel_reservation(pg
->info
.pgid
);
6697 utime_t dur
= ceph_clock_now() - enter_time
;
6698 pg
->osd
->recoverystate_perf
->tinc(rs_reprecovering_latency
, dur
);
6701 /*------Activating--------*/
6702 PG::RecoveryState::Activating::Activating(my_context ctx
)
6704 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Active/Activating")
6706 context
< RecoveryMachine
>().log_enter(state_name
);
6709 void PG::RecoveryState::Activating::exit()
6711 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
6712 PG
*pg
= context
< RecoveryMachine
>().pg
;
6713 utime_t dur
= ceph_clock_now() - enter_time
;
6714 pg
->osd
->recoverystate_perf
->tinc(rs_activating_latency
, dur
);
6717 PG::RecoveryState::WaitLocalRecoveryReserved::WaitLocalRecoveryReserved(my_context ctx
)
6719 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Active/WaitLocalRecoveryReserved")
6721 context
< RecoveryMachine
>().log_enter(state_name
);
6722 PG
*pg
= context
< RecoveryMachine
>().pg
;
6724 // Make sure all nodes that part of the recovery aren't full
6725 if (!pg
->cct
->_conf
->osd_debug_skip_full_check_in_recovery
&&
6726 pg
->osd
->check_osdmap_full(pg
->actingbackfill
)) {
6727 post_event(RecoveryTooFull());
6731 pg
->state_clear(PG_STATE_RECOVERY_TOOFULL
);
6732 pg
->state_set(PG_STATE_RECOVERY_WAIT
);
6733 pg
->osd
->local_reserver
.request_reservation(
6735 new QueuePeeringEvt
<LocalRecoveryReserved
>(
6736 pg
, pg
->get_osdmap()->get_epoch(),
6737 LocalRecoveryReserved()),
6738 pg
->get_recovery_priority());
6739 pg
->publish_stats_to_osd();
6742 boost::statechart::result
6743 PG::RecoveryState::WaitLocalRecoveryReserved::react(const RecoveryTooFull
&evt
)
6745 PG
*pg
= context
< RecoveryMachine
>().pg
;
6746 pg
->state_set(PG_STATE_RECOVERY_TOOFULL
);
6747 pg
->schedule_recovery_full_retry();
6748 return transit
<NotRecovering
>();
6751 void PG::RecoveryState::WaitLocalRecoveryReserved::exit()
6753 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
6754 PG
*pg
= context
< RecoveryMachine
>().pg
;
6755 utime_t dur
= ceph_clock_now() - enter_time
;
6756 pg
->osd
->recoverystate_perf
->tinc(rs_waitlocalrecoveryreserved_latency
, dur
);
6759 PG::RecoveryState::WaitRemoteRecoveryReserved::WaitRemoteRecoveryReserved(my_context ctx
)
6761 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Active/WaitRemoteRecoveryReserved"),
6762 remote_recovery_reservation_it(context
< Active
>().remote_shards_to_reserve_recovery
.begin())
6764 context
< RecoveryMachine
>().log_enter(state_name
);
6765 post_event(RemoteRecoveryReserved());
6768 boost::statechart::result
6769 PG::RecoveryState::WaitRemoteRecoveryReserved::react(const RemoteRecoveryReserved
&evt
) {
6770 PG
*pg
= context
< RecoveryMachine
>().pg
;
6772 if (remote_recovery_reservation_it
!= context
< Active
>().remote_shards_to_reserve_recovery
.end()) {
6773 assert(*remote_recovery_reservation_it
!= pg
->pg_whoami
);
6774 ConnectionRef con
= pg
->osd
->get_con_osd_cluster(
6775 remote_recovery_reservation_it
->osd
, pg
->get_osdmap()->get_epoch());
6777 pg
->osd
->send_message_osd_cluster(
6778 new MRecoveryReserve(
6779 MRecoveryReserve::REQUEST
,
6780 spg_t(pg
->info
.pgid
.pgid
, remote_recovery_reservation_it
->shard
),
6781 pg
->get_osdmap()->get_epoch()),
6784 ++remote_recovery_reservation_it
;
6786 post_event(AllRemotesReserved());
6788 return discard_event();
6791 void PG::RecoveryState::WaitRemoteRecoveryReserved::exit()
6793 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
6794 PG
*pg
= context
< RecoveryMachine
>().pg
;
6795 utime_t dur
= ceph_clock_now() - enter_time
;
6796 pg
->osd
->recoverystate_perf
->tinc(rs_waitremoterecoveryreserved_latency
, dur
);
6799 PG::RecoveryState::Recovering::Recovering(my_context ctx
)
6801 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Active/Recovering")
6803 context
< RecoveryMachine
>().log_enter(state_name
);
6805 PG
*pg
= context
< RecoveryMachine
>().pg
;
6806 pg
->state_clear(PG_STATE_RECOVERY_WAIT
);
6807 pg
->state_clear(PG_STATE_RECOVERY_TOOFULL
);
6808 pg
->state_set(PG_STATE_RECOVERING
);
6809 pg
->publish_stats_to_osd();
6810 pg
->queue_recovery();
6813 void PG::RecoveryState::Recovering::release_reservations(bool cancel
)
6815 PG
*pg
= context
< RecoveryMachine
>().pg
;
6816 assert(cancel
|| !pg
->pg_log
.get_missing().have_missing());
6818 // release remote reservations
6819 for (set
<pg_shard_t
>::const_iterator i
=
6820 context
< Active
>().remote_shards_to_reserve_recovery
.begin();
6821 i
!= context
< Active
>().remote_shards_to_reserve_recovery
.end();
6823 if (*i
== pg
->pg_whoami
) // skip myself
6825 ConnectionRef con
= pg
->osd
->get_con_osd_cluster(
6826 i
->osd
, pg
->get_osdmap()->get_epoch());
6828 pg
->osd
->send_message_osd_cluster(
6829 new MRecoveryReserve(
6830 MRecoveryReserve::RELEASE
,
6831 spg_t(pg
->info
.pgid
.pgid
, i
->shard
),
6832 pg
->get_osdmap()->get_epoch()),
6838 boost::statechart::result
6839 PG::RecoveryState::Recovering::react(const AllReplicasRecovered
&evt
)
6841 PG
*pg
= context
< RecoveryMachine
>().pg
;
6842 pg
->state_clear(PG_STATE_RECOVERING
);
6843 release_reservations();
6844 return transit
<Recovered
>();
6847 boost::statechart::result
6848 PG::RecoveryState::Recovering::react(const RequestBackfill
&evt
)
6850 PG
*pg
= context
< RecoveryMachine
>().pg
;
6851 pg
->state_clear(PG_STATE_RECOVERING
);
6852 release_reservations();
6853 return transit
<WaitRemoteBackfillReserved
>();
6856 boost::statechart::result
6857 PG::RecoveryState::Recovering::react(const CancelRecovery
&evt
)
6859 PG
*pg
= context
< RecoveryMachine
>().pg
;
6860 pg
->state_clear(PG_STATE_RECOVERING
);
6861 pg
->osd
->local_reserver
.cancel_reservation(pg
->info
.pgid
);
6862 release_reservations(true);
6863 pg
->schedule_recovery_full_retry();
6864 return transit
<NotRecovering
>();
6867 void PG::RecoveryState::Recovering::exit()
6869 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
6870 PG
*pg
= context
< RecoveryMachine
>().pg
;
6871 utime_t dur
= ceph_clock_now() - enter_time
;
6872 pg
->osd
->recoverystate_perf
->tinc(rs_recovering_latency
, dur
);
6875 PG::RecoveryState::Recovered::Recovered(my_context ctx
)
6877 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Active/Recovered")
6879 pg_shard_t auth_log_shard
;
6881 context
< RecoveryMachine
>().log_enter(state_name
);
6883 PG
*pg
= context
< RecoveryMachine
>().pg
;
6884 pg
->osd
->local_reserver
.cancel_reservation(pg
->info
.pgid
);
6886 assert(!pg
->needs_recovery());
6888 // if we finished backfill, all acting are active; recheck if
6889 // DEGRADED | UNDERSIZED is appropriate.
6890 assert(!pg
->actingbackfill
.empty());
6891 if (pg
->get_osdmap()->get_pg_size(pg
->info
.pgid
.pgid
) <=
6892 pg
->actingbackfill
.size()) {
6893 pg
->state_clear(PG_STATE_DEGRADED
);
6894 pg
->publish_stats_to_osd();
6897 // trim pglog on recovered
6900 // adjust acting set? (e.g. because backfill completed...)
6901 bool history_les_bound
= false;
6902 if (pg
->acting
!= pg
->up
&& !pg
->choose_acting(auth_log_shard
,
6903 true, &history_les_bound
))
6904 assert(pg
->want_acting
.size());
6906 if (context
< Active
>().all_replicas_activated
)
6907 post_event(GoClean());
6910 void PG::RecoveryState::Recovered::exit()
6912 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
6913 PG
*pg
= context
< RecoveryMachine
>().pg
;
6914 utime_t dur
= ceph_clock_now() - enter_time
;
6915 pg
->osd
->recoverystate_perf
->tinc(rs_recovered_latency
, dur
);
6918 PG::RecoveryState::Clean::Clean(my_context ctx
)
6920 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Active/Clean")
6922 context
< RecoveryMachine
>().log_enter(state_name
);
6924 PG
*pg
= context
< RecoveryMachine
>().pg
;
6926 if (pg
->info
.last_complete
!= pg
->info
.last_update
) {
6929 pg
->finish_recovery(*context
< RecoveryMachine
>().get_on_safe_context_list());
6931 if (pg
->is_active()) {
6935 pg
->share_pg_info();
6936 pg
->publish_stats_to_osd();
6937 pg
->requeue_ops(pg
->waiting_for_clean_to_primary_repair
);
6940 void PG::RecoveryState::Clean::exit()
6942 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
6943 PG
*pg
= context
< RecoveryMachine
>().pg
;
6944 pg
->state_clear(PG_STATE_CLEAN
);
6945 utime_t dur
= ceph_clock_now() - enter_time
;
6946 pg
->osd
->recoverystate_perf
->tinc(rs_clean_latency
, dur
);
6949 template <typename T
>
6950 set
<pg_shard_t
> unique_osd_shard_set(const pg_shard_t
& skip
, const T
&in
)
6952 set
<int> osds_found
;
6953 set
<pg_shard_t
> out
;
6954 for (typename
T::const_iterator i
= in
.begin();
6957 if (*i
!= skip
&& !osds_found
.count(i
->osd
)) {
6958 osds_found
.insert(i
->osd
);
6965 /*---------Active---------*/
6966 PG::RecoveryState::Active::Active(my_context ctx
)
6968 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Active"),
6969 remote_shards_to_reserve_recovery(
6970 unique_osd_shard_set(
6971 context
< RecoveryMachine
>().pg
->pg_whoami
,
6972 context
< RecoveryMachine
>().pg
->actingbackfill
)),
6973 remote_shards_to_reserve_backfill(
6974 unique_osd_shard_set(
6975 context
< RecoveryMachine
>().pg
->pg_whoami
,
6976 context
< RecoveryMachine
>().pg
->backfill_targets
)),
6977 all_replicas_activated(false)
6979 context
< RecoveryMachine
>().log_enter(state_name
);
6981 PG
*pg
= context
< RecoveryMachine
>().pg
;
6983 assert(!pg
->backfill_reserving
);
6984 assert(!pg
->backfill_reserved
);
6985 assert(pg
->is_primary());
6986 ldout(pg
->cct
, 10) << "In Active, about to call activate" << dendl
;
6988 context
< RecoveryMachine
>().get_cur_transaction(),
6989 context
< RecoveryMachine
>().get_on_applied_context_list(),
6990 context
< RecoveryMachine
>().get_on_safe_context_list());
6991 pg
->activate(*context
< RecoveryMachine
>().get_cur_transaction(),
6992 pg
->get_osdmap()->get_epoch(),
6993 *context
< RecoveryMachine
>().get_on_safe_context_list(),
6994 *context
< RecoveryMachine
>().get_query_map(),
6995 context
< RecoveryMachine
>().get_info_map(),
6996 context
< RecoveryMachine
>().get_recovery_ctx());
6998 // everyone has to commit/ack before we are truly active
6999 pg
->blocked_by
.clear();
7000 for (set
<pg_shard_t
>::iterator p
= pg
->actingbackfill
.begin();
7001 p
!= pg
->actingbackfill
.end();
7003 if (p
->shard
!= pg
->pg_whoami
.shard
) {
7004 pg
->blocked_by
.insert(p
->shard
);
7007 pg
->publish_stats_to_osd();
7008 ldout(pg
->cct
, 10) << "Activate Finished" << dendl
;
7011 boost::statechart::result
PG::RecoveryState::Active::react(const AdvMap
& advmap
)
7013 PG
*pg
= context
< RecoveryMachine
>().pg
;
7014 ldout(pg
->cct
, 10) << "Active advmap" << dendl
;
7015 if (!pg
->pool
.newly_removed_snaps
.empty()) {
7016 pg
->snap_trimq
.union_of(pg
->pool
.newly_removed_snaps
);
7017 ldout(pg
->cct
, 10) << *pg
<< " snap_trimq now " << pg
->snap_trimq
<< dendl
;
7018 pg
->dirty_info
= true;
7019 pg
->dirty_big_info
= true;
7022 for (size_t i
= 0; i
< pg
->want_acting
.size(); i
++) {
7023 int osd
= pg
->want_acting
[i
];
7024 if (!advmap
.osdmap
->is_up(osd
)) {
7025 pg_shard_t
osd_with_shard(osd
, shard_id_t(i
));
7026 assert(pg
->is_acting(osd_with_shard
) || pg
->is_up(osd_with_shard
));
7030 bool need_publish
= false;
7031 /* Check for changes in pool size (if the acting set changed as a result,
7032 * this does not matter) */
7033 if (advmap
.lastmap
->get_pg_size(pg
->info
.pgid
.pgid
) !=
7034 pg
->get_osdmap()->get_pg_size(pg
->info
.pgid
.pgid
)) {
7035 if (pg
->get_osdmap()->get_pg_size(pg
->info
.pgid
.pgid
) <= pg
->actingset
.size()) {
7036 pg
->state_clear(PG_STATE_UNDERSIZED
);
7037 if (pg
->needs_recovery()) {
7038 pg
->state_set(PG_STATE_DEGRADED
);
7040 pg
->state_clear(PG_STATE_DEGRADED
);
7043 pg
->state_set(PG_STATE_UNDERSIZED
);
7044 pg
->state_set(PG_STATE_DEGRADED
);
7046 need_publish
= true; // degraded may have changed
7049 // if we haven't reported our PG stats in a long time, do so now.
7050 if (pg
->info
.stats
.reported_epoch
+ pg
->cct
->_conf
->osd_pg_stat_report_interval_max
< advmap
.osdmap
->get_epoch()) {
7051 ldout(pg
->cct
, 20) << "reporting stats to osd after " << (advmap
.osdmap
->get_epoch() - pg
->info
.stats
.reported_epoch
)
7052 << " epochs" << dendl
;
7053 need_publish
= true;
7057 pg
->publish_stats_to_osd();
7059 return forward_event();
7062 boost::statechart::result
PG::RecoveryState::Active::react(const ActMap
&)
7064 PG
*pg
= context
< RecoveryMachine
>().pg
;
7065 ldout(pg
->cct
, 10) << "Active: handling ActMap" << dendl
;
7066 assert(pg
->is_primary());
7068 if (pg
->have_unfound()) {
7069 // object may have become unfound
7070 pg
->discover_all_missing(*context
< RecoveryMachine
>().get_query_map());
7073 if (pg
->cct
->_conf
->osd_check_for_log_corruption
)
7074 pg
->check_log_for_corruption(pg
->osd
->store
);
7076 uint64_t unfound
= pg
->missing_loc
.num_unfound();
7078 pg
->all_unfound_are_queried_or_lost(pg
->get_osdmap())) {
7079 if (pg
->cct
->_conf
->osd_auto_mark_unfound_lost
) {
7080 pg
->osd
->clog
->error() << pg
->info
.pgid
.pgid
<< " has " << unfound
7081 << " objects unfound and apparently lost, would automatically marking lost but NOT IMPLEMENTED";
7083 pg
->osd
->clog
->error() << pg
->info
.pgid
.pgid
<< " has " << unfound
<< " objects unfound and apparently lost";
7086 if (pg
->is_active()) {
7087 ldout(pg
->cct
, 10) << "Active: kicking snap trim" << dendl
;
7088 pg
->kick_snap_trim();
7091 if (pg
->is_peered() &&
7093 !pg
->get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL
) &&
7094 (!pg
->get_osdmap()->test_flag(CEPH_OSDMAP_NOREBALANCE
) || pg
->is_degraded())) {
7095 pg
->queue_recovery();
7097 return forward_event();
7100 boost::statechart::result
PG::RecoveryState::Active::react(const MNotifyRec
& notevt
)
7102 PG
*pg
= context
< RecoveryMachine
>().pg
;
7103 assert(pg
->is_primary());
7104 if (pg
->peer_info
.count(notevt
.from
)) {
7105 ldout(pg
->cct
, 10) << "Active: got notify from " << notevt
.from
7106 << ", already have info from that osd, ignoring"
7108 } else if (pg
->peer_purged
.count(notevt
.from
)) {
7109 ldout(pg
->cct
, 10) << "Active: got notify from " << notevt
.from
7110 << ", already purged that peer, ignoring"
7113 ldout(pg
->cct
, 10) << "Active: got notify from " << notevt
.from
7114 << ", calling proc_replica_info and discover_all_missing"
7116 pg
->proc_replica_info(
7117 notevt
.from
, notevt
.notify
.info
, notevt
.notify
.epoch_sent
);
7118 if (pg
->have_unfound()) {
7119 pg
->discover_all_missing(*context
< RecoveryMachine
>().get_query_map());
7122 return discard_event();
7125 boost::statechart::result
PG::RecoveryState::Active::react(const MInfoRec
& infoevt
)
7127 PG
*pg
= context
< RecoveryMachine
>().pg
;
7128 assert(pg
->is_primary());
7130 assert(!pg
->actingbackfill
.empty());
7131 // don't update history (yet) if we are active and primary; the replica
7132 // may be telling us they have activated (and committed) but we can't
7133 // share that until _everyone_ does the same.
7134 if (pg
->is_actingbackfill(infoevt
.from
)) {
7135 ldout(pg
->cct
, 10) << " peer osd." << infoevt
.from
7136 << " activated and committed" << dendl
;
7137 pg
->peer_activated
.insert(infoevt
.from
);
7138 pg
->blocked_by
.erase(infoevt
.from
.shard
);
7139 pg
->publish_stats_to_osd();
7140 if (pg
->peer_activated
.size() == pg
->actingbackfill
.size()) {
7141 pg
->all_activated_and_committed();
7144 return discard_event();
7147 boost::statechart::result
PG::RecoveryState::Active::react(const MLogRec
& logevt
)
7149 PG
*pg
= context
< RecoveryMachine
>().pg
;
7150 ldout(pg
->cct
, 10) << "searching osd." << logevt
.from
7151 << " log for unfound items" << dendl
;
7152 pg
->proc_replica_log(
7153 logevt
.msg
->info
, logevt
.msg
->log
, logevt
.msg
->missing
, logevt
.from
);
7154 bool got_missing
= pg
->search_for_missing(
7155 pg
->peer_info
[logevt
.from
],
7156 pg
->peer_missing
[logevt
.from
],
7158 context
< RecoveryMachine
>().get_recovery_ctx());
7159 if (pg
->is_peered() &&
7161 pg
->queue_recovery();
7162 return discard_event();
7165 boost::statechart::result
PG::RecoveryState::Active::react(const QueryState
& q
)
7167 PG
*pg
= context
< RecoveryMachine
>().pg
;
7169 q
.f
->open_object_section("state");
7170 q
.f
->dump_string("name", state_name
);
7171 q
.f
->dump_stream("enter_time") << enter_time
;
7174 q
.f
->open_array_section("might_have_unfound");
7175 for (set
<pg_shard_t
>::iterator p
= pg
->might_have_unfound
.begin();
7176 p
!= pg
->might_have_unfound
.end();
7178 q
.f
->open_object_section("osd");
7179 q
.f
->dump_stream("osd") << *p
;
7180 if (pg
->peer_missing
.count(*p
)) {
7181 q
.f
->dump_string("status", "already probed");
7182 } else if (pg
->peer_missing_requested
.count(*p
)) {
7183 q
.f
->dump_string("status", "querying");
7184 } else if (!pg
->get_osdmap()->is_up(p
->osd
)) {
7185 q
.f
->dump_string("status", "osd is down");
7187 q
.f
->dump_string("status", "not queried");
7189 q
.f
->close_section();
7191 q
.f
->close_section();
7194 q
.f
->open_object_section("recovery_progress");
7195 pg
->dump_recovery_info(q
.f
);
7196 q
.f
->close_section();
7200 q
.f
->open_object_section("scrub");
7201 q
.f
->dump_stream("scrubber.epoch_start") << pg
->scrubber
.epoch_start
;
7202 q
.f
->dump_bool("scrubber.active", pg
->scrubber
.active
);
7203 q
.f
->dump_string("scrubber.state", Scrubber::state_string(pg
->scrubber
.state
));
7204 q
.f
->dump_stream("scrubber.start") << pg
->scrubber
.start
;
7205 q
.f
->dump_stream("scrubber.end") << pg
->scrubber
.end
;
7206 q
.f
->dump_stream("scrubber.subset_last_update") << pg
->scrubber
.subset_last_update
;
7207 q
.f
->dump_bool("scrubber.deep", pg
->scrubber
.deep
);
7208 q
.f
->dump_unsigned("scrubber.seed", pg
->scrubber
.seed
);
7209 q
.f
->dump_int("scrubber.waiting_on", pg
->scrubber
.waiting_on
);
7211 q
.f
->open_array_section("scrubber.waiting_on_whom");
7212 for (set
<pg_shard_t
>::iterator p
= pg
->scrubber
.waiting_on_whom
.begin();
7213 p
!= pg
->scrubber
.waiting_on_whom
.end();
7215 q
.f
->dump_stream("shard") << *p
;
7217 q
.f
->close_section();
7219 q
.f
->close_section();
7222 q
.f
->close_section();
7223 return forward_event();
7226 boost::statechart::result
PG::RecoveryState::Active::react(const AllReplicasActivated
&evt
)
7228 PG
*pg
= context
< RecoveryMachine
>().pg
;
7229 all_replicas_activated
= true;
7231 pg
->state_clear(PG_STATE_ACTIVATING
);
7232 pg
->state_clear(PG_STATE_CREATING
);
7233 if (pg
->acting
.size() >= pg
->pool
.info
.min_size
) {
7234 pg
->state_set(PG_STATE_ACTIVE
);
7236 pg
->state_set(PG_STATE_PEERED
);
7239 // info.last_epoch_started is set during activate()
7240 pg
->info
.history
.last_epoch_started
= pg
->info
.last_epoch_started
;
7241 pg
->info
.history
.last_interval_started
= pg
->info
.last_interval_started
;
7242 pg
->dirty_info
= true;
7244 pg
->share_pg_info();
7245 pg
->publish_stats_to_osd();
7250 if (pg
->flushes_in_progress
== 0) {
7251 pg
->requeue_ops(pg
->waiting_for_peered
);
7256 return discard_event();
7259 void PG::RecoveryState::Active::exit()
7261 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
7262 PG
*pg
= context
< RecoveryMachine
>().pg
;
7263 pg
->osd
->local_reserver
.cancel_reservation(pg
->info
.pgid
);
7265 pg
->blocked_by
.clear();
7266 pg
->backfill_reserved
= false;
7267 pg
->backfill_reserving
= false;
7268 pg
->state_clear(PG_STATE_ACTIVATING
);
7269 pg
->state_clear(PG_STATE_DEGRADED
);
7270 pg
->state_clear(PG_STATE_UNDERSIZED
);
7271 pg
->state_clear(PG_STATE_BACKFILL_TOOFULL
);
7272 pg
->state_clear(PG_STATE_BACKFILL_WAIT
);
7273 pg
->state_clear(PG_STATE_RECOVERY_WAIT
);
7274 pg
->state_clear(PG_STATE_RECOVERY_TOOFULL
);
7275 utime_t dur
= ceph_clock_now() - enter_time
;
7276 pg
->osd
->recoverystate_perf
->tinc(rs_active_latency
, dur
);
7280 /*------ReplicaActive-----*/
7281 PG::RecoveryState::ReplicaActive::ReplicaActive(my_context ctx
)
7283 NamedState(context
< RecoveryMachine
>().pg
, "Started/ReplicaActive")
7285 context
< RecoveryMachine
>().log_enter(state_name
);
7287 PG
*pg
= context
< RecoveryMachine
>().pg
;
7289 context
< RecoveryMachine
>().get_cur_transaction(),
7290 context
< RecoveryMachine
>().get_on_applied_context_list(),
7291 context
< RecoveryMachine
>().get_on_safe_context_list());
7295 boost::statechart::result
PG::RecoveryState::ReplicaActive::react(
7296 const Activate
& actevt
) {
7297 PG
*pg
= context
< RecoveryMachine
>().pg
;
7298 ldout(pg
->cct
, 10) << "In ReplicaActive, about to call activate" << dendl
;
7299 map
<int, map
<spg_t
, pg_query_t
> > query_map
;
7300 pg
->activate(*context
< RecoveryMachine
>().get_cur_transaction(),
7301 actevt
.activation_epoch
,
7302 *context
< RecoveryMachine
>().get_on_safe_context_list(),
7303 query_map
, NULL
, NULL
);
7304 ldout(pg
->cct
, 10) << "Activate Finished" << dendl
;
7305 return discard_event();
7308 boost::statechart::result
PG::RecoveryState::ReplicaActive::react(const MInfoRec
& infoevt
)
7310 PG
*pg
= context
< RecoveryMachine
>().pg
;
7311 pg
->proc_primary_info(*context
<RecoveryMachine
>().get_cur_transaction(),
7313 return discard_event();
7316 boost::statechart::result
PG::RecoveryState::ReplicaActive::react(const MLogRec
& logevt
)
7318 PG
*pg
= context
< RecoveryMachine
>().pg
;
7319 ldout(pg
->cct
, 10) << "received log from " << logevt
.from
<< dendl
;
7320 ObjectStore::Transaction
* t
= context
<RecoveryMachine
>().get_cur_transaction();
7321 pg
->merge_log(*t
, logevt
.msg
->info
, logevt
.msg
->log
, logevt
.from
);
7322 assert(pg
->pg_log
.get_head() == pg
->info
.last_update
);
7324 return discard_event();
7327 boost::statechart::result
PG::RecoveryState::ReplicaActive::react(const ActMap
&)
7329 PG
*pg
= context
< RecoveryMachine
>().pg
;
7330 if (pg
->should_send_notify() && pg
->get_primary().osd
>= 0) {
7331 context
< RecoveryMachine
>().send_notify(
7334 pg
->get_primary().shard
, pg
->pg_whoami
.shard
,
7335 pg
->get_osdmap()->get_epoch(),
7336 pg
->get_osdmap()->get_epoch(),
7338 pg
->past_intervals
);
7341 return discard_event();
7344 boost::statechart::result
PG::RecoveryState::ReplicaActive::react(const MQuery
& query
)
7346 PG
*pg
= context
< RecoveryMachine
>().pg
;
7347 if (query
.query
.type
== pg_query_t::MISSING
) {
7348 pg
->update_history(query
.query
.history
);
7349 pg
->fulfill_log(query
.from
, query
.query
, query
.query_epoch
);
7350 } // else: from prior to activation, safe to ignore
7351 return discard_event();
7354 boost::statechart::result
PG::RecoveryState::ReplicaActive::react(const QueryState
& q
)
7356 q
.f
->open_object_section("state");
7357 q
.f
->dump_string("name", state_name
);
7358 q
.f
->dump_stream("enter_time") << enter_time
;
7359 q
.f
->close_section();
7360 return forward_event();
7363 void PG::RecoveryState::ReplicaActive::exit()
7365 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
7366 PG
*pg
= context
< RecoveryMachine
>().pg
;
7367 pg
->osd
->remote_reserver
.cancel_reservation(pg
->info
.pgid
);
7368 utime_t dur
= ceph_clock_now() - enter_time
;
7369 pg
->osd
->recoverystate_perf
->tinc(rs_replicaactive_latency
, dur
);
7373 PG::RecoveryState::Stray::Stray(my_context ctx
)
7375 NamedState(context
< RecoveryMachine
>().pg
, "Started/Stray")
7377 context
< RecoveryMachine
>().log_enter(state_name
);
7379 PG
*pg
= context
< RecoveryMachine
>().pg
;
7380 assert(!pg
->is_peered());
7381 assert(!pg
->is_peering());
7382 assert(!pg
->is_primary());
7384 context
< RecoveryMachine
>().get_cur_transaction(),
7385 context
< RecoveryMachine
>().get_on_applied_context_list(),
7386 context
< RecoveryMachine
>().get_on_safe_context_list());
7389 boost::statechart::result
PG::RecoveryState::Stray::react(const MLogRec
& logevt
)
7391 PG
*pg
= context
< RecoveryMachine
>().pg
;
7392 MOSDPGLog
*msg
= logevt
.msg
.get();
7393 ldout(pg
->cct
, 10) << "got info+log from osd." << logevt
.from
<< " " << msg
->info
<< " " << msg
->log
<< dendl
;
7395 ObjectStore::Transaction
* t
= context
<RecoveryMachine
>().get_cur_transaction();
7396 if (msg
->info
.last_backfill
== hobject_t()) {
7398 pg
->unreg_next_scrub();
7399 pg
->info
= msg
->info
;
7400 pg
->reg_next_scrub();
7401 pg
->dirty_info
= true;
7402 pg
->dirty_big_info
= true; // maybe.
7404 PGLogEntryHandler rollbacker
{pg
, t
};
7405 pg
->pg_log
.reset_backfill_claim_log(msg
->log
, &rollbacker
);
7407 pg
->pg_log
.reset_backfill();
7409 pg
->merge_log(*t
, msg
->info
, msg
->log
, logevt
.from
);
7412 assert(pg
->pg_log
.get_head() == pg
->info
.last_update
);
7414 post_event(Activate(logevt
.msg
->info
.last_epoch_started
));
7415 return transit
<ReplicaActive
>();
7418 boost::statechart::result
PG::RecoveryState::Stray::react(const MInfoRec
& infoevt
)
7420 PG
*pg
= context
< RecoveryMachine
>().pg
;
7421 ldout(pg
->cct
, 10) << "got info from osd." << infoevt
.from
<< " " << infoevt
.info
<< dendl
;
7423 if (pg
->info
.last_update
> infoevt
.info
.last_update
) {
7424 // rewind divergent log entries
7425 ObjectStore::Transaction
* t
= context
<RecoveryMachine
>().get_cur_transaction();
7426 pg
->rewind_divergent_log(*t
, infoevt
.info
.last_update
);
7427 pg
->info
.stats
= infoevt
.info
.stats
;
7428 pg
->info
.hit_set
= infoevt
.info
.hit_set
;
7431 assert(infoevt
.info
.last_update
== pg
->info
.last_update
);
7432 assert(pg
->pg_log
.get_head() == pg
->info
.last_update
);
7434 post_event(Activate(infoevt
.info
.last_epoch_started
));
7435 return transit
<ReplicaActive
>();
7438 boost::statechart::result
PG::RecoveryState::Stray::react(const MQuery
& query
)
7440 PG
*pg
= context
< RecoveryMachine
>().pg
;
7441 if (query
.query
.type
== pg_query_t::INFO
) {
7442 pair
<pg_shard_t
, pg_info_t
> notify_info
;
7443 pg
->update_history(query
.query
.history
);
7444 pg
->fulfill_info(query
.from
, query
.query
, notify_info
);
7445 context
< RecoveryMachine
>().send_notify(
7448 notify_info
.first
.shard
, pg
->pg_whoami
.shard
,
7450 pg
->get_osdmap()->get_epoch(),
7451 notify_info
.second
),
7452 pg
->past_intervals
);
7454 pg
->fulfill_log(query
.from
, query
.query
, query
.query_epoch
);
7456 return discard_event();
7459 boost::statechart::result
PG::RecoveryState::Stray::react(const ActMap
&)
7461 PG
*pg
= context
< RecoveryMachine
>().pg
;
7462 if (pg
->should_send_notify() && pg
->get_primary().osd
>= 0) {
7463 context
< RecoveryMachine
>().send_notify(
7466 pg
->get_primary().shard
, pg
->pg_whoami
.shard
,
7467 pg
->get_osdmap()->get_epoch(),
7468 pg
->get_osdmap()->get_epoch(),
7470 pg
->past_intervals
);
7473 return discard_event();
7476 void PG::RecoveryState::Stray::exit()
7478 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
7479 PG
*pg
= context
< RecoveryMachine
>().pg
;
7480 utime_t dur
= ceph_clock_now() - enter_time
;
7481 pg
->osd
->recoverystate_perf
->tinc(rs_stray_latency
, dur
);
7484 /*--------GetInfo---------*/
7485 PG::RecoveryState::GetInfo::GetInfo(my_context ctx
)
7487 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Peering/GetInfo")
7489 context
< RecoveryMachine
>().log_enter(state_name
);
7491 PG
*pg
= context
< RecoveryMachine
>().pg
;
7492 pg
->check_past_interval_bounds();
7493 PastIntervals::PriorSet
&prior_set
= context
< Peering
>().prior_set
;
7495 assert(pg
->blocked_by
.empty());
7497 prior_set
= pg
->build_prior();
7499 pg
->reset_min_peer_features();
7501 if (prior_set
.pg_down
) {
7502 post_event(IsDown());
7503 } else if (peer_info_requested
.empty()) {
7504 post_event(GotInfo());
7508 void PG::RecoveryState::GetInfo::get_infos()
7510 PG
*pg
= context
< RecoveryMachine
>().pg
;
7511 PastIntervals::PriorSet
&prior_set
= context
< Peering
>().prior_set
;
7513 pg
->blocked_by
.clear();
7514 for (set
<pg_shard_t
>::const_iterator it
= prior_set
.probe
.begin();
7515 it
!= prior_set
.probe
.end();
7517 pg_shard_t peer
= *it
;
7518 if (peer
== pg
->pg_whoami
) {
7521 if (pg
->peer_info
.count(peer
)) {
7522 ldout(pg
->cct
, 10) << " have osd." << peer
<< " info " << pg
->peer_info
[peer
] << dendl
;
7525 if (peer_info_requested
.count(peer
)) {
7526 ldout(pg
->cct
, 10) << " already requested info from osd." << peer
<< dendl
;
7527 pg
->blocked_by
.insert(peer
.osd
);
7528 } else if (!pg
->get_osdmap()->is_up(peer
.osd
)) {
7529 ldout(pg
->cct
, 10) << " not querying info from down osd." << peer
<< dendl
;
7531 ldout(pg
->cct
, 10) << " querying info from osd." << peer
<< dendl
;
7532 context
< RecoveryMachine
>().send_query(
7533 peer
, pg_query_t(pg_query_t::INFO
,
7534 it
->shard
, pg
->pg_whoami
.shard
,
7536 pg
->get_osdmap()->get_epoch()));
7537 peer_info_requested
.insert(peer
);
7538 pg
->blocked_by
.insert(peer
.osd
);
7542 pg
->publish_stats_to_osd();
7545 boost::statechart::result
PG::RecoveryState::GetInfo::react(const MNotifyRec
& infoevt
)
7547 PG
*pg
= context
< RecoveryMachine
>().pg
;
7549 set
<pg_shard_t
>::iterator p
= peer_info_requested
.find(infoevt
.from
);
7550 if (p
!= peer_info_requested
.end()) {
7551 peer_info_requested
.erase(p
);
7552 pg
->blocked_by
.erase(infoevt
.from
.osd
);
7555 epoch_t old_start
= pg
->info
.history
.last_epoch_started
;
7556 if (pg
->proc_replica_info(
7557 infoevt
.from
, infoevt
.notify
.info
, infoevt
.notify
.epoch_sent
)) {
7558 // we got something new ...
7559 PastIntervals::PriorSet
&prior_set
= context
< Peering
>().prior_set
;
7560 if (old_start
< pg
->info
.history
.last_epoch_started
) {
7561 ldout(pg
->cct
, 10) << " last_epoch_started moved forward, rebuilding prior" << dendl
;
7562 prior_set
= pg
->build_prior();
7564 // filter out any osds that got dropped from the probe set from
7565 // peer_info_requested. this is less expensive than restarting
7566 // peering (which would re-probe everyone).
7567 set
<pg_shard_t
>::iterator p
= peer_info_requested
.begin();
7568 while (p
!= peer_info_requested
.end()) {
7569 if (prior_set
.probe
.count(*p
) == 0) {
7570 ldout(pg
->cct
, 20) << " dropping osd." << *p
<< " from info_requested, no longer in probe set" << dendl
;
7571 peer_info_requested
.erase(p
++);
7578 ldout(pg
->cct
, 20) << "Adding osd: " << infoevt
.from
.osd
<< " peer features: "
7579 << hex
<< infoevt
.features
<< dec
<< dendl
;
7580 pg
->apply_peer_features(infoevt
.features
);
7582 // are we done getting everything?
7583 if (peer_info_requested
.empty() && !prior_set
.pg_down
) {
7584 ldout(pg
->cct
, 20) << "Common peer features: " << hex
<< pg
->get_min_peer_features() << dec
<< dendl
;
7585 ldout(pg
->cct
, 20) << "Common acting features: " << hex
<< pg
->get_min_acting_features() << dec
<< dendl
;
7586 ldout(pg
->cct
, 20) << "Common upacting features: " << hex
<< pg
->get_min_upacting_features() << dec
<< dendl
;
7587 post_event(GotInfo());
7590 return discard_event();
7593 boost::statechart::result
PG::RecoveryState::GetInfo::react(const QueryState
& q
)
7595 PG
*pg
= context
< RecoveryMachine
>().pg
;
7596 q
.f
->open_object_section("state");
7597 q
.f
->dump_string("name", state_name
);
7598 q
.f
->dump_stream("enter_time") << enter_time
;
7600 q
.f
->open_array_section("requested_info_from");
7601 for (set
<pg_shard_t
>::iterator p
= peer_info_requested
.begin();
7602 p
!= peer_info_requested
.end();
7604 q
.f
->open_object_section("osd");
7605 q
.f
->dump_stream("osd") << *p
;
7606 if (pg
->peer_info
.count(*p
)) {
7607 q
.f
->open_object_section("got_info");
7608 pg
->peer_info
[*p
].dump(q
.f
);
7609 q
.f
->close_section();
7611 q
.f
->close_section();
7613 q
.f
->close_section();
7615 q
.f
->close_section();
7616 return forward_event();
7619 void PG::RecoveryState::GetInfo::exit()
7621 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
7622 PG
*pg
= context
< RecoveryMachine
>().pg
;
7623 utime_t dur
= ceph_clock_now() - enter_time
;
7624 pg
->osd
->recoverystate_perf
->tinc(rs_getinfo_latency
, dur
);
7625 pg
->blocked_by
.clear();
7626 pg
->publish_stats_to_osd();
7629 /*------GetLog------------*/
7630 PG::RecoveryState::GetLog::GetLog(my_context ctx
)
7633 context
< RecoveryMachine
>().pg
, "Started/Primary/Peering/GetLog"),
7636 context
< RecoveryMachine
>().log_enter(state_name
);
7638 PG
*pg
= context
< RecoveryMachine
>().pg
;
7641 if (!pg
->choose_acting(auth_log_shard
, false,
7642 &context
< Peering
>().history_les_bound
)) {
7643 if (!pg
->want_acting
.empty()) {
7644 post_event(NeedActingChange());
7646 post_event(IsIncomplete());
7652 if (auth_log_shard
== pg
->pg_whoami
) {
7653 post_event(GotLog());
7657 const pg_info_t
& best
= pg
->peer_info
[auth_log_shard
];
7660 if (pg
->info
.last_update
< best
.log_tail
) {
7661 ldout(pg
->cct
, 10) << " not contiguous with osd." << auth_log_shard
<< ", down" << dendl
;
7662 post_event(IsIncomplete());
7666 // how much log to request?
7667 eversion_t request_log_from
= pg
->info
.last_update
;
7668 assert(!pg
->actingbackfill
.empty());
7669 for (set
<pg_shard_t
>::iterator p
= pg
->actingbackfill
.begin();
7670 p
!= pg
->actingbackfill
.end();
7672 if (*p
== pg
->pg_whoami
) continue;
7673 pg_info_t
& ri
= pg
->peer_info
[*p
];
7674 if (ri
.last_update
< pg
->info
.log_tail
&& ri
.last_update
>= best
.log_tail
&&
7675 ri
.last_update
< request_log_from
)
7676 request_log_from
= ri
.last_update
;
7680 ldout(pg
->cct
, 10) << " requesting log from osd." << auth_log_shard
<< dendl
;
7681 context
<RecoveryMachine
>().send_query(
7685 auth_log_shard
.shard
, pg
->pg_whoami
.shard
,
7686 request_log_from
, pg
->info
.history
,
7687 pg
->get_osdmap()->get_epoch()));
7689 assert(pg
->blocked_by
.empty());
7690 pg
->blocked_by
.insert(auth_log_shard
.osd
);
7691 pg
->publish_stats_to_osd();
7694 boost::statechart::result
PG::RecoveryState::GetLog::react(const AdvMap
& advmap
)
7696 PG
*pg
= context
< RecoveryMachine
>().pg
;
7697 // make sure our log source didn't go down. we need to check
7698 // explicitly because it may not be part of the prior set, which
7699 // means the Peering state check won't catch it going down.
7700 if (!advmap
.osdmap
->is_up(auth_log_shard
.osd
)) {
7701 ldout(pg
->cct
, 10) << "GetLog: auth_log_shard osd."
7702 << auth_log_shard
.osd
<< " went down" << dendl
;
7704 return transit
< Reset
>();
7707 // let the Peering state do its checks.
7708 return forward_event();
7711 boost::statechart::result
PG::RecoveryState::GetLog::react(const MLogRec
& logevt
)
7713 PG
*pg
= context
< RecoveryMachine
>().pg
;
7715 if (logevt
.from
!= auth_log_shard
) {
7716 ldout(pg
->cct
, 10) << "GetLog: discarding log from "
7717 << "non-auth_log_shard osd." << logevt
.from
<< dendl
;
7718 return discard_event();
7720 ldout(pg
->cct
, 10) << "GetLog: received master log from osd"
7721 << logevt
.from
<< dendl
;
7723 post_event(GotLog());
7724 return discard_event();
7727 boost::statechart::result
PG::RecoveryState::GetLog::react(const GotLog
&)
7729 PG
*pg
= context
< RecoveryMachine
>().pg
;
7730 ldout(pg
->cct
, 10) << "leaving GetLog" << dendl
;
7732 ldout(pg
->cct
, 10) << "processing master log" << dendl
;
7733 pg
->proc_master_log(*context
<RecoveryMachine
>().get_cur_transaction(),
7734 msg
->info
, msg
->log
, msg
->missing
,
7738 context
< RecoveryMachine
>().get_cur_transaction(),
7739 context
< RecoveryMachine
>().get_on_applied_context_list(),
7740 context
< RecoveryMachine
>().get_on_safe_context_list());
7741 return transit
< GetMissing
>();
7744 boost::statechart::result
PG::RecoveryState::GetLog::react(const QueryState
& q
)
7746 q
.f
->open_object_section("state");
7747 q
.f
->dump_string("name", state_name
);
7748 q
.f
->dump_stream("enter_time") << enter_time
;
7749 q
.f
->dump_stream("auth_log_shard") << auth_log_shard
;
7750 q
.f
->close_section();
7751 return forward_event();
7754 void PG::RecoveryState::GetLog::exit()
7756 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
7757 PG
*pg
= context
< RecoveryMachine
>().pg
;
7758 utime_t dur
= ceph_clock_now() - enter_time
;
7759 pg
->osd
->recoverystate_perf
->tinc(rs_getlog_latency
, dur
);
7760 pg
->blocked_by
.clear();
7761 pg
->publish_stats_to_osd();
7764 /*------WaitActingChange--------*/
7765 PG::RecoveryState::WaitActingChange::WaitActingChange(my_context ctx
)
7767 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Peering/WaitActingChange")
7769 context
< RecoveryMachine
>().log_enter(state_name
);
7772 boost::statechart::result
PG::RecoveryState::WaitActingChange::react(const AdvMap
& advmap
)
7774 PG
*pg
= context
< RecoveryMachine
>().pg
;
7775 OSDMapRef osdmap
= advmap
.osdmap
;
7777 ldout(pg
->cct
, 10) << "verifying no want_acting " << pg
->want_acting
<< " targets didn't go down" << dendl
;
7778 for (vector
<int>::iterator p
= pg
->want_acting
.begin(); p
!= pg
->want_acting
.end(); ++p
) {
7779 if (!osdmap
->is_up(*p
)) {
7780 ldout(pg
->cct
, 10) << " want_acting target osd." << *p
<< " went down, resetting" << dendl
;
7782 return transit
< Reset
>();
7785 return forward_event();
7788 boost::statechart::result
PG::RecoveryState::WaitActingChange::react(const MLogRec
& logevt
)
7790 PG
*pg
= context
< RecoveryMachine
>().pg
;
7791 ldout(pg
->cct
, 10) << "In WaitActingChange, ignoring MLocRec" << dendl
;
7792 return discard_event();
7795 boost::statechart::result
PG::RecoveryState::WaitActingChange::react(const MInfoRec
& evt
)
7797 PG
*pg
= context
< RecoveryMachine
>().pg
;
7798 ldout(pg
->cct
, 10) << "In WaitActingChange, ignoring MInfoRec" << dendl
;
7799 return discard_event();
7802 boost::statechart::result
PG::RecoveryState::WaitActingChange::react(const MNotifyRec
& evt
)
7804 PG
*pg
= context
< RecoveryMachine
>().pg
;
7805 ldout(pg
->cct
, 10) << "In WaitActingChange, ignoring MNotifyRec" << dendl
;
7806 return discard_event();
7809 boost::statechart::result
PG::RecoveryState::WaitActingChange::react(const QueryState
& q
)
7811 q
.f
->open_object_section("state");
7812 q
.f
->dump_string("name", state_name
);
7813 q
.f
->dump_stream("enter_time") << enter_time
;
7814 q
.f
->dump_string("comment", "waiting for pg acting set to change");
7815 q
.f
->close_section();
7816 return forward_event();
7819 void PG::RecoveryState::WaitActingChange::exit()
7821 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
7822 PG
*pg
= context
< RecoveryMachine
>().pg
;
7823 utime_t dur
= ceph_clock_now() - enter_time
;
7824 pg
->osd
->recoverystate_perf
->tinc(rs_waitactingchange_latency
, dur
);
7827 /*------Down--------*/
7828 PG::RecoveryState::Down::Down(my_context ctx
)
7830 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Peering/Down")
7832 context
< RecoveryMachine
>().log_enter(state_name
);
7833 PG
*pg
= context
< RecoveryMachine
>().pg
;
7835 pg
->state_clear(PG_STATE_PEERING
);
7836 pg
->state_set(PG_STATE_DOWN
);
7838 auto &prior_set
= context
< Peering
>().prior_set
;
7839 assert(pg
->blocked_by
.empty());
7840 pg
->blocked_by
.insert(prior_set
.down
.begin(), prior_set
.down
.end());
7841 pg
->publish_stats_to_osd();
7844 void PG::RecoveryState::Down::exit()
7846 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
7847 PG
*pg
= context
< RecoveryMachine
>().pg
;
7849 pg
->state_clear(PG_STATE_DOWN
);
7850 utime_t dur
= ceph_clock_now() - enter_time
;
7851 pg
->osd
->recoverystate_perf
->tinc(rs_down_latency
, dur
);
7853 pg
->blocked_by
.clear();
7854 pg
->publish_stats_to_osd();
7857 boost::statechart::result
PG::RecoveryState::Down::react(const QueryState
& q
)
7859 q
.f
->open_object_section("state");
7860 q
.f
->dump_string("name", state_name
);
7861 q
.f
->dump_stream("enter_time") << enter_time
;
7862 q
.f
->dump_string("comment",
7863 "not enough up instances of this PG to go active");
7864 q
.f
->close_section();
7865 return forward_event();
7868 /*------Incomplete--------*/
7869 PG::RecoveryState::Incomplete::Incomplete(my_context ctx
)
7871 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Peering/Incomplete")
7873 context
< RecoveryMachine
>().log_enter(state_name
);
7874 PG
*pg
= context
< RecoveryMachine
>().pg
;
7876 pg
->state_clear(PG_STATE_PEERING
);
7877 pg
->state_set(PG_STATE_INCOMPLETE
);
7879 PastIntervals::PriorSet
&prior_set
= context
< Peering
>().prior_set
;
7880 assert(pg
->blocked_by
.empty());
7881 pg
->blocked_by
.insert(prior_set
.down
.begin(), prior_set
.down
.end());
7882 pg
->publish_stats_to_osd();
7885 boost::statechart::result
PG::RecoveryState::Incomplete::react(const AdvMap
&advmap
) {
7886 PG
*pg
= context
< RecoveryMachine
>().pg
;
7887 int64_t poolnum
= pg
->info
.pgid
.pool();
7889 // Reset if min_size turn smaller than previous value, pg might now be able to go active
7890 if (advmap
.lastmap
->get_pools().find(poolnum
)->second
.min_size
>
7891 advmap
.osdmap
->get_pools().find(poolnum
)->second
.min_size
) {
7893 return transit
< Reset
>();
7896 return forward_event();
7899 boost::statechart::result
PG::RecoveryState::Incomplete::react(const MNotifyRec
& notevt
) {
7900 PG
*pg
= context
< RecoveryMachine
>().pg
;
7901 ldout(pg
->cct
, 7) << "handle_pg_notify from osd." << notevt
.from
<< dendl
;
7902 if (pg
->proc_replica_info(
7903 notevt
.from
, notevt
.notify
.info
, notevt
.notify
.epoch_sent
)) {
7904 // We got something new, try again!
7905 return transit
< GetLog
>();
7907 return discard_event();
7911 boost::statechart::result
PG::RecoveryState::Incomplete::react(
7912 const QueryState
& q
)
7914 q
.f
->open_object_section("state");
7915 q
.f
->dump_string("name", state_name
);
7916 q
.f
->dump_stream("enter_time") << enter_time
;
7917 q
.f
->dump_string("comment", "not enough complete instances of this PG");
7918 q
.f
->close_section();
7919 return forward_event();
7922 void PG::RecoveryState::Incomplete::exit()
7924 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
7925 PG
*pg
= context
< RecoveryMachine
>().pg
;
7927 pg
->state_clear(PG_STATE_INCOMPLETE
);
7928 utime_t dur
= ceph_clock_now() - enter_time
;
7929 pg
->osd
->recoverystate_perf
->tinc(rs_incomplete_latency
, dur
);
7931 pg
->blocked_by
.clear();
7932 pg
->publish_stats_to_osd();
7935 /*------GetMissing--------*/
7936 PG::RecoveryState::GetMissing::GetMissing(my_context ctx
)
7938 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Peering/GetMissing")
7940 context
< RecoveryMachine
>().log_enter(state_name
);
7942 PG
*pg
= context
< RecoveryMachine
>().pg
;
7943 assert(!pg
->actingbackfill
.empty());
7945 for (set
<pg_shard_t
>::iterator i
= pg
->actingbackfill
.begin();
7946 i
!= pg
->actingbackfill
.end();
7948 if (*i
== pg
->get_primary()) continue;
7949 const pg_info_t
& pi
= pg
->peer_info
[*i
];
7952 continue; // no pg data, nothing divergent
7954 if (pi
.last_update
< pg
->pg_log
.get_tail()) {
7955 ldout(pg
->cct
, 10) << " osd." << *i
<< " is not contiguous, will restart backfill" << dendl
;
7956 pg
->peer_missing
[*i
];
7959 if (pi
.last_backfill
== hobject_t()) {
7960 ldout(pg
->cct
, 10) << " osd." << *i
<< " will fully backfill; can infer empty missing set" << dendl
;
7961 pg
->peer_missing
[*i
];
7965 if (pi
.last_update
== pi
.last_complete
&& // peer has no missing
7966 pi
.last_update
== pg
->info
.last_update
) { // peer is up to date
7967 // replica has no missing and identical log as us. no need to
7969 // FIXME: we can do better here. if last_update==last_complete we
7970 // can infer the rest!
7971 ldout(pg
->cct
, 10) << " osd." << *i
<< " has no missing, identical log" << dendl
;
7972 pg
->peer_missing
[*i
];
7976 // We pull the log from the peer's last_epoch_started to ensure we
7977 // get enough log to detect divergent updates.
7978 since
.epoch
= pi
.last_epoch_started
;
7979 assert(pi
.last_update
>= pg
->info
.log_tail
); // or else choose_acting() did a bad thing
7980 if (pi
.log_tail
<= since
) {
7981 ldout(pg
->cct
, 10) << " requesting log+missing since " << since
<< " from osd." << *i
<< dendl
;
7982 context
< RecoveryMachine
>().send_query(
7986 i
->shard
, pg
->pg_whoami
.shard
,
7987 since
, pg
->info
.history
,
7988 pg
->get_osdmap()->get_epoch()));
7990 ldout(pg
->cct
, 10) << " requesting fulllog+missing from osd." << *i
7991 << " (want since " << since
<< " < log.tail "
7992 << pi
.log_tail
<< ")" << dendl
;
7993 context
< RecoveryMachine
>().send_query(
7995 pg_query_t::FULLLOG
,
7996 i
->shard
, pg
->pg_whoami
.shard
,
7997 pg
->info
.history
, pg
->get_osdmap()->get_epoch()));
7999 peer_missing_requested
.insert(*i
);
8000 pg
->blocked_by
.insert(i
->osd
);
8003 if (peer_missing_requested
.empty()) {
8004 if (pg
->need_up_thru
) {
8005 ldout(pg
->cct
, 10) << " still need up_thru update before going active"
8007 post_event(NeedUpThru());
8012 post_event(Activate(pg
->get_osdmap()->get_epoch()));
8014 pg
->publish_stats_to_osd();
8018 boost::statechart::result
PG::RecoveryState::GetMissing::react(const MLogRec
& logevt
)
8020 PG
*pg
= context
< RecoveryMachine
>().pg
;
8022 peer_missing_requested
.erase(logevt
.from
);
8023 pg
->proc_replica_log(logevt
.msg
->info
, logevt
.msg
->log
, logevt
.msg
->missing
, logevt
.from
);
8025 if (peer_missing_requested
.empty()) {
8026 if (pg
->need_up_thru
) {
8027 ldout(pg
->cct
, 10) << " still need up_thru update before going active"
8029 post_event(NeedUpThru());
8031 ldout(pg
->cct
, 10) << "Got last missing, don't need missing "
8032 << "posting Activate" << dendl
;
8033 post_event(Activate(pg
->get_osdmap()->get_epoch()));
8036 return discard_event();
8039 boost::statechart::result
PG::RecoveryState::GetMissing::react(const QueryState
& q
)
8041 PG
*pg
= context
< RecoveryMachine
>().pg
;
8042 q
.f
->open_object_section("state");
8043 q
.f
->dump_string("name", state_name
);
8044 q
.f
->dump_stream("enter_time") << enter_time
;
8046 q
.f
->open_array_section("peer_missing_requested");
8047 for (set
<pg_shard_t
>::iterator p
= peer_missing_requested
.begin();
8048 p
!= peer_missing_requested
.end();
8050 q
.f
->open_object_section("osd");
8051 q
.f
->dump_stream("osd") << *p
;
8052 if (pg
->peer_missing
.count(*p
)) {
8053 q
.f
->open_object_section("got_missing");
8054 pg
->peer_missing
[*p
].dump(q
.f
);
8055 q
.f
->close_section();
8057 q
.f
->close_section();
8059 q
.f
->close_section();
8061 q
.f
->close_section();
8062 return forward_event();
8065 void PG::RecoveryState::GetMissing::exit()
8067 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
8068 PG
*pg
= context
< RecoveryMachine
>().pg
;
8069 utime_t dur
= ceph_clock_now() - enter_time
;
8070 pg
->osd
->recoverystate_perf
->tinc(rs_getmissing_latency
, dur
);
8071 pg
->blocked_by
.clear();
8072 pg
->publish_stats_to_osd();
8075 /*------WaitUpThru--------*/
8076 PG::RecoveryState::WaitUpThru::WaitUpThru(my_context ctx
)
8078 NamedState(context
< RecoveryMachine
>().pg
, "Started/Primary/Peering/WaitUpThru")
8080 context
< RecoveryMachine
>().log_enter(state_name
);
8083 boost::statechart::result
PG::RecoveryState::WaitUpThru::react(const ActMap
& am
)
8085 PG
*pg
= context
< RecoveryMachine
>().pg
;
8086 if (!pg
->need_up_thru
) {
8087 post_event(Activate(pg
->get_osdmap()->get_epoch()));
8089 return forward_event();
8092 boost::statechart::result
PG::RecoveryState::WaitUpThru::react(const MLogRec
& logevt
)
8094 PG
*pg
= context
< RecoveryMachine
>().pg
;
8095 ldout(pg
->cct
, 10) << "Noting missing from osd." << logevt
.from
<< dendl
;
8096 pg
->peer_missing
[logevt
.from
].claim(logevt
.msg
->missing
);
8097 pg
->peer_info
[logevt
.from
] = logevt
.msg
->info
;
8098 return discard_event();
8101 boost::statechart::result
PG::RecoveryState::WaitUpThru::react(const QueryState
& q
)
8103 q
.f
->open_object_section("state");
8104 q
.f
->dump_string("name", state_name
);
8105 q
.f
->dump_stream("enter_time") << enter_time
;
8106 q
.f
->dump_string("comment", "waiting for osdmap to reflect a new up_thru for this osd");
8107 q
.f
->close_section();
8108 return forward_event();
8111 void PG::RecoveryState::WaitUpThru::exit()
8113 context
< RecoveryMachine
>().log_exit(state_name
, enter_time
);
8114 PG
*pg
= context
< RecoveryMachine
>().pg
;
8115 utime_t dur
= ceph_clock_now() - enter_time
;
8116 pg
->osd
->recoverystate_perf
->tinc(rs_waitupthru_latency
, dur
);
8119 /*----RecoveryState::RecoveryMachine Methods-----*/
8121 #define dout_prefix *_dout << pg->gen_prefix()
8123 void PG::RecoveryState::RecoveryMachine::log_enter(const char *state_name
)
8125 PG
*pg
= context
< RecoveryMachine
>().pg
;
8126 ldout(pg
->cct
, 5) << "enter " << state_name
<< dendl
;
8127 pg
->osd
->pg_recovery_stats
.log_enter(state_name
);
8130 void PG::RecoveryState::RecoveryMachine::log_exit(const char *state_name
, utime_t enter_time
)
8132 utime_t dur
= ceph_clock_now() - enter_time
;
8133 PG
*pg
= context
< RecoveryMachine
>().pg
;
8134 ldout(pg
->cct
, 5) << "exit " << state_name
<< " " << dur
<< " " << event_count
<< " " << event_time
<< dendl
;
8135 pg
->osd
->pg_recovery_stats
.log_exit(state_name
, ceph_clock_now() - enter_time
,
8136 event_count
, event_time
);
8138 event_time
= utime_t();
8142 /*---------------------------------------------------*/
8144 #define dout_prefix (*_dout << (debug_pg ? debug_pg->gen_prefix() : string()) << " PriorSet: ")
8146 void PG::RecoveryState::start_handle(RecoveryCtx
*new_ctx
) {
8151 if (messages_pending_flush
) {
8152 rctx
= RecoveryCtx(*messages_pending_flush
, *new_ctx
);
8156 rctx
->start_time
= ceph_clock_now();
8160 void PG::RecoveryState::begin_block_outgoing() {
8161 assert(!messages_pending_flush
);
8164 messages_pending_flush
= BufferedRecoveryMessages();
8165 rctx
= RecoveryCtx(*messages_pending_flush
, *orig_ctx
);
8168 void PG::RecoveryState::clear_blocked_outgoing() {
8171 messages_pending_flush
= boost::optional
<BufferedRecoveryMessages
>();
8174 void PG::RecoveryState::end_block_outgoing() {
8175 assert(messages_pending_flush
);
8179 rctx
= RecoveryCtx(*orig_ctx
);
8180 rctx
->accept_buffered_messages(*messages_pending_flush
);
8181 messages_pending_flush
= boost::optional
<BufferedRecoveryMessages
>();
8184 void PG::RecoveryState::end_handle() {
8186 utime_t dur
= ceph_clock_now() - rctx
->start_time
;
8187 machine
.event_time
+= dur
;
8190 machine
.event_count
++;
8191 rctx
= boost::optional
<RecoveryCtx
>();
8195 ostream
& operator<<(ostream
& out
, const PG::BackfillInterval
& bi
)
8197 out
<< "BackfillInfo(" << bi
.begin
<< "-" << bi
.end
8198 << " " << bi
.objects
.size() << " objects";
8199 if (!bi
.objects
.empty())
8200 out
<< " " << bi
.objects
;
8205 void intrusive_ptr_add_ref(PG
*pg
) { pg
->get("intptr"); }
8206 void intrusive_ptr_release(PG
*pg
) { pg
->put("intptr"); }
8208 #ifdef PG_DEBUG_REFS
8209 uint64_t get_with_id(PG
*pg
) { return pg
->get_with_id(); }
8210 void put_with_id(PG
*pg
, uint64_t id
) { return pg
->put_with_id(id
); }