build_might_have_unfound();
- state_set(PG_STATE_DEGRADED);
if (have_unfound())
discover_all_missing(query_map);
}
- // degraded?
+ // num_objects_degraded if calculated should reflect this too, unless no
+ // missing and we are about to go clean.
if (get_osdmap()->get_pg_size(info.pgid.pgid) > actingset.size()) {
- state_set(PG_STATE_DEGRADED);
state_set(PG_STATE_UNDERSIZED);
}
// waiters
if (flushes_in_progress == 0) {
requeue_ops(waiting_for_peered);
+ } else if (!waiting_for_peered.empty()) {
+ dout(10) << __func__ << " flushes in progress, moving "
+ << waiting_for_peered.size() << " items to waiting_for_flush"
+ << dendl;
+ assert(waiting_for_flush.empty());
+ waiting_for_flush.swap(waiting_for_peered);
}
}
assert(!actingbackfill.empty());
assert(blocked_by.empty());
+ // Degraded?
+ _update_calc_stats();
+ if (info.stats.stats.sum.num_objects_degraded) {
+ state_set(PG_STATE_DEGRADED);
+ } else {
+ state_clear(PG_STATE_DEGRADED);
+ }
+
queue_peering_event(
CephPeeringEvtRef(
std::make_shared<CephPeeringEvt>(
info.stats.ondisk_log_size = info.stats.log_size;
info.stats.log_start = pg_log.get_tail();
info.stats.ondisk_log_start = pg_log.get_tail();
+ info.stats.snaptrimq_len = snap_trimq.size();
- // If actingset is larger then upset we will have misplaced,
- // so we will report based on actingset size.
-
- // If upset is larger then we will have degraded,
- // so we will report based on upset size.
+ unsigned num_shards = get_osdmap()->get_pg_size(info.pgid.pgid);
- // If target is the largest of them all, it will contribute to
- // the degraded count because num_object_copies is
- // computed using target and eventual used to get degraded total.
-
- unsigned target = get_osdmap()->get_pg_size(info.pgid.pgid);
- unsigned nrep = MAX(actingset.size(), upset.size());
+ // In rare case that upset is too large (usually transient), use as target
+ // for calculations below.
+ unsigned target = std::max(num_shards, (unsigned)upset.size());
+ // Not sure this could ever happen, that actingset > upset
+ // which only matters if actingset > num_shards.
+ unsigned nrep = std::max(actingset.size(), upset.size());
// calc num_object_copies
info.stats.stats.calc_copies(MAX(target, nrep));
info.stats.stats.sum.num_objects_degraded = 0;
info.stats.stats.sum.num_objects_unfound = 0;
info.stats.stats.sum.num_objects_misplaced = 0;
- if ((is_degraded() || is_undersized() || !is_clean()) && is_peered()) {
- // NOTE: we only generate copies, degraded, misplaced and unfound
+ if ((is_remapped() || is_undersized() || !is_clean()) && (is_peered() || is_activating())) {
+ dout(20) << __func__ << " actingset " << actingset << " upset "
+ << upset << " actingbackfill " << actingbackfill << dendl;
+ dout(20) << __func__ << " acting " << acting << " up " << up << dendl;
+
+ assert(!actingbackfill.empty());
+
+ // NOTE: we only generate degraded, misplaced and unfound
// values for the summation, not individual stat categories.
int64_t num_objects = info.stats.stats.sum.num_objects;
- // Total sum of all missing
- int64_t missing = 0;
- // Objects that have arrived backfilled to up OSDs (not in acting)
- int64_t backfilled = 0;
- // A misplaced object is not stored on the correct OSD
- int64_t misplaced = 0;
- // Total of object copies/shards found
- int64_t object_copies = 0;
-
- // num_objects_missing on each peer
- for (map<pg_shard_t, pg_info_t>::iterator pi =
- peer_info.begin();
- pi != peer_info.end();
- ++pi) {
- map<pg_shard_t, pg_missing_t>::const_iterator pm =
- peer_missing.find(pi->first);
- if (pm != peer_missing.end()) {
- pi->second.stats.stats.sum.num_objects_missing =
- pm->second.num_missing();
- }
+ // Objects missing from up nodes, sorted by # objects.
+ boost::container::flat_set<pair<int64_t,pg_shard_t>> missing_target_objects;
+ // Objects missing from nodes not in up, sort by # objects
+ boost::container::flat_set<pair<int64_t,pg_shard_t>> acting_source_objects;
+
+ int64_t missing;
+
+ // Primary first
+ missing = pg_log.get_missing().num_missing();
+ assert(actingbackfill.count(pg_whoami));
+ if (upset.count(pg_whoami)) {
+ missing_target_objects.insert(make_pair(missing, pg_whoami));
+ } else {
+ acting_source_objects.insert(make_pair(missing, pg_whoami));
}
+ info.stats.stats.sum.num_objects_missing_on_primary = missing;
- assert(!actingbackfill.empty());
- for (set<pg_shard_t>::iterator i = actingbackfill.begin();
- i != actingbackfill.end();
- ++i) {
- const pg_shard_t &p = *i;
-
- bool in_up = (upset.find(p) != upset.end());
- bool in_acting = (actingset.find(p) != actingset.end());
- assert(in_up || in_acting);
-
- // in acting Compute total objects excluding num_missing
- // in acting and not in up Compute misplaced objects excluding num_missing
- // in up and not in acting Compute total objects already backfilled
- if (in_acting) {
- unsigned osd_missing;
- // primary handling
- if (p == pg_whoami) {
- osd_missing = pg_log.get_missing().num_missing();
- info.stats.stats.sum.num_objects_missing_on_primary =
- osd_missing;
- object_copies += num_objects; // My local (primary) count
- } else {
- assert(peer_missing.count(p));
- osd_missing = peer_missing[p].num_missing();
- object_copies += peer_info[p].stats.stats.sum.num_objects;
+ // All other peers
+ for (auto& peer : peer_info) {
+ // Ignore other peers until we add code to look at detailed missing
+ // information. (recovery)
+ if (!actingbackfill.count(peer.first)) {
+ continue;
+ }
+ missing = 0;
+ // Backfill targets always track num_objects accurately
+ // all other peers track missing accurately.
+ if (is_backfill_targets(peer.first)) {
+ missing = std::max((int64_t)0, num_objects - peer.second.stats.stats.sum.num_objects);
+ } else {
+ if (peer_missing.count(peer.first)) {
+ missing = peer_missing[peer.first].num_missing();
+ } else {
+ dout(20) << __func__ << " no peer_missing found for " << peer.first << dendl;
}
- missing += osd_missing;
- // Count non-missing objects not in up as misplaced
- if (!in_up && num_objects > osd_missing)
- misplaced += num_objects - osd_missing;
+ }
+ if (upset.count(peer.first)) {
+ missing_target_objects.insert(make_pair(missing, peer.first));
} else {
- assert(in_up && !in_acting);
+ acting_source_objects.insert(make_pair(missing, peer.first));
+ }
+ peer.second.stats.stats.sum.num_objects_missing = missing;
+ }
- // If this peer has more objects then it should, ignore them
- backfilled += MIN(num_objects, peer_info[p].stats.stats.sum.num_objects);
+ if (pool.info.is_replicated()) {
+ // Add to missing_target_objects up to target elements (num_objects missing)
+ assert(target >= missing_target_objects.size());
+ unsigned needed = target - missing_target_objects.size();
+ for (; needed; --needed)
+ missing_target_objects.insert(make_pair(num_objects, pg_shard_t(pg_shard_t::NO_OSD)));
+ } else {
+ for (unsigned i = 0 ; i < num_shards; ++i) {
+ shard_id_t shard(i);
+ bool found = false;
+ for (const auto& t : missing_target_objects) {
+ if (std::get<1>(t).shard == shard) {
+ found = true;
+ break;
+ }
+ }
+ if (!found)
+ missing_target_objects.insert(make_pair(num_objects, pg_shard_t(pg_shard_t::NO_OSD,shard)));
}
}
- // Any objects that have been backfilled to up OSDs can deducted from misplaced
- misplaced = MAX(0, misplaced - backfilled);
+ for (const auto& item : missing_target_objects)
+ dout(20) << __func__ << " missing shard " << std::get<1>(item) << " missing= " << std::get<0>(item) << dendl;
+ for (const auto& item : acting_source_objects)
+ dout(20) << __func__ << " acting shard " << std::get<1>(item) << " missing= " << std::get<0>(item) << dendl;
+
+ // A misplaced object is not stored on the correct OSD
+ int64_t misplaced = 0;
+ // a degraded objects has fewer replicas or EC shards than the pool specifies.
+ int64_t degraded = 0;
+
+ for (auto m = missing_target_objects.rbegin();
+ m != missing_target_objects.rend(); ++m) {
+
+ int64_t extra_missing = -1;
- // Deduct computed total missing on acting nodes
- object_copies -= missing;
- // Include computed backfilled objects on up nodes
- object_copies += backfilled;
- // a degraded objects has fewer replicas or EC shards than the
- // pool specifies. num_object_copies will never be smaller than target * num_copies.
- int64_t degraded = MAX(0, info.stats.stats.sum.num_object_copies - object_copies);
+ if (pool.info.is_replicated()) {
+ if (!acting_source_objects.empty()) {
+ auto extra_copy = acting_source_objects.begin();
+ extra_missing = std::get<0>(*extra_copy);
+ acting_source_objects.erase(extra_copy);
+ }
+ } else { // Erasure coded
+ // Use corresponding shard
+ for (const auto& a : acting_source_objects) {
+ if (std::get<1>(a).shard == std::get<1>(*m).shard) {
+ extra_missing = std::get<0>(a);
+ acting_source_objects.erase(a);
+ break;
+ }
+ }
+ }
+
+ if (extra_missing >= 0 && std::get<0>(*m) >= extra_missing) {
+ // We don't know which of the objects on the target
+ // are part of extra_missing so assume are all degraded.
+ misplaced += std::get<0>(*m) - extra_missing;
+ degraded += extra_missing;
+ } else {
+ // 1. extra_missing == -1, more targets than sources so degraded
+ // 2. extra_missing > std::get<0>(m), so that we know that some extra_missing
+ // previously degraded are now present on the target.
+ degraded += std::get<0>(*m);
+ }
+ }
+ // If there are still acting that haven't been accounted for
+ // then they are misplaced
+ for (const auto& a : acting_source_objects) {
+ int64_t extra_misplaced = std::max((int64_t)0, num_objects - std::get<0>(a));
+ dout(20) << __func__ << " extra acting misplaced " << extra_misplaced << dendl;
+ misplaced += extra_misplaced;
+ }
+ dout(20) << __func__ << " degraded " << degraded << dendl;
+ dout(20) << __func__ << " misplaced " << misplaced << dendl;
info.stats.stats.sum.num_objects_degraded = degraded;
info.stats.stats.sum.num_objects_unfound = get_num_unfound();
}
_update_calc_stats();
+ if (info.stats.stats.sum.num_objects_degraded) {
+ state_set(PG_STATE_DEGRADED);
+ } else {
+ state_clear(PG_STATE_DEGRADED);
+ }
_update_blocked_by();
bool publish = false;
cct->_conf->osd_ignore_stale_divergent_priors,
cct->_conf->osd_debug_verify_missing_on_start);
if (oss.tellp())
- osd->clog->error() << oss.rdbuf();
+ osd->clog->error() << oss.str();
if (force_rebuild_missing) {
dout(10) << __func__ << " forced rebuild of missing got "
try {
::decode(snaps, p);
} catch (...) {
+ derr << __func__ << " decode snaps failure on " << *i << dendl;
snaps.clear();
}
set<snapid_t> _snaps(snaps.begin(), snaps.end());
<< "...repaired";
}
snap_mapper.add_oid(hoid, obj_snaps, &_t);
- r = osd->store->apply_transaction(osr.get(), std::move(t));
- if (r != 0) {
- derr << __func__ << ": apply_transaction got " << cpp_strerror(r)
- << dendl;
+
+ // wait for repair to apply to avoid confusing other bits of the system.
+ {
+ Cond my_cond;
+ Mutex my_lock("PG::_scan_snaps my_lock");
+ int r = 0;
+ bool done;
+ t.register_on_applied_sync(
+ new C_SafeCond(&my_lock, &my_cond, &done, &r));
+ r = osd->store->apply_transaction(osr.get(), std::move(t));
+ if (r != 0) {
+ derr << __func__ << ": apply_transaction got " << cpp_strerror(r)
+ << dendl;
+ } else {
+ my_lock.Lock();
+ while (!done)
+ my_cond.Wait(my_lock);
+ my_lock.Unlock();
+ }
}
}
}
bool PG::append_log_entries_update_missing(
const mempool::osd_pglog::list<pg_log_entry_t> &entries,
- ObjectStore::Transaction &t)
+ ObjectStore::Transaction &t, boost::optional<eversion_t> trim_to,
+ boost::optional<eversion_t> roll_forward_to)
{
assert(!entries.empty());
assert(entries.begin()->version > info.last_update);
PGLogEntryHandler rollbacker{this, &t};
+ if (roll_forward_to) {
+ pg_log.roll_forward(&rollbacker);
+ }
bool invalidate_stats =
pg_log.append_new_log_entries(info.last_backfill,
info.last_backfill_bitwise,
entries,
&rollbacker);
+
+ if (roll_forward_to && entries.rbegin()->soid > info.last_backfill) {
+ pg_log.roll_forward(&rollbacker);
+ }
+ if (roll_forward_to && *roll_forward_to > pg_log.get_can_rollback_to()) {
+ pg_log.roll_forward_to(*roll_forward_to, &rollbacker);
+ last_rollback_info_trimmed_to_applied = *roll_forward_to;
+ }
+
info.last_update = pg_log.get_head();
if (pg_log.get_missing().num_missing() == 0) {
// advance last_complete since nothing else is missing!
info.last_complete = info.last_update;
}
-
info.stats.stats_invalid = info.stats.stats_invalid || invalidate_stats;
+
+ dout(20) << __func__ << "trim_to bool = " << bool(trim_to) << " trim_to = " << (trim_to ? *trim_to : eversion_t()) << dendl;
+ if (trim_to)
+ pg_log.trim(*trim_to, info);
dirty_info = true;
write_if_dirty(t);
return invalidate_stats;
void PG::merge_new_log_entries(
const mempool::osd_pglog::list<pg_log_entry_t> &entries,
- ObjectStore::Transaction &t)
+ ObjectStore::Transaction &t,
+ boost::optional<eversion_t> trim_to,
+ boost::optional<eversion_t> roll_forward_to)
{
dout(10) << __func__ << " " << entries << dendl;
assert(is_primary());
- bool rebuild_missing = append_log_entries_update_missing(entries, t);
+ bool rebuild_missing = append_log_entries_update_missing(entries, t, trim_to, roll_forward_to);
for (set<pg_shard_t>::const_iterator i = actingbackfill.begin();
i != actingbackfill.end();
++i) {
pg_shard_t old_acting_primary = get_primary();
pg_shard_t old_up_primary = up_primary;
bool was_old_primary = is_primary();
+ bool was_old_replica = is_replica();
acting.swap(oldacting);
up.swap(oldup);
actingbackfill.clear();
scrub_queued = false;
- // reset primary state?
+ // reset primary/replica state?
if (was_old_primary || is_primary()) {
osd->remove_want_pg_temp(info.pgid.pgid);
+ } else if (was_old_replica || is_replica()) {
+ osd->remove_want_pg_temp(info.pgid.pgid);
}
clear_primary_state();
assert(!is_primary());
update_history(oinfo.history);
+ if (!info.stats.stats_invalid && info.stats.stats.sum.num_scrub_errors) {
+ info.stats.stats.sum.num_scrub_errors = 0;
+ info.stats.stats.sum.num_shallow_scrub_errors = 0;
+ info.stats.stats.sum.num_deep_scrub_errors = 0;
+ dirty_info = true;
+ }
- if (last_complete_ondisk.epoch >= info.history.last_epoch_started) {
- // DEBUG: verify that the snaps are empty in snap_mapper
- if (cct->_conf->osd_debug_verify_snaps_on_info) {
- interval_set<snapid_t> p;
- p.union_of(oinfo.purged_snaps, info.purged_snaps);
- p.subtract(info.purged_snaps);
- if (!p.empty()) {
- for (interval_set<snapid_t>::iterator i = p.begin();
- i != p.end();
- ++i) {
- for (snapid_t snap = i.get_start();
- snap != i.get_len() + i.get_start();
- ++snap) {
- vector<hobject_t> hoids;
- int r = snap_mapper.get_next_objects_to_trim(snap, 1, &hoids);
- if (r != 0 && r != -ENOENT) {
- derr << __func__ << ": snap_mapper get_next_object_to_trim returned "
- << cpp_strerror(r) << dendl;
- ceph_abort();
- } else if (r != -ENOENT) {
- assert(!hoids.empty());
- derr << __func__ << ": snap_mapper get_next_object_to_trim returned "
- << cpp_strerror(r) << " for object "
- << hoids[0] << " on snap " << snap
- << " which should have been fully trimmed " << dendl;
- ceph_abort();
- }
- }
- }
- }
- }
+ if (!(info.purged_snaps == oinfo.purged_snaps)) {
+ dout(10) << __func__ << " updating purged_snaps to " << oinfo.purged_snaps
+ << dendl;
info.purged_snaps = oinfo.purged_snaps;
dirty_info = true;
dirty_big_info = true;
<< " " << pg.up;
if (pg.acting != pg.up)
out << "/" << pg.acting;
+ if (pg.is_ec_pg())
+ out << "p" << pg.get_primary();
out << " r=" << pg.get_role();
out << " lpr=" << pg.get_last_peering_reset();
recovery_state.handle_event(evt, rctx);
ActMap evt2;
recovery_state.handle_event(evt2, rctx);
+
+ rctx->on_applied->add(make_lambda_context([this]() {
+ update_store_with_options();
+ }));
}
void PG::handle_query_state(Formatter *f)
{
auto r = osd->store->set_collection_opts(coll, pool.info.opts);
if(r < 0 && r != -EOPNOTSUPP) {
- derr << __func__ << "set_collection_opts returns error:" << r << dendl;
+ derr << __func__ << " set_collection_opts returns error:" << r << dendl;
}
}
return transit<NotBackfilling>();
}
+boost::statechart::result
+PG::RecoveryState::Backfilling::react(const UnfoundBackfill &c)
+{
+ PG *pg = context< RecoveryMachine >().pg;
+ ldout(pg->cct, 10) << "backfill has unfound, can't continue" << dendl;
+ pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
+
+ pg->state_set(PG_STATE_BACKFILL_UNFOUND);
+ pg->state_clear(PG_STATE_BACKFILLING);
+
+ for (set<pg_shard_t>::iterator it = pg->backfill_targets.begin();
+ it != pg->backfill_targets.end();
+ ++it) {
+ assert(*it != pg->pg_whoami);
+ ConnectionRef con = pg->osd->get_con_osd_cluster(
+ it->osd, pg->get_osdmap()->get_epoch());
+ if (con) {
+ pg->osd->send_message_osd_cluster(
+ new MBackfillReserve(
+ MBackfillReserve::REJECT,
+ spg_t(pg->info.pgid.pgid, it->shard),
+ pg->get_osdmap()->get_epoch()),
+ con.get());
+ }
+ }
+
+ pg->waiting_on_backfill.clear();
+
+ return transit<NotBackfilling>();
+}
+
boost::statechart::result
PG::RecoveryState::Backfilling::react(const RemoteReservationRejected &)
{
{
context< RecoveryMachine >().log_exit(state_name, enter_time);
PG *pg = context< RecoveryMachine >().pg;
+ pg->state_clear(PG_STATE_BACKFILL_UNFOUND);
utime_t dur = ceph_clock_now() - enter_time;
pg->osd->recoverystate_perf->tinc(rs_notbackfilling_latency, dur);
}
{
context< RecoveryMachine >().log_exit(state_name, enter_time);
PG *pg = context< RecoveryMachine >().pg;
+ pg->state_clear(PG_STATE_RECOVERY_UNFOUND);
utime_t dur = ceph_clock_now() - enter_time;
pg->osd->recoverystate_perf->tinc(rs_notrecovering_latency, dur);
}
pg->state_clear(PG_STATE_RECOVERY_WAIT);
pg->state_clear(PG_STATE_RECOVERY_TOOFULL);
pg->state_set(PG_STATE_RECOVERING);
+ assert(!pg->state_test(PG_STATE_ACTIVATING));
pg->publish_stats_to_osd();
pg->queue_recovery();
}
pg->state_clear(PG_STATE_FORCED_RECOVERY);
release_reservations();
pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
+ // XXX: Is this needed?
+ pg->publish_stats_to_osd();
return transit<WaitLocalBackfillReserved>();
}
return transit<NotRecovering>();
}
+boost::statechart::result
+PG::RecoveryState::Recovering::react(const UnfoundRecovery &evt)
+{
+ PG *pg = context< RecoveryMachine >().pg;
+ ldout(pg->cct, 10) << "recovery has unfound, can't continue" << dendl;
+ pg->state_set(PG_STATE_RECOVERY_UNFOUND);
+ pg->state_clear(PG_STATE_RECOVERING);
+ pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
+ release_reservations(true);
+ return transit<NotRecovering>();
+}
+
void PG::RecoveryState::Recovering::exit()
{
context< RecoveryMachine >().log_exit(state_name, enter_time);
assert(!pg->actingbackfill.empty());
if (pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid) <=
pg->actingbackfill.size()) {
- pg->state_clear(PG_STATE_DEGRADED);
pg->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
pg->publish_stats_to_osd();
}
pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid)) {
if (pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid) <= pg->actingset.size()) {
pg->state_clear(PG_STATE_UNDERSIZED);
- if (pg->needs_recovery()) {
- pg->state_set(PG_STATE_DEGRADED);
- } else {
- pg->state_clear(PG_STATE_DEGRADED);
- }
} else {
pg->state_set(PG_STATE_UNDERSIZED);
- pg->state_set(PG_STATE_DEGRADED);
}
- need_publish = true; // degraded may have changed
+ // degraded changes will be detected by call from publish_stats_to_osd()
+ need_publish = true;
}
// if we haven't reported our PG stats in a long time, do so now.
pg->peer_missing[logevt.from],
logevt.from,
context< RecoveryMachine >().get_recovery_ctx());
- if (pg->is_peered() &&
- got_missing)
- pg->queue_recovery();
+ // If there are missing AND we are "fully" active then start recovery now
+ if (got_missing && pg->state_test(PG_STATE_ACTIVE)) {
+ post_event(DoRecovery());
+ }
return discard_event();
}
// waiters
if (pg->flushes_in_progress == 0) {
pg->requeue_ops(pg->waiting_for_peered);
+ } else if (!pg->waiting_for_peered.empty()) {
+ ldout(pg->cct, 10) << __func__ << " flushes in progress, moving "
+ << pg->waiting_for_peered.size()
+ << " items to waiting_for_flush"
+ << dendl;
+ assert(pg->waiting_for_flush.empty());
+ pg->waiting_for_flush.swap(pg->waiting_for_peered);
}
pg->on_activate();