#include "messages/MOSDSubOpReply.h"
#include "messages/MOSDRepOpReply.h"
#include "messages/MOSDRepScrubMap.h"
+#include "messages/MOSDPGRecoveryDelete.h"
+#include "messages/MOSDPGRecoveryDeleteReply.h"
#include "common/BackTrace.h"
#include "common/EventTrace.h"
}
#endif
+
void PGPool::update(OSDMapRef map)
{
const pg_pool_t *pi = map->get_pg_pool(id);
dirty_info(false), dirty_big_info(false),
info(p),
info_struct_v(0),
- coll(p), pg_log(cct),
+ coll(p),
+ pg_log(cct),
pgmeta_oid(p.make_pgmeta_oid()),
missing_loc(this),
past_intervals(
bool PG::MissingLoc::readable_with_acting(
const hobject_t &hoid,
const set<pg_shard_t> &acting) const {
- if (!needs_recovery(hoid)) return true;
+ if (!needs_recovery(hoid))
+ return true;
+ if (is_deleted(hoid))
+ return false;
auto missing_loc_entry = missing_loc.find(hoid);
- if (missing_loc_entry == missing_loc.end()) return false;
+ if (missing_loc_entry == missing_loc.end())
+ return false;
const set<pg_shard_t> &locs = missing_loc_entry->second;
ldout(pg->cct, 10) << __func__ << ": locs:" << locs << dendl;
set<pg_shard_t> have_acting;
handle->reset_tp_timeout();
loop = 0;
}
+ if (i->second.is_delete())
+ continue;
missing_loc[i->first].insert(sources.begin(), sources.end());
missing_loc_sources.insert(sources.begin(), sources.end());
}
handle->reset_tp_timeout();
loop = 0;
}
+ if (p->second.is_delete()) {
+ ldout(pg->cct, 10) << __func__ << " " << soid
+ << " delete, ignoring source" << dendl;
+ found_missing = true;
+ continue;
+ }
if (oinfo.last_update < need) {
ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
<< " also missing on osd." << fromosd
PG::Scrubber::Scrubber()
: reserved(false), reserve_failed(false),
epoch_start(0),
- active(false), queue_snap_trim(false),
+ active(false),
waiting_on(0), shallow_errors(0), deep_errors(0), fixed(0),
must_scrub(false), must_deep_scrub(false), must_repair(false),
auto_repair(false),
dout(10) << "activate peer osd." << peer << " " << pi << dendl;
MOSDPGLog *m = 0;
+ assert(peer_missing.count(peer));
pg_missing_t& pm = peer_missing[peer];
bool needs_past_intervals = pi.dne();
* behind.
*/
// backfill
- osd->clog->info() << info.pgid << " starting backfill to osd." << peer
+ osd->clog->debug() << info.pgid << " starting backfill to osd." << peer
<< " from (" << pi.log_tail << "," << pi.last_update
<< "] " << pi.last_backfill
<< " to " << info.last_update;
if (m && pi.last_backfill != hobject_t()) {
for (list<pg_log_entry_t>::iterator p = m->log.log.begin();
p != m->log.log.end();
- ++p)
+ ++p) {
if (p->soid <= pi.last_backfill &&
- !p->is_error())
- pm.add_next_event(*p);
+ !p->is_error()) {
+ if (perform_deletes_during_peering() && p->is_delete()) {
+ pm.rm(p->soid, p->version);
+ } else {
+ pm.add_next_event(*p);
+ }
+ }
+ }
}
-
+
if (m) {
dout(10) << "activate peer osd." << peer << " sending " << m->log << dendl;
//m->log.print(cout);
for (set<pg_shard_t>::iterator i = actingbackfill.begin();
i != actingbackfill.end();
++i) {
+ dout(20) << __func__ << " setting up missing_loc from shard " << *i << " " << dendl;
if (*i == get_primary()) {
missing_loc.add_active_missing(missing);
if (!missing.have_missing())
op->need_write_cap(),
op->classes());
- dout(20) << "op_has_sufficient_caps pool=" << pool.id << " (" << pool.name
- << " " << req->get_hobj().nspace
+ dout(20) << "op_has_sufficient_caps "
+ << "session=" << session
+ << " pool=" << pool.id << " (" << pool.name
+ << " " << req->get_hobj().nspace
<< ") owner=" << pool.auid
<< " need_read_cap=" << op->need_read_cap()
<< " need_write_cap=" << op->need_write_cap()
}
}
-void PG::queue_recovery(bool front)
+void PG::queue_recovery()
{
if (!is_primary() || !is_peered()) {
dout(10) << "queue_recovery -- not primary or not peered " << dendl;
} else {
dout(10) << "queue_recovery -- queuing" << dendl;
recovery_queued = true;
- osd->queue_for_recovery(this, front);
+ osd->queue_for_recovery(this);
}
}
void PG::mark_clean()
{
- // only mark CLEAN if we have the desired number of replicas AND we
- // are not remapped.
- if (actingset.size() == get_osdmap()->get_pg_size(info.pgid.pgid) &&
- up == acting)
+ if (actingset.size() == get_osdmap()->get_pg_size(info.pgid.pgid)) {
+ state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
state_set(PG_STATE_CLEAN);
-
- // NOTE: this is actually a bit premature: we haven't purged the
- // strays yet.
- info.history.last_epoch_clean = get_osdmap()->get_epoch();
- info.history.last_interval_clean = info.history.same_interval_since;
-
- past_intervals.clear();
- dirty_big_info = true;
-
- if (is_active()) {
- /* The check is needed because if we are below min_size we're not
- * actually active */
- kick_snap_trim();
+ info.history.last_epoch_clean = get_osdmap()->get_epoch();
+ info.history.last_interval_clean = info.history.same_interval_since;
+ past_intervals.clear();
+ dirty_big_info = true;
+ dirty_info = true;
}
- dirty_info = true;
+ kick_snap_trim();
}
-unsigned PG::get_recovery_priority()
+void PG::_change_recovery_force_mode(int new_mode, bool clear)
{
- // a higher value -> a higher priority
-
- int pool_recovery_priority = 0;
- pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority);
+ if (!deleting) {
+ // we can't and shouldn't do anything if the PG is being deleted locally
+ if (clear) {
+ state_clear(new_mode);
+ } else {
+ state_set(new_mode);
+ }
+ publish_stats_to_osd();
+ }
+}
- int ret = OSD_RECOVERY_PRIORITY_BASE + pool_recovery_priority;
+inline int PG::clamp_recovery_priority(int priority)
+{
+ static_assert(OSD_RECOVERY_PRIORITY_MIN < OSD_RECOVERY_PRIORITY_MAX, "Invalid priority range");
+ static_assert(OSD_RECOVERY_PRIORITY_MIN >= 0, "Priority range must match unsigned type");
// Clamp to valid range
- if (ret > OSD_RECOVERY_PRIORITY_MAX) {
- ret = OSD_RECOVERY_PRIORITY_MAX;
- } else if (ret < OSD_RECOVERY_PRIORITY_MIN) {
- ret = OSD_RECOVERY_PRIORITY_MIN;
+ if (priority > OSD_RECOVERY_PRIORITY_MAX) {
+ return OSD_RECOVERY_PRIORITY_MAX;
+ } else if (priority < OSD_RECOVERY_PRIORITY_MIN) {
+ return OSD_RECOVERY_PRIORITY_MIN;
+ } else {
+ return priority;
}
+}
- static_assert(OSD_RECOVERY_PRIORITY_MIN < OSD_RECOVERY_PRIORITY_MAX, "Invalid priority range");
- static_assert(OSD_RECOVERY_PRIORITY_MIN >= 0, "Priority range must match unsigned type");
+unsigned PG::get_recovery_priority()
+{
+ // a higher value -> a higher priority
+ int ret = 0;
+ if (state & PG_STATE_FORCED_RECOVERY) {
+ ret = OSD_RECOVERY_PRIORITY_FORCED;
+ } else {
+ pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &ret);
+ ret = clamp_recovery_priority(OSD_RECOVERY_PRIORITY_BASE + ret);
+ }
+ dout(20) << __func__ << " recovery priority for " << *this << " is " << ret << ", state is " << state << dendl;
return static_cast<unsigned>(ret);
}
unsigned PG::get_backfill_priority()
{
// a higher value -> a higher priority
-
int ret = OSD_BACKFILL_PRIORITY_BASE;
- if (acting.size() < pool.info.min_size) {
- // inactive: no. of replicas < min_size, highest priority since it blocks IO
- ret = OSD_BACKFILL_INACTIVE_PRIORITY_BASE + (pool.info.min_size - acting.size());
+ if (state & PG_STATE_FORCED_BACKFILL) {
+ ret = OSD_RECOVERY_PRIORITY_FORCED;
+ } else {
+ if (acting.size() < pool.info.min_size) {
+ // inactive: no. of replicas < min_size, highest priority since it blocks IO
+ ret = OSD_BACKFILL_INACTIVE_PRIORITY_BASE + (pool.info.min_size - acting.size());
- } else if (is_undersized()) {
- // undersized: OSD_BACKFILL_DEGRADED_PRIORITY_BASE + num missing replicas
- assert(pool.info.size > actingset.size());
- ret = OSD_BACKFILL_DEGRADED_PRIORITY_BASE + (pool.info.size - actingset.size());
+ } else if (is_undersized()) {
+ // undersized: OSD_BACKFILL_DEGRADED_PRIORITY_BASE + num missing replicas
+ assert(pool.info.size > actingset.size());
+ ret = OSD_BACKFILL_DEGRADED_PRIORITY_BASE + (pool.info.size - actingset.size());
- } else if (is_degraded()) {
- // degraded: baseline degraded
- ret = OSD_BACKFILL_DEGRADED_PRIORITY_BASE;
- }
+ } else if (is_degraded()) {
+ // degraded: baseline degraded
+ ret = OSD_BACKFILL_DEGRADED_PRIORITY_BASE;
+ }
- // Adjust with pool's recovery priority
- int pool_recovery_priority = 0;
- pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority);
- ret += pool_recovery_priority;
+ // Adjust with pool's recovery priority
+ int pool_recovery_priority = 0;
+ pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority);
- // Clamp to valid range
- if (ret > OSD_RECOVERY_PRIORITY_MAX) {
- ret = OSD_RECOVERY_PRIORITY_MAX;
- } else if (ret < OSD_RECOVERY_PRIORITY_MIN) {
- ret = OSD_RECOVERY_PRIORITY_MIN;
+ ret = clamp_recovery_priority(pool_recovery_priority + ret);
}
return static_cast<unsigned>(ret);
// in the future).
info.set_last_backfill(hobject_t());
child->info.set_last_backfill(hobject_t());
+ // restarting backfill implies that the missing set is empty,
+ // since it is only used for objects prior to last_backfill
+ pg_log.reset_backfill();
+ child->pg_log.reset_backfill();
}
child->info.stats = info.stats;
auto last = logv.rbegin();
if (is_primary() && last != logv.rend()) {
projected_log.skip_can_rollback_to_to_head();
- projected_log.trim(cct, last->version, nullptr);
+ projected_log.trim(cct, last->version, nullptr, nullptr, nullptr);
}
if (transaction_applied && roll_forward_to > pg_log.get_can_rollback_to()) {
last_written_info = info;
+ // if we are upgrading from jewel, we need to force rebuild of
+ // missing set. v9 was fastinfo, added v11.0.2-331-g1d5dc29a13
+ // (before kraken). persisted missing set was circa
+ // v11.0.0-866-gb0e239da95 (a bit earlier, also before kraken).
+ // v8 was pre-jewel (per-pg meta object).
+ bool force_rebuild_missing = info_struct_v < 9;
+ if (force_rebuild_missing) {
+ dout(10) << __func__ << " detected upgrade from jewel, force_rebuild_missing"
+ << dendl;
+ }
+
ostringstream oss;
pg_log.read_log_and_missing(
store,
info_struct_v < 8 ? coll_t::meta() : coll,
ghobject_t(info_struct_v < 8 ? OSD::make_pg_log_oid(pg_id) : pgmeta_oid),
info,
+ force_rebuild_missing,
oss,
cct->_conf->osd_ignore_stale_divergent_priors,
cct->_conf->osd_debug_verify_missing_on_start);
if (oss.tellp())
osd->clog->error() << oss.rdbuf();
+ if (force_rebuild_missing) {
+ dout(10) << __func__ << " forced rebuild of missing got "
+ << pg_log.get_missing()
+ << dendl;
+ }
+
// log any weirdness
log_weirdness();
}
// sloppy check
if ((pg_log.get_log().log.begin()->version <= pg_log.get_tail()))
osd->clog->error() << info.pgid
- << " log bound mismatch, info (" << pg_log.get_tail() << ","
- << pg_log.get_head() << "]"
+ << " log bound mismatch, info (tail,head] ("
+ << pg_log.get_tail() << "," << pg_log.get_head() << "]"
<< " actual ["
<< pg_log.get_log().log.begin()->version << ","
<< pg_log.get_log().log.rbegin()->version << "]";
} else {
osd->clog->error() << "osd." << osd->whoami
<< " pg " << info.pgid
- << " Regular scrub request, losing deep-scrub details";
+ << " Regular scrub request, deep-scrub details will be lost";
}
}
queue_scrub();
}
}
+void PG::_repair_oinfo_oid(ScrubMap &smap)
+{
+ for (map<hobject_t, ScrubMap::object>::reverse_iterator i = smap.objects.rbegin();
+ i != smap.objects.rend();
+ ++i) {
+ const hobject_t &hoid = i->first;
+ ScrubMap::object &o = i->second;
+
+ bufferlist bl;
+ if (o.attrs.find(OI_ATTR) == o.attrs.end()) {
+ continue;
+ }
+ bl.push_back(o.attrs[OI_ATTR]);
+ object_info_t oi;
+ try {
+ oi.decode(bl);
+ } catch(...) {
+ continue;
+ }
+ if (oi.soid != hoid) {
+ ObjectStore::Transaction t;
+ OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
+ osd->clog->error() << "osd." << osd->whoami
+ << " found object info error on pg "
+ << info.pgid
+ << " oid " << hoid << " oid in object info: "
+ << oi.soid
+ << "...repaired";
+ // Fix object info
+ oi.soid = hoid;
+ bl.clear();
+ ::encode(oi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
+
+ bufferptr bp(bl.c_str(), bl.length());
+ o.attrs[OI_ATTR] = bp;
+
+ t.setattr(coll, ghobject_t(hoid), OI_ATTR, bl);
+ int r = osd->store->apply_transaction(osr.get(), std::move(t));
+ if (r != 0) {
+ derr << __func__ << ": apply_transaction got " << cpp_strerror(r)
+ << dendl;
+ }
+ }
+ }
+}
+
/*
* build a scrub map over a chunk without releasing the lock
* only used by chunky scrub
get_pgbackend()->be_scan_list(map, ls, deep, seed, handle);
_scan_rollback_obs(rollback_obs, handle);
_scan_snaps(map);
+ _repair_oinfo_oid(map);
dout(20) << __func__ << " done" << dendl;
return 0;
eversion_t v;
bufferlist bv;
bv.push_back(po.attrs[OI_ATTR]);
- object_info_t oi(bv);
+ object_info_t oi;
+ try {
+ bufferlist::iterator bliter = bv.begin();
+ ::decode(oi, bliter);
+ } catch (...) {
+ dout(0) << __func__ << ": Need version of replica, bad object_info_t: " << soid << dendl;
+ assert(0);
+ }
if (bad_peer != primary) {
- peer_missing[bad_peer].add(soid, oi.version, eversion_t());
+ peer_missing[bad_peer].add(soid, oi.version, eversion_t(), false);
} else {
// We should only be scrubbing if the PG is clean.
assert(waiting_for_unreadable_object.empty());
const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
stringstream oss;
oss << info.pgid.pgid << " " << mode << " starts" << std::endl;
- osd->clog->info(oss);
+ osd->clog->debug(oss);
}
scrubber.seed = -1;
scrubber.state = PG::Scrubber::INACTIVE;
done = true;
+ if (!snap_trimq.empty()) {
+ dout(10) << "scrub finished, requeuing snap_trimmer" << dendl;
+ snap_trimmer_scrub_complete();
+ }
+
break;
default:
requeue_ops(waiting_for_scrub);
- if (scrubber.queue_snap_trim) {
- dout(10) << "scrub finished, requeuing snap_trimmer" << dendl;
- snap_trimmer_scrub_complete();
- }
-
scrubber.reset();
// type-specific state clear
if (total_errors)
osd->clog->error(oss);
else
- osd->clog->info(oss);
+ osd->clog->debug(oss);
}
// finish up
assert(peer_missing.count(peer));
assert(peer_info.count(peer));
pg_missing_t& pmissing(peer_missing[peer]);
+ dout(20) << __func__ << " peer_missing for " << peer << " = " << pmissing << dendl;
pg_info_t& pinfo(peer_info[peer]);
bool invalidate_stats = PGLog::append_log_entries_update_missing(
pinfo.last_backfill,
return can_discard_replica_op<MOSDSubOpReply, MSG_OSD_SUBOPREPLY>(op);
case MSG_OSD_REPOPREPLY:
return can_discard_replica_op<MOSDRepOpReply, MSG_OSD_REPOPREPLY>(op);
+ case MSG_OSD_PG_RECOVERY_DELETE:
+ return can_discard_replica_op<MOSDPGRecoveryDelete, MSG_OSD_PG_RECOVERY_DELETE>(op);
+
+ case MSG_OSD_PG_RECOVERY_DELETE_REPLY:
+ return can_discard_replica_op<MOSDPGRecoveryDeleteReply, MSG_OSD_PG_RECOVERY_DELETE_REPLY>(op);
case MSG_OSD_EC_WRITE:
return can_discard_replica_op<MOSDECSubOpWrite, MSG_OSD_EC_WRITE>(op);
pg->publish_stats_to_osd();
}
+boost::statechart::result
+PG::RecoveryState::Backfilling::react(const CancelBackfill &)
+{
+ PG *pg = context< RecoveryMachine >().pg;
+ pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
+ // XXX: Add a new pg state so user can see why backfill isn't proceeding
+ // Can't use PG_STATE_BACKFILL_WAIT since it means waiting for reservations
+ //pg->state_set(PG_STATE_BACKFILL_STALLED????);
+
+ for (set<pg_shard_t>::iterator it = pg->backfill_targets.begin();
+ it != pg->backfill_targets.end();
+ ++it) {
+ assert(*it != pg->pg_whoami);
+ ConnectionRef con = pg->osd->get_con_osd_cluster(
+ it->osd, pg->get_osdmap()->get_epoch());
+ if (con) {
+ pg->osd->send_message_osd_cluster(
+ new MBackfillReserve(
+ MBackfillReserve::REJECT,
+ spg_t(pg->info.pgid.pgid, it->shard),
+ pg->get_osdmap()->get_epoch()),
+ con.get());
+ }
+ }
+
+ pg->waiting_on_backfill.clear();
+
+ pg->schedule_backfill_full_retry();
+ return transit<NotBackfilling>();
+}
+
boost::statechart::result
PG::RecoveryState::Backfilling::react(const RemoteReservationRejected &)
{
pg->backfill_reserved = false;
pg->backfill_reserving = false;
pg->state_clear(PG_STATE_BACKFILL);
+ pg->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
utime_t dur = ceph_clock_now() - enter_time;
pg->osd->recoverystate_perf->tinc(rs_backfilling_latency, dur);
}
pg->queue_recovery();
}
-void PG::RecoveryState::Recovering::release_reservations()
+void PG::RecoveryState::Recovering::release_reservations(bool cancel)
{
PG *pg = context< RecoveryMachine >().pg;
- assert(!pg->pg_log.get_missing().have_missing());
+ assert(cancel || !pg->pg_log.get_missing().have_missing());
// release remote reservations
for (set<pg_shard_t>::const_iterator i =
{
PG *pg = context< RecoveryMachine >().pg;
pg->state_clear(PG_STATE_RECOVERING);
+ pg->state_clear(PG_STATE_FORCED_RECOVERY);
release_reservations();
return transit<Recovered>();
}
{
PG *pg = context< RecoveryMachine >().pg;
pg->state_clear(PG_STATE_RECOVERING);
+ pg->state_clear(PG_STATE_FORCED_RECOVERY);
release_reservations();
return transit<WaitRemoteBackfillReserved>();
}
+boost::statechart::result
+PG::RecoveryState::Recovering::react(const CancelRecovery &evt)
+{
+ PG *pg = context< RecoveryMachine >().pg;
+ pg->state_clear(PG_STATE_RECOVERING);
+ pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
+ release_reservations(true);
+ pg->schedule_recovery_full_retry();
+ return transit<NotRecovering>();
+}
+
void PG::RecoveryState::Recovering::exit()
{
context< RecoveryMachine >().log_exit(state_name, enter_time);
if (pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid) <=
pg->actingbackfill.size()) {
pg->state_clear(PG_STATE_DEGRADED);
+ pg->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
pg->publish_stats_to_osd();
}
ceph_abort();
}
pg->finish_recovery(*context< RecoveryMachine >().get_on_safe_context_list());
- pg->mark_clean();
+
+ if (pg->is_active()) {
+ pg->mark_clean();
+ }
pg->share_pg_info();
pg->publish_stats_to_osd();
-
+ pg->requeue_ops(pg->waiting_for_clean_to_primary_repair);
}
void PG::RecoveryState::Clean::exit()
pg->all_unfound_are_queried_or_lost(pg->get_osdmap())) {
if (pg->cct->_conf->osd_auto_mark_unfound_lost) {
pg->osd->clog->error() << pg->info.pgid.pgid << " has " << unfound
- << " objects unfound and apparently lost, would automatically marking lost but NOT IMPLEMENTED";
+ << " objects unfound and apparently lost, would automatically "
+ << "mark these objects lost but this feature is not yet implemented "
+ << "(osd_auto_mark_unfound_lost)";
} else
- pg->osd->clog->error() << pg->info.pgid.pgid << " has " << unfound << " objects unfound and apparently lost";
+ pg->osd->clog->error() << pg->info.pgid.pgid << " has "
+ << unfound << " objects unfound and apparently lost";
}
if (pg->is_active()) {
++i) {
if (*i == pg->get_primary()) continue;
const pg_info_t& pi = pg->peer_info[*i];
+ // reset this so to make sure the pg_missing_t is initialized and
+ // has the correct semantics even if we don't need to get a
+ // missing set from a shard. This way later additions due to
+ // lost+unfound delete work properly.
+ pg->peer_missing[*i].may_include_deletes = !pg->perform_deletes_during_peering();
if (pi.is_empty())
continue; // no pg data, nothing divergent
if (pi.last_update < pg->pg_log.get_tail()) {
ldout(pg->cct, 10) << " osd." << *i << " is not contiguous, will restart backfill" << dendl;
- pg->peer_missing[*i];
+ pg->peer_missing[*i].clear();
continue;
}
if (pi.last_backfill == hobject_t()) {
ldout(pg->cct, 10) << " osd." << *i << " will fully backfill; can infer empty missing set" << dendl;
- pg->peer_missing[*i];
+ pg->peer_missing[*i].clear();
continue;
}
// FIXME: we can do better here. if last_update==last_complete we
// can infer the rest!
ldout(pg->cct, 10) << " osd." << *i << " has no missing, identical log" << dendl;
- pg->peer_missing[*i];
+ pg->peer_missing[*i].clear();
continue;
}