#include "OSDMap.h"
#include "PGLog.h"
#include "common/LogClient.h"
+#include "messages/MOSDPGRecoveryDelete.h"
+#include "messages/MOSDPGRecoveryDeleteReply.h"
#define dout_context cct
#define dout_subsys ceph_subsys_osd
return *_dout << pgb->get_parent()->gen_dbg_prefix();
}
+void PGBackend::recover_delete_object(const hobject_t &oid, eversion_t v,
+ RecoveryHandle *h)
+{
+ assert(get_parent()->get_actingbackfill_shards().size() > 0);
+ for (const auto& shard : get_parent()->get_actingbackfill_shards()) {
+ if (shard == get_parent()->whoami_shard())
+ continue;
+ if (get_parent()->get_shard_missing(shard).is_missing(oid)) {
+ dout(20) << __func__ << " will remove " << oid << " " << v << " from "
+ << shard << dendl;
+ h->deletes[shard].push_back(make_pair(oid, v));
+ get_parent()->begin_peer_recover(shard, oid);
+ }
+ }
+}
+
+void PGBackend::send_recovery_deletes(int prio,
+ const map<pg_shard_t, vector<pair<hobject_t, eversion_t> > > &deletes)
+{
+ epoch_t min_epoch = get_parent()->get_last_peering_reset_epoch();
+ for (const auto& p : deletes) {
+ const auto& shard = p.first;
+ const auto& objects = p.second;
+ ConnectionRef con = get_parent()->get_con_osd_cluster(
+ shard.osd,
+ get_osdmap()->get_epoch());
+ if (!con)
+ continue;
+ auto it = objects.begin();
+ while (it != objects.end()) {
+ uint64_t cost = 0;
+ uint64_t deletes = 0;
+ spg_t target_pg = spg_t(get_parent()->get_info().pgid.pgid, shard.shard);
+ MOSDPGRecoveryDelete *msg =
+ new MOSDPGRecoveryDelete(get_parent()->whoami_shard(),
+ target_pg,
+ get_osdmap()->get_epoch(),
+ min_epoch);
+ msg->set_priority(prio);
+
+ while (it != objects.end() &&
+ cost < cct->_conf->osd_max_push_cost &&
+ deletes < cct->_conf->osd_max_push_objects) {
+ dout(20) << __func__ << ": sending recovery delete << " << it->first
+ << " " << it->second << " to osd." << shard << dendl;
+ msg->objects.push_back(*it);
+ cost += cct->_conf->osd_push_per_object_cost;
+ ++deletes;
+ ++it;
+ }
+
+ msg->set_cost(cost);
+ get_parent()->send_message_osd_cluster(msg, con);
+ }
+ }
+}
+
+bool PGBackend::handle_message(OpRequestRef op)
+{
+ switch (op->get_req()->get_type()) {
+ case MSG_OSD_PG_RECOVERY_DELETE:
+ handle_recovery_delete(op);
+ return true;
+
+ case MSG_OSD_PG_RECOVERY_DELETE_REPLY:
+ handle_recovery_delete_reply(op);
+ return true;
+
+ default:
+ break;
+ }
+
+ return _handle_message(op);
+}
+
+void PGBackend::handle_recovery_delete(OpRequestRef op)
+{
+ const MOSDPGRecoveryDelete *m = static_cast<const MOSDPGRecoveryDelete *>(op->get_req());
+ assert(m->get_type() == MSG_OSD_PG_RECOVERY_DELETE);
+ dout(20) << __func__ << " " << op << dendl;
+
+ op->mark_started();
+
+ C_GatherBuilder gather(cct);
+ for (const auto &p : m->objects) {
+ get_parent()->remove_missing_object(p.first, p.second, gather.new_sub());
+ }
+
+ MOSDPGRecoveryDeleteReply *reply = new MOSDPGRecoveryDeleteReply;
+ reply->from = get_parent()->whoami_shard();
+ reply->set_priority(m->get_priority());
+ reply->pgid = spg_t(get_parent()->get_info().pgid.pgid, m->from.shard);
+ reply->map_epoch = m->map_epoch;
+ reply->min_epoch = m->min_epoch;
+ reply->objects = m->objects;
+ ConnectionRef conn = m->get_connection();
+
+ gather.set_finisher(new FunctionContext(
+ [=](int r) {
+ if (r != -EAGAIN) {
+ get_parent()->send_message_osd_cluster(reply, conn.get());
+ } else {
+ reply->put();
+ }
+ }));
+ gather.activate();
+}
+
+void PGBackend::handle_recovery_delete_reply(OpRequestRef op)
+{
+ const MOSDPGRecoveryDeleteReply *m = static_cast<const MOSDPGRecoveryDeleteReply *>(op->get_req());
+ assert(m->get_type() == MSG_OSD_PG_RECOVERY_DELETE_REPLY);
+ dout(20) << __func__ << " " << op << dendl;
+
+ for (const auto &p : m->objects) {
+ ObjectRecoveryInfo recovery_info;
+ hobject_t oid = p.first;
+ recovery_info.version = p.second;
+ get_parent()->on_peer_recover(m->from, oid, recovery_info);
+ bool peers_recovered = true;
+ for (const auto& shard : get_parent()->get_actingbackfill_shards()) {
+ if (shard == get_parent()->whoami_shard())
+ continue;
+ if (get_parent()->get_shard_missing(shard).is_missing(oid)) {
+ dout(20) << __func__ << " " << oid << " still missing on at least "
+ << shard << dendl;
+ peers_recovered = false;
+ break;
+ }
+ }
+ if (peers_recovered && !get_parent()->get_local_missing().is_missing(oid)) {
+ dout(20) << __func__ << " completed recovery, local_missing = "
+ << get_parent()->get_local_missing() << dendl;
+ object_stat_sum_t stat_diff;
+ stat_diff.num_objects_recovered = 1;
+ get_parent()->on_global_recover(p.first, stat_diff, true);
+ }
+ }
+}
+
void PGBackend::rollback(
const pg_log_entry_t &entry,
ObjectStore::Transaction *t)
for (map<string,bufferptr>::const_iterator i = auth.attrs.begin();
i != auth.attrs.end();
++i) {
+ // We check system keys seperately
+ if (i->first == OI_ATTR || i->first == SS_ATTR)
+ continue;
if (!candidate.attrs.count(i->first)) {
if (error != CLEAN)
errorstream << ", ";
for (map<string,bufferptr>::const_iterator i = candidate.attrs.begin();
i != candidate.attrs.end();
++i) {
+ // We check system keys seperately
+ if (i->first == OI_ATTR || i->first == SS_ATTR)
+ continue;
if (!auth.attrs.count(i->first)) {
if (error != CLEAN)
errorstream << ", ";
inconsistent_obj_wrapper &object_error)
{
eversion_t auth_version;
- bufferlist auth_bl;
+ bufferlist first_bl;
- map<pg_shard_t, ScrubMap *>::const_iterator auth = maps.end();
+ // Create list of shards with primary last so it will be auth copy all
+ // other things being equal.
+ list<pg_shard_t> shards;
for (map<pg_shard_t, ScrubMap *>::const_iterator j = maps.begin();
j != maps.end();
++j) {
+ if (j->first == get_parent()->whoami_shard())
+ continue;
+ shards.push_back(j->first);
+ }
+ shards.push_back(get_parent()->whoami_shard());
+
+ map<pg_shard_t, ScrubMap *>::const_iterator auth = maps.end();
+ for (auto &l : shards) {
+ map<pg_shard_t, ScrubMap *>::const_iterator j = maps.find(l);
map<hobject_t, ScrubMap::object>::iterator i =
j->second->objects.find(obj);
if (i == j->second->objects.end()) {
}
string error_string;
auto& shard_info = shard_map[j->first];
+ if (j->first == get_parent()->whoami_shard())
+ shard_info.primary = true;
if (i->second.read_error) {
shard_info.set_read_error();
error_string += " read_error";
object_info_t oi;
bufferlist bl;
map<string, bufferptr>::iterator k;
+ SnapSet ss;
+ bufferlist ss_bl;
if (i->second.stat_error) {
shard_info.set_stat_error();
goto out;
}
+ // We won't pick an auth copy if the snapset is missing or won't decode.
+ if (obj.is_head() || obj.is_snapdir()) {
+ k = i->second.attrs.find(SS_ATTR);
+ if (k == i->second.attrs.end()) {
+ shard_info.set_ss_attr_missing();
+ error_string += " ss_attr_missing";
+ } else {
+ ss_bl.push_back(k->second);
+ try {
+ bufferlist::iterator bliter = ss_bl.begin();
+ ::decode(ss, bliter);
+ } catch (...) {
+ // invalid snapset, probably corrupt
+ shard_info.set_ss_attr_corrupted();
+ error_string += " ss_attr_corrupted";
+ }
+ }
+ }
+
k = i->second.attrs.find(OI_ATTR);
if (k == i->second.attrs.end()) {
// no object info on object, probably corrupt
goto out;
}
- if (auth_version != eversion_t()) {
- if (!object_error.has_object_info_inconsistency() && !(bl == auth_bl)) {
- object_error.set_object_info_inconsistency();
- error_string += " object_info_inconsistency";
- }
+ // This is automatically corrected in PG::_repair_oinfo_oid()
+ assert(oi.soid == obj);
+
+ if (first_bl.length() == 0) {
+ first_bl.append(bl);
+ } else if (!object_error.has_object_info_inconsistency() && !bl.contents_equal(first_bl)) {
+ object_error.set_object_info_inconsistency();
+ error_string += " object_info_inconsistency";
}
- // Don't use this particular shard because it won't be able to repair data
- // XXX: For now we can't pick one shard for repair and another's object info
- if (i->second.read_error || i->second.ec_hash_mismatch || i->second.ec_size_mismatch)
+ if (i->second.size != be_get_ondisk_size(oi.size)) {
+ dout(5) << __func__ << " size " << i->second.size << " oi size " << oi.size << dendl;
+ shard_info.set_obj_size_oi_mismatch();
+ error_string += " obj_size_oi_mismatch";
+ }
+
+ // Don't use this particular shard due to previous errors
+ // XXX: For now we can't pick one shard for repair and another's object info or snapset
+ if (shard_info.errors)
goto out;
if (auth_version == eversion_t() || oi.version > auth_version ||
auth = j;
*auth_oi = oi;
auth_version = oi.version;
- auth_bl.clear();
- auth_bl.append(bl);
}
out:
be_select_auth_object(*k, maps, &auth_oi, shard_map, object_error);
list<pg_shard_t> auth_list;
+ set<pg_shard_t> object_errors;
if (auth == maps.end()) {
object_error.set_version(0);
- object_error.set_auth_missing(*k, maps, shard_map, shallow_errors, deep_errors);
+ object_error.set_auth_missing(*k, maps, shard_map, shallow_errors,
+ deep_errors, get_parent()->whoami_shard());
if (object_error.has_deep_errors())
++deep_errors;
else if (object_error.has_shallow_errors())
if (found)
errorstream << pgid << " shard " << j->first << ": soid " << *k
<< " " << ss.str() << "\n";
+ } else if (found) {
+ // Track possible shard to use as authoritative, if needed
+ // There are errors, without identifying the shard
+ object_errors.insert(j->first);
} else {
// XXX: The auth shard might get here that we don't know
// that it has the "correct" data.
} else {
cur_missing.insert(j->first);
shard_map[j->first].set_missing();
+ shard_map[j->first].primary = (j->first == get_parent()->whoami_shard());
// Can't have any other errors if there is no information available
++shallow_errors;
errorstream << pgid << " shard " << j->first << " missing " << *k
}
if (auth_list.empty()) {
- errorstream << pgid.pgid << " soid " << *k
+ if (object_errors.empty()) {
+ errorstream << pgid.pgid << " soid " << *k
<< ": failed to pick suitable auth object\n";
- goto out;
+ goto out;
+ }
+ // Object errors exist and nothing in auth_list
+ // Prefer the auth shard otherwise take first from list.
+ pg_shard_t shard;
+ if (object_errors.count(auth->first)) {
+ shard = auth->first;
+ } else {
+ shard = *(object_errors.begin());
+ }
+ auth_list.push_back(shard);
+ object_errors.erase(shard);
}
+ // At this point auth_list is populated, so we add the object errors shards
+ // as inconsistent.
+ cur_inconsistent.insert(object_errors.begin(), object_errors.end());
if (!cur_missing.empty()) {
missing[*k] = cur_missing;
}