]> git.proxmox.com Git - ceph.git/blobdiff - ceph/src/osd/PGBackend.cc
update sources to v12.2.0
[ceph.git] / ceph / src / osd / PGBackend.cc
index fab9ee826642c676747668e9f08f706a8fd38d65..468e29297d0de41e14bad096c23d4ee82d2219ed 100644 (file)
@@ -27,6 +27,8 @@
 #include "OSDMap.h"
 #include "PGLog.h"
 #include "common/LogClient.h"
+#include "messages/MOSDPGRecoveryDelete.h"
+#include "messages/MOSDPGRecoveryDeleteReply.h"
 
 #define dout_context cct
 #define dout_subsys ceph_subsys_osd
@@ -37,6 +39,146 @@ static ostream& _prefix(std::ostream *_dout, PGBackend *pgb) {
   return *_dout << pgb->get_parent()->gen_dbg_prefix();
 }
 
+void PGBackend::recover_delete_object(const hobject_t &oid, eversion_t v,
+                                     RecoveryHandle *h)
+{
+  assert(get_parent()->get_actingbackfill_shards().size() > 0);
+  for (const auto& shard : get_parent()->get_actingbackfill_shards()) {
+    if (shard == get_parent()->whoami_shard())
+      continue;
+    if (get_parent()->get_shard_missing(shard).is_missing(oid)) {
+      dout(20) << __func__ << " will remove " << oid << " " << v << " from "
+              << shard << dendl;
+      h->deletes[shard].push_back(make_pair(oid, v));
+      get_parent()->begin_peer_recover(shard, oid);
+    }
+  }
+}
+
+void PGBackend::send_recovery_deletes(int prio,
+                                     const map<pg_shard_t, vector<pair<hobject_t, eversion_t> > > &deletes)
+{
+  epoch_t min_epoch = get_parent()->get_last_peering_reset_epoch();
+  for (const auto& p : deletes) {
+    const auto& shard = p.first;
+    const auto& objects = p.second;
+    ConnectionRef con = get_parent()->get_con_osd_cluster(
+      shard.osd,
+      get_osdmap()->get_epoch());
+    if (!con)
+      continue;
+    auto it = objects.begin();
+    while (it != objects.end()) {
+      uint64_t cost = 0;
+      uint64_t deletes = 0;
+      spg_t target_pg = spg_t(get_parent()->get_info().pgid.pgid, shard.shard);
+      MOSDPGRecoveryDelete *msg =
+       new MOSDPGRecoveryDelete(get_parent()->whoami_shard(),
+                                target_pg,
+                                get_osdmap()->get_epoch(),
+                                min_epoch);
+      msg->set_priority(prio);
+
+      while (it != objects.end() &&
+            cost < cct->_conf->osd_max_push_cost &&
+            deletes < cct->_conf->osd_max_push_objects) {
+       dout(20) << __func__ << ": sending recovery delete << " << it->first
+                << " " << it->second << " to osd." << shard << dendl;
+       msg->objects.push_back(*it);
+       cost += cct->_conf->osd_push_per_object_cost;
+       ++deletes;
+       ++it;
+      }
+
+      msg->set_cost(cost);
+      get_parent()->send_message_osd_cluster(msg, con);
+    }
+  }
+}
+
+bool PGBackend::handle_message(OpRequestRef op)
+{
+  switch (op->get_req()->get_type()) {
+  case MSG_OSD_PG_RECOVERY_DELETE:
+    handle_recovery_delete(op);
+    return true;
+
+  case MSG_OSD_PG_RECOVERY_DELETE_REPLY:
+    handle_recovery_delete_reply(op);
+    return true;
+
+  default:
+    break;
+  }
+
+  return _handle_message(op);
+}
+
+void PGBackend::handle_recovery_delete(OpRequestRef op)
+{
+  const MOSDPGRecoveryDelete *m = static_cast<const MOSDPGRecoveryDelete *>(op->get_req());
+  assert(m->get_type() == MSG_OSD_PG_RECOVERY_DELETE);
+  dout(20) << __func__ << " " << op << dendl;
+
+  op->mark_started();
+
+  C_GatherBuilder gather(cct);
+  for (const auto &p : m->objects) {
+    get_parent()->remove_missing_object(p.first, p.second, gather.new_sub());
+  }
+
+  MOSDPGRecoveryDeleteReply *reply = new MOSDPGRecoveryDeleteReply;
+  reply->from = get_parent()->whoami_shard();
+  reply->set_priority(m->get_priority());
+  reply->pgid = spg_t(get_parent()->get_info().pgid.pgid, m->from.shard);
+  reply->map_epoch = m->map_epoch;
+  reply->min_epoch = m->min_epoch;
+  reply->objects = m->objects;
+  ConnectionRef conn = m->get_connection();
+
+  gather.set_finisher(new FunctionContext(
+    [=](int r) {
+      if (r != -EAGAIN) {
+       get_parent()->send_message_osd_cluster(reply, conn.get());
+      } else {
+       reply->put();
+      }
+    }));
+  gather.activate();
+}
+
+void PGBackend::handle_recovery_delete_reply(OpRequestRef op)
+{
+  const MOSDPGRecoveryDeleteReply *m = static_cast<const MOSDPGRecoveryDeleteReply *>(op->get_req());
+  assert(m->get_type() == MSG_OSD_PG_RECOVERY_DELETE_REPLY);
+  dout(20) << __func__ << " " << op << dendl;
+
+  for (const auto &p : m->objects) {
+    ObjectRecoveryInfo recovery_info;
+    hobject_t oid = p.first;
+    recovery_info.version = p.second;
+    get_parent()->on_peer_recover(m->from, oid, recovery_info);
+    bool peers_recovered = true;
+    for (const auto& shard : get_parent()->get_actingbackfill_shards()) {
+      if (shard == get_parent()->whoami_shard())
+       continue;
+      if (get_parent()->get_shard_missing(shard).is_missing(oid)) {
+       dout(20) << __func__ << " " << oid << " still missing on at least "
+                << shard << dendl;
+       peers_recovered = false;
+       break;
+      }
+    }
+    if (peers_recovered && !get_parent()->get_local_missing().is_missing(oid)) {
+      dout(20) << __func__ << " completed recovery, local_missing = "
+              << get_parent()->get_local_missing() << dendl;
+      object_stat_sum_t stat_diff;
+      stat_diff.num_objects_recovered = 1;
+      get_parent()->on_global_recover(p.first, stat_diff, true);
+    }
+  }
+}
+
 void PGBackend::rollback(
   const pg_log_entry_t &entry,
   ObjectStore::Transaction *t)
@@ -571,6 +713,9 @@ bool PGBackend::be_compare_scrub_objects(
   for (map<string,bufferptr>::const_iterator i = auth.attrs.begin();
        i != auth.attrs.end();
        ++i) {
+    // We check system keys seperately
+    if (i->first == OI_ATTR || i->first == SS_ATTR)
+      continue;
     if (!candidate.attrs.count(i->first)) {
       if (error != CLEAN)
         errorstream << ", ";
@@ -588,6 +733,9 @@ bool PGBackend::be_compare_scrub_objects(
   for (map<string,bufferptr>::const_iterator i = candidate.attrs.begin();
        i != candidate.attrs.end();
        ++i) {
+    // We check system keys seperately
+    if (i->first == OI_ATTR || i->first == SS_ATTR)
+      continue;
     if (!auth.attrs.count(i->first)) {
       if (error != CLEAN)
         errorstream << ", ";
@@ -618,12 +766,23 @@ map<pg_shard_t, ScrubMap *>::const_iterator
   inconsistent_obj_wrapper &object_error)
 {
   eversion_t auth_version;
-  bufferlist auth_bl;
+  bufferlist first_bl;
 
-  map<pg_shard_t, ScrubMap *>::const_iterator auth = maps.end();
+  // Create list of shards with primary last so it will be auth copy all
+  // other things being equal.
+  list<pg_shard_t> shards;
   for (map<pg_shard_t, ScrubMap *>::const_iterator j = maps.begin();
        j != maps.end();
        ++j) {
+    if (j->first == get_parent()->whoami_shard())
+      continue;
+    shards.push_back(j->first);
+  }
+  shards.push_back(get_parent()->whoami_shard());
+
+  map<pg_shard_t, ScrubMap *>::const_iterator auth = maps.end();
+  for (auto &l : shards) {
+    map<pg_shard_t, ScrubMap *>::const_iterator j = maps.find(l);
     map<hobject_t, ScrubMap::object>::iterator i =
       j->second->objects.find(obj);
     if (i == j->second->objects.end()) {
@@ -631,6 +790,8 @@ map<pg_shard_t, ScrubMap *>::const_iterator
     }
     string error_string;
     auto& shard_info = shard_map[j->first];
+    if (j->first == get_parent()->whoami_shard())
+      shard_info.primary = true;
     if (i->second.read_error) {
       shard_info.set_read_error();
       error_string += " read_error";
@@ -647,6 +808,8 @@ map<pg_shard_t, ScrubMap *>::const_iterator
     object_info_t oi;
     bufferlist bl;
     map<string, bufferptr>::iterator k;
+    SnapSet ss;
+    bufferlist ss_bl;
 
     if (i->second.stat_error) {
       shard_info.set_stat_error();
@@ -656,6 +819,25 @@ map<pg_shard_t, ScrubMap *>::const_iterator
       goto out;
     }
 
+    // We won't pick an auth copy if the snapset is missing or won't decode.
+    if (obj.is_head() || obj.is_snapdir()) {
+      k = i->second.attrs.find(SS_ATTR);
+      if (k == i->second.attrs.end()) {
+       shard_info.set_ss_attr_missing();
+       error_string += " ss_attr_missing";
+      } else {
+        ss_bl.push_back(k->second);
+        try {
+         bufferlist::iterator bliter = ss_bl.begin();
+         ::decode(ss, bliter);
+        } catch (...) {
+         // invalid snapset, probably corrupt
+         shard_info.set_ss_attr_corrupted();
+         error_string += " ss_attr_corrupted";
+        }
+      }
+    }
+
     k = i->second.attrs.find(OI_ATTR);
     if (k == i->second.attrs.end()) {
       // no object info on object, probably corrupt
@@ -674,16 +856,25 @@ map<pg_shard_t, ScrubMap *>::const_iterator
       goto out;
     }
 
-    if (auth_version != eversion_t()) {
-      if (!object_error.has_object_info_inconsistency() && !(bl == auth_bl)) {
-       object_error.set_object_info_inconsistency();
-       error_string += " object_info_inconsistency";
-      }
+    // This is automatically corrected in PG::_repair_oinfo_oid()
+    assert(oi.soid == obj);
+
+    if (first_bl.length() == 0) {
+      first_bl.append(bl);
+    } else if (!object_error.has_object_info_inconsistency() && !bl.contents_equal(first_bl)) {
+      object_error.set_object_info_inconsistency();
+      error_string += " object_info_inconsistency";
     }
 
-    // Don't use this particular shard because it won't be able to repair data
-    // XXX: For now we can't pick one shard for repair and another's object info
-    if (i->second.read_error || i->second.ec_hash_mismatch || i->second.ec_size_mismatch)
+    if (i->second.size != be_get_ondisk_size(oi.size)) {
+      dout(5) << __func__ << " size " << i->second.size << " oi size " << oi.size << dendl;
+      shard_info.set_obj_size_oi_mismatch();
+      error_string += " obj_size_oi_mismatch";
+    }
+
+    // Don't use this particular shard due to previous errors
+    // XXX: For now we can't pick one shard for repair and another's object info or snapset
+    if (shard_info.errors)
       goto out;
 
     if (auth_version == eversion_t() || oi.version > auth_version ||
@@ -691,8 +882,6 @@ map<pg_shard_t, ScrubMap *>::const_iterator
       auth = j;
       *auth_oi = oi;
       auth_version = oi.version;
-      auth_bl.clear();
-      auth_bl.append(bl);
     }
 
 out:
@@ -750,9 +939,11 @@ void PGBackend::be_compare_scrubmaps(
       be_select_auth_object(*k, maps, &auth_oi, shard_map, object_error);
 
     list<pg_shard_t> auth_list;
+    set<pg_shard_t> object_errors;
     if (auth == maps.end()) {
       object_error.set_version(0);
-      object_error.set_auth_missing(*k, maps, shard_map, shallow_errors, deep_errors);
+      object_error.set_auth_missing(*k, maps, shard_map, shallow_errors,
+       deep_errors, get_parent()->whoami_shard());
       if (object_error.has_deep_errors())
        ++deep_errors;
       else if (object_error.has_shallow_errors())
@@ -793,6 +984,10 @@ void PGBackend::be_compare_scrubmaps(
          if (found)
            errorstream << pgid << " shard " << j->first << ": soid " << *k
                      << " " << ss.str() << "\n";
+       } else if (found) {
+         // Track possible shard to use as authoritative, if needed
+         // There are errors, without identifying the shard
+         object_errors.insert(j->first);
        } else {
          // XXX: The auth shard might get here that we don't know
          // that it has the "correct" data.
@@ -801,6 +996,7 @@ void PGBackend::be_compare_scrubmaps(
       } else {
        cur_missing.insert(j->first);
        shard_map[j->first].set_missing();
+        shard_map[j->first].primary = (j->first == get_parent()->whoami_shard());
        // Can't have any other errors if there is no information available
        ++shallow_errors;
        errorstream << pgid << " shard " << j->first << " missing " << *k
@@ -810,10 +1006,25 @@ void PGBackend::be_compare_scrubmaps(
     }
 
     if (auth_list.empty()) {
-      errorstream << pgid.pgid << " soid " << *k
+      if (object_errors.empty()) {
+        errorstream << pgid.pgid << " soid " << *k
                  << ": failed to pick suitable auth object\n";
-      goto out;
+        goto out;
+      }
+      // Object errors exist and nothing in auth_list
+      // Prefer the auth shard otherwise take first from list.
+      pg_shard_t shard;
+      if (object_errors.count(auth->first)) {
+       shard = auth->first;
+      } else {
+       shard = *(object_errors.begin());
+      }
+      auth_list.push_back(shard);
+      object_errors.erase(shard);
     }
+    // At this point auth_list is populated, so we add the object errors shards
+    // as inconsistent.
+    cur_inconsistent.insert(object_errors.begin(), object_errors.end());
     if (!cur_missing.empty()) {
       missing[*k] = cur_missing;
     }