update sources to v12.1.3

[ceph.git] / ceph / src / osd / PG.cc
diff --git a/ceph/src/osd/PG.cc b/ceph/src/osd/PG.cc

index e753310534bd86bac4155092a9de1d1f30451655..e0707efe2d581bf8ce1ca81ff5c3ab2391e8c01c 100644 (file)
--- a/ceph/src/osd/PG.cc
+++ b/ceph/src/osd/PG.cc
@@ -55,6 +55,8 @@
  #include "messages/MOSDSubOpReply.h"
  #include "messages/MOSDRepOpReply.h"
  #include "messages/MOSDRepScrubMap.h"
+#include "messages/MOSDPGRecoveryDelete.h"
+#include "messages/MOSDPGRecoveryDeleteReply.h"
  
  #include "common/BackTrace.h"
  #include "common/EventTrace.h"
@@ -223,6 +225,7 @@ void PG::dump_live_ids()
  }
  #endif
  
+
  void PGPool::update(OSDMapRef map)
  {
    const pg_pool_t *pi = map->get_pg_pool(id);
@@ -294,7 +297,8 @@ PG::PG(OSDService *o, OSDMapRef curmap,
    dirty_info(false), dirty_big_info(false),
    info(p),
    info_struct_v(0),
-  coll(p), pg_log(cct),
+  coll(p),
+  pg_log(cct),
    pgmeta_oid(p.make_pgmeta_oid()),
    missing_loc(this),
    past_intervals(
@@ -572,9 +576,13 @@ bool PG::search_for_missing(
  bool PG::MissingLoc::readable_with_acting(
    const hobject_t &hoid,
    const set<pg_shard_t> &acting) const {
-  if (!needs_recovery(hoid)) return true;
+  if (!needs_recovery(hoid))
+    return true;
+  if (is_deleted(hoid))
+    return false;
    auto missing_loc_entry = missing_loc.find(hoid);
-  if (missing_loc_entry == missing_loc.end()) return false;
+  if (missing_loc_entry == missing_loc.end())
+    return false;
    const set<pg_shard_t> &locs = missing_loc_entry->second;
    ldout(pg->cct, 10) << __func__ << ": locs:" << locs << dendl;
    set<pg_shard_t> have_acting;
@@ -600,6 +608,8 @@ void PG::MissingLoc::add_batch_sources_info(
        handle->reset_tp_timeout();
        loop = 0;
      }
+    if (i->second.is_delete())
+      continue;
      missing_loc[i->first].insert(sources.begin(), sources.end());
      missing_loc_sources.insert(sources.begin(), sources.end());
    }
@@ -623,6 +633,12 @@ bool PG::MissingLoc::add_source_info(
        handle->reset_tp_timeout();
        loop = 0;
      }
+    if (p->second.is_delete()) {
+      ldout(pg->cct, 10) << __func__ << " " << soid
+                        << " delete, ignoring source" << dendl;
+      found_missing = true;
+      continue;
+    }
      if (oinfo.last_update < need) {
        ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
                          << " also missing on osd." << fromosd
@@ -994,7 +1010,7 @@ void PG::clear_primary_state()
  PG::Scrubber::Scrubber()
   : reserved(false), reserve_failed(false),
     epoch_start(0),
-   active(false), queue_snap_trim(false),
+   active(false),
     waiting_on(0), shallow_errors(0), deep_errors(0), fixed(0),
     must_scrub(false), must_deep_scrub(false), must_repair(false),
     auto_repair(false),
@@ -1628,6 +1644,7 @@ void PG::activate(ObjectStore::Transaction& t,
        dout(10) << "activate peer osd." << peer << " " << pi << dendl;
  
        MOSDPGLog *m = 0;
+      assert(peer_missing.count(peer));
        pg_missing_t& pm = peer_missing[peer];
  
        bool needs_past_intervals = pi.dne();
@@ -1677,7 +1694,7 @@ void PG::activate(ObjectStore::Transaction& t,
          * behind.
          */
         // backfill
-       osd->clog->info() << info.pgid << " starting backfill to osd." << peer
+       osd->clog->debug() << info.pgid << " starting backfill to osd." << peer
                          << " from (" << pi.log_tail << "," << pi.last_update
                           << "] " << pi.last_backfill
                          << " to " << info.last_update;
@@ -1724,12 +1741,18 @@ void PG::activate(ObjectStore::Transaction& t,
        if (m && pi.last_backfill != hobject_t()) {
          for (list<pg_log_entry_t>::iterator p = m->log.log.begin();
               p != m->log.log.end();
-             ++p)
+             ++p) {
           if (p->soid <= pi.last_backfill &&
-             !p->is_error())
-           pm.add_next_event(*p);
+             !p->is_error()) {
+           if (perform_deletes_during_peering() && p->is_delete()) {
+             pm.rm(p->soid, p->version);
+           } else {
+             pm.add_next_event(*p);
+           }
+         }
+       }
        }
-      
+
        if (m) {
         dout(10) << "activate peer osd." << peer << " sending " << m->log << dendl;
         //m->log.print(cout);
@@ -1753,6 +1776,7 @@ void PG::activate(ObjectStore::Transaction& t,
      for (set<pg_shard_t>::iterator i = actingbackfill.begin();
          i != actingbackfill.end();
          ++i) {
+      dout(20) << __func__ << " setting up missing_loc from shard " << *i << " " << dendl;
        if (*i == get_primary()) {
         missing_loc.add_active_missing(missing);
          if (!missing.have_missing())
@@ -1856,8 +1880,10 @@ bool PG::op_has_sufficient_caps(OpRequestRef& op)
                              op->need_write_cap(),
                              op->classes());
  
-  dout(20) << "op_has_sufficient_caps pool=" << pool.id << " (" << pool.name
-                  << " " << req->get_hobj().nspace
+  dout(20) << "op_has_sufficient_caps "
+           << "session=" << session
+           << " pool=" << pool.id << " (" << pool.name
+           << " " << req->get_hobj().nspace
            << ") owner=" << pool.auid
            << " need_read_cap=" << op->need_read_cap()
            << " need_write_cap=" << op->need_write_cap()
@@ -1949,7 +1975,7 @@ bool PG::requeue_scrub(bool high_priority)
    }
  }
  
-void PG::queue_recovery(bool front)
+void PG::queue_recovery()
  {
    if (!is_primary() || !is_peered()) {
      dout(10) << "queue_recovery -- not primary or not peered " << dendl;
@@ -1959,7 +1985,7 @@ void PG::queue_recovery(bool front)
    } else {
      dout(10) << "queue_recovery -- queuing" << dendl;
      recovery_queued = true;
-    osd->queue_for_recovery(this, front);
+    osd->queue_for_recovery(this);
    }
  }
  
@@ -2003,80 +2029,88 @@ struct C_PG_FinishRecovery : public Context {
  
  void PG::mark_clean()
  {
-  // only mark CLEAN if we have the desired number of replicas AND we
-  // are not remapped.
-  if (actingset.size() == get_osdmap()->get_pg_size(info.pgid.pgid) &&
-      up == acting)
+  if (actingset.size() == get_osdmap()->get_pg_size(info.pgid.pgid)) {
+    state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
      state_set(PG_STATE_CLEAN);
-
-  // NOTE: this is actually a bit premature: we haven't purged the
-  // strays yet.
-  info.history.last_epoch_clean = get_osdmap()->get_epoch();
-  info.history.last_interval_clean = info.history.same_interval_since;
-
-  past_intervals.clear();
-  dirty_big_info = true;
-
-  if (is_active()) {
-    /* The check is needed because if we are below min_size we're not
-     * actually active */
-    kick_snap_trim();
+    info.history.last_epoch_clean = get_osdmap()->get_epoch();
+    info.history.last_interval_clean = info.history.same_interval_since;
+    past_intervals.clear();
+    dirty_big_info = true;
+    dirty_info = true;
    }
  
-  dirty_info = true;
+  kick_snap_trim();
  }
  
-unsigned PG::get_recovery_priority()
+void PG::_change_recovery_force_mode(int new_mode, bool clear)
  {
-  // a higher value -> a higher priority
-
-  int pool_recovery_priority = 0;
-  pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority);
+  if (!deleting) {
+    // we can't and shouldn't do anything if the PG is being deleted locally
+    if (clear) {
+      state_clear(new_mode);
+    } else {
+      state_set(new_mode);
+    }
+    publish_stats_to_osd();
+  }
+}
  
-  int ret = OSD_RECOVERY_PRIORITY_BASE + pool_recovery_priority;
+inline int PG::clamp_recovery_priority(int priority)
+{
+  static_assert(OSD_RECOVERY_PRIORITY_MIN < OSD_RECOVERY_PRIORITY_MAX, "Invalid priority range");
+  static_assert(OSD_RECOVERY_PRIORITY_MIN >= 0, "Priority range must match unsigned type");
  
    // Clamp to valid range
-  if (ret > OSD_RECOVERY_PRIORITY_MAX) {
-    ret = OSD_RECOVERY_PRIORITY_MAX;
-  } else if (ret < OSD_RECOVERY_PRIORITY_MIN) {
-    ret = OSD_RECOVERY_PRIORITY_MIN;
+  if (priority > OSD_RECOVERY_PRIORITY_MAX) {
+    return OSD_RECOVERY_PRIORITY_MAX;
+  } else if (priority < OSD_RECOVERY_PRIORITY_MIN) {
+    return OSD_RECOVERY_PRIORITY_MIN;
+  } else {
+    return priority;
    }
+}
  
-  static_assert(OSD_RECOVERY_PRIORITY_MIN < OSD_RECOVERY_PRIORITY_MAX, "Invalid priority range");
-  static_assert(OSD_RECOVERY_PRIORITY_MIN >= 0, "Priority range must match unsigned type");
+unsigned PG::get_recovery_priority()
+{
+  // a higher value -> a higher priority
+  int ret = 0;
  
+  if (state & PG_STATE_FORCED_RECOVERY) {
+    ret = OSD_RECOVERY_PRIORITY_FORCED;
+  } else {
+    pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &ret);
+    ret = clamp_recovery_priority(OSD_RECOVERY_PRIORITY_BASE + ret);
+  }
+  dout(20) << __func__ << " recovery priority for " << *this << " is " << ret << ", state is " << state << dendl;
    return static_cast<unsigned>(ret);
  }
  
  unsigned PG::get_backfill_priority()
  {
    // a higher value -> a higher priority
-
    int ret = OSD_BACKFILL_PRIORITY_BASE;
-  if (acting.size() < pool.info.min_size) {
-    // inactive: no. of replicas < min_size, highest priority since it blocks IO
-    ret = OSD_BACKFILL_INACTIVE_PRIORITY_BASE + (pool.info.min_size - acting.size());
+  if (state & PG_STATE_FORCED_BACKFILL) {
+    ret = OSD_RECOVERY_PRIORITY_FORCED;
+  } else {
+    if (acting.size() < pool.info.min_size) {
+      // inactive: no. of replicas < min_size, highest priority since it blocks IO
+      ret = OSD_BACKFILL_INACTIVE_PRIORITY_BASE + (pool.info.min_size - acting.size());
  
-  } else if (is_undersized()) {
-    // undersized: OSD_BACKFILL_DEGRADED_PRIORITY_BASE + num missing replicas
-    assert(pool.info.size > actingset.size());
-    ret = OSD_BACKFILL_DEGRADED_PRIORITY_BASE + (pool.info.size - actingset.size());
+    } else if (is_undersized()) {
+      // undersized: OSD_BACKFILL_DEGRADED_PRIORITY_BASE + num missing replicas
+      assert(pool.info.size > actingset.size());
+      ret = OSD_BACKFILL_DEGRADED_PRIORITY_BASE + (pool.info.size - actingset.size());
  
-  } else if (is_degraded()) {
-    // degraded: baseline degraded
-    ret = OSD_BACKFILL_DEGRADED_PRIORITY_BASE;
-  }
+    } else if (is_degraded()) {
+      // degraded: baseline degraded
+      ret = OSD_BACKFILL_DEGRADED_PRIORITY_BASE;
+    }
  
-  // Adjust with pool's recovery priority
-  int pool_recovery_priority = 0;
-  pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority);
-  ret += pool_recovery_priority;
+    // Adjust with pool's recovery priority
+    int pool_recovery_priority = 0;
+    pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority);
  
-  // Clamp to valid range
-  if (ret > OSD_RECOVERY_PRIORITY_MAX) {
-    ret = OSD_RECOVERY_PRIORITY_MAX;
-  } else if (ret < OSD_RECOVERY_PRIORITY_MIN) {
-    ret = OSD_RECOVERY_PRIORITY_MIN;
+    ret = clamp_recovery_priority(pool_recovery_priority + ret);
    }
  
    return static_cast<unsigned>(ret);
@@ -2196,6 +2230,10 @@ void PG::split_into(pg_t child_pgid, PG *child, unsigned split_bits)
      // in the future).
      info.set_last_backfill(hobject_t());
      child->info.set_last_backfill(hobject_t());
+    // restarting backfill implies that the missing set is empty,
+    // since it is only used for objects prior to last_backfill
+    pg_log.reset_backfill();
+    child->pg_log.reset_backfill();
    }
  
    child->info.stats = info.stats;
@@ -3155,7 +3193,7 @@ void PG::append_log(
    auto last = logv.rbegin();
    if (is_primary() && last != logv.rend()) {
      projected_log.skip_can_rollback_to_to_head();
-    projected_log.trim(cct, last->version, nullptr);
+    projected_log.trim(cct, last->version, nullptr, nullptr, nullptr);
    }
  
    if (transaction_applied && roll_forward_to > pg_log.get_can_rollback_to()) {
@@ -3277,6 +3315,17 @@ void PG::read_state(ObjectStore *store, bufferlist &bl)
  
    last_written_info = info;
  
+  // if we are upgrading from jewel, we need to force rebuild of
+  // missing set.  v9 was fastinfo, added v11.0.2-331-g1d5dc29a13
+  // (before kraken).  persisted missing set was circa
+  // v11.0.0-866-gb0e239da95 (a bit earlier, also before kraken).
+  // v8 was pre-jewel (per-pg meta object).
+  bool force_rebuild_missing = info_struct_v < 9;
+  if (force_rebuild_missing) {
+    dout(10) << __func__ << " detected upgrade from jewel, force_rebuild_missing"
+            << dendl;
+  }
+
    ostringstream oss;
    pg_log.read_log_and_missing(
      store,
@@ -3284,12 +3333,19 @@ void PG::read_state(ObjectStore *store, bufferlist &bl)
      info_struct_v < 8 ? coll_t::meta() : coll,
      ghobject_t(info_struct_v < 8 ? OSD::make_pg_log_oid(pg_id) : pgmeta_oid),
      info,
+    force_rebuild_missing,
      oss,
      cct->_conf->osd_ignore_stale_divergent_priors,
      cct->_conf->osd_debug_verify_missing_on_start);
    if (oss.tellp())
      osd->clog->error() << oss.rdbuf();
  
+  if (force_rebuild_missing) {
+    dout(10) << __func__ << " forced rebuild of missing got "
+            << pg_log.get_missing()
+            << dendl;
+  }
+
    // log any weirdness
    log_weirdness();
  }
@@ -3309,8 +3365,8 @@ void PG::log_weirdness()
      // sloppy check
      if ((pg_log.get_log().log.begin()->version <= pg_log.get_tail()))
        osd->clog->error() << info.pgid
-                       << " log bound mismatch, info (" << pg_log.get_tail() << ","
-                       << pg_log.get_head() << "]"
+                       << " log bound mismatch, info (tail,head] ("
+                       << pg_log.get_tail() << "," << pg_log.get_head() << "]"
                         << " actual ["
                         << pg_log.get_log().log.begin()->version << ","
                          << pg_log.get_log().log.rbegin()->version << "]";
@@ -3592,7 +3648,7 @@ bool PG::sched_scrub()
         } else {
           osd->clog->error() << "osd." << osd->whoami
                              << " pg " << info.pgid
-                            << " Regular scrub request, losing deep-scrub details";
+                            << " Regular scrub request, deep-scrub details will be lost";
         }
        }
        queue_scrub();
@@ -4034,6 +4090,52 @@ void PG::_scan_snaps(ScrubMap &smap)
    }
  }
  
+void PG::_repair_oinfo_oid(ScrubMap &smap)
+{
+  for (map<hobject_t, ScrubMap::object>::reverse_iterator i = smap.objects.rbegin();
+       i != smap.objects.rend();
+       ++i) {
+    const hobject_t &hoid = i->first;
+    ScrubMap::object &o = i->second;
+
+    bufferlist bl;
+    if (o.attrs.find(OI_ATTR) == o.attrs.end()) {
+      continue;
+    }
+    bl.push_back(o.attrs[OI_ATTR]);
+    object_info_t oi;
+    try {
+      oi.decode(bl);
+    } catch(...) {
+      continue;
+    }
+    if (oi.soid != hoid) {
+      ObjectStore::Transaction t;
+      OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
+      osd->clog->error() << "osd." << osd->whoami
+                           << " found object info error on pg "
+                           << info.pgid
+                           << " oid " << hoid << " oid in object info: "
+                           << oi.soid
+                           << "...repaired";
+      // Fix object info
+      oi.soid = hoid;
+      bl.clear();
+      ::encode(oi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
+
+      bufferptr bp(bl.c_str(), bl.length());
+      o.attrs[OI_ATTR] = bp;
+
+      t.setattr(coll, ghobject_t(hoid), OI_ATTR, bl);
+      int r = osd->store->apply_transaction(osr.get(), std::move(t));
+      if (r != 0) {
+       derr << __func__ << ": apply_transaction got " << cpp_strerror(r)
+            << dendl;
+      }
+    }
+  }
+}
+
  /*
   * build a scrub map over a chunk without releasing the lock
   * only used by chunky scrub
@@ -4066,6 +4168,7 @@ int PG::build_scrub_map_chunk(
    get_pgbackend()->be_scan_list(map, ls, deep, seed, handle);
    _scan_rollback_obs(rollback_obs, handle);
    _scan_snaps(map);
+  _repair_oinfo_oid(map);
  
    dout(20) << __func__ << " done" << dendl;
    return 0;
@@ -4100,9 +4203,16 @@ void PG::repair_object(
    eversion_t v;
    bufferlist bv;
    bv.push_back(po.attrs[OI_ATTR]);
-  object_info_t oi(bv);
+  object_info_t oi;
+  try {
+    bufferlist::iterator bliter = bv.begin();
+    ::decode(oi, bliter);
+  } catch (...) {
+    dout(0) << __func__ << ": Need version of replica, bad object_info_t: " << soid << dendl;
+    assert(0);
+  }
    if (bad_peer != primary) {
-    peer_missing[bad_peer].add(soid, oi.version, eversion_t());
+    peer_missing[bad_peer].add(soid, oi.version, eversion_t(), false);
    } else {
      // We should only be scrubbing if the PG is clean.
      assert(waiting_for_unreadable_object.empty());
@@ -4405,7 +4515,7 @@ void PG::chunky_scrub(ThreadPool::TPHandle &handle)
           const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
           stringstream oss;
           oss << info.pgid.pgid << " " << mode << " starts" << std::endl;
-         osd->clog->info(oss);
+         osd->clog->debug(oss);
         }
  
         scrubber.seed = -1;
@@ -4611,6 +4721,11 @@ void PG::chunky_scrub(ThreadPool::TPHandle &handle)
          scrubber.state = PG::Scrubber::INACTIVE;
          done = true;
  
+       if (!snap_trimq.empty()) {
+         dout(10) << "scrub finished, requeuing snap_trimmer" << dendl;
+         snap_trimmer_scrub_complete();
+       }
+
          break;
  
        default:
@@ -4635,11 +4750,6 @@ void PG::scrub_clear_state()
  
    requeue_ops(waiting_for_scrub);
  
-  if (scrubber.queue_snap_trim) {
-    dout(10) << "scrub finished, requeuing snap_trimmer" << dendl;
-    snap_trimmer_scrub_complete();
-  }
-
    scrubber.reset();
  
    // type-specific state clear
@@ -4844,7 +4954,7 @@ void PG::scrub_finish()
      if (total_errors)
        osd->clog->error(oss);
      else
-      osd->clog->info(oss);
+      osd->clog->debug(oss);
    }
  
    // finish up
@@ -4982,6 +5092,7 @@ void PG::merge_new_log_entries(
      assert(peer_missing.count(peer));
      assert(peer_info.count(peer));
      pg_missing_t& pmissing(peer_missing[peer]);
+    dout(20) << __func__ << " peer_missing for " << peer << " = " << pmissing << dendl;
      pg_info_t& pinfo(peer_info[peer]);
      bool invalidate_stats = PGLog::append_log_entries_update_missing(
        pinfo.last_backfill,
@@ -5646,6 +5757,11 @@ bool PG::can_discard_request(OpRequestRef& op)
      return can_discard_replica_op<MOSDSubOpReply, MSG_OSD_SUBOPREPLY>(op);
    case MSG_OSD_REPOPREPLY:
      return can_discard_replica_op<MOSDRepOpReply, MSG_OSD_REPOPREPLY>(op);
+  case MSG_OSD_PG_RECOVERY_DELETE:
+    return can_discard_replica_op<MOSDPGRecoveryDelete, MSG_OSD_PG_RECOVERY_DELETE>(op);
+
+  case MSG_OSD_PG_RECOVERY_DELETE_REPLY:
+    return can_discard_replica_op<MOSDPGRecoveryDeleteReply, MSG_OSD_PG_RECOVERY_DELETE_REPLY>(op);
  
    case MSG_OSD_EC_WRITE:
      return can_discard_replica_op<MOSDECSubOpWrite, MSG_OSD_EC_WRITE>(op);
@@ -6266,6 +6382,37 @@ PG::RecoveryState::Backfilling::Backfilling(my_context ctx)
    pg->publish_stats_to_osd();
  }
  
+boost::statechart::result
+PG::RecoveryState::Backfilling::react(const CancelBackfill &)
+{
+  PG *pg = context< RecoveryMachine >().pg;
+  pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
+  // XXX: Add a new pg state so user can see why backfill isn't proceeding
+  // Can't use PG_STATE_BACKFILL_WAIT since it means waiting for reservations
+  //pg->state_set(PG_STATE_BACKFILL_STALLED????);
+
+  for (set<pg_shard_t>::iterator it = pg->backfill_targets.begin();
+       it != pg->backfill_targets.end();
+       ++it) {
+    assert(*it != pg->pg_whoami);
+    ConnectionRef con = pg->osd->get_con_osd_cluster(
+      it->osd, pg->get_osdmap()->get_epoch());
+    if (con) {
+      pg->osd->send_message_osd_cluster(
+        new MBackfillReserve(
+         MBackfillReserve::REJECT,
+         spg_t(pg->info.pgid.pgid, it->shard),
+         pg->get_osdmap()->get_epoch()),
+       con.get());
+    }
+  }
+
+  pg->waiting_on_backfill.clear();
+
+  pg->schedule_backfill_full_retry();
+  return transit<NotBackfilling>();
+}
+
  boost::statechart::result
  PG::RecoveryState::Backfilling::react(const RemoteReservationRejected &)
  {
@@ -6303,6 +6450,7 @@ void PG::RecoveryState::Backfilling::exit()
    pg->backfill_reserved = false;
    pg->backfill_reserving = false;
    pg->state_clear(PG_STATE_BACKFILL);
+  pg->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
    utime_t dur = ceph_clock_now() - enter_time;
    pg->osd->recoverystate_perf->tinc(rs_backfilling_latency, dur);
  }
@@ -6736,10 +6884,10 @@ PG::RecoveryState::Recovering::Recovering(my_context ctx)
    pg->queue_recovery();
  }
  
-void PG::RecoveryState::Recovering::release_reservations()
+void PG::RecoveryState::Recovering::release_reservations(bool cancel)
  {
    PG *pg = context< RecoveryMachine >().pg;
-  assert(!pg->pg_log.get_missing().have_missing());
+  assert(cancel || !pg->pg_log.get_missing().have_missing());
  
    // release remote reservations
    for (set<pg_shard_t>::const_iterator i =
@@ -6766,6 +6914,7 @@ PG::RecoveryState::Recovering::react(const AllReplicasRecovered &evt)
  {
    PG *pg = context< RecoveryMachine >().pg;
    pg->state_clear(PG_STATE_RECOVERING);
+  pg->state_clear(PG_STATE_FORCED_RECOVERY);
    release_reservations();
    return transit<Recovered>();
  }
@@ -6775,10 +6924,22 @@ PG::RecoveryState::Recovering::react(const RequestBackfill &evt)
  {
    PG *pg = context< RecoveryMachine >().pg;
    pg->state_clear(PG_STATE_RECOVERING);
+  pg->state_clear(PG_STATE_FORCED_RECOVERY);
    release_reservations();
    return transit<WaitRemoteBackfillReserved>();
  }
  
+boost::statechart::result
+PG::RecoveryState::Recovering::react(const CancelRecovery &evt)
+{
+  PG *pg = context< RecoveryMachine >().pg;
+  pg->state_clear(PG_STATE_RECOVERING);
+  pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
+  release_reservations(true);
+  pg->schedule_recovery_full_retry();
+  return transit<NotRecovering>();
+}
+
  void PG::RecoveryState::Recovering::exit()
  {
    context< RecoveryMachine >().log_exit(state_name, enter_time);
@@ -6806,6 +6967,7 @@ PG::RecoveryState::Recovered::Recovered(my_context ctx)
    if (pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid) <=
        pg->actingbackfill.size()) {
      pg->state_clear(PG_STATE_DEGRADED);
+    pg->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
      pg->publish_stats_to_osd();
    }
  
@@ -6842,11 +7004,14 @@ PG::RecoveryState::Clean::Clean(my_context ctx)
      ceph_abort();
    }
    pg->finish_recovery(*context< RecoveryMachine >().get_on_safe_context_list());
-  pg->mark_clean();
+
+  if (pg->is_active()) {
+    pg->mark_clean();
+  }
  
    pg->share_pg_info();
    pg->publish_stats_to_osd();
-
+  pg->requeue_ops(pg->waiting_for_clean_to_primary_repair);
  }
  
  void PG::RecoveryState::Clean::exit()
@@ -6990,9 +7155,12 @@ boost::statechart::result PG::RecoveryState::Active::react(const ActMap&)
        pg->all_unfound_are_queried_or_lost(pg->get_osdmap())) {
      if (pg->cct->_conf->osd_auto_mark_unfound_lost) {
        pg->osd->clog->error() << pg->info.pgid.pgid << " has " << unfound
-                           << " objects unfound and apparently lost, would automatically marking lost but NOT IMPLEMENTED";
+                           << " objects unfound and apparently lost, would automatically "
+                           << "mark these objects lost but this feature is not yet implemented "
+                           << "(osd_auto_mark_unfound_lost)";
      } else
-      pg->osd->clog->error() << pg->info.pgid.pgid << " has " << unfound << " objects unfound and apparently lost";
+      pg->osd->clog->error() << pg->info.pgid.pgid << " has "
+                             << unfound << " objects unfound and apparently lost";
    }
  
    if (pg->is_active()) {
@@ -7859,18 +8027,23 @@ PG::RecoveryState::GetMissing::GetMissing(my_context ctx)
         ++i) {
      if (*i == pg->get_primary()) continue;
      const pg_info_t& pi = pg->peer_info[*i];
+    // reset this so to make sure the pg_missing_t is initialized and
+    // has the correct semantics even if we don't need to get a
+    // missing set from a shard. This way later additions due to
+    // lost+unfound delete work properly.
+    pg->peer_missing[*i].may_include_deletes = !pg->perform_deletes_during_peering();
  
      if (pi.is_empty())
        continue;                                // no pg data, nothing divergent
  
      if (pi.last_update < pg->pg_log.get_tail()) {
        ldout(pg->cct, 10) << " osd." << *i << " is not contiguous, will restart backfill" << dendl;
-      pg->peer_missing[*i];
+      pg->peer_missing[*i].clear();
        continue;
      }
      if (pi.last_backfill == hobject_t()) {
        ldout(pg->cct, 10) << " osd." << *i << " will fully backfill; can infer empty missing set" << dendl;
-      pg->peer_missing[*i];
+      pg->peer_missing[*i].clear();
        continue;
      }
  
@@ -7881,7 +8054,7 @@ PG::RecoveryState::GetMissing::GetMissing(my_context ctx)
        // FIXME: we can do better here.  if last_update==last_complete we
        //        can infer the rest!
        ldout(pg->cct, 10) << " osd." << *i << " has no missing, identical log" << dendl;
-      pg->peer_missing[*i];
+      pg->peer_missing[*i].clear();
        continue;
      }