update sources to ceph Nautilus 14.2.1

[ceph.git] / ceph / src / osd / PG.h
diff --git a/ceph/src/osd/PG.h b/ceph/src/osd/PG.h

index dc64cedb15e58af0d2e23b067178426c3074412a..790c004bed753e514adf6b807070a86f0c21a20a 100644 (file)
--- a/ceph/src/osd/PG.h
+++ b/ceph/src/osd/PG.h
@@ -25,11 +25,10 @@
  #include <boost/scoped_ptr.hpp>
  #include <boost/circular_buffer.hpp>
  #include <boost/container/flat_set.hpp>
-#include "include/memory.h"
  #include "include/mempool.h"
  
  // re-include our assert to clobber boost's
-#include "include/assert.h" 
+#include "include/ceph_assert.h" 
  
  #include "include/types.h"
  #include "include/stringify.h"
@@ -44,6 +43,9 @@
  #include "messages/MOSDPGLog.h"
  #include "include/str_list.h"
  #include "PGBackend.h"
+#include "PGPeeringEvent.h"
+
+#include "mgr/OSDPerfMetricTypes.h"
  
  #include <atomic>
  #include <list>
@@ -51,15 +53,14 @@
  #include <stack>
  #include <string>
  #include <tuple>
-using namespace std;
-
-// #include "include/unordered_map.h"
-// #include "include/unordered_set.h"
  
  //#define DEBUG_RECOVERY_OIDS   // track set of recovering oids explicitly, to find counting bugs
+//#define PG_DEBUG_REFS    // track provenance of pg refs, helpful for finding leaks
  
  class OSD;
  class OSDService;
+class OSDShard;
+class OSDShardPGSlot;
  class MOSDOp;
  class MOSDPGScan;
  class MOSDPGBackfill;
@@ -70,14 +71,12 @@ struct OpRequest;
  typedef OpRequest::Ref OpRequestRef;
  class MOSDPGLog;
  class CephContext;
+class DynamicPerfStats;
  
  namespace Scrub {
    class Store;
  }
  
-void intrusive_ptr_add_ref(PG *pg);
-void intrusive_ptr_release(PG *pg);
-
  using state_history_entry = std::tuple<utime_t, utime_t, const char*>;
  using embedded_state = std::pair<utime_t, const char*>;
  
@@ -163,11 +162,11 @@ class PGRecoveryStats {
    PGRecoveryStats() : lock("PGRecoverStats::lock") {}
  
    void reset() {
-    Mutex::Locker l(lock);
+    std::lock_guard l(lock);
      info.clear();
    }
    void dump(ostream& out) {
-    Mutex::Locker l(lock);
+    std::lock_guard l(lock);
      for (map<const char *,per_state_info>::iterator p = info.begin(); p != info.end(); ++p) {
        per_state_info& i = p->second;
        out << i.enter << "\t" << i.exit << "\t"
@@ -179,7 +178,7 @@ class PGRecoveryStats {
    }
  
    void dump_formatted(Formatter *f) {
-    Mutex::Locker l(lock);
+    std::lock_guard l(lock);
      f->open_array_section("pg_recovery_stats");
      for (map<const char *,per_state_info>::iterator p = info.begin();
          p != info.end(); ++p) {
@@ -206,11 +205,11 @@ class PGRecoveryStats {
    }
  
    void log_enter(const char *s) {
-    Mutex::Locker l(lock);
+    std::lock_guard l(lock);
      info[s].enter++;
    }
    void log_exit(const char *s, utime_t dur, uint64_t events, utime_t event_dur) {
-    Mutex::Locker l(lock);
+    std::lock_guard l(lock);
      per_state_info &i = info[s];
      i.exit++;
      i.total_time += dur;
@@ -228,28 +227,28 @@ struct PGPool {
    epoch_t cached_epoch;
    int64_t id;
    string name;
-  uint64_t auid;
  
    pg_pool_t info;      
    SnapContext snapc;   // the default pool snapc, ready to go.
  
+  // these two sets are for < mimic only
    interval_set<snapid_t> cached_removed_snaps;      // current removed_snaps set
    interval_set<snapid_t> newly_removed_snaps;  // newly removed in the last epoch
  
-  PGPool(CephContext* cct, OSDMapRef map, int64_t i)
+  PGPool(CephContext* cct, OSDMapRef map, int64_t i, const pg_pool_t& info,
+        const string& name)
      : cct(cct),
        cached_epoch(map->get_epoch()),
        id(i),
-      name(map->get_pool_name(id)),
-      auid(map->get_pg_pool(id)->auid) {
-    const pg_pool_t *pi = map->get_pg_pool(id);
-    assert(pi);
-    info = *pi;
-    snapc = pi->get_snap_context();
-    pi->build_removed_snaps(cached_removed_snaps);
+      name(name),
+      info(info) {
+    snapc = info.get_snap_context();
+    if (map->require_osd_release < CEPH_RELEASE_MIMIC) {
+      info.build_removed_snaps(cached_removed_snaps);
+    }
    }
  
-  void update(OSDMapRef map);
+  void update(CephContext *cct, OSDMapRef map);
  };
  
  /** PG - Replica Placement Group
@@ -258,130 +257,369 @@ struct PGPool {
  
  class PG : public DoutPrefixProvider {
  public:
+  // -- members --
+  const spg_t pg_id;
+  const coll_t coll;
+
+  ObjectStore::CollectionHandle ch;
+
+  struct RecoveryCtx;
+
+  // -- methods --
+  std::ostream& gen_prefix(std::ostream& out) const override;
+  CephContext *get_cct() const override {
+    return cct;
+  }
+  unsigned get_subsys() const override {
+    return ceph_subsys_osd;
+  }
+
+  const OSDMapRef& get_osdmap() const {
+    ceph_assert(is_locked());
+    ceph_assert(osdmap_ref);
+    return osdmap_ref;
+  }
+  epoch_t get_osdmap_epoch() const {
+    return osdmap_ref->get_epoch();
+  }
+
+  void lock_suspend_timeout(ThreadPool::TPHandle &handle) {
+    handle.suspend_tp_timeout();
+    lock();
+    handle.reset_tp_timeout();
+  }
+  void lock(bool no_lockdep = false) const;
+  void unlock() const {
+    //generic_dout(0) << this << " " << info.pgid << " unlock" << dendl;
+    ceph_assert(!dirty_info);
+    ceph_assert(!dirty_big_info);
+    _lock.Unlock();
+  }
+  bool is_locked() const {
+    return _lock.is_locked();
+  }
+
+  const spg_t& get_pgid() const {
+    return pg_id;
+  }
+
+  const PGPool& get_pool() const {
+    return pool;
+  }
+  uint64_t get_last_user_version() const {
+    return info.last_user_version;
+  }
+  const pg_history_t& get_history() const {
+    return info.history;
+  }
+  bool get_need_up_thru() const {
+    return need_up_thru;
+  }
+  epoch_t get_same_interval_since() const {
+    return info.history.same_interval_since;
+  }
+
+  void set_last_scrub_stamp(utime_t t) {
+    info.stats.last_scrub_stamp = t;
+    info.history.last_scrub_stamp = t;
+  }
+
+  void set_last_deep_scrub_stamp(utime_t t) {
+    info.stats.last_deep_scrub_stamp = t;
+    info.history.last_deep_scrub_stamp = t;
+  }
+
+  bool is_deleting() const {
+    return deleting;
+  }
+  bool is_deleted() const {
+    return deleted;
+  }
+  bool is_replica() const {
+    return role > 0;
+  }
+  bool is_primary() const {
+    return pg_whoami == primary;
+  }
+  bool pg_has_reset_since(epoch_t e) {
+    ceph_assert(is_locked());
+    return deleted || e < get_last_peering_reset();
+  }
+
+  bool is_ec_pg() const {
+    return pool.info.is_erasure();
+  }
+  int get_role() const {
+    return role;
+  }
+  const vector<int> get_acting() const {
+    return acting;
+  }
+  int get_acting_primary() const {
+    return primary.osd;
+  }
+  pg_shard_t get_primary() const {
+    return primary;
+  }
+  const vector<int> get_up() const {
+    return up;
+  }
+  int get_up_primary() const {
+    return up_primary.osd;
+  }
+  const PastIntervals& get_past_intervals() const {
+    return past_intervals;
+  }
+
+  /// initialize created PG
+  void init(
+    int role,
+    const vector<int>& up,
+    int up_primary,
+    const vector<int>& acting,
+    int acting_primary,
+    const pg_history_t& history,
+    const PastIntervals& pim,
+    bool backfill,
+    ObjectStore::Transaction *t);
+
+  /// read existing pg state off disk
+  void read_state(ObjectStore *store);
+  static int peek_map_epoch(ObjectStore *store, spg_t pgid, epoch_t *pepoch);
+
+  static int get_latest_struct_v() {
+    return latest_struct_v;
+  }
+  static int get_compat_struct_v() {
+    return compat_struct_v;
+  }
+  static int read_info(
+    ObjectStore *store, spg_t pgid, const coll_t &coll,
+    pg_info_t &info, PastIntervals &past_intervals,
+    __u8 &);
+  static bool _has_removal_flag(ObjectStore *store, spg_t pgid);
+
+  void rm_backoff(BackoffRef b);
+
+  void update_snap_mapper_bits(uint32_t bits) {
+    snap_mapper.update_bits(bits);
+  }
+  void start_split_stats(const set<spg_t>& childpgs, vector<object_stat_sum_t> *v);
+  virtual void split_colls(
+    spg_t child,
+    int split_bits,
+    int seed,
+    const pg_pool_t *pool,
+    ObjectStore::Transaction *t) = 0;
+  void split_into(pg_t child_pgid, PG *child, unsigned split_bits);
+  void merge_from(map<spg_t,PGRef>& sources, RecoveryCtx *rctx,
+                 unsigned split_bits,
+                 const pg_merge_meta_t& last_pg_merge_meta);
+  void finish_split_stats(const object_stat_sum_t& stats, ObjectStore::Transaction *t);
+
+  void scrub(epoch_t queued, ThreadPool::TPHandle &handle);
+  void reg_next_scrub();
+  void unreg_next_scrub();
+
+  bool is_forced_recovery_or_backfill() const {
+    return get_state() & (PG_STATE_FORCED_RECOVERY | PG_STATE_FORCED_BACKFILL);
+  }
    bool set_force_recovery(bool b);
    bool set_force_backfill(bool b);
  
+  void queue_peering_event(PGPeeringEventRef evt);
+  void do_peering_event(PGPeeringEventRef evt, RecoveryCtx *rcx);
+  void queue_null(epoch_t msg_epoch, epoch_t query_epoch);
+  void queue_flushed(epoch_t started_at);
+  void handle_advance_map(
+    OSDMapRef osdmap, OSDMapRef lastmap,
+    vector<int>& newup, int up_primary,
+    vector<int>& newacting, int acting_primary,
+    RecoveryCtx *rctx);
+  void handle_activate_map(RecoveryCtx *rctx);
+  void handle_initialize(RecoveryCtx *rctx);
+  void handle_query_state(Formatter *f);
+
+  /**
+   * @param ops_begun returns how many recovery ops the function started
+   * @returns true if any useful work was accomplished; false otherwise
+   */
+  virtual bool start_recovery_ops(
+    uint64_t max,
+    ThreadPool::TPHandle &handle,
+    uint64_t *ops_begun) = 0;
+
+  // more work after the above, but with a RecoveryCtx
+  void find_unfound(epoch_t queued, RecoveryCtx *rctx);
+
+  virtual void get_watchers(std::list<obj_watch_item_t> *ls) = 0;
+
+  void dump_pgstate_history(Formatter *f);
+  void dump_missing(Formatter *f);
+
+  void get_pg_stats(std::function<void(const pg_stat_t&, epoch_t lec)> f);
+  void with_heartbeat_peers(std::function<void(int)> f);
+
+  void shutdown();
+  virtual void on_shutdown() = 0;
+
+  bool get_must_scrub() const {
+    return scrubber.must_scrub;
+  }
+  bool sched_scrub();
+
+  virtual void do_request(
+    OpRequestRef& op,
+    ThreadPool::TPHandle &handle
+  ) = 0;
+  virtual void clear_cache() = 0;
+  virtual int get_cache_obj_count() = 0;
+
+  virtual void snap_trimmer(epoch_t epoch_queued) = 0;
+  virtual int do_command(
+    cmdmap_t cmdmap,
+    ostream& ss,
+    bufferlist& idata,
+    bufferlist& odata,
+    ConnectionRef conn,
+    ceph_tid_t tid) = 0;
+
+  virtual bool agent_work(int max) = 0;
+  virtual bool agent_work(int max, int agent_flush_quota) = 0;
+  virtual void agent_stop() = 0;
+  virtual void agent_delay() = 0;
+  virtual void agent_clear() = 0;
+  virtual void agent_choose_mode_restart() = 0;
+
+  virtual void on_removal(ObjectStore::Transaction *t) = 0;
+
+  void _delete_some(ObjectStore::Transaction *t);
+
+  virtual void set_dynamic_perf_stats_queries(
+    const std::list<OSDPerfMetricQuery> &queries) {
+  }
+  virtual void get_dynamic_perf_stats(DynamicPerfStats *stats) {
+  }
+
+  // reference counting
+#ifdef PG_DEBUG_REFS
+  uint64_t get_with_id();
+  void put_with_id(uint64_t);
+  void dump_live_ids();
+#endif
+  void get(const char* tag);
+  void put(const char* tag);
+  int get_num_ref() {
+    return ref;
+  }
+
+  // ctor
+  PG(OSDService *o, OSDMapRef curmap,
+     const PGPool &pool, spg_t p);
+  ~PG() override;
+
+  // prevent copying
+  explicit PG(const PG& rhs) = delete;
+  PG& operator=(const PG& rhs) = delete;
+
  protected:
+  // -------------
+  // protected
    OSDService *osd;
+public:
+  OSDShard *osd_shard = nullptr;
+  OSDShardPGSlot *pg_slot = nullptr;
+protected:
    CephContext *cct;
+
+  // osdmap
+  OSDMapRef osdmap_ref;
+
+  PGPool pool;
+
+  // locking and reference counting.
+  // I destroy myself when the reference count hits zero.
+  // lock() should be called before doing anything.
+  // get() should be called on pointer copy (to another thread, etc.).
+  // put() should be called on destruction of some previously copied pointer.
+  // unlock() when done with the current pointer (_most common_).
+  mutable Mutex _lock = {"PG::_lock"};
+
+  std::atomic<unsigned int> ref{0};
+
+#ifdef PG_DEBUG_REFS
+  Mutex _ref_id_lock = {"PG::_ref_id_lock"};
+  map<uint64_t, string> _live_ids;
+  map<string, uint64_t> _tag_counts;
+  uint64_t _ref_id = 0;
+
+  friend uint64_t get_with_id(PG *pg) { return pg->get_with_id(); }
+  friend void put_with_id(PG *pg, uint64_t id) { return pg->put_with_id(id); }
+#endif
+
+private:
+  friend void intrusive_ptr_add_ref(PG *pg) {
+    pg->get("intptr");
+  }
+  friend void intrusive_ptr_release(PG *pg) {
+    pg->put("intptr");
+  }
+
+
+  // =====================
+
+protected:
    OSDriver osdriver;
    SnapMapper snap_mapper;
    bool eio_errors_to_process = false;
  
    virtual PGBackend *get_pgbackend() = 0;
-public:
-  std::string gen_prefix() const override;
-  CephContext *get_cct() const override { return cct; }
-  unsigned get_subsys() const override { return ceph_subsys_osd; }
+  virtual const PGBackend* get_pgbackend() const = 0;
  
+protected:
    /*** PG ****/
-  void update_snap_mapper_bits(uint32_t bits) {
-    snap_mapper.update_bits(bits);
-  }
    /// get_is_recoverable_predicate: caller owns returned pointer and must delete when done
-  IsPGRecoverablePredicate *get_is_recoverable_predicate() {
+  IsPGRecoverablePredicate *get_is_recoverable_predicate() const {
      return get_pgbackend()->get_is_recoverable_predicate();
    }
  protected:
-  OSDMapRef osdmap_ref;
-  OSDMapRef last_persisted_osdmap_ref;
-  PGPool pool;
+  epoch_t last_persisted_osdmap;
  
    void requeue_map_waiters();
  
    void update_osdmap_ref(OSDMapRef newmap) {
-    assert(_lock.is_locked_by_me());
+    ceph_assert(_lock.is_locked_by_me());
      osdmap_ref = std::move(newmap);
    }
  
-public:
-  OSDMapRef get_osdmap() const {
-    assert(is_locked());
-    assert(osdmap_ref);
-    return osdmap_ref;
-  }
  protected:
  
-  /** locking and reference counting.
-   * I destroy myself when the reference count hits zero.
-   * lock() should be called before doing anything.
-   * get() should be called on pointer copy (to another thread, etc.).
-   * put() should be called on destruction of some previously copied pointer.
-   * unlock() when done with the current pointer (_most common_).
-   */  
-  mutable Mutex _lock;
-  std::atomic_uint ref{0};
-
-#ifdef PG_DEBUG_REFS
-  Mutex _ref_id_lock;
-  map<uint64_t, string> _live_ids;
-  map<string, uint64_t> _tag_counts;
-  uint64_t _ref_id;
-#endif
  
-public:
    bool deleting;  // true while in removing or OSD is shutting down
+  atomic<bool> deleted = {false};
  
    ZTracer::Endpoint trace_endpoint;
  
-  void lock_suspend_timeout(ThreadPool::TPHandle &handle);
-  void lock(bool no_lockdep = false) const;
-  void unlock() const {
-    //generic_dout(0) << this << " " << info.pgid << " unlock" << dendl;
-    assert(!dirty_info);
-    assert(!dirty_big_info);
-    _lock.Unlock();
-  }
-
-  bool is_locked() const {
-    return _lock.is_locked();
-  }
-
-#ifdef PG_DEBUG_REFS
-  uint64_t get_with_id();
-  void put_with_id(uint64_t);
-  void dump_live_ids();
-#endif
-  void get(const char* tag);
-  void put(const char* tag);
  
+protected:
    bool dirty_info, dirty_big_info;
  
-public:
-  bool is_ec_pg() const {
-    return pool.info.ec_pool();
-  }
+protected:
    // pg state
    pg_info_t info;               ///< current pg info
    pg_info_t last_written_info;  ///< last written info
-  __u8 info_struct_v;
-  static const __u8 cur_struct_v = 10;
+  __u8 info_struct_v = 0;
+  static const __u8 latest_struct_v = 10;
    // v10 is the new past_intervals encoding
    // v9 was fastinfo_key addition
    // v8 was the move to a per-pg pgmeta object
    // v7 was SnapMapper addition in 86658392516d5175b2756659ef7ffaaf95b0f8ad
    // (first appeared in cuttlefish).
-  static const __u8 compat_struct_v = 7;
-  bool must_upgrade() {
-    return info_struct_v < cur_struct_v;
-  }
-  bool can_upgrade() {
-    return info_struct_v >= compat_struct_v;
-  }
+  static const __u8 compat_struct_v = 10;
    void upgrade(ObjectStore *store);
  
-  const coll_t coll;
-  ObjectStore::CollectionHandle ch;
+protected:
    PGLog  pg_log;
-  static string get_info_key(spg_t pgid) {
-    return stringify(pgid) + "_info";
-  }
-  static string get_biginfo_key(spg_t pgid) {
-    return stringify(pgid) + "_biginfo";
-  }
-  static string get_epoch_key(spg_t pgid) {
-    return stringify(pgid) + "_epoch";
-  }
    ghobject_t    pgmeta_oid;
  
    // ------------------
@@ -402,8 +640,8 @@ public:
                    (l.other < r.other)));
        }
        friend ostream& operator<<(ostream& out, const loc_count_t& l) {
-       assert(l.up >= 0);
-       assert(l.other >= 0);
+       ceph_assert(l.up >= 0);
+       ceph_assert(l.other >= 0);
         return out << "(" << l.up << "+" << l.other << ")";
        }
      };
@@ -458,7 +696,7 @@ public:
        pgs_by_shard_id(s, pgsbs);
        for (auto shard: pgsbs) {
          auto p = missing_by_count[shard.first].find(_get_count(shard.second));
-        assert(p != missing_by_count[shard.first].end());
+        ceph_assert(p != missing_by_count[shard.first].end());
          if (--p->second == 0) {
           missing_by_count[shard.first].erase(p);
          }
@@ -478,7 +716,9 @@ public:
        is_readable.reset(_is_readable);
        is_recoverable.reset(_is_recoverable);
      }
-    string gen_prefix() const { return pg->gen_prefix(); }
+    std::ostream& gen_prefix(std::ostream& out) const {
+      return pg->gen_prefix(out);
+    }
      bool needs_recovery(
        const hobject_t &hoid,
        eversion_t *v = 0) const {
@@ -497,9 +737,15 @@ public:
        return i->second.is_delete();
      }
      bool is_unfound(const hobject_t &hoid) const {
-      return needs_recovery(hoid) && !is_deleted(hoid) && (
-       !missing_loc.count(hoid) ||
-       !(*is_recoverable)(missing_loc.find(hoid)->second));
+      auto it = needs_recovery_map.find(hoid);
+      if (it == needs_recovery_map.end()) {
+        return false;
+      }
+      if (it->second.is_delete()) {
+        return false;
+      }
+      auto mit = missing_loc.find(hoid);
+      return mit == missing_loc.end() || !(*is_recoverable)(mit->second);
      }
      bool readable_with_acting(
        const hobject_t &hoid,
@@ -510,7 +756,10 @@ public:
              needs_recovery_map.begin();
            i != needs_recovery_map.end();
            ++i) {
-       if (is_unfound(i->first))
+       if (i->second.is_delete())
+         continue;
+       auto mi = missing_loc.find(i->first);
+       if (mi == missing_loc.end() || !(*is_recoverable)(mi->second))
           ++ret;
        }
        return ret;
@@ -521,7 +770,10 @@ public:
              needs_recovery_map.begin();
            i != needs_recovery_map.end();
            ++i) {
-       if (is_unfound(i->first))
+        if (i->second.is_delete())
+          continue;
+        auto mi = missing_loc.find(i->first);
+        if (mi == missing_loc.end() || !(*is_recoverable)(mi->second))
           return true;
        }
        return false;
@@ -548,9 +800,22 @@ public:
        if (p != missing_loc.end()) {
         _dec_count(p->second);
         p->second.erase(location);
-       _inc_count(p->second);
+        if (p->second.empty()) {
+          missing_loc.erase(p);
+        } else {
+          _inc_count(p->second);
+        }
        }
      }
+
+    void clear_location(const hobject_t &hoid) {
+      auto p = missing_loc.find(hoid);
+      if (p != missing_loc.end()) {
+       _dec_count(p->second);
+        missing_loc.erase(p);
+      }
+    }
+
      void add_active_missing(const pg_missing_t &missing) {
        for (map<hobject_t, pg_missing_item>::const_iterator i =
              missing.get_items().begin();
@@ -564,17 +829,18 @@ public:
           lgeneric_dout(pg->cct, 0) << this << " " << pg->info.pgid << " unexpected need for "
                                     << i->first << " have " << j->second
                                     << " tried to add " << i->second << dendl;
-         assert(i->second.need == j->second.need);
+         ceph_assert(i->second.need == j->second.need);
         }
        }
      }
  
-    void add_missing(const hobject_t &hoid, eversion_t need, eversion_t have) {
-      needs_recovery_map[hoid] = pg_missing_item(need, have);
+    void add_missing(const hobject_t &hoid, eversion_t need, eversion_t have, bool is_delete=false) {
+      needs_recovery_map[hoid] = pg_missing_item(need, have, is_delete);
      }
      void revise_need(const hobject_t &hoid, eversion_t need) {
-      assert(needs_recovery(hoid));
-      needs_recovery_map[hoid].need = need;
+      auto it = needs_recovery_map.find(hoid);
+      ceph_assert(it != needs_recovery_map.end());
+      it->second.need = need;
      }
  
      /// Adds info about a possible recovery source
@@ -623,7 +889,7 @@ public:
           if (i == self)
             continue;
           auto pmiter = pmissing.find(i);
-         assert(pmiter != pmissing.end());
+         ceph_assert(pmiter != pmissing.end());
           miter = pmiter->second.get_items().find(hoid);
           if (miter != pmiter->second.get_items().end()) {
             item = miter->second;
@@ -639,13 +905,15 @@ public:
         return;
        auto mliter =
         missing_loc.insert(make_pair(hoid, set<pg_shard_t>())).first;
-      assert(info.last_backfill.is_max());
-      assert(info.last_update >= item->need);
+      ceph_assert(info.last_backfill.is_max());
+      ceph_assert(info.last_update >= item->need);
        if (!missing.is_missing(hoid))
         mliter->second.insert(self);
        for (auto &&i: pmissing) {
+       if (i.first == self)
+         continue;
         auto pinfoiter = pinfo.find(i.first);
-       assert(pinfoiter != pinfo.end());
+       ceph_assert(pinfoiter != pinfo.end());
         if (item->need <= pinfoiter->second.last_update &&
             hoid <= pinfoiter->second.last_backfill &&
             !i.second.is_missing(hoid))
@@ -655,8 +923,8 @@ public:
      }
  
      const set<pg_shard_t> &get_locations(const hobject_t &hoid) const {
-      return missing_loc.count(hoid) ?
-       missing_loc.find(hoid)->second : empty_set;
+      auto it = missing_loc.find(hoid);
+      return it == missing_loc.end() ? empty_set : it->second;
      }
      const map<hobject_t, set<pg_shard_t>> &get_missing_locs() const {
        return missing_loc;
@@ -682,55 +950,40 @@ public:
    int recovery_ops_active;
    set<pg_shard_t> waiting_on_backfill;
  #ifdef DEBUG_RECOVERY_OIDS
-  set<hobject_t> recovering_oids;
+  multiset<hobject_t> recovering_oids;
  #endif
  
  protected:
    int         role;    // 0 = primary, 1 = replica, -1=none.
-  unsigned    state;   // PG_STATE_*
+  uint64_t    state;   // PG_STATE_*
  
    bool send_notify;    ///< true if we are non-primary and should notify the primary
  
-public:
+protected:
    eversion_t  last_update_ondisk;    // last_update that has committed; ONLY DEFINED WHEN is_active()
    eversion_t  last_complete_ondisk;  // last_complete that has committed.
    eversion_t  last_update_applied;
  
-
-  struct C_UpdateLastRollbackInfoTrimmedToApplied : Context {
-    PGRef pg;
-    epoch_t e;
-    eversion_t v;
-    C_UpdateLastRollbackInfoTrimmedToApplied(PG *pg, epoch_t e, eversion_t v)
-      : pg(pg), e(e), v(v) {}
-    void finish(int) override {
-      pg->lock();
-      if (!pg->pg_has_reset_since(e)) {
-       pg->last_rollback_info_trimmed_to_applied = v;
-      }
-      pg->unlock();
-    }
-  };
-  // entries <= last_rollback_info_trimmed_to_applied have been trimmed,
-  // and the transaction has applied
+  // entries <= last_rollback_info_trimmed_to_applied have been trimmed
    eversion_t  last_rollback_info_trimmed_to_applied;
  
    // primary state
- public:
+protected:
    pg_shard_t primary;
    pg_shard_t pg_whoami;
    pg_shard_t up_primary;
    vector<int> up, acting, want_acting;
-  set<pg_shard_t> actingbackfill, actingset, upset;
+  // acting_recovery_backfill contains shards that are acting,
+  // async recovery targets, or backfill targets.
+  set<pg_shard_t> acting_recovery_backfill, actingset, upset;
    map<pg_shard_t,eversion_t> peer_last_complete_ondisk;
    eversion_t  min_last_complete_ondisk;  // up: min over last_complete_ondisk, peer_last_complete_ondisk
    eversion_t  pg_trim_to;
  
    set<int> blocked_by; ///< osds we are blocked by (for pg stats)
  
+protected:
    // [primary only] content recovery state
-
-public:    
    struct BufferedRecoveryMessages {
      map<int, map<spg_t, pg_query_t> > query_map;
      map<int, vector<pair<pg_notify_t, PastIntervals> > > info_map;
@@ -744,9 +997,6 @@ public:
      map<int, map<spg_t, pg_query_t> > *query_map;
      map<int, vector<pair<pg_notify_t, PastIntervals> > > *info_map;
      map<int, vector<pair<pg_notify_t, PastIntervals> > > *notify_list;
-    set<PGRef> created_pgs;
-    C_Contexts *on_applied;
-    C_Contexts *on_safe;
      ObjectStore::Transaction *transaction;
      ThreadPool::TPHandle* handle;
      RecoveryCtx(map<int, map<spg_t, pg_query_t> > *query_map,
@@ -754,13 +1004,9 @@ public:
                     vector<pair<pg_notify_t, PastIntervals> > > *info_map,
                 map<int,
                     vector<pair<pg_notify_t, PastIntervals> > > *notify_list,
-               C_Contexts *on_applied,
-               C_Contexts *on_safe,
                 ObjectStore::Transaction *transaction)
        : query_map(query_map), info_map(info_map), 
         notify_list(notify_list),
-       on_applied(on_applied),
-       on_safe(on_safe),
         transaction(transaction),
          handle(NULL) {}
  
@@ -768,15 +1014,13 @@ public:
        : query_map(&(buf.query_map)),
         info_map(&(buf.info_map)),
         notify_list(&(buf.notify_list)),
-       on_applied(rctx.on_applied),
-       on_safe(rctx.on_safe),
         transaction(rctx.transaction),
          handle(rctx.handle) {}
  
      void accept_buffered_messages(BufferedRecoveryMessages &m) {
-      assert(query_map);
-      assert(info_map);
-      assert(notify_list);
+      ceph_assert(query_map);
+      ceph_assert(info_map);
+      ceph_assert(notify_list);
        for (map<int, map<spg_t, pg_query_t> >::iterator i = m.query_map.begin();
            i != m.query_map.end();
            ++i) {
@@ -809,11 +1053,11 @@ public:
  
      void send_notify(pg_shard_t to,
                      const pg_notify_t &info, const PastIntervals &pi) {
-      assert(notify_list);
+      ceph_assert(notify_list);
        (*notify_list)[to.osd].push_back(make_pair(info, pi));
      }
    };
-
+protected:
  
    PGStateHistory pgstate_history;
  
@@ -840,8 +1084,8 @@ protected:
    
    bool        need_up_thru;
    set<pg_shard_t>    stray_set;   // non-acting osds that have PG data.
-  eversion_t  oldest_update; // acting: lowest (valid) last_update in active set
    map<pg_shard_t, pg_info_t>    peer_info;   // info from peers (stray or prior)
+  map<pg_shard_t, int64_t>    peer_bytes;   // Peer's num_bytes from peer_info
    set<pg_shard_t> peer_purged; // peers purged
    map<pg_shard_t, pg_missing_t> peer_missing;
    set<pg_shard_t> peer_log_requested;  // logs i've requested (and start stamps)
@@ -855,15 +1099,19 @@ protected:
                                         // which are unfound on the primary
    epoch_t last_peering_reset;
  
+  epoch_t get_last_peering_reset() const {
+    return last_peering_reset;
+  }
  
    /* heartbeat peers */
    void set_probe_targets(const set<pg_shard_t> &probe_set);
    void clear_probe_targets();
-public:
+
    Mutex heartbeat_peer_lock;
    set<int> heartbeat_peers;
    set<int> probe_targets;
  
+public:
    /**
     * BackfillInterval
     *
@@ -925,7 +1173,7 @@ public:
  
      /// drop first entry, and adjust @begin accordingly
      void pop_front() {
-      assert(!objects.empty());
+      ceph_assert(!objects.empty());
        objects.erase(objects.begin());
        trim();
      }
@@ -954,15 +1202,107 @@ protected:
    bool backfill_reserved;
    bool backfill_reserving;
  
-  friend class OSD;
+  set<pg_shard_t> backfill_targets,  async_recovery_targets;
  
-public:
-  set<pg_shard_t> backfill_targets;
+  // The primary's num_bytes and local num_bytes for this pg, only valid
+  // during backfill for non-primary shards.
+  // Both of these are adjusted for EC to reflect the on-disk bytes
+  std::atomic<int64_t> primary_num_bytes = 0;
+  std::atomic<int64_t> local_num_bytes = 0;
  
+public:
    bool is_backfill_targets(pg_shard_t osd) {
      return backfill_targets.count(osd);
    }
  
+  // Space reserved for backfill is primary_num_bytes - local_num_bytes
+  // Don't care that difference itself isn't atomic
+  uint64_t get_reserved_num_bytes() {
+    int64_t primary = primary_num_bytes.load();
+    int64_t local = local_num_bytes.load();
+    if (primary > local)
+      return primary - local;
+    else
+      return 0;
+  }
+
+  bool is_remote_backfilling() {
+    return primary_num_bytes.load() > 0;
+  }
+
+  void set_reserved_num_bytes(int64_t primary, int64_t local);
+  void clear_reserved_num_bytes();
+
+  // If num_bytes are inconsistent and local_num- goes negative
+  // it's ok, because it would then be ignored.
+
+  // The value of num_bytes could be negative,
+  // but we don't let local_num_bytes go negative.
+  void add_local_num_bytes(int64_t num_bytes) {
+    if (num_bytes) {
+      int64_t prev_bytes = local_num_bytes.load();
+      int64_t new_bytes;
+      do {
+        new_bytes = prev_bytes + num_bytes;
+        if (new_bytes < 0)
+          new_bytes = 0;
+      } while(!local_num_bytes.compare_exchange_weak(prev_bytes, new_bytes));
+    }
+  }
+  void sub_local_num_bytes(int64_t num_bytes) {
+    ceph_assert(num_bytes >= 0);
+    if (num_bytes) {
+      int64_t prev_bytes = local_num_bytes.load();
+      int64_t new_bytes;
+      do {
+        new_bytes = prev_bytes - num_bytes;
+        if (new_bytes < 0)
+          new_bytes = 0;
+      } while(!local_num_bytes.compare_exchange_weak(prev_bytes, new_bytes));
+    }
+  }
+  // The value of num_bytes could be negative,
+  // but we don't let info.stats.stats.sum.num_bytes go negative.
+  void add_num_bytes(int64_t num_bytes) {
+    ceph_assert(_lock.is_locked_by_me());
+    if (num_bytes) {
+      info.stats.stats.sum.num_bytes += num_bytes;
+      if (info.stats.stats.sum.num_bytes < 0) {
+        info.stats.stats.sum.num_bytes = 0;
+      }
+    }
+  }
+  void sub_num_bytes(int64_t num_bytes) {
+    ceph_assert(_lock.is_locked_by_me());
+    ceph_assert(num_bytes >= 0);
+    if (num_bytes) {
+      info.stats.stats.sum.num_bytes -= num_bytes;
+      if (info.stats.stats.sum.num_bytes < 0) {
+        info.stats.stats.sum.num_bytes = 0;
+      }
+    }
+  }
+
+  // Only used in testing so not worried about needing the PG lock here
+  int64_t get_stats_num_bytes() {
+    Mutex::Locker l(_lock);
+    int num_bytes = info.stats.stats.sum.num_bytes;
+    if (pool.info.is_erasure()) {
+      num_bytes /= (int)get_pgbackend()->get_ec_data_chunk_count();
+      // Round up each object by a stripe
+      num_bytes +=  get_pgbackend()->get_ec_stripe_chunk_size() * info.stats.stats.sum.num_objects;
+    }
+    int64_t lnb = local_num_bytes.load();
+    if (lnb && lnb != num_bytes) {
+      lgeneric_dout(cct, 0) << this << " " << info.pgid << " num_bytes mismatch "
+                           << lnb << " vs stats "
+                            << info.stats.stats.sum.num_bytes << " / chunk "
+                            << get_pgbackend()->get_ec_data_chunk_count()
+                            << dendl;
+    }
+    return num_bytes;
+  }
+
  protected:
  
    /*
@@ -1049,7 +1389,7 @@ protected:
    map<hobject_t, list<Context*>> callbacks_for_degraded_object;
  
    map<eversion_t,
-      list<pair<OpRequestRef, version_t> > > waiting_for_ondisk;
+      list<tuple<OpRequestRef, version_t, int> > > waiting_for_ondisk;
  
    void requeue_object_waiters(map<hobject_t, list<OpRequestRef>>& m);
    void requeue_op(OpRequestRef op);
@@ -1063,26 +1403,22 @@ protected:
    bool pg_stats_publish_valid;
    pg_stat_t pg_stats_publish;
  
-  // for ordering writes
-  ceph::shared_ptr<ObjectStore::Sequencer> osr;
-
    void _update_calc_stats();
    void _update_blocked_by();
    friend class TestOpsSocketHook;
    void publish_stats_to_osd();
    void clear_publish_stats();
  
-public:
    void clear_primary_state();
  
-  bool is_actingbackfill(pg_shard_t osd) const {
-    return actingbackfill.count(osd);
+  bool is_acting_recovery_backfill(pg_shard_t osd) const {
+    return acting_recovery_backfill.count(osd);
    }
    bool is_acting(pg_shard_t osd) const {
-    return has_shard(pool.info.ec_pool(), acting, osd);
+    return has_shard(pool.info.is_erasure(), acting, osd);
    }
    bool is_up(pg_shard_t osd) const {
-    return has_shard(pool.info.ec_pool(), up, osd);
+    return has_shard(pool.info.is_erasure(), up, osd);
    }
    static bool has_shard(bool ec, const vector<int>& v, pg_shard_t osd) {
      if (ec) {
@@ -1101,18 +1437,20 @@ public:
    unsigned get_recovery_priority();
    /// get backfill reservation priority
    unsigned get_backfill_priority();
+  /// get priority for pg deletion
+  unsigned get_delete_priority();
  
-  void mark_clean();  ///< mark an active pg clean
+  void try_mark_clean();  ///< mark an active pg clean
  
    /// return [start,end) bounds for required past_intervals
    static pair<epoch_t, epoch_t> get_required_past_interval_bounds(
      const pg_info_t &info,
      epoch_t oldest_map) {
-    epoch_t start = MAX(
+    epoch_t start = std::max(
        info.history.last_epoch_clean ? info.history.last_epoch_clean :
         info.history.epoch_pool_created,
        oldest_map);
-    epoch_t end = MAX(
+    epoch_t end = std::max(
        info.history.same_interval_since,
        info.history.epoch_pool_created);
      return make_pair(start, end);
@@ -1127,23 +1465,23 @@ public:
    bool all_unfound_are_queried_or_lost(const OSDMapRef osdmap) const;
    virtual void dump_recovery_info(Formatter *f) const = 0;
  
-  bool calc_min_last_complete_ondisk() {
+  void calc_min_last_complete_ondisk() {
      eversion_t min = last_complete_ondisk;
-    assert(!actingbackfill.empty());
-    for (set<pg_shard_t>::iterator i = actingbackfill.begin();
-        i != actingbackfill.end();
+    ceph_assert(!acting_recovery_backfill.empty());
+    for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
+        i != acting_recovery_backfill.end();
          ++i) {
        if (*i == get_primary()) continue;
        if (peer_last_complete_ondisk.count(*i) == 0)
-       return false;   // we don't have complete info
+       return;   // we don't have complete info
        eversion_t a = peer_last_complete_ondisk[*i];
        if (a < min)
         min = a;
      }
      if (min == min_last_complete_ondisk)
-      return false;
+      return;
      min_last_complete_ondisk = min;
-    return true;
+    return;
    }
  
    virtual void calc_trim_to() = 0;
@@ -1170,7 +1508,7 @@ public:
        pg->get_pgbackend()->try_stash(hoid, v, t);
      }
      void rollback(const pg_log_entry_t &entry) override {
-      assert(entry.can_rollback());
+      ceph_assert(entry.can_rollback());
        pg->get_pgbackend()->rollback(entry, t);
      }
      void rollforward(const pg_log_entry_t &entry) override {
@@ -1197,13 +1535,8 @@ public:
      pg_shard_t fromosd,
      RecoveryCtx*);
  
-  void check_for_lost_objects();
-  void forget_lost_objects();
-
    void discover_all_missing(std::map<int, map<spg_t,pg_query_t> > &query_map);
    
-  void trim_write_ahead();
-
    map<pg_shard_t, pg_info_t>::const_iterator find_best_info(
      const map<pg_shard_t, pg_info_t> &infos,
      bool restrict_to_up_acting,
@@ -1212,21 +1545,18 @@ public:
      map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
      unsigned size,
      const vector<int> &acting,
-    pg_shard_t acting_primary,
      const vector<int> &up,
-    pg_shard_t up_primary,
      const map<pg_shard_t, pg_info_t> &all_info,
      bool restrict_to_up_acting,
      vector<int> *want,
      set<pg_shard_t> *backfill,
      set<pg_shard_t> *acting_backfill,
-    pg_shard_t *want_primary,
      ostream &ss);
    static void calc_replicated_acting(
      map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
+    uint64_t force_auth_primary_missing_objects,
      unsigned size,
      const vector<int> &acting,
-    pg_shard_t acting_primary,
      const vector<int> &up,
      pg_shard_t up_primary,
      const map<pg_shard_t, pg_info_t> &all_info,
@@ -1234,8 +1564,18 @@ public:
      vector<int> *want,
      set<pg_shard_t> *backfill,
      set<pg_shard_t> *acting_backfill,
-    pg_shard_t *want_primary,
+    const OSDMapRef osdmap,
      ostream &ss);
+  void choose_async_recovery_ec(const map<pg_shard_t, pg_info_t> &all_info,
+                                const pg_info_t &auth_info,
+                                vector<int> *want,
+                                set<pg_shard_t> *async_recovery) const;
+  void choose_async_recovery_replicated(const map<pg_shard_t, pg_info_t> &all_info,
+                                        const pg_info_t &auth_info,
+                                        vector<int> *want,
+                                        set<pg_shard_t> *async_recovery) const;
+
+  bool recoverable_and_ge_min_size(const vector<int> &want) const;
    bool choose_acting(pg_shard_t &auth_log_shard,
                      bool restrict_to_up_acting,
                      bool *history_les_bound);
@@ -1243,11 +1583,21 @@ public:
    void activate(
      ObjectStore::Transaction& t,
      epoch_t activation_epoch,
-    list<Context*>& tfin,
      map<int, map<spg_t,pg_query_t> >& query_map,
      map<int,
        vector<pair<pg_notify_t, PastIntervals> > > *activator_map,
      RecoveryCtx *ctx);
+
+  struct C_PG_ActivateCommitted : public Context {
+    PGRef pg;
+    epoch_t epoch;
+    epoch_t activation_epoch;
+    C_PG_ActivateCommitted(PG *p, epoch_t e, epoch_t ae)
+      : pg(p), epoch(e), activation_epoch(ae) {}
+    void finish(int r) override {
+      pg->_activate_committed(epoch, activation_epoch);
+    }
+  };
    void _activate_committed(epoch_t epoch, epoch_t activation_epoch);
    void all_activated_and_committed();
  
@@ -1262,23 +1612,21 @@ public:
  
    virtual void check_local() = 0;
  
-  /**
-   * @param ops_begun returns how many recovery ops the function started
-   * @returns true if any useful work was accomplished; false otherwise
-   */
-  virtual bool start_recovery_ops(
-    uint64_t max,
-    ThreadPool::TPHandle &handle,
-    uint64_t *ops_begun) = 0;
-
    void purge_strays();
  
    void update_heartbeat_peers();
  
    Context *finish_sync_event;
  
-  void finish_recovery(list<Context*>& tfin);
+  Context *finish_recovery();
    void _finish_recovery(Context *c);
+  struct C_PG_FinishRecovery : public Context {
+    PGRef pg;
+    explicit C_PG_FinishRecovery(PG *p) : pg(p) {}
+    void finish(int r) override {
+      pg->_finish_recovery(this);
+    }
+  };
    void cancel_recovery();
    void clear_recovery_state();
    virtual void _clear_recovery_state() = 0;
@@ -1286,10 +1634,10 @@ public:
    void start_recovery_op(const hobject_t& soid);
    void finish_recovery_op(const hobject_t& soid, bool dequeue=false);
  
-  void split_into(pg_t child_pgid, PG *child, unsigned split_bits);
    virtual void _split_into(pg_t child_pgid, PG *child, unsigned split_bits) = 0;
  
    friend class C_OSD_RepModify_Commit;
+  friend class C_DeleteMore;
  
    // -- backoff --
    Mutex backoff_lock;  // orders inside Backoff::lock
@@ -1313,9 +1661,8 @@ public:
      release_backoffs(begin, end);
    }
  
-  void rm_backoff(BackoffRef b);
-
    // -- scrub --
+public:
    struct Scrubber {
      Scrubber();
      ~Scrubber();
@@ -1330,7 +1677,6 @@ public:
      set<pg_shard_t> waiting_on_whom;
      int shallow_errors;
      int deep_errors;
-    int large_omap_objects = 0;
      int fixed;
      ScrubMap primary_scrubmap;
      ScrubMapBuilder primary_scrubmap_pos;
@@ -1341,6 +1687,8 @@ public:
      OpRequestRef active_rep_scrub;
      utime_t scrub_reg_stamp;  // stamp we registered for
  
+    omap_stat_t omap_stats  = (const struct omap_stat_t){ 0 };
+
      // For async sleep
      bool sleeping = false;
      bool needs_sleep = true;
@@ -1350,10 +1698,15 @@ public:
      bool must_scrub, must_deep_scrub, must_repair;
  
      // Priority to use for scrub scheduling
-    unsigned priority;
+    unsigned priority = 0;
  
      // this flag indicates whether we would like to do auto-repair of the PG or not
      bool auto_repair;
+    // this flag indicates that we are scrubbing post repair to verify everything is fixed
+    bool check_repair;
+    // this flag indicates that if a regular scrub detects errors <= osd_scrub_auto_repair_num_errors,
+    // we should deep scrub in order to auto repair
+    bool deep_scrub_on_error;
  
      // Maps from objects with errors to missing/inconsistent peers
      map<hobject_t, set<pg_shard_t>> missing;
@@ -1466,6 +1819,8 @@ public:
        must_deep_scrub = false;
        must_repair = false;
        auto_repair = false;
+      check_repair = false;
+      deep_scrub_on_error = false;
  
        state = PG::Scrubber::INACTIVE;
        start = hobject_t();
@@ -1474,8 +1829,8 @@ public:
        subset_last_update = eversion_t();
        shallow_errors = 0;
        deep_errors = 0;
-      large_omap_objects = 0;
        fixed = 0;
+      omap_stats = (const struct omap_stat_t){ 0 };
        deep = false;
        run_callbacks();
        inconsistent.clear();
@@ -1496,6 +1851,7 @@ public:
      void cleanup_store(ObjectStore::Transaction *t);
    } scrubber;
  
+protected:
    bool scrub_after_recovery;
  
    int active_pushes;
@@ -1515,7 +1871,6 @@ public:
      const hobject_t& soid, list<pair<ScrubMap::object, pg_shard_t> > *ok_peers,
      pg_shard_t bad_peer);
  
-  void scrub(epoch_t queued, ThreadPool::TPHandle &handle);
    void chunky_scrub(ThreadPool::TPHandle &handle);
    void scrub_compare_maps();
    /**
@@ -1524,12 +1879,10 @@ public:
    bool scrub_process_inconsistent();
    bool ops_blocked_by_scrub() const;
    void scrub_finish();
-  void scrub_clear_state();
+  void scrub_clear_state(bool keep_repair = false);
    void _scan_snaps(ScrubMap &map);
    void _repair_oinfo_oid(ScrubMap &map);
-  void _scan_rollback_obs(
-    const vector<ghobject_t> &rollback_obs,
-    ThreadPool::TPHandle &handle);
+  void _scan_rollback_obs(const vector<ghobject_t> &rollback_obs);
    void _request_scrub_map(pg_shard_t replica, eversion_t version,
                            hobject_t start, hobject_t end, bool deep,
                           bool allow_preemption);
@@ -1552,25 +1905,15 @@ public:
                          boost::optional<uint32_t>>> &missing_digest) { }
    virtual void _scrub_clear_state() { }
    virtual void _scrub_finish() { }
-  virtual void split_colls(
-    spg_t child,
-    int split_bits,
-    int seed,
-    const pg_pool_t *pool,
-    ObjectStore::Transaction *t) = 0;
    void clear_scrub_reserved();
    void scrub_reserve_replicas();
    void scrub_unreserve_replicas();
    bool scrub_all_replicas_reserved() const;
-  bool sched_scrub();
-  void reg_next_scrub();
-  void unreg_next_scrub();
  
    void replica_scrub(
      OpRequestRef op,
      ThreadPool::TPHandle &handle);
    void do_replica_scrub_map(OpRequestRef op);
-  void sub_op_scrub_map(OpRequestRef op);
  
    void handle_scrub_reserve_request(OpRequestRef op);
    void handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from);
@@ -1592,8 +1935,8 @@ public:
        pg(pg), epoch(epoch), evt(evt) {}
      void finish(int r) override {
        pg->lock();
-      pg->queue_peering_event(PG::CephPeeringEvtRef(
-                               new PG::CephPeeringEvt(
+      pg->queue_peering_event(PGPeeringEventRef(
+                               new PGPeeringEvent(
                                   epoch,
                                   epoch,
                                   evt)));
@@ -1601,33 +1944,6 @@ public:
      }
    };
  
-  class CephPeeringEvt {
-    epoch_t epoch_sent;
-    epoch_t epoch_requested;
-    boost::intrusive_ptr< const boost::statechart::event_base > evt;
-    string desc;
-  public:
-    MEMPOOL_CLASS_HELPERS();
-    template <class T>
-    CephPeeringEvt(epoch_t epoch_sent,
-                  epoch_t epoch_requested,
-                  const T &evt_) :
-      epoch_sent(epoch_sent), epoch_requested(epoch_requested),
-      evt(evt_.intrusive_from_this()) {
-      stringstream out;
-      out << "epoch_sent: " << epoch_sent
-         << " epoch_requested: " << epoch_requested << " ";
-      evt_.print(&out);
-      desc = out.str();
-    }
-    epoch_t get_epoch_sent() { return epoch_sent; }
-    epoch_t get_epoch_requested() { return epoch_requested; }
-    const boost::statechart::event_base &get_event() { return *evt; }
-    string get_desc() { return desc; }
-  };
-  typedef ceph::shared_ptr<CephPeeringEvt> CephPeeringEvtRef;
-  list<CephPeeringEvtRef> peering_queue;  // op queue
-  list<CephPeeringEvtRef> peering_waiters;
  
    struct QueryState : boost::statechart::event< QueryState > {
      Formatter *f;
@@ -1637,51 +1953,9 @@ public:
      }
    };
  
-  struct MInfoRec : boost::statechart::event< MInfoRec > {
-    pg_shard_t from;
-    pg_info_t info;
-    epoch_t msg_epoch;
-    MInfoRec(pg_shard_t from, const pg_info_t &info, epoch_t msg_epoch) :
-      from(from), info(info), msg_epoch(msg_epoch) {}
-    void print(std::ostream *out) const {
-      *out << "MInfoRec from " << from << " info: " << info;
-    }
-  };
-
-  struct MLogRec : boost::statechart::event< MLogRec > {
-    pg_shard_t from;
-    boost::intrusive_ptr<MOSDPGLog> msg;
-    MLogRec(pg_shard_t from, MOSDPGLog *msg) :
-      from(from), msg(msg) {}
-    void print(std::ostream *out) const {
-      *out << "MLogRec from " << from;
-    }
-  };
-
-  struct MNotifyRec : boost::statechart::event< MNotifyRec > {
-    pg_shard_t from;
-    pg_notify_t notify;
-    uint64_t features;
-    MNotifyRec(pg_shard_t from, const pg_notify_t &notify, uint64_t f) :
-      from(from), notify(notify), features(f) {}
-    void print(std::ostream *out) const {
-      *out << "MNotifyRec from " << from << " notify: " << notify
-        << " features: 0x" << hex << features << dec;
-    }
-  };
-
-  struct MQuery : boost::statechart::event< MQuery > {
-    pg_shard_t from;
-    pg_query_t query;
-    epoch_t query_epoch;
-    MQuery(pg_shard_t from, const pg_query_t &query, epoch_t query_epoch):
-      from(from), query(query), query_epoch(query_epoch) {}
-    void print(std::ostream *out) const {
-      *out << "MQuery from " << from
-          << " query_epoch " << query_epoch
-          << " query: " << query;
-    }
-  };
+public:
+  int pg_stat_adjust(osd_stat_t *new_stat);
+protected:
  
    struct AdvMap : boost::statechart::event< AdvMap > {
      OSDMapRef osdmap;
@@ -1716,35 +1990,7 @@ public:
        *out << "Activate from " << activation_epoch;
      }
    };
-  struct RequestBackfillPrio : boost::statechart::event< RequestBackfillPrio > {
-    unsigned priority;
-    explicit RequestBackfillPrio(unsigned prio) :
-              boost::statechart::event< RequestBackfillPrio >(),
-                         priority(prio) {}
-    void print(std::ostream *out) const {
-      *out << "RequestBackfillPrio: priority " << priority;
-    }
-  };
-#define TrivialEvent(T) struct T : boost::statechart::event< T > { \
-    T() : boost::statechart::event< T >() {}                      \
-    void print(std::ostream *out) const {                         \
-      *out << #T;                                                 \
-    }                                                             \
-  };
-  struct DeferBackfill : boost::statechart::event<DeferBackfill> {
-    float delay;
-    explicit DeferBackfill(float delay) : delay(delay) {}
-    void print(std::ostream *out) const {
-      *out << "DeferBackfill: delay " << delay;
-    }
-  };
-  struct DeferRecovery : boost::statechart::event<DeferRecovery> {
-    float delay;
-    explicit DeferRecovery(float delay) : delay(delay) {}
-    void print(std::ostream *out) const {
-      *out << "DeferRecovery: delay " << delay;
-    }
-  };
+public:
    struct UnfoundBackfill : boost::statechart::event<UnfoundBackfill> {
      explicit UnfoundBackfill() {}
      void print(std::ostream *out) const {
@@ -1757,22 +2003,29 @@ public:
        *out << "UnfoundRecovery";
      }
    };
+
+  struct RequestScrub : boost::statechart::event<RequestScrub> {
+    bool deep;
+    bool repair;
+    explicit RequestScrub(bool d, bool r) : deep(d), repair(r) {}
+    void print(std::ostream *out) const {
+      *out << "RequestScrub(" << (deep ? "deep" : "shallow")
+          << (repair ? " repair" : "");
+    }
+  };
+
  protected:
    TrivialEvent(Initialize)
-  TrivialEvent(Load)
    TrivialEvent(GotInfo)
    TrivialEvent(NeedUpThru)
-  TrivialEvent(NullEvt)
-  TrivialEvent(FlushedEvt)
    TrivialEvent(Backfilled)
    TrivialEvent(LocalBackfillReserved)
-  TrivialEvent(RemoteBackfillReserved)
    TrivialEvent(RejectRemoteReservation)
-  TrivialEvent(RemoteReservationRejected)
-  TrivialEvent(RemoteReservationCanceled)
+  public:
    TrivialEvent(RequestBackfill)
-  TrivialEvent(RequestRecovery)
-  TrivialEvent(RecoveryDone)
+  protected:
+  TrivialEvent(RemoteRecoveryPreempted)
+  TrivialEvent(RemoteBackfillPreempted)
    TrivialEvent(BackfillTooFull)
    TrivialEvent(RecoveryTooFull)
  
@@ -1785,7 +2038,8 @@ protected:
    TrivialEvent(AllReplicasRecovered)
    TrivialEvent(DoRecovery)
    TrivialEvent(LocalRecoveryReserved)
-  TrivialEvent(RemoteRecoveryReserved)
+  public:
+  protected:
    TrivialEvent(AllRemotesReserved)
    TrivialEvent(AllBackfillsReserved)
    TrivialEvent(GoClean)
@@ -1794,6 +2048,19 @@ protected:
  
    TrivialEvent(IntervalFlush)
  
+  public:
+  TrivialEvent(DeleteStart)
+  TrivialEvent(DeleteSome)
+
+  TrivialEvent(SetForceRecovery)
+  TrivialEvent(UnsetForceRecovery)
+  TrivialEvent(SetForceBackfill)
+  TrivialEvent(UnsetForceBackfill)
+
+  protected:
+  TrivialEvent(DeleteReserved)
+  TrivialEvent(DeleteInterrupted)
+
    /* Encapsulates PG recovery process */
    class RecoveryState {
      void start_handle(RecoveryCtx *new_ctx);
@@ -1826,47 +2093,35 @@ protected:
  
        /* Accessor functions for state methods */
        ObjectStore::Transaction* get_cur_transaction() {
-       assert(state->rctx);
-       assert(state->rctx->transaction);
+       ceph_assert(state->rctx);
+       ceph_assert(state->rctx->transaction);
         return state->rctx->transaction;
        }
  
        void send_query(pg_shard_t to, const pg_query_t &query) {
-       assert(state->rctx);
-       assert(state->rctx->query_map);
+       ceph_assert(state->rctx);
+       ceph_assert(state->rctx->query_map);
         (*state->rctx->query_map)[to.osd][spg_t(pg->info.pgid.pgid, to.shard)] =
           query;
        }
  
        map<int, map<spg_t, pg_query_t> > *get_query_map() {
-       assert(state->rctx);
-       assert(state->rctx->query_map);
+       ceph_assert(state->rctx);
+       ceph_assert(state->rctx->query_map);
         return state->rctx->query_map;
        }
  
        map<int, vector<pair<pg_notify_t, PastIntervals> > > *get_info_map() {
-       assert(state->rctx);
-       assert(state->rctx->info_map);
+       ceph_assert(state->rctx);
+       ceph_assert(state->rctx->info_map);
         return state->rctx->info_map;
        }
  
-      list< Context* > *get_on_safe_context_list() {
-       assert(state->rctx);
-       assert(state->rctx->on_safe);
-       return &(state->rctx->on_safe->contexts);
-      }
-
-      list< Context * > *get_on_applied_context_list() {
-       assert(state->rctx);
-       assert(state->rctx->on_applied);
-       return &(state->rctx->on_applied->contexts);
-      }
-
        RecoveryCtx *get_recovery_ctx() { return &*(state->rctx); }
  
        void send_notify(pg_shard_t to,
                        const pg_notify_t &info, const PastIntervals &pi) {
-       assert(state->rctx);
+       ceph_assert(state->rctx);
         state->rctx->send_notify(to, info, pi);
        }
      };
@@ -1903,6 +2158,10 @@ protected:
      //       RepWaitBackfillReserved
      //       RepWaitRecoveryReserved
      //     Stray
+    //     ToDelete
+    //       WaitDeleteReserved
+    //       Deleting
+    // Crashed
  
      struct Crashed : boost::statechart::state< Crashed, RecoveryMachine >, NamedState {
        explicit Crashed(my_context ctx);
@@ -1916,12 +2175,10 @@ protected:
  
        typedef boost::mpl::list <
         boost::statechart::transition< Initialize, Reset >,
-       boost::statechart::custom_reaction< Load >,
         boost::statechart::custom_reaction< NullEvt >,
         boost::statechart::transition< boost::statechart::event_base, Crashed >
         > reactions;
  
-      boost::statechart::result react(const Load&);
        boost::statechart::result react(const MNotifyRec&);
        boost::statechart::result react(const MInfoRec&);
        boost::statechart::result react(const MLogRec&);
@@ -1939,14 +2196,12 @@ protected:
         boost::statechart::custom_reaction< AdvMap >,
         boost::statechart::custom_reaction< ActMap >,
         boost::statechart::custom_reaction< NullEvt >,
-       boost::statechart::custom_reaction< FlushedEvt >,
         boost::statechart::custom_reaction< IntervalFlush >,
         boost::statechart::transition< boost::statechart::event_base, Crashed >
         > reactions;
        boost::statechart::result react(const QueryState& q);
        boost::statechart::result react(const AdvMap&);
        boost::statechart::result react(const ActMap&);
-      boost::statechart::result react(const FlushedEvt&);
        boost::statechart::result react(const IntervalFlush&);
        boost::statechart::result react(const boost::statechart::event_base&) {
         return discard_event();
@@ -1962,14 +2217,19 @@ protected:
        typedef boost::mpl::list <
         boost::statechart::custom_reaction< QueryState >,
         boost::statechart::custom_reaction< AdvMap >,
-       boost::statechart::custom_reaction< NullEvt >,
-       boost::statechart::custom_reaction< FlushedEvt >,
         boost::statechart::custom_reaction< IntervalFlush >,
+       // ignored
+       boost::statechart::custom_reaction< NullEvt >,
+       boost::statechart::custom_reaction<SetForceRecovery>,
+       boost::statechart::custom_reaction<UnsetForceRecovery>,
+       boost::statechart::custom_reaction<SetForceBackfill>,
+       boost::statechart::custom_reaction<UnsetForceBackfill>,
+       boost::statechart::custom_reaction<RequestScrub>,
+       // crash
         boost::statechart::transition< boost::statechart::event_base, Crashed >
         > reactions;
        boost::statechart::result react(const QueryState& q);
        boost::statechart::result react(const AdvMap&);
-      boost::statechart::result react(const FlushedEvt&);
        boost::statechart::result react(const IntervalFlush&);
        boost::statechart::result react(const boost::statechart::event_base&) {
         return discard_event();
@@ -2001,10 +2261,20 @@ protected:
        typedef boost::mpl::list <
         boost::statechart::custom_reaction< ActMap >,
         boost::statechart::custom_reaction< MNotifyRec >,
-       boost::statechart::transition< NeedActingChange, WaitActingChange >
+       boost::statechart::transition< NeedActingChange, WaitActingChange >,
+       boost::statechart::custom_reaction<SetForceRecovery>,
+       boost::statechart::custom_reaction<UnsetForceRecovery>,
+       boost::statechart::custom_reaction<SetForceBackfill>,
+       boost::statechart::custom_reaction<UnsetForceBackfill>,
+       boost::statechart::custom_reaction<RequestScrub>
         > reactions;
        boost::statechart::result react(const ActMap&);
        boost::statechart::result react(const MNotifyRec&);
+      boost::statechart::result react(const SetForceRecovery&);
+      boost::statechart::result react(const UnsetForceRecovery&);
+      boost::statechart::result react(const SetForceBackfill&);
+      boost::statechart::result react(const UnsetForceBackfill&);
+      boost::statechart::result react(const RequestScrub&);
      };
  
      struct WaitActingChange : boost::statechart::state< WaitActingChange, Primary>,
@@ -2061,12 +2331,15 @@ protected:
         boost::statechart::custom_reaction< MInfoRec >,
         boost::statechart::custom_reaction< MNotifyRec >,
         boost::statechart::custom_reaction< MLogRec >,
+       boost::statechart::custom_reaction< MTrim >,
         boost::statechart::custom_reaction< Backfilled >,
         boost::statechart::custom_reaction< AllReplicasActivated >,
         boost::statechart::custom_reaction< DeferRecovery >,
         boost::statechart::custom_reaction< DeferBackfill >,
-  boost::statechart::custom_reaction< UnfoundRecovery >,
+       boost::statechart::custom_reaction< UnfoundRecovery >,
         boost::statechart::custom_reaction< UnfoundBackfill >,
+       boost::statechart::custom_reaction< RemoteReservationRevokedTooFull>,
+       boost::statechart::custom_reaction< RemoteReservationRevoked>,
         boost::statechart::custom_reaction< DoRecovery>
         > reactions;
        boost::statechart::result react(const QueryState& q);
@@ -2075,6 +2348,7 @@ protected:
        boost::statechart::result react(const MInfoRec& infoevt);
        boost::statechart::result react(const MNotifyRec& notevt);
        boost::statechart::result react(const MLogRec& logevt);
+      boost::statechart::result react(const MTrim& trimevt);
        boost::statechart::result react(const Backfilled&) {
         return discard_event();
        }
@@ -2086,10 +2360,16 @@ protected:
         return discard_event();
        }
        boost::statechart::result react(const UnfoundRecovery& evt) {
-  return discard_event();
+       return discard_event();
        }
        boost::statechart::result react(const UnfoundBackfill& evt) {
-  return discard_event();
+       return discard_event();
+      }
+      boost::statechart::result react(const RemoteReservationRevokedTooFull&) {
+       return discard_event();
+      }
+      boost::statechart::result react(const RemoteReservationRevoked&) {
+       return discard_event();
        }
        boost::statechart::result react(const DoRecovery&) {
         return discard_event();
@@ -2098,10 +2378,15 @@ protected:
  
      struct Clean : boost::statechart::state< Clean, Active >, NamedState {
        typedef boost::mpl::list<
-       boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >
+       boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >,
+       boost::statechart::custom_reaction<SetForceRecovery>,
+       boost::statechart::custom_reaction<SetForceBackfill>
        > reactions;
        explicit Clean(my_context ctx);
        void exit();
+      boost::statechart::result react(const boost::statechart::event_base&) {
+       return discard_event();
+      }
      };
  
      struct Recovered : boost::statechart::state< Recovered, Active >, NamedState {
@@ -2120,15 +2405,26 @@ protected:
  
      struct Backfilling : boost::statechart::state< Backfilling, Active >, NamedState {
        typedef boost::mpl::list<
-       boost::statechart::transition< Backfilled, Recovered >,
+        boost::statechart::custom_reaction< Backfilled >,
         boost::statechart::custom_reaction< DeferBackfill >,
         boost::statechart::custom_reaction< UnfoundBackfill >,
-       boost::statechart::custom_reaction< RemoteReservationRejected >
+       boost::statechart::custom_reaction< RemoteReservationRejected >,
+       boost::statechart::custom_reaction< RemoteReservationRevokedTooFull>,
+       boost::statechart::custom_reaction< RemoteReservationRevoked>
         > reactions;
        explicit Backfilling(my_context ctx);
-      boost::statechart::result react(const RemoteReservationRejected& evt);
+      boost::statechart::result react(const RemoteReservationRejected& evt) {
+       // for compat with old peers
+       post_event(RemoteReservationRevokedTooFull());
+       return discard_event();
+      }
+      void backfill_release_reservations();
+      boost::statechart::result react(const Backfilled& evt);
+      boost::statechart::result react(const RemoteReservationRevokedTooFull& evt);
+      boost::statechart::result react(const RemoteReservationRevoked& evt);
        boost::statechart::result react(const DeferBackfill& evt);
        boost::statechart::result react(const UnfoundBackfill& evt);
+      void cancel_backfill();
        void exit();
      };
  
@@ -2136,13 +2432,16 @@ protected:
        typedef boost::mpl::list<
         boost::statechart::custom_reaction< RemoteBackfillReserved >,
         boost::statechart::custom_reaction< RemoteReservationRejected >,
+       boost::statechart::custom_reaction< RemoteReservationRevoked >,
         boost::statechart::transition< AllBackfillsReserved, Backfilling >
         > reactions;
        set<pg_shard_t>::const_iterator backfill_osd_it;
        explicit WaitRemoteBackfillReserved(my_context ctx);
+      void retry();
        void exit();
        boost::statechart::result react(const RemoteBackfillReserved& evt);
        boost::statechart::result react(const RemoteReservationRejected& evt);
+      boost::statechart::result react(const RemoteReservationRevoked& evt);
      };
  
      struct WaitLocalBackfillReserved : boost::statechart::state< WaitLocalBackfillReserved, Active >, NamedState {
@@ -2183,6 +2482,7 @@ protected:
        void exit();
      };
  
+    struct ToDelete;
      struct RepNotRecovering;
      struct ReplicaActive : boost::statechart::state< ReplicaActive, Started, RepNotRecovering >, NamedState {
        explicit ReplicaActive(my_context ctx);
@@ -2194,18 +2494,27 @@ protected:
         boost::statechart::custom_reaction< MQuery >,
         boost::statechart::custom_reaction< MInfoRec >,
         boost::statechart::custom_reaction< MLogRec >,
+       boost::statechart::custom_reaction< MTrim >,
         boost::statechart::custom_reaction< Activate >,
         boost::statechart::custom_reaction< DeferRecovery >,
         boost::statechart::custom_reaction< DeferBackfill >,
         boost::statechart::custom_reaction< UnfoundRecovery >,
-       boost::statechart::custom_reaction< UnfoundBackfill >
+       boost::statechart::custom_reaction< UnfoundBackfill >,
+       boost::statechart::custom_reaction< RemoteBackfillPreempted >,
+       boost::statechart::custom_reaction< RemoteRecoveryPreempted >,
+       boost::statechart::custom_reaction< RecoveryDone >,
+       boost::statechart::transition<DeleteStart, ToDelete>
         > reactions;
        boost::statechart::result react(const QueryState& q);
        boost::statechart::result react(const MInfoRec& infoevt);
        boost::statechart::result react(const MLogRec& logevt);
+      boost::statechart::result react(const MTrim& trimevt);
        boost::statechart::result react(const ActMap&);
        boost::statechart::result react(const MQuery&);
        boost::statechart::result react(const Activate&);
+      boost::statechart::result react(const RecoveryDone&) {
+       return discard_event();
+      }
        boost::statechart::result react(const DeferRecovery& evt) {
         return discard_event();
        }
@@ -2218,6 +2527,12 @@ protected:
        boost::statechart::result react(const UnfoundBackfill& evt) {
         return discard_event();
        }
+      boost::statechart::result react(const RemoteBackfillPreempted& evt) {
+       return discard_event();
+      }
+      boost::statechart::result react(const RemoteRecoveryPreempted& evt) {
+       return discard_event();
+      }
      };
  
      struct RepRecovering : boost::statechart::state< RepRecovering, ReplicaActive >, NamedState {
@@ -2226,10 +2541,14 @@ protected:
         // for compat with old peers
         boost::statechart::transition< RemoteReservationRejected, RepNotRecovering >,
         boost::statechart::transition< RemoteReservationCanceled, RepNotRecovering >,
-       boost::statechart::custom_reaction< BackfillTooFull >
+       boost::statechart::custom_reaction< BackfillTooFull >,
+       boost::statechart::custom_reaction< RemoteRecoveryPreempted >,
+       boost::statechart::custom_reaction< RemoteBackfillPreempted >
         > reactions;
        explicit RepRecovering(my_context ctx);
+      boost::statechart::result react(const RemoteRecoveryPreempted &evt);
        boost::statechart::result react(const BackfillTooFull &evt);
+      boost::statechart::result react(const RemoteBackfillPreempted &evt);
        void exit();
      };
  
@@ -2268,15 +2587,26 @@ protected:
  
      struct RepNotRecovering : boost::statechart::state< RepNotRecovering, ReplicaActive>, NamedState {
        typedef boost::mpl::list<
+       boost::statechart::custom_reaction< RequestRecoveryPrio >,
         boost::statechart::custom_reaction< RequestBackfillPrio >,
-        boost::statechart::transition< RequestRecovery, RepWaitRecoveryReserved >,
         boost::statechart::custom_reaction< RejectRemoteReservation >,
         boost::statechart::transition< RemoteReservationRejected, RepNotRecovering >,
         boost::statechart::transition< RemoteReservationCanceled, RepNotRecovering >,
+       boost::statechart::custom_reaction< RemoteRecoveryReserved >,
+       boost::statechart::custom_reaction< RemoteBackfillReserved >,
         boost::statechart::transition< RecoveryDone, RepNotRecovering >  // for compat with pre-reservation peers
         > reactions;
        explicit RepNotRecovering(my_context ctx);
+      boost::statechart::result react(const RequestRecoveryPrio &evt);
        boost::statechart::result react(const RequestBackfillPrio &evt);
+      boost::statechart::result react(const RemoteBackfillReserved &evt) {
+       // my reservation completion raced with a RELEASE from primary
+       return discard_event();
+      }
+      boost::statechart::result react(const RemoteRecoveryReserved &evt) {
+       // my reservation completion raced with a RELEASE from primary
+       return discard_event();
+      }
        boost::statechart::result react(const RejectRemoteReservation &evt);
        void exit();
      };
@@ -2328,9 +2658,8 @@ protected:
        void exit();
      };
  
-    struct Stray : boost::statechart::state< Stray, Started >, NamedState {
-      map<int, pair<pg_query_t, epoch_t> > pending_queries;
-
+    struct Stray : boost::statechart::state< Stray, Started >,
+             NamedState {
        explicit Stray(my_context ctx);
        void exit();
  
@@ -2339,7 +2668,8 @@ protected:
         boost::statechart::custom_reaction< MLogRec >,
         boost::statechart::custom_reaction< MInfoRec >,
         boost::statechart::custom_reaction< ActMap >,
-       boost::statechart::custom_reaction< RecoveryDone >
+       boost::statechart::custom_reaction< RecoveryDone >,
+       boost::statechart::transition<DeleteStart, ToDelete>
         > reactions;
        boost::statechart::result react(const MQuery& query);
        boost::statechart::result react(const MLogRec& logevt);
@@ -2350,6 +2680,43 @@ protected:
        }
      };
  
+    struct WaitDeleteReserved;
+    struct ToDelete : boost::statechart::state<ToDelete, Started, WaitDeleteReserved>, NamedState {
+      unsigned priority = 0;
+      typedef boost::mpl::list <
+       boost::statechart::custom_reaction< ActMap >,
+       boost::statechart::custom_reaction< DeleteSome >
+       > reactions;
+      explicit ToDelete(my_context ctx);
+      boost::statechart::result react(const ActMap &evt);
+      boost::statechart::result react(const DeleteSome &evt) {
+       // happens if we drop out of Deleting due to reprioritization etc.
+       return discard_event();
+      }
+      void exit();
+    };
+
+    struct Deleting;
+    struct WaitDeleteReserved : boost::statechart::state<WaitDeleteReserved,
+                                                        ToDelete>, NamedState {
+      typedef boost::mpl::list <
+       boost::statechart::transition<DeleteReserved, Deleting>
+       > reactions;
+      explicit WaitDeleteReserved(my_context ctx);
+      void exit();
+    };
+
+    struct Deleting : boost::statechart::state<Deleting,
+                                              ToDelete>, NamedState {
+      typedef boost::mpl::list <
+       boost::statechart::custom_reaction< DeleteSome >,
+       boost::statechart::transition<DeleteInterrupted, WaitDeleteReserved>
+       > reactions;
+      explicit Deleting(my_context ctx);
+      boost::statechart::result react(const DeleteSome &evt);
+      void exit();
+    };
+
      struct GetLog;
  
      struct GetInfo : boost::statechart::state< GetInfo, Peering >, NamedState {
@@ -2427,9 +2794,11 @@ protected:
      struct Down : boost::statechart::state< Down, Peering>, NamedState {
        explicit Down(my_context ctx);
        typedef boost::mpl::list <
-       boost::statechart::custom_reaction< QueryState >
+       boost::statechart::custom_reaction< QueryState >,
+       boost::statechart::custom_reaction< MNotifyRec >
         > reactions;
-      boost::statechart::result react(const QueryState& infoevt);
+      boost::statechart::result react(const QueryState& q);
+      boost::statechart::result react(const MNotifyRec& infoevt);
        void exit();
      };
  
@@ -2442,11 +2811,10 @@ protected:
        explicit Incomplete(my_context ctx);
        boost::statechart::result react(const AdvMap &advmap);
        boost::statechart::result react(const MNotifyRec& infoevt);
-      boost::statechart::result react(const QueryState& infoevt);
+      boost::statechart::result react(const QueryState& q);
        void exit();
      };
  
-
      RecoveryMachine machine;
      PG *pg;
  
@@ -2476,7 +2844,7 @@ protected:
        end_handle();
      }
  
-    void handle_event(CephPeeringEvtRef evt,
+    void handle_event(PGPeeringEventRef evt,
                       RecoveryCtx *rctx) {
        start_handle(rctx);
        machine.process_event(evt->get_event());
@@ -2486,35 +2854,18 @@ protected:
    } recovery_state;
  
  
- public:
-  PG(OSDService *o, OSDMapRef curmap,
-     const PGPool &pool, spg_t p);
-  ~PG() override;
-  const spg_t pg_id;
  
- private:
-  // Prevent copying
-  explicit PG(const PG& rhs);
-  PG& operator=(const PG& rhs);
    uint64_t peer_features;
    uint64_t acting_features;
    uint64_t upacting_features;
  
    epoch_t last_epoch;
  
- public:
-  const spg_t&      get_pgid() const { return pg_id; }
-
-  void set_last_scrub_stamp(utime_t t) {
-    info.stats.last_scrub_stamp = t;
-    info.history.last_scrub_stamp = t;
-  }
-
-  void set_last_deep_scrub_stamp(utime_t t) {
-    info.stats.last_deep_scrub_stamp = t;
-    info.history.last_deep_scrub_stamp = t;
-  }
+  /// most recently consumed osdmap's require_osd_version
+  unsigned last_require_osd_release = 0;
+  bool delete_needs_sleep = false;
  
+protected:
    void reset_min_peer_features() {
      peer_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
    }
@@ -2543,7 +2894,7 @@ protected:
         actingset.insert(
           pg_shard_t(
             acting[i],
-           pool.info.ec_pool() ? shard_id_t(i) : shard_id_t::NO_SHARD));
+           pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
      }
      upset.clear();
      up = newup;
@@ -2552,9 +2903,9 @@ protected:
         upset.insert(
           pg_shard_t(
             up[i],
-           pool.info.ec_pool() ? shard_id_t(i) : shard_id_t::NO_SHARD));
+           pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
      }
-    if (!pool.info.ec_pool()) {
+    if (!pool.info.is_erasure()) {
        up_primary = pg_shard_t(new_up_primary, shard_id_t::NO_SHARD);
        primary = pg_shard_t(new_acting_primary, shard_id_t::NO_SHARD);
        return;
@@ -2573,71 +2924,55 @@ protected:
         break;
        }
      }
-    assert(up_primary.osd == new_up_primary);
-    assert(primary.osd == new_acting_primary);
+    ceph_assert(up_primary.osd == new_up_primary);
+    ceph_assert(primary.osd == new_acting_primary);
    }
-  pg_shard_t get_primary() const { return primary; }
-  
-  int        get_role() const { return role; }
-  void       set_role(int r) { role = r; }
  
-  bool       is_primary() const { return pg_whoami == primary; }
-  bool       is_replica() const { return role > 0; }
+  void set_role(int r) {
+    role = r;
+  }
  
-  epoch_t get_last_peering_reset() const { return last_peering_reset; }
-  
-  //int  get_state() const { return state; }
-  bool state_test(int m) const { return (state & m) != 0; }
-  void state_set(int m) { state |= m; }
-  void state_clear(int m) { state &= ~m; }
+  bool state_test(uint64_t m) const { return (state & m) != 0; }
+  void state_set(uint64_t m) { state |= m; }
+  void state_clear(uint64_t m) { state &= ~m; }
  
    bool is_complete() const { return info.last_complete == info.last_update; }
    bool should_send_notify() const { return send_notify; }
  
-  int get_state() const { return state; }
-  bool       is_active() const { return state_test(PG_STATE_ACTIVE); }
-  bool       is_activating() const { return state_test(PG_STATE_ACTIVATING); }
-  bool       is_peering() const { return state_test(PG_STATE_PEERING); }
-  bool       is_down() const { return state_test(PG_STATE_DOWN); }
-  bool       is_recovery_unfound() const { return state_test(PG_STATE_RECOVERY_UNFOUND); }
-  bool       is_backfill_unfound() const { return state_test(PG_STATE_BACKFILL_UNFOUND); }
-  bool       is_incomplete() const { return state_test(PG_STATE_INCOMPLETE); }
-  bool       is_clean() const { return state_test(PG_STATE_CLEAN); }
-  bool       is_degraded() const { return state_test(PG_STATE_DEGRADED); }
-  bool       is_undersized() const { return state_test(PG_STATE_UNDERSIZED); }
-
-  bool       is_scrubbing() const { return state_test(PG_STATE_SCRUBBING); }
-  bool       is_remapped() const { return state_test(PG_STATE_REMAPPED); }
-  bool       is_peered() const {
+  uint64_t get_state() const { return state; }
+  bool is_active() const { return state_test(PG_STATE_ACTIVE); }
+  bool is_activating() const { return state_test(PG_STATE_ACTIVATING); }
+  bool is_peering() const { return state_test(PG_STATE_PEERING); }
+  bool is_down() const { return state_test(PG_STATE_DOWN); }
+  bool is_recovery_unfound() const { return state_test(PG_STATE_RECOVERY_UNFOUND); }
+  bool is_backfill_unfound() const { return state_test(PG_STATE_BACKFILL_UNFOUND); }
+  bool is_incomplete() const { return state_test(PG_STATE_INCOMPLETE); }
+  bool is_clean() const { return state_test(PG_STATE_CLEAN); }
+  bool is_degraded() const { return state_test(PG_STATE_DEGRADED); }
+  bool is_undersized() const { return state_test(PG_STATE_UNDERSIZED); }
+  bool is_scrubbing() const { return state_test(PG_STATE_SCRUBBING); }
+  bool is_remapped() const { return state_test(PG_STATE_REMAPPED); }
+  bool is_peered() const {
      return state_test(PG_STATE_ACTIVE) || state_test(PG_STATE_PEERED);
    }
    bool is_recovering() const { return state_test(PG_STATE_RECOVERING); }
+  bool is_premerge() const { return state_test(PG_STATE_PREMERGE); }
+  bool is_repair() const { return state_test(PG_STATE_REPAIR); }
  
-  bool  is_empty() const { return info.last_update == eversion_t(0,0); }
-
-  void init(
-    int role,
-    const vector<int>& up,
-    int up_primary,
-    const vector<int>& acting,
-    int acting_primary,
-    const pg_history_t& history,
-    const PastIntervals& pim,
-    bool backfill,
-    ObjectStore::Transaction *t);
+  bool is_empty() const { return info.last_update == eversion_t(0,0); }
  
    // pg on-disk state
    void do_pending_flush();
  
+public:
    static void _create(ObjectStore::Transaction& t, spg_t pgid, int bits);
    static void _init(ObjectStore::Transaction& t,
                     spg_t pgid, const pg_pool_t *pool);
  
-private:
+protected:
    void prepare_write_info(map<string,bufferlist> *km);
  
    void update_store_with_options();
-  void update_store_on_load();
  
  public:
    static int _prepare_write_info(
@@ -2651,6 +2986,11 @@ public:
      bool dirty_epoch,
      bool try_fast_info,
      PerfCounters *logger = nullptr);
+
+  void write_if_dirty(RecoveryCtx *rctx) {
+    write_if_dirty(*rctx->transaction);
+  }
+protected:
    void write_if_dirty(ObjectStore::Transaction& t);
  
    PGLog::IndexedLog projected_log;
@@ -2662,11 +3002,11 @@ public:
    eversion_t projected_last_update;
    eversion_t get_next_version() const {
      eversion_t at_version(
-      get_osdmap()->get_epoch(),
+      get_osdmap_epoch(),
        projected_last_update.version+1);
-    assert(at_version > info.last_update);
-    assert(at_version > pg_log.get_head());
-    assert(at_version > projected_last_update);
+    ceph_assert(at_version > info.last_update);
+    ceph_assert(at_version > pg_log.get_head());
+    ceph_assert(at_version > projected_last_update);
      return at_version;
    }
  
@@ -2676,18 +3016,12 @@ public:
      eversion_t trim_to,
      eversion_t roll_forward_to,
      ObjectStore::Transaction &t,
-    bool transaction_applied = true);
+    bool transaction_applied = true,
+    bool async = false);
    bool check_log_for_corruption(ObjectStore *store);
  
    std::string get_corrupt_pg_log_name() const;
-  static int read_info(
-    ObjectStore *store, spg_t pgid, const coll_t &coll,
-    bufferlist &bl, pg_info_t &info, PastIntervals &past_intervals,
-    __u8 &);
-  void read_state(ObjectStore *store, bufferlist &bl);
-  static bool _has_removal_flag(ObjectStore *store, spg_t pgid);
-  static int peek_map_epoch(ObjectStore *store, spg_t pgid,
-                           epoch_t *pepoch, bufferlist *bl);
+
    void update_snap_map(
      const vector<pg_log_entry_t> &log_entries,
      ObjectStore::Transaction& t);
@@ -2715,7 +3049,7 @@ public:
  
    /**
     * Merge entries updating missing as necessary on all
-   * actingbackfill logs and missings (also missing_loc)
+   * acting_recovery_backfill logs and missings (also missing_loc)
     */
    void merge_new_log_entries(
      const mempool::osd_pglog::list<pg_log_entry_t> &entries,
@@ -2731,14 +3065,8 @@ public:
      ObjectStore::Transaction *t);
    void on_new_interval();
    virtual void _on_new_interval() = 0;
-  void start_flush(ObjectStore::Transaction *t,
-                  list<Context *> *on_applied,
-                  list<Context *> *on_safe);
+  void start_flush(ObjectStore::Transaction *t);
    void set_last_peering_reset();
-  bool pg_has_reset_since(epoch_t e) {
-    assert(is_locked());
-    return deleting || e < get_last_peering_reset();
-  }
  
    void update_history(const pg_history_t& history);
    void fulfill_info(pg_shard_t from, const pg_query_t &query,
@@ -2765,14 +3093,14 @@ public:
    bool can_discard_replica_op(OpRequestRef& op);
  
    bool old_peering_msg(epoch_t reply_epoch, epoch_t query_epoch);
-  bool old_peering_evt(CephPeeringEvtRef evt) {
+  bool old_peering_evt(PGPeeringEventRef evt) {
      return old_peering_msg(evt->get_epoch_sent(), evt->get_epoch_requested());
    }
    static bool have_same_or_newer_map(epoch_t cur_epoch, epoch_t e) {
      return e <= cur_epoch;
    }
    bool have_same_or_newer_map(epoch_t e) {
-    return e <= get_osdmap()->get_epoch();
+    return e <= get_osdmap_epoch();
    }
  
    bool op_has_sufficient_caps(OpRequestRef& op);
@@ -2780,67 +3108,21 @@ public:
  
    // recovery bits
    void take_waiters();
-  void queue_peering_event(CephPeeringEvtRef evt);
-  void handle_peering_event(CephPeeringEvtRef evt, RecoveryCtx *rctx);
-  void queue_query(epoch_t msg_epoch, epoch_t query_epoch,
-                  pg_shard_t from, const pg_query_t& q);
-  void queue_null(epoch_t msg_epoch, epoch_t query_epoch);
-  void queue_flushed(epoch_t started_at);
-  void handle_advance_map(
-    OSDMapRef osdmap, OSDMapRef lastmap,
-    vector<int>& newup, int up_primary,
-    vector<int>& newacting, int acting_primary,
-    RecoveryCtx *rctx);
-  void handle_activate_map(RecoveryCtx *rctx);
-  void handle_create(RecoveryCtx *rctx);
-  void handle_loaded(RecoveryCtx *rctx);
-  void handle_query_state(Formatter *f);
-
-  virtual void on_removal(ObjectStore::Transaction *t) = 0;
  
  
    // abstract bits
-  virtual void do_request(
-    OpRequestRef& op,
-    ThreadPool::TPHandle &handle
-  ) = 0;
-
-  virtual void do_op(OpRequestRef& op) = 0;
-  virtual void do_sub_op(OpRequestRef op) = 0;
-  virtual void do_sub_op_reply(OpRequestRef op) = 0;
-  virtual void do_scan(
-    OpRequestRef op,
-    ThreadPool::TPHandle &handle
-  ) = 0;
-  virtual void do_backfill(OpRequestRef op) = 0;
-  virtual void snap_trimmer(epoch_t epoch_queued) = 0;
-
-  virtual int do_command(
-    cmdmap_t cmdmap,
-    ostream& ss,
-    bufferlist& idata,
-    bufferlist& odata,
-    ConnectionRef conn,
-    ceph_tid_t tid) = 0;
+  friend class FlushState;
  
    virtual void on_role_change() = 0;
    virtual void on_pool_change() = 0;
    virtual void on_change(ObjectStore::Transaction *t) = 0;
    virtual void on_activate() = 0;
    virtual void on_flushed() = 0;
-  virtual void on_shutdown() = 0;
    virtual void check_blacklisted_watchers() = 0;
-  virtual void get_watchers(std::list<obj_watch_item_t>&) = 0;
  
-  virtual bool agent_work(int max) = 0;
-  virtual bool agent_work(int max, int agent_flush_quota) = 0;
-  virtual void agent_stop() = 0;
-  virtual void agent_delay() = 0;
-  virtual void agent_clear() = 0;
-  virtual void agent_choose_mode_restart() = 0;
+  friend ostream& operator<<(ostream& out, const PG& pg);
  };
  
-ostream& operator<<(ostream& out, const PG& pg);
  
  ostream& operator<<(ostream& out, const PG::BackfillInterval& bi);