]> git.proxmox.com Git - ceph.git/blobdiff - ceph/src/osd/PG.h
update sources to ceph Nautilus 14.2.1
[ceph.git] / ceph / src / osd / PG.h
index dc64cedb15e58af0d2e23b067178426c3074412a..790c004bed753e514adf6b807070a86f0c21a20a 100644 (file)
 #include <boost/scoped_ptr.hpp>
 #include <boost/circular_buffer.hpp>
 #include <boost/container/flat_set.hpp>
-#include "include/memory.h"
 #include "include/mempool.h"
 
 // re-include our assert to clobber boost's
-#include "include/assert.h" 
+#include "include/ceph_assert.h" 
 
 #include "include/types.h"
 #include "include/stringify.h"
@@ -44,6 +43,9 @@
 #include "messages/MOSDPGLog.h"
 #include "include/str_list.h"
 #include "PGBackend.h"
+#include "PGPeeringEvent.h"
+
+#include "mgr/OSDPerfMetricTypes.h"
 
 #include <atomic>
 #include <list>
 #include <stack>
 #include <string>
 #include <tuple>
-using namespace std;
-
-// #include "include/unordered_map.h"
-// #include "include/unordered_set.h"
 
 //#define DEBUG_RECOVERY_OIDS   // track set of recovering oids explicitly, to find counting bugs
+//#define PG_DEBUG_REFS    // track provenance of pg refs, helpful for finding leaks
 
 class OSD;
 class OSDService;
+class OSDShard;
+class OSDShardPGSlot;
 class MOSDOp;
 class MOSDPGScan;
 class MOSDPGBackfill;
@@ -70,14 +71,12 @@ struct OpRequest;
 typedef OpRequest::Ref OpRequestRef;
 class MOSDPGLog;
 class CephContext;
+class DynamicPerfStats;
 
 namespace Scrub {
   class Store;
 }
 
-void intrusive_ptr_add_ref(PG *pg);
-void intrusive_ptr_release(PG *pg);
-
 using state_history_entry = std::tuple<utime_t, utime_t, const char*>;
 using embedded_state = std::pair<utime_t, const char*>;
 
@@ -163,11 +162,11 @@ class PGRecoveryStats {
   PGRecoveryStats() : lock("PGRecoverStats::lock") {}
 
   void reset() {
-    Mutex::Locker l(lock);
+    std::lock_guard l(lock);
     info.clear();
   }
   void dump(ostream& out) {
-    Mutex::Locker l(lock);
+    std::lock_guard l(lock);
     for (map<const char *,per_state_info>::iterator p = info.begin(); p != info.end(); ++p) {
       per_state_info& i = p->second;
       out << i.enter << "\t" << i.exit << "\t"
@@ -179,7 +178,7 @@ class PGRecoveryStats {
   }
 
   void dump_formatted(Formatter *f) {
-    Mutex::Locker l(lock);
+    std::lock_guard l(lock);
     f->open_array_section("pg_recovery_stats");
     for (map<const char *,per_state_info>::iterator p = info.begin();
         p != info.end(); ++p) {
@@ -206,11 +205,11 @@ class PGRecoveryStats {
   }
 
   void log_enter(const char *s) {
-    Mutex::Locker l(lock);
+    std::lock_guard l(lock);
     info[s].enter++;
   }
   void log_exit(const char *s, utime_t dur, uint64_t events, utime_t event_dur) {
-    Mutex::Locker l(lock);
+    std::lock_guard l(lock);
     per_state_info &i = info[s];
     i.exit++;
     i.total_time += dur;
@@ -228,28 +227,28 @@ struct PGPool {
   epoch_t cached_epoch;
   int64_t id;
   string name;
-  uint64_t auid;
 
   pg_pool_t info;      
   SnapContext snapc;   // the default pool snapc, ready to go.
 
+  // these two sets are for < mimic only
   interval_set<snapid_t> cached_removed_snaps;      // current removed_snaps set
   interval_set<snapid_t> newly_removed_snaps;  // newly removed in the last epoch
 
-  PGPool(CephContext* cct, OSDMapRef map, int64_t i)
+  PGPool(CephContext* cct, OSDMapRef map, int64_t i, const pg_pool_t& info,
+        const string& name)
     : cct(cct),
       cached_epoch(map->get_epoch()),
       id(i),
-      name(map->get_pool_name(id)),
-      auid(map->get_pg_pool(id)->auid) {
-    const pg_pool_t *pi = map->get_pg_pool(id);
-    assert(pi);
-    info = *pi;
-    snapc = pi->get_snap_context();
-    pi->build_removed_snaps(cached_removed_snaps);
+      name(name),
+      info(info) {
+    snapc = info.get_snap_context();
+    if (map->require_osd_release < CEPH_RELEASE_MIMIC) {
+      info.build_removed_snaps(cached_removed_snaps);
+    }
   }
 
-  void update(OSDMapRef map);
+  void update(CephContext *cct, OSDMapRef map);
 };
 
 /** PG - Replica Placement Group
@@ -258,130 +257,369 @@ struct PGPool {
 
 class PG : public DoutPrefixProvider {
 public:
+  // -- members --
+  const spg_t pg_id;
+  const coll_t coll;
+
+  ObjectStore::CollectionHandle ch;
+
+  struct RecoveryCtx;
+
+  // -- methods --
+  std::ostream& gen_prefix(std::ostream& out) const override;
+  CephContext *get_cct() const override {
+    return cct;
+  }
+  unsigned get_subsys() const override {
+    return ceph_subsys_osd;
+  }
+
+  const OSDMapRef& get_osdmap() const {
+    ceph_assert(is_locked());
+    ceph_assert(osdmap_ref);
+    return osdmap_ref;
+  }
+  epoch_t get_osdmap_epoch() const {
+    return osdmap_ref->get_epoch();
+  }
+
+  void lock_suspend_timeout(ThreadPool::TPHandle &handle) {
+    handle.suspend_tp_timeout();
+    lock();
+    handle.reset_tp_timeout();
+  }
+  void lock(bool no_lockdep = false) const;
+  void unlock() const {
+    //generic_dout(0) << this << " " << info.pgid << " unlock" << dendl;
+    ceph_assert(!dirty_info);
+    ceph_assert(!dirty_big_info);
+    _lock.Unlock();
+  }
+  bool is_locked() const {
+    return _lock.is_locked();
+  }
+
+  const spg_t& get_pgid() const {
+    return pg_id;
+  }
+
+  const PGPool& get_pool() const {
+    return pool;
+  }
+  uint64_t get_last_user_version() const {
+    return info.last_user_version;
+  }
+  const pg_history_t& get_history() const {
+    return info.history;
+  }
+  bool get_need_up_thru() const {
+    return need_up_thru;
+  }
+  epoch_t get_same_interval_since() const {
+    return info.history.same_interval_since;
+  }
+
+  void set_last_scrub_stamp(utime_t t) {
+    info.stats.last_scrub_stamp = t;
+    info.history.last_scrub_stamp = t;
+  }
+
+  void set_last_deep_scrub_stamp(utime_t t) {
+    info.stats.last_deep_scrub_stamp = t;
+    info.history.last_deep_scrub_stamp = t;
+  }
+
+  bool is_deleting() const {
+    return deleting;
+  }
+  bool is_deleted() const {
+    return deleted;
+  }
+  bool is_replica() const {
+    return role > 0;
+  }
+  bool is_primary() const {
+    return pg_whoami == primary;
+  }
+  bool pg_has_reset_since(epoch_t e) {
+    ceph_assert(is_locked());
+    return deleted || e < get_last_peering_reset();
+  }
+
+  bool is_ec_pg() const {
+    return pool.info.is_erasure();
+  }
+  int get_role() const {
+    return role;
+  }
+  const vector<int> get_acting() const {
+    return acting;
+  }
+  int get_acting_primary() const {
+    return primary.osd;
+  }
+  pg_shard_t get_primary() const {
+    return primary;
+  }
+  const vector<int> get_up() const {
+    return up;
+  }
+  int get_up_primary() const {
+    return up_primary.osd;
+  }
+  const PastIntervals& get_past_intervals() const {
+    return past_intervals;
+  }
+
+  /// initialize created PG
+  void init(
+    int role,
+    const vector<int>& up,
+    int up_primary,
+    const vector<int>& acting,
+    int acting_primary,
+    const pg_history_t& history,
+    const PastIntervals& pim,
+    bool backfill,
+    ObjectStore::Transaction *t);
+
+  /// read existing pg state off disk
+  void read_state(ObjectStore *store);
+  static int peek_map_epoch(ObjectStore *store, spg_t pgid, epoch_t *pepoch);
+
+  static int get_latest_struct_v() {
+    return latest_struct_v;
+  }
+  static int get_compat_struct_v() {
+    return compat_struct_v;
+  }
+  static int read_info(
+    ObjectStore *store, spg_t pgid, const coll_t &coll,
+    pg_info_t &info, PastIntervals &past_intervals,
+    __u8 &);
+  static bool _has_removal_flag(ObjectStore *store, spg_t pgid);
+
+  void rm_backoff(BackoffRef b);
+
+  void update_snap_mapper_bits(uint32_t bits) {
+    snap_mapper.update_bits(bits);
+  }
+  void start_split_stats(const set<spg_t>& childpgs, vector<object_stat_sum_t> *v);
+  virtual void split_colls(
+    spg_t child,
+    int split_bits,
+    int seed,
+    const pg_pool_t *pool,
+    ObjectStore::Transaction *t) = 0;
+  void split_into(pg_t child_pgid, PG *child, unsigned split_bits);
+  void merge_from(map<spg_t,PGRef>& sources, RecoveryCtx *rctx,
+                 unsigned split_bits,
+                 const pg_merge_meta_t& last_pg_merge_meta);
+  void finish_split_stats(const object_stat_sum_t& stats, ObjectStore::Transaction *t);
+
+  void scrub(epoch_t queued, ThreadPool::TPHandle &handle);
+  void reg_next_scrub();
+  void unreg_next_scrub();
+
+  bool is_forced_recovery_or_backfill() const {
+    return get_state() & (PG_STATE_FORCED_RECOVERY | PG_STATE_FORCED_BACKFILL);
+  }
   bool set_force_recovery(bool b);
   bool set_force_backfill(bool b);
 
+  void queue_peering_event(PGPeeringEventRef evt);
+  void do_peering_event(PGPeeringEventRef evt, RecoveryCtx *rcx);
+  void queue_null(epoch_t msg_epoch, epoch_t query_epoch);
+  void queue_flushed(epoch_t started_at);
+  void handle_advance_map(
+    OSDMapRef osdmap, OSDMapRef lastmap,
+    vector<int>& newup, int up_primary,
+    vector<int>& newacting, int acting_primary,
+    RecoveryCtx *rctx);
+  void handle_activate_map(RecoveryCtx *rctx);
+  void handle_initialize(RecoveryCtx *rctx);
+  void handle_query_state(Formatter *f);
+
+  /**
+   * @param ops_begun returns how many recovery ops the function started
+   * @returns true if any useful work was accomplished; false otherwise
+   */
+  virtual bool start_recovery_ops(
+    uint64_t max,
+    ThreadPool::TPHandle &handle,
+    uint64_t *ops_begun) = 0;
+
+  // more work after the above, but with a RecoveryCtx
+  void find_unfound(epoch_t queued, RecoveryCtx *rctx);
+
+  virtual void get_watchers(std::list<obj_watch_item_t> *ls) = 0;
+
+  void dump_pgstate_history(Formatter *f);
+  void dump_missing(Formatter *f);
+
+  void get_pg_stats(std::function<void(const pg_stat_t&, epoch_t lec)> f);
+  void with_heartbeat_peers(std::function<void(int)> f);
+
+  void shutdown();
+  virtual void on_shutdown() = 0;
+
+  bool get_must_scrub() const {
+    return scrubber.must_scrub;
+  }
+  bool sched_scrub();
+
+  virtual void do_request(
+    OpRequestRef& op,
+    ThreadPool::TPHandle &handle
+  ) = 0;
+  virtual void clear_cache() = 0;
+  virtual int get_cache_obj_count() = 0;
+
+  virtual void snap_trimmer(epoch_t epoch_queued) = 0;
+  virtual int do_command(
+    cmdmap_t cmdmap,
+    ostream& ss,
+    bufferlist& idata,
+    bufferlist& odata,
+    ConnectionRef conn,
+    ceph_tid_t tid) = 0;
+
+  virtual bool agent_work(int max) = 0;
+  virtual bool agent_work(int max, int agent_flush_quota) = 0;
+  virtual void agent_stop() = 0;
+  virtual void agent_delay() = 0;
+  virtual void agent_clear() = 0;
+  virtual void agent_choose_mode_restart() = 0;
+
+  virtual void on_removal(ObjectStore::Transaction *t) = 0;
+
+  void _delete_some(ObjectStore::Transaction *t);
+
+  virtual void set_dynamic_perf_stats_queries(
+    const std::list<OSDPerfMetricQuery> &queries) {
+  }
+  virtual void get_dynamic_perf_stats(DynamicPerfStats *stats) {
+  }
+
+  // reference counting
+#ifdef PG_DEBUG_REFS
+  uint64_t get_with_id();
+  void put_with_id(uint64_t);
+  void dump_live_ids();
+#endif
+  void get(const char* tag);
+  void put(const char* tag);
+  int get_num_ref() {
+    return ref;
+  }
+
+  // ctor
+  PG(OSDService *o, OSDMapRef curmap,
+     const PGPool &pool, spg_t p);
+  ~PG() override;
+
+  // prevent copying
+  explicit PG(const PG& rhs) = delete;
+  PG& operator=(const PG& rhs) = delete;
+
 protected:
+  // -------------
+  // protected
   OSDService *osd;
+public:
+  OSDShard *osd_shard = nullptr;
+  OSDShardPGSlot *pg_slot = nullptr;
+protected:
   CephContext *cct;
+
+  // osdmap
+  OSDMapRef osdmap_ref;
+
+  PGPool pool;
+
+  // locking and reference counting.
+  // I destroy myself when the reference count hits zero.
+  // lock() should be called before doing anything.
+  // get() should be called on pointer copy (to another thread, etc.).
+  // put() should be called on destruction of some previously copied pointer.
+  // unlock() when done with the current pointer (_most common_).
+  mutable Mutex _lock = {"PG::_lock"};
+
+  std::atomic<unsigned int> ref{0};
+
+#ifdef PG_DEBUG_REFS
+  Mutex _ref_id_lock = {"PG::_ref_id_lock"};
+  map<uint64_t, string> _live_ids;
+  map<string, uint64_t> _tag_counts;
+  uint64_t _ref_id = 0;
+
+  friend uint64_t get_with_id(PG *pg) { return pg->get_with_id(); }
+  friend void put_with_id(PG *pg, uint64_t id) { return pg->put_with_id(id); }
+#endif
+
+private:
+  friend void intrusive_ptr_add_ref(PG *pg) {
+    pg->get("intptr");
+  }
+  friend void intrusive_ptr_release(PG *pg) {
+    pg->put("intptr");
+  }
+
+
+  // =====================
+
+protected:
   OSDriver osdriver;
   SnapMapper snap_mapper;
   bool eio_errors_to_process = false;
 
   virtual PGBackend *get_pgbackend() = 0;
-public:
-  std::string gen_prefix() const override;
-  CephContext *get_cct() const override { return cct; }
-  unsigned get_subsys() const override { return ceph_subsys_osd; }
+  virtual const PGBackend* get_pgbackend() const = 0;
 
+protected:
   /*** PG ****/
-  void update_snap_mapper_bits(uint32_t bits) {
-    snap_mapper.update_bits(bits);
-  }
   /// get_is_recoverable_predicate: caller owns returned pointer and must delete when done
-  IsPGRecoverablePredicate *get_is_recoverable_predicate() {
+  IsPGRecoverablePredicate *get_is_recoverable_predicate() const {
     return get_pgbackend()->get_is_recoverable_predicate();
   }
 protected:
-  OSDMapRef osdmap_ref;
-  OSDMapRef last_persisted_osdmap_ref;
-  PGPool pool;
+  epoch_t last_persisted_osdmap;
 
   void requeue_map_waiters();
 
   void update_osdmap_ref(OSDMapRef newmap) {
-    assert(_lock.is_locked_by_me());
+    ceph_assert(_lock.is_locked_by_me());
     osdmap_ref = std::move(newmap);
   }
 
-public:
-  OSDMapRef get_osdmap() const {
-    assert(is_locked());
-    assert(osdmap_ref);
-    return osdmap_ref;
-  }
 protected:
 
-  /** locking and reference counting.
-   * I destroy myself when the reference count hits zero.
-   * lock() should be called before doing anything.
-   * get() should be called on pointer copy (to another thread, etc.).
-   * put() should be called on destruction of some previously copied pointer.
-   * unlock() when done with the current pointer (_most common_).
-   */  
-  mutable Mutex _lock;
-  std::atomic_uint ref{0};
-
-#ifdef PG_DEBUG_REFS
-  Mutex _ref_id_lock;
-  map<uint64_t, string> _live_ids;
-  map<string, uint64_t> _tag_counts;
-  uint64_t _ref_id;
-#endif
 
-public:
   bool deleting;  // true while in removing or OSD is shutting down
+  atomic<bool> deleted = {false};
 
   ZTracer::Endpoint trace_endpoint;
 
-  void lock_suspend_timeout(ThreadPool::TPHandle &handle);
-  void lock(bool no_lockdep = false) const;
-  void unlock() const {
-    //generic_dout(0) << this << " " << info.pgid << " unlock" << dendl;
-    assert(!dirty_info);
-    assert(!dirty_big_info);
-    _lock.Unlock();
-  }
-
-  bool is_locked() const {
-    return _lock.is_locked();
-  }
-
-#ifdef PG_DEBUG_REFS
-  uint64_t get_with_id();
-  void put_with_id(uint64_t);
-  void dump_live_ids();
-#endif
-  void get(const char* tag);
-  void put(const char* tag);
 
+protected:
   bool dirty_info, dirty_big_info;
 
-public:
-  bool is_ec_pg() const {
-    return pool.info.ec_pool();
-  }
+protected:
   // pg state
   pg_info_t info;               ///< current pg info
   pg_info_t last_written_info;  ///< last written info
-  __u8 info_struct_v;
-  static const __u8 cur_struct_v = 10;
+  __u8 info_struct_v = 0;
+  static const __u8 latest_struct_v = 10;
   // v10 is the new past_intervals encoding
   // v9 was fastinfo_key addition
   // v8 was the move to a per-pg pgmeta object
   // v7 was SnapMapper addition in 86658392516d5175b2756659ef7ffaaf95b0f8ad
   // (first appeared in cuttlefish).
-  static const __u8 compat_struct_v = 7;
-  bool must_upgrade() {
-    return info_struct_v < cur_struct_v;
-  }
-  bool can_upgrade() {
-    return info_struct_v >= compat_struct_v;
-  }
+  static const __u8 compat_struct_v = 10;
   void upgrade(ObjectStore *store);
 
-  const coll_t coll;
-  ObjectStore::CollectionHandle ch;
+protected:
   PGLog  pg_log;
-  static string get_info_key(spg_t pgid) {
-    return stringify(pgid) + "_info";
-  }
-  static string get_biginfo_key(spg_t pgid) {
-    return stringify(pgid) + "_biginfo";
-  }
-  static string get_epoch_key(spg_t pgid) {
-    return stringify(pgid) + "_epoch";
-  }
   ghobject_t    pgmeta_oid;
 
   // ------------------
@@ -402,8 +640,8 @@ public:
                   (l.other < r.other)));
       }
       friend ostream& operator<<(ostream& out, const loc_count_t& l) {
-       assert(l.up >= 0);
-       assert(l.other >= 0);
+       ceph_assert(l.up >= 0);
+       ceph_assert(l.other >= 0);
        return out << "(" << l.up << "+" << l.other << ")";
       }
     };
@@ -458,7 +696,7 @@ public:
       pgs_by_shard_id(s, pgsbs);
       for (auto shard: pgsbs) {
         auto p = missing_by_count[shard.first].find(_get_count(shard.second));
-        assert(p != missing_by_count[shard.first].end());
+        ceph_assert(p != missing_by_count[shard.first].end());
         if (--p->second == 0) {
          missing_by_count[shard.first].erase(p);
         }
@@ -478,7 +716,9 @@ public:
       is_readable.reset(_is_readable);
       is_recoverable.reset(_is_recoverable);
     }
-    string gen_prefix() const { return pg->gen_prefix(); }
+    std::ostream& gen_prefix(std::ostream& out) const {
+      return pg->gen_prefix(out);
+    }
     bool needs_recovery(
       const hobject_t &hoid,
       eversion_t *v = 0) const {
@@ -497,9 +737,15 @@ public:
       return i->second.is_delete();
     }
     bool is_unfound(const hobject_t &hoid) const {
-      return needs_recovery(hoid) && !is_deleted(hoid) && (
-       !missing_loc.count(hoid) ||
-       !(*is_recoverable)(missing_loc.find(hoid)->second));
+      auto it = needs_recovery_map.find(hoid);
+      if (it == needs_recovery_map.end()) {
+        return false;
+      }
+      if (it->second.is_delete()) {
+        return false;
+      }
+      auto mit = missing_loc.find(hoid);
+      return mit == missing_loc.end() || !(*is_recoverable)(mit->second);
     }
     bool readable_with_acting(
       const hobject_t &hoid,
@@ -510,7 +756,10 @@ public:
             needs_recovery_map.begin();
           i != needs_recovery_map.end();
           ++i) {
-       if (is_unfound(i->first))
+       if (i->second.is_delete())
+         continue;
+       auto mi = missing_loc.find(i->first);
+       if (mi == missing_loc.end() || !(*is_recoverable)(mi->second))
          ++ret;
       }
       return ret;
@@ -521,7 +770,10 @@ public:
             needs_recovery_map.begin();
           i != needs_recovery_map.end();
           ++i) {
-       if (is_unfound(i->first))
+        if (i->second.is_delete())
+          continue;
+        auto mi = missing_loc.find(i->first);
+        if (mi == missing_loc.end() || !(*is_recoverable)(mi->second))
          return true;
       }
       return false;
@@ -548,9 +800,22 @@ public:
       if (p != missing_loc.end()) {
        _dec_count(p->second);
        p->second.erase(location);
-       _inc_count(p->second);
+        if (p->second.empty()) {
+          missing_loc.erase(p);
+        } else {
+          _inc_count(p->second);
+        }
       }
     }
+
+    void clear_location(const hobject_t &hoid) {
+      auto p = missing_loc.find(hoid);
+      if (p != missing_loc.end()) {
+       _dec_count(p->second);
+        missing_loc.erase(p);
+      }
+    }
+
     void add_active_missing(const pg_missing_t &missing) {
       for (map<hobject_t, pg_missing_item>::const_iterator i =
             missing.get_items().begin();
@@ -564,17 +829,18 @@ public:
          lgeneric_dout(pg->cct, 0) << this << " " << pg->info.pgid << " unexpected need for "
                                    << i->first << " have " << j->second
                                    << " tried to add " << i->second << dendl;
-         assert(i->second.need == j->second.need);
+         ceph_assert(i->second.need == j->second.need);
        }
       }
     }
 
-    void add_missing(const hobject_t &hoid, eversion_t need, eversion_t have) {
-      needs_recovery_map[hoid] = pg_missing_item(need, have);
+    void add_missing(const hobject_t &hoid, eversion_t need, eversion_t have, bool is_delete=false) {
+      needs_recovery_map[hoid] = pg_missing_item(need, have, is_delete);
     }
     void revise_need(const hobject_t &hoid, eversion_t need) {
-      assert(needs_recovery(hoid));
-      needs_recovery_map[hoid].need = need;
+      auto it = needs_recovery_map.find(hoid);
+      ceph_assert(it != needs_recovery_map.end());
+      it->second.need = need;
     }
 
     /// Adds info about a possible recovery source
@@ -623,7 +889,7 @@ public:
          if (i == self)
            continue;
          auto pmiter = pmissing.find(i);
-         assert(pmiter != pmissing.end());
+         ceph_assert(pmiter != pmissing.end());
          miter = pmiter->second.get_items().find(hoid);
          if (miter != pmiter->second.get_items().end()) {
            item = miter->second;
@@ -639,13 +905,15 @@ public:
        return;
       auto mliter =
        missing_loc.insert(make_pair(hoid, set<pg_shard_t>())).first;
-      assert(info.last_backfill.is_max());
-      assert(info.last_update >= item->need);
+      ceph_assert(info.last_backfill.is_max());
+      ceph_assert(info.last_update >= item->need);
       if (!missing.is_missing(hoid))
        mliter->second.insert(self);
       for (auto &&i: pmissing) {
+       if (i.first == self)
+         continue;
        auto pinfoiter = pinfo.find(i.first);
-       assert(pinfoiter != pinfo.end());
+       ceph_assert(pinfoiter != pinfo.end());
        if (item->need <= pinfoiter->second.last_update &&
            hoid <= pinfoiter->second.last_backfill &&
            !i.second.is_missing(hoid))
@@ -655,8 +923,8 @@ public:
     }
 
     const set<pg_shard_t> &get_locations(const hobject_t &hoid) const {
-      return missing_loc.count(hoid) ?
-       missing_loc.find(hoid)->second : empty_set;
+      auto it = missing_loc.find(hoid);
+      return it == missing_loc.end() ? empty_set : it->second;
     }
     const map<hobject_t, set<pg_shard_t>> &get_missing_locs() const {
       return missing_loc;
@@ -682,55 +950,40 @@ public:
   int recovery_ops_active;
   set<pg_shard_t> waiting_on_backfill;
 #ifdef DEBUG_RECOVERY_OIDS
-  set<hobject_t> recovering_oids;
+  multiset<hobject_t> recovering_oids;
 #endif
 
 protected:
   int         role;    // 0 = primary, 1 = replica, -1=none.
-  unsigned    state;   // PG_STATE_*
+  uint64_t    state;   // PG_STATE_*
 
   bool send_notify;    ///< true if we are non-primary and should notify the primary
 
-public:
+protected:
   eversion_t  last_update_ondisk;    // last_update that has committed; ONLY DEFINED WHEN is_active()
   eversion_t  last_complete_ondisk;  // last_complete that has committed.
   eversion_t  last_update_applied;
 
-
-  struct C_UpdateLastRollbackInfoTrimmedToApplied : Context {
-    PGRef pg;
-    epoch_t e;
-    eversion_t v;
-    C_UpdateLastRollbackInfoTrimmedToApplied(PG *pg, epoch_t e, eversion_t v)
-      : pg(pg), e(e), v(v) {}
-    void finish(int) override {
-      pg->lock();
-      if (!pg->pg_has_reset_since(e)) {
-       pg->last_rollback_info_trimmed_to_applied = v;
-      }
-      pg->unlock();
-    }
-  };
-  // entries <= last_rollback_info_trimmed_to_applied have been trimmed,
-  // and the transaction has applied
+  // entries <= last_rollback_info_trimmed_to_applied have been trimmed
   eversion_t  last_rollback_info_trimmed_to_applied;
 
   // primary state
- public:
+protected:
   pg_shard_t primary;
   pg_shard_t pg_whoami;
   pg_shard_t up_primary;
   vector<int> up, acting, want_acting;
-  set<pg_shard_t> actingbackfill, actingset, upset;
+  // acting_recovery_backfill contains shards that are acting,
+  // async recovery targets, or backfill targets.
+  set<pg_shard_t> acting_recovery_backfill, actingset, upset;
   map<pg_shard_t,eversion_t> peer_last_complete_ondisk;
   eversion_t  min_last_complete_ondisk;  // up: min over last_complete_ondisk, peer_last_complete_ondisk
   eversion_t  pg_trim_to;
 
   set<int> blocked_by; ///< osds we are blocked by (for pg stats)
 
+protected:
   // [primary only] content recovery state
-
-public:    
   struct BufferedRecoveryMessages {
     map<int, map<spg_t, pg_query_t> > query_map;
     map<int, vector<pair<pg_notify_t, PastIntervals> > > info_map;
@@ -744,9 +997,6 @@ public:
     map<int, map<spg_t, pg_query_t> > *query_map;
     map<int, vector<pair<pg_notify_t, PastIntervals> > > *info_map;
     map<int, vector<pair<pg_notify_t, PastIntervals> > > *notify_list;
-    set<PGRef> created_pgs;
-    C_Contexts *on_applied;
-    C_Contexts *on_safe;
     ObjectStore::Transaction *transaction;
     ThreadPool::TPHandle* handle;
     RecoveryCtx(map<int, map<spg_t, pg_query_t> > *query_map,
@@ -754,13 +1004,9 @@ public:
                    vector<pair<pg_notify_t, PastIntervals> > > *info_map,
                map<int,
                    vector<pair<pg_notify_t, PastIntervals> > > *notify_list,
-               C_Contexts *on_applied,
-               C_Contexts *on_safe,
                ObjectStore::Transaction *transaction)
       : query_map(query_map), info_map(info_map), 
        notify_list(notify_list),
-       on_applied(on_applied),
-       on_safe(on_safe),
        transaction(transaction),
         handle(NULL) {}
 
@@ -768,15 +1014,13 @@ public:
       : query_map(&(buf.query_map)),
        info_map(&(buf.info_map)),
        notify_list(&(buf.notify_list)),
-       on_applied(rctx.on_applied),
-       on_safe(rctx.on_safe),
        transaction(rctx.transaction),
         handle(rctx.handle) {}
 
     void accept_buffered_messages(BufferedRecoveryMessages &m) {
-      assert(query_map);
-      assert(info_map);
-      assert(notify_list);
+      ceph_assert(query_map);
+      ceph_assert(info_map);
+      ceph_assert(notify_list);
       for (map<int, map<spg_t, pg_query_t> >::iterator i = m.query_map.begin();
           i != m.query_map.end();
           ++i) {
@@ -809,11 +1053,11 @@ public:
 
     void send_notify(pg_shard_t to,
                     const pg_notify_t &info, const PastIntervals &pi) {
-      assert(notify_list);
+      ceph_assert(notify_list);
       (*notify_list)[to.osd].push_back(make_pair(info, pi));
     }
   };
-
+protected:
 
   PGStateHistory pgstate_history;
 
@@ -840,8 +1084,8 @@ protected:
   
   bool        need_up_thru;
   set<pg_shard_t>    stray_set;   // non-acting osds that have PG data.
-  eversion_t  oldest_update; // acting: lowest (valid) last_update in active set
   map<pg_shard_t, pg_info_t>    peer_info;   // info from peers (stray or prior)
+  map<pg_shard_t, int64_t>    peer_bytes;   // Peer's num_bytes from peer_info
   set<pg_shard_t> peer_purged; // peers purged
   map<pg_shard_t, pg_missing_t> peer_missing;
   set<pg_shard_t> peer_log_requested;  // logs i've requested (and start stamps)
@@ -855,15 +1099,19 @@ protected:
                                        // which are unfound on the primary
   epoch_t last_peering_reset;
 
+  epoch_t get_last_peering_reset() const {
+    return last_peering_reset;
+  }
 
   /* heartbeat peers */
   void set_probe_targets(const set<pg_shard_t> &probe_set);
   void clear_probe_targets();
-public:
+
   Mutex heartbeat_peer_lock;
   set<int> heartbeat_peers;
   set<int> probe_targets;
 
+public:
   /**
    * BackfillInterval
    *
@@ -925,7 +1173,7 @@ public:
 
     /// drop first entry, and adjust @begin accordingly
     void pop_front() {
-      assert(!objects.empty());
+      ceph_assert(!objects.empty());
       objects.erase(objects.begin());
       trim();
     }
@@ -954,15 +1202,107 @@ protected:
   bool backfill_reserved;
   bool backfill_reserving;
 
-  friend class OSD;
+  set<pg_shard_t> backfill_targets,  async_recovery_targets;
 
-public:
-  set<pg_shard_t> backfill_targets;
+  // The primary's num_bytes and local num_bytes for this pg, only valid
+  // during backfill for non-primary shards.
+  // Both of these are adjusted for EC to reflect the on-disk bytes
+  std::atomic<int64_t> primary_num_bytes = 0;
+  std::atomic<int64_t> local_num_bytes = 0;
 
+public:
   bool is_backfill_targets(pg_shard_t osd) {
     return backfill_targets.count(osd);
   }
 
+  // Space reserved for backfill is primary_num_bytes - local_num_bytes
+  // Don't care that difference itself isn't atomic
+  uint64_t get_reserved_num_bytes() {
+    int64_t primary = primary_num_bytes.load();
+    int64_t local = local_num_bytes.load();
+    if (primary > local)
+      return primary - local;
+    else
+      return 0;
+  }
+
+  bool is_remote_backfilling() {
+    return primary_num_bytes.load() > 0;
+  }
+
+  void set_reserved_num_bytes(int64_t primary, int64_t local);
+  void clear_reserved_num_bytes();
+
+  // If num_bytes are inconsistent and local_num- goes negative
+  // it's ok, because it would then be ignored.
+
+  // The value of num_bytes could be negative,
+  // but we don't let local_num_bytes go negative.
+  void add_local_num_bytes(int64_t num_bytes) {
+    if (num_bytes) {
+      int64_t prev_bytes = local_num_bytes.load();
+      int64_t new_bytes;
+      do {
+        new_bytes = prev_bytes + num_bytes;
+        if (new_bytes < 0)
+          new_bytes = 0;
+      } while(!local_num_bytes.compare_exchange_weak(prev_bytes, new_bytes));
+    }
+  }
+  void sub_local_num_bytes(int64_t num_bytes) {
+    ceph_assert(num_bytes >= 0);
+    if (num_bytes) {
+      int64_t prev_bytes = local_num_bytes.load();
+      int64_t new_bytes;
+      do {
+        new_bytes = prev_bytes - num_bytes;
+        if (new_bytes < 0)
+          new_bytes = 0;
+      } while(!local_num_bytes.compare_exchange_weak(prev_bytes, new_bytes));
+    }
+  }
+  // The value of num_bytes could be negative,
+  // but we don't let info.stats.stats.sum.num_bytes go negative.
+  void add_num_bytes(int64_t num_bytes) {
+    ceph_assert(_lock.is_locked_by_me());
+    if (num_bytes) {
+      info.stats.stats.sum.num_bytes += num_bytes;
+      if (info.stats.stats.sum.num_bytes < 0) {
+        info.stats.stats.sum.num_bytes = 0;
+      }
+    }
+  }
+  void sub_num_bytes(int64_t num_bytes) {
+    ceph_assert(_lock.is_locked_by_me());
+    ceph_assert(num_bytes >= 0);
+    if (num_bytes) {
+      info.stats.stats.sum.num_bytes -= num_bytes;
+      if (info.stats.stats.sum.num_bytes < 0) {
+        info.stats.stats.sum.num_bytes = 0;
+      }
+    }
+  }
+
+  // Only used in testing so not worried about needing the PG lock here
+  int64_t get_stats_num_bytes() {
+    Mutex::Locker l(_lock);
+    int num_bytes = info.stats.stats.sum.num_bytes;
+    if (pool.info.is_erasure()) {
+      num_bytes /= (int)get_pgbackend()->get_ec_data_chunk_count();
+      // Round up each object by a stripe
+      num_bytes +=  get_pgbackend()->get_ec_stripe_chunk_size() * info.stats.stats.sum.num_objects;
+    }
+    int64_t lnb = local_num_bytes.load();
+    if (lnb && lnb != num_bytes) {
+      lgeneric_dout(cct, 0) << this << " " << info.pgid << " num_bytes mismatch "
+                           << lnb << " vs stats "
+                            << info.stats.stats.sum.num_bytes << " / chunk "
+                            << get_pgbackend()->get_ec_data_chunk_count()
+                            << dendl;
+    }
+    return num_bytes;
+  }
+
 protected:
 
   /*
@@ -1049,7 +1389,7 @@ protected:
   map<hobject_t, list<Context*>> callbacks_for_degraded_object;
 
   map<eversion_t,
-      list<pair<OpRequestRef, version_t> > > waiting_for_ondisk;
+      list<tuple<OpRequestRef, version_t, int> > > waiting_for_ondisk;
 
   void requeue_object_waiters(map<hobject_t, list<OpRequestRef>>& m);
   void requeue_op(OpRequestRef op);
@@ -1063,26 +1403,22 @@ protected:
   bool pg_stats_publish_valid;
   pg_stat_t pg_stats_publish;
 
-  // for ordering writes
-  ceph::shared_ptr<ObjectStore::Sequencer> osr;
-
   void _update_calc_stats();
   void _update_blocked_by();
   friend class TestOpsSocketHook;
   void publish_stats_to_osd();
   void clear_publish_stats();
 
-public:
   void clear_primary_state();
 
-  bool is_actingbackfill(pg_shard_t osd) const {
-    return actingbackfill.count(osd);
+  bool is_acting_recovery_backfill(pg_shard_t osd) const {
+    return acting_recovery_backfill.count(osd);
   }
   bool is_acting(pg_shard_t osd) const {
-    return has_shard(pool.info.ec_pool(), acting, osd);
+    return has_shard(pool.info.is_erasure(), acting, osd);
   }
   bool is_up(pg_shard_t osd) const {
-    return has_shard(pool.info.ec_pool(), up, osd);
+    return has_shard(pool.info.is_erasure(), up, osd);
   }
   static bool has_shard(bool ec, const vector<int>& v, pg_shard_t osd) {
     if (ec) {
@@ -1101,18 +1437,20 @@ public:
   unsigned get_recovery_priority();
   /// get backfill reservation priority
   unsigned get_backfill_priority();
+  /// get priority for pg deletion
+  unsigned get_delete_priority();
 
-  void mark_clean();  ///< mark an active pg clean
+  void try_mark_clean();  ///< mark an active pg clean
 
   /// return [start,end) bounds for required past_intervals
   static pair<epoch_t, epoch_t> get_required_past_interval_bounds(
     const pg_info_t &info,
     epoch_t oldest_map) {
-    epoch_t start = MAX(
+    epoch_t start = std::max(
       info.history.last_epoch_clean ? info.history.last_epoch_clean :
        info.history.epoch_pool_created,
       oldest_map);
-    epoch_t end = MAX(
+    epoch_t end = std::max(
       info.history.same_interval_since,
       info.history.epoch_pool_created);
     return make_pair(start, end);
@@ -1127,23 +1465,23 @@ public:
   bool all_unfound_are_queried_or_lost(const OSDMapRef osdmap) const;
   virtual void dump_recovery_info(Formatter *f) const = 0;
 
-  bool calc_min_last_complete_ondisk() {
+  void calc_min_last_complete_ondisk() {
     eversion_t min = last_complete_ondisk;
-    assert(!actingbackfill.empty());
-    for (set<pg_shard_t>::iterator i = actingbackfill.begin();
-        i != actingbackfill.end();
+    ceph_assert(!acting_recovery_backfill.empty());
+    for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
+        i != acting_recovery_backfill.end();
         ++i) {
       if (*i == get_primary()) continue;
       if (peer_last_complete_ondisk.count(*i) == 0)
-       return false;   // we don't have complete info
+       return;   // we don't have complete info
       eversion_t a = peer_last_complete_ondisk[*i];
       if (a < min)
        min = a;
     }
     if (min == min_last_complete_ondisk)
-      return false;
+      return;
     min_last_complete_ondisk = min;
-    return true;
+    return;
   }
 
   virtual void calc_trim_to() = 0;
@@ -1170,7 +1508,7 @@ public:
       pg->get_pgbackend()->try_stash(hoid, v, t);
     }
     void rollback(const pg_log_entry_t &entry) override {
-      assert(entry.can_rollback());
+      ceph_assert(entry.can_rollback());
       pg->get_pgbackend()->rollback(entry, t);
     }
     void rollforward(const pg_log_entry_t &entry) override {
@@ -1197,13 +1535,8 @@ public:
     pg_shard_t fromosd,
     RecoveryCtx*);
 
-  void check_for_lost_objects();
-  void forget_lost_objects();
-
   void discover_all_missing(std::map<int, map<spg_t,pg_query_t> > &query_map);
   
-  void trim_write_ahead();
-
   map<pg_shard_t, pg_info_t>::const_iterator find_best_info(
     const map<pg_shard_t, pg_info_t> &infos,
     bool restrict_to_up_acting,
@@ -1212,21 +1545,18 @@ public:
     map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
     unsigned size,
     const vector<int> &acting,
-    pg_shard_t acting_primary,
     const vector<int> &up,
-    pg_shard_t up_primary,
     const map<pg_shard_t, pg_info_t> &all_info,
     bool restrict_to_up_acting,
     vector<int> *want,
     set<pg_shard_t> *backfill,
     set<pg_shard_t> *acting_backfill,
-    pg_shard_t *want_primary,
     ostream &ss);
   static void calc_replicated_acting(
     map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
+    uint64_t force_auth_primary_missing_objects,
     unsigned size,
     const vector<int> &acting,
-    pg_shard_t acting_primary,
     const vector<int> &up,
     pg_shard_t up_primary,
     const map<pg_shard_t, pg_info_t> &all_info,
@@ -1234,8 +1564,18 @@ public:
     vector<int> *want,
     set<pg_shard_t> *backfill,
     set<pg_shard_t> *acting_backfill,
-    pg_shard_t *want_primary,
+    const OSDMapRef osdmap,
     ostream &ss);
+  void choose_async_recovery_ec(const map<pg_shard_t, pg_info_t> &all_info,
+                                const pg_info_t &auth_info,
+                                vector<int> *want,
+                                set<pg_shard_t> *async_recovery) const;
+  void choose_async_recovery_replicated(const map<pg_shard_t, pg_info_t> &all_info,
+                                        const pg_info_t &auth_info,
+                                        vector<int> *want,
+                                        set<pg_shard_t> *async_recovery) const;
+
+  bool recoverable_and_ge_min_size(const vector<int> &want) const;
   bool choose_acting(pg_shard_t &auth_log_shard,
                     bool restrict_to_up_acting,
                     bool *history_les_bound);
@@ -1243,11 +1583,21 @@ public:
   void activate(
     ObjectStore::Transaction& t,
     epoch_t activation_epoch,
-    list<Context*>& tfin,
     map<int, map<spg_t,pg_query_t> >& query_map,
     map<int,
       vector<pair<pg_notify_t, PastIntervals> > > *activator_map,
     RecoveryCtx *ctx);
+
+  struct C_PG_ActivateCommitted : public Context {
+    PGRef pg;
+    epoch_t epoch;
+    epoch_t activation_epoch;
+    C_PG_ActivateCommitted(PG *p, epoch_t e, epoch_t ae)
+      : pg(p), epoch(e), activation_epoch(ae) {}
+    void finish(int r) override {
+      pg->_activate_committed(epoch, activation_epoch);
+    }
+  };
   void _activate_committed(epoch_t epoch, epoch_t activation_epoch);
   void all_activated_and_committed();
 
@@ -1262,23 +1612,21 @@ public:
 
   virtual void check_local() = 0;
 
-  /**
-   * @param ops_begun returns how many recovery ops the function started
-   * @returns true if any useful work was accomplished; false otherwise
-   */
-  virtual bool start_recovery_ops(
-    uint64_t max,
-    ThreadPool::TPHandle &handle,
-    uint64_t *ops_begun) = 0;
-
   void purge_strays();
 
   void update_heartbeat_peers();
 
   Context *finish_sync_event;
 
-  void finish_recovery(list<Context*>& tfin);
+  Context *finish_recovery();
   void _finish_recovery(Context *c);
+  struct C_PG_FinishRecovery : public Context {
+    PGRef pg;
+    explicit C_PG_FinishRecovery(PG *p) : pg(p) {}
+    void finish(int r) override {
+      pg->_finish_recovery(this);
+    }
+  };
   void cancel_recovery();
   void clear_recovery_state();
   virtual void _clear_recovery_state() = 0;
@@ -1286,10 +1634,10 @@ public:
   void start_recovery_op(const hobject_t& soid);
   void finish_recovery_op(const hobject_t& soid, bool dequeue=false);
 
-  void split_into(pg_t child_pgid, PG *child, unsigned split_bits);
   virtual void _split_into(pg_t child_pgid, PG *child, unsigned split_bits) = 0;
 
   friend class C_OSD_RepModify_Commit;
+  friend class C_DeleteMore;
 
   // -- backoff --
   Mutex backoff_lock;  // orders inside Backoff::lock
@@ -1313,9 +1661,8 @@ public:
     release_backoffs(begin, end);
   }
 
-  void rm_backoff(BackoffRef b);
-
   // -- scrub --
+public:
   struct Scrubber {
     Scrubber();
     ~Scrubber();
@@ -1330,7 +1677,6 @@ public:
     set<pg_shard_t> waiting_on_whom;
     int shallow_errors;
     int deep_errors;
-    int large_omap_objects = 0;
     int fixed;
     ScrubMap primary_scrubmap;
     ScrubMapBuilder primary_scrubmap_pos;
@@ -1341,6 +1687,8 @@ public:
     OpRequestRef active_rep_scrub;
     utime_t scrub_reg_stamp;  // stamp we registered for
 
+    omap_stat_t omap_stats  = (const struct omap_stat_t){ 0 };
+
     // For async sleep
     bool sleeping = false;
     bool needs_sleep = true;
@@ -1350,10 +1698,15 @@ public:
     bool must_scrub, must_deep_scrub, must_repair;
 
     // Priority to use for scrub scheduling
-    unsigned priority;
+    unsigned priority = 0;
 
     // this flag indicates whether we would like to do auto-repair of the PG or not
     bool auto_repair;
+    // this flag indicates that we are scrubbing post repair to verify everything is fixed
+    bool check_repair;
+    // this flag indicates that if a regular scrub detects errors <= osd_scrub_auto_repair_num_errors,
+    // we should deep scrub in order to auto repair
+    bool deep_scrub_on_error;
 
     // Maps from objects with errors to missing/inconsistent peers
     map<hobject_t, set<pg_shard_t>> missing;
@@ -1466,6 +1819,8 @@ public:
       must_deep_scrub = false;
       must_repair = false;
       auto_repair = false;
+      check_repair = false;
+      deep_scrub_on_error = false;
 
       state = PG::Scrubber::INACTIVE;
       start = hobject_t();
@@ -1474,8 +1829,8 @@ public:
       subset_last_update = eversion_t();
       shallow_errors = 0;
       deep_errors = 0;
-      large_omap_objects = 0;
       fixed = 0;
+      omap_stats = (const struct omap_stat_t){ 0 };
       deep = false;
       run_callbacks();
       inconsistent.clear();
@@ -1496,6 +1851,7 @@ public:
     void cleanup_store(ObjectStore::Transaction *t);
   } scrubber;
 
+protected:
   bool scrub_after_recovery;
 
   int active_pushes;
@@ -1515,7 +1871,6 @@ public:
     const hobject_t& soid, list<pair<ScrubMap::object, pg_shard_t> > *ok_peers,
     pg_shard_t bad_peer);
 
-  void scrub(epoch_t queued, ThreadPool::TPHandle &handle);
   void chunky_scrub(ThreadPool::TPHandle &handle);
   void scrub_compare_maps();
   /**
@@ -1524,12 +1879,10 @@ public:
   bool scrub_process_inconsistent();
   bool ops_blocked_by_scrub() const;
   void scrub_finish();
-  void scrub_clear_state();
+  void scrub_clear_state(bool keep_repair = false);
   void _scan_snaps(ScrubMap &map);
   void _repair_oinfo_oid(ScrubMap &map);
-  void _scan_rollback_obs(
-    const vector<ghobject_t> &rollback_obs,
-    ThreadPool::TPHandle &handle);
+  void _scan_rollback_obs(const vector<ghobject_t> &rollback_obs);
   void _request_scrub_map(pg_shard_t replica, eversion_t version,
                           hobject_t start, hobject_t end, bool deep,
                          bool allow_preemption);
@@ -1552,25 +1905,15 @@ public:
                         boost::optional<uint32_t>>> &missing_digest) { }
   virtual void _scrub_clear_state() { }
   virtual void _scrub_finish() { }
-  virtual void split_colls(
-    spg_t child,
-    int split_bits,
-    int seed,
-    const pg_pool_t *pool,
-    ObjectStore::Transaction *t) = 0;
   void clear_scrub_reserved();
   void scrub_reserve_replicas();
   void scrub_unreserve_replicas();
   bool scrub_all_replicas_reserved() const;
-  bool sched_scrub();
-  void reg_next_scrub();
-  void unreg_next_scrub();
 
   void replica_scrub(
     OpRequestRef op,
     ThreadPool::TPHandle &handle);
   void do_replica_scrub_map(OpRequestRef op);
-  void sub_op_scrub_map(OpRequestRef op);
 
   void handle_scrub_reserve_request(OpRequestRef op);
   void handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from);
@@ -1592,8 +1935,8 @@ public:
       pg(pg), epoch(epoch), evt(evt) {}
     void finish(int r) override {
       pg->lock();
-      pg->queue_peering_event(PG::CephPeeringEvtRef(
-                               new PG::CephPeeringEvt(
+      pg->queue_peering_event(PGPeeringEventRef(
+                               new PGPeeringEvent(
                                  epoch,
                                  epoch,
                                  evt)));
@@ -1601,33 +1944,6 @@ public:
     }
   };
 
-  class CephPeeringEvt {
-    epoch_t epoch_sent;
-    epoch_t epoch_requested;
-    boost::intrusive_ptr< const boost::statechart::event_base > evt;
-    string desc;
-  public:
-    MEMPOOL_CLASS_HELPERS();
-    template <class T>
-    CephPeeringEvt(epoch_t epoch_sent,
-                  epoch_t epoch_requested,
-                  const T &evt_) :
-      epoch_sent(epoch_sent), epoch_requested(epoch_requested),
-      evt(evt_.intrusive_from_this()) {
-      stringstream out;
-      out << "epoch_sent: " << epoch_sent
-         << " epoch_requested: " << epoch_requested << " ";
-      evt_.print(&out);
-      desc = out.str();
-    }
-    epoch_t get_epoch_sent() { return epoch_sent; }
-    epoch_t get_epoch_requested() { return epoch_requested; }
-    const boost::statechart::event_base &get_event() { return *evt; }
-    string get_desc() { return desc; }
-  };
-  typedef ceph::shared_ptr<CephPeeringEvt> CephPeeringEvtRef;
-  list<CephPeeringEvtRef> peering_queue;  // op queue
-  list<CephPeeringEvtRef> peering_waiters;
 
   struct QueryState : boost::statechart::event< QueryState > {
     Formatter *f;
@@ -1637,51 +1953,9 @@ public:
     }
   };
 
-  struct MInfoRec : boost::statechart::event< MInfoRec > {
-    pg_shard_t from;
-    pg_info_t info;
-    epoch_t msg_epoch;
-    MInfoRec(pg_shard_t from, const pg_info_t &info, epoch_t msg_epoch) :
-      from(from), info(info), msg_epoch(msg_epoch) {}
-    void print(std::ostream *out) const {
-      *out << "MInfoRec from " << from << " info: " << info;
-    }
-  };
-
-  struct MLogRec : boost::statechart::event< MLogRec > {
-    pg_shard_t from;
-    boost::intrusive_ptr<MOSDPGLog> msg;
-    MLogRec(pg_shard_t from, MOSDPGLog *msg) :
-      from(from), msg(msg) {}
-    void print(std::ostream *out) const {
-      *out << "MLogRec from " << from;
-    }
-  };
-
-  struct MNotifyRec : boost::statechart::event< MNotifyRec > {
-    pg_shard_t from;
-    pg_notify_t notify;
-    uint64_t features;
-    MNotifyRec(pg_shard_t from, const pg_notify_t &notify, uint64_t f) :
-      from(from), notify(notify), features(f) {}
-    void print(std::ostream *out) const {
-      *out << "MNotifyRec from " << from << " notify: " << notify
-        << " features: 0x" << hex << features << dec;
-    }
-  };
-
-  struct MQuery : boost::statechart::event< MQuery > {
-    pg_shard_t from;
-    pg_query_t query;
-    epoch_t query_epoch;
-    MQuery(pg_shard_t from, const pg_query_t &query, epoch_t query_epoch):
-      from(from), query(query), query_epoch(query_epoch) {}
-    void print(std::ostream *out) const {
-      *out << "MQuery from " << from
-          << " query_epoch " << query_epoch
-          << " query: " << query;
-    }
-  };
+public:
+  int pg_stat_adjust(osd_stat_t *new_stat);
+protected:
 
   struct AdvMap : boost::statechart::event< AdvMap > {
     OSDMapRef osdmap;
@@ -1716,35 +1990,7 @@ public:
       *out << "Activate from " << activation_epoch;
     }
   };
-  struct RequestBackfillPrio : boost::statechart::event< RequestBackfillPrio > {
-    unsigned priority;
-    explicit RequestBackfillPrio(unsigned prio) :
-              boost::statechart::event< RequestBackfillPrio >(),
-                         priority(prio) {}
-    void print(std::ostream *out) const {
-      *out << "RequestBackfillPrio: priority " << priority;
-    }
-  };
-#define TrivialEvent(T) struct T : boost::statechart::event< T > { \
-    T() : boost::statechart::event< T >() {}                      \
-    void print(std::ostream *out) const {                         \
-      *out << #T;                                                 \
-    }                                                             \
-  };
-  struct DeferBackfill : boost::statechart::event<DeferBackfill> {
-    float delay;
-    explicit DeferBackfill(float delay) : delay(delay) {}
-    void print(std::ostream *out) const {
-      *out << "DeferBackfill: delay " << delay;
-    }
-  };
-  struct DeferRecovery : boost::statechart::event<DeferRecovery> {
-    float delay;
-    explicit DeferRecovery(float delay) : delay(delay) {}
-    void print(std::ostream *out) const {
-      *out << "DeferRecovery: delay " << delay;
-    }
-  };
+public:
   struct UnfoundBackfill : boost::statechart::event<UnfoundBackfill> {
     explicit UnfoundBackfill() {}
     void print(std::ostream *out) const {
@@ -1757,22 +2003,29 @@ public:
       *out << "UnfoundRecovery";
     }
   };
+
+  struct RequestScrub : boost::statechart::event<RequestScrub> {
+    bool deep;
+    bool repair;
+    explicit RequestScrub(bool d, bool r) : deep(d), repair(r) {}
+    void print(std::ostream *out) const {
+      *out << "RequestScrub(" << (deep ? "deep" : "shallow")
+          << (repair ? " repair" : "");
+    }
+  };
+
 protected:
   TrivialEvent(Initialize)
-  TrivialEvent(Load)
   TrivialEvent(GotInfo)
   TrivialEvent(NeedUpThru)
-  TrivialEvent(NullEvt)
-  TrivialEvent(FlushedEvt)
   TrivialEvent(Backfilled)
   TrivialEvent(LocalBackfillReserved)
-  TrivialEvent(RemoteBackfillReserved)
   TrivialEvent(RejectRemoteReservation)
-  TrivialEvent(RemoteReservationRejected)
-  TrivialEvent(RemoteReservationCanceled)
+  public:
   TrivialEvent(RequestBackfill)
-  TrivialEvent(RequestRecovery)
-  TrivialEvent(RecoveryDone)
+  protected:
+  TrivialEvent(RemoteRecoveryPreempted)
+  TrivialEvent(RemoteBackfillPreempted)
   TrivialEvent(BackfillTooFull)
   TrivialEvent(RecoveryTooFull)
 
@@ -1785,7 +2038,8 @@ protected:
   TrivialEvent(AllReplicasRecovered)
   TrivialEvent(DoRecovery)
   TrivialEvent(LocalRecoveryReserved)
-  TrivialEvent(RemoteRecoveryReserved)
+  public:
+  protected:
   TrivialEvent(AllRemotesReserved)
   TrivialEvent(AllBackfillsReserved)
   TrivialEvent(GoClean)
@@ -1794,6 +2048,19 @@ protected:
 
   TrivialEvent(IntervalFlush)
 
+  public:
+  TrivialEvent(DeleteStart)
+  TrivialEvent(DeleteSome)
+
+  TrivialEvent(SetForceRecovery)
+  TrivialEvent(UnsetForceRecovery)
+  TrivialEvent(SetForceBackfill)
+  TrivialEvent(UnsetForceBackfill)
+
+  protected:
+  TrivialEvent(DeleteReserved)
+  TrivialEvent(DeleteInterrupted)
+
   /* Encapsulates PG recovery process */
   class RecoveryState {
     void start_handle(RecoveryCtx *new_ctx);
@@ -1826,47 +2093,35 @@ protected:
 
       /* Accessor functions for state methods */
       ObjectStore::Transaction* get_cur_transaction() {
-       assert(state->rctx);
-       assert(state->rctx->transaction);
+       ceph_assert(state->rctx);
+       ceph_assert(state->rctx->transaction);
        return state->rctx->transaction;
       }
 
       void send_query(pg_shard_t to, const pg_query_t &query) {
-       assert(state->rctx);
-       assert(state->rctx->query_map);
+       ceph_assert(state->rctx);
+       ceph_assert(state->rctx->query_map);
        (*state->rctx->query_map)[to.osd][spg_t(pg->info.pgid.pgid, to.shard)] =
          query;
       }
 
       map<int, map<spg_t, pg_query_t> > *get_query_map() {
-       assert(state->rctx);
-       assert(state->rctx->query_map);
+       ceph_assert(state->rctx);
+       ceph_assert(state->rctx->query_map);
        return state->rctx->query_map;
       }
 
       map<int, vector<pair<pg_notify_t, PastIntervals> > > *get_info_map() {
-       assert(state->rctx);
-       assert(state->rctx->info_map);
+       ceph_assert(state->rctx);
+       ceph_assert(state->rctx->info_map);
        return state->rctx->info_map;
       }
 
-      list< Context* > *get_on_safe_context_list() {
-       assert(state->rctx);
-       assert(state->rctx->on_safe);
-       return &(state->rctx->on_safe->contexts);
-      }
-
-      list< Context * > *get_on_applied_context_list() {
-       assert(state->rctx);
-       assert(state->rctx->on_applied);
-       return &(state->rctx->on_applied->contexts);
-      }
-
       RecoveryCtx *get_recovery_ctx() { return &*(state->rctx); }
 
       void send_notify(pg_shard_t to,
                       const pg_notify_t &info, const PastIntervals &pi) {
-       assert(state->rctx);
+       ceph_assert(state->rctx);
        state->rctx->send_notify(to, info, pi);
       }
     };
@@ -1903,6 +2158,10 @@ protected:
     //       RepWaitBackfillReserved
     //       RepWaitRecoveryReserved
     //     Stray
+    //     ToDelete
+    //       WaitDeleteReserved
+    //       Deleting
+    // Crashed
 
     struct Crashed : boost::statechart::state< Crashed, RecoveryMachine >, NamedState {
       explicit Crashed(my_context ctx);
@@ -1916,12 +2175,10 @@ protected:
 
       typedef boost::mpl::list <
        boost::statechart::transition< Initialize, Reset >,
-       boost::statechart::custom_reaction< Load >,
        boost::statechart::custom_reaction< NullEvt >,
        boost::statechart::transition< boost::statechart::event_base, Crashed >
        > reactions;
 
-      boost::statechart::result react(const Load&);
       boost::statechart::result react(const MNotifyRec&);
       boost::statechart::result react(const MInfoRec&);
       boost::statechart::result react(const MLogRec&);
@@ -1939,14 +2196,12 @@ protected:
        boost::statechart::custom_reaction< AdvMap >,
        boost::statechart::custom_reaction< ActMap >,
        boost::statechart::custom_reaction< NullEvt >,
-       boost::statechart::custom_reaction< FlushedEvt >,
        boost::statechart::custom_reaction< IntervalFlush >,
        boost::statechart::transition< boost::statechart::event_base, Crashed >
        > reactions;
       boost::statechart::result react(const QueryState& q);
       boost::statechart::result react(const AdvMap&);
       boost::statechart::result react(const ActMap&);
-      boost::statechart::result react(const FlushedEvt&);
       boost::statechart::result react(const IntervalFlush&);
       boost::statechart::result react(const boost::statechart::event_base&) {
        return discard_event();
@@ -1962,14 +2217,19 @@ protected:
       typedef boost::mpl::list <
        boost::statechart::custom_reaction< QueryState >,
        boost::statechart::custom_reaction< AdvMap >,
-       boost::statechart::custom_reaction< NullEvt >,
-       boost::statechart::custom_reaction< FlushedEvt >,
        boost::statechart::custom_reaction< IntervalFlush >,
+       // ignored
+       boost::statechart::custom_reaction< NullEvt >,
+       boost::statechart::custom_reaction<SetForceRecovery>,
+       boost::statechart::custom_reaction<UnsetForceRecovery>,
+       boost::statechart::custom_reaction<SetForceBackfill>,
+       boost::statechart::custom_reaction<UnsetForceBackfill>,
+       boost::statechart::custom_reaction<RequestScrub>,
+       // crash
        boost::statechart::transition< boost::statechart::event_base, Crashed >
        > reactions;
       boost::statechart::result react(const QueryState& q);
       boost::statechart::result react(const AdvMap&);
-      boost::statechart::result react(const FlushedEvt&);
       boost::statechart::result react(const IntervalFlush&);
       boost::statechart::result react(const boost::statechart::event_base&) {
        return discard_event();
@@ -2001,10 +2261,20 @@ protected:
       typedef boost::mpl::list <
        boost::statechart::custom_reaction< ActMap >,
        boost::statechart::custom_reaction< MNotifyRec >,
-       boost::statechart::transition< NeedActingChange, WaitActingChange >
+       boost::statechart::transition< NeedActingChange, WaitActingChange >,
+       boost::statechart::custom_reaction<SetForceRecovery>,
+       boost::statechart::custom_reaction<UnsetForceRecovery>,
+       boost::statechart::custom_reaction<SetForceBackfill>,
+       boost::statechart::custom_reaction<UnsetForceBackfill>,
+       boost::statechart::custom_reaction<RequestScrub>
        > reactions;
       boost::statechart::result react(const ActMap&);
       boost::statechart::result react(const MNotifyRec&);
+      boost::statechart::result react(const SetForceRecovery&);
+      boost::statechart::result react(const UnsetForceRecovery&);
+      boost::statechart::result react(const SetForceBackfill&);
+      boost::statechart::result react(const UnsetForceBackfill&);
+      boost::statechart::result react(const RequestScrub&);
     };
 
     struct WaitActingChange : boost::statechart::state< WaitActingChange, Primary>,
@@ -2061,12 +2331,15 @@ protected:
        boost::statechart::custom_reaction< MInfoRec >,
        boost::statechart::custom_reaction< MNotifyRec >,
        boost::statechart::custom_reaction< MLogRec >,
+       boost::statechart::custom_reaction< MTrim >,
        boost::statechart::custom_reaction< Backfilled >,
        boost::statechart::custom_reaction< AllReplicasActivated >,
        boost::statechart::custom_reaction< DeferRecovery >,
        boost::statechart::custom_reaction< DeferBackfill >,
-  boost::statechart::custom_reaction< UnfoundRecovery >,
+       boost::statechart::custom_reaction< UnfoundRecovery >,
        boost::statechart::custom_reaction< UnfoundBackfill >,
+       boost::statechart::custom_reaction< RemoteReservationRevokedTooFull>,
+       boost::statechart::custom_reaction< RemoteReservationRevoked>,
        boost::statechart::custom_reaction< DoRecovery>
        > reactions;
       boost::statechart::result react(const QueryState& q);
@@ -2075,6 +2348,7 @@ protected:
       boost::statechart::result react(const MInfoRec& infoevt);
       boost::statechart::result react(const MNotifyRec& notevt);
       boost::statechart::result react(const MLogRec& logevt);
+      boost::statechart::result react(const MTrim& trimevt);
       boost::statechart::result react(const Backfilled&) {
        return discard_event();
       }
@@ -2086,10 +2360,16 @@ protected:
        return discard_event();
       }
       boost::statechart::result react(const UnfoundRecovery& evt) {
-  return discard_event();
+       return discard_event();
       }
       boost::statechart::result react(const UnfoundBackfill& evt) {
-  return discard_event();
+       return discard_event();
+      }
+      boost::statechart::result react(const RemoteReservationRevokedTooFull&) {
+       return discard_event();
+      }
+      boost::statechart::result react(const RemoteReservationRevoked&) {
+       return discard_event();
       }
       boost::statechart::result react(const DoRecovery&) {
        return discard_event();
@@ -2098,10 +2378,15 @@ protected:
 
     struct Clean : boost::statechart::state< Clean, Active >, NamedState {
       typedef boost::mpl::list<
-       boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >
+       boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >,
+       boost::statechart::custom_reaction<SetForceRecovery>,
+       boost::statechart::custom_reaction<SetForceBackfill>
       > reactions;
       explicit Clean(my_context ctx);
       void exit();
+      boost::statechart::result react(const boost::statechart::event_base&) {
+       return discard_event();
+      }
     };
 
     struct Recovered : boost::statechart::state< Recovered, Active >, NamedState {
@@ -2120,15 +2405,26 @@ protected:
 
     struct Backfilling : boost::statechart::state< Backfilling, Active >, NamedState {
       typedef boost::mpl::list<
-       boost::statechart::transition< Backfilled, Recovered >,
+        boost::statechart::custom_reaction< Backfilled >,
        boost::statechart::custom_reaction< DeferBackfill >,
        boost::statechart::custom_reaction< UnfoundBackfill >,
-       boost::statechart::custom_reaction< RemoteReservationRejected >
+       boost::statechart::custom_reaction< RemoteReservationRejected >,
+       boost::statechart::custom_reaction< RemoteReservationRevokedTooFull>,
+       boost::statechart::custom_reaction< RemoteReservationRevoked>
        > reactions;
       explicit Backfilling(my_context ctx);
-      boost::statechart::result react(const RemoteReservationRejected& evt);
+      boost::statechart::result react(const RemoteReservationRejected& evt) {
+       // for compat with old peers
+       post_event(RemoteReservationRevokedTooFull());
+       return discard_event();
+      }
+      void backfill_release_reservations();
+      boost::statechart::result react(const Backfilled& evt);
+      boost::statechart::result react(const RemoteReservationRevokedTooFull& evt);
+      boost::statechart::result react(const RemoteReservationRevoked& evt);
       boost::statechart::result react(const DeferBackfill& evt);
       boost::statechart::result react(const UnfoundBackfill& evt);
+      void cancel_backfill();
       void exit();
     };
 
@@ -2136,13 +2432,16 @@ protected:
       typedef boost::mpl::list<
        boost::statechart::custom_reaction< RemoteBackfillReserved >,
        boost::statechart::custom_reaction< RemoteReservationRejected >,
+       boost::statechart::custom_reaction< RemoteReservationRevoked >,
        boost::statechart::transition< AllBackfillsReserved, Backfilling >
        > reactions;
       set<pg_shard_t>::const_iterator backfill_osd_it;
       explicit WaitRemoteBackfillReserved(my_context ctx);
+      void retry();
       void exit();
       boost::statechart::result react(const RemoteBackfillReserved& evt);
       boost::statechart::result react(const RemoteReservationRejected& evt);
+      boost::statechart::result react(const RemoteReservationRevoked& evt);
     };
 
     struct WaitLocalBackfillReserved : boost::statechart::state< WaitLocalBackfillReserved, Active >, NamedState {
@@ -2183,6 +2482,7 @@ protected:
       void exit();
     };
 
+    struct ToDelete;
     struct RepNotRecovering;
     struct ReplicaActive : boost::statechart::state< ReplicaActive, Started, RepNotRecovering >, NamedState {
       explicit ReplicaActive(my_context ctx);
@@ -2194,18 +2494,27 @@ protected:
        boost::statechart::custom_reaction< MQuery >,
        boost::statechart::custom_reaction< MInfoRec >,
        boost::statechart::custom_reaction< MLogRec >,
+       boost::statechart::custom_reaction< MTrim >,
        boost::statechart::custom_reaction< Activate >,
        boost::statechart::custom_reaction< DeferRecovery >,
        boost::statechart::custom_reaction< DeferBackfill >,
        boost::statechart::custom_reaction< UnfoundRecovery >,
-       boost::statechart::custom_reaction< UnfoundBackfill >
+       boost::statechart::custom_reaction< UnfoundBackfill >,
+       boost::statechart::custom_reaction< RemoteBackfillPreempted >,
+       boost::statechart::custom_reaction< RemoteRecoveryPreempted >,
+       boost::statechart::custom_reaction< RecoveryDone >,
+       boost::statechart::transition<DeleteStart, ToDelete>
        > reactions;
       boost::statechart::result react(const QueryState& q);
       boost::statechart::result react(const MInfoRec& infoevt);
       boost::statechart::result react(const MLogRec& logevt);
+      boost::statechart::result react(const MTrim& trimevt);
       boost::statechart::result react(const ActMap&);
       boost::statechart::result react(const MQuery&);
       boost::statechart::result react(const Activate&);
+      boost::statechart::result react(const RecoveryDone&) {
+       return discard_event();
+      }
       boost::statechart::result react(const DeferRecovery& evt) {
        return discard_event();
       }
@@ -2218,6 +2527,12 @@ protected:
       boost::statechart::result react(const UnfoundBackfill& evt) {
        return discard_event();
       }
+      boost::statechart::result react(const RemoteBackfillPreempted& evt) {
+       return discard_event();
+      }
+      boost::statechart::result react(const RemoteRecoveryPreempted& evt) {
+       return discard_event();
+      }
     };
 
     struct RepRecovering : boost::statechart::state< RepRecovering, ReplicaActive >, NamedState {
@@ -2226,10 +2541,14 @@ protected:
        // for compat with old peers
        boost::statechart::transition< RemoteReservationRejected, RepNotRecovering >,
        boost::statechart::transition< RemoteReservationCanceled, RepNotRecovering >,
-       boost::statechart::custom_reaction< BackfillTooFull >
+       boost::statechart::custom_reaction< BackfillTooFull >,
+       boost::statechart::custom_reaction< RemoteRecoveryPreempted >,
+       boost::statechart::custom_reaction< RemoteBackfillPreempted >
        > reactions;
       explicit RepRecovering(my_context ctx);
+      boost::statechart::result react(const RemoteRecoveryPreempted &evt);
       boost::statechart::result react(const BackfillTooFull &evt);
+      boost::statechart::result react(const RemoteBackfillPreempted &evt);
       void exit();
     };
 
@@ -2268,15 +2587,26 @@ protected:
 
     struct RepNotRecovering : boost::statechart::state< RepNotRecovering, ReplicaActive>, NamedState {
       typedef boost::mpl::list<
+       boost::statechart::custom_reaction< RequestRecoveryPrio >,
        boost::statechart::custom_reaction< RequestBackfillPrio >,
-        boost::statechart::transition< RequestRecovery, RepWaitRecoveryReserved >,
        boost::statechart::custom_reaction< RejectRemoteReservation >,
        boost::statechart::transition< RemoteReservationRejected, RepNotRecovering >,
        boost::statechart::transition< RemoteReservationCanceled, RepNotRecovering >,
+       boost::statechart::custom_reaction< RemoteRecoveryReserved >,
+       boost::statechart::custom_reaction< RemoteBackfillReserved >,
        boost::statechart::transition< RecoveryDone, RepNotRecovering >  // for compat with pre-reservation peers
        > reactions;
       explicit RepNotRecovering(my_context ctx);
+      boost::statechart::result react(const RequestRecoveryPrio &evt);
       boost::statechart::result react(const RequestBackfillPrio &evt);
+      boost::statechart::result react(const RemoteBackfillReserved &evt) {
+       // my reservation completion raced with a RELEASE from primary
+       return discard_event();
+      }
+      boost::statechart::result react(const RemoteRecoveryReserved &evt) {
+       // my reservation completion raced with a RELEASE from primary
+       return discard_event();
+      }
       boost::statechart::result react(const RejectRemoteReservation &evt);
       void exit();
     };
@@ -2328,9 +2658,8 @@ protected:
       void exit();
     };
 
-    struct Stray : boost::statechart::state< Stray, Started >, NamedState {
-      map<int, pair<pg_query_t, epoch_t> > pending_queries;
-
+    struct Stray : boost::statechart::state< Stray, Started >,
+             NamedState {
       explicit Stray(my_context ctx);
       void exit();
 
@@ -2339,7 +2668,8 @@ protected:
        boost::statechart::custom_reaction< MLogRec >,
        boost::statechart::custom_reaction< MInfoRec >,
        boost::statechart::custom_reaction< ActMap >,
-       boost::statechart::custom_reaction< RecoveryDone >
+       boost::statechart::custom_reaction< RecoveryDone >,
+       boost::statechart::transition<DeleteStart, ToDelete>
        > reactions;
       boost::statechart::result react(const MQuery& query);
       boost::statechart::result react(const MLogRec& logevt);
@@ -2350,6 +2680,43 @@ protected:
       }
     };
 
+    struct WaitDeleteReserved;
+    struct ToDelete : boost::statechart::state<ToDelete, Started, WaitDeleteReserved>, NamedState {
+      unsigned priority = 0;
+      typedef boost::mpl::list <
+       boost::statechart::custom_reaction< ActMap >,
+       boost::statechart::custom_reaction< DeleteSome >
+       > reactions;
+      explicit ToDelete(my_context ctx);
+      boost::statechart::result react(const ActMap &evt);
+      boost::statechart::result react(const DeleteSome &evt) {
+       // happens if we drop out of Deleting due to reprioritization etc.
+       return discard_event();
+      }
+      void exit();
+    };
+
+    struct Deleting;
+    struct WaitDeleteReserved : boost::statechart::state<WaitDeleteReserved,
+                                                        ToDelete>, NamedState {
+      typedef boost::mpl::list <
+       boost::statechart::transition<DeleteReserved, Deleting>
+       > reactions;
+      explicit WaitDeleteReserved(my_context ctx);
+      void exit();
+    };
+
+    struct Deleting : boost::statechart::state<Deleting,
+                                              ToDelete>, NamedState {
+      typedef boost::mpl::list <
+       boost::statechart::custom_reaction< DeleteSome >,
+       boost::statechart::transition<DeleteInterrupted, WaitDeleteReserved>
+       > reactions;
+      explicit Deleting(my_context ctx);
+      boost::statechart::result react(const DeleteSome &evt);
+      void exit();
+    };
+
     struct GetLog;
 
     struct GetInfo : boost::statechart::state< GetInfo, Peering >, NamedState {
@@ -2427,9 +2794,11 @@ protected:
     struct Down : boost::statechart::state< Down, Peering>, NamedState {
       explicit Down(my_context ctx);
       typedef boost::mpl::list <
-       boost::statechart::custom_reaction< QueryState >
+       boost::statechart::custom_reaction< QueryState >,
+       boost::statechart::custom_reaction< MNotifyRec >
        > reactions;
-      boost::statechart::result react(const QueryState& infoevt);
+      boost::statechart::result react(const QueryState& q);
+      boost::statechart::result react(const MNotifyRec& infoevt);
       void exit();
     };
 
@@ -2442,11 +2811,10 @@ protected:
       explicit Incomplete(my_context ctx);
       boost::statechart::result react(const AdvMap &advmap);
       boost::statechart::result react(const MNotifyRec& infoevt);
-      boost::statechart::result react(const QueryState& infoevt);
+      boost::statechart::result react(const QueryState& q);
       void exit();
     };
 
-
     RecoveryMachine machine;
     PG *pg;
 
@@ -2476,7 +2844,7 @@ protected:
       end_handle();
     }
 
-    void handle_event(CephPeeringEvtRef evt,
+    void handle_event(PGPeeringEventRef evt,
                      RecoveryCtx *rctx) {
       start_handle(rctx);
       machine.process_event(evt->get_event());
@@ -2486,35 +2854,18 @@ protected:
   } recovery_state;
 
 
- public:
-  PG(OSDService *o, OSDMapRef curmap,
-     const PGPool &pool, spg_t p);
-  ~PG() override;
-  const spg_t pg_id;
 
- private:
-  // Prevent copying
-  explicit PG(const PG& rhs);
-  PG& operator=(const PG& rhs);
   uint64_t peer_features;
   uint64_t acting_features;
   uint64_t upacting_features;
 
   epoch_t last_epoch;
 
- public:
-  const spg_t&      get_pgid() const { return pg_id; }
-
-  void set_last_scrub_stamp(utime_t t) {
-    info.stats.last_scrub_stamp = t;
-    info.history.last_scrub_stamp = t;
-  }
-
-  void set_last_deep_scrub_stamp(utime_t t) {
-    info.stats.last_deep_scrub_stamp = t;
-    info.history.last_deep_scrub_stamp = t;
-  }
+  /// most recently consumed osdmap's require_osd_version
+  unsigned last_require_osd_release = 0;
+  bool delete_needs_sleep = false;
 
+protected:
   void reset_min_peer_features() {
     peer_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
   }
@@ -2543,7 +2894,7 @@ protected:
        actingset.insert(
          pg_shard_t(
            acting[i],
-           pool.info.ec_pool() ? shard_id_t(i) : shard_id_t::NO_SHARD));
+           pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
     }
     upset.clear();
     up = newup;
@@ -2552,9 +2903,9 @@ protected:
        upset.insert(
          pg_shard_t(
            up[i],
-           pool.info.ec_pool() ? shard_id_t(i) : shard_id_t::NO_SHARD));
+           pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
     }
-    if (!pool.info.ec_pool()) {
+    if (!pool.info.is_erasure()) {
       up_primary = pg_shard_t(new_up_primary, shard_id_t::NO_SHARD);
       primary = pg_shard_t(new_acting_primary, shard_id_t::NO_SHARD);
       return;
@@ -2573,71 +2924,55 @@ protected:
        break;
       }
     }
-    assert(up_primary.osd == new_up_primary);
-    assert(primary.osd == new_acting_primary);
+    ceph_assert(up_primary.osd == new_up_primary);
+    ceph_assert(primary.osd == new_acting_primary);
   }
-  pg_shard_t get_primary() const { return primary; }
-  
-  int        get_role() const { return role; }
-  void       set_role(int r) { role = r; }
 
-  bool       is_primary() const { return pg_whoami == primary; }
-  bool       is_replica() const { return role > 0; }
+  void set_role(int r) {
+    role = r;
+  }
 
-  epoch_t get_last_peering_reset() const { return last_peering_reset; }
-  
-  //int  get_state() const { return state; }
-  bool state_test(int m) const { return (state & m) != 0; }
-  void state_set(int m) { state |= m; }
-  void state_clear(int m) { state &= ~m; }
+  bool state_test(uint64_t m) const { return (state & m) != 0; }
+  void state_set(uint64_t m) { state |= m; }
+  void state_clear(uint64_t m) { state &= ~m; }
 
   bool is_complete() const { return info.last_complete == info.last_update; }
   bool should_send_notify() const { return send_notify; }
 
-  int get_state() const { return state; }
-  bool       is_active() const { return state_test(PG_STATE_ACTIVE); }
-  bool       is_activating() const { return state_test(PG_STATE_ACTIVATING); }
-  bool       is_peering() const { return state_test(PG_STATE_PEERING); }
-  bool       is_down() const { return state_test(PG_STATE_DOWN); }
-  bool       is_recovery_unfound() const { return state_test(PG_STATE_RECOVERY_UNFOUND); }
-  bool       is_backfill_unfound() const { return state_test(PG_STATE_BACKFILL_UNFOUND); }
-  bool       is_incomplete() const { return state_test(PG_STATE_INCOMPLETE); }
-  bool       is_clean() const { return state_test(PG_STATE_CLEAN); }
-  bool       is_degraded() const { return state_test(PG_STATE_DEGRADED); }
-  bool       is_undersized() const { return state_test(PG_STATE_UNDERSIZED); }
-
-  bool       is_scrubbing() const { return state_test(PG_STATE_SCRUBBING); }
-  bool       is_remapped() const { return state_test(PG_STATE_REMAPPED); }
-  bool       is_peered() const {
+  uint64_t get_state() const { return state; }
+  bool is_active() const { return state_test(PG_STATE_ACTIVE); }
+  bool is_activating() const { return state_test(PG_STATE_ACTIVATING); }
+  bool is_peering() const { return state_test(PG_STATE_PEERING); }
+  bool is_down() const { return state_test(PG_STATE_DOWN); }
+  bool is_recovery_unfound() const { return state_test(PG_STATE_RECOVERY_UNFOUND); }
+  bool is_backfill_unfound() const { return state_test(PG_STATE_BACKFILL_UNFOUND); }
+  bool is_incomplete() const { return state_test(PG_STATE_INCOMPLETE); }
+  bool is_clean() const { return state_test(PG_STATE_CLEAN); }
+  bool is_degraded() const { return state_test(PG_STATE_DEGRADED); }
+  bool is_undersized() const { return state_test(PG_STATE_UNDERSIZED); }
+  bool is_scrubbing() const { return state_test(PG_STATE_SCRUBBING); }
+  bool is_remapped() const { return state_test(PG_STATE_REMAPPED); }
+  bool is_peered() const {
     return state_test(PG_STATE_ACTIVE) || state_test(PG_STATE_PEERED);
   }
   bool is_recovering() const { return state_test(PG_STATE_RECOVERING); }
+  bool is_premerge() const { return state_test(PG_STATE_PREMERGE); }
+  bool is_repair() const { return state_test(PG_STATE_REPAIR); }
 
-  bool  is_empty() const { return info.last_update == eversion_t(0,0); }
-
-  void init(
-    int role,
-    const vector<int>& up,
-    int up_primary,
-    const vector<int>& acting,
-    int acting_primary,
-    const pg_history_t& history,
-    const PastIntervals& pim,
-    bool backfill,
-    ObjectStore::Transaction *t);
+  bool is_empty() const { return info.last_update == eversion_t(0,0); }
 
   // pg on-disk state
   void do_pending_flush();
 
+public:
   static void _create(ObjectStore::Transaction& t, spg_t pgid, int bits);
   static void _init(ObjectStore::Transaction& t,
                    spg_t pgid, const pg_pool_t *pool);
 
-private:
+protected:
   void prepare_write_info(map<string,bufferlist> *km);
 
   void update_store_with_options();
-  void update_store_on_load();
 
 public:
   static int _prepare_write_info(
@@ -2651,6 +2986,11 @@ public:
     bool dirty_epoch,
     bool try_fast_info,
     PerfCounters *logger = nullptr);
+
+  void write_if_dirty(RecoveryCtx *rctx) {
+    write_if_dirty(*rctx->transaction);
+  }
+protected:
   void write_if_dirty(ObjectStore::Transaction& t);
 
   PGLog::IndexedLog projected_log;
@@ -2662,11 +3002,11 @@ public:
   eversion_t projected_last_update;
   eversion_t get_next_version() const {
     eversion_t at_version(
-      get_osdmap()->get_epoch(),
+      get_osdmap_epoch(),
       projected_last_update.version+1);
-    assert(at_version > info.last_update);
-    assert(at_version > pg_log.get_head());
-    assert(at_version > projected_last_update);
+    ceph_assert(at_version > info.last_update);
+    ceph_assert(at_version > pg_log.get_head());
+    ceph_assert(at_version > projected_last_update);
     return at_version;
   }
 
@@ -2676,18 +3016,12 @@ public:
     eversion_t trim_to,
     eversion_t roll_forward_to,
     ObjectStore::Transaction &t,
-    bool transaction_applied = true);
+    bool transaction_applied = true,
+    bool async = false);
   bool check_log_for_corruption(ObjectStore *store);
 
   std::string get_corrupt_pg_log_name() const;
-  static int read_info(
-    ObjectStore *store, spg_t pgid, const coll_t &coll,
-    bufferlist &bl, pg_info_t &info, PastIntervals &past_intervals,
-    __u8 &);
-  void read_state(ObjectStore *store, bufferlist &bl);
-  static bool _has_removal_flag(ObjectStore *store, spg_t pgid);
-  static int peek_map_epoch(ObjectStore *store, spg_t pgid,
-                           epoch_t *pepoch, bufferlist *bl);
+
   void update_snap_map(
     const vector<pg_log_entry_t> &log_entries,
     ObjectStore::Transaction& t);
@@ -2715,7 +3049,7 @@ public:
 
   /**
    * Merge entries updating missing as necessary on all
-   * actingbackfill logs and missings (also missing_loc)
+   * acting_recovery_backfill logs and missings (also missing_loc)
    */
   void merge_new_log_entries(
     const mempool::osd_pglog::list<pg_log_entry_t> &entries,
@@ -2731,14 +3065,8 @@ public:
     ObjectStore::Transaction *t);
   void on_new_interval();
   virtual void _on_new_interval() = 0;
-  void start_flush(ObjectStore::Transaction *t,
-                  list<Context *> *on_applied,
-                  list<Context *> *on_safe);
+  void start_flush(ObjectStore::Transaction *t);
   void set_last_peering_reset();
-  bool pg_has_reset_since(epoch_t e) {
-    assert(is_locked());
-    return deleting || e < get_last_peering_reset();
-  }
 
   void update_history(const pg_history_t& history);
   void fulfill_info(pg_shard_t from, const pg_query_t &query,
@@ -2765,14 +3093,14 @@ public:
   bool can_discard_replica_op(OpRequestRef& op);
 
   bool old_peering_msg(epoch_t reply_epoch, epoch_t query_epoch);
-  bool old_peering_evt(CephPeeringEvtRef evt) {
+  bool old_peering_evt(PGPeeringEventRef evt) {
     return old_peering_msg(evt->get_epoch_sent(), evt->get_epoch_requested());
   }
   static bool have_same_or_newer_map(epoch_t cur_epoch, epoch_t e) {
     return e <= cur_epoch;
   }
   bool have_same_or_newer_map(epoch_t e) {
-    return e <= get_osdmap()->get_epoch();
+    return e <= get_osdmap_epoch();
   }
 
   bool op_has_sufficient_caps(OpRequestRef& op);
@@ -2780,67 +3108,21 @@ public:
 
   // recovery bits
   void take_waiters();
-  void queue_peering_event(CephPeeringEvtRef evt);
-  void handle_peering_event(CephPeeringEvtRef evt, RecoveryCtx *rctx);
-  void queue_query(epoch_t msg_epoch, epoch_t query_epoch,
-                  pg_shard_t from, const pg_query_t& q);
-  void queue_null(epoch_t msg_epoch, epoch_t query_epoch);
-  void queue_flushed(epoch_t started_at);
-  void handle_advance_map(
-    OSDMapRef osdmap, OSDMapRef lastmap,
-    vector<int>& newup, int up_primary,
-    vector<int>& newacting, int acting_primary,
-    RecoveryCtx *rctx);
-  void handle_activate_map(RecoveryCtx *rctx);
-  void handle_create(RecoveryCtx *rctx);
-  void handle_loaded(RecoveryCtx *rctx);
-  void handle_query_state(Formatter *f);
-
-  virtual void on_removal(ObjectStore::Transaction *t) = 0;
 
 
   // abstract bits
-  virtual void do_request(
-    OpRequestRef& op,
-    ThreadPool::TPHandle &handle
-  ) = 0;
-
-  virtual void do_op(OpRequestRef& op) = 0;
-  virtual void do_sub_op(OpRequestRef op) = 0;
-  virtual void do_sub_op_reply(OpRequestRef op) = 0;
-  virtual void do_scan(
-    OpRequestRef op,
-    ThreadPool::TPHandle &handle
-  ) = 0;
-  virtual void do_backfill(OpRequestRef op) = 0;
-  virtual void snap_trimmer(epoch_t epoch_queued) = 0;
-
-  virtual int do_command(
-    cmdmap_t cmdmap,
-    ostream& ss,
-    bufferlist& idata,
-    bufferlist& odata,
-    ConnectionRef conn,
-    ceph_tid_t tid) = 0;
+  friend class FlushState;
 
   virtual void on_role_change() = 0;
   virtual void on_pool_change() = 0;
   virtual void on_change(ObjectStore::Transaction *t) = 0;
   virtual void on_activate() = 0;
   virtual void on_flushed() = 0;
-  virtual void on_shutdown() = 0;
   virtual void check_blacklisted_watchers() = 0;
-  virtual void get_watchers(std::list<obj_watch_item_t>&) = 0;
 
-  virtual bool agent_work(int max) = 0;
-  virtual bool agent_work(int max, int agent_flush_quota) = 0;
-  virtual void agent_stop() = 0;
-  virtual void agent_delay() = 0;
-  virtual void agent_clear() = 0;
-  virtual void agent_choose_mode_restart() = 0;
+  friend ostream& operator<<(ostream& out, const PG& pg);
 };
 
-ostream& operator<<(ostream& out, const PG& pg);
 
 ostream& operator<<(ostream& out, const PG::BackfillInterval& bi);