ceph/src/osd/PG.h

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Ceph - scalable distributed file system
   5  *
   6  * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
   7  *
   8  * This is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License version 2.1, as published by the Free Software
  11  * Foundation.  See file COPYING.
  12  *
  13  */
  14
  15 #ifndef CEPH_PG_H
  16 #define CEPH_PG_H
  17
  18 #include <boost/statechart/custom_reaction.hpp>
  19 #include <boost/statechart/event.hpp>
  20 #include <boost/statechart/simple_state.hpp>
  21 #include <boost/statechart/state.hpp>
  22 #include <boost/statechart/state_machine.hpp>
  23 #include <boost/statechart/transition.hpp>
  24 #include <boost/statechart/event_base.hpp>
  25 #include <boost/scoped_ptr.hpp>
  26 #include <boost/circular_buffer.hpp>
  27 #include <boost/container/flat_set.hpp>
  28 #include "include/mempool.h"
  29
  30 // re-include our assert to clobber boost's
  31 #include "include/ceph_assert.h"
  32
  33 #include "include/types.h"
  34 #include "include/stringify.h"
  35 #include "osd_types.h"
  36 #include "include/xlist.h"
  37 #include "SnapMapper.h"
  38 #include "Session.h"
  39 #include "common/Timer.h"
  40
  41 #include "PGLog.h"
  42 #include "OSDMap.h"
  43 #include "messages/MOSDPGLog.h"
  44 #include "include/str_list.h"
  45 #include "PGBackend.h"
  46 #include "PGPeeringEvent.h"
  47
  48 #include "mgr/OSDPerfMetricTypes.h"
  49
  50 #include <atomic>
  51 #include <list>
  52 #include <memory>
  53 #include <stack>
  54 #include <string>
  55 #include <tuple>
  56
  57 //#define DEBUG_RECOVERY_OIDS   // track set of recovering oids explicitly, to find counting bugs
  58 //#define PG_DEBUG_REFS    // track provenance of pg refs, helpful for finding leaks
  59
  60 class OSD;
  61 class OSDService;
  62 class OSDShard;
  63 class OSDShardPGSlot;
  64 class MOSDOp;
  65 class MOSDPGScan;
  66 class MOSDPGBackfill;
  67 class MOSDPGInfo;
  68
  69 class PG;
  70 struct OpRequest;
  71 typedef OpRequest::Ref OpRequestRef;
  72 class MOSDPGLog;
  73 class CephContext;
  74 class DynamicPerfStats;
  75
  76 namespace Scrub {
  77   class Store;
  78 }
  79
  80 using state_history_entry = std::tuple<utime_t, utime_t, const char*>;
  81 using embedded_state = std::pair<utime_t, const char*>;
  82
  83 struct PGStateInstance {
  84   // Time spent in pg states
  85
  86   void setepoch(const epoch_t current_epoch) {
  87     this_epoch = current_epoch;
  88   }
  89
  90   void enter_state(const utime_t entime, const char* state) {
  91     embedded_states.push(std::make_pair(entime, state));
  92   }
  93
  94   void exit_state(const utime_t extime) {
  95     embedded_state this_state = embedded_states.top();
  96     state_history.push_back(state_history_entry{
  97         this_state.first, extime, this_state.second});
  98     embedded_states.pop();
  99   }
 100
 101   epoch_t this_epoch;
 102   utime_t enter_time;
 103   std::vector<state_history_entry> state_history;
 104   std::stack<embedded_state> embedded_states;
 105 };
 106
 107 class PGStateHistory {
 108   // Member access protected with the PG lock
 109 public:
 110   PGStateHistory() : buffer(10) {}
 111
 112   void enter(PG* pg, const utime_t entime, const char* state);
 113
 114   void exit(const char* state);
 115
 116   void reset() {
 117     pi = nullptr;
 118   }
 119
 120   void set_pg_in_destructor() { pg_in_destructor = true; }
 121
 122   void dump(Formatter* f) const;
 123
 124   string get_current_state() {
 125     if (pi == nullptr) return "unknown";
 126     return std::get<1>(pi->embedded_states.top());
 127   }
 128
 129 private:
 130   bool pg_in_destructor = false;
 131   PG* thispg = nullptr;
 132   std::unique_ptr<PGStateInstance> tmppi;
 133   PGStateInstance* pi = nullptr;
 134   boost::circular_buffer<std::unique_ptr<PGStateInstance>> buffer;
 135
 136 };
 137
 138 #ifdef PG_DEBUG_REFS
 139 #include "common/tracked_int_ptr.hpp"
 140   uint64_t get_with_id(PG *pg);
 141   void put_with_id(PG *pg, uint64_t id);
 142   typedef TrackedIntPtr<PG> PGRef;
 143 #else
 144   typedef boost::intrusive_ptr<PG> PGRef;
 145 #endif
 146
 147 class PGRecoveryStats {
 148   struct per_state_info {
 149     uint64_t enter, exit;     // enter/exit counts
 150     uint64_t events;
 151     utime_t event_time;       // time spent processing events
 152     utime_t total_time;       // total time in state
 153     utime_t min_time, max_time;
 154
 155     // cppcheck-suppress unreachableCode
 156     per_state_info() : enter(0), exit(0), events(0) {}
 157   };
 158   map<const char *,per_state_info> info;
 159   Mutex lock;
 160
 161   public:
 162   PGRecoveryStats() : lock("PGRecoverStats::lock") {}
 163
 164   void reset() {
 165     std::lock_guard l(lock);
 166     info.clear();
 167   }
 168   void dump(ostream& out) {
 169     std::lock_guard l(lock);
 170     for (map<const char *,per_state_info>::iterator p = info.begin(); p != info.end(); ++p) {
 171       per_state_info& i = p->second;
 172       out << i.enter << "\t" << i.exit << "\t"
 173           << i.events << "\t" << i.event_time << "\t"
 174           << i.total_time << "\t"
 175           << i.min_time << "\t" << i.max_time << "\t"
 176           << p->first << "\n";
 177     }
 178   }
 179
 180   void dump_formatted(Formatter *f) {
 181     std::lock_guard l(lock);
 182     f->open_array_section("pg_recovery_stats");
 183     for (map<const char *,per_state_info>::iterator p = info.begin();
 184          p != info.end(); ++p) {
 185       per_state_info& i = p->second;
 186       f->open_object_section("recovery_state");
 187       f->dump_int("enter", i.enter);
 188       f->dump_int("exit", i.exit);
 189       f->dump_int("events", i.events);
 190       f->dump_stream("event_time") << i.event_time;
 191       f->dump_stream("total_time") << i.total_time;
 192       f->dump_stream("min_time") << i.min_time;
 193       f->dump_stream("max_time") << i.max_time;
 194       vector<string> states;
 195       get_str_vec(p->first, "/", states);
 196       f->open_array_section("nested_states");
 197       for (vector<string>::iterator st = states.begin();
 198            st != states.end(); ++st) {
 199         f->dump_string("state", *st);
 200       }
 201       f->close_section();
 202       f->close_section();
 203     }
 204     f->close_section();
 205   }
 206
 207   void log_enter(const char *s) {
 208     std::lock_guard l(lock);
 209     info[s].enter++;
 210   }
 211   void log_exit(const char *s, utime_t dur, uint64_t events, utime_t event_dur) {
 212     std::lock_guard l(lock);
 213     per_state_info &i = info[s];
 214     i.exit++;
 215     i.total_time += dur;
 216     if (dur > i.max_time)
 217       i.max_time = dur;
 218     if (dur < i.min_time || i.min_time == utime_t())
 219       i.min_time = dur;
 220     i.events += events;
 221     i.event_time += event_dur;
 222   }
 223 };
 224
 225 struct PGPool {
 226   CephContext* cct;
 227   epoch_t cached_epoch;
 228   int64_t id;
 229   string name;
 230
 231   pg_pool_t info;
 232   SnapContext snapc;   // the default pool snapc, ready to go.
 233
 234   // these two sets are for < mimic only
 235   interval_set<snapid_t> cached_removed_snaps;      // current removed_snaps set
 236   interval_set<snapid_t> newly_removed_snaps;  // newly removed in the last epoch
 237
 238   PGPool(CephContext* cct, OSDMapRef map, int64_t i, const pg_pool_t& info,
 239          const string& name)
 240     : cct(cct),
 241       cached_epoch(map->get_epoch()),
 242       id(i),
 243       name(name),
 244       info(info) {
 245     snapc = info.get_snap_context();
 246     if (map->require_osd_release < CEPH_RELEASE_MIMIC) {
 247       info.build_removed_snaps(cached_removed_snaps);
 248     }
 249   }
 250
 251   void update(CephContext *cct, OSDMapRef map);
 252 };
 253
 254 /** PG - Replica Placement Group
 255  *
 256  */
 257
 258 class PG : public DoutPrefixProvider {
 259 public:
 260   // -- members --
 261   const spg_t pg_id;
 262   const coll_t coll;
 263
 264   ObjectStore::CollectionHandle ch;
 265
 266   struct RecoveryCtx;
 267
 268   // -- methods --
 269   std::ostream& gen_prefix(std::ostream& out) const override;
 270   CephContext *get_cct() const override {
 271     return cct;
 272   }
 273   unsigned get_subsys() const override {
 274     return ceph_subsys_osd;
 275   }
 276
 277   const OSDMapRef& get_osdmap() const {
 278     ceph_assert(is_locked());
 279     ceph_assert(osdmap_ref);
 280     return osdmap_ref;
 281   }
 282   epoch_t get_osdmap_epoch() const {
 283     return osdmap_ref->get_epoch();
 284   }
 285
 286   void lock_suspend_timeout(ThreadPool::TPHandle &handle) {
 287     handle.suspend_tp_timeout();
 288     lock();
 289     handle.reset_tp_timeout();
 290   }
 291   void lock(bool no_lockdep = false) const;
 292   void unlock() const {
 293     //generic_dout(0) << this << " " << info.pgid << " unlock" << dendl;
 294     ceph_assert(!dirty_info);
 295     ceph_assert(!dirty_big_info);
 296     _lock.Unlock();
 297   }
 298   bool is_locked() const {
 299     return _lock.is_locked();
 300   }
 301
 302   const spg_t& get_pgid() const {
 303     return pg_id;
 304   }
 305
 306   const PGPool& get_pool() const {
 307     return pool;
 308   }
 309   uint64_t get_last_user_version() const {
 310     return info.last_user_version;
 311   }
 312   const pg_history_t& get_history() const {
 313     return info.history;
 314   }
 315   bool get_need_up_thru() const {
 316     return need_up_thru;
 317   }
 318   epoch_t get_same_interval_since() const {
 319     return info.history.same_interval_since;
 320   }
 321
 322   void set_last_scrub_stamp(utime_t t) {
 323     info.stats.last_scrub_stamp = t;
 324     info.history.last_scrub_stamp = t;
 325   }
 326
 327   void set_last_deep_scrub_stamp(utime_t t) {
 328     info.stats.last_deep_scrub_stamp = t;
 329     info.history.last_deep_scrub_stamp = t;
 330   }
 331
 332   bool is_deleting() const {
 333     return deleting;
 334   }
 335   bool is_deleted() const {
 336     return deleted;
 337   }
 338   bool is_replica() const {
 339     return role > 0;
 340   }
 341   bool is_primary() const {
 342     return pg_whoami == primary;
 343   }
 344   bool pg_has_reset_since(epoch_t e) {
 345     ceph_assert(is_locked());
 346     return deleted || e < get_last_peering_reset();
 347   }
 348
 349   bool is_ec_pg() const {
 350     return pool.info.is_erasure();
 351   }
 352   int get_role() const {
 353     return role;
 354   }
 355   const vector<int> get_acting() const {
 356     return acting;
 357   }
 358   int get_acting_primary() const {
 359     return primary.osd;
 360   }
 361   pg_shard_t get_primary() const {
 362     return primary;
 363   }
 364   const vector<int> get_up() const {
 365     return up;
 366   }
 367   int get_up_primary() const {
 368     return up_primary.osd;
 369   }
 370   const PastIntervals& get_past_intervals() const {
 371     return past_intervals;
 372   }
 373
 374   /// initialize created PG
 375   void init(
 376     int role,
 377     const vector<int>& up,
 378     int up_primary,
 379     const vector<int>& acting,
 380     int acting_primary,
 381     const pg_history_t& history,
 382     const PastIntervals& pim,
 383     bool backfill,
 384     ObjectStore::Transaction *t);
 385
 386   /// read existing pg state off disk
 387   void read_state(ObjectStore *store);
 388   static int peek_map_epoch(ObjectStore *store, spg_t pgid, epoch_t *pepoch);
 389
 390   static int get_latest_struct_v() {
 391     return latest_struct_v;
 392   }
 393   static int get_compat_struct_v() {
 394     return compat_struct_v;
 395   }
 396   static int read_info(
 397     ObjectStore *store, spg_t pgid, const coll_t &coll,
 398     pg_info_t &info, PastIntervals &past_intervals,
 399     __u8 &);
 400   static bool _has_removal_flag(ObjectStore *store, spg_t pgid);
 401
 402   void rm_backoff(BackoffRef b);
 403
 404   void update_snap_mapper_bits(uint32_t bits) {
 405     snap_mapper.update_bits(bits);
 406   }
 407   void start_split_stats(const set<spg_t>& childpgs, vector<object_stat_sum_t> *v);
 408   virtual void split_colls(
 409     spg_t child,
 410     int split_bits,
 411     int seed,
 412     const pg_pool_t *pool,
 413     ObjectStore::Transaction *t) = 0;
 414   void split_into(pg_t child_pgid, PG *child, unsigned split_bits);
 415   void merge_from(map<spg_t,PGRef>& sources, RecoveryCtx *rctx,
 416                   unsigned split_bits,
 417                   const pg_merge_meta_t& last_pg_merge_meta);
 418   void finish_split_stats(const object_stat_sum_t& stats, ObjectStore::Transaction *t);
 419
 420   void scrub(epoch_t queued, ThreadPool::TPHandle &handle);
 421
 422   bool is_scrub_registered();
 423   void reg_next_scrub();
 424   void unreg_next_scrub();
 425
 426   void on_info_history_change();
 427
 428   void scrub_requested(bool deep, bool repair, bool need_auto = false);
 429
 430   bool is_forced_recovery_or_backfill() const {
 431     return get_state() & (PG_STATE_FORCED_RECOVERY | PG_STATE_FORCED_BACKFILL);
 432   }
 433   bool set_force_recovery(bool b);
 434   bool set_force_backfill(bool b);
 435
 436   void queue_peering_event(PGPeeringEventRef evt);
 437   void do_peering_event(PGPeeringEventRef evt, RecoveryCtx *rcx);
 438   void queue_null(epoch_t msg_epoch, epoch_t query_epoch);
 439   void queue_flushed(epoch_t started_at);
 440   void handle_advance_map(
 441     OSDMapRef osdmap, OSDMapRef lastmap,
 442     vector<int>& newup, int up_primary,
 443     vector<int>& newacting, int acting_primary,
 444     RecoveryCtx *rctx);
 445   void handle_activate_map(RecoveryCtx *rctx);
 446   void handle_initialize(RecoveryCtx *rctx);
 447   void handle_query_state(Formatter *f);
 448
 449   /**
 450    * @param ops_begun returns how many recovery ops the function started
 451    * @returns true if any useful work was accomplished; false otherwise
 452    */
 453   virtual bool start_recovery_ops(
 454     uint64_t max,
 455     ThreadPool::TPHandle &handle,
 456     uint64_t *ops_begun) = 0;
 457
 458   // more work after the above, but with a RecoveryCtx
 459   void find_unfound(epoch_t queued, RecoveryCtx *rctx);
 460
 461   virtual void get_watchers(std::list<obj_watch_item_t> *ls) = 0;
 462
 463   void dump_pgstate_history(Formatter *f);
 464   void dump_missing(Formatter *f);
 465
 466   void get_pg_stats(std::function<void(const pg_stat_t&, epoch_t lec)> f);
 467   void with_heartbeat_peers(std::function<void(int)> f);
 468
 469   void shutdown();
 470   virtual void on_shutdown() = 0;
 471
 472   bool get_must_scrub() const {
 473     return scrubber.must_scrub;
 474   }
 475   bool sched_scrub();
 476
 477   virtual void do_request(
 478     OpRequestRef& op,
 479     ThreadPool::TPHandle &handle
 480   ) = 0;
 481   virtual void clear_cache() = 0;
 482   virtual int get_cache_obj_count() = 0;
 483
 484   virtual void snap_trimmer(epoch_t epoch_queued) = 0;
 485   virtual int do_command(
 486     cmdmap_t cmdmap,
 487     ostream& ss,
 488     bufferlist& idata,
 489     bufferlist& odata,
 490     ConnectionRef conn,
 491     ceph_tid_t tid) = 0;
 492
 493   virtual bool agent_work(int max) = 0;
 494   virtual bool agent_work(int max, int agent_flush_quota) = 0;
 495   virtual void agent_stop() = 0;
 496   virtual void agent_delay() = 0;
 497   virtual void agent_clear() = 0;
 498   virtual void agent_choose_mode_restart() = 0;
 499
 500   virtual void on_removal(ObjectStore::Transaction *t) = 0;
 501
 502   void _delete_some(ObjectStore::Transaction *t);
 503
 504   virtual void set_dynamic_perf_stats_queries(
 505     const std::list<OSDPerfMetricQuery> &queries) {
 506   }
 507   virtual void get_dynamic_perf_stats(DynamicPerfStats *stats) {
 508   }
 509
 510   // reference counting
 511 #ifdef PG_DEBUG_REFS
 512   uint64_t get_with_id();
 513   void put_with_id(uint64_t);
 514   void dump_live_ids();
 515 #endif
 516   void get(const char* tag);
 517   void put(const char* tag);
 518   int get_num_ref() {
 519     return ref;
 520   }
 521
 522   // ctor
 523   PG(OSDService *o, OSDMapRef curmap,
 524      const PGPool &pool, spg_t p);
 525   ~PG() override;
 526
 527   // prevent copying
 528   explicit PG(const PG& rhs) = delete;
 529   PG& operator=(const PG& rhs) = delete;
 530
 531 protected:
 532   // -------------
 533   // protected
 534   OSDService *osd;
 535 public:
 536   OSDShard *osd_shard = nullptr;
 537   OSDShardPGSlot *pg_slot = nullptr;
 538 protected:
 539   CephContext *cct;
 540
 541   // osdmap
 542   OSDMapRef osdmap_ref;
 543
 544   PGPool pool;
 545
 546   // locking and reference counting.
 547   // I destroy myself when the reference count hits zero.
 548   // lock() should be called before doing anything.
 549   // get() should be called on pointer copy (to another thread, etc.).
 550   // put() should be called on destruction of some previously copied pointer.
 551   // unlock() when done with the current pointer (_most common_).
 552   mutable Mutex _lock = {"PG::_lock"};
 553
 554   std::atomic<unsigned int> ref{0};
 555
 556 #ifdef PG_DEBUG_REFS
 557   Mutex _ref_id_lock = {"PG::_ref_id_lock"};
 558   map<uint64_t, string> _live_ids;
 559   map<string, uint64_t> _tag_counts;
 560   uint64_t _ref_id = 0;
 561
 562   friend uint64_t get_with_id(PG *pg) { return pg->get_with_id(); }
 563   friend void put_with_id(PG *pg, uint64_t id) { return pg->put_with_id(id); }
 564 #endif
 565
 566 private:
 567   friend void intrusive_ptr_add_ref(PG *pg) {
 568     pg->get("intptr");
 569   }
 570   friend void intrusive_ptr_release(PG *pg) {
 571     pg->put("intptr");
 572   }
 573
 574
 575   // =====================
 576
 577 protected:
 578   OSDriver osdriver;
 579   SnapMapper snap_mapper;
 580   bool eio_errors_to_process = false;
 581
 582   virtual PGBackend *get_pgbackend() = 0;
 583   virtual const PGBackend* get_pgbackend() const = 0;
 584
 585 protected:
 586   /*** PG ****/
 587   /// get_is_recoverable_predicate: caller owns returned pointer and must delete when done
 588   IsPGRecoverablePredicate *get_is_recoverable_predicate() const {
 589     return get_pgbackend()->get_is_recoverable_predicate();
 590   }
 591 protected:
 592   epoch_t last_persisted_osdmap;
 593
 594   void requeue_map_waiters();
 595
 596   void update_osdmap_ref(OSDMapRef newmap) {
 597     ceph_assert(_lock.is_locked_by_me());
 598     osdmap_ref = std::move(newmap);
 599   }
 600
 601 protected:
 602
 603
 604   bool deleting;  // true while in removing or OSD is shutting down
 605   atomic<bool> deleted = {false};
 606
 607   ZTracer::Endpoint trace_endpoint;
 608
 609
 610 protected:
 611   bool dirty_info, dirty_big_info;
 612
 613 protected:
 614   // pg state
 615   pg_info_t info;               ///< current pg info
 616   pg_info_t last_written_info;  ///< last written info
 617   __u8 info_struct_v = 0;
 618   static const __u8 latest_struct_v = 10;
 619   // v10 is the new past_intervals encoding
 620   // v9 was fastinfo_key addition
 621   // v8 was the move to a per-pg pgmeta object
 622   // v7 was SnapMapper addition in 86658392516d5175b2756659ef7ffaaf95b0f8ad
 623   // (first appeared in cuttlefish).
 624   static const __u8 compat_struct_v = 10;
 625   void upgrade(ObjectStore *store);
 626
 627 protected:
 628   PGLog  pg_log;
 629   ghobject_t    pgmeta_oid;
 630
 631   // ------------------
 632   // MissingLoc
 633
 634   class MissingLoc {
 635   public:
 636     // a loc_count indicates how many locations we know in each of
 637     // these distinct sets
 638     struct loc_count_t {
 639       int up = 0;        //< up
 640       int other = 0;    //< other
 641
 642       friend bool operator<(const loc_count_t& l,
 643                             const loc_count_t& r) {
 644         return (l.up < r.up ||
 645                 (l.up == r.up &&
 646                    (l.other < r.other)));
 647       }
 648       friend ostream& operator<<(ostream& out, const loc_count_t& l) {
 649         ceph_assert(l.up >= 0);
 650         ceph_assert(l.other >= 0);
 651         return out << "(" << l.up << "+" << l.other << ")";
 652       }
 653     };
 654
 655
 656   private:
 657
 658     loc_count_t _get_count(const set<pg_shard_t>& shards) {
 659       loc_count_t r;
 660       for (auto s : shards) {
 661         if (pg->upset.count(s)) {
 662           r.up++;
 663         } else {
 664           r.other++;
 665         }
 666       }
 667       return r;
 668     }
 669
 670     map<hobject_t, pg_missing_item> needs_recovery_map;
 671     map<hobject_t, set<pg_shard_t> > missing_loc;
 672     set<pg_shard_t> missing_loc_sources;
 673
 674     // for every entry in missing_loc, we count how many of each type of shard we have,
 675     // and maintain totals here.  The sum of the values for this map will always equal
 676     // missing_loc.size().
 677     map < shard_id_t, map<loc_count_t,int> > missing_by_count;
 678
 679    void pgs_by_shard_id(const set<pg_shard_t>& s, map< shard_id_t, set<pg_shard_t> >& pgsbs) {
 680       if (pg->get_osdmap()->pg_is_ec(pg->info.pgid.pgid)) {
 681         int num_shards = pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid);
 682         // For completely missing shards initialize with empty set<pg_shard_t>
 683         for (int i = 0 ; i < num_shards ; ++i) {
 684           shard_id_t shard(i);
 685           pgsbs[shard];
 686         }
 687         for (auto pgs: s)
 688           pgsbs[pgs.shard].insert(pgs);
 689       } else {
 690         pgsbs[shard_id_t::NO_SHARD] = s;
 691       }
 692     }
 693
 694     void _inc_count(const set<pg_shard_t>& s) {
 695       map< shard_id_t, set<pg_shard_t> > pgsbs;
 696       pgs_by_shard_id(s, pgsbs);
 697       for (auto shard: pgsbs)
 698         ++missing_by_count[shard.first][_get_count(shard.second)];
 699     }
 700     void _dec_count(const set<pg_shard_t>& s) {
 701       map< shard_id_t, set<pg_shard_t> > pgsbs;
 702       pgs_by_shard_id(s, pgsbs);
 703       for (auto shard: pgsbs) {
 704         auto p = missing_by_count[shard.first].find(_get_count(shard.second));
 705         ceph_assert(p != missing_by_count[shard.first].end());
 706         if (--p->second == 0) {
 707           missing_by_count[shard.first].erase(p);
 708         }
 709       }
 710     }
 711
 712     PG *pg;
 713     set<pg_shard_t> empty_set;
 714   public:
 715     boost::scoped_ptr<IsPGReadablePredicate> is_readable;
 716     boost::scoped_ptr<IsPGRecoverablePredicate> is_recoverable;
 717     explicit MissingLoc(PG *pg)
 718       : pg(pg) { }
 719     void set_backend_predicates(
 720       IsPGReadablePredicate *_is_readable,
 721       IsPGRecoverablePredicate *_is_recoverable) {
 722       is_readable.reset(_is_readable);
 723       is_recoverable.reset(_is_recoverable);
 724     }
 725     std::ostream& gen_prefix(std::ostream& out) const {
 726       return pg->gen_prefix(out);
 727     }
 728     bool needs_recovery(
 729       const hobject_t &hoid,
 730       eversion_t *v = 0) const {
 731       map<hobject_t, pg_missing_item>::const_iterator i =
 732         needs_recovery_map.find(hoid);
 733       if (i == needs_recovery_map.end())
 734         return false;
 735       if (v)
 736         *v = i->second.need;
 737       return true;
 738     }
 739     bool is_deleted(const hobject_t &hoid) const {
 740       auto i = needs_recovery_map.find(hoid);
 741       if (i == needs_recovery_map.end())
 742         return false;
 743       return i->second.is_delete();
 744     }
 745     bool is_unfound(const hobject_t &hoid) const {
 746       auto it = needs_recovery_map.find(hoid);
 747       if (it == needs_recovery_map.end()) {
 748         return false;
 749       }
 750       if (it->second.is_delete()) {
 751         return false;
 752       }
 753       auto mit = missing_loc.find(hoid);
 754       return mit == missing_loc.end() || !(*is_recoverable)(mit->second);
 755     }
 756     bool readable_with_acting(
 757       const hobject_t &hoid,
 758       const set<pg_shard_t> &acting) const;
 759     uint64_t num_unfound() const {
 760       uint64_t ret = 0;
 761       for (map<hobject_t, pg_missing_item>::const_iterator i =
 762              needs_recovery_map.begin();
 763            i != needs_recovery_map.end();
 764            ++i) {
 765         if (i->second.is_delete())
 766           continue;
 767         auto mi = missing_loc.find(i->first);
 768         if (mi == missing_loc.end() || !(*is_recoverable)(mi->second))
 769           ++ret;
 770       }
 771       return ret;
 772     }
 773
 774     bool have_unfound() const {
 775       for (map<hobject_t, pg_missing_item>::const_iterator i =
 776              needs_recovery_map.begin();
 777            i != needs_recovery_map.end();
 778            ++i) {
 779         if (i->second.is_delete())
 780           continue;
 781         auto mi = missing_loc.find(i->first);
 782         if (mi == missing_loc.end() || !(*is_recoverable)(mi->second))
 783           return true;
 784       }
 785       return false;
 786     }
 787     void clear() {
 788       needs_recovery_map.clear();
 789       missing_loc.clear();
 790       missing_loc_sources.clear();
 791       missing_by_count.clear();
 792     }
 793
 794     void add_location(const hobject_t &hoid, pg_shard_t location) {
 795       auto p = missing_loc.find(hoid);
 796       if (p == missing_loc.end()) {
 797         p = missing_loc.emplace(hoid, set<pg_shard_t>()).first;
 798       } else {
 799         _dec_count(p->second);
 800       }
 801       p->second.insert(location);
 802       _inc_count(p->second);
 803     }
 804     void remove_location(const hobject_t &hoid, pg_shard_t location) {
 805       auto p = missing_loc.find(hoid);
 806       if (p != missing_loc.end()) {
 807         _dec_count(p->second);
 808         p->second.erase(location);
 809         if (p->second.empty()) {
 810           missing_loc.erase(p);
 811         } else {
 812           _inc_count(p->second);
 813         }
 814       }
 815     }
 816
 817     void clear_location(const hobject_t &hoid) {
 818       auto p = missing_loc.find(hoid);
 819       if (p != missing_loc.end()) {
 820         _dec_count(p->second);
 821         missing_loc.erase(p);
 822       }
 823     }
 824
 825     void add_active_missing(const pg_missing_t &missing) {
 826       for (map<hobject_t, pg_missing_item>::const_iterator i =
 827              missing.get_items().begin();
 828            i != missing.get_items().end();
 829            ++i) {
 830         map<hobject_t, pg_missing_item>::const_iterator j =
 831           needs_recovery_map.find(i->first);
 832         if (j == needs_recovery_map.end()) {
 833           needs_recovery_map.insert(*i);
 834         } else {
 835           lgeneric_dout(pg->cct, 0) << this << " " << pg->info.pgid << " unexpected need for "
 836                                     << i->first << " have " << j->second
 837                                     << " tried to add " << i->second << dendl;
 838           ceph_assert(i->second.need == j->second.need);
 839         }
 840       }
 841     }
 842
 843     void add_missing(const hobject_t &hoid, eversion_t need, eversion_t have, bool is_delete=false) {
 844       needs_recovery_map[hoid] = pg_missing_item(need, have, is_delete);
 845     }
 846     void revise_need(const hobject_t &hoid, eversion_t need) {
 847       auto it = needs_recovery_map.find(hoid);
 848       ceph_assert(it != needs_recovery_map.end());
 849       it->second.need = need;
 850     }
 851
 852     /// Adds info about a possible recovery source
 853     bool add_source_info(
 854       pg_shard_t source,           ///< [in] source
 855       const pg_info_t &oinfo,      ///< [in] info
 856       const pg_missing_t &omissing, ///< [in] (optional) missing
 857       ThreadPool::TPHandle* handle  ///< [in] ThreadPool handle
 858       ); ///< @return whether a new object location was discovered
 859
 860     /// Adds recovery sources in batch
 861     void add_batch_sources_info(
 862       const set<pg_shard_t> &sources,  ///< [in] a set of resources which can be used for all objects
 863       ThreadPool::TPHandle* handle  ///< [in] ThreadPool handle
 864       );
 865
 866     /// Uses osdmap to update structures for now down sources
 867     void check_recovery_sources(const OSDMapRef& osdmap);
 868
 869     /// Call when hoid is no longer missing in acting set
 870     void recovered(const hobject_t &hoid) {
 871       needs_recovery_map.erase(hoid);
 872       auto p = missing_loc.find(hoid);
 873       if (p != missing_loc.end()) {
 874         _dec_count(p->second);
 875         missing_loc.erase(p);
 876       }
 877     }
 878
 879     /// Call to update structures for hoid after a change
 880     void rebuild(
 881       const hobject_t &hoid,
 882       pg_shard_t self,
 883       const set<pg_shard_t> to_recover,
 884       const pg_info_t &info,
 885       const pg_missing_t &missing,
 886       const map<pg_shard_t, pg_missing_t> &pmissing,
 887       const map<pg_shard_t, pg_info_t> &pinfo) {
 888       recovered(hoid);
 889       boost::optional<pg_missing_item> item;
 890       auto miter = missing.get_items().find(hoid);
 891       if (miter != missing.get_items().end()) {
 892         item = miter->second;
 893       } else {
 894         for (auto &&i: to_recover) {
 895           if (i == self)
 896             continue;
 897           auto pmiter = pmissing.find(i);
 898           ceph_assert(pmiter != pmissing.end());
 899           miter = pmiter->second.get_items().find(hoid);
 900           if (miter != pmiter->second.get_items().end()) {
 901             item = miter->second;
 902             break;
 903           }
 904         }
 905       }
 906       if (!item)
 907         return; // recovered!
 908
 909       needs_recovery_map[hoid] = *item;
 910       if (item->is_delete())
 911         return;
 912       auto mliter =
 913         missing_loc.insert(make_pair(hoid, set<pg_shard_t>())).first;
 914       ceph_assert(info.last_backfill.is_max());
 915       ceph_assert(info.last_update >= item->need);
 916       if (!missing.is_missing(hoid))
 917         mliter->second.insert(self);
 918       for (auto &&i: pmissing) {
 919         if (i.first == self)
 920           continue;
 921         auto pinfoiter = pinfo.find(i.first);
 922         ceph_assert(pinfoiter != pinfo.end());
 923         if (item->need <= pinfoiter->second.last_update &&
 924             hoid <= pinfoiter->second.last_backfill &&
 925             !i.second.is_missing(hoid))
 926           mliter->second.insert(i.first);
 927       }
 928       _inc_count(mliter->second);
 929     }
 930
 931     const set<pg_shard_t> &get_locations(const hobject_t &hoid) const {
 932       auto it = missing_loc.find(hoid);
 933       return it == missing_loc.end() ? empty_set : it->second;
 934     }
 935     const map<hobject_t, set<pg_shard_t>> &get_missing_locs() const {
 936       return missing_loc;
 937     }
 938     const map<hobject_t, pg_missing_item> &get_needs_recovery() const {
 939       return needs_recovery_map;
 940     }
 941     const map < shard_id_t, map<loc_count_t,int> > &get_missing_by_count() const {
 942       return missing_by_count;
 943     }
 944   } missing_loc;
 945
 946   PastIntervals past_intervals;
 947
 948   interval_set<snapid_t> snap_trimq;
 949
 950   /* You should not use these items without taking their respective queue locks
 951    * (if they have one) */
 952   xlist<PG*>::item stat_queue_item;
 953   bool scrub_queued;
 954   bool recovery_queued;
 955
 956   int recovery_ops_active;
 957   set<pg_shard_t> waiting_on_backfill;
 958 #ifdef DEBUG_RECOVERY_OIDS
 959   multiset<hobject_t> recovering_oids;
 960 #endif
 961
 962 protected:
 963   int         role;    // 0 = primary, 1 = replica, -1=none.
 964   uint64_t    state;   // PG_STATE_*
 965
 966   bool send_notify;    ///< true if we are non-primary and should notify the primary
 967
 968 protected:
 969   eversion_t  last_update_ondisk;    // last_update that has committed; ONLY DEFINED WHEN is_active()
 970   eversion_t  last_complete_ondisk;  // last_complete that has committed.
 971   eversion_t  last_update_applied;
 972
 973   // entries <= last_rollback_info_trimmed_to_applied have been trimmed
 974   eversion_t  last_rollback_info_trimmed_to_applied;
 975
 976   // primary state
 977 protected:
 978   pg_shard_t primary;
 979   pg_shard_t pg_whoami;
 980   pg_shard_t up_primary;
 981   vector<int> up, acting, want_acting;
 982   // acting_recovery_backfill contains shards that are acting,
 983   // async recovery targets, or backfill targets.
 984   set<pg_shard_t> acting_recovery_backfill, actingset, upset;
 985   map<pg_shard_t,eversion_t> peer_last_complete_ondisk;
 986   eversion_t  min_last_complete_ondisk;  // up: min over last_complete_ondisk, peer_last_complete_ondisk
 987   eversion_t  pg_trim_to;
 988
 989   set<int> blocked_by; ///< osds we are blocked by (for pg stats)
 990
 991 protected:
 992   // [primary only] content recovery state
 993   struct BufferedRecoveryMessages {
 994     map<int, map<spg_t, pg_query_t> > query_map;
 995     map<int, vector<pair<pg_notify_t, PastIntervals> > > info_map;
 996     map<int, vector<pair<pg_notify_t, PastIntervals> > > notify_list;
 997   };
 998
 999 public:
1000   bool dne() { return info.dne(); }
1001   struct RecoveryCtx {
1002     utime_t start_time;
1003     map<int, map<spg_t, pg_query_t> > *query_map;
1004     map<int, vector<pair<pg_notify_t, PastIntervals> > > *info_map;
1005     map<int, vector<pair<pg_notify_t, PastIntervals> > > *notify_list;
1006     ObjectStore::Transaction *transaction;
1007     ThreadPool::TPHandle* handle;
1008     RecoveryCtx(map<int, map<spg_t, pg_query_t> > *query_map,
1009                 map<int,
1010                     vector<pair<pg_notify_t, PastIntervals> > > *info_map,
1011                 map<int,
1012                     vector<pair<pg_notify_t, PastIntervals> > > *notify_list,
1013                 ObjectStore::Transaction *transaction)
1014       : query_map(query_map), info_map(info_map),
1015         notify_list(notify_list),
1016         transaction(transaction),
1017         handle(NULL) {}
1018
1019     RecoveryCtx(BufferedRecoveryMessages &buf, RecoveryCtx &rctx)
1020       : query_map(&(buf.query_map)),
1021         info_map(&(buf.info_map)),
1022         notify_list(&(buf.notify_list)),
1023         transaction(rctx.transaction),
1024         handle(rctx.handle) {}
1025
1026     void accept_buffered_messages(BufferedRecoveryMessages &m) {
1027       ceph_assert(query_map);
1028       ceph_assert(info_map);
1029       ceph_assert(notify_list);
1030       for (map<int, map<spg_t, pg_query_t> >::iterator i = m.query_map.begin();
1031            i != m.query_map.end();
1032            ++i) {
1033         map<spg_t, pg_query_t> &omap = (*query_map)[i->first];
1034         for (map<spg_t, pg_query_t>::iterator j = i->second.begin();
1035              j != i->second.end();
1036              ++j) {
1037           omap[j->first] = j->second;
1038         }
1039       }
1040       for (map<int, vector<pair<pg_notify_t, PastIntervals> > >::iterator i
1041              = m.info_map.begin();
1042            i != m.info_map.end();
1043            ++i) {
1044         vector<pair<pg_notify_t, PastIntervals> > &ovec =
1045           (*info_map)[i->first];
1046         ovec.reserve(ovec.size() + i->second.size());
1047         ovec.insert(ovec.end(), i->second.begin(), i->second.end());
1048       }
1049       for (map<int, vector<pair<pg_notify_t, PastIntervals> > >::iterator i
1050              = m.notify_list.begin();
1051            i != m.notify_list.end();
1052            ++i) {
1053         vector<pair<pg_notify_t, PastIntervals> > &ovec =
1054           (*notify_list)[i->first];
1055         ovec.reserve(ovec.size() + i->second.size());
1056         ovec.insert(ovec.end(), i->second.begin(), i->second.end());
1057       }
1058     }
1059
1060     void send_notify(pg_shard_t to,
1061                      const pg_notify_t &info, const PastIntervals &pi) {
1062       ceph_assert(notify_list);
1063       (*notify_list)[to.osd].push_back(make_pair(info, pi));
1064     }
1065   };
1066 protected:
1067
1068   PGStateHistory pgstate_history;
1069
1070   struct NamedState {
1071     const char *state_name;
1072     utime_t enter_time;
1073     PG* pg;
1074     const char *get_state_name() { return state_name; }
1075     NamedState(PG *pg_, const char *state_name_)
1076       : state_name(state_name_), enter_time(ceph_clock_now()), pg(pg_) {
1077         pg->pgstate_history.enter(pg, enter_time, state_name);
1078       }
1079     virtual ~NamedState() { pg->pgstate_history.exit(state_name); }
1080   };
1081
1082
1083
1084 protected:
1085
1086   /*
1087    * peer_info    -- projected (updates _before_ replicas ack)
1088    * peer_missing -- committed (updates _after_ replicas ack)
1089    */
1090
1091   bool        need_up_thru;
1092   set<pg_shard_t>    stray_set;   // non-acting osds that have PG data.
1093   map<pg_shard_t, pg_info_t>    peer_info;   // info from peers (stray or prior)
1094   map<pg_shard_t, int64_t>    peer_bytes;   // Peer's num_bytes from peer_info
1095   set<pg_shard_t> peer_purged; // peers purged
1096   map<pg_shard_t, pg_missing_t> peer_missing;
1097   set<pg_shard_t> peer_log_requested;  // logs i've requested (and start stamps)
1098   set<pg_shard_t> peer_missing_requested;
1099
1100   // i deleted these strays; ignore racing PGInfo from them
1101   set<pg_shard_t> peer_activated;
1102
1103   // primary-only, recovery-only state
1104   set<pg_shard_t> might_have_unfound;  // These osds might have objects on them
1105                                        // which are unfound on the primary
1106   epoch_t last_peering_reset;
1107
1108   epoch_t get_last_peering_reset() const {
1109     return last_peering_reset;
1110   }
1111
1112   /* heartbeat peers */
1113   void set_probe_targets(const set<pg_shard_t> &probe_set);
1114   void clear_probe_targets();
1115
1116   Mutex heartbeat_peer_lock;
1117   set<int> heartbeat_peers;
1118   set<int> probe_targets;
1119
1120 public:
1121   /**
1122    * BackfillInterval
1123    *
1124    * Represents the objects in a range [begin, end)
1125    *
1126    * Possible states:
1127    * 1) begin == end == hobject_t() indicates the the interval is unpopulated
1128    * 2) Else, objects contains all objects in [begin, end)
1129    */
1130   struct BackfillInterval {
1131     // info about a backfill interval on a peer
1132     eversion_t version; /// version at which the scan occurred
1133     map<hobject_t,eversion_t> objects;
1134     hobject_t begin;
1135     hobject_t end;
1136
1137     /// clear content
1138     void clear() {
1139       *this = BackfillInterval();
1140     }
1141
1142     /// clear objects list only
1143     void clear_objects() {
1144       objects.clear();
1145     }
1146
1147     /// reinstantiate with a new start+end position and sort order
1148     void reset(hobject_t start) {
1149       clear();
1150       begin = end = start;
1151     }
1152
1153     /// true if there are no objects in this interval
1154     bool empty() const {
1155       return objects.empty();
1156     }
1157
1158     /// true if interval extends to the end of the range
1159     bool extends_to_end() const {
1160       return end.is_max();
1161     }
1162
1163     /// removes items <= soid and adjusts begin to the first object
1164     void trim_to(const hobject_t &soid) {
1165       trim();
1166       while (!objects.empty() &&
1167              objects.begin()->first <= soid) {
1168         pop_front();
1169       }
1170     }
1171
1172     /// Adjusts begin to the first object
1173     void trim() {
1174       if (!objects.empty())
1175         begin = objects.begin()->first;
1176       else
1177         begin = end;
1178     }
1179
1180     /// drop first entry, and adjust @begin accordingly
1181     void pop_front() {
1182       ceph_assert(!objects.empty());
1183       objects.erase(objects.begin());
1184       trim();
1185     }
1186
1187     /// dump
1188     void dump(Formatter *f) const {
1189       f->dump_stream("begin") << begin;
1190       f->dump_stream("end") << end;
1191       f->open_array_section("objects");
1192       for (map<hobject_t, eversion_t>::const_iterator i =
1193              objects.begin();
1194            i != objects.end();
1195            ++i) {
1196         f->open_object_section("object");
1197         f->dump_stream("object") << i->first;
1198         f->dump_stream("version") << i->second;
1199         f->close_section();
1200       }
1201       f->close_section();
1202     }
1203   };
1204
1205 protected:
1206   BackfillInterval backfill_info;
1207   map<pg_shard_t, BackfillInterval> peer_backfill_info;
1208   bool backfill_reserved;
1209   bool backfill_reserving;
1210
1211   set<pg_shard_t> backfill_targets,  async_recovery_targets;
1212
1213   // The primary's num_bytes and local num_bytes for this pg, only valid
1214   // during backfill for non-primary shards.
1215   // Both of these are adjusted for EC to reflect the on-disk bytes
1216   std::atomic<int64_t> primary_num_bytes = 0;
1217   std::atomic<int64_t> local_num_bytes = 0;
1218
1219 public:
1220   bool is_backfill_targets(pg_shard_t osd) {
1221     return backfill_targets.count(osd);
1222   }
1223
1224   // Space reserved for backfill is primary_num_bytes - local_num_bytes
1225   // Don't care that difference itself isn't atomic
1226   uint64_t get_reserved_num_bytes() {
1227     int64_t primary = primary_num_bytes.load();
1228     int64_t local = local_num_bytes.load();
1229     if (primary > local)
1230       return primary - local;
1231     else
1232       return 0;
1233   }
1234
1235   bool is_remote_backfilling() {
1236     return primary_num_bytes.load() > 0;
1237   }
1238
1239   void set_reserved_num_bytes(int64_t primary, int64_t local);
1240   void clear_reserved_num_bytes();
1241
1242   // If num_bytes are inconsistent and local_num- goes negative
1243   // it's ok, because it would then be ignored.
1244
1245   // The value of num_bytes could be negative,
1246   // but we don't let local_num_bytes go negative.
1247   void add_local_num_bytes(int64_t num_bytes) {
1248     if (num_bytes) {
1249       int64_t prev_bytes = local_num_bytes.load();
1250       int64_t new_bytes;
1251       do {
1252         new_bytes = prev_bytes + num_bytes;
1253         if (new_bytes < 0)
1254           new_bytes = 0;
1255       } while(!local_num_bytes.compare_exchange_weak(prev_bytes, new_bytes));
1256     }
1257   }
1258   void sub_local_num_bytes(int64_t num_bytes) {
1259     ceph_assert(num_bytes >= 0);
1260     if (num_bytes) {
1261       int64_t prev_bytes = local_num_bytes.load();
1262       int64_t new_bytes;
1263       do {
1264         new_bytes = prev_bytes - num_bytes;
1265         if (new_bytes < 0)
1266           new_bytes = 0;
1267       } while(!local_num_bytes.compare_exchange_weak(prev_bytes, new_bytes));
1268     }
1269   }
1270   // The value of num_bytes could be negative,
1271   // but we don't let info.stats.stats.sum.num_bytes go negative.
1272   void add_num_bytes(int64_t num_bytes) {
1273     ceph_assert(_lock.is_locked_by_me());
1274     if (num_bytes) {
1275       info.stats.stats.sum.num_bytes += num_bytes;
1276       if (info.stats.stats.sum.num_bytes < 0) {
1277         info.stats.stats.sum.num_bytes = 0;
1278       }
1279     }
1280   }
1281   void sub_num_bytes(int64_t num_bytes) {
1282     ceph_assert(_lock.is_locked_by_me());
1283     ceph_assert(num_bytes >= 0);
1284     if (num_bytes) {
1285       info.stats.stats.sum.num_bytes -= num_bytes;
1286       if (info.stats.stats.sum.num_bytes < 0) {
1287         info.stats.stats.sum.num_bytes = 0;
1288       }
1289     }
1290   }
1291
1292   // Only used in testing so not worried about needing the PG lock here
1293   int64_t get_stats_num_bytes() {
1294     Mutex::Locker l(_lock);
1295     int num_bytes = info.stats.stats.sum.num_bytes;
1296     if (pool.info.is_erasure()) {
1297       num_bytes /= (int)get_pgbackend()->get_ec_data_chunk_count();
1298       // Round up each object by a stripe
1299       num_bytes +=  get_pgbackend()->get_ec_stripe_chunk_size() * info.stats.stats.sum.num_objects;
1300     }
1301     int64_t lnb = local_num_bytes.load();
1302     if (lnb && lnb != num_bytes) {
1303       lgeneric_dout(cct, 0) << this << " " << info.pgid << " num_bytes mismatch "
1304                             << lnb << " vs stats "
1305                             << info.stats.stats.sum.num_bytes << " / chunk "
1306                             << get_pgbackend()->get_ec_data_chunk_count()
1307                             << dendl;
1308     }
1309     return num_bytes;
1310   }
1311
1312 protected:
1313
1314   /*
1315    * blocked request wait hierarchy
1316    *
1317    * In order to preserve request ordering we need to be careful about the
1318    * order in which blocked requests get requeued.  Generally speaking, we
1319    * push the requests back up to the op_wq in reverse order (most recent
1320    * request first) so that they come back out again in the original order.
1321    * However, because there are multiple wait queues, we need to requeue
1322    * waitlists in order.  Generally speaking, we requeue the wait lists
1323    * that are checked first.
1324    *
1325    * Here are the various wait lists, in the order they are used during
1326    * request processing, with notes:
1327    *
1328    *  - waiting_for_map
1329    *    - may start or stop blocking at any time (depending on client epoch)
1330    *  - waiting_for_peered
1331    *    - !is_peered() or flushes_in_progress
1332    *    - only starts blocking on interval change; never restarts
1333    *  - waiting_for_active
1334    *    - !is_active()
1335    *    - only starts blocking on interval change; never restarts
1336    *  - waiting_for_flush
1337    *    - is_active() and flushes_in_progress
1338    *    - waiting for final flush during activate
1339    *  - waiting_for_scrub
1340    *    - starts and stops blocking for varying intervals during scrub
1341    *  - waiting_for_unreadable_object
1342    *    - never restarts once object is readable (* except for EIO?)
1343    *  - waiting_for_degraded_object
1344    *    - never restarts once object is writeable (* except for EIO?)
1345    *  - waiting_for_blocked_object
1346    *    - starts and stops based on proxied op activity
1347    *  - obc rwlocks
1348    *    - starts and stops based on read/write activity
1349    *
1350    * Notes:
1351    *
1352    *  1. During and interval change, we requeue *everything* in the above order.
1353    *
1354    *  2. When an obc rwlock is released, we check for a scrub block and requeue
1355    *     the op there if it applies.  We ignore the unreadable/degraded/blocked
1356    *     queues because we assume they cannot apply at that time (this is
1357    *     probably mostly true).
1358    *
1359    *  3. The requeue_ops helper will push ops onto the waiting_for_map list if
1360    *     it is non-empty.
1361    *
1362    * These three behaviors are generally sufficient to maintain ordering, with
1363    * the possible exception of cases where we make an object degraded or
1364    * unreadable that was previously okay, e.g. when scrub or op processing
1365    * encounter an unexpected error.  FIXME.
1366    */
1367
1368   // pg waiters
1369   unsigned flushes_in_progress;
1370
1371   // ops with newer maps than our (or blocked behind them)
1372   // track these by client, since inter-request ordering doesn't otherwise
1373   // matter.
1374   unordered_map<entity_name_t,list<OpRequestRef>> waiting_for_map;
1375
1376   // ops waiting on peered
1377   list<OpRequestRef>            waiting_for_peered;
1378
1379   // ops waiting on active (require peered as well)
1380   list<OpRequestRef>            waiting_for_active;
1381   list<OpRequestRef>            waiting_for_flush;
1382   list<OpRequestRef>            waiting_for_scrub;
1383
1384   list<OpRequestRef>            waiting_for_cache_not_full;
1385   list<OpRequestRef>            waiting_for_clean_to_primary_repair;
1386   map<hobject_t, list<OpRequestRef>> waiting_for_unreadable_object,
1387                              waiting_for_degraded_object,
1388                              waiting_for_blocked_object;
1389
1390   set<hobject_t> objects_blocked_on_cache_full;
1391   map<hobject_t,snapid_t> objects_blocked_on_degraded_snap;
1392   map<hobject_t,ObjectContextRef> objects_blocked_on_snap_promotion;
1393
1394   // Callbacks should assume pg (and nothing else) is locked
1395   map<hobject_t, list<Context*>> callbacks_for_degraded_object;
1396
1397   map<eversion_t,
1398       list<tuple<OpRequestRef, version_t, int> > > waiting_for_ondisk;
1399
1400   void requeue_object_waiters(map<hobject_t, list<OpRequestRef>>& m);
1401   void requeue_op(OpRequestRef op);
1402   void requeue_ops(list<OpRequestRef> &l);
1403
1404   // stats that persist lazily
1405   object_stat_collection_t unstable_stats;
1406
1407   // publish stats
1408   Mutex pg_stats_publish_lock;
1409   bool pg_stats_publish_valid;
1410   pg_stat_t pg_stats_publish;
1411
1412   void _update_calc_stats();
1413   void _update_blocked_by();
1414   friend class TestOpsSocketHook;
1415   void publish_stats_to_osd();
1416   void clear_publish_stats();
1417
1418   void clear_primary_state();
1419
1420   bool is_acting_recovery_backfill(pg_shard_t osd) const {
1421     return acting_recovery_backfill.count(osd);
1422   }
1423   bool is_acting(pg_shard_t osd) const {
1424     return has_shard(pool.info.is_erasure(), acting, osd);
1425   }
1426   bool is_up(pg_shard_t osd) const {
1427     return has_shard(pool.info.is_erasure(), up, osd);
1428   }
1429   static bool has_shard(bool ec, const vector<int>& v, pg_shard_t osd) {
1430     if (ec) {
1431       return v.size() > (unsigned)osd.shard && v[osd.shard] == osd.osd;
1432     } else {
1433       return std::find(v.begin(), v.end(), osd.osd) != v.end();
1434     }
1435   }
1436
1437   bool needs_recovery() const;
1438   bool needs_backfill() const;
1439
1440   /// clip calculated priority to reasonable range
1441   int clamp_recovery_priority(int prio, int pool_recovery_prio, int max);
1442   /// get log recovery reservation priority
1443   unsigned get_recovery_priority();
1444   /// get backfill reservation priority
1445   unsigned get_backfill_priority();
1446   /// get priority for pg deletion
1447   unsigned get_delete_priority();
1448
1449   void try_mark_clean();  ///< mark an active pg clean
1450
1451   /// return [start,end) bounds for required past_intervals
1452   static pair<epoch_t, epoch_t> get_required_past_interval_bounds(
1453     const pg_info_t &info,
1454     epoch_t oldest_map) {
1455     epoch_t start = std::max(
1456       info.history.last_epoch_clean ? info.history.last_epoch_clean :
1457        info.history.epoch_pool_created,
1458       oldest_map);
1459     epoch_t end = std::max(
1460       info.history.same_interval_since,
1461       info.history.epoch_pool_created);
1462     return make_pair(start, end);
1463   }
1464   void check_past_interval_bounds() const;
1465   PastIntervals::PriorSet build_prior();
1466
1467   void remove_down_peer_info(const OSDMapRef osdmap);
1468
1469   bool adjust_need_up_thru(const OSDMapRef osdmap);
1470
1471   bool all_unfound_are_queried_or_lost(const OSDMapRef osdmap) const;
1472   virtual void dump_recovery_info(Formatter *f) const = 0;
1473
1474   void calc_min_last_complete_ondisk() {
1475     eversion_t min = last_complete_ondisk;
1476     ceph_assert(!acting_recovery_backfill.empty());
1477     for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
1478          i != acting_recovery_backfill.end();
1479          ++i) {
1480       if (*i == get_primary()) continue;
1481       if (peer_last_complete_ondisk.count(*i) == 0)
1482         return;   // we don't have complete info
1483       eversion_t a = peer_last_complete_ondisk[*i];
1484       if (a < min)
1485         min = a;
1486     }
1487     if (min == min_last_complete_ondisk)
1488       return;
1489     min_last_complete_ondisk = min;
1490     return;
1491   }
1492
1493   virtual void calc_trim_to() = 0;
1494
1495   virtual void calc_trim_to_aggressive() = 0;
1496
1497   void proc_replica_log(pg_info_t &oinfo, const pg_log_t &olog,
1498                         pg_missing_t& omissing, pg_shard_t from);
1499   void proc_master_log(ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog,
1500                        pg_missing_t& omissing, pg_shard_t from);
1501   bool proc_replica_info(
1502     pg_shard_t from, const pg_info_t &info, epoch_t send_epoch);
1503
1504   struct PGLogEntryHandler : public PGLog::LogEntryHandler {
1505     PG *pg;
1506     ObjectStore::Transaction *t;
1507     PGLogEntryHandler(PG *pg, ObjectStore::Transaction *t) : pg(pg), t(t) {}
1508
1509     // LogEntryHandler
1510     void remove(const hobject_t &hoid) override {
1511       pg->get_pgbackend()->remove(hoid, t);
1512     }
1513     void try_stash(const hobject_t &hoid, version_t v) override {
1514       pg->get_pgbackend()->try_stash(hoid, v, t);
1515     }
1516     void rollback(const pg_log_entry_t &entry) override {
1517       ceph_assert(entry.can_rollback());
1518       pg->get_pgbackend()->rollback(entry, t);
1519     }
1520     void rollforward(const pg_log_entry_t &entry) override {
1521       pg->get_pgbackend()->rollforward(entry, t);
1522     }
1523     void trim(const pg_log_entry_t &entry) override {
1524       pg->get_pgbackend()->trim(entry, t);
1525     }
1526   };
1527
1528   void update_object_snap_mapping(
1529     ObjectStore::Transaction *t, const hobject_t &soid,
1530     const set<snapid_t> &snaps);
1531   void clear_object_snap_mapping(
1532     ObjectStore::Transaction *t, const hobject_t &soid);
1533   void remove_snap_mapped_object(
1534     ObjectStore::Transaction& t, const hobject_t& soid);
1535   void merge_log(
1536     ObjectStore::Transaction& t, pg_info_t &oinfo,
1537     pg_log_t &olog, pg_shard_t from);
1538   void rewind_divergent_log(ObjectStore::Transaction& t, eversion_t newhead);
1539   bool search_for_missing(
1540     const pg_info_t &oinfo, const pg_missing_t &omissing,
1541     pg_shard_t fromosd,
1542     RecoveryCtx*);
1543
1544   void discover_all_missing(std::map<int, map<spg_t,pg_query_t> > &query_map);
1545
1546   map<pg_shard_t, pg_info_t>::const_iterator find_best_info(
1547     const map<pg_shard_t, pg_info_t> &infos,
1548     bool restrict_to_up_acting,
1549     bool *history_les_bound) const;
1550   static void calc_ec_acting(
1551     map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1552     unsigned size,
1553     const vector<int> &acting,
1554     const vector<int> &up,
1555     const map<pg_shard_t, pg_info_t> &all_info,
1556     bool restrict_to_up_acting,
1557     vector<int> *want,
1558     set<pg_shard_t> *backfill,
1559     set<pg_shard_t> *acting_backfill,
1560     ostream &ss);
1561   static void calc_replicated_acting(
1562     map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1563     uint64_t force_auth_primary_missing_objects,
1564     unsigned size,
1565     const vector<int> &acting,
1566     const vector<int> &up,
1567     pg_shard_t up_primary,
1568     const map<pg_shard_t, pg_info_t> &all_info,
1569     bool restrict_to_up_acting,
1570     vector<int> *want,
1571     set<pg_shard_t> *backfill,
1572     set<pg_shard_t> *acting_backfill,
1573     const OSDMapRef osdmap,
1574     ostream &ss);
1575   void choose_async_recovery_ec(const map<pg_shard_t, pg_info_t> &all_info,
1576                                 const pg_info_t &auth_info,
1577                                 vector<int> *want,
1578                                 set<pg_shard_t> *async_recovery,
1579                                 const OSDMapRef osdmap) const;
1580   void choose_async_recovery_replicated(const map<pg_shard_t, pg_info_t> &all_info,
1581                                         const pg_info_t &auth_info,
1582                                         vector<int> *want,
1583                                         set<pg_shard_t> *async_recovery,
1584                                         const OSDMapRef osdmap) const;
1585
1586   bool recoverable_and_ge_min_size(const vector<int> &want) const;
1587   bool choose_acting(pg_shard_t &auth_log_shard,
1588                      bool restrict_to_up_acting,
1589                      bool *history_les_bound);
1590   void build_might_have_unfound();
1591   void activate(
1592     ObjectStore::Transaction& t,
1593     epoch_t activation_epoch,
1594     map<int, map<spg_t,pg_query_t> >& query_map,
1595     map<int,
1596       vector<pair<pg_notify_t, PastIntervals> > > *activator_map,
1597     RecoveryCtx *ctx);
1598
1599   struct C_PG_ActivateCommitted : public Context {
1600     PGRef pg;
1601     epoch_t epoch;
1602     epoch_t activation_epoch;
1603     C_PG_ActivateCommitted(PG *p, epoch_t e, epoch_t ae)
1604       : pg(p), epoch(e), activation_epoch(ae) {}
1605     void finish(int r) override {
1606       pg->_activate_committed(epoch, activation_epoch);
1607     }
1608   };
1609   void _activate_committed(epoch_t epoch, epoch_t activation_epoch);
1610   void all_activated_and_committed();
1611
1612   void proc_primary_info(ObjectStore::Transaction &t, const pg_info_t &info);
1613
1614   bool have_unfound() const {
1615     return missing_loc.have_unfound();
1616   }
1617   uint64_t get_num_unfound() const {
1618     return missing_loc.num_unfound();
1619   }
1620   bool all_missing_unfound() const {
1621     const auto& missing = pg_log.get_missing();
1622     if (!missing.have_missing())
1623       return false;
1624     for (auto& m : missing.get_items()) {
1625       if (!missing_loc.is_unfound(m.first))
1626         return false;
1627     }
1628     return true;
1629   }
1630
1631   virtual void check_local() = 0;
1632
1633   void purge_strays();
1634
1635   void update_heartbeat_peers();
1636
1637   Context *finish_sync_event;
1638
1639   Context *finish_recovery();
1640   void _finish_recovery(Context *c);
1641   struct C_PG_FinishRecovery : public Context {
1642     PGRef pg;
1643     explicit C_PG_FinishRecovery(PG *p) : pg(p) {}
1644     void finish(int r) override {
1645       pg->_finish_recovery(this);
1646     }
1647   };
1648   void cancel_recovery();
1649   void clear_recovery_state();
1650   virtual void _clear_recovery_state() = 0;
1651   virtual void check_recovery_sources(const OSDMapRef& newmap) = 0;
1652   void start_recovery_op(const hobject_t& soid);
1653   void finish_recovery_op(const hobject_t& soid, bool dequeue=false);
1654
1655   virtual void _split_into(pg_t child_pgid, PG *child, unsigned split_bits) = 0;
1656
1657   friend class C_OSD_RepModify_Commit;
1658   friend class C_DeleteMore;
1659
1660   // -- backoff --
1661   Mutex backoff_lock;  // orders inside Backoff::lock
1662   map<hobject_t,set<BackoffRef>> backoffs;
1663
1664   void add_backoff(SessionRef s, const hobject_t& begin, const hobject_t& end);
1665   void release_backoffs(const hobject_t& begin, const hobject_t& end);
1666   void release_backoffs(const hobject_t& o) {
1667     release_backoffs(o, o);
1668   }
1669   void clear_backoffs();
1670
1671   void add_pg_backoff(SessionRef s) {
1672     hobject_t begin = info.pgid.pgid.get_hobj_start();
1673     hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1674     add_backoff(s, begin, end);
1675   }
1676 public:
1677   void release_pg_backoffs() {
1678     hobject_t begin = info.pgid.pgid.get_hobj_start();
1679     hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1680     release_backoffs(begin, end);
1681   }
1682 protected:
1683
1684   // -- scrub --
1685 public:
1686   struct Scrubber {
1687     Scrubber();
1688     ~Scrubber();
1689
1690     // metadata
1691     set<pg_shard_t> reserved_peers;
1692     bool local_reserved, remote_reserved, reserve_failed;
1693     epoch_t epoch_start;
1694
1695     // common to both scrubs
1696     bool active;
1697     set<pg_shard_t> waiting_on_whom;
1698     int shallow_errors;
1699     int deep_errors;
1700     int fixed;
1701     ScrubMap primary_scrubmap;
1702     ScrubMapBuilder primary_scrubmap_pos;
1703     epoch_t replica_scrub_start = 0;
1704     ScrubMap replica_scrubmap;
1705     ScrubMapBuilder replica_scrubmap_pos;
1706     map<pg_shard_t, ScrubMap> received_maps;
1707     OpRequestRef active_rep_scrub;
1708     utime_t scrub_reg_stamp;  // stamp we registered for
1709
1710     static utime_t scrub_must_stamp() { return utime_t(0,1); }
1711
1712     omap_stat_t omap_stats  = (const struct omap_stat_t){ 0 };
1713
1714     // For async sleep
1715     bool sleeping = false;
1716     bool needs_sleep = true;
1717     utime_t sleep_start;
1718
1719     // flags to indicate explicitly requested scrubs (by admin)
1720     bool must_scrub, must_deep_scrub, must_repair, need_auto;
1721
1722     // Priority to use for scrub scheduling
1723     unsigned priority = 0;
1724
1725     bool time_for_deep;
1726     // this flag indicates whether we would like to do auto-repair of the PG or not
1727     bool auto_repair;
1728     // this flag indicates that we are scrubbing post repair to verify everything is fixed
1729     bool check_repair;
1730     // this flag indicates that if a regular scrub detects errors <= osd_scrub_auto_repair_num_errors,
1731     // we should deep scrub in order to auto repair
1732     bool deep_scrub_on_error;
1733
1734     // Maps from objects with errors to missing/inconsistent peers
1735     map<hobject_t, set<pg_shard_t>> missing;
1736     map<hobject_t, set<pg_shard_t>> inconsistent;
1737
1738     // Map from object with errors to good peers
1739     map<hobject_t, list<pair<ScrubMap::object, pg_shard_t> >> authoritative;
1740
1741     // Cleaned map pending snap metadata scrub
1742     ScrubMap cleaned_meta_map;
1743
1744     void clean_meta_map(ScrubMap &for_meta_scrub) {
1745       if (end.is_max() ||
1746           cleaned_meta_map.objects.empty()) {
1747          cleaned_meta_map.swap(for_meta_scrub);
1748       } else {
1749         auto iter = cleaned_meta_map.objects.end();
1750         --iter; // not empty, see if clause
1751         auto begin = cleaned_meta_map.objects.begin();
1752         if (iter->first.has_snapset()) {
1753           ++iter;
1754         } else {
1755           while (iter != begin) {
1756             auto next = iter--;
1757             if (next->first.get_head() != iter->first.get_head()) {
1758               ++iter;
1759               break;
1760             }
1761           }
1762         }
1763         for_meta_scrub.objects.insert(begin, iter);
1764         cleaned_meta_map.objects.erase(begin, iter);
1765       }
1766     }
1767
1768     // digest updates which we are waiting on
1769     int num_digest_updates_pending;
1770
1771     // chunky scrub
1772     hobject_t start, end;    // [start,end)
1773     hobject_t max_end;       // Largest end that may have been sent to replicas
1774     eversion_t subset_last_update;
1775
1776     // chunky scrub state
1777     enum State {
1778       INACTIVE,
1779       NEW_CHUNK,
1780       WAIT_PUSHES,
1781       WAIT_LAST_UPDATE,
1782       BUILD_MAP,
1783       BUILD_MAP_DONE,
1784       WAIT_REPLICAS,
1785       COMPARE_MAPS,
1786       WAIT_DIGEST_UPDATES,
1787       FINISH,
1788       BUILD_MAP_REPLICA,
1789     } state;
1790
1791     std::unique_ptr<Scrub::Store> store;
1792     // deep scrub
1793     bool deep;
1794     int preempt_left;
1795     int preempt_divisor;
1796
1797     list<Context*> callbacks;
1798     void add_callback(Context *context) {
1799       callbacks.push_back(context);
1800     }
1801     void run_callbacks() {
1802       list<Context*> to_run;
1803       to_run.swap(callbacks);
1804       for (list<Context*>::iterator i = to_run.begin();
1805            i != to_run.end();
1806            ++i) {
1807         (*i)->complete(0);
1808       }
1809     }
1810
1811     static const char *state_string(const PG::Scrubber::State& state) {
1812       const char *ret = NULL;
1813       switch( state )
1814       {
1815         case INACTIVE: ret = "INACTIVE"; break;
1816         case NEW_CHUNK: ret = "NEW_CHUNK"; break;
1817         case WAIT_PUSHES: ret = "WAIT_PUSHES"; break;
1818         case WAIT_LAST_UPDATE: ret = "WAIT_LAST_UPDATE"; break;
1819         case BUILD_MAP: ret = "BUILD_MAP"; break;
1820         case BUILD_MAP_DONE: ret = "BUILD_MAP_DONE"; break;
1821         case WAIT_REPLICAS: ret = "WAIT_REPLICAS"; break;
1822         case COMPARE_MAPS: ret = "COMPARE_MAPS"; break;
1823         case WAIT_DIGEST_UPDATES: ret = "WAIT_DIGEST_UPDATES"; break;
1824         case FINISH: ret = "FINISH"; break;
1825         case BUILD_MAP_REPLICA: ret = "BUILD_MAP_REPLICA"; break;
1826       }
1827       return ret;
1828     }
1829
1830     bool is_chunky_scrub_active() const { return state != INACTIVE; }
1831
1832     // clear all state
1833     void reset() {
1834       active = false;
1835       waiting_on_whom.clear();
1836       if (active_rep_scrub) {
1837         active_rep_scrub = OpRequestRef();
1838       }
1839       received_maps.clear();
1840
1841       must_scrub = false;
1842       must_deep_scrub = false;
1843       must_repair = false;
1844       need_auto = false;
1845       time_for_deep = false;
1846       auto_repair = false;
1847       check_repair = false;
1848       deep_scrub_on_error = false;
1849
1850       state = PG::Scrubber::INACTIVE;
1851       start = hobject_t();
1852       end = hobject_t();
1853       max_end = hobject_t();
1854       subset_last_update = eversion_t();
1855       shallow_errors = 0;
1856       deep_errors = 0;
1857       fixed = 0;
1858       omap_stats = (const struct omap_stat_t){ 0 };
1859       deep = false;
1860       run_callbacks();
1861       inconsistent.clear();
1862       missing.clear();
1863       authoritative.clear();
1864       num_digest_updates_pending = 0;
1865       primary_scrubmap = ScrubMap();
1866       primary_scrubmap_pos.reset();
1867       replica_scrubmap = ScrubMap();
1868       replica_scrubmap_pos.reset();
1869       cleaned_meta_map = ScrubMap();
1870       sleeping = false;
1871       needs_sleep = true;
1872       sleep_start = utime_t();
1873     }
1874
1875     void create_results(const hobject_t& obj);
1876     void cleanup_store(ObjectStore::Transaction *t);
1877   } scrubber;
1878
1879 protected:
1880   bool scrub_after_recovery;
1881
1882   int active_pushes;
1883
1884   bool scrub_can_preempt = false;
1885   bool scrub_preempted = false;
1886
1887   // we allow some number of preemptions of the scrub, which mean we do
1888   // not block.  then we start to block.  once we start blocking, we do
1889   // not stop until the scrub range is completed.
1890   bool write_blocked_by_scrub(const hobject_t &soid);
1891
1892   /// true if the given range intersects the scrub interval in any way
1893   bool range_intersects_scrub(const hobject_t &start, const hobject_t& end);
1894
1895   void repair_object(
1896     const hobject_t& soid, list<pair<ScrubMap::object, pg_shard_t> > *ok_peers,
1897     pg_shard_t bad_peer);
1898
1899   void chunky_scrub(ThreadPool::TPHandle &handle);
1900   void scrub_compare_maps();
1901   /**
1902    * return true if any inconsistency/missing is repaired, false otherwise
1903    */
1904   bool scrub_process_inconsistent();
1905   bool ops_blocked_by_scrub() const;
1906   void scrub_finish();
1907   void scrub_clear_state(bool keep_repair = false);
1908   void _scan_snaps(ScrubMap &map);
1909   void _repair_oinfo_oid(ScrubMap &map);
1910   void _scan_rollback_obs(const vector<ghobject_t> &rollback_obs);
1911   void _request_scrub_map(pg_shard_t replica, eversion_t version,
1912                           hobject_t start, hobject_t end, bool deep,
1913                           bool allow_preemption);
1914   int build_scrub_map_chunk(
1915     ScrubMap &map,
1916     ScrubMapBuilder &pos,
1917     hobject_t start, hobject_t end, bool deep,
1918     ThreadPool::TPHandle &handle);
1919   /**
1920    * returns true if [begin, end) is good to scrub at this time
1921    * a false return value obliges the implementer to requeue scrub when the
1922    * condition preventing scrub clears
1923    */
1924   virtual bool _range_available_for_scrub(
1925     const hobject_t &begin, const hobject_t &end) = 0;
1926   virtual void scrub_snapshot_metadata(
1927     ScrubMap &map,
1928     const std::map<hobject_t,
1929                    pair<boost::optional<uint32_t>,
1930                         boost::optional<uint32_t>>> &missing_digest) { }
1931   virtual void _scrub_clear_state() { }
1932   virtual void _scrub_finish() { }
1933   void clear_scrub_reserved();
1934   void scrub_reserve_replicas();
1935   void scrub_unreserve_replicas();
1936   bool scrub_all_replicas_reserved() const;
1937
1938   void replica_scrub(
1939     OpRequestRef op,
1940     ThreadPool::TPHandle &handle);
1941   void do_replica_scrub_map(OpRequestRef op);
1942
1943   void handle_scrub_reserve_request(OpRequestRef op);
1944   void handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from);
1945   void handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from);
1946   void handle_scrub_reserve_release(OpRequestRef op);
1947
1948   void reject_reservation();
1949   void schedule_backfill_retry(float retry);
1950   void schedule_recovery_retry(float retry);
1951
1952   // -- recovery state --
1953
1954   template <class EVT>
1955   struct QueuePeeringEvt : Context {
1956     PGRef pg;
1957     epoch_t epoch;
1958     EVT evt;
1959     QueuePeeringEvt(PG *pg, epoch_t epoch, EVT evt) :
1960       pg(pg), epoch(epoch), evt(evt) {}
1961     void finish(int r) override {
1962       pg->lock();
1963       pg->queue_peering_event(PGPeeringEventRef(
1964                                 new PGPeeringEvent(
1965                                   epoch,
1966                                   epoch,
1967                                   evt)));
1968       pg->unlock();
1969     }
1970   };
1971
1972
1973   struct QueryState : boost::statechart::event< QueryState > {
1974     Formatter *f;
1975     explicit QueryState(Formatter *f) : f(f) {}
1976     void print(std::ostream *out) const {
1977       *out << "Query";
1978     }
1979   };
1980
1981 public:
1982   int pg_stat_adjust(osd_stat_t *new_stat);
1983 protected:
1984
1985   struct AdvMap : boost::statechart::event< AdvMap > {
1986     OSDMapRef osdmap;
1987     OSDMapRef lastmap;
1988     vector<int> newup, newacting;
1989     int up_primary, acting_primary;
1990     AdvMap(
1991       OSDMapRef osdmap, OSDMapRef lastmap,
1992       vector<int>& newup, int up_primary,
1993       vector<int>& newacting, int acting_primary):
1994       osdmap(osdmap), lastmap(lastmap),
1995       newup(newup),
1996       newacting(newacting),
1997       up_primary(up_primary),
1998       acting_primary(acting_primary) {}
1999     void print(std::ostream *out) const {
2000       *out << "AdvMap";
2001     }
2002   };
2003
2004   struct ActMap : boost::statechart::event< ActMap > {
2005     ActMap() : boost::statechart::event< ActMap >() {}
2006     void print(std::ostream *out) const {
2007       *out << "ActMap";
2008     }
2009   };
2010   struct Activate : boost::statechart::event< Activate > {
2011     epoch_t activation_epoch;
2012     explicit Activate(epoch_t q) : boost::statechart::event< Activate >(),
2013                           activation_epoch(q) {}
2014     void print(std::ostream *out) const {
2015       *out << "Activate from " << activation_epoch;
2016     }
2017   };
2018 public:
2019   struct UnfoundBackfill : boost::statechart::event<UnfoundBackfill> {
2020     explicit UnfoundBackfill() {}
2021     void print(std::ostream *out) const {
2022       *out << "UnfoundBackfill";
2023     }
2024   };
2025   struct UnfoundRecovery : boost::statechart::event<UnfoundRecovery> {
2026     explicit UnfoundRecovery() {}
2027     void print(std::ostream *out) const {
2028       *out << "UnfoundRecovery";
2029     }
2030   };
2031
2032   struct RequestScrub : boost::statechart::event<RequestScrub> {
2033     bool deep;
2034     bool repair;
2035     explicit RequestScrub(bool d, bool r) : deep(d), repair(r) {}
2036     void print(std::ostream *out) const {
2037       *out << "RequestScrub(" << (deep ? "deep" : "shallow")
2038            << (repair ? " repair" : "");
2039     }
2040   };
2041
2042 protected:
2043   TrivialEvent(Initialize)
2044   TrivialEvent(GotInfo)
2045   TrivialEvent(NeedUpThru)
2046   TrivialEvent(Backfilled)
2047   TrivialEvent(LocalBackfillReserved)
2048   TrivialEvent(RejectTooFullRemoteReservation)
2049   public:
2050   TrivialEvent(RequestBackfill)
2051   protected:
2052   TrivialEvent(RemoteRecoveryPreempted)
2053   TrivialEvent(RemoteBackfillPreempted)
2054   TrivialEvent(BackfillTooFull)
2055   TrivialEvent(RecoveryTooFull)
2056
2057   TrivialEvent(MakePrimary)
2058   TrivialEvent(MakeStray)
2059   TrivialEvent(NeedActingChange)
2060   TrivialEvent(IsIncomplete)
2061   TrivialEvent(IsDown)
2062
2063   TrivialEvent(AllReplicasRecovered)
2064   TrivialEvent(DoRecovery)
2065   TrivialEvent(LocalRecoveryReserved)
2066   public:
2067   protected:
2068   TrivialEvent(AllRemotesReserved)
2069   TrivialEvent(AllBackfillsReserved)
2070   TrivialEvent(GoClean)
2071
2072   TrivialEvent(AllReplicasActivated)
2073
2074   TrivialEvent(IntervalFlush)
2075
2076   public:
2077   TrivialEvent(DeleteStart)
2078   TrivialEvent(DeleteSome)
2079
2080   TrivialEvent(SetForceRecovery)
2081   TrivialEvent(UnsetForceRecovery)
2082   TrivialEvent(SetForceBackfill)
2083   TrivialEvent(UnsetForceBackfill)
2084
2085   protected:
2086   TrivialEvent(DeleteReserved)
2087   TrivialEvent(DeleteInterrupted)
2088
2089   /* Encapsulates PG recovery process */
2090   class RecoveryState {
2091     void start_handle(RecoveryCtx *new_ctx);
2092     void end_handle();
2093   public:
2094     void begin_block_outgoing();
2095     void end_block_outgoing();
2096     void clear_blocked_outgoing();
2097   private:
2098
2099     /* States */
2100     struct Initial;
2101     class RecoveryMachine : public boost::statechart::state_machine< RecoveryMachine, Initial > {
2102       RecoveryState *state;
2103     public:
2104       PG *pg;
2105
2106       utime_t event_time;
2107       uint64_t event_count;
2108
2109       void clear_event_counters() {
2110         event_time = utime_t();
2111         event_count = 0;
2112       }
2113
2114       void log_enter(const char *state_name);
2115       void log_exit(const char *state_name, utime_t duration);
2116
2117       RecoveryMachine(RecoveryState *state, PG *pg) : state(state), pg(pg), event_count(0) {}
2118
2119       /* Accessor functions for state methods */
2120       ObjectStore::Transaction* get_cur_transaction() {
2121         ceph_assert(state->rctx);
2122         ceph_assert(state->rctx->transaction);
2123         return state->rctx->transaction;
2124       }
2125
2126       void send_query(pg_shard_t to, const pg_query_t &query) {
2127         ceph_assert(state->rctx);
2128         ceph_assert(state->rctx->query_map);
2129         (*state->rctx->query_map)[to.osd][spg_t(pg->info.pgid.pgid, to.shard)] =
2130           query;
2131       }
2132
2133       map<int, map<spg_t, pg_query_t> > *get_query_map() {
2134         ceph_assert(state->rctx);
2135         ceph_assert(state->rctx->query_map);
2136         return state->rctx->query_map;
2137       }
2138
2139       map<int, vector<pair<pg_notify_t, PastIntervals> > > *get_info_map() {
2140         ceph_assert(state->rctx);
2141         ceph_assert(state->rctx->info_map);
2142         return state->rctx->info_map;
2143       }
2144
2145       RecoveryCtx *get_recovery_ctx() { return &*(state->rctx); }
2146
2147       void send_notify(pg_shard_t to,
2148                        const pg_notify_t &info, const PastIntervals &pi) {
2149         ceph_assert(state->rctx);
2150         state->rctx->send_notify(to, info, pi);
2151       }
2152     };
2153     friend class RecoveryMachine;
2154
2155     /* States */
2156     // Initial
2157     // Reset
2158     // Start
2159     //   Started
2160     //     Primary
2161     //       WaitActingChange
2162     //       Peering
2163     //         GetInfo
2164     //         GetLog
2165     //         GetMissing
2166     //         WaitUpThru
2167     //         Incomplete
2168     //       Active
2169     //         Activating
2170     //         Clean
2171     //         Recovered
2172     //         Backfilling
2173     //         WaitRemoteBackfillReserved
2174     //         WaitLocalBackfillReserved
2175     //         NotBackfilling
2176     //         NotRecovering
2177     //         Recovering
2178     //         WaitRemoteRecoveryReserved
2179     //         WaitLocalRecoveryReserved
2180     //     ReplicaActive
2181     //       RepNotRecovering
2182     //       RepRecovering
2183     //       RepWaitBackfillReserved
2184     //       RepWaitRecoveryReserved
2185     //     Stray
2186     //     ToDelete
2187     //       WaitDeleteReserved
2188     //       Deleting
2189     // Crashed
2190
2191     struct Crashed : boost::statechart::state< Crashed, RecoveryMachine >, NamedState {
2192       explicit Crashed(my_context ctx);
2193     };
2194
2195     struct Reset;
2196
2197     struct Initial : boost::statechart::state< Initial, RecoveryMachine >, NamedState {
2198       explicit Initial(my_context ctx);
2199       void exit();
2200
2201       typedef boost::mpl::list <
2202         boost::statechart::transition< Initialize, Reset >,
2203         boost::statechart::custom_reaction< NullEvt >,
2204         boost::statechart::transition< boost::statechart::event_base, Crashed >
2205         > reactions;
2206
2207       boost::statechart::result react(const MNotifyRec&);
2208       boost::statechart::result react(const MInfoRec&);
2209       boost::statechart::result react(const MLogRec&);
2210       boost::statechart::result react(const boost::statechart::event_base&) {
2211         return discard_event();
2212       }
2213     };
2214
2215     struct Reset : boost::statechart::state< Reset, RecoveryMachine >, NamedState {
2216       explicit Reset(my_context ctx);
2217       void exit();
2218
2219       typedef boost::mpl::list <
2220         boost::statechart::custom_reaction< QueryState >,
2221         boost::statechart::custom_reaction< AdvMap >,
2222         boost::statechart::custom_reaction< ActMap >,
2223         boost::statechart::custom_reaction< NullEvt >,
2224         boost::statechart::custom_reaction< IntervalFlush >,
2225         boost::statechart::transition< boost::statechart::event_base, Crashed >
2226         > reactions;
2227       boost::statechart::result react(const QueryState& q);
2228       boost::statechart::result react(const AdvMap&);
2229       boost::statechart::result react(const ActMap&);
2230       boost::statechart::result react(const IntervalFlush&);
2231       boost::statechart::result react(const boost::statechart::event_base&) {
2232         return discard_event();
2233       }
2234     };
2235
2236     struct Start;
2237
2238     struct Started : boost::statechart::state< Started, RecoveryMachine, Start >, NamedState {
2239       explicit Started(my_context ctx);
2240       void exit();
2241
2242       typedef boost::mpl::list <
2243         boost::statechart::custom_reaction< QueryState >,
2244         boost::statechart::custom_reaction< AdvMap >,
2245         boost::statechart::custom_reaction< IntervalFlush >,
2246         // ignored
2247         boost::statechart::custom_reaction< NullEvt >,
2248         boost::statechart::custom_reaction<SetForceRecovery>,
2249         boost::statechart::custom_reaction<UnsetForceRecovery>,
2250         boost::statechart::custom_reaction<SetForceBackfill>,
2251         boost::statechart::custom_reaction<UnsetForceBackfill>,
2252         boost::statechart::custom_reaction<RequestScrub>,
2253         // crash
2254         boost::statechart::transition< boost::statechart::event_base, Crashed >
2255         > reactions;
2256       boost::statechart::result react(const QueryState& q);
2257       boost::statechart::result react(const AdvMap&);
2258       boost::statechart::result react(const IntervalFlush&);
2259       boost::statechart::result react(const boost::statechart::event_base&) {
2260         return discard_event();
2261       }
2262     };
2263
2264     struct Primary;
2265     struct Stray;
2266
2267     struct Start : boost::statechart::state< Start, Started >, NamedState {
2268       explicit Start(my_context ctx);
2269       void exit();
2270
2271       typedef boost::mpl::list <
2272         boost::statechart::transition< MakePrimary, Primary >,
2273         boost::statechart::transition< MakeStray, Stray >
2274         > reactions;
2275     };
2276
2277     struct Peering;
2278     struct WaitActingChange;
2279     struct Incomplete;
2280     struct Down;
2281
2282     struct Primary : boost::statechart::state< Primary, Started, Peering >, NamedState {
2283       explicit Primary(my_context ctx);
2284       void exit();
2285
2286       typedef boost::mpl::list <
2287         boost::statechart::custom_reaction< ActMap >,
2288         boost::statechart::custom_reaction< MNotifyRec >,
2289         boost::statechart::transition< NeedActingChange, WaitActingChange >,
2290         boost::statechart::custom_reaction<SetForceRecovery>,
2291         boost::statechart::custom_reaction<UnsetForceRecovery>,
2292         boost::statechart::custom_reaction<SetForceBackfill>,
2293         boost::statechart::custom_reaction<UnsetForceBackfill>,
2294         boost::statechart::custom_reaction<RequestScrub>
2295         > reactions;
2296       boost::statechart::result react(const ActMap&);
2297       boost::statechart::result react(const MNotifyRec&);
2298       boost::statechart::result react(const SetForceRecovery&);
2299       boost::statechart::result react(const UnsetForceRecovery&);
2300       boost::statechart::result react(const SetForceBackfill&);
2301       boost::statechart::result react(const UnsetForceBackfill&);
2302       boost::statechart::result react(const RequestScrub&);
2303     };
2304
2305     struct WaitActingChange : boost::statechart::state< WaitActingChange, Primary>,
2306                               NamedState {
2307       typedef boost::mpl::list <
2308         boost::statechart::custom_reaction< QueryState >,
2309         boost::statechart::custom_reaction< AdvMap >,
2310         boost::statechart::custom_reaction< MLogRec >,
2311         boost::statechart::custom_reaction< MInfoRec >,
2312         boost::statechart::custom_reaction< MNotifyRec >
2313         > reactions;
2314       explicit WaitActingChange(my_context ctx);
2315       boost::statechart::result react(const QueryState& q);
2316       boost::statechart::result react(const AdvMap&);
2317       boost::statechart::result react(const MLogRec&);
2318       boost::statechart::result react(const MInfoRec&);
2319       boost::statechart::result react(const MNotifyRec&);
2320       void exit();
2321     };
2322
2323     struct GetInfo;
2324     struct Active;
2325
2326     struct Peering : boost::statechart::state< Peering, Primary, GetInfo >, NamedState {
2327       PastIntervals::PriorSet prior_set;
2328       bool history_les_bound;  //< need osd_find_best_info_ignore_history_les
2329
2330       explicit Peering(my_context ctx);
2331       void exit();
2332
2333       typedef boost::mpl::list <
2334         boost::statechart::custom_reaction< QueryState >,
2335         boost::statechart::transition< Activate, Active >,
2336         boost::statechart::custom_reaction< AdvMap >
2337         > reactions;
2338       boost::statechart::result react(const QueryState& q);
2339       boost::statechart::result react(const AdvMap &advmap);
2340     };
2341
2342     struct WaitLocalRecoveryReserved;
2343     struct Activating;
2344     struct Active : boost::statechart::state< Active, Primary, Activating >, NamedState {
2345       explicit Active(my_context ctx);
2346       void exit();
2347
2348       const set<pg_shard_t> remote_shards_to_reserve_recovery;
2349       const set<pg_shard_t> remote_shards_to_reserve_backfill;
2350       bool all_replicas_activated;
2351
2352       typedef boost::mpl::list <
2353         boost::statechart::custom_reaction< QueryState >,
2354         boost::statechart::custom_reaction< ActMap >,
2355         boost::statechart::custom_reaction< AdvMap >,
2356         boost::statechart::custom_reaction< MInfoRec >,
2357         boost::statechart::custom_reaction< MNotifyRec >,
2358         boost::statechart::custom_reaction< MLogRec >,
2359         boost::statechart::custom_reaction< MTrim >,
2360         boost::statechart::custom_reaction< Backfilled >,
2361         boost::statechart::custom_reaction< AllReplicasActivated >,
2362         boost::statechart::custom_reaction< DeferRecovery >,
2363         boost::statechart::custom_reaction< DeferBackfill >,
2364         boost::statechart::custom_reaction< UnfoundRecovery >,
2365         boost::statechart::custom_reaction< UnfoundBackfill >,
2366         boost::statechart::custom_reaction< RemoteReservationRevokedTooFull>,
2367         boost::statechart::custom_reaction< RemoteReservationRevoked>,
2368         boost::statechart::custom_reaction< DoRecovery>
2369         > reactions;
2370       boost::statechart::result react(const QueryState& q);
2371       boost::statechart::result react(const ActMap&);
2372       boost::statechart::result react(const AdvMap&);
2373       boost::statechart::result react(const MInfoRec& infoevt);
2374       boost::statechart::result react(const MNotifyRec& notevt);
2375       boost::statechart::result react(const MLogRec& logevt);
2376       boost::statechart::result react(const MTrim& trimevt);
2377       boost::statechart::result react(const Backfilled&) {
2378         return discard_event();
2379       }
2380       boost::statechart::result react(const AllReplicasActivated&);
2381       boost::statechart::result react(const DeferRecovery& evt) {
2382         return discard_event();
2383       }
2384       boost::statechart::result react(const DeferBackfill& evt) {
2385         return discard_event();
2386       }
2387       boost::statechart::result react(const UnfoundRecovery& evt) {
2388         return discard_event();
2389       }
2390       boost::statechart::result react(const UnfoundBackfill& evt) {
2391         return discard_event();
2392       }
2393       boost::statechart::result react(const RemoteReservationRevokedTooFull&) {
2394         return discard_event();
2395       }
2396       boost::statechart::result react(const RemoteReservationRevoked&) {
2397         return discard_event();
2398       }
2399       boost::statechart::result react(const DoRecovery&) {
2400         return discard_event();
2401       }
2402     };
2403
2404     struct Clean : boost::statechart::state< Clean, Active >, NamedState {
2405       typedef boost::mpl::list<
2406         boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >,
2407         boost::statechart::custom_reaction<SetForceRecovery>,
2408         boost::statechart::custom_reaction<SetForceBackfill>
2409       > reactions;
2410       explicit Clean(my_context ctx);
2411       void exit();
2412       boost::statechart::result react(const boost::statechart::event_base&) {
2413         return discard_event();
2414       }
2415     };
2416
2417     struct Recovered : boost::statechart::state< Recovered, Active >, NamedState {
2418       typedef boost::mpl::list<
2419         boost::statechart::transition< GoClean, Clean >,
2420         boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >,
2421         boost::statechart::custom_reaction< AllReplicasActivated >
2422       > reactions;
2423       explicit Recovered(my_context ctx);
2424       void exit();
2425       boost::statechart::result react(const AllReplicasActivated&) {
2426         post_event(GoClean());
2427         return forward_event();
2428       }
2429     };
2430
2431     struct Backfilling : boost::statechart::state< Backfilling, Active >, NamedState {
2432       typedef boost::mpl::list<
2433         boost::statechart::custom_reaction< Backfilled >,
2434         boost::statechart::custom_reaction< DeferBackfill >,
2435         boost::statechart::custom_reaction< UnfoundBackfill >,
2436         boost::statechart::custom_reaction< RemoteReservationRejectedTooFull >,
2437         boost::statechart::custom_reaction< RemoteReservationRevokedTooFull>,
2438         boost::statechart::custom_reaction< RemoteReservationRevoked>
2439         > reactions;
2440       explicit Backfilling(my_context ctx);
2441       boost::statechart::result react(const RemoteReservationRejectedTooFull& evt) {
2442         // for compat with old peers
2443         post_event(RemoteReservationRevokedTooFull());
2444         return discard_event();
2445       }
2446       void backfill_release_reservations();
2447       boost::statechart::result react(const Backfilled& evt);
2448       boost::statechart::result react(const RemoteReservationRevokedTooFull& evt);
2449       boost::statechart::result react(const RemoteReservationRevoked& evt);
2450       boost::statechart::result react(const DeferBackfill& evt);
2451       boost::statechart::result react(const UnfoundBackfill& evt);
2452       void cancel_backfill();
2453       void exit();
2454     };
2455
2456     struct WaitRemoteBackfillReserved : boost::statechart::state< WaitRemoteBackfillReserved, Active >, NamedState {
2457       typedef boost::mpl::list<
2458         boost::statechart::custom_reaction< RemoteBackfillReserved >,
2459         boost::statechart::custom_reaction< RemoteReservationRejectedTooFull >,
2460         boost::statechart::custom_reaction< RemoteReservationRevoked >,
2461         boost::statechart::transition< AllBackfillsReserved, Backfilling >
2462         > reactions;
2463       set<pg_shard_t>::const_iterator backfill_osd_it;
2464       explicit WaitRemoteBackfillReserved(my_context ctx);
2465       void retry();
2466       void exit();
2467       boost::statechart::result react(const RemoteBackfillReserved& evt);
2468       boost::statechart::result react(const RemoteReservationRejectedTooFull& evt);
2469       boost::statechart::result react(const RemoteReservationRevoked& evt);
2470     };
2471
2472     struct WaitLocalBackfillReserved : boost::statechart::state< WaitLocalBackfillReserved, Active >, NamedState {
2473       typedef boost::mpl::list<
2474         boost::statechart::transition< LocalBackfillReserved, WaitRemoteBackfillReserved >
2475         > reactions;
2476       explicit WaitLocalBackfillReserved(my_context ctx);
2477       void exit();
2478     };
2479
2480     struct NotBackfilling : boost::statechart::state< NotBackfilling, Active>, NamedState {
2481       typedef boost::mpl::list<
2482         boost::statechart::transition< RequestBackfill, WaitLocalBackfillReserved>,
2483         boost::statechart::custom_reaction< RemoteBackfillReserved >,
2484         boost::statechart::custom_reaction< RemoteReservationRejectedTooFull >
2485         > reactions;
2486       explicit NotBackfilling(my_context ctx);
2487       void exit();
2488       boost::statechart::result react(const RemoteBackfillReserved& evt);
2489       boost::statechart::result react(const RemoteReservationRejectedTooFull& evt);
2490     };
2491
2492     struct NotRecovering : boost::statechart::state< NotRecovering, Active>, NamedState {
2493       typedef boost::mpl::list<
2494         boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >,
2495         boost::statechart::custom_reaction< DeferRecovery >,
2496         boost::statechart::custom_reaction< UnfoundRecovery >
2497         > reactions;
2498       explicit NotRecovering(my_context ctx);
2499       boost::statechart::result react(const DeferRecovery& evt) {
2500         /* no-op */
2501         return discard_event();
2502       }
2503       boost::statechart::result react(const UnfoundRecovery& evt) {
2504         /* no-op */
2505         return discard_event();
2506       }
2507       void exit();
2508     };
2509
2510     struct ToDelete;
2511     struct RepNotRecovering;
2512     struct ReplicaActive : boost::statechart::state< ReplicaActive, Started, RepNotRecovering >, NamedState {
2513       explicit ReplicaActive(my_context ctx);
2514       void exit();
2515
2516       typedef boost::mpl::list <
2517         boost::statechart::custom_reaction< QueryState >,
2518         boost::statechart::custom_reaction< ActMap >,
2519         boost::statechart::custom_reaction< MQuery >,
2520         boost::statechart::custom_reaction< MInfoRec >,
2521         boost::statechart::custom_reaction< MLogRec >,
2522         boost::statechart::custom_reaction< MTrim >,
2523         boost::statechart::custom_reaction< Activate >,
2524         boost::statechart::custom_reaction< DeferRecovery >,
2525         boost::statechart::custom_reaction< DeferBackfill >,
2526         boost::statechart::custom_reaction< UnfoundRecovery >,
2527         boost::statechart::custom_reaction< UnfoundBackfill >,
2528         boost::statechart::custom_reaction< RemoteBackfillPreempted >,
2529         boost::statechart::custom_reaction< RemoteRecoveryPreempted >,
2530         boost::statechart::custom_reaction< RecoveryDone >,
2531         boost::statechart::transition<DeleteStart, ToDelete>
2532         > reactions;
2533       boost::statechart::result react(const QueryState& q);
2534       boost::statechart::result react(const MInfoRec& infoevt);
2535       boost::statechart::result react(const MLogRec& logevt);
2536       boost::statechart::result react(const MTrim& trimevt);
2537       boost::statechart::result react(const ActMap&);
2538       boost::statechart::result react(const MQuery&);
2539       boost::statechart::result react(const Activate&);
2540       boost::statechart::result react(const RecoveryDone&) {
2541         return discard_event();
2542       }
2543       boost::statechart::result react(const DeferRecovery& evt) {
2544         return discard_event();
2545       }
2546       boost::statechart::result react(const DeferBackfill& evt) {
2547         return discard_event();
2548       }
2549       boost::statechart::result react(const UnfoundRecovery& evt) {
2550         return discard_event();
2551       }
2552       boost::statechart::result react(const UnfoundBackfill& evt) {
2553         return discard_event();
2554       }
2555       boost::statechart::result react(const RemoteBackfillPreempted& evt) {
2556         return discard_event();
2557       }
2558       boost::statechart::result react(const RemoteRecoveryPreempted& evt) {
2559         return discard_event();
2560       }
2561     };
2562
2563     struct RepRecovering : boost::statechart::state< RepRecovering, ReplicaActive >, NamedState {
2564       typedef boost::mpl::list<
2565         boost::statechart::transition< RecoveryDone, RepNotRecovering >,
2566         // for compat with old peers
2567         boost::statechart::transition< RemoteReservationRejectedTooFull, RepNotRecovering >,
2568         boost::statechart::transition< RemoteReservationCanceled, RepNotRecovering >,
2569         boost::statechart::custom_reaction< BackfillTooFull >,
2570         boost::statechart::custom_reaction< RemoteRecoveryPreempted >,
2571         boost::statechart::custom_reaction< RemoteBackfillPreempted >
2572         > reactions;
2573       explicit RepRecovering(my_context ctx);
2574       boost::statechart::result react(const RemoteRecoveryPreempted &evt);
2575       boost::statechart::result react(const BackfillTooFull &evt);
2576       boost::statechart::result react(const RemoteBackfillPreempted &evt);
2577       void exit();
2578     };
2579
2580     struct RepWaitBackfillReserved : boost::statechart::state< RepWaitBackfillReserved, ReplicaActive >, NamedState {
2581       typedef boost::mpl::list<
2582         boost::statechart::custom_reaction< RemoteBackfillReserved >,
2583         boost::statechart::custom_reaction< RejectTooFullRemoteReservation >,
2584         boost::statechart::custom_reaction< RemoteReservationRejectedTooFull >,
2585         boost::statechart::custom_reaction< RemoteReservationCanceled >
2586         > reactions;
2587       explicit RepWaitBackfillReserved(my_context ctx);
2588       void exit();
2589       boost::statechart::result react(const RemoteBackfillReserved &evt);
2590       boost::statechart::result react(const RejectTooFullRemoteReservation &evt);
2591       boost::statechart::result react(const RemoteReservationRejectedTooFull &evt);
2592       boost::statechart::result react(const RemoteReservationCanceled &evt);
2593     };
2594
2595     struct RepWaitRecoveryReserved : boost::statechart::state< RepWaitRecoveryReserved, ReplicaActive >, NamedState {
2596       typedef boost::mpl::list<
2597         boost::statechart::custom_reaction< RemoteRecoveryReserved >,
2598         // for compat with old peers
2599         boost::statechart::custom_reaction< RemoteReservationRejectedTooFull >,
2600         boost::statechart::custom_reaction< RemoteReservationCanceled >
2601         > reactions;
2602       explicit RepWaitRecoveryReserved(my_context ctx);
2603       void exit();
2604       boost::statechart::result react(const RemoteRecoveryReserved &evt);
2605       boost::statechart::result react(const RemoteReservationRejectedTooFull &evt) {
2606         // for compat with old peers
2607         post_event(RemoteReservationCanceled());
2608         return discard_event();
2609       }
2610       boost::statechart::result react(const RemoteReservationCanceled &evt);
2611     };
2612
2613     struct RepNotRecovering : boost::statechart::state< RepNotRecovering, ReplicaActive>, NamedState {
2614       typedef boost::mpl::list<
2615         boost::statechart::custom_reaction< RequestRecoveryPrio >,
2616         boost::statechart::custom_reaction< RequestBackfillPrio >,
2617         boost::statechart::custom_reaction< RejectTooFullRemoteReservation >,
2618         boost::statechart::transition< RemoteReservationRejectedTooFull, RepNotRecovering >,
2619         boost::statechart::transition< RemoteReservationCanceled, RepNotRecovering >,
2620         boost::statechart::custom_reaction< RemoteRecoveryReserved >,
2621         boost::statechart::custom_reaction< RemoteBackfillReserved >,
2622         boost::statechart::transition< RecoveryDone, RepNotRecovering >  // for compat with pre-reservation peers
2623         > reactions;
2624       explicit RepNotRecovering(my_context ctx);
2625       boost::statechart::result react(const RequestRecoveryPrio &evt);
2626       boost::statechart::result react(const RequestBackfillPrio &evt);
2627       boost::statechart::result react(const RemoteBackfillReserved &evt) {
2628         // my reservation completion raced with a RELEASE from primary
2629         return discard_event();
2630       }
2631       boost::statechart::result react(const RemoteRecoveryReserved &evt) {
2632         // my reservation completion raced with a RELEASE from primary
2633         return discard_event();
2634       }
2635       boost::statechart::result react(const RejectTooFullRemoteReservation &evt);
2636       void exit();
2637     };
2638
2639     struct Recovering : boost::statechart::state< Recovering, Active >, NamedState {
2640       typedef boost::mpl::list <
2641         boost::statechart::custom_reaction< AllReplicasRecovered >,
2642         boost::statechart::custom_reaction< DeferRecovery >,
2643         boost::statechart::custom_reaction< UnfoundRecovery >,
2644         boost::statechart::custom_reaction< RequestBackfill >
2645         > reactions;
2646       explicit Recovering(my_context ctx);
2647       void exit();
2648       void release_reservations(bool cancel = false);
2649       boost::statechart::result react(const AllReplicasRecovered &evt);
2650       boost::statechart::result react(const DeferRecovery& evt);
2651       boost::statechart::result react(const UnfoundRecovery& evt);
2652       boost::statechart::result react(const RequestBackfill &evt);
2653     };
2654
2655     struct WaitRemoteRecoveryReserved : boost::statechart::state< WaitRemoteRecoveryReserved, Active >, NamedState {
2656       typedef boost::mpl::list <
2657         boost::statechart::custom_reaction< RemoteRecoveryReserved >,
2658         boost::statechart::transition< AllRemotesReserved, Recovering >
2659         > reactions;
2660       set<pg_shard_t>::const_iterator remote_recovery_reservation_it;
2661       explicit WaitRemoteRecoveryReserved(my_context ctx);
2662       boost::statechart::result react(const RemoteRecoveryReserved &evt);
2663       void exit();
2664     };
2665
2666     struct WaitLocalRecoveryReserved : boost::statechart::state< WaitLocalRecoveryReserved, Active >, NamedState {
2667       typedef boost::mpl::list <
2668         boost::statechart::transition< LocalRecoveryReserved, WaitRemoteRecoveryReserved >,
2669         boost::statechart::custom_reaction< RecoveryTooFull >
2670         > reactions;
2671       explicit WaitLocalRecoveryReserved(my_context ctx);
2672       void exit();
2673       boost::statechart::result react(const RecoveryTooFull &evt);
2674     };
2675
2676     struct Activating : boost::statechart::state< Activating, Active >, NamedState {
2677       typedef boost::mpl::list <
2678         boost::statechart::transition< AllReplicasRecovered, Recovered >,
2679         boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >,
2680         boost::statechart::transition< RequestBackfill, WaitLocalBackfillReserved >
2681         > reactions;
2682       explicit Activating(my_context ctx);
2683       void exit();
2684     };
2685
2686     struct Stray : boost::statechart::state< Stray, Started >,
2687               NamedState {
2688       explicit Stray(my_context ctx);
2689       void exit();
2690
2691       typedef boost::mpl::list <
2692         boost::statechart::custom_reaction< MQuery >,
2693         boost::statechart::custom_reaction< MLogRec >,
2694         boost::statechart::custom_reaction< MInfoRec >,
2695         boost::statechart::custom_reaction< ActMap >,
2696         boost::statechart::custom_reaction< RecoveryDone >,
2697         boost::statechart::transition<DeleteStart, ToDelete>
2698         > reactions;
2699       boost::statechart::result react(const MQuery& query);
2700       boost::statechart::result react(const MLogRec& logevt);
2701       boost::statechart::result react(const MInfoRec& infoevt);
2702       boost::statechart::result react(const ActMap&);
2703       boost::statechart::result react(const RecoveryDone&) {
2704         return discard_event();
2705       }
2706     };
2707
2708     struct WaitDeleteReserved;
2709     struct ToDelete : boost::statechart::state<ToDelete, Started, WaitDeleteReserved>, NamedState {
2710       unsigned priority = 0;
2711       typedef boost::mpl::list <
2712         boost::statechart::custom_reaction< ActMap >,
2713         boost::statechart::custom_reaction< DeleteSome >
2714         > reactions;
2715       explicit ToDelete(my_context ctx);
2716       boost::statechart::result react(const ActMap &evt);
2717       boost::statechart::result react(const DeleteSome &evt) {
2718         // happens if we drop out of Deleting due to reprioritization etc.
2719         return discard_event();
2720       }
2721       void exit();
2722     };
2723
2724     struct Deleting;
2725     struct WaitDeleteReserved : boost::statechart::state<WaitDeleteReserved,
2726                                                          ToDelete>, NamedState {
2727       typedef boost::mpl::list <
2728         boost::statechart::transition<DeleteReserved, Deleting>
2729         > reactions;
2730       explicit WaitDeleteReserved(my_context ctx);
2731       void exit();
2732     };
2733
2734     struct Deleting : boost::statechart::state<Deleting,
2735                                                ToDelete>, NamedState {
2736       typedef boost::mpl::list <
2737         boost::statechart::custom_reaction< DeleteSome >,
2738         boost::statechart::transition<DeleteInterrupted, WaitDeleteReserved>
2739         > reactions;
2740       explicit Deleting(my_context ctx);
2741       boost::statechart::result react(const DeleteSome &evt);
2742       void exit();
2743     };
2744
2745     struct GetLog;
2746
2747     struct GetInfo : boost::statechart::state< GetInfo, Peering >, NamedState {
2748       set<pg_shard_t> peer_info_requested;
2749
2750       explicit GetInfo(my_context ctx);
2751       void exit();
2752       void get_infos();
2753
2754       typedef boost::mpl::list <
2755         boost::statechart::custom_reaction< QueryState >,
2756         boost::statechart::transition< GotInfo, GetLog >,
2757         boost::statechart::custom_reaction< MNotifyRec >,
2758         boost::statechart::transition< IsDown, Down >
2759         > reactions;
2760       boost::statechart::result react(const QueryState& q);
2761       boost::statechart::result react(const MNotifyRec& infoevt);
2762     };
2763
2764     struct GotLog : boost::statechart::event< GotLog > {
2765       GotLog() : boost::statechart::event< GotLog >() {}
2766     };
2767
2768     struct GetLog : boost::statechart::state< GetLog, Peering >, NamedState {
2769       pg_shard_t auth_log_shard;
2770       boost::intrusive_ptr<MOSDPGLog> msg;
2771
2772       explicit GetLog(my_context ctx);
2773       void exit();
2774
2775       typedef boost::mpl::list <
2776         boost::statechart::custom_reaction< QueryState >,
2777         boost::statechart::custom_reaction< MLogRec >,
2778         boost::statechart::custom_reaction< GotLog >,
2779         boost::statechart::custom_reaction< AdvMap >,
2780         boost::statechart::transition< IsIncomplete, Incomplete >
2781         > reactions;
2782       boost::statechart::result react(const AdvMap&);
2783       boost::statechart::result react(const QueryState& q);
2784       boost::statechart::result react(const MLogRec& logevt);
2785       boost::statechart::result react(const GotLog&);
2786     };
2787
2788     struct WaitUpThru;
2789
2790     struct GetMissing : boost::statechart::state< GetMissing, Peering >, NamedState {
2791       set<pg_shard_t> peer_missing_requested;
2792
2793       explicit GetMissing(my_context ctx);
2794       void exit();
2795
2796       typedef boost::mpl::list <
2797         boost::statechart::custom_reaction< QueryState >,
2798         boost::statechart::custom_reaction< MLogRec >,
2799         boost::statechart::transition< NeedUpThru, WaitUpThru >
2800         > reactions;
2801       boost::statechart::result react(const QueryState& q);
2802       boost::statechart::result react(const MLogRec& logevt);
2803     };
2804
2805     struct WaitUpThru : boost::statechart::state< WaitUpThru, Peering >, NamedState {
2806       explicit WaitUpThru(my_context ctx);
2807       void exit();
2808
2809       typedef boost::mpl::list <
2810         boost::statechart::custom_reaction< QueryState >,
2811         boost::statechart::custom_reaction< ActMap >,
2812         boost::statechart::custom_reaction< MLogRec >
2813         > reactions;
2814       boost::statechart::result react(const QueryState& q);
2815       boost::statechart::result react(const ActMap& am);
2816       boost::statechart::result react(const MLogRec& logrec);
2817     };
2818
2819     struct Down : boost::statechart::state< Down, Peering>, NamedState {
2820       explicit Down(my_context ctx);
2821       typedef boost::mpl::list <
2822         boost::statechart::custom_reaction< QueryState >,
2823         boost::statechart::custom_reaction< MNotifyRec >
2824         > reactions;
2825       boost::statechart::result react(const QueryState& q);
2826       boost::statechart::result react(const MNotifyRec& infoevt);
2827       void exit();
2828     };
2829
2830     struct Incomplete : boost::statechart::state< Incomplete, Peering>, NamedState {
2831       typedef boost::mpl::list <
2832         boost::statechart::custom_reaction< AdvMap >,
2833         boost::statechart::custom_reaction< MNotifyRec >,
2834         boost::statechart::custom_reaction< QueryState >
2835         > reactions;
2836       explicit Incomplete(my_context ctx);
2837       boost::statechart::result react(const AdvMap &advmap);
2838       boost::statechart::result react(const MNotifyRec& infoevt);
2839       boost::statechart::result react(const QueryState& q);
2840       void exit();
2841     };
2842
2843     RecoveryMachine machine;
2844     PG *pg;
2845
2846     /// context passed in by state machine caller
2847     RecoveryCtx *orig_ctx;
2848
2849     /// populated if we are buffering messages pending a flush
2850     boost::optional<BufferedRecoveryMessages> messages_pending_flush;
2851
2852     /**
2853      * populated between start_handle() and end_handle(), points into
2854      * the message lists for messages_pending_flush while blocking messages
2855      * or into orig_ctx otherwise
2856      */
2857     boost::optional<RecoveryCtx> rctx;
2858
2859   public:
2860     explicit RecoveryState(PG *pg)
2861       : machine(this, pg), pg(pg), orig_ctx(0) {
2862       machine.initiate();
2863     }
2864
2865     void handle_event(const boost::statechart::event_base &evt,
2866                       RecoveryCtx *rctx) {
2867       start_handle(rctx);
2868       machine.process_event(evt);
2869       end_handle();
2870     }
2871
2872     void handle_event(PGPeeringEventRef evt,
2873                       RecoveryCtx *rctx) {
2874       start_handle(rctx);
2875       machine.process_event(evt->get_event());
2876       end_handle();
2877     }
2878
2879   } recovery_state;
2880
2881
2882
2883   uint64_t peer_features;
2884   uint64_t acting_features;
2885   uint64_t upacting_features;
2886
2887   epoch_t last_epoch;
2888
2889   /// most recently consumed osdmap's require_osd_version
2890   unsigned last_require_osd_release = 0;
2891   bool delete_needs_sleep = false;
2892
2893 protected:
2894   void reset_min_peer_features() {
2895     peer_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
2896   }
2897   uint64_t get_min_peer_features() const { return peer_features; }
2898   void apply_peer_features(uint64_t f) { peer_features &= f; }
2899
2900   uint64_t get_min_acting_features() const { return acting_features; }
2901   uint64_t get_min_upacting_features() const { return upacting_features; }
2902   bool perform_deletes_during_peering() const {
2903     return !(get_osdmap()->test_flag(CEPH_OSDMAP_RECOVERY_DELETES));
2904   }
2905
2906   bool hard_limit_pglog() const {
2907     return (get_osdmap()->test_flag(CEPH_OSDMAP_PGLOG_HARDLIMIT));
2908   }
2909
2910   void init_primary_up_acting(
2911     const vector<int> &newup,
2912     const vector<int> &newacting,
2913     int new_up_primary,
2914     int new_acting_primary) {
2915     actingset.clear();
2916     acting = newacting;
2917     for (uint8_t i = 0; i < acting.size(); ++i) {
2918       if (acting[i] != CRUSH_ITEM_NONE)
2919         actingset.insert(
2920           pg_shard_t(
2921             acting[i],
2922             pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
2923     }
2924     upset.clear();
2925     up = newup;
2926     for (uint8_t i = 0; i < up.size(); ++i) {
2927       if (up[i] != CRUSH_ITEM_NONE)
2928         upset.insert(
2929           pg_shard_t(
2930             up[i],
2931             pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
2932     }
2933     if (!pool.info.is_erasure()) {
2934       up_primary = pg_shard_t(new_up_primary, shard_id_t::NO_SHARD);
2935       primary = pg_shard_t(new_acting_primary, shard_id_t::NO_SHARD);
2936       return;
2937     }
2938     up_primary = pg_shard_t();
2939     primary = pg_shard_t();
2940     for (uint8_t i = 0; i < up.size(); ++i) {
2941       if (up[i] == new_up_primary) {
2942         up_primary = pg_shard_t(up[i], shard_id_t(i));
2943         break;
2944       }
2945     }
2946     for (uint8_t i = 0; i < acting.size(); ++i) {
2947       if (acting[i] == new_acting_primary) {
2948         primary = pg_shard_t(acting[i], shard_id_t(i));
2949         break;
2950       }
2951     }
2952     ceph_assert(up_primary.osd == new_up_primary);
2953     ceph_assert(primary.osd == new_acting_primary);
2954   }
2955
2956   void set_role(int r) {
2957     role = r;
2958   }
2959
2960   bool state_test(uint64_t m) const { return (state & m) != 0; }
2961   void state_set(uint64_t m) { state |= m; }
2962   void state_clear(uint64_t m) { state &= ~m; }
2963
2964   bool is_complete() const { return info.last_complete == info.last_update; }
2965   bool should_send_notify() const { return send_notify; }
2966
2967   uint64_t get_state() const { return state; }
2968   bool is_active() const { return state_test(PG_STATE_ACTIVE); }
2969   bool is_activating() const { return state_test(PG_STATE_ACTIVATING); }
2970   bool is_peering() const { return state_test(PG_STATE_PEERING); }
2971   bool is_down() const { return state_test(PG_STATE_DOWN); }
2972   bool is_recovery_unfound() const { return state_test(PG_STATE_RECOVERY_UNFOUND); }
2973   bool is_backfill_unfound() const { return state_test(PG_STATE_BACKFILL_UNFOUND); }
2974   bool is_incomplete() const { return state_test(PG_STATE_INCOMPLETE); }
2975   bool is_clean() const { return state_test(PG_STATE_CLEAN); }
2976   bool is_degraded() const { return state_test(PG_STATE_DEGRADED); }
2977   bool is_undersized() const { return state_test(PG_STATE_UNDERSIZED); }
2978   bool is_scrubbing() const { return state_test(PG_STATE_SCRUBBING); }
2979   bool is_remapped() const { return state_test(PG_STATE_REMAPPED); }
2980   bool is_peered() const {
2981     return state_test(PG_STATE_ACTIVE) || state_test(PG_STATE_PEERED);
2982   }
2983   bool is_recovering() const { return state_test(PG_STATE_RECOVERING); }
2984   bool is_premerge() const { return state_test(PG_STATE_PREMERGE); }
2985   bool is_repair() const { return state_test(PG_STATE_REPAIR); }
2986
2987   bool is_empty() const { return info.last_update == eversion_t(0,0); }
2988
2989   // pg on-disk state
2990   void do_pending_flush();
2991
2992 public:
2993   static void _create(ObjectStore::Transaction& t, spg_t pgid, int bits);
2994   static void _init(ObjectStore::Transaction& t,
2995                     spg_t pgid, const pg_pool_t *pool);
2996
2997 protected:
2998   void prepare_write_info(map<string,bufferlist> *km);
2999
3000   void update_store_with_options();
3001
3002 public:
3003   static int _prepare_write_info(
3004     CephContext* cct,
3005     map<string,bufferlist> *km,
3006     epoch_t epoch,
3007     pg_info_t &info,
3008     pg_info_t &last_written_info,
3009     PastIntervals &past_intervals,
3010     bool dirty_big_info,
3011     bool dirty_epoch,
3012     bool try_fast_info,
3013     PerfCounters *logger = nullptr);
3014
3015   void write_if_dirty(RecoveryCtx *rctx) {
3016     write_if_dirty(*rctx->transaction);
3017   }
3018 protected:
3019   void write_if_dirty(ObjectStore::Transaction& t);
3020
3021   PGLog::IndexedLog projected_log;
3022   bool check_in_progress_op(
3023     const osd_reqid_t &r,
3024     eversion_t *version,
3025     version_t *user_version,
3026     int *return_code) const;
3027   eversion_t projected_last_update;
3028   eversion_t get_next_version() const {
3029     eversion_t at_version(
3030       get_osdmap_epoch(),
3031       projected_last_update.version+1);
3032     ceph_assert(at_version > info.last_update);
3033     ceph_assert(at_version > pg_log.get_head());
3034     ceph_assert(at_version > projected_last_update);
3035     return at_version;
3036   }
3037
3038   void add_log_entry(const pg_log_entry_t& e, bool applied);
3039   void append_log(
3040     const vector<pg_log_entry_t>& logv,
3041     eversion_t trim_to,
3042     eversion_t roll_forward_to,
3043     ObjectStore::Transaction &t,
3044     bool transaction_applied = true,
3045     bool async = false);
3046   bool check_log_for_corruption(ObjectStore *store);
3047
3048   std::string get_corrupt_pg_log_name() const;
3049
3050   void update_snap_map(
3051     const vector<pg_log_entry_t> &log_entries,
3052     ObjectStore::Transaction& t);
3053
3054   void filter_snapc(vector<snapid_t> &snaps);
3055
3056   void log_weirdness();
3057
3058   virtual void kick_snap_trim() = 0;
3059   virtual void snap_trimmer_scrub_complete() = 0;
3060   bool requeue_scrub(bool high_priority = false);
3061   void queue_recovery();
3062   bool queue_scrub();
3063   unsigned get_scrub_priority();
3064
3065   /// share pg info after a pg is active
3066   void share_pg_info();
3067
3068
3069   bool append_log_entries_update_missing(
3070     const mempool::osd_pglog::list<pg_log_entry_t> &entries,
3071     ObjectStore::Transaction &t,
3072     boost::optional<eversion_t> trim_to,
3073     boost::optional<eversion_t> roll_forward_to);
3074
3075   /**
3076    * Merge entries updating missing as necessary on all
3077    * acting_recovery_backfill logs and missings (also missing_loc)
3078    */
3079   void merge_new_log_entries(
3080     const mempool::osd_pglog::list<pg_log_entry_t> &entries,
3081     ObjectStore::Transaction &t,
3082     boost::optional<eversion_t> trim_to,
3083     boost::optional<eversion_t> roll_forward_to);
3084
3085   void reset_interval_flush();
3086   void start_peering_interval(
3087     const OSDMapRef lastmap,
3088     const vector<int>& newup, int up_primary,
3089     const vector<int>& newacting, int acting_primary,
3090     ObjectStore::Transaction *t);
3091   void on_new_interval();
3092   virtual void _on_new_interval() = 0;
3093   void start_flush(ObjectStore::Transaction *t);
3094   void set_last_peering_reset();
3095
3096   void update_history(const pg_history_t& history);
3097   void fulfill_info(pg_shard_t from, const pg_query_t &query,
3098                     pair<pg_shard_t, pg_info_t> &notify_info);
3099   void fulfill_log(pg_shard_t from, const pg_query_t &query, epoch_t query_epoch);
3100   void fulfill_query(const MQuery& q, RecoveryCtx *rctx);
3101   void check_full_transition(OSDMapRef lastmap, OSDMapRef osdmap);
3102
3103   bool should_restart_peering(
3104     int newupprimary,
3105     int newactingprimary,
3106     const vector<int>& newup,
3107     const vector<int>& newacting,
3108     OSDMapRef lastmap,
3109     OSDMapRef osdmap);
3110
3111   // OpRequest queueing
3112   bool can_discard_op(OpRequestRef& op);
3113   bool can_discard_scan(OpRequestRef op);
3114   bool can_discard_backfill(OpRequestRef op);
3115   bool can_discard_request(OpRequestRef& op);
3116
3117   template<typename T, int MSGTYPE>
3118   bool can_discard_replica_op(OpRequestRef& op);
3119
3120   bool old_peering_msg(epoch_t reply_epoch, epoch_t query_epoch);
3121   bool old_peering_evt(PGPeeringEventRef evt) {
3122     return old_peering_msg(evt->get_epoch_sent(), evt->get_epoch_requested());
3123   }
3124   static bool have_same_or_newer_map(epoch_t cur_epoch, epoch_t e) {
3125     return e <= cur_epoch;
3126   }
3127   bool have_same_or_newer_map(epoch_t e) {
3128     return e <= get_osdmap_epoch();
3129   }
3130
3131   bool op_has_sufficient_caps(OpRequestRef& op);
3132
3133
3134   // recovery bits
3135   void take_waiters();
3136
3137
3138   // abstract bits
3139   friend class FlushState;
3140
3141   virtual void on_role_change() = 0;
3142   virtual void on_pool_change() = 0;
3143   virtual void on_change(ObjectStore::Transaction *t) = 0;
3144   virtual void on_activate() = 0;
3145   virtual void on_flushed() = 0;
3146   virtual void check_blacklisted_watchers() = 0;
3147
3148   friend ostream& operator<<(ostream& out, const PG& pg);
3149 };
3150
3151
3152 ostream& operator<<(ostream& out, const PG::BackfillInterval& bi);
3153
3154 #endif