ceph/src/osd/PG.h

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Ceph - scalable distributed file system
   5  *
   6  * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
   7  *
   8  * This is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License version 2.1, as published by the Free Software
  11  * Foundation.  See file COPYING.
  12  *
  13  */
  14
  15 #ifndef CEPH_PG_H
  16 #define CEPH_PG_H
  17
  18 #include <boost/statechart/custom_reaction.hpp>
  19 #include <boost/statechart/event.hpp>
  20 #include <boost/statechart/simple_state.hpp>
  21 #include <boost/statechart/state.hpp>
  22 #include <boost/statechart/state_machine.hpp>
  23 #include <boost/statechart/transition.hpp>
  24 #include <boost/statechart/event_base.hpp>
  25 #include <boost/scoped_ptr.hpp>
  26 #include <boost/circular_buffer.hpp>
  27 #include <boost/container/flat_set.hpp>
  28 #include "include/mempool.h"
  29
  30 // re-include our assert to clobber boost's
  31 #include "include/ceph_assert.h"
  32
  33 #include "include/types.h"
  34 #include "include/stringify.h"
  35 #include "osd_types.h"
  36 #include "include/xlist.h"
  37 #include "SnapMapper.h"
  38 #include "Session.h"
  39 #include "common/Timer.h"
  40
  41 #include "PGLog.h"
  42 #include "OSDMap.h"
  43 #include "messages/MOSDPGLog.h"
  44 #include "include/str_list.h"
  45 #include "PGBackend.h"
  46 #include "PGPeeringEvent.h"
  47
  48 #include "mgr/OSDPerfMetricTypes.h"
  49
  50 #include <atomic>
  51 #include <list>
  52 #include <memory>
  53 #include <stack>
  54 #include <string>
  55 #include <tuple>
  56
  57 //#define DEBUG_RECOVERY_OIDS   // track set of recovering oids explicitly, to find counting bugs
  58 //#define PG_DEBUG_REFS    // track provenance of pg refs, helpful for finding leaks
  59
  60 class OSD;
  61 class OSDService;
  62 class OSDShard;
  63 class OSDShardPGSlot;
  64 class MOSDOp;
  65 class MOSDPGScan;
  66 class MOSDPGBackfill;
  67 class MOSDPGInfo;
  68
  69 class PG;
  70 struct OpRequest;
  71 typedef OpRequest::Ref OpRequestRef;
  72 class MOSDPGLog;
  73 class CephContext;
  74 class DynamicPerfStats;
  75
  76 namespace Scrub {
  77   class Store;
  78 }
  79
  80 using state_history_entry = std::tuple<utime_t, utime_t, const char*>;
  81 using embedded_state = std::pair<utime_t, const char*>;
  82
  83 struct PGStateInstance {
  84   // Time spent in pg states
  85
  86   void setepoch(const epoch_t current_epoch) {
  87     this_epoch = current_epoch;
  88   }
  89
  90   void enter_state(const utime_t entime, const char* state) {
  91     embedded_states.push(std::make_pair(entime, state));
  92   }
  93
  94   void exit_state(const utime_t extime) {
  95     embedded_state this_state = embedded_states.top();
  96     state_history.push_back(state_history_entry{
  97         this_state.first, extime, this_state.second});
  98     embedded_states.pop();
  99   }
 100
 101   epoch_t this_epoch;
 102   utime_t enter_time;
 103   std::vector<state_history_entry> state_history;
 104   std::stack<embedded_state> embedded_states;
 105 };
 106
 107 class PGStateHistory {
 108   // Member access protected with the PG lock
 109 public:
 110   PGStateHistory() : buffer(10) {}
 111
 112   void enter(PG* pg, const utime_t entime, const char* state);
 113
 114   void exit(const char* state);
 115
 116   void reset() {
 117     pi = nullptr;
 118   }
 119
 120   void set_pg_in_destructor() { pg_in_destructor = true; }
 121
 122   void dump(Formatter* f) const;
 123
 124   string get_current_state() {
 125     if (pi == nullptr) return "unknown";
 126     return std::get<1>(pi->embedded_states.top());
 127   }
 128
 129 private:
 130   bool pg_in_destructor = false;
 131   PG* thispg = nullptr;
 132   std::unique_ptr<PGStateInstance> tmppi;
 133   PGStateInstance* pi = nullptr;
 134   boost::circular_buffer<std::unique_ptr<PGStateInstance>> buffer;
 135
 136 };
 137
 138 #ifdef PG_DEBUG_REFS
 139 #include "common/tracked_int_ptr.hpp"
 140   uint64_t get_with_id(PG *pg);
 141   void put_with_id(PG *pg, uint64_t id);
 142   typedef TrackedIntPtr<PG> PGRef;
 143 #else
 144   typedef boost::intrusive_ptr<PG> PGRef;
 145 #endif
 146
 147 class PGRecoveryStats {
 148   struct per_state_info {
 149     uint64_t enter, exit;     // enter/exit counts
 150     uint64_t events;
 151     utime_t event_time;       // time spent processing events
 152     utime_t total_time;       // total time in state
 153     utime_t min_time, max_time;
 154
 155     // cppcheck-suppress unreachableCode
 156     per_state_info() : enter(0), exit(0), events(0) {}
 157   };
 158   map<const char *,per_state_info> info;
 159   Mutex lock;
 160
 161   public:
 162   PGRecoveryStats() : lock("PGRecoverStats::lock") {}
 163
 164   void reset() {
 165     std::lock_guard l(lock);
 166     info.clear();
 167   }
 168   void dump(ostream& out) {
 169     std::lock_guard l(lock);
 170     for (map<const char *,per_state_info>::iterator p = info.begin(); p != info.end(); ++p) {
 171       per_state_info& i = p->second;
 172       out << i.enter << "\t" << i.exit << "\t"
 173           << i.events << "\t" << i.event_time << "\t"
 174           << i.total_time << "\t"
 175           << i.min_time << "\t" << i.max_time << "\t"
 176           << p->first << "\n";
 177     }
 178   }
 179
 180   void dump_formatted(Formatter *f) {
 181     std::lock_guard l(lock);
 182     f->open_array_section("pg_recovery_stats");
 183     for (map<const char *,per_state_info>::iterator p = info.begin();
 184          p != info.end(); ++p) {
 185       per_state_info& i = p->second;
 186       f->open_object_section("recovery_state");
 187       f->dump_int("enter", i.enter);
 188       f->dump_int("exit", i.exit);
 189       f->dump_int("events", i.events);
 190       f->dump_stream("event_time") << i.event_time;
 191       f->dump_stream("total_time") << i.total_time;
 192       f->dump_stream("min_time") << i.min_time;
 193       f->dump_stream("max_time") << i.max_time;
 194       vector<string> states;
 195       get_str_vec(p->first, "/", states);
 196       f->open_array_section("nested_states");
 197       for (vector<string>::iterator st = states.begin();
 198            st != states.end(); ++st) {
 199         f->dump_string("state", *st);
 200       }
 201       f->close_section();
 202       f->close_section();
 203     }
 204     f->close_section();
 205   }
 206
 207   void log_enter(const char *s) {
 208     std::lock_guard l(lock);
 209     info[s].enter++;
 210   }
 211   void log_exit(const char *s, utime_t dur, uint64_t events, utime_t event_dur) {
 212     std::lock_guard l(lock);
 213     per_state_info &i = info[s];
 214     i.exit++;
 215     i.total_time += dur;
 216     if (dur > i.max_time)
 217       i.max_time = dur;
 218     if (dur < i.min_time || i.min_time == utime_t())
 219       i.min_time = dur;
 220     i.events += events;
 221     i.event_time += event_dur;
 222   }
 223 };
 224
 225 struct PGPool {
 226   CephContext* cct;
 227   epoch_t cached_epoch;
 228   int64_t id;
 229   string name;
 230
 231   pg_pool_t info;
 232   SnapContext snapc;   // the default pool snapc, ready to go.
 233
 234   // these two sets are for < mimic only
 235   interval_set<snapid_t> cached_removed_snaps;      // current removed_snaps set
 236   interval_set<snapid_t> newly_removed_snaps;  // newly removed in the last epoch
 237
 238   PGPool(CephContext* cct, OSDMapRef map, int64_t i, const pg_pool_t& info,
 239          const string& name)
 240     : cct(cct),
 241       cached_epoch(map->get_epoch()),
 242       id(i),
 243       name(name),
 244       info(info) {
 245     snapc = info.get_snap_context();
 246     if (map->require_osd_release < CEPH_RELEASE_MIMIC) {
 247       info.build_removed_snaps(cached_removed_snaps);
 248     }
 249   }
 250
 251   void update(CephContext *cct, OSDMapRef map);
 252 };
 253
 254 /** PG - Replica Placement Group
 255  *
 256  */
 257
 258 class PG : public DoutPrefixProvider {
 259 public:
 260   // -- members --
 261   const spg_t pg_id;
 262   const coll_t coll;
 263
 264   ObjectStore::CollectionHandle ch;
 265
 266   struct RecoveryCtx;
 267
 268   // -- methods --
 269   std::ostream& gen_prefix(std::ostream& out) const override;
 270   CephContext *get_cct() const override {
 271     return cct;
 272   }
 273   unsigned get_subsys() const override {
 274     return ceph_subsys_osd;
 275   }
 276
 277   const OSDMapRef& get_osdmap() const {
 278     ceph_assert(is_locked());
 279     ceph_assert(osdmap_ref);
 280     return osdmap_ref;
 281   }
 282   epoch_t get_osdmap_epoch() const {
 283     return osdmap_ref->get_epoch();
 284   }
 285
 286   void lock_suspend_timeout(ThreadPool::TPHandle &handle) {
 287     handle.suspend_tp_timeout();
 288     lock();
 289     handle.reset_tp_timeout();
 290   }
 291   void lock(bool no_lockdep = false) const;
 292   void unlock() const {
 293     //generic_dout(0) << this << " " << info.pgid << " unlock" << dendl;
 294     ceph_assert(!dirty_info);
 295     ceph_assert(!dirty_big_info);
 296     _lock.Unlock();
 297   }
 298   bool is_locked() const {
 299     return _lock.is_locked();
 300   }
 301
 302   const spg_t& get_pgid() const {
 303     return pg_id;
 304   }
 305
 306   const PGPool& get_pool() const {
 307     return pool;
 308   }
 309   uint64_t get_last_user_version() const {
 310     return info.last_user_version;
 311   }
 312   const pg_history_t& get_history() const {
 313     return info.history;
 314   }
 315   bool get_need_up_thru() const {
 316     return need_up_thru;
 317   }
 318   epoch_t get_same_interval_since() const {
 319     return info.history.same_interval_since;
 320   }
 321
 322   void set_last_scrub_stamp(utime_t t) {
 323     info.stats.last_scrub_stamp = t;
 324     info.history.last_scrub_stamp = t;
 325   }
 326
 327   void set_last_deep_scrub_stamp(utime_t t) {
 328     info.stats.last_deep_scrub_stamp = t;
 329     info.history.last_deep_scrub_stamp = t;
 330   }
 331
 332   bool is_deleting() const {
 333     return deleting;
 334   }
 335   bool is_deleted() const {
 336     return deleted;
 337   }
 338   bool is_replica() const {
 339     return role > 0;
 340   }
 341   bool is_primary() const {
 342     return pg_whoami == primary;
 343   }
 344   bool pg_has_reset_since(epoch_t e) {
 345     ceph_assert(is_locked());
 346     return deleted || e < get_last_peering_reset();
 347   }
 348
 349   bool is_ec_pg() const {
 350     return pool.info.is_erasure();
 351   }
 352   int get_role() const {
 353     return role;
 354   }
 355   const vector<int> get_acting() const {
 356     return acting;
 357   }
 358   int get_acting_primary() const {
 359     return primary.osd;
 360   }
 361   pg_shard_t get_primary() const {
 362     return primary;
 363   }
 364   const vector<int> get_up() const {
 365     return up;
 366   }
 367   int get_up_primary() const {
 368     return up_primary.osd;
 369   }
 370   const PastIntervals& get_past_intervals() const {
 371     return past_intervals;
 372   }
 373
 374   /// initialize created PG
 375   void init(
 376     int role,
 377     const vector<int>& up,
 378     int up_primary,
 379     const vector<int>& acting,
 380     int acting_primary,
 381     const pg_history_t& history,
 382     const PastIntervals& pim,
 383     bool backfill,
 384     ObjectStore::Transaction *t);
 385
 386   /// read existing pg state off disk
 387   void read_state(ObjectStore *store);
 388   static int peek_map_epoch(ObjectStore *store, spg_t pgid, epoch_t *pepoch);
 389
 390   static int get_latest_struct_v() {
 391     return latest_struct_v;
 392   }
 393   static int get_compat_struct_v() {
 394     return compat_struct_v;
 395   }
 396   static int read_info(
 397     ObjectStore *store, spg_t pgid, const coll_t &coll,
 398     pg_info_t &info, PastIntervals &past_intervals,
 399     __u8 &);
 400   static bool _has_removal_flag(ObjectStore *store, spg_t pgid);
 401
 402   void rm_backoff(BackoffRef b);
 403
 404   void update_snap_mapper_bits(uint32_t bits) {
 405     snap_mapper.update_bits(bits);
 406   }
 407   void start_split_stats(const set<spg_t>& childpgs, vector<object_stat_sum_t> *v);
 408   virtual void split_colls(
 409     spg_t child,
 410     int split_bits,
 411     int seed,
 412     const pg_pool_t *pool,
 413     ObjectStore::Transaction *t) = 0;
 414   void split_into(pg_t child_pgid, PG *child, unsigned split_bits);
 415   void merge_from(map<spg_t,PGRef>& sources, RecoveryCtx *rctx,
 416                   unsigned split_bits,
 417                   const pg_merge_meta_t& last_pg_merge_meta);
 418   void finish_split_stats(const object_stat_sum_t& stats, ObjectStore::Transaction *t);
 419
 420   void scrub(epoch_t queued, ThreadPool::TPHandle &handle);
 421
 422   bool is_scrub_registered();
 423   void reg_next_scrub();
 424   void unreg_next_scrub();
 425
 426   void on_info_history_change();
 427
 428   void scrub_requested(bool deep, bool repair, bool need_auto = false);
 429
 430   bool is_forced_recovery_or_backfill() const {
 431     return get_state() & (PG_STATE_FORCED_RECOVERY | PG_STATE_FORCED_BACKFILL);
 432   }
 433   bool set_force_recovery(bool b);
 434   bool set_force_backfill(bool b);
 435
 436   void queue_peering_event(PGPeeringEventRef evt);
 437   void do_peering_event(PGPeeringEventRef evt, RecoveryCtx *rcx);
 438   void queue_null(epoch_t msg_epoch, epoch_t query_epoch);
 439   void queue_flushed(epoch_t started_at);
 440   void handle_advance_map(
 441     OSDMapRef osdmap, OSDMapRef lastmap,
 442     vector<int>& newup, int up_primary,
 443     vector<int>& newacting, int acting_primary,
 444     RecoveryCtx *rctx);
 445   void handle_activate_map(RecoveryCtx *rctx);
 446   void handle_initialize(RecoveryCtx *rctx);
 447   void handle_query_state(Formatter *f);
 448
 449   /**
 450    * @param ops_begun returns how many recovery ops the function started
 451    * @returns true if any useful work was accomplished; false otherwise
 452    */
 453   virtual bool start_recovery_ops(
 454     uint64_t max,
 455     ThreadPool::TPHandle &handle,
 456     uint64_t *ops_begun) = 0;
 457
 458   // more work after the above, but with a RecoveryCtx
 459   void find_unfound(epoch_t queued, RecoveryCtx *rctx);
 460
 461   virtual void get_watchers(std::list<obj_watch_item_t> *ls) = 0;
 462
 463   void dump_pgstate_history(Formatter *f);
 464   void dump_missing(Formatter *f);
 465
 466   void get_pg_stats(std::function<void(const pg_stat_t&, epoch_t lec)> f);
 467   void with_heartbeat_peers(std::function<void(int)> f);
 468
 469   void shutdown();
 470   virtual void on_shutdown() = 0;
 471
 472   bool get_must_scrub() const {
 473     return scrubber.must_scrub;
 474   }
 475   bool sched_scrub();
 476
 477   virtual void do_request(
 478     OpRequestRef& op,
 479     ThreadPool::TPHandle &handle
 480   ) = 0;
 481   virtual void clear_cache() = 0;
 482   virtual int get_cache_obj_count() = 0;
 483
 484   virtual void snap_trimmer(epoch_t epoch_queued) = 0;
 485   virtual int do_command(
 486     cmdmap_t cmdmap,
 487     ostream& ss,
 488     bufferlist& idata,
 489     bufferlist& odata,
 490     ConnectionRef conn,
 491     ceph_tid_t tid) = 0;
 492
 493   virtual bool agent_work(int max) = 0;
 494   virtual bool agent_work(int max, int agent_flush_quota) = 0;
 495   virtual void agent_stop() = 0;
 496   virtual void agent_delay() = 0;
 497   virtual void agent_clear() = 0;
 498   virtual void agent_choose_mode_restart() = 0;
 499
 500   virtual void on_removal(ObjectStore::Transaction *t) = 0;
 501
 502   void _delete_some(ObjectStore::Transaction *t);
 503
 504   virtual void set_dynamic_perf_stats_queries(
 505     const std::list<OSDPerfMetricQuery> &queries) {
 506   }
 507   virtual void get_dynamic_perf_stats(DynamicPerfStats *stats) {
 508   }
 509
 510   // reference counting
 511 #ifdef PG_DEBUG_REFS
 512   uint64_t get_with_id();
 513   void put_with_id(uint64_t);
 514   void dump_live_ids();
 515 #endif
 516   void get(const char* tag);
 517   void put(const char* tag);
 518   int get_num_ref() {
 519     return ref;
 520   }
 521
 522   // ctor
 523   PG(OSDService *o, OSDMapRef curmap,
 524      const PGPool &pool, spg_t p);
 525   ~PG() override;
 526
 527   // prevent copying
 528   explicit PG(const PG& rhs) = delete;
 529   PG& operator=(const PG& rhs) = delete;
 530
 531 protected:
 532   // -------------
 533   // protected
 534   OSDService *osd;
 535 public:
 536   OSDShard *osd_shard = nullptr;
 537   OSDShardPGSlot *pg_slot = nullptr;
 538 protected:
 539   CephContext *cct;
 540
 541   // osdmap
 542   OSDMapRef osdmap_ref;
 543
 544   PGPool pool;
 545
 546   // locking and reference counting.
 547   // I destroy myself when the reference count hits zero.
 548   // lock() should be called before doing anything.
 549   // get() should be called on pointer copy (to another thread, etc.).
 550   // put() should be called on destruction of some previously copied pointer.
 551   // unlock() when done with the current pointer (_most common_).
 552   mutable Mutex _lock = {"PG::_lock"};
 553
 554   std::atomic<unsigned int> ref{0};
 555
 556 #ifdef PG_DEBUG_REFS
 557   Mutex _ref_id_lock = {"PG::_ref_id_lock"};
 558   map<uint64_t, string> _live_ids;
 559   map<string, uint64_t> _tag_counts;
 560   uint64_t _ref_id = 0;
 561
 562   friend uint64_t get_with_id(PG *pg) { return pg->get_with_id(); }
 563   friend void put_with_id(PG *pg, uint64_t id) { return pg->put_with_id(id); }
 564 #endif
 565
 566 private:
 567   friend void intrusive_ptr_add_ref(PG *pg) {
 568     pg->get("intptr");
 569   }
 570   friend void intrusive_ptr_release(PG *pg) {
 571     pg->put("intptr");
 572   }
 573
 574
 575   // =====================
 576
 577 protected:
 578   OSDriver osdriver;
 579   SnapMapper snap_mapper;
 580   bool eio_errors_to_process = false;
 581
 582   virtual PGBackend *get_pgbackend() = 0;
 583   virtual const PGBackend* get_pgbackend() const = 0;
 584
 585 protected:
 586   /*** PG ****/
 587   /// get_is_recoverable_predicate: caller owns returned pointer and must delete when done
 588   IsPGRecoverablePredicate *get_is_recoverable_predicate() const {
 589     return get_pgbackend()->get_is_recoverable_predicate();
 590   }
 591 protected:
 592   epoch_t last_persisted_osdmap;
 593
 594   void requeue_map_waiters();
 595
 596   void update_osdmap_ref(OSDMapRef newmap) {
 597     ceph_assert(_lock.is_locked_by_me());
 598     osdmap_ref = std::move(newmap);
 599   }
 600
 601 protected:
 602
 603
 604   bool deleting;  // true while in removing or OSD is shutting down
 605   atomic<bool> deleted = {false};
 606
 607   ZTracer::Endpoint trace_endpoint;
 608
 609
 610 protected:
 611   bool dirty_info, dirty_big_info;
 612
 613 protected:
 614   // pg state
 615   pg_info_t info;               ///< current pg info
 616   pg_info_t last_written_info;  ///< last written info
 617   __u8 info_struct_v = 0;
 618   static const __u8 latest_struct_v = 10;
 619   // v10 is the new past_intervals encoding
 620   // v9 was fastinfo_key addition
 621   // v8 was the move to a per-pg pgmeta object
 622   // v7 was SnapMapper addition in 86658392516d5175b2756659ef7ffaaf95b0f8ad
 623   // (first appeared in cuttlefish).
 624   static const __u8 compat_struct_v = 10;
 625   void upgrade(ObjectStore *store);
 626
 627 protected:
 628   PGLog  pg_log;
 629   ghobject_t    pgmeta_oid;
 630
 631   // ------------------
 632   // MissingLoc
 633
 634   class MissingLoc {
 635   public:
 636     // a loc_count indicates how many locations we know in each of
 637     // these distinct sets
 638     struct loc_count_t {
 639       int up = 0;        //< up
 640       int other = 0;    //< other
 641
 642       friend bool operator<(const loc_count_t& l,
 643                             const loc_count_t& r) {
 644         return (l.up < r.up ||
 645                 (l.up == r.up &&
 646                    (l.other < r.other)));
 647       }
 648       friend ostream& operator<<(ostream& out, const loc_count_t& l) {
 649         ceph_assert(l.up >= 0);
 650         ceph_assert(l.other >= 0);
 651         return out << "(" << l.up << "+" << l.other << ")";
 652       }
 653     };
 654
 655
 656   private:
 657
 658     loc_count_t _get_count(const set<pg_shard_t>& shards) {
 659       loc_count_t r;
 660       for (auto s : shards) {
 661         if (pg->upset.count(s)) {
 662           r.up++;
 663         } else {
 664           r.other++;
 665         }
 666       }
 667       return r;
 668     }
 669
 670     map<hobject_t, pg_missing_item> needs_recovery_map;
 671     map<hobject_t, set<pg_shard_t> > missing_loc;
 672     set<pg_shard_t> missing_loc_sources;
 673
 674     // for every entry in missing_loc, we count how many of each type of shard we have,
 675     // and maintain totals here.  The sum of the values for this map will always equal
 676     // missing_loc.size().
 677     map < shard_id_t, map<loc_count_t,int> > missing_by_count;
 678
 679    void pgs_by_shard_id(const set<pg_shard_t>& s, map< shard_id_t, set<pg_shard_t> >& pgsbs) {
 680       if (pg->get_osdmap()->pg_is_ec(pg->info.pgid.pgid)) {
 681         int num_shards = pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid);
 682         // For completely missing shards initialize with empty set<pg_shard_t>
 683         for (int i = 0 ; i < num_shards ; ++i) {
 684           shard_id_t shard(i);
 685           pgsbs[shard];
 686         }
 687         for (auto pgs: s)
 688           pgsbs[pgs.shard].insert(pgs);
 689       } else {
 690         pgsbs[shard_id_t::NO_SHARD] = s;
 691       }
 692     }
 693
 694     void _inc_count(const set<pg_shard_t>& s) {
 695       map< shard_id_t, set<pg_shard_t> > pgsbs;
 696       pgs_by_shard_id(s, pgsbs);
 697       for (auto shard: pgsbs)
 698         ++missing_by_count[shard.first][_get_count(shard.second)];
 699     }
 700     void _dec_count(const set<pg_shard_t>& s) {
 701       map< shard_id_t, set<pg_shard_t> > pgsbs;
 702       pgs_by_shard_id(s, pgsbs);
 703       for (auto shard: pgsbs) {
 704         auto p = missing_by_count[shard.first].find(_get_count(shard.second));
 705         ceph_assert(p != missing_by_count[shard.first].end());
 706         if (--p->second == 0) {
 707           missing_by_count[shard.first].erase(p);
 708         }
 709       }
 710     }
 711
 712     PG *pg;
 713     set<pg_shard_t> empty_set;
 714   public:
 715     boost::scoped_ptr<IsPGReadablePredicate> is_readable;
 716     boost::scoped_ptr<IsPGRecoverablePredicate> is_recoverable;
 717     explicit MissingLoc(PG *pg)
 718       : pg(pg) { }
 719     void set_backend_predicates(
 720       IsPGReadablePredicate *_is_readable,
 721       IsPGRecoverablePredicate *_is_recoverable) {
 722       is_readable.reset(_is_readable);
 723       is_recoverable.reset(_is_recoverable);
 724     }
 725     std::ostream& gen_prefix(std::ostream& out) const {
 726       return pg->gen_prefix(out);
 727     }
 728     bool needs_recovery(
 729       const hobject_t &hoid,
 730       eversion_t *v = 0) const {
 731       map<hobject_t, pg_missing_item>::const_iterator i =
 732         needs_recovery_map.find(hoid);
 733       if (i == needs_recovery_map.end())
 734         return false;
 735       if (v)
 736         *v = i->second.need;
 737       return true;
 738     }
 739     bool is_deleted(const hobject_t &hoid) const {
 740       auto i = needs_recovery_map.find(hoid);
 741       if (i == needs_recovery_map.end())
 742         return false;
 743       return i->second.is_delete();
 744     }
 745     bool is_unfound(const hobject_t &hoid) const {
 746       auto it = needs_recovery_map.find(hoid);
 747       if (it == needs_recovery_map.end()) {
 748         return false;
 749       }
 750       if (it->second.is_delete()) {
 751         return false;
 752       }
 753       auto mit = missing_loc.find(hoid);
 754       return mit == missing_loc.end() || !(*is_recoverable)(mit->second);
 755     }
 756     bool readable_with_acting(
 757       const hobject_t &hoid,
 758       const set<pg_shard_t> &acting) const;
 759     uint64_t num_unfound() const {
 760       uint64_t ret = 0;
 761       for (map<hobject_t, pg_missing_item>::const_iterator i =
 762              needs_recovery_map.begin();
 763            i != needs_recovery_map.end();
 764            ++i) {
 765         if (i->second.is_delete())
 766           continue;
 767         auto mi = missing_loc.find(i->first);
 768         if (mi == missing_loc.end() || !(*is_recoverable)(mi->second))
 769           ++ret;
 770       }
 771       return ret;
 772     }
 773
 774     bool have_unfound() const {
 775       for (map<hobject_t, pg_missing_item>::const_iterator i =
 776              needs_recovery_map.begin();
 777            i != needs_recovery_map.end();
 778            ++i) {
 779         if (i->second.is_delete())
 780           continue;
 781         auto mi = missing_loc.find(i->first);
 782         if (mi == missing_loc.end() || !(*is_recoverable)(mi->second))
 783           return true;
 784       }
 785       return false;
 786     }
 787     void clear() {
 788       needs_recovery_map.clear();
 789       missing_loc.clear();
 790       missing_loc_sources.clear();
 791       missing_by_count.clear();
 792     }
 793
 794     void add_location(const hobject_t &hoid, pg_shard_t location) {
 795       auto p = missing_loc.find(hoid);
 796       if (p == missing_loc.end()) {
 797         p = missing_loc.emplace(hoid, set<pg_shard_t>()).first;
 798       } else {
 799         _dec_count(p->second);
 800       }
 801       p->second.insert(location);
 802       _inc_count(p->second);
 803     }
 804     void remove_location(const hobject_t &hoid, pg_shard_t location) {
 805       auto p = missing_loc.find(hoid);
 806       if (p != missing_loc.end()) {
 807         _dec_count(p->second);
 808         p->second.erase(location);
 809         if (p->second.empty()) {
 810           missing_loc.erase(p);
 811         } else {
 812           _inc_count(p->second);
 813         }
 814       }
 815     }
 816
 817     void clear_location(const hobject_t &hoid) {
 818       auto p = missing_loc.find(hoid);
 819       if (p != missing_loc.end()) {
 820         _dec_count(p->second);
 821         missing_loc.erase(p);
 822       }
 823     }
 824
 825     void add_active_missing(const pg_missing_t &missing) {
 826       for (map<hobject_t, pg_missing_item>::const_iterator i =
 827              missing.get_items().begin();
 828            i != missing.get_items().end();
 829            ++i) {
 830         map<hobject_t, pg_missing_item>::const_iterator j =
 831           needs_recovery_map.find(i->first);
 832         if (j == needs_recovery_map.end()) {
 833           needs_recovery_map.insert(*i);
 834         } else {
 835           lgeneric_dout(pg->cct, 0) << this << " " << pg->info.pgid << " unexpected need for "
 836                                     << i->first << " have " << j->second
 837                                     << " tried to add " << i->second << dendl;
 838           ceph_assert(i->second.need == j->second.need);
 839         }
 840       }
 841     }
 842
 843     void add_missing(const hobject_t &hoid, eversion_t need, eversion_t have, bool is_delete=false) {
 844       needs_recovery_map[hoid] = pg_missing_item(need, have, is_delete);
 845     }
 846     void revise_need(const hobject_t &hoid, eversion_t need) {
 847       auto it = needs_recovery_map.find(hoid);
 848       ceph_assert(it != needs_recovery_map.end());
 849       it->second.need = need;
 850     }
 851
 852     /// Adds info about a possible recovery source
 853     bool add_source_info(
 854       pg_shard_t source,           ///< [in] source
 855       const pg_info_t &oinfo,      ///< [in] info
 856       const pg_missing_t &omissing, ///< [in] (optional) missing
 857       ThreadPool::TPHandle* handle  ///< [in] ThreadPool handle
 858       ); ///< @return whether a new object location was discovered
 859
 860     /// Adds recovery sources in batch
 861     void add_batch_sources_info(
 862       const set<pg_shard_t> &sources,  ///< [in] a set of resources which can be used for all objects
 863       ThreadPool::TPHandle* handle  ///< [in] ThreadPool handle
 864       );
 865
 866     /// Uses osdmap to update structures for now down sources
 867     void check_recovery_sources(const OSDMapRef& osdmap);
 868
 869     /// Call when hoid is no longer missing in acting set
 870     void recovered(const hobject_t &hoid) {
 871       needs_recovery_map.erase(hoid);
 872       auto p = missing_loc.find(hoid);
 873       if (p != missing_loc.end()) {
 874         _dec_count(p->second);
 875         missing_loc.erase(p);
 876       }
 877     }
 878
 879     /// Call to update structures for hoid after a change
 880     void rebuild(
 881       const hobject_t &hoid,
 882       pg_shard_t self,
 883       const set<pg_shard_t> to_recover,
 884       const pg_info_t &info,
 885       const pg_missing_t &missing,
 886       const map<pg_shard_t, pg_missing_t> &pmissing,
 887       const map<pg_shard_t, pg_info_t> &pinfo) {
 888       recovered(hoid);
 889       boost::optional<pg_missing_item> item;
 890       auto miter = missing.get_items().find(hoid);
 891       if (miter != missing.get_items().end()) {
 892         item = miter->second;
 893       } else {
 894         for (auto &&i: to_recover) {
 895           if (i == self)
 896             continue;
 897           auto pmiter = pmissing.find(i);
 898           ceph_assert(pmiter != pmissing.end());
 899           miter = pmiter->second.get_items().find(hoid);
 900           if (miter != pmiter->second.get_items().end()) {
 901             item = miter->second;
 902             break;
 903           }
 904         }
 905       }
 906       if (!item)
 907         return; // recovered!
 908
 909       needs_recovery_map[hoid] = *item;
 910       if (item->is_delete())
 911         return;
 912       auto mliter =
 913         missing_loc.insert(make_pair(hoid, set<pg_shard_t>())).first;
 914       ceph_assert(info.last_backfill.is_max());
 915       ceph_assert(info.last_update >= item->need);
 916       if (!missing.is_missing(hoid))
 917         mliter->second.insert(self);
 918       for (auto &&i: pmissing) {
 919         if (i.first == self)
 920           continue;
 921         auto pinfoiter = pinfo.find(i.first);
 922         ceph_assert(pinfoiter != pinfo.end());
 923         if (item->need <= pinfoiter->second.last_update &&
 924             hoid <= pinfoiter->second.last_backfill &&
 925             !i.second.is_missing(hoid))
 926           mliter->second.insert(i.first);
 927       }
 928       _inc_count(mliter->second);
 929     }
 930
 931     const set<pg_shard_t> &get_locations(const hobject_t &hoid) const {
 932       auto it = missing_loc.find(hoid);
 933       return it == missing_loc.end() ? empty_set : it->second;
 934     }
 935     const map<hobject_t, set<pg_shard_t>> &get_missing_locs() const {
 936       return missing_loc;
 937     }
 938     const map<hobject_t, pg_missing_item> &get_needs_recovery() const {
 939       return needs_recovery_map;
 940     }
 941     const map < shard_id_t, map<loc_count_t,int> > &get_missing_by_count() const {
 942       return missing_by_count;
 943     }
 944   } missing_loc;
 945
 946   PastIntervals past_intervals;
 947
 948   interval_set<snapid_t> snap_trimq;
 949
 950   /* You should not use these items without taking their respective queue locks
 951    * (if they have one) */
 952   xlist<PG*>::item stat_queue_item;
 953   bool scrub_queued;
 954   bool recovery_queued;
 955
 956   int recovery_ops_active;
 957   set<pg_shard_t> waiting_on_backfill;
 958 #ifdef DEBUG_RECOVERY_OIDS
 959   multiset<hobject_t> recovering_oids;
 960 #endif
 961
 962 protected:
 963   int         role;    // 0 = primary, 1 = replica, -1=none.
 964   uint64_t    state;   // PG_STATE_*
 965
 966   bool send_notify;    ///< true if we are non-primary and should notify the primary
 967
 968 protected:
 969   eversion_t  last_update_ondisk;    // last_update that has committed; ONLY DEFINED WHEN is_active()
 970   eversion_t  last_complete_ondisk;  // last_complete that has committed.
 971   eversion_t  last_update_applied;
 972
 973   // entries <= last_rollback_info_trimmed_to_applied have been trimmed
 974   eversion_t  last_rollback_info_trimmed_to_applied;
 975
 976   // primary state
 977 protected:
 978   pg_shard_t primary;
 979   pg_shard_t pg_whoami;
 980   pg_shard_t up_primary;
 981   vector<int> up, acting, want_acting;
 982   // acting_recovery_backfill contains shards that are acting,
 983   // async recovery targets, or backfill targets.
 984   set<pg_shard_t> acting_recovery_backfill, actingset, upset;
 985   map<pg_shard_t,eversion_t> peer_last_complete_ondisk;
 986   eversion_t  min_last_complete_ondisk;  // up: min over last_complete_ondisk, peer_last_complete_ondisk
 987   eversion_t  pg_trim_to;
 988
 989   set<int> blocked_by; ///< osds we are blocked by (for pg stats)
 990
 991 protected:
 992   // [primary only] content recovery state
 993   struct BufferedRecoveryMessages {
 994     map<int, map<spg_t, pg_query_t> > query_map;
 995     map<int, vector<pair<pg_notify_t, PastIntervals> > > info_map;
 996     map<int, vector<pair<pg_notify_t, PastIntervals> > > notify_list;
 997   };
 998
 999 public:
1000   bool dne() { return info.dne(); }
1001   struct RecoveryCtx {
1002     utime_t start_time;
1003     map<int, map<spg_t, pg_query_t> > *query_map;
1004     map<int, vector<pair<pg_notify_t, PastIntervals> > > *info_map;
1005     map<int, vector<pair<pg_notify_t, PastIntervals> > > *notify_list;
1006     ObjectStore::Transaction *transaction;
1007     ThreadPool::TPHandle* handle;
1008     RecoveryCtx(map<int, map<spg_t, pg_query_t> > *query_map,
1009                 map<int,
1010                     vector<pair<pg_notify_t, PastIntervals> > > *info_map,
1011                 map<int,
1012                     vector<pair<pg_notify_t, PastIntervals> > > *notify_list,
1013                 ObjectStore::Transaction *transaction)
1014       : query_map(query_map), info_map(info_map),
1015         notify_list(notify_list),
1016         transaction(transaction),
1017         handle(NULL) {}
1018
1019     RecoveryCtx(BufferedRecoveryMessages &buf, RecoveryCtx &rctx)
1020       : query_map(&(buf.query_map)),
1021         info_map(&(buf.info_map)),
1022         notify_list(&(buf.notify_list)),
1023         transaction(rctx.transaction),
1024         handle(rctx.handle) {}
1025
1026     void accept_buffered_messages(BufferedRecoveryMessages &m) {
1027       ceph_assert(query_map);
1028       ceph_assert(info_map);
1029       ceph_assert(notify_list);
1030       for (map<int, map<spg_t, pg_query_t> >::iterator i = m.query_map.begin();
1031            i != m.query_map.end();
1032            ++i) {
1033         map<spg_t, pg_query_t> &omap = (*query_map)[i->first];
1034         for (map<spg_t, pg_query_t>::iterator j = i->second.begin();
1035              j != i->second.end();
1036              ++j) {
1037           omap[j->first] = j->second;
1038         }
1039       }
1040       for (map<int, vector<pair<pg_notify_t, PastIntervals> > >::iterator i
1041              = m.info_map.begin();
1042            i != m.info_map.end();
1043            ++i) {
1044         vector<pair<pg_notify_t, PastIntervals> > &ovec =
1045           (*info_map)[i->first];
1046         ovec.reserve(ovec.size() + i->second.size());
1047         ovec.insert(ovec.end(), i->second.begin(), i->second.end());
1048       }
1049       for (map<int, vector<pair<pg_notify_t, PastIntervals> > >::iterator i
1050              = m.notify_list.begin();
1051            i != m.notify_list.end();
1052            ++i) {
1053         vector<pair<pg_notify_t, PastIntervals> > &ovec =
1054           (*notify_list)[i->first];
1055         ovec.reserve(ovec.size() + i->second.size());
1056         ovec.insert(ovec.end(), i->second.begin(), i->second.end());
1057       }
1058     }
1059
1060     void send_notify(pg_shard_t to,
1061                      const pg_notify_t &info, const PastIntervals &pi) {
1062       ceph_assert(notify_list);
1063       (*notify_list)[to.osd].push_back(make_pair(info, pi));
1064     }
1065   };
1066 protected:
1067
1068   PGStateHistory pgstate_history;
1069
1070   struct NamedState {
1071     const char *state_name;
1072     utime_t enter_time;
1073     PG* pg;
1074     const char *get_state_name() { return state_name; }
1075     NamedState(PG *pg_, const char *state_name_)
1076       : state_name(state_name_), enter_time(ceph_clock_now()), pg(pg_) {
1077         pg->pgstate_history.enter(pg, enter_time, state_name);
1078       }
1079     virtual ~NamedState() { pg->pgstate_history.exit(state_name); }
1080   };
1081
1082
1083
1084 protected:
1085
1086   /*
1087    * peer_info    -- projected (updates _before_ replicas ack)
1088    * peer_missing -- committed (updates _after_ replicas ack)
1089    */
1090
1091   bool        need_up_thru;
1092   set<pg_shard_t>    stray_set;   // non-acting osds that have PG data.
1093   map<pg_shard_t, pg_info_t>    peer_info;   // info from peers (stray or prior)
1094   map<pg_shard_t, int64_t>    peer_bytes;   // Peer's num_bytes from peer_info
1095   set<pg_shard_t> peer_purged; // peers purged
1096   map<pg_shard_t, pg_missing_t> peer_missing;
1097   set<pg_shard_t> peer_log_requested;  // logs i've requested (and start stamps)
1098   set<pg_shard_t> peer_missing_requested;
1099
1100   // i deleted these strays; ignore racing PGInfo from them
1101   set<pg_shard_t> peer_activated;
1102
1103   // primary-only, recovery-only state
1104   set<pg_shard_t> might_have_unfound;  // These osds might have objects on them
1105                                        // which are unfound on the primary
1106   epoch_t last_peering_reset;
1107
1108   epoch_t get_last_peering_reset() const {
1109     return last_peering_reset;
1110   }
1111
1112   /* heartbeat peers */
1113   void set_probe_targets(const set<pg_shard_t> &probe_set);
1114   void clear_probe_targets();
1115
1116   Mutex heartbeat_peer_lock;
1117   set<int> heartbeat_peers;
1118   set<int> probe_targets;
1119
1120 public:
1121   /**
1122    * BackfillInterval
1123    *
1124    * Represents the objects in a range [begin, end)
1125    *
1126    * Possible states:
1127    * 1) begin == end == hobject_t() indicates the the interval is unpopulated
1128    * 2) Else, objects contains all objects in [begin, end)
1129    */
1130   struct BackfillInterval {
1131     // info about a backfill interval on a peer
1132     eversion_t version; /// version at which the scan occurred
1133     map<hobject_t,eversion_t> objects;
1134     hobject_t begin;
1135     hobject_t end;
1136
1137     /// clear content
1138     void clear() {
1139       *this = BackfillInterval();
1140     }
1141
1142     /// clear objects list only
1143     void clear_objects() {
1144       objects.clear();
1145     }
1146
1147     /// reinstantiate with a new start+end position and sort order
1148     void reset(hobject_t start) {
1149       clear();
1150       begin = end = start;
1151     }
1152
1153     /// true if there are no objects in this interval
1154     bool empty() const {
1155       return objects.empty();
1156     }
1157
1158     /// true if interval extends to the end of the range
1159     bool extends_to_end() const {
1160       return end.is_max();
1161     }
1162
1163     /// removes items <= soid and adjusts begin to the first object
1164     void trim_to(const hobject_t &soid) {
1165       trim();
1166       while (!objects.empty() &&
1167              objects.begin()->first <= soid) {
1168         pop_front();
1169       }
1170     }
1171
1172     /// Adjusts begin to the first object
1173     void trim() {
1174       if (!objects.empty())
1175         begin = objects.begin()->first;
1176       else
1177         begin = end;
1178     }
1179
1180     /// drop first entry, and adjust @begin accordingly
1181     void pop_front() {
1182       ceph_assert(!objects.empty());
1183       objects.erase(objects.begin());
1184       trim();
1185     }
1186
1187     /// dump
1188     void dump(Formatter *f) const {
1189       f->dump_stream("begin") << begin;
1190       f->dump_stream("end") << end;
1191       f->open_array_section("objects");
1192       for (map<hobject_t, eversion_t>::const_iterator i =
1193              objects.begin();
1194            i != objects.end();
1195            ++i) {
1196         f->open_object_section("object");
1197         f->dump_stream("object") << i->first;
1198         f->dump_stream("version") << i->second;
1199         f->close_section();
1200       }
1201       f->close_section();
1202     }
1203   };
1204
1205 protected:
1206   BackfillInterval backfill_info;
1207   map<pg_shard_t, BackfillInterval> peer_backfill_info;
1208   bool backfill_reserved;
1209   bool backfill_reserving;
1210
1211   set<pg_shard_t> backfill_targets,  async_recovery_targets;
1212
1213   // The primary's num_bytes and local num_bytes for this pg, only valid
1214   // during backfill for non-primary shards.
1215   // Both of these are adjusted for EC to reflect the on-disk bytes
1216   std::atomic<int64_t> primary_num_bytes = 0;
1217   std::atomic<int64_t> local_num_bytes = 0;
1218
1219 public:
1220   bool is_backfill_targets(pg_shard_t osd) {
1221     return backfill_targets.count(osd);
1222   }
1223
1224   // Space reserved for backfill is primary_num_bytes - local_num_bytes
1225   // Don't care that difference itself isn't atomic
1226   uint64_t get_reserved_num_bytes() {
1227     int64_t primary = primary_num_bytes.load();
1228     int64_t local = local_num_bytes.load();
1229     if (primary > local)
1230       return primary - local;
1231     else
1232       return 0;
1233   }
1234
1235   bool is_remote_backfilling() {
1236     return primary_num_bytes.load() > 0;
1237   }
1238
1239   void set_reserved_num_bytes(int64_t primary, int64_t local);
1240   void clear_reserved_num_bytes();
1241
1242   // If num_bytes are inconsistent and local_num- goes negative
1243   // it's ok, because it would then be ignored.
1244
1245   // The value of num_bytes could be negative,
1246   // but we don't let local_num_bytes go negative.
1247   void add_local_num_bytes(int64_t num_bytes) {
1248     if (num_bytes) {
1249       int64_t prev_bytes = local_num_bytes.load();
1250       int64_t new_bytes;
1251       do {
1252         new_bytes = prev_bytes + num_bytes;
1253         if (new_bytes < 0)
1254           new_bytes = 0;
1255       } while(!local_num_bytes.compare_exchange_weak(prev_bytes, new_bytes));
1256     }
1257   }
1258   void sub_local_num_bytes(int64_t num_bytes) {
1259     ceph_assert(num_bytes >= 0);
1260     if (num_bytes) {
1261       int64_t prev_bytes = local_num_bytes.load();
1262       int64_t new_bytes;
1263       do {
1264         new_bytes = prev_bytes - num_bytes;
1265         if (new_bytes < 0)
1266           new_bytes = 0;
1267       } while(!local_num_bytes.compare_exchange_weak(prev_bytes, new_bytes));
1268     }
1269   }
1270   // The value of num_bytes could be negative,
1271   // but we don't let info.stats.stats.sum.num_bytes go negative.
1272   void add_num_bytes(int64_t num_bytes) {
1273     ceph_assert(_lock.is_locked_by_me());
1274     if (num_bytes) {
1275       info.stats.stats.sum.num_bytes += num_bytes;
1276       if (info.stats.stats.sum.num_bytes < 0) {
1277         info.stats.stats.sum.num_bytes = 0;
1278       }
1279     }
1280   }
1281   void sub_num_bytes(int64_t num_bytes) {
1282     ceph_assert(_lock.is_locked_by_me());
1283     ceph_assert(num_bytes >= 0);
1284     if (num_bytes) {
1285       info.stats.stats.sum.num_bytes -= num_bytes;
1286       if (info.stats.stats.sum.num_bytes < 0) {
1287         info.stats.stats.sum.num_bytes = 0;
1288       }
1289     }
1290   }
1291
1292   // Only used in testing so not worried about needing the PG lock here
1293   int64_t get_stats_num_bytes() {
1294     Mutex::Locker l(_lock);
1295     int num_bytes = info.stats.stats.sum.num_bytes;
1296     if (pool.info.is_erasure()) {
1297       num_bytes /= (int)get_pgbackend()->get_ec_data_chunk_count();
1298       // Round up each object by a stripe
1299       num_bytes +=  get_pgbackend()->get_ec_stripe_chunk_size() * info.stats.stats.sum.num_objects;
1300     }
1301     int64_t lnb = local_num_bytes.load();
1302     if (lnb && lnb != num_bytes) {
1303       lgeneric_dout(cct, 0) << this << " " << info.pgid << " num_bytes mismatch "
1304                             << lnb << " vs stats "
1305                             << info.stats.stats.sum.num_bytes << " / chunk "
1306                             << get_pgbackend()->get_ec_data_chunk_count()
1307                             << dendl;
1308     }
1309     return num_bytes;
1310   }
1311
1312 protected:
1313
1314   /*
1315    * blocked request wait hierarchy
1316    *
1317    * In order to preserve request ordering we need to be careful about the
1318    * order in which blocked requests get requeued.  Generally speaking, we
1319    * push the requests back up to the op_wq in reverse order (most recent
1320    * request first) so that they come back out again in the original order.
1321    * However, because there are multiple wait queues, we need to requeue
1322    * waitlists in order.  Generally speaking, we requeue the wait lists
1323    * that are checked first.
1324    *
1325    * Here are the various wait lists, in the order they are used during
1326    * request processing, with notes:
1327    *
1328    *  - waiting_for_map
1329    *    - may start or stop blocking at any time (depending on client epoch)
1330    *  - waiting_for_peered
1331    *    - !is_peered() or flushes_in_progress
1332    *    - only starts blocking on interval change; never restarts
1333    *  - waiting_for_active
1334    *    - !is_active()
1335    *    - only starts blocking on interval change; never restarts
1336    *  - waiting_for_flush
1337    *    - is_active() and flushes_in_progress
1338    *    - waiting for final flush during activate
1339    *  - waiting_for_scrub
1340    *    - starts and stops blocking for varying intervals during scrub
1341    *  - waiting_for_unreadable_object
1342    *    - never restarts once object is readable (* except for EIO?)
1343    *  - waiting_for_degraded_object
1344    *    - never restarts once object is writeable (* except for EIO?)
1345    *  - waiting_for_blocked_object
1346    *    - starts and stops based on proxied op activity
1347    *  - obc rwlocks
1348    *    - starts and stops based on read/write activity
1349    *
1350    * Notes:
1351    *
1352    *  1. During and interval change, we requeue *everything* in the above order.
1353    *
1354    *  2. When an obc rwlock is released, we check for a scrub block and requeue
1355    *     the op there if it applies.  We ignore the unreadable/degraded/blocked
1356    *     queues because we assume they cannot apply at that time (this is
1357    *     probably mostly true).
1358    *
1359    *  3. The requeue_ops helper will push ops onto the waiting_for_map list if
1360    *     it is non-empty.
1361    *
1362    * These three behaviors are generally sufficient to maintain ordering, with
1363    * the possible exception of cases where we make an object degraded or
1364    * unreadable that was previously okay, e.g. when scrub or op processing
1365    * encounter an unexpected error.  FIXME.
1366    */
1367
1368   // pg waiters
1369   unsigned flushes_in_progress;
1370
1371   // ops with newer maps than our (or blocked behind them)
1372   // track these by client, since inter-request ordering doesn't otherwise
1373   // matter.
1374   unordered_map<entity_name_t,list<OpRequestRef>> waiting_for_map;
1375
1376   // ops waiting on peered
1377   list<OpRequestRef>            waiting_for_peered;
1378
1379   // ops waiting on active (require peered as well)
1380   list<OpRequestRef>            waiting_for_active;
1381   list<OpRequestRef>            waiting_for_flush;
1382   list<OpRequestRef>            waiting_for_scrub;
1383
1384   list<OpRequestRef>            waiting_for_cache_not_full;
1385   list<OpRequestRef>            waiting_for_clean_to_primary_repair;
1386   map<hobject_t, list<OpRequestRef>> waiting_for_unreadable_object,
1387                              waiting_for_degraded_object,
1388                              waiting_for_blocked_object;
1389
1390   set<hobject_t> objects_blocked_on_cache_full;
1391   map<hobject_t,snapid_t> objects_blocked_on_degraded_snap;
1392   map<hobject_t,ObjectContextRef> objects_blocked_on_snap_promotion;
1393
1394   // Callbacks should assume pg (and nothing else) is locked
1395   map<hobject_t, list<Context*>> callbacks_for_degraded_object;
1396
1397   map<eversion_t,
1398       list<tuple<OpRequestRef, version_t, int> > > waiting_for_ondisk;
1399
1400   void requeue_object_waiters(map<hobject_t, list<OpRequestRef>>& m);
1401   void requeue_op(OpRequestRef op);
1402   void requeue_ops(list<OpRequestRef> &l);
1403
1404   // stats that persist lazily
1405   object_stat_collection_t unstable_stats;
1406
1407   // publish stats
1408   Mutex pg_stats_publish_lock;
1409   bool pg_stats_publish_valid;
1410   pg_stat_t pg_stats_publish;
1411
1412   void _update_calc_stats();
1413   void _update_blocked_by();
1414   friend class TestOpsSocketHook;
1415   void publish_stats_to_osd();
1416   void clear_publish_stats();
1417
1418   void clear_primary_state();
1419
1420   bool is_acting_recovery_backfill(pg_shard_t osd) const {
1421     return acting_recovery_backfill.count(osd);
1422   }
1423   bool is_acting(pg_shard_t osd) const {
1424     return has_shard(pool.info.is_erasure(), acting, osd);
1425   }
1426   bool is_up(pg_shard_t osd) const {
1427     return has_shard(pool.info.is_erasure(), up, osd);
1428   }
1429   static bool has_shard(bool ec, const vector<int>& v, pg_shard_t osd) {
1430     if (ec) {
1431       return v.size() > (unsigned)osd.shard && v[osd.shard] == osd.osd;
1432     } else {
1433       return std::find(v.begin(), v.end(), osd.osd) != v.end();
1434     }
1435   }
1436
1437   bool needs_recovery() const;
1438   bool needs_backfill() const;
1439
1440   /// clip calculated priority to reasonable range
1441   int clamp_recovery_priority(int prio, int pool_recovery_prio, int max);
1442   /// get log recovery reservation priority
1443   unsigned get_recovery_priority();
1444   /// get backfill reservation priority
1445   unsigned get_backfill_priority();
1446   /// get priority for pg deletion
1447   unsigned get_delete_priority();
1448
1449   void try_mark_clean();  ///< mark an active pg clean
1450
1451   /// return [start,end) bounds for required past_intervals
1452   static pair<epoch_t, epoch_t> get_required_past_interval_bounds(
1453     const pg_info_t &info,
1454     epoch_t oldest_map) {
1455     epoch_t start = std::max(
1456       info.history.last_epoch_clean ? info.history.last_epoch_clean :
1457        info.history.epoch_pool_created,
1458       oldest_map);
1459     epoch_t end = std::max(
1460       info.history.same_interval_since,
1461       info.history.epoch_pool_created);
1462     return make_pair(start, end);
1463   }
1464   void check_past_interval_bounds() const;
1465   PastIntervals::PriorSet build_prior();
1466
1467   void remove_down_peer_info(const OSDMapRef osdmap);
1468
1469   bool adjust_need_up_thru(const OSDMapRef osdmap);
1470
1471   bool all_unfound_are_queried_or_lost(const OSDMapRef osdmap) const;
1472   virtual void dump_recovery_info(Formatter *f) const = 0;
1473
1474   void calc_min_last_complete_ondisk() {
1475     eversion_t min = last_complete_ondisk;
1476     ceph_assert(!acting_recovery_backfill.empty());
1477     for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
1478          i != acting_recovery_backfill.end();
1479          ++i) {
1480       if (*i == get_primary()) continue;
1481       if (peer_last_complete_ondisk.count(*i) == 0)
1482         return;   // we don't have complete info
1483       eversion_t a = peer_last_complete_ondisk[*i];
1484       if (a < min)
1485         min = a;
1486     }
1487     if (min == min_last_complete_ondisk)
1488       return;
1489     min_last_complete_ondisk = min;
1490     return;
1491   }
1492
1493   virtual void calc_trim_to() = 0;
1494
1495   virtual void calc_trim_to_aggressive() = 0;
1496
1497   void proc_replica_log(pg_info_t &oinfo, const pg_log_t &olog,
1498                         pg_missing_t& omissing, pg_shard_t from);
1499   void proc_master_log(ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog,
1500                        pg_missing_t& omissing, pg_shard_t from);
1501   bool proc_replica_info(
1502     pg_shard_t from, const pg_info_t &info, epoch_t send_epoch);
1503
1504   struct PGLogEntryHandler : public PGLog::LogEntryHandler {
1505     PG *pg;
1506     ObjectStore::Transaction *t;
1507     PGLogEntryHandler(PG *pg, ObjectStore::Transaction *t) : pg(pg), t(t) {}
1508
1509     // LogEntryHandler
1510     void remove(const hobject_t &hoid) override {
1511       pg->get_pgbackend()->remove(hoid, t);
1512     }
1513     void try_stash(const hobject_t &hoid, version_t v) override {
1514       pg->get_pgbackend()->try_stash(hoid, v, t);
1515     }
1516     void rollback(const pg_log_entry_t &entry) override {
1517       ceph_assert(entry.can_rollback());
1518       pg->get_pgbackend()->rollback(entry, t);
1519     }
1520     void rollforward(const pg_log_entry_t &entry) override {
1521       pg->get_pgbackend()->rollforward(entry, t);
1522     }
1523     void trim(const pg_log_entry_t &entry) override {
1524       pg->get_pgbackend()->trim(entry, t);
1525     }
1526   };
1527
1528   void update_object_snap_mapping(
1529     ObjectStore::Transaction *t, const hobject_t &soid,
1530     const set<snapid_t> &snaps);
1531   void clear_object_snap_mapping(
1532     ObjectStore::Transaction *t, const hobject_t &soid);
1533   void remove_snap_mapped_object(
1534     ObjectStore::Transaction& t, const hobject_t& soid);
1535   void merge_log(
1536     ObjectStore::Transaction& t, pg_info_t &oinfo,
1537     pg_log_t &olog, pg_shard_t from);
1538   void rewind_divergent_log(ObjectStore::Transaction& t, eversion_t newhead);
1539   bool search_for_missing(
1540     const pg_info_t &oinfo, const pg_missing_t &omissing,
1541     pg_shard_t fromosd,
1542     RecoveryCtx*);
1543
1544   void discover_all_missing(std::map<int, map<spg_t,pg_query_t> > &query_map);
1545
1546   map<pg_shard_t, pg_info_t>::const_iterator find_best_info(
1547     const map<pg_shard_t, pg_info_t> &infos,
1548     bool restrict_to_up_acting,
1549     bool *history_les_bound) const;
1550   static void calc_ec_acting(
1551     map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1552     unsigned size,
1553     const vector<int> &acting,
1554     const vector<int> &up,
1555     const map<pg_shard_t, pg_info_t> &all_info,
1556     bool restrict_to_up_acting,
1557     vector<int> *want,
1558     set<pg_shard_t> *backfill,
1559     set<pg_shard_t> *acting_backfill,
1560     ostream &ss);
1561   static void calc_replicated_acting(
1562     map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1563     uint64_t force_auth_primary_missing_objects,
1564     unsigned size,
1565     const vector<int> &acting,
1566     const vector<int> &up,
1567     pg_shard_t up_primary,
1568     const map<pg_shard_t, pg_info_t> &all_info,
1569     bool restrict_to_up_acting,
1570     vector<int> *want,
1571     set<pg_shard_t> *backfill,
1572     set<pg_shard_t> *acting_backfill,
1573     const OSDMapRef osdmap,
1574     ostream &ss);
1575   void choose_async_recovery_ec(const map<pg_shard_t, pg_info_t> &all_info,
1576                                 const pg_info_t &auth_info,
1577                                 vector<int> *want,
1578                                 set<pg_shard_t> *async_recovery,
1579                                 const OSDMapRef osdmap) const;
1580   void choose_async_recovery_replicated(const map<pg_shard_t, pg_info_t> &all_info,
1581                                         const pg_info_t &auth_info,
1582                                         vector<int> *want,
1583                                         set<pg_shard_t> *async_recovery,
1584                                         const OSDMapRef osdmap) const;
1585
1586   bool recoverable_and_ge_min_size(const vector<int> &want) const;
1587   bool choose_acting(pg_shard_t &auth_log_shard,
1588                      bool restrict_to_up_acting,
1589                      bool *history_les_bound);
1590   void build_might_have_unfound();
1591   void activate(
1592     ObjectStore::Transaction& t,
1593     epoch_t activation_epoch,
1594     map<int, map<spg_t,pg_query_t> >& query_map,
1595     map<int,
1596       vector<pair<pg_notify_t, PastIntervals> > > *activator_map,
1597     RecoveryCtx *ctx);
1598
1599   struct C_PG_ActivateCommitted : public Context {
1600     PGRef pg;
1601     epoch_t epoch;
1602     epoch_t activation_epoch;
1603     C_PG_ActivateCommitted(PG *p, epoch_t e, epoch_t ae)
1604       : pg(p), epoch(e), activation_epoch(ae) {}
1605     void finish(int r) override {
1606       pg->_activate_committed(epoch, activation_epoch);
1607     }
1608   };
1609   void _activate_committed(epoch_t epoch, epoch_t activation_epoch);
1610   void all_activated_and_committed();
1611
1612   void proc_primary_info(ObjectStore::Transaction &t, const pg_info_t &info);
1613
1614   bool have_unfound() const {
1615     return missing_loc.have_unfound();
1616   }
1617   uint64_t get_num_unfound() const {
1618     return missing_loc.num_unfound();
1619   }
1620   bool all_missing_unfound() const {
1621     const auto& missing = pg_log.get_missing();
1622     if (!missing.have_missing())
1623       return false;
1624     for (auto& m : missing.get_items()) {
1625       if (!missing_loc.is_unfound(m.first))
1626         return false;
1627     }
1628     return true;
1629   }
1630
1631   virtual void check_local() = 0;
1632
1633   void purge_strays();
1634
1635   void update_heartbeat_peers();
1636
1637   Context *finish_sync_event;
1638
1639   Context *finish_recovery();
1640   void _finish_recovery(Context *c);
1641   struct C_PG_FinishRecovery : public Context {
1642     PGRef pg;
1643     explicit C_PG_FinishRecovery(PG *p) : pg(p) {}
1644     void finish(int r) override {
1645       pg->_finish_recovery(this);
1646     }
1647   };
1648   void cancel_recovery();
1649   void clear_recovery_state();
1650   virtual void _clear_recovery_state() = 0;
1651   virtual void check_recovery_sources(const OSDMapRef& newmap) = 0;
1652   void start_recovery_op(const hobject_t& soid);
1653   void finish_recovery_op(const hobject_t& soid, bool dequeue=false);
1654
1655   virtual void _split_into(pg_t child_pgid, PG *child, unsigned split_bits) = 0;
1656
1657   friend class C_OSD_RepModify_Commit;
1658   friend class C_DeleteMore;
1659
1660   // -- backoff --
1661   Mutex backoff_lock;  // orders inside Backoff::lock
1662   map<hobject_t,set<BackoffRef>> backoffs;
1663
1664   void add_backoff(SessionRef s, const hobject_t& begin, const hobject_t& end);
1665   void release_backoffs(const hobject_t& begin, const hobject_t& end);
1666   void release_backoffs(const hobject_t& o) {
1667     release_backoffs(o, o);
1668   }
1669   void clear_backoffs();
1670
1671   void add_pg_backoff(SessionRef s) {
1672     hobject_t begin = info.pgid.pgid.get_hobj_start();
1673     hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1674     add_backoff(s, begin, end);
1675   }
1676   void release_pg_backoffs() {
1677     hobject_t begin = info.pgid.pgid.get_hobj_start();
1678     hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1679     release_backoffs(begin, end);
1680   }
1681
1682   // -- scrub --
1683 public:
1684   struct Scrubber {
1685     Scrubber();
1686     ~Scrubber();
1687
1688     // metadata
1689     set<pg_shard_t> reserved_peers;
1690     bool reserved, reserve_failed;
1691     epoch_t epoch_start;
1692
1693     // common to both scrubs
1694     bool active;
1695     set<pg_shard_t> waiting_on_whom;
1696     int shallow_errors;
1697     int deep_errors;
1698     int fixed;
1699     ScrubMap primary_scrubmap;
1700     ScrubMapBuilder primary_scrubmap_pos;
1701     epoch_t replica_scrub_start = 0;
1702     ScrubMap replica_scrubmap;
1703     ScrubMapBuilder replica_scrubmap_pos;
1704     map<pg_shard_t, ScrubMap> received_maps;
1705     OpRequestRef active_rep_scrub;
1706     utime_t scrub_reg_stamp;  // stamp we registered for
1707
1708     static utime_t scrub_must_stamp() { return utime_t(0,1); }
1709
1710     omap_stat_t omap_stats  = (const struct omap_stat_t){ 0 };
1711
1712     // For async sleep
1713     bool sleeping = false;
1714     bool needs_sleep = true;
1715     utime_t sleep_start;
1716
1717     // flags to indicate explicitly requested scrubs (by admin)
1718     bool must_scrub, must_deep_scrub, must_repair, need_auto;
1719
1720     // Priority to use for scrub scheduling
1721     unsigned priority = 0;
1722
1723     bool time_for_deep;
1724     // this flag indicates whether we would like to do auto-repair of the PG or not
1725     bool auto_repair;
1726     // this flag indicates that we are scrubbing post repair to verify everything is fixed
1727     bool check_repair;
1728     // this flag indicates that if a regular scrub detects errors <= osd_scrub_auto_repair_num_errors,
1729     // we should deep scrub in order to auto repair
1730     bool deep_scrub_on_error;
1731
1732     // Maps from objects with errors to missing/inconsistent peers
1733     map<hobject_t, set<pg_shard_t>> missing;
1734     map<hobject_t, set<pg_shard_t>> inconsistent;
1735
1736     // Map from object with errors to good peers
1737     map<hobject_t, list<pair<ScrubMap::object, pg_shard_t> >> authoritative;
1738
1739     // Cleaned map pending snap metadata scrub
1740     ScrubMap cleaned_meta_map;
1741
1742     void clean_meta_map(ScrubMap &for_meta_scrub) {
1743       if (end.is_max() ||
1744           cleaned_meta_map.objects.empty()) {
1745          cleaned_meta_map.swap(for_meta_scrub);
1746       } else {
1747         auto iter = cleaned_meta_map.objects.end();
1748         --iter; // not empty, see if clause
1749         auto begin = cleaned_meta_map.objects.begin();
1750         if (iter->first.has_snapset()) {
1751           ++iter;
1752         } else {
1753           while (iter != begin) {
1754             auto next = iter--;
1755             if (next->first.get_head() != iter->first.get_head()) {
1756               ++iter;
1757               break;
1758             }
1759           }
1760         }
1761         for_meta_scrub.objects.insert(begin, iter);
1762         cleaned_meta_map.objects.erase(begin, iter);
1763       }
1764     }
1765
1766     // digest updates which we are waiting on
1767     int num_digest_updates_pending;
1768
1769     // chunky scrub
1770     hobject_t start, end;    // [start,end)
1771     hobject_t max_end;       // Largest end that may have been sent to replicas
1772     eversion_t subset_last_update;
1773
1774     // chunky scrub state
1775     enum State {
1776       INACTIVE,
1777       NEW_CHUNK,
1778       WAIT_PUSHES,
1779       WAIT_LAST_UPDATE,
1780       BUILD_MAP,
1781       BUILD_MAP_DONE,
1782       WAIT_REPLICAS,
1783       COMPARE_MAPS,
1784       WAIT_DIGEST_UPDATES,
1785       FINISH,
1786       BUILD_MAP_REPLICA,
1787     } state;
1788
1789     std::unique_ptr<Scrub::Store> store;
1790     // deep scrub
1791     bool deep;
1792     int preempt_left;
1793     int preempt_divisor;
1794
1795     list<Context*> callbacks;
1796     void add_callback(Context *context) {
1797       callbacks.push_back(context);
1798     }
1799     void run_callbacks() {
1800       list<Context*> to_run;
1801       to_run.swap(callbacks);
1802       for (list<Context*>::iterator i = to_run.begin();
1803            i != to_run.end();
1804            ++i) {
1805         (*i)->complete(0);
1806       }
1807     }
1808
1809     static const char *state_string(const PG::Scrubber::State& state) {
1810       const char *ret = NULL;
1811       switch( state )
1812       {
1813         case INACTIVE: ret = "INACTIVE"; break;
1814         case NEW_CHUNK: ret = "NEW_CHUNK"; break;
1815         case WAIT_PUSHES: ret = "WAIT_PUSHES"; break;
1816         case WAIT_LAST_UPDATE: ret = "WAIT_LAST_UPDATE"; break;
1817         case BUILD_MAP: ret = "BUILD_MAP"; break;
1818         case BUILD_MAP_DONE: ret = "BUILD_MAP_DONE"; break;
1819         case WAIT_REPLICAS: ret = "WAIT_REPLICAS"; break;
1820         case COMPARE_MAPS: ret = "COMPARE_MAPS"; break;
1821         case WAIT_DIGEST_UPDATES: ret = "WAIT_DIGEST_UPDATES"; break;
1822         case FINISH: ret = "FINISH"; break;
1823         case BUILD_MAP_REPLICA: ret = "BUILD_MAP_REPLICA"; break;
1824       }
1825       return ret;
1826     }
1827
1828     bool is_chunky_scrub_active() const { return state != INACTIVE; }
1829
1830     // clear all state
1831     void reset() {
1832       active = false;
1833       waiting_on_whom.clear();
1834       if (active_rep_scrub) {
1835         active_rep_scrub = OpRequestRef();
1836       }
1837       received_maps.clear();
1838
1839       must_scrub = false;
1840       must_deep_scrub = false;
1841       must_repair = false;
1842       need_auto = false;
1843       time_for_deep = false;
1844       auto_repair = false;
1845       check_repair = false;
1846       deep_scrub_on_error = false;
1847
1848       state = PG::Scrubber::INACTIVE;
1849       start = hobject_t();
1850       end = hobject_t();
1851       max_end = hobject_t();
1852       subset_last_update = eversion_t();
1853       shallow_errors = 0;
1854       deep_errors = 0;
1855       fixed = 0;
1856       omap_stats = (const struct omap_stat_t){ 0 };
1857       deep = false;
1858       run_callbacks();
1859       inconsistent.clear();
1860       missing.clear();
1861       authoritative.clear();
1862       num_digest_updates_pending = 0;
1863       primary_scrubmap = ScrubMap();
1864       primary_scrubmap_pos.reset();
1865       replica_scrubmap = ScrubMap();
1866       replica_scrubmap_pos.reset();
1867       cleaned_meta_map = ScrubMap();
1868       sleeping = false;
1869       needs_sleep = true;
1870       sleep_start = utime_t();
1871     }
1872
1873     void create_results(const hobject_t& obj);
1874     void cleanup_store(ObjectStore::Transaction *t);
1875   } scrubber;
1876
1877 protected:
1878   bool scrub_after_recovery;
1879
1880   int active_pushes;
1881
1882   bool scrub_can_preempt = false;
1883   bool scrub_preempted = false;
1884
1885   // we allow some number of preemptions of the scrub, which mean we do
1886   // not block.  then we start to block.  once we start blocking, we do
1887   // not stop until the scrub range is completed.
1888   bool write_blocked_by_scrub(const hobject_t &soid);
1889
1890   /// true if the given range intersects the scrub interval in any way
1891   bool range_intersects_scrub(const hobject_t &start, const hobject_t& end);
1892
1893   void repair_object(
1894     const hobject_t& soid, list<pair<ScrubMap::object, pg_shard_t> > *ok_peers,
1895     pg_shard_t bad_peer);
1896
1897   void chunky_scrub(ThreadPool::TPHandle &handle);
1898   void scrub_compare_maps();
1899   /**
1900    * return true if any inconsistency/missing is repaired, false otherwise
1901    */
1902   bool scrub_process_inconsistent();
1903   bool ops_blocked_by_scrub() const;
1904   void scrub_finish();
1905   void scrub_clear_state(bool keep_repair = false);
1906   void _scan_snaps(ScrubMap &map);
1907   void _repair_oinfo_oid(ScrubMap &map);
1908   void _scan_rollback_obs(const vector<ghobject_t> &rollback_obs);
1909   void _request_scrub_map(pg_shard_t replica, eversion_t version,
1910                           hobject_t start, hobject_t end, bool deep,
1911                           bool allow_preemption);
1912   int build_scrub_map_chunk(
1913     ScrubMap &map,
1914     ScrubMapBuilder &pos,
1915     hobject_t start, hobject_t end, bool deep,
1916     ThreadPool::TPHandle &handle);
1917   /**
1918    * returns true if [begin, end) is good to scrub at this time
1919    * a false return value obliges the implementer to requeue scrub when the
1920    * condition preventing scrub clears
1921    */
1922   virtual bool _range_available_for_scrub(
1923     const hobject_t &begin, const hobject_t &end) = 0;
1924   virtual void scrub_snapshot_metadata(
1925     ScrubMap &map,
1926     const std::map<hobject_t,
1927                    pair<boost::optional<uint32_t>,
1928                         boost::optional<uint32_t>>> &missing_digest) { }
1929   virtual void _scrub_clear_state() { }
1930   virtual void _scrub_finish() { }
1931   void clear_scrub_reserved();
1932   void scrub_reserve_replicas();
1933   void scrub_unreserve_replicas();
1934   bool scrub_all_replicas_reserved() const;
1935
1936   void replica_scrub(
1937     OpRequestRef op,
1938     ThreadPool::TPHandle &handle);
1939   void do_replica_scrub_map(OpRequestRef op);
1940
1941   void handle_scrub_reserve_request(OpRequestRef op);
1942   void handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from);
1943   void handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from);
1944   void handle_scrub_reserve_release(OpRequestRef op);
1945
1946   void reject_reservation();
1947   void schedule_backfill_retry(float retry);
1948   void schedule_recovery_retry(float retry);
1949
1950   // -- recovery state --
1951
1952   template <class EVT>
1953   struct QueuePeeringEvt : Context {
1954     PGRef pg;
1955     epoch_t epoch;
1956     EVT evt;
1957     QueuePeeringEvt(PG *pg, epoch_t epoch, EVT evt) :
1958       pg(pg), epoch(epoch), evt(evt) {}
1959     void finish(int r) override {
1960       pg->lock();
1961       pg->queue_peering_event(PGPeeringEventRef(
1962                                 new PGPeeringEvent(
1963                                   epoch,
1964                                   epoch,
1965                                   evt)));
1966       pg->unlock();
1967     }
1968   };
1969
1970
1971   struct QueryState : boost::statechart::event< QueryState > {
1972     Formatter *f;
1973     explicit QueryState(Formatter *f) : f(f) {}
1974     void print(std::ostream *out) const {
1975       *out << "Query";
1976     }
1977   };
1978
1979 public:
1980   int pg_stat_adjust(osd_stat_t *new_stat);
1981 protected:
1982
1983   struct AdvMap : boost::statechart::event< AdvMap > {
1984     OSDMapRef osdmap;
1985     OSDMapRef lastmap;
1986     vector<int> newup, newacting;
1987     int up_primary, acting_primary;
1988     AdvMap(
1989       OSDMapRef osdmap, OSDMapRef lastmap,
1990       vector<int>& newup, int up_primary,
1991       vector<int>& newacting, int acting_primary):
1992       osdmap(osdmap), lastmap(lastmap),
1993       newup(newup),
1994       newacting(newacting),
1995       up_primary(up_primary),
1996       acting_primary(acting_primary) {}
1997     void print(std::ostream *out) const {
1998       *out << "AdvMap";
1999     }
2000   };
2001
2002   struct ActMap : boost::statechart::event< ActMap > {
2003     ActMap() : boost::statechart::event< ActMap >() {}
2004     void print(std::ostream *out) const {
2005       *out << "ActMap";
2006     }
2007   };
2008   struct Activate : boost::statechart::event< Activate > {
2009     epoch_t activation_epoch;
2010     explicit Activate(epoch_t q) : boost::statechart::event< Activate >(),
2011                           activation_epoch(q) {}
2012     void print(std::ostream *out) const {
2013       *out << "Activate from " << activation_epoch;
2014     }
2015   };
2016 public:
2017   struct UnfoundBackfill : boost::statechart::event<UnfoundBackfill> {
2018     explicit UnfoundBackfill() {}
2019     void print(std::ostream *out) const {
2020       *out << "UnfoundBackfill";
2021     }
2022   };
2023   struct UnfoundRecovery : boost::statechart::event<UnfoundRecovery> {
2024     explicit UnfoundRecovery() {}
2025     void print(std::ostream *out) const {
2026       *out << "UnfoundRecovery";
2027     }
2028   };
2029
2030   struct RequestScrub : boost::statechart::event<RequestScrub> {
2031     bool deep;
2032     bool repair;
2033     explicit RequestScrub(bool d, bool r) : deep(d), repair(r) {}
2034     void print(std::ostream *out) const {
2035       *out << "RequestScrub(" << (deep ? "deep" : "shallow")
2036            << (repair ? " repair" : "");
2037     }
2038   };
2039
2040 protected:
2041   TrivialEvent(Initialize)
2042   TrivialEvent(GotInfo)
2043   TrivialEvent(NeedUpThru)
2044   TrivialEvent(Backfilled)
2045   TrivialEvent(LocalBackfillReserved)
2046   TrivialEvent(RejectRemoteReservation)
2047   public:
2048   TrivialEvent(RequestBackfill)
2049   protected:
2050   TrivialEvent(RemoteRecoveryPreempted)
2051   TrivialEvent(RemoteBackfillPreempted)
2052   TrivialEvent(BackfillTooFull)
2053   TrivialEvent(RecoveryTooFull)
2054
2055   TrivialEvent(MakePrimary)
2056   TrivialEvent(MakeStray)
2057   TrivialEvent(NeedActingChange)
2058   TrivialEvent(IsIncomplete)
2059   TrivialEvent(IsDown)
2060
2061   TrivialEvent(AllReplicasRecovered)
2062   TrivialEvent(DoRecovery)
2063   TrivialEvent(LocalRecoveryReserved)
2064   public:
2065   protected:
2066   TrivialEvent(AllRemotesReserved)
2067   TrivialEvent(AllBackfillsReserved)
2068   TrivialEvent(GoClean)
2069
2070   TrivialEvent(AllReplicasActivated)
2071
2072   TrivialEvent(IntervalFlush)
2073
2074   public:
2075   TrivialEvent(DeleteStart)
2076   TrivialEvent(DeleteSome)
2077
2078   TrivialEvent(SetForceRecovery)
2079   TrivialEvent(UnsetForceRecovery)
2080   TrivialEvent(SetForceBackfill)
2081   TrivialEvent(UnsetForceBackfill)
2082
2083   protected:
2084   TrivialEvent(DeleteReserved)
2085   TrivialEvent(DeleteInterrupted)
2086
2087   /* Encapsulates PG recovery process */
2088   class RecoveryState {
2089     void start_handle(RecoveryCtx *new_ctx);
2090     void end_handle();
2091   public:
2092     void begin_block_outgoing();
2093     void end_block_outgoing();
2094     void clear_blocked_outgoing();
2095   private:
2096
2097     /* States */
2098     struct Initial;
2099     class RecoveryMachine : public boost::statechart::state_machine< RecoveryMachine, Initial > {
2100       RecoveryState *state;
2101     public:
2102       PG *pg;
2103
2104       utime_t event_time;
2105       uint64_t event_count;
2106
2107       void clear_event_counters() {
2108         event_time = utime_t();
2109         event_count = 0;
2110       }
2111
2112       void log_enter(const char *state_name);
2113       void log_exit(const char *state_name, utime_t duration);
2114
2115       RecoveryMachine(RecoveryState *state, PG *pg) : state(state), pg(pg), event_count(0) {}
2116
2117       /* Accessor functions for state methods */
2118       ObjectStore::Transaction* get_cur_transaction() {
2119         ceph_assert(state->rctx);
2120         ceph_assert(state->rctx->transaction);
2121         return state->rctx->transaction;
2122       }
2123
2124       void send_query(pg_shard_t to, const pg_query_t &query) {
2125         ceph_assert(state->rctx);
2126         ceph_assert(state->rctx->query_map);
2127         (*state->rctx->query_map)[to.osd][spg_t(pg->info.pgid.pgid, to.shard)] =
2128           query;
2129       }
2130
2131       map<int, map<spg_t, pg_query_t> > *get_query_map() {
2132         ceph_assert(state->rctx);
2133         ceph_assert(state->rctx->query_map);
2134         return state->rctx->query_map;
2135       }
2136
2137       map<int, vector<pair<pg_notify_t, PastIntervals> > > *get_info_map() {
2138         ceph_assert(state->rctx);
2139         ceph_assert(state->rctx->info_map);
2140         return state->rctx->info_map;
2141       }
2142
2143       RecoveryCtx *get_recovery_ctx() { return &*(state->rctx); }
2144
2145       void send_notify(pg_shard_t to,
2146                        const pg_notify_t &info, const PastIntervals &pi) {
2147         ceph_assert(state->rctx);
2148         state->rctx->send_notify(to, info, pi);
2149       }
2150     };
2151     friend class RecoveryMachine;
2152
2153     /* States */
2154     // Initial
2155     // Reset
2156     // Start
2157     //   Started
2158     //     Primary
2159     //       WaitActingChange
2160     //       Peering
2161     //         GetInfo
2162     //         GetLog
2163     //         GetMissing
2164     //         WaitUpThru
2165     //         Incomplete
2166     //       Active
2167     //         Activating
2168     //         Clean
2169     //         Recovered
2170     //         Backfilling
2171     //         WaitRemoteBackfillReserved
2172     //         WaitLocalBackfillReserved
2173     //         NotBackfilling
2174     //         NotRecovering
2175     //         Recovering
2176     //         WaitRemoteRecoveryReserved
2177     //         WaitLocalRecoveryReserved
2178     //     ReplicaActive
2179     //       RepNotRecovering
2180     //       RepRecovering
2181     //       RepWaitBackfillReserved
2182     //       RepWaitRecoveryReserved
2183     //     Stray
2184     //     ToDelete
2185     //       WaitDeleteReserved
2186     //       Deleting
2187     // Crashed
2188
2189     struct Crashed : boost::statechart::state< Crashed, RecoveryMachine >, NamedState {
2190       explicit Crashed(my_context ctx);
2191     };
2192
2193     struct Reset;
2194
2195     struct Initial : boost::statechart::state< Initial, RecoveryMachine >, NamedState {
2196       explicit Initial(my_context ctx);
2197       void exit();
2198
2199       typedef boost::mpl::list <
2200         boost::statechart::transition< Initialize, Reset >,
2201         boost::statechart::custom_reaction< NullEvt >,
2202         boost::statechart::transition< boost::statechart::event_base, Crashed >
2203         > reactions;
2204
2205       boost::statechart::result react(const MNotifyRec&);
2206       boost::statechart::result react(const MInfoRec&);
2207       boost::statechart::result react(const MLogRec&);
2208       boost::statechart::result react(const boost::statechart::event_base&) {
2209         return discard_event();
2210       }
2211     };
2212
2213     struct Reset : boost::statechart::state< Reset, RecoveryMachine >, NamedState {
2214       explicit Reset(my_context ctx);
2215       void exit();
2216
2217       typedef boost::mpl::list <
2218         boost::statechart::custom_reaction< QueryState >,
2219         boost::statechart::custom_reaction< AdvMap >,
2220         boost::statechart::custom_reaction< ActMap >,
2221         boost::statechart::custom_reaction< NullEvt >,
2222         boost::statechart::custom_reaction< IntervalFlush >,
2223         boost::statechart::transition< boost::statechart::event_base, Crashed >
2224         > reactions;
2225       boost::statechart::result react(const QueryState& q);
2226       boost::statechart::result react(const AdvMap&);
2227       boost::statechart::result react(const ActMap&);
2228       boost::statechart::result react(const IntervalFlush&);
2229       boost::statechart::result react(const boost::statechart::event_base&) {
2230         return discard_event();
2231       }
2232     };
2233
2234     struct Start;
2235
2236     struct Started : boost::statechart::state< Started, RecoveryMachine, Start >, NamedState {
2237       explicit Started(my_context ctx);
2238       void exit();
2239
2240       typedef boost::mpl::list <
2241         boost::statechart::custom_reaction< QueryState >,
2242         boost::statechart::custom_reaction< AdvMap >,
2243         boost::statechart::custom_reaction< IntervalFlush >,
2244         // ignored
2245         boost::statechart::custom_reaction< NullEvt >,
2246         boost::statechart::custom_reaction<SetForceRecovery>,
2247         boost::statechart::custom_reaction<UnsetForceRecovery>,
2248         boost::statechart::custom_reaction<SetForceBackfill>,
2249         boost::statechart::custom_reaction<UnsetForceBackfill>,
2250         boost::statechart::custom_reaction<RequestScrub>,
2251         // crash
2252         boost::statechart::transition< boost::statechart::event_base, Crashed >
2253         > reactions;
2254       boost::statechart::result react(const QueryState& q);
2255       boost::statechart::result react(const AdvMap&);
2256       boost::statechart::result react(const IntervalFlush&);
2257       boost::statechart::result react(const boost::statechart::event_base&) {
2258         return discard_event();
2259       }
2260     };
2261
2262     struct Primary;
2263     struct Stray;
2264
2265     struct Start : boost::statechart::state< Start, Started >, NamedState {
2266       explicit Start(my_context ctx);
2267       void exit();
2268
2269       typedef boost::mpl::list <
2270         boost::statechart::transition< MakePrimary, Primary >,
2271         boost::statechart::transition< MakeStray, Stray >
2272         > reactions;
2273     };
2274
2275     struct Peering;
2276     struct WaitActingChange;
2277     struct Incomplete;
2278     struct Down;
2279
2280     struct Primary : boost::statechart::state< Primary, Started, Peering >, NamedState {
2281       explicit Primary(my_context ctx);
2282       void exit();
2283
2284       typedef boost::mpl::list <
2285         boost::statechart::custom_reaction< ActMap >,
2286         boost::statechart::custom_reaction< MNotifyRec >,
2287         boost::statechart::transition< NeedActingChange, WaitActingChange >,
2288         boost::statechart::custom_reaction<SetForceRecovery>,
2289         boost::statechart::custom_reaction<UnsetForceRecovery>,
2290         boost::statechart::custom_reaction<SetForceBackfill>,
2291         boost::statechart::custom_reaction<UnsetForceBackfill>,
2292         boost::statechart::custom_reaction<RequestScrub>
2293         > reactions;
2294       boost::statechart::result react(const ActMap&);
2295       boost::statechart::result react(const MNotifyRec&);
2296       boost::statechart::result react(const SetForceRecovery&);
2297       boost::statechart::result react(const UnsetForceRecovery&);
2298       boost::statechart::result react(const SetForceBackfill&);
2299       boost::statechart::result react(const UnsetForceBackfill&);
2300       boost::statechart::result react(const RequestScrub&);
2301     };
2302
2303     struct WaitActingChange : boost::statechart::state< WaitActingChange, Primary>,
2304                               NamedState {
2305       typedef boost::mpl::list <
2306         boost::statechart::custom_reaction< QueryState >,
2307         boost::statechart::custom_reaction< AdvMap >,
2308         boost::statechart::custom_reaction< MLogRec >,
2309         boost::statechart::custom_reaction< MInfoRec >,
2310         boost::statechart::custom_reaction< MNotifyRec >
2311         > reactions;
2312       explicit WaitActingChange(my_context ctx);
2313       boost::statechart::result react(const QueryState& q);
2314       boost::statechart::result react(const AdvMap&);
2315       boost::statechart::result react(const MLogRec&);
2316       boost::statechart::result react(const MInfoRec&);
2317       boost::statechart::result react(const MNotifyRec&);
2318       void exit();
2319     };
2320
2321     struct GetInfo;
2322     struct Active;
2323
2324     struct Peering : boost::statechart::state< Peering, Primary, GetInfo >, NamedState {
2325       PastIntervals::PriorSet prior_set;
2326       bool history_les_bound;  //< need osd_find_best_info_ignore_history_les
2327
2328       explicit Peering(my_context ctx);
2329       void exit();
2330
2331       typedef boost::mpl::list <
2332         boost::statechart::custom_reaction< QueryState >,
2333         boost::statechart::transition< Activate, Active >,
2334         boost::statechart::custom_reaction< AdvMap >
2335         > reactions;
2336       boost::statechart::result react(const QueryState& q);
2337       boost::statechart::result react(const AdvMap &advmap);
2338     };
2339
2340     struct WaitLocalRecoveryReserved;
2341     struct Activating;
2342     struct Active : boost::statechart::state< Active, Primary, Activating >, NamedState {
2343       explicit Active(my_context ctx);
2344       void exit();
2345
2346       const set<pg_shard_t> remote_shards_to_reserve_recovery;
2347       const set<pg_shard_t> remote_shards_to_reserve_backfill;
2348       bool all_replicas_activated;
2349
2350       typedef boost::mpl::list <
2351         boost::statechart::custom_reaction< QueryState >,
2352         boost::statechart::custom_reaction< ActMap >,
2353         boost::statechart::custom_reaction< AdvMap >,
2354         boost::statechart::custom_reaction< MInfoRec >,
2355         boost::statechart::custom_reaction< MNotifyRec >,
2356         boost::statechart::custom_reaction< MLogRec >,
2357         boost::statechart::custom_reaction< MTrim >,
2358         boost::statechart::custom_reaction< Backfilled >,
2359         boost::statechart::custom_reaction< AllReplicasActivated >,
2360         boost::statechart::custom_reaction< DeferRecovery >,
2361         boost::statechart::custom_reaction< DeferBackfill >,
2362         boost::statechart::custom_reaction< UnfoundRecovery >,
2363         boost::statechart::custom_reaction< UnfoundBackfill >,
2364         boost::statechart::custom_reaction< RemoteReservationRevokedTooFull>,
2365         boost::statechart::custom_reaction< RemoteReservationRevoked>,
2366         boost::statechart::custom_reaction< DoRecovery>
2367         > reactions;
2368       boost::statechart::result react(const QueryState& q);
2369       boost::statechart::result react(const ActMap&);
2370       boost::statechart::result react(const AdvMap&);
2371       boost::statechart::result react(const MInfoRec& infoevt);
2372       boost::statechart::result react(const MNotifyRec& notevt);
2373       boost::statechart::result react(const MLogRec& logevt);
2374       boost::statechart::result react(const MTrim& trimevt);
2375       boost::statechart::result react(const Backfilled&) {
2376         return discard_event();
2377       }
2378       boost::statechart::result react(const AllReplicasActivated&);
2379       boost::statechart::result react(const DeferRecovery& evt) {
2380         return discard_event();
2381       }
2382       boost::statechart::result react(const DeferBackfill& evt) {
2383         return discard_event();
2384       }
2385       boost::statechart::result react(const UnfoundRecovery& evt) {
2386         return discard_event();
2387       }
2388       boost::statechart::result react(const UnfoundBackfill& evt) {
2389         return discard_event();
2390       }
2391       boost::statechart::result react(const RemoteReservationRevokedTooFull&) {
2392         return discard_event();
2393       }
2394       boost::statechart::result react(const RemoteReservationRevoked&) {
2395         return discard_event();
2396       }
2397       boost::statechart::result react(const DoRecovery&) {
2398         return discard_event();
2399       }
2400     };
2401
2402     struct Clean : boost::statechart::state< Clean, Active >, NamedState {
2403       typedef boost::mpl::list<
2404         boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >,
2405         boost::statechart::custom_reaction<SetForceRecovery>,
2406         boost::statechart::custom_reaction<SetForceBackfill>
2407       > reactions;
2408       explicit Clean(my_context ctx);
2409       void exit();
2410       boost::statechart::result react(const boost::statechart::event_base&) {
2411         return discard_event();
2412       }
2413     };
2414
2415     struct Recovered : boost::statechart::state< Recovered, Active >, NamedState {
2416       typedef boost::mpl::list<
2417         boost::statechart::transition< GoClean, Clean >,
2418         boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >,
2419         boost::statechart::custom_reaction< AllReplicasActivated >
2420       > reactions;
2421       explicit Recovered(my_context ctx);
2422       void exit();
2423       boost::statechart::result react(const AllReplicasActivated&) {
2424         post_event(GoClean());
2425         return forward_event();
2426       }
2427     };
2428
2429     struct Backfilling : boost::statechart::state< Backfilling, Active >, NamedState {
2430       typedef boost::mpl::list<
2431         boost::statechart::custom_reaction< Backfilled >,
2432         boost::statechart::custom_reaction< DeferBackfill >,
2433         boost::statechart::custom_reaction< UnfoundBackfill >,
2434         boost::statechart::custom_reaction< RemoteReservationRejected >,
2435         boost::statechart::custom_reaction< RemoteReservationRevokedTooFull>,
2436         boost::statechart::custom_reaction< RemoteReservationRevoked>
2437         > reactions;
2438       explicit Backfilling(my_context ctx);
2439       boost::statechart::result react(const RemoteReservationRejected& evt) {
2440         // for compat with old peers
2441         post_event(RemoteReservationRevokedTooFull());
2442         return discard_event();
2443       }
2444       void backfill_release_reservations();
2445       boost::statechart::result react(const Backfilled& evt);
2446       boost::statechart::result react(const RemoteReservationRevokedTooFull& evt);
2447       boost::statechart::result react(const RemoteReservationRevoked& evt);
2448       boost::statechart::result react(const DeferBackfill& evt);
2449       boost::statechart::result react(const UnfoundBackfill& evt);
2450       void cancel_backfill();
2451       void exit();
2452     };
2453
2454     struct WaitRemoteBackfillReserved : boost::statechart::state< WaitRemoteBackfillReserved, Active >, NamedState {
2455       typedef boost::mpl::list<
2456         boost::statechart::custom_reaction< RemoteBackfillReserved >,
2457         boost::statechart::custom_reaction< RemoteReservationRejected >,
2458         boost::statechart::custom_reaction< RemoteReservationRevoked >,
2459         boost::statechart::transition< AllBackfillsReserved, Backfilling >
2460         > reactions;
2461       set<pg_shard_t>::const_iterator backfill_osd_it;
2462       explicit WaitRemoteBackfillReserved(my_context ctx);
2463       void retry();
2464       void exit();
2465       boost::statechart::result react(const RemoteBackfillReserved& evt);
2466       boost::statechart::result react(const RemoteReservationRejected& evt);
2467       boost::statechart::result react(const RemoteReservationRevoked& evt);
2468     };
2469
2470     struct WaitLocalBackfillReserved : boost::statechart::state< WaitLocalBackfillReserved, Active >, NamedState {
2471       typedef boost::mpl::list<
2472         boost::statechart::transition< LocalBackfillReserved, WaitRemoteBackfillReserved >
2473         > reactions;
2474       explicit WaitLocalBackfillReserved(my_context ctx);
2475       void exit();
2476     };
2477
2478     struct NotBackfilling : boost::statechart::state< NotBackfilling, Active>, NamedState {
2479       typedef boost::mpl::list<
2480         boost::statechart::transition< RequestBackfill, WaitLocalBackfillReserved>,
2481         boost::statechart::custom_reaction< RemoteBackfillReserved >,
2482         boost::statechart::custom_reaction< RemoteReservationRejected >
2483         > reactions;
2484       explicit NotBackfilling(my_context ctx);
2485       void exit();
2486       boost::statechart::result react(const RemoteBackfillReserved& evt);
2487       boost::statechart::result react(const RemoteReservationRejected& evt);
2488     };
2489
2490     struct NotRecovering : boost::statechart::state< NotRecovering, Active>, NamedState {
2491       typedef boost::mpl::list<
2492         boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >,
2493         boost::statechart::custom_reaction< DeferRecovery >,
2494         boost::statechart::custom_reaction< UnfoundRecovery >
2495         > reactions;
2496       explicit NotRecovering(my_context ctx);
2497       boost::statechart::result react(const DeferRecovery& evt) {
2498         /* no-op */
2499         return discard_event();
2500       }
2501       boost::statechart::result react(const UnfoundRecovery& evt) {
2502         /* no-op */
2503         return discard_event();
2504       }
2505       void exit();
2506     };
2507
2508     struct ToDelete;
2509     struct RepNotRecovering;
2510     struct ReplicaActive : boost::statechart::state< ReplicaActive, Started, RepNotRecovering >, NamedState {
2511       explicit ReplicaActive(my_context ctx);
2512       void exit();
2513
2514       typedef boost::mpl::list <
2515         boost::statechart::custom_reaction< QueryState >,
2516         boost::statechart::custom_reaction< ActMap >,
2517         boost::statechart::custom_reaction< MQuery >,
2518         boost::statechart::custom_reaction< MInfoRec >,
2519         boost::statechart::custom_reaction< MLogRec >,
2520         boost::statechart::custom_reaction< MTrim >,
2521         boost::statechart::custom_reaction< Activate >,
2522         boost::statechart::custom_reaction< DeferRecovery >,
2523         boost::statechart::custom_reaction< DeferBackfill >,
2524         boost::statechart::custom_reaction< UnfoundRecovery >,
2525         boost::statechart::custom_reaction< UnfoundBackfill >,
2526         boost::statechart::custom_reaction< RemoteBackfillPreempted >,
2527         boost::statechart::custom_reaction< RemoteRecoveryPreempted >,
2528         boost::statechart::custom_reaction< RecoveryDone >,
2529         boost::statechart::transition<DeleteStart, ToDelete>
2530         > reactions;
2531       boost::statechart::result react(const QueryState& q);
2532       boost::statechart::result react(const MInfoRec& infoevt);
2533       boost::statechart::result react(const MLogRec& logevt);
2534       boost::statechart::result react(const MTrim& trimevt);
2535       boost::statechart::result react(const ActMap&);
2536       boost::statechart::result react(const MQuery&);
2537       boost::statechart::result react(const Activate&);
2538       boost::statechart::result react(const RecoveryDone&) {
2539         return discard_event();
2540       }
2541       boost::statechart::result react(const DeferRecovery& evt) {
2542         return discard_event();
2543       }
2544       boost::statechart::result react(const DeferBackfill& evt) {
2545         return discard_event();
2546       }
2547       boost::statechart::result react(const UnfoundRecovery& evt) {
2548         return discard_event();
2549       }
2550       boost::statechart::result react(const UnfoundBackfill& evt) {
2551         return discard_event();
2552       }
2553       boost::statechart::result react(const RemoteBackfillPreempted& evt) {
2554         return discard_event();
2555       }
2556       boost::statechart::result react(const RemoteRecoveryPreempted& evt) {
2557         return discard_event();
2558       }
2559     };
2560
2561     struct RepRecovering : boost::statechart::state< RepRecovering, ReplicaActive >, NamedState {
2562       typedef boost::mpl::list<
2563         boost::statechart::transition< RecoveryDone, RepNotRecovering >,
2564         // for compat with old peers
2565         boost::statechart::transition< RemoteReservationRejected, RepNotRecovering >,
2566         boost::statechart::transition< RemoteReservationCanceled, RepNotRecovering >,
2567         boost::statechart::custom_reaction< BackfillTooFull >,
2568         boost::statechart::custom_reaction< RemoteRecoveryPreempted >,
2569         boost::statechart::custom_reaction< RemoteBackfillPreempted >
2570         > reactions;
2571       explicit RepRecovering(my_context ctx);
2572       boost::statechart::result react(const RemoteRecoveryPreempted &evt);
2573       boost::statechart::result react(const BackfillTooFull &evt);
2574       boost::statechart::result react(const RemoteBackfillPreempted &evt);
2575       void exit();
2576     };
2577
2578     struct RepWaitBackfillReserved : boost::statechart::state< RepWaitBackfillReserved, ReplicaActive >, NamedState {
2579       typedef boost::mpl::list<
2580         boost::statechart::custom_reaction< RemoteBackfillReserved >,
2581         boost::statechart::custom_reaction< RejectRemoteReservation >,
2582         boost::statechart::custom_reaction< RemoteReservationRejected >,
2583         boost::statechart::custom_reaction< RemoteReservationCanceled >
2584         > reactions;
2585       explicit RepWaitBackfillReserved(my_context ctx);
2586       void exit();
2587       boost::statechart::result react(const RemoteBackfillReserved &evt);
2588       boost::statechart::result react(const RejectRemoteReservation &evt);
2589       boost::statechart::result react(const RemoteReservationRejected &evt);
2590       boost::statechart::result react(const RemoteReservationCanceled &evt);
2591     };
2592
2593     struct RepWaitRecoveryReserved : boost::statechart::state< RepWaitRecoveryReserved, ReplicaActive >, NamedState {
2594       typedef boost::mpl::list<
2595         boost::statechart::custom_reaction< RemoteRecoveryReserved >,
2596         // for compat with old peers
2597         boost::statechart::custom_reaction< RemoteReservationRejected >,
2598         boost::statechart::custom_reaction< RemoteReservationCanceled >
2599         > reactions;
2600       explicit RepWaitRecoveryReserved(my_context ctx);
2601       void exit();
2602       boost::statechart::result react(const RemoteRecoveryReserved &evt);
2603       boost::statechart::result react(const RemoteReservationRejected &evt) {
2604         // for compat with old peers
2605         post_event(RemoteReservationCanceled());
2606         return discard_event();
2607       }
2608       boost::statechart::result react(const RemoteReservationCanceled &evt);
2609     };
2610
2611     struct RepNotRecovering : boost::statechart::state< RepNotRecovering, ReplicaActive>, NamedState {
2612       typedef boost::mpl::list<
2613         boost::statechart::custom_reaction< RequestRecoveryPrio >,
2614         boost::statechart::custom_reaction< RequestBackfillPrio >,
2615         boost::statechart::custom_reaction< RejectRemoteReservation >,
2616         boost::statechart::transition< RemoteReservationRejected, RepNotRecovering >,
2617         boost::statechart::transition< RemoteReservationCanceled, RepNotRecovering >,
2618         boost::statechart::custom_reaction< RemoteRecoveryReserved >,
2619         boost::statechart::custom_reaction< RemoteBackfillReserved >,
2620         boost::statechart::transition< RecoveryDone, RepNotRecovering >  // for compat with pre-reservation peers
2621         > reactions;
2622       explicit RepNotRecovering(my_context ctx);
2623       boost::statechart::result react(const RequestRecoveryPrio &evt);
2624       boost::statechart::result react(const RequestBackfillPrio &evt);
2625       boost::statechart::result react(const RemoteBackfillReserved &evt) {
2626         // my reservation completion raced with a RELEASE from primary
2627         return discard_event();
2628       }
2629       boost::statechart::result react(const RemoteRecoveryReserved &evt) {
2630         // my reservation completion raced with a RELEASE from primary
2631         return discard_event();
2632       }
2633       boost::statechart::result react(const RejectRemoteReservation &evt);
2634       void exit();
2635     };
2636
2637     struct Recovering : boost::statechart::state< Recovering, Active >, NamedState {
2638       typedef boost::mpl::list <
2639         boost::statechart::custom_reaction< AllReplicasRecovered >,
2640         boost::statechart::custom_reaction< DeferRecovery >,
2641         boost::statechart::custom_reaction< UnfoundRecovery >,
2642         boost::statechart::custom_reaction< RequestBackfill >
2643         > reactions;
2644       explicit Recovering(my_context ctx);
2645       void exit();
2646       void release_reservations(bool cancel = false);
2647       boost::statechart::result react(const AllReplicasRecovered &evt);
2648       boost::statechart::result react(const DeferRecovery& evt);
2649       boost::statechart::result react(const UnfoundRecovery& evt);
2650       boost::statechart::result react(const RequestBackfill &evt);
2651     };
2652
2653     struct WaitRemoteRecoveryReserved : boost::statechart::state< WaitRemoteRecoveryReserved, Active >, NamedState {
2654       typedef boost::mpl::list <
2655         boost::statechart::custom_reaction< RemoteRecoveryReserved >,
2656         boost::statechart::transition< AllRemotesReserved, Recovering >
2657         > reactions;
2658       set<pg_shard_t>::const_iterator remote_recovery_reservation_it;
2659       explicit WaitRemoteRecoveryReserved(my_context ctx);
2660       boost::statechart::result react(const RemoteRecoveryReserved &evt);
2661       void exit();
2662     };
2663
2664     struct WaitLocalRecoveryReserved : boost::statechart::state< WaitLocalRecoveryReserved, Active >, NamedState {
2665       typedef boost::mpl::list <
2666         boost::statechart::transition< LocalRecoveryReserved, WaitRemoteRecoveryReserved >,
2667         boost::statechart::custom_reaction< RecoveryTooFull >
2668         > reactions;
2669       explicit WaitLocalRecoveryReserved(my_context ctx);
2670       void exit();
2671       boost::statechart::result react(const RecoveryTooFull &evt);
2672     };
2673
2674     struct Activating : boost::statechart::state< Activating, Active >, NamedState {
2675       typedef boost::mpl::list <
2676         boost::statechart::transition< AllReplicasRecovered, Recovered >,
2677         boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >,
2678         boost::statechart::transition< RequestBackfill, WaitLocalBackfillReserved >
2679         > reactions;
2680       explicit Activating(my_context ctx);
2681       void exit();
2682     };
2683
2684     struct Stray : boost::statechart::state< Stray, Started >,
2685               NamedState {
2686       explicit Stray(my_context ctx);
2687       void exit();
2688
2689       typedef boost::mpl::list <
2690         boost::statechart::custom_reaction< MQuery >,
2691         boost::statechart::custom_reaction< MLogRec >,
2692         boost::statechart::custom_reaction< MInfoRec >,
2693         boost::statechart::custom_reaction< ActMap >,
2694         boost::statechart::custom_reaction< RecoveryDone >,
2695         boost::statechart::transition<DeleteStart, ToDelete>
2696         > reactions;
2697       boost::statechart::result react(const MQuery& query);
2698       boost::statechart::result react(const MLogRec& logevt);
2699       boost::statechart::result react(const MInfoRec& infoevt);
2700       boost::statechart::result react(const ActMap&);
2701       boost::statechart::result react(const RecoveryDone&) {
2702         return discard_event();
2703       }
2704     };
2705
2706     struct WaitDeleteReserved;
2707     struct ToDelete : boost::statechart::state<ToDelete, Started, WaitDeleteReserved>, NamedState {
2708       unsigned priority = 0;
2709       typedef boost::mpl::list <
2710         boost::statechart::custom_reaction< ActMap >,
2711         boost::statechart::custom_reaction< DeleteSome >
2712         > reactions;
2713       explicit ToDelete(my_context ctx);
2714       boost::statechart::result react(const ActMap &evt);
2715       boost::statechart::result react(const DeleteSome &evt) {
2716         // happens if we drop out of Deleting due to reprioritization etc.
2717         return discard_event();
2718       }
2719       void exit();
2720     };
2721
2722     struct Deleting;
2723     struct WaitDeleteReserved : boost::statechart::state<WaitDeleteReserved,
2724                                                          ToDelete>, NamedState {
2725       typedef boost::mpl::list <
2726         boost::statechart::transition<DeleteReserved, Deleting>
2727         > reactions;
2728       explicit WaitDeleteReserved(my_context ctx);
2729       void exit();
2730     };
2731
2732     struct Deleting : boost::statechart::state<Deleting,
2733                                                ToDelete>, NamedState {
2734       typedef boost::mpl::list <
2735         boost::statechart::custom_reaction< DeleteSome >,
2736         boost::statechart::transition<DeleteInterrupted, WaitDeleteReserved>
2737         > reactions;
2738       explicit Deleting(my_context ctx);
2739       boost::statechart::result react(const DeleteSome &evt);
2740       void exit();
2741     };
2742
2743     struct GetLog;
2744
2745     struct GetInfo : boost::statechart::state< GetInfo, Peering >, NamedState {
2746       set<pg_shard_t> peer_info_requested;
2747
2748       explicit GetInfo(my_context ctx);
2749       void exit();
2750       void get_infos();
2751
2752       typedef boost::mpl::list <
2753         boost::statechart::custom_reaction< QueryState >,
2754         boost::statechart::transition< GotInfo, GetLog >,
2755         boost::statechart::custom_reaction< MNotifyRec >,
2756         boost::statechart::transition< IsDown, Down >
2757         > reactions;
2758       boost::statechart::result react(const QueryState& q);
2759       boost::statechart::result react(const MNotifyRec& infoevt);
2760     };
2761
2762     struct GotLog : boost::statechart::event< GotLog > {
2763       GotLog() : boost::statechart::event< GotLog >() {}
2764     };
2765
2766     struct GetLog : boost::statechart::state< GetLog, Peering >, NamedState {
2767       pg_shard_t auth_log_shard;
2768       boost::intrusive_ptr<MOSDPGLog> msg;
2769
2770       explicit GetLog(my_context ctx);
2771       void exit();
2772
2773       typedef boost::mpl::list <
2774         boost::statechart::custom_reaction< QueryState >,
2775         boost::statechart::custom_reaction< MLogRec >,
2776         boost::statechart::custom_reaction< GotLog >,
2777         boost::statechart::custom_reaction< AdvMap >,
2778         boost::statechart::transition< IsIncomplete, Incomplete >
2779         > reactions;
2780       boost::statechart::result react(const AdvMap&);
2781       boost::statechart::result react(const QueryState& q);
2782       boost::statechart::result react(const MLogRec& logevt);
2783       boost::statechart::result react(const GotLog&);
2784     };
2785
2786     struct WaitUpThru;
2787
2788     struct GetMissing : boost::statechart::state< GetMissing, Peering >, NamedState {
2789       set<pg_shard_t> peer_missing_requested;
2790
2791       explicit GetMissing(my_context ctx);
2792       void exit();
2793
2794       typedef boost::mpl::list <
2795         boost::statechart::custom_reaction< QueryState >,
2796         boost::statechart::custom_reaction< MLogRec >,
2797         boost::statechart::transition< NeedUpThru, WaitUpThru >
2798         > reactions;
2799       boost::statechart::result react(const QueryState& q);
2800       boost::statechart::result react(const MLogRec& logevt);
2801     };
2802
2803     struct WaitUpThru : boost::statechart::state< WaitUpThru, Peering >, NamedState {
2804       explicit WaitUpThru(my_context ctx);
2805       void exit();
2806
2807       typedef boost::mpl::list <
2808         boost::statechart::custom_reaction< QueryState >,
2809         boost::statechart::custom_reaction< ActMap >,
2810         boost::statechart::custom_reaction< MLogRec >
2811         > reactions;
2812       boost::statechart::result react(const QueryState& q);
2813       boost::statechart::result react(const ActMap& am);
2814       boost::statechart::result react(const MLogRec& logrec);
2815     };
2816
2817     struct Down : boost::statechart::state< Down, Peering>, NamedState {
2818       explicit Down(my_context ctx);
2819       typedef boost::mpl::list <
2820         boost::statechart::custom_reaction< QueryState >,
2821         boost::statechart::custom_reaction< MNotifyRec >
2822         > reactions;
2823       boost::statechart::result react(const QueryState& q);
2824       boost::statechart::result react(const MNotifyRec& infoevt);
2825       void exit();
2826     };
2827
2828     struct Incomplete : boost::statechart::state< Incomplete, Peering>, NamedState {
2829       typedef boost::mpl::list <
2830         boost::statechart::custom_reaction< AdvMap >,
2831         boost::statechart::custom_reaction< MNotifyRec >,
2832         boost::statechart::custom_reaction< QueryState >
2833         > reactions;
2834       explicit Incomplete(my_context ctx);
2835       boost::statechart::result react(const AdvMap &advmap);
2836       boost::statechart::result react(const MNotifyRec& infoevt);
2837       boost::statechart::result react(const QueryState& q);
2838       void exit();
2839     };
2840
2841     RecoveryMachine machine;
2842     PG *pg;
2843
2844     /// context passed in by state machine caller
2845     RecoveryCtx *orig_ctx;
2846
2847     /// populated if we are buffering messages pending a flush
2848     boost::optional<BufferedRecoveryMessages> messages_pending_flush;
2849
2850     /**
2851      * populated between start_handle() and end_handle(), points into
2852      * the message lists for messages_pending_flush while blocking messages
2853      * or into orig_ctx otherwise
2854      */
2855     boost::optional<RecoveryCtx> rctx;
2856
2857   public:
2858     explicit RecoveryState(PG *pg)
2859       : machine(this, pg), pg(pg), orig_ctx(0) {
2860       machine.initiate();
2861     }
2862
2863     void handle_event(const boost::statechart::event_base &evt,
2864                       RecoveryCtx *rctx) {
2865       start_handle(rctx);
2866       machine.process_event(evt);
2867       end_handle();
2868     }
2869
2870     void handle_event(PGPeeringEventRef evt,
2871                       RecoveryCtx *rctx) {
2872       start_handle(rctx);
2873       machine.process_event(evt->get_event());
2874       end_handle();
2875     }
2876
2877   } recovery_state;
2878
2879
2880
2881   uint64_t peer_features;
2882   uint64_t acting_features;
2883   uint64_t upacting_features;
2884
2885   epoch_t last_epoch;
2886
2887   /// most recently consumed osdmap's require_osd_version
2888   unsigned last_require_osd_release = 0;
2889   bool delete_needs_sleep = false;
2890
2891 protected:
2892   void reset_min_peer_features() {
2893     peer_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
2894   }
2895   uint64_t get_min_peer_features() const { return peer_features; }
2896   void apply_peer_features(uint64_t f) { peer_features &= f; }
2897
2898   uint64_t get_min_acting_features() const { return acting_features; }
2899   uint64_t get_min_upacting_features() const { return upacting_features; }
2900   bool perform_deletes_during_peering() const {
2901     return !(get_osdmap()->test_flag(CEPH_OSDMAP_RECOVERY_DELETES));
2902   }
2903
2904   bool hard_limit_pglog() const {
2905     return (get_osdmap()->test_flag(CEPH_OSDMAP_PGLOG_HARDLIMIT));
2906   }
2907
2908   void init_primary_up_acting(
2909     const vector<int> &newup,
2910     const vector<int> &newacting,
2911     int new_up_primary,
2912     int new_acting_primary) {
2913     actingset.clear();
2914     acting = newacting;
2915     for (uint8_t i = 0; i < acting.size(); ++i) {
2916       if (acting[i] != CRUSH_ITEM_NONE)
2917         actingset.insert(
2918           pg_shard_t(
2919             acting[i],
2920             pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
2921     }
2922     upset.clear();
2923     up = newup;
2924     for (uint8_t i = 0; i < up.size(); ++i) {
2925       if (up[i] != CRUSH_ITEM_NONE)
2926         upset.insert(
2927           pg_shard_t(
2928             up[i],
2929             pool.info.is_erasure() ? shard_id_t(i) : shard_id_t::NO_SHARD));
2930     }
2931     if (!pool.info.is_erasure()) {
2932       up_primary = pg_shard_t(new_up_primary, shard_id_t::NO_SHARD);
2933       primary = pg_shard_t(new_acting_primary, shard_id_t::NO_SHARD);
2934       return;
2935     }
2936     up_primary = pg_shard_t();
2937     primary = pg_shard_t();
2938     for (uint8_t i = 0; i < up.size(); ++i) {
2939       if (up[i] == new_up_primary) {
2940         up_primary = pg_shard_t(up[i], shard_id_t(i));
2941         break;
2942       }
2943     }
2944     for (uint8_t i = 0; i < acting.size(); ++i) {
2945       if (acting[i] == new_acting_primary) {
2946         primary = pg_shard_t(acting[i], shard_id_t(i));
2947         break;
2948       }
2949     }
2950     ceph_assert(up_primary.osd == new_up_primary);
2951     ceph_assert(primary.osd == new_acting_primary);
2952   }
2953
2954   void set_role(int r) {
2955     role = r;
2956   }
2957
2958   bool state_test(uint64_t m) const { return (state & m) != 0; }
2959   void state_set(uint64_t m) { state |= m; }
2960   void state_clear(uint64_t m) { state &= ~m; }
2961
2962   bool is_complete() const { return info.last_complete == info.last_update; }
2963   bool should_send_notify() const { return send_notify; }
2964
2965   uint64_t get_state() const { return state; }
2966   bool is_active() const { return state_test(PG_STATE_ACTIVE); }
2967   bool is_activating() const { return state_test(PG_STATE_ACTIVATING); }
2968   bool is_peering() const { return state_test(PG_STATE_PEERING); }
2969   bool is_down() const { return state_test(PG_STATE_DOWN); }
2970   bool is_recovery_unfound() const { return state_test(PG_STATE_RECOVERY_UNFOUND); }
2971   bool is_backfill_unfound() const { return state_test(PG_STATE_BACKFILL_UNFOUND); }
2972   bool is_incomplete() const { return state_test(PG_STATE_INCOMPLETE); }
2973   bool is_clean() const { return state_test(PG_STATE_CLEAN); }
2974   bool is_degraded() const { return state_test(PG_STATE_DEGRADED); }
2975   bool is_undersized() const { return state_test(PG_STATE_UNDERSIZED); }
2976   bool is_scrubbing() const { return state_test(PG_STATE_SCRUBBING); }
2977   bool is_remapped() const { return state_test(PG_STATE_REMAPPED); }
2978   bool is_peered() const {
2979     return state_test(PG_STATE_ACTIVE) || state_test(PG_STATE_PEERED);
2980   }
2981   bool is_recovering() const { return state_test(PG_STATE_RECOVERING); }
2982   bool is_premerge() const { return state_test(PG_STATE_PREMERGE); }
2983   bool is_repair() const { return state_test(PG_STATE_REPAIR); }
2984
2985   bool is_empty() const { return info.last_update == eversion_t(0,0); }
2986
2987   // pg on-disk state
2988   void do_pending_flush();
2989
2990 public:
2991   static void _create(ObjectStore::Transaction& t, spg_t pgid, int bits);
2992   static void _init(ObjectStore::Transaction& t,
2993                     spg_t pgid, const pg_pool_t *pool);
2994
2995 protected:
2996   void prepare_write_info(map<string,bufferlist> *km);
2997
2998   void update_store_with_options();
2999
3000 public:
3001   static int _prepare_write_info(
3002     CephContext* cct,
3003     map<string,bufferlist> *km,
3004     epoch_t epoch,
3005     pg_info_t &info,
3006     pg_info_t &last_written_info,
3007     PastIntervals &past_intervals,
3008     bool dirty_big_info,
3009     bool dirty_epoch,
3010     bool try_fast_info,
3011     PerfCounters *logger = nullptr);
3012
3013   void write_if_dirty(RecoveryCtx *rctx) {
3014     write_if_dirty(*rctx->transaction);
3015   }
3016 protected:
3017   void write_if_dirty(ObjectStore::Transaction& t);
3018
3019   PGLog::IndexedLog projected_log;
3020   bool check_in_progress_op(
3021     const osd_reqid_t &r,
3022     eversion_t *version,
3023     version_t *user_version,
3024     int *return_code) const;
3025   eversion_t projected_last_update;
3026   eversion_t get_next_version() const {
3027     eversion_t at_version(
3028       get_osdmap_epoch(),
3029       projected_last_update.version+1);
3030     ceph_assert(at_version > info.last_update);
3031     ceph_assert(at_version > pg_log.get_head());
3032     ceph_assert(at_version > projected_last_update);
3033     return at_version;
3034   }
3035
3036   void add_log_entry(const pg_log_entry_t& e, bool applied);
3037   void append_log(
3038     const vector<pg_log_entry_t>& logv,
3039     eversion_t trim_to,
3040     eversion_t roll_forward_to,
3041     ObjectStore::Transaction &t,
3042     bool transaction_applied = true,
3043     bool async = false);
3044   bool check_log_for_corruption(ObjectStore *store);
3045
3046   std::string get_corrupt_pg_log_name() const;
3047
3048   void update_snap_map(
3049     const vector<pg_log_entry_t> &log_entries,
3050     ObjectStore::Transaction& t);
3051
3052   void filter_snapc(vector<snapid_t> &snaps);
3053
3054   void log_weirdness();
3055
3056   virtual void kick_snap_trim() = 0;
3057   virtual void snap_trimmer_scrub_complete() = 0;
3058   bool requeue_scrub(bool high_priority = false);
3059   void queue_recovery();
3060   bool queue_scrub();
3061   unsigned get_scrub_priority();
3062
3063   /// share pg info after a pg is active
3064   void share_pg_info();
3065
3066
3067   bool append_log_entries_update_missing(
3068     const mempool::osd_pglog::list<pg_log_entry_t> &entries,
3069     ObjectStore::Transaction &t,
3070     boost::optional<eversion_t> trim_to,
3071     boost::optional<eversion_t> roll_forward_to);
3072
3073   /**
3074    * Merge entries updating missing as necessary on all
3075    * acting_recovery_backfill logs and missings (also missing_loc)
3076    */
3077   void merge_new_log_entries(
3078     const mempool::osd_pglog::list<pg_log_entry_t> &entries,
3079     ObjectStore::Transaction &t,
3080     boost::optional<eversion_t> trim_to,
3081     boost::optional<eversion_t> roll_forward_to);
3082
3083   void reset_interval_flush();
3084   void start_peering_interval(
3085     const OSDMapRef lastmap,
3086     const vector<int>& newup, int up_primary,
3087     const vector<int>& newacting, int acting_primary,
3088     ObjectStore::Transaction *t);
3089   void on_new_interval();
3090   virtual void _on_new_interval() = 0;
3091   void start_flush(ObjectStore::Transaction *t);
3092   void set_last_peering_reset();
3093
3094   void update_history(const pg_history_t& history);
3095   void fulfill_info(pg_shard_t from, const pg_query_t &query,
3096                     pair<pg_shard_t, pg_info_t> &notify_info);
3097   void fulfill_log(pg_shard_t from, const pg_query_t &query, epoch_t query_epoch);
3098   void fulfill_query(const MQuery& q, RecoveryCtx *rctx);
3099   void check_full_transition(OSDMapRef lastmap, OSDMapRef osdmap);
3100
3101   bool should_restart_peering(
3102     int newupprimary,
3103     int newactingprimary,
3104     const vector<int>& newup,
3105     const vector<int>& newacting,
3106     OSDMapRef lastmap,
3107     OSDMapRef osdmap);
3108
3109   // OpRequest queueing
3110   bool can_discard_op(OpRequestRef& op);
3111   bool can_discard_scan(OpRequestRef op);
3112   bool can_discard_backfill(OpRequestRef op);
3113   bool can_discard_request(OpRequestRef& op);
3114
3115   template<typename T, int MSGTYPE>
3116   bool can_discard_replica_op(OpRequestRef& op);
3117
3118   bool old_peering_msg(epoch_t reply_epoch, epoch_t query_epoch);
3119   bool old_peering_evt(PGPeeringEventRef evt) {
3120     return old_peering_msg(evt->get_epoch_sent(), evt->get_epoch_requested());
3121   }
3122   static bool have_same_or_newer_map(epoch_t cur_epoch, epoch_t e) {
3123     return e <= cur_epoch;
3124   }
3125   bool have_same_or_newer_map(epoch_t e) {
3126     return e <= get_osdmap_epoch();
3127   }
3128
3129   bool op_has_sufficient_caps(OpRequestRef& op);
3130
3131
3132   // recovery bits
3133   void take_waiters();
3134
3135
3136   // abstract bits
3137   friend class FlushState;
3138
3139   virtual void on_role_change() = 0;
3140   virtual void on_pool_change() = 0;
3141   virtual void on_change(ObjectStore::Transaction *t) = 0;
3142   virtual void on_activate() = 0;
3143   virtual void on_flushed() = 0;
3144   virtual void check_blacklisted_watchers() = 0;
3145
3146   friend ostream& operator<<(ostream& out, const PG& pg);
3147 };
3148
3149
3150 ostream& operator<<(ostream& out, const PG::BackfillInterval& bi);
3151
3152 #endif