ceph/src/osd/PG.h

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Ceph - scalable distributed file system
   5  *
   6  * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
   7  *
   8  * This is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License version 2.1, as published by the Free Software
  11  * Foundation.  See file COPYING.
  12  *
  13  */
  14
  15 #ifndef CEPH_PG_H
  16 #define CEPH_PG_H
  17
  18 #include <boost/statechart/custom_reaction.hpp>
  19 #include <boost/statechart/event.hpp>
  20 #include <boost/statechart/simple_state.hpp>
  21 #include <boost/statechart/state.hpp>
  22 #include <boost/statechart/state_machine.hpp>
  23 #include <boost/statechart/transition.hpp>
  24 #include <boost/statechart/event_base.hpp>
  25 #include <boost/scoped_ptr.hpp>
  26 #include <boost/container/flat_set.hpp>
  27 #include "include/mempool.h"
  28
  29 // re-include our assert to clobber boost's
  30 #include "include/ceph_assert.h"
  31 #include "include/common_fwd.h"
  32
  33 #include "include/types.h"
  34 #include "include/stringify.h"
  35 #include "osd_types.h"
  36 #include "include/xlist.h"
  37 #include "SnapMapper.h"
  38 #include "Session.h"
  39 #include "common/Timer.h"
  40
  41 #include "PGLog.h"
  42 #include "OSDMap.h"
  43 #include "messages/MOSDPGLog.h"
  44 #include "include/str_list.h"
  45 #include "PGBackend.h"
  46 #include "PGPeeringEvent.h"
  47 #include "PeeringState.h"
  48 #include "MissingLoc.h"
  49
  50 #include "mgr/OSDPerfMetricTypes.h"
  51
  52 #include <atomic>
  53 #include <list>
  54 #include <memory>
  55 #include <string>
  56 #include <tuple>
  57
  58 //#define DEBUG_RECOVERY_OIDS   // track set of recovering oids explicitly, to find counting bugs
  59 //#define PG_DEBUG_REFS    // track provenance of pg refs, helpful for finding leaks
  60
  61 class OSD;
  62 class OSDService;
  63 class OSDShard;
  64 class OSDShardPGSlot;
  65 class MOSDOp;
  66 class MOSDPGScan;
  67 class MOSDPGBackfill;
  68 class MOSDPGInfo;
  69
  70 class PG;
  71 struct OpRequest;
  72 typedef OpRequest::Ref OpRequestRef;
  73 class MOSDPGLog;
  74 class DynamicPerfStats;
  75
  76 namespace Scrub {
  77   class Store;
  78 }
  79
  80 #ifdef PG_DEBUG_REFS
  81 #include "common/tracked_int_ptr.hpp"
  82   uint64_t get_with_id(PG *pg);
  83   void put_with_id(PG *pg, uint64_t id);
  84   typedef TrackedIntPtr<PG> PGRef;
  85 #else
  86   typedef boost::intrusive_ptr<PG> PGRef;
  87 #endif
  88
  89 class PGRecoveryStats {
  90   struct per_state_info {
  91     uint64_t enter, exit;     // enter/exit counts
  92     uint64_t events;
  93     utime_t event_time;       // time spent processing events
  94     utime_t total_time;       // total time in state
  95     utime_t min_time, max_time;
  96
  97     // cppcheck-suppress unreachableCode
  98     per_state_info() : enter(0), exit(0), events(0) {}
  99   };
 100   map<const char *,per_state_info> info;
 101   ceph::mutex lock = ceph::make_mutex("PGRecoverStats::lock");
 102
 103   public:
 104   PGRecoveryStats() = default;
 105
 106   void reset() {
 107     std::lock_guard l(lock);
 108     info.clear();
 109   }
 110   void dump(ostream& out) {
 111     std::lock_guard l(lock);
 112     for (map<const char *,per_state_info>::iterator p = info.begin(); p != info.end(); ++p) {
 113       per_state_info& i = p->second;
 114       out << i.enter << "\t" << i.exit << "\t"
 115           << i.events << "\t" << i.event_time << "\t"
 116           << i.total_time << "\t"
 117           << i.min_time << "\t" << i.max_time << "\t"
 118           << p->first << "\n";
 119     }
 120   }
 121
 122   void dump_formatted(Formatter *f) {
 123     std::lock_guard l(lock);
 124     f->open_array_section("pg_recovery_stats");
 125     for (map<const char *,per_state_info>::iterator p = info.begin();
 126          p != info.end(); ++p) {
 127       per_state_info& i = p->second;
 128       f->open_object_section("recovery_state");
 129       f->dump_int("enter", i.enter);
 130       f->dump_int("exit", i.exit);
 131       f->dump_int("events", i.events);
 132       f->dump_stream("event_time") << i.event_time;
 133       f->dump_stream("total_time") << i.total_time;
 134       f->dump_stream("min_time") << i.min_time;
 135       f->dump_stream("max_time") << i.max_time;
 136       vector<string> states;
 137       get_str_vec(p->first, "/", states);
 138       f->open_array_section("nested_states");
 139       for (vector<string>::iterator st = states.begin();
 140            st != states.end(); ++st) {
 141         f->dump_string("state", *st);
 142       }
 143       f->close_section();
 144       f->close_section();
 145     }
 146     f->close_section();
 147   }
 148
 149   void log_enter(const char *s) {
 150     std::lock_guard l(lock);
 151     info[s].enter++;
 152   }
 153   void log_exit(const char *s, utime_t dur, uint64_t events, utime_t event_dur) {
 154     std::lock_guard l(lock);
 155     per_state_info &i = info[s];
 156     i.exit++;
 157     i.total_time += dur;
 158     if (dur > i.max_time)
 159       i.max_time = dur;
 160     if (dur < i.min_time || i.min_time == utime_t())
 161       i.min_time = dur;
 162     i.events += events;
 163     i.event_time += event_dur;
 164   }
 165 };
 166
 167 /** PG - Replica Placement Group
 168  *
 169  */
 170
 171 class PG : public DoutPrefixProvider, public PeeringState::PeeringListener {
 172   friend class NamedState;
 173   friend class PeeringState;
 174
 175 public:
 176   const pg_shard_t pg_whoami;
 177   const spg_t pg_id;
 178
 179 public:
 180   // -- members --
 181   const coll_t coll;
 182
 183   ObjectStore::CollectionHandle ch;
 184
 185   // -- methods --
 186   std::ostream& gen_prefix(std::ostream& out) const override;
 187   CephContext *get_cct() const override {
 188     return cct;
 189   }
 190   unsigned get_subsys() const override {
 191     return ceph_subsys_osd;
 192   }
 193
 194   const char* const get_current_state() const {
 195     return recovery_state.get_current_state();
 196   }
 197
 198   const OSDMapRef& get_osdmap() const {
 199     ceph_assert(is_locked());
 200     return recovery_state.get_osdmap();
 201   }
 202
 203   epoch_t get_osdmap_epoch() const override final {
 204     return recovery_state.get_osdmap()->get_epoch();
 205   }
 206
 207   PerfCounters &get_peering_perf() override;
 208   PerfCounters &get_perf_logger() override;
 209   void log_state_enter(const char *state) override;
 210   void log_state_exit(
 211     const char *state_name, utime_t enter_time,
 212     uint64_t events, utime_t event_dur) override;
 213
 214   void lock_suspend_timeout(ThreadPool::TPHandle &handle) {
 215     handle.suspend_tp_timeout();
 216     lock();
 217     handle.reset_tp_timeout();
 218   }
 219   void lock(bool no_lockdep = false) const;
 220   void unlock() const;
 221   bool is_locked() const;
 222
 223   const spg_t& get_pgid() const {
 224     return pg_id;
 225   }
 226
 227   const PGPool& get_pool() const {
 228     return pool;
 229   }
 230   uint64_t get_last_user_version() const {
 231     return info.last_user_version;
 232   }
 233   const pg_history_t& get_history() const {
 234     return info.history;
 235   }
 236   bool get_need_up_thru() const {
 237     return recovery_state.get_need_up_thru();
 238   }
 239   epoch_t get_same_interval_since() const {
 240     return info.history.same_interval_since;
 241   }
 242
 243   static void set_last_scrub_stamp(
 244     utime_t t, pg_history_t &history, pg_stat_t &stats) {
 245     stats.last_scrub_stamp = t;
 246     history.last_scrub_stamp = t;
 247   }
 248
 249   void set_last_scrub_stamp(utime_t t) {
 250     recovery_state.update_stats(
 251       [=](auto &history, auto &stats) {
 252         set_last_scrub_stamp(t, history, stats);
 253         return true;
 254       });
 255   }
 256
 257   static void set_last_deep_scrub_stamp(
 258     utime_t t, pg_history_t &history, pg_stat_t &stats) {
 259     stats.last_deep_scrub_stamp = t;
 260     history.last_deep_scrub_stamp = t;
 261   }
 262
 263   void set_last_deep_scrub_stamp(utime_t t) {
 264     recovery_state.update_stats(
 265       [=](auto &history, auto &stats) {
 266         set_last_deep_scrub_stamp(t, history, stats);
 267         return true;
 268       });
 269   }
 270
 271   bool is_deleting() const {
 272     return recovery_state.is_deleting();
 273   }
 274   bool is_deleted() const {
 275     return recovery_state.is_deleted();
 276   }
 277   bool is_nonprimary() const {
 278     return recovery_state.is_nonprimary();
 279   }
 280   bool is_primary() const {
 281     return recovery_state.is_primary();
 282   }
 283   bool pg_has_reset_since(epoch_t e) {
 284     ceph_assert(is_locked());
 285     return recovery_state.pg_has_reset_since(e);
 286   }
 287
 288   bool is_ec_pg() const {
 289     return recovery_state.is_ec_pg();
 290   }
 291   int get_role() const {
 292     return recovery_state.get_role();
 293   }
 294   const vector<int> get_acting() const {
 295     return recovery_state.get_acting();
 296   }
 297   const set<pg_shard_t> &get_actingset() const {
 298     return recovery_state.get_actingset();
 299   }
 300   int get_acting_primary() const {
 301     return recovery_state.get_acting_primary();
 302   }
 303   pg_shard_t get_primary() const {
 304     return recovery_state.get_primary();
 305   }
 306   const vector<int> get_up() const {
 307     return recovery_state.get_up();
 308   }
 309   int get_up_primary() const {
 310     return recovery_state.get_up_primary();
 311   }
 312   const PastIntervals& get_past_intervals() const {
 313     return recovery_state.get_past_intervals();
 314   }
 315   bool is_acting_recovery_backfill(pg_shard_t osd) const {
 316     return recovery_state.is_acting_recovery_backfill(osd);
 317   }
 318   const set<pg_shard_t> &get_acting_recovery_backfill() const {
 319     return recovery_state.get_acting_recovery_backfill();
 320   }
 321   bool is_acting(pg_shard_t osd) const {
 322     return recovery_state.is_acting(osd);
 323   }
 324   bool is_up(pg_shard_t osd) const {
 325     return recovery_state.is_up(osd);
 326   }
 327   static bool has_shard(bool ec, const vector<int>& v, pg_shard_t osd) {
 328     return PeeringState::has_shard(ec, v, osd);
 329   }
 330
 331   /// initialize created PG
 332   void init(
 333     int role,
 334     const vector<int>& up,
 335     int up_primary,
 336     const vector<int>& acting,
 337     int acting_primary,
 338     const pg_history_t& history,
 339     const PastIntervals& pim,
 340     bool backfill,
 341     ObjectStore::Transaction &t);
 342
 343   /// read existing pg state off disk
 344   void read_state(ObjectStore *store);
 345   static int peek_map_epoch(ObjectStore *store, spg_t pgid, epoch_t *pepoch);
 346
 347   static int get_latest_struct_v() {
 348     return pg_latest_struct_v;
 349   }
 350   static int get_compat_struct_v() {
 351     return pg_compat_struct_v;
 352   }
 353   static int read_info(
 354     ObjectStore *store, spg_t pgid, const coll_t &coll,
 355     pg_info_t &info, PastIntervals &past_intervals,
 356     __u8 &);
 357   static bool _has_removal_flag(ObjectStore *store, spg_t pgid);
 358
 359   void rm_backoff(const ceph::ref_t<Backoff>& b);
 360
 361   void update_snap_mapper_bits(uint32_t bits) {
 362     snap_mapper.update_bits(bits);
 363   }
 364   void start_split_stats(const set<spg_t>& childpgs, vector<object_stat_sum_t> *v);
 365   virtual void split_colls(
 366     spg_t child,
 367     int split_bits,
 368     int seed,
 369     const pg_pool_t *pool,
 370     ObjectStore::Transaction &t) = 0;
 371   void split_into(pg_t child_pgid, PG *child, unsigned split_bits);
 372   void merge_from(map<spg_t,PGRef>& sources, PeeringCtx &rctx,
 373                   unsigned split_bits,
 374                   const pg_merge_meta_t& last_pg_merge_meta);
 375   void finish_split_stats(const object_stat_sum_t& stats,
 376                           ObjectStore::Transaction &t);
 377
 378   void scrub(epoch_t queued, ThreadPool::TPHandle &handle);
 379
 380   bool is_scrub_registered();
 381   void reg_next_scrub();
 382   void unreg_next_scrub();
 383
 384   void queue_want_pg_temp(const vector<int> &wanted) override;
 385   void clear_want_pg_temp() override;
 386
 387   void on_new_interval() override;
 388
 389   void on_role_change() override;
 390   virtual void plpg_on_role_change() = 0;
 391
 392   void init_collection_pool_opts();
 393   void on_pool_change() override;
 394   virtual void plpg_on_pool_change() = 0;
 395
 396   void on_info_history_change() override;
 397
 398   void scrub_requested(bool deep, bool repair, bool need_auto = false) override;
 399
 400   uint64_t get_snap_trimq_size() const override {
 401     return snap_trimq.size();
 402   }
 403   unsigned get_target_pg_log_entries() const override;
 404
 405   void clear_publish_stats() override;
 406   void clear_primary_state() override;
 407
 408   epoch_t oldest_stored_osdmap() override;
 409   OstreamTemp get_clog_error() override;
 410   OstreamTemp get_clog_info() override;
 411   OstreamTemp get_clog_debug() override;
 412
 413   void schedule_event_after(
 414     PGPeeringEventRef event,
 415     float delay) override;
 416   void request_local_background_io_reservation(
 417     unsigned priority,
 418     PGPeeringEventRef on_grant,
 419     PGPeeringEventRef on_preempt) override;
 420   void update_local_background_io_priority(
 421     unsigned priority) override;
 422   void cancel_local_background_io_reservation() override;
 423
 424   void request_remote_recovery_reservation(
 425     unsigned priority,
 426     PGPeeringEventRef on_grant,
 427     PGPeeringEventRef on_preempt) override;
 428   void cancel_remote_recovery_reservation() override;
 429
 430   void schedule_event_on_commit(
 431     ObjectStore::Transaction &t,
 432     PGPeeringEventRef on_commit) override;
 433
 434   void on_active_exit() override;
 435
 436   Context *on_clean() override {
 437     if (is_active()) {
 438       kick_snap_trim();
 439     }
 440     requeue_ops(waiting_for_clean_to_primary_repair);
 441     return finish_recovery();
 442   }
 443
 444   void on_activate(interval_set<snapid_t> snaps) override {
 445     ceph_assert(scrubber.callbacks.empty());
 446     ceph_assert(callbacks_for_degraded_object.empty());
 447     snap_trimq = snaps;
 448     release_pg_backoffs();
 449     projected_last_update = info.last_update;
 450   }
 451
 452   void on_activate_committed() override;
 453
 454   void on_active_actmap() override;
 455   void on_active_advmap(const OSDMapRef &osdmap) override;
 456
 457   void queue_snap_retrim(snapid_t snap);
 458
 459   void on_backfill_reserved() override;
 460   void on_backfill_canceled() override;
 461   void on_recovery_reserved() override;
 462
 463   bool is_forced_recovery_or_backfill() const {
 464     return recovery_state.is_forced_recovery_or_backfill();
 465   }
 466
 467   PGLog::LogEntryHandlerRef get_log_handler(
 468     ObjectStore::Transaction &t) override {
 469     return std::make_unique<PG::PGLogEntryHandler>(this, &t);
 470   }
 471
 472   void do_delete_work(ObjectStore::Transaction &t) override;
 473
 474   void clear_ready_to_merge() override;
 475   void set_not_ready_to_merge_target(pg_t pgid, pg_t src) override;
 476   void set_not_ready_to_merge_source(pg_t pgid) override;
 477   void set_ready_to_merge_target(eversion_t lu, epoch_t les, epoch_t lec) override;
 478   void set_ready_to_merge_source(eversion_t lu) override;
 479
 480   void send_pg_created(pg_t pgid) override;
 481
 482   ceph::signedspan get_mnow() override;
 483   HeartbeatStampsRef get_hb_stamps(int peer) override;
 484   void schedule_renew_lease(epoch_t lpr, ceph::timespan delay) override;
 485   void queue_check_readable(epoch_t lpr, ceph::timespan delay) override;
 486
 487   void rebuild_missing_set_with_deletes(PGLog &pglog) override;
 488
 489   void queue_peering_event(PGPeeringEventRef evt);
 490   void do_peering_event(PGPeeringEventRef evt, PeeringCtx &rcx);
 491   void queue_null(epoch_t msg_epoch, epoch_t query_epoch);
 492   void queue_flushed(epoch_t started_at);
 493   void handle_advance_map(
 494     OSDMapRef osdmap, OSDMapRef lastmap,
 495     vector<int>& newup, int up_primary,
 496     vector<int>& newacting, int acting_primary,
 497     PeeringCtx &rctx);
 498   void handle_activate_map(PeeringCtx &rctx);
 499   void handle_initialize(PeeringCtx &rxcx);
 500   void handle_query_state(Formatter *f);
 501
 502   /**
 503    * @param ops_begun returns how many recovery ops the function started
 504    * @returns true if any useful work was accomplished; false otherwise
 505    */
 506   virtual bool start_recovery_ops(
 507     uint64_t max,
 508     ThreadPool::TPHandle &handle,
 509     uint64_t *ops_begun) = 0;
 510
 511   // more work after the above, but with a PeeringCtx
 512   void find_unfound(epoch_t queued, PeeringCtx &rctx);
 513
 514   virtual void get_watchers(std::list<obj_watch_item_t> *ls) = 0;
 515
 516   void dump_pgstate_history(Formatter *f);
 517   void dump_missing(Formatter *f);
 518
 519   void get_pg_stats(std::function<void(const pg_stat_t&, epoch_t lec)> f);
 520   void with_heartbeat_peers(std::function<void(int)> f);
 521
 522   void shutdown();
 523   virtual void on_shutdown() = 0;
 524
 525   bool get_must_scrub() const {
 526     return scrubber.must_scrub;
 527   }
 528   bool sched_scrub();
 529
 530   virtual void do_request(
 531     OpRequestRef& op,
 532     ThreadPool::TPHandle &handle
 533   ) = 0;
 534   virtual void clear_cache() = 0;
 535   virtual int get_cache_obj_count() = 0;
 536
 537   virtual void snap_trimmer(epoch_t epoch_queued) = 0;
 538   virtual void do_command(
 539     const string_view& prefix,
 540     const cmdmap_t& cmdmap,
 541     const bufferlist& idata,
 542     std::function<void(int,const std::string&,bufferlist&)> on_finish) = 0;
 543
 544   virtual bool agent_work(int max) = 0;
 545   virtual bool agent_work(int max, int agent_flush_quota) = 0;
 546   virtual void agent_stop() = 0;
 547   virtual void agent_delay() = 0;
 548   virtual void agent_clear() = 0;
 549   virtual void agent_choose_mode_restart() = 0;
 550
 551   struct C_DeleteMore : public Context {
 552     PGRef pg;
 553     epoch_t epoch;
 554     C_DeleteMore(PG *p, epoch_t e) : pg(p), epoch(e) {}
 555     void finish(int r) override {
 556       ceph_abort();
 557     }
 558     void complete(int r) override;
 559   };
 560
 561   void _delete_some(ObjectStore::Transaction *t);
 562
 563   virtual void set_dynamic_perf_stats_queries(
 564     const std::list<OSDPerfMetricQuery> &queries) {
 565   }
 566   virtual void get_dynamic_perf_stats(DynamicPerfStats *stats) {
 567   }
 568
 569   uint64_t get_min_alloc_size() const;
 570
 571   // reference counting
 572 #ifdef PG_DEBUG_REFS
 573   uint64_t get_with_id();
 574   void put_with_id(uint64_t);
 575   void dump_live_ids();
 576 #endif
 577   void get(const char* tag);
 578   void put(const char* tag);
 579   int get_num_ref() {
 580     return ref;
 581   }
 582
 583   // ctor
 584   PG(OSDService *o, OSDMapRef curmap,
 585      const PGPool &pool, spg_t p);
 586   ~PG() override;
 587
 588   // prevent copying
 589   explicit PG(const PG& rhs) = delete;
 590   PG& operator=(const PG& rhs) = delete;
 591
 592 protected:
 593   // -------------
 594   // protected
 595   OSDService *osd;
 596 public:
 597   OSDShard *osd_shard = nullptr;
 598   OSDShardPGSlot *pg_slot = nullptr;
 599 protected:
 600   CephContext *cct;
 601
 602   // locking and reference counting.
 603   // I destroy myself when the reference count hits zero.
 604   // lock() should be called before doing anything.
 605   // get() should be called on pointer copy (to another thread, etc.).
 606   // put() should be called on destruction of some previously copied pointer.
 607   // unlock() when done with the current pointer (_most common_).
 608   mutable ceph::mutex _lock = ceph::make_mutex("PG::_lock");
 609 #ifndef CEPH_DEBUG_MUTEX
 610   mutable std::thread::id locked_by;
 611 #endif
 612   std::atomic<unsigned int> ref{0};
 613
 614 #ifdef PG_DEBUG_REFS
 615   ceph::mutex _ref_id_lock = ceph::make_mutex("PG::_ref_id_lock");
 616   map<uint64_t, string> _live_ids;
 617   map<string, uint64_t> _tag_counts;
 618   uint64_t _ref_id = 0;
 619
 620   friend uint64_t get_with_id(PG *pg) { return pg->get_with_id(); }
 621   friend void put_with_id(PG *pg, uint64_t id) { return pg->put_with_id(id); }
 622 #endif
 623
 624 private:
 625   friend void intrusive_ptr_add_ref(PG *pg) {
 626     pg->get("intptr");
 627   }
 628   friend void intrusive_ptr_release(PG *pg) {
 629     pg->put("intptr");
 630   }
 631
 632
 633   // =====================
 634
 635 protected:
 636   OSDriver osdriver;
 637   SnapMapper snap_mapper;
 638   bool eio_errors_to_process = false;
 639
 640   virtual PGBackend *get_pgbackend() = 0;
 641   virtual const PGBackend* get_pgbackend() const = 0;
 642
 643 protected:
 644   void requeue_map_waiters();
 645
 646 protected:
 647
 648   ZTracer::Endpoint trace_endpoint;
 649
 650
 651 protected:
 652   __u8 info_struct_v = 0;
 653   void upgrade(ObjectStore *store);
 654
 655 protected:
 656   ghobject_t    pgmeta_oid;
 657
 658   // ------------------
 659   interval_set<snapid_t> snap_trimq;
 660   set<snapid_t> snap_trimq_repeat;
 661
 662   /* You should not use these items without taking their respective queue locks
 663    * (if they have one) */
 664   xlist<PG*>::item stat_queue_item;
 665   bool scrub_queued;
 666   bool recovery_queued;
 667
 668   int recovery_ops_active;
 669   set<pg_shard_t> waiting_on_backfill;
 670 #ifdef DEBUG_RECOVERY_OIDS
 671   multiset<hobject_t> recovering_oids;
 672 #endif
 673
 674 public:
 675   bool dne() { return info.dne(); }
 676
 677   virtual void send_cluster_message(
 678     int osd, Message *m, epoch_t epoch, bool share_map_update) override;
 679
 680 protected:
 681   epoch_t get_last_peering_reset() const {
 682     return recovery_state.get_last_peering_reset();
 683   }
 684
 685   /* heartbeat peers */
 686   void set_probe_targets(const set<pg_shard_t> &probe_set) override;
 687   void clear_probe_targets() override;
 688
 689   ceph::mutex heartbeat_peer_lock =
 690     ceph::make_mutex("PG::heartbeat_peer_lock");
 691   set<int> heartbeat_peers;
 692   set<int> probe_targets;
 693
 694 public:
 695   /**
 696    * BackfillInterval
 697    *
 698    * Represents the objects in a range [begin, end)
 699    *
 700    * Possible states:
 701    * 1) begin == end == hobject_t() indicates the the interval is unpopulated
 702    * 2) Else, objects contains all objects in [begin, end)
 703    */
 704   struct BackfillInterval {
 705     // info about a backfill interval on a peer
 706     eversion_t version; /// version at which the scan occurred
 707     map<hobject_t,eversion_t> objects;
 708     hobject_t begin;
 709     hobject_t end;
 710
 711     /// clear content
 712     void clear() {
 713       *this = BackfillInterval();
 714     }
 715
 716     /// clear objects list only
 717     void clear_objects() {
 718       objects.clear();
 719     }
 720
 721     /// reinstantiate with a new start+end position and sort order
 722     void reset(hobject_t start) {
 723       clear();
 724       begin = end = start;
 725     }
 726
 727     /// true if there are no objects in this interval
 728     bool empty() const {
 729       return objects.empty();
 730     }
 731
 732     /// true if interval extends to the end of the range
 733     bool extends_to_end() const {
 734       return end.is_max();
 735     }
 736
 737     /// removes items <= soid and adjusts begin to the first object
 738     void trim_to(const hobject_t &soid) {
 739       trim();
 740       while (!objects.empty() &&
 741              objects.begin()->first <= soid) {
 742         pop_front();
 743       }
 744     }
 745
 746     /// Adjusts begin to the first object
 747     void trim() {
 748       if (!objects.empty())
 749         begin = objects.begin()->first;
 750       else
 751         begin = end;
 752     }
 753
 754     /// drop first entry, and adjust @begin accordingly
 755     void pop_front() {
 756       ceph_assert(!objects.empty());
 757       objects.erase(objects.begin());
 758       trim();
 759     }
 760
 761     /// dump
 762     void dump(Formatter *f) const {
 763       f->dump_stream("begin") << begin;
 764       f->dump_stream("end") << end;
 765       f->open_array_section("objects");
 766       for (map<hobject_t, eversion_t>::const_iterator i =
 767              objects.begin();
 768            i != objects.end();
 769            ++i) {
 770         f->open_object_section("object");
 771         f->dump_stream("object") << i->first;
 772         f->dump_stream("version") << i->second;
 773         f->close_section();
 774       }
 775       f->close_section();
 776     }
 777   };
 778
 779 protected:
 780   BackfillInterval backfill_info;
 781   map<pg_shard_t, BackfillInterval> peer_backfill_info;
 782   bool backfill_reserving;
 783
 784   // The primary's num_bytes and local num_bytes for this pg, only valid
 785   // during backfill for non-primary shards.
 786   // Both of these are adjusted for EC to reflect the on-disk bytes
 787   std::atomic<int64_t> primary_num_bytes = 0;
 788   std::atomic<int64_t> local_num_bytes = 0;
 789
 790 public:
 791   // Space reserved for backfill is primary_num_bytes - local_num_bytes
 792   // Don't care that difference itself isn't atomic
 793   uint64_t get_reserved_num_bytes() {
 794     int64_t primary = primary_num_bytes.load();
 795     int64_t local = local_num_bytes.load();
 796     if (primary > local)
 797       return primary - local;
 798     else
 799       return 0;
 800   }
 801
 802   bool is_remote_backfilling() {
 803     return primary_num_bytes.load() > 0;
 804   }
 805
 806   bool try_reserve_recovery_space(int64_t primary, int64_t local) override;
 807   void unreserve_recovery_space() override;
 808
 809   // If num_bytes are inconsistent and local_num- goes negative
 810   // it's ok, because it would then be ignored.
 811
 812   // The value of num_bytes could be negative,
 813   // but we don't let local_num_bytes go negative.
 814   void add_local_num_bytes(int64_t num_bytes) {
 815     if (num_bytes) {
 816       int64_t prev_bytes = local_num_bytes.load();
 817       int64_t new_bytes;
 818       do {
 819         new_bytes = prev_bytes + num_bytes;
 820         if (new_bytes < 0)
 821           new_bytes = 0;
 822       } while(!local_num_bytes.compare_exchange_weak(prev_bytes, new_bytes));
 823     }
 824   }
 825   void sub_local_num_bytes(int64_t num_bytes) {
 826     ceph_assert(num_bytes >= 0);
 827     if (num_bytes) {
 828       int64_t prev_bytes = local_num_bytes.load();
 829       int64_t new_bytes;
 830       do {
 831         new_bytes = prev_bytes - num_bytes;
 832         if (new_bytes < 0)
 833           new_bytes = 0;
 834       } while(!local_num_bytes.compare_exchange_weak(prev_bytes, new_bytes));
 835     }
 836   }
 837   // The value of num_bytes could be negative,
 838   // but we don't let info.stats.stats.sum.num_bytes go negative.
 839   void add_num_bytes(int64_t num_bytes) {
 840     ceph_assert(ceph_mutex_is_locked_by_me(_lock));
 841     if (num_bytes) {
 842       recovery_state.update_stats(
 843         [num_bytes](auto &history, auto &stats) {
 844           stats.stats.sum.num_bytes += num_bytes;
 845           if (stats.stats.sum.num_bytes < 0) {
 846             stats.stats.sum.num_bytes = 0;
 847           }
 848           return false;
 849         });
 850     }
 851   }
 852   void sub_num_bytes(int64_t num_bytes) {
 853     ceph_assert(ceph_mutex_is_locked_by_me(_lock));
 854     ceph_assert(num_bytes >= 0);
 855     if (num_bytes) {
 856       recovery_state.update_stats(
 857         [num_bytes](auto &history, auto &stats) {
 858           stats.stats.sum.num_bytes -= num_bytes;
 859           if (stats.stats.sum.num_bytes < 0) {
 860             stats.stats.sum.num_bytes = 0;
 861           }
 862           return false;
 863         });
 864     }
 865   }
 866
 867   // Only used in testing so not worried about needing the PG lock here
 868   int64_t get_stats_num_bytes() {
 869     std::lock_guard l{_lock};
 870     int num_bytes = info.stats.stats.sum.num_bytes;
 871     if (pool.info.is_erasure()) {
 872       num_bytes /= (int)get_pgbackend()->get_ec_data_chunk_count();
 873       // Round up each object by a stripe
 874       num_bytes +=  get_pgbackend()->get_ec_stripe_chunk_size() * info.stats.stats.sum.num_objects;
 875     }
 876     int64_t lnb = local_num_bytes.load();
 877     if (lnb && lnb != num_bytes) {
 878       lgeneric_dout(cct, 0) << this << " " << info.pgid << " num_bytes mismatch "
 879                             << lnb << " vs stats "
 880                             << info.stats.stats.sum.num_bytes << " / chunk "
 881                             << get_pgbackend()->get_ec_data_chunk_count()
 882                             << dendl;
 883     }
 884     return num_bytes;
 885   }
 886
 887 protected:
 888
 889   /*
 890    * blocked request wait hierarchy
 891    *
 892    * In order to preserve request ordering we need to be careful about the
 893    * order in which blocked requests get requeued.  Generally speaking, we
 894    * push the requests back up to the op_wq in reverse order (most recent
 895    * request first) so that they come back out again in the original order.
 896    * However, because there are multiple wait queues, we need to requeue
 897    * waitlists in order.  Generally speaking, we requeue the wait lists
 898    * that are checked first.
 899    *
 900    * Here are the various wait lists, in the order they are used during
 901    * request processing, with notes:
 902    *
 903    *  - waiting_for_map
 904    *    - may start or stop blocking at any time (depending on client epoch)
 905    *  - waiting_for_peered
 906    *    - !is_peered()
 907    *    - only starts blocking on interval change; never restarts
 908    *  - waiting_for_flush
 909    *    - flushes_in_progress
 910    *    - waiting for final flush during activate
 911    *  - waiting_for_active
 912    *    - !is_active()
 913    *    - only starts blocking on interval change; never restarts
 914    *  - waiting_for_readable
 915    *    - now > readable_until
 916    *    - unblocks when we get fresh(er) osd_pings
 917    *  - waiting_for_scrub
 918    *    - starts and stops blocking for varying intervals during scrub
 919    *  - waiting_for_unreadable_object
 920    *    - never restarts once object is readable (* except for EIO?)
 921    *  - waiting_for_degraded_object
 922    *    - never restarts once object is writeable (* except for EIO?)
 923    *  - waiting_for_blocked_object
 924    *    - starts and stops based on proxied op activity
 925    *  - obc rwlocks
 926    *    - starts and stops based on read/write activity
 927    *
 928    * Notes:
 929    *
 930    *  1. During and interval change, we requeue *everything* in the above order.
 931    *
 932    *  2. When an obc rwlock is released, we check for a scrub block and requeue
 933    *     the op there if it applies.  We ignore the unreadable/degraded/blocked
 934    *     queues because we assume they cannot apply at that time (this is
 935    *     probably mostly true).
 936    *
 937    *  3. The requeue_ops helper will push ops onto the waiting_for_map list if
 938    *     it is non-empty.
 939    *
 940    * These three behaviors are generally sufficient to maintain ordering, with
 941    * the possible exception of cases where we make an object degraded or
 942    * unreadable that was previously okay, e.g. when scrub or op processing
 943    * encounter an unexpected error.  FIXME.
 944    */
 945
 946   // ops with newer maps than our (or blocked behind them)
 947   // track these by client, since inter-request ordering doesn't otherwise
 948   // matter.
 949   unordered_map<entity_name_t,list<OpRequestRef>> waiting_for_map;
 950
 951   // ops waiting on peered
 952   list<OpRequestRef>            waiting_for_peered;
 953
 954   /// ops waiting on readble
 955   list<OpRequestRef>            waiting_for_readable;
 956
 957   // ops waiting on active (require peered as well)
 958   list<OpRequestRef>            waiting_for_active;
 959   list<OpRequestRef>            waiting_for_flush;
 960   list<OpRequestRef>            waiting_for_scrub;
 961
 962   list<OpRequestRef>            waiting_for_cache_not_full;
 963   list<OpRequestRef>            waiting_for_clean_to_primary_repair;
 964   map<hobject_t, list<OpRequestRef>> waiting_for_unreadable_object,
 965                              waiting_for_degraded_object,
 966                              waiting_for_blocked_object;
 967
 968   set<hobject_t> objects_blocked_on_cache_full;
 969   map<hobject_t,snapid_t> objects_blocked_on_degraded_snap;
 970   map<hobject_t,ObjectContextRef> objects_blocked_on_snap_promotion;
 971
 972   // Callbacks should assume pg (and nothing else) is locked
 973   map<hobject_t, list<Context*>> callbacks_for_degraded_object;
 974
 975   map<eversion_t,
 976       list<
 977         tuple<OpRequestRef, version_t, int,
 978               vector<pg_log_op_return_item_t>>>> waiting_for_ondisk;
 979
 980   void requeue_object_waiters(map<hobject_t, list<OpRequestRef>>& m);
 981   void requeue_op(OpRequestRef op);
 982   void requeue_ops(list<OpRequestRef> &l);
 983
 984   // stats that persist lazily
 985   object_stat_collection_t unstable_stats;
 986
 987   // publish stats
 988   ceph::mutex pg_stats_publish_lock =
 989     ceph::make_mutex("PG::pg_stats_publish_lock");
 990   bool pg_stats_publish_valid;
 991   pg_stat_t pg_stats_publish;
 992
 993   friend class TestOpsSocketHook;
 994   void publish_stats_to_osd() override;
 995
 996   bool needs_recovery() const {
 997     return recovery_state.needs_recovery();
 998   }
 999   bool needs_backfill() const {
1000     return recovery_state.needs_backfill();
1001   }
1002
1003   bool all_unfound_are_queried_or_lost(const OSDMapRef osdmap) const;
1004
1005   struct PGLogEntryHandler : public PGLog::LogEntryHandler {
1006     PG *pg;
1007     ObjectStore::Transaction *t;
1008     PGLogEntryHandler(PG *pg, ObjectStore::Transaction *t) : pg(pg), t(t) {}
1009
1010     // LogEntryHandler
1011     void remove(const hobject_t &hoid) override {
1012       pg->get_pgbackend()->remove(hoid, t);
1013     }
1014     void try_stash(const hobject_t &hoid, version_t v) override {
1015       pg->get_pgbackend()->try_stash(hoid, v, t);
1016     }
1017     void rollback(const pg_log_entry_t &entry) override {
1018       ceph_assert(entry.can_rollback());
1019       pg->get_pgbackend()->rollback(entry, t);
1020     }
1021     void rollforward(const pg_log_entry_t &entry) override {
1022       pg->get_pgbackend()->rollforward(entry, t);
1023     }
1024     void trim(const pg_log_entry_t &entry) override {
1025       pg->get_pgbackend()->trim(entry, t);
1026     }
1027   };
1028
1029   void update_object_snap_mapping(
1030     ObjectStore::Transaction *t, const hobject_t &soid,
1031     const set<snapid_t> &snaps);
1032   void clear_object_snap_mapping(
1033     ObjectStore::Transaction *t, const hobject_t &soid);
1034   void remove_snap_mapped_object(
1035     ObjectStore::Transaction& t, const hobject_t& soid);
1036
1037   bool have_unfound() const {
1038     return recovery_state.have_unfound();
1039   }
1040   uint64_t get_num_unfound() const {
1041     return recovery_state.get_num_unfound();
1042   }
1043
1044   virtual void check_local() = 0;
1045
1046   void purge_strays();
1047
1048   void update_heartbeat_peers(set<int> peers) override;
1049
1050   Context *finish_sync_event;
1051
1052   Context *finish_recovery();
1053   void _finish_recovery(Context *c);
1054   struct C_PG_FinishRecovery : public Context {
1055     PGRef pg;
1056     explicit C_PG_FinishRecovery(PG *p) : pg(p) {}
1057     void finish(int r) override {
1058       pg->_finish_recovery(this);
1059     }
1060   };
1061   void cancel_recovery();
1062   void clear_recovery_state();
1063   virtual void _clear_recovery_state() = 0;
1064   void start_recovery_op(const hobject_t& soid);
1065   void finish_recovery_op(const hobject_t& soid, bool dequeue=false);
1066
1067   virtual void _split_into(pg_t child_pgid, PG *child, unsigned split_bits) = 0;
1068
1069   friend class C_OSD_RepModify_Commit;
1070   friend class C_DeleteMore;
1071
1072   // -- backoff --
1073   ceph::mutex backoff_lock = // orders inside Backoff::lock
1074     ceph::make_mutex("PG::backoff_lock");
1075   map<hobject_t,set<ceph::ref_t<Backoff>>> backoffs;
1076
1077   void add_backoff(const ceph::ref_t<Session>& s, const hobject_t& begin, const hobject_t& end);
1078   void release_backoffs(const hobject_t& begin, const hobject_t& end);
1079   void release_backoffs(const hobject_t& o) {
1080     release_backoffs(o, o);
1081   }
1082   void clear_backoffs();
1083
1084   void add_pg_backoff(const ceph::ref_t<Session>& s) {
1085     hobject_t begin = info.pgid.pgid.get_hobj_start();
1086     hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1087     add_backoff(s, begin, end);
1088   }
1089 public:
1090   void release_pg_backoffs() {
1091     hobject_t begin = info.pgid.pgid.get_hobj_start();
1092     hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1093     release_backoffs(begin, end);
1094   }
1095 protected:
1096
1097   // -- scrub --
1098 public:
1099   struct Scrubber {
1100     Scrubber();
1101     ~Scrubber();
1102
1103     // metadata
1104     set<pg_shard_t> reserved_peers;
1105     bool local_reserved, remote_reserved, reserve_failed;
1106     epoch_t epoch_start;
1107
1108     // common to both scrubs
1109     bool active;
1110     set<pg_shard_t> waiting_on_whom;
1111     int shallow_errors;
1112     int deep_errors;
1113     int fixed;
1114     ScrubMap primary_scrubmap;
1115     ScrubMapBuilder primary_scrubmap_pos;
1116     epoch_t replica_scrub_start = 0;
1117     ScrubMap replica_scrubmap;
1118     ScrubMapBuilder replica_scrubmap_pos;
1119     map<pg_shard_t, ScrubMap> received_maps;
1120     OpRequestRef active_rep_scrub;
1121     utime_t scrub_reg_stamp;  // stamp we registered for
1122
1123     static utime_t scrub_must_stamp() { return utime_t(0,1); }
1124
1125     omap_stat_t omap_stats  = (const struct omap_stat_t){ 0 };
1126
1127     // For async sleep
1128     bool sleeping = false;
1129     bool needs_sleep = true;
1130     utime_t sleep_start;
1131
1132     // flags to indicate explicitly requested scrubs (by admin)
1133     bool must_scrub, must_deep_scrub, must_repair, need_auto;
1134
1135     // Priority to use for scrub scheduling
1136     unsigned priority = 0;
1137
1138     bool time_for_deep;
1139     // this flag indicates whether we would like to do auto-repair of the PG or not
1140     bool auto_repair;
1141     // this flag indicates that we are scrubbing post repair to verify everything is fixed
1142     bool check_repair;
1143     // this flag indicates that if a regular scrub detects errors <= osd_scrub_auto_repair_num_errors,
1144     // we should deep scrub in order to auto repair
1145     bool deep_scrub_on_error;
1146
1147     // Maps from objects with errors to missing/inconsistent peers
1148     map<hobject_t, set<pg_shard_t>> missing;
1149     map<hobject_t, set<pg_shard_t>> inconsistent;
1150
1151     // Map from object with errors to good peers
1152     map<hobject_t, list<pair<ScrubMap::object, pg_shard_t> >> authoritative;
1153
1154     // Cleaned map pending snap metadata scrub
1155     ScrubMap cleaned_meta_map;
1156
1157     void clean_meta_map(ScrubMap &for_meta_scrub) {
1158       if (end.is_max() ||
1159           cleaned_meta_map.objects.empty()) {
1160          cleaned_meta_map.swap(for_meta_scrub);
1161       } else {
1162         auto iter = cleaned_meta_map.objects.end();
1163         --iter; // not empty, see if clause
1164         auto begin = cleaned_meta_map.objects.begin();
1165         if (iter->first.has_snapset()) {
1166           ++iter;
1167         } else {
1168           while (iter != begin) {
1169             auto next = iter--;
1170             if (next->first.get_head() != iter->first.get_head()) {
1171               ++iter;
1172               break;
1173             }
1174           }
1175         }
1176         for_meta_scrub.objects.insert(begin, iter);
1177         cleaned_meta_map.objects.erase(begin, iter);
1178       }
1179     }
1180
1181     // digest updates which we are waiting on
1182     int num_digest_updates_pending;
1183
1184     // chunky scrub
1185     hobject_t start, end;    // [start,end)
1186     hobject_t max_end;       // Largest end that may have been sent to replicas
1187     eversion_t subset_last_update;
1188
1189     // chunky scrub state
1190     enum State {
1191       INACTIVE,
1192       NEW_CHUNK,
1193       WAIT_PUSHES,
1194       WAIT_LAST_UPDATE,
1195       BUILD_MAP,
1196       BUILD_MAP_DONE,
1197       WAIT_REPLICAS,
1198       COMPARE_MAPS,
1199       WAIT_DIGEST_UPDATES,
1200       FINISH,
1201       BUILD_MAP_REPLICA,
1202     } state;
1203
1204     std::unique_ptr<Scrub::Store> store;
1205     // deep scrub
1206     bool deep;
1207     int preempt_left;
1208     int preempt_divisor;
1209
1210     list<Context*> callbacks;
1211     void add_callback(Context *context) {
1212       callbacks.push_back(context);
1213     }
1214     void run_callbacks() {
1215       list<Context*> to_run;
1216       to_run.swap(callbacks);
1217       for (list<Context*>::iterator i = to_run.begin();
1218            i != to_run.end();
1219            ++i) {
1220         (*i)->complete(0);
1221       }
1222     }
1223
1224     static const char *state_string(const PG::Scrubber::State& state) {
1225       const char *ret = NULL;
1226       switch( state )
1227       {
1228         case INACTIVE: ret = "INACTIVE"; break;
1229         case NEW_CHUNK: ret = "NEW_CHUNK"; break;
1230         case WAIT_PUSHES: ret = "WAIT_PUSHES"; break;
1231         case WAIT_LAST_UPDATE: ret = "WAIT_LAST_UPDATE"; break;
1232         case BUILD_MAP: ret = "BUILD_MAP"; break;
1233         case BUILD_MAP_DONE: ret = "BUILD_MAP_DONE"; break;
1234         case WAIT_REPLICAS: ret = "WAIT_REPLICAS"; break;
1235         case COMPARE_MAPS: ret = "COMPARE_MAPS"; break;
1236         case WAIT_DIGEST_UPDATES: ret = "WAIT_DIGEST_UPDATES"; break;
1237         case FINISH: ret = "FINISH"; break;
1238         case BUILD_MAP_REPLICA: ret = "BUILD_MAP_REPLICA"; break;
1239       }
1240       return ret;
1241     }
1242
1243     bool is_chunky_scrub_active() const { return state != INACTIVE; }
1244
1245     // clear all state
1246     void reset() {
1247       active = false;
1248       waiting_on_whom.clear();
1249       if (active_rep_scrub) {
1250         active_rep_scrub = OpRequestRef();
1251       }
1252       received_maps.clear();
1253
1254       must_scrub = false;
1255       must_deep_scrub = false;
1256       must_repair = false;
1257       need_auto = false;
1258       time_for_deep = false;
1259       auto_repair = false;
1260       check_repair = false;
1261       deep_scrub_on_error = false;
1262
1263       state = PG::Scrubber::INACTIVE;
1264       start = hobject_t();
1265       end = hobject_t();
1266       max_end = hobject_t();
1267       subset_last_update = eversion_t();
1268       shallow_errors = 0;
1269       deep_errors = 0;
1270       fixed = 0;
1271       omap_stats = (const struct omap_stat_t){ 0 };
1272       deep = false;
1273       run_callbacks();
1274       inconsistent.clear();
1275       missing.clear();
1276       authoritative.clear();
1277       num_digest_updates_pending = 0;
1278       primary_scrubmap = ScrubMap();
1279       primary_scrubmap_pos.reset();
1280       replica_scrubmap = ScrubMap();
1281       replica_scrubmap_pos.reset();
1282       cleaned_meta_map = ScrubMap();
1283       sleeping = false;
1284       needs_sleep = true;
1285       sleep_start = utime_t();
1286     }
1287
1288     void create_results(const hobject_t& obj);
1289     void cleanup_store(ObjectStore::Transaction *t);
1290   } scrubber;
1291
1292 protected:
1293   bool scrub_after_recovery;
1294
1295   int active_pushes;
1296
1297   bool scrub_can_preempt = false;
1298   bool scrub_preempted = false;
1299
1300   // we allow some number of preemptions of the scrub, which mean we do
1301   // not block.  then we start to block.  once we start blocking, we do
1302   // not stop until the scrub range is completed.
1303   bool write_blocked_by_scrub(const hobject_t &soid);
1304
1305   /// true if the given range intersects the scrub interval in any way
1306   bool range_intersects_scrub(const hobject_t &start, const hobject_t& end);
1307
1308   void repair_object(
1309     const hobject_t &soid,
1310     const list<pair<ScrubMap::object, pg_shard_t> > &ok_peers,
1311     const set<pg_shard_t> &bad_peers);
1312
1313   void chunky_scrub(ThreadPool::TPHandle &handle);
1314   void scrub_compare_maps();
1315   /**
1316    * return true if any inconsistency/missing is repaired, false otherwise
1317    */
1318   bool scrub_process_inconsistent();
1319   bool ops_blocked_by_scrub() const;
1320   void scrub_finish();
1321   void scrub_clear_state(bool keep_repair = false);
1322   void _scan_snaps(ScrubMap &map);
1323   void _repair_oinfo_oid(ScrubMap &map);
1324   void _scan_rollback_obs(const vector<ghobject_t> &rollback_obs);
1325   void _request_scrub_map(pg_shard_t replica, eversion_t version,
1326                           hobject_t start, hobject_t end, bool deep,
1327                           bool allow_preemption);
1328   int build_scrub_map_chunk(
1329     ScrubMap &map,
1330     ScrubMapBuilder &pos,
1331     hobject_t start, hobject_t end, bool deep,
1332     ThreadPool::TPHandle &handle);
1333   /**
1334    * returns true if [begin, end) is good to scrub at this time
1335    * a false return value obliges the implementer to requeue scrub when the
1336    * condition preventing scrub clears
1337    */
1338   virtual bool _range_available_for_scrub(
1339     const hobject_t &begin, const hobject_t &end) = 0;
1340   virtual void scrub_snapshot_metadata(
1341     ScrubMap &map,
1342     const std::map<hobject_t,
1343                    pair<std::optional<uint32_t>,
1344                         std::optional<uint32_t>>> &missing_digest) { }
1345   virtual void _scrub_clear_state() { }
1346   virtual void _scrub_finish() { }
1347   void clear_scrub_reserved();
1348   void scrub_reserve_replicas();
1349   void scrub_unreserve_replicas();
1350   bool scrub_all_replicas_reserved() const;
1351
1352   void replica_scrub(
1353     OpRequestRef op,
1354     ThreadPool::TPHandle &handle);
1355   void do_replica_scrub_map(OpRequestRef op);
1356
1357   void handle_scrub_reserve_request(OpRequestRef op);
1358   void handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from);
1359   void handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from);
1360   void handle_scrub_reserve_release(OpRequestRef op);
1361
1362   // -- recovery state --
1363
1364   struct QueuePeeringEvt : Context {
1365     PGRef pg;
1366     PGPeeringEventRef evt;
1367
1368     template <class EVT>
1369     QueuePeeringEvt(PG *pg, epoch_t epoch, EVT evt) :
1370       pg(pg), evt(std::make_shared<PGPeeringEvent>(epoch, epoch, evt)) {}
1371
1372     QueuePeeringEvt(PG *pg, PGPeeringEventRef evt) :
1373       pg(pg), evt(std::move(evt)) {}
1374
1375     void finish(int r) override {
1376       pg->lock();
1377       pg->queue_peering_event(std::move(evt));
1378       pg->unlock();
1379     }
1380   };
1381
1382
1383 public:
1384   int pg_stat_adjust(osd_stat_t *new_stat);
1385 protected:
1386   bool delete_needs_sleep = false;
1387
1388 protected:
1389   bool state_test(uint64_t m) const { return recovery_state.state_test(m); }
1390   void state_set(uint64_t m) { recovery_state.state_set(m); }
1391   void state_clear(uint64_t m) { recovery_state.state_clear(m); }
1392
1393   bool is_complete() const {
1394     return recovery_state.is_complete();
1395   }
1396   bool should_send_notify() const {
1397     return recovery_state.should_send_notify();
1398   }
1399
1400   bool is_active() const { return recovery_state.is_active(); }
1401   bool is_activating() const { return recovery_state.is_activating(); }
1402   bool is_peering() const { return recovery_state.is_peering(); }
1403   bool is_down() const { return recovery_state.is_down(); }
1404   bool is_recovery_unfound() const { return recovery_state.is_recovery_unfound(); }
1405   bool is_backfill_unfound() const { return recovery_state.is_backfill_unfound(); }
1406   bool is_incomplete() const { return recovery_state.is_incomplete(); }
1407   bool is_clean() const { return recovery_state.is_clean(); }
1408   bool is_degraded() const { return recovery_state.is_degraded(); }
1409   bool is_undersized() const { return recovery_state.is_undersized(); }
1410   bool is_scrubbing() const { return state_test(PG_STATE_SCRUBBING); }
1411   bool is_remapped() const { return recovery_state.is_remapped(); }
1412   bool is_peered() const { return recovery_state.is_peered(); }
1413   bool is_recovering() const { return recovery_state.is_recovering(); }
1414   bool is_premerge() const { return recovery_state.is_premerge(); }
1415   bool is_repair() const { return recovery_state.is_repair(); }
1416   bool is_laggy() const { return state_test(PG_STATE_LAGGY); }
1417   bool is_wait() const { return state_test(PG_STATE_WAIT); }
1418
1419   bool is_empty() const { return recovery_state.is_empty(); }
1420
1421   // pg on-disk state
1422   void do_pending_flush();
1423
1424 public:
1425   virtual void prepare_write(
1426     pg_info_t &info,
1427     pg_info_t &last_written_info,
1428     PastIntervals &past_intervals,
1429     PGLog &pglog,
1430     bool dirty_info,
1431     bool dirty_big_info,
1432     bool need_write_epoch,
1433     ObjectStore::Transaction &t) override;
1434
1435   void write_if_dirty(PeeringCtx &rctx) {
1436     write_if_dirty(rctx.transaction);
1437   }
1438 protected:
1439   void write_if_dirty(ObjectStore::Transaction& t) {
1440     recovery_state.write_if_dirty(t);
1441   }
1442
1443   PGLog::IndexedLog projected_log;
1444   bool check_in_progress_op(
1445     const osd_reqid_t &r,
1446     eversion_t *version,
1447     version_t *user_version,
1448     int *return_code,
1449     vector<pg_log_op_return_item_t> *op_returns) const;
1450   eversion_t projected_last_update;
1451   eversion_t get_next_version() const {
1452     eversion_t at_version(
1453       get_osdmap_epoch(),
1454       projected_last_update.version+1);
1455     ceph_assert(at_version > info.last_update);
1456     ceph_assert(at_version > recovery_state.get_pg_log().get_head());
1457     ceph_assert(at_version > projected_last_update);
1458     return at_version;
1459   }
1460
1461   bool check_log_for_corruption(ObjectStore *store);
1462
1463   std::string get_corrupt_pg_log_name() const;
1464
1465   void update_snap_map(
1466     const vector<pg_log_entry_t> &log_entries,
1467     ObjectStore::Transaction& t);
1468
1469   void filter_snapc(vector<snapid_t> &snaps);
1470
1471   virtual void kick_snap_trim() = 0;
1472   virtual void snap_trimmer_scrub_complete() = 0;
1473   bool requeue_scrub(bool high_priority = false);
1474   void queue_recovery();
1475   bool queue_scrub();
1476   unsigned get_scrub_priority();
1477
1478   bool try_flush_or_schedule_async() override;
1479   void start_flush_on_transaction(
1480     ObjectStore::Transaction &t) override;
1481
1482   void update_history(const pg_history_t& history) {
1483     recovery_state.update_history(history);
1484   }
1485
1486   // OpRequest queueing
1487   bool can_discard_op(OpRequestRef& op);
1488   bool can_discard_scan(OpRequestRef op);
1489   bool can_discard_backfill(OpRequestRef op);
1490   bool can_discard_request(OpRequestRef& op);
1491
1492   template<typename T, int MSGTYPE>
1493   bool can_discard_replica_op(OpRequestRef& op);
1494
1495   bool old_peering_msg(epoch_t reply_epoch, epoch_t query_epoch);
1496   bool old_peering_evt(PGPeeringEventRef evt) {
1497     return old_peering_msg(evt->get_epoch_sent(), evt->get_epoch_requested());
1498   }
1499   bool have_same_or_newer_map(epoch_t e) {
1500     return e <= get_osdmap_epoch();
1501   }
1502
1503   bool op_has_sufficient_caps(OpRequestRef& op);
1504
1505   // abstract bits
1506   friend class FlushState;
1507
1508   friend ostream& operator<<(ostream& out, const PG& pg);
1509
1510 protected:
1511   PeeringState recovery_state;
1512
1513   // ref to recovery_state.pool
1514   const PGPool &pool;
1515
1516   // ref to recovery_state.info
1517   const pg_info_t &info;
1518 };
1519
1520
1521 ostream& operator<<(ostream& out, const PG::BackfillInterval& bi);
1522
1523 #endif