ceph/src/osd/PrimaryLogPG.h

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 /*
   3  * Ceph - scalable distributed file system
   4  *
   5  * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
   6  * Copyright (C) 2013 Cloudwatt <libre.licensing@cloudwatt.com>
   7  *
   8  * Author: Loic Dachary <loic@dachary.org>
   9  *
  10  * This is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public
  12  * License version 2.1, as published by the Free Software
  13  * Foundation.  See file COPYING.
  14  *
  15  */
  16
  17 #ifndef CEPH_REPLICATEDPG_H
  18 #define CEPH_REPLICATEDPG_H
  19
  20 #include <boost/tuple/tuple.hpp>
  21 #include "include/ceph_assert.h"
  22 #include "DynamicPerfStats.h"
  23 #include "OSD.h"
  24 #include "PG.h"
  25 #include "Watch.h"
  26 #include "TierAgentState.h"
  27 #include "messages/MOSDOpReply.h"
  28 #include "common/Checksummer.h"
  29 #include "common/sharedptr_registry.hpp"
  30 #include "common/shared_cache.hpp"
  31 #include "ReplicatedBackend.h"
  32 #include "PGTransaction.h"
  33 #include "cls/cas/cls_cas_ops.h"
  34
  35 class CopyFromCallback;
  36 class PromoteCallback;
  37
  38 class PrimaryLogPG;
  39 class PGLSFilter;
  40 class HitSet;
  41 struct TierAgentState;
  42 class MOSDOp;
  43 class MOSDOpReply;
  44 class OSDService;
  45
  46 void intrusive_ptr_add_ref(PrimaryLogPG *pg);
  47 void intrusive_ptr_release(PrimaryLogPG *pg);
  48 uint64_t get_with_id(PrimaryLogPG *pg);
  49 void put_with_id(PrimaryLogPG *pg, uint64_t id);
  50
  51 #ifdef PG_DEBUG_REFS
  52   typedef TrackedIntPtr<PrimaryLogPG> PrimaryLogPGRef;
  53 #else
  54   typedef boost::intrusive_ptr<PrimaryLogPG> PrimaryLogPGRef;
  55 #endif
  56
  57 struct inconsistent_snapset_wrapper;
  58
  59 class PrimaryLogPG : public PG, public PGBackend::Listener {
  60   friend class OSD;
  61   friend class Watch;
  62
  63 public:
  64   MEMPOOL_CLASS_HELPERS();
  65
  66   /*
  67    * state associated with a copy operation
  68    */
  69   struct OpContext;
  70   class CopyCallback;
  71
  72   /**
  73    * CopyResults stores the object metadata of interest to a copy initiator.
  74    */
  75   struct CopyResults {
  76     ceph::real_time mtime; ///< the copy source's mtime
  77     uint64_t object_size; ///< the copied object's size
  78     bool started_temp_obj; ///< true if the callback needs to delete temp object
  79     hobject_t temp_oid;    ///< temp object (if any)
  80
  81     /**
  82      * Function to fill in transaction; if non-empty the callback
  83      * must execute it before any other accesses to the object
  84      * (in order to complete the copy).
  85      */
  86     std::function<void(PGTransaction *)> fill_in_final_tx;
  87
  88     version_t user_version; ///< The copy source's user version
  89     bool should_requeue;  ///< op should be requeued on cancel
  90     vector<snapid_t> snaps;  ///< src's snaps (if clone)
  91     snapid_t snap_seq;       ///< src's snap_seq (if head)
  92     librados::snap_set_t snapset; ///< src snapset (if head)
  93     bool mirror_snapset;
  94     bool has_omap;
  95     uint32_t flags;    // object_copy_data_t::FLAG_*
  96     uint32_t source_data_digest, source_omap_digest;
  97     uint32_t data_digest, omap_digest;
  98     mempool::osd_pglog::vector<pair<osd_reqid_t, version_t> > reqids; // [(reqid, user_version)]
  99     mempool::osd_pglog::map<uint32_t, int> reqid_return_codes; // map reqids by index to error code
 100     map<string, bufferlist> attrs; // xattrs
 101     uint64_t truncate_seq;
 102     uint64_t truncate_size;
 103     bool is_data_digest() {
 104       return flags & object_copy_data_t::FLAG_DATA_DIGEST;
 105     }
 106     bool is_omap_digest() {
 107       return flags & object_copy_data_t::FLAG_OMAP_DIGEST;
 108     }
 109     CopyResults()
 110       : object_size(0), started_temp_obj(false),
 111         user_version(0),
 112         should_requeue(false), mirror_snapset(false),
 113         has_omap(false),
 114         flags(0),
 115         source_data_digest(-1), source_omap_digest(-1),
 116         data_digest(-1), omap_digest(-1),
 117         truncate_seq(0), truncate_size(0)
 118     {}
 119   };
 120
 121   struct CopyOp;
 122   typedef std::shared_ptr<CopyOp> CopyOpRef;
 123
 124   struct CopyOp {
 125     CopyCallback *cb;
 126     ObjectContextRef obc;
 127     hobject_t src;
 128     object_locator_t oloc;
 129     unsigned flags;
 130     bool mirror_snapset;
 131
 132     CopyResults results;
 133
 134     ceph_tid_t objecter_tid;
 135     ceph_tid_t objecter_tid2;
 136
 137     object_copy_cursor_t cursor;
 138     map<string,bufferlist> attrs;
 139     bufferlist data;
 140     bufferlist omap_header;
 141     bufferlist omap_data;
 142     int rval;
 143
 144     object_copy_cursor_t temp_cursor;
 145
 146     /*
 147      * For CopyOp the process is:
 148      * step1: read the data(attr/omap/data) from the source object
 149      * step2: handle those data(w/ those data create a new object)
 150      * src_obj_fadvise_flags used in step1;
 151      * dest_obj_fadvise_flags used in step2
 152      */
 153     unsigned src_obj_fadvise_flags;
 154     unsigned dest_obj_fadvise_flags;
 155
 156     map<uint64_t, CopyOpRef> chunk_cops;
 157     int num_chunk;
 158     bool failed;
 159     uint64_t start_offset = 0;
 160     uint64_t last_offset = 0;
 161     vector<OSDOp> chunk_ops;
 162
 163     CopyOp(CopyCallback *cb_, ObjectContextRef _obc, hobject_t s,
 164            object_locator_t l,
 165            version_t v,
 166            unsigned f,
 167            bool ms,
 168            unsigned src_obj_fadvise_flags,
 169            unsigned dest_obj_fadvise_flags)
 170       : cb(cb_), obc(_obc), src(s), oloc(l), flags(f),
 171         mirror_snapset(ms),
 172         objecter_tid(0),
 173         objecter_tid2(0),
 174         rval(-1),
 175         src_obj_fadvise_flags(src_obj_fadvise_flags),
 176         dest_obj_fadvise_flags(dest_obj_fadvise_flags),
 177         num_chunk(0),
 178         failed(false)
 179     {
 180       results.user_version = v;
 181       results.mirror_snapset = mirror_snapset;
 182     }
 183   };
 184
 185   /**
 186    * The CopyCallback class defines an interface for completions to the
 187    * copy_start code. Users of the copy infrastructure must implement
 188    * one and give an instance of the class to start_copy.
 189    *
 190    * The implementer is responsible for making sure that the CopyCallback
 191    * can associate itself with the correct copy operation.
 192    */
 193   typedef boost::tuple<int, CopyResults*> CopyCallbackResults;
 194
 195   friend class CopyFromCallback;
 196   friend class CopyFromFinisher;
 197   friend class PromoteCallback;
 198   friend class PromoteFinisher;
 199
 200   struct ProxyReadOp {
 201     OpRequestRef op;
 202     hobject_t soid;
 203     ceph_tid_t objecter_tid;
 204     vector<OSDOp> &ops;
 205     version_t user_version;
 206     int data_offset;
 207     bool canceled;              ///< true if canceled
 208
 209     ProxyReadOp(OpRequestRef _op, hobject_t oid, vector<OSDOp>& _ops)
 210       : op(_op), soid(oid),
 211         objecter_tid(0), ops(_ops),
 212         user_version(0), data_offset(0),
 213         canceled(false) { }
 214   };
 215   typedef std::shared_ptr<ProxyReadOp> ProxyReadOpRef;
 216
 217   struct ProxyWriteOp {
 218     OpContext *ctx;
 219     OpRequestRef op;
 220     hobject_t soid;
 221     ceph_tid_t objecter_tid;
 222     vector<OSDOp> &ops;
 223     version_t user_version;
 224     bool sent_reply;
 225     utime_t mtime;
 226     bool canceled;
 227     osd_reqid_t reqid;
 228
 229     ProxyWriteOp(OpRequestRef _op, hobject_t oid, vector<OSDOp>& _ops, osd_reqid_t _reqid)
 230       : ctx(NULL), op(_op), soid(oid),
 231         objecter_tid(0), ops(_ops),
 232         user_version(0), sent_reply(false),
 233         canceled(false),
 234         reqid(_reqid) { }
 235   };
 236   typedef std::shared_ptr<ProxyWriteOp> ProxyWriteOpRef;
 237
 238   struct FlushOp {
 239     ObjectContextRef obc;       ///< obc we are flushing
 240     OpRequestRef op;            ///< initiating op
 241     list<OpRequestRef> dup_ops; ///< bandwagon jumpers
 242     version_t flushed_version;  ///< user version we are flushing
 243     ceph_tid_t objecter_tid;    ///< copy-from request tid
 244     int rval;                   ///< copy-from result
 245     bool blocking;              ///< whether we are blocking updates
 246     bool removal;               ///< we are removing the backend object
 247     boost::optional<std::function<void()>> on_flush; ///< callback, may be null
 248     // for chunked object
 249     map<uint64_t, int> io_results;
 250     map<uint64_t, ceph_tid_t> io_tids;
 251     uint64_t chunks;
 252
 253     FlushOp()
 254       : flushed_version(0), objecter_tid(0), rval(0),
 255         blocking(false), removal(false), chunks(0) {}
 256     ~FlushOp() { ceph_assert(!on_flush); }
 257   };
 258   typedef std::shared_ptr<FlushOp> FlushOpRef;
 259
 260   boost::scoped_ptr<PGBackend> pgbackend;
 261   PGBackend *get_pgbackend() override {
 262     return pgbackend.get();
 263   }
 264
 265   const PGBackend *get_pgbackend() const override {
 266     return pgbackend.get();
 267   }
 268
 269   /// Listener methods
 270   DoutPrefixProvider *get_dpp() override {
 271     return this;
 272   }
 273
 274   void on_local_recover(
 275     const hobject_t &oid,
 276     const ObjectRecoveryInfo &recovery_info,
 277     ObjectContextRef obc,
 278     bool is_delete,
 279     ObjectStore::Transaction *t
 280     ) override;
 281   void on_peer_recover(
 282     pg_shard_t peer,
 283     const hobject_t &oid,
 284     const ObjectRecoveryInfo &recovery_info
 285     ) override;
 286   void begin_peer_recover(
 287     pg_shard_t peer,
 288     const hobject_t oid) override;
 289   void on_global_recover(
 290     const hobject_t &oid,
 291     const object_stat_sum_t &stat_diff,
 292     bool is_delete) override;
 293   void failed_push(const list<pg_shard_t> &from,
 294                    const hobject_t &soid,
 295                    const eversion_t &need = eversion_t()) override;
 296   void primary_failed(const hobject_t &soid) override;
 297   bool primary_error(const hobject_t& soid, eversion_t v) override;
 298   void cancel_pull(const hobject_t &soid) override;
 299   void apply_stats(
 300     const hobject_t &soid,
 301     const object_stat_sum_t &delta_stats) override;
 302   void on_primary_error(const hobject_t &oid, eversion_t v) override;
 303   void backfill_add_missing(const hobject_t &oid, eversion_t v) override;
 304   void remove_missing_object(const hobject_t &oid,
 305                              eversion_t v,
 306                              Context *on_complete) override;
 307
 308   template<class T> class BlessedGenContext;
 309   template<class T> class UnlockedBlessedGenContext;
 310   class BlessedContext;
 311   Context *bless_context(Context *c) override;
 312
 313   GenContext<ThreadPool::TPHandle&> *bless_gencontext(
 314     GenContext<ThreadPool::TPHandle&> *c) override;
 315   GenContext<ThreadPool::TPHandle&> *bless_unlocked_gencontext(
 316     GenContext<ThreadPool::TPHandle&> *c) override;
 317
 318   void send_message(int to_osd, Message *m) override {
 319     osd->send_message_osd_cluster(to_osd, m, get_osdmap_epoch());
 320   }
 321   void queue_transaction(ObjectStore::Transaction&& t,
 322                          OpRequestRef op) override {
 323     osd->store->queue_transaction(ch, std::move(t), op);
 324   }
 325   void queue_transactions(vector<ObjectStore::Transaction>& tls,
 326                           OpRequestRef op) override {
 327     osd->store->queue_transactions(ch, tls, op, NULL);
 328   }
 329   epoch_t get_interval_start_epoch() const override {
 330     return info.history.same_interval_since;
 331   }
 332   epoch_t get_last_peering_reset_epoch() const override {
 333     return get_last_peering_reset();
 334   }
 335   const set<pg_shard_t> &get_acting_recovery_backfill_shards() const override {
 336     return acting_recovery_backfill;
 337   }
 338   const set<pg_shard_t> &get_acting_shards() const override {
 339     return actingset;
 340   }
 341   const set<pg_shard_t> &get_backfill_shards() const override {
 342     return backfill_targets;
 343   }
 344
 345   std::ostream& gen_dbg_prefix(std::ostream& out) const override {
 346     return gen_prefix(out);
 347   }
 348
 349   const map<hobject_t, set<pg_shard_t>>
 350     &get_missing_loc_shards() const override {
 351     return missing_loc.get_missing_locs();
 352   }
 353   const map<pg_shard_t, pg_missing_t> &get_shard_missing() const override {
 354     return peer_missing;
 355   }
 356   using PGBackend::Listener::get_shard_missing;
 357   const map<pg_shard_t, pg_info_t> &get_shard_info() const override {
 358     return peer_info;
 359   }
 360   using PGBackend::Listener::get_shard_info;
 361   const pg_missing_tracker_t &get_local_missing() const override {
 362     return pg_log.get_missing();
 363   }
 364   const PGLog &get_log() const override {
 365     return pg_log;
 366   }
 367   void add_local_next_event(const pg_log_entry_t& e) override {
 368     pg_log.missing_add_next_entry(e);
 369   }
 370   bool pgb_is_primary() const override {
 371     return is_primary();
 372   }
 373   const OSDMapRef& pgb_get_osdmap() const override final {
 374     return get_osdmap();
 375   }
 376   epoch_t pgb_get_osdmap_epoch() const override final {
 377     return get_osdmap_epoch();
 378   }
 379   const pg_info_t &get_info() const override {
 380     return info;
 381   }
 382   const pg_pool_t &get_pool() const override {
 383     return pool.info;
 384   }
 385
 386   ObjectContextRef get_obc(
 387     const hobject_t &hoid,
 388     const map<string, bufferlist> &attrs) override {
 389     return get_object_context(hoid, true, &attrs);
 390   }
 391
 392   bool try_lock_for_read(
 393     const hobject_t &hoid,
 394     ObcLockManager &manager) override {
 395     if (is_missing_object(hoid))
 396       return false;
 397     auto obc = get_object_context(hoid, false, nullptr);
 398     if (!obc)
 399       return false;
 400     return manager.try_get_read_lock(hoid, obc);
 401   }
 402
 403   void release_locks(ObcLockManager &manager) override {
 404     release_object_locks(manager);
 405   }
 406
 407   bool pg_is_repair() override {
 408     return is_repair();
 409   }
 410   void inc_osd_stat_repaired() override {
 411     osd->inc_osd_stat_repaired();
 412   }
 413   bool pg_is_remote_backfilling() override {
 414     return is_remote_backfilling();
 415   }
 416   void pg_add_local_num_bytes(int64_t num_bytes) override {
 417     add_local_num_bytes(num_bytes);
 418   }
 419   void pg_sub_local_num_bytes(int64_t num_bytes) override {
 420     sub_local_num_bytes(num_bytes);
 421   }
 422   void pg_add_num_bytes(int64_t num_bytes) override {
 423     add_num_bytes(num_bytes);
 424   }
 425   void pg_sub_num_bytes(int64_t num_bytes) override {
 426     sub_num_bytes(num_bytes);
 427   }
 428
 429   void pgb_set_object_snap_mapping(
 430     const hobject_t &soid,
 431     const set<snapid_t> &snaps,
 432     ObjectStore::Transaction *t) override {
 433     return update_object_snap_mapping(t, soid, snaps);
 434   }
 435   void pgb_clear_object_snap_mapping(
 436     const hobject_t &soid,
 437     ObjectStore::Transaction *t) override {
 438     return clear_object_snap_mapping(t, soid);
 439   }
 440
 441   void log_operation(
 442     const vector<pg_log_entry_t> &logv,
 443     const boost::optional<pg_hit_set_history_t> &hset_history,
 444     const eversion_t &trim_to,
 445     const eversion_t &roll_forward_to,
 446     bool transaction_applied,
 447     ObjectStore::Transaction &t,
 448     bool async = false) override {
 449     if (hset_history) {
 450       info.hit_set = *hset_history;
 451     }
 452     append_log(logv, trim_to, roll_forward_to, t, transaction_applied, async);
 453   }
 454
 455   void op_applied(const eversion_t &applied_version) override;
 456
 457   bool should_send_op(
 458     pg_shard_t peer,
 459     const hobject_t &hoid) override;
 460
 461   bool pg_is_undersized() const override {
 462     return is_undersized();
 463   }
 464
 465   bool pg_is_repair() const override {
 466     return is_repair();
 467   }
 468
 469   void update_peer_last_complete_ondisk(
 470     pg_shard_t fromosd,
 471     eversion_t lcod) override {
 472     peer_last_complete_ondisk[fromosd] = lcod;
 473   }
 474
 475   void update_last_complete_ondisk(
 476     eversion_t lcod) override {
 477     last_complete_ondisk = lcod;
 478   }
 479
 480   void update_stats(
 481     const pg_stat_t &stat) override {
 482     info.stats = stat;
 483   }
 484
 485   void schedule_recovery_work(
 486     GenContext<ThreadPool::TPHandle&> *c) override;
 487
 488   pg_shard_t whoami_shard() const override {
 489     return pg_whoami;
 490   }
 491   spg_t primary_spg_t() const override {
 492     return spg_t(info.pgid.pgid, primary.shard);
 493   }
 494   pg_shard_t primary_shard() const override {
 495     return primary;
 496   }
 497
 498   void send_message_osd_cluster(
 499     int peer, Message *m, epoch_t from_epoch) override;
 500   void send_message_osd_cluster(
 501     Message *m, Connection *con) override;
 502   void send_message_osd_cluster(
 503     Message *m, const ConnectionRef& con) override;
 504   ConnectionRef get_con_osd_cluster(int peer, epoch_t from_epoch) override;
 505   entity_name_t get_cluster_msgr_name() override {
 506     return osd->get_cluster_msgr_name();
 507   }
 508
 509   PerfCounters *get_logger() override;
 510
 511   ceph_tid_t get_tid() override { return osd->get_tid(); }
 512
 513   LogClientTemp clog_error() override { return osd->clog->error(); }
 514   LogClientTemp clog_warn() override { return osd->clog->warn(); }
 515
 516   struct watch_disconnect_t {
 517     uint64_t cookie;
 518     entity_name_t name;
 519     bool send_disconnect;
 520     watch_disconnect_t(uint64_t c, entity_name_t n, bool sd)
 521       : cookie(c), name(n), send_disconnect(sd) {}
 522   };
 523   void complete_disconnect_watches(
 524     ObjectContextRef obc,
 525     const list<watch_disconnect_t> &to_disconnect);
 526
 527   struct OpFinisher {
 528     virtual ~OpFinisher() {
 529     }
 530
 531     virtual int execute() = 0;
 532   };
 533
 534   /*
 535    * Capture all object state associated with an in-progress read or write.
 536    */
 537   struct OpContext {
 538     OpRequestRef op;
 539     osd_reqid_t reqid;
 540     vector<OSDOp> *ops;
 541
 542     const ObjectState *obs; // Old objectstate
 543     const SnapSet *snapset; // Old snapset
 544
 545     ObjectState new_obs;  // resulting ObjectState
 546     SnapSet new_snapset;  // resulting SnapSet (in case of a write)
 547     //pg_stat_t new_stats;  // resulting Stats
 548     object_stat_sum_t delta_stats;
 549
 550     bool modify;          // (force) modification (even if op_t is empty)
 551     bool user_modify;     // user-visible modification
 552     bool undirty;         // user explicitly un-dirtying this object
 553     bool cache_evict;     ///< true if this is a cache eviction
 554     bool ignore_cache;    ///< true if IGNORE_CACHE flag is set
 555     bool ignore_log_op_stats;  // don't log op stats
 556     bool update_log_only; ///< this is a write that returned an error - just record in pg log for dup detection
 557
 558     // side effects
 559     list<pair<watch_info_t,bool> > watch_connects; ///< new watch + will_ping flag
 560     list<watch_disconnect_t> watch_disconnects; ///< old watch + send_discon
 561     list<notify_info_t> notifies;
 562     struct NotifyAck {
 563       boost::optional<uint64_t> watch_cookie;
 564       uint64_t notify_id;
 565       bufferlist reply_bl;
 566       explicit NotifyAck(uint64_t notify_id) : notify_id(notify_id) {}
 567       NotifyAck(uint64_t notify_id, uint64_t cookie, bufferlist& rbl)
 568         : watch_cookie(cookie), notify_id(notify_id) {
 569         reply_bl.claim(rbl);
 570       }
 571     };
 572     list<NotifyAck> notify_acks;
 573
 574     uint64_t bytes_written, bytes_read;
 575
 576     utime_t mtime;
 577     SnapContext snapc;           // writer snap context
 578     eversion_t at_version;       // pg's current version pointer
 579     version_t user_at_version;   // pg's current user version pointer
 580
 581     /// index of the current subop - only valid inside of do_osd_ops()
 582     int current_osd_subop_num;
 583     /// total number of subops processed in this context for cls_cxx_subop_version()
 584     int processed_subop_count = 0;
 585
 586     PGTransactionUPtr op_t;
 587     vector<pg_log_entry_t> log;
 588     boost::optional<pg_hit_set_history_t> updated_hset_history;
 589
 590     interval_set<uint64_t> modified_ranges;
 591     ObjectContextRef obc;
 592     ObjectContextRef clone_obc;    // if we created a clone
 593     ObjectContextRef head_obc;     // if we also update snapset (see trim_object)
 594
 595     // FIXME: we may want to kill this msgr hint off at some point!
 596     boost::optional<int> data_off = boost::none;
 597
 598     MOSDOpReply *reply;
 599
 600     PrimaryLogPG *pg;
 601
 602     int num_read;    ///< count read ops
 603     int num_write;   ///< count update ops
 604
 605     mempool::osd_pglog::vector<pair<osd_reqid_t, version_t> > extra_reqids;
 606     mempool::osd_pglog::map<uint32_t, int> extra_reqid_return_codes;
 607
 608     hobject_t new_temp_oid, discard_temp_oid;  ///< temp objects we should start/stop tracking
 609
 610     list<std::function<void()>> on_applied;
 611     list<std::function<void()>> on_committed;
 612     list<std::function<void()>> on_finish;
 613     list<std::function<void()>> on_success;
 614     template <typename F>
 615     void register_on_finish(F &&f) {
 616       on_finish.emplace_back(std::forward<F>(f));
 617     }
 618     template <typename F>
 619     void register_on_success(F &&f) {
 620       on_success.emplace_back(std::forward<F>(f));
 621     }
 622     template <typename F>
 623     void register_on_applied(F &&f) {
 624       on_applied.emplace_back(std::forward<F>(f));
 625     }
 626     template <typename F>
 627     void register_on_commit(F &&f) {
 628       on_committed.emplace_back(std::forward<F>(f));
 629     }
 630
 631     bool sent_reply = false;
 632
 633     // pending async reads <off, len, op_flags> -> <outbl, outr>
 634     list<pair<boost::tuple<uint64_t, uint64_t, unsigned>,
 635               pair<bufferlist*, Context*> > > pending_async_reads;
 636     int inflightreads;
 637     friend struct OnReadComplete;
 638     void start_async_reads(PrimaryLogPG *pg);
 639     void finish_read(PrimaryLogPG *pg);
 640     bool async_reads_complete() {
 641       return inflightreads == 0;
 642     }
 643
 644     ObjectContext::RWState::State lock_type;
 645     ObcLockManager lock_manager;
 646
 647     std::map<int, std::unique_ptr<OpFinisher>> op_finishers;
 648
 649     OpContext(const OpContext& other);
 650     const OpContext& operator=(const OpContext& other);
 651
 652     OpContext(OpRequestRef _op, osd_reqid_t _reqid, vector<OSDOp>* _ops,
 653               ObjectContextRef& obc,
 654               PrimaryLogPG *_pg) :
 655       op(_op), reqid(_reqid), ops(_ops),
 656       obs(&obc->obs),
 657       snapset(0),
 658       new_obs(obs->oi, obs->exists),
 659       modify(false), user_modify(false), undirty(false), cache_evict(false),
 660       ignore_cache(false), ignore_log_op_stats(false), update_log_only(false),
 661       bytes_written(0), bytes_read(0), user_at_version(0),
 662       current_osd_subop_num(0),
 663       obc(obc),
 664       reply(NULL), pg(_pg),
 665       num_read(0),
 666       num_write(0),
 667       sent_reply(false),
 668       inflightreads(0),
 669       lock_type(ObjectContext::RWState::RWNONE) {
 670       if (obc->ssc) {
 671         new_snapset = obc->ssc->snapset;
 672         snapset = &obc->ssc->snapset;
 673       }
 674     }
 675     OpContext(OpRequestRef _op, osd_reqid_t _reqid,
 676               vector<OSDOp>* _ops, PrimaryLogPG *_pg) :
 677       op(_op), reqid(_reqid), ops(_ops), obs(NULL), snapset(0),
 678       modify(false), user_modify(false), undirty(false), cache_evict(false),
 679       ignore_cache(false), ignore_log_op_stats(false), update_log_only(false),
 680       bytes_written(0), bytes_read(0), user_at_version(0),
 681       current_osd_subop_num(0),
 682       reply(NULL), pg(_pg),
 683       num_read(0),
 684       num_write(0),
 685       inflightreads(0),
 686       lock_type(ObjectContext::RWState::RWNONE) {}
 687     void reset_obs(ObjectContextRef obc) {
 688       new_obs = ObjectState(obc->obs.oi, obc->obs.exists);
 689       if (obc->ssc) {
 690         new_snapset = obc->ssc->snapset;
 691         snapset = &obc->ssc->snapset;
 692       }
 693     }
 694     ~OpContext() {
 695       ceph_assert(!op_t);
 696       if (reply)
 697         reply->put();
 698       for (list<pair<boost::tuple<uint64_t, uint64_t, unsigned>,
 699                      pair<bufferlist*, Context*> > >::iterator i =
 700              pending_async_reads.begin();
 701            i != pending_async_reads.end();
 702            pending_async_reads.erase(i++)) {
 703         delete i->second.second;
 704       }
 705     }
 706     uint64_t get_features() {
 707       if (op && op->get_req()) {
 708         return op->get_req()->get_connection()->get_features();
 709       }
 710       return -1ull;
 711     }
 712   };
 713   using OpContextUPtr = std::unique_ptr<OpContext>;
 714   friend struct OpContext;
 715
 716   /*
 717    * State on the PG primary associated with the replicated mutation
 718    */
 719   class RepGather {
 720   public:
 721     hobject_t hoid;
 722     OpRequestRef op;
 723     xlist<RepGather*>::item queue_item;
 724     int nref;
 725
 726     eversion_t v;
 727     int r = 0;
 728
 729     ceph_tid_t rep_tid;
 730
 731     bool rep_aborted;
 732     bool all_committed;
 733
 734     utime_t   start;
 735
 736     eversion_t          pg_local_last_complete;
 737
 738     ObcLockManager lock_manager;
 739
 740     list<std::function<void()>> on_committed;
 741     list<std::function<void()>> on_success;
 742     list<std::function<void()>> on_finish;
 743
 744     RepGather(
 745       OpContext *c, ceph_tid_t rt,
 746       eversion_t lc) :
 747       hoid(c->obc->obs.oi.soid),
 748       op(c->op),
 749       queue_item(this),
 750       nref(1),
 751       rep_tid(rt),
 752       rep_aborted(false),
 753       all_committed(false),
 754       pg_local_last_complete(lc),
 755       lock_manager(std::move(c->lock_manager)),
 756       on_committed(std::move(c->on_committed)),
 757       on_success(std::move(c->on_success)),
 758       on_finish(std::move(c->on_finish)) {}
 759
 760     RepGather(
 761       ObcLockManager &&manager,
 762       OpRequestRef &&o,
 763       boost::optional<std::function<void(void)> > &&on_complete,
 764       ceph_tid_t rt,
 765       eversion_t lc,
 766       int r) :
 767       op(o),
 768       queue_item(this),
 769       nref(1),
 770       r(r),
 771       rep_tid(rt),
 772       rep_aborted(false),
 773       all_committed(false),
 774       pg_local_last_complete(lc),
 775       lock_manager(std::move(manager)) {
 776       if (on_complete) {
 777         on_success.push_back(std::move(*on_complete));
 778       }
 779     }
 780
 781     RepGather *get() {
 782       nref++;
 783       return this;
 784     }
 785     void put() {
 786       ceph_assert(nref > 0);
 787       if (--nref == 0) {
 788         delete this;
 789         //generic_dout(0) << "deleting " << this << dendl;
 790       }
 791     }
 792   };
 793
 794
 795 protected:
 796
 797   /**
 798    * Grabs locks for OpContext, should be cleaned up in close_op_ctx
 799    *
 800    * @param ctx [in,out] ctx to get locks for
 801    * @return true on success, false if we are queued
 802    */
 803   bool get_rw_locks(bool write_ordered, OpContext *ctx) {
 804     /* If head_obc, !obc->obs->exists and we will always take the
 805      * snapdir lock *before* the head lock.  Since all callers will do
 806      * this (read or write) if we get the first we will be guaranteed
 807      * to get the second.
 808      */
 809     if (write_ordered && ctx->op->may_read()) {
 810       ctx->lock_type = ObjectContext::RWState::RWEXCL;
 811     } else if (write_ordered) {
 812       ctx->lock_type = ObjectContext::RWState::RWWRITE;
 813     } else {
 814       ceph_assert(ctx->op->may_read());
 815       ctx->lock_type = ObjectContext::RWState::RWREAD;
 816     }
 817
 818     if (ctx->head_obc) {
 819       ceph_assert(!ctx->obc->obs.exists);
 820       if (!ctx->lock_manager.get_lock_type(
 821             ctx->lock_type,
 822             ctx->head_obc->obs.oi.soid,
 823             ctx->head_obc,
 824             ctx->op)) {
 825         ctx->lock_type = ObjectContext::RWState::RWNONE;
 826         return false;
 827       }
 828     }
 829     if (ctx->lock_manager.get_lock_type(
 830           ctx->lock_type,
 831           ctx->obc->obs.oi.soid,
 832           ctx->obc,
 833           ctx->op)) {
 834       return true;
 835     } else {
 836       ceph_assert(!ctx->head_obc);
 837       ctx->lock_type = ObjectContext::RWState::RWNONE;
 838       return false;
 839     }
 840   }
 841
 842   /**
 843    * Cleans up OpContext
 844    *
 845    * @param ctx [in] ctx to clean up
 846    */
 847   void close_op_ctx(OpContext *ctx);
 848
 849   /**
 850    * Releases locks
 851    *
 852    * @param manager [in] manager with locks to release
 853    */
 854   void release_object_locks(
 855     ObcLockManager &lock_manager) {
 856     list<pair<ObjectContextRef, list<OpRequestRef> > > to_req;
 857     bool requeue_recovery = false;
 858     bool requeue_snaptrim = false;
 859     lock_manager.put_locks(
 860       &to_req,
 861       &requeue_recovery,
 862       &requeue_snaptrim);
 863     if (requeue_recovery)
 864       queue_recovery();
 865     if (requeue_snaptrim)
 866       snap_trimmer_machine.process_event(TrimWriteUnblocked());
 867
 868     if (!to_req.empty()) {
 869       // requeue at front of scrub blocking queue if we are blocked by scrub
 870       for (auto &&p: to_req) {
 871         if (write_blocked_by_scrub(p.first->obs.oi.soid.get_head())) {
 872           for (auto& op : p.second) {
 873             op->mark_delayed("waiting for scrub");
 874           }
 875
 876           waiting_for_scrub.splice(
 877             waiting_for_scrub.begin(),
 878             p.second,
 879             p.second.begin(),
 880             p.second.end());
 881         } else {
 882           requeue_ops(p.second);
 883         }
 884       }
 885     }
 886   }
 887
 888   // replica ops
 889   // [primary|tail]
 890   xlist<RepGather*> repop_queue;
 891
 892   friend class C_OSD_RepopCommit;
 893   void repop_all_committed(RepGather *repop);
 894   void eval_repop(RepGather*);
 895   void issue_repop(RepGather *repop, OpContext *ctx);
 896   RepGather *new_repop(
 897     OpContext *ctx,
 898     ObjectContextRef obc,
 899     ceph_tid_t rep_tid);
 900   boost::intrusive_ptr<RepGather> new_repop(
 901     eversion_t version,
 902     int r,
 903     ObcLockManager &&manager,
 904     OpRequestRef &&op,
 905     boost::optional<std::function<void(void)> > &&on_complete);
 906   void remove_repop(RepGather *repop);
 907
 908   OpContextUPtr simple_opc_create(ObjectContextRef obc);
 909   void simple_opc_submit(OpContextUPtr ctx);
 910
 911   /**
 912    * Merge entries atomically into all acting_recovery_backfill osds
 913    * adjusting missing and recovery state as necessary.
 914    *
 915    * Also used to store error log entries for dup detection.
 916    */
 917   void submit_log_entries(
 918     const mempool::osd_pglog::list<pg_log_entry_t> &entries,
 919     ObcLockManager &&manager,
 920     boost::optional<std::function<void(void)> > &&on_complete,
 921     OpRequestRef op = OpRequestRef(),
 922     int r = 0);
 923   struct LogUpdateCtx {
 924     boost::intrusive_ptr<RepGather> repop;
 925     set<pg_shard_t> waiting_on;
 926   };
 927   void cancel_log_updates();
 928   map<ceph_tid_t, LogUpdateCtx> log_entry_update_waiting_on;
 929
 930
 931   // hot/cold tracking
 932   HitSetRef hit_set;        ///< currently accumulating HitSet
 933   utime_t hit_set_start_stamp;    ///< time the current HitSet started recording
 934
 935
 936   void hit_set_clear();     ///< discard any HitSet state
 937   void hit_set_setup();     ///< initialize HitSet state
 938   void hit_set_create();    ///< create a new HitSet
 939   void hit_set_persist();   ///< persist hit info
 940   bool hit_set_apply_log(); ///< apply log entries to update in-memory HitSet
 941   void hit_set_trim(OpContextUPtr &ctx, unsigned max); ///< discard old HitSets
 942   void hit_set_in_memory_trim(uint32_t max_in_memory); ///< discard old in memory HitSets
 943   void hit_set_remove_all();
 944
 945   hobject_t get_hit_set_current_object(utime_t stamp);
 946   hobject_t get_hit_set_archive_object(utime_t start,
 947                                        utime_t end,
 948                                        bool using_gmt);
 949
 950   // agent
 951   boost::scoped_ptr<TierAgentState> agent_state;
 952
 953   void agent_setup();       ///< initialize agent state
 954   bool agent_work(int max) override ///< entry point to do some agent work
 955   {
 956     return agent_work(max, max);
 957   }
 958   bool agent_work(int max, int agent_flush_quota) override;
 959   bool agent_maybe_flush(ObjectContextRef& obc);  ///< maybe flush
 960   bool agent_maybe_evict(ObjectContextRef& obc, bool after_flush);  ///< maybe evict
 961
 962   void agent_load_hit_sets();  ///< load HitSets, if needed
 963
 964   /// estimate object atime and temperature
 965   ///
 966   /// @param oid [in] object name
 967   /// @param temperature [out] relative temperature (# consider both access time and frequency)
 968   void agent_estimate_temp(const hobject_t& oid, int *temperature);
 969
 970   /// stop the agent
 971   void agent_stop() override;
 972   void agent_delay() override;
 973
 974   /// clear agent state
 975   void agent_clear() override;
 976
 977   /// choose (new) agent mode(s), returns true if op is requeued
 978   bool agent_choose_mode(bool restart = false, OpRequestRef op = OpRequestRef());
 979   void agent_choose_mode_restart() override;
 980
 981   /// true if we can send an ondisk/commit for v
 982   bool already_complete(eversion_t v);
 983   /// true if we can send an ack for v
 984   bool already_ack(eversion_t v);
 985
 986   // projected object info
 987   SharedLRU<hobject_t, ObjectContext> object_contexts;
 988   // map from oid.snapdir() to SnapSetContext *
 989   map<hobject_t, SnapSetContext*> snapset_contexts;
 990   Mutex snapset_contexts_lock;
 991
 992   // debug order that client ops are applied
 993   map<hobject_t, map<client_t, ceph_tid_t>> debug_op_order;
 994
 995   void populate_obc_watchers(ObjectContextRef obc);
 996   void check_blacklisted_obc_watchers(ObjectContextRef obc);
 997   void check_blacklisted_watchers() override;
 998   void get_watchers(list<obj_watch_item_t> *ls) override;
 999   void get_obc_watchers(ObjectContextRef obc, list<obj_watch_item_t> &pg_watchers);
1000 public:
1001   void handle_watch_timeout(WatchRef watch);
1002 protected:
1003
1004   ObjectContextRef create_object_context(const object_info_t& oi, SnapSetContext *ssc);
1005   ObjectContextRef get_object_context(
1006     const hobject_t& soid,
1007     bool can_create,
1008     const map<string, bufferlist> *attrs = 0
1009     );
1010
1011   void context_registry_on_change();
1012   void object_context_destructor_callback(ObjectContext *obc);
1013   class C_PG_ObjectContext;
1014
1015   int find_object_context(const hobject_t& oid,
1016                           ObjectContextRef *pobc,
1017                           bool can_create,
1018                           bool map_snapid_to_clone=false,
1019                           hobject_t *missing_oid=NULL);
1020
1021   void add_object_context_to_pg_stat(ObjectContextRef obc, pg_stat_t *stat);
1022
1023   void get_src_oloc(const object_t& oid, const object_locator_t& oloc, object_locator_t& src_oloc);
1024
1025   SnapSetContext *get_snapset_context(
1026     const hobject_t& oid,
1027     bool can_create,
1028     const map<string, bufferlist> *attrs = 0,
1029     bool oid_existed = true //indicate this oid whether exsited in backend
1030     );
1031   void register_snapset_context(SnapSetContext *ssc) {
1032     std::lock_guard l(snapset_contexts_lock);
1033     _register_snapset_context(ssc);
1034   }
1035   void _register_snapset_context(SnapSetContext *ssc) {
1036     ceph_assert(snapset_contexts_lock.is_locked());
1037     if (!ssc->registered) {
1038       ceph_assert(snapset_contexts.count(ssc->oid) == 0);
1039       ssc->registered = true;
1040       snapset_contexts[ssc->oid] = ssc;
1041     }
1042   }
1043   void put_snapset_context(SnapSetContext *ssc);
1044
1045   map<hobject_t, ObjectContextRef> recovering;
1046
1047   /*
1048    * Backfill
1049    *
1050    * peer_info[backfill_target].last_backfill == info.last_backfill on the peer.
1051    *
1052    * objects prior to peer_info[backfill_target].last_backfill
1053    *   - are on the peer
1054    *   - are included in the peer stats
1055    *
1056    * objects \in (last_backfill, last_backfill_started]
1057    *   - are on the peer or are in backfills_in_flight
1058    *   - are not included in pg stats (yet)
1059    *   - have their stats in pending_backfill_updates on the primary
1060    */
1061   set<hobject_t> backfills_in_flight;
1062   map<hobject_t, pg_stat_t> pending_backfill_updates;
1063
1064   void dump_recovery_info(Formatter *f) const override {
1065     f->open_array_section("backfill_targets");
1066     for (set<pg_shard_t>::const_iterator p = backfill_targets.begin();
1067         p != backfill_targets.end(); ++p)
1068       f->dump_stream("replica") << *p;
1069     f->close_section();
1070     f->open_array_section("waiting_on_backfill");
1071     for (set<pg_shard_t>::const_iterator p = waiting_on_backfill.begin();
1072         p != waiting_on_backfill.end(); ++p)
1073       f->dump_stream("osd") << *p;
1074     f->close_section();
1075     f->dump_stream("last_backfill_started") << last_backfill_started;
1076     {
1077       f->open_object_section("backfill_info");
1078       backfill_info.dump(f);
1079       f->close_section();
1080     }
1081     {
1082       f->open_array_section("peer_backfill_info");
1083       for (map<pg_shard_t, BackfillInterval>::const_iterator pbi =
1084              peer_backfill_info.begin();
1085           pbi != peer_backfill_info.end(); ++pbi) {
1086         f->dump_stream("osd") << pbi->first;
1087         f->open_object_section("BackfillInterval");
1088           pbi->second.dump(f);
1089         f->close_section();
1090       }
1091       f->close_section();
1092     }
1093     {
1094       f->open_array_section("backfills_in_flight");
1095       for (set<hobject_t>::const_iterator i = backfills_in_flight.begin();
1096            i != backfills_in_flight.end();
1097            ++i) {
1098         f->dump_stream("object") << *i;
1099       }
1100       f->close_section();
1101     }
1102     {
1103       f->open_array_section("recovering");
1104       for (map<hobject_t, ObjectContextRef>::const_iterator i = recovering.begin();
1105            i != recovering.end();
1106            ++i) {
1107         f->dump_stream("object") << i->first;
1108       }
1109       f->close_section();
1110     }
1111     {
1112       f->open_object_section("pg_backend");
1113       pgbackend->dump_recovery_info(f);
1114       f->close_section();
1115     }
1116   }
1117
1118   /// last backfill operation started
1119   hobject_t last_backfill_started;
1120   bool new_backfill;
1121
1122   int prep_object_replica_pushes(const hobject_t& soid, eversion_t v,
1123                                  PGBackend::RecoveryHandle *h,
1124                                  bool *work_started);
1125   int prep_object_replica_deletes(const hobject_t& soid, eversion_t v,
1126                                   PGBackend::RecoveryHandle *h,
1127                                   bool *work_started);
1128
1129   void finish_degraded_object(const hobject_t& oid) override;
1130
1131   // Cancels/resets pulls from peer
1132   void check_recovery_sources(const OSDMapRef& map) override ;
1133
1134   int recover_missing(
1135     const hobject_t& oid,
1136     eversion_t v,
1137     int priority,
1138     PGBackend::RecoveryHandle *h);
1139
1140   // low level ops
1141
1142   void _make_clone(
1143     OpContext *ctx,
1144     PGTransaction* t,
1145     ObjectContextRef obc,
1146     const hobject_t& head, const hobject_t& coid,
1147     object_info_t *poi);
1148   void execute_ctx(OpContext *ctx);
1149   void finish_ctx(OpContext *ctx, int log_op_type);
1150   void reply_ctx(OpContext *ctx, int err);
1151   void reply_ctx(OpContext *ctx, int err, eversion_t v, version_t uv);
1152   void make_writeable(OpContext *ctx);
1153   void log_op_stats(const OpRequest& op, uint64_t inb, uint64_t outb);
1154
1155   void write_update_size_and_usage(object_stat_sum_t& stats, object_info_t& oi,
1156                                    interval_set<uint64_t>& modified, uint64_t offset,
1157                                    uint64_t length, bool write_full=false);
1158   inline void truncate_update_size_and_usage(
1159     object_stat_sum_t& delta_stats,
1160     object_info_t& oi,
1161     uint64_t truncate_size);
1162
1163   enum class cache_result_t {
1164     NOOP,
1165     BLOCKED_FULL,
1166     BLOCKED_PROMOTE,
1167     HANDLED_PROXY,
1168     HANDLED_REDIRECT,
1169     REPLIED_WITH_EAGAIN,
1170     BLOCKED_RECOVERY,
1171   };
1172   cache_result_t maybe_handle_cache_detail(OpRequestRef op,
1173                                            bool write_ordered,
1174                                            ObjectContextRef obc, int r,
1175                                            hobject_t missing_oid,
1176                                            bool must_promote,
1177                                            bool in_hit_set,
1178                                            ObjectContextRef *promote_obc);
1179   cache_result_t maybe_handle_manifest_detail(OpRequestRef op,
1180                                                      bool write_ordered,
1181                                                      ObjectContextRef obc);
1182   bool maybe_handle_manifest(OpRequestRef op,
1183                               bool write_ordered,
1184                               ObjectContextRef obc) {
1185     return cache_result_t::NOOP != maybe_handle_manifest_detail(
1186       op,
1187       write_ordered,
1188       obc);
1189   }
1190
1191   /**
1192    * This helper function is called from do_op if the ObjectContext lookup fails.
1193    * @returns true if the caching code is handling the Op, false otherwise.
1194    */
1195   bool maybe_handle_cache(OpRequestRef op,
1196                           bool write_ordered,
1197                           ObjectContextRef obc, int r,
1198                           const hobject_t& missing_oid,
1199                           bool must_promote,
1200                           bool in_hit_set = false) {
1201     return cache_result_t::NOOP != maybe_handle_cache_detail(
1202       op,
1203       write_ordered,
1204       obc,
1205       r,
1206       missing_oid,
1207       must_promote,
1208       in_hit_set,
1209       nullptr);
1210   }
1211
1212   /**
1213    * This helper function checks if a promotion is needed.
1214    */
1215   bool maybe_promote(ObjectContextRef obc,
1216                      const hobject_t& missing_oid,
1217                      const object_locator_t& oloc,
1218                      bool in_hit_set,
1219                      uint32_t recency,
1220                      OpRequestRef promote_op,
1221                      ObjectContextRef *promote_obc = nullptr);
1222   /**
1223    * This helper function tells the client to redirect their request elsewhere.
1224    */
1225   void do_cache_redirect(OpRequestRef op);
1226   /**
1227    * This function attempts to start a promote.  Either it succeeds,
1228    * or places op on a wait list.  If op is null, failure means that
1229    * this is a noop.  If a future user wants to be able to distinguish
1230    * these cases, a return value should be added.
1231    */
1232   void promote_object(
1233     ObjectContextRef obc,            ///< [optional] obc
1234     const hobject_t& missing_object, ///< oid (if !obc)
1235     const object_locator_t& oloc,    ///< locator for obc|oid
1236     OpRequestRef op,                 ///< [optional] client op
1237     ObjectContextRef *promote_obc = nullptr ///< [optional] new obc for object
1238     );
1239
1240   int prepare_transaction(OpContext *ctx);
1241   list<pair<OpRequestRef, OpContext*> > in_progress_async_reads;
1242   void complete_read_ctx(int result, OpContext *ctx);
1243
1244   // pg on-disk content
1245   void check_local() override;
1246
1247   void _clear_recovery_state() override;
1248
1249   bool start_recovery_ops(
1250     uint64_t max,
1251     ThreadPool::TPHandle &handle, uint64_t *started) override;
1252
1253   uint64_t recover_primary(uint64_t max, ThreadPool::TPHandle &handle);
1254   uint64_t recover_replicas(uint64_t max, ThreadPool::TPHandle &handle,
1255                             bool *recovery_started);
1256   hobject_t earliest_peer_backfill() const;
1257   bool all_peer_done() const;
1258   /**
1259    * @param work_started will be set to true if recover_backfill got anywhere
1260    * @returns the number of operations started
1261    */
1262   uint64_t recover_backfill(uint64_t max, ThreadPool::TPHandle &handle,
1263                             bool *work_started);
1264
1265   /**
1266    * scan a (hash) range of objects in the current pg
1267    *
1268    * @begin first item should be >= this value
1269    * @min return at least this many items, unless we are done
1270    * @max return no more than this many items
1271    * @bi [out] resulting map of objects to eversion_t's
1272    */
1273   void scan_range(
1274     int min, int max, BackfillInterval *bi,
1275     ThreadPool::TPHandle &handle
1276     );
1277
1278   /// Update a hash range to reflect changes since the last scan
1279   void update_range(
1280     BackfillInterval *bi,        ///< [in,out] interval to update
1281     ThreadPool::TPHandle &handle ///< [in] tp handle
1282     );
1283
1284   int prep_backfill_object_push(
1285     hobject_t oid, eversion_t v, ObjectContextRef obc,
1286     vector<pg_shard_t> peers,
1287     PGBackend::RecoveryHandle *h);
1288   void send_remove_op(const hobject_t& oid, eversion_t v, pg_shard_t peer);
1289
1290
1291   class C_OSD_AppliedRecoveredObject;
1292   class C_OSD_CommittedPushedObject;
1293   class C_OSD_AppliedRecoveredObjectReplica;
1294
1295   void _applied_recovered_object(ObjectContextRef obc);
1296   void _applied_recovered_object_replica();
1297   void _committed_pushed_object(epoch_t epoch, eversion_t lc);
1298   void recover_got(hobject_t oid, eversion_t v);
1299
1300   // -- copyfrom --
1301   map<hobject_t, CopyOpRef> copy_ops;
1302
1303   int do_copy_get(OpContext *ctx, bufferlist::const_iterator& bp, OSDOp& op,
1304                   ObjectContextRef& obc);
1305   int finish_copy_get();
1306
1307   void fill_in_copy_get_noent(OpRequestRef& op, hobject_t oid,
1308                               OSDOp& osd_op);
1309
1310   /**
1311    * To copy an object, call start_copy.
1312    *
1313    * @param cb: The CopyCallback to be activated when the copy is complete
1314    * @param obc: The ObjectContext we are copying into
1315    * @param src: The source object
1316    * @param oloc: the source object locator
1317    * @param version: the version of the source object to copy (0 for any)
1318    */
1319   void start_copy(CopyCallback *cb, ObjectContextRef obc, hobject_t src,
1320                   object_locator_t oloc, version_t version, unsigned flags,
1321                   bool mirror_snapset, unsigned src_obj_fadvise_flags,
1322                   unsigned dest_obj_fadvise_flags);
1323   void process_copy_chunk(hobject_t oid, ceph_tid_t tid, int r);
1324   void _write_copy_chunk(CopyOpRef cop, PGTransaction *t);
1325   uint64_t get_copy_chunk_size() const {
1326     uint64_t size = cct->_conf->osd_copyfrom_max_chunk;
1327     if (pool.info.required_alignment()) {
1328       uint64_t alignment = pool.info.required_alignment();
1329       if (size % alignment) {
1330         size += alignment - (size % alignment);
1331       }
1332     }
1333     return size;
1334   }
1335   void _copy_some(ObjectContextRef obc, CopyOpRef cop);
1336   void finish_copyfrom(CopyFromCallback *cb);
1337   void finish_promote(int r, CopyResults *results, ObjectContextRef obc);
1338   void cancel_copy(CopyOpRef cop, bool requeue, vector<ceph_tid_t> *tids);
1339   void cancel_copy_ops(bool requeue, vector<ceph_tid_t> *tids);
1340
1341   friend struct C_Copyfrom;
1342
1343   // -- flush --
1344   map<hobject_t, FlushOpRef> flush_ops;
1345
1346   /// start_flush takes ownership of on_flush iff ret == -EINPROGRESS
1347   int start_flush(
1348     OpRequestRef op, ObjectContextRef obc,
1349     bool blocking, hobject_t *pmissing,
1350     boost::optional<std::function<void()>> &&on_flush);
1351   void finish_flush(hobject_t oid, ceph_tid_t tid, int r);
1352   int try_flush_mark_clean(FlushOpRef fop);
1353   void cancel_flush(FlushOpRef fop, bool requeue, vector<ceph_tid_t> *tids);
1354   void cancel_flush_ops(bool requeue, vector<ceph_tid_t> *tids);
1355
1356   /// @return false if clone is has been evicted
1357   bool is_present_clone(hobject_t coid);
1358
1359   friend struct C_Flush;
1360
1361   // -- scrub --
1362   bool _range_available_for_scrub(
1363     const hobject_t &begin, const hobject_t &end) override;
1364   void scrub_snapshot_metadata(
1365     ScrubMap &map,
1366     const std::map<hobject_t,
1367                    pair<boost::optional<uint32_t>,
1368                         boost::optional<uint32_t>>> &missing_digest) override;
1369   void _scrub_clear_state() override;
1370   void _scrub_finish() override;
1371   object_stat_collection_t scrub_cstat;
1372
1373   void _split_into(pg_t child_pgid, PG *child,
1374                    unsigned split_bits) override;
1375   void apply_and_flush_repops(bool requeue);
1376
1377   void calc_trim_to() override;
1378   void calc_trim_to_aggressive() override;
1379   int do_xattr_cmp_u64(int op, __u64 v1, bufferlist& xattr);
1380   int do_xattr_cmp_str(int op, string& v1s, bufferlist& xattr);
1381
1382   // -- checksum --
1383   int do_checksum(OpContext *ctx, OSDOp& osd_op, bufferlist::const_iterator *bl_it);
1384   int finish_checksum(OSDOp& osd_op, Checksummer::CSumType csum_type,
1385                       bufferlist::const_iterator *init_value_bl_it,
1386                       const bufferlist &read_bl);
1387
1388   friend class C_ChecksumRead;
1389
1390   int do_extent_cmp(OpContext *ctx, OSDOp& osd_op);
1391   int finish_extent_cmp(OSDOp& osd_op, const bufferlist &read_bl);
1392
1393   friend class C_ExtentCmpRead;
1394
1395   int do_read(OpContext *ctx, OSDOp& osd_op);
1396   int do_sparse_read(OpContext *ctx, OSDOp& osd_op);
1397   int do_writesame(OpContext *ctx, OSDOp& osd_op);
1398
1399   bool pgls_filter(PGLSFilter *filter, hobject_t& sobj, bufferlist& outdata);
1400   int get_pgls_filter(bufferlist::const_iterator& iter, PGLSFilter **pfilter);
1401
1402   map<hobject_t, list<OpRequestRef>> in_progress_proxy_ops;
1403   void kick_proxy_ops_blocked(hobject_t& soid);
1404   void cancel_proxy_ops(bool requeue, vector<ceph_tid_t> *tids);
1405
1406   // -- proxyread --
1407   map<ceph_tid_t, ProxyReadOpRef> proxyread_ops;
1408
1409   void do_proxy_read(OpRequestRef op, ObjectContextRef obc = NULL);
1410   void finish_proxy_read(hobject_t oid, ceph_tid_t tid, int r);
1411   void cancel_proxy_read(ProxyReadOpRef prdop, vector<ceph_tid_t> *tids);
1412
1413   friend struct C_ProxyRead;
1414
1415   // -- proxywrite --
1416   map<ceph_tid_t, ProxyWriteOpRef> proxywrite_ops;
1417
1418   void do_proxy_write(OpRequestRef op, ObjectContextRef obc = NULL);
1419   void finish_proxy_write(hobject_t oid, ceph_tid_t tid, int r);
1420   void cancel_proxy_write(ProxyWriteOpRef pwop, vector<ceph_tid_t> *tids);
1421
1422   friend struct C_ProxyWrite_Commit;
1423
1424   // -- chunkop --
1425   void do_proxy_chunked_op(OpRequestRef op, const hobject_t& missing_oid,
1426                            ObjectContextRef obc, bool write_ordered);
1427   void do_proxy_chunked_read(OpRequestRef op, ObjectContextRef obc, int op_index,
1428                              uint64_t chunk_index, uint64_t req_offset, uint64_t req_length,
1429                              uint64_t req_total_len, bool write_ordered);
1430   bool can_proxy_chunked_read(OpRequestRef op, ObjectContextRef obc);
1431   void _copy_some_manifest(ObjectContextRef obc, CopyOpRef cop, uint64_t start_offset);
1432   void process_copy_chunk_manifest(hobject_t oid, ceph_tid_t tid, int r, uint64_t offset);
1433   void finish_promote_manifest(int r, CopyResults *results, ObjectContextRef obc);
1434   void cancel_and_requeue_proxy_ops(hobject_t oid);
1435   int do_manifest_flush(OpRequestRef op, ObjectContextRef obc, FlushOpRef manifest_fop,
1436                         uint64_t start_offset, bool block);
1437   int start_manifest_flush(OpRequestRef op, ObjectContextRef obc, bool blocking,
1438                            boost::optional<std::function<void()>> &&on_flush);
1439   void finish_manifest_flush(hobject_t oid, ceph_tid_t tid, int r, ObjectContextRef obc,
1440                              uint64_t last_offset);
1441   void handle_manifest_flush(hobject_t oid, ceph_tid_t tid, int r,
1442                              uint64_t offset, uint64_t last_offset, epoch_t lpr);
1443   void refcount_manifest(ObjectContextRef obc, object_locator_t oloc, hobject_t soid,
1444                          SnapContext snapc, bool get, Context *cb, uint64_t offset);
1445
1446   friend struct C_ProxyChunkRead;
1447   friend class PromoteManifestCallback;
1448   friend class C_CopyChunk;
1449   friend struct C_ManifestFlush;
1450   friend struct RefCountCallback;
1451
1452 public:
1453   PrimaryLogPG(OSDService *o, OSDMapRef curmap,
1454                const PGPool &_pool,
1455                const map<string,string>& ec_profile,
1456                spg_t p);
1457   ~PrimaryLogPG() override {}
1458
1459   int do_command(
1460     cmdmap_t cmdmap,
1461     ostream& ss,
1462     bufferlist& idata,
1463     bufferlist& odata,
1464     ConnectionRef conn,
1465     ceph_tid_t tid) override;
1466
1467   void clear_cache();
1468   int get_cache_obj_count() {
1469     return object_contexts.get_count();
1470   }
1471   void do_request(
1472     OpRequestRef& op,
1473     ThreadPool::TPHandle &handle) override;
1474   void do_op(OpRequestRef& op);
1475   void record_write_error(OpRequestRef op, const hobject_t &soid,
1476                           MOSDOpReply *orig_reply, int r);
1477   void do_pg_op(OpRequestRef op);
1478   void do_scan(
1479     OpRequestRef op,
1480     ThreadPool::TPHandle &handle);
1481   void do_backfill(OpRequestRef op);
1482   void do_backfill_remove(OpRequestRef op);
1483
1484   void handle_backoff(OpRequestRef& op);
1485
1486   int trim_object(bool first, const hobject_t &coid, OpContextUPtr *ctxp);
1487   void snap_trimmer(epoch_t e) override;
1488   void kick_snap_trim() override;
1489   void snap_trimmer_scrub_complete() override;
1490   int do_osd_ops(OpContext *ctx, vector<OSDOp>& ops);
1491
1492   int _get_tmap(OpContext *ctx, bufferlist *header, bufferlist *vals);
1493   int do_tmap2omap(OpContext *ctx, unsigned flags);
1494   int do_tmapup(OpContext *ctx, bufferlist::const_iterator& bp, OSDOp& osd_op);
1495   int do_tmapup_slow(OpContext *ctx, bufferlist::const_iterator& bp, OSDOp& osd_op, bufferlist& bl);
1496
1497   void do_osd_op_effects(OpContext *ctx, const ConnectionRef& conn);
1498 private:
1499   int do_scrub_ls(MOSDOp *op, OSDOp *osd_op);
1500   hobject_t earliest_backfill() const;
1501   bool check_src_targ(const hobject_t& soid, const hobject_t& toid) const;
1502
1503   uint64_t temp_seq; ///< last id for naming temp objects
1504   /// generate a new temp object name
1505   hobject_t generate_temp_object(const hobject_t& target);
1506   /// generate a new temp object name (for recovery)
1507   hobject_t get_temp_recovery_object(const hobject_t& target,
1508                                      eversion_t version) override;
1509   int get_recovery_op_priority() const {
1510     int64_t pri = 0;
1511     pool.info.opts.get(pool_opts_t::RECOVERY_OP_PRIORITY, &pri);
1512     return  pri > 0 ? pri : cct->_conf->osd_recovery_op_priority;
1513   }
1514   void log_missing(unsigned missing,
1515                         const boost::optional<hobject_t> &head,
1516                         LogChannelRef clog,
1517                         const spg_t &pgid,
1518                         const char *func,
1519                         const char *mode,
1520                         bool allow_incomplete_clones);
1521   unsigned process_clones_to(const boost::optional<hobject_t> &head,
1522     const boost::optional<SnapSet> &snapset,
1523     LogChannelRef clog,
1524     const spg_t &pgid,
1525     const char *mode,
1526     bool allow_incomplete_clones,
1527     boost::optional<snapid_t> target,
1528     vector<snapid_t>::reverse_iterator *curclone,
1529     inconsistent_snapset_wrapper &snap_error);
1530
1531 public:
1532   coll_t get_coll() {
1533     return coll;
1534   }
1535   void split_colls(
1536     spg_t child,
1537     int split_bits,
1538     int seed,
1539     const pg_pool_t *pool,
1540     ObjectStore::Transaction *t) override {
1541     coll_t target = coll_t(child);
1542     PG::_create(*t, child, split_bits);
1543     t->split_collection(
1544       coll,
1545       split_bits,
1546       seed,
1547       target);
1548     PG::_init(*t, child, pool);
1549   }
1550 private:
1551
1552   struct DoSnapWork : boost::statechart::event< DoSnapWork > {
1553     DoSnapWork() : boost::statechart::event < DoSnapWork >() {}
1554   };
1555   struct KickTrim : boost::statechart::event< KickTrim > {
1556     KickTrim() : boost::statechart::event < KickTrim >() {}
1557   };
1558   struct RepopsComplete : boost::statechart::event< RepopsComplete > {
1559     RepopsComplete() : boost::statechart::event < RepopsComplete >() {}
1560   };
1561   struct ScrubComplete : boost::statechart::event< ScrubComplete > {
1562     ScrubComplete() : boost::statechart::event < ScrubComplete >() {}
1563   };
1564   struct TrimWriteUnblocked : boost::statechart::event< TrimWriteUnblocked > {
1565     TrimWriteUnblocked() : boost::statechart::event < TrimWriteUnblocked >() {}
1566   };
1567   struct Reset : boost::statechart::event< Reset > {
1568     Reset() : boost::statechart::event< Reset >() {}
1569   };
1570   struct SnapTrimReserved : boost::statechart::event< SnapTrimReserved > {
1571     SnapTrimReserved() : boost::statechart::event< SnapTrimReserved >() {}
1572   };
1573   struct SnapTrimTimerReady : boost::statechart::event< SnapTrimTimerReady > {
1574     SnapTrimTimerReady() : boost::statechart::event< SnapTrimTimerReady >() {}
1575   };
1576
1577   struct NotTrimming;
1578   struct SnapTrimmer : public boost::statechart::state_machine< SnapTrimmer, NotTrimming > {
1579     PrimaryLogPG *pg;
1580     explicit SnapTrimmer(PrimaryLogPG *pg) : pg(pg) {}
1581     void log_enter(const char *state_name);
1582     void log_exit(const char *state_name, utime_t duration);
1583     bool permit_trim() {
1584       return
1585         pg->is_clean() &&
1586         !pg->scrubber.active &&
1587         !pg->snap_trimq.empty();
1588     }
1589     bool can_trim() {
1590       return
1591         permit_trim() &&
1592         !pg->get_osdmap()->test_flag(CEPH_OSDMAP_NOSNAPTRIM);
1593     }
1594   } snap_trimmer_machine;
1595
1596   struct WaitReservation;
1597   struct Trimming : boost::statechart::state< Trimming, SnapTrimmer, WaitReservation >, NamedState {
1598     typedef boost::mpl::list <
1599       boost::statechart::custom_reaction< KickTrim >,
1600       boost::statechart::transition< Reset, NotTrimming >
1601       > reactions;
1602
1603     set<hobject_t> in_flight;
1604     snapid_t snap_to_trim;
1605
1606     explicit Trimming(my_context ctx)
1607       : my_base(ctx),
1608         NamedState(context< SnapTrimmer >().pg, "Trimming") {
1609       context< SnapTrimmer >().log_enter(state_name);
1610       ceph_assert(context< SnapTrimmer >().permit_trim());
1611       ceph_assert(in_flight.empty());
1612     }
1613     void exit() {
1614       context< SnapTrimmer >().log_exit(state_name, enter_time);
1615       auto *pg = context< SnapTrimmer >().pg;
1616       pg->osd->snap_reserver.cancel_reservation(pg->get_pgid());
1617       pg->state_clear(PG_STATE_SNAPTRIM);
1618       pg->publish_stats_to_osd();
1619     }
1620     boost::statechart::result react(const KickTrim&) {
1621       return discard_event();
1622     }
1623   };
1624
1625   /* SnapTrimmerStates */
1626   struct WaitTrimTimer : boost::statechart::state< WaitTrimTimer, Trimming >, NamedState {
1627     typedef boost::mpl::list <
1628       boost::statechart::custom_reaction< SnapTrimTimerReady >
1629       > reactions;
1630     Context *wakeup = nullptr;
1631     explicit WaitTrimTimer(my_context ctx)
1632       : my_base(ctx),
1633         NamedState(context< SnapTrimmer >().pg, "Trimming/WaitTrimTimer") {
1634       context< SnapTrimmer >().log_enter(state_name);
1635       ceph_assert(context<Trimming>().in_flight.empty());
1636       struct OnTimer : Context {
1637         PrimaryLogPGRef pg;
1638         epoch_t epoch;
1639         OnTimer(PrimaryLogPGRef pg, epoch_t epoch) : pg(pg), epoch(epoch) {}
1640         void finish(int) override {
1641           pg->lock();
1642           if (!pg->pg_has_reset_since(epoch))
1643             pg->snap_trimmer_machine.process_event(SnapTrimTimerReady());
1644           pg->unlock();
1645         }
1646       };
1647       auto *pg = context< SnapTrimmer >().pg;
1648       if (pg->cct->_conf->osd_snap_trim_sleep > 0) {
1649         std::lock_guard l(pg->osd->sleep_lock);
1650         wakeup = pg->osd->sleep_timer.add_event_after(
1651           pg->cct->_conf->osd_snap_trim_sleep,
1652           new OnTimer{pg, pg->get_osdmap_epoch()});
1653       } else {
1654         post_event(SnapTrimTimerReady());
1655       }
1656     }
1657     void exit() {
1658       context< SnapTrimmer >().log_exit(state_name, enter_time);
1659       auto *pg = context< SnapTrimmer >().pg;
1660       if (wakeup) {
1661         std::lock_guard l(pg->osd->sleep_lock);
1662         pg->osd->sleep_timer.cancel_event(wakeup);
1663         wakeup = nullptr;
1664       }
1665     }
1666     boost::statechart::result react(const SnapTrimTimerReady &) {
1667       wakeup = nullptr;
1668       if (!context< SnapTrimmer >().can_trim()) {
1669         post_event(KickTrim());
1670         return transit< NotTrimming >();
1671       } else {
1672         return transit< AwaitAsyncWork >();
1673       }
1674     }
1675   };
1676
1677   struct WaitRWLock : boost::statechart::state< WaitRWLock, Trimming >, NamedState {
1678     typedef boost::mpl::list <
1679       boost::statechart::custom_reaction< TrimWriteUnblocked >
1680       > reactions;
1681     explicit WaitRWLock(my_context ctx)
1682       : my_base(ctx),
1683         NamedState(context< SnapTrimmer >().pg, "Trimming/WaitRWLock") {
1684       context< SnapTrimmer >().log_enter(state_name);
1685       ceph_assert(context<Trimming>().in_flight.empty());
1686     }
1687     void exit() {
1688       context< SnapTrimmer >().log_exit(state_name, enter_time);
1689     }
1690     boost::statechart::result react(const TrimWriteUnblocked&) {
1691       if (!context< SnapTrimmer >().can_trim()) {
1692         post_event(KickTrim());
1693         return transit< NotTrimming >();
1694       } else {
1695         return transit< AwaitAsyncWork >();
1696       }
1697     }
1698   };
1699
1700   struct WaitRepops : boost::statechart::state< WaitRepops, Trimming >, NamedState {
1701     typedef boost::mpl::list <
1702       boost::statechart::custom_reaction< RepopsComplete >
1703       > reactions;
1704     explicit WaitRepops(my_context ctx)
1705       : my_base(ctx),
1706         NamedState(context< SnapTrimmer >().pg, "Trimming/WaitRepops") {
1707       context< SnapTrimmer >().log_enter(state_name);
1708       ceph_assert(!context<Trimming>().in_flight.empty());
1709     }
1710     void exit() {
1711       context< SnapTrimmer >().log_exit(state_name, enter_time);
1712     }
1713     boost::statechart::result react(const RepopsComplete&) {
1714       if (!context< SnapTrimmer >().can_trim()) {
1715         post_event(KickTrim());
1716         return transit< NotTrimming >();
1717       } else {
1718         return transit< WaitTrimTimer >();
1719       }
1720     }
1721   };
1722
1723   struct AwaitAsyncWork : boost::statechart::state< AwaitAsyncWork, Trimming >, NamedState {
1724     typedef boost::mpl::list <
1725       boost::statechart::custom_reaction< DoSnapWork >
1726       > reactions;
1727     explicit AwaitAsyncWork(my_context ctx);
1728     void exit() {
1729       context< SnapTrimmer >().log_exit(state_name, enter_time);
1730     }
1731     boost::statechart::result react(const DoSnapWork&);
1732   };
1733
1734   struct WaitReservation : boost::statechart::state< WaitReservation, Trimming >, NamedState {
1735     /* WaitReservation is a sub-state of trimming simply so that exiting Trimming
1736      * always cancels the reservation */
1737     typedef boost::mpl::list <
1738       boost::statechart::custom_reaction< SnapTrimReserved >
1739       > reactions;
1740     struct ReservationCB : public Context {
1741       PrimaryLogPGRef pg;
1742       bool canceled;
1743       explicit ReservationCB(PrimaryLogPG *pg) : pg(pg), canceled(false) {}
1744       void finish(int) override {
1745         pg->lock();
1746         if (!canceled)
1747           pg->snap_trimmer_machine.process_event(SnapTrimReserved());
1748         pg->unlock();
1749       }
1750       void cancel() {
1751         ceph_assert(pg->is_locked());
1752         ceph_assert(!canceled);
1753         canceled = true;
1754       }
1755     };
1756     ReservationCB *pending = nullptr;
1757
1758     explicit WaitReservation(my_context ctx)
1759       : my_base(ctx),
1760         NamedState(context< SnapTrimmer >().pg, "Trimming/WaitReservation") {
1761       context< SnapTrimmer >().log_enter(state_name);
1762       ceph_assert(context<Trimming>().in_flight.empty());
1763       auto *pg = context< SnapTrimmer >().pg;
1764       pending = new ReservationCB(pg);
1765       pg->osd->snap_reserver.request_reservation(
1766         pg->get_pgid(),
1767         pending,
1768         0);
1769       pg->state_set(PG_STATE_SNAPTRIM_WAIT);
1770       pg->publish_stats_to_osd();
1771     }
1772     boost::statechart::result react(const SnapTrimReserved&);
1773     void exit() {
1774       context< SnapTrimmer >().log_exit(state_name, enter_time);
1775       if (pending)
1776         pending->cancel();
1777       pending = nullptr;
1778       auto *pg = context< SnapTrimmer >().pg;
1779       pg->state_clear(PG_STATE_SNAPTRIM_WAIT);
1780       pg->state_clear(PG_STATE_SNAPTRIM_ERROR);
1781       pg->publish_stats_to_osd();
1782     }
1783   };
1784
1785   struct WaitScrub : boost::statechart::state< WaitScrub, SnapTrimmer >, NamedState {
1786     typedef boost::mpl::list <
1787       boost::statechart::custom_reaction< ScrubComplete >,
1788       boost::statechart::custom_reaction< KickTrim >,
1789       boost::statechart::transition< Reset, NotTrimming >
1790       > reactions;
1791     explicit WaitScrub(my_context ctx)
1792       : my_base(ctx),
1793         NamedState(context< SnapTrimmer >().pg, "Trimming/WaitScrub") {
1794       context< SnapTrimmer >().log_enter(state_name);
1795     }
1796     void exit() {
1797       context< SnapTrimmer >().log_exit(state_name, enter_time);
1798     }
1799     boost::statechart::result react(const ScrubComplete&) {
1800       post_event(KickTrim());
1801       return transit< NotTrimming >();
1802     }
1803     boost::statechart::result react(const KickTrim&) {
1804       return discard_event();
1805     }
1806   };
1807
1808   struct NotTrimming : boost::statechart::state< NotTrimming, SnapTrimmer >, NamedState {
1809     typedef boost::mpl::list <
1810       boost::statechart::custom_reaction< KickTrim >,
1811       boost::statechart::transition< Reset, NotTrimming >
1812       > reactions;
1813     explicit NotTrimming(my_context ctx);
1814     void exit();
1815     boost::statechart::result react(const KickTrim&);
1816   };
1817
1818   int _verify_no_head_clones(const hobject_t& soid,
1819                              const SnapSet& ss);
1820   // return true if we're creating a local object, false for a
1821   // whiteout or no change.
1822   void maybe_create_new_object(OpContext *ctx, bool ignore_transaction=false);
1823   int _delete_oid(OpContext *ctx, bool no_whiteout, bool try_no_whiteout);
1824   int _rollback_to(OpContext *ctx, ceph_osd_op& op);
1825 public:
1826   bool is_missing_object(const hobject_t& oid) const;
1827   bool is_unreadable_object(const hobject_t &oid) const {
1828     return is_missing_object(oid) ||
1829       !missing_loc.readable_with_acting(oid, actingset);
1830   }
1831   void maybe_kick_recovery(const hobject_t &soid);
1832   void wait_for_unreadable_object(const hobject_t& oid, OpRequestRef op);
1833   void wait_for_all_missing(OpRequestRef op);
1834
1835   bool is_degraded_or_backfilling_object(const hobject_t& oid);
1836   bool is_degraded_on_async_recovery_target(const hobject_t& soid);
1837   void wait_for_degraded_object(const hobject_t& oid, OpRequestRef op);
1838
1839   void block_write_on_full_cache(
1840     const hobject_t& oid, OpRequestRef op);
1841   void block_for_clean(
1842     const hobject_t& oid, OpRequestRef op);
1843   void block_write_on_snap_rollback(
1844     const hobject_t& oid, ObjectContextRef obc, OpRequestRef op);
1845   void block_write_on_degraded_snap(const hobject_t& oid, OpRequestRef op);
1846
1847   bool maybe_await_blocked_head(const hobject_t &soid, OpRequestRef op);
1848   void wait_for_blocked_object(const hobject_t& soid, OpRequestRef op);
1849   void kick_object_context_blocked(ObjectContextRef obc);
1850
1851   void maybe_force_recovery();
1852
1853   void mark_all_unfound_lost(
1854     int what,
1855     ConnectionRef con,
1856     ceph_tid_t tid);
1857   eversion_t pick_newest_available(const hobject_t& oid);
1858
1859   void do_update_log_missing(
1860     OpRequestRef &op);
1861
1862   void do_update_log_missing_reply(
1863     OpRequestRef &op);
1864
1865   void on_role_change() override;
1866   void on_pool_change() override;
1867   void _on_new_interval() override;
1868   void clear_async_reads();
1869   void on_change(ObjectStore::Transaction *t) override;
1870   void on_activate() override;
1871   void on_flushed() override;
1872   void on_removal(ObjectStore::Transaction *t) override;
1873   void on_shutdown() override;
1874   bool check_failsafe_full() override;
1875   bool check_osdmap_full(const set<pg_shard_t> &missing_on) override;
1876   bool maybe_preempt_replica_scrub(const hobject_t& oid) override {
1877     return write_blocked_by_scrub(oid);
1878   }
1879   int rep_repair_primary_object(const hobject_t& soid, OpContext *ctx);
1880
1881   // attr cache handling
1882   void setattr_maybe_cache(
1883     ObjectContextRef obc,
1884     PGTransaction *t,
1885     const string &key,
1886     bufferlist &val);
1887   void setattrs_maybe_cache(
1888     ObjectContextRef obc,
1889     PGTransaction *t,
1890     map<string, bufferlist> &attrs);
1891   void rmattr_maybe_cache(
1892     ObjectContextRef obc,
1893     PGTransaction *t,
1894     const string &key);
1895   int getattr_maybe_cache(
1896     ObjectContextRef obc,
1897     const string &key,
1898     bufferlist *val);
1899   int getattrs_maybe_cache(
1900     ObjectContextRef obc,
1901     map<string, bufferlist> *out);
1902
1903 public:
1904   void set_dynamic_perf_stats_queries(
1905       const std::list<OSDPerfMetricQuery> &queries)  override;
1906   void get_dynamic_perf_stats(DynamicPerfStats *stats)  override;
1907
1908 private:
1909   DynamicPerfStats m_dynamic_perf_stats;
1910 };
1911
1912 inline ostream& operator<<(ostream& out, const PrimaryLogPG::RepGather& repop)
1913 {
1914   out << "repgather(" << &repop
1915       << " " << repop.v
1916       << " rep_tid=" << repop.rep_tid
1917       << " committed?=" << repop.all_committed
1918       << " r=" << repop.r
1919       << ")";
1920   return out;
1921 }
1922
1923 inline ostream& operator<<(ostream& out,
1924                            const PrimaryLogPG::ProxyWriteOpRef& pwop)
1925 {
1926   out << "proxywrite(" << &pwop
1927       << " " << pwop->user_version
1928       << " pwop_tid=" << pwop->objecter_tid;
1929   if (pwop->ctx->op)
1930     out << " op=" << *(pwop->ctx->op->get_req());
1931   out << ")";
1932   return out;
1933 }
1934
1935 void intrusive_ptr_add_ref(PrimaryLogPG::RepGather *repop);
1936 void intrusive_ptr_release(PrimaryLogPG::RepGather *repop);
1937
1938
1939 #endif