ceph/src/osd/PrimaryLogPG.cc

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Ceph - scalable distributed file system
   5  *
   6  * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
   7  * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
   8  *
   9  * Author: Loic Dachary <loic@dachary.org>
  10  *
  11  * This is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public
  13  * License version 2.1, as published by the Free Software
  14  * Foundation.  See file COPYING.
  15  *
  16  */
  17 #include <errno.h>
  18
  19 #include <charconv>
  20 #include <sstream>
  21 #include <utility>
  22
  23 #include <boost/intrusive_ptr.hpp>
  24 #include <boost/tuple/tuple.hpp>
  25
  26 #include "PrimaryLogPG.h"
  27
  28 #include "cls/cas/cls_cas_ops.h"
  29 #include "common/CDC.h"
  30 #include "common/EventTrace.h"
  31 #include "common/ceph_crypto.h"
  32 #include "common/config.h"
  33 #include "common/errno.h"
  34 #include "common/perf_counters.h"
  35 #include "common/scrub_types.h"
  36 #include "include/compat.h"
  37 #include "json_spirit/json_spirit_reader.h"
  38 #include "json_spirit/json_spirit_value.h"
  39 #include "messages/MCommandReply.h"
  40 #include "messages/MOSDBackoff.h"
  41 #include "messages/MOSDOp.h"
  42 #include "messages/MOSDPGBackfill.h"
  43 #include "messages/MOSDPGBackfillRemove.h"
  44 #include "messages/MOSDPGLog.h"
  45 #include "messages/MOSDPGScan.h"
  46 #include "messages/MOSDPGTrim.h"
  47 #include "messages/MOSDPGUpdateLogMissing.h"
  48 #include "messages/MOSDPGUpdateLogMissingReply.h"
  49 #include "messages/MOSDRepScrub.h"
  50 #include "messages/MOSDScrubReserve.h"
  51 #include "mon/MonClient.h"
  52 #include "objclass/objclass.h"
  53 #include "osd/ClassHandler.h"
  54 #include "osdc/Objecter.h"
  55 #include "osd/scrubber/PrimaryLogScrub.h"
  56 #include "osd/scrubber/ScrubStore.h"
  57 #include "osd/scrubber/pg_scrubber.h"
  58
  59 #include "OSD.h"
  60 #include "OpRequest.h"
  61 #include "PG.h"
  62 #include "Session.h"
  63
  64 // required includes order:
  65 #include "json_spirit/json_spirit_value.h"
  66 #include "json_spirit/json_spirit_reader.h"
  67 #include "include/ceph_assert.h"  // json_spirit clobbers it
  68 #include "include/rados/rados_types.hpp"
  69
  70 #ifdef WITH_LTTNG
  71 #include "tracing/osd.h"
  72 #else
  73 #define tracepoint(...)
  74 #endif
  75
  76 #define dout_context cct
  77 #define dout_subsys ceph_subsys_osd
  78 #define DOUT_PREFIX_ARGS this, osd->whoami, get_osdmap()
  79 #undef dout_prefix
  80 #define dout_prefix _prefix(_dout, this)
  81
  82 #include "osd_tracer.h"
  83
  84 MEMPOOL_DEFINE_OBJECT_FACTORY(PrimaryLogPG, replicatedpg, osd);
  85
  86 using std::less;
  87 using std::list;
  88 using std::ostream;
  89 using std::pair;
  90 using std::make_pair;
  91 using std::make_unique;
  92 using std::map;
  93 using std::ostringstream;
  94 using std::set;
  95 using std::string;
  96 using std::string_view;
  97 using std::stringstream;
  98 using std::unique_ptr;
  99 using std::vector;
 100
 101 using ceph::bufferlist;
 102 using ceph::bufferptr;
 103 using ceph::Formatter;
 104 using ceph::decode;
 105 using ceph::decode_noclear;
 106 using ceph::encode;
 107 using ceph::encode_destructively;
 108
 109 using namespace ceph::osd::scheduler;
 110 using TOPNSPC::common::cmd_getval;
 111 using TOPNSPC::common::cmd_getval_or;
 112
 113 template <typename T>
 114 static ostream& _prefix(std::ostream *_dout, T *pg) {
 115   return pg->gen_prefix(*_dout);
 116 }
 117
 118 /**
 119  * The CopyCallback class defines an interface for completions to the
 120  * copy_start code. Users of the copy infrastructure must implement
 121  * one and give an instance of the class to start_copy.
 122  *
 123  * The implementer is responsible for making sure that the CopyCallback
 124  * can associate itself with the correct copy operation.
 125  */
 126 class PrimaryLogPG::CopyCallback : public GenContext<CopyCallbackResults> {
 127 protected:
 128   CopyCallback() {}
 129   /**
 130    * results.get<0>() is the return code: 0 for success; -ECANCELED if
 131    * the operation was cancelled by the local OSD; -errno for other issues.
 132    * results.get<1>() is a pointer to a CopyResults object, which you are
 133    * responsible for deleting.
 134    */
 135   void finish(CopyCallbackResults results_) override = 0;
 136
 137 public:
 138   /// Provide the final size of the copied object to the CopyCallback
 139   ~CopyCallback() override {}
 140 };
 141
 142 template <typename T>
 143 class PrimaryLogPG::BlessedGenContext : public GenContext<T> {
 144   PrimaryLogPGRef pg;
 145   unique_ptr<GenContext<T>> c;
 146   epoch_t e;
 147 public:
 148   BlessedGenContext(PrimaryLogPG *pg, GenContext<T> *c, epoch_t e)
 149     : pg(pg), c(c), e(e) {}
 150   void finish(T t) override {
 151     std::scoped_lock locker{*pg};
 152     if (pg->pg_has_reset_since(e))
 153       c.reset();
 154     else
 155       c.release()->complete(t);
 156   }
 157   bool sync_finish(T t) {
 158     // we assume here all blessed/wrapped Contexts can complete synchronously.
 159     c.release()->complete(t);
 160     return true;
 161   }
 162 };
 163
 164 GenContext<ThreadPool::TPHandle&> *PrimaryLogPG::bless_gencontext(
 165   GenContext<ThreadPool::TPHandle&> *c) {
 166   return new BlessedGenContext<ThreadPool::TPHandle&>(
 167     this, c, get_osdmap_epoch());
 168 }
 169
 170 template <typename T>
 171 class PrimaryLogPG::UnlockedBlessedGenContext : public GenContext<T> {
 172   PrimaryLogPGRef pg;
 173   unique_ptr<GenContext<T>> c;
 174   epoch_t e;
 175 public:
 176   UnlockedBlessedGenContext(PrimaryLogPG *pg, GenContext<T> *c, epoch_t e)
 177     : pg(pg), c(c), e(e) {}
 178   void finish(T t) override {
 179     if (pg->pg_has_reset_since(e))
 180       c.reset();
 181     else
 182       c.release()->complete(t);
 183   }
 184   bool sync_finish(T t) {
 185     // we assume here all blessed/wrapped Contexts can complete synchronously.
 186     c.release()->complete(t);
 187     return true;
 188   }
 189 };
 190
 191 GenContext<ThreadPool::TPHandle&> *PrimaryLogPG::bless_unlocked_gencontext(
 192   GenContext<ThreadPool::TPHandle&> *c) {
 193   return new UnlockedBlessedGenContext<ThreadPool::TPHandle&>(
 194     this, c, get_osdmap_epoch());
 195 }
 196
 197 class PrimaryLogPG::BlessedContext : public Context {
 198   PrimaryLogPGRef pg;
 199   unique_ptr<Context> c;
 200   epoch_t e;
 201 public:
 202   BlessedContext(PrimaryLogPG *pg, Context *c, epoch_t e)
 203     : pg(pg), c(c), e(e) {}
 204   void finish(int r) override {
 205     std::scoped_lock locker{*pg};
 206     if (pg->pg_has_reset_since(e))
 207       c.reset();
 208     else
 209       c.release()->complete(r);
 210   }
 211   bool sync_finish(int r) override {
 212     // we assume here all blessed/wrapped Contexts can complete synchronously.
 213     c.release()->complete(r);
 214     return true;
 215   }
 216 };
 217
 218 Context *PrimaryLogPG::bless_context(Context *c) {
 219   return new BlessedContext(this, c, get_osdmap_epoch());
 220 }
 221
 222 class PrimaryLogPG::C_PG_ObjectContext : public Context {
 223   PrimaryLogPGRef pg;
 224   ObjectContext *obc;
 225   public:
 226   C_PG_ObjectContext(PrimaryLogPG *p, ObjectContext *o) :
 227     pg(p), obc(o) {}
 228   void finish(int r) override {
 229     pg->object_context_destructor_callback(obc);
 230   }
 231 };
 232
 233 struct OnReadComplete : public Context {
 234   PrimaryLogPG *pg;
 235   PrimaryLogPG::OpContext *opcontext;
 236   OnReadComplete(
 237     PrimaryLogPG *pg,
 238     PrimaryLogPG::OpContext *ctx) : pg(pg), opcontext(ctx) {}
 239   void finish(int r) override {
 240     opcontext->finish_read(pg);
 241   }
 242   ~OnReadComplete() override {}
 243 };
 244
 245 class PrimaryLogPG::C_OSD_AppliedRecoveredObject : public Context {
 246   PrimaryLogPGRef pg;
 247   ObjectContextRef obc;
 248   public:
 249   C_OSD_AppliedRecoveredObject(PrimaryLogPG *p, ObjectContextRef o) :
 250     pg(p), obc(o) {}
 251   bool sync_finish(int r) override {
 252     pg->_applied_recovered_object(obc);
 253     return true;
 254   }
 255   void finish(int r) override {
 256     std::scoped_lock locker{*pg};
 257     pg->_applied_recovered_object(obc);
 258   }
 259 };
 260
 261 class PrimaryLogPG::C_OSD_CommittedPushedObject : public Context {
 262   PrimaryLogPGRef pg;
 263   epoch_t epoch;
 264   eversion_t last_complete;
 265   public:
 266   C_OSD_CommittedPushedObject(
 267     PrimaryLogPG *p, epoch_t epoch, eversion_t lc) :
 268     pg(p), epoch(epoch), last_complete(lc) {
 269   }
 270   void finish(int r) override {
 271     pg->_committed_pushed_object(epoch, last_complete);
 272   }
 273 };
 274
 275 class PrimaryLogPG::C_OSD_AppliedRecoveredObjectReplica : public Context {
 276   PrimaryLogPGRef pg;
 277   public:
 278   explicit C_OSD_AppliedRecoveredObjectReplica(PrimaryLogPG *p) :
 279     pg(p) {}
 280   bool sync_finish(int r) override {
 281     pg->_applied_recovered_object_replica();
 282     return true;
 283   }
 284   void finish(int r) override {
 285     std::scoped_lock locker{*pg};
 286     pg->_applied_recovered_object_replica();
 287   }
 288 };
 289
 290 // OpContext
 291 void PrimaryLogPG::OpContext::start_async_reads(PrimaryLogPG *pg)
 292 {
 293   inflightreads = 1;
 294   list<pair<boost::tuple<uint64_t, uint64_t, unsigned>,
 295             pair<bufferlist*, Context*> > > in;
 296   in.swap(pending_async_reads);
 297   pg->pgbackend->objects_read_async(
 298     obc->obs.oi.soid,
 299     in,
 300     new OnReadComplete(pg, this), pg->get_pool().fast_read);
 301 }
 302 void PrimaryLogPG::OpContext::finish_read(PrimaryLogPG *pg)
 303 {
 304   ceph_assert(inflightreads > 0);
 305   --inflightreads;
 306   if (async_reads_complete()) {
 307     ceph_assert(pg->in_progress_async_reads.size());
 308     ceph_assert(pg->in_progress_async_reads.front().second == this);
 309     pg->in_progress_async_reads.pop_front();
 310
 311     // Restart the op context now that all reads have been
 312     // completed. Read failures will be handled by the op finisher
 313     pg->execute_ctx(this);
 314   }
 315 }
 316
 317 class CopyFromCallback : public PrimaryLogPG::CopyCallback {
 318 public:
 319   PrimaryLogPG::CopyResults *results = nullptr;
 320   PrimaryLogPG::OpContext *ctx;
 321   OSDOp &osd_op;
 322   uint32_t truncate_seq;
 323   uint64_t truncate_size;
 324   bool have_truncate = false;
 325
 326   CopyFromCallback(PrimaryLogPG::OpContext *ctx, OSDOp &osd_op)
 327     : ctx(ctx), osd_op(osd_op) {
 328   }
 329   ~CopyFromCallback() override {}
 330
 331   void finish(PrimaryLogPG::CopyCallbackResults results_) override {
 332     results = results_.get<1>();
 333     int r = results_.get<0>();
 334
 335     // Only use truncate_{seq,size} from the original object if the client
 336     // did not sent us these parameters
 337     if (!have_truncate) {
 338       truncate_seq = results->truncate_seq;
 339       truncate_size = results->truncate_size;
 340     }
 341
 342     // for finish_copyfrom
 343     ctx->user_at_version = results->user_version;
 344
 345     if (r >= 0) {
 346       ctx->pg->execute_ctx(ctx);
 347     } else {
 348       if (r != -ECANCELED) { // on cancel just toss it out; client resends
 349         if (ctx->op)
 350           ctx->pg->osd->reply_op_error(ctx->op, r);
 351       } else if (results->should_requeue) {
 352         if (ctx->op)
 353           ctx->pg->requeue_op(ctx->op);
 354       }
 355       ctx->pg->close_op_ctx(ctx);
 356     }
 357   }
 358
 359   bool is_temp_obj_used() {
 360     return results->started_temp_obj;
 361   }
 362   uint64_t get_data_size() {
 363     return results->object_size;
 364   }
 365   void set_truncate(uint32_t seq, uint64_t size) {
 366     truncate_seq = seq;
 367     truncate_size = size;
 368     have_truncate = true;
 369   }
 370 };
 371
 372 struct CopyFromFinisher : public PrimaryLogPG::OpFinisher {
 373   CopyFromCallback *copy_from_callback;
 374
 375   explicit CopyFromFinisher(CopyFromCallback *copy_from_callback)
 376     : copy_from_callback(copy_from_callback) {
 377   }
 378
 379   int execute() override {
 380     // instance will be destructed after this method completes
 381     copy_from_callback->ctx->pg->finish_copyfrom(copy_from_callback);
 382     return 0;
 383   }
 384 };
 385
 386 // ======================
 387 // PGBackend::Listener
 388
 389 void PrimaryLogPG::on_local_recover(
 390   const hobject_t &hoid,
 391   const ObjectRecoveryInfo &_recovery_info,
 392   ObjectContextRef obc,
 393   bool is_delete,
 394   ObjectStore::Transaction *t
 395   )
 396 {
 397   dout(10) << __func__ << ": " << hoid << dendl;
 398
 399   ObjectRecoveryInfo recovery_info(_recovery_info);
 400   clear_object_snap_mapping(t, hoid);
 401   if (!is_delete && recovery_info.soid.is_snap()) {
 402     OSDriver::OSTransaction _t(osdriver.get_transaction(t));
 403     set<snapid_t> snaps;
 404     dout(20) << " snapset " << recovery_info.ss << dendl;
 405     auto p = recovery_info.ss.clone_snaps.find(hoid.snap);
 406     if (p != recovery_info.ss.clone_snaps.end()) {
 407       snaps.insert(p->second.begin(), p->second.end());
 408       dout(20) << " snaps " << snaps << dendl;
 409       snap_mapper.add_oid(
 410         recovery_info.soid,
 411         snaps,
 412         &_t);
 413     } else {
 414       derr << __func__ << " " << hoid << " had no clone_snaps" << dendl;
 415     }
 416   }
 417   if (!is_delete && recovery_state.get_pg_log().get_missing().is_missing(recovery_info.soid) &&
 418       recovery_state.get_pg_log().get_missing().get_items().find(recovery_info.soid)->second.need > recovery_info.version) {
 419     ceph_assert(is_primary());
 420     const pg_log_entry_t *latest = recovery_state.get_pg_log().get_log().objects.find(recovery_info.soid)->second;
 421     if (latest->op == pg_log_entry_t::LOST_REVERT &&
 422         latest->reverting_to == recovery_info.version) {
 423       dout(10) << " got old revert version " << recovery_info.version
 424                << " for " << *latest << dendl;
 425       recovery_info.version = latest->version;
 426       // update the attr to the revert event version
 427       recovery_info.oi.prior_version = recovery_info.oi.version;
 428       recovery_info.oi.version = latest->version;
 429       bufferlist bl;
 430       encode(recovery_info.oi, bl,
 431                get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
 432       ceph_assert(!pool.info.is_erasure());
 433       t->setattr(coll, ghobject_t(recovery_info.soid), OI_ATTR, bl);
 434       if (obc)
 435         obc->attr_cache[OI_ATTR] = bl;
 436     }
 437   }
 438
 439   // keep track of active pushes for scrub
 440   ++active_pushes;
 441
 442   recovery_state.recover_got(
 443     recovery_info.soid,
 444     recovery_info.version,
 445     is_delete,
 446     *t);
 447
 448   if (is_primary()) {
 449     if (!is_delete) {
 450       obc->obs.exists = true;
 451
 452       bool got = obc->get_recovery_read();
 453       ceph_assert(got);
 454
 455       ceph_assert(recovering.count(obc->obs.oi.soid));
 456       recovering[obc->obs.oi.soid] = obc;
 457       obc->obs.oi = recovery_info.oi;  // may have been updated above
 458     }
 459
 460     t->register_on_applied(new C_OSD_AppliedRecoveredObject(this, obc));
 461
 462     publish_stats_to_osd();
 463     release_backoffs(hoid);
 464     if (!is_unreadable_object(hoid)) {
 465       auto unreadable_object_entry = waiting_for_unreadable_object.find(hoid);
 466       if (unreadable_object_entry != waiting_for_unreadable_object.end()) {
 467         dout(20) << " kicking unreadable waiters on " << hoid << dendl;
 468         requeue_ops(unreadable_object_entry->second);
 469         waiting_for_unreadable_object.erase(unreadable_object_entry);
 470       }
 471     }
 472   } else {
 473     t->register_on_applied(
 474       new C_OSD_AppliedRecoveredObjectReplica(this));
 475
 476   }
 477
 478   t->register_on_commit(
 479     new C_OSD_CommittedPushedObject(
 480       this,
 481       get_osdmap_epoch(),
 482       info.last_complete));
 483 }
 484
 485 void PrimaryLogPG::on_global_recover(
 486   const hobject_t &soid,
 487   const object_stat_sum_t &stat_diff,
 488   bool is_delete)
 489 {
 490   recovery_state.object_recovered(soid, stat_diff);
 491   publish_stats_to_osd();
 492   dout(10) << "pushed " << soid << " to all replicas" << dendl;
 493   auto i = recovering.find(soid);
 494   ceph_assert(i != recovering.end());
 495
 496   if (i->second && i->second->rwstate.recovery_read_marker) {
 497     // recover missing won't have had an obc, but it gets filled in
 498     // during on_local_recover
 499     ceph_assert(i->second);
 500     list<OpRequestRef> requeue_list;
 501     i->second->drop_recovery_read(&requeue_list);
 502     requeue_ops(requeue_list);
 503   }
 504
 505   backfills_in_flight.erase(soid);
 506
 507   recovering.erase(i);
 508   finish_recovery_op(soid);
 509   release_backoffs(soid);
 510   auto degraded_object_entry = waiting_for_degraded_object.find(soid);
 511   if (degraded_object_entry != waiting_for_degraded_object.end()) {
 512     dout(20) << " kicking degraded waiters on " << soid << dendl;
 513     requeue_ops(degraded_object_entry->second);
 514     waiting_for_degraded_object.erase(degraded_object_entry);
 515   }
 516   auto unreadable_object_entry = waiting_for_unreadable_object.find(soid);
 517   if (unreadable_object_entry != waiting_for_unreadable_object.end()) {
 518     dout(20) << " kicking unreadable waiters on " << soid << dendl;
 519     requeue_ops(unreadable_object_entry->second);
 520     waiting_for_unreadable_object.erase(unreadable_object_entry);
 521   }
 522   finish_degraded_object(soid);
 523 }
 524
 525 void PrimaryLogPG::schedule_recovery_work(
 526   GenContext<ThreadPool::TPHandle&> *c,
 527   uint64_t cost)
 528 {
 529   osd->queue_recovery_context(
 530     this, c, cost,
 531     recovery_state.get_recovery_op_priority());
 532 }
 533
 534 void PrimaryLogPG::replica_clear_repop_obc(
 535   const vector<pg_log_entry_t> &logv,
 536   ObjectStore::Transaction &t)
 537 {
 538   for (auto &&e: logv) {
 539     /* Have to blast all clones, they share a snapset */
 540     object_contexts.clear_range(
 541       e.soid.get_object_boundary(), e.soid.get_head());
 542     ceph_assert(
 543       snapset_contexts.find(e.soid.get_head()) ==
 544       snapset_contexts.end());
 545   }
 546 }
 547
 548 bool PrimaryLogPG::should_send_op(
 549   pg_shard_t peer,
 550   const hobject_t &hoid) {
 551   if (peer == get_primary())
 552     return true;
 553   ceph_assert(recovery_state.has_peer_info(peer));
 554   bool should_send =
 555       hoid.pool != (int64_t)info.pgid.pool() ||
 556       hoid <= last_backfill_started ||
 557       hoid <= recovery_state.get_peer_info(peer).last_backfill;
 558   if (!should_send) {
 559     ceph_assert(is_backfill_target(peer));
 560     dout(10) << __func__ << " issue_repop shipping empty opt to osd." << peer
 561              << ", object " << hoid
 562              << " beyond std::max(last_backfill_started "
 563              << ", peer_info[peer].last_backfill "
 564              << recovery_state.get_peer_info(peer).last_backfill
 565              << ")" << dendl;
 566     return should_send;
 567   }
 568   if (is_async_recovery_target(peer) &&
 569       recovery_state.get_peer_missing(peer).is_missing(hoid)) {
 570     should_send = false;
 571     dout(10) << __func__ << " issue_repop shipping empty opt to osd." << peer
 572              << ", object " << hoid
 573              << " which is pending recovery in async_recovery_targets" << dendl;
 574   }
 575   return should_send;
 576 }
 577
 578
 579 ConnectionRef PrimaryLogPG::get_con_osd_cluster(
 580   int peer, epoch_t from_epoch)
 581 {
 582   return osd->get_con_osd_cluster(peer, from_epoch);
 583 }
 584
 585 PerfCounters *PrimaryLogPG::get_logger()
 586 {
 587   return osd->logger;
 588 }
 589
 590
 591 // ====================
 592 // missing objects
 593
 594 bool PrimaryLogPG::is_missing_object(const hobject_t& soid) const
 595 {
 596   return recovery_state.get_pg_log().get_missing().get_items().count(soid);
 597 }
 598
 599 void PrimaryLogPG::maybe_kick_recovery(
 600   const hobject_t &soid)
 601 {
 602   eversion_t v;
 603   bool work_started = false;
 604   if (!recovery_state.get_missing_loc().needs_recovery(soid, &v))
 605     return;
 606
 607   map<hobject_t, ObjectContextRef>::const_iterator p = recovering.find(soid);
 608   if (p != recovering.end()) {
 609     dout(7) << "object " << soid << " v " << v << ", already recovering." << dendl;
 610   } else if (recovery_state.get_missing_loc().is_unfound(soid)) {
 611     dout(7) << "object " << soid << " v " << v << ", is unfound." << dendl;
 612   } else {
 613     dout(7) << "object " << soid << " v " << v << ", recovering." << dendl;
 614     PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
 615     if (is_missing_object(soid)) {
 616       recover_missing(soid, v, CEPH_MSG_PRIO_HIGH, h);
 617     } else if (recovery_state.get_missing_loc().is_deleted(soid)) {
 618       prep_object_replica_deletes(soid, v, h, &work_started);
 619     } else {
 620       prep_object_replica_pushes(soid, v, h, &work_started);
 621     }
 622     pgbackend->run_recovery_op(h, CEPH_MSG_PRIO_HIGH);
 623   }
 624 }
 625
 626 void PrimaryLogPG::wait_for_unreadable_object(
 627   const hobject_t& soid, OpRequestRef op)
 628 {
 629   ceph_assert(is_unreadable_object(soid));
 630   maybe_kick_recovery(soid);
 631   waiting_for_unreadable_object[soid].push_back(op);
 632   op->mark_delayed("waiting for missing object");
 633   osd->logger->inc(l_osd_op_delayed_unreadable);
 634 }
 635
 636 bool PrimaryLogPG::is_degraded_or_backfilling_object(const hobject_t& soid)
 637 {
 638   /* The conditions below may clear (on_local_recover, before we queue
 639    * the transaction) before we actually requeue the degraded waiters
 640    * in on_global_recover after the transaction completes.
 641    */
 642   if (waiting_for_degraded_object.count(soid))
 643     return true;
 644   if (recovery_state.get_pg_log().get_missing().get_items().count(soid))
 645     return true;
 646   ceph_assert(!get_acting_recovery_backfill().empty());
 647   for (set<pg_shard_t>::iterator i = get_acting_recovery_backfill().begin();
 648        i != get_acting_recovery_backfill().end();
 649        ++i) {
 650     if (*i == get_primary()) continue;
 651     pg_shard_t peer = *i;
 652     auto peer_missing_entry = recovery_state.get_peer_missing().find(peer);
 653     // If an object is missing on an async_recovery_target, return false.
 654     // This will not block the op and the object is async recovered later.
 655     if (peer_missing_entry != recovery_state.get_peer_missing().end() &&
 656         peer_missing_entry->second.get_items().count(soid)) {
 657       if (is_async_recovery_target(peer))
 658         continue;
 659       else
 660         return true;
 661     }
 662     // Object is degraded if after last_backfill AND
 663     // we are backfilling it
 664     if (is_backfill_target(peer) &&
 665         recovery_state.get_peer_info(peer).last_backfill <= soid &&
 666         last_backfill_started >= soid &&
 667         backfills_in_flight.count(soid))
 668       return true;
 669   }
 670   return false;
 671 }
 672
 673 bool PrimaryLogPG::is_degraded_on_async_recovery_target(const hobject_t& soid)
 674 {
 675   for (auto &i: get_async_recovery_targets()) {
 676     auto peer_missing_entry = recovery_state.get_peer_missing().find(i);
 677     if (peer_missing_entry != recovery_state.get_peer_missing().end() &&
 678         peer_missing_entry->second.get_items().count(soid)) {
 679       dout(30) << __func__ << " " << soid << dendl;
 680       return true;
 681     }
 682   }
 683   return false;
 684 }
 685
 686 void PrimaryLogPG::wait_for_degraded_object(const hobject_t& soid, OpRequestRef op)
 687 {
 688   ceph_assert(is_degraded_or_backfilling_object(soid) || is_degraded_on_async_recovery_target(soid));
 689
 690   maybe_kick_recovery(soid);
 691   waiting_for_degraded_object[soid].push_back(op);
 692   op->mark_delayed("waiting for degraded object");
 693   osd->logger->inc(l_osd_op_delayed_degraded);
 694 }
 695
 696 void PrimaryLogPG::block_write_on_full_cache(
 697   const hobject_t& _oid, OpRequestRef op)
 698 {
 699   const hobject_t oid = _oid.get_head();
 700   dout(20) << __func__ << ": blocking object " << oid
 701            << " on full cache" << dendl;
 702   objects_blocked_on_cache_full.insert(oid);
 703   waiting_for_cache_not_full.push_back(op);
 704   op->mark_delayed("waiting for cache not full");
 705 }
 706
 707 void PrimaryLogPG::block_for_clean(
 708   const hobject_t& oid, OpRequestRef op)
 709 {
 710   dout(20) << __func__ << ": blocking object " << oid
 711            << " on primary repair" << dendl;
 712   waiting_for_clean_to_primary_repair.push_back(op);
 713   op->mark_delayed("waiting for clean to repair");
 714 }
 715
 716 void PrimaryLogPG::block_write_on_snap_rollback(
 717   const hobject_t& oid, ObjectContextRef obc, OpRequestRef op)
 718 {
 719   dout(20) << __func__ << ": blocking object " << oid.get_head()
 720            << " on snap promotion " << obc->obs.oi.soid << dendl;
 721   // otherwise, we'd have blocked in do_op
 722   ceph_assert(oid.is_head());
 723   ceph_assert(objects_blocked_on_snap_promotion.count(oid) == 0);
 724   /*
 725    * We block the head object here.
 726    *
 727    * Let's assume that there is racing read When the head object is being rollbacked.
 728    * Since the two different ops can trigger promote_object() with the same source,
 729    * infinite loop happens by canceling ops each other.
 730    * To avoid this, we block the head object during rollback.
 731    * So, the racing read will be blocked until the rollback is completed.
 732    * see also: https://tracker.ceph.com/issues/49726
 733    */
 734   ObjectContextRef head_obc = get_object_context(oid, false);
 735   head_obc->start_block();
 736   objects_blocked_on_snap_promotion[oid] = obc;
 737   wait_for_blocked_object(obc->obs.oi.soid, op);
 738 }
 739
 740 void PrimaryLogPG::block_write_on_degraded_snap(
 741   const hobject_t& snap, OpRequestRef op)
 742 {
 743   dout(20) << __func__ << ": blocking object " << snap.get_head()
 744            << " on degraded snap " << snap << dendl;
 745   // otherwise, we'd have blocked in do_op
 746   ceph_assert(objects_blocked_on_degraded_snap.count(snap.get_head()) == 0);
 747   objects_blocked_on_degraded_snap[snap.get_head()] = snap.snap;
 748   wait_for_degraded_object(snap, op);
 749 }
 750
 751 bool PrimaryLogPG::maybe_await_blocked_head(
 752   const hobject_t &hoid,
 753   OpRequestRef op)
 754 {
 755   ObjectContextRef obc;
 756   obc = object_contexts.lookup(hoid.get_head());
 757   if (obc) {
 758     if (obc->is_blocked()) {
 759       wait_for_blocked_object(obc->obs.oi.soid, op);
 760       return true;
 761     } else {
 762       return false;
 763     }
 764   }
 765   return false;
 766 }
 767
 768 void PrimaryLogPG::wait_for_blocked_object(const hobject_t& soid, OpRequestRef op)
 769 {
 770   dout(10) << __func__ << " " << soid << " " << *op->get_req() << dendl;
 771   waiting_for_blocked_object[soid].push_back(op);
 772   op->mark_delayed("waiting for blocked object");
 773 }
 774
 775 void PrimaryLogPG::maybe_force_recovery()
 776 {
 777   // no force if not in degraded/recovery/backfill states
 778   if (!is_degraded() &&
 779       !state_test(PG_STATE_RECOVERING |
 780                   PG_STATE_RECOVERY_WAIT |
 781                   PG_STATE_BACKFILLING |
 782                   PG_STATE_BACKFILL_WAIT |
 783                   PG_STATE_BACKFILL_TOOFULL))
 784     return;
 785
 786   if (recovery_state.get_pg_log().get_log().approx_size() <
 787       cct->_conf->osd_max_pg_log_entries *
 788         cct->_conf->osd_force_recovery_pg_log_entries_factor)
 789     return;
 790
 791   // find the oldest missing object
 792   version_t min_version = recovery_state.get_pg_log().get_log().head.version;
 793   hobject_t soid;
 794   if (!recovery_state.get_pg_log().get_missing().get_rmissing().empty()) {
 795     min_version = recovery_state.get_pg_log().get_missing().get_rmissing().begin()->first;
 796     soid = recovery_state.get_pg_log().get_missing().get_rmissing().begin()->second;
 797   }
 798   ceph_assert(!get_acting_recovery_backfill().empty());
 799   for (set<pg_shard_t>::iterator it = get_acting_recovery_backfill().begin();
 800        it != get_acting_recovery_backfill().end();
 801        ++it) {
 802     if (*it == get_primary()) continue;
 803     pg_shard_t peer = *it;
 804     auto it_missing = recovery_state.get_peer_missing().find(peer);
 805     if (it_missing != recovery_state.get_peer_missing().end() &&
 806         !it_missing->second.get_rmissing().empty()) {
 807       const auto& min_obj = recovery_state.get_peer_missing(peer).get_rmissing().begin();
 808       dout(20) << __func__ << " peer " << peer << " min_version " << min_obj->first
 809                << " oid " << min_obj->second << dendl;
 810       if (min_version > min_obj->first) {
 811         min_version = min_obj->first;
 812         soid = min_obj->second;
 813       }
 814     }
 815   }
 816
 817   // recover it
 818   if (soid != hobject_t())
 819     maybe_kick_recovery(soid);
 820 }
 821
 822 bool PrimaryLogPG::check_laggy(OpRequestRef& op)
 823 {
 824   assert(HAVE_FEATURE(recovery_state.get_min_upacting_features(),
 825                       SERVER_OCTOPUS));
 826   if (state_test(PG_STATE_WAIT)) {
 827     dout(10) << __func__ << " PG is WAIT state" << dendl;
 828   } else if (!state_test(PG_STATE_LAGGY)) {
 829     auto mnow = osd->get_mnow();
 830     auto ru = recovery_state.get_readable_until();
 831     if (mnow <= ru) {
 832       // not laggy
 833       return true;
 834     }
 835     dout(10) << __func__
 836              << " mnow " << mnow
 837              << " > readable_until " << ru << dendl;
 838
 839     if (!is_primary()) {
 840       osd->reply_op_error(op, -EAGAIN);
 841       return false;
 842     }
 843
 844     // go to laggy state
 845     state_set(PG_STATE_LAGGY);
 846     publish_stats_to_osd();
 847   }
 848   dout(10) << __func__ << " not readable" << dendl;
 849   waiting_for_readable.push_back(op);
 850   op->mark_delayed("waiting for readable");
 851   return false;
 852 }
 853
 854 bool PrimaryLogPG::check_laggy_requeue(OpRequestRef& op)
 855 {
 856   assert(HAVE_FEATURE(recovery_state.get_min_upacting_features(),
 857                       SERVER_OCTOPUS));
 858   if (!state_test(PG_STATE_WAIT) && !state_test(PG_STATE_LAGGY)) {
 859     return true; // not laggy
 860   }
 861   dout(10) << __func__ << " not readable" << dendl;
 862   waiting_for_readable.push_front(op);
 863   op->mark_delayed("waiting for readable");
 864   return false;
 865 }
 866
 867 void PrimaryLogPG::recheck_readable()
 868 {
 869   if (!is_wait() && !is_laggy()) {
 870     dout(20) << __func__ << " wasn't wait or laggy" << dendl;
 871     return;
 872   }
 873   auto mnow = osd->get_mnow();
 874   bool pub = false;
 875   if (is_wait()) {
 876     auto prior_readable_until_ub = recovery_state.get_prior_readable_until_ub();
 877     if (mnow < prior_readable_until_ub) {
 878       dout(10) << __func__ << " still wait (mnow " << mnow
 879                << " < prior_readable_until_ub " << prior_readable_until_ub
 880                << ")" << dendl;
 881     } else {
 882       dout(10) << __func__ << " no longer wait (mnow " << mnow
 883                << " >= prior_readable_until_ub " << prior_readable_until_ub
 884                << ")" << dendl;
 885       state_clear(PG_STATE_WAIT);
 886       recovery_state.clear_prior_readable_until_ub();
 887       pub = true;
 888     }
 889   }
 890   if (is_laggy()) {
 891     auto ru = recovery_state.get_readable_until();
 892     if (ru == ceph::signedspan::zero()) {
 893       dout(10) << __func__ << " still laggy (mnow " << mnow
 894                << ", readable_until zero)" << dendl;
 895     } else if (mnow >= ru) {
 896       dout(10) << __func__ << " still laggy (mnow " << mnow
 897                << " >= readable_until " << ru << ")" << dendl;
 898     } else {
 899       dout(10) << __func__ << " no longer laggy (mnow " << mnow
 900                << " < readable_until " << ru << ")" << dendl;
 901       state_clear(PG_STATE_LAGGY);
 902       pub = true;
 903     }
 904   }
 905   if (pub) {
 906     publish_stats_to_osd();
 907   }
 908   if (!is_laggy() && !is_wait()) {
 909     requeue_ops(waiting_for_readable);
 910   }
 911 }
 912
 913 bool PrimaryLogPG::pgls_filter(const PGLSFilter& filter, const hobject_t& sobj)
 914 {
 915   bufferlist bl;
 916
 917   // If filter has expressed an interest in an xattr, load it.
 918   if (!filter.get_xattr().empty()) {
 919     int ret = pgbackend->objects_get_attr(
 920       sobj,
 921       filter.get_xattr(),
 922       &bl);
 923     dout(0) << "getattr (sobj=" << sobj << ", attr=" << filter.get_xattr() << ") returned " << ret << dendl;
 924     if (ret < 0) {
 925       if (ret != -ENODATA || filter.reject_empty_xattr()) {
 926         return false;
 927       }
 928     }
 929   }
 930
 931   return filter.filter(sobj, bl);
 932 }
 933
 934 std::pair<int, std::unique_ptr<const PGLSFilter>>
 935 PrimaryLogPG::get_pgls_filter(bufferlist::const_iterator& iter)
 936 {
 937   string type;
 938   // storing non-const PGLSFilter for the sake of ::init()
 939   std::unique_ptr<PGLSFilter> filter;
 940
 941   try {
 942     decode(type, iter);
 943   }
 944   catch (ceph::buffer::error& e) {
 945     return { -EINVAL, nullptr };
 946   }
 947
 948   if (type.compare("plain") == 0) {
 949     filter = std::make_unique<PGLSPlainFilter>();
 950   } else {
 951     std::size_t dot = type.find('.');
 952     if (dot == std::string::npos || dot == 0 || dot == type.size() - 1) {
 953       return { -EINVAL, nullptr };
 954     }
 955
 956     const std::string class_name = type.substr(0, dot);
 957     const std::string filter_name = type.substr(dot + 1);
 958     ClassHandler::ClassData *cls = NULL;
 959     int r = ClassHandler::get_instance().open_class(class_name, &cls);
 960     if (r != 0) {
 961       derr << "Error opening class '" << class_name << "': "
 962            << cpp_strerror(r) << dendl;
 963       if (r != -EPERM) // propagate permission error
 964         r = -EINVAL;
 965       return { r, nullptr };
 966     } else {
 967       ceph_assert(cls);
 968     }
 969
 970     ClassHandler::ClassFilter *class_filter = cls->get_filter(filter_name);
 971     if (class_filter == NULL) {
 972       derr << "Error finding filter '" << filter_name << "' in class "
 973            << class_name << dendl;
 974       return { -EINVAL, nullptr };
 975     }
 976     filter.reset(class_filter->fn());
 977     if (!filter) {
 978       // Object classes are obliged to return us something, but let's
 979       // give an error rather than asserting out.
 980       derr << "Buggy class " << class_name << " failed to construct "
 981               "filter " << filter_name << dendl;
 982       return { -EINVAL, nullptr };
 983     }
 984   }
 985
 986   ceph_assert(filter);
 987   int r = filter->init(iter);
 988   if (r < 0) {
 989     derr << "Error initializing filter " << type << ": "
 990          << cpp_strerror(r) << dendl;
 991     return { -EINVAL, nullptr };
 992   } else {
 993     // Successfully constructed and initialized, return it.
 994     return std::make_pair(0, std::move(filter));
 995   }
 996 }
 997
 998
 999 // ==========================================================
1000
1001 void PrimaryLogPG::do_command(
1002   const string_view& orig_prefix,
1003   const cmdmap_t& cmdmap,
1004   const bufferlist& idata,
1005   std::function<void(int,const std::string&,bufferlist&)> on_finish)
1006 {
1007   string format;
1008   cmd_getval(cmdmap, "format", format);
1009   auto f(Formatter::create_unique(format, "json-pretty", "json-pretty"));
1010   int ret = 0;
1011   stringstream ss;   // stderr error message stream
1012   bufferlist outbl;  // if empty at end, we'll dump formatter as output
1013
1014   // get final prefix:
1015   // - ceph pg <pgid> foo -> prefix=pg, cmd=foo
1016   // - ceph tell <pgid> foo -> prefix=foo
1017   string prefix(orig_prefix);
1018   string command;
1019   cmd_getval(cmdmap, "cmd", command);
1020   if (command.size()) {
1021     prefix = command;
1022   }
1023
1024   if (prefix == "query") {
1025     f->open_object_section("pg");
1026     f->dump_stream("snap_trimq") << snap_trimq;
1027     f->dump_unsigned("snap_trimq_len", snap_trimq.size());
1028     recovery_state.dump_peering_state(f.get());
1029
1030     f->open_array_section("recovery_state");
1031     handle_query_state(f.get());
1032     f->close_section();
1033
1034     if (is_primary() && is_active() && m_scrubber) {
1035       m_scrubber->dump_scrubber(f.get(), m_planned_scrub);
1036     }
1037
1038     f->open_object_section("agent_state");
1039     if (agent_state)
1040       agent_state->dump(f.get());
1041     f->close_section();
1042
1043     f->close_section();
1044   }
1045   else if (prefix == "log") {
1046
1047     f->open_object_section("op_log");
1048     f->open_object_section("pg_log_t");
1049     recovery_state.get_pg_log().get_log().dump(f.get());
1050     f->close_section();
1051     f->close_section();
1052   }
1053   else if (prefix == "mark_unfound_lost") {
1054     string mulcmd;
1055     cmd_getval(cmdmap, "mulcmd", mulcmd);
1056     int mode = -1;
1057     if (mulcmd == "revert") {
1058       if (pool.info.is_erasure()) {
1059         ss << "mode must be 'delete' for ec pool";
1060         ret = -EINVAL;
1061         goto out;
1062       }
1063       mode = pg_log_entry_t::LOST_REVERT;
1064     } else if (mulcmd == "delete") {
1065       mode = pg_log_entry_t::LOST_DELETE;
1066     } else {
1067       ss << "mode must be 'revert' or 'delete'; mark not yet implemented";
1068       ret = -EINVAL;
1069       goto out;
1070     }
1071     ceph_assert(mode == pg_log_entry_t::LOST_REVERT ||
1072                 mode == pg_log_entry_t::LOST_DELETE);
1073
1074     if (!is_primary()) {
1075       ss << "not primary";
1076       ret = -EROFS;
1077       goto out;
1078     }
1079
1080     uint64_t unfound = recovery_state.get_missing_loc().num_unfound();
1081     if (!unfound) {
1082       ss << "pg has no unfound objects";
1083       goto out;  // make command idempotent
1084     }
1085
1086     if (!recovery_state.all_unfound_are_queried_or_lost(get_osdmap())) {
1087       ss << "pg has " << unfound
1088          << " unfound objects but we haven't probed all sources, not marking lost";
1089       ret = -EINVAL;
1090       goto out;
1091     }
1092
1093     mark_all_unfound_lost(mode, on_finish);
1094     return;
1095   }
1096
1097   else if (prefix == "list_unfound") {
1098     hobject_t offset;
1099     string offset_json;
1100     bool show_offset = false;
1101     if (cmd_getval(cmdmap, "offset", offset_json)) {
1102       json_spirit::Value v;
1103       try {
1104         if (!json_spirit::read(offset_json, v))
1105           throw std::runtime_error("bad json");
1106         offset.decode(v);
1107       } catch (std::runtime_error& e) {
1108         ss << "error parsing offset: " << e.what();
1109         ret = -EINVAL;
1110         goto out;
1111       }
1112       show_offset = true;
1113     }
1114     f->open_object_section("missing");
1115     if (show_offset) {
1116       f->open_object_section("offset");
1117       offset.dump(f.get());
1118       f->close_section();
1119     }
1120     auto &needs_recovery_map = recovery_state.get_missing_loc()
1121       .get_needs_recovery();
1122     f->dump_int("num_missing", needs_recovery_map.size());
1123     f->dump_int("num_unfound", get_num_unfound());
1124     map<hobject_t, pg_missing_item>::const_iterator p =
1125       needs_recovery_map.upper_bound(offset);
1126     {
1127       f->open_array_section("objects");
1128       int32_t num = 0;
1129       for (; p != needs_recovery_map.end() &&
1130              num < cct->_conf->osd_command_max_records;
1131            ++p) {
1132         if (recovery_state.get_missing_loc().is_unfound(p->first)) {
1133           f->open_object_section("object");
1134           {
1135             f->open_object_section("oid");
1136             p->first.dump(f.get());
1137             f->close_section();
1138           }
1139           p->second.dump(f.get()); // have, need keys
1140           {
1141             f->open_array_section("locations");
1142             for (auto &&r : recovery_state.get_missing_loc().get_locations(
1143                    p->first)) {
1144               f->dump_stream("shard") << r;
1145             }
1146             f->close_section();
1147           }
1148           f->close_section();
1149           num++;
1150         }
1151       }
1152       f->close_section();
1153     }
1154     // Get possible locations of missing objects from pg information
1155     PeeringState::QueryUnfound q(f.get());
1156     recovery_state.handle_event(q, 0);
1157     f->dump_bool("more", p != needs_recovery_map.end());
1158     f->close_section();
1159   }
1160
1161   else if (prefix == "scrub" ||
1162            prefix == "deep_scrub") {
1163     bool deep = (prefix == "deep_scrub");
1164     int64_t time = cmd_getval_or<int64_t>(cmdmap, "time", 0);
1165
1166     if (is_primary()) {
1167       const pg_pool_t *p = &pool.info;
1168       double pool_scrub_max_interval = 0;
1169       double scrub_max_interval;
1170       if (deep) {
1171         p->opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &pool_scrub_max_interval);
1172         scrub_max_interval = pool_scrub_max_interval > 0 ?
1173           pool_scrub_max_interval : g_conf()->osd_deep_scrub_interval;
1174       } else {
1175         p->opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &pool_scrub_max_interval);
1176         scrub_max_interval = pool_scrub_max_interval > 0 ?
1177           pool_scrub_max_interval : g_conf()->osd_scrub_max_interval;
1178       }
1179       // Instead of marking must_scrub force a schedule scrub
1180       utime_t stamp = ceph_clock_now();
1181       if (time == 0)
1182         stamp -= scrub_max_interval;
1183       else
1184         stamp -=  (float)time;
1185       stamp -= 100.0;  // push back last scrub more for good measure
1186       if (deep) {
1187         set_last_deep_scrub_stamp(stamp);
1188       }
1189       set_last_scrub_stamp(stamp); // for 'deep' as well, as we use this value to order scrubs
1190       f->open_object_section("result");
1191       f->dump_bool("deep", deep);
1192       f->dump_stream("stamp") << stamp;
1193       f->close_section();
1194     } else {
1195       ss << "Not primary";
1196       ret = -EPERM;
1197     }
1198     outbl.append(ss.str());
1199   }
1200
1201   else if (prefix == "block" || prefix == "unblock" || prefix == "set" ||
1202            prefix == "unset") {
1203     string value;
1204     cmd_getval(cmdmap, "value", value);
1205
1206     if (is_primary()) {
1207       ret = m_scrubber->asok_debug(prefix, value, f.get(), ss);
1208       f->open_object_section("result");
1209       f->dump_bool("success", true);
1210       f->close_section();
1211     } else {
1212       ss << "Not primary";
1213       ret = -EPERM;
1214     }
1215     outbl.append(ss.str());
1216   }
1217   else {
1218     ret = -ENOSYS;
1219     ss << "prefix '" << prefix << "' not implemented";
1220   }
1221
1222  out:
1223   if (ret >= 0 && outbl.length() == 0) {
1224     f->flush(outbl);
1225   }
1226   on_finish(ret, ss.str(), outbl);
1227 }
1228
1229
1230 // ==========================================================
1231
1232 void PrimaryLogPG::do_pg_op(OpRequestRef op)
1233 {
1234   const MOSDOp *m = static_cast<const MOSDOp *>(op->get_req());
1235   ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
1236   dout(10) << "do_pg_op " << *m << dendl;
1237
1238   op->mark_started();
1239
1240   int result = 0;
1241   string cname, mname;
1242
1243   snapid_t snapid = m->get_snapid();
1244
1245   vector<OSDOp> ops = m->ops;
1246
1247   for (vector<OSDOp>::iterator p = ops.begin(); p != ops.end(); ++p) {
1248     std::unique_ptr<const PGLSFilter> filter;
1249     OSDOp& osd_op = *p;
1250     auto bp = p->indata.cbegin();
1251     switch (p->op.op) {
1252     case CEPH_OSD_OP_PGNLS_FILTER:
1253       try {
1254         decode(cname, bp);
1255         decode(mname, bp);
1256       }
1257       catch (const ceph::buffer::error& e) {
1258         dout(0) << "unable to decode PGLS_FILTER description in " << *m << dendl;
1259         result = -EINVAL;
1260         break;
1261       }
1262       std::tie(result, filter) = get_pgls_filter(bp);
1263       if (result < 0)
1264         break;
1265
1266       ceph_assert(filter);
1267
1268       // fall through
1269
1270     case CEPH_OSD_OP_PGNLS:
1271       if (snapid != CEPH_NOSNAP) {
1272         result = -EINVAL;
1273         break;
1274       }
1275       if (get_osdmap()->raw_pg_to_pg(m->get_pg()) != info.pgid.pgid) {
1276         dout(10) << " pgnls pg=" << m->get_pg()
1277                  << " " << get_osdmap()->raw_pg_to_pg(m->get_pg())
1278                  << " != " << info.pgid << dendl;
1279         result = 0; // hmm?
1280       } else {
1281         unsigned list_size = std::min<uint64_t>(cct->_conf->osd_max_pgls,
1282                                                 p->op.pgls.count);
1283
1284         dout(10) << " pgnls pg=" << m->get_pg() << " count " << list_size
1285                  << dendl;
1286         // read into a buffer
1287         vector<hobject_t> sentries;
1288         pg_nls_response_t response;
1289         try {
1290           decode(response.handle, bp);
1291         }
1292         catch (const ceph::buffer::error& e) {
1293           dout(0) << "unable to decode PGNLS handle in " << *m << dendl;
1294           result = -EINVAL;
1295           break;
1296         }
1297
1298         hobject_t next;
1299         hobject_t lower_bound = response.handle;
1300         hobject_t pg_start = info.pgid.pgid.get_hobj_start();
1301         hobject_t pg_end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1302         dout(10) << " pgnls lower_bound " << lower_bound
1303                  << " pg_end " << pg_end << dendl;
1304         if (((!lower_bound.is_max() && lower_bound >= pg_end) ||
1305              (lower_bound != hobject_t() && lower_bound < pg_start))) {
1306           // this should only happen with a buggy client.
1307           dout(10) << "outside of PG bounds " << pg_start << " .. "
1308                    << pg_end << dendl;
1309           result = -EINVAL;
1310           break;
1311         }
1312
1313         hobject_t current = lower_bound;
1314         int r = pgbackend->objects_list_partial(
1315           current,
1316           list_size,
1317           list_size,
1318           &sentries,
1319           &next);
1320         if (r != 0) {
1321           result = -EINVAL;
1322           break;
1323         }
1324
1325         map<hobject_t, pg_missing_item>::const_iterator missing_iter =
1326           recovery_state.get_pg_log().get_missing().get_items().lower_bound(current);
1327         vector<hobject_t>::iterator ls_iter = sentries.begin();
1328         hobject_t _max = hobject_t::get_max();
1329         while (1) {
1330           const hobject_t &mcand =
1331             missing_iter == recovery_state.get_pg_log().get_missing().get_items().end() ?
1332             _max :
1333             missing_iter->first;
1334           const hobject_t &lcand =
1335             ls_iter == sentries.end() ?
1336             _max :
1337             *ls_iter;
1338
1339           hobject_t candidate;
1340           if (mcand == lcand) {
1341             candidate = mcand;
1342             if (!mcand.is_max()) {
1343               ++ls_iter;
1344               ++missing_iter;
1345             }
1346           } else if (mcand < lcand) {
1347             candidate = mcand;
1348             ceph_assert(!mcand.is_max());
1349             ++missing_iter;
1350           } else {
1351             candidate = lcand;
1352             ceph_assert(!lcand.is_max());
1353             ++ls_iter;
1354           }
1355
1356           dout(10) << " pgnls candidate 0x" << std::hex << candidate.get_hash()
1357                    << " vs lower bound 0x" << lower_bound.get_hash()
1358                    << std::dec << dendl;
1359
1360           if (candidate >= next) {
1361             break;
1362           }
1363
1364           if (response.entries.size() == list_size) {
1365             next = candidate;
1366             break;
1367           }
1368
1369           if (candidate.snap != CEPH_NOSNAP)
1370             continue;
1371
1372           // skip internal namespace
1373           if (candidate.get_namespace() == cct->_conf->osd_hit_set_namespace)
1374             continue;
1375
1376           if (recovery_state.get_missing_loc().is_deleted(candidate))
1377             continue;
1378
1379           // skip wrong namespace
1380           if (m->get_hobj().nspace != librados::all_nspaces &&
1381                candidate.get_namespace() != m->get_hobj().nspace)
1382             continue;
1383
1384           if (filter && !pgls_filter(*filter, candidate))
1385             continue;
1386
1387           dout(20) << "pgnls item 0x" << std::hex
1388             << candidate.get_hash()
1389             << ", rev 0x" << hobject_t::_reverse_bits(candidate.get_hash())
1390             << std::dec << " "
1391             << candidate.oid.name << dendl;
1392
1393           librados::ListObjectImpl item;
1394           item.nspace = candidate.get_namespace();
1395           item.oid = candidate.oid.name;
1396           item.locator = candidate.get_key();
1397           response.entries.push_back(item);
1398         }
1399
1400         if (next.is_max() &&
1401             missing_iter == recovery_state.get_pg_log().get_missing().get_items().end() &&
1402             ls_iter == sentries.end()) {
1403           result = 1;
1404
1405           // Set response.handle to the start of the next PG according
1406           // to the object sort order.
1407           response.handle = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1408         } else {
1409           response.handle = next;
1410         }
1411         dout(10) << "pgnls handle=" << response.handle << dendl;
1412         encode(response, osd_op.outdata);
1413         dout(10) << " pgnls result=" << result << " outdata.length()="
1414                  << osd_op.outdata.length() << dendl;
1415       }
1416       break;
1417
1418     case CEPH_OSD_OP_PGLS_FILTER:
1419       try {
1420         decode(cname, bp);
1421         decode(mname, bp);
1422       }
1423       catch (const ceph::buffer::error& e) {
1424         dout(0) << "unable to decode PGLS_FILTER description in " << *m << dendl;
1425         result = -EINVAL;
1426         break;
1427       }
1428       std::tie(result, filter) = get_pgls_filter(bp);
1429       if (result < 0)
1430         break;
1431
1432       ceph_assert(filter);
1433
1434       // fall through
1435
1436     case CEPH_OSD_OP_PGLS:
1437       if (snapid != CEPH_NOSNAP) {
1438         result = -EINVAL;
1439         break;
1440       }
1441       if (get_osdmap()->raw_pg_to_pg(m->get_pg()) != info.pgid.pgid) {
1442         dout(10) << " pgls pg=" << m->get_pg()
1443                  << " " << get_osdmap()->raw_pg_to_pg(m->get_pg())
1444                  << " != " << info.pgid << dendl;
1445         result = 0; // hmm?
1446       } else {
1447         unsigned list_size = std::min<uint64_t>(cct->_conf->osd_max_pgls,
1448                                                 p->op.pgls.count);
1449
1450         dout(10) << " pgls pg=" << m->get_pg() << " count " << list_size << dendl;
1451         // read into a buffer
1452         vector<hobject_t> sentries;
1453         pg_ls_response_t response;
1454         try {
1455           decode(response.handle, bp);
1456         }
1457         catch (const ceph::buffer::error& e) {
1458           dout(0) << "unable to decode PGLS handle in " << *m << dendl;
1459           result = -EINVAL;
1460           break;
1461         }
1462
1463         hobject_t next;
1464         hobject_t current = response.handle;
1465         int r = pgbackend->objects_list_partial(
1466           current,
1467           list_size,
1468           list_size,
1469           &sentries,
1470           &next);
1471         if (r != 0) {
1472           result = -EINVAL;
1473           break;
1474         }
1475
1476         ceph_assert(snapid == CEPH_NOSNAP || recovery_state.get_pg_log().get_missing().get_items().empty());
1477
1478         map<hobject_t, pg_missing_item>::const_iterator missing_iter =
1479           recovery_state.get_pg_log().get_missing().get_items().lower_bound(current);
1480         vector<hobject_t>::iterator ls_iter = sentries.begin();
1481         hobject_t _max = hobject_t::get_max();
1482         while (1) {
1483           const hobject_t &mcand =
1484             missing_iter == recovery_state.get_pg_log().get_missing().get_items().end() ?
1485             _max :
1486             missing_iter->first;
1487           const hobject_t &lcand =
1488             ls_iter == sentries.end() ?
1489             _max :
1490             *ls_iter;
1491
1492           hobject_t candidate;
1493           if (mcand == lcand) {
1494             candidate = mcand;
1495             if (!mcand.is_max()) {
1496               ++ls_iter;
1497               ++missing_iter;
1498             }
1499           } else if (mcand < lcand) {
1500             candidate = mcand;
1501             ceph_assert(!mcand.is_max());
1502             ++missing_iter;
1503           } else {
1504             candidate = lcand;
1505             ceph_assert(!lcand.is_max());
1506             ++ls_iter;
1507           }
1508
1509           if (candidate >= next) {
1510             break;
1511           }
1512
1513           if (response.entries.size() == list_size) {
1514             next = candidate;
1515             break;
1516           }
1517
1518           if (candidate.snap != CEPH_NOSNAP)
1519             continue;
1520
1521           // skip wrong namespace
1522           if (candidate.get_namespace() != m->get_hobj().nspace)
1523             continue;
1524
1525           if (recovery_state.get_missing_loc().is_deleted(candidate))
1526             continue;
1527
1528           if (filter && !pgls_filter(*filter, candidate))
1529             continue;
1530
1531           response.entries.push_back(make_pair(candidate.oid,
1532                                                candidate.get_key()));
1533         }
1534         if (next.is_max() &&
1535             missing_iter == recovery_state.get_pg_log().get_missing().get_items().end() &&
1536             ls_iter == sentries.end()) {
1537           result = 1;
1538         }
1539         response.handle = next;
1540         encode(response, osd_op.outdata);
1541         dout(10) << " pgls result=" << result << " outdata.length()="
1542                  << osd_op.outdata.length() << dendl;
1543       }
1544       break;
1545
1546     case CEPH_OSD_OP_PG_HITSET_LS:
1547       {
1548         list< pair<utime_t,utime_t> > ls;
1549         for (list<pg_hit_set_info_t>::const_iterator p = info.hit_set.history.begin();
1550              p != info.hit_set.history.end();
1551              ++p)
1552           ls.push_back(make_pair(p->begin, p->end));
1553         if (hit_set)
1554           ls.push_back(make_pair(hit_set_start_stamp, utime_t()));
1555         encode(ls, osd_op.outdata);
1556       }
1557       break;
1558
1559     case CEPH_OSD_OP_PG_HITSET_GET:
1560       {
1561         utime_t stamp(osd_op.op.hit_set_get.stamp);
1562         if (hit_set_start_stamp && stamp >= hit_set_start_stamp) {
1563           // read the current in-memory HitSet, not the version we've
1564           // checkpointed.
1565           if (!hit_set) {
1566             result= -ENOENT;
1567             break;
1568           }
1569           encode(*hit_set, osd_op.outdata);
1570           result = osd_op.outdata.length();
1571         } else {
1572           // read an archived HitSet.
1573           hobject_t oid;
1574           for (list<pg_hit_set_info_t>::const_iterator p = info.hit_set.history.begin();
1575                p != info.hit_set.history.end();
1576                ++p) {
1577             if (stamp >= p->begin && stamp <= p->end) {
1578               oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
1579               break;
1580             }
1581           }
1582           if (oid == hobject_t()) {
1583             result = -ENOENT;
1584             break;
1585           }
1586           if (!pool.info.is_replicated()) {
1587             // FIXME: EC not supported yet
1588             result = -EOPNOTSUPP;
1589             break;
1590           }
1591           if (is_unreadable_object(oid)) {
1592             wait_for_unreadable_object(oid, op);
1593             return;
1594           }
1595           result = osd->store->read(ch, ghobject_t(oid), 0, 0, osd_op.outdata);
1596         }
1597       }
1598       break;
1599
1600    case CEPH_OSD_OP_SCRUBLS:
1601       result = do_scrub_ls(m, &osd_op);
1602       break;
1603
1604     default:
1605       result = -EINVAL;
1606       break;
1607     }
1608
1609     if (result < 0)
1610       break;
1611   }
1612
1613   // reply
1614   MOSDOpReply *reply = new MOSDOpReply(m, 0, get_osdmap_epoch(),
1615                                        CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK,
1616                                        false);
1617   reply->claim_op_out_data(ops);
1618   reply->set_result(result);
1619   reply->set_reply_versions(info.last_update, info.last_user_version);
1620   osd->send_message_osd_client(reply, m->get_connection());
1621 }
1622
1623 int PrimaryLogPG::do_scrub_ls(const MOSDOp *m, OSDOp *osd_op)
1624 {
1625   if (m->get_pg() != info.pgid.pgid) {
1626     dout(10) << " scrubls pg=" << m->get_pg() << " != " << info.pgid << dendl;
1627     return -EINVAL; // hmm?
1628   }
1629   auto bp = osd_op->indata.cbegin();
1630   scrub_ls_arg_t arg;
1631   try {
1632     arg.decode(bp);
1633   } catch (ceph::buffer::error&) {
1634     dout(10) << " corrupted scrub_ls_arg_t" << dendl;
1635     return -EINVAL;
1636   }
1637
1638   int r = 0;
1639   scrub_ls_result_t result = {.interval = info.history.same_interval_since};
1640
1641   if (arg.interval != 0 && arg.interval != info.history.same_interval_since) {
1642     r = -EAGAIN;
1643   } else {
1644     bool store_queried = m_scrubber && m_scrubber->get_store_errors(arg, result);
1645     if (store_queried) {
1646       encode(result, osd_op->outdata);
1647     } else {
1648       // the scrubber's store is not initialized
1649       r = -ENOENT;
1650     }
1651   }
1652
1653   return r;
1654 }
1655
1656 /**
1657  * Grabs locks for OpContext, should be cleaned up in close_op_ctx
1658  *
1659  * @param ctx [in,out] ctx to get locks for
1660  * @return true on success, false if we are queued
1661  */
1662 bool PrimaryLogPG::get_rw_locks(bool write_ordered, OpContext *ctx)
1663 {
1664   /* If head_obc, !obc->obs->exists and we will always take the
1665    * snapdir lock *before* the head lock.  Since all callers will do
1666    * this (read or write) if we get the first we will be guaranteed
1667    * to get the second.
1668    */
1669   if (write_ordered && ctx->op->may_read()) {
1670     ctx->lock_type = RWState::RWEXCL;
1671   } else if (write_ordered) {
1672     ctx->lock_type = RWState::RWWRITE;
1673   } else {
1674     ceph_assert(ctx->op->may_read());
1675     ctx->lock_type = RWState::RWREAD;
1676   }
1677
1678   if (ctx->head_obc) {
1679     ceph_assert(!ctx->obc->obs.exists);
1680     if (!ctx->lock_manager.get_lock_type(
1681           ctx->lock_type,
1682           ctx->head_obc->obs.oi.soid,
1683           ctx->head_obc,
1684           ctx->op)) {
1685       ctx->lock_type = RWState::RWNONE;
1686       return false;
1687     }
1688   }
1689   if (ctx->lock_manager.get_lock_type(
1690         ctx->lock_type,
1691         ctx->obc->obs.oi.soid,
1692         ctx->obc,
1693         ctx->op)) {
1694     return true;
1695   } else {
1696     ceph_assert(!ctx->head_obc);
1697     ctx->lock_type = RWState::RWNONE;
1698     return false;
1699   }
1700 }
1701
1702 /**
1703  * Releases locks
1704  *
1705  * @param manager [in] manager with locks to release
1706  */
1707 void PrimaryLogPG::release_object_locks(
1708   ObcLockManager &lock_manager) {
1709   std::list<std::pair<ObjectContextRef, std::list<OpRequestRef> > > to_req;
1710   bool requeue_recovery = false;
1711   bool requeue_snaptrim = false;
1712   lock_manager.put_locks(
1713     &to_req,
1714     &requeue_recovery,
1715     &requeue_snaptrim);
1716   if (requeue_recovery)
1717     queue_recovery();
1718   if (requeue_snaptrim)
1719     snap_trimmer_machine.process_event(TrimWriteUnblocked());
1720
1721   if (!to_req.empty()) {
1722     // requeue at front of scrub blocking queue if we are blocked by scrub
1723     for (auto &&p: to_req) {
1724       if (m_scrubber->write_blocked_by_scrub(p.first->obs.oi.soid.get_head())) {
1725         for (auto& op : p.second) {
1726           op->mark_delayed("waiting for scrub");
1727         }
1728
1729         waiting_for_scrub.splice(
1730           waiting_for_scrub.begin(),
1731           p.second,
1732           p.second.begin(),
1733           p.second.end());
1734       } else if (is_laggy()) {
1735         for (auto& op : p.second) {
1736           op->mark_delayed("waiting for readable");
1737         }
1738         waiting_for_readable.splice(
1739           waiting_for_readable.begin(),
1740           p.second,
1741           p.second.begin(),
1742           p.second.end());
1743       } else {
1744         requeue_ops(p.second);
1745       }
1746     }
1747   }
1748 }
1749
1750 PrimaryLogPG::PrimaryLogPG(OSDService *o, OSDMapRef curmap,
1751                            const PGPool &_pool,
1752                            const map<string,string>& ec_profile, spg_t p) :
1753   PG(o, curmap, _pool, p),
1754   pgbackend(
1755     PGBackend::build_pg_backend(
1756       _pool.info, ec_profile, this, coll_t(p), ch, o->store, cct)),
1757   object_contexts(o->cct, o->cct->_conf->osd_pg_object_context_cache_count),
1758   new_backfill(false),
1759   temp_seq(0),
1760   snap_trimmer_machine(this)
1761 {
1762   recovery_state.set_backend_predicates(
1763     pgbackend->get_is_readable_predicate(),
1764     pgbackend->get_is_recoverable_predicate());
1765   snap_trimmer_machine.initiate();
1766
1767   m_scrubber = make_unique<PrimaryLogScrub>(this);
1768 }
1769
1770 PrimaryLogPG::~PrimaryLogPG()
1771 {
1772   m_scrubber.reset();
1773 }
1774
1775 void PrimaryLogPG::get_src_oloc(const object_t& oid, const object_locator_t& oloc, object_locator_t& src_oloc)
1776 {
1777   src_oloc = oloc;
1778   if (oloc.key.empty())
1779     src_oloc.key = oid.name;
1780 }
1781
1782 void PrimaryLogPG::handle_backoff(OpRequestRef& op)
1783 {
1784   auto m = op->get_req<MOSDBackoff>();
1785   auto session = ceph::ref_cast<Session>(m->get_connection()->get_priv());
1786   if (!session)
1787     return;  // drop it.
1788   hobject_t begin = info.pgid.pgid.get_hobj_start();
1789   hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1790   if (begin < m->begin) {
1791     begin = m->begin;
1792   }
1793   if (end > m->end) {
1794     end = m->end;
1795   }
1796   dout(10) << __func__ << " backoff ack id " << m->id
1797            << " [" << begin << "," << end << ")" << dendl;
1798   session->ack_backoff(cct, m->pgid, m->id, begin, end);
1799 }
1800
1801 void PrimaryLogPG::do_request(
1802   OpRequestRef& op,
1803   ThreadPool::TPHandle &handle)
1804 {
1805   if (op->osd_trace) {
1806     op->pg_trace.init("pg op", &trace_endpoint, &op->osd_trace);
1807     op->pg_trace.event("do request");
1808   }
1809
1810
1811 // make sure we have a new enough map
1812   auto p = waiting_for_map.find(op->get_source());
1813   if (p != waiting_for_map.end()) {
1814     // preserve ordering
1815     dout(20) << __func__ << " waiting_for_map "
1816              << p->first << " not empty, queueing" << dendl;
1817     p->second.push_back(op);
1818     op->mark_delayed("waiting_for_map not empty");
1819     return;
1820   }
1821   if (!have_same_or_newer_map(op->min_epoch)) {
1822     dout(20) << __func__ << " min " << op->min_epoch
1823              << ", queue on waiting_for_map " << op->get_source() << dendl;
1824     waiting_for_map[op->get_source()].push_back(op);
1825     op->mark_delayed("op must wait for map");
1826     osd->request_osdmap_update(op->min_epoch);
1827     return;
1828   }
1829
1830   if (can_discard_request(op)) {
1831     return;
1832   }
1833
1834   // pg-wide backoffs
1835   const Message *m = op->get_req();
1836   int msg_type = m->get_type();
1837   if (m->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF)) {
1838     auto session = ceph::ref_cast<Session>(m->get_connection()->get_priv());
1839     if (!session)
1840       return;  // drop it.
1841     if (msg_type == CEPH_MSG_OSD_OP) {
1842       if (session->check_backoff(cct, info.pgid,
1843                                  info.pgid.pgid.get_hobj_start(), m)) {
1844         return;
1845       }
1846
1847       bool backoff =
1848         is_down() ||
1849         is_incomplete() ||
1850         (!is_active() && is_peered());
1851       if (g_conf()->osd_backoff_on_peering && !backoff) {
1852         if (is_peering()) {
1853           backoff = true;
1854         }
1855       }
1856       if (backoff) {
1857         add_pg_backoff(session);
1858         return;
1859       }
1860     }
1861     // pg backoff acks at pg-level
1862     if (msg_type == CEPH_MSG_OSD_BACKOFF) {
1863       const MOSDBackoff *ba = static_cast<const MOSDBackoff*>(m);
1864       if (ba->begin != ba->end) {
1865         handle_backoff(op);
1866         return;
1867       }
1868     }
1869   }
1870
1871   if (!is_peered()) {
1872     // Delay unless PGBackend says it's ok
1873     if (pgbackend->can_handle_while_inactive(op)) {
1874       bool handled = pgbackend->handle_message(op);
1875       ceph_assert(handled);
1876       return;
1877     } else {
1878       waiting_for_peered.push_back(op);
1879       op->mark_delayed("waiting for peered");
1880       return;
1881     }
1882   }
1883
1884   if (recovery_state.needs_flush()) {
1885     dout(20) << "waiting for flush on " << *op->get_req() << dendl;
1886     waiting_for_flush.push_back(op);
1887     op->mark_delayed("waiting for flush");
1888     return;
1889   }
1890
1891   ceph_assert(is_peered() && !recovery_state.needs_flush());
1892   if (pgbackend->handle_message(op))
1893     return;
1894
1895   switch (msg_type) {
1896   case CEPH_MSG_OSD_OP:
1897   case CEPH_MSG_OSD_BACKOFF:
1898     if (!is_active()) {
1899       dout(20) << " peered, not active, waiting for active on "
1900                << *op->get_req() << dendl;
1901       waiting_for_active.push_back(op);
1902       op->mark_delayed("waiting for active");
1903       return;
1904     }
1905     switch (msg_type) {
1906     case CEPH_MSG_OSD_OP:
1907       // verify client features
1908       if ((pool.info.has_tiers() || pool.info.is_tier()) &&
1909           !op->has_feature(CEPH_FEATURE_OSD_CACHEPOOL)) {
1910         osd->reply_op_error(op, -EOPNOTSUPP);
1911         return;
1912       }
1913       do_op(op);
1914       break;
1915     case CEPH_MSG_OSD_BACKOFF:
1916       // object-level backoff acks handled in osdop context
1917       handle_backoff(op);
1918       break;
1919     }
1920     break;
1921
1922   case MSG_OSD_PG_SCAN:
1923     do_scan(op, handle);
1924     break;
1925
1926   case MSG_OSD_PG_BACKFILL:
1927     do_backfill(op);
1928     break;
1929
1930   case MSG_OSD_PG_BACKFILL_REMOVE:
1931     do_backfill_remove(op);
1932     break;
1933
1934   case MSG_OSD_SCRUB_RESERVE:
1935     {
1936       if (!m_scrubber) {
1937         osd->reply_op_error(op, -EAGAIN);
1938         return;
1939       }
1940       auto m = op->get_req<MOSDScrubReserve>();
1941       switch (m->type) {
1942       case MOSDScrubReserve::REQUEST:
1943         m_scrubber->handle_scrub_reserve_request(op);
1944         break;
1945       case MOSDScrubReserve::GRANT:
1946         m_scrubber->handle_scrub_reserve_grant(op, m->from);
1947         break;
1948       case MOSDScrubReserve::REJECT:
1949         m_scrubber->handle_scrub_reserve_reject(op, m->from);
1950         break;
1951       case MOSDScrubReserve::RELEASE:
1952         m_scrubber->handle_scrub_reserve_release(op);
1953         break;
1954       }
1955     }
1956     break;
1957
1958   case MSG_OSD_REP_SCRUB:
1959     replica_scrub(op, handle);
1960     break;
1961
1962   case MSG_OSD_REP_SCRUBMAP:
1963     do_replica_scrub_map(op);
1964     break;
1965
1966   case MSG_OSD_PG_UPDATE_LOG_MISSING:
1967     do_update_log_missing(op);
1968     break;
1969
1970   case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY:
1971     do_update_log_missing_reply(op);
1972     break;
1973
1974   default:
1975     ceph_abort_msg("bad message type in do_request");
1976   }
1977 }
1978
1979 /** do_op - do an op
1980  * pg lock will be held (if multithreaded)
1981  * osd_lock NOT held.
1982  */
1983 void PrimaryLogPG::do_op(OpRequestRef& op)
1984 {
1985   FUNCTRACE(cct);
1986   // NOTE: take a non-const pointer here; we must be careful not to
1987   // change anything that will break other reads on m (operator<<).
1988   MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
1989   ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
1990   if (m->finish_decode()) {
1991     op->reset_desc();   // for TrackedOp
1992     m->clear_payload();
1993   }
1994
1995   dout(20) << __func__ << ": op " << *m << dendl;
1996
1997   const hobject_t head = m->get_hobj().get_head();
1998
1999   if (!info.pgid.pgid.contains(
2000         info.pgid.pgid.get_split_bits(pool.info.get_pg_num()), head)) {
2001     derr << __func__ << " " << info.pgid.pgid << " does not contain "
2002          << head << " pg_num " << pool.info.get_pg_num() << " hash "
2003          << std::hex << head.get_hash() << std::dec << dendl;
2004     osd->clog->warn() << info.pgid.pgid << " does not contain " << head
2005                       << " op " << *m;
2006     ceph_assert(!cct->_conf->osd_debug_misdirected_ops);
2007     return;
2008   }
2009
2010   bool can_backoff =
2011     m->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF);
2012   ceph::ref_t<Session> session;
2013   if (can_backoff) {
2014     session = static_cast<Session*>(m->get_connection()->get_priv().get());
2015     if (!session.get()) {
2016       dout(10) << __func__ << " no session" << dendl;
2017       return;
2018     }
2019
2020     if (session->check_backoff(cct, info.pgid, head, m)) {
2021       return;
2022     }
2023   }
2024
2025   if (m->has_flag(CEPH_OSD_FLAG_PARALLELEXEC)) {
2026     // not implemented.
2027     dout(20) << __func__ << ": PARALLELEXEC not implemented " << *m << dendl;
2028     osd->reply_op_error(op, -EINVAL);
2029     return;
2030   }
2031
2032   {
2033     int r = op->maybe_init_op_info(*get_osdmap());
2034     if (r) {
2035       osd->reply_op_error(op, r);
2036       return;
2037     }
2038   }
2039
2040   if ((m->get_flags() & (CEPH_OSD_FLAG_BALANCE_READS |
2041                          CEPH_OSD_FLAG_LOCALIZE_READS)) &&
2042       op->may_read() &&
2043       !(op->may_write() || op->may_cache())) {
2044     // balanced reads; any replica will do
2045     if (!(is_primary() || is_nonprimary())) {
2046       osd->handle_misdirected_op(this, op);
2047       return;
2048     }
2049   } else {
2050     // normal case; must be primary
2051     if (!is_primary()) {
2052       osd->handle_misdirected_op(this, op);
2053       return;
2054     }
2055   }
2056
2057   if (!check_laggy(op)) {
2058     return;
2059   }
2060
2061   if (!op_has_sufficient_caps(op)) {
2062     osd->reply_op_error(op, -EPERM);
2063     return;
2064   }
2065
2066   if (op->includes_pg_op()) {
2067     return do_pg_op(op);
2068   }
2069
2070   // object name too long?
2071   if (m->get_oid().name.size() > cct->_conf->osd_max_object_name_len) {
2072     dout(4) << "do_op name is longer than "
2073             << cct->_conf->osd_max_object_name_len
2074             << " bytes" << dendl;
2075     osd->reply_op_error(op, -ENAMETOOLONG);
2076     return;
2077   }
2078   if (m->get_hobj().get_key().size() > cct->_conf->osd_max_object_name_len) {
2079     dout(4) << "do_op locator is longer than "
2080             << cct->_conf->osd_max_object_name_len
2081             << " bytes" << dendl;
2082     osd->reply_op_error(op, -ENAMETOOLONG);
2083     return;
2084   }
2085   if (m->get_hobj().nspace.size() > cct->_conf->osd_max_object_namespace_len) {
2086     dout(4) << "do_op namespace is longer than "
2087             << cct->_conf->osd_max_object_namespace_len
2088             << " bytes" << dendl;
2089     osd->reply_op_error(op, -ENAMETOOLONG);
2090     return;
2091   }
2092   if (m->get_hobj().oid.name.empty()) {
2093     dout(4) << "do_op empty oid name is not allowed" << dendl;
2094     osd->reply_op_error(op, -EINVAL);
2095     return;
2096   }
2097
2098   if (int r = osd->store->validate_hobject_key(head)) {
2099     dout(4) << "do_op object " << head << " invalid for backing store: "
2100             << r << dendl;
2101     osd->reply_op_error(op, r);
2102     return;
2103   }
2104
2105   // blocklisted?
2106   if (get_osdmap()->is_blocklisted(m->get_source_addr())) {
2107     dout(10) << "do_op " << m->get_source_addr() << " is blocklisted" << dendl;
2108     osd->reply_op_error(op, -EBLOCKLISTED);
2109     return;
2110   }
2111
2112   // order this op as a write?
2113   bool write_ordered = op->rwordered();
2114
2115   // discard due to cluster full transition?  (we discard any op that
2116   // originates before the cluster or pool is marked full; the client
2117   // will resend after the full flag is removed or if they expect the
2118   // op to succeed despite being full).  The except is FULL_FORCE and
2119   // FULL_TRY ops, which there is no reason to discard because they
2120   // bypass all full checks anyway.  If this op isn't write or
2121   // read-ordered, we skip.
2122   // FIXME: we exclude mds writes for now.
2123   if (write_ordered && !(m->get_source().is_mds() ||
2124                          m->has_flag(CEPH_OSD_FLAG_FULL_TRY) ||
2125                          m->has_flag(CEPH_OSD_FLAG_FULL_FORCE)) &&
2126       info.history.last_epoch_marked_full > m->get_map_epoch()) {
2127     dout(10) << __func__ << " discarding op sent before full " << m << " "
2128              << *m << dendl;
2129     return;
2130   }
2131   // mds should have stopped writing before this point.
2132   // We can't allow OSD to become non-startable even if mds
2133   // could be writing as part of file removals.
2134   if (write_ordered && osd->check_failsafe_full(get_dpp()) &&
2135       !m->has_flag(CEPH_OSD_FLAG_FULL_TRY)) {
2136     dout(10) << __func__ << " fail-safe full check failed, dropping request." << dendl;
2137     return;
2138   }
2139   int64_t poolid = get_pgid().pool();
2140   const pg_pool_t *pi = get_osdmap()->get_pg_pool(poolid);
2141   if (!pi) {
2142     return;
2143   }
2144   if (pi->has_flag(pg_pool_t::FLAG_EIO)) {
2145     // drop op on the floor; the client will handle returning EIO
2146     if (m->has_flag(CEPH_OSD_FLAG_SUPPORTSPOOLEIO)) {
2147       dout(10) << __func__ << " discarding op due to pool EIO flag" << dendl;
2148     } else {
2149       dout(10) << __func__ << " replying EIO due to pool EIO flag" << dendl;
2150       osd->reply_op_error(op, -EIO);
2151     }
2152     return;
2153   }
2154   if (op->may_write()) {
2155
2156     // invalid?
2157     if (m->get_snapid() != CEPH_NOSNAP) {
2158       dout(20) << __func__ << ": write to clone not valid " << *m << dendl;
2159       osd->reply_op_error(op, -EINVAL);
2160       return;
2161     }
2162
2163     // too big?
2164     if (cct->_conf->osd_max_write_size &&
2165         m->get_data_len() > cct->_conf->osd_max_write_size << 20) {
2166       // journal can't hold commit!
2167       derr << "do_op msg data len " << m->get_data_len()
2168            << " > osd_max_write_size " << (cct->_conf->osd_max_write_size << 20)
2169            << " on " << *m << dendl;
2170       osd->reply_op_error(op, -OSD_WRITETOOBIG);
2171       return;
2172     }
2173   }
2174
2175   dout(10) << "do_op " << *m
2176            << (op->may_write() ? " may_write" : "")
2177            << (op->may_read() ? " may_read" : "")
2178            << (op->may_cache() ? " may_cache" : "")
2179            << " -> " << (write_ordered ? "write-ordered" : "read-ordered")
2180            << " flags " << ceph_osd_flag_string(m->get_flags())
2181            << dendl;
2182
2183
2184   // missing object?
2185   if (is_unreadable_object(head)) {
2186     if (!is_primary()) {
2187       osd->reply_op_error(op, -EAGAIN);
2188       return;
2189     }
2190     if (can_backoff &&
2191         (g_conf()->osd_backoff_on_degraded ||
2192          (g_conf()->osd_backoff_on_unfound &&
2193           recovery_state.get_missing_loc().is_unfound(head)))) {
2194       add_backoff(session, head, head);
2195       maybe_kick_recovery(head);
2196     } else {
2197       wait_for_unreadable_object(head, op);
2198     }
2199     return;
2200   }
2201
2202   if (write_ordered) {
2203     // degraded object?
2204     if (is_degraded_or_backfilling_object(head)) {
2205       if (can_backoff && g_conf()->osd_backoff_on_degraded) {
2206         add_backoff(session, head, head);
2207         maybe_kick_recovery(head);
2208       } else {
2209         wait_for_degraded_object(head, op);
2210       }
2211       return;
2212     }
2213
2214     if (m_scrubber->is_scrub_active() && m_scrubber->write_blocked_by_scrub(head)) {
2215       dout(20) << __func__ << ": waiting for scrub" << dendl;
2216       waiting_for_scrub.push_back(op);
2217       op->mark_delayed("waiting for scrub");
2218       return;
2219     }
2220     if (!check_laggy_requeue(op)) {
2221       return;
2222     }
2223
2224     // blocked on snap?
2225     if (auto blocked_iter = objects_blocked_on_degraded_snap.find(head);
2226         blocked_iter != std::end(objects_blocked_on_degraded_snap)) {
2227       hobject_t to_wait_on(head);
2228       to_wait_on.snap = blocked_iter->second;
2229       wait_for_degraded_object(to_wait_on, op);
2230       return;
2231     }
2232     if (auto blocked_snap_promote_iter = objects_blocked_on_snap_promotion.find(head);
2233         blocked_snap_promote_iter != std::end(objects_blocked_on_snap_promotion)) {
2234       wait_for_blocked_object(blocked_snap_promote_iter->second->obs.oi.soid, op);
2235       return;
2236     }
2237     if (objects_blocked_on_cache_full.count(head)) {
2238       block_write_on_full_cache(head, op);
2239       return;
2240     }
2241   }
2242
2243   // dup/resent?
2244   if (op->may_write() || op->may_cache()) {
2245     // warning: we will get back *a* request for this reqid, but not
2246     // necessarily the most recent.  this happens with flush and
2247     // promote ops, but we can't possible have both in our log where
2248     // the original request is still not stable on disk, so for our
2249     // purposes here it doesn't matter which one we get.
2250     eversion_t version;
2251     version_t user_version;
2252     int return_code = 0;
2253     vector<pg_log_op_return_item_t> op_returns;
2254     bool got = check_in_progress_op(
2255       m->get_reqid(), &version, &user_version, &return_code, &op_returns);
2256     if (got) {
2257       dout(3) << __func__ << " dup " << m->get_reqid()
2258               << " version " << version << dendl;
2259       if (already_complete(version)) {
2260         osd->reply_op_error(op, return_code, version, user_version, op_returns);
2261       } else {
2262         dout(10) << " waiting for " << version << " to commit" << dendl;
2263         // always queue ondisk waiters, so that we can requeue if needed
2264         waiting_for_ondisk[version].emplace_back(op, user_version, return_code,
2265                                                  op_returns);
2266         op->mark_delayed("waiting for ondisk");
2267       }
2268       return;
2269     }
2270   }
2271
2272   ObjectContextRef obc;
2273   bool can_create = op->may_write();
2274   hobject_t missing_oid;
2275
2276   // kludge around the fact that LIST_SNAPS sets CEPH_SNAPDIR for LIST_SNAPS
2277   const hobject_t& oid =
2278     m->get_snapid() == CEPH_SNAPDIR ? head : m->get_hobj();
2279
2280   // make sure LIST_SNAPS is on CEPH_SNAPDIR and nothing else
2281   for (vector<OSDOp>::iterator p = m->ops.begin(); p != m->ops.end(); ++p) {
2282     OSDOp& osd_op = *p;
2283
2284     if (osd_op.op.op == CEPH_OSD_OP_LIST_SNAPS) {
2285       if (m->get_snapid() != CEPH_SNAPDIR) {
2286         dout(10) << "LIST_SNAPS with incorrect context" << dendl;
2287         osd->reply_op_error(op, -EINVAL);
2288         return;
2289       }
2290     } else {
2291       if (m->get_snapid() == CEPH_SNAPDIR) {
2292         dout(10) << "non-LIST_SNAPS on snapdir" << dendl;
2293         osd->reply_op_error(op, -EINVAL);
2294         return;
2295       }
2296     }
2297   }
2298
2299   // io blocked on obc?
2300   if (!m->has_flag(CEPH_OSD_FLAG_FLUSH) &&
2301       maybe_await_blocked_head(oid, op)) {
2302     return;
2303   }
2304
2305   if (!is_primary()) {
2306     if (!recovery_state.can_serve_replica_read(oid)) {
2307       dout(20) << __func__
2308                << ": unstable write on replica, bouncing to primary "
2309                << *m << dendl;
2310       osd->reply_op_error(op, -EAGAIN);
2311       return;
2312     }
2313     dout(20) << __func__ << ": serving replica read on oid " << oid
2314              << dendl;
2315   }
2316
2317   int r = find_object_context(
2318     oid, &obc, can_create,
2319     m->has_flag(CEPH_OSD_FLAG_MAP_SNAP_CLONE),
2320     &missing_oid);
2321
2322   // LIST_SNAPS needs the ssc too
2323   if (obc &&
2324       m->get_snapid() == CEPH_SNAPDIR &&
2325       !obc->ssc) {
2326     obc->ssc = get_snapset_context(oid, true);
2327   }
2328
2329   if (r == -EAGAIN) {
2330     // If we're not the primary of this OSD, we just return -EAGAIN. Otherwise,
2331     // we have to wait for the object.
2332     if (is_primary()) {
2333       // missing the specific snap we need; requeue and wait.
2334       ceph_assert(!op->may_write()); // only happens on a read/cache
2335       wait_for_unreadable_object(missing_oid, op);
2336       return;
2337     }
2338   } else if (r == 0) {
2339     if (is_unreadable_object(obc->obs.oi.soid)) {
2340       dout(10) << __func__ << ": clone " << obc->obs.oi.soid
2341                << " is unreadable, waiting" << dendl;
2342       wait_for_unreadable_object(obc->obs.oi.soid, op);
2343       return;
2344     }
2345
2346     // degraded object?  (the check above was for head; this could be a clone)
2347     if (write_ordered &&
2348         obc->obs.oi.soid.snap != CEPH_NOSNAP &&
2349         is_degraded_or_backfilling_object(obc->obs.oi.soid)) {
2350       dout(10) << __func__ << ": clone " << obc->obs.oi.soid
2351                << " is degraded, waiting" << dendl;
2352       wait_for_degraded_object(obc->obs.oi.soid, op);
2353       return;
2354     }
2355   }
2356
2357   bool in_hit_set = false;
2358   if (hit_set) {
2359     if (obc.get()) {
2360       if (obc->obs.oi.soid != hobject_t() && hit_set->contains(obc->obs.oi.soid))
2361         in_hit_set = true;
2362     } else {
2363       if (missing_oid != hobject_t() && hit_set->contains(missing_oid))
2364         in_hit_set = true;
2365     }
2366     if (!op->hitset_inserted) {
2367       hit_set->insert(oid);
2368       op->hitset_inserted = true;
2369       if (hit_set->is_full() ||
2370           hit_set_start_stamp + pool.info.hit_set_period <= m->get_recv_stamp()) {
2371         hit_set_persist();
2372       }
2373     }
2374   }
2375
2376   if (agent_state) {
2377     if (agent_choose_mode(false, op))
2378       return;
2379   }
2380
2381   if (obc.get() && obc->obs.exists) {
2382     if (recover_adjacent_clones(obc, op)) {
2383       return;
2384     }
2385     if (maybe_handle_manifest(op,
2386                                write_ordered,
2387                                obc))
2388     return;
2389   }
2390
2391   if (maybe_handle_cache(op,
2392                          write_ordered,
2393                          obc,
2394                          r,
2395                          missing_oid,
2396                          false,
2397                          in_hit_set))
2398     return;
2399
2400   if (r && (r != -ENOENT || !obc)) {
2401     // copy the reqids for copy get on ENOENT
2402     if (r == -ENOENT &&
2403         (m->ops[0].op.op == CEPH_OSD_OP_COPY_GET)) {
2404       fill_in_copy_get_noent(op, oid, m->ops[0]);
2405       return;
2406     }
2407     dout(20) << __func__ << ": find_object_context got error " << r << dendl;
2408     if (op->may_write() &&
2409         get_osdmap()->require_osd_release >= ceph_release_t::kraken) {
2410       record_write_error(op, oid, nullptr, r);
2411     } else {
2412       osd->reply_op_error(op, r);
2413     }
2414     return;
2415   }
2416
2417   // make sure locator is consistent
2418   object_locator_t oloc(obc->obs.oi.soid);
2419   if (m->get_object_locator() != oloc) {
2420     dout(10) << " provided locator " << m->get_object_locator()
2421              << " != object's " << obc->obs.oi.soid << dendl;
2422     osd->clog->warn() << "bad locator " << m->get_object_locator()
2423                      << " on object " << oloc
2424                       << " op " << *m;
2425   }
2426
2427   // io blocked on obc?
2428   if (obc->is_blocked() &&
2429       !m->has_flag(CEPH_OSD_FLAG_FLUSH)) {
2430     wait_for_blocked_object(obc->obs.oi.soid, op);
2431     return;
2432   }
2433
2434   dout(25) << __func__ << " oi " << obc->obs.oi << dendl;
2435
2436   OpContext *ctx = new OpContext(op, m->get_reqid(), &m->ops, obc, this);
2437
2438   if (m->has_flag(CEPH_OSD_FLAG_SKIPRWLOCKS)) {
2439     dout(20) << __func__ << ": skipping rw locks" << dendl;
2440   } else if (m->get_flags() & CEPH_OSD_FLAG_FLUSH) {
2441     dout(20) << __func__ << ": part of flush, will ignore write lock" << dendl;
2442
2443     // verify there is in fact a flush in progress
2444     // FIXME: we could make this a stronger test.
2445     map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(obc->obs.oi.soid);
2446     if (p == flush_ops.end()) {
2447       dout(10) << __func__ << " no flush in progress, aborting" << dendl;
2448       reply_ctx(ctx, -EINVAL);
2449       return;
2450     }
2451   } else if (!get_rw_locks(write_ordered, ctx)) {
2452     dout(20) << __func__ << " waiting for rw locks " << dendl;
2453     op->mark_delayed("waiting for rw locks");
2454     close_op_ctx(ctx);
2455     return;
2456   }
2457   dout(20) << __func__ << " obc " << *obc << dendl;
2458
2459   if (r) {
2460     dout(20) << __func__ << " returned an error: " << r << dendl;
2461     if (op->may_write() &&
2462         get_osdmap()->require_osd_release >= ceph_release_t::kraken) {
2463       record_write_error(op, oid, nullptr, r,
2464                          ctx->op->allows_returnvec() ? ctx : nullptr);
2465     } else {
2466       osd->reply_op_error(op, r);
2467     }
2468     close_op_ctx(ctx);
2469     return;
2470   }
2471
2472   if (m->has_flag(CEPH_OSD_FLAG_IGNORE_CACHE)) {
2473     ctx->ignore_cache = true;
2474   }
2475
2476   if ((op->may_read()) && (obc->obs.oi.is_lost())) {
2477     // This object is lost. Reading from it returns an error.
2478     dout(20) << __func__ << ": object " << obc->obs.oi.soid
2479              << " is lost" << dendl;
2480     reply_ctx(ctx, -ENFILE);
2481     return;
2482   }
2483   if (!op->may_write() &&
2484       !op->may_cache() &&
2485       (!obc->obs.exists ||
2486        ((m->get_snapid() != CEPH_SNAPDIR) &&
2487         obc->obs.oi.is_whiteout()))) {
2488     // copy the reqids for copy get on ENOENT
2489     if (m->ops[0].op.op == CEPH_OSD_OP_COPY_GET) {
2490       fill_in_copy_get_noent(op, oid, m->ops[0]);
2491       close_op_ctx(ctx);
2492       return;
2493     }
2494     reply_ctx(ctx, -ENOENT);
2495     return;
2496   }
2497
2498   op->mark_started();
2499
2500   execute_ctx(ctx);
2501   utime_t prepare_latency = ceph_clock_now();
2502   prepare_latency -= op->get_dequeued_time();
2503   osd->logger->tinc(l_osd_op_prepare_lat, prepare_latency);
2504   if (op->may_read() && op->may_write()) {
2505     osd->logger->tinc(l_osd_op_rw_prepare_lat, prepare_latency);
2506   } else if (op->may_read()) {
2507     osd->logger->tinc(l_osd_op_r_prepare_lat, prepare_latency);
2508   } else if (op->may_write() || op->may_cache()) {
2509     osd->logger->tinc(l_osd_op_w_prepare_lat, prepare_latency);
2510   }
2511
2512   // force recovery of the oldest missing object if too many logs
2513   maybe_force_recovery();
2514 }
2515
2516 PrimaryLogPG::cache_result_t PrimaryLogPG::maybe_handle_manifest_detail(
2517   OpRequestRef op,
2518   bool write_ordered,
2519   ObjectContextRef obc)
2520 {
2521   if (!obc) {
2522     dout(20) << __func__ << ": no obc " << dendl;
2523     return cache_result_t::NOOP;
2524   }
2525
2526   if (!obc->obs.oi.has_manifest()) {
2527     dout(20) << __func__ << ": " << obc->obs.oi.soid
2528              << " is not manifest object " << dendl;
2529     return cache_result_t::NOOP;
2530   }
2531   if (op->get_req<MOSDOp>()->get_flags() & CEPH_OSD_FLAG_IGNORE_REDIRECT) {
2532     dout(20) << __func__ << ": ignoring redirect due to flag" << dendl;
2533     return cache_result_t::NOOP;
2534   }
2535
2536   // if it is write-ordered and blocked, stop now
2537   if (obc->is_blocked() && write_ordered) {
2538     // we're already doing something with this object
2539     dout(20) << __func__ << " blocked on " << obc->obs.oi.soid << dendl;
2540     return cache_result_t::NOOP;
2541   }
2542
2543   vector<OSDOp> ops = op->get_req<MOSDOp>()->ops;
2544   for (vector<OSDOp>::iterator p = ops.begin(); p != ops.end(); ++p) {
2545     OSDOp& osd_op = *p;
2546     ceph_osd_op& op = osd_op.op;
2547     if (op.op == CEPH_OSD_OP_SET_REDIRECT ||
2548         op.op == CEPH_OSD_OP_SET_CHUNK ||
2549         op.op == CEPH_OSD_OP_UNSET_MANIFEST ||
2550         op.op == CEPH_OSD_OP_TIER_PROMOTE ||
2551         op.op == CEPH_OSD_OP_TIER_FLUSH ||
2552         op.op == CEPH_OSD_OP_TIER_EVICT ||
2553         op.op == CEPH_OSD_OP_ISDIRTY) {
2554       return cache_result_t::NOOP;
2555     }
2556   }
2557
2558   switch (obc->obs.oi.manifest.type) {
2559   case object_manifest_t::TYPE_REDIRECT:
2560     if (op->may_write() || write_ordered) {
2561       do_proxy_write(op, obc);
2562     } else {
2563       // promoted object
2564       if (obc->obs.oi.size != 0) {
2565         return cache_result_t::NOOP;
2566       }
2567       do_proxy_read(op, obc);
2568     }
2569     return cache_result_t::HANDLED_PROXY;
2570   case object_manifest_t::TYPE_CHUNKED:
2571     {
2572       if (can_proxy_chunked_read(op, obc)) {
2573         map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(obc->obs.oi.soid);
2574         if (p != flush_ops.end()) {
2575           do_proxy_chunked_op(op, obc->obs.oi.soid, obc, true);
2576           return cache_result_t::HANDLED_PROXY;
2577         }
2578         do_proxy_chunked_op(op, obc->obs.oi.soid, obc, write_ordered);
2579         return cache_result_t::HANDLED_PROXY;
2580       }
2581
2582       MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
2583       ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
2584       hobject_t head = m->get_hobj();
2585
2586       if (is_degraded_or_backfilling_object(head)) {
2587         dout(20) << __func__ << ": " << head << " is degraded, waiting" << dendl;
2588         wait_for_degraded_object(head, op);
2589         return cache_result_t::BLOCKED_RECOVERY;
2590       }
2591
2592       if (m_scrubber->write_blocked_by_scrub(head)) {
2593         dout(20) << __func__ << ": waiting for scrub" << dendl;
2594         waiting_for_scrub.push_back(op);
2595         op->mark_delayed("waiting for scrub");
2596         return cache_result_t::BLOCKED_RECOVERY;
2597       }
2598       if (!check_laggy_requeue(op)) {
2599         return cache_result_t::BLOCKED_RECOVERY;
2600       }
2601
2602       for (auto& p : obc->obs.oi.manifest.chunk_map) {
2603         if (p.second.is_missing()) {
2604           auto m = op->get_req<MOSDOp>();
2605           const object_locator_t oloc = m->get_object_locator();
2606           promote_object(obc, obc->obs.oi.soid, oloc, op, NULL);
2607           return cache_result_t::BLOCKED_PROMOTE;
2608         }
2609       }
2610       return cache_result_t::NOOP;
2611     }
2612   default:
2613     ceph_abort_msg("unrecognized manifest type");
2614   }
2615
2616   return cache_result_t::NOOP;
2617 }
2618
2619 void PrimaryLogPG::record_write_error(OpRequestRef op, const hobject_t &soid,
2620                                       MOSDOpReply *orig_reply, int r,
2621                                       OpContext *ctx_for_op_returns)
2622 {
2623   dout(20) << __func__ << " r=" << r << dendl;
2624   ceph_assert(op->may_write());
2625   const osd_reqid_t &reqid = op->get_req<MOSDOp>()->get_reqid();
2626   mempool::osd_pglog::list<pg_log_entry_t> entries;
2627   entries.push_back(pg_log_entry_t(pg_log_entry_t::ERROR, soid,
2628                                    get_next_version(), eversion_t(), 0,
2629                                    reqid, utime_t(), r));
2630   if (ctx_for_op_returns) {
2631     entries.back().set_op_returns(*ctx_for_op_returns->ops);
2632     dout(20) << __func__ << " op_returns=" << entries.back().op_returns << dendl;
2633   }
2634
2635   struct OnComplete {
2636     PrimaryLogPG *pg;
2637     OpRequestRef op;
2638     boost::intrusive_ptr<MOSDOpReply> orig_reply;
2639     int r;
2640     OnComplete(
2641       PrimaryLogPG *pg,
2642       OpRequestRef op,
2643       MOSDOpReply *orig_reply,
2644       int r)
2645       : pg(pg), op(op),
2646         orig_reply(orig_reply, false /* take over ref */), r(r)
2647       {}
2648     void operator()() {
2649       ldpp_dout(pg, 20) << "finished " << __func__ << " r=" << r << dendl;
2650       auto m = op->get_req<MOSDOp>();
2651       MOSDOpReply *reply = orig_reply.detach();
2652       ldpp_dout(pg, 10) << " sending commit on " << *m << " " << reply << dendl;
2653       pg->osd->send_message_osd_client(reply, m->get_connection());
2654     }
2655   };
2656
2657   ObcLockManager lock_manager;
2658   submit_log_entries(
2659     entries,
2660     std::move(lock_manager),
2661     std::optional<std::function<void(void)> >(
2662       OnComplete(this, op, orig_reply, r)),
2663     op,
2664     r);
2665 }
2666
2667 PrimaryLogPG::cache_result_t PrimaryLogPG::maybe_handle_cache_detail(
2668   OpRequestRef op,
2669   bool write_ordered,
2670   ObjectContextRef obc,
2671   int r, hobject_t missing_oid,
2672   bool must_promote,
2673   bool in_hit_set,
2674   ObjectContextRef *promote_obc)
2675 {
2676   // return quickly if caching is not enabled
2677   if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)
2678     return cache_result_t::NOOP;
2679
2680   if (op &&
2681       op->get_req() &&
2682       op->get_req()->get_type() == CEPH_MSG_OSD_OP &&
2683       (op->get_req<MOSDOp>()->get_flags() &
2684        CEPH_OSD_FLAG_IGNORE_CACHE)) {
2685     dout(20) << __func__ << ": ignoring cache due to flag" << dendl;
2686     return cache_result_t::NOOP;
2687   }
2688
2689   must_promote = must_promote || op->need_promote();
2690
2691   if (obc)
2692     dout(25) << __func__ << " " << obc->obs.oi << " "
2693              << (obc->obs.exists ? "exists" : "DNE")
2694              << " missing_oid " << missing_oid
2695              << " must_promote " << (int)must_promote
2696              << " in_hit_set " << (int)in_hit_set
2697              << dendl;
2698   else
2699     dout(25) << __func__ << " (no obc)"
2700              << " missing_oid " << missing_oid
2701              << " must_promote " << (int)must_promote
2702              << " in_hit_set " << (int)in_hit_set
2703              << dendl;
2704
2705   // if it is write-ordered and blocked, stop now
2706   if (obc.get() && obc->is_blocked() && write_ordered) {
2707     // we're already doing something with this object
2708     dout(20) << __func__ << " blocked on " << obc->obs.oi.soid << dendl;
2709     return cache_result_t::NOOP;
2710   }
2711
2712   if (r == -ENOENT && missing_oid == hobject_t()) {
2713     // we know this object is logically absent (e.g., an undefined clone)
2714     return cache_result_t::NOOP;
2715   }
2716
2717   if (obc.get() && obc->obs.exists) {
2718     osd->logger->inc(l_osd_op_cache_hit);
2719     return cache_result_t::NOOP;
2720   }
2721   if (!is_primary()) {
2722     dout(20) << __func__ << " cache miss; ask the primary" << dendl;
2723     osd->reply_op_error(op, -EAGAIN);
2724     return cache_result_t::REPLIED_WITH_EAGAIN;
2725   }
2726
2727   if (missing_oid == hobject_t() && obc.get()) {
2728     missing_oid = obc->obs.oi.soid;
2729   }
2730
2731   auto m = op->get_req<MOSDOp>();
2732   const object_locator_t oloc = m->get_object_locator();
2733
2734   if (op->need_skip_handle_cache()) {
2735     return cache_result_t::NOOP;
2736   }
2737
2738   OpRequestRef promote_op;
2739
2740   switch (pool.info.cache_mode) {
2741   case pg_pool_t::CACHEMODE_WRITEBACK:
2742     if (agent_state &&
2743         agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2744       if (!op->may_write() && !op->may_cache() &&
2745           !write_ordered && !must_promote) {
2746         dout(20) << __func__ << " cache pool full, proxying read" << dendl;
2747         do_proxy_read(op);
2748         return cache_result_t::HANDLED_PROXY;
2749       }
2750       dout(20) << __func__ << " cache pool full, waiting" << dendl;
2751       block_write_on_full_cache(missing_oid, op);
2752       return cache_result_t::BLOCKED_FULL;
2753     }
2754
2755     if (must_promote || (!hit_set && !op->need_skip_promote())) {
2756       promote_object(obc, missing_oid, oloc, op, promote_obc);
2757       return cache_result_t::BLOCKED_PROMOTE;
2758     }
2759
2760     if (op->may_write() || op->may_cache()) {
2761       do_proxy_write(op);
2762
2763       // Promote too?
2764       if (!op->need_skip_promote() &&
2765           maybe_promote(obc, missing_oid, oloc, in_hit_set,
2766                       pool.info.min_write_recency_for_promote,
2767                       OpRequestRef(),
2768                       promote_obc)) {
2769         return cache_result_t::BLOCKED_PROMOTE;
2770       }
2771       return cache_result_t::HANDLED_PROXY;
2772     } else {
2773       do_proxy_read(op);
2774
2775       // Avoid duplicate promotion
2776       if (obc.get() && obc->is_blocked()) {
2777         if (promote_obc)
2778           *promote_obc = obc;
2779         return cache_result_t::BLOCKED_PROMOTE;
2780       }
2781
2782       // Promote too?
2783       if (!op->need_skip_promote()) {
2784         (void)maybe_promote(obc, missing_oid, oloc, in_hit_set,
2785                             pool.info.min_read_recency_for_promote,
2786                             promote_op, promote_obc);
2787       }
2788
2789       return cache_result_t::HANDLED_PROXY;
2790     }
2791     ceph_abort_msg("unreachable");
2792     return cache_result_t::NOOP;
2793
2794   case pg_pool_t::CACHEMODE_READONLY:
2795     // TODO: clean this case up
2796     if (!obc.get() && r == -ENOENT) {
2797       // we don't have the object and op's a read
2798       promote_object(obc, missing_oid, oloc, op, promote_obc);
2799       return cache_result_t::BLOCKED_PROMOTE;
2800     }
2801     if (!r) { // it must be a write
2802       do_cache_redirect(op);
2803       return cache_result_t::HANDLED_REDIRECT;
2804     }
2805     // crap, there was a failure of some kind
2806     return cache_result_t::NOOP;
2807
2808   case pg_pool_t::CACHEMODE_FORWARD:
2809     // this mode is deprecated; proxy instead
2810   case pg_pool_t::CACHEMODE_PROXY:
2811     if (!must_promote) {
2812       if (op->may_write() || op->may_cache() || write_ordered) {
2813         do_proxy_write(op);
2814         return cache_result_t::HANDLED_PROXY;
2815       } else {
2816         do_proxy_read(op);
2817         return cache_result_t::HANDLED_PROXY;
2818       }
2819     }
2820     // ugh, we're forced to promote.
2821     if (agent_state &&
2822         agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2823       dout(20) << __func__ << " cache pool full, waiting" << dendl;
2824       block_write_on_full_cache(missing_oid, op);
2825       return cache_result_t::BLOCKED_FULL;
2826     }
2827     promote_object(obc, missing_oid, oloc, op, promote_obc);
2828     return cache_result_t::BLOCKED_PROMOTE;
2829
2830   case pg_pool_t::CACHEMODE_READFORWARD:
2831     // this mode is deprecated; proxy instead
2832   case pg_pool_t::CACHEMODE_READPROXY:
2833     // Do writeback to the cache tier for writes
2834     if (op->may_write() || write_ordered || must_promote) {
2835       if (agent_state &&
2836           agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2837         dout(20) << __func__ << " cache pool full, waiting" << dendl;
2838         block_write_on_full_cache(missing_oid, op);
2839         return cache_result_t::BLOCKED_FULL;
2840       }
2841       promote_object(obc, missing_oid, oloc, op, promote_obc);
2842       return cache_result_t::BLOCKED_PROMOTE;
2843     }
2844
2845     // If it is a read, we can read, we need to proxy it
2846     do_proxy_read(op);
2847     return cache_result_t::HANDLED_PROXY;
2848
2849   default:
2850     ceph_abort_msg("unrecognized cache_mode");
2851   }
2852   return cache_result_t::NOOP;
2853 }
2854
2855 bool PrimaryLogPG::maybe_promote(ObjectContextRef obc,
2856                                  const hobject_t& missing_oid,
2857                                  const object_locator_t& oloc,
2858                                  bool in_hit_set,
2859                                  uint32_t recency,
2860                                  OpRequestRef promote_op,
2861                                  ObjectContextRef *promote_obc)
2862 {
2863   dout(20) << __func__ << " missing_oid " << missing_oid
2864            << "  in_hit_set " << in_hit_set << dendl;
2865
2866   switch (recency) {
2867   case 0:
2868     break;
2869   case 1:
2870     // Check if in the current hit set
2871     if (in_hit_set) {
2872       break;
2873     } else {
2874       // not promoting
2875       return false;
2876     }
2877     break;
2878   default:
2879     {
2880       unsigned count = (int)in_hit_set;
2881       if (count) {
2882         // Check if in other hit sets
2883         const hobject_t& oid = obc.get() ? obc->obs.oi.soid : missing_oid;
2884         for (map<time_t,HitSetRef>::reverse_iterator itor =
2885                agent_state->hit_set_map.rbegin();
2886              itor != agent_state->hit_set_map.rend();
2887              ++itor) {
2888           if (!itor->second->contains(oid)) {
2889             break;
2890           }
2891           ++count;
2892           if (count >= recency) {
2893             break;
2894           }
2895         }
2896       }
2897       if (count >= recency) {
2898         break;
2899       }
2900       return false;     // not promoting
2901     }
2902     break;
2903   }
2904
2905   if (osd->promote_throttle()) {
2906     dout(10) << __func__ << " promote throttled" << dendl;
2907     return false;
2908   }
2909   promote_object(obc, missing_oid, oloc, promote_op, promote_obc);
2910   return true;
2911 }
2912
2913 void PrimaryLogPG::do_cache_redirect(OpRequestRef op)
2914 {
2915   auto m = op->get_req<MOSDOp>();
2916   int flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
2917   MOSDOpReply *reply = new MOSDOpReply(m, -ENOENT, get_osdmap_epoch(),
2918                                        flags, false);
2919   request_redirect_t redir(m->get_object_locator(), pool.info.tier_of);
2920   reply->set_redirect(redir);
2921   dout(10) << "sending redirect to pool " << pool.info.tier_of << " for op "
2922            << *op->get_req() << dendl;
2923   m->get_connection()->send_message(reply);
2924   return;
2925 }
2926
2927 struct C_ProxyRead : public Context {
2928   PrimaryLogPGRef pg;
2929   hobject_t oid;
2930   epoch_t last_peering_reset;
2931   ceph_tid_t tid;
2932   PrimaryLogPG::ProxyReadOpRef prdop;
2933   utime_t start;
2934   C_ProxyRead(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
2935              const PrimaryLogPG::ProxyReadOpRef& prd)
2936     : pg(p), oid(o), last_peering_reset(lpr),
2937       tid(0), prdop(prd), start(ceph_clock_now())
2938   {}
2939   void finish(int r) override {
2940     if (prdop->canceled)
2941       return;
2942     std::scoped_lock locker{*pg};
2943     if (prdop->canceled) {
2944       return;
2945     }
2946     if (last_peering_reset == pg->get_last_peering_reset()) {
2947       pg->finish_proxy_read(oid, tid, r);
2948       pg->osd->logger->tinc(l_osd_tier_r_lat, ceph_clock_now() - start);
2949     }
2950   }
2951 };
2952
2953 struct C_ProxyChunkRead : public Context {
2954   PrimaryLogPGRef pg;
2955   hobject_t oid;
2956   epoch_t last_peering_reset;
2957   ceph_tid_t tid;
2958   PrimaryLogPG::ProxyReadOpRef prdop;
2959   utime_t start;
2960   ObjectOperation *obj_op;
2961   int op_index = 0;
2962   uint64_t req_offset = 0;
2963   ObjectContextRef obc;
2964   uint64_t req_total_len = 0;
2965   C_ProxyChunkRead(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
2966                    const PrimaryLogPG::ProxyReadOpRef& prd)
2967     : pg(p), oid(o), last_peering_reset(lpr),
2968       tid(0), prdop(prd), start(ceph_clock_now()), obj_op(NULL)
2969   {}
2970   void finish(int r) override {
2971     if (prdop->canceled)
2972       return;
2973     std::scoped_lock locker{*pg};
2974     if (prdop->canceled) {
2975       return;
2976     }
2977     if (last_peering_reset == pg->get_last_peering_reset()) {
2978       if (r >= 0) {
2979         if (!prdop->ops[op_index].outdata.length()) {
2980           ceph_assert(req_total_len);
2981           bufferlist list;
2982           bufferptr bptr(req_total_len);
2983           list.push_back(std::move(bptr));
2984           prdop->ops[op_index].outdata.append(list);
2985         }
2986         ceph_assert(obj_op);
2987         uint64_t copy_offset;
2988         if (req_offset >= prdop->ops[op_index].op.extent.offset) {
2989           copy_offset = req_offset - prdop->ops[op_index].op.extent.offset;
2990         } else {
2991           copy_offset = 0;
2992         }
2993         prdop->ops[op_index].outdata.begin(copy_offset).copy_in(
2994           obj_op->ops[0].outdata.length(),
2995           obj_op->ops[0].outdata.c_str());
2996       }
2997
2998       pg->finish_proxy_read(oid, tid, r);
2999       pg->osd->logger->tinc(l_osd_tier_r_lat, ceph_clock_now() - start);
3000       if (obj_op) {
3001         delete obj_op;
3002       }
3003     }
3004   }
3005 };
3006
3007 void PrimaryLogPG::do_proxy_read(OpRequestRef op, ObjectContextRef obc)
3008 {
3009   // NOTE: non-const here because the ProxyReadOp needs mutable refs to
3010   // stash the result in the request's OSDOp vector
3011   MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
3012   object_locator_t oloc;
3013   hobject_t soid;
3014   /* extensible tier */
3015   if (obc && obc->obs.exists && obc->obs.oi.has_manifest()) {
3016     switch (obc->obs.oi.manifest.type) {
3017       case object_manifest_t::TYPE_REDIRECT:
3018           oloc = object_locator_t(obc->obs.oi.manifest.redirect_target);
3019           soid = obc->obs.oi.manifest.redirect_target;
3020           break;
3021       default:
3022         ceph_abort_msg("unrecognized manifest type");
3023     }
3024   } else {
3025   /* proxy */
3026     soid = m->get_hobj();
3027     oloc = object_locator_t(m->get_object_locator());
3028     oloc.pool = pool.info.tier_of;
3029   }
3030   unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY;
3031
3032   // pass through some original flags that make sense.
3033   //  - leave out redirection and balancing flags since we are
3034   //    already proxying through the primary
3035   //  - leave off read/write/exec flags that are derived from the op
3036   flags |= m->get_flags() & (CEPH_OSD_FLAG_RWORDERED |
3037                              CEPH_OSD_FLAG_ORDERSNAP |
3038                              CEPH_OSD_FLAG_ENFORCE_SNAPC |
3039                              CEPH_OSD_FLAG_MAP_SNAP_CLONE);
3040
3041   dout(10) << __func__ << " Start proxy read for " << *m << dendl;
3042
3043   ProxyReadOpRef prdop(std::make_shared<ProxyReadOp>(op, soid, m->ops));
3044
3045   ObjectOperation obj_op;
3046   obj_op.dup(prdop->ops);
3047
3048   if (pool.info.cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
3049       (agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL)) {
3050     for (unsigned i = 0; i < obj_op.ops.size(); i++) {
3051       ceph_osd_op op = obj_op.ops[i].op;
3052       switch (op.op) {
3053         case CEPH_OSD_OP_READ:
3054         case CEPH_OSD_OP_SYNC_READ:
3055         case CEPH_OSD_OP_SPARSE_READ:
3056         case CEPH_OSD_OP_CHECKSUM:
3057         case CEPH_OSD_OP_CMPEXT:
3058           op.flags = (op.flags | CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL) &
3059                        ~(CEPH_OSD_OP_FLAG_FADVISE_DONTNEED | CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
3060       }
3061     }
3062   }
3063
3064   C_ProxyRead *fin = new C_ProxyRead(this, soid, get_last_peering_reset(),
3065                                      prdop);
3066   ceph_tid_t tid = osd->objecter->read(
3067     soid.oid, oloc, obj_op,
3068     m->get_snapid(), NULL,
3069     flags, new C_OnFinisher(fin, osd->get_objecter_finisher(get_pg_shard())),
3070     &prdop->user_version,
3071     &prdop->data_offset,
3072     m->get_features());
3073   fin->tid = tid;
3074   prdop->objecter_tid = tid;
3075   proxyread_ops[tid] = prdop;
3076   in_progress_proxy_ops[soid].push_back(op);
3077 }
3078
3079 void PrimaryLogPG::finish_proxy_read(hobject_t oid, ceph_tid_t tid, int r)
3080 {
3081   dout(10) << __func__ << " " << oid << " tid " << tid
3082            << " " << cpp_strerror(r) << dendl;
3083
3084   map<ceph_tid_t, ProxyReadOpRef>::iterator p = proxyread_ops.find(tid);
3085   if (p == proxyread_ops.end()) {
3086     dout(10) << __func__ << " no proxyread_op found" << dendl;
3087     return;
3088   }
3089   ProxyReadOpRef prdop = p->second;
3090   if (tid != prdop->objecter_tid) {
3091     dout(10) << __func__ << " tid " << tid << " != prdop " << prdop
3092              << " tid " << prdop->objecter_tid << dendl;
3093     return;
3094   }
3095   if (oid != prdop->soid) {
3096     dout(10) << __func__ << " oid " << oid << " != prdop " << prdop
3097              << " soid " << prdop->soid << dendl;
3098     return;
3099   }
3100   proxyread_ops.erase(tid);
3101
3102   map<hobject_t, list<OpRequestRef>>::iterator q = in_progress_proxy_ops.find(oid);
3103   if (q == in_progress_proxy_ops.end()) {
3104     dout(10) << __func__ << " no in_progress_proxy_ops found" << dendl;
3105     return;
3106   }
3107   ceph_assert(q->second.size());
3108   list<OpRequestRef>::iterator it = std::find(q->second.begin(),
3109                                               q->second.end(),
3110                                               prdop->op);
3111   ceph_assert(it != q->second.end());
3112   OpRequestRef op = *it;
3113   q->second.erase(it);
3114   if (q->second.size() == 0) {
3115     in_progress_proxy_ops.erase(oid);
3116   } else if (std::find(q->second.begin(),
3117                        q->second.end(),
3118                        prdop->op) != q->second.end()) {
3119     /* multiple read case */
3120     dout(20) << __func__ << " " << oid << " is not completed  " << dendl;
3121     return;
3122   }
3123
3124   osd->logger->inc(l_osd_tier_proxy_read);
3125
3126   auto m = op->get_req<MOSDOp>();
3127   OpContext *ctx = new OpContext(op, m->get_reqid(), &prdop->ops, this);
3128   ctx->reply = new MOSDOpReply(m, 0, get_osdmap_epoch(), 0, false);
3129   ctx->user_at_version = prdop->user_version;
3130   ctx->data_off = prdop->data_offset;
3131   ctx->ignore_log_op_stats = true;
3132   complete_read_ctx(r, ctx);
3133 }
3134
3135 void PrimaryLogPG::kick_proxy_ops_blocked(hobject_t& soid)
3136 {
3137   map<hobject_t, list<OpRequestRef>>::iterator p = in_progress_proxy_ops.find(soid);
3138   if (p == in_progress_proxy_ops.end())
3139     return;
3140
3141   list<OpRequestRef>& ls = p->second;
3142   dout(10) << __func__ << " " << soid << " requeuing " << ls.size() << " requests" << dendl;
3143   requeue_ops(ls);
3144   in_progress_proxy_ops.erase(p);
3145 }
3146
3147 void PrimaryLogPG::cancel_proxy_read(ProxyReadOpRef prdop,
3148                                      vector<ceph_tid_t> *tids)
3149 {
3150   dout(10) << __func__ << " " << prdop->soid << dendl;
3151   prdop->canceled = true;
3152
3153   // cancel objecter op, if we can
3154   if (prdop->objecter_tid) {
3155     tids->push_back(prdop->objecter_tid);
3156     for (uint32_t i = 0; i < prdop->ops.size(); i++) {
3157       prdop->ops[i].outdata.clear();
3158     }
3159     proxyread_ops.erase(prdop->objecter_tid);
3160     prdop->objecter_tid = 0;
3161   }
3162 }
3163
3164 void PrimaryLogPG::cancel_proxy_ops(bool requeue, vector<ceph_tid_t> *tids)
3165 {
3166   dout(10) << __func__ << dendl;
3167
3168   // cancel proxy reads
3169   map<ceph_tid_t, ProxyReadOpRef>::iterator p = proxyread_ops.begin();
3170   while (p != proxyread_ops.end()) {
3171     cancel_proxy_read((p++)->second, tids);
3172   }
3173
3174   // cancel proxy writes
3175   map<ceph_tid_t, ProxyWriteOpRef>::iterator q = proxywrite_ops.begin();
3176   while (q != proxywrite_ops.end()) {
3177     cancel_proxy_write((q++)->second, tids);
3178   }
3179
3180   if (requeue) {
3181     map<hobject_t, list<OpRequestRef>>::iterator p =
3182       in_progress_proxy_ops.begin();
3183     while (p != in_progress_proxy_ops.end()) {
3184       list<OpRequestRef>& ls = p->second;
3185       dout(10) << __func__ << " " << p->first << " requeuing " << ls.size()
3186                << " requests" << dendl;
3187       requeue_ops(ls);
3188       in_progress_proxy_ops.erase(p++);
3189     }
3190   } else {
3191     in_progress_proxy_ops.clear();
3192   }
3193 }
3194
3195 struct C_ProxyWrite_Commit : public Context {
3196   PrimaryLogPGRef pg;
3197   hobject_t oid;
3198   epoch_t last_peering_reset;
3199   ceph_tid_t tid;
3200   PrimaryLogPG::ProxyWriteOpRef pwop;
3201   C_ProxyWrite_Commit(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
3202                       const PrimaryLogPG::ProxyWriteOpRef& pw)
3203     : pg(p), oid(o), last_peering_reset(lpr),
3204       tid(0), pwop(pw)
3205   {}
3206   void finish(int r) override {
3207     if (pwop->canceled)
3208       return;
3209     std::scoped_lock locker{*pg};
3210     if (pwop->canceled) {
3211       return;
3212     }
3213     if (last_peering_reset == pg->get_last_peering_reset()) {
3214       pg->finish_proxy_write(oid, tid, r);
3215     }
3216   }
3217 };
3218
3219 void PrimaryLogPG::do_proxy_write(OpRequestRef op, ObjectContextRef obc)
3220 {
3221   // NOTE: non-const because ProxyWriteOp takes a mutable ref
3222   MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
3223   object_locator_t oloc;
3224   SnapContext snapc(m->get_snap_seq(), m->get_snaps());
3225   hobject_t soid;
3226   /* extensible tier */
3227   if (obc && obc->obs.exists && obc->obs.oi.has_manifest()) {
3228     switch (obc->obs.oi.manifest.type) {
3229       case object_manifest_t::TYPE_REDIRECT:
3230           oloc = object_locator_t(obc->obs.oi.manifest.redirect_target);
3231           soid = obc->obs.oi.manifest.redirect_target;
3232           break;
3233       default:
3234         ceph_abort_msg("unrecognized manifest type");
3235     }
3236   } else {
3237   /* proxy */
3238     soid = m->get_hobj();
3239     oloc = object_locator_t(m->get_object_locator());
3240     oloc.pool = pool.info.tier_of;
3241   }
3242
3243   unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY;
3244   if (!(op->may_write() || op->may_cache())) {
3245     flags |= CEPH_OSD_FLAG_RWORDERED;
3246   }
3247   if (op->allows_returnvec()) {
3248     flags |= CEPH_OSD_FLAG_RETURNVEC;
3249   }
3250
3251   dout(10) << __func__ << " Start proxy write for " << *m << dendl;
3252
3253   ProxyWriteOpRef pwop(std::make_shared<ProxyWriteOp>(op, soid, m->ops, m->get_reqid()));
3254   pwop->ctx = new OpContext(op, m->get_reqid(), &pwop->ops, this);
3255   pwop->mtime = m->get_mtime();
3256
3257   ObjectOperation obj_op;
3258   obj_op.dup(pwop->ops);
3259
3260   C_ProxyWrite_Commit *fin = new C_ProxyWrite_Commit(
3261       this, soid, get_last_peering_reset(), pwop);
3262   ceph_tid_t tid = osd->objecter->mutate(
3263     soid.oid, oloc, obj_op, snapc,
3264     ceph::real_clock::from_ceph_timespec(pwop->mtime),
3265     flags, new C_OnFinisher(fin, osd->get_objecter_finisher(get_pg_shard())),
3266     &pwop->user_version, pwop->reqid);
3267   fin->tid = tid;
3268   pwop->objecter_tid = tid;
3269   proxywrite_ops[tid] = pwop;
3270   in_progress_proxy_ops[soid].push_back(op);
3271 }
3272
3273 void PrimaryLogPG::do_proxy_chunked_op(OpRequestRef op, const hobject_t& missing_oid,
3274                                        ObjectContextRef obc, bool write_ordered)
3275 {
3276   MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
3277   OSDOp *osd_op = NULL;
3278   for (unsigned int i = 0; i < m->ops.size(); i++) {
3279     osd_op = &m->ops[i];
3280     uint64_t cursor = osd_op->op.extent.offset;
3281     uint64_t op_length = osd_op->op.extent.offset + osd_op->op.extent.length;
3282     uint64_t chunk_length = 0, chunk_index = 0, req_len = 0;
3283     object_manifest_t *manifest = &obc->obs.oi.manifest;
3284     map <uint64_t, map<uint64_t, uint64_t>> chunk_read;
3285
3286     while (cursor < op_length) {
3287       chunk_index = 0;
3288       chunk_length = 0;
3289       /* find the right chunk position for cursor */
3290       for (auto &p : manifest->chunk_map) {
3291         if (p.first <= cursor && p.first + p.second.length > cursor) {
3292           chunk_length = p.second.length;
3293           chunk_index = p.first;
3294           break;
3295         }
3296       }
3297       /* no index */
3298       if (!chunk_index && !chunk_length) {
3299         if (cursor == osd_op->op.extent.offset) {
3300           OpContext *ctx = new OpContext(op, m->get_reqid(), &m->ops, this);
3301           ctx->reply = new MOSDOpReply(m, 0, get_osdmap_epoch(), 0, false);
3302           ctx->data_off = osd_op->op.extent.offset;
3303           ctx->ignore_log_op_stats = true;
3304           complete_read_ctx(0, ctx);
3305         }
3306         break;
3307       }
3308       uint64_t next_length = chunk_length;
3309       /* the size to read -> | op length | */
3310       /*                     |   a chunk   | */
3311       if (cursor + next_length > op_length) {
3312         next_length = op_length - cursor;
3313       }
3314       /* the size to read -> |   op length   | */
3315       /*                     |   a chunk | */
3316       if (cursor + next_length > chunk_index + chunk_length) {
3317         next_length = chunk_index + chunk_length - cursor;
3318       }
3319
3320       chunk_read[cursor] = {{chunk_index, next_length}};
3321       cursor += next_length;
3322     }
3323
3324     req_len = cursor - osd_op->op.extent.offset;
3325     for (auto &p : chunk_read) {
3326       auto chunks = p.second.begin();
3327       dout(20) << __func__ << " chunk_index: " << chunks->first
3328               << " next_length: " << chunks->second << " cursor: "
3329               << p.first << dendl;
3330       do_proxy_chunked_read(op, obc, i, chunks->first, p.first, chunks->second, req_len, write_ordered);
3331     }
3332   }
3333 }
3334
3335 struct RefCountCallback : public Context {
3336 public:
3337   PrimaryLogPG::OpContext *ctx;
3338   OSDOp& osd_op;
3339   bool requeue = false;
3340
3341   RefCountCallback(PrimaryLogPG::OpContext *ctx, OSDOp &osd_op)
3342     : ctx(ctx), osd_op(osd_op) {}
3343   void finish(int r) override {
3344     // NB: caller must already have pg->lock held
3345     ctx->obc->stop_block();
3346     ctx->pg->kick_object_context_blocked(ctx->obc);
3347     if (r >= 0) {
3348       osd_op.rval = 0;
3349       ctx->pg->execute_ctx(ctx);
3350     } else {
3351        // on cancel simply toss op out,
3352        // or requeue as requested
3353       if (r != -ECANCELED) {
3354         if (ctx->op)
3355           ctx->pg->osd->reply_op_error(ctx->op, r);
3356       } else if (requeue) {
3357         if (ctx->op)
3358           ctx->pg->requeue_op(ctx->op);
3359       }
3360       ctx->pg->close_op_ctx(ctx);
3361     }
3362   }
3363   void set_requeue(bool rq) {
3364     requeue = rq;
3365   }
3366 };
3367
3368 struct SetManifestFinisher : public PrimaryLogPG::OpFinisher {
3369   OSDOp& osd_op;
3370
3371   explicit SetManifestFinisher(OSDOp& osd_op) : osd_op(osd_op) {
3372   }
3373
3374   int execute() override {
3375     return osd_op.rval;
3376   }
3377 };
3378
3379 struct C_SetManifestRefCountDone : public Context {
3380   PrimaryLogPGRef pg;
3381   hobject_t soid;
3382   uint64_t offset;
3383   ceph_tid_t tid = 0;
3384   C_SetManifestRefCountDone(PrimaryLogPG *p,
3385     hobject_t soid, uint64_t offset) :
3386           pg(p), soid(soid), offset(offset) {}
3387   void finish(int r) override {
3388     if (r == -ECANCELED)
3389       return;
3390     std::scoped_lock locker{*pg};
3391     pg->finish_set_manifest_refcount(soid, r, tid, offset);
3392   }
3393 };
3394
3395 struct C_SetDedupChunks : public Context {
3396   PrimaryLogPGRef pg;
3397   hobject_t oid;
3398   epoch_t last_peering_reset;
3399   ceph_tid_t tid;
3400   uint64_t offset;
3401
3402   C_SetDedupChunks(PrimaryLogPG *p, hobject_t o, epoch_t lpr, uint64_t offset)
3403     : pg(p), oid(o), last_peering_reset(lpr),
3404       tid(0), offset(offset)
3405   {}
3406   void finish(int r) override {
3407     if (r == -ECANCELED)
3408       return;
3409     std::scoped_lock locker{*pg};
3410     if (last_peering_reset != pg->get_last_peering_reset()) {
3411       return;
3412     }
3413     pg->finish_set_dedup(oid, r, tid, offset);
3414   }
3415 };
3416
3417 void PrimaryLogPG::cancel_manifest_ops(bool requeue, vector<ceph_tid_t> *tids)
3418 {
3419   dout(10) << __func__ << dendl;
3420   auto p = manifest_ops.begin();
3421   while (p != manifest_ops.end()) {
3422     auto mop = p->second;
3423     // cancel objecter op, if we can
3424     if (mop->objecter_tid) {
3425       tids->push_back(mop->objecter_tid);
3426       mop->objecter_tid = 0;
3427     } else if (!mop->tids.empty()) {
3428       for (auto &p : mop->tids) {
3429         tids->push_back(p.second);
3430       }
3431     }
3432     if (mop->cb) {
3433       mop->cb->set_requeue(requeue);
3434       mop->cb->complete(-ECANCELED);
3435     }
3436     manifest_ops.erase(p++);
3437   }
3438 }
3439
3440 int PrimaryLogPG::get_manifest_ref_count(ObjectContextRef obc, std::string& fp_oid, OpRequestRef op)
3441 {
3442   int cnt = 0;
3443   // head
3444   for (auto &p : obc->obs.oi.manifest.chunk_map) {
3445     if (p.second.oid.oid.name == fp_oid) {
3446       cnt++;
3447     }
3448   }
3449   // snap
3450   SnapSet& ss = obc->ssc->snapset;
3451   const OSDMapRef& osdmap = get_osdmap();
3452   for (vector<snapid_t>::const_reverse_iterator p = ss.clones.rbegin();
3453       p != ss.clones.rend();
3454       ++p) {
3455     object_ref_delta_t refs;
3456     ObjectContextRef obc_l = nullptr;
3457     ObjectContextRef obc_g = nullptr;
3458     hobject_t clone_oid = obc->obs.oi.soid;
3459     clone_oid.snap = *p;
3460     if (osdmap->in_removed_snaps_queue(info.pgid.pgid.pool(), *p)) {
3461       return -EBUSY;
3462     }
3463     if (is_unreadable_object(clone_oid)) {
3464       dout(10) << __func__ << ": " << clone_oid
3465                << " is unreadable. Need to wait for recovery" << dendl;
3466       wait_for_unreadable_object(clone_oid, op);
3467       return -EAGAIN;
3468     }
3469     ObjectContextRef clone_obc = get_object_context(clone_oid, false);
3470     if (!clone_obc) {
3471       break;
3472     }
3473     if (recover_adjacent_clones(clone_obc, op)) {
3474       return -EAGAIN;
3475     }
3476     get_adjacent_clones(clone_obc, obc_l, obc_g);
3477     clone_obc->obs.oi.manifest.calc_refs_to_inc_on_set(
3478       obc_g ? &(obc_g->obs.oi.manifest) : nullptr ,
3479       nullptr,
3480       refs);
3481     for (auto p = refs.begin(); p != refs.end(); ++p) {
3482       if (p->first.oid.name == fp_oid && p->second > 0) {
3483         cnt += p->second;
3484       }
3485     }
3486   }
3487
3488   return cnt;
3489 }
3490
3491 bool PrimaryLogPG::recover_adjacent_clones(ObjectContextRef obc, OpRequestRef op)
3492 {
3493   if (!obc->ssc || !obc->ssc->snapset.clones.size()) {
3494     return false;
3495   }
3496   MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
3497   bool has_manifest_op = std::any_of(
3498     begin(m->ops),
3499     end(m->ops),
3500     [](const auto& osd_op) {
3501        return osd_op.op.op == CEPH_OSD_OP_SET_CHUNK;
3502     });
3503   if (!obc->obs.oi.manifest.is_chunked() && !has_manifest_op) {
3504     return false;
3505   }
3506   ceph_assert(op);
3507
3508   const SnapSet& snapset = obc->ssc->snapset;
3509   auto s = std::find(snapset.clones.begin(), snapset.clones.end(), obc->obs.oi.soid.snap);
3510   auto is_unreadable_snap = [this, obc, &snapset, op](auto iter) -> bool {
3511     hobject_t cid = obc->obs.oi.soid;
3512     cid.snap = (iter == snapset.clones.end()) ? snapid_t(CEPH_NOSNAP) : *iter;
3513     if (is_unreadable_object(cid)) {
3514       dout(10) << __func__ << ": clone " << cid
3515                << " is unreadable, waiting" << dendl;
3516       wait_for_unreadable_object(cid, op);
3517       return true;
3518     }
3519     return false;
3520   };
3521   if (s != snapset.clones.begin()) {
3522     if (is_unreadable_snap(s - 1)) {
3523       return true;
3524     }
3525   }
3526   if (s != snapset.clones.end()) {
3527     if (is_unreadable_snap(s + 1)) {
3528       return true;
3529     }
3530   }
3531   return false;
3532 }
3533
3534 ObjectContextRef PrimaryLogPG::get_prev_clone_obc(ObjectContextRef obc)
3535 {
3536   auto s = std::find(obc->ssc->snapset.clones.begin(), obc->ssc->snapset.clones.end(),
3537                     obc->obs.oi.soid.snap);
3538   if (s != obc->ssc->snapset.clones.begin()) {
3539     auto s_iter = s - 1;
3540     hobject_t cid = obc->obs.oi.soid;
3541     object_ref_delta_t refs;
3542     cid.snap = *s_iter;
3543     ObjectContextRef cobc = get_object_context(cid, false, NULL);
3544     ceph_assert(cobc);
3545     return cobc;
3546   }
3547   return nullptr;
3548 }
3549
3550 void PrimaryLogPG::dec_refcount(const hobject_t& soid, const object_ref_delta_t& refs)
3551 {
3552   for (auto p = refs.begin(); p != refs.end(); ++p) {
3553     int dec_ref_count = p->second;
3554     ceph_assert(dec_ref_count < 0);
3555     while (dec_ref_count < 0) {
3556       dout(10) << __func__ << ": decrement reference on offset oid: " << p->first << dendl;
3557       refcount_manifest(soid, p->first,
3558                         refcount_t::DECREMENT_REF, NULL, std::nullopt);
3559       dec_ref_count++;
3560     }
3561   }
3562 }
3563
3564
3565 void PrimaryLogPG::get_adjacent_clones(ObjectContextRef src_obc,
3566                                        ObjectContextRef& _l, ObjectContextRef& _g)
3567 {
3568   const SnapSet& snapset = src_obc->ssc->snapset;
3569   const object_info_t& oi = src_obc->obs.oi;
3570
3571   auto get_context = [this, &oi, &snapset](auto iter)
3572     -> ObjectContextRef {
3573     hobject_t cid = oi.soid;
3574     cid.snap = (iter == snapset.clones.end()) ? snapid_t(CEPH_NOSNAP) : *iter;
3575     ObjectContextRef obc = get_object_context(cid, false, NULL);
3576     ceph_assert(obc);
3577     return obc;
3578   };
3579
3580   // check adjacent clones
3581   auto s = std::find(snapset.clones.begin(), snapset.clones.end(), oi.soid.snap);
3582
3583   // We *must* find the clone iff it's not head,
3584   // let s == snapset.clones.end() mean head
3585   ceph_assert((s == snapset.clones.end()) == oi.soid.is_head());
3586
3587   if (s != snapset.clones.begin()) {
3588     _l = get_context(s - 1);
3589   }
3590
3591   if (s != snapset.clones.end()) {
3592     _g = get_context(s + 1);
3593   }
3594 }
3595
3596 bool PrimaryLogPG::inc_refcount_by_set(OpContext* ctx, object_manifest_t& set_chunk,
3597                                        OSDOp& osd_op)
3598 {
3599   object_ref_delta_t refs;
3600   ObjectContextRef obc_l, obc_g;
3601   get_adjacent_clones(ctx->obc, obc_l, obc_g);
3602   set_chunk.calc_refs_to_inc_on_set(
3603     obc_l ? &(obc_l->obs.oi.manifest) : nullptr,
3604     obc_g ? &(obc_g->obs.oi.manifest) : nullptr,
3605     refs);
3606   bool need_inc_ref = false;
3607   if (!refs.is_empty()) {
3608     ManifestOpRef mop(std::make_shared<ManifestOp>(ctx->obc, nullptr));
3609     for (auto c : set_chunk.chunk_map) {
3610       auto p = refs.find(c.second.oid);
3611       if (p == refs.end()) {
3612         continue;
3613       }
3614
3615       int inc_ref_count = p->second;
3616       if (inc_ref_count > 0) {
3617         /*
3618          * In set-chunk case, the first thing we should do is to increment
3619          * the reference the targe object has prior to update object_manifest in object_info_t.
3620          * So, call directly refcount_manifest.
3621          */
3622         auto target_oid = p->first;
3623         auto offset = c.first;
3624         auto length = c.second.length;
3625         auto* fin = new C_SetManifestRefCountDone(this, ctx->obs->oi.soid, offset);
3626         ceph_tid_t tid = refcount_manifest(ctx->obs->oi.soid, target_oid,
3627                                             refcount_t::INCREMENT_REF, fin, std::nullopt);
3628         fin->tid = tid;
3629         mop->chunks[target_oid] = make_pair(offset, length);
3630         mop->num_chunks++;
3631         mop->tids[offset] = tid;
3632
3633         if (!ctx->obc->is_blocked()) {
3634           dout(15) << fmt::format("{}: blocking object on rc: tid:{}", __func__, tid) << dendl;
3635           ctx->obc->start_block();
3636         }
3637         need_inc_ref = true;
3638       } else if (inc_ref_count < 0) {
3639         hobject_t src = ctx->obs->oi.soid;
3640         hobject_t tgt = p->first;
3641         ctx->register_on_commit(
3642             [src, tgt, this](){
3643               refcount_manifest(src, tgt, refcount_t::DECREMENT_REF, NULL, std::nullopt);
3644             });
3645       }
3646     }
3647     if (mop->tids.size()) {
3648       mop->cb = new RefCountCallback(ctx, osd_op);
3649       manifest_ops[ctx->obs->oi.soid] = mop;
3650       manifest_ops[ctx->obs->oi.soid]->op = ctx->op;
3651     }
3652   }
3653
3654   return need_inc_ref;
3655 }
3656
3657 void PrimaryLogPG::update_chunk_map_by_dirty(OpContext* ctx) {
3658   /*
3659    * We should consider two cases here:
3660    *  1) just modification: This created dirty regions, but didn't update chunk_map.
3661    *  2) rollback: In rollback, head will be converted to the clone the rollback targets.
3662    *            Also, rollback already updated chunk_map.
3663    * So, we should do here is to check whether chunk_map is updated and the clean_region has dirty regions.
3664    * In case of the rollback, chunk_map doesn't need to be clear
3665    */
3666   for (auto &p : ctx->obs->oi.manifest.chunk_map) {
3667     if (!ctx->clean_regions.is_clean_region(p.first, p.second.length)) {
3668       ctx->new_obs.oi.manifest.chunk_map.erase(p.first);
3669       if (ctx->new_obs.oi.manifest.chunk_map.empty()) {
3670         ctx->new_obs.oi.manifest.type = object_manifest_t::TYPE_NONE;
3671         ctx->new_obs.oi.clear_flag(object_info_t::FLAG_MANIFEST);
3672         ctx->delta_stats.num_objects_manifest--;
3673       }
3674     }
3675   }
3676 }
3677
3678 void PrimaryLogPG::dec_refcount_by_dirty(OpContext* ctx)
3679 {
3680   object_ref_delta_t refs;
3681   ObjectContextRef cobc = nullptr;
3682   ObjectContextRef obc = ctx->obc;
3683   // Look over previous snapshot, then figure out whether updated chunk needs to be deleted
3684   cobc = get_prev_clone_obc(obc);
3685   obc->obs.oi.manifest.calc_refs_to_drop_on_modify(
3686     cobc ? &cobc->obs.oi.manifest : nullptr,
3687     ctx->clean_regions,
3688     refs);
3689   if (!refs.is_empty()) {
3690     hobject_t soid = obc->obs.oi.soid;
3691     ctx->register_on_commit(
3692       [soid, this, refs](){
3693         dec_refcount(soid, refs);
3694       });
3695   }
3696 }
3697
3698 void PrimaryLogPG::dec_all_refcount_manifest(const object_info_t& oi, OpContext* ctx)
3699 {
3700   ceph_assert(oi.has_manifest());
3701   ceph_assert(ctx->obc->ssc);
3702
3703   if (oi.manifest.is_chunked()) {
3704     object_ref_delta_t refs;
3705     ObjectContextRef obc_l, obc_g, obc;
3706     /* in trim_object, oi and ctx can have different oid */
3707     obc = get_object_context(oi.soid, false, NULL);
3708     ceph_assert(obc);
3709     get_adjacent_clones(obc, obc_l, obc_g);
3710     oi.manifest.calc_refs_to_drop_on_removal(
3711       obc_l ? &(obc_l->obs.oi.manifest) : nullptr,
3712       obc_g ? &(obc_g->obs.oi.manifest) : nullptr,
3713       refs);
3714
3715     if (!refs.is_empty()) {
3716       /* dec_refcount will use head object anyway */
3717       hobject_t soid = ctx->obc->obs.oi.soid;
3718       ctx->register_on_commit(
3719         [soid, this, refs](){
3720           dec_refcount(soid, refs);
3721         });
3722     }
3723   } else if (oi.manifest.is_redirect() &&
3724              oi.test_flag(object_info_t::FLAG_REDIRECT_HAS_REFERENCE)) {
3725     ctx->register_on_commit(
3726       [oi, this](){
3727         refcount_manifest(oi.soid, oi.manifest.redirect_target,
3728                           refcount_t::DECREMENT_REF, NULL, std::nullopt);
3729       });
3730   }
3731 }
3732
3733 ceph_tid_t PrimaryLogPG::refcount_manifest(hobject_t src_soid, hobject_t tgt_soid, refcount_t type,
3734                                      Context *cb, std::optional<bufferlist> chunk)
3735 {
3736   unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY |
3737                    CEPH_OSD_FLAG_RWORDERED;
3738
3739   dout(10) << __func__ << " Start refcount from " << src_soid
3740            << " to " << tgt_soid << dendl;
3741
3742   ObjectOperation obj_op;
3743   bufferlist in;
3744   if (type == refcount_t::INCREMENT_REF) {
3745     cls_cas_chunk_get_ref_op call;
3746     call.source = src_soid.get_head();
3747     ::encode(call, in);
3748     obj_op.call("cas", "chunk_get_ref", in);
3749   } else if (type == refcount_t::DECREMENT_REF) {
3750     cls_cas_chunk_put_ref_op call;
3751     call.source = src_soid.get_head();
3752     ::encode(call, in);
3753     obj_op.call("cas", "chunk_put_ref", in);
3754   } else if (type == refcount_t::CREATE_OR_GET_REF) {
3755     cls_cas_chunk_create_or_get_ref_op get_call;
3756     get_call.source = src_soid.get_head();
3757     ceph_assert(chunk);
3758     get_call.data = std::move(*chunk);
3759     ::encode(get_call, in);
3760     obj_op.call("cas", "chunk_create_or_get_ref", in);
3761   } else {
3762     ceph_assert(0 == "unrecognized type");
3763   }
3764
3765   Context *c = nullptr;
3766   if (cb) {
3767     c = new C_OnFinisher(cb, osd->get_objecter_finisher(get_pg_shard()));
3768   }
3769
3770   object_locator_t oloc(tgt_soid);
3771   ObjectContextRef src_obc = get_object_context(src_soid, false, NULL);
3772   ceph_assert(src_obc);
3773   auto tid = osd->objecter->mutate(
3774     tgt_soid.oid, oloc, obj_op, SnapContext(),
3775     ceph::real_clock::from_ceph_timespec(src_obc->obs.oi.mtime),
3776     flags, c);
3777   return tid;
3778 }
3779
3780 void PrimaryLogPG::do_proxy_chunked_read(OpRequestRef op, ObjectContextRef obc, int op_index,
3781                                          uint64_t chunk_index, uint64_t req_offset, uint64_t req_length,
3782                                          uint64_t req_total_len, bool write_ordered)
3783 {
3784   MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
3785   object_manifest_t *manifest = &obc->obs.oi.manifest;
3786   if (!manifest->chunk_map.count(chunk_index)) {
3787     return;
3788   }
3789   uint64_t chunk_length = manifest->chunk_map[chunk_index].length;
3790   hobject_t soid = manifest->chunk_map[chunk_index].oid;
3791   hobject_t ori_soid = m->get_hobj();
3792   object_locator_t oloc(soid);
3793   unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY;
3794   if (write_ordered) {
3795     flags |= CEPH_OSD_FLAG_RWORDERED;
3796   }
3797
3798   if (!chunk_length || soid == hobject_t()) {
3799     return;
3800   }
3801
3802   /* same as do_proxy_read() */
3803   flags |= m->get_flags() & (CEPH_OSD_FLAG_RWORDERED |
3804                              CEPH_OSD_FLAG_ORDERSNAP |
3805                              CEPH_OSD_FLAG_ENFORCE_SNAPC |
3806                              CEPH_OSD_FLAG_MAP_SNAP_CLONE);
3807
3808   dout(10) << __func__ << " Start do chunk proxy read for " << *m
3809            << " index: " << op_index << " oid: " << soid.oid.name << " req_offset: " << req_offset
3810            << " req_length: " << req_length << dendl;
3811
3812   ProxyReadOpRef prdop(std::make_shared<ProxyReadOp>(op, ori_soid, m->ops));
3813
3814   ObjectOperation *pobj_op = new ObjectOperation;
3815   OSDOp &osd_op = pobj_op->add_op(m->ops[op_index].op.op);
3816
3817   if (chunk_index <= req_offset) {
3818     osd_op.op.extent.offset = manifest->chunk_map[chunk_index].offset + req_offset - chunk_index;
3819   } else {
3820     ceph_abort_msg("chunk_index > req_offset");
3821   }
3822   osd_op.op.extent.length = req_length;
3823
3824   ObjectOperation obj_op;
3825   obj_op.dup(pobj_op->ops);
3826
3827   C_ProxyChunkRead *fin = new C_ProxyChunkRead(this, ori_soid, get_last_peering_reset(),
3828                                                prdop);
3829   fin->obj_op = pobj_op;
3830   fin->op_index = op_index;
3831   fin->req_offset = req_offset;
3832   fin->obc = obc;
3833   fin->req_total_len = req_total_len;
3834
3835   ceph_tid_t tid = osd->objecter->read(
3836     soid.oid, oloc, obj_op,
3837     m->get_snapid(), NULL,
3838     flags, new C_OnFinisher(fin, osd->get_objecter_finisher(get_pg_shard())),
3839     &prdop->user_version,
3840     &prdop->data_offset,
3841     m->get_features());
3842   fin->tid = tid;
3843   prdop->objecter_tid = tid;
3844   proxyread_ops[tid] = prdop;
3845   in_progress_proxy_ops[ori_soid].push_back(op);
3846 }
3847
3848 bool PrimaryLogPG::can_proxy_chunked_read(OpRequestRef op, ObjectContextRef obc)
3849 {
3850   MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
3851   OSDOp *osd_op = NULL;
3852   bool ret = true;
3853   for (unsigned int i = 0; i < m->ops.size(); i++) {
3854     osd_op = &m->ops[i];
3855     ceph_osd_op op = osd_op->op;
3856     switch (op.op) {
3857       case CEPH_OSD_OP_READ:
3858       case CEPH_OSD_OP_SYNC_READ: {
3859         uint64_t cursor = osd_op->op.extent.offset;
3860         uint64_t remain = osd_op->op.extent.length;
3861
3862         /* requested chunks exist in chunk_map ? */
3863         for (auto &p : obc->obs.oi.manifest.chunk_map) {
3864           if (p.first <= cursor && p.first + p.second.length > cursor) {
3865             if (!p.second.is_missing()) {
3866               return false;
3867             }
3868             if (p.second.length >= remain) {
3869               remain = 0;
3870               break;
3871             } else {
3872               remain = remain - p.second.length;
3873             }
3874             cursor += p.second.length;
3875           }
3876         }
3877
3878         if (remain) {
3879           dout(20) << __func__ << " requested chunks don't exist in chunk_map " << dendl;
3880           return false;
3881         }
3882         continue;
3883       }
3884       default:
3885         return false;
3886     }
3887   }
3888   return ret;
3889 }
3890
3891 void PrimaryLogPG::finish_proxy_write(hobject_t oid, ceph_tid_t tid, int r)
3892 {
3893   dout(10) << __func__ << " " << oid << " tid " << tid
3894            << " " << cpp_strerror(r) << dendl;
3895
3896   map<ceph_tid_t, ProxyWriteOpRef>::iterator p = proxywrite_ops.find(tid);
3897   if (p == proxywrite_ops.end()) {
3898     dout(10) << __func__ << " no proxywrite_op found" << dendl;
3899     return;
3900   }
3901   ProxyWriteOpRef pwop = p->second;
3902   ceph_assert(tid == pwop->objecter_tid);
3903   ceph_assert(oid == pwop->soid);
3904
3905   proxywrite_ops.erase(tid);
3906
3907   map<hobject_t, list<OpRequestRef> >::iterator q = in_progress_proxy_ops.find(oid);
3908   if (q == in_progress_proxy_ops.end()) {
3909     dout(10) << __func__ << " no in_progress_proxy_ops found" << dendl;
3910     delete pwop->ctx;
3911     pwop->ctx = NULL;
3912     return;
3913   }
3914   list<OpRequestRef>& in_progress_op = q->second;
3915   ceph_assert(in_progress_op.size());
3916   list<OpRequestRef>::iterator it = std::find(in_progress_op.begin(),
3917                                               in_progress_op.end(),
3918                                               pwop->op);
3919   ceph_assert(it != in_progress_op.end());
3920   in_progress_op.erase(it);
3921   if (in_progress_op.size() == 0) {
3922     in_progress_proxy_ops.erase(oid);
3923   } else if (std::find(in_progress_op.begin(),
3924                         in_progress_op.end(),
3925                         pwop->op) != in_progress_op.end()) {
3926     if (pwop->ctx)
3927       delete pwop->ctx;
3928     pwop->ctx = NULL;
3929     dout(20) << __func__ << " " << oid << " tid " << tid
3930             << " in_progress_op size: "
3931             << in_progress_op.size() << dendl;
3932     return;
3933   }
3934
3935   osd->logger->inc(l_osd_tier_proxy_write);
3936
3937   auto m = pwop->op->get_req<MOSDOp>();
3938   ceph_assert(m != NULL);
3939
3940   if (!pwop->sent_reply) {
3941     // send commit.
3942     assert(pwop->ctx->reply == nullptr);
3943     MOSDOpReply *reply = new MOSDOpReply(m, r, get_osdmap_epoch(), 0,
3944                                          true /* we claim it below */);
3945     reply->set_reply_versions(eversion_t(), pwop->user_version);
3946     reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
3947     reply->claim_op_out_data(pwop->ops);
3948     dout(10) << " sending commit on " << pwop << " " << reply << dendl;
3949     osd->send_message_osd_client(reply, m->get_connection());
3950     pwop->sent_reply = true;
3951     pwop->ctx->op->mark_commit_sent();
3952   }
3953
3954   delete pwop->ctx;
3955   pwop->ctx = NULL;
3956 }
3957
3958 void PrimaryLogPG::cancel_proxy_write(ProxyWriteOpRef pwop,
3959                                       vector<ceph_tid_t> *tids)
3960 {
3961   dout(10) << __func__ << " " << pwop->soid << dendl;
3962   pwop->canceled = true;
3963
3964   // cancel objecter op, if we can
3965   if (pwop->objecter_tid) {
3966     tids->push_back(pwop->objecter_tid);
3967     delete pwop->ctx;
3968     pwop->ctx = NULL;
3969     proxywrite_ops.erase(pwop->objecter_tid);
3970     pwop->objecter_tid = 0;
3971   }
3972 }
3973
3974 class PromoteCallback: public PrimaryLogPG::CopyCallback {
3975   ObjectContextRef obc;
3976   PrimaryLogPG *pg;
3977   utime_t start;
3978 public:
3979   PromoteCallback(ObjectContextRef obc_, PrimaryLogPG *pg_)
3980     : obc(obc_),
3981       pg(pg_),
3982       start(ceph_clock_now()) {}
3983
3984   void finish(PrimaryLogPG::CopyCallbackResults results) override {
3985     PrimaryLogPG::CopyResults *results_data = results.get<1>();
3986     int r = results.get<0>();
3987     if (obc->obs.oi.has_manifest() && obc->obs.oi.manifest.is_chunked()) {
3988       pg->finish_promote_manifest(r, results_data, obc);
3989     } else {
3990       pg->finish_promote(r, results_data, obc);
3991     }
3992     pg->osd->logger->tinc(l_osd_tier_promote_lat, ceph_clock_now() - start);
3993   }
3994 };
3995
3996 class PromoteManifestCallback: public PrimaryLogPG::CopyCallback {
3997   ObjectContextRef obc;
3998   PrimaryLogPG *pg;
3999   utime_t start;
4000   PrimaryLogPG::OpContext *ctx;
4001   PrimaryLogPG::CopyCallbackResults promote_results;
4002 public:
4003   PromoteManifestCallback(ObjectContextRef obc_, PrimaryLogPG *pg_, PrimaryLogPG::OpContext *ctx)
4004     : obc(obc_),
4005       pg(pg_),
4006       start(ceph_clock_now()), ctx(ctx) {}
4007
4008   void finish(PrimaryLogPG::CopyCallbackResults results) override {
4009     PrimaryLogPG::CopyResults *results_data = results.get<1>();
4010     int r = results.get<0>();
4011     promote_results = results;
4012     if (obc->obs.oi.has_manifest() && obc->obs.oi.manifest.is_redirect()) {
4013       ctx->user_at_version = results_data->user_version;
4014     }
4015     if (r >= 0) {
4016       ctx->pg->execute_ctx(ctx);
4017     } else {
4018       if (r != -ECANCELED) {
4019         if (ctx->op)
4020           ctx->pg->osd->reply_op_error(ctx->op, r);
4021       } else if (results_data->should_requeue) {
4022         if (ctx->op)
4023           ctx->pg->requeue_op(ctx->op);
4024       }
4025       ctx->pg->close_op_ctx(ctx);
4026     }
4027     pg->osd->logger->tinc(l_osd_tier_promote_lat, ceph_clock_now() - start);
4028   }
4029   friend struct PromoteFinisher;
4030 };
4031
4032 struct PromoteFinisher : public PrimaryLogPG::OpFinisher {
4033   PromoteManifestCallback *promote_callback;
4034
4035   explicit PromoteFinisher(PromoteManifestCallback *promote_callback)
4036     : promote_callback(promote_callback) {
4037   }
4038
4039   int execute() override {
4040     if (promote_callback->ctx->obc->obs.oi.manifest.is_redirect()) {
4041       promote_callback->ctx->pg->finish_promote(promote_callback->promote_results.get<0>(),
4042                                                 promote_callback->promote_results.get<1>(),
4043                                                 promote_callback->obc);
4044     } else if (promote_callback->ctx->obc->obs.oi.manifest.is_chunked()) {
4045       promote_callback->ctx->pg->finish_promote_manifest(promote_callback->promote_results.get<0>(),
4046                                                 promote_callback->promote_results.get<1>(),
4047                                                 promote_callback->obc);
4048     } else {
4049       ceph_abort_msg("unrecognized manifest type");
4050     }
4051     return 0;
4052   }
4053 };
4054
4055 void PrimaryLogPG::promote_object(ObjectContextRef obc,
4056                                   const hobject_t& missing_oid,
4057                                   const object_locator_t& oloc,
4058                                   OpRequestRef op,
4059                                   ObjectContextRef *promote_obc)
4060 {
4061   hobject_t hoid = obc ? obc->obs.oi.soid : missing_oid;
4062   ceph_assert(hoid != hobject_t());
4063   if (m_scrubber->write_blocked_by_scrub(hoid)) {
4064     dout(10) << __func__ << " " << hoid
4065              << " blocked by scrub" << dendl;
4066     if (op) {
4067       waiting_for_scrub.push_back(op);
4068       op->mark_delayed("waiting for scrub");
4069       dout(10) << __func__ << " " << hoid
4070                << " placing op in waiting_for_scrub" << dendl;
4071     } else {
4072       dout(10) << __func__ << " " << hoid
4073                << " no op, dropping on the floor" << dendl;
4074     }
4075     return;
4076   }
4077   if (op && !check_laggy_requeue(op)) {
4078     return;
4079   }
4080   if (!obc) { // we need to create an ObjectContext
4081     ceph_assert(missing_oid != hobject_t());
4082     obc = get_object_context(missing_oid, true);
4083   }
4084   if (promote_obc)
4085     *promote_obc = obc;
4086
4087   /*
4088    * Before promote complete, if there are  proxy-reads for the object,
4089    * for this case we don't use DONTNEED.
4090    */
4091   unsigned src_fadvise_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL;
4092   map<hobject_t, list<OpRequestRef>>::iterator q = in_progress_proxy_ops.find(obc->obs.oi.soid);
4093   if (q == in_progress_proxy_ops.end()) {
4094     src_fadvise_flags |= LIBRADOS_OP_FLAG_FADVISE_DONTNEED;
4095   }
4096
4097   CopyCallback *cb;
4098   object_locator_t my_oloc;
4099   hobject_t src_hoid;
4100   if (!obc->obs.oi.has_manifest()) {
4101     my_oloc = oloc;
4102     my_oloc.pool = pool.info.tier_of;
4103     src_hoid = obc->obs.oi.soid;
4104     cb = new PromoteCallback(obc, this);
4105   } else {
4106     if (obc->obs.oi.manifest.is_chunked()) {
4107       src_hoid = obc->obs.oi.soid;
4108       cb = new PromoteCallback(obc, this);
4109     } else if (obc->obs.oi.manifest.is_redirect()) {
4110       object_locator_t src_oloc(obc->obs.oi.manifest.redirect_target);
4111       my_oloc = src_oloc;
4112       src_hoid = obc->obs.oi.manifest.redirect_target;
4113       cb = new PromoteCallback(obc, this);
4114     } else {
4115       ceph_abort_msg("unrecognized manifest type");
4116     }
4117   }
4118
4119   unsigned flags = CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
4120                    CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
4121                    CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE |
4122                    CEPH_OSD_COPY_FROM_FLAG_RWORDERED;
4123   start_copy(cb, obc, src_hoid, my_oloc, 0, flags,
4124              obc->obs.oi.soid.snap == CEPH_NOSNAP,
4125              src_fadvise_flags, 0);
4126
4127   ceph_assert(obc->is_blocked());
4128
4129   if (op)
4130     wait_for_blocked_object(obc->obs.oi.soid, op);
4131
4132   recovery_state.update_stats(
4133     [](auto &history, auto &stats) {
4134       stats.stats.sum.num_promote++;
4135       return false;
4136     });
4137 }
4138
4139 void PrimaryLogPG::execute_ctx(OpContext *ctx)
4140 {
4141   FUNCTRACE(cct);
4142   dout(10) << __func__ << " " << ctx << dendl;
4143   ctx->reset_obs(ctx->obc);
4144   ctx->update_log_only = false; // reset in case finish_copyfrom() is re-running execute_ctx
4145   OpRequestRef op = ctx->op;
4146   auto m = op->get_req<MOSDOp>();
4147   ObjectContextRef obc = ctx->obc;
4148   const hobject_t& soid = obc->obs.oi.soid;
4149
4150   // this method must be idempotent since we may call it several times
4151   // before we finally apply the resulting transaction.
4152   ctx->op_t.reset(new PGTransaction);
4153
4154   if (op->may_write() || op->may_cache()) {
4155     // snap
4156     if (!(m->has_flag(CEPH_OSD_FLAG_ENFORCE_SNAPC)) &&
4157         pool.info.is_pool_snaps_mode()) {
4158       // use pool's snapc
4159       ctx->snapc = pool.snapc;
4160     } else {
4161       // client specified snapc
4162       ctx->snapc.seq = m->get_snap_seq();
4163       ctx->snapc.snaps = m->get_snaps();
4164       filter_snapc(ctx->snapc.snaps);
4165     }
4166     if ((m->has_flag(CEPH_OSD_FLAG_ORDERSNAP)) &&
4167         ctx->snapc.seq < obc->ssc->snapset.seq) {
4168       dout(10) << " ORDERSNAP flag set and snapc seq " << ctx->snapc.seq
4169                << " < snapset seq " << obc->ssc->snapset.seq
4170                << " on " << obc->obs.oi.soid << dendl;
4171       reply_ctx(ctx, -EOLDSNAPC);
4172       return;
4173     }
4174
4175     // version
4176     ctx->at_version = get_next_version();
4177     ctx->mtime = m->get_mtime();
4178
4179     dout(10) << __func__ << " " << soid << " " << *ctx->ops
4180              << " ov " << obc->obs.oi.version << " av " << ctx->at_version
4181              << " snapc " << ctx->snapc
4182              << " snapset " << obc->ssc->snapset
4183              << dendl;
4184   } else {
4185     dout(10) << __func__ << " " << soid << " " << *ctx->ops
4186              << " ov " << obc->obs.oi.version
4187              << dendl;
4188   }
4189
4190   if (!ctx->user_at_version)
4191     ctx->user_at_version = obc->obs.oi.user_version;
4192   dout(30) << __func__ << " user_at_version " << ctx->user_at_version << dendl;
4193
4194   {
4195 #ifdef WITH_LTTNG
4196     osd_reqid_t reqid = ctx->op->get_reqid();
4197 #endif
4198     tracepoint(osd, prepare_tx_enter, reqid.name._type,
4199         reqid.name._num, reqid.tid, reqid.inc);
4200   }
4201
4202
4203   int result = prepare_transaction(ctx);
4204
4205   {
4206 #ifdef WITH_LTTNG
4207     osd_reqid_t reqid = ctx->op->get_reqid();
4208 #endif
4209     tracepoint(osd, prepare_tx_exit, reqid.name._type,
4210         reqid.name._num, reqid.tid, reqid.inc);
4211   }
4212
4213   bool pending_async_reads = !ctx->pending_async_reads.empty();
4214   if (result == -EINPROGRESS || pending_async_reads) {
4215     // come back later.
4216     if (pending_async_reads) {
4217       ceph_assert(pool.info.is_erasure());
4218       in_progress_async_reads.push_back(make_pair(op, ctx));
4219       ctx->start_async_reads(this);
4220     }
4221     return;
4222   }
4223
4224   if (result == -EAGAIN) {
4225     // clean up after the ctx
4226     close_op_ctx(ctx);
4227     return;
4228   }
4229
4230   bool ignore_out_data = false;
4231   if (!ctx->op_t->empty() &&
4232       op->may_write() &&
4233       result >= 0) {
4234     // successful update
4235     if (ctx->op->allows_returnvec()) {
4236       // enforce reasonable bound on the return buffer sizes
4237       for (auto& i : *ctx->ops) {
4238         if (i.outdata.length() > cct->_conf->osd_max_write_op_reply_len) {
4239           dout(10) << __func__ << " op " << i << " outdata overflow" << dendl;
4240           result = -EOVERFLOW;  // overall result is overflow
4241           i.rval = -EOVERFLOW;
4242           i.outdata.clear();
4243         }
4244       }
4245     } else {
4246       // legacy behavior -- zero result and return data etc.
4247       ignore_out_data = true;
4248       result = 0;
4249     }
4250   }
4251
4252   // prepare the reply
4253   ctx->reply = new MOSDOpReply(m, result, get_osdmap_epoch(), 0,
4254                                ignore_out_data);
4255   dout(20) << __func__ << " alloc reply " << ctx->reply
4256            << " result " << result << dendl;
4257
4258   // read or error?
4259   if ((ctx->op_t->empty() || result < 0) && !ctx->update_log_only) {
4260     // finish side-effects
4261     if (result >= 0)
4262       do_osd_op_effects(ctx, m->get_connection());
4263
4264     complete_read_ctx(result, ctx);
4265     return;
4266   }
4267
4268   ctx->reply->set_reply_versions(ctx->at_version, ctx->user_at_version);
4269
4270   ceph_assert(op->may_write() || op->may_cache());
4271
4272   // trim log?
4273   recovery_state.update_trim_to();
4274
4275   // verify that we are doing this in order?
4276   if (cct->_conf->osd_debug_op_order && m->get_source().is_client() &&
4277       !pool.info.is_tier() && !pool.info.has_tiers()) {
4278     map<client_t,ceph_tid_t>& cm = debug_op_order[obc->obs.oi.soid];
4279     ceph_tid_t t = m->get_tid();
4280     client_t n = m->get_source().num();
4281     map<client_t,ceph_tid_t>::iterator p = cm.find(n);
4282     if (p == cm.end()) {
4283       dout(20) << " op order client." << n << " tid " << t << " (first)" << dendl;
4284       cm[n] = t;
4285     } else {
4286       dout(20) << " op order client." << n << " tid " << t << " last was " << p->second << dendl;
4287       if (p->second > t) {
4288         derr << "bad op order, already applied " << p->second << " > this " << t << dendl;
4289         ceph_abort_msg("out of order op");
4290       }
4291       p->second = t;
4292     }
4293   }
4294
4295   if (ctx->update_log_only) {
4296     if (result >= 0)
4297       do_osd_op_effects(ctx, m->get_connection());
4298
4299     dout(20) << __func__ << " update_log_only -- result=" << result << dendl;
4300     // save just what we need from ctx
4301     MOSDOpReply *reply = ctx->reply;
4302     ctx->reply = nullptr;
4303     reply->get_header().data_off = (ctx->data_off ? *ctx->data_off : 0);
4304
4305     if (result == -ENOENT) {
4306       reply->set_enoent_reply_versions(info.last_update,
4307                                        info.last_user_version);
4308     }
4309     reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
4310     // append to pg log for dup detection - don't save buffers for now
4311     record_write_error(op, soid, reply, result,
4312                        ctx->op->allows_returnvec() ? ctx : nullptr);
4313     close_op_ctx(ctx);
4314     return;
4315   }
4316
4317   // no need to capture PG ref, repop cancel will handle that
4318   // Can capture the ctx by pointer, it's owned by the repop
4319   ctx->register_on_commit(
4320     [m, ctx, this](){
4321       if (ctx->op)
4322         log_op_stats(*ctx->op, ctx->bytes_written, ctx->bytes_read);
4323
4324       if (m && !ctx->sent_reply) {
4325         MOSDOpReply *reply = ctx->reply;
4326         ctx->reply = nullptr;
4327         reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
4328         dout(10) << " sending reply on " << *m << " " << reply << dendl;
4329         osd->send_message_osd_client(reply, m->get_connection());
4330         ctx->sent_reply = true;
4331         ctx->op->mark_commit_sent();
4332       }
4333     });
4334   ctx->register_on_success(
4335     [ctx, this]() {
4336       do_osd_op_effects(
4337         ctx,
4338         ctx->op ? ctx->op->get_req()->get_connection() :
4339         ConnectionRef());
4340     });
4341   ctx->register_on_finish(
4342     [ctx]() {
4343       delete ctx;
4344     });
4345
4346   // issue replica writes
4347   ceph_tid_t rep_tid = osd->get_tid();
4348
4349   RepGather *repop = new_repop(ctx, rep_tid);
4350
4351   issue_repop(repop, ctx);
4352   eval_repop(repop);
4353   repop->put();
4354 }
4355
4356 void PrimaryLogPG::close_op_ctx(OpContext *ctx) {
4357   release_object_locks(ctx->lock_manager);
4358
4359   ctx->op_t.reset();
4360
4361   for (auto p = ctx->on_finish.begin(); p != ctx->on_finish.end();
4362        ctx->on_finish.erase(p++)) {
4363     (*p)();
4364   }
4365   delete ctx;
4366 }
4367
4368 void PrimaryLogPG::reply_ctx(OpContext *ctx, int r)
4369 {
4370   if (ctx->op)
4371     osd->reply_op_error(ctx->op, r);
4372   close_op_ctx(ctx);
4373 }
4374
4375 void PrimaryLogPG::log_op_stats(const OpRequest& op,
4376                                 const uint64_t inb,
4377                                 const uint64_t outb)
4378 {
4379   auto m = op.get_req<MOSDOp>();
4380   const utime_t now = ceph_clock_now();
4381
4382   const utime_t latency = now - m->get_recv_stamp();
4383   const utime_t process_latency = now - op.get_dequeued_time();
4384
4385   osd->logger->inc(l_osd_op);
4386
4387   osd->logger->inc(l_osd_op_outb, outb);
4388   osd->logger->inc(l_osd_op_inb, inb);
4389   osd->logger->tinc(l_osd_op_lat, latency);
4390   osd->logger->tinc(l_osd_op_process_lat, process_latency);
4391
4392   if (op.may_read() && op.may_write()) {
4393     osd->logger->inc(l_osd_op_rw);
4394     osd->logger->inc(l_osd_op_rw_inb, inb);
4395     osd->logger->inc(l_osd_op_rw_outb, outb);
4396     osd->logger->tinc(l_osd_op_rw_lat, latency);
4397     osd->logger->hinc(l_osd_op_rw_lat_inb_hist, latency.to_nsec(), inb);
4398     osd->logger->hinc(l_osd_op_rw_lat_outb_hist, latency.to_nsec(), outb);
4399     osd->logger->tinc(l_osd_op_rw_process_lat, process_latency);
4400   } else if (op.may_read()) {
4401     osd->logger->inc(l_osd_op_r);
4402     osd->logger->inc(l_osd_op_r_outb, outb);
4403     osd->logger->tinc(l_osd_op_r_lat, latency);
4404     osd->logger->hinc(l_osd_op_r_lat_outb_hist, latency.to_nsec(), outb);
4405     osd->logger->tinc(l_osd_op_r_process_lat, process_latency);
4406   } else if (op.may_write() || op.may_cache()) {
4407     osd->logger->inc(l_osd_op_w);
4408     osd->logger->inc(l_osd_op_w_inb, inb);
4409     osd->logger->tinc(l_osd_op_w_lat, latency);
4410     osd->logger->hinc(l_osd_op_w_lat_inb_hist, latency.to_nsec(), inb);
4411     osd->logger->tinc(l_osd_op_w_process_lat, process_latency);
4412   } else {
4413     ceph_abort();
4414   }
4415
4416   dout(15) << "log_op_stats " << *m
4417            << " inb " << inb
4418            << " outb " << outb
4419            << " lat " << latency << dendl;
4420
4421   if (m_dynamic_perf_stats.is_enabled()) {
4422     m_dynamic_perf_stats.add(osd, info, op, inb, outb, latency);
4423   }
4424 }
4425
4426 void PrimaryLogPG::set_dynamic_perf_stats_queries(
4427     const std::list<OSDPerfMetricQuery> &queries)
4428 {
4429   m_dynamic_perf_stats.set_queries(queries);
4430 }
4431
4432 void PrimaryLogPG::get_dynamic_perf_stats(DynamicPerfStats *stats)
4433 {
4434   std::swap(m_dynamic_perf_stats, *stats);
4435 }
4436
4437 void PrimaryLogPG::do_scan(
4438   OpRequestRef op,
4439   ThreadPool::TPHandle &handle)
4440 {
4441   auto m = op->get_req<MOSDPGScan>();
4442   ceph_assert(m->get_type() == MSG_OSD_PG_SCAN);
4443   dout(10) << "do_scan " << *m << dendl;
4444
4445   op->mark_started();
4446
4447   switch (m->op) {
4448   case MOSDPGScan::OP_SCAN_GET_DIGEST:
4449     {
4450       auto dpp = get_dpp();
4451       if (osd->check_backfill_full(dpp)) {
4452         dout(1) << __func__ << ": Canceling backfill: Full." << dendl;
4453         queue_peering_event(
4454           PGPeeringEventRef(
4455             std::make_shared<PGPeeringEvent>(
4456               get_osdmap_epoch(),
4457               get_osdmap_epoch(),
4458               PeeringState::BackfillTooFull())));
4459         return;
4460       }
4461
4462       BackfillInterval bi;
4463       bi.begin = m->begin;
4464       // No need to flush, there won't be any in progress writes occuring
4465       // past m->begin
4466       scan_range(
4467         cct->_conf->osd_backfill_scan_min,
4468         cct->_conf->osd_backfill_scan_max,
4469         &bi,
4470         handle);
4471       MOSDPGScan *reply = new MOSDPGScan(
4472         MOSDPGScan::OP_SCAN_DIGEST,
4473         pg_whoami,
4474         get_osdmap_epoch(), m->query_epoch,
4475         spg_t(info.pgid.pgid, get_primary().shard), bi.begin, bi.end);
4476       encode(bi.objects, reply->get_data());
4477       osd->send_message_osd_cluster(reply, m->get_connection());
4478     }
4479     break;
4480
4481   case MOSDPGScan::OP_SCAN_DIGEST:
4482     {
4483       pg_shard_t from = m->from;
4484
4485       // Check that from is in backfill_targets vector
4486       ceph_assert(is_backfill_target(from));
4487
4488       BackfillInterval& bi = peer_backfill_info[from];
4489       bi.begin = m->begin;
4490       bi.end = m->end;
4491       auto p = m->get_data().cbegin();
4492
4493       // take care to preserve ordering!
4494       bi.clear_objects();
4495       decode_noclear(bi.objects, p);
4496       dout(10) << __func__ << " bi.begin=" << bi.begin << " bi.end=" << bi.end
4497                << " bi.objects.size()=" << bi.objects.size() << dendl;
4498
4499       if (waiting_on_backfill.erase(from)) {
4500         if (waiting_on_backfill.empty()) {
4501           ceph_assert(
4502             peer_backfill_info.size() ==
4503             get_backfill_targets().size());
4504           finish_recovery_op(hobject_t::get_max());
4505         }
4506       } else {
4507         // we canceled backfill for a while due to a too full, and this
4508         // is an extra response from a non-too-full peer
4509         dout(20) << __func__ << " canceled backfill (too full?)" << dendl;
4510       }
4511     }
4512     break;
4513   }
4514 }
4515
4516 void PrimaryLogPG::do_backfill(OpRequestRef op)
4517 {
4518   auto m = op->get_req<MOSDPGBackfill>();
4519   ceph_assert(m->get_type() == MSG_OSD_PG_BACKFILL);
4520   dout(10) << "do_backfill " << *m << dendl;
4521
4522   op->mark_started();
4523
4524   switch (m->op) {
4525   case MOSDPGBackfill::OP_BACKFILL_FINISH:
4526     {
4527       ceph_assert(cct->_conf->osd_kill_backfill_at != 1);
4528
4529       MOSDPGBackfill *reply = new MOSDPGBackfill(
4530         MOSDPGBackfill::OP_BACKFILL_FINISH_ACK,
4531         get_osdmap_epoch(),
4532         m->query_epoch,
4533         spg_t(info.pgid.pgid, get_primary().shard));
4534       reply->set_priority(recovery_state.get_recovery_op_priority());
4535       osd->send_message_osd_cluster(reply, m->get_connection());
4536       queue_peering_event(
4537         PGPeeringEventRef(
4538           std::make_shared<PGPeeringEvent>(
4539             get_osdmap_epoch(),
4540             get_osdmap_epoch(),
4541             RecoveryDone())));
4542     }
4543     // fall-thru
4544
4545   case MOSDPGBackfill::OP_BACKFILL_PROGRESS:
4546     {
4547       ceph_assert(cct->_conf->osd_kill_backfill_at != 2);
4548
4549       ObjectStore::Transaction t;
4550       recovery_state.update_backfill_progress(
4551         m->last_backfill,
4552         m->stats,
4553         m->op == MOSDPGBackfill::OP_BACKFILL_PROGRESS,
4554         t);
4555
4556       int tr = osd->store->queue_transaction(ch, std::move(t), NULL);
4557       ceph_assert(tr == 0);
4558     }
4559     break;
4560
4561   case MOSDPGBackfill::OP_BACKFILL_FINISH_ACK:
4562     {
4563       ceph_assert(is_primary());
4564       ceph_assert(cct->_conf->osd_kill_backfill_at != 3);
4565       finish_recovery_op(hobject_t::get_max());
4566     }
4567     break;
4568   }
4569 }
4570
4571 void PrimaryLogPG::do_backfill_remove(OpRequestRef op)
4572 {
4573   const MOSDPGBackfillRemove *m = static_cast<const MOSDPGBackfillRemove*>(
4574     op->get_req());
4575   ceph_assert(m->get_type() == MSG_OSD_PG_BACKFILL_REMOVE);
4576   dout(7) << __func__ << " " << m->ls << dendl;
4577
4578   op->mark_started();
4579
4580   ObjectStore::Transaction t;
4581   for (auto& p : m->ls) {
4582     if (is_remote_backfilling()) {
4583       struct stat st;
4584       int r = osd->store->stat(ch, ghobject_t(p.first, ghobject_t::NO_GEN,
4585                                pg_whoami.shard) , &st);
4586       if (r == 0) {
4587         sub_local_num_bytes(st.st_size);
4588         int64_t usersize;
4589         if (pool.info.is_erasure()) {
4590           bufferlist bv;
4591           int r = osd->store->getattr(
4592               ch,
4593               ghobject_t(p.first, ghobject_t::NO_GEN, pg_whoami.shard),
4594               OI_ATTR,
4595               bv);
4596           if (r >= 0) {
4597             object_info_t oi(bv);
4598             usersize = oi.size * pgbackend->get_ec_data_chunk_count();
4599           } else {
4600             dout(0) << __func__ << " " << ghobject_t(p.first, ghobject_t::NO_GEN, pg_whoami.shard)
4601                     << " can't get object info" << dendl;
4602             usersize = 0;
4603           }
4604         } else {
4605           usersize = st.st_size;
4606         }
4607         sub_num_bytes(usersize);
4608         dout(10) << __func__ << " " << ghobject_t(p.first, ghobject_t::NO_GEN, pg_whoami.shard)
4609                  << " sub actual data by " << st.st_size
4610                  << " sub num_bytes by " << usersize
4611                  << dendl;
4612       }
4613     }
4614     remove_snap_mapped_object(t, p.first);
4615   }
4616   int r = osd->store->queue_transaction(ch, std::move(t), NULL);
4617   ceph_assert(r == 0);
4618 }
4619
4620 int PrimaryLogPG::trim_object(
4621   bool first, const hobject_t &coid, snapid_t snap_to_trim,
4622   PrimaryLogPG::OpContextUPtr *ctxp)
4623 {
4624   *ctxp = NULL;
4625
4626   // load clone info
4627   bufferlist bl;
4628   ObjectContextRef obc = get_object_context(coid, false, NULL);
4629   if (!obc || !obc->ssc || !obc->ssc->exists) {
4630     osd->clog->error() << __func__ << ": Can not trim " << coid
4631       << " repair needed " << (obc ? "(no obc->ssc or !exists)" : "(no obc)");
4632     return -ENOENT;
4633   }
4634
4635   hobject_t head_oid = coid.get_head();
4636   ObjectContextRef head_obc = get_object_context(head_oid, false);
4637   if (!head_obc) {
4638     osd->clog->error() << __func__ << ": Can not trim " << coid
4639       << " repair needed, no snapset obc for " << head_oid;
4640     return -ENOENT;
4641   }
4642
4643   SnapSet& snapset = obc->ssc->snapset;
4644
4645   object_info_t &coi = obc->obs.oi;
4646   auto citer = snapset.clone_snaps.find(coid.snap);
4647   if (citer == snapset.clone_snaps.end()) {
4648     osd->clog->error() << "No clone_snaps in snapset " << snapset
4649                        << " for object " << coid << "\n";
4650     return -ENOENT;
4651   }
4652   set<snapid_t> old_snaps(citer->second.begin(), citer->second.end());
4653   if (old_snaps.empty()) {
4654     osd->clog->error() << "No object info snaps for object " << coid;
4655     return -ENOENT;
4656   }
4657
4658   dout(10) << coid << " old_snaps " << old_snaps
4659            << " old snapset " << snapset << dendl;
4660   if (snapset.seq == 0) {
4661     osd->clog->error() << "No snapset.seq for object " << coid;
4662     return -ENOENT;
4663   }
4664
4665   set<snapid_t> new_snaps;
4666   const OSDMapRef& osdmap = get_osdmap();
4667   for (set<snapid_t>::iterator i = old_snaps.begin();
4668        i != old_snaps.end();
4669        ++i) {
4670     if (!osdmap->in_removed_snaps_queue(info.pgid.pgid.pool(), *i) &&
4671         *i != snap_to_trim) {
4672       new_snaps.insert(*i);
4673     }
4674   }
4675
4676   vector<snapid_t>::iterator p = snapset.clones.end();
4677
4678   if (new_snaps.empty()) {
4679     p = std::find(snapset.clones.begin(), snapset.clones.end(), coid.snap);
4680     if (p == snapset.clones.end()) {
4681       osd->clog->error() << "Snap " << coid.snap << " not in clones";
4682       return -ENOENT;
4683     }
4684   }
4685
4686   OpContextUPtr ctx = simple_opc_create(obc);
4687   ctx->head_obc = head_obc;
4688
4689   if (!ctx->lock_manager.get_snaptrimmer_write(
4690         coid,
4691         obc,
4692         first)) {
4693     close_op_ctx(ctx.release());
4694     dout(10) << __func__ << ": Unable to get a wlock on " << coid << dendl;
4695     return -ENOLCK;
4696   }
4697
4698   if (!ctx->lock_manager.get_snaptrimmer_write(
4699         head_oid,
4700         head_obc,
4701         first)) {
4702     close_op_ctx(ctx.release());
4703     dout(10) << __func__ << ": Unable to get a wlock on " << head_oid << dendl;
4704     return -ENOLCK;
4705   }
4706
4707   ctx->at_version = get_next_version();
4708
4709   PGTransaction *t = ctx->op_t.get();
4710
4711   int64_t num_objects_before_trim = ctx->delta_stats.num_objects;
4712
4713   if (new_snaps.empty()) {
4714     // remove clone
4715     dout(10) << coid << " snaps " << old_snaps << " -> "
4716              << new_snaps << " ... deleting" << dendl;
4717
4718     // ...from snapset
4719     ceph_assert(p != snapset.clones.end());
4720
4721     snapid_t last = coid.snap;
4722     ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(last);
4723
4724     if (p != snapset.clones.begin()) {
4725       // not the oldest... merge overlap into next older clone
4726       vector<snapid_t>::iterator n = p - 1;
4727       hobject_t prev_coid = coid;
4728       prev_coid.snap = *n;
4729       bool adjust_prev_bytes = is_present_clone(prev_coid);
4730
4731       if (adjust_prev_bytes)
4732         ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(*n);
4733
4734       snapset.clone_overlap[*n].intersection_of(
4735         snapset.clone_overlap[*p]);
4736
4737       if (adjust_prev_bytes)
4738         ctx->delta_stats.num_bytes += snapset.get_clone_bytes(*n);
4739     }
4740     ctx->delta_stats.num_objects--;
4741     if (coi.is_dirty())
4742       ctx->delta_stats.num_objects_dirty--;
4743     if (coi.is_omap())
4744       ctx->delta_stats.num_objects_omap--;
4745     if (coi.is_whiteout()) {
4746       dout(20) << __func__ << " trimming whiteout on " << coid << dendl;
4747       ctx->delta_stats.num_whiteouts--;
4748     }
4749     ctx->delta_stats.num_object_clones--;
4750     if (coi.is_cache_pinned())
4751       ctx->delta_stats.num_objects_pinned--;
4752     if (coi.has_manifest()) {
4753       dec_all_refcount_manifest(coi, ctx.get());
4754       ctx->delta_stats.num_objects_manifest--;
4755     }
4756     obc->obs.exists = false;
4757
4758     snapset.clones.erase(p);
4759     snapset.clone_overlap.erase(last);
4760     snapset.clone_size.erase(last);
4761     snapset.clone_snaps.erase(last);
4762
4763     ctx->log.push_back(
4764       pg_log_entry_t(
4765         pg_log_entry_t::DELETE,
4766         coid,
4767         ctx->at_version,
4768         ctx->obs->oi.version,
4769         0,
4770         osd_reqid_t(),
4771         ctx->mtime,
4772         0)
4773       );
4774     t->remove(coid);
4775     t->update_snaps(
4776       coid,
4777       old_snaps,
4778       new_snaps);
4779
4780     coi = object_info_t(coid);
4781
4782     ctx->at_version.version++;
4783   } else {
4784     // save adjusted snaps for this object
4785     dout(10) << coid << " snaps " << old_snaps << " -> " << new_snaps << dendl;
4786     snapset.clone_snaps[coid.snap] =
4787       vector<snapid_t>(new_snaps.rbegin(), new_snaps.rend());
4788     // we still do a 'modify' event on this object just to trigger a
4789     // snapmapper.update ... :(
4790
4791     coi.prior_version = coi.version;
4792     coi.version = ctx->at_version;
4793     bl.clear();
4794     encode(coi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
4795     t->setattr(coid, OI_ATTR, bl);
4796
4797     ctx->log.push_back(
4798       pg_log_entry_t(
4799         pg_log_entry_t::MODIFY,
4800         coid,
4801         coi.version,
4802         coi.prior_version,
4803         0,
4804         osd_reqid_t(),
4805         ctx->mtime,
4806         0)
4807       );
4808     ctx->at_version.version++;
4809
4810     t->update_snaps(
4811       coid,
4812       old_snaps,
4813       new_snaps);
4814   }
4815
4816   // save head snapset
4817   dout(10) << coid << " new snapset " << snapset << " on "
4818            << head_obc->obs.oi << dendl;
4819   if (snapset.clones.empty() &&
4820       (head_obc->obs.oi.is_whiteout() &&
4821        !(head_obc->obs.oi.is_dirty() && pool.info.is_tier()) &&
4822        !head_obc->obs.oi.is_cache_pinned())) {
4823     // NOTE: this arguably constitutes minor interference with the
4824     // tiering agent if this is a cache tier since a snap trim event
4825     // is effectively evicting a whiteout we might otherwise want to
4826     // keep around.
4827     dout(10) << coid << " removing " << head_oid << dendl;
4828     ctx->log.push_back(
4829       pg_log_entry_t(
4830         pg_log_entry_t::DELETE,
4831         head_oid,
4832         ctx->at_version,
4833         head_obc->obs.oi.version,
4834         0,
4835         osd_reqid_t(),
4836         ctx->mtime,
4837         0)
4838       );
4839     dout(10) << "removing snap head" << dendl;
4840     object_info_t& oi = head_obc->obs.oi;
4841     ctx->delta_stats.num_objects--;
4842     if (oi.is_dirty()) {
4843       ctx->delta_stats.num_objects_dirty--;
4844     }
4845     if (oi.is_omap())
4846       ctx->delta_stats.num_objects_omap--;
4847     if (oi.is_whiteout()) {
4848       dout(20) << __func__ << " trimming whiteout on " << oi.soid << dendl;
4849       ctx->delta_stats.num_whiteouts--;
4850     }
4851     if (oi.is_cache_pinned()) {
4852       ctx->delta_stats.num_objects_pinned--;
4853     }
4854     if (oi.has_manifest()) {
4855       ctx->delta_stats.num_objects_manifest--;
4856       dec_all_refcount_manifest(oi, ctx.get());
4857     }
4858     head_obc->obs.exists = false;
4859     head_obc->obs.oi = object_info_t(head_oid);
4860     t->remove(head_oid);
4861   } else {
4862     if (get_osdmap()->require_osd_release < ceph_release_t::octopus) {
4863       // filter SnapSet::snaps for the benefit of pre-octopus
4864       // peers. This is perhaps overly conservative in that I'm not
4865       // certain they need this, but let's be conservative here.
4866       dout(10) << coid << " filtering snapset on " << head_oid << dendl;
4867       snapset.filter(pool.info);
4868     } else {
4869       snapset.snaps.clear();
4870     }
4871     dout(10) << coid << " writing updated snapset on " << head_oid
4872              << ", snapset is " << snapset << dendl;
4873     ctx->log.push_back(
4874       pg_log_entry_t(
4875         pg_log_entry_t::MODIFY,
4876         head_oid,
4877         ctx->at_version,
4878         head_obc->obs.oi.version,
4879         0,
4880         osd_reqid_t(),
4881         ctx->mtime,
4882         0)
4883       );
4884
4885     head_obc->obs.oi.prior_version = head_obc->obs.oi.version;
4886     head_obc->obs.oi.version = ctx->at_version;
4887
4888     map <string, bufferlist, less<>> attrs;
4889     bl.clear();
4890     encode(snapset, bl);
4891     attrs[SS_ATTR] = std::move(bl);
4892
4893     bl.clear();
4894     encode(head_obc->obs.oi, bl,
4895              get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
4896     attrs[OI_ATTR] = std::move(bl);
4897     t->setattrs(head_oid, attrs);
4898   }
4899
4900   // Stats reporting - Set number of objects trimmed
4901   if (num_objects_before_trim > ctx->delta_stats.num_objects) {
4902     int64_t num_objects_trimmed =
4903       num_objects_before_trim - ctx->delta_stats.num_objects;
4904     add_objects_trimmed_count(num_objects_trimmed);
4905   }
4906
4907   *ctxp = std::move(ctx);
4908   return 0;
4909 }
4910
4911 void PrimaryLogPG::kick_snap_trim()
4912 {
4913   ceph_assert(is_active());
4914   ceph_assert(is_primary());
4915   if (is_clean() &&
4916       !state_test(PG_STATE_PREMERGE) &&
4917       !snap_trimq.empty()) {
4918     if (get_osdmap()->test_flag(CEPH_OSDMAP_NOSNAPTRIM)) {
4919       dout(10) << __func__ << ": nosnaptrim set, not kicking" << dendl;
4920     } else {
4921       dout(10) << __func__ << ": clean and snaps to trim, kicking" << dendl;
4922       reset_objects_trimmed();
4923       set_snaptrim_begin_stamp();
4924       snap_trimmer_machine.process_event(KickTrim());
4925     }
4926   }
4927 }
4928
4929 void PrimaryLogPG::snap_trimmer_scrub_complete()
4930 {
4931   if (is_primary() && is_active() && is_clean() && !snap_trimq.empty()) {
4932     dout(10) << "scrub finished - requeuing snap_trimmer" << dendl;
4933     snap_trimmer_machine.process_event(ScrubComplete());
4934   }
4935 }
4936
4937 void PrimaryLogPG::snap_trimmer(epoch_t queued)
4938 {
4939   if (recovery_state.is_deleting() || pg_has_reset_since(queued)) {
4940     return;
4941   }
4942
4943   ceph_assert(is_primary());
4944
4945   dout(10) << "snap_trimmer posting" << dendl;
4946   snap_trimmer_machine.process_event(DoSnapWork());
4947   dout(10) << "snap_trimmer complete" << dendl;
4948   return;
4949 }
4950
4951 namespace {
4952
4953 template<typename U, typename V>
4954 int do_cmp_xattr(int op, const U& lhs, const V& rhs)
4955 {
4956   switch (op) {
4957   case CEPH_OSD_CMPXATTR_OP_EQ:
4958     return lhs == rhs;
4959   case CEPH_OSD_CMPXATTR_OP_NE:
4960     return lhs != rhs;
4961   case CEPH_OSD_CMPXATTR_OP_GT:
4962     return lhs > rhs;
4963   case CEPH_OSD_CMPXATTR_OP_GTE:
4964     return lhs >= rhs;
4965   case CEPH_OSD_CMPXATTR_OP_LT:
4966     return lhs < rhs;
4967   case CEPH_OSD_CMPXATTR_OP_LTE:
4968     return lhs <= rhs;
4969   default:
4970     return -EINVAL;
4971   }
4972 }
4973
4974 } // anonymous namespace
4975
4976 int PrimaryLogPG::do_xattr_cmp_u64(int op, uint64_t v1, bufferlist& xattr)
4977 {
4978   uint64_t v2;
4979
4980   if (xattr.length()) {
4981     const char* first = xattr.c_str();
4982     if (auto [p, ec] = std::from_chars(first, first + xattr.length(), v2);
4983         ec != std::errc()) {
4984       return -EINVAL;
4985     }
4986   } else {
4987     v2 = 0;
4988   }
4989   dout(20) << "do_xattr_cmp_u64 '" << v1 << "' vs '" << v2 << "' op " << op << dendl;
4990   return do_cmp_xattr(op, v1, v2);
4991 }
4992
4993 int PrimaryLogPG::do_xattr_cmp_str(int op, string& v1s, bufferlist& xattr)
4994 {
4995   string_view v2s(xattr.c_str(), xattr.length());
4996   dout(20) << "do_xattr_cmp_str '" << v1s << "' vs '" << v2s << "' op " << op << dendl;
4997   return do_cmp_xattr(op, v1s, v2s);
4998 }
4999
5000 int PrimaryLogPG::do_writesame(OpContext *ctx, OSDOp& osd_op)
5001 {
5002   ceph_osd_op& op = osd_op.op;
5003   vector<OSDOp> write_ops(1);
5004   OSDOp& write_op = write_ops[0];
5005   uint64_t write_length = op.writesame.length;
5006   int result = 0;
5007
5008   if (!write_length)
5009     return 0;
5010
5011   if (!op.writesame.data_length || write_length % op.writesame.data_length)
5012     return -EINVAL;
5013
5014   if (op.writesame.data_length != osd_op.indata.length()) {
5015     derr << "invalid length ws data length " << op.writesame.data_length << " actual len " << osd_op.indata.length() << dendl;
5016     return -EINVAL;
5017   }
5018
5019   while (write_length) {
5020     write_op.indata.append(osd_op.indata);
5021     write_length -= op.writesame.data_length;
5022   }
5023
5024   write_op.op.op = CEPH_OSD_OP_WRITE;
5025   write_op.op.extent.offset = op.writesame.offset;
5026   write_op.op.extent.length = op.writesame.length;
5027   result = do_osd_ops(ctx, write_ops);
5028   if (result < 0)
5029     derr << "do_writesame do_osd_ops failed " << result << dendl;
5030
5031   return result;
5032 }
5033
5034 // ========================================================================
5035 // low level osd ops
5036
5037 int PrimaryLogPG::do_tmap2omap(OpContext *ctx, unsigned flags)
5038 {
5039   dout(20) << " convert tmap to omap for " << ctx->new_obs.oi.soid << dendl;
5040   bufferlist header, vals;
5041   int r = _get_tmap(ctx, &header, &vals);
5042   if (r < 0) {
5043     if (r == -ENODATA && (flags & CEPH_OSD_TMAP2OMAP_NULLOK))
5044       r = 0;
5045     return r;
5046   }
5047
5048   vector<OSDOp> ops(3);
5049
5050   ops[0].op.op = CEPH_OSD_OP_TRUNCATE;
5051   ops[0].op.extent.offset = 0;
5052   ops[0].op.extent.length = 0;
5053
5054   ops[1].op.op = CEPH_OSD_OP_OMAPSETHEADER;
5055   ops[1].indata = std::move(header);
5056
5057   ops[2].op.op = CEPH_OSD_OP_OMAPSETVALS;
5058   ops[2].indata = std::move(vals);
5059
5060   return do_osd_ops(ctx, ops);
5061 }
5062
5063 int PrimaryLogPG::do_tmapup_slow(OpContext *ctx, bufferlist::const_iterator& bp,
5064                                  OSDOp& osd_op, bufferlist& bl)
5065 {
5066   // decode
5067   bufferlist header;
5068   map<string, bufferlist> m;
5069   if (bl.length()) {
5070     auto p = bl.cbegin();
5071     decode(header, p);
5072     decode(m, p);
5073     ceph_assert(p.end());
5074   }
5075
5076   // do the update(s)
5077   while (!bp.end()) {
5078     __u8 op;
5079     string key;
5080     decode(op, bp);
5081
5082     switch (op) {
5083     case CEPH_OSD_TMAP_SET: // insert key
5084       {
5085         decode(key, bp);
5086         bufferlist data;
5087         decode(data, bp);
5088         m[key] = data;
5089       }
5090       break;
5091     case CEPH_OSD_TMAP_RM: // remove key
5092       decode(key, bp);
5093       if (!m.count(key)) {
5094         return -ENOENT;
5095       }
5096       m.erase(key);
5097       break;
5098     case CEPH_OSD_TMAP_RMSLOPPY: // remove key
5099       decode(key, bp);
5100       m.erase(key);
5101       break;
5102     case CEPH_OSD_TMAP_HDR: // update header
5103       {
5104         decode(header, bp);
5105       }
5106       break;
5107     default:
5108       return -EINVAL;
5109     }
5110   }
5111
5112   // reencode
5113   bufferlist obl;
5114   encode(header, obl);
5115   encode(m, obl);
5116
5117   // write it out
5118   vector<OSDOp> nops(1);
5119   OSDOp& newop = nops[0];
5120   newop.op.op = CEPH_OSD_OP_WRITEFULL;
5121   newop.op.extent.offset = 0;
5122   newop.op.extent.length = obl.length();
5123   newop.indata = obl;
5124   do_osd_ops(ctx, nops);
5125   return 0;
5126 }
5127
5128 int PrimaryLogPG::do_tmapup(OpContext *ctx, bufferlist::const_iterator& bp, OSDOp& osd_op)
5129 {
5130   bufferlist::const_iterator orig_bp = bp;
5131   int result = 0;
5132   if (bp.end()) {
5133     dout(10) << "tmapup is a no-op" << dendl;
5134   } else {
5135     // read the whole object
5136     vector<OSDOp> nops(1);
5137     OSDOp& newop = nops[0];
5138     newop.op.op = CEPH_OSD_OP_READ;
5139     newop.op.extent.offset = 0;
5140     newop.op.extent.length = 0;
5141     result = do_osd_ops(ctx, nops);
5142
5143     dout(10) << "tmapup read " << newop.outdata.length() << dendl;
5144
5145     dout(30) << " starting is \n";
5146     newop.outdata.hexdump(*_dout);
5147     *_dout << dendl;
5148
5149     auto ip = newop.outdata.cbegin();
5150     bufferlist obl;
5151
5152     dout(30) << "the update command is: \n";
5153     osd_op.indata.hexdump(*_dout);
5154     *_dout << dendl;
5155
5156     // header
5157     bufferlist header;
5158     __u32 nkeys = 0;
5159     if (newop.outdata.length()) {
5160       decode(header, ip);
5161       decode(nkeys, ip);
5162     }
5163     dout(10) << "tmapup header " << header.length() << dendl;
5164
5165     if (!bp.end() && *bp == CEPH_OSD_TMAP_HDR) {
5166       ++bp;
5167       decode(header, bp);
5168       dout(10) << "tmapup new header " << header.length() << dendl;
5169     }
5170
5171     encode(header, obl);
5172
5173     dout(20) << "tmapup initial nkeys " << nkeys << dendl;
5174
5175     // update keys
5176     bufferlist newkeydata;
5177     string nextkey, last_in_key;
5178     bufferlist nextval;
5179     bool have_next = false;
5180     if (!ip.end()) {
5181       have_next = true;
5182       decode(nextkey, ip);
5183       decode(nextval, ip);
5184     }
5185     while (!bp.end() && !result) {
5186       __u8 op;
5187       string key;
5188       try {
5189         decode(op, bp);
5190         decode(key, bp);
5191       }
5192       catch (ceph::buffer::error& e) {
5193         return -EINVAL;
5194       }
5195       if (key < last_in_key) {
5196         dout(5) << "tmapup warning: key '" << key << "' < previous key '" << last_in_key
5197                 << "', falling back to an inefficient (unsorted) update" << dendl;
5198         bp = orig_bp;
5199         return do_tmapup_slow(ctx, bp, osd_op, newop.outdata);
5200       }
5201       last_in_key = key;
5202
5203       dout(10) << "tmapup op " << (int)op << " key " << key << dendl;
5204
5205       // skip existing intervening keys
5206       bool key_exists = false;
5207       while (have_next && !key_exists) {
5208         dout(20) << "  (have_next=" << have_next << " nextkey=" << nextkey << ")" << dendl;
5209         if (nextkey > key)
5210           break;
5211         if (nextkey < key) {
5212           // copy untouched.
5213           encode(nextkey, newkeydata);
5214           encode(nextval, newkeydata);
5215           dout(20) << "  keep " << nextkey << " " << nextval.length() << dendl;
5216         } else {
5217           // don't copy; discard old value.  and stop.
5218           dout(20) << "  drop " << nextkey << " " << nextval.length() << dendl;
5219           key_exists = true;
5220           nkeys--;
5221         }
5222         if (!ip.end()) {
5223           decode(nextkey, ip);
5224           decode(nextval, ip);
5225         } else {
5226           have_next = false;
5227         }
5228       }
5229
5230       if (op == CEPH_OSD_TMAP_SET) {
5231         bufferlist val;
5232         try {
5233           decode(val, bp);
5234         }
5235         catch (ceph::buffer::error& e) {
5236           return -EINVAL;
5237         }
5238         encode(key, newkeydata);
5239         encode(val, newkeydata);
5240         dout(20) << "   set " << key << " " << val.length() << dendl;
5241         nkeys++;
5242       } else if (op == CEPH_OSD_TMAP_CREATE) {
5243         if (key_exists) {
5244           return -EEXIST;
5245         }
5246         bufferlist val;
5247         try {
5248           decode(val, bp);
5249         }
5250         catch (ceph::buffer::error& e) {
5251           return -EINVAL;
5252         }
5253         encode(key, newkeydata);
5254         encode(val, newkeydata);
5255         dout(20) << "   create " << key << " " << val.length() << dendl;
5256         nkeys++;
5257       } else if (op == CEPH_OSD_TMAP_RM) {
5258         // do nothing.
5259         if (!key_exists) {
5260           return -ENOENT;
5261         }
5262       } else if (op == CEPH_OSD_TMAP_RMSLOPPY) {
5263         // do nothing
5264       } else {
5265         dout(10) << "  invalid tmap op " << (int)op << dendl;
5266         return -EINVAL;
5267       }
5268     }
5269
5270     // copy remaining
5271     if (have_next) {
5272       encode(nextkey, newkeydata);
5273       encode(nextval, newkeydata);
5274       dout(20) << "  keep " << nextkey << " " << nextval.length() << dendl;
5275     }
5276     if (!ip.end()) {
5277       bufferlist rest;
5278       rest.substr_of(newop.outdata, ip.get_off(), newop.outdata.length() - ip.get_off());
5279       dout(20) << "  keep trailing " << rest.length()
5280                << " at " << newkeydata.length() << dendl;
5281       newkeydata.claim_append(rest);
5282     }
5283
5284     // encode final key count + key data
5285     dout(20) << "tmapup final nkeys " << nkeys << dendl;
5286     encode(nkeys, obl);
5287     obl.claim_append(newkeydata);
5288
5289     if (0) {
5290       dout(30) << " final is \n";
5291       obl.hexdump(*_dout);
5292       *_dout << dendl;
5293
5294       // sanity check
5295       auto tp = obl.cbegin();
5296       bufferlist h;
5297       decode(h, tp);
5298       map<string,bufferlist> d;
5299       decode(d, tp);
5300       ceph_assert(tp.end());
5301       dout(0) << " **** debug sanity check, looks ok ****" << dendl;
5302     }
5303
5304     // write it out
5305     if (!result) {
5306       dout(20) << "tmapput write " << obl.length() << dendl;
5307       newop.op.op = CEPH_OSD_OP_WRITEFULL;
5308       newop.op.extent.offset = 0;
5309       newop.op.extent.length = obl.length();
5310       newop.indata = obl;
5311       do_osd_ops(ctx, nops);
5312     }
5313   }
5314   return result;
5315 }
5316
5317 static int check_offset_and_length(uint64_t offset, uint64_t length,
5318   uint64_t max, DoutPrefixProvider *dpp)
5319 {
5320   if (offset >= max ||
5321       length > max ||
5322       offset + length > max) {
5323     ldpp_dout(dpp, 10) << __func__ << " "
5324       << "osd_max_object_size: " << max
5325       << "; Hard limit of object size is 4GB." << dendl;
5326     return -EFBIG;
5327   }
5328
5329   return 0;
5330 }
5331
5332 struct FillInVerifyExtent : public Context {
5333   ceph_le64 *r;
5334   int32_t *rval;
5335   bufferlist *outdatap;
5336   std::optional<uint32_t> maybe_crc;
5337   uint64_t size;
5338   OSDService *osd;
5339   hobject_t soid;
5340   uint32_t flags;
5341   FillInVerifyExtent(ceph_le64 *r, int32_t *rv, bufferlist *blp,
5342                      std::optional<uint32_t> mc, uint64_t size,
5343                      OSDService *osd, hobject_t soid, uint32_t flags) :
5344     r(r), rval(rv), outdatap(blp), maybe_crc(mc),
5345     size(size), osd(osd), soid(soid), flags(flags) {}
5346   void finish(int len) override {
5347     if (len < 0) {
5348       *rval = len;
5349       return;
5350     }
5351     *r = len;
5352     *rval = 0;
5353
5354     // whole object?  can we verify the checksum?
5355     if (maybe_crc && *r == size) {
5356       uint32_t crc = outdatap->crc32c(-1);
5357       if (maybe_crc != crc) {
5358         osd->clog->error() << std::hex << " full-object read crc 0x" << crc
5359                            << " != expected 0x" << *maybe_crc
5360                            << std::dec << " on " << soid;
5361         if (!(flags & CEPH_OSD_OP_FLAG_FAILOK)) {
5362           *rval = -EIO;
5363           *r = 0;
5364         }
5365       }
5366     }
5367   }
5368 };
5369
5370 struct ToSparseReadResult : public Context {
5371   int* result;
5372   bufferlist* data_bl;
5373   uint64_t data_offset;
5374   ceph_le64* len;
5375   ToSparseReadResult(int* result, bufferlist* bl, uint64_t offset,
5376                      ceph_le64* len)
5377     : result(result), data_bl(bl), data_offset(offset),len(len) {}
5378   void finish(int r) override {
5379     if (r < 0) {
5380       *result = r;
5381       return;
5382     }
5383     *result = 0;
5384     *len = r;
5385     bufferlist outdata;
5386     map<uint64_t, uint64_t> extents = {{data_offset, r}};
5387     encode(extents, outdata);
5388     encode_destructively(*data_bl, outdata);
5389     data_bl->swap(outdata);
5390   }
5391 };
5392
5393 template<typename V>
5394 static string list_keys(const map<string, V>& m) {
5395   string s;
5396   for (typename map<string, V>::const_iterator itr = m.begin(); itr != m.end(); ++itr) {
5397     if (!s.empty()) {
5398       s.push_back(',');
5399     }
5400     s.append(itr->first);
5401   }
5402   return s;
5403 }
5404
5405 template<typename T>
5406 static string list_entries(const T& m) {
5407   string s;
5408   for (typename T::const_iterator itr = m.begin(); itr != m.end(); ++itr) {
5409     if (!s.empty()) {
5410       s.push_back(',');
5411     }
5412     s.append(*itr);
5413   }
5414   return s;
5415 }
5416
5417 void PrimaryLogPG::maybe_create_new_object(
5418   OpContext *ctx,
5419   bool ignore_transaction)
5420 {
5421   ObjectState& obs = ctx->new_obs;
5422   if (!obs.exists) {
5423     ctx->delta_stats.num_objects++;
5424     obs.exists = true;
5425     ceph_assert(!obs.oi.is_whiteout());
5426     obs.oi.new_object();
5427     if (!ignore_transaction)
5428       ctx->op_t->create(obs.oi.soid);
5429   } else if (obs.oi.is_whiteout()) {
5430     dout(10) << __func__ << " clearing whiteout on " << obs.oi.soid << dendl;
5431     ctx->new_obs.oi.clear_flag(object_info_t::FLAG_WHITEOUT);
5432     --ctx->delta_stats.num_whiteouts;
5433   }
5434 }
5435
5436 struct ReadFinisher : public PrimaryLogPG::OpFinisher {
5437   OSDOp& osd_op;
5438
5439   explicit ReadFinisher(OSDOp& osd_op) : osd_op(osd_op) {
5440   }
5441
5442   int execute() override {
5443     return osd_op.rval;
5444   }
5445 };
5446
5447 struct C_ChecksumRead : public Context {
5448   PrimaryLogPG *primary_log_pg;
5449   OSDOp &osd_op;
5450   Checksummer::CSumType csum_type;
5451   bufferlist init_value_bl;
5452   ceph_le64 read_length;
5453   bufferlist read_bl;
5454   Context *fill_extent_ctx;
5455
5456   C_ChecksumRead(PrimaryLogPG *primary_log_pg, OSDOp &osd_op,
5457                  Checksummer::CSumType csum_type, bufferlist &&init_value_bl,
5458                  std::optional<uint32_t> maybe_crc, uint64_t size,
5459                  OSDService *osd, hobject_t soid, uint32_t flags)
5460     : primary_log_pg(primary_log_pg), osd_op(osd_op),
5461       csum_type(csum_type), init_value_bl(std::move(init_value_bl)),
5462       fill_extent_ctx(new FillInVerifyExtent(&read_length, &osd_op.rval,
5463                                              &read_bl, maybe_crc, size,
5464                                              osd, soid, flags)) {
5465   }
5466   ~C_ChecksumRead() override {
5467     delete fill_extent_ctx;
5468   }
5469
5470   void finish(int r) override {
5471     fill_extent_ctx->complete(r);
5472     fill_extent_ctx = nullptr;
5473
5474     if (osd_op.rval >= 0) {
5475       bufferlist::const_iterator init_value_bl_it = init_value_bl.begin();
5476       osd_op.rval = primary_log_pg->finish_checksum(osd_op, csum_type,
5477                                                     &init_value_bl_it, read_bl);
5478     }
5479   }
5480 };
5481
5482 int PrimaryLogPG::do_checksum(OpContext *ctx, OSDOp& osd_op,
5483                               bufferlist::const_iterator *bl_it)
5484 {
5485   dout(20) << __func__ << dendl;
5486
5487   auto& op = osd_op.op;
5488   if (op.checksum.chunk_size > 0) {
5489     if (op.checksum.length == 0) {
5490       dout(10) << __func__ << ": length required when chunk size provided"
5491                << dendl;
5492       return -EINVAL;
5493     }
5494     if (op.checksum.length % op.checksum.chunk_size != 0) {
5495       dout(10) << __func__ << ": length not aligned to chunk size" << dendl;
5496       return -EINVAL;
5497     }
5498   }
5499
5500   auto& oi = ctx->new_obs.oi;
5501   if (op.checksum.offset == 0 && op.checksum.length == 0) {
5502     // zeroed offset+length implies checksum whole object
5503     op.checksum.length = oi.size;
5504   } else if (op.checksum.offset >= oi.size) {
5505     // read size was trimmed to zero, do nothing
5506     // see PrimaryLogPG::do_read
5507     return 0;
5508   } else if (op.extent.offset + op.extent.length > oi.size) {
5509     op.extent.length = oi.size - op.extent.offset;
5510     if (op.checksum.chunk_size > 0 &&
5511         op.checksum.length % op.checksum.chunk_size != 0) {
5512       dout(10) << __func__ << ": length (trimmed to 0x"
5513                << std::hex << op.checksum.length
5514                << ") not aligned to chunk size 0x"
5515                << op.checksum.chunk_size << std::dec
5516                << dendl;
5517       return -EINVAL;
5518     }
5519   }
5520
5521   Checksummer::CSumType csum_type;
5522   switch (op.checksum.type) {
5523   case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH32:
5524     csum_type = Checksummer::CSUM_XXHASH32;
5525     break;
5526   case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH64:
5527     csum_type = Checksummer::CSUM_XXHASH64;
5528     break;
5529   case CEPH_OSD_CHECKSUM_OP_TYPE_CRC32C:
5530     csum_type = Checksummer::CSUM_CRC32C;
5531     break;
5532   default:
5533     dout(10) << __func__ << ": unknown crc type ("
5534              << static_cast<uint32_t>(op.checksum.type) << ")" << dendl;
5535     return -EINVAL;
5536   }
5537
5538   size_t csum_init_value_size = Checksummer::get_csum_init_value_size(csum_type);
5539   if (bl_it->get_remaining() < csum_init_value_size) {
5540     dout(10) << __func__ << ": init value not provided" << dendl;
5541     return -EINVAL;
5542   }
5543
5544   bufferlist init_value_bl;
5545   init_value_bl.substr_of(bl_it->get_bl(), bl_it->get_off(),
5546                           csum_init_value_size);
5547   *bl_it += csum_init_value_size;
5548
5549   if (pool.info.is_erasure() && op.checksum.length > 0) {
5550     // If there is a data digest and it is possible we are reading
5551     // entire object, pass the digest.
5552     std::optional<uint32_t> maybe_crc;
5553     if (oi.is_data_digest() && op.checksum.offset == 0 &&
5554         op.checksum.length >= oi.size) {
5555       maybe_crc = oi.data_digest;
5556     }
5557
5558     // async read
5559     auto& soid = oi.soid;
5560     auto checksum_ctx = new C_ChecksumRead(this, osd_op, csum_type,
5561                                            std::move(init_value_bl), maybe_crc,
5562                                            oi.size, osd, soid, op.flags);
5563
5564     ctx->pending_async_reads.push_back({
5565       {op.checksum.offset, op.checksum.length, op.flags},
5566       {&checksum_ctx->read_bl, checksum_ctx}});
5567
5568     dout(10) << __func__ << ": async_read noted for " << soid << dendl;
5569     ctx->op_finishers[ctx->current_osd_subop_num].reset(
5570       new ReadFinisher(osd_op));
5571     return -EINPROGRESS;
5572   }
5573
5574   // sync read
5575   std::vector<OSDOp> read_ops(1);
5576   auto& read_op = read_ops[0];
5577   if (op.checksum.length > 0) {
5578     read_op.op.op = CEPH_OSD_OP_READ;
5579     read_op.op.flags = op.flags;
5580     read_op.op.extent.offset = op.checksum.offset;
5581     read_op.op.extent.length = op.checksum.length;
5582     read_op.op.extent.truncate_size = 0;
5583     read_op.op.extent.truncate_seq = 0;
5584
5585     int r = do_osd_ops(ctx, read_ops);
5586     if (r < 0) {
5587       derr << __func__ << ": do_osd_ops failed: " << cpp_strerror(r) << dendl;
5588       return r;
5589     }
5590   }
5591
5592   bufferlist::const_iterator init_value_bl_it = init_value_bl.begin();
5593   return finish_checksum(osd_op, csum_type, &init_value_bl_it,
5594                          read_op.outdata);
5595 }
5596
5597 int PrimaryLogPG::finish_checksum(OSDOp& osd_op,
5598                                   Checksummer::CSumType csum_type,
5599                                   bufferlist::const_iterator *init_value_bl_it,
5600                                   const bufferlist &read_bl) {
5601   dout(20) << __func__ << dendl;
5602
5603   auto& op = osd_op.op;
5604
5605   if (op.checksum.length > 0 && read_bl.length() != op.checksum.length) {
5606     derr << __func__ << ": bytes read " << read_bl.length() << " != "
5607          << op.checksum.length << dendl;
5608     return -EINVAL;
5609   }
5610
5611   size_t csum_chunk_size = (op.checksum.chunk_size != 0 ?
5612                               op.checksum.chunk_size : read_bl.length());
5613   uint32_t csum_count = (csum_chunk_size > 0 ?
5614                            read_bl.length() / csum_chunk_size : 0);
5615
5616   bufferlist csum;
5617   bufferptr csum_data;
5618   if (csum_count > 0) {
5619     size_t csum_value_size = Checksummer::get_csum_value_size(csum_type);
5620     csum_data = ceph::buffer::create(csum_value_size * csum_count);
5621     csum_data.zero();
5622     csum.append(csum_data);
5623
5624     switch (csum_type) {
5625     case Checksummer::CSUM_XXHASH32:
5626       {
5627         Checksummer::xxhash32::init_value_t init_value;
5628         decode(init_value, *init_value_bl_it);
5629         Checksummer::calculate<Checksummer::xxhash32>(
5630           init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
5631           &csum_data);
5632       }
5633       break;
5634     case Checksummer::CSUM_XXHASH64:
5635       {
5636         Checksummer::xxhash64::init_value_t init_value;
5637         decode(init_value, *init_value_bl_it);
5638         Checksummer::calculate<Checksummer::xxhash64>(
5639           init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
5640           &csum_data);
5641       }
5642       break;
5643     case Checksummer::CSUM_CRC32C:
5644       {
5645         Checksummer::crc32c::init_value_t init_value;
5646         decode(init_value, *init_value_bl_it);
5647         Checksummer::calculate<Checksummer::crc32c>(
5648           init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
5649           &csum_data);
5650       }
5651       break;
5652     default:
5653       break;
5654     }
5655   }
5656
5657   encode(csum_count, osd_op.outdata);
5658   osd_op.outdata.claim_append(csum);
5659   return 0;
5660 }
5661
5662 struct C_ExtentCmpRead : public Context {
5663   PrimaryLogPG *primary_log_pg;
5664   OSDOp &osd_op;
5665   ceph_le64 read_length{};
5666   bufferlist read_bl;
5667   Context *fill_extent_ctx;
5668
5669   C_ExtentCmpRead(PrimaryLogPG *primary_log_pg, OSDOp &osd_op,
5670                   std::optional<uint32_t> maybe_crc, uint64_t size,
5671                   OSDService *osd, hobject_t soid, uint32_t flags)
5672     : primary_log_pg(primary_log_pg), osd_op(osd_op),
5673       fill_extent_ctx(new FillInVerifyExtent(&read_length, &osd_op.rval,
5674                                              &read_bl, maybe_crc, size,
5675                                              osd, soid, flags)) {
5676   }
5677   ~C_ExtentCmpRead() override {
5678     delete fill_extent_ctx;
5679   }
5680
5681   void finish(int r) override {
5682     if (r == -ENOENT) {
5683       osd_op.rval = 0;
5684       read_bl.clear();
5685       delete fill_extent_ctx;
5686     } else {
5687       fill_extent_ctx->complete(r);
5688     }
5689     fill_extent_ctx = nullptr;
5690
5691     if (osd_op.rval >= 0) {
5692       osd_op.rval = primary_log_pg->finish_extent_cmp(osd_op, read_bl);
5693     }
5694   }
5695 };
5696
5697 int PrimaryLogPG::do_extent_cmp(OpContext *ctx, OSDOp& osd_op)
5698 {
5699   dout(20) << __func__ << dendl;
5700   ceph_osd_op& op = osd_op.op;
5701
5702   auto& oi = ctx->new_obs.oi;
5703   uint64_t size = oi.size;
5704   if ((oi.truncate_seq < op.extent.truncate_seq) &&
5705       (op.extent.offset + op.extent.length > op.extent.truncate_size)) {
5706     size = op.extent.truncate_size;
5707   }
5708
5709   if (op.extent.offset >= size) {
5710     op.extent.length = 0;
5711   } else if (op.extent.offset + op.extent.length > size) {
5712     op.extent.length = size - op.extent.offset;
5713   }
5714
5715   if (op.extent.length == 0) {
5716     dout(20) << __func__ << " zero length extent" << dendl;
5717     return finish_extent_cmp(osd_op, bufferlist{});
5718   } else if (!ctx->obs->exists || ctx->obs->oi.is_whiteout()) {
5719     dout(20) << __func__ << " object DNE" << dendl;
5720     return finish_extent_cmp(osd_op, {});
5721   } else if (pool.info.is_erasure()) {
5722     // If there is a data digest and it is possible we are reading
5723     // entire object, pass the digest.
5724     std::optional<uint32_t> maybe_crc;
5725     if (oi.is_data_digest() && op.checksum.offset == 0 &&
5726         op.checksum.length >= oi.size) {
5727       maybe_crc = oi.data_digest;
5728     }
5729
5730     // async read
5731     auto& soid = oi.soid;
5732     auto extent_cmp_ctx = new C_ExtentCmpRead(this, osd_op, maybe_crc, oi.size,
5733                                               osd, soid, op.flags);
5734     ctx->pending_async_reads.push_back({
5735       {op.extent.offset, op.extent.length, op.flags},
5736       {&extent_cmp_ctx->read_bl, extent_cmp_ctx}});
5737
5738     dout(10) << __func__ << ": async_read noted for " << soid << dendl;
5739
5740     ctx->op_finishers[ctx->current_osd_subop_num].reset(
5741       new ReadFinisher(osd_op));
5742     return -EINPROGRESS;
5743   }
5744
5745   // sync read
5746   vector<OSDOp> read_ops(1);
5747   OSDOp& read_op = read_ops[0];
5748
5749   read_op.op.op = CEPH_OSD_OP_SYNC_READ;
5750   read_op.op.extent.offset = op.extent.offset;
5751   read_op.op.extent.length = op.extent.length;
5752   read_op.op.extent.truncate_seq = op.extent.truncate_seq;
5753   read_op.op.extent.truncate_size = op.extent.truncate_size;
5754
5755   int result = do_osd_ops(ctx, read_ops);
5756   if (result < 0) {
5757     derr << __func__ << " failed " << result << dendl;
5758     return result;
5759   }
5760   return finish_extent_cmp(osd_op, read_op.outdata);
5761 }
5762
5763 int PrimaryLogPG::finish_extent_cmp(OSDOp& osd_op, const bufferlist &read_bl)
5764 {
5765   for (uint64_t idx = 0; idx < osd_op.indata.length(); ++idx) {
5766     char read_byte = (idx < read_bl.length() ? read_bl[idx] : 0);
5767     if (osd_op.indata[idx] != read_byte) {
5768         return (-MAX_ERRNO - idx);
5769     }
5770   }
5771
5772   return 0;
5773 }
5774
5775 int PrimaryLogPG::do_read(OpContext *ctx, OSDOp& osd_op) {
5776   dout(20) << __func__ << dendl;
5777   auto& op = osd_op.op;
5778   auto& oi = ctx->new_obs.oi;
5779   auto& soid = oi.soid;
5780   __u32 seq = oi.truncate_seq;
5781   uint64_t size = oi.size;
5782   bool trimmed_read = false;
5783
5784   dout(30) << __func__ << " oi.size: " << oi.size << dendl;
5785   dout(30) << __func__ << " oi.truncate_seq: " << oi.truncate_seq << dendl;
5786   dout(30) << __func__ << " op.extent.truncate_seq: " << op.extent.truncate_seq << dendl;
5787   dout(30) << __func__ << " op.extent.truncate_size: " << op.extent.truncate_size << dendl;
5788
5789   // are we beyond truncate_size?
5790   if ( (seq < op.extent.truncate_seq) &&
5791        (op.extent.offset + op.extent.length > op.extent.truncate_size) &&
5792        (size > op.extent.truncate_size) )
5793     size = op.extent.truncate_size;
5794
5795   if (op.extent.length == 0) //length is zero mean read the whole object
5796     op.extent.length = size;
5797
5798   if (op.extent.offset >= size) {
5799     op.extent.length = 0;
5800     trimmed_read = true;
5801   } else if (op.extent.offset + op.extent.length > size) {
5802     op.extent.length = size - op.extent.offset;
5803     trimmed_read = true;
5804   }
5805
5806   dout(30) << __func__ << "op.extent.length is now " << op.extent.length << dendl;
5807
5808   // read into a buffer
5809   int result = 0;
5810   if (trimmed_read && op.extent.length == 0) {
5811     // read size was trimmed to zero and it is expected to do nothing
5812     // a read operation of 0 bytes does *not* do nothing, this is why
5813     // the trimmed_read boolean is needed
5814   } else if (pool.info.is_erasure()) {
5815     // The initialisation below is required to silence a false positive
5816     // -Wmaybe-uninitialized warning
5817     std::optional<uint32_t> maybe_crc;
5818     // If there is a data digest and it is possible we are reading
5819     // entire object, pass the digest.  FillInVerifyExtent will
5820     // will check the oi.size again.
5821     if (oi.is_data_digest() && op.extent.offset == 0 &&
5822         op.extent.length >= oi.size)
5823       maybe_crc = oi.data_digest;
5824     ctx->pending_async_reads.push_back(
5825       make_pair(
5826         boost::make_tuple(op.extent.offset, op.extent.length, op.flags),
5827         make_pair(&osd_op.outdata,
5828                   new FillInVerifyExtent(&op.extent.length, &osd_op.rval,
5829                                          &osd_op.outdata, maybe_crc, oi.size,
5830                                          osd, soid, op.flags))));
5831     dout(10) << " async_read noted for " << soid << dendl;
5832
5833     ctx->op_finishers[ctx->current_osd_subop_num].reset(
5834       new ReadFinisher(osd_op));
5835   } else {
5836     int r = pgbackend->objects_read_sync(
5837       soid, op.extent.offset, op.extent.length, op.flags, &osd_op.outdata);
5838     // whole object?  can we verify the checksum?
5839     if (r >= 0 && op.extent.offset == 0 &&
5840         (uint64_t)r == oi.size && oi.is_data_digest()) {
5841       uint32_t crc = osd_op.outdata.crc32c(-1);
5842       if (oi.data_digest != crc) {
5843         osd->clog->error() << info.pgid << std::hex
5844                            << " full-object read crc 0x" << crc
5845                            << " != expected 0x" << oi.data_digest
5846                            << std::dec << " on " << soid;
5847         r = -EIO; // try repair later
5848       }
5849     }
5850     if (r == -EIO) {
5851       r = rep_repair_primary_object(soid, ctx);
5852     }
5853     if (r >= 0)
5854       op.extent.length = r;
5855     else if (r == -EAGAIN) {
5856       result = -EAGAIN;
5857     } else {
5858       result = r;
5859       op.extent.length = 0;
5860     }
5861     dout(10) << " read got " << r << " / " << op.extent.length
5862              << " bytes from obj " << soid << dendl;
5863   }
5864   if (result >= 0) {
5865     ctx->delta_stats.num_rd_kb += shift_round_up(op.extent.length, 10);
5866     ctx->delta_stats.num_rd++;
5867   }
5868   return result;
5869 }
5870
5871 int PrimaryLogPG::do_sparse_read(OpContext *ctx, OSDOp& osd_op) {
5872   dout(20) << __func__ << dendl;
5873   auto& op = osd_op.op;
5874   auto& oi = ctx->new_obs.oi;
5875   auto& soid = oi.soid;
5876   uint64_t size = oi.size;
5877   uint64_t offset = op.extent.offset;
5878   uint64_t length = op.extent.length;
5879
5880   // are we beyond truncate_size?
5881   if ((oi.truncate_seq < op.extent.truncate_seq) &&
5882        (op.extent.offset + op.extent.length > op.extent.truncate_size) &&
5883        (size > op.extent.truncate_size)) {
5884     size = op.extent.truncate_size;
5885   }
5886
5887   if (offset > size) {
5888     length = 0;
5889   } else if (offset + length > size) {
5890     length = size - offset;
5891   }
5892
5893   ++ctx->num_read;
5894   if (pool.info.is_erasure()) {
5895     // translate sparse read to a normal one if not supported
5896
5897     if (length > 0) {
5898       ctx->pending_async_reads.push_back(
5899         make_pair(
5900           boost::make_tuple(offset, length, op.flags),
5901           make_pair(
5902             &osd_op.outdata,
5903             new ToSparseReadResult(&osd_op.rval, &osd_op.outdata, offset,
5904                                    &op.extent.length))));
5905       dout(10) << " async_read (was sparse_read) noted for " << soid << dendl;
5906
5907       ctx->op_finishers[ctx->current_osd_subop_num].reset(
5908         new ReadFinisher(osd_op));
5909     } else {
5910       dout(10) << " sparse read ended up empty for " << soid << dendl;
5911       map<uint64_t, uint64_t> extents;
5912       encode(extents, osd_op.outdata);
5913     }
5914   } else {
5915     // read into a buffer
5916     map<uint64_t, uint64_t> m;
5917     int r = osd->store->fiemap(ch, ghobject_t(soid, ghobject_t::NO_GEN,
5918                                               info.pgid.shard),
5919                                offset, length, m);
5920     if (r < 0)  {
5921       return r;
5922     }
5923
5924     bufferlist data_bl;
5925     r = pgbackend->objects_readv_sync(soid, std::move(m), op.flags, &data_bl);
5926     if (r == -EIO) {
5927       r = rep_repair_primary_object(soid, ctx);
5928     }
5929     if (r < 0) {
5930       return r;
5931     }
5932
5933     // Why SPARSE_READ need checksum? In fact, librbd always use sparse-read.
5934     // Maybe at first, there is no much whole objects. With continued use, more
5935     // and more whole object exist. So from this point, for spare-read add
5936     // checksum make sense.
5937     if ((uint64_t)r == oi.size && oi.is_data_digest()) {
5938       uint32_t crc = data_bl.crc32c(-1);
5939       if (oi.data_digest != crc) {
5940         osd->clog->error() << info.pgid << std::hex
5941           << " full-object read crc 0x" << crc
5942           << " != expected 0x" << oi.data_digest
5943           << std::dec << " on " << soid;
5944         r = rep_repair_primary_object(soid, ctx);
5945         if (r < 0) {
5946           return r;
5947         }
5948       }
5949     }
5950
5951     op.extent.length = r;
5952
5953     encode(m, osd_op.outdata); // re-encode since it might be modified
5954     ::encode_destructively(data_bl, osd_op.outdata);
5955
5956     dout(10) << " sparse_read got " << r << " bytes from object "
5957              << soid << dendl;
5958   }
5959
5960   ctx->delta_stats.num_rd_kb += shift_round_up(op.extent.length, 10);
5961   ctx->delta_stats.num_rd++;
5962   return 0;
5963 }
5964
5965 int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
5966 {
5967   int result = 0;
5968   SnapSetContext *ssc = ctx->obc->ssc;
5969   ObjectState& obs = ctx->new_obs;
5970   object_info_t& oi = obs.oi;
5971   const hobject_t& soid = oi.soid;
5972   const bool skip_data_digest = osd->store->has_builtin_csum() &&
5973     osd->osd_skip_data_digest;
5974
5975   PGTransaction* t = ctx->op_t.get();
5976
5977   dout(10) << "do_osd_op " << soid << " " << ops << dendl;
5978
5979   ctx->current_osd_subop_num = 0;
5980   for (auto p = ops.begin(); p != ops.end(); ++p, ctx->current_osd_subop_num++, ctx->processed_subop_count++) {
5981     OSDOp& osd_op = *p;
5982     ceph_osd_op& op = osd_op.op;
5983
5984     OpFinisher* op_finisher = nullptr;
5985     {
5986       auto op_finisher_it = ctx->op_finishers.find(ctx->current_osd_subop_num);
5987       if (op_finisher_it != ctx->op_finishers.end()) {
5988         op_finisher = op_finisher_it->second.get();
5989       }
5990     }
5991
5992     // TODO: check endianness (ceph_le32 vs uint32_t, etc.)
5993     // The fields in ceph_osd_op are little-endian (according to the definition in rados.h),
5994     // but the code in this function seems to treat them as native-endian.  What should the
5995     // tracepoints do?
5996     tracepoint(osd, do_osd_op_pre, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op), op.flags);
5997
5998     dout(10) << "do_osd_op  " << osd_op << dendl;
5999
6000     auto bp = osd_op.indata.cbegin();
6001
6002     // user-visible modifcation?
6003     switch (op.op) {
6004       // non user-visible modifications
6005     case CEPH_OSD_OP_WATCH:
6006     case CEPH_OSD_OP_CACHE_EVICT:
6007     case CEPH_OSD_OP_CACHE_FLUSH:
6008     case CEPH_OSD_OP_CACHE_TRY_FLUSH:
6009     case CEPH_OSD_OP_UNDIRTY:
6010     case CEPH_OSD_OP_COPY_FROM:  // we handle user_version update explicitly
6011     case CEPH_OSD_OP_COPY_FROM2:
6012     case CEPH_OSD_OP_CACHE_PIN:
6013     case CEPH_OSD_OP_CACHE_UNPIN:
6014     case CEPH_OSD_OP_SET_REDIRECT:
6015     case CEPH_OSD_OP_SET_CHUNK:
6016     case CEPH_OSD_OP_TIER_PROMOTE:
6017     case CEPH_OSD_OP_TIER_FLUSH:
6018     case CEPH_OSD_OP_TIER_EVICT:
6019       break;
6020     default:
6021       if (op.op & CEPH_OSD_OP_MODE_WR)
6022         ctx->user_modify = true;
6023     }
6024
6025     // munge -1 truncate to 0 truncate
6026     if (ceph_osd_op_uses_extent(op.op) &&
6027         op.extent.truncate_seq == 1 &&
6028         op.extent.truncate_size == (-1ULL)) {
6029       op.extent.truncate_size = 0;
6030       op.extent.truncate_seq = 0;
6031     }
6032
6033     // munge ZERO -> TRUNCATE?  (don't munge to DELETE or we risk hosing attributes)
6034     if (op.op == CEPH_OSD_OP_ZERO &&
6035         obs.exists &&
6036         op.extent.offset < static_cast<Option::size_t>(osd->osd_max_object_size) &&
6037         op.extent.length >= 1 &&
6038         op.extent.length <= static_cast<Option::size_t>(osd->osd_max_object_size) &&
6039         op.extent.offset + op.extent.length >= oi.size) {
6040       if (op.extent.offset >= oi.size) {
6041         // no-op
6042         goto fail;
6043       }
6044       dout(10) << " munging ZERO " << op.extent.offset << "~" << op.extent.length
6045                << " -> TRUNCATE " << op.extent.offset << " (old size is " << oi.size << ")" << dendl;
6046       op.op = CEPH_OSD_OP_TRUNCATE;
6047     }
6048
6049     switch (op.op) {
6050
6051       // --- READS ---
6052
6053     case CEPH_OSD_OP_CMPEXT:
6054       ++ctx->num_read;
6055       tracepoint(osd, do_osd_op_pre_extent_cmp, soid.oid.name.c_str(),
6056                  soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
6057                  op.extent.length, op.extent.truncate_size,
6058                  op.extent.truncate_seq);
6059
6060       if (op_finisher == nullptr) {
6061         result = do_extent_cmp(ctx, osd_op);
6062       } else {
6063         result = op_finisher->execute();
6064       }
6065       break;
6066
6067     case CEPH_OSD_OP_SYNC_READ:
6068       if (pool.info.is_erasure()) {
6069         result = -EOPNOTSUPP;
6070         break;
6071       }
6072       // fall through
6073     case CEPH_OSD_OP_READ:
6074       ++ctx->num_read;
6075       tracepoint(osd, do_osd_op_pre_read, soid.oid.name.c_str(),
6076                  soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
6077                  op.extent.length, op.extent.truncate_size,
6078                  op.extent.truncate_seq);
6079       if (op_finisher == nullptr) {
6080         if (!ctx->data_off) {
6081           ctx->data_off = op.extent.offset;
6082         }
6083         result = do_read(ctx, osd_op);
6084       } else {
6085         result = op_finisher->execute();
6086       }
6087       break;
6088
6089     case CEPH_OSD_OP_CHECKSUM:
6090       ++ctx->num_read;
6091       {
6092         tracepoint(osd, do_osd_op_pre_checksum, soid.oid.name.c_str(),
6093                    soid.snap.val, oi.size, oi.truncate_seq, op.checksum.type,
6094                    op.checksum.offset, op.checksum.length,
6095                    op.checksum.chunk_size);
6096
6097         if (op_finisher == nullptr) {
6098           result = do_checksum(ctx, osd_op, &bp);
6099         } else {
6100           result = op_finisher->execute();
6101         }
6102       }
6103       break;
6104
6105     /* map extents */
6106     case CEPH_OSD_OP_MAPEXT:
6107       tracepoint(osd, do_osd_op_pre_mapext, soid.oid.name.c_str(), soid.snap.val, op.extent.offset, op.extent.length);
6108       if (pool.info.is_erasure()) {
6109         result = -EOPNOTSUPP;
6110         break;
6111       }
6112       ++ctx->num_read;
6113       {
6114         // read into a buffer
6115         bufferlist bl;
6116         int r = osd->store->fiemap(ch, ghobject_t(soid, ghobject_t::NO_GEN,
6117                                                   info.pgid.shard),
6118                                    op.extent.offset, op.extent.length, bl);
6119         osd_op.outdata = std::move(bl);
6120         if (r < 0)
6121           result = r;
6122         else
6123           ctx->delta_stats.num_rd_kb += shift_round_up(bl.length(), 10);
6124         ctx->delta_stats.num_rd++;
6125         dout(10) << " map_extents done on object " << soid << dendl;
6126       }
6127       break;
6128
6129     /* map extents */
6130     case CEPH_OSD_OP_SPARSE_READ:
6131       tracepoint(osd, do_osd_op_pre_sparse_read, soid.oid.name.c_str(),
6132                  soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
6133                  op.extent.length, op.extent.truncate_size,
6134                  op.extent.truncate_seq);
6135       if (op_finisher == nullptr) {
6136         result = do_sparse_read(ctx, osd_op);
6137       } else {
6138         result = op_finisher->execute();
6139       }
6140       break;
6141
6142     case CEPH_OSD_OP_CALL:
6143       {
6144         string cname, mname;
6145         bufferlist indata;
6146         try {
6147           bp.copy(op.cls.class_len, cname);
6148           bp.copy(op.cls.method_len, mname);
6149           bp.copy(op.cls.indata_len, indata);
6150         } catch (ceph::buffer::error& e) {
6151           dout(10) << "call unable to decode class + method + indata" << dendl;
6152           dout(30) << "in dump: ";
6153           osd_op.indata.hexdump(*_dout);
6154           *_dout << dendl;
6155           result = -EINVAL;
6156           tracepoint(osd, do_osd_op_pre_call, soid.oid.name.c_str(), soid.snap.val, "???", "???");
6157           break;
6158         }
6159         tracepoint(osd, do_osd_op_pre_call, soid.oid.name.c_str(), soid.snap.val, cname.c_str(), mname.c_str());
6160
6161         ClassHandler::ClassData *cls;
6162         result = ClassHandler::get_instance().open_class(cname, &cls);
6163         ceph_assert(result == 0);   // init_op_flags() already verified this works.
6164
6165         ClassHandler::ClassMethod *method = cls->get_method(mname);
6166         if (!method) {
6167           dout(10) << "call method " << cname << "." << mname << " does not exist" << dendl;
6168           result = -EOPNOTSUPP;
6169           break;
6170         }
6171
6172         int flags = method->get_flags();
6173         if (flags & CLS_METHOD_WR)
6174           ctx->user_modify = true;
6175
6176         bufferlist outdata;
6177         dout(10) << "call method " << cname << "." << mname << dendl;
6178         int prev_rd = ctx->num_read;
6179         int prev_wr = ctx->num_write;
6180         result = method->exec((cls_method_context_t)&ctx, indata, outdata);
6181
6182         if (ctx->num_read > prev_rd && !(flags & CLS_METHOD_RD)) {
6183           derr << "method " << cname << "." << mname << " tried to read object but is not marked RD" << dendl;
6184           result = -EIO;
6185           break;
6186         }
6187         if (ctx->num_write > prev_wr && !(flags & CLS_METHOD_WR)) {
6188           derr << "method " << cname << "." << mname << " tried to update object but is not marked WR" << dendl;
6189           result = -EIO;
6190           break;
6191         }
6192
6193         dout(10) << "method called response length=" << outdata.length() << dendl;
6194         op.extent.length = outdata.length();
6195         osd_op.outdata.claim_append(outdata);
6196         dout(30) << "out dump: ";
6197         osd_op.outdata.hexdump(*_dout);
6198         *_dout << dendl;
6199       }
6200       break;
6201
6202     case CEPH_OSD_OP_STAT:
6203       // note: stat does not require RD
6204       {
6205         tracepoint(osd, do_osd_op_pre_stat, soid.oid.name.c_str(), soid.snap.val);
6206
6207         if (obs.exists && !oi.is_whiteout()) {
6208           encode(oi.size, osd_op.outdata);
6209           encode(oi.mtime, osd_op.outdata);
6210           dout(10) << "stat oi has " << oi.size << " " << oi.mtime << dendl;
6211         } else {
6212           result = -ENOENT;
6213           dout(10) << "stat oi object does not exist" << dendl;
6214         }
6215
6216         ctx->delta_stats.num_rd++;
6217       }
6218       break;
6219
6220     case CEPH_OSD_OP_ISDIRTY:
6221       ++ctx->num_read;
6222       {
6223         tracepoint(osd, do_osd_op_pre_isdirty, soid.oid.name.c_str(), soid.snap.val);
6224         bool is_dirty = obs.oi.is_dirty();
6225         encode(is_dirty, osd_op.outdata);
6226         ctx->delta_stats.num_rd++;
6227         result = 0;
6228       }
6229       break;
6230
6231     case CEPH_OSD_OP_UNDIRTY:
6232       ++ctx->num_write;
6233       result = 0;
6234       {
6235         tracepoint(osd, do_osd_op_pre_undirty, soid.oid.name.c_str(), soid.snap.val);
6236         if (oi.is_dirty()) {
6237           ctx->undirty = true;  // see make_writeable()
6238           ctx->modify = true;
6239           ctx->delta_stats.num_wr++;
6240         }
6241       }
6242       break;
6243
6244     case CEPH_OSD_OP_CACHE_TRY_FLUSH:
6245       ++ctx->num_write;
6246       result = 0;
6247       {
6248         tracepoint(osd, do_osd_op_pre_try_flush, soid.oid.name.c_str(), soid.snap.val);
6249         if (ctx->lock_type != RWState::RWNONE) {
6250           dout(10) << "cache-try-flush without SKIPRWLOCKS flag set" << dendl;
6251           result = -EINVAL;
6252           break;
6253         }
6254         if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE || obs.oi.has_manifest()) {
6255           result = -EINVAL;
6256           break;
6257         }
6258         if (!obs.exists) {
6259           result = 0;
6260           break;
6261         }
6262         if (oi.is_cache_pinned()) {
6263           dout(10) << "cache-try-flush on a pinned object, consider unpin this object first" << dendl;
6264           result = -EPERM;
6265           break;
6266         }
6267         if (oi.is_dirty()) {
6268           result = start_flush(ctx->op, ctx->obc, false, NULL, std::nullopt);
6269           if (result == -EINPROGRESS)
6270             result = -EAGAIN;
6271         } else {
6272           result = 0;
6273         }
6274       }
6275       break;
6276
6277     case CEPH_OSD_OP_CACHE_FLUSH:
6278       ++ctx->num_write;
6279       result = 0;
6280       {
6281         tracepoint(osd, do_osd_op_pre_cache_flush, soid.oid.name.c_str(), soid.snap.val);
6282         if (ctx->lock_type == RWState::RWNONE) {
6283           dout(10) << "cache-flush with SKIPRWLOCKS flag set" << dendl;
6284           result = -EINVAL;
6285           break;
6286         }
6287         if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE || obs.oi.has_manifest()) {
6288           result = -EINVAL;
6289           break;
6290         }
6291         if (!obs.exists) {
6292           result = 0;
6293           break;
6294         }
6295         if (oi.is_cache_pinned()) {
6296           dout(10) << "cache-flush on a pinned object, consider unpin this object first" << dendl;
6297           result = -EPERM;
6298           break;
6299         }
6300         hobject_t missing;
6301         if (oi.is_dirty()) {
6302           result = start_flush(ctx->op, ctx->obc, true, &missing, std::nullopt);
6303           if (result == -EINPROGRESS)
6304             result = -EAGAIN;
6305         } else {
6306           result = 0;
6307         }
6308         // Check special return value which has set missing_return
6309         if (result == -ENOENT) {
6310           dout(10) << __func__ << " CEPH_OSD_OP_CACHE_FLUSH got ENOENT" << dendl;
6311           ceph_assert(!missing.is_min());
6312           wait_for_unreadable_object(missing, ctx->op);
6313           // Error code which is used elsewhere when wait_for_unreadable_object() is used
6314           result = -EAGAIN;
6315         }
6316       }
6317       break;
6318
6319     case CEPH_OSD_OP_CACHE_EVICT:
6320       ++ctx->num_write;
6321       result = 0;
6322       {
6323         tracepoint(osd, do_osd_op_pre_cache_evict, soid.oid.name.c_str(), soid.snap.val);
6324         if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE || obs.oi.has_manifest()) {
6325           result = -EINVAL;
6326           break;
6327         }
6328         if (!obs.exists) {
6329           result = 0;
6330           break;
6331         }
6332         if (oi.is_cache_pinned()) {
6333           dout(10) << "cache-evict on a pinned object, consider unpin this object first" << dendl;
6334           result = -EPERM;
6335           break;
6336         }
6337         if (oi.is_dirty()) {
6338           result = -EBUSY;
6339           break;
6340         }
6341         if (!oi.watchers.empty()) {
6342           result = -EBUSY;
6343           break;
6344         }
6345         if (soid.snap == CEPH_NOSNAP) {
6346           result = _verify_no_head_clones(soid, ssc->snapset);
6347           if (result < 0)
6348             break;
6349         }
6350         result = _delete_oid(ctx, true, false);
6351         if (result >= 0) {
6352           // mark that this is a cache eviction to avoid triggering normal
6353           // make_writeable() clone creation in finish_ctx()
6354           ctx->cache_operation = true;
6355         }
6356         osd->logger->inc(l_osd_tier_evict);
6357       }
6358       break;
6359
6360     case CEPH_OSD_OP_GETXATTR:
6361       ++ctx->num_read;
6362       {
6363         string aname;
6364         bp.copy(op.xattr.name_len, aname);
6365         tracepoint(osd, do_osd_op_pre_getxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
6366         string name = "_" + aname;
6367         int r = getattr_maybe_cache(
6368           ctx->obc,
6369           name,
6370           &(osd_op.outdata));
6371         if (r >= 0) {
6372           op.xattr.value_len = osd_op.outdata.length();
6373           result = 0;
6374           ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
6375         } else
6376           result = r;
6377
6378         ctx->delta_stats.num_rd++;
6379       }
6380       break;
6381
6382    case CEPH_OSD_OP_GETXATTRS:
6383       ++ctx->num_read;
6384       {
6385         tracepoint(osd, do_osd_op_pre_getxattrs, soid.oid.name.c_str(), soid.snap.val);
6386         map<string, bufferlist,less<>> out;
6387         result = getattrs_maybe_cache(
6388           ctx->obc,
6389           &out);
6390
6391         bufferlist bl;
6392         encode(out, bl);
6393         ctx->delta_stats.num_rd_kb += shift_round_up(bl.length(), 10);
6394         ctx->delta_stats.num_rd++;
6395         osd_op.outdata.claim_append(bl);
6396       }
6397       break;
6398
6399     case CEPH_OSD_OP_CMPXATTR:
6400       ++ctx->num_read;
6401       {
6402         string aname;
6403         bp.copy(op.xattr.name_len, aname);
6404         tracepoint(osd, do_osd_op_pre_cmpxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
6405         string name = "_" + aname;
6406         name[op.xattr.name_len + 1] = 0;
6407
6408         bufferlist xattr;
6409         result = getattr_maybe_cache(
6410           ctx->obc,
6411           name,
6412           &xattr);
6413         if (result < 0 && result != -EEXIST && result != -ENODATA)
6414           break;
6415
6416         ctx->delta_stats.num_rd++;
6417         ctx->delta_stats.num_rd_kb += shift_round_up(xattr.length(), 10);
6418
6419         switch (op.xattr.cmp_mode) {
6420         case CEPH_OSD_CMPXATTR_MODE_STRING:
6421           {
6422             string val;
6423             bp.copy(op.xattr.value_len, val);
6424             val[op.xattr.value_len] = 0;
6425             dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name << " val=" << val
6426                      << " op=" << (int)op.xattr.cmp_op << " mode=" << (int)op.xattr.cmp_mode << dendl;
6427             result = do_xattr_cmp_str(op.xattr.cmp_op, val, xattr);
6428           }
6429           break;
6430
6431         case CEPH_OSD_CMPXATTR_MODE_U64:
6432           {
6433             uint64_t u64val;
6434             try {
6435               decode(u64val, bp);
6436             }
6437             catch (ceph::buffer::error& e) {
6438               result = -EINVAL;
6439               goto fail;
6440             }
6441             dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name << " val=" << u64val
6442                      << " op=" << (int)op.xattr.cmp_op << " mode=" << (int)op.xattr.cmp_mode << dendl;
6443             result = do_xattr_cmp_u64(op.xattr.cmp_op, u64val, xattr);
6444           }
6445           break;
6446
6447         default:
6448           dout(10) << "bad cmp mode " << (int)op.xattr.cmp_mode << dendl;
6449           result = -EINVAL;
6450         }
6451
6452         if (!result) {
6453           dout(10) << "comparison returned false" << dendl;
6454           result = -ECANCELED;
6455           break;
6456         }
6457         if (result < 0) {
6458           dout(10) << "comparison returned " << result << " " << cpp_strerror(-result) << dendl;
6459           break;
6460         }
6461
6462         dout(10) << "comparison returned true" << dendl;
6463       }
6464       break;
6465
6466     case CEPH_OSD_OP_ASSERT_VER:
6467       ++ctx->num_read;
6468       {
6469         uint64_t ver = op.assert_ver.ver;
6470         tracepoint(osd, do_osd_op_pre_assert_ver, soid.oid.name.c_str(), soid.snap.val, ver);
6471         if (!ver) {
6472           result = -EINVAL;
6473         } else if (ver < oi.user_version) {
6474           result = -ERANGE;
6475         } else if (ver > oi.user_version) {
6476           result = -EOVERFLOW;
6477         }
6478       }
6479       break;
6480
6481     case CEPH_OSD_OP_LIST_WATCHERS:
6482       ++ctx->num_read;
6483       {
6484         tracepoint(osd, do_osd_op_pre_list_watchers, soid.oid.name.c_str(), soid.snap.val);
6485         obj_list_watch_response_t resp;
6486
6487         map<pair<uint64_t, entity_name_t>, watch_info_t>::const_iterator oi_iter;
6488         for (oi_iter = oi.watchers.begin(); oi_iter != oi.watchers.end();
6489                                        ++oi_iter) {
6490           dout(20) << "key cookie=" << oi_iter->first.first
6491                << " entity=" << oi_iter->first.second << " "
6492                << oi_iter->second << dendl;
6493           ceph_assert(oi_iter->first.first == oi_iter->second.cookie);
6494           ceph_assert(oi_iter->first.second.is_client());
6495
6496           watch_item_t wi(oi_iter->first.second, oi_iter->second.cookie,
6497                  oi_iter->second.timeout_seconds, oi_iter->second.addr);
6498           resp.entries.push_back(wi);
6499         }
6500
6501         resp.encode(osd_op.outdata, ctx->get_features());
6502         result = 0;
6503
6504         ctx->delta_stats.num_rd++;
6505         break;
6506       }
6507
6508     case CEPH_OSD_OP_LIST_SNAPS:
6509       ++ctx->num_read;
6510       {
6511         tracepoint(osd, do_osd_op_pre_list_snaps, soid.oid.name.c_str(), soid.snap.val);
6512         obj_list_snap_response_t resp;
6513
6514         if (!ssc) {
6515           ssc = ctx->obc->ssc = get_snapset_context(soid, false);
6516         }
6517         ceph_assert(ssc);
6518         dout(20) << " snapset " << ssc->snapset << dendl;
6519
6520         int clonecount = ssc->snapset.clones.size();
6521         clonecount++;  // for head
6522         resp.clones.reserve(clonecount);
6523         for (auto clone_iter = ssc->snapset.clones.begin();
6524              clone_iter != ssc->snapset.clones.end(); ++clone_iter) {
6525           clone_info ci;
6526           ci.cloneid = *clone_iter;
6527
6528           hobject_t clone_oid = soid;
6529           clone_oid.snap = *clone_iter;
6530
6531           auto p = ssc->snapset.clone_snaps.find(*clone_iter);
6532           if (p == ssc->snapset.clone_snaps.end()) {
6533             osd->clog->error() << "osd." << osd->whoami
6534                                << ": inconsistent clone_snaps found for oid "
6535                                << soid << " clone " << *clone_iter
6536                                << " snapset " << ssc->snapset;
6537             result = -EINVAL;
6538             break;
6539           }
6540           for (auto q = p->second.rbegin(); q != p->second.rend(); ++q) {
6541             ci.snaps.push_back(*q);
6542           }
6543
6544           dout(20) << " clone " << *clone_iter << " snaps " << ci.snaps << dendl;
6545
6546           map<snapid_t, interval_set<uint64_t> >::const_iterator coi;
6547           coi = ssc->snapset.clone_overlap.find(ci.cloneid);
6548           if (coi == ssc->snapset.clone_overlap.end()) {
6549             osd->clog->error() << "osd." << osd->whoami
6550                                << ": inconsistent clone_overlap found for oid "
6551                               << soid << " clone " << *clone_iter;
6552             result = -EINVAL;
6553             break;
6554           }
6555           const interval_set<uint64_t> &o = coi->second;
6556           ci.overlap.reserve(o.num_intervals());
6557           for (interval_set<uint64_t>::const_iterator r = o.begin();
6558                r != o.end(); ++r) {
6559             ci.overlap.push_back(pair<uint64_t,uint64_t>(r.get_start(),
6560                                                          r.get_len()));
6561           }
6562
6563           map<snapid_t, uint64_t>::const_iterator si;
6564           si = ssc->snapset.clone_size.find(ci.cloneid);
6565           if (si == ssc->snapset.clone_size.end()) {
6566             osd->clog->error() << "osd." << osd->whoami
6567                                << ": inconsistent clone_size found for oid "
6568                                << soid << " clone " << *clone_iter;
6569             result = -EINVAL;
6570             break;
6571           }
6572           ci.size = si->second;
6573
6574           resp.clones.push_back(ci);
6575         }
6576         if (result < 0) {
6577           break;
6578         }
6579         if (!ctx->obc->obs.oi.is_whiteout()) {
6580           ceph_assert(obs.exists);
6581           clone_info ci;
6582           ci.cloneid = CEPH_NOSNAP;
6583
6584           //Size for HEAD is oi.size
6585           ci.size = oi.size;
6586
6587           resp.clones.push_back(ci);
6588         }
6589         resp.seq = ssc->snapset.seq;
6590
6591         resp.encode(osd_op.outdata);
6592         result = 0;
6593
6594         ctx->delta_stats.num_rd++;
6595         break;
6596       }
6597
6598    case CEPH_OSD_OP_NOTIFY:
6599       ++ctx->num_read;
6600       {
6601         uint32_t timeout;
6602         bufferlist bl;
6603
6604         try {
6605           uint32_t ver; // obsolete
6606           decode(ver, bp);
6607           decode(timeout, bp);
6608           decode(bl, bp);
6609         } catch (const ceph::buffer::error &e) {
6610           timeout = 0;
6611         }
6612         tracepoint(osd, do_osd_op_pre_notify, soid.oid.name.c_str(), soid.snap.val, timeout);
6613         if (!timeout)
6614           timeout = cct->_conf->osd_default_notify_timeout;
6615
6616         notify_info_t n;
6617         n.timeout = timeout;
6618         n.notify_id = osd->get_next_id(get_osdmap_epoch());
6619         n.cookie = op.notify.cookie;
6620         n.bl = bl;
6621         ctx->notifies.push_back(n);
6622
6623         // return our unique notify id to the client
6624         encode(n.notify_id, osd_op.outdata);
6625       }
6626       break;
6627
6628     case CEPH_OSD_OP_NOTIFY_ACK:
6629       ++ctx->num_read;
6630       {
6631         try {
6632           uint64_t notify_id = 0;
6633           uint64_t watch_cookie = 0;
6634           decode(notify_id, bp);
6635           decode(watch_cookie, bp);
6636           bufferlist reply_bl;
6637           if (!bp.end()) {
6638             decode(reply_bl, bp);
6639           }
6640           tracepoint(osd, do_osd_op_pre_notify_ack, soid.oid.name.c_str(), soid.snap.val, notify_id, watch_cookie, "Y");
6641           OpContext::NotifyAck ack(notify_id, watch_cookie, reply_bl);
6642           ctx->notify_acks.push_back(ack);
6643         } catch (const ceph::buffer::error &e) {
6644           tracepoint(osd, do_osd_op_pre_notify_ack, soid.oid.name.c_str(), soid.snap.val, op.watch.cookie, 0, "N");
6645           OpContext::NotifyAck ack(
6646             // op.watch.cookie is actually the notify_id for historical reasons
6647             op.watch.cookie
6648             );
6649           ctx->notify_acks.push_back(ack);
6650         }
6651       }
6652       break;
6653
6654     case CEPH_OSD_OP_SETALLOCHINT:
6655       ++ctx->num_write;
6656       result = 0;
6657       {
6658         tracepoint(osd, do_osd_op_pre_setallochint, soid.oid.name.c_str(), soid.snap.val, op.alloc_hint.expected_object_size, op.alloc_hint.expected_write_size);
6659         maybe_create_new_object(ctx);
6660         oi.expected_object_size = op.alloc_hint.expected_object_size;
6661         oi.expected_write_size = op.alloc_hint.expected_write_size;
6662         oi.alloc_hint_flags = op.alloc_hint.flags;
6663         t->set_alloc_hint(soid, op.alloc_hint.expected_object_size,
6664                           op.alloc_hint.expected_write_size,
6665                           op.alloc_hint.flags);
6666       }
6667       break;
6668
6669
6670       // --- WRITES ---
6671
6672       // -- object data --
6673
6674     case CEPH_OSD_OP_WRITE:
6675       ++ctx->num_write;
6676       result = 0;
6677       { // write
6678         __u32 seq = oi.truncate_seq;
6679         tracepoint(osd, do_osd_op_pre_write, soid.oid.name.c_str(), soid.snap.val, oi.size, seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
6680         if (op.extent.length != osd_op.indata.length()) {
6681           result = -EINVAL;
6682           break;
6683         }
6684
6685         if (pool.info.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED))
6686           op.flags = op.flags | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
6687
6688         if (pool.info.requires_aligned_append() &&
6689             (op.extent.offset % pool.info.required_alignment() != 0)) {
6690           result = -EOPNOTSUPP;
6691           break;
6692         }
6693
6694         if (!obs.exists) {
6695           if (pool.info.requires_aligned_append() && op.extent.offset) {
6696             result = -EOPNOTSUPP;
6697             break;
6698           }
6699         } else if (op.extent.offset != oi.size &&
6700                    pool.info.requires_aligned_append()) {
6701           result = -EOPNOTSUPP;
6702           break;
6703         }
6704
6705         if (seq && (seq > op.extent.truncate_seq) &&
6706             (op.extent.offset + op.extent.length > oi.size)) {
6707           // old write, arrived after trimtrunc
6708           op.extent.length = (op.extent.offset > oi.size ? 0 : oi.size - op.extent.offset);
6709           dout(10) << " old truncate_seq " << op.extent.truncate_seq << " < current " << seq
6710                    << ", adjusting write length to " << op.extent.length << dendl;
6711           bufferlist t;
6712           t.substr_of(osd_op.indata, 0, op.extent.length);
6713           osd_op.indata.swap(t);
6714         }
6715         if (op.extent.truncate_seq > seq) {
6716           // write arrives before trimtrunc
6717           if (obs.exists && !oi.is_whiteout()) {
6718             dout(10) << " truncate_seq " << op.extent.truncate_seq << " > current " << seq
6719                      << ", truncating to " << op.extent.truncate_size << dendl;
6720             t->truncate(soid, op.extent.truncate_size);
6721             oi.truncate_seq = op.extent.truncate_seq;
6722             oi.truncate_size = op.extent.truncate_size;
6723             if (oi.size > op.extent.truncate_size) {
6724               interval_set<uint64_t> trim;
6725               trim.insert(op.extent.truncate_size,
6726                 oi.size - op.extent.truncate_size);
6727               ctx->modified_ranges.union_of(trim);
6728               ctx->clean_regions.mark_data_region_dirty(op.extent.truncate_size, oi.size - op.extent.truncate_size);
6729               oi.clear_data_digest();
6730             }
6731             if (op.extent.truncate_size != oi.size) {
6732               truncate_update_size_and_usage(ctx->delta_stats,
6733                                              oi,
6734                                              op.extent.truncate_size);
6735             }
6736           } else {
6737             dout(10) << " truncate_seq " << op.extent.truncate_seq << " > current " << seq
6738                      << ", but object is new" << dendl;
6739             oi.truncate_seq = op.extent.truncate_seq;
6740             oi.truncate_size = op.extent.truncate_size;
6741           }
6742         }
6743         result = check_offset_and_length(
6744           op.extent.offset, op.extent.length,
6745           static_cast<Option::size_t>(osd->osd_max_object_size), get_dpp());
6746         if (result < 0)
6747           break;
6748
6749         maybe_create_new_object(ctx);
6750
6751         if (op.extent.length == 0) {
6752           if (op.extent.offset > oi.size) {
6753             t->truncate(
6754               soid, op.extent.offset);
6755             truncate_update_size_and_usage(ctx->delta_stats, oi,
6756                                            op.extent.offset);
6757           } else {
6758             t->nop(soid);
6759           }
6760         } else {
6761           t->write(
6762             soid, op.extent.offset, op.extent.length, osd_op.indata, op.flags);
6763         }
6764
6765         if (op.extent.offset == 0 && op.extent.length >= oi.size
6766             && !skip_data_digest) {
6767           obs.oi.set_data_digest(osd_op.indata.crc32c(-1));
6768         } else if (op.extent.offset == oi.size && obs.oi.is_data_digest()) {
6769           if (skip_data_digest) {
6770             obs.oi.clear_data_digest();
6771           } else {
6772             obs.oi.set_data_digest(osd_op.indata.crc32c(obs.oi.data_digest));
6773           }
6774         } else {
6775           obs.oi.clear_data_digest();
6776         }
6777         write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
6778                                     op.extent.offset, op.extent.length);
6779         ctx->clean_regions.mark_data_region_dirty(op.extent.offset, op.extent.length);
6780         dout(10) << "clean_regions modified" << ctx->clean_regions << dendl;
6781       }
6782       break;
6783
6784     case CEPH_OSD_OP_WRITEFULL:
6785       ++ctx->num_write;
6786       result = 0;
6787       { // write full object
6788         tracepoint(osd, do_osd_op_pre_writefull, soid.oid.name.c_str(), soid.snap.val, oi.size, 0, op.extent.length);
6789
6790         if (op.extent.length != osd_op.indata.length()) {
6791           result = -EINVAL;
6792           break;
6793         }
6794         result = check_offset_and_length(
6795           0, op.extent.length,
6796           static_cast<Option::size_t>(osd->osd_max_object_size), get_dpp());
6797         if (result < 0)
6798           break;
6799
6800         if (pool.info.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED))
6801           op.flags = op.flags | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
6802
6803         maybe_create_new_object(ctx);
6804         if (pool.info.is_erasure()) {
6805           t->truncate(soid, 0);
6806         } else if (obs.exists && op.extent.length < oi.size) {
6807           t->truncate(soid, op.extent.length);
6808         }
6809         if (op.extent.length) {
6810           t->write(soid, 0, op.extent.length, osd_op.indata, op.flags);
6811         }
6812         if (!skip_data_digest) {
6813           obs.oi.set_data_digest(osd_op.indata.crc32c(-1));
6814         } else {
6815           obs.oi.clear_data_digest();
6816         }
6817         ctx->clean_regions.mark_data_region_dirty(0,
6818           std::max((uint64_t)op.extent.length, oi.size));
6819         write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
6820             0, op.extent.length, true);
6821       }
6822       break;
6823
6824     case CEPH_OSD_OP_WRITESAME:
6825       ++ctx->num_write;
6826       tracepoint(osd, do_osd_op_pre_writesame, soid.oid.name.c_str(), soid.snap.val, oi.size, op.writesame.offset, op.writesame.length, op.writesame.data_length);
6827       result = do_writesame(ctx, osd_op);
6828       break;
6829
6830     case CEPH_OSD_OP_ROLLBACK :
6831       ++ctx->num_write;
6832       tracepoint(osd, do_osd_op_pre_rollback, soid.oid.name.c_str(), soid.snap.val);
6833       result = _rollback_to(ctx, osd_op);
6834       break;
6835
6836     case CEPH_OSD_OP_ZERO:
6837       tracepoint(osd, do_osd_op_pre_zero, soid.oid.name.c_str(), soid.snap.val, op.extent.offset, op.extent.length);
6838       if (pool.info.requires_aligned_append()) {
6839         result = -EOPNOTSUPP;
6840         break;
6841       }
6842       ++ctx->num_write;
6843       { // zero
6844         result = check_offset_and_length(
6845           op.extent.offset, op.extent.length,
6846           static_cast<Option::size_t>(osd->osd_max_object_size), get_dpp());
6847         if (result < 0)
6848           break;
6849
6850         if (op.extent.length && obs.exists && !oi.is_whiteout()) {
6851           t->zero(soid, op.extent.offset, op.extent.length);
6852           interval_set<uint64_t> ch;
6853           ch.insert(op.extent.offset, op.extent.length);
6854           ctx->modified_ranges.union_of(ch);
6855           ctx->clean_regions.mark_data_region_dirty(op.extent.offset, op.extent.length);
6856           ctx->delta_stats.num_wr++;
6857           oi.clear_data_digest();
6858         } else {
6859           // no-op
6860         }
6861       }
6862       break;
6863     case CEPH_OSD_OP_CREATE:
6864       ++ctx->num_write;
6865       result = 0;
6866       {
6867         tracepoint(osd, do_osd_op_pre_create, soid.oid.name.c_str(), soid.snap.val);
6868         if (obs.exists && !oi.is_whiteout() &&
6869             (op.flags & CEPH_OSD_OP_FLAG_EXCL)) {
6870           result = -EEXIST; /* this is an exclusive create */
6871         } else {
6872           if (osd_op.indata.length()) {
6873             auto p = osd_op.indata.cbegin();
6874             string category;
6875             try {
6876               decode(category, p);
6877             }
6878             catch (ceph::buffer::error& e) {
6879               result = -EINVAL;
6880               goto fail;
6881             }
6882             // category is no longer implemented.
6883           }
6884           maybe_create_new_object(ctx);
6885           t->nop(soid);
6886         }
6887       }
6888       break;
6889
6890     case CEPH_OSD_OP_TRIMTRUNC:
6891       op.extent.offset = op.extent.truncate_size;
6892       // falling through
6893
6894     case CEPH_OSD_OP_TRUNCATE:
6895       tracepoint(osd, do_osd_op_pre_truncate, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
6896       if (pool.info.requires_aligned_append()) {
6897         result = -EOPNOTSUPP;
6898         break;
6899       }
6900       ++ctx->num_write;
6901       result = 0;
6902       {
6903         // truncate
6904         if (!obs.exists || oi.is_whiteout()) {
6905           dout(10) << " object dne, truncate is a no-op" << dendl;
6906           break;
6907         }
6908
6909         result = check_offset_and_length(
6910           op.extent.offset, op.extent.length,
6911           static_cast<Option::size_t>(osd->osd_max_object_size), get_dpp());
6912         if (result < 0)
6913           break;
6914
6915         if (op.extent.truncate_seq) {
6916           ceph_assert(op.extent.offset == op.extent.truncate_size);
6917           if (op.extent.truncate_seq <= oi.truncate_seq) {
6918             dout(10) << " truncate seq " << op.extent.truncate_seq << " <= current " << oi.truncate_seq
6919                      << ", no-op" << dendl;
6920             break; // old
6921           }
6922           dout(10) << " truncate seq " << op.extent.truncate_seq << " > current " << oi.truncate_seq
6923                    << ", truncating" << dendl;
6924           oi.truncate_seq = op.extent.truncate_seq;
6925           oi.truncate_size = op.extent.truncate_size;
6926         }
6927
6928         maybe_create_new_object(ctx);
6929         t->truncate(soid, op.extent.offset);
6930         if (oi.size > op.extent.offset) {
6931           interval_set<uint64_t> trim;
6932           trim.insert(op.extent.offset, oi.size-op.extent.offset);
6933           ctx->modified_ranges.union_of(trim);
6934           ctx->clean_regions.mark_data_region_dirty(op.extent.offset, oi.size - op.extent.offset);
6935         } else if (oi.size < op.extent.offset) {
6936           ctx->clean_regions.mark_data_region_dirty(oi.size, op.extent.offset - oi.size);
6937         }
6938         if (op.extent.offset != oi.size) {
6939           truncate_update_size_and_usage(ctx->delta_stats,
6940                                          oi,
6941                                          op.extent.offset);
6942         }
6943         ctx->delta_stats.num_wr++;
6944         // do no set exists, or we will break above DELETE -> TRUNCATE munging.
6945
6946         oi.clear_data_digest();
6947       }
6948       break;
6949
6950     case CEPH_OSD_OP_DELETE:
6951       ++ctx->num_write;
6952       result = 0;
6953       tracepoint(osd, do_osd_op_pre_delete, soid.oid.name.c_str(), soid.snap.val);
6954       {
6955         result = _delete_oid(ctx, false, ctx->ignore_cache);
6956       }
6957       break;
6958
6959     case CEPH_OSD_OP_WATCH:
6960       ++ctx->num_write;
6961       result = 0;
6962       {
6963         tracepoint(osd, do_osd_op_pre_watch, soid.oid.name.c_str(), soid.snap.val,
6964                    op.watch.cookie, op.watch.op);
6965         if (!obs.exists) {
6966           result = -ENOENT;
6967           break;
6968         }
6969         result = 0;
6970         uint64_t cookie = op.watch.cookie;
6971         entity_name_t entity = ctx->reqid.name;
6972         ObjectContextRef obc = ctx->obc;
6973
6974         dout(10) << "watch " << ceph_osd_watch_op_name(op.watch.op)
6975                  << ": ctx->obc=" << (void *)obc.get() << " cookie=" << cookie
6976                  << " oi.version=" << oi.version.version << " ctx->at_version=" << ctx->at_version << dendl;
6977         dout(10) << "watch: oi.user_version=" << oi.user_version<< dendl;
6978         dout(10) << "watch: peer_addr="
6979           << ctx->op->get_req()->get_connection()->get_peer_addr() << dendl;
6980
6981         uint32_t timeout = cct->_conf->osd_client_watch_timeout;
6982         if (op.watch.timeout != 0) {
6983           timeout = op.watch.timeout;
6984         }
6985
6986         watch_info_t w(cookie, timeout,
6987           ctx->op->get_req()->get_connection()->get_peer_addr());
6988         if (op.watch.op == CEPH_OSD_WATCH_OP_WATCH ||
6989             op.watch.op == CEPH_OSD_WATCH_OP_LEGACY_WATCH) {
6990           if (oi.watchers.count(make_pair(cookie, entity))) {
6991             dout(10) << " found existing watch " << w << " by " << entity << dendl;
6992           } else {
6993             dout(10) << " registered new watch " << w << " by " << entity << dendl;
6994             oi.watchers[make_pair(cookie, entity)] = w;
6995             t->nop(soid);  // make sure update the object_info on disk!
6996           }
6997           bool will_ping = (op.watch.op == CEPH_OSD_WATCH_OP_WATCH);
6998           ctx->watch_connects.push_back(make_pair(w, will_ping));
6999         } else if (op.watch.op == CEPH_OSD_WATCH_OP_RECONNECT) {
7000           if (!oi.watchers.count(make_pair(cookie, entity))) {
7001             result = -ENOTCONN;
7002             break;
7003           }
7004           dout(10) << " found existing watch " << w << " by " << entity << dendl;
7005           ctx->watch_connects.push_back(make_pair(w, true));
7006         } else if (op.watch.op == CEPH_OSD_WATCH_OP_PING) {
7007           /* Note: WATCH with PING doesn't cause may_write() to return true,
7008            * so if there is nothing else in the transaction, this is going
7009            * to run do_osd_op_effects, but not write out a log entry */
7010           if (!oi.watchers.count(make_pair(cookie, entity))) {
7011             result = -ENOTCONN;
7012             break;
7013           }
7014           map<pair<uint64_t,entity_name_t>,WatchRef>::iterator p =
7015             obc->watchers.find(make_pair(cookie, entity));
7016           if (p == obc->watchers.end() ||
7017               !p->second->is_connected()) {
7018             // client needs to reconnect
7019             result = -ETIMEDOUT;
7020             break;
7021           }
7022           dout(10) << " found existing watch " << w << " by " << entity << dendl;
7023           p->second->got_ping(ceph_clock_now());
7024           result = 0;
7025         } else if (op.watch.op == CEPH_OSD_WATCH_OP_UNWATCH) {
7026           map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator oi_iter =
7027             oi.watchers.find(make_pair(cookie, entity));
7028           if (oi_iter != oi.watchers.end()) {
7029             dout(10) << " removed watch " << oi_iter->second << " by "
7030                      << entity << dendl;
7031             oi.watchers.erase(oi_iter);
7032             t->nop(soid);  // update oi on disk
7033             ctx->watch_disconnects.push_back(
7034               watch_disconnect_t(cookie, entity, false));
7035           } else {
7036             dout(10) << " can't remove: no watch by " << entity << dendl;
7037           }
7038         }
7039       }
7040       break;
7041
7042     case CEPH_OSD_OP_CACHE_PIN:
7043       tracepoint(osd, do_osd_op_pre_cache_pin, soid.oid.name.c_str(), soid.snap.val);
7044       if ((!pool.info.is_tier() ||
7045           pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)) {
7046         result = -EINVAL;
7047         dout(10) << " pin object is only allowed on the cache tier " << dendl;
7048         break;
7049       }
7050       ++ctx->num_write;
7051       result = 0;
7052       {
7053         if (!obs.exists || oi.is_whiteout()) {
7054           result = -ENOENT;
7055           break;
7056         }
7057
7058         if (!oi.is_cache_pinned()) {
7059           oi.set_flag(object_info_t::FLAG_CACHE_PIN);
7060           ctx->modify = true;
7061           ctx->delta_stats.num_objects_pinned++;
7062           ctx->delta_stats.num_wr++;
7063         }
7064       }
7065       break;
7066
7067     case CEPH_OSD_OP_CACHE_UNPIN:
7068       tracepoint(osd, do_osd_op_pre_cache_unpin, soid.oid.name.c_str(), soid.snap.val);
7069       if ((!pool.info.is_tier() ||
7070           pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)) {
7071         result = -EINVAL;
7072         dout(10) << " pin object is only allowed on the cache tier " << dendl;
7073         break;
7074       }
7075       ++ctx->num_write;
7076       result = 0;
7077       {
7078         if (!obs.exists || oi.is_whiteout()) {
7079           result = -ENOENT;
7080           break;
7081         }
7082
7083         if (oi.is_cache_pinned()) {
7084           oi.clear_flag(object_info_t::FLAG_CACHE_PIN);
7085           ctx->modify = true;
7086           ctx->delta_stats.num_objects_pinned--;
7087           ctx->delta_stats.num_wr++;
7088         }
7089       }
7090       break;
7091
7092     case CEPH_OSD_OP_SET_REDIRECT:
7093       ++ctx->num_write;
7094       result = 0;
7095       {
7096         if (pool.info.is_tier()) {
7097           result = -EINVAL;
7098           break;
7099         }
7100         if (!obs.exists) {
7101           result = -ENOENT;
7102           break;
7103         }
7104         if (get_osdmap()->require_osd_release < ceph_release_t::luminous) {
7105           result = -EOPNOTSUPP;
7106           break;
7107         }
7108
7109         object_t target_name;
7110         object_locator_t target_oloc;
7111         snapid_t target_snapid = (uint64_t)op.copy_from.snapid;
7112         version_t target_version = op.copy_from.src_version;
7113         try {
7114           decode(target_name, bp);
7115           decode(target_oloc, bp);
7116         }
7117         catch (ceph::buffer::error& e) {
7118           result = -EINVAL;
7119           goto fail;
7120         }
7121         pg_t raw_pg;
7122         result = get_osdmap()->object_locator_to_pg(target_name, target_oloc, raw_pg);
7123         if (result < 0) {
7124           dout(5) << " pool information is invalid: " << result << dendl;
7125           break;
7126         }
7127         hobject_t target(target_name, target_oloc.key, target_snapid,
7128                 raw_pg.ps(), raw_pg.pool(),
7129                 target_oloc.nspace);
7130         if (target == soid) {
7131           dout(20) << " set-redirect self is invalid" << dendl;
7132           result = -EINVAL;
7133           break;
7134         }
7135
7136         bool need_reference = (osd_op.op.flags & CEPH_OSD_OP_FLAG_WITH_REFERENCE);
7137         bool has_reference = (oi.flags & object_info_t::FLAG_REDIRECT_HAS_REFERENCE);
7138         if (has_reference) {
7139           result = -EINVAL;
7140           dout(5) << " the object is already a manifest " << dendl;
7141           break;
7142         }
7143         if (op_finisher == nullptr && need_reference) {
7144           // start
7145           ctx->op_finishers[ctx->current_osd_subop_num].reset(
7146             new SetManifestFinisher(osd_op));
7147           ManifestOpRef mop = std::make_shared<ManifestOp>(ctx->obc, new RefCountCallback(ctx, osd_op));
7148           auto* fin = new C_SetManifestRefCountDone(this, soid, 0);
7149           ceph_tid_t tid = refcount_manifest(soid, target,
7150                                               refcount_t::INCREMENT_REF, fin, std::nullopt);
7151           fin->tid = tid;
7152           mop->num_chunks++;
7153           mop->tids[0] = tid;
7154           manifest_ops[soid] = mop;
7155           ctx->obc->start_block();
7156           result = -EINPROGRESS;
7157         } else {
7158           // finish
7159           if (op_finisher) {
7160             result = op_finisher->execute();
7161             ceph_assert(result == 0);
7162           }
7163
7164           if (!oi.has_manifest() && !oi.manifest.is_redirect())
7165             ctx->delta_stats.num_objects_manifest++;
7166
7167           oi.set_flag(object_info_t::FLAG_MANIFEST);
7168           oi.manifest.redirect_target = target;
7169           oi.manifest.type = object_manifest_t::TYPE_REDIRECT;
7170           t->truncate(soid, 0);
7171           ctx->clean_regions.mark_data_region_dirty(0, oi.size);
7172           if (oi.is_omap() && pool.info.supports_omap()) {
7173             t->omap_clear(soid);
7174             obs.oi.clear_omap_digest();
7175             obs.oi.clear_flag(object_info_t::FLAG_OMAP);
7176             ctx->clean_regions.mark_omap_dirty();
7177           }
7178           write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
7179             0, oi.size, false);
7180           ctx->delta_stats.num_bytes -= oi.size;
7181           oi.size = 0;
7182           oi.new_object();
7183           oi.user_version = target_version;
7184           ctx->user_at_version = target_version;
7185           /* rm_attrs */
7186           map<string,bufferlist,less<>> rmattrs;
7187           result = getattrs_maybe_cache(ctx->obc, &rmattrs);
7188           if (result < 0) {
7189             dout(10) << __func__ << " error: " << cpp_strerror(result) << dendl;
7190             return result;
7191           }
7192           map<string, bufferlist>::iterator iter;
7193           for (iter = rmattrs.begin(); iter != rmattrs.end(); ++iter) {
7194             const string& name = iter->first;
7195             t->rmattr(soid, name);
7196           }
7197           if (!has_reference && need_reference) {
7198             oi.set_flag(object_info_t::FLAG_REDIRECT_HAS_REFERENCE);
7199           }
7200           dout(10) << "set-redirect oid:" << oi.soid << " user_version: " << oi.user_version << dendl;
7201           if (op_finisher) {
7202             ctx->op_finishers.erase(ctx->current_osd_subop_num);
7203           }
7204         }
7205       }
7206
7207       break;
7208
7209     case CEPH_OSD_OP_SET_CHUNK:
7210       ++ctx->num_write;
7211       result = 0;
7212       {
7213         if (pool.info.is_tier()) {
7214           result = -EINVAL;
7215           break;
7216         }
7217         if (!obs.exists) {
7218           result = -ENOENT;
7219           break;
7220         }
7221         if (get_osdmap()->require_osd_release < ceph_release_t::luminous) {
7222           result = -EOPNOTSUPP;
7223           break;
7224         }
7225         if (oi.manifest.is_redirect()) {
7226           result = -EINVAL;
7227           goto fail;
7228         }
7229
7230         object_locator_t tgt_oloc;
7231         uint64_t src_offset, src_length, tgt_offset;
7232         object_t tgt_name;
7233         try {
7234           decode(src_offset, bp);
7235           decode(src_length, bp);
7236           decode(tgt_oloc, bp);
7237           decode(tgt_name, bp);
7238           decode(tgt_offset, bp);
7239         }
7240         catch (ceph::buffer::error& e) {
7241           result = -EINVAL;
7242           goto fail;
7243         }
7244
7245         if (!src_length) {
7246           result = -EINVAL;
7247           goto fail;
7248         }
7249         if (src_offset + src_length > oi.size) {
7250           result = -ERANGE;
7251           goto fail;
7252         }
7253         if (!(osd_op.op.flags & CEPH_OSD_OP_FLAG_WITH_REFERENCE)) {
7254           result = -EOPNOTSUPP;
7255           break;
7256         }
7257         if (pool.info.is_erasure()) {
7258           result = -EOPNOTSUPP;
7259           break;
7260         }
7261
7262         for (auto &p : oi.manifest.chunk_map) {
7263           interval_set<uint64_t> chunk;
7264           chunk.insert(p.first, p.second.length);
7265           if (chunk.intersects(src_offset, src_length)) {
7266             dout(20) << __func__ << " overlapped !! offset: " << src_offset << " length: " << src_length
7267                     << " chunk_info: " << p << dendl;
7268             result = -EOPNOTSUPP;
7269             goto fail;
7270           }
7271         }
7272
7273         pg_t raw_pg;
7274         chunk_info_t chunk_info;
7275         result = get_osdmap()->object_locator_to_pg(tgt_name, tgt_oloc, raw_pg);
7276         if (result < 0) {
7277           dout(5) << " pool information is invalid: " << result << dendl;
7278           break;
7279         }
7280         hobject_t target(tgt_name, tgt_oloc.key, snapid_t(),
7281                          raw_pg.ps(), raw_pg.pool(),
7282                          tgt_oloc.nspace);
7283         bool has_reference = (oi.manifest.chunk_map.find(src_offset) != oi.manifest.chunk_map.end()) &&
7284                              (oi.manifest.chunk_map[src_offset].test_flag(chunk_info_t::FLAG_HAS_REFERENCE));
7285         if (has_reference) {
7286           result = -EINVAL;
7287           dout(5) << " the object is already a manifest " << dendl;
7288           break;
7289         }
7290         chunk_info.oid = target;
7291         chunk_info.offset = tgt_offset;
7292         chunk_info.length = src_length;
7293         if (op_finisher == nullptr)  {
7294           // start
7295           ctx->op_finishers[ctx->current_osd_subop_num].reset(
7296             new SetManifestFinisher(osd_op));
7297           object_manifest_t set_chunk;
7298           bool need_inc_ref = false;
7299           set_chunk.chunk_map[src_offset] = chunk_info;
7300           need_inc_ref = inc_refcount_by_set(ctx, set_chunk, osd_op);
7301           if (need_inc_ref) {
7302             result = -EINPROGRESS;
7303             break;
7304           }
7305         }
7306         if (op_finisher) {
7307           result = op_finisher->execute();
7308           ceph_assert(result == 0);
7309         }
7310
7311         oi.manifest.chunk_map[src_offset] = chunk_info;
7312         if (!oi.has_manifest() && !oi.manifest.is_chunked())
7313           ctx->delta_stats.num_objects_manifest++;
7314         oi.set_flag(object_info_t::FLAG_MANIFEST);
7315         oi.manifest.type = object_manifest_t::TYPE_CHUNKED;
7316         if (!has_reference) {
7317           oi.manifest.chunk_map[src_offset].set_flag(chunk_info_t::FLAG_HAS_REFERENCE);
7318         }
7319         ctx->modify = true;
7320         ctx->cache_operation = true;
7321
7322         dout(10) << "set-chunked oid:" << oi.soid << " user_version: " << oi.user_version
7323                  << " chunk_info: " << chunk_info << dendl;
7324         if (op_finisher) {
7325           ctx->op_finishers.erase(ctx->current_osd_subop_num);
7326         }
7327       }
7328
7329       break;
7330
7331     case CEPH_OSD_OP_TIER_PROMOTE:
7332       ++ctx->num_write;
7333       result = 0;
7334       {
7335         if (pool.info.is_tier()) {
7336           result = -EINVAL;
7337           break;
7338         }
7339         if (!obs.exists) {
7340           result = -ENOENT;
7341           break;
7342         }
7343         if (get_osdmap()->require_osd_release < ceph_release_t::luminous) {
7344           result = -EOPNOTSUPP;
7345           break;
7346         }
7347         if (!obs.oi.has_manifest()) {
7348           result = 0;
7349           break;
7350         }
7351
7352         if (op_finisher == nullptr) {
7353           PromoteManifestCallback *cb;
7354           object_locator_t my_oloc;
7355           hobject_t src_hoid;
7356
7357           if (obs.oi.manifest.is_chunked()) {
7358             src_hoid = obs.oi.soid;
7359           } else if (obs.oi.manifest.is_redirect()) {
7360             object_locator_t src_oloc(obs.oi.manifest.redirect_target);
7361             my_oloc = src_oloc;
7362             src_hoid = obs.oi.manifest.redirect_target;
7363           } else {
7364             ceph_abort_msg("unrecognized manifest type");
7365           }
7366           cb = new PromoteManifestCallback(ctx->obc, this, ctx);
7367           ctx->op_finishers[ctx->current_osd_subop_num].reset(
7368             new PromoteFinisher(cb));
7369           unsigned flags = CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
7370                            CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
7371                            CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE |
7372                            CEPH_OSD_COPY_FROM_FLAG_RWORDERED;
7373           unsigned src_fadvise_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL;
7374           start_copy(cb, ctx->obc, src_hoid, my_oloc, 0, flags,
7375                      obs.oi.soid.snap == CEPH_NOSNAP,
7376                      src_fadvise_flags, 0);
7377
7378           dout(10) << "tier-promote oid:" << oi.soid << " manifest: " << obs.oi.manifest << dendl;
7379           result = -EINPROGRESS;
7380         } else {
7381           result = op_finisher->execute();
7382           ceph_assert(result == 0);
7383           ctx->op_finishers.erase(ctx->current_osd_subop_num);
7384         }
7385       }
7386
7387       break;
7388
7389     case CEPH_OSD_OP_TIER_FLUSH:
7390       ++ctx->num_write;
7391       result = 0;
7392       {
7393         if (pool.info.is_tier()) {
7394           result = -EINVAL;
7395           break;
7396         }
7397         if (!obs.exists) {
7398           result = -ENOENT;
7399           break;
7400         }
7401         if (get_osdmap()->require_osd_release < ceph_release_t::octopus) {
7402           result = -EOPNOTSUPP;
7403           break;
7404         }
7405
7406         if (oi.is_dirty() || !obs.oi.has_manifest()) {
7407           result = start_flush(ctx->op, ctx->obc, true, NULL, std::nullopt, true);
7408           if (result == -EINPROGRESS)
7409             result = -EAGAIN;
7410         } else {
7411           result = 0;
7412         }
7413       }
7414
7415       break;
7416
7417     case CEPH_OSD_OP_TIER_EVICT:
7418       ++ctx->num_write;
7419       result = 0;
7420       {
7421         if (pool.info.is_tier()) {
7422           result = -EINVAL;
7423           break;
7424         }
7425         if (!obs.exists) {
7426           result = -ENOENT;
7427           break;
7428         }
7429         if (get_osdmap()->require_osd_release < ceph_release_t::octopus) {
7430           result = -EOPNOTSUPP;
7431           break;
7432         }
7433         if (!obs.oi.has_manifest()) {
7434           result = -EINVAL;
7435           break;
7436         }
7437
7438         // The chunks already has a reference, so it is just enough to invoke truncate if necessary
7439         for (auto &p : obs.oi.manifest.chunk_map) {
7440           p.second.set_flag(chunk_info_t::FLAG_MISSING);
7441           // punch hole
7442           t->zero(soid, p.first, p.second.length);
7443           interval_set<uint64_t> ch;
7444           ch.insert(p.first, p.second.length);
7445           ctx->modified_ranges.union_of(ch);
7446           ctx->clean_regions.mark_data_region_dirty(p.first, p.second.length);
7447         }
7448         oi.clear_data_digest();
7449         ctx->delta_stats.num_wr++;
7450         ctx->cache_operation = true;
7451         ctx->undirty = true;
7452         osd->logger->inc(l_osd_tier_evict);
7453       }
7454
7455       break;
7456
7457     case CEPH_OSD_OP_UNSET_MANIFEST:
7458       ++ctx->num_write;
7459       result = 0;
7460       {
7461         if (pool.info.is_tier()) {
7462           result = -EINVAL;
7463           break;
7464         }
7465         if (!obs.exists) {
7466           result = -ENOENT;
7467           break;
7468         }
7469         if (!oi.has_manifest()) {
7470           result = -EOPNOTSUPP;
7471           break;
7472         }
7473         if (get_osdmap()->require_osd_release < ceph_release_t::luminous) {
7474           result = -EOPNOTSUPP;
7475           break;
7476         }
7477
7478         dec_all_refcount_manifest(oi, ctx);
7479
7480         oi.clear_flag(object_info_t::FLAG_MANIFEST);
7481         oi.manifest = object_manifest_t();
7482         ctx->delta_stats.num_objects_manifest--;
7483         ctx->delta_stats.num_wr++;
7484         ctx->modify = true;
7485       }
7486
7487       break;
7488
7489       // -- object attrs --
7490
7491     case CEPH_OSD_OP_SETXATTR:
7492       ++ctx->num_write;
7493       result = 0;
7494       {
7495         if (cct->_conf->osd_max_attr_size > 0 &&
7496             op.xattr.value_len > cct->_conf->osd_max_attr_size) {
7497           tracepoint(osd, do_osd_op_pre_setxattr, soid.oid.name.c_str(), soid.snap.val, "???");
7498           result = -EFBIG;
7499           break;
7500         }
7501         unsigned max_name_len =
7502           std::min<uint64_t>(osd->store->get_max_attr_name_length(),
7503                              cct->_conf->osd_max_attr_name_len);
7504         if (op.xattr.name_len > max_name_len) {
7505           result = -ENAMETOOLONG;
7506           break;
7507         }
7508         maybe_create_new_object(ctx);
7509         string aname;
7510         bp.copy(op.xattr.name_len, aname);
7511         tracepoint(osd, do_osd_op_pre_setxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
7512         string name = "_" + aname;
7513         bufferlist bl;
7514         bp.copy(op.xattr.value_len, bl);
7515         t->setattr(soid, name, bl);
7516         ctx->delta_stats.num_wr++;
7517       }
7518       break;
7519
7520     case CEPH_OSD_OP_RMXATTR:
7521       ++ctx->num_write;
7522       result = 0;
7523       {
7524         string aname;
7525         bp.copy(op.xattr.name_len, aname);
7526         tracepoint(osd, do_osd_op_pre_rmxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
7527         if (!obs.exists || oi.is_whiteout()) {
7528           result = -ENOENT;
7529           break;
7530         }
7531         string name = "_" + aname;
7532         t->rmattr(soid, name);
7533         ctx->delta_stats.num_wr++;
7534       }
7535       break;
7536
7537
7538       // -- fancy writers --
7539     case CEPH_OSD_OP_APPEND:
7540       {
7541         tracepoint(osd, do_osd_op_pre_append, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
7542         // just do it inline; this works because we are happy to execute
7543         // fancy op on replicas as well.
7544         vector<OSDOp> nops(1);
7545         OSDOp& newop = nops[0];
7546         newop.op.op = CEPH_OSD_OP_WRITE;
7547         newop.op.extent.offset = oi.size;
7548         newop.op.extent.length = op.extent.length;
7549         newop.op.extent.truncate_seq = oi.truncate_seq;
7550         newop.indata = osd_op.indata;
7551         result = do_osd_ops(ctx, nops);
7552         osd_op.outdata = std::move(newop.outdata);
7553       }
7554       break;
7555
7556     case CEPH_OSD_OP_STARTSYNC:
7557       result = 0;
7558       t->nop(soid);
7559       break;
7560
7561       // -- trivial map --
7562     case CEPH_OSD_OP_TMAPGET:
7563       tracepoint(osd, do_osd_op_pre_tmapget, soid.oid.name.c_str(), soid.snap.val);
7564       if (pool.info.is_erasure()) {
7565         result = -EOPNOTSUPP;
7566         break;
7567       }
7568       {
7569         vector<OSDOp> nops(1);
7570         OSDOp& newop = nops[0];
7571         newop.op.op = CEPH_OSD_OP_SYNC_READ;
7572         newop.op.extent.offset = 0;
7573         newop.op.extent.length = 0;
7574         result = do_osd_ops(ctx, nops);
7575         osd_op.outdata = std::move(newop.outdata);
7576       }
7577       break;
7578
7579     case CEPH_OSD_OP_TMAPPUT:
7580       tracepoint(osd, do_osd_op_pre_tmapput, soid.oid.name.c_str(), soid.snap.val);
7581       if (pool.info.is_erasure()) {
7582         result = -EOPNOTSUPP;
7583         break;
7584       }
7585       {
7586         //_dout_lock.Lock();
7587         //osd_op.data.hexdump(*_dout);
7588         //_dout_lock.Unlock();
7589
7590         // verify sort order
7591         bool unsorted = false;
7592         if (true) {
7593           bufferlist header;
7594           decode(header, bp);
7595           uint32_t n;
7596           decode(n, bp);
7597           string last_key;
7598           while (n--) {
7599             string key;
7600             decode(key, bp);
7601             dout(10) << "tmapput key " << key << dendl;
7602             bufferlist val;
7603             decode(val, bp);
7604             if (key < last_key) {
7605               dout(10) << "TMAPPUT is unordered; resorting" << dendl;
7606               unsorted = true;
7607               break;
7608             }
7609             last_key = key;
7610           }
7611         }
7612
7613         // write it
7614         vector<OSDOp> nops(1);
7615         OSDOp& newop = nops[0];
7616         newop.op.op = CEPH_OSD_OP_WRITEFULL;
7617         newop.op.extent.offset = 0;
7618         newop.op.extent.length = osd_op.indata.length();
7619         newop.indata = osd_op.indata;
7620
7621         if (unsorted) {
7622           bp = osd_op.indata.begin();
7623           bufferlist header;
7624           map<string, bufferlist> m;
7625           decode(header, bp);
7626           decode(m, bp);
7627           ceph_assert(bp.end());
7628           bufferlist newbl;
7629           encode(header, newbl);
7630           encode(m, newbl);
7631           newop.indata = newbl;
7632         }
7633         result = do_osd_ops(ctx, nops);
7634         ceph_assert(result == 0);
7635       }
7636       break;
7637
7638     case CEPH_OSD_OP_TMAPUP:
7639       tracepoint(osd, do_osd_op_pre_tmapup, soid.oid.name.c_str(), soid.snap.val);
7640       if (pool.info.is_erasure()) {
7641         result = -EOPNOTSUPP;
7642         break;
7643       }
7644       ++ctx->num_write;
7645       result = do_tmapup(ctx, bp, osd_op);
7646       break;
7647
7648     case CEPH_OSD_OP_TMAP2OMAP:
7649       ++ctx->num_write;
7650       tracepoint(osd, do_osd_op_pre_tmap2omap, soid.oid.name.c_str(), soid.snap.val);
7651       result = do_tmap2omap(ctx, op.tmap2omap.flags);
7652       break;
7653
7654       // OMAP Read ops
7655     case CEPH_OSD_OP_OMAPGETKEYS:
7656       ++ctx->num_read;
7657       {
7658         string start_after;
7659         uint64_t max_return;
7660         try {
7661           decode(start_after, bp);
7662           decode(max_return, bp);
7663         }
7664         catch (ceph::buffer::error& e) {
7665           result = -EINVAL;
7666           tracepoint(osd, do_osd_op_pre_omapgetkeys, soid.oid.name.c_str(), soid.snap.val, "???", 0);
7667           goto fail;
7668         }
7669         if (max_return > cct->_conf->osd_max_omap_entries_per_request) {
7670           max_return = cct->_conf->osd_max_omap_entries_per_request;
7671         }
7672         tracepoint(osd, do_osd_op_pre_omapgetkeys, soid.oid.name.c_str(), soid.snap.val, start_after.c_str(), max_return);
7673
7674         bufferlist bl;
7675         uint32_t num = 0;
7676         bool truncated = false;
7677         if (oi.is_omap()) {
7678           ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(
7679             ch, ghobject_t(soid)
7680             );
7681           ceph_assert(iter);
7682           iter->upper_bound(start_after);
7683           for (num = 0; iter->valid(); ++num, iter->next()) {
7684             if (num >= max_return ||
7685                 bl.length() >= cct->_conf->osd_max_omap_bytes_per_request) {
7686               truncated = true;
7687               break;
7688             }
7689             encode(iter->key(), bl);
7690           }
7691         } // else return empty out_set
7692         encode(num, osd_op.outdata);
7693         osd_op.outdata.claim_append(bl);
7694         encode(truncated, osd_op.outdata);
7695         ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
7696         ctx->delta_stats.num_rd++;
7697       }
7698       break;
7699
7700     case CEPH_OSD_OP_OMAPGETVALS:
7701       ++ctx->num_read;
7702       {
7703         string start_after;
7704         uint64_t max_return;
7705         string filter_prefix;
7706         try {
7707           decode(start_after, bp);
7708           decode(max_return, bp);
7709           decode(filter_prefix, bp);
7710         }
7711         catch (ceph::buffer::error& e) {
7712           result = -EINVAL;
7713           tracepoint(osd, do_osd_op_pre_omapgetvals, soid.oid.name.c_str(), soid.snap.val, "???", 0, "???");
7714           goto fail;
7715         }
7716         if (max_return > cct->_conf->osd_max_omap_entries_per_request) {
7717           max_return = cct->_conf->osd_max_omap_entries_per_request;
7718         }
7719         tracepoint(osd, do_osd_op_pre_omapgetvals, soid.oid.name.c_str(), soid.snap.val, start_after.c_str(), max_return, filter_prefix.c_str());
7720
7721         uint32_t num = 0;
7722         bool truncated = false;
7723         bufferlist bl;
7724         if (oi.is_omap()) {
7725           ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(
7726             ch, ghobject_t(soid)
7727             );
7728           if (!iter) {
7729             result = -ENOENT;
7730             goto fail;
7731           }
7732           iter->upper_bound(start_after);
7733           if (filter_prefix > start_after) iter->lower_bound(filter_prefix);
7734           for (num = 0;
7735                iter->valid() &&
7736                  iter->key().substr(0, filter_prefix.size()) == filter_prefix;
7737                ++num, iter->next()) {
7738             dout(20) << "Found key " << iter->key() << dendl;
7739             if (num >= max_return ||
7740                 bl.length() >= cct->_conf->osd_max_omap_bytes_per_request) {
7741               truncated = true;
7742               break;
7743             }
7744             encode(iter->key(), bl);
7745             encode(iter->value(), bl);
7746           }
7747         } // else return empty out_set
7748         encode(num, osd_op.outdata);
7749         osd_op.outdata.claim_append(bl);
7750         encode(truncated, osd_op.outdata);
7751         ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
7752         ctx->delta_stats.num_rd++;
7753       }
7754       break;
7755
7756     case CEPH_OSD_OP_OMAPGETHEADER:
7757       tracepoint(osd, do_osd_op_pre_omapgetheader, soid.oid.name.c_str(), soid.snap.val);
7758       if (!oi.is_omap()) {
7759         // return empty header
7760         break;
7761       }
7762       ++ctx->num_read;
7763       {
7764         osd->store->omap_get_header(ch, ghobject_t(soid), &osd_op.outdata);
7765         ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
7766         ctx->delta_stats.num_rd++;
7767       }
7768       break;
7769
7770     case CEPH_OSD_OP_OMAPGETVALSBYKEYS:
7771       ++ctx->num_read;
7772       {
7773         set<string> keys_to_get;
7774         try {
7775           decode(keys_to_get, bp);
7776         }
7777         catch (ceph::buffer::error& e) {
7778           result = -EINVAL;
7779           tracepoint(osd, do_osd_op_pre_omapgetvalsbykeys, soid.oid.name.c_str(), soid.snap.val, "???");
7780           goto fail;
7781         }
7782         tracepoint(osd, do_osd_op_pre_omapgetvalsbykeys, soid.oid.name.c_str(), soid.snap.val, list_entries(keys_to_get).c_str());
7783         map<string, bufferlist> out;
7784         if (oi.is_omap()) {
7785           osd->store->omap_get_values(ch, ghobject_t(soid), keys_to_get, &out);
7786         } // else return empty omap entries
7787         encode(out, osd_op.outdata);
7788         ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
7789         ctx->delta_stats.num_rd++;
7790       }
7791       break;
7792
7793     case CEPH_OSD_OP_OMAP_CMP:
7794       ++ctx->num_read;
7795       {
7796         if (!obs.exists || oi.is_whiteout()) {
7797           result = -ENOENT;
7798           tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, "???");
7799           break;
7800         }
7801         map<string, pair<bufferlist, int> > assertions;
7802         try {
7803           decode(assertions, bp);
7804         }
7805         catch (ceph::buffer::error& e) {
7806           result = -EINVAL;
7807           tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, "???");
7808           goto fail;
7809         }
7810         tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, list_keys(assertions).c_str());
7811
7812         map<string, bufferlist> out;
7813
7814         if (oi.is_omap()) {
7815           set<string> to_get;
7816           for (map<string, pair<bufferlist, int> >::iterator i = assertions.begin();
7817                i != assertions.end();
7818                ++i)
7819             to_get.insert(i->first);
7820           int r = osd->store->omap_get_values(ch, ghobject_t(soid),
7821                                               to_get, &out);
7822           if (r < 0) {
7823             result = r;
7824             break;
7825           }
7826         } // else leave out empty
7827
7828         //Should set num_rd_kb based on encode length of map
7829         ctx->delta_stats.num_rd++;
7830
7831         int r = 0;
7832         bufferlist empty;
7833         for (map<string, pair<bufferlist, int> >::iterator i = assertions.begin();
7834              i != assertions.end();
7835              ++i) {
7836           auto out_entry = out.find(i->first);
7837           bufferlist &bl = (out_entry != out.end()) ?
7838             out_entry->second : empty;
7839           switch (i->second.second) {
7840           case CEPH_OSD_CMPXATTR_OP_EQ:
7841             if (!(bl == i->second.first)) {
7842               r = -ECANCELED;
7843             }
7844             break;
7845           case CEPH_OSD_CMPXATTR_OP_LT:
7846             if (!(bl < i->second.first)) {
7847               r = -ECANCELED;
7848             }
7849             break;
7850           case CEPH_OSD_CMPXATTR_OP_GT:
7851             if (!(bl > i->second.first)) {
7852               r = -ECANCELED;
7853             }
7854             break;
7855           default:
7856             r = -EINVAL;
7857             break;
7858           }
7859           if (r < 0)
7860             break;
7861         }
7862         if (r < 0) {
7863           result = r;
7864         }
7865       }
7866       break;
7867
7868       // OMAP Write ops
7869     case CEPH_OSD_OP_OMAPSETVALS:
7870       if (!pool.info.supports_omap()) {
7871         result = -EOPNOTSUPP;
7872         tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
7873         break;
7874       }
7875       ++ctx->num_write;
7876       result = 0;
7877       {
7878         maybe_create_new_object(ctx);
7879         bufferlist to_set_bl;
7880         try {
7881           decode_str_str_map_to_bl(bp, &to_set_bl);
7882         }
7883         catch (ceph::buffer::error& e) {
7884           result = -EINVAL;
7885           tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
7886           goto fail;
7887         }
7888         tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
7889         if (cct->_conf->subsys.should_gather<dout_subsys, 20>()) {
7890           dout(20) << "setting vals: " << dendl;
7891           map<string,bufferlist> to_set;
7892           bufferlist::const_iterator pt = to_set_bl.begin();
7893           decode(to_set, pt);
7894           for (map<string, bufferlist>::iterator i = to_set.begin();
7895                i != to_set.end();
7896                ++i) {
7897             dout(20) << "\t" << i->first << dendl;
7898           }
7899         }
7900         t->omap_setkeys(soid, to_set_bl);
7901         ctx->clean_regions.mark_omap_dirty();
7902         ctx->delta_stats.num_wr++;
7903         ctx->delta_stats.num_wr_kb += shift_round_up(to_set_bl.length(), 10);
7904       }
7905       obs.oi.set_flag(object_info_t::FLAG_OMAP);
7906       obs.oi.clear_omap_digest();
7907       break;
7908
7909     case CEPH_OSD_OP_OMAPSETHEADER:
7910       tracepoint(osd, do_osd_op_pre_omapsetheader, soid.oid.name.c_str(), soid.snap.val);
7911       if (!pool.info.supports_omap()) {
7912         result = -EOPNOTSUPP;
7913         break;
7914       }
7915       ++ctx->num_write;
7916       result = 0;
7917       {
7918         maybe_create_new_object(ctx);
7919         t->omap_setheader(soid, osd_op.indata);
7920         ctx->clean_regions.mark_omap_dirty();
7921         ctx->delta_stats.num_wr++;
7922       }
7923       obs.oi.set_flag(object_info_t::FLAG_OMAP);
7924       obs.oi.clear_omap_digest();
7925       break;
7926
7927     case CEPH_OSD_OP_OMAPCLEAR:
7928       tracepoint(osd, do_osd_op_pre_omapclear, soid.oid.name.c_str(), soid.snap.val);
7929       if (!pool.info.supports_omap()) {
7930         result = -EOPNOTSUPP;
7931         break;
7932       }
7933       ++ctx->num_write;
7934       result = 0;
7935       {
7936         if (!obs.exists || oi.is_whiteout()) {
7937           result = -ENOENT;
7938           break;
7939         }
7940         if (oi.is_omap()) {
7941           t->omap_clear(soid);
7942           ctx->clean_regions.mark_omap_dirty();
7943           ctx->delta_stats.num_wr++;
7944           obs.oi.clear_omap_digest();
7945           obs.oi.clear_flag(object_info_t::FLAG_OMAP);
7946         }
7947       }
7948       break;
7949
7950     case CEPH_OSD_OP_OMAPRMKEYS:
7951       if (!pool.info.supports_omap()) {
7952         result = -EOPNOTSUPP;
7953         tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
7954         break;
7955       }
7956       ++ctx->num_write;
7957       result = 0;
7958       {
7959         if (!obs.exists || oi.is_whiteout()) {
7960           result = -ENOENT;
7961           tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
7962           break;
7963         }
7964         bufferlist to_rm_bl;
7965         try {
7966           decode_str_set_to_bl(bp, &to_rm_bl);
7967         }
7968         catch (ceph::buffer::error& e) {
7969           result = -EINVAL;
7970           tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
7971           goto fail;
7972         }
7973         tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
7974         t->omap_rmkeys(soid, to_rm_bl);
7975         ctx->clean_regions.mark_omap_dirty();
7976         ctx->delta_stats.num_wr++;
7977       }
7978       obs.oi.clear_omap_digest();
7979       break;
7980
7981     case CEPH_OSD_OP_OMAPRMKEYRANGE:
7982       tracepoint(osd, do_osd_op_pre_omaprmkeyrange, soid.oid.name.c_str(), soid.snap.val);
7983       if (!pool.info.supports_omap()) {
7984         result = -EOPNOTSUPP;
7985         break;
7986       }
7987       ++ctx->num_write;
7988       result = 0;
7989       {
7990         if (!obs.exists || oi.is_whiteout()) {
7991           result = -ENOENT;
7992           break;
7993         }
7994         std::string key_begin, key_end;
7995         try {
7996           decode(key_begin, bp);
7997           decode(key_end, bp);
7998         } catch (ceph::buffer::error& e) {
7999           result = -EINVAL;
8000           goto fail;
8001         }
8002         t->omap_rmkeyrange(soid, key_begin, key_end);
8003         ctx->clean_regions.mark_omap_dirty();
8004         ctx->delta_stats.num_wr++;
8005       }
8006       obs.oi.clear_omap_digest();
8007       break;
8008
8009     case CEPH_OSD_OP_COPY_GET:
8010       ++ctx->num_read;
8011       tracepoint(osd, do_osd_op_pre_copy_get, soid.oid.name.c_str(),
8012                  soid.snap.val);
8013       if (op_finisher == nullptr) {
8014         result = do_copy_get(ctx, bp, osd_op, ctx->obc);
8015       } else {
8016         result = op_finisher->execute();
8017       }
8018       break;
8019
8020     case CEPH_OSD_OP_COPY_FROM:
8021     case CEPH_OSD_OP_COPY_FROM2:
8022       ++ctx->num_write;
8023       result = 0;
8024       {
8025         object_t src_name;
8026         object_locator_t src_oloc;
8027         uint32_t truncate_seq = 0;
8028         uint64_t truncate_size = 0;
8029         bool have_truncate = false;
8030         snapid_t src_snapid = (uint64_t)op.copy_from.snapid;
8031         version_t src_version = op.copy_from.src_version;
8032
8033         if ((op.op == CEPH_OSD_OP_COPY_FROM2) &&
8034             (op.copy_from.flags & ~CEPH_OSD_COPY_FROM_FLAGS)) {
8035           dout(20) << "invalid copy-from2 flags 0x"
8036                   << std::hex << (int)op.copy_from.flags << std::dec << dendl;
8037           result = -EINVAL;
8038           break;
8039         }
8040         try {
8041           decode(src_name, bp);
8042           decode(src_oloc, bp);
8043           // check if client sent us truncate_seq and truncate_size
8044           if ((op.op == CEPH_OSD_OP_COPY_FROM2) &&
8045               (op.copy_from.flags & CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ)) {
8046             decode(truncate_seq, bp);
8047             decode(truncate_size, bp);
8048             have_truncate = true;
8049           }
8050         }
8051         catch (ceph::buffer::error& e) {
8052           result = -EINVAL;
8053           tracepoint(osd,
8054                      do_osd_op_pre_copy_from,
8055                      soid.oid.name.c_str(),
8056                      soid.snap.val,
8057                      "???",
8058                      0,
8059                      "???",
8060                      "???",
8061                      0,
8062                      src_snapid,
8063                      src_version);
8064           goto fail;
8065         }
8066         tracepoint(osd,
8067                    do_osd_op_pre_copy_from,
8068                    soid.oid.name.c_str(),
8069                    soid.snap.val,
8070                    src_name.name.c_str(),
8071                    src_oloc.pool,
8072                    src_oloc.key.c_str(),
8073                    src_oloc.nspace.c_str(),
8074                    src_oloc.hash,
8075                    src_snapid,
8076                    src_version);
8077         if (op_finisher == nullptr) {
8078           // start
8079           pg_t raw_pg;
8080           get_osdmap()->object_locator_to_pg(src_name, src_oloc, raw_pg);
8081           hobject_t src(src_name, src_oloc.key, src_snapid,
8082                         raw_pg.ps(), raw_pg.pool(),
8083                         src_oloc.nspace);
8084           if (src == soid) {
8085             dout(20) << " copy from self is invalid" << dendl;
8086             result = -EINVAL;
8087             break;
8088           }
8089           CopyFromCallback *cb = new CopyFromCallback(ctx, osd_op);
8090           if (have_truncate)
8091             cb->set_truncate(truncate_seq, truncate_size);
8092           ctx->op_finishers[ctx->current_osd_subop_num].reset(
8093             new CopyFromFinisher(cb));
8094           start_copy(cb, ctx->obc, src, src_oloc, src_version,
8095                      op.copy_from.flags,
8096                      false,
8097                      op.copy_from.src_fadvise_flags,
8098                      op.flags);
8099           result = -EINPROGRESS;
8100         } else {
8101           // finish
8102           result = op_finisher->execute();
8103           ceph_assert(result == 0);
8104
8105           // COPY_FROM cannot be executed multiple times -- it must restart
8106           ctx->op_finishers.erase(ctx->current_osd_subop_num);
8107         }
8108       }
8109       break;
8110
8111     default:
8112       tracepoint(osd, do_osd_op_pre_unknown, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op));
8113       dout(1) << "unrecognized osd op " << op.op
8114               << " " << ceph_osd_op_name(op.op)
8115               << dendl;
8116       result = -EOPNOTSUPP;
8117     }
8118
8119   fail:
8120     osd_op.rval = result;
8121     tracepoint(osd, do_osd_op_post, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op), op.flags, result);
8122     if (result < 0 && (op.flags & CEPH_OSD_OP_FLAG_FAILOK) &&
8123         result != -EAGAIN && result != -EINPROGRESS)
8124       result = 0;
8125
8126     if (result < 0)
8127       break;
8128   }
8129   if (result < 0) {
8130     dout(10) << __func__ << " error: " << cpp_strerror(result) << dendl;
8131   }
8132   return result;
8133 }
8134
8135 int PrimaryLogPG::_get_tmap(OpContext *ctx, bufferlist *header, bufferlist *vals)
8136 {
8137   if (ctx->new_obs.oi.size == 0) {
8138     dout(20) << "unable to get tmap for zero sized " << ctx->new_obs.oi.soid << dendl;
8139     return -ENODATA;
8140   }
8141   vector<OSDOp> nops(1);
8142   OSDOp &newop = nops[0];
8143   newop.op.op = CEPH_OSD_OP_TMAPGET;
8144   do_osd_ops(ctx, nops);
8145   try {
8146     bufferlist::const_iterator i = newop.outdata.begin();
8147     decode(*header, i);
8148     (*vals).substr_of(newop.outdata, i.get_off(), i.get_remaining());
8149   } catch (...) {
8150     dout(20) << "unsuccessful at decoding tmap for " << ctx->new_obs.oi.soid
8151              << dendl;
8152     return -EINVAL;
8153   }
8154   dout(20) << "successful at decoding tmap for " << ctx->new_obs.oi.soid
8155            << dendl;
8156   return 0;
8157 }
8158
8159 int PrimaryLogPG::_verify_no_head_clones(const hobject_t& soid,
8160                                         const SnapSet& ss)
8161 {
8162   // verify that all clones have been evicted
8163   dout(20) << __func__ << " verifying clones are absent "
8164            << ss << dendl;
8165   for (vector<snapid_t>::const_iterator p = ss.clones.begin();
8166        p != ss.clones.end();
8167        ++p) {
8168     hobject_t clone_oid = soid;
8169     clone_oid.snap = *p;
8170     if (is_missing_object(clone_oid))
8171       return -EBUSY;
8172     ObjectContextRef clone_obc = get_object_context(clone_oid, false);
8173     if (clone_obc && clone_obc->obs.exists) {
8174       dout(10) << __func__ << " cannot evict head before clone "
8175                << clone_oid << dendl;
8176       return -EBUSY;
8177     }
8178     if (copy_ops.count(clone_oid)) {
8179       dout(10) << __func__ << " cannot evict head, pending promote on clone "
8180                << clone_oid << dendl;
8181       return -EBUSY;
8182     }
8183   }
8184   return 0;
8185 }
8186
8187 inline int PrimaryLogPG::_delete_oid(
8188   OpContext *ctx,
8189   bool no_whiteout,     // no whiteouts, no matter what.
8190   bool try_no_whiteout) // try not to whiteout
8191 {
8192   SnapSet& snapset = ctx->new_snapset;
8193   ObjectState& obs = ctx->new_obs;
8194   object_info_t& oi = obs.oi;
8195   const hobject_t& soid = oi.soid;
8196   PGTransaction* t = ctx->op_t.get();
8197
8198   // cache: cache: set whiteout on delete?
8199   bool whiteout = false;
8200   if (pool.info.cache_mode != pg_pool_t::CACHEMODE_NONE
8201       && !no_whiteout
8202       && !try_no_whiteout) {
8203     whiteout = true;
8204   }
8205
8206   // in luminous or later, we can't delete the head if there are
8207   // clones. we trust the caller passing no_whiteout has already
8208   // verified they don't exist.
8209   if (!snapset.clones.empty() ||
8210       (!ctx->snapc.snaps.empty() && ctx->snapc.snaps[0] > snapset.seq)) {
8211     if (no_whiteout) {
8212       dout(20) << __func__ << " has or will have clones but no_whiteout=1"
8213                << dendl;
8214     } else {
8215       dout(20) << __func__ << " has or will have clones; will whiteout"
8216                << dendl;
8217       whiteout = true;
8218     }
8219   }
8220   dout(20) << __func__ << " " << soid << " whiteout=" << (int)whiteout
8221            << " no_whiteout=" << (int)no_whiteout
8222            << " try_no_whiteout=" << (int)try_no_whiteout
8223            << dendl;
8224   if (!obs.exists || (obs.oi.is_whiteout() && whiteout))
8225     return -ENOENT;
8226
8227   t->remove(soid);
8228
8229   if (oi.size > 0) {
8230     interval_set<uint64_t> ch;
8231     ch.insert(0, oi.size);
8232     ctx->modified_ranges.union_of(ch);
8233     ctx->clean_regions.mark_data_region_dirty(0, oi.size);
8234   }
8235
8236   ctx->clean_regions.mark_omap_dirty();
8237   ctx->delta_stats.num_wr++;
8238   if (soid.is_snap()) {
8239     ceph_assert(ctx->obc->ssc->snapset.clone_overlap.count(soid.snap));
8240     ctx->delta_stats.num_bytes -= ctx->obc->ssc->snapset.get_clone_bytes(soid.snap);
8241   } else {
8242     ctx->delta_stats.num_bytes -= oi.size;
8243   }
8244   oi.size = 0;
8245   oi.new_object();
8246
8247   // disconnect all watchers
8248   for (map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator p =
8249          oi.watchers.begin();
8250        p != oi.watchers.end();
8251        ++p) {
8252     dout(20) << __func__ << " will disconnect watcher " << p->first << dendl;
8253     ctx->watch_disconnects.push_back(
8254       watch_disconnect_t(p->first.first, p->first.second, true));
8255   }
8256   oi.watchers.clear();
8257
8258   if (whiteout) {
8259     dout(20) << __func__ << " setting whiteout on " << soid << dendl;
8260     oi.set_flag(object_info_t::FLAG_WHITEOUT);
8261     ctx->delta_stats.num_whiteouts++;
8262     t->create(soid);
8263     osd->logger->inc(l_osd_tier_whiteout);
8264     return 0;
8265   }
8266
8267   if (oi.has_manifest()) {
8268     ctx->delta_stats.num_objects_manifest--;
8269     dec_all_refcount_manifest(oi, ctx);
8270   }
8271
8272   // delete the head
8273   ctx->delta_stats.num_objects--;
8274   if (soid.is_snap())
8275     ctx->delta_stats.num_object_clones--;
8276   if (oi.is_whiteout()) {
8277     dout(20) << __func__ << " deleting whiteout on " << soid << dendl;
8278     ctx->delta_stats.num_whiteouts--;
8279     oi.clear_flag(object_info_t::FLAG_WHITEOUT);
8280   }
8281   if (oi.is_cache_pinned()) {
8282     ctx->delta_stats.num_objects_pinned--;
8283   }
8284   obs.exists = false;
8285   return 0;
8286 }
8287
8288 int PrimaryLogPG::_rollback_to(OpContext *ctx, OSDOp& op)
8289 {
8290   ObjectState& obs = ctx->new_obs;
8291   object_info_t& oi = obs.oi;
8292   const hobject_t& soid = oi.soid;
8293   snapid_t snapid = (uint64_t)op.op.snap.snapid;
8294   hobject_t missing_oid;
8295
8296   dout(10) << "_rollback_to " << soid << " snapid " << snapid << dendl;
8297
8298   ObjectContextRef rollback_to;
8299
8300   int ret = find_object_context(
8301     hobject_t(soid.oid, soid.get_key(), snapid, soid.get_hash(), info.pgid.pool(),
8302               soid.get_namespace()),
8303     &rollback_to, false, false, &missing_oid);
8304   if (ret == -EAGAIN) {
8305     /* clone must be missing */
8306     ceph_assert(is_degraded_or_backfilling_object(missing_oid) || is_degraded_on_async_recovery_target(missing_oid));
8307     dout(20) << "_rollback_to attempted to roll back to a missing or backfilling clone "
8308              << missing_oid << " (requested snapid: ) " << snapid << dendl;
8309     block_write_on_degraded_snap(missing_oid, ctx->op);
8310     return ret;
8311   }
8312   {
8313     ObjectContextRef promote_obc;
8314     cache_result_t tier_mode_result;
8315     if (obs.exists && obs.oi.has_manifest()) {
8316       /*
8317        * In the case of manifest object, the object_info exists on the base tier at all time,
8318        * so promote_obc should be equal to rollback_to
8319        * */
8320       promote_obc = rollback_to;
8321       tier_mode_result =
8322         maybe_handle_manifest_detail(
8323           ctx->op,
8324           true,
8325           rollback_to);
8326     } else {
8327       tier_mode_result =
8328         maybe_handle_cache_detail(
8329           ctx->op,
8330           true,
8331           rollback_to,
8332           ret,
8333           missing_oid,
8334           true,
8335           false,
8336           &promote_obc);
8337     }
8338     switch (tier_mode_result) {
8339     case cache_result_t::NOOP:
8340       break;
8341     case cache_result_t::BLOCKED_PROMOTE:
8342       ceph_assert(promote_obc);
8343       block_write_on_snap_rollback(soid, promote_obc, ctx->op);
8344       return -EAGAIN;
8345     case cache_result_t::BLOCKED_FULL:
8346       block_write_on_full_cache(soid, ctx->op);
8347       return -EAGAIN;
8348     case cache_result_t::REPLIED_WITH_EAGAIN:
8349       ceph_abort_msg("this can't happen, no rollback on replica");
8350     default:
8351       ceph_abort_msg("must promote was set, other values are not valid");
8352       return -EAGAIN;
8353     }
8354   }
8355
8356   if (ret == -ENOENT || (rollback_to && rollback_to->obs.oi.is_whiteout())) {
8357     // there's no snapshot here, or there's no object.
8358     // if there's no snapshot, we delete the object; otherwise, do nothing.
8359     dout(20) << "_rollback_to deleting head on " << soid.oid
8360              << " because got ENOENT|whiteout on find_object_context" << dendl;
8361     if (ctx->obc->obs.oi.watchers.size()) {
8362       // Cannot delete an object with watchers
8363       ret = -EBUSY;
8364     } else {
8365       _delete_oid(ctx, false, false);
8366       ret = 0;
8367     }
8368   } else if (ret) {
8369     // ummm....huh? It *can't* return anything else at time of writing.
8370     ceph_abort_msg("unexpected error code in _rollback_to");
8371   } else { //we got our context, let's use it to do the rollback!
8372     hobject_t& rollback_to_sobject = rollback_to->obs.oi.soid;
8373     if (is_degraded_or_backfilling_object(rollback_to_sobject) ||
8374         is_degraded_on_async_recovery_target(rollback_to_sobject)) {
8375       dout(20) << "_rollback_to attempted to roll back to a degraded object "
8376                << rollback_to_sobject << " (requested snapid: ) " << snapid << dendl;
8377       block_write_on_degraded_snap(rollback_to_sobject, ctx->op);
8378       ret = -EAGAIN;
8379     } else if (rollback_to->obs.oi.soid.snap == CEPH_NOSNAP) {
8380       // rolling back to the head; we just need to clone it.
8381       ctx->modify = true;
8382     } else {
8383       if (rollback_to->obs.oi.has_manifest() && rollback_to->obs.oi.manifest.is_chunked()) {
8384         /*
8385          * looking at the following case, the foo head needs the reference of chunk4 and chunk5
8386          * in case snap[1] is removed.
8387          *
8388          * Before rollback to snap[1]:
8389          *
8390          * foo snap[1]:          [chunk4]          [chunk5]
8391          * foo snap[0]: [                  chunk2                   ]
8392          * foo head   :          [chunk1]                    [chunk3]
8393          *
8394          * After:
8395          *
8396          * foo snap[1]:          [chunk4]          [chunk5]
8397          * foo snap[0]: [                  chunk2                   ]
8398          * foo head   :          [chunk4]          [chunk5]
8399          *
8400          */
8401         OpFinisher* op_finisher = nullptr;
8402         auto op_finisher_it = ctx->op_finishers.find(ctx->current_osd_subop_num);
8403         if (op_finisher_it != ctx->op_finishers.end()) {
8404           op_finisher = op_finisher_it->second.get();
8405         }
8406         if (!op_finisher) {
8407           bool need_inc_ref = inc_refcount_by_set(ctx, rollback_to->obs.oi.manifest, op);
8408           if (need_inc_ref) {
8409             ceph_assert(op_finisher_it == ctx->op_finishers.end());
8410             ctx->op_finishers[ctx->current_osd_subop_num].reset(
8411                 new SetManifestFinisher(op));
8412             return -EINPROGRESS;
8413           }
8414         } else {
8415           op_finisher->execute();
8416           ctx->op_finishers.erase(ctx->current_osd_subop_num);
8417         }
8418       }
8419       _do_rollback_to(ctx, rollback_to, op);
8420     }
8421   }
8422   return ret;
8423 }
8424
8425 void PrimaryLogPG::_do_rollback_to(OpContext *ctx, ObjectContextRef rollback_to,
8426                                     OSDOp& op)
8427 {
8428   SnapSet& snapset = ctx->new_snapset;
8429   ObjectState& obs = ctx->new_obs;
8430   object_info_t& oi = obs.oi;
8431   const hobject_t& soid = oi.soid;
8432   PGTransaction* t = ctx->op_t.get();
8433   snapid_t snapid = (uint64_t)op.op.snap.snapid;
8434   hobject_t& rollback_to_sobject = rollback_to->obs.oi.soid;
8435
8436   /* 1) Delete current head
8437    * 2) Clone correct snapshot into head
8438    * 3) Calculate clone_overlaps by following overlaps
8439    *    forward from rollback snapshot */
8440   dout(10) << "_do_rollback_to deleting " << soid.oid
8441            << " and rolling back to old snap" << dendl;
8442
8443   if (obs.exists) {
8444     t->remove(soid);
8445     if (obs.oi.has_manifest()) {
8446       dec_all_refcount_manifest(obs.oi, ctx);
8447       oi.manifest.clear();
8448       oi.manifest.type = object_manifest_t::TYPE_NONE;
8449       oi.clear_flag(object_info_t::FLAG_MANIFEST);
8450       ctx->delta_stats.num_objects_manifest--;
8451       ctx->cache_operation = true; // do not trigger to call ref function to calculate refcount
8452     }
8453   }
8454   t->clone(soid, rollback_to_sobject);
8455   t->add_obc(rollback_to);
8456
8457   map<snapid_t, interval_set<uint64_t> >::iterator iter =
8458     snapset.clone_overlap.lower_bound(snapid);
8459   ceph_assert(iter != snapset.clone_overlap.end());
8460   interval_set<uint64_t> overlaps = iter->second;
8461   for ( ;
8462         iter != snapset.clone_overlap.end();
8463         ++iter)
8464     overlaps.intersection_of(iter->second);
8465
8466   if (obs.oi.size > 0) {
8467     interval_set<uint64_t> modified;
8468     modified.insert(0, obs.oi.size);
8469     overlaps.intersection_of(modified);
8470     modified.subtract(overlaps);
8471     ctx->modified_ranges.union_of(modified);
8472   }
8473
8474   // Adjust the cached objectcontext
8475   maybe_create_new_object(ctx, true);
8476   ctx->delta_stats.num_bytes -= obs.oi.size;
8477   ctx->delta_stats.num_bytes += rollback_to->obs.oi.size;
8478   ctx->clean_regions.mark_data_region_dirty(0, std::max(obs.oi.size, rollback_to->obs.oi.size));
8479   ctx->clean_regions.mark_omap_dirty();
8480   obs.oi.size = rollback_to->obs.oi.size;
8481   if (rollback_to->obs.oi.is_data_digest())
8482     obs.oi.set_data_digest(rollback_to->obs.oi.data_digest);
8483   else
8484     obs.oi.clear_data_digest();
8485   if (rollback_to->obs.oi.is_omap_digest())
8486     obs.oi.set_omap_digest(rollback_to->obs.oi.omap_digest);
8487   else
8488     obs.oi.clear_omap_digest();
8489
8490   if (rollback_to->obs.oi.has_manifest() && rollback_to->obs.oi.manifest.is_chunked()) {
8491     obs.oi.set_flag(object_info_t::FLAG_MANIFEST);
8492     obs.oi.manifest.type = rollback_to->obs.oi.manifest.type;
8493     obs.oi.manifest.chunk_map = rollback_to->obs.oi.manifest.chunk_map;
8494     ctx->cache_operation = true;
8495     ctx->delta_stats.num_objects_manifest++;
8496   }
8497
8498   if (rollback_to->obs.oi.is_omap()) {
8499     dout(10) << __func__ << " setting omap flag on " << obs.oi.soid << dendl;
8500     obs.oi.set_flag(object_info_t::FLAG_OMAP);
8501   } else {
8502     dout(10) << __func__ << " clearing omap flag on " << obs.oi.soid << dendl;
8503     obs.oi.clear_flag(object_info_t::FLAG_OMAP);
8504   }
8505 }
8506
8507 void PrimaryLogPG::_make_clone(
8508   OpContext *ctx,
8509   PGTransaction* t,
8510   ObjectContextRef clone_obc,
8511   const hobject_t& head, const hobject_t& coid,
8512   object_info_t *poi)
8513 {
8514   bufferlist bv;
8515   encode(*poi, bv, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
8516
8517   t->clone(coid, head);
8518   setattr_maybe_cache(clone_obc, t, OI_ATTR, bv);
8519   rmattr_maybe_cache(clone_obc, t, SS_ATTR);
8520 }
8521
8522 void PrimaryLogPG::make_writeable(OpContext *ctx)
8523 {
8524   const hobject_t& soid = ctx->obs->oi.soid;
8525   SnapContext& snapc = ctx->snapc;
8526
8527   // clone?
8528   ceph_assert(soid.snap == CEPH_NOSNAP);
8529   dout(20) << "make_writeable " << soid << " snapset=" << ctx->new_snapset
8530            << "  snapc=" << snapc << dendl;
8531
8532   bool was_dirty = ctx->obc->obs.oi.is_dirty();
8533   if (ctx->new_obs.exists) {
8534     // we will mark the object dirty
8535     if (ctx->undirty && was_dirty) {
8536       dout(20) << " clearing DIRTY flag" << dendl;
8537       ceph_assert(ctx->new_obs.oi.is_dirty());
8538       ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
8539       --ctx->delta_stats.num_objects_dirty;
8540       osd->logger->inc(l_osd_tier_clean);
8541     } else if (!was_dirty && !ctx->undirty) {
8542       dout(20) << " setting DIRTY flag" << dendl;
8543       ctx->new_obs.oi.set_flag(object_info_t::FLAG_DIRTY);
8544       ++ctx->delta_stats.num_objects_dirty;
8545       osd->logger->inc(l_osd_tier_dirty);
8546     }
8547   } else {
8548     if (was_dirty) {
8549       dout(20) << " deletion, decrementing num_dirty and clearing flag" << dendl;
8550       ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
8551       --ctx->delta_stats.num_objects_dirty;
8552     }
8553   }
8554
8555   if ((ctx->new_obs.exists &&
8556        ctx->new_obs.oi.is_omap()) &&
8557       (!ctx->obc->obs.exists ||
8558        !ctx->obc->obs.oi.is_omap())) {
8559     ++ctx->delta_stats.num_objects_omap;
8560   }
8561   if ((!ctx->new_obs.exists ||
8562        !ctx->new_obs.oi.is_omap()) &&
8563       (ctx->obc->obs.exists &&
8564        ctx->obc->obs.oi.is_omap())) {
8565     --ctx->delta_stats.num_objects_omap;
8566   }
8567
8568   if (ctx->new_snapset.seq > snapc.seq) {
8569     dout(10) << " op snapset is old" << dendl;
8570   }
8571
8572   if ((ctx->obs->exists && !ctx->obs->oi.is_whiteout()) && // head exist(ed)
8573       snapc.snaps.size() &&                 // there are snaps
8574       !ctx->cache_operation &&
8575       snapc.snaps[0] > ctx->new_snapset.seq) {  // existing object is old
8576     // clone
8577     hobject_t coid = soid;
8578     coid.snap = snapc.seq;
8579
8580     const auto snaps = [&] {
8581       auto last = find_if_not(
8582         begin(snapc.snaps), end(snapc.snaps),
8583         [&](snapid_t snap_id) { return snap_id > ctx->new_snapset.seq; });
8584       return vector<snapid_t>{begin(snapc.snaps), last};
8585     }();
8586
8587     // prepare clone
8588     object_info_t static_snap_oi(coid);
8589     object_info_t *snap_oi;
8590     if (is_primary()) {
8591       ctx->clone_obc = object_contexts.lookup_or_create(static_snap_oi.soid);
8592       ctx->clone_obc->destructor_callback =
8593         new C_PG_ObjectContext(this, ctx->clone_obc.get());
8594       ctx->clone_obc->obs.oi = static_snap_oi;
8595       ctx->clone_obc->obs.exists = true;
8596       ctx->clone_obc->ssc = ctx->obc->ssc;
8597       ctx->clone_obc->ssc->ref++;
8598       if (pool.info.is_erasure())
8599         ctx->clone_obc->attr_cache = ctx->obc->attr_cache;
8600       snap_oi = &ctx->clone_obc->obs.oi;
8601       if (ctx->obc->obs.oi.has_manifest()) {
8602         if ((ctx->obc->obs.oi.flags & object_info_t::FLAG_REDIRECT_HAS_REFERENCE) &&
8603             ctx->obc->obs.oi.manifest.is_redirect()) {
8604           snap_oi->set_flag(object_info_t::FLAG_MANIFEST);
8605           snap_oi->manifest.type = object_manifest_t::TYPE_REDIRECT;
8606           snap_oi->manifest.redirect_target = ctx->obc->obs.oi.manifest.redirect_target;
8607         } else if (ctx->obc->obs.oi.manifest.is_chunked()) {
8608           snap_oi->set_flag(object_info_t::FLAG_MANIFEST);
8609           snap_oi->manifest.type = object_manifest_t::TYPE_CHUNKED;
8610           snap_oi->manifest.chunk_map = ctx->obc->obs.oi.manifest.chunk_map;
8611         } else {
8612           ceph_abort_msg("unrecognized manifest type");
8613         }
8614       }
8615       bool got = ctx->lock_manager.get_write_greedy(
8616         coid,
8617         ctx->clone_obc,
8618         ctx->op);
8619       ceph_assert(got);
8620       dout(20) << " got greedy write on clone_obc " << *ctx->clone_obc << dendl;
8621     } else {
8622       snap_oi = &static_snap_oi;
8623     }
8624     snap_oi->version = ctx->at_version;
8625     snap_oi->prior_version = ctx->obs->oi.version;
8626     snap_oi->copy_user_bits(ctx->obs->oi);
8627
8628     _make_clone(ctx, ctx->op_t.get(), ctx->clone_obc, soid, coid, snap_oi);
8629
8630     ctx->delta_stats.num_objects++;
8631     if (snap_oi->is_dirty()) {
8632       ctx->delta_stats.num_objects_dirty++;
8633       osd->logger->inc(l_osd_tier_dirty);
8634     }
8635     if (snap_oi->is_omap())
8636       ctx->delta_stats.num_objects_omap++;
8637     if (snap_oi->is_cache_pinned())
8638       ctx->delta_stats.num_objects_pinned++;
8639     if (snap_oi->has_manifest())
8640       ctx->delta_stats.num_objects_manifest++;
8641     ctx->delta_stats.num_object_clones++;
8642     ctx->new_snapset.clones.push_back(coid.snap);
8643     ctx->new_snapset.clone_size[coid.snap] = ctx->obs->oi.size;
8644     ctx->new_snapset.clone_snaps[coid.snap] = snaps;
8645
8646     // clone_overlap should contain an entry for each clone
8647     // (an empty interval_set if there is no overlap)
8648     ctx->new_snapset.clone_overlap[coid.snap];
8649     if (ctx->obs->oi.size) {
8650       ctx->new_snapset.clone_overlap[coid.snap].insert(0, ctx->obs->oi.size);
8651     }
8652
8653     // log clone
8654     dout(10) << " cloning v " << ctx->obs->oi.version
8655              << " to " << coid << " v " << ctx->at_version
8656              << " snaps=" << snaps
8657              << " snapset=" << ctx->new_snapset << dendl;
8658     ctx->log.push_back(pg_log_entry_t(
8659                          pg_log_entry_t::CLONE, coid, ctx->at_version,
8660                          ctx->obs->oi.version,
8661                          ctx->obs->oi.user_version,
8662                          osd_reqid_t(), ctx->new_obs.oi.mtime, 0));
8663     encode(snaps, ctx->log.back().snaps);
8664
8665     ctx->at_version.version++;
8666   }
8667
8668   // update most recent clone_overlap and usage stats
8669   if (ctx->new_snapset.clones.size() > 0) {
8670     // the clone_overlap is difference of range between head and clones.
8671     // we need to check whether the most recent clone exists, if it's
8672     // been evicted, it's not included in the stats, but the clone_overlap
8673     // is still exist in the snapset, so we should update the
8674     // clone_overlap to make it sense.
8675     hobject_t last_clone_oid = soid;
8676     last_clone_oid.snap = ctx->new_snapset.clone_overlap.rbegin()->first;
8677     interval_set<uint64_t> &newest_overlap =
8678       ctx->new_snapset.clone_overlap.rbegin()->second;
8679     ctx->modified_ranges.intersection_of(newest_overlap);
8680     if (is_present_clone(last_clone_oid)) {
8681       // modified_ranges is still in use by the clone
8682       ctx->delta_stats.num_bytes += ctx->modified_ranges.size();
8683     }
8684     newest_overlap.subtract(ctx->modified_ranges);
8685   }
8686
8687   if (snapc.seq > ctx->new_snapset.seq) {
8688     // update snapset with latest snap context
8689     ctx->new_snapset.seq = snapc.seq;
8690     if (get_osdmap()->require_osd_release < ceph_release_t::octopus) {
8691       ctx->new_snapset.snaps = snapc.snaps;
8692     } else {
8693       ctx->new_snapset.snaps.clear();
8694     }
8695   }
8696   dout(20) << "make_writeable " << soid
8697            << " done, snapset=" << ctx->new_snapset << dendl;
8698 }
8699
8700
8701 void PrimaryLogPG::write_update_size_and_usage(object_stat_sum_t& delta_stats, object_info_t& oi,
8702                                                interval_set<uint64_t>& modified, uint64_t offset,
8703                                                uint64_t length, bool write_full)
8704 {
8705   interval_set<uint64_t> ch;
8706   if (write_full) {
8707     if (oi.size)
8708       ch.insert(0, oi.size);
8709   } else if (length)
8710     ch.insert(offset, length);
8711   modified.union_of(ch);
8712   if (write_full ||
8713       (offset + length > oi.size && length)) {
8714     uint64_t new_size = offset + length;
8715     delta_stats.num_bytes -= oi.size;
8716     delta_stats.num_bytes += new_size;
8717     oi.size = new_size;
8718   }
8719
8720   delta_stats.num_wr++;
8721   delta_stats.num_wr_kb += shift_round_up(length, 10);
8722 }
8723
8724 void PrimaryLogPG::truncate_update_size_and_usage(
8725   object_stat_sum_t& delta_stats,
8726   object_info_t& oi,
8727   uint64_t truncate_size)
8728 {
8729   if (oi.size != truncate_size) {
8730     delta_stats.num_bytes -= oi.size;
8731     delta_stats.num_bytes += truncate_size;
8732     oi.size = truncate_size;
8733   }
8734 }
8735
8736 void PrimaryLogPG::complete_disconnect_watches(
8737   ObjectContextRef obc,
8738   const list<watch_disconnect_t> &to_disconnect)
8739 {
8740   for (list<watch_disconnect_t>::const_iterator i =
8741          to_disconnect.begin();
8742        i != to_disconnect.end();
8743        ++i) {
8744     pair<uint64_t, entity_name_t> watcher(i->cookie, i->name);
8745     auto watchers_entry = obc->watchers.find(watcher);
8746     if (watchers_entry != obc->watchers.end()) {
8747       WatchRef watch = watchers_entry->second;
8748       dout(10) << "do_osd_op_effects disconnect watcher " << watcher << dendl;
8749       obc->watchers.erase(watcher);
8750       watch->remove(i->send_disconnect);
8751     } else {
8752       dout(10) << "do_osd_op_effects disconnect failed to find watcher "
8753                << watcher << dendl;
8754     }
8755   }
8756 }
8757
8758 void PrimaryLogPG::do_osd_op_effects(OpContext *ctx, const ConnectionRef& conn)
8759 {
8760   entity_name_t entity = ctx->reqid.name;
8761   dout(15) << "do_osd_op_effects " << entity << " con " << conn.get() << dendl;
8762
8763   // disconnects first
8764   complete_disconnect_watches(ctx->obc, ctx->watch_disconnects);
8765
8766   ceph_assert(conn);
8767
8768   auto session = conn->get_priv();
8769   if (!session)
8770     return;
8771
8772   for (list<pair<watch_info_t,bool> >::iterator i = ctx->watch_connects.begin();
8773        i != ctx->watch_connects.end();
8774        ++i) {
8775     pair<uint64_t, entity_name_t> watcher(i->first.cookie, entity);
8776     dout(15) << "do_osd_op_effects applying watch connect on session "
8777              << session.get() << " watcher " << watcher << dendl;
8778     WatchRef watch;
8779     if (ctx->obc->watchers.count(watcher)) {
8780       dout(15) << "do_osd_op_effects found existing watch watcher " << watcher
8781                << dendl;
8782       watch = ctx->obc->watchers[watcher];
8783     } else {
8784       dout(15) << "do_osd_op_effects new watcher " << watcher
8785                << dendl;
8786       watch = Watch::makeWatchRef(
8787         this, osd, ctx->obc, i->first.timeout_seconds,
8788         i->first.cookie, entity, conn->get_peer_addr());
8789       ctx->obc->watchers.insert(
8790         make_pair(
8791           watcher,
8792           watch));
8793     }
8794     watch->connect(conn, i->second);
8795   }
8796
8797   for (list<notify_info_t>::iterator p = ctx->notifies.begin();
8798        p != ctx->notifies.end();
8799        ++p) {
8800     dout(10) << "do_osd_op_effects, notify " << *p << dendl;
8801     ConnectionRef conn(ctx->op->get_req()->get_connection());
8802     NotifyRef notif(
8803       Notify::makeNotifyRef(
8804         conn,
8805         ctx->reqid.name.num(),
8806         p->bl,
8807         p->timeout,
8808         p->cookie,
8809         p->notify_id,
8810         ctx->obc->obs.oi.user_version,
8811         osd));
8812     for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator i =
8813            ctx->obc->watchers.begin();
8814          i != ctx->obc->watchers.end();
8815          ++i) {
8816       dout(10) << "starting notify on watch " << i->first << dendl;
8817       i->second->start_notify(notif);
8818     }
8819     notif->init();
8820   }
8821
8822   for (list<OpContext::NotifyAck>::iterator p = ctx->notify_acks.begin();
8823        p != ctx->notify_acks.end();
8824        ++p) {
8825     if (p->watch_cookie)
8826       dout(10) << "notify_ack " << make_pair(*(p->watch_cookie), p->notify_id) << dendl;
8827     else
8828       dout(10) << "notify_ack " << make_pair("NULL", p->notify_id) << dendl;
8829     for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator i =
8830            ctx->obc->watchers.begin();
8831          i != ctx->obc->watchers.end();
8832          ++i) {
8833       if (i->first.second != entity) continue;
8834       if (p->watch_cookie &&
8835           *(p->watch_cookie) != i->first.first) continue;
8836       dout(10) << "acking notify on watch " << i->first << dendl;
8837       i->second->notify_ack(p->notify_id, p->reply_bl);
8838     }
8839   }
8840 }
8841
8842 hobject_t PrimaryLogPG::generate_temp_object(const hobject_t& target)
8843 {
8844   ostringstream ss;
8845   ss << "temp_" << info.pgid << "_" << get_role()
8846      << "_" << osd->monc->get_global_id() << "_" << (++temp_seq);
8847   hobject_t hoid = target.make_temp_hobject(ss.str());
8848   dout(20) << __func__ << " " << hoid << dendl;
8849   return hoid;
8850 }
8851
8852 hobject_t PrimaryLogPG::get_temp_recovery_object(
8853   const hobject_t& target,
8854   eversion_t version)
8855 {
8856   ostringstream ss;
8857   ss << "temp_recovering_" << info.pgid  // (note this includes the shardid)
8858      << "_" << version
8859      << "_" << info.history.same_interval_since
8860      << "_" << target.snap;
8861   // pgid + version + interval + snapid is unique, and short
8862   hobject_t hoid = target.make_temp_hobject(ss.str());
8863   dout(20) << __func__ << " " << hoid << dendl;
8864   return hoid;
8865 }
8866
8867 int PrimaryLogPG::prepare_transaction(OpContext *ctx)
8868 {
8869   ceph_assert(!ctx->ops->empty());
8870
8871   // valid snap context?
8872   if (!ctx->snapc.is_valid()) {
8873     dout(10) << " invalid snapc " << ctx->snapc << dendl;
8874     return -EINVAL;
8875   }
8876
8877   // prepare the actual mutation
8878   int result = do_osd_ops(ctx, *ctx->ops);
8879   if (result < 0) {
8880     if (ctx->op->may_write() &&
8881         get_osdmap()->require_osd_release >= ceph_release_t::kraken) {
8882       // need to save the error code in the pg log, to detect dup ops,
8883       // but do nothing else
8884       ctx->update_log_only = true;
8885     }
8886     return result;
8887   }
8888
8889   // read-op?  write-op noop? done?
8890   if (ctx->op_t->empty() && !ctx->modify) {
8891     if (ctx->pending_async_reads.empty())
8892       unstable_stats.add(ctx->delta_stats);
8893     if (ctx->op->may_write() &&
8894         get_osdmap()->require_osd_release >= ceph_release_t::kraken) {
8895       ctx->update_log_only = true;
8896     }
8897     return result;
8898   }
8899
8900   // check for full
8901   if ((ctx->delta_stats.num_bytes > 0 ||
8902        ctx->delta_stats.num_objects > 0) &&  // FIXME: keys?
8903       pool.info.has_flag(pg_pool_t::FLAG_FULL)) {
8904     auto m = ctx->op->get_req<MOSDOp>();
8905     if (ctx->reqid.name.is_mds() ||   // FIXME: ignore MDS for now
8906         m->has_flag(CEPH_OSD_FLAG_FULL_FORCE)) {
8907       dout(20) << __func__ << " full, but proceeding due to FULL_FORCE or MDS"
8908                << dendl;
8909     } else if (m->has_flag(CEPH_OSD_FLAG_FULL_TRY)) {
8910       // they tried, they failed.
8911       dout(20) << __func__ << " full, replying to FULL_TRY op" << dendl;
8912       return pool.info.has_flag(pg_pool_t::FLAG_FULL_QUOTA) ? -EDQUOT : -ENOSPC;
8913     } else {
8914       // drop request
8915       dout(20) << __func__ << " full, dropping request (bad client)" << dendl;
8916       return -EAGAIN;
8917     }
8918   }
8919
8920   const hobject_t& soid = ctx->obs->oi.soid;
8921   // clone, if necessary
8922   if (soid.snap == CEPH_NOSNAP)
8923     make_writeable(ctx);
8924
8925   finish_ctx(ctx,
8926              ctx->new_obs.exists ? pg_log_entry_t::MODIFY :
8927              pg_log_entry_t::DELETE,
8928              result);
8929
8930   return result;
8931 }
8932
8933 void PrimaryLogPG::finish_ctx(OpContext *ctx, int log_op_type, int result)
8934 {
8935   const hobject_t& soid = ctx->obs->oi.soid;
8936   dout(20) << __func__ << " " << soid << " " << ctx
8937            << " op " << pg_log_entry_t::get_op_name(log_op_type)
8938            << dendl;
8939   utime_t now = ceph_clock_now();
8940
8941
8942   // Drop the reference if deduped chunk is modified
8943   if (ctx->new_obs.oi.is_dirty() &&
8944     (ctx->obs->oi.has_manifest() && ctx->obs->oi.manifest.is_chunked()) &&
8945     !ctx->cache_operation &&
8946     log_op_type != pg_log_entry_t::PROMOTE) {
8947     update_chunk_map_by_dirty(ctx);
8948     // If a clone is creating, ignore dropping the reference for manifest object
8949     if (!ctx->delta_stats.num_object_clones) {
8950       dec_refcount_by_dirty(ctx);
8951     }
8952   }
8953
8954   // finish and log the op.
8955   if (ctx->user_modify) {
8956     // update the user_version for any modify ops, except for the watch op
8957     ctx->user_at_version = std::max(info.last_user_version, ctx->new_obs.oi.user_version) + 1;
8958     /* In order for new clients and old clients to interoperate properly
8959      * when exchanging versions, we need to lower bound the user_version
8960      * (which our new clients pay proper attention to)
8961      * by the at_version (which is all the old clients can ever see). */
8962     if (ctx->at_version.version > ctx->user_at_version)
8963       ctx->user_at_version = ctx->at_version.version;
8964     ctx->new_obs.oi.user_version = ctx->user_at_version;
8965   }
8966   ctx->bytes_written = ctx->op_t->get_bytes_written();
8967
8968   if (ctx->new_obs.exists) {
8969     ctx->new_obs.oi.version = ctx->at_version;
8970     ctx->new_obs.oi.prior_version = ctx->obs->oi.version;
8971     ctx->new_obs.oi.last_reqid = ctx->reqid;
8972     if (ctx->mtime != utime_t()) {
8973       ctx->new_obs.oi.mtime = ctx->mtime;
8974       dout(10) << " set mtime to " << ctx->new_obs.oi.mtime << dendl;
8975       ctx->new_obs.oi.local_mtime = now;
8976     } else {
8977       dout(10) << " mtime unchanged at " << ctx->new_obs.oi.mtime << dendl;
8978     }
8979
8980     // object_info_t
8981     map <string, bufferlist, less<>> attrs;
8982     bufferlist bv(sizeof(ctx->new_obs.oi));
8983     encode(ctx->new_obs.oi, bv,
8984              get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
8985     attrs[OI_ATTR] = std::move(bv);
8986
8987     // snapset
8988     if (soid.snap == CEPH_NOSNAP) {
8989       dout(10) << " final snapset " << ctx->new_snapset
8990                << " in " << soid << dendl;
8991       bufferlist bss;
8992       encode(ctx->new_snapset, bss);
8993       attrs[SS_ATTR] = std::move(bss);
8994     } else {
8995       dout(10) << " no snapset (this is a clone)" << dendl;
8996     }
8997     ctx->op_t->setattrs(soid, attrs);
8998   } else {
8999     // reset cached oi
9000     ctx->new_obs.oi = object_info_t(ctx->obc->obs.oi.soid);
9001   }
9002
9003   // append to log
9004   ctx->log.push_back(
9005     pg_log_entry_t(log_op_type, soid, ctx->at_version,
9006                    ctx->obs->oi.version,
9007                    ctx->user_at_version, ctx->reqid,
9008                    ctx->mtime,
9009                    (ctx->op && ctx->op->allows_returnvec()) ? result : 0));
9010   if (ctx->op && ctx->op->allows_returnvec()) {
9011     // also the per-op values
9012     ctx->log.back().set_op_returns(*ctx->ops);
9013     dout(20) << __func__ << " op_returns " << ctx->log.back().op_returns
9014              << dendl;
9015   }
9016
9017   ctx->log.back().clean_regions = ctx->clean_regions;
9018   dout(20) << __func__ << " object " << soid <<  " marks clean_regions " << ctx->log.back().clean_regions << dendl;
9019
9020   if (soid.snap < CEPH_NOSNAP) {
9021     switch (log_op_type) {
9022     case pg_log_entry_t::MODIFY:
9023     case pg_log_entry_t::PROMOTE:
9024     case pg_log_entry_t::CLEAN:
9025       dout(20) << __func__ << " encoding snaps from " << ctx->new_snapset
9026                << dendl;
9027       encode(ctx->new_snapset.clone_snaps[soid.snap], ctx->log.back().snaps);
9028       break;
9029     default:
9030       break;
9031     }
9032   }
9033
9034   if (!ctx->extra_reqids.empty()) {
9035     dout(20) << __func__ << "  extra_reqids " << ctx->extra_reqids << " "
9036              << ctx->extra_reqid_return_codes << dendl;
9037     ctx->log.back().extra_reqids.swap(ctx->extra_reqids);
9038     ctx->log.back().extra_reqid_return_codes.swap(ctx->extra_reqid_return_codes);
9039   }
9040
9041   // apply new object state.
9042   ctx->obc->obs = ctx->new_obs;
9043
9044   if (soid.is_head() && !ctx->obc->obs.exists) {
9045     ctx->obc->ssc->exists = false;
9046     ctx->obc->ssc->snapset = SnapSet();
9047   } else {
9048     ctx->obc->ssc->exists = true;
9049     ctx->obc->ssc->snapset = ctx->new_snapset;
9050   }
9051 }
9052
9053 void PrimaryLogPG::apply_stats(
9054   const hobject_t &soid,
9055   const object_stat_sum_t &delta_stats) {
9056
9057   recovery_state.apply_op_stats(soid, delta_stats);
9058   for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
9059        i != get_backfill_targets().end();
9060        ++i) {
9061     pg_shard_t bt = *i;
9062     const pg_info_t& pinfo = recovery_state.get_peer_info(bt);
9063     if (soid > pinfo.last_backfill && soid <= last_backfill_started) {
9064       pending_backfill_updates[soid].stats.add(delta_stats);
9065     }
9066   }
9067
9068   m_scrubber->stats_of_handled_objects(delta_stats, soid);
9069 }
9070
9071 void PrimaryLogPG::complete_read_ctx(int result, OpContext *ctx)
9072 {
9073   auto m = ctx->op->get_req<MOSDOp>();
9074   ceph_assert(ctx->async_reads_complete());
9075
9076   for (auto p = ctx->ops->begin();
9077     p != ctx->ops->end() && result >= 0; ++p) {
9078     if (p->rval < 0 && !(p->op.flags & CEPH_OSD_OP_FLAG_FAILOK)) {
9079       result = p->rval;
9080       break;
9081     }
9082     ctx->bytes_read += p->outdata.length();
9083   }
9084   ctx->reply->get_header().data_off = (ctx->data_off ? *ctx->data_off : 0);
9085
9086   MOSDOpReply *reply = ctx->reply;
9087   ctx->reply = nullptr;
9088
9089   if (result >= 0) {
9090     if (!ctx->ignore_log_op_stats) {
9091       log_op_stats(*ctx->op, ctx->bytes_written, ctx->bytes_read);
9092
9093       publish_stats_to_osd();
9094     }
9095
9096     // on read, return the current object version
9097     if (ctx->obs) {
9098       reply->set_reply_versions(eversion_t(), ctx->obs->oi.user_version);
9099     } else {
9100       reply->set_reply_versions(eversion_t(), ctx->user_at_version);
9101     }
9102   } else if (result == -ENOENT) {
9103     // on ENOENT, set a floor for what the next user version will be.
9104     reply->set_enoent_reply_versions(info.last_update, info.last_user_version);
9105   }
9106
9107   reply->set_result(result);
9108   reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
9109   osd->send_message_osd_client(reply, m->get_connection());
9110   close_op_ctx(ctx);
9111 }
9112
9113 // ========================================================================
9114 // copyfrom
9115
9116 struct C_Copyfrom : public Context {
9117   PrimaryLogPGRef pg;
9118   hobject_t oid;
9119   epoch_t last_peering_reset;
9120   ceph_tid_t tid;
9121   PrimaryLogPG::CopyOpRef cop;  // used for keeping the cop alive
9122   C_Copyfrom(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
9123              const PrimaryLogPG::CopyOpRef& c)
9124     : pg(p), oid(o), last_peering_reset(lpr),
9125       tid(0), cop(c)
9126   {}
9127   void finish(int r) override {
9128     if (r == -ECANCELED)
9129       return;
9130     std::scoped_lock l{*pg};
9131     if (last_peering_reset == pg->get_last_peering_reset()) {
9132       pg->process_copy_chunk(oid, tid, r);
9133       cop.reset();
9134     }
9135   }
9136 };
9137
9138 struct C_CopyFrom_AsyncReadCb : public Context {
9139   OSDOp *osd_op;
9140   object_copy_data_t reply_obj;
9141   uint64_t features;
9142   size_t len;
9143   C_CopyFrom_AsyncReadCb(OSDOp *osd_op, uint64_t features) :
9144     osd_op(osd_op), features(features), len(0) {}
9145   void finish(int r) override {
9146     osd_op->rval = r;
9147     if (r < 0) {
9148       return;
9149     }
9150
9151     ceph_assert(len > 0);
9152     ceph_assert(len <= reply_obj.data.length());
9153     bufferlist bl;
9154     bl.substr_of(reply_obj.data, 0, len);
9155     reply_obj.data.swap(bl);
9156     encode(reply_obj, osd_op->outdata, features);
9157   }
9158 };
9159
9160 struct C_CopyChunk : public Context {
9161   PrimaryLogPGRef pg;
9162   hobject_t oid;
9163   epoch_t last_peering_reset;
9164   ceph_tid_t tid;
9165   PrimaryLogPG::CopyOpRef cop;  // used for keeping the cop alive
9166   uint64_t offset = 0;
9167   C_CopyChunk(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
9168              const PrimaryLogPG::CopyOpRef& c)
9169     : pg(p), oid(o), last_peering_reset(lpr),
9170       tid(0), cop(c)
9171   {}
9172   void finish(int r) override {
9173     if (r == -ECANCELED)
9174       return;
9175     std::scoped_lock l{*pg};
9176     if (last_peering_reset == pg->get_last_peering_reset()) {
9177       pg->process_copy_chunk_manifest(oid, tid, r, offset);
9178       cop.reset();
9179     }
9180   }
9181 };
9182
9183 int PrimaryLogPG::do_copy_get(OpContext *ctx, bufferlist::const_iterator& bp,
9184                               OSDOp& osd_op, ObjectContextRef &obc)
9185 {
9186   object_info_t& oi = obc->obs.oi;
9187   hobject_t& soid = oi.soid;
9188   int result = 0;
9189   object_copy_cursor_t cursor;
9190   uint64_t out_max;
9191   try {
9192     decode(cursor, bp);
9193     decode(out_max, bp);
9194   }
9195   catch (ceph::buffer::error& e) {
9196     result = -EINVAL;
9197     return result;
9198   }
9199
9200   const MOSDOp *op = reinterpret_cast<const MOSDOp*>(ctx->op->get_req());
9201   uint64_t features = op->get_features();
9202
9203   bool async_read_started = false;
9204   object_copy_data_t _reply_obj;
9205   C_CopyFrom_AsyncReadCb *cb = nullptr;
9206   if (pool.info.is_erasure()) {
9207     cb = new C_CopyFrom_AsyncReadCb(&osd_op, features);
9208   }
9209   object_copy_data_t &reply_obj = cb ? cb->reply_obj : _reply_obj;
9210   // size, mtime
9211   reply_obj.size = oi.size;
9212   reply_obj.mtime = oi.mtime;
9213   ceph_assert(obc->ssc);
9214   if (soid.snap < CEPH_NOSNAP) {
9215     auto p = obc->ssc->snapset.clone_snaps.find(soid.snap);
9216     ceph_assert(p != obc->ssc->snapset.clone_snaps.end()); // warn?
9217     reply_obj.snaps = p->second;
9218   } else {
9219     reply_obj.snap_seq = obc->ssc->snapset.seq;
9220   }
9221   if (oi.is_data_digest()) {
9222     reply_obj.flags |= object_copy_data_t::FLAG_DATA_DIGEST;
9223     reply_obj.data_digest = oi.data_digest;
9224   }
9225   if (oi.is_omap_digest()) {
9226     reply_obj.flags |= object_copy_data_t::FLAG_OMAP_DIGEST;
9227     reply_obj.omap_digest = oi.omap_digest;
9228   }
9229   reply_obj.truncate_seq = oi.truncate_seq;
9230   reply_obj.truncate_size = oi.truncate_size;
9231
9232   // attrs
9233   map<string,bufferlist,less<>>& out_attrs = reply_obj.attrs;
9234   if (!cursor.attr_complete) {
9235     result = getattrs_maybe_cache(
9236       ctx->obc,
9237       &out_attrs);
9238     if (result < 0) {
9239       if (cb) {
9240         delete cb;
9241       }
9242       return result;
9243     }
9244     cursor.attr_complete = true;
9245     dout(20) << " got attrs" << dendl;
9246   }
9247
9248   int64_t left = out_max - osd_op.outdata.length();
9249
9250   // data
9251   bufferlist& bl = reply_obj.data;
9252   if (left > 0 && !cursor.data_complete) {
9253     if (cursor.data_offset < oi.size) {
9254       uint64_t max_read = std::min(oi.size - cursor.data_offset, (uint64_t)left);
9255       if (cb) {
9256         async_read_started = true;
9257         ctx->pending_async_reads.push_back(
9258           make_pair(
9259             boost::make_tuple(cursor.data_offset, max_read, osd_op.op.flags),
9260             make_pair(&bl, cb)));
9261         cb->len = max_read;
9262
9263         ctx->op_finishers[ctx->current_osd_subop_num].reset(
9264           new ReadFinisher(osd_op));
9265         result = -EINPROGRESS;
9266
9267         dout(10) << __func__ << ": async_read noted for " << soid << dendl;
9268       } else {
9269         result = pgbackend->objects_read_sync(
9270           oi.soid, cursor.data_offset, max_read, osd_op.op.flags, &bl);
9271         if (result < 0)
9272           return result;
9273       }
9274       left -= max_read;
9275       cursor.data_offset += max_read;
9276     }
9277     if (cursor.data_offset == oi.size) {
9278       cursor.data_complete = true;
9279       dout(20) << " got data" << dendl;
9280     }
9281     ceph_assert(cursor.data_offset <= oi.size);
9282   }
9283
9284   // omap
9285   uint32_t omap_keys = 0;
9286   if (!pool.info.supports_omap() || !oi.is_omap()) {
9287     cursor.omap_complete = true;
9288   } else {
9289     if (left > 0 && !cursor.omap_complete) {
9290       ceph_assert(cursor.data_complete);
9291       if (cursor.omap_offset.empty()) {
9292         osd->store->omap_get_header(ch, ghobject_t(oi.soid),
9293                                     &reply_obj.omap_header);
9294       }
9295       bufferlist omap_data;
9296       ObjectMap::ObjectMapIterator iter =
9297         osd->store->get_omap_iterator(ch, ghobject_t(oi.soid));
9298       ceph_assert(iter);
9299       iter->upper_bound(cursor.omap_offset);
9300       for (; iter->valid(); iter->next()) {
9301         ++omap_keys;
9302         encode(iter->key(), omap_data);
9303         encode(iter->value(), omap_data);
9304         left -= iter->key().length() + 4 + iter->value().length() + 4;
9305         if (left <= 0)
9306           break;
9307       }
9308       if (omap_keys) {
9309         encode(omap_keys, reply_obj.omap_data);
9310         reply_obj.omap_data.claim_append(omap_data);
9311       }
9312       if (iter->valid()) {
9313         cursor.omap_offset = iter->key();
9314       } else {
9315         cursor.omap_complete = true;
9316         dout(20) << " got omap" << dendl;
9317       }
9318     }
9319   }
9320
9321   if (cursor.is_complete()) {
9322     // include reqids only in the final step.  this is a bit fragile
9323     // but it works...
9324     recovery_state.get_pg_log().get_log().get_object_reqids(ctx->obc->obs.oi.soid, 10,
9325                                        &reply_obj.reqids,
9326                                        &reply_obj.reqid_return_codes);
9327     dout(20) << " got reqids" << dendl;
9328   }
9329
9330   dout(20) << " cursor.is_complete=" << cursor.is_complete()
9331            << " " << out_attrs.size() << " attrs"
9332            << " " << bl.length() << " bytes"
9333            << " " << reply_obj.omap_header.length() << " omap header bytes"
9334            << " " << reply_obj.omap_data.length() << " omap data bytes in "
9335            << omap_keys << " keys"
9336            << " " << reply_obj.reqids.size() << " reqids"
9337            << dendl;
9338   reply_obj.cursor = cursor;
9339   if (!async_read_started) {
9340     encode(reply_obj, osd_op.outdata, features);
9341   }
9342   if (cb && !async_read_started) {
9343     delete cb;
9344   }
9345
9346   if (result > 0) {
9347     result = 0;
9348   }
9349   return result;
9350 }
9351
9352 void PrimaryLogPG::fill_in_copy_get_noent(OpRequestRef& op, hobject_t oid,
9353                                           OSDOp& osd_op)
9354 {
9355   const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
9356   uint64_t features = m->get_features();
9357   object_copy_data_t reply_obj;
9358
9359   recovery_state.get_pg_log().get_log().get_object_reqids(oid, 10, &reply_obj.reqids,
9360                                      &reply_obj.reqid_return_codes);
9361   dout(20) << __func__ << " got reqids " << reply_obj.reqids << dendl;
9362   encode(reply_obj, osd_op.outdata, features);
9363   osd_op.rval = -ENOENT;
9364   MOSDOpReply *reply = new MOSDOpReply(m, 0, get_osdmap_epoch(), 0, false);
9365   reply->set_result(-ENOENT);
9366   reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
9367   osd->send_message_osd_client(reply, m->get_connection());
9368 }
9369
9370 void PrimaryLogPG::start_copy(CopyCallback *cb, ObjectContextRef obc,
9371                               hobject_t src, object_locator_t oloc,
9372                               version_t version, unsigned flags,
9373                               bool mirror_snapset,
9374                               unsigned src_obj_fadvise_flags,
9375                               unsigned dest_obj_fadvise_flags)
9376 {
9377   const hobject_t& dest = obc->obs.oi.soid;
9378   dout(10) << __func__ << " " << dest
9379            << " from " << src << " " << oloc << " v" << version
9380            << " flags " << flags
9381            << (mirror_snapset ? " mirror_snapset" : "")
9382            << dendl;
9383
9384   ceph_assert(!mirror_snapset || src.snap == CEPH_NOSNAP);
9385
9386   // cancel a previous in-progress copy?
9387   if (copy_ops.count(dest)) {
9388     // FIXME: if the src etc match, we could avoid restarting from the
9389     // beginning.
9390     CopyOpRef cop = copy_ops[dest];
9391     vector<ceph_tid_t> tids;
9392     cancel_copy(cop, false, &tids);
9393     osd->objecter->op_cancel(tids, -ECANCELED);
9394   }
9395
9396   CopyOpRef cop(std::make_shared<CopyOp>(cb, obc, src, oloc, version, flags,
9397                            mirror_snapset, src_obj_fadvise_flags,
9398                            dest_obj_fadvise_flags));
9399   copy_ops[dest] = cop;
9400   dout(20) << fmt::format("{}: blocking {}", __func__, dest) << dendl;
9401   obc->start_block();
9402
9403   if (!obc->obs.oi.has_manifest()) {
9404     _copy_some(obc, cop);
9405   } else {
9406     if (obc->obs.oi.manifest.is_redirect()) {
9407       _copy_some(obc, cop);
9408     } else if (obc->obs.oi.manifest.is_chunked()) {
9409       auto p = obc->obs.oi.manifest.chunk_map.begin();
9410       _copy_some_manifest(obc, cop, p->first);
9411     } else {
9412       ceph_abort_msg("unrecognized manifest type");
9413     }
9414   }
9415 }
9416
9417 void PrimaryLogPG::_copy_some(ObjectContextRef obc, CopyOpRef cop)
9418 {
9419   dout(10) << __func__ << " " << *obc << " " << cop << dendl;
9420
9421   unsigned flags = 0;
9422   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_FLUSH)
9423     flags |= CEPH_OSD_FLAG_FLUSH;
9424   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE)
9425     flags |= CEPH_OSD_FLAG_IGNORE_CACHE;
9426   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY)
9427     flags |= CEPH_OSD_FLAG_IGNORE_OVERLAY;
9428   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE)
9429     flags |= CEPH_OSD_FLAG_MAP_SNAP_CLONE;
9430   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_RWORDERED)
9431     flags |= CEPH_OSD_FLAG_RWORDERED;
9432
9433   C_GatherBuilder gather(cct);
9434
9435   if (cop->cursor.is_initial() && cop->mirror_snapset) {
9436     // list snaps too.
9437     ceph_assert(cop->src.snap == CEPH_NOSNAP);
9438     ObjectOperation op;
9439     op.list_snaps(&cop->results.snapset, NULL);
9440     ceph_tid_t tid = osd->objecter->read(cop->src.oid, cop->oloc, op,
9441                                     CEPH_SNAPDIR, NULL,
9442                                     flags, gather.new_sub(), NULL);
9443     cop->objecter_tid2 = tid;
9444   }
9445
9446   ObjectOperation op;
9447   if (cop->results.user_version) {
9448     op.assert_version(cop->results.user_version);
9449   } else {
9450     // we should learn the version after the first chunk, if we didn't know
9451     // it already!
9452     ceph_assert(cop->cursor.is_initial());
9453   }
9454   op.copy_get(&cop->cursor, get_copy_chunk_size(),
9455               &cop->results.object_size, &cop->results.mtime,
9456               &cop->attrs, &cop->data, &cop->omap_header, &cop->omap_data,
9457               &cop->results.snaps, &cop->results.snap_seq,
9458               &cop->results.flags,
9459               &cop->results.source_data_digest,
9460               &cop->results.source_omap_digest,
9461               &cop->results.reqids,
9462               &cop->results.reqid_return_codes,
9463               &cop->results.truncate_seq,
9464               &cop->results.truncate_size,
9465               &cop->rval);
9466   op.set_last_op_flags(cop->src_obj_fadvise_flags);
9467
9468   C_Copyfrom *fin = new C_Copyfrom(this, obc->obs.oi.soid,
9469                                    get_last_peering_reset(), cop);
9470   gather.set_finisher(new C_OnFinisher(fin,
9471                                        osd->get_objecter_finisher(get_pg_shard())));
9472
9473   ceph_tid_t tid = osd->objecter->read(cop->src.oid, cop->oloc, op,
9474                                   cop->src.snap, NULL,
9475                                   flags,
9476                                   gather.new_sub(),
9477                                   // discover the object version if we don't know it yet
9478                                   cop->results.user_version ? NULL : &cop->results.user_version);
9479   fin->tid = tid;
9480   cop->objecter_tid = tid;
9481   gather.activate();
9482 }
9483
9484 void PrimaryLogPG::_copy_some_manifest(ObjectContextRef obc, CopyOpRef cop, uint64_t start_offset)
9485 {
9486   dout(10) << __func__ << " " << *obc << " " << cop << dendl;
9487
9488   unsigned flags = 0;
9489   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_FLUSH)
9490     flags |= CEPH_OSD_FLAG_FLUSH;
9491   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE)
9492     flags |= CEPH_OSD_FLAG_IGNORE_CACHE;
9493   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY)
9494     flags |= CEPH_OSD_FLAG_IGNORE_OVERLAY;
9495   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE)
9496     flags |= CEPH_OSD_FLAG_MAP_SNAP_CLONE;
9497   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_RWORDERED)
9498     flags |= CEPH_OSD_FLAG_RWORDERED;
9499
9500   int num_chunks = 0;
9501   uint64_t last_offset = 0, chunks_size = 0;
9502   object_manifest_t *manifest = &obc->obs.oi.manifest;
9503   map<uint64_t, chunk_info_t>::iterator iter = manifest->chunk_map.find(start_offset);
9504   for (;iter != manifest->chunk_map.end(); ++iter) {
9505     num_chunks++;
9506     chunks_size += iter->second.length;
9507     last_offset = iter->first;
9508     if (get_copy_chunk_size() < chunks_size) {
9509       break;
9510     }
9511   }
9512
9513   cop->num_chunk = num_chunks;
9514   cop->start_offset = start_offset;
9515   cop->last_offset = last_offset;
9516   dout(20) << __func__ << " oid " << obc->obs.oi.soid << " num_chunks: " << num_chunks
9517           << " start_offset: " << start_offset << " chunks_size: " << chunks_size
9518           << " last_offset: " << last_offset << dendl;
9519
9520   iter = manifest->chunk_map.find(start_offset);
9521   for (;iter != manifest->chunk_map.end(); ++iter) {
9522     uint64_t obj_offset = iter->first;
9523     uint64_t length = manifest->chunk_map[iter->first].length;
9524     hobject_t soid = manifest->chunk_map[iter->first].oid;
9525     object_locator_t oloc(soid);
9526     CopyCallback * cb = NULL;
9527     CopyOpRef sub_cop(std::make_shared<CopyOp>(cb, ObjectContextRef(), cop->src, oloc,
9528                        cop->results.user_version, cop->flags, cop->mirror_snapset,
9529                        cop->src_obj_fadvise_flags, cop->dest_obj_fadvise_flags));
9530     sub_cop->cursor.data_offset = obj_offset;
9531     cop->chunk_cops[obj_offset] = sub_cop;
9532
9533     int s = sub_cop->chunk_ops.size();
9534     sub_cop->chunk_ops.resize(s+1);
9535     sub_cop->chunk_ops[s].op.op =  CEPH_OSD_OP_READ;
9536     sub_cop->chunk_ops[s].op.extent.offset = manifest->chunk_map[iter->first].offset;
9537     sub_cop->chunk_ops[s].op.extent.length = length;
9538
9539     ObjectOperation op;
9540     op.dup(sub_cop->chunk_ops);
9541
9542     if (cop->results.user_version) {
9543       op.assert_version(cop->results.user_version);
9544     } else {
9545       // we should learn the version after the first chunk, if we didn't know
9546       // it already!
9547       ceph_assert(cop->cursor.is_initial());
9548     }
9549     op.set_last_op_flags(cop->src_obj_fadvise_flags);
9550
9551     C_CopyChunk *fin = new C_CopyChunk(this, obc->obs.oi.soid,
9552                                      get_last_peering_reset(), cop);
9553     fin->offset = obj_offset;
9554
9555     ceph_tid_t tid = osd->objecter->read(
9556       soid.oid, oloc, op,
9557       sub_cop->src.snap, NULL,
9558       flags,
9559       new C_OnFinisher(fin, osd->get_objecter_finisher(get_pg_shard())),
9560       // discover the object version if we don't know it yet
9561       sub_cop->results.user_version ? NULL : &sub_cop->results.user_version);
9562     fin->tid = tid;
9563     sub_cop->objecter_tid = tid;
9564
9565     dout(20) << __func__ << " tgt_oid: " << soid.oid << " tgt_offset: "
9566             << manifest->chunk_map[iter->first].offset
9567             << " length: " << length << " pool id: " << oloc.pool
9568             << " tid: " << tid << dendl;
9569
9570     if (last_offset <= iter->first) {
9571       break;
9572     }
9573   }
9574 }
9575
9576 void PrimaryLogPG::process_copy_chunk(hobject_t oid, ceph_tid_t tid, int r)
9577 {
9578   dout(10) << __func__ << " " << oid << " tid " << tid
9579            << " " << cpp_strerror(r) << dendl;
9580   map<hobject_t,CopyOpRef>::iterator p = copy_ops.find(oid);
9581   if (p == copy_ops.end()) {
9582     dout(10) << __func__ << " no copy_op found" << dendl;
9583     return;
9584   }
9585   CopyOpRef cop = p->second;
9586   if (tid != cop->objecter_tid) {
9587     dout(10) << __func__ << " tid " << tid << " != cop " << cop
9588              << " tid " << cop->objecter_tid << dendl;
9589     return;
9590   }
9591
9592   if (cop->omap_data.length() || cop->omap_header.length())
9593     cop->results.has_omap = true;
9594
9595   if (r >= 0 && !pool.info.supports_omap() &&
9596       (cop->omap_data.length() || cop->omap_header.length())) {
9597     r = -EOPNOTSUPP;
9598   }
9599   cop->objecter_tid = 0;
9600   cop->objecter_tid2 = 0;  // assume this ordered before us (if it happened)
9601   ObjectContextRef& cobc = cop->obc;
9602
9603   if (r < 0)
9604     goto out;
9605
9606   ceph_assert(cop->rval >= 0);
9607
9608   if (oid.snap < CEPH_NOSNAP && !cop->results.snaps.empty()) {
9609     // verify snap hasn't been deleted
9610     vector<snapid_t>::iterator p = cop->results.snaps.begin();
9611     while (p != cop->results.snaps.end()) {
9612       // make best effort to sanitize snaps/clones.
9613       if (get_osdmap()->in_removed_snaps_queue(info.pgid.pgid.pool(), *p)) {
9614         dout(10) << __func__ << " clone snap " << *p << " has been deleted"
9615                  << dendl;
9616         for (vector<snapid_t>::iterator q = p + 1;
9617              q != cop->results.snaps.end();
9618              ++q)
9619           *(q - 1) = *q;
9620         cop->results.snaps.resize(cop->results.snaps.size() - 1);
9621       } else {
9622         ++p;
9623       }
9624     }
9625     if (cop->results.snaps.empty()) {
9626       dout(10) << __func__ << " no more snaps for " << oid << dendl;
9627       r = -ENOENT;
9628       goto out;
9629     }
9630   }
9631
9632   ceph_assert(cop->rval >= 0);
9633
9634   if (!cop->temp_cursor.data_complete) {
9635     cop->results.data_digest = cop->data.crc32c(cop->results.data_digest);
9636   }
9637   if (pool.info.supports_omap() && !cop->temp_cursor.omap_complete) {
9638     if (cop->omap_header.length()) {
9639       cop->results.omap_digest =
9640         cop->omap_header.crc32c(cop->results.omap_digest);
9641     }
9642     if (cop->omap_data.length()) {
9643       bufferlist keys;
9644       keys.substr_of(cop->omap_data, 4, cop->omap_data.length() - 4);
9645       cop->results.omap_digest = keys.crc32c(cop->results.omap_digest);
9646     }
9647   }
9648
9649   if (!cop->temp_cursor.attr_complete) {
9650     for (map<string,bufferlist>::iterator p = cop->attrs.begin();
9651          p != cop->attrs.end();
9652          ++p) {
9653       cop->results.attrs[string("_") + p->first] = p->second;
9654     }
9655     cop->attrs.clear();
9656   }
9657
9658   if (!cop->cursor.is_complete()) {
9659     // write out what we have so far
9660     if (cop->temp_cursor.is_initial()) {
9661       ceph_assert(!cop->results.started_temp_obj);
9662       cop->results.started_temp_obj = true;
9663       cop->results.temp_oid = generate_temp_object(oid);
9664       dout(20) << __func__ << " using temp " << cop->results.temp_oid << dendl;
9665     }
9666     ObjectContextRef tempobc = get_object_context(cop->results.temp_oid, true);
9667     OpContextUPtr ctx = simple_opc_create(tempobc);
9668     if (cop->temp_cursor.is_initial()) {
9669       ctx->new_temp_oid = cop->results.temp_oid;
9670     }
9671     _write_copy_chunk(cop, ctx->op_t.get());
9672     simple_opc_submit(std::move(ctx));
9673     dout(10) << __func__ << " fetching more" << dendl;
9674     _copy_some(cobc, cop);
9675     return;
9676   }
9677
9678   // verify digests?
9679   if (cop->results.is_data_digest() || cop->results.is_omap_digest()) {
9680     dout(20) << __func__ << std::hex
9681       << " got digest: rx data 0x" << cop->results.data_digest
9682       << " omap 0x" << cop->results.omap_digest
9683       << ", source: data 0x" << cop->results.source_data_digest
9684       << " omap 0x" <<  cop->results.source_omap_digest
9685       << std::dec
9686       << " flags " << cop->results.flags
9687       << dendl;
9688   }
9689   if (cop->results.is_data_digest() &&
9690       cop->results.data_digest != cop->results.source_data_digest) {
9691     derr << __func__ << std::hex << " data digest 0x" << cop->results.data_digest
9692          << " != source 0x" << cop->results.source_data_digest << std::dec
9693          << dendl;
9694     osd->clog->error() << info.pgid << " copy from " << cop->src
9695                        << " to " << cop->obc->obs.oi.soid << std::hex
9696                        << " data digest 0x" << cop->results.data_digest
9697                        << " != source 0x" << cop->results.source_data_digest
9698                        << std::dec;
9699     r = -EIO;
9700     goto out;
9701   }
9702   if (cop->results.is_omap_digest() &&
9703       cop->results.omap_digest != cop->results.source_omap_digest) {
9704     derr << __func__ << std::hex
9705          << " omap digest 0x" << cop->results.omap_digest
9706          << " != source 0x" << cop->results.source_omap_digest
9707          << std::dec << dendl;
9708     osd->clog->error() << info.pgid << " copy from " << cop->src
9709                        << " to " << cop->obc->obs.oi.soid << std::hex
9710                        << " omap digest 0x" << cop->results.omap_digest
9711                        << " != source 0x" << cop->results.source_omap_digest
9712                        << std::dec;
9713     r = -EIO;
9714     goto out;
9715   }
9716   if (cct->_conf->osd_debug_inject_copyfrom_error) {
9717     derr << __func__ << " injecting copyfrom failure" << dendl;
9718     r = -EIO;
9719     goto out;
9720   }
9721
9722   cop->results.fill_in_final_tx = std::function<void(PGTransaction*)>(
9723     [this, &cop /* avoid ref cycle */](PGTransaction *t) {
9724       ObjectState& obs = cop->obc->obs;
9725       if (cop->temp_cursor.is_initial()) {
9726         dout(20) << "fill_in_final_tx: writing "
9727                  << "directly to final object" << dendl;
9728         // write directly to final object
9729         cop->results.temp_oid = obs.oi.soid;
9730         _write_copy_chunk(cop, t);
9731       } else {
9732         // finish writing to temp object, then move into place
9733         dout(20) << "fill_in_final_tx: writing to temp object" << dendl;
9734         if (obs.oi.has_manifest() && obs.oi.manifest.is_redirect() && obs.exists) {
9735           /* In redirect manifest case, the object exists in the upper tier.
9736            * So, to avoid a conflict when rename() is called, remove existing
9737            * object first
9738            */
9739           t->remove(obs.oi.soid);
9740         }
9741         _write_copy_chunk(cop, t);
9742         t->rename(obs.oi.soid, cop->results.temp_oid);
9743       }
9744       t->setattrs(obs.oi.soid, cop->results.attrs);
9745     });
9746
9747   dout(20) << __func__ << " success; committing" << dendl;
9748
9749  out:
9750   dout(20) << __func__ << " complete r = " << cpp_strerror(r) << dendl;
9751   CopyCallbackResults results(r, &cop->results);
9752   cop->cb->complete(results);
9753
9754   copy_ops.erase(cobc->obs.oi.soid);
9755   cobc->stop_block();
9756
9757   if (r < 0 && cop->results.started_temp_obj) {
9758     dout(10) << __func__ << " deleting partial temp object "
9759              << cop->results.temp_oid << dendl;
9760     ObjectContextRef tempobc = get_object_context(cop->results.temp_oid, true);
9761     OpContextUPtr ctx = simple_opc_create(tempobc);
9762     ctx->op_t->remove(cop->results.temp_oid);
9763     ctx->discard_temp_oid = cop->results.temp_oid;
9764     simple_opc_submit(std::move(ctx));
9765   }
9766
9767   // cancel and requeue proxy ops on this object
9768   if (!r) {
9769     cancel_and_requeue_proxy_ops(cobc->obs.oi.soid);
9770   }
9771
9772   kick_object_context_blocked(cobc);
9773 }
9774
9775 void PrimaryLogPG::process_copy_chunk_manifest(hobject_t oid, ceph_tid_t tid, int r, uint64_t offset)
9776 {
9777   dout(10) << __func__ << " " << oid << " tid " << tid
9778            << " " << cpp_strerror(r) << dendl;
9779   map<hobject_t,CopyOpRef>::iterator p = copy_ops.find(oid);
9780   if (p == copy_ops.end()) {
9781     dout(10) << __func__ << " no copy_op found" << dendl;
9782     return;
9783   }
9784   CopyOpRef obj_cop = p->second;
9785   CopyOpRef chunk_cop = obj_cop->chunk_cops[offset];
9786
9787   if (tid != chunk_cop->objecter_tid) {
9788     dout(10) << __func__ << " tid " << tid << " != cop " << chunk_cop
9789              << " tid " << chunk_cop->objecter_tid << dendl;
9790     return;
9791   }
9792
9793   if (chunk_cop->omap_data.length() || chunk_cop->omap_header.length()) {
9794     r = -EOPNOTSUPP;
9795   }
9796
9797   chunk_cop->objecter_tid = 0;
9798   chunk_cop->objecter_tid2 = 0;  // assume this ordered before us (if it happened)
9799   ObjectContextRef& cobc = obj_cop->obc;
9800   OSDOp &chunk_data = chunk_cop->chunk_ops[0];
9801
9802   if (r < 0) {
9803     obj_cop->failed = true;
9804     goto out;
9805   }
9806
9807   if (obj_cop->failed) {
9808     return;
9809   }
9810   if (!chunk_data.outdata.length()) {
9811     r = -EIO;
9812     obj_cop->failed = true;
9813     goto out;
9814   }
9815
9816   obj_cop->num_chunk--;
9817
9818   /* check all of the copyop are completed */
9819   if (obj_cop->num_chunk) {
9820     dout(20) << __func__ << " num_chunk: " << obj_cop->num_chunk << dendl;
9821     return;
9822   }
9823
9824   {
9825     OpContextUPtr ctx = simple_opc_create(obj_cop->obc);
9826     if (!ctx->lock_manager.take_write_lock(
9827           obj_cop->obc->obs.oi.soid,
9828           obj_cop->obc)) {
9829       // recovery op can take read lock.
9830       // so need to wait for recovery completion
9831       r = -EAGAIN;
9832       obj_cop->failed = true;
9833       close_op_ctx(ctx.release());
9834       goto out;
9835     }
9836     dout(20) << __func__ << " took lock on obc, " << obj_cop->obc->rwstate << dendl;
9837
9838     PGTransaction *t = ctx->op_t.get();
9839     ObjectState& obs = ctx->new_obs;
9840     for (auto p : obj_cop->chunk_cops) {
9841       OSDOp &sub_chunk = p.second->chunk_ops[0];
9842       t->write(cobc->obs.oi.soid,
9843               p.second->cursor.data_offset,
9844               sub_chunk.outdata.length(),
9845               sub_chunk.outdata,
9846               p.second->dest_obj_fadvise_flags);
9847       dout(20) << __func__ << " offset: " << p.second->cursor.data_offset
9848               << " length: " << sub_chunk.outdata.length() << dendl;
9849       write_update_size_and_usage(ctx->delta_stats, obs.oi, ctx->modified_ranges,
9850                                   p.second->cursor.data_offset, sub_chunk.outdata.length());
9851       obs.oi.manifest.chunk_map[p.second->cursor.data_offset].clear_flag(chunk_info_t::FLAG_MISSING);
9852       ctx->clean_regions.mark_data_region_dirty(p.second->cursor.data_offset, sub_chunk.outdata.length());
9853       sub_chunk.outdata.clear();
9854     }
9855     obs.oi.clear_data_digest();
9856     ctx->at_version = get_next_version();
9857     finish_ctx(ctx.get(), pg_log_entry_t::PROMOTE);
9858     simple_opc_submit(std::move(ctx));
9859     obj_cop->chunk_cops.clear();
9860
9861     auto p = cobc->obs.oi.manifest.chunk_map.rbegin();
9862     /* check remaining work */
9863     if (p != cobc->obs.oi.manifest.chunk_map.rend()) {
9864       if (obj_cop->last_offset < p->first) {
9865         for (auto &en : cobc->obs.oi.manifest.chunk_map) {
9866           if (obj_cop->last_offset < en.first) {
9867             _copy_some_manifest(cobc, obj_cop, en.first);
9868             return;
9869           }
9870         }
9871       }
9872     }
9873   }
9874
9875  out:
9876   dout(20) << __func__ << " complete r = " << cpp_strerror(r) << dendl;
9877   CopyCallbackResults results(r, &obj_cop->results);
9878   obj_cop->cb->complete(results);
9879
9880   copy_ops.erase(cobc->obs.oi.soid);
9881   cobc->stop_block();
9882
9883   // cancel and requeue proxy ops on this object
9884   if (!r) {
9885     cancel_and_requeue_proxy_ops(cobc->obs.oi.soid);
9886   }
9887
9888   kick_object_context_blocked(cobc);
9889 }
9890
9891 void PrimaryLogPG::cancel_and_requeue_proxy_ops(hobject_t oid) {
9892   vector<ceph_tid_t> tids;
9893   for (map<ceph_tid_t, ProxyReadOpRef>::iterator it = proxyread_ops.begin();
9894       it != proxyread_ops.end();) {
9895     if (it->second->soid == oid) {
9896       cancel_proxy_read((it++)->second, &tids);
9897     } else {
9898       ++it;
9899     }
9900   }
9901   for (map<ceph_tid_t, ProxyWriteOpRef>::iterator it = proxywrite_ops.begin();
9902        it != proxywrite_ops.end();) {
9903     if (it->second->soid == oid) {
9904       cancel_proxy_write((it++)->second, &tids);
9905     } else {
9906       ++it;
9907     }
9908   }
9909   osd->objecter->op_cancel(tids, -ECANCELED);
9910   kick_proxy_ops_blocked(oid);
9911 }
9912
9913 void PrimaryLogPG::_write_copy_chunk(CopyOpRef cop, PGTransaction *t)
9914 {
9915   dout(20) << __func__ << " " << cop
9916            << " " << cop->attrs.size() << " attrs"
9917            << " " << cop->data.length() << " bytes"
9918            << " " << cop->omap_header.length() << " omap header bytes"
9919            << " " << cop->omap_data.length() << " omap data bytes"
9920            << dendl;
9921   if (!cop->temp_cursor.attr_complete) {
9922     t->create(cop->results.temp_oid);
9923   }
9924   if (!cop->temp_cursor.data_complete) {
9925     ceph_assert(cop->data.length() + cop->temp_cursor.data_offset ==
9926            cop->cursor.data_offset);
9927     if (pool.info.required_alignment() &&
9928         !cop->cursor.data_complete) {
9929       /**
9930        * Trim off the unaligned bit at the end, we'll adjust cursor.data_offset
9931        * to pick it up on the next pass.
9932        */
9933       ceph_assert(cop->temp_cursor.data_offset %
9934              pool.info.required_alignment() == 0);
9935       if (cop->data.length() % pool.info.required_alignment() != 0) {
9936         uint64_t to_trim =
9937           cop->data.length() % pool.info.required_alignment();
9938         bufferlist bl;
9939         bl.substr_of(cop->data, 0, cop->data.length() - to_trim);
9940         cop->data.swap(bl);
9941         cop->cursor.data_offset -= to_trim;
9942         ceph_assert(cop->data.length() + cop->temp_cursor.data_offset ==
9943                cop->cursor.data_offset);
9944       }
9945     }
9946     if (cop->data.length()) {
9947       t->write(
9948         cop->results.temp_oid,
9949         cop->temp_cursor.data_offset,
9950         cop->data.length(),
9951         cop->data,
9952         cop->dest_obj_fadvise_flags);
9953     }
9954     cop->data.clear();
9955   }
9956   if (pool.info.supports_omap()) {
9957     if (!cop->temp_cursor.omap_complete) {
9958       if (cop->omap_header.length()) {
9959         t->omap_setheader(
9960           cop->results.temp_oid,
9961           cop->omap_header);
9962         cop->omap_header.clear();
9963       }
9964       if (cop->omap_data.length()) {
9965         map<string,bufferlist> omap;
9966         bufferlist::const_iterator p = cop->omap_data.begin();
9967         decode(omap, p);
9968         t->omap_setkeys(cop->results.temp_oid, omap);
9969         cop->omap_data.clear();
9970       }
9971     }
9972   } else {
9973     ceph_assert(cop->omap_header.length() == 0);
9974     ceph_assert(cop->omap_data.length() == 0);
9975   }
9976   cop->temp_cursor = cop->cursor;
9977 }
9978
9979 void PrimaryLogPG::finish_copyfrom(CopyFromCallback *cb)
9980 {
9981   OpContext *ctx = cb->ctx;
9982   dout(20) << "finish_copyfrom on " << ctx->obs->oi.soid << dendl;
9983
9984   ObjectState& obs = ctx->new_obs;
9985   if (obs.exists) {
9986     dout(20) << __func__ << ": exists, removing" << dendl;
9987     ctx->op_t->remove(obs.oi.soid);
9988   } else {
9989     ctx->delta_stats.num_objects++;
9990     obs.exists = true;
9991   }
9992   if (cb->is_temp_obj_used()) {
9993     ctx->discard_temp_oid = cb->results->temp_oid;
9994   }
9995   cb->results->fill_in_final_tx(ctx->op_t.get());
9996
9997   // CopyFromCallback fills this in for us
9998   obs.oi.user_version = ctx->user_at_version;
9999
10000   if (cb->results->is_data_digest()) {
10001     obs.oi.set_data_digest(cb->results->data_digest);
10002   } else {
10003     obs.oi.clear_data_digest();
10004   }
10005   if (cb->results->is_omap_digest()) {
10006     obs.oi.set_omap_digest(cb->results->omap_digest);
10007   } else {
10008     obs.oi.clear_omap_digest();
10009   }
10010
10011   obs.oi.truncate_seq = cb->truncate_seq;
10012   obs.oi.truncate_size = cb->truncate_size;
10013
10014   obs.oi.mtime = ceph::real_clock::to_timespec(cb->results->mtime);
10015   ctx->mtime = utime_t();
10016
10017   ctx->extra_reqids = cb->results->reqids;
10018   ctx->extra_reqid_return_codes = cb->results->reqid_return_codes;
10019
10020   // cache: clear whiteout?
10021   if (obs.oi.is_whiteout()) {
10022     dout(10) << __func__ << " clearing whiteout on " << obs.oi.soid << dendl;
10023     obs.oi.clear_flag(object_info_t::FLAG_WHITEOUT);
10024     --ctx->delta_stats.num_whiteouts;
10025   }
10026
10027   if (cb->results->has_omap) {
10028     dout(10) << __func__ << " setting omap flag on " << obs.oi.soid << dendl;
10029     obs.oi.set_flag(object_info_t::FLAG_OMAP);
10030     ctx->clean_regions.mark_omap_dirty();
10031   } else {
10032     dout(10) << __func__ << " clearing omap flag on " << obs.oi.soid << dendl;
10033     obs.oi.clear_flag(object_info_t::FLAG_OMAP);
10034   }
10035
10036   interval_set<uint64_t> ch;
10037   if (obs.oi.size > 0)
10038     ch.insert(0, obs.oi.size);
10039   ctx->modified_ranges.union_of(ch);
10040   ctx->clean_regions.mark_data_region_dirty(0, std::max(obs.oi.size, cb->get_data_size()));
10041
10042   if (cb->get_data_size() != obs.oi.size) {
10043     ctx->delta_stats.num_bytes -= obs.oi.size;
10044     obs.oi.size = cb->get_data_size();
10045     ctx->delta_stats.num_bytes += obs.oi.size;
10046   }
10047   ctx->delta_stats.num_wr++;
10048   ctx->delta_stats.num_wr_kb += shift_round_up(obs.oi.size, 10);
10049
10050   osd->logger->inc(l_osd_copyfrom);
10051 }
10052
10053 void PrimaryLogPG::finish_promote(int r, CopyResults *results,
10054                                   ObjectContextRef obc)
10055 {
10056   const hobject_t& soid = obc->obs.oi.soid;
10057   dout(10) << __func__ << " " << soid << " r=" << r
10058            << " uv" << results->user_version << dendl;
10059
10060   if (r == -ECANCELED) {
10061     return;
10062   }
10063
10064   if (r != -ENOENT && soid.is_snap()) {
10065     if (results->snaps.empty()) {
10066       // we must have read "snap" content from the head object in the
10067       // base pool.  use snap_seq to construct what snaps should be
10068       // for this clone (what is was before we evicted the clean clone
10069       // from this pool, and what it will be when we flush and the
10070       // clone eventually happens in the base pool).  we want to use
10071       // snaps in (results->snap_seq,soid.snap]
10072       SnapSet& snapset = obc->ssc->snapset;
10073       for (auto p = snapset.clone_snaps.rbegin();
10074            p != snapset.clone_snaps.rend();
10075            ++p) {
10076         for (auto snap : p->second) {
10077           if (snap > soid.snap) {
10078             continue;
10079           }
10080           if (snap <= results->snap_seq) {
10081             break;
10082           }
10083           results->snaps.push_back(snap);
10084         }
10085       }
10086     }
10087
10088     dout(20) << __func__ << " snaps " << results->snaps << dendl;
10089     filter_snapc(results->snaps);
10090
10091     dout(20) << __func__ << " filtered snaps " << results->snaps << dendl;
10092     if (results->snaps.empty()) {
10093       dout(20) << __func__
10094                << " snaps are empty, clone is invalid,"
10095                << " setting r to ENOENT" << dendl;
10096       r = -ENOENT;
10097     }
10098   }
10099
10100   if (r < 0 && results->started_temp_obj) {
10101     dout(10) << __func__ << " abort; will clean up partial work" << dendl;
10102     ObjectContextRef tempobc = get_object_context(results->temp_oid, false);
10103     ceph_assert(tempobc);
10104     OpContextUPtr ctx = simple_opc_create(tempobc);
10105     ctx->op_t->remove(results->temp_oid);
10106     simple_opc_submit(std::move(ctx));
10107     results->started_temp_obj = false;
10108   }
10109
10110   if (r == -ENOENT && soid.is_snap()) {
10111     dout(10) << __func__
10112              << ": enoent while trying to promote clone, " << soid
10113              << " must have been trimmed, removing from snapset"
10114              << dendl;
10115     hobject_t head(soid.get_head());
10116     ObjectContextRef obc = get_object_context(head, false);
10117     ceph_assert(obc);
10118
10119     OpContextUPtr tctx = simple_opc_create(obc);
10120     tctx->at_version = get_next_version();
10121     if (get_osdmap()->require_osd_release < ceph_release_t::octopus) {
10122       filter_snapc(tctx->new_snapset.snaps);
10123     } else {
10124       tctx->new_snapset.snaps.clear();
10125     }
10126     vector<snapid_t> new_clones;
10127     map<snapid_t, vector<snapid_t>> new_clone_snaps;
10128     for (vector<snapid_t>::iterator i = tctx->new_snapset.clones.begin();
10129          i != tctx->new_snapset.clones.end();
10130          ++i) {
10131       if (*i != soid.snap) {
10132         new_clones.push_back(*i);
10133         auto p = tctx->new_snapset.clone_snaps.find(*i);
10134         if (p != tctx->new_snapset.clone_snaps.end()) {
10135           new_clone_snaps[*i] = p->second;
10136         }
10137       }
10138     }
10139     tctx->new_snapset.clones.swap(new_clones);
10140     tctx->new_snapset.clone_overlap.erase(soid.snap);
10141     tctx->new_snapset.clone_size.erase(soid.snap);
10142     tctx->new_snapset.clone_snaps.swap(new_clone_snaps);
10143
10144     // take RWWRITE lock for duration of our local write.  ignore starvation.
10145     if (!tctx->lock_manager.take_write_lock(
10146           head,
10147           obc)) {
10148       ceph_abort_msg("problem!");
10149     }
10150     dout(20) << __func__ << " took lock on obc, " << obc->rwstate << dendl;
10151
10152     finish_ctx(tctx.get(), pg_log_entry_t::PROMOTE);
10153
10154     simple_opc_submit(std::move(tctx));
10155     return;
10156   }
10157
10158   bool whiteout = false;
10159   if (r == -ENOENT) {
10160     ceph_assert(soid.snap == CEPH_NOSNAP); // snap case is above
10161     dout(10) << __func__ << " whiteout " << soid << dendl;
10162     whiteout = true;
10163   }
10164
10165   if (r < 0 && !whiteout) {
10166     derr << __func__ << " unexpected promote error " << cpp_strerror(r) << dendl;
10167     // pass error to everyone blocked on this object
10168     // FIXME: this is pretty sloppy, but at this point we got
10169     // something unexpected and don't have many other options.
10170     map<hobject_t,list<OpRequestRef>>::iterator blocked_iter =
10171       waiting_for_blocked_object.find(soid);
10172     if (blocked_iter != waiting_for_blocked_object.end()) {
10173       while (!blocked_iter->second.empty()) {
10174         osd->reply_op_error(blocked_iter->second.front(), r);
10175         blocked_iter->second.pop_front();
10176       }
10177       waiting_for_blocked_object.erase(blocked_iter);
10178     }
10179     return;
10180   }
10181
10182   osd->promote_finish(results->object_size);
10183
10184   OpContextUPtr tctx =  simple_opc_create(obc);
10185   tctx->at_version = get_next_version();
10186
10187   if (!obc->obs.oi.has_manifest()) {
10188     ++tctx->delta_stats.num_objects;
10189   }
10190   if (soid.snap < CEPH_NOSNAP)
10191     ++tctx->delta_stats.num_object_clones;
10192   tctx->new_obs.exists = true;
10193
10194   tctx->extra_reqids = results->reqids;
10195   tctx->extra_reqid_return_codes = results->reqid_return_codes;
10196
10197   if (obc->obs.oi.has_manifest() && obc->obs.oi.manifest.is_redirect()) {
10198     tctx->new_obs.oi.manifest.type = object_manifest_t::TYPE_NONE;
10199     tctx->new_obs.oi.clear_flag(object_info_t::FLAG_REDIRECT_HAS_REFERENCE);
10200     tctx->new_obs.oi.clear_flag(object_info_t::FLAG_MANIFEST);
10201     tctx->new_obs.oi.manifest.redirect_target = hobject_t();
10202     tctx->delta_stats.num_objects_manifest--;
10203     if (obc->obs.oi.test_flag(object_info_t::FLAG_REDIRECT_HAS_REFERENCE)) {
10204       dec_all_refcount_manifest(obc->obs.oi, tctx.get());
10205     }
10206   }
10207
10208   if (whiteout) {
10209     // create a whiteout
10210     tctx->op_t->create(soid);
10211     tctx->new_obs.oi.set_flag(object_info_t::FLAG_WHITEOUT);
10212     ++tctx->delta_stats.num_whiteouts;
10213     dout(20) << __func__ << " creating whiteout on " << soid << dendl;
10214     osd->logger->inc(l_osd_tier_whiteout);
10215   } else {
10216     if (results->has_omap) {
10217       dout(10) << __func__ << " setting omap flag on " << soid << dendl;
10218       tctx->new_obs.oi.set_flag(object_info_t::FLAG_OMAP);
10219       ++tctx->delta_stats.num_objects_omap;
10220     }
10221
10222     results->fill_in_final_tx(tctx->op_t.get());
10223     if (results->started_temp_obj) {
10224       tctx->discard_temp_oid = results->temp_oid;
10225     }
10226     tctx->new_obs.oi.size = results->object_size;
10227     tctx->new_obs.oi.user_version = results->user_version;
10228     tctx->new_obs.oi.mtime = ceph::real_clock::to_timespec(results->mtime);
10229     tctx->mtime = utime_t();
10230     if (results->is_data_digest()) {
10231       tctx->new_obs.oi.set_data_digest(results->data_digest);
10232     } else {
10233       tctx->new_obs.oi.clear_data_digest();
10234     }
10235     if (results->object_size)
10236       tctx->clean_regions.mark_data_region_dirty(0, results->object_size);
10237     if (results->is_omap_digest()) {
10238       tctx->new_obs.oi.set_omap_digest(results->omap_digest);
10239     } else {
10240       tctx->new_obs.oi.clear_omap_digest();
10241     }
10242     if (results->has_omap)
10243         tctx->clean_regions.mark_omap_dirty();
10244     tctx->new_obs.oi.truncate_seq = results->truncate_seq;
10245     tctx->new_obs.oi.truncate_size = results->truncate_size;
10246
10247     if (soid.snap != CEPH_NOSNAP) {
10248       ceph_assert(obc->ssc->snapset.clone_snaps.count(soid.snap));
10249       ceph_assert(obc->ssc->snapset.clone_size.count(soid.snap));
10250       ceph_assert(obc->ssc->snapset.clone_size[soid.snap] ==
10251              results->object_size);
10252       ceph_assert(obc->ssc->snapset.clone_overlap.count(soid.snap));
10253
10254       tctx->delta_stats.num_bytes += obc->ssc->snapset.get_clone_bytes(soid.snap);
10255     } else {
10256       tctx->delta_stats.num_bytes += results->object_size;
10257     }
10258   }
10259
10260   if (results->mirror_snapset) {
10261     ceph_assert(tctx->new_obs.oi.soid.snap == CEPH_NOSNAP);
10262     tctx->new_snapset.from_snap_set(
10263       results->snapset,
10264       get_osdmap()->require_osd_release < ceph_release_t::luminous);
10265   }
10266   dout(20) << __func__ << " new_snapset " << tctx->new_snapset << dendl;
10267
10268   // take RWWRITE lock for duration of our local write.  ignore starvation.
10269   if (!tctx->lock_manager.take_write_lock(
10270         obc->obs.oi.soid,
10271         obc)) {
10272     ceph_abort_msg("problem!");
10273   }
10274   dout(20) << __func__ << " took lock on obc, " << obc->rwstate << dendl;
10275
10276   finish_ctx(tctx.get(), pg_log_entry_t::PROMOTE);
10277
10278   simple_opc_submit(std::move(tctx));
10279
10280   osd->logger->inc(l_osd_tier_promote);
10281
10282   if (agent_state &&
10283       agent_state->is_idle())
10284     agent_choose_mode();
10285 }
10286
10287 void PrimaryLogPG::finish_promote_manifest(int r, CopyResults *results,
10288                                             ObjectContextRef obc)
10289 {
10290   const hobject_t& soid = obc->obs.oi.soid;
10291   dout(10) << __func__ << " " << soid << " r=" << r
10292            << " uv" << results->user_version << dendl;
10293
10294   if (r == -ECANCELED || r == -EAGAIN) {
10295     return;
10296   }
10297
10298   if (r < 0) {
10299     derr << __func__ << " unexpected promote error " << cpp_strerror(r) << dendl;
10300     // pass error to everyone blocked on this object
10301     // FIXME: this is pretty sloppy, but at this point we got
10302     // something unexpected and don't have many other options.
10303     map<hobject_t,list<OpRequestRef>>::iterator blocked_iter =
10304       waiting_for_blocked_object.find(soid);
10305     if (blocked_iter != waiting_for_blocked_object.end()) {
10306       while (!blocked_iter->second.empty()) {
10307         osd->reply_op_error(blocked_iter->second.front(), r);
10308         blocked_iter->second.pop_front();
10309       }
10310       waiting_for_blocked_object.erase(blocked_iter);
10311     }
10312     return;
10313   }
10314
10315   osd->promote_finish(results->object_size);
10316   osd->logger->inc(l_osd_tier_promote);
10317
10318   if (agent_state &&
10319       agent_state->is_idle())
10320     agent_choose_mode();
10321 }
10322
10323 void PrimaryLogPG::cancel_copy(CopyOpRef cop, bool requeue,
10324                                vector<ceph_tid_t> *tids)
10325 {
10326   dout(10) << __func__ << " " << cop->obc->obs.oi.soid
10327            << " from " << cop->src << " " << cop->oloc
10328            << " v" << cop->results.user_version << dendl;
10329
10330   // cancel objecter op, if we can
10331   if (cop->objecter_tid) {
10332     tids->push_back(cop->objecter_tid);
10333     cop->objecter_tid = 0;
10334     if (cop->objecter_tid2) {
10335       tids->push_back(cop->objecter_tid2);
10336       cop->objecter_tid2 = 0;
10337     }
10338   }
10339
10340   copy_ops.erase(cop->obc->obs.oi.soid);
10341   cop->obc->stop_block();
10342
10343   kick_object_context_blocked(cop->obc);
10344   cop->results.should_requeue = requeue;
10345   CopyCallbackResults result(-ECANCELED, &cop->results);
10346   cop->cb->complete(result);
10347
10348   // There may still be an objecter callback referencing this copy op.
10349   // That callback will not need the obc since it's been canceled, and
10350   // we need the obc reference to go away prior to flush.
10351   cop->obc = ObjectContextRef();
10352 }
10353
10354 void PrimaryLogPG::cancel_copy_ops(bool requeue, vector<ceph_tid_t> *tids)
10355 {
10356   dout(10) << __func__ << dendl;
10357   map<hobject_t,CopyOpRef>::iterator p = copy_ops.begin();
10358   while (p != copy_ops.end()) {
10359     // requeue this op? can I queue up all of them?
10360     cancel_copy((p++)->second, requeue, tids);
10361   }
10362 }
10363
10364 struct C_gather : public Context {
10365   PrimaryLogPGRef pg;
10366   hobject_t oid;
10367   epoch_t last_peering_reset;
10368   OSDOp *osd_op;
10369   C_gather(PrimaryLogPG *pg_, hobject_t oid_, epoch_t lpr_, OSDOp *osd_op_) :
10370     pg(pg_), oid(oid_), last_peering_reset(lpr_), osd_op(osd_op_) {}
10371   void finish(int r) override {
10372     if (r == -ECANCELED)
10373       return;
10374     std::scoped_lock locker{*pg};
10375     auto p = pg->cls_gather_ops.find(oid);
10376     if (p == pg->cls_gather_ops.end()) {
10377       // op was cancelled
10378       return;
10379     }
10380     if (last_peering_reset != pg->get_last_peering_reset()) {
10381       return;
10382     }
10383     osd_op->rval = r;
10384     PrimaryLogPG::OpContext *ctx = p->second.ctx;
10385     pg->cls_gather_ops.erase(p);
10386     pg->execute_ctx(ctx);
10387   }
10388 };
10389
10390 int PrimaryLogPG::start_cls_gather(OpContext *ctx, std::map<std::string, bufferlist> *src_obj_buffs, const std::string& pool,
10391                                    const char *cls, const char *method, bufferlist& inbl)
10392 {
10393   OpRequestRef op = ctx->op;
10394   MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
10395
10396   auto pool_id = osd->objecter->with_osdmap(std::mem_fn(&OSDMap::lookup_pg_pool_name), pool);
10397   object_locator_t oloc(pool_id);
10398
10399   ObjectState& obs = ctx->new_obs;
10400   object_info_t& oi = obs.oi;
10401   const hobject_t& soid = oi.soid;
10402
10403   ObjectContextRef obc = get_object_context(soid, false);
10404   C_GatherBuilder gather(cct);
10405
10406   auto [iter, inserted] = cls_gather_ops.emplace(soid, CLSGatherOp(ctx, obc, op));
10407   ceph_assert(inserted);
10408   auto &cgop = iter->second;
10409   for (std::map<std::string, bufferlist>::iterator it = src_obj_buffs->begin(); it != src_obj_buffs->end(); it++) {
10410     std::string oid = it->first;
10411     ObjectOperation obj_op;
10412     obj_op.call(cls, method, inbl);
10413     uint32_t flags = 0;
10414     ceph_tid_t tid = osd->objecter->read(
10415                                          object_t(oid), oloc, obj_op,
10416                                          m->get_snapid(), &it->second,
10417                                          flags, gather.new_sub());
10418     cgop.objecter_tids.push_back(tid);
10419     dout(10) << __func__ << " src=" << oid << ", tgt=" << soid << dendl;
10420   }
10421
10422   C_gather *fin = new C_gather(this, soid, get_last_peering_reset(), &(*ctx->ops)[ctx->current_osd_subop_num]);
10423   gather.set_finisher(new C_OnFinisher(fin,
10424                                        osd->get_objecter_finisher(get_pg_shard())));
10425   gather.activate();
10426
10427   return -EINPROGRESS;
10428 }
10429
10430 // ========================================================================
10431 // flush
10432 //
10433 // Flush a dirty object in the cache tier by writing it back to the
10434 // base tier.  The sequence looks like:
10435 //
10436 //  * send a copy-from operation to the base tier to copy the current
10437 //    version of the object
10438 //  * base tier will pull the object via (perhaps multiple) copy-get(s)
10439 //  * on completion, we check if the object has been modified.  if so,
10440 //    just reply with -EAGAIN.
10441 //  * try to take a write lock so we can clear the dirty flag.  if this
10442 //    fails, wait and retry
10443 //  * start a repop that clears the bit.
10444 //
10445 // If we have to wait, we will retry by coming back through the
10446 // start_flush method.  We check if a flush is already in progress
10447 // and, if so, try to finish it by rechecking the version and trying
10448 // to clear the dirty bit.
10449 //
10450 // In order for the cache-flush (a write op) to not block the copy-get
10451 // from reading the object, the client *must* set the SKIPRWLOCKS
10452 // flag.
10453 //
10454 // NOTE: normally writes are strictly ordered for the client, but
10455 // flushes are special in that they can be reordered with respect to
10456 // other writes.  In particular, we can't have a flush request block
10457 // an update to the cache pool object!
10458
10459 struct C_Flush : public Context {
10460   PrimaryLogPGRef pg;
10461   hobject_t oid;
10462   epoch_t last_peering_reset;
10463   ceph_tid_t tid;
10464   utime_t start;
10465   C_Flush(PrimaryLogPG *p, hobject_t o, epoch_t lpr)
10466     : pg(p), oid(o), last_peering_reset(lpr),
10467       tid(0), start(ceph_clock_now())
10468   {}
10469   void finish(int r) override {
10470     if (r == -ECANCELED)
10471       return;
10472     std::scoped_lock locker{*pg};
10473     if (last_peering_reset == pg->get_last_peering_reset()) {
10474       pg->finish_flush(oid, tid, r);
10475       pg->osd->logger->tinc(l_osd_tier_flush_lat, ceph_clock_now() - start);
10476     }
10477   }
10478 };
10479
10480 int PrimaryLogPG::start_dedup(OpRequestRef op, ObjectContextRef obc)
10481 {
10482   const object_info_t& oi = obc->obs.oi;
10483   const hobject_t& soid = oi.soid;
10484
10485   ceph_assert(obc->is_blocked());
10486   if (oi.size == 0) {
10487     // evicted
10488     return 0;
10489   }
10490   if (pool.info.get_fingerprint_type() == pg_pool_t::TYPE_FINGERPRINT_NONE) {
10491     dout(0) << " fingerprint algorithm is not set " << dendl;
10492     return -EINVAL;
10493   }
10494   if (pool.info.get_dedup_tier() <= 0) {
10495     dout(10) << " dedup tier is not set " << dendl;
10496     return -EINVAL;
10497   }
10498
10499   /*
10500    * The operations to make dedup chunks are tracked by a ManifestOp.
10501    * This op will be finished if all the operations are completed.
10502    */
10503   ManifestOpRef mop(std::make_shared<ManifestOp>(obc, nullptr));
10504
10505   // cdc
10506   std::map<uint64_t, bufferlist> chunks;
10507   int r = do_cdc(oi, mop->new_manifest.chunk_map, chunks);
10508   if (r < 0) {
10509     return r;
10510   }
10511   if (!chunks.size()) {
10512     return 0;
10513   }
10514
10515   // chunks issued here are different with chunk_map newly generated
10516   // because the same chunks in previous snap will not be issued
10517   // So, we need two data structures; the first is the issued chunk list to track
10518   // issued operations, and the second is the new chunk_map to update chunk_map after
10519   // all operations are finished
10520   object_ref_delta_t refs;
10521   ObjectContextRef obc_l, obc_g;
10522   get_adjacent_clones(obc, obc_l, obc_g);
10523   // skip if the same content exits in prev snap at same offset
10524   mop->new_manifest.calc_refs_to_inc_on_set(
10525     obc_l ? &(obc_l->obs.oi.manifest) : nullptr,
10526     obc_g ? &(obc_g->obs.oi.manifest) : nullptr,
10527     refs);
10528
10529   for (auto p : chunks) {
10530     hobject_t target = mop->new_manifest.chunk_map[p.first].oid;
10531     if (refs.find(target) == refs.end()) {
10532       continue;
10533     }
10534     C_SetDedupChunks *fin = new C_SetDedupChunks(this, soid, get_last_peering_reset(), p.first);
10535     ceph_tid_t tid = refcount_manifest(soid, target, refcount_t::CREATE_OR_GET_REF,
10536                             fin, std::move(chunks[p.first]));
10537     mop->chunks[target] = make_pair(p.first, p.second.length());
10538     mop->num_chunks++;
10539     mop->tids[p.first] = tid;
10540     fin->tid = tid;
10541     dout(10) << __func__ << " oid: " << soid << " tid: " << tid
10542             << " target: " << target << " offset: " << p.first
10543             << " length: " << p.second.length() << dendl;
10544   }
10545
10546   if (mop->tids.size()) {
10547     manifest_ops[soid] = mop;
10548     manifest_ops[soid]->op = op;
10549   } else {
10550     // size == 0
10551     return 0;
10552   }
10553
10554   return -EINPROGRESS;
10555 }
10556
10557 int PrimaryLogPG::do_cdc(const object_info_t& oi,
10558                          std::map<uint64_t, chunk_info_t>& chunk_map,
10559                          std::map<uint64_t, bufferlist>& chunks)
10560 {
10561   string chunk_algo = pool.info.get_dedup_chunk_algorithm_name();
10562   int64_t chunk_size = pool.info.get_dedup_cdc_chunk_size();
10563   uint64_t total_length = 0;
10564
10565   std::unique_ptr<CDC> cdc = CDC::create(chunk_algo, cbits(chunk_size)-1);
10566   if (!cdc) {
10567     dout(0) << __func__ << " unrecognized chunk-algorithm " << dendl;
10568     return -EINVAL;
10569   }
10570
10571   bufferlist bl;
10572   /**
10573    * We disable EC pool as a base tier of distributed dedup.
10574    * The reason why we disallow erasure code pool here is that the EC pool does not support objects_read_sync().
10575    * Therefore, we should change the current implementation totally to make EC pool compatible.
10576    * As s result, we leave this as a future work.
10577    */
10578   int r = pgbackend->objects_read_sync(
10579       oi.soid, 0, oi.size, 0, &bl);
10580   if (r < 0) {
10581     dout(0) << __func__ << " read fail " << oi.soid
10582             << " len: " << oi.size << " r: " << r << dendl;
10583     return r;
10584   }
10585   if (bl.length() != oi.size) {
10586     dout(0) << __func__ << " bl.length: " << bl.length() << " != oi.size: "
10587             << oi.size << " during chunking " << dendl;
10588     return -EIO;
10589   }
10590
10591   dout(10) << __func__ << " oid: " << oi.soid << " len: " << bl.length()
10592            << " oi.size: " << oi.size
10593            << " chunk_size: " << chunk_size << dendl;
10594
10595   vector<pair<uint64_t, uint64_t>> cdc_chunks;
10596   cdc->calc_chunks(bl, &cdc_chunks);
10597
10598   // get fingerprint
10599   for (auto p : cdc_chunks) {
10600     bufferlist chunk;
10601     chunk.substr_of(bl, p.first, p.second);
10602     auto [ret, target] = get_fpoid_from_chunk(oi.soid, chunk);
10603     if (ret < 0) {
10604       return ret;
10605     }
10606     chunks[p.first] = std::move(chunk);
10607     chunk_map[p.first] = chunk_info_t(0, p.second, target);
10608     total_length += p.second;
10609   }
10610   return total_length;
10611 }
10612
10613 std::pair<int, hobject_t> PrimaryLogPG::get_fpoid_from_chunk(
10614   const hobject_t soid, bufferlist& chunk)
10615 {
10616   pg_pool_t::fingerprint_t fp_algo = pool.info.get_fingerprint_type();
10617   if (fp_algo == pg_pool_t::TYPE_FINGERPRINT_NONE) {
10618     return make_pair(-EINVAL, hobject_t());
10619   }
10620   object_t fp_oid = [&fp_algo, &chunk]() -> string {
10621     switch (fp_algo) {
10622       case pg_pool_t::TYPE_FINGERPRINT_SHA1:
10623         return ceph::crypto::digest<ceph::crypto::SHA1>(chunk).to_str();
10624       case pg_pool_t::TYPE_FINGERPRINT_SHA256:
10625         return ceph::crypto::digest<ceph::crypto::SHA256>(chunk).to_str();
10626       case pg_pool_t::TYPE_FINGERPRINT_SHA512:
10627         return ceph::crypto::digest<ceph::crypto::SHA512>(chunk).to_str();
10628       default:
10629         assert(0 == "unrecognized fingerprint type");
10630         return {};
10631     }
10632   }();
10633
10634   pg_t raw_pg;
10635   object_locator_t oloc(soid);
10636   oloc.pool = pool.info.get_dedup_tier();
10637   // check if dedup_tier isn't set
10638   ceph_assert(oloc.pool > 0);
10639   int ret = get_osdmap()->object_locator_to_pg(fp_oid, oloc, raw_pg);
10640   if (ret < 0) {
10641     return make_pair(ret, hobject_t());
10642   }
10643   hobject_t target(fp_oid, oloc.key, snapid_t(),
10644                     raw_pg.ps(), raw_pg.pool(),
10645                     oloc.nspace);
10646   return make_pair(0, target);
10647 }
10648
10649 int PrimaryLogPG::finish_set_dedup(hobject_t oid, int r, ceph_tid_t tid, uint64_t offset)
10650 {
10651   dout(10) << __func__ << " " << oid << " tid " << tid
10652            << " " << cpp_strerror(r) << dendl;
10653   map<hobject_t,ManifestOpRef>::iterator p = manifest_ops.find(oid);
10654   if (p == manifest_ops.end()) {
10655     dout(10) << __func__ << " no manifest_op found" << dendl;
10656     return -EINVAL;
10657   }
10658   ManifestOpRef mop = p->second;
10659   mop->results[offset] = r;
10660   if (r < 0) {
10661     // if any failure occurs, put a mark on the results to recognize the failure
10662     mop->results[0] = r;
10663   }
10664   if (mop->num_chunks != mop->results.size()) {
10665     // there are on-going works
10666     return -EINPROGRESS;
10667   }
10668   ObjectContextRef obc = mop->obc;
10669   ceph_assert(obc);
10670   ceph_assert(obc->is_blocked());
10671   obc->stop_block();
10672   kick_object_context_blocked(obc);
10673   if (mop->results[0] < 0) {
10674     // check if the previous op returns fail
10675     ceph_assert(mop->num_chunks == mop->results.size());
10676     manifest_ops.erase(oid);
10677     osd->reply_op_error(mop->op, mop->results[0]);
10678     return -EIO;
10679   }
10680
10681   if (mop->chunks.size()) {
10682     OpContextUPtr ctx = simple_opc_create(obc);
10683     ceph_assert(ctx);
10684     if (ctx->lock_manager.get_lock_type(
10685           RWState::RWWRITE,
10686           oid,
10687           obc,
10688           mop->op)) {
10689       dout(20) << __func__ << " took write lock" << dendl;
10690     } else if (mop->op) {
10691       dout(10) << __func__ << " waiting on write lock " << mop->op << dendl;
10692       close_op_ctx(ctx.release());
10693       return -EAGAIN;
10694     }
10695
10696     ctx->at_version = get_next_version();
10697     ctx->new_obs = obc->obs;
10698     ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
10699     --ctx->delta_stats.num_objects_dirty;
10700     if (!ctx->obs->oi.has_manifest()) {
10701       ctx->delta_stats.num_objects_manifest++;
10702       ctx->new_obs.oi.set_flag(object_info_t::FLAG_MANIFEST);
10703       ctx->new_obs.oi.manifest.type = object_manifest_t::TYPE_CHUNKED;
10704     }
10705
10706     /*
10707     * Let's assume that there is a manifest snapshotted object, and we issue tier_flush() to head.
10708     * head: [0, 2) aaa <-- tier_flush()
10709     * 20:   [0, 2) ddd, [6, 2) bbb, [8, 2) ccc
10710     *
10711     * In this case, if the new chunk_map is as follows,
10712     * new_chunk_map : [0, 2) ddd, [6, 2) bbb, [8, 2) ccc
10713     * we should drop aaa from head by using calc_refs_to_drop_on_removal().
10714     * So, the precedure is
10715     *   1. calc_refs_to_drop_on_removal()
10716     *   2. register old references to drop after tier_flush() is committed
10717     *   3. update new chunk_map
10718     */
10719
10720     ObjectCleanRegions c_regions = ctx->clean_regions;
10721     ObjectContextRef cobc = get_prev_clone_obc(obc);
10722     c_regions.mark_fully_dirty();
10723     // CDC was done on entire range of manifest object,
10724     // so the first thing we should do here is to drop the reference to old chunks
10725     ObjectContextRef obc_l, obc_g;
10726     get_adjacent_clones(obc, obc_l, obc_g);
10727     // clear all old references
10728     object_ref_delta_t refs;
10729     ctx->obs->oi.manifest.calc_refs_to_drop_on_removal(
10730       obc_l ? &(obc_l->obs.oi.manifest) : nullptr,
10731       obc_g ? &(obc_g->obs.oi.manifest) : nullptr,
10732       refs);
10733     if (!refs.is_empty()) {
10734       ctx->register_on_commit(
10735         [oid, this, refs](){
10736           dec_refcount(oid, refs);
10737         });
10738     }
10739
10740     // set new references
10741     ctx->new_obs.oi.manifest.chunk_map = mop->new_manifest.chunk_map;
10742
10743     finish_ctx(ctx.get(), pg_log_entry_t::CLEAN);
10744     simple_opc_submit(std::move(ctx));
10745   }
10746   if (mop->op)
10747     osd->reply_op_error(mop->op, r);
10748
10749   manifest_ops.erase(oid);
10750   return 0;
10751 }
10752
10753 int PrimaryLogPG::finish_set_manifest_refcount(hobject_t oid, int r, ceph_tid_t tid, uint64_t offset)
10754 {
10755   dout(10) << __func__ << " " << oid << " tid " << tid
10756            << " " << cpp_strerror(r) << dendl;
10757   map<hobject_t,ManifestOpRef>::iterator p = manifest_ops.find(oid);
10758   if (p == manifest_ops.end()) {
10759     dout(10) << __func__ << " no manifest_op found" << dendl;
10760     return -EINVAL;
10761   }
10762   ManifestOpRef mop = p->second;
10763   mop->results[offset] = r;
10764   if (r < 0) {
10765     // if any failure occurs, put a mark on the results to recognize the failure
10766     mop->results[0] = r;
10767   }
10768   if (mop->num_chunks != mop->results.size()) {
10769     // there are on-going works
10770     return -EINPROGRESS;
10771   }
10772
10773   if (mop->cb) {
10774     mop->cb->complete(r);
10775   }
10776
10777   manifest_ops.erase(p);
10778   mop.reset();
10779
10780   return 0;
10781 }
10782
10783 int PrimaryLogPG::start_flush(
10784   OpRequestRef op, ObjectContextRef obc,
10785   bool blocking, hobject_t *pmissing,
10786   std::optional<std::function<void()>> &&on_flush,
10787   bool force_dedup)
10788 {
10789   const object_info_t& oi = obc->obs.oi;
10790   const hobject_t& soid = oi.soid;
10791   dout(10) << __func__ << " " << soid
10792            << " v" << oi.version
10793            << " uv" << oi.user_version
10794            << " " << (blocking ? "blocking" : "non-blocking/best-effort")
10795            << dendl;
10796
10797   bool preoctopus_compat =
10798     get_osdmap()->require_osd_release < ceph_release_t::octopus;
10799   SnapSet snapset;
10800   if (preoctopus_compat) {
10801     // for pre-octopus compatibility, filter SnapSet::snaps.  not
10802     // certain we need this, but let's be conservative.
10803     snapset = obc->ssc->snapset.get_filtered(pool.info);
10804   } else {
10805     // NOTE: change this to a const ref when we remove this compat code
10806     snapset = obc->ssc->snapset;
10807   }
10808
10809   if ((obc->obs.oi.has_manifest() && obc->obs.oi.manifest.is_chunked())
10810       || force_dedup) {
10811     // current dedup tier only supports blocking operation
10812     if (!blocking) {
10813       return -EOPNOTSUPP;
10814     }
10815   }
10816
10817   // verify there are no (older) check for dirty clones
10818   {
10819     dout(20) << " snapset " << snapset << dendl;
10820     vector<snapid_t>::reverse_iterator p = snapset.clones.rbegin();
10821     while (p != snapset.clones.rend() && *p >= soid.snap)
10822       ++p;
10823     if (p != snapset.clones.rend()) {
10824       hobject_t next = soid;
10825       next.snap = *p;
10826       ceph_assert(next.snap < soid.snap);
10827       if (recovery_state.get_pg_log().get_missing().is_missing(next)) {
10828         dout(10) << __func__ << " missing clone is " << next << dendl;
10829         if (pmissing)
10830           *pmissing = next;
10831         return -ENOENT;
10832       }
10833       ObjectContextRef older_obc = get_object_context(next, false);
10834       if (older_obc) {
10835         dout(20) << __func__ << " next oldest clone is " << older_obc->obs.oi
10836                  << dendl;
10837         if (older_obc->obs.oi.is_dirty()) {
10838           dout(10) << __func__ << " next oldest clone is dirty: "
10839                    << older_obc->obs.oi << dendl;
10840           return -EBUSY;
10841         }
10842       } else {
10843         dout(20) << __func__ << " next oldest clone " << next
10844                  << " is not present; implicitly clean" << dendl;
10845       }
10846     } else {
10847       dout(20) << __func__ << " no older clones" << dendl;
10848     }
10849   }
10850
10851   if (blocking) {
10852     dout(20) << fmt::format("{}: blocking {}", __func__, soid) << dendl;
10853     obc->start_block();
10854   }
10855
10856   map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(soid);
10857   if (p != flush_ops.end()) {
10858     FlushOpRef fop = p->second;
10859     if (fop->op == op) {
10860       // we couldn't take the write lock on a cache-try-flush before;
10861       // now we are trying again for the lock.
10862       return try_flush_mark_clean(fop);
10863     }
10864     if (fop->flushed_version == obc->obs.oi.user_version &&
10865         (fop->blocking || !blocking)) {
10866       // nonblocking can join anything
10867       // blocking can only join a blocking flush
10868       dout(20) << __func__ << " piggybacking on existing flush " << dendl;
10869       if (op)
10870         fop->dup_ops.push_back(op);
10871       return -EAGAIN;   // clean up this ctx; op will retry later
10872     }
10873
10874     // cancel current flush since it will fail anyway, or because we
10875     // are blocking and the existing flush is nonblocking.
10876     dout(20) << __func__ << " canceling previous flush; it will fail" << dendl;
10877     if (fop->op)
10878       osd->reply_op_error(fop->op, -EBUSY);
10879     while (!fop->dup_ops.empty()) {
10880       osd->reply_op_error(fop->dup_ops.front(), -EBUSY);
10881       fop->dup_ops.pop_front();
10882     }
10883     vector<ceph_tid_t> tids;
10884     cancel_flush(fop, false, &tids);
10885     osd->objecter->op_cancel(tids, -ECANCELED);
10886   }
10887
10888   if ((obc->obs.oi.has_manifest() && obc->obs.oi.manifest.is_chunked())
10889       || force_dedup) {
10890     int r = start_dedup(op, obc);
10891     if (r != -EINPROGRESS) {
10892       if (blocking)
10893         obc->stop_block();
10894     }
10895     return r;
10896   }
10897
10898   /**
10899    * In general, we need to send a delete and a copyfrom.
10900    * Consider snapc 10:[10, 9, 8, 4, 3, 2]:[10(10, 9), 4(4,3,2)]
10901    * where 4 is marked as clean.  To flush 10, we have to:
10902    * 1) delete 4:[4,3,2] -- Logically, the object does not exist after 4
10903    * 2) copyfrom 8:[8,4,3,2] -- flush object after snap 8
10904    *
10905    * There is a complicating case.  Supposed there had been a clone 7
10906    * for snaps [7, 6] which has been trimmed since they no longer exist.
10907    * In the base pool, we'd have 5:[4,3,2]:[4(4,3,2)]+head.  When we submit
10908    * the delete, the snap will be promoted to 5, and the head will become
10909    * a whiteout.  When the copy-from goes through, we'll end up with
10910    * 8:[8,4,3,2]:[4(4,3,2)]+head.
10911    *
10912    * Another complication is the case where there is an interval change
10913    * after doing the delete and the flush but before marking the object
10914    * clean.  We'll happily delete head and then recreate it at the same
10915    * sequence number, which works out ok.
10916    */
10917
10918   SnapContext snapc, dsnapc;
10919   if (snapset.seq != 0) {
10920     if (soid.snap == CEPH_NOSNAP) {
10921       snapc = snapset.get_ssc_as_of(snapset.seq);
10922     } else {
10923       snapid_t min_included_snap;
10924       auto p = snapset.clone_snaps.find(soid.snap);
10925       ceph_assert(p != snapset.clone_snaps.end());
10926       min_included_snap = p->second.back();
10927       snapc = snapset.get_ssc_as_of(min_included_snap - 1);
10928     }
10929
10930     snapid_t prev_snapc = 0;
10931     for (vector<snapid_t>::reverse_iterator citer = snapset.clones.rbegin();
10932          citer != snapset.clones.rend();
10933          ++citer) {
10934       if (*citer < soid.snap) {
10935         prev_snapc = *citer;
10936         break;
10937       }
10938     }
10939
10940     dsnapc = snapset.get_ssc_as_of(prev_snapc);
10941   }
10942
10943   object_locator_t base_oloc(soid);
10944   base_oloc.pool = pool.info.tier_of;
10945
10946   if (dsnapc.seq < snapc.seq) {
10947     ObjectOperation o;
10948     o.remove();
10949     osd->objecter->mutate(
10950       soid.oid,
10951       base_oloc,
10952       o,
10953       dsnapc,
10954       ceph::real_clock::from_ceph_timespec(oi.mtime),
10955       (CEPH_OSD_FLAG_IGNORE_OVERLAY |
10956        CEPH_OSD_FLAG_ENFORCE_SNAPC),
10957       NULL /* no callback, we'll rely on the ordering w.r.t the next op */);
10958   }
10959
10960   FlushOpRef fop(std::make_shared<FlushOp>());
10961   fop->obc = obc;
10962   fop->flushed_version = oi.user_version;
10963   fop->blocking = blocking;
10964   fop->on_flush = std::move(on_flush);
10965   fop->op = op;
10966
10967   ObjectOperation o;
10968   if (oi.is_whiteout()) {
10969     fop->removal = true;
10970     o.remove();
10971   } else {
10972     object_locator_t oloc(soid);
10973     o.copy_from(soid.oid.name, soid.snap, oloc, oi.user_version,
10974                 CEPH_OSD_COPY_FROM_FLAG_FLUSH |
10975                 CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
10976                 CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
10977                 CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE,
10978                 LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL|LIBRADOS_OP_FLAG_FADVISE_NOCACHE);
10979
10980     //mean the base tier don't cache data after this
10981     if (agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL)
10982       o.set_last_op_flags(LIBRADOS_OP_FLAG_FADVISE_DONTNEED);
10983   }
10984   C_Flush *fin = new C_Flush(this, soid, get_last_peering_reset());
10985
10986   ceph_tid_t tid = osd->objecter->mutate(
10987     soid.oid, base_oloc, o, snapc,
10988     ceph::real_clock::from_ceph_timespec(oi.mtime),
10989     CEPH_OSD_FLAG_IGNORE_OVERLAY | CEPH_OSD_FLAG_ENFORCE_SNAPC,
10990     new C_OnFinisher(fin,
10991                      osd->get_objecter_finisher(get_pg_shard())));
10992   /* we're under the pg lock and fin->finish() is grabbing that */
10993   fin->tid = tid;
10994   fop->objecter_tid = tid;
10995
10996   flush_ops[soid] = fop;
10997
10998   recovery_state.update_stats(
10999     [&oi](auto &history, auto &stats) {
11000       stats.stats.sum.num_flush++;
11001       stats.stats.sum.num_flush_kb += shift_round_up(oi.size, 10);
11002       return false;
11003     });
11004   return -EINPROGRESS;
11005 }
11006
11007 void PrimaryLogPG::finish_flush(hobject_t oid, ceph_tid_t tid, int r)
11008 {
11009   dout(10) << __func__ << " " << oid << " tid " << tid
11010            << " " << cpp_strerror(r) << dendl;
11011   map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(oid);
11012   if (p == flush_ops.end()) {
11013     dout(10) << __func__ << " no flush_op found" << dendl;
11014     return;
11015   }
11016   FlushOpRef fop = p->second;
11017   if (tid != fop->objecter_tid && !fop->obc->obs.oi.has_manifest()) {
11018     dout(10) << __func__ << " tid " << tid << " != fop " << fop
11019              << " tid " << fop->objecter_tid << dendl;
11020     return;
11021   }
11022   ObjectContextRef obc = fop->obc;
11023   fop->objecter_tid = 0;
11024
11025   if (r < 0 && !(r == -ENOENT && fop->removal)) {
11026     if (fop->op)
11027       osd->reply_op_error(fop->op, -EBUSY);
11028     if (fop->blocking) {
11029       obc->stop_block();
11030       kick_object_context_blocked(obc);
11031     }
11032
11033     if (!fop->dup_ops.empty()) {
11034       dout(20) << __func__ << " requeueing dups" << dendl;
11035       requeue_ops(fop->dup_ops);
11036     }
11037     if (fop->on_flush) {
11038       (*(fop->on_flush))();
11039       fop->on_flush = std::nullopt;
11040     }
11041     flush_ops.erase(oid);
11042     return;
11043   }
11044
11045   r = try_flush_mark_clean(fop);
11046   if (r == -EBUSY && fop->op) {
11047     osd->reply_op_error(fop->op, r);
11048   }
11049 }
11050
11051 int PrimaryLogPG::try_flush_mark_clean(FlushOpRef fop)
11052 {
11053   ObjectContextRef obc = fop->obc;
11054   const hobject_t& oid = obc->obs.oi.soid;
11055
11056   if (fop->blocking) {
11057     obc->stop_block();
11058     kick_object_context_blocked(obc);
11059   }
11060
11061   if (fop->flushed_version != obc->obs.oi.user_version ||
11062       !obc->obs.exists) {
11063     if (obc->obs.exists)
11064       dout(10) << __func__ << " flushed_version " << fop->flushed_version
11065                << " != current " << obc->obs.oi.user_version
11066                << dendl;
11067     else
11068       dout(10) << __func__ << " object no longer exists" << dendl;
11069
11070     if (!fop->dup_ops.empty()) {
11071       dout(20) << __func__ << " requeueing dups" << dendl;
11072       requeue_ops(fop->dup_ops);
11073     }
11074     if (fop->on_flush) {
11075       (*(fop->on_flush))();
11076       fop->on_flush = std::nullopt;
11077     }
11078     flush_ops.erase(oid);
11079     if (fop->blocking)
11080       osd->logger->inc(l_osd_tier_flush_fail);
11081     else
11082       osd->logger->inc(l_osd_tier_try_flush_fail);
11083     return -EBUSY;
11084   }
11085
11086   if (!fop->blocking &&
11087       m_scrubber->write_blocked_by_scrub(oid)) {
11088     if (fop->op) {
11089       dout(10) << __func__ << " blocked by scrub" << dendl;
11090       requeue_op(fop->op);
11091       requeue_ops(fop->dup_ops);
11092       return -EAGAIN;    // will retry
11093     } else {
11094       osd->logger->inc(l_osd_tier_try_flush_fail);
11095       vector<ceph_tid_t> tids;
11096       cancel_flush(fop, false, &tids);
11097       osd->objecter->op_cancel(tids, -ECANCELED);
11098       return -ECANCELED;
11099     }
11100   }
11101
11102   // successfully flushed, can we evict this object?
11103   if (!obc->obs.oi.has_manifest() && !fop->op &&
11104       agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_IDLE &&
11105       agent_maybe_evict(obc, true)) {
11106     osd->logger->inc(l_osd_tier_clean);
11107     if (fop->on_flush) {
11108       (*(fop->on_flush))();
11109       fop->on_flush = std::nullopt;
11110     }
11111     flush_ops.erase(oid);
11112     return 0;
11113   }
11114
11115   dout(10) << __func__ << " clearing DIRTY flag for " << oid << dendl;
11116   OpContextUPtr ctx = simple_opc_create(fop->obc);
11117
11118   // successfully flushed; can we clear the dirty bit?
11119   // try to take the lock manually, since we don't
11120   // have a ctx yet.
11121   if (ctx->lock_manager.get_lock_type(
11122         RWState::RWWRITE,
11123         oid,
11124         obc,
11125         fop->op)) {
11126     dout(20) << __func__ << " took write lock" << dendl;
11127   } else if (fop->op) {
11128     dout(10) << __func__ << " waiting on write lock " << fop->op << " "
11129              << fop->dup_ops << dendl;
11130     // fop->op is now waiting on the lock; get fop->dup_ops to wait too.
11131     for (auto op : fop->dup_ops) {
11132       bool locked = ctx->lock_manager.get_lock_type(
11133         RWState::RWWRITE,
11134         oid,
11135         obc,
11136         op);
11137       ceph_assert(!locked);
11138     }
11139     close_op_ctx(ctx.release());
11140     return -EAGAIN;    // will retry
11141   } else {
11142     dout(10) << __func__ << " failed write lock, no op; failing" << dendl;
11143     close_op_ctx(ctx.release());
11144     osd->logger->inc(l_osd_tier_try_flush_fail);
11145     vector<ceph_tid_t> tids;
11146     cancel_flush(fop, false, &tids);
11147     osd->objecter->op_cancel(tids, -ECANCELED);
11148     return -ECANCELED;
11149   }
11150
11151   if (fop->on_flush) {
11152     ctx->register_on_finish(*(fop->on_flush));
11153     fop->on_flush = std::nullopt;
11154   }
11155
11156   ctx->at_version = get_next_version();
11157
11158   ctx->new_obs = obc->obs;
11159   ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
11160   --ctx->delta_stats.num_objects_dirty;
11161   if (fop->obc->obs.oi.has_manifest()) {
11162     ceph_assert(obc->obs.oi.manifest.is_chunked());
11163     PGTransaction* t = ctx->op_t.get();
11164     uint64_t chunks_size = 0;
11165     for (auto &p : ctx->new_obs.oi.manifest.chunk_map) {
11166       chunks_size += p.second.length;
11167     }
11168     if (ctx->new_obs.oi.is_omap() && pool.info.supports_omap()) {
11169       t->omap_clear(oid);
11170       ctx->new_obs.oi.clear_omap_digest();
11171       ctx->new_obs.oi.clear_flag(object_info_t::FLAG_OMAP);
11172       ctx->clean_regions.mark_omap_dirty();
11173     }
11174     if (obc->obs.oi.size == chunks_size) {
11175       t->truncate(oid, 0);
11176       interval_set<uint64_t> trim;
11177       trim.insert(0, ctx->new_obs.oi.size);
11178       ctx->modified_ranges.union_of(trim);
11179       truncate_update_size_and_usage(ctx->delta_stats,
11180                                      ctx->new_obs.oi,
11181                                      0);
11182       ctx->clean_regions.mark_data_region_dirty(0, ctx->new_obs.oi.size);
11183       ctx->new_obs.oi.new_object();
11184       for (auto &p : ctx->new_obs.oi.manifest.chunk_map) {
11185         p.second.set_flag(chunk_info_t::FLAG_MISSING);
11186       }
11187     } else {
11188       for (auto &p : ctx->new_obs.oi.manifest.chunk_map) {
11189         dout(20) << __func__ << " offset: " << p.second.offset
11190                 << " length: " << p.second.length << dendl;
11191         p.second.clear_flag(chunk_info_t::FLAG_MISSING); // CLEAN
11192       }
11193     }
11194   }
11195
11196   finish_ctx(ctx.get(), pg_log_entry_t::CLEAN);
11197
11198   osd->logger->inc(l_osd_tier_clean);
11199
11200   if (!fop->dup_ops.empty() || fop->op) {
11201     dout(20) << __func__ << " requeueing for " << ctx->at_version << dendl;
11202     list<OpRequestRef> ls;
11203     if (fop->op)
11204       ls.push_back(fop->op);
11205     ls.splice(ls.end(), fop->dup_ops);
11206     requeue_ops(ls);
11207   }
11208
11209   simple_opc_submit(std::move(ctx));
11210
11211   flush_ops.erase(oid);
11212
11213   if (fop->blocking)
11214     osd->logger->inc(l_osd_tier_flush);
11215   else
11216     osd->logger->inc(l_osd_tier_try_flush);
11217
11218   return -EINPROGRESS;
11219 }
11220
11221 void PrimaryLogPG::cancel_flush(FlushOpRef fop, bool requeue,
11222                                 vector<ceph_tid_t> *tids)
11223 {
11224   dout(10) << __func__ << " " << fop->obc->obs.oi.soid << " tid "
11225            << fop->objecter_tid << dendl;
11226   if (fop->objecter_tid) {
11227     tids->push_back(fop->objecter_tid);
11228     fop->objecter_tid = 0;
11229   }
11230   if (fop->io_tids.size()) {
11231     for (auto &p : fop->io_tids) {
11232       tids->push_back(p.second);
11233       p.second = 0;
11234     }
11235   }
11236   if (fop->blocking && fop->obc->is_blocked()) {
11237     fop->obc->stop_block();
11238     kick_object_context_blocked(fop->obc);
11239   }
11240   if (requeue) {
11241     if (fop->op)
11242       requeue_op(fop->op);
11243     requeue_ops(fop->dup_ops);
11244   }
11245   if (fop->on_flush) {
11246     (*(fop->on_flush))();
11247     fop->on_flush = std::nullopt;
11248   }
11249   flush_ops.erase(fop->obc->obs.oi.soid);
11250 }
11251
11252 void PrimaryLogPG::cancel_flush_ops(bool requeue, vector<ceph_tid_t> *tids)
11253 {
11254   dout(10) << __func__ << dendl;
11255   map<hobject_t,FlushOpRef>::iterator p = flush_ops.begin();
11256   while (p != flush_ops.end()) {
11257     cancel_flush((p++)->second, requeue, tids);
11258   }
11259 }
11260
11261 bool PrimaryLogPG::is_present_clone(hobject_t coid)
11262 {
11263   if (!pool.info.allow_incomplete_clones())
11264     return true;
11265   if (is_missing_object(coid))
11266     return true;
11267   ObjectContextRef obc = get_object_context(coid, false);
11268   return obc && obc->obs.exists;
11269 }
11270
11271 // ========================================================================
11272 // cls gather
11273 //
11274
11275 void PrimaryLogPG::cancel_cls_gather(map<hobject_t,CLSGatherOp>::iterator iter, bool requeue,
11276                                      vector<ceph_tid_t> *tids)
11277 {
11278   auto &cgop = iter->second;
11279   for (std::vector<ceph_tid_t>::iterator p = cgop.objecter_tids.begin(); p != cgop.objecter_tids.end(); p++) {
11280     tids->push_back(*p);
11281     dout(10) << __func__ << " " << cgop.obc->obs.oi.soid << " tid " << *p << dendl;
11282   }
11283   cgop.objecter_tids.clear();
11284   close_op_ctx(cgop.ctx);
11285   cgop.ctx = NULL;
11286   if (requeue) {
11287     if (cgop.op)
11288       requeue_op(cgop.op);
11289   }
11290   cls_gather_ops.erase(iter);
11291 }
11292
11293 void PrimaryLogPG::cancel_cls_gather_ops(bool requeue, vector<ceph_tid_t> *tids)
11294 {
11295   dout(10) << __func__ << dendl;
11296   map<hobject_t,CLSGatherOp>::iterator p = cls_gather_ops.begin();
11297   while (p != cls_gather_ops.end()) {
11298     cancel_cls_gather(p++, requeue, tids);
11299   }
11300 }
11301
11302 // ========================================================================
11303 // rep op gather
11304
11305 class C_OSD_RepopCommit : public Context {
11306   PrimaryLogPGRef pg;
11307   boost::intrusive_ptr<PrimaryLogPG::RepGather> repop;
11308 public:
11309   C_OSD_RepopCommit(PrimaryLogPG *pg, PrimaryLogPG::RepGather *repop)
11310     : pg(pg), repop(repop) {}
11311   void finish(int) override {
11312     pg->repop_all_committed(repop.get());
11313   }
11314 };
11315
11316 void PrimaryLogPG::repop_all_committed(RepGather *repop)
11317 {
11318   dout(10) << __func__ << ": repop tid " << repop->rep_tid << " all committed "
11319            << dendl;
11320   repop->all_committed = true;
11321   if (!repop->rep_aborted) {
11322     if (repop->v != eversion_t()) {
11323       recovery_state.complete_write(repop->v, repop->pg_local_last_complete);
11324     }
11325     eval_repop(repop);
11326   }
11327 }
11328
11329 void PrimaryLogPG::op_applied(const eversion_t &applied_version)
11330 {
11331   dout(10) << "op_applied version " << applied_version << dendl;
11332   ceph_assert(applied_version != eversion_t());
11333   ceph_assert(applied_version <= info.last_update);
11334   recovery_state.local_write_applied(applied_version);
11335
11336   if (is_primary() && m_scrubber) {
11337     // if there's a scrub operation waiting for the selected chunk to be fully updated -
11338     // allow it to continue
11339     m_scrubber->on_applied_when_primary(recovery_state.get_last_update_applied());
11340   }
11341 }
11342
11343 void PrimaryLogPG::eval_repop(RepGather *repop)
11344 {
11345   dout(10) << "eval_repop " << *repop
11346     << (repop->op && repop->op->get_req<MOSDOp>() ? "" : " (no op)") << dendl;
11347
11348   // ondisk?
11349   if (repop->all_committed) {
11350     dout(10) << " commit: " << *repop << dendl;
11351     for (auto p = repop->on_committed.begin();
11352          p != repop->on_committed.end();
11353          repop->on_committed.erase(p++)) {
11354       (*p)();
11355     }
11356     // send dup commits, in order
11357     auto it = waiting_for_ondisk.find(repop->v);
11358     if (it != waiting_for_ondisk.end()) {
11359       ceph_assert(waiting_for_ondisk.begin()->first == repop->v);
11360       for (auto& i : it->second) {
11361         int return_code = repop->r;
11362         if (return_code >= 0) {
11363           return_code = std::get<2>(i);
11364         }
11365         osd->reply_op_error(std::get<0>(i), return_code, repop->v,
11366                             std::get<1>(i), std::get<3>(i));
11367       }
11368       waiting_for_ondisk.erase(it);
11369     }
11370
11371     publish_stats_to_osd();
11372
11373     dout(10) << " removing " << *repop << dendl;
11374     ceph_assert(!repop_queue.empty());
11375     dout(20) << "   q front is " << *repop_queue.front() << dendl;
11376     if (repop_queue.front() == repop) {
11377       RepGather *to_remove = nullptr;
11378       while (!repop_queue.empty() &&
11379              (to_remove = repop_queue.front())->all_committed) {
11380         repop_queue.pop_front();
11381         for (auto p = to_remove->on_success.begin();
11382              p != to_remove->on_success.end();
11383              to_remove->on_success.erase(p++)) {
11384           (*p)();
11385         }
11386         remove_repop(to_remove);
11387       }
11388     }
11389   }
11390 }
11391
11392 void PrimaryLogPG::issue_repop(RepGather *repop, OpContext *ctx)
11393 {
11394   FUNCTRACE(cct);
11395   const hobject_t& soid = ctx->obs->oi.soid;
11396   dout(7) << "issue_repop rep_tid " << repop->rep_tid
11397           << " o " << soid
11398           << dendl;
11399
11400
11401   repop->v = ctx->at_version;
11402
11403   ctx->op_t->add_obc(ctx->obc);
11404   if (ctx->clone_obc) {
11405     ctx->op_t->add_obc(ctx->clone_obc);
11406   }
11407   if (ctx->head_obc) {
11408     ctx->op_t->add_obc(ctx->head_obc);
11409   }
11410
11411   Context *on_all_commit = new C_OSD_RepopCommit(this, repop);
11412   if (!(ctx->log.empty())) {
11413     ceph_assert(ctx->at_version >= projected_last_update);
11414     projected_last_update = ctx->at_version;
11415   }
11416   for (auto &&entry: ctx->log) {
11417     projected_log.add(entry);
11418   }
11419
11420   recovery_state.pre_submit_op(
11421     soid,
11422     ctx->log,
11423     ctx->at_version);
11424   pgbackend->submit_transaction(
11425     soid,
11426     ctx->delta_stats,
11427     ctx->at_version,
11428     std::move(ctx->op_t),
11429     recovery_state.get_pg_trim_to(),
11430     recovery_state.get_min_last_complete_ondisk(),
11431     std::move(ctx->log),
11432     ctx->updated_hset_history,
11433     on_all_commit,
11434     repop->rep_tid,
11435     ctx->reqid,
11436     ctx->op);
11437 }
11438
11439 PrimaryLogPG::RepGather *PrimaryLogPG::new_repop(
11440   OpContext *ctx,
11441   ceph_tid_t rep_tid)
11442 {
11443   if (ctx->op)
11444     dout(10) << "new_repop rep_tid " << rep_tid << " on " << *ctx->op->get_req() << dendl;
11445   else
11446     dout(10) << "new_repop rep_tid " << rep_tid << " (no op)" << dendl;
11447
11448   RepGather *repop = new RepGather(
11449     ctx, rep_tid, info.last_complete);
11450
11451   repop->start = ceph_clock_now();
11452
11453   repop_queue.push_back(&repop->queue_item);
11454   repop->get();
11455
11456   osd->logger->inc(l_osd_op_wip);
11457
11458   dout(10) << __func__ << ": " << *repop << dendl;
11459   return repop;
11460 }
11461
11462 boost::intrusive_ptr<PrimaryLogPG::RepGather> PrimaryLogPG::new_repop(
11463   eversion_t version,
11464   int r,
11465   ObcLockManager &&manager,
11466   OpRequestRef &&op,
11467   std::optional<std::function<void(void)> > &&on_complete)
11468 {
11469   RepGather *repop = new RepGather(
11470     std::move(manager),
11471     std::move(op),
11472     std::move(on_complete),
11473     osd->get_tid(),
11474     info.last_complete,
11475     r);
11476   repop->v = version;
11477
11478   repop->start = ceph_clock_now();
11479
11480   repop_queue.push_back(&repop->queue_item);
11481
11482   osd->logger->inc(l_osd_op_wip);
11483
11484   dout(10) << __func__ << ": " << *repop << dendl;
11485   return boost::intrusive_ptr<RepGather>(repop);
11486 }
11487
11488 void PrimaryLogPG::remove_repop(RepGather *repop)
11489 {
11490   dout(20) << __func__ << " " << *repop << dendl;
11491
11492   for (auto p = repop->on_finish.begin();
11493        p != repop->on_finish.end();
11494        repop->on_finish.erase(p++)) {
11495     (*p)();
11496   }
11497
11498   release_object_locks(
11499     repop->lock_manager);
11500   repop->put();
11501
11502   osd->logger->dec(l_osd_op_wip);
11503 }
11504
11505 PrimaryLogPG::OpContextUPtr PrimaryLogPG::simple_opc_create(ObjectContextRef obc)
11506 {
11507   dout(20) << __func__ << " " << obc->obs.oi.soid << dendl;
11508   ceph_tid_t rep_tid = osd->get_tid();
11509   osd_reqid_t reqid(osd->get_cluster_msgr_name(), 0, rep_tid);
11510   OpContextUPtr ctx(new OpContext(OpRequestRef(), reqid, nullptr, obc, this));
11511   ctx->op_t.reset(new PGTransaction());
11512   ctx->mtime = ceph_clock_now();
11513   return ctx;
11514 }
11515
11516 void PrimaryLogPG::simple_opc_submit(OpContextUPtr ctx)
11517 {
11518   RepGather *repop = new_repop(ctx.get(), ctx->reqid.tid);
11519   dout(20) << __func__ << " " << repop << dendl;
11520   issue_repop(repop, ctx.get());
11521   eval_repop(repop);
11522   recovery_state.update_trim_to();
11523   repop->put();
11524 }
11525
11526
11527 void PrimaryLogPG::submit_log_entries(
11528   const mempool::osd_pglog::list<pg_log_entry_t> &entries,
11529   ObcLockManager &&manager,
11530   std::optional<std::function<void(void)> > &&_on_complete,
11531   OpRequestRef op,
11532   int r)
11533 {
11534   dout(10) << __func__ << " " << entries << dendl;
11535   ceph_assert(is_primary());
11536
11537   eversion_t version;
11538   if (!entries.empty()) {
11539     ceph_assert(entries.rbegin()->version >= projected_last_update);
11540     version = projected_last_update = entries.rbegin()->version;
11541   }
11542
11543   boost::intrusive_ptr<RepGather> repop;
11544   std::optional<std::function<void(void)> > on_complete;
11545   if (get_osdmap()->require_osd_release >= ceph_release_t::jewel) {
11546     repop = new_repop(
11547       version,
11548       r,
11549       std::move(manager),
11550       std::move(op),
11551       std::move(_on_complete));
11552   } else {
11553     on_complete = std::move(_on_complete);
11554   }
11555
11556   pgbackend->call_write_ordered(
11557     [this, entries, repop, on_complete]() {
11558       ObjectStore::Transaction t;
11559       eversion_t old_last_update = info.last_update;
11560       recovery_state.merge_new_log_entries(
11561         entries, t, recovery_state.get_pg_trim_to(),
11562         recovery_state.get_min_last_complete_ondisk());
11563
11564       set<pg_shard_t> waiting_on;
11565       for (set<pg_shard_t>::const_iterator i = get_acting_recovery_backfill().begin();
11566            i != get_acting_recovery_backfill().end();
11567            ++i) {
11568         pg_shard_t peer(*i);
11569         if (peer == pg_whoami) continue;
11570         ceph_assert(recovery_state.get_peer_missing().count(peer));
11571         ceph_assert(recovery_state.has_peer_info(peer));
11572         if (get_osdmap()->require_osd_release >= ceph_release_t::jewel) {
11573           ceph_assert(repop);
11574           MOSDPGUpdateLogMissing *m = new MOSDPGUpdateLogMissing(
11575             entries,
11576             spg_t(info.pgid.pgid, i->shard),
11577             pg_whoami.shard,
11578             get_osdmap_epoch(),
11579             get_last_peering_reset(),
11580             repop->rep_tid,
11581             recovery_state.get_pg_trim_to(),
11582             recovery_state.get_min_last_complete_ondisk());
11583           osd->send_message_osd_cluster(
11584             peer.osd, m, get_osdmap_epoch());
11585           waiting_on.insert(peer);
11586         } else {
11587           MOSDPGLog *m = new MOSDPGLog(
11588             peer.shard, pg_whoami.shard,
11589             info.last_update.epoch,
11590             info, get_last_peering_reset());
11591           m->log.log = entries;
11592           m->log.tail = old_last_update;
11593           m->log.head = info.last_update;
11594           osd->send_message_osd_cluster(
11595             peer.osd, m, get_osdmap_epoch());
11596         }
11597       }
11598       ceph_tid_t rep_tid = repop->rep_tid;
11599       waiting_on.insert(pg_whoami);
11600       log_entry_update_waiting_on.insert(
11601         make_pair(
11602           rep_tid,
11603           LogUpdateCtx{std::move(repop), std::move(waiting_on)}
11604           ));
11605       struct OnComplete : public Context {
11606         PrimaryLogPGRef pg;
11607         ceph_tid_t rep_tid;
11608         epoch_t epoch;
11609         OnComplete(
11610           PrimaryLogPGRef pg,
11611           ceph_tid_t rep_tid,
11612           epoch_t epoch)
11613           : pg(pg), rep_tid(rep_tid), epoch(epoch) {}
11614         void finish(int) override {
11615           std::scoped_lock l{*pg};
11616           if (!pg->pg_has_reset_since(epoch)) {
11617             auto it = pg->log_entry_update_waiting_on.find(rep_tid);
11618             ceph_assert(it != pg->log_entry_update_waiting_on.end());
11619             auto it2 = it->second.waiting_on.find(pg->pg_whoami);
11620             ceph_assert(it2 != it->second.waiting_on.end());
11621             it->second.waiting_on.erase(it2);
11622             if (it->second.waiting_on.empty()) {
11623               pg->repop_all_committed(it->second.repop.get());
11624               pg->log_entry_update_waiting_on.erase(it);
11625             }
11626           }
11627         }
11628       };
11629       t.register_on_commit(
11630         new OnComplete{this, rep_tid, get_osdmap_epoch()});
11631       int r = osd->store->queue_transaction(ch, std::move(t), NULL);
11632       ceph_assert(r == 0);
11633       op_applied(info.last_update);
11634     });
11635
11636   recovery_state.update_trim_to();
11637 }
11638
11639 void PrimaryLogPG::cancel_log_updates()
11640 {
11641   // get rid of all the LogUpdateCtx so their references to repops are
11642   // dropped
11643   log_entry_update_waiting_on.clear();
11644 }
11645
11646 // -------------------------------------------------------
11647
11648 void PrimaryLogPG::get_watchers(list<obj_watch_item_t> *ls)
11649 {
11650   std::scoped_lock l{*this};
11651   pair<hobject_t, ObjectContextRef> i;
11652   while (object_contexts.get_next(i.first, &i)) {
11653     ObjectContextRef obc(i.second);
11654     get_obc_watchers(obc, *ls);
11655   }
11656 }
11657
11658 void PrimaryLogPG::get_obc_watchers(ObjectContextRef obc, list<obj_watch_item_t> &pg_watchers)
11659 {
11660   for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j =
11661          obc->watchers.begin();
11662         j != obc->watchers.end();
11663         ++j) {
11664     obj_watch_item_t owi;
11665
11666     owi.obj = obc->obs.oi.soid;
11667     owi.wi.addr = j->second->get_peer_addr();
11668     owi.wi.name = j->second->get_entity();
11669     owi.wi.cookie = j->second->get_cookie();
11670     owi.wi.timeout_seconds = j->second->get_timeout();
11671
11672     dout(30) << "watch: Found oid=" << owi.obj << " addr=" << owi.wi.addr
11673       << " name=" << owi.wi.name << " cookie=" << owi.wi.cookie << dendl;
11674
11675     pg_watchers.push_back(owi);
11676   }
11677 }
11678
11679 void PrimaryLogPG::check_blocklisted_watchers()
11680 {
11681   dout(20) << "PrimaryLogPG::check_blocklisted_watchers for pg " << get_pgid() << dendl;
11682   pair<hobject_t, ObjectContextRef> i;
11683   while (object_contexts.get_next(i.first, &i))
11684     check_blocklisted_obc_watchers(i.second);
11685 }
11686
11687 void PrimaryLogPG::check_blocklisted_obc_watchers(ObjectContextRef obc)
11688 {
11689   dout(20) << "PrimaryLogPG::check_blocklisted_obc_watchers for obc " << obc->obs.oi.soid << dendl;
11690   for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator k =
11691          obc->watchers.begin();
11692         k != obc->watchers.end();
11693         ) {
11694     //Advance iterator now so handle_watch_timeout() can erase element
11695     map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j = k++;
11696     dout(30) << "watch: Found " << j->second->get_entity() << " cookie " << j->second->get_cookie() << dendl;
11697     entity_addr_t ea = j->second->get_peer_addr();
11698     dout(30) << "watch: Check entity_addr_t " << ea << dendl;
11699     if (get_osdmap()->is_blocklisted(ea)) {
11700       dout(10) << "watch: Found blocklisted watcher for " << ea << dendl;
11701       ceph_assert(j->second->get_pg() == this);
11702       j->second->unregister_cb();
11703       handle_watch_timeout(j->second);
11704     }
11705   }
11706 }
11707
11708 void PrimaryLogPG::populate_obc_watchers(ObjectContextRef obc)
11709 {
11710   ceph_assert(is_primary() && is_active());
11711   auto it_objects = recovery_state.get_pg_log().get_log().objects.find(obc->obs.oi.soid);
11712   ceph_assert((recovering.count(obc->obs.oi.soid) ||
11713           !is_missing_object(obc->obs.oi.soid)) ||
11714          (it_objects != recovery_state.get_pg_log().get_log().objects.end() && // or this is a revert... see recover_primary()
11715           it_objects->second->op ==
11716             pg_log_entry_t::LOST_REVERT &&
11717           it_objects->second->reverting_to ==
11718             obc->obs.oi.version));
11719
11720   dout(10) << "populate_obc_watchers " << obc->obs.oi.soid << dendl;
11721   ceph_assert(obc->watchers.empty());
11722   // populate unconnected_watchers
11723   for (map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator p =
11724         obc->obs.oi.watchers.begin();
11725        p != obc->obs.oi.watchers.end();
11726        ++p) {
11727     utime_t expire = info.stats.last_became_active;
11728     expire += p->second.timeout_seconds;
11729     dout(10) << "  unconnected watcher " << p->first << " will expire " << expire << dendl;
11730     WatchRef watch(
11731       Watch::makeWatchRef(
11732         this, osd, obc, p->second.timeout_seconds, p->first.first,
11733         p->first.second, p->second.addr));
11734     watch->disconnect();
11735     obc->watchers.insert(
11736       make_pair(
11737         make_pair(p->first.first, p->first.second),
11738         watch));
11739   }
11740   // Look for watchers from blocklisted clients and drop
11741   check_blocklisted_obc_watchers(obc);
11742 }
11743
11744 void PrimaryLogPG::handle_watch_timeout(WatchRef watch)
11745 {
11746   ObjectContextRef obc = watch->get_obc(); // handle_watch_timeout owns this ref
11747   dout(10) << "handle_watch_timeout obc " << *obc << dendl;
11748
11749   if (!is_active()) {
11750     dout(10) << "handle_watch_timeout not active, no-op" << dendl;
11751     return;
11752   }
11753   if (!obc->obs.exists) {
11754     dout(10) << __func__ << " object " << obc->obs.oi.soid << " dne" << dendl;
11755     return;
11756   }
11757   if (is_degraded_or_backfilling_object(obc->obs.oi.soid)) {
11758     callbacks_for_degraded_object[obc->obs.oi.soid].push_back(
11759       watch->get_delayed_cb()
11760       );
11761     dout(10) << "handle_watch_timeout waiting for degraded on obj "
11762              << obc->obs.oi.soid
11763              << dendl;
11764     return;
11765   }
11766
11767   if (m_scrubber->write_blocked_by_scrub(obc->obs.oi.soid)) {
11768     dout(10) << "handle_watch_timeout waiting for scrub on obj "
11769              << obc->obs.oi.soid
11770              << dendl;
11771     m_scrubber->add_callback(
11772       watch->get_delayed_cb() // This callback!
11773       );
11774     return;
11775   }
11776
11777   OpContextUPtr ctx = simple_opc_create(obc);
11778   ctx->at_version = get_next_version();
11779
11780   object_info_t& oi = ctx->new_obs.oi;
11781   oi.watchers.erase(make_pair(watch->get_cookie(),
11782                               watch->get_entity()));
11783
11784   list<watch_disconnect_t> watch_disconnects = {
11785     watch_disconnect_t(watch->get_cookie(), watch->get_entity(), true)
11786   };
11787   ctx->register_on_success(
11788     [this, obc, watch_disconnects]() {
11789       complete_disconnect_watches(obc, watch_disconnects);
11790     });
11791
11792
11793   PGTransaction *t = ctx->op_t.get();
11794   ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::MODIFY, obc->obs.oi.soid,
11795                                     ctx->at_version,
11796                                     oi.version,
11797                                     0,
11798                                     osd_reqid_t(), ctx->mtime, 0));
11799
11800   oi.prior_version = obc->obs.oi.version;
11801   oi.version = ctx->at_version;
11802   bufferlist bl;
11803   encode(oi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
11804   t->setattr(obc->obs.oi.soid, OI_ATTR, bl);
11805
11806   // apply new object state.
11807   ctx->obc->obs = ctx->new_obs;
11808
11809   // no ctx->delta_stats
11810   simple_opc_submit(std::move(ctx));
11811 }
11812
11813 ObjectContextRef PrimaryLogPG::create_object_context(const object_info_t& oi,
11814                                                      SnapSetContext *ssc)
11815 {
11816   ObjectContextRef obc(object_contexts.lookup_or_create(oi.soid));
11817   ceph_assert(obc->destructor_callback == NULL);
11818   obc->destructor_callback = new C_PG_ObjectContext(this, obc.get());
11819   obc->obs.oi = oi;
11820   obc->obs.exists = false;
11821   obc->ssc = ssc;
11822   if (ssc)
11823     register_snapset_context(ssc);
11824   dout(10) << "create_object_context " << (void*)obc.get() << " " << oi.soid << " " << dendl;
11825   if (is_active())
11826     populate_obc_watchers(obc);
11827   return obc;
11828 }
11829
11830 ObjectContextRef PrimaryLogPG::get_object_context(
11831   const hobject_t& soid,
11832   bool can_create,
11833   const map<string, bufferlist, less<>> *attrs)
11834 {
11835   auto it_objects = recovery_state.get_pg_log().get_log().objects.find(soid);
11836   ceph_assert(
11837     attrs || !recovery_state.get_pg_log().get_missing().is_missing(soid) ||
11838     // or this is a revert... see recover_primary()
11839     (it_objects != recovery_state.get_pg_log().get_log().objects.end() &&
11840       it_objects->second->op ==
11841       pg_log_entry_t::LOST_REVERT));
11842   ObjectContextRef obc = object_contexts.lookup(soid);
11843   osd->logger->inc(l_osd_object_ctx_cache_total);
11844   if (obc) {
11845     osd->logger->inc(l_osd_object_ctx_cache_hit);
11846     dout(10) << __func__ << ": found obc in cache: " << *obc
11847              << dendl;
11848   } else {
11849     dout(10) << __func__ << ": obc NOT found in cache: " << soid << dendl;
11850     // check disk
11851     bufferlist bv;
11852     if (attrs) {
11853       auto it_oi = attrs->find(OI_ATTR);
11854       ceph_assert(it_oi != attrs->end());
11855       bv = it_oi->second;
11856     } else {
11857       int r = pgbackend->objects_get_attr(soid, OI_ATTR, &bv);
11858       if (r < 0) {
11859         if (!can_create) {
11860           dout(10) << __func__ << ": no obc for soid "
11861                    << soid << " and !can_create"
11862                    << dendl;
11863           return ObjectContextRef();   // -ENOENT!
11864         }
11865
11866         dout(10) << __func__ << ": no obc for soid "
11867                  << soid << " but can_create"
11868                  << dendl;
11869         // new object.
11870         object_info_t oi(soid);
11871         SnapSetContext *ssc = get_snapset_context(
11872           soid, true, 0, false);
11873         ceph_assert(ssc);
11874         obc = create_object_context(oi, ssc);
11875         dout(10) << __func__ << ": " << *obc
11876                  << " oi: " << obc->obs.oi
11877                  << " " << *obc->ssc << dendl;
11878         return obc;
11879       }
11880     }
11881
11882     object_info_t oi;
11883     try {
11884       bufferlist::const_iterator bliter = bv.begin();
11885       decode(oi, bliter);
11886     } catch (...) {
11887       dout(0) << __func__ << ": obc corrupt: " << soid << dendl;
11888       return ObjectContextRef();   // -ENOENT!
11889     }
11890
11891     ceph_assert(oi.soid.pool == (int64_t)info.pgid.pool());
11892
11893     obc = object_contexts.lookup_or_create(oi.soid);
11894     obc->destructor_callback = new C_PG_ObjectContext(this, obc.get());
11895     obc->obs.oi = oi;
11896     obc->obs.exists = true;
11897
11898     obc->ssc = get_snapset_context(
11899       soid, true,
11900       soid.has_snapset() ? attrs : 0);
11901
11902     if (is_primary() && is_active())
11903       populate_obc_watchers(obc);
11904
11905     if (pool.info.is_erasure()) {
11906       if (attrs) {
11907         obc->attr_cache = *attrs;
11908       } else {
11909         int r = pgbackend->objects_get_attrs(
11910           soid,
11911           &obc->attr_cache);
11912         ceph_assert(r == 0);
11913       }
11914     }
11915
11916     dout(10) << __func__ << ": creating obc from disk: " << *obc
11917              << dendl;
11918   }
11919
11920   // XXX: Caller doesn't expect this
11921   if (obc->ssc == NULL) {
11922     derr << __func__ << ": obc->ssc not available, not returning context" << dendl;
11923     return ObjectContextRef();   // -ENOENT!
11924   }
11925
11926   dout(10) << __func__ << ": " << *obc
11927            << " oi: " << obc->obs.oi
11928            << " exists: " << (int)obc->obs.exists
11929            << " " << *obc->ssc << dendl;
11930   return obc;
11931 }
11932
11933 void PrimaryLogPG::context_registry_on_change()
11934 {
11935   pair<hobject_t, ObjectContextRef> i;
11936   while (object_contexts.get_next(i.first, &i)) {
11937     ObjectContextRef obc(i.second);
11938     if (obc) {
11939       for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j =
11940              obc->watchers.begin();
11941            j != obc->watchers.end();
11942            obc->watchers.erase(j++)) {
11943         j->second->discard();
11944       }
11945     }
11946   }
11947 }
11948
11949
11950 /*
11951  * If we return an error, and set *pmissing, then promoting that
11952  * object may help.
11953  *
11954  * If we return -EAGAIN, we will always set *pmissing to the missing
11955  * object to wait for.
11956  *
11957  * If we return an error but do not set *pmissing, then we know the
11958  * object does not exist.
11959  */
11960 int PrimaryLogPG::find_object_context(const hobject_t& oid,
11961                                       ObjectContextRef *pobc,
11962                                       bool can_create,
11963                                       bool map_snapid_to_clone,
11964                                       hobject_t *pmissing)
11965 {
11966   FUNCTRACE(cct);
11967   ceph_assert(oid.pool == static_cast<int64_t>(info.pgid.pool()));
11968   // want the head?
11969   if (oid.snap == CEPH_NOSNAP) {
11970     ObjectContextRef obc = get_object_context(oid, can_create);
11971     if (!obc) {
11972       if (pmissing)
11973         *pmissing = oid;
11974       return -ENOENT;
11975     }
11976     dout(10) << __func__ << " " << oid
11977        << " @" << oid.snap
11978        << " oi=" << obc->obs.oi
11979        << dendl;
11980     *pobc = obc;
11981
11982     return 0;
11983   }
11984
11985   // we want a snap
11986
11987   hobject_t head = oid.get_head();
11988   SnapSetContext *ssc = get_snapset_context(oid, can_create);
11989   if (!ssc || !(ssc->exists || can_create)) {
11990     dout(20) << __func__ << " " << oid << " no snapset" << dendl;
11991     if (pmissing)
11992       *pmissing = head;  // start by getting the head
11993     if (ssc)
11994       put_snapset_context(ssc);
11995     return -ENOENT;
11996   }
11997
11998   if (map_snapid_to_clone) {
11999     dout(10) << __func__ << " " << oid << " @" << oid.snap
12000              << " snapset " << ssc->snapset
12001              << " map_snapid_to_clone=true" << dendl;
12002     if (oid.snap > ssc->snapset.seq) {
12003       // already must be readable
12004       ObjectContextRef obc = get_object_context(head, false);
12005       dout(10) << __func__ << " " << oid << " @" << oid.snap
12006                << " snapset " << ssc->snapset
12007                << " maps to head" << dendl;
12008       *pobc = obc;
12009       put_snapset_context(ssc);
12010       return (obc && obc->obs.exists) ? 0 : -ENOENT;
12011     } else {
12012       vector<snapid_t>::const_iterator citer = std::find(
12013         ssc->snapset.clones.begin(),
12014         ssc->snapset.clones.end(),
12015         oid.snap);
12016       if (citer == ssc->snapset.clones.end()) {
12017         dout(10) << __func__ << " " << oid << " @" << oid.snap
12018                  << " snapset " << ssc->snapset
12019                  << " maps to nothing" << dendl;
12020         put_snapset_context(ssc);
12021         return -ENOENT;
12022       }
12023
12024       dout(10) << __func__ << " " << oid << " @" << oid.snap
12025                << " snapset " << ssc->snapset
12026                << " maps to " << oid << dendl;
12027
12028       if (recovery_state.get_pg_log().get_missing().is_missing(oid)) {
12029         dout(10) << __func__ << " " << oid << " @" << oid.snap
12030                  << " snapset " << ssc->snapset
12031                  << " " << oid << " is missing" << dendl;
12032         if (pmissing)
12033           *pmissing = oid;
12034         put_snapset_context(ssc);
12035         return -EAGAIN;
12036       }
12037
12038       ObjectContextRef obc = get_object_context(oid, false);
12039       if (!obc || !obc->obs.exists) {
12040         dout(10) << __func__ << " " << oid << " @" << oid.snap
12041                  << " snapset " << ssc->snapset
12042                  << " " << oid << " is not present" << dendl;
12043         if (pmissing)
12044           *pmissing = oid;
12045         put_snapset_context(ssc);
12046         return -ENOENT;
12047       }
12048       dout(10) << __func__ << " " << oid << " @" << oid.snap
12049                << " snapset " << ssc->snapset
12050                << " " << oid << " HIT" << dendl;
12051       *pobc = obc;
12052       put_snapset_context(ssc);
12053       return 0;
12054     }
12055     ceph_abort(); //unreachable
12056   }
12057
12058   dout(10) << __func__ << " " << oid << " @" << oid.snap
12059            << " snapset " << ssc->snapset << dendl;
12060
12061   // head?
12062   if (oid.snap > ssc->snapset.seq) {
12063     ObjectContextRef obc = get_object_context(head, false);
12064     dout(10) << __func__ << " " << head
12065              << " want " << oid.snap << " > snapset seq " << ssc->snapset.seq
12066              << " -- HIT " << obc->obs
12067              << dendl;
12068     if (!obc->ssc)
12069       obc->ssc = ssc;
12070     else {
12071       ceph_assert(ssc == obc->ssc);
12072       put_snapset_context(ssc);
12073     }
12074     *pobc = obc;
12075     return 0;
12076   }
12077
12078   // which clone would it be?
12079   unsigned k = 0;
12080   while (k < ssc->snapset.clones.size() &&
12081          ssc->snapset.clones[k] < oid.snap)
12082     k++;
12083   if (k == ssc->snapset.clones.size()) {
12084     dout(10) << __func__ << " no clones with last >= oid.snap "
12085              << oid.snap << " -- DNE" << dendl;
12086     put_snapset_context(ssc);
12087     return -ENOENT;
12088   }
12089   hobject_t soid(oid.oid, oid.get_key(), ssc->snapset.clones[k], oid.get_hash(),
12090                  info.pgid.pool(), oid.get_namespace());
12091
12092   if (recovery_state.get_pg_log().get_missing().is_missing(soid)) {
12093     dout(20) << __func__ << " " << soid << " missing, try again later"
12094              << dendl;
12095     if (pmissing)
12096       *pmissing = soid;
12097     put_snapset_context(ssc);
12098     return -EAGAIN;
12099   }
12100
12101   ObjectContextRef obc = get_object_context(soid, false);
12102   if (!obc || !obc->obs.exists) {
12103     if (pmissing)
12104       *pmissing = soid;
12105     put_snapset_context(ssc);
12106     if (is_primary()) {
12107       if (is_degraded_or_backfilling_object(soid)) {
12108         dout(20) << __func__ << " clone is degraded or backfilling " << soid << dendl;
12109         return -EAGAIN;
12110       } else if (is_degraded_on_async_recovery_target(soid)) {
12111         dout(20) << __func__ << " clone is recovering " << soid << dendl;
12112         return -EAGAIN;
12113       } else {
12114         dout(20) << __func__ << " missing clone " << soid << dendl;
12115         return -ENOENT;
12116       }
12117     } else {
12118       dout(20) << __func__ << " replica missing clone" << soid << dendl;
12119       return -ENOENT;
12120     }
12121   }
12122
12123   if (!obc->ssc) {
12124     obc->ssc = ssc;
12125   } else {
12126     ceph_assert(obc->ssc == ssc);
12127     put_snapset_context(ssc);
12128   }
12129   ssc = 0;
12130
12131   // clone
12132   dout(20) << __func__ << " " << soid
12133            << " snapset " << obc->ssc->snapset
12134            << dendl;
12135   snapid_t first, last;
12136   auto p = obc->ssc->snapset.clone_snaps.find(soid.snap);
12137   ceph_assert(p != obc->ssc->snapset.clone_snaps.end());
12138   if (p->second.empty()) {
12139     dout(1) << __func__ << " " << soid << " empty snapset -- DNE" << dendl;
12140     ceph_assert(!cct->_conf->osd_debug_verify_snaps);
12141     return -ENOENT;
12142   }
12143   if (std::find(p->second.begin(), p->second.end(), oid.snap) ==
12144       p->second.end()) {
12145     dout(20) << __func__ << " " << soid << " clone_snaps " << p->second
12146              << " does not contain " << oid.snap << " -- DNE" << dendl;
12147     return -ENOENT;
12148   }
12149   if (get_osdmap()->in_removed_snaps_queue(info.pgid.pgid.pool(), oid.snap)) {
12150     dout(20) << __func__ << " " << soid << " snap " << oid.snap
12151              << " in removed_snaps_queue" << " -- DNE" << dendl;
12152     return -ENOENT;
12153   }
12154   dout(20) << __func__ << " " << soid << " clone_snaps " << p->second
12155            << " contains " << oid.snap << " -- HIT " << obc->obs << dendl;
12156   *pobc = obc;
12157   return 0;
12158 }
12159
12160 void PrimaryLogPG::object_context_destructor_callback(ObjectContext *obc)
12161 {
12162   if (obc->ssc)
12163     put_snapset_context(obc->ssc);
12164 }
12165
12166 void PrimaryLogPG::add_object_context_to_pg_stat(ObjectContextRef obc, pg_stat_t *pgstat)
12167 {
12168   object_info_t& oi = obc->obs.oi;
12169
12170   dout(10) << __func__ << " " << oi.soid << dendl;
12171   ceph_assert(!oi.soid.is_snapdir());
12172
12173   object_stat_sum_t stat;
12174   stat.num_objects++;
12175   if (oi.is_dirty())
12176     stat.num_objects_dirty++;
12177   if (oi.is_whiteout())
12178     stat.num_whiteouts++;
12179   if (oi.is_omap())
12180     stat.num_objects_omap++;
12181   if (oi.is_cache_pinned())
12182     stat.num_objects_pinned++;
12183   if (oi.has_manifest())
12184     stat.num_objects_manifest++;
12185
12186   if (oi.soid.is_snap()) {
12187     stat.num_object_clones++;
12188
12189     if (!obc->ssc)
12190       obc->ssc = get_snapset_context(oi.soid, false);
12191     ceph_assert(obc->ssc);
12192     stat.num_bytes += obc->ssc->snapset.get_clone_bytes(oi.soid.snap);
12193   } else {
12194     stat.num_bytes += oi.size;
12195   }
12196
12197   // add it in
12198   pgstat->stats.sum.add(stat);
12199 }
12200
12201 void PrimaryLogPG::requeue_op_blocked_by_object(const hobject_t &soid) {
12202   map<hobject_t, list<OpRequestRef>>::iterator p = waiting_for_blocked_object.find(soid);
12203   if (p != waiting_for_blocked_object.end()) {
12204     list<OpRequestRef>& ls = p->second;
12205     dout(10) << __func__ << " " << soid << " requeuing " << ls.size() << " requests" << dendl;
12206     requeue_ops(ls);
12207     waiting_for_blocked_object.erase(p);
12208   }
12209 }
12210
12211 void PrimaryLogPG::kick_object_context_blocked(ObjectContextRef obc)
12212 {
12213   const hobject_t& soid = obc->obs.oi.soid;
12214   if (obc->is_blocked()) {
12215     dout(10) << __func__ << " " << soid << " still blocked" << dendl;
12216     return;
12217   }
12218
12219   requeue_op_blocked_by_object(soid);
12220
12221   map<hobject_t, ObjectContextRef>::iterator i =
12222     objects_blocked_on_snap_promotion.find(obc->obs.oi.soid.get_head());
12223   if (i != objects_blocked_on_snap_promotion.end()) {
12224     ceph_assert(i->second == obc);
12225     ObjectContextRef head_obc = get_object_context(i->first, false);
12226     head_obc->stop_block();
12227     // kick blocked ops (head)
12228     requeue_op_blocked_by_object(i->first);
12229     objects_blocked_on_snap_promotion.erase(i);
12230   }
12231
12232   if (obc->requeue_scrub_on_unblock) {
12233
12234     obc->requeue_scrub_on_unblock = false;
12235
12236     dout(20) << __func__ << " requeuing if still active: " << (is_active() ? "yes" : "no") << dendl;
12237
12238     // only requeue if we are still active: we may be unblocking
12239     // because we are resetting for a new peering interval
12240     if (is_active()) {
12241       osd->queue_scrub_unblocking(this, is_scrub_blocking_ops());
12242     }
12243   }
12244 }
12245
12246 SnapSetContext *PrimaryLogPG::get_snapset_context(
12247   const hobject_t& oid,
12248   bool can_create,
12249   const map<string, bufferlist, less<>> *attrs,
12250   bool oid_existed)
12251 {
12252   std::lock_guard l(snapset_contexts_lock);
12253   SnapSetContext *ssc;
12254   map<hobject_t, SnapSetContext*>::iterator p = snapset_contexts.find(
12255     oid.get_snapdir());
12256   if (p != snapset_contexts.end()) {
12257     if (can_create || p->second->exists) {
12258       ssc = p->second;
12259     } else {
12260       return NULL;
12261     }
12262   } else {
12263     bufferlist bv;
12264     if (!attrs) {
12265       int r = -ENOENT;
12266       if (!(oid.is_head() && !oid_existed)) {
12267         r = pgbackend->objects_get_attr(oid.get_head(), SS_ATTR, &bv);
12268       }
12269       if (r < 0 && !can_create)
12270         return NULL;
12271     } else {
12272       auto it_ss = attrs->find(SS_ATTR);
12273       ceph_assert(it_ss != attrs->end());
12274       bv = it_ss->second;
12275     }
12276     ssc = new SnapSetContext(oid.get_snapdir());
12277     _register_snapset_context(ssc);
12278     if (bv.length()) {
12279       bufferlist::const_iterator bvp = bv.begin();
12280       try {
12281         ssc->snapset.decode(bvp);
12282       } catch (const ceph::buffer::error& e) {
12283         dout(0) << __func__ << " Can't decode snapset: " << e.what() << dendl;
12284         return NULL;
12285       }
12286       ssc->exists = true;
12287     } else {
12288       ssc->exists = false;
12289     }
12290   }
12291   ceph_assert(ssc);
12292   ssc->ref++;
12293   return ssc;
12294 }
12295
12296 void PrimaryLogPG::put_snapset_context(SnapSetContext *ssc)
12297 {
12298   std::lock_guard l(snapset_contexts_lock);
12299   --ssc->ref;
12300   if (ssc->ref == 0) {
12301     if (ssc->registered)
12302       snapset_contexts.erase(ssc->oid);
12303     delete ssc;
12304   }
12305 }
12306
12307 /*
12308  * Return values:
12309  *  NONE  - didn't pull anything
12310  *  YES   - pulled what the caller wanted
12311  *  HEAD  - needed to pull head first
12312  */
12313 enum { PULL_NONE, PULL_HEAD, PULL_YES };
12314
12315 int PrimaryLogPG::recover_missing(
12316   const hobject_t &soid, eversion_t v,
12317   int priority,
12318   PGBackend::RecoveryHandle *h)
12319 {
12320   dout(10) << __func__ << " sar: " << scrub_after_recovery << dendl;
12321
12322   if (recovery_state.get_missing_loc().is_unfound(soid)) {
12323     dout(7) << __func__ << " " << soid
12324             << " v " << v
12325             << " but it is unfound" << dendl;
12326     return PULL_NONE;
12327   }
12328
12329   if (recovery_state.get_missing_loc().is_deleted(soid)) {
12330     start_recovery_op(soid);
12331     ceph_assert(!recovering.count(soid));
12332     recovering.insert(make_pair(soid, ObjectContextRef()));
12333     epoch_t cur_epoch = get_osdmap_epoch();
12334     remove_missing_object(soid, v, new LambdaContext(
12335      [=, this](int) {
12336        std::scoped_lock locker{*this};
12337        if (!pg_has_reset_since(cur_epoch)) {
12338          bool object_missing = false;
12339          for (const auto& shard : get_acting_recovery_backfill()) {
12340            if (shard == pg_whoami)
12341              continue;
12342            if (recovery_state.get_peer_missing(shard).is_missing(soid)) {
12343              dout(20) << __func__ << ": soid " << soid << " needs to be deleted from replica " << shard << dendl;
12344              object_missing = true;
12345              break;
12346            }
12347          }
12348          if (!object_missing) {
12349            object_stat_sum_t stat_diff;
12350            stat_diff.num_objects_recovered = 1;
12351            if (scrub_after_recovery)
12352              stat_diff.num_objects_repaired = 1;
12353            on_global_recover(soid, stat_diff, true);
12354          } else {
12355            auto recovery_handle = pgbackend->open_recovery_op();
12356            pgbackend->recover_delete_object(soid, v, recovery_handle);
12357            pgbackend->run_recovery_op(recovery_handle, priority);
12358          }
12359        }
12360      }));
12361     return PULL_YES;
12362   }
12363
12364   // is this a snapped object?  if so, consult the snapset.. we may not need the entire object!
12365   ObjectContextRef obc;
12366   ObjectContextRef head_obc;
12367   if (soid.snap && soid.snap < CEPH_NOSNAP) {
12368     // do we have the head?
12369     hobject_t head = soid.get_head();
12370     if (recovery_state.get_pg_log().get_missing().is_missing(head)) {
12371       if (recovering.count(head)) {
12372         dout(10) << " missing but already recovering head " << head << dendl;
12373         return PULL_NONE;
12374       } else {
12375         int r = recover_missing(
12376           head, recovery_state.get_pg_log().get_missing().get_items().find(head)->second.need, priority,
12377           h);
12378         if (r != PULL_NONE)
12379           return PULL_HEAD;
12380         return PULL_NONE;
12381       }
12382     }
12383     head_obc = get_object_context(
12384       head,
12385       false,
12386       0);
12387     ceph_assert(head_obc);
12388   }
12389   start_recovery_op(soid);
12390   ceph_assert(!recovering.count(soid));
12391   recovering.insert(make_pair(soid, obc));
12392   int r = pgbackend->recover_object(
12393     soid,
12394     v,
12395     head_obc,
12396     obc,
12397     h);
12398   // This is only a pull which shouldn't return an error
12399   ceph_assert(r >= 0);
12400   return PULL_YES;
12401 }
12402
12403 void PrimaryLogPG::remove_missing_object(const hobject_t &soid,
12404                                          eversion_t v, Context *on_complete)
12405 {
12406   dout(20) << __func__ << " " << soid << " " << v << dendl;
12407   ceph_assert(on_complete != nullptr);
12408   // delete locally
12409   ObjectStore::Transaction t;
12410   remove_snap_mapped_object(t, soid);
12411
12412   ObjectRecoveryInfo recovery_info;
12413   recovery_info.soid = soid;
12414   recovery_info.version = v;
12415
12416   epoch_t cur_epoch = get_osdmap_epoch();
12417   t.register_on_complete(new LambdaContext(
12418      [=, this](int) {
12419        std::unique_lock locker{*this};
12420        if (!pg_has_reset_since(cur_epoch)) {
12421          ObjectStore::Transaction t2;
12422          on_local_recover(soid, recovery_info, ObjectContextRef(), true, &t2);
12423          t2.register_on_complete(on_complete);
12424          int r = osd->store->queue_transaction(ch, std::move(t2), nullptr);
12425          ceph_assert(r == 0);
12426          locker.unlock();
12427        } else {
12428          locker.unlock();
12429          on_complete->complete(-EAGAIN);
12430        }
12431      }));
12432   int r = osd->store->queue_transaction(ch, std::move(t), nullptr);
12433   ceph_assert(r == 0);
12434 }
12435
12436 void PrimaryLogPG::finish_degraded_object(const hobject_t oid)
12437 {
12438   dout(10) << __func__ << " " << oid << dendl;
12439   if (callbacks_for_degraded_object.count(oid)) {
12440     list<Context*> contexts;
12441     contexts.swap(callbacks_for_degraded_object[oid]);
12442     callbacks_for_degraded_object.erase(oid);
12443     for (list<Context*>::iterator i = contexts.begin();
12444          i != contexts.end();
12445          ++i) {
12446       (*i)->complete(0);
12447     }
12448   }
12449   map<hobject_t, snapid_t>::iterator i = objects_blocked_on_degraded_snap.find(
12450     oid.get_head());
12451   if (i != objects_blocked_on_degraded_snap.end() &&
12452       i->second == oid.snap)
12453     objects_blocked_on_degraded_snap.erase(i);
12454 }
12455
12456 void PrimaryLogPG::_committed_pushed_object(
12457   epoch_t epoch, eversion_t last_complete)
12458 {
12459   std::scoped_lock locker{*this};
12460   if (!pg_has_reset_since(epoch)) {
12461     recovery_state.recovery_committed_to(last_complete);
12462   } else {
12463     dout(10) << __func__
12464              << " pg has changed, not touching last_complete_ondisk" << dendl;
12465   }
12466 }
12467
12468 void PrimaryLogPG::_applied_recovered_object(ObjectContextRef obc)
12469 {
12470   dout(20) << __func__ << dendl;
12471   if (obc) {
12472     dout(20) << "obc = " << *obc << dendl;
12473   }
12474   ceph_assert(active_pushes >= 1);
12475   --active_pushes;
12476
12477   // requeue an active chunky scrub waiting on recovery ops
12478   if (!recovery_state.is_deleting() && active_pushes == 0 &&
12479       is_scrub_active()) {
12480
12481     osd->queue_scrub_pushes_update(this, is_scrub_blocking_ops());
12482   }
12483 }
12484
12485 void PrimaryLogPG::_applied_recovered_object_replica()
12486 {
12487   dout(20) << __func__ << dendl;
12488   ceph_assert(active_pushes >= 1);
12489   --active_pushes;
12490
12491   // requeue an active scrub waiting on recovery ops
12492   if (!recovery_state.is_deleting() && active_pushes == 0 &&
12493       is_scrub_active()) {
12494
12495     osd->queue_scrub_replica_pushes(this, m_scrubber->replica_op_priority());
12496   }
12497 }
12498
12499 void PrimaryLogPG::on_failed_pull(
12500   const set<pg_shard_t> &from,
12501   const hobject_t &soid,
12502   const eversion_t &v)
12503 {
12504   dout(20) << __func__ << ": " << soid << dendl;
12505   ceph_assert(recovering.count(soid));
12506   auto obc = recovering[soid];
12507   if (obc) {
12508     list<OpRequestRef> blocked_ops;
12509     obc->drop_recovery_read(&blocked_ops);
12510     requeue_ops(blocked_ops);
12511   }
12512   recovering.erase(soid);
12513   for (auto&& i : from) {
12514     if (i != pg_whoami) { // we'll get it below in primary_error
12515       recovery_state.force_object_missing(i, soid, v);
12516     }
12517   }
12518
12519   dout(0) << __func__ << " " << soid << " from shard " << from
12520           << ", reps on " << recovery_state.get_missing_loc().get_locations(soid)
12521           << " unfound? " << recovery_state.get_missing_loc().is_unfound(soid)
12522           << dendl;
12523   finish_recovery_op(soid);  // close out this attempt,
12524   finish_degraded_object(soid);
12525
12526   if (from.count(pg_whoami)) {
12527     dout(0) << " primary missing oid " << soid << " version " << v << dendl;
12528     primary_error(soid, v);
12529     backfills_in_flight.erase(soid);
12530   }
12531 }
12532
12533 eversion_t PrimaryLogPG::pick_newest_available(const hobject_t& oid)
12534 {
12535   eversion_t v;
12536   pg_missing_item pmi;
12537   bool is_missing = recovery_state.get_pg_log().get_missing().is_missing(oid, &pmi);
12538   ceph_assert(is_missing);
12539   v = pmi.have;
12540   dout(10) << "pick_newest_available " << oid << " " << v << " on osd." << osd->whoami << " (local)" << dendl;
12541
12542   ceph_assert(!get_acting_recovery_backfill().empty());
12543   for (set<pg_shard_t>::iterator i = get_acting_recovery_backfill().begin();
12544        i != get_acting_recovery_backfill().end();
12545        ++i) {
12546     if (*i == get_primary()) continue;
12547     pg_shard_t peer = *i;
12548     if (!recovery_state.get_peer_missing(peer).is_missing(oid)) {
12549       continue;
12550     }
12551     eversion_t h = recovery_state.get_peer_missing(peer).get_items().at(oid).have;
12552     dout(10) << "pick_newest_available " << oid << " " << h << " on osd." << peer << dendl;
12553     if (h > v)
12554       v = h;
12555   }
12556
12557   dout(10) << "pick_newest_available " << oid << " " << v << " (newest)" << dendl;
12558   return v;
12559 }
12560
12561 void PrimaryLogPG::do_update_log_missing(OpRequestRef &op)
12562 {
12563   const MOSDPGUpdateLogMissing *m = static_cast<const MOSDPGUpdateLogMissing*>(
12564     op->get_req());
12565   ceph_assert(m->get_type() == MSG_OSD_PG_UPDATE_LOG_MISSING);
12566   ObjectStore::Transaction t;
12567   std::optional<eversion_t> op_trim_to, op_roll_forward_to;
12568   if (m->pg_trim_to != eversion_t())
12569     op_trim_to = m->pg_trim_to;
12570   if (m->pg_roll_forward_to != eversion_t())
12571     op_roll_forward_to = m->pg_roll_forward_to;
12572
12573   dout(20) << __func__
12574            << " op_trim_to = " << op_trim_to << " op_roll_forward_to = " << op_roll_forward_to << dendl;
12575
12576   recovery_state.append_log_entries_update_missing(
12577     m->entries, t, op_trim_to, op_roll_forward_to);
12578   eversion_t new_lcod = info.last_complete;
12579
12580   Context *complete = new LambdaContext(
12581     [=, this](int) {
12582       const MOSDPGUpdateLogMissing *msg = static_cast<const MOSDPGUpdateLogMissing*>(
12583         op->get_req());
12584       std::scoped_lock locker{*this};
12585       if (!pg_has_reset_since(msg->get_epoch())) {
12586         update_last_complete_ondisk(new_lcod);
12587         MOSDPGUpdateLogMissingReply *reply =
12588           new MOSDPGUpdateLogMissingReply(
12589             spg_t(info.pgid.pgid, primary_shard().shard),
12590             pg_whoami.shard,
12591             msg->get_epoch(),
12592             msg->min_epoch,
12593             msg->get_tid(),
12594             new_lcod);
12595         reply->set_priority(CEPH_MSG_PRIO_HIGH);
12596         msg->get_connection()->send_message(reply);
12597       }
12598     });
12599
12600   if (get_osdmap()->require_osd_release >= ceph_release_t::kraken) {
12601     t.register_on_commit(complete);
12602   } else {
12603     /* Hack to work around the fact that ReplicatedBackend sends
12604      * ack+commit if commit happens first
12605      *
12606      * This behavior is no longer necessary, but we preserve it so old
12607      * primaries can keep their repops in order */
12608     if (pool.info.is_erasure()) {
12609       t.register_on_complete(complete);
12610     } else {
12611       t.register_on_commit(complete);
12612     }
12613   }
12614   int tr = osd->store->queue_transaction(
12615     ch,
12616     std::move(t),
12617     nullptr);
12618   ceph_assert(tr == 0);
12619   op_applied(info.last_update);
12620 }
12621
12622 void PrimaryLogPG::do_update_log_missing_reply(OpRequestRef &op)
12623 {
12624   const MOSDPGUpdateLogMissingReply *m =
12625     static_cast<const MOSDPGUpdateLogMissingReply*>(
12626     op->get_req());
12627   dout(20) << __func__ << " got reply from "
12628            << m->get_from() << dendl;
12629
12630   auto it = log_entry_update_waiting_on.find(m->get_tid());
12631   if (it != log_entry_update_waiting_on.end()) {
12632     if (it->second.waiting_on.count(m->get_from())) {
12633       it->second.waiting_on.erase(m->get_from());
12634       if (m->last_complete_ondisk != eversion_t()) {
12635         update_peer_last_complete_ondisk(m->get_from(), m->last_complete_ondisk);
12636       }
12637     } else {
12638       osd->clog->error()
12639         << info.pgid << " got reply "
12640         << *m << " from shard we are not waiting for "
12641         << m->get_from();
12642     }
12643
12644     if (it->second.waiting_on.empty()) {
12645       repop_all_committed(it->second.repop.get());
12646       log_entry_update_waiting_on.erase(it);
12647     }
12648   } else {
12649     osd->clog->error()
12650       << info.pgid << " got reply "
12651       << *m << " on unknown tid " << m->get_tid();
12652   }
12653 }
12654
12655 /* Mark all unfound objects as lost.
12656  */
12657 void PrimaryLogPG::mark_all_unfound_lost(
12658   int what,
12659   std::function<void(int,const std::string&,bufferlist&)> on_finish)
12660 {
12661   dout(3) << __func__ << " " << pg_log_entry_t::get_op_name(what) << dendl;
12662   list<hobject_t> oids;
12663
12664   dout(30) << __func__ << ": log before:\n";
12665   recovery_state.get_pg_log().get_log().print(*_dout);
12666   *_dout << dendl;
12667
12668   mempool::osd_pglog::list<pg_log_entry_t> log_entries;
12669
12670   utime_t mtime = ceph_clock_now();
12671   map<hobject_t, pg_missing_item>::const_iterator m =
12672     recovery_state.get_missing_loc().get_needs_recovery().begin();
12673   map<hobject_t, pg_missing_item>::const_iterator mend =
12674     recovery_state.get_missing_loc().get_needs_recovery().end();
12675
12676   ObcLockManager manager;
12677   eversion_t v = get_next_version();
12678   v.epoch = get_osdmap_epoch();
12679   uint64_t num_unfound = recovery_state.get_missing_loc().num_unfound();
12680   while (m != mend) {
12681     const hobject_t &oid(m->first);
12682     if (!recovery_state.get_missing_loc().is_unfound(oid)) {
12683       // We only care about unfound objects
12684       ++m;
12685       continue;
12686     }
12687
12688     ObjectContextRef obc;
12689     eversion_t prev;
12690
12691     switch (what) {
12692     case pg_log_entry_t::LOST_MARK:
12693       ceph_abort_msg("actually, not implemented yet!");
12694       break;
12695
12696     case pg_log_entry_t::LOST_REVERT:
12697       prev = pick_newest_available(oid);
12698       if (prev > eversion_t()) {
12699         // log it
12700         pg_log_entry_t e(
12701           pg_log_entry_t::LOST_REVERT, oid, v,
12702           m->second.need, 0, osd_reqid_t(), mtime, 0);
12703         e.reverting_to = prev;
12704         e.mark_unrollbackable();
12705         log_entries.push_back(e);
12706         dout(10) << e << dendl;
12707
12708         // we are now missing the new version; recovery code will sort it out.
12709         ++v.version;
12710         ++m;
12711         break;
12712       }
12713
12714     case pg_log_entry_t::LOST_DELETE:
12715       {
12716         pg_log_entry_t e(pg_log_entry_t::LOST_DELETE, oid, v, m->second.need,
12717                          0, osd_reqid_t(), mtime, 0);
12718         if (get_osdmap()->require_osd_release >= ceph_release_t::jewel) {
12719           if (pool.info.require_rollback()) {
12720             e.mod_desc.try_rmobject(v.version);
12721           } else {
12722             e.mark_unrollbackable();
12723           }
12724         } // otherwise, just do what we used to do
12725         dout(10) << e << dendl;
12726         log_entries.push_back(e);
12727         oids.push_back(oid);
12728
12729         // If context found mark object as deleted in case
12730         // of racing with new creation.  This can happen if
12731         // object lost and EIO at primary.
12732         obc = object_contexts.lookup(oid);
12733         if (obc)
12734           obc->obs.exists = false;
12735
12736         ++v.version;
12737         ++m;
12738       }
12739       break;
12740
12741     default:
12742       ceph_abort();
12743     }
12744   }
12745
12746   recovery_state.update_stats(
12747     [](auto &history, auto &stats) {
12748       stats.stats_invalid = true;
12749       return false;
12750     });
12751
12752   submit_log_entries(
12753     log_entries,
12754     std::move(manager),
12755     std::optional<std::function<void(void)> >(
12756       [this, oids, num_unfound, on_finish]() {
12757         if (recovery_state.perform_deletes_during_peering()) {
12758           for (auto oid : oids) {
12759             // clear old locations - merge_new_log_entries will have
12760             // handled rebuilding missing_loc for each of these
12761             // objects if we have the RECOVERY_DELETES flag
12762             recovery_state.object_recovered(oid, object_stat_sum_t());
12763           }
12764         }
12765
12766         if (is_recovery_unfound()) {
12767           queue_peering_event(
12768             PGPeeringEventRef(
12769               std::make_shared<PGPeeringEvent>(
12770               get_osdmap_epoch(),
12771               get_osdmap_epoch(),
12772               PeeringState::DoRecovery())));
12773         } else if (is_backfill_unfound()) {
12774           queue_peering_event(
12775             PGPeeringEventRef(
12776               std::make_shared<PGPeeringEvent>(
12777               get_osdmap_epoch(),
12778               get_osdmap_epoch(),
12779               PeeringState::RequestBackfill())));
12780         } else {
12781           queue_recovery();
12782         }
12783
12784         stringstream ss;
12785         ss << "pg has " << num_unfound
12786            << " objects unfound and apparently lost marking";
12787         string rs = ss.str();
12788         dout(0) << "do_command r=" << 0 << " " << rs << dendl;
12789         osd->clog->info() << rs;
12790         bufferlist empty;
12791         on_finish(0, rs, empty);
12792       }),
12793     OpRequestRef());
12794 }
12795
12796 void PrimaryLogPG::_split_into(pg_t child_pgid, PG *child, unsigned split_bits)
12797 {
12798   ceph_assert(repop_queue.empty());
12799 }
12800
12801 /*
12802  * pg status change notification
12803  */
12804
12805 void PrimaryLogPG::apply_and_flush_repops(bool requeue)
12806 {
12807   list<OpRequestRef> rq;
12808
12809   // apply all repops
12810   while (!repop_queue.empty()) {
12811     RepGather *repop = repop_queue.front();
12812     repop_queue.pop_front();
12813     dout(10) << " canceling repop tid " << repop->rep_tid << dendl;
12814     repop->rep_aborted = true;
12815     repop->on_committed.clear();
12816     repop->on_success.clear();
12817
12818     if (requeue) {
12819       if (repop->op) {
12820         dout(10) << " requeuing " << *repop->op->get_req() << dendl;
12821         rq.push_back(repop->op);
12822         repop->op = OpRequestRef();
12823       }
12824
12825       // also requeue any dups, interleaved into position
12826       auto p = waiting_for_ondisk.find(repop->v);
12827       if (p != waiting_for_ondisk.end()) {
12828         dout(10) << " also requeuing ondisk waiters " << p->second << dendl;
12829         for (auto& i : p->second) {
12830           rq.push_back(std::get<0>(i));
12831         }
12832         waiting_for_ondisk.erase(p);
12833       }
12834     }
12835
12836     remove_repop(repop);
12837   }
12838
12839   ceph_assert(repop_queue.empty());
12840
12841   if (requeue) {
12842     requeue_ops(rq);
12843     if (!waiting_for_ondisk.empty()) {
12844       for (auto& i : waiting_for_ondisk) {
12845         for (auto& j : i.second) {
12846           derr << __func__ << ": op " << *(std::get<0>(j)->get_req())
12847                << " waiting on " << i.first << dendl;
12848         }
12849       }
12850       ceph_assert(waiting_for_ondisk.empty());
12851     }
12852   }
12853
12854   waiting_for_ondisk.clear();
12855 }
12856
12857 void PrimaryLogPG::on_flushed()
12858 {
12859   requeue_ops(waiting_for_flush);
12860   if (!is_peered() || !is_primary()) {
12861     pair<hobject_t, ObjectContextRef> i;
12862     while (object_contexts.get_next(i.first, &i)) {
12863       derr << __func__ << ": object " << i.first << " obc still alive" << dendl;
12864     }
12865     ceph_assert(object_contexts.empty());
12866   }
12867 }
12868
12869 void PrimaryLogPG::on_removal(ObjectStore::Transaction &t)
12870 {
12871   dout(10) << __func__ << dendl;
12872
12873   on_shutdown();
12874
12875   t.register_on_commit(new C_DeleteMore(this, get_osdmap_epoch()));
12876 }
12877
12878 void PrimaryLogPG::clear_async_reads()
12879 {
12880   dout(10) << __func__ << dendl;
12881   for(auto& i : in_progress_async_reads) {
12882     dout(10) << "clear ctx: "
12883              << "OpRequestRef " << i.first
12884              << " OpContext " << i.second
12885              << dendl;
12886     close_op_ctx(i.second);
12887   }
12888 }
12889
12890 void PrimaryLogPG::clear_cache()
12891 {
12892   object_contexts.clear();
12893 }
12894
12895 void PrimaryLogPG::on_shutdown()
12896 {
12897   dout(10) << __func__ << dendl;
12898
12899   if (recovery_queued) {
12900     recovery_queued = false;
12901     osd->clear_queued_recovery(this);
12902   }
12903
12904   m_scrubber->scrub_clear_state();
12905   m_scrubber->rm_from_osd_scrubbing();
12906
12907   vector<ceph_tid_t> tids;
12908   cancel_copy_ops(false, &tids);
12909   cancel_flush_ops(false, &tids);
12910   cancel_proxy_ops(false, &tids);
12911   cancel_manifest_ops(false, &tids);
12912   cancel_cls_gather_ops(false, &tids);
12913   osd->objecter->op_cancel(tids, -ECANCELED);
12914
12915   apply_and_flush_repops(false);
12916   cancel_log_updates();
12917   // we must remove PGRefs, so do this this prior to release_backoffs() callers
12918   clear_backoffs();
12919   // clean up snap trim references
12920   snap_trimmer_machine.process_event(Reset());
12921
12922   pgbackend->on_change();
12923
12924   context_registry_on_change();
12925   object_contexts.clear();
12926
12927   clear_async_reads();
12928
12929   osd->remote_reserver.cancel_reservation(info.pgid);
12930   osd->local_reserver.cancel_reservation(info.pgid);
12931
12932   clear_primary_state();
12933   cancel_recovery();
12934
12935   if (is_primary()) {
12936     osd->clear_ready_to_merge(this);
12937   }
12938 }
12939
12940 void PrimaryLogPG::on_activate_complete()
12941 {
12942   check_local();
12943   // waiters
12944   if (!recovery_state.needs_flush()) {
12945     requeue_ops(waiting_for_peered);
12946   } else if (!waiting_for_peered.empty()) {
12947     dout(10) << __func__ << " flushes in progress, moving "
12948              << waiting_for_peered.size()
12949              << " items to waiting_for_flush"
12950              << dendl;
12951     ceph_assert(waiting_for_flush.empty());
12952     waiting_for_flush.swap(waiting_for_peered);
12953   }
12954
12955
12956   // all clean?
12957   if (needs_recovery()) {
12958     dout(10) << "activate not all replicas are up-to-date, queueing recovery" << dendl;
12959     queue_peering_event(
12960       PGPeeringEventRef(
12961         std::make_shared<PGPeeringEvent>(
12962           get_osdmap_epoch(),
12963           get_osdmap_epoch(),
12964           PeeringState::DoRecovery())));
12965   } else if (needs_backfill()) {
12966     dout(10) << "activate queueing backfill" << dendl;
12967     queue_peering_event(
12968       PGPeeringEventRef(
12969         std::make_shared<PGPeeringEvent>(
12970           get_osdmap_epoch(),
12971           get_osdmap_epoch(),
12972           PeeringState::RequestBackfill())));
12973   } else {
12974     dout(10) << "activate all replicas clean, no recovery" << dendl;
12975     queue_peering_event(
12976       PGPeeringEventRef(
12977         std::make_shared<PGPeeringEvent>(
12978           get_osdmap_epoch(),
12979           get_osdmap_epoch(),
12980           PeeringState::AllReplicasRecovered())));
12981   }
12982
12983   publish_stats_to_osd();
12984
12985   if (get_backfill_targets().size()) {
12986     last_backfill_started = recovery_state.earliest_backfill();
12987     new_backfill = true;
12988     ceph_assert(!last_backfill_started.is_max());
12989     dout(5) << __func__ << ": bft=" << get_backfill_targets()
12990            << " from " << last_backfill_started << dendl;
12991     for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
12992          i != get_backfill_targets().end();
12993          ++i) {
12994       dout(5) << "target shard " << *i
12995              << " from " << recovery_state.get_peer_info(*i).last_backfill
12996              << dendl;
12997     }
12998   }
12999
13000   hit_set_setup();
13001   agent_setup();
13002 }
13003
13004 void PrimaryLogPG::on_change(ObjectStore::Transaction &t)
13005 {
13006   dout(10) << __func__ << dendl;
13007
13008   if (hit_set && hit_set->insert_count() == 0) {
13009     dout(20) << " discarding empty hit_set" << dendl;
13010     hit_set_clear();
13011   }
13012
13013   if (recovery_queued) {
13014     recovery_queued = false;
13015     osd->clear_queued_recovery(this);
13016   }
13017
13018   // requeue everything in the reverse order they should be
13019   // reexamined.
13020   requeue_ops(waiting_for_peered);
13021   requeue_ops(waiting_for_flush);
13022   requeue_ops(waiting_for_active);
13023   requeue_ops(waiting_for_readable);
13024
13025   vector<ceph_tid_t> tids;
13026   cancel_copy_ops(is_primary(), &tids);
13027   cancel_flush_ops(is_primary(), &tids);
13028   cancel_proxy_ops(is_primary(), &tids);
13029   cancel_manifest_ops(is_primary(), &tids);
13030   cancel_cls_gather_ops(is_primary(), &tids);
13031   osd->objecter->op_cancel(tids, -ECANCELED);
13032
13033   // requeue object waiters
13034   for (auto& p : waiting_for_unreadable_object) {
13035     release_backoffs(p.first);
13036   }
13037   if (is_primary()) {
13038     requeue_object_waiters(waiting_for_unreadable_object);
13039   } else {
13040     waiting_for_unreadable_object.clear();
13041   }
13042   for (map<hobject_t,list<OpRequestRef>>::iterator p = waiting_for_degraded_object.begin();
13043        p != waiting_for_degraded_object.end();
13044        waiting_for_degraded_object.erase(p++)) {
13045     release_backoffs(p->first);
13046     if (is_primary())
13047       requeue_ops(p->second);
13048     else
13049       p->second.clear();
13050     finish_degraded_object(p->first);
13051   }
13052
13053   // requeues waiting_for_scrub
13054   m_scrubber->scrub_clear_state();
13055
13056   for (auto p = waiting_for_blocked_object.begin();
13057        p != waiting_for_blocked_object.end();
13058        waiting_for_blocked_object.erase(p++)) {
13059     if (is_primary())
13060       requeue_ops(p->second);
13061     else
13062       p->second.clear();
13063   }
13064   for (auto i = callbacks_for_degraded_object.begin();
13065        i != callbacks_for_degraded_object.end();
13066     ) {
13067     finish_degraded_object((i++)->first);
13068   }
13069   ceph_assert(callbacks_for_degraded_object.empty());
13070
13071   if (is_primary()) {
13072     requeue_ops(waiting_for_cache_not_full);
13073   } else {
13074     waiting_for_cache_not_full.clear();
13075   }
13076   objects_blocked_on_cache_full.clear();
13077
13078   for (list<pair<OpRequestRef, OpContext*> >::iterator i =
13079          in_progress_async_reads.begin();
13080        i != in_progress_async_reads.end();
13081        in_progress_async_reads.erase(i++)) {
13082     close_op_ctx(i->second);
13083     if (is_primary())
13084       requeue_op(i->first);
13085   }
13086
13087   // this will requeue ops we were working on but didn't finish, and
13088   // any dups
13089   apply_and_flush_repops(is_primary());
13090   cancel_log_updates();
13091
13092   // do this *after* apply_and_flush_repops so that we catch any newly
13093   // registered watches.
13094   context_registry_on_change();
13095
13096   pgbackend->on_change_cleanup(&t);
13097   m_scrubber->cleanup_store(&t);
13098   pgbackend->on_change();
13099
13100   // clear snap_trimmer state
13101   snap_trimmer_machine.process_event(Reset());
13102
13103   debug_op_order.clear();
13104   unstable_stats.clear();
13105
13106   // we don't want to cache object_contexts through the interval change
13107   // NOTE: we actually assert that all currently live references are dead
13108   // by the time the flush for the next interval completes.
13109   object_contexts.clear();
13110
13111   // should have been cleared above by finishing all of the degraded objects
13112   ceph_assert(objects_blocked_on_degraded_snap.empty());
13113 }
13114
13115 void PrimaryLogPG::plpg_on_role_change()
13116 {
13117   dout(10) << __func__ << dendl;
13118   if (get_role() != 0 && hit_set) {
13119     dout(10) << " clearing hit set" << dendl;
13120     hit_set_clear();
13121   }
13122 }
13123
13124 void PrimaryLogPG::plpg_on_pool_change()
13125 {
13126   dout(10) << __func__ << dendl;
13127   // requeue cache full waiters just in case the cache_mode is
13128   // changing away from writeback mode.  note that if we are not
13129   // active the normal requeuing machinery is sufficient (and properly
13130   // ordered).
13131   if (is_active() &&
13132       pool.info.cache_mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13133       !waiting_for_cache_not_full.empty()) {
13134     dout(10) << __func__ << " requeuing full waiters (not in writeback) "
13135              << dendl;
13136     requeue_ops(waiting_for_cache_not_full);
13137     objects_blocked_on_cache_full.clear();
13138   }
13139   hit_set_setup();
13140   agent_setup();
13141 }
13142
13143 // clear state.  called on recovery completion AND cancellation.
13144 void PrimaryLogPG::_clear_recovery_state()
13145 {
13146 #ifdef DEBUG_RECOVERY_OIDS
13147   recovering_oids.clear();
13148 #endif
13149   dout(15) << __func__ << " flags: " << m_planned_scrub << dendl;
13150
13151   last_backfill_started = hobject_t();
13152   set<hobject_t>::iterator i = backfills_in_flight.begin();
13153   while (i != backfills_in_flight.end()) {
13154     backfills_in_flight.erase(i++);
13155   }
13156
13157   list<OpRequestRef> blocked_ops;
13158   for (map<hobject_t, ObjectContextRef>::iterator i = recovering.begin();
13159        i != recovering.end();
13160        recovering.erase(i++)) {
13161     if (i->second) {
13162       i->second->drop_recovery_read(&blocked_ops);
13163       requeue_ops(blocked_ops);
13164     }
13165   }
13166   ceph_assert(backfills_in_flight.empty());
13167   pending_backfill_updates.clear();
13168   ceph_assert(recovering.empty());
13169   pgbackend->clear_recovery_state();
13170 }
13171
13172 void PrimaryLogPG::cancel_pull(const hobject_t &soid)
13173 {
13174   dout(20) << __func__ << ": " << soid << dendl;
13175   ceph_assert(recovering.count(soid));
13176   ObjectContextRef obc = recovering[soid];
13177   if (obc) {
13178     list<OpRequestRef> blocked_ops;
13179     obc->drop_recovery_read(&blocked_ops);
13180     requeue_ops(blocked_ops);
13181   }
13182   recovering.erase(soid);
13183   finish_recovery_op(soid);
13184   release_backoffs(soid);
13185   if (waiting_for_degraded_object.count(soid)) {
13186     dout(20) << " kicking degraded waiters on " << soid << dendl;
13187     requeue_ops(waiting_for_degraded_object[soid]);
13188     waiting_for_degraded_object.erase(soid);
13189   }
13190   if (waiting_for_unreadable_object.count(soid)) {
13191     dout(20) << " kicking unreadable waiters on " << soid << dendl;
13192     requeue_ops(waiting_for_unreadable_object[soid]);
13193     waiting_for_unreadable_object.erase(soid);
13194   }
13195   if (is_missing_object(soid))
13196     recovery_state.set_last_requested(0);
13197   finish_degraded_object(soid);
13198 }
13199
13200 void PrimaryLogPG::check_recovery_sources(const OSDMapRef& osdmap)
13201 {
13202   pgbackend->check_recovery_sources(osdmap);
13203 }
13204
13205 bool PrimaryLogPG::start_recovery_ops(
13206   uint64_t max,
13207   ThreadPool::TPHandle &handle,
13208   uint64_t *ops_started)
13209 {
13210   uint64_t& started = *ops_started;
13211   started = 0;
13212   bool work_in_progress = false;
13213   bool recovery_started = false;
13214   ceph_assert(is_primary());
13215   ceph_assert(is_peered());
13216   ceph_assert(!recovery_state.is_deleting());
13217
13218   ceph_assert(recovery_queued);
13219   recovery_queued = false;
13220
13221   if (!state_test(PG_STATE_RECOVERING) &&
13222       !state_test(PG_STATE_BACKFILLING)) {
13223     /* TODO: I think this case is broken and will make do_recovery()
13224      * unhappy since we're returning false */
13225     dout(10) << "recovery raced and were queued twice, ignoring!" << dendl;
13226     return have_unfound();
13227   }
13228
13229   const auto &missing = recovery_state.get_pg_log().get_missing();
13230
13231   uint64_t num_unfound = get_num_unfound();
13232
13233   if (!recovery_state.have_missing()) {
13234     recovery_state.local_recovery_complete();
13235   }
13236
13237   if (!missing.have_missing() || // Primary does not have missing
13238       // or all of the missing objects are unfound.
13239       recovery_state.all_missing_unfound()) {
13240     // Recover the replicas.
13241     started = recover_replicas(max, handle, &recovery_started);
13242   }
13243   if (!started) {
13244     // We still have missing objects that we should grab from replicas.
13245     started += recover_primary(max, handle);
13246   }
13247   if (!started && num_unfound != get_num_unfound()) {
13248     // second chance to recovery replicas
13249     started = recover_replicas(max, handle, &recovery_started);
13250   }
13251
13252   if (started || recovery_started)
13253     work_in_progress = true;
13254
13255   bool deferred_backfill = false;
13256   if (recovering.empty() &&
13257       state_test(PG_STATE_BACKFILLING) &&
13258       !get_backfill_targets().empty() && started < max &&
13259       missing.num_missing() == 0 &&
13260       waiting_on_backfill.empty()) {
13261     if (get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL)) {
13262       dout(10) << "deferring backfill due to NOBACKFILL" << dendl;
13263       deferred_backfill = true;
13264     } else if (get_osdmap()->test_flag(CEPH_OSDMAP_NOREBALANCE) &&
13265                !is_degraded())  {
13266       dout(10) << "deferring backfill due to NOREBALANCE" << dendl;
13267       deferred_backfill = true;
13268     } else if (!recovery_state.is_backfill_reserved()) {
13269       /* DNMNOTE I think this branch is dead */
13270       dout(10) << "deferring backfill due to !backfill_reserved" << dendl;
13271       if (!backfill_reserving) {
13272         dout(10) << "queueing RequestBackfill" << dendl;
13273         backfill_reserving = true;
13274         queue_peering_event(
13275           PGPeeringEventRef(
13276             std::make_shared<PGPeeringEvent>(
13277               get_osdmap_epoch(),
13278               get_osdmap_epoch(),
13279               PeeringState::RequestBackfill())));
13280       }
13281       deferred_backfill = true;
13282     } else {
13283       started += recover_backfill(max - started, handle, &work_in_progress);
13284     }
13285   }
13286
13287   dout(10) << " started " << started << dendl;
13288   osd->logger->inc(l_osd_rop, started);
13289
13290   if (!recovering.empty() ||
13291       work_in_progress || recovery_ops_active > 0 || deferred_backfill)
13292     return !work_in_progress && have_unfound();
13293
13294   ceph_assert(recovering.empty());
13295   ceph_assert(recovery_ops_active == 0);
13296
13297   dout(10) << __func__ << " needs_recovery: "
13298            << recovery_state.get_missing_loc().get_needs_recovery()
13299            << dendl;
13300   dout(10) << __func__ << " missing_loc: "
13301            << recovery_state.get_missing_loc().get_missing_locs()
13302            << dendl;
13303   int unfound = get_num_unfound();
13304   if (unfound) {
13305     dout(10) << " still have " << unfound << " unfound" << dendl;
13306     return true;
13307   }
13308
13309   if (missing.num_missing() > 0) {
13310     // this shouldn't happen!
13311     osd->clog->error() << info.pgid << " Unexpected Error: recovery ending with "
13312                        << missing.num_missing() << ": " << missing.get_items();
13313     return false;
13314   }
13315
13316   if (needs_recovery()) {
13317     // this shouldn't happen!
13318     // We already checked num_missing() so we must have missing replicas
13319     osd->clog->error() << info.pgid
13320                        << " Unexpected Error: recovery ending with missing replicas";
13321     return false;
13322   }
13323
13324   if (state_test(PG_STATE_RECOVERING)) {
13325     state_clear(PG_STATE_RECOVERING);
13326     state_clear(PG_STATE_FORCED_RECOVERY);
13327     if (needs_backfill()) {
13328       dout(10) << "recovery done, queuing backfill" << dendl;
13329       queue_peering_event(
13330         PGPeeringEventRef(
13331           std::make_shared<PGPeeringEvent>(
13332             get_osdmap_epoch(),
13333             get_osdmap_epoch(),
13334             PeeringState::RequestBackfill())));
13335     } else {
13336       dout(10) << "recovery done, no backfill" << dendl;
13337       state_clear(PG_STATE_FORCED_BACKFILL);
13338       queue_peering_event(
13339         PGPeeringEventRef(
13340           std::make_shared<PGPeeringEvent>(
13341             get_osdmap_epoch(),
13342             get_osdmap_epoch(),
13343             PeeringState::AllReplicasRecovered())));
13344     }
13345   } else { // backfilling
13346     state_clear(PG_STATE_BACKFILLING);
13347     state_clear(PG_STATE_FORCED_BACKFILL);
13348     state_clear(PG_STATE_FORCED_RECOVERY);
13349     dout(10) << "recovery done, backfill done" << dendl;
13350     queue_peering_event(
13351       PGPeeringEventRef(
13352         std::make_shared<PGPeeringEvent>(
13353           get_osdmap_epoch(),
13354           get_osdmap_epoch(),
13355           PeeringState::Backfilled())));
13356   }
13357
13358   return false;
13359 }
13360
13361 /**
13362  * do one recovery op.
13363  * return true if done, false if nothing left to do.
13364  */
13365 uint64_t PrimaryLogPG::recover_primary(uint64_t max, ThreadPool::TPHandle &handle)
13366 {
13367   ceph_assert(is_primary());
13368
13369   const auto &missing = recovery_state.get_pg_log().get_missing();
13370
13371   dout(10) << __func__ << " recovering " << recovering.size()
13372            << " in pg,"
13373            << " missing " << missing << dendl;
13374
13375   dout(25) << __func__ << " " << missing.get_items() << dendl;
13376
13377   // look at log!
13378   pg_log_entry_t *latest = 0;
13379   unsigned started = 0;
13380   int skipped = 0;
13381
13382   PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
13383   map<version_t, hobject_t>::const_iterator p =
13384     missing.get_rmissing().lower_bound(recovery_state.get_pg_log().get_log().last_requested);
13385   while (p != missing.get_rmissing().end()) {
13386     handle.reset_tp_timeout();
13387     hobject_t soid;
13388     version_t v = p->first;
13389
13390     auto it_objects = recovery_state.get_pg_log().get_log().objects.find(p->second);
13391     if (it_objects != recovery_state.get_pg_log().get_log().objects.end()) {
13392       latest = it_objects->second;
13393       ceph_assert(latest->is_update() || latest->is_delete());
13394       soid = latest->soid;
13395     } else {
13396       latest = 0;
13397       soid = p->second;
13398     }
13399     const pg_missing_item& item = missing.get_items().find(p->second)->second;
13400     ++p;
13401
13402     hobject_t head = soid.get_head();
13403
13404     eversion_t need = item.need;
13405
13406     dout(10) << __func__ << " "
13407              << soid << " " << item.need
13408              << (missing.is_missing(soid) ? " (missing)":"")
13409              << (missing.is_missing(head) ? " (missing head)":"")
13410              << (recovering.count(soid) ? " (recovering)":"")
13411              << (recovering.count(head) ? " (recovering head)":"")
13412              << dendl;
13413
13414     if (latest) {
13415       switch (latest->op) {
13416       case pg_log_entry_t::CLONE:
13417         /*
13418          * Handling for this special case removed for now, until we
13419          * can correctly construct an accurate SnapSet from the old
13420          * one.
13421          */
13422         break;
13423
13424       case pg_log_entry_t::LOST_REVERT:
13425         {
13426           if (item.have == latest->reverting_to) {
13427             ObjectContextRef obc = get_object_context(soid, true);
13428
13429             if (obc->obs.oi.version == latest->version) {
13430               // I'm already reverting
13431               dout(10) << " already reverting " << soid << dendl;
13432             } else {
13433               dout(10) << " reverting " << soid << " to " << latest->prior_version << dendl;
13434               obc->obs.oi.version = latest->version;
13435
13436               ObjectStore::Transaction t;
13437               bufferlist b2;
13438               obc->obs.oi.encode(
13439                 b2,
13440                 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
13441               ceph_assert(!pool.info.require_rollback());
13442               t.setattr(coll, ghobject_t(soid), OI_ATTR, b2);
13443
13444               recovery_state.recover_got(
13445                 soid,
13446                 latest->version,
13447                 false,
13448                 t);
13449
13450               ++active_pushes;
13451
13452               t.register_on_applied(new C_OSD_AppliedRecoveredObject(this, obc));
13453               t.register_on_commit(new C_OSD_CommittedPushedObject(
13454                                      this,
13455                                      get_osdmap_epoch(),
13456                                      info.last_complete));
13457               osd->store->queue_transaction(ch, std::move(t));
13458               continue;
13459             }
13460           } else {
13461             /*
13462              * Pull the old version of the object.  Update missing_loc here to have the location
13463              * of the version we want.
13464              *
13465              * This doesn't use the usual missing_loc paths, but that's okay:
13466              *  - if we have it locally, we hit the case above, and go from there.
13467              *  - if we don't, we always pass through this case during recovery and set up the location
13468              *    properly.
13469              *  - this way we don't need to mangle the missing code to be general about needing an old
13470              *    version...
13471              */
13472             eversion_t alternate_need = latest->reverting_to;
13473             dout(10) << " need to pull prior_version " << alternate_need << " for revert " << item << dendl;
13474
13475             set<pg_shard_t> good_peers;
13476             for (auto p = recovery_state.get_peer_missing().begin();
13477                  p != recovery_state.get_peer_missing().end();
13478                  ++p) {
13479               if (p->second.is_missing(soid, need) &&
13480                   p->second.get_items().at(soid).have == alternate_need) {
13481                 good_peers.insert(p->first);
13482               }
13483             }
13484             recovery_state.set_revert_with_targets(
13485               soid,
13486               good_peers);
13487             dout(10) << " will pull " << alternate_need << " or " << need
13488                      << " from one of "
13489                      << recovery_state.get_missing_loc().get_locations(soid)
13490                      << dendl;
13491           }
13492         }
13493         break;
13494       }
13495     }
13496
13497     if (!recovering.count(soid)) {
13498       if (recovering.count(head)) {
13499         ++skipped;
13500       } else {
13501         int r = recover_missing(
13502           soid, need, recovery_state.get_recovery_op_priority(), h);
13503         switch (r) {
13504         case PULL_YES:
13505           ++started;
13506           break;
13507         case PULL_HEAD:
13508           ++started;
13509         case PULL_NONE:
13510           ++skipped;
13511           break;
13512         default:
13513           ceph_abort();
13514         }
13515         if (started >= max)
13516           break;
13517       }
13518     }
13519
13520     // only advance last_requested if we haven't skipped anything
13521     if (!skipped)
13522       recovery_state.set_last_requested(v);
13523   }
13524
13525   pgbackend->run_recovery_op(h, recovery_state.get_recovery_op_priority());
13526   return started;
13527 }
13528
13529 bool PrimaryLogPG::primary_error(
13530   const hobject_t& soid, eversion_t v)
13531 {
13532   recovery_state.force_object_missing(pg_whoami, soid, v);
13533   bool uhoh = recovery_state.get_missing_loc().is_unfound(soid);
13534   if (uhoh)
13535     osd->clog->error() << info.pgid << " missing primary copy of "
13536                        << soid << ", unfound";
13537   else
13538     osd->clog->error() << info.pgid << " missing primary copy of "
13539                        << soid
13540                        << ", will try copies on "
13541                        << recovery_state.get_missing_loc().get_locations(soid);
13542   return uhoh;
13543 }
13544
13545 int PrimaryLogPG::prep_object_replica_deletes(
13546   const hobject_t& soid, eversion_t v,
13547   PGBackend::RecoveryHandle *h,
13548   bool *work_started)
13549 {
13550   ceph_assert(is_primary());
13551   dout(10) << __func__ << ": on " << soid << dendl;
13552
13553   ObjectContextRef obc = get_object_context(soid, false);
13554   if (obc) {
13555     if (!obc->get_recovery_read()) {
13556       dout(20) << "replica delete delayed on " << soid
13557                << "; could not get rw_manager lock" << dendl;
13558       *work_started = true;
13559       return 0;
13560     } else {
13561       dout(20) << "replica delete got recovery read lock on " << soid
13562                << dendl;
13563     }
13564   }
13565
13566   start_recovery_op(soid);
13567   ceph_assert(!recovering.count(soid));
13568   if (!obc)
13569     recovering.insert(make_pair(soid, ObjectContextRef()));
13570   else
13571     recovering.insert(make_pair(soid, obc));
13572
13573   pgbackend->recover_delete_object(soid, v, h);
13574   return 1;
13575 }
13576
13577 int PrimaryLogPG::prep_object_replica_pushes(
13578   const hobject_t& soid, eversion_t v,
13579   PGBackend::RecoveryHandle *h,
13580   bool *work_started)
13581 {
13582   ceph_assert(is_primary());
13583   dout(10) << __func__ << ": on " << soid << dendl;
13584
13585   if (soid.snap && soid.snap < CEPH_NOSNAP) {
13586     // do we have the head and/or snapdir?
13587     hobject_t head = soid.get_head();
13588     if (recovery_state.get_pg_log().get_missing().is_missing(head)) {
13589       if (recovering.count(head)) {
13590         dout(10) << " missing but already recovering head " << head << dendl;
13591         return 0;
13592       } else {
13593         int r = recover_missing(
13594             head, recovery_state.get_pg_log().get_missing().get_items().find(head)->second.need,
13595             recovery_state.get_recovery_op_priority(), h);
13596         if (r != PULL_NONE)
13597           return 1;
13598         return 0;
13599       }
13600     }
13601   }
13602
13603   // NOTE: we know we will get a valid oloc off of disk here.
13604   ObjectContextRef obc = get_object_context(soid, false);
13605   if (!obc) {
13606     primary_error(soid, v);
13607     return 0;
13608   }
13609
13610   if (!obc->get_recovery_read()) {
13611     dout(20) << "recovery delayed on " << soid
13612              << "; could not get rw_manager lock" << dendl;
13613     *work_started = true;
13614     return 0;
13615   } else {
13616     dout(20) << "recovery got recovery read lock on " << soid
13617              << dendl;
13618   }
13619
13620   start_recovery_op(soid);
13621   ceph_assert(!recovering.count(soid));
13622   recovering.insert(make_pair(soid, obc));
13623
13624   int r = pgbackend->recover_object(
13625     soid,
13626     v,
13627     ObjectContextRef(),
13628     obc, // has snapset context
13629     h);
13630   if (r < 0) {
13631     dout(0) << __func__ << " Error " << r << " on oid " << soid << dendl;
13632     on_failed_pull({ pg_whoami }, soid, v);
13633     return 0;
13634   }
13635   return 1;
13636 }
13637
13638 uint64_t PrimaryLogPG::recover_replicas(uint64_t max, ThreadPool::TPHandle &handle,
13639   bool *work_started)
13640 {
13641   dout(10) << __func__ << "(" << max << ")" << dendl;
13642   uint64_t started = 0;
13643
13644   PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
13645
13646   // this is FAR from an optimal recovery order.  pretty lame, really.
13647   ceph_assert(!get_acting_recovery_backfill().empty());
13648   // choose replicas to recover, replica has the shortest missing list first
13649   // so we can bring it back to normal ASAP
13650   std::vector<std::pair<unsigned int, pg_shard_t>> replicas_by_num_missing,
13651     async_by_num_missing;
13652   replicas_by_num_missing.reserve(get_acting_recovery_backfill().size() - 1);
13653   for (auto &p: get_acting_recovery_backfill()) {
13654     if (p == get_primary()) {
13655       continue;
13656     }
13657     auto pm = recovery_state.get_peer_missing().find(p);
13658     ceph_assert(pm != recovery_state.get_peer_missing().end());
13659     auto nm = pm->second.num_missing();
13660     if (nm != 0) {
13661       if (is_async_recovery_target(p)) {
13662         async_by_num_missing.push_back(make_pair(nm, p));
13663       } else {
13664         replicas_by_num_missing.push_back(make_pair(nm, p));
13665       }
13666     }
13667   }
13668   // sort by number of missing objects, in ascending order.
13669   auto func = [](const std::pair<unsigned int, pg_shard_t> &lhs,
13670                  const std::pair<unsigned int, pg_shard_t> &rhs) {
13671     return lhs.first < rhs.first;
13672   };
13673   // acting goes first
13674   std::sort(replicas_by_num_missing.begin(), replicas_by_num_missing.end(), func);
13675   // then async_recovery_targets
13676   std::sort(async_by_num_missing.begin(), async_by_num_missing.end(), func);
13677   replicas_by_num_missing.insert(replicas_by_num_missing.end(),
13678     async_by_num_missing.begin(), async_by_num_missing.end());
13679   for (auto &replica: replicas_by_num_missing) {
13680     pg_shard_t &peer = replica.second;
13681     ceph_assert(peer != get_primary());
13682     auto pm = recovery_state.get_peer_missing().find(peer);
13683     ceph_assert(pm != recovery_state.get_peer_missing().end());
13684     size_t m_sz = pm->second.num_missing();
13685
13686     dout(10) << " peer osd." << peer << " missing " << m_sz << " objects." << dendl;
13687     dout(20) << " peer osd." << peer << " missing " << pm->second.get_items() << dendl;
13688
13689     // oldest first!
13690     const pg_missing_t &m(pm->second);
13691     for (map<version_t, hobject_t>::const_iterator p = m.get_rmissing().begin();
13692          p != m.get_rmissing().end() && started < max;
13693            ++p) {
13694       handle.reset_tp_timeout();
13695       const hobject_t soid(p->second);
13696
13697       if (recovery_state.get_missing_loc().is_unfound(soid)) {
13698         dout(10) << __func__ << ": " << soid << " still unfound" << dendl;
13699         continue;
13700       }
13701
13702       const pg_info_t &pi = recovery_state.get_peer_info(peer);
13703       if (soid > pi.last_backfill) {
13704         if (!recovering.count(soid)) {
13705           derr << __func__ << ": object " << soid << " last_backfill "
13706                << pi.last_backfill << dendl;
13707           derr << __func__ << ": object added to missing set for backfill, but "
13708                << "is not in recovering, error!" << dendl;
13709           ceph_abort();
13710         }
13711         continue;
13712       }
13713
13714       if (recovering.count(soid)) {
13715         dout(10) << __func__ << ": already recovering " << soid << dendl;
13716         continue;
13717       }
13718
13719       if (recovery_state.get_missing_loc().is_deleted(soid)) {
13720         dout(10) << __func__ << ": " << soid << " is a delete, removing" << dendl;
13721         map<hobject_t,pg_missing_item>::const_iterator r = m.get_items().find(soid);
13722         started += prep_object_replica_deletes(soid, r->second.need, h, work_started);
13723         continue;
13724       }
13725
13726       if (soid.is_snap() &&
13727           recovery_state.get_pg_log().get_missing().is_missing(
13728             soid.get_head())) {
13729         dout(10) << __func__ << ": " << soid.get_head()
13730                  << " still missing on primary" << dendl;
13731         continue;
13732       }
13733
13734       if (recovery_state.get_pg_log().get_missing().is_missing(soid)) {
13735         dout(10) << __func__ << ": " << soid << " still missing on primary" << dendl;
13736         continue;
13737       }
13738
13739       dout(10) << __func__ << ": recover_object_replicas(" << soid << ")" << dendl;
13740       map<hobject_t,pg_missing_item>::const_iterator r = m.get_items().find(soid);
13741       started += prep_object_replica_pushes(soid, r->second.need, h, work_started);
13742     }
13743   }
13744
13745   pgbackend->run_recovery_op(h, recovery_state.get_recovery_op_priority());
13746   return started;
13747 }
13748
13749 hobject_t PrimaryLogPG::earliest_peer_backfill() const
13750 {
13751   hobject_t e = hobject_t::get_max();
13752   for (const pg_shard_t& peer : get_backfill_targets()) {
13753     const auto iter = peer_backfill_info.find(peer);
13754     ceph_assert(iter != peer_backfill_info.end());
13755     e = std::min(e, iter->second.begin);
13756   }
13757   return e;
13758 }
13759
13760 bool PrimaryLogPG::all_peer_done() const
13761 {
13762   // Primary hasn't got any more objects
13763   ceph_assert(backfill_info.empty());
13764
13765   for (const pg_shard_t& bt : get_backfill_targets()) {
13766     const auto piter = peer_backfill_info.find(bt);
13767     ceph_assert(piter != peer_backfill_info.end());
13768     const BackfillInterval& pbi = piter->second;
13769     // See if peer has more to process
13770     if (!pbi.extends_to_end() || !pbi.empty())
13771         return false;
13772   }
13773   return true;
13774 }
13775
13776 /**
13777  * recover_backfill
13778  *
13779  * Invariants:
13780  *
13781  * backfilled: fully pushed to replica or present in replica's missing set (both
13782  * our copy and theirs).
13783  *
13784  * All objects on a backfill_target in
13785  * [MIN,peer_backfill_info[backfill_target].begin) are valid; logically-removed
13786  * objects have been actually deleted and all logically-valid objects are replicated.
13787  * There may be PG objects in this interval yet to be backfilled.
13788  *
13789  * All objects in PG in [MIN,backfill_info.begin) have been backfilled to all
13790  * backfill_targets.  There may be objects on backfill_target(s) yet to be deleted.
13791  *
13792  * For a backfill target, all objects < std::min(peer_backfill_info[target].begin,
13793  *     backfill_info.begin) in PG are backfilled.  No deleted objects in this
13794  * interval remain on the backfill target.
13795  *
13796  * For a backfill target, all objects <= peer_info[target].last_backfill
13797  * have been backfilled to target
13798  *
13799  * There *MAY* be missing/outdated objects between last_backfill_started and
13800  * std::min(peer_backfill_info[*].begin, backfill_info.begin) in the event that client
13801  * io created objects since the last scan.  For this reason, we call
13802  * update_range() again before continuing backfill.
13803  */
13804 uint64_t PrimaryLogPG::recover_backfill(
13805   uint64_t max,
13806   ThreadPool::TPHandle &handle, bool *work_started)
13807 {
13808   dout(10) << __func__ << " (" << max << ")"
13809            << " bft=" << get_backfill_targets()
13810            << " last_backfill_started " << last_backfill_started
13811            << (new_backfill ? " new_backfill":"")
13812            << dendl;
13813   ceph_assert(!get_backfill_targets().empty());
13814
13815   // Initialize from prior backfill state
13816   if (new_backfill) {
13817     // on_activate() was called prior to getting here
13818     ceph_assert(last_backfill_started == recovery_state.earliest_backfill());
13819     new_backfill = false;
13820
13821     // initialize BackfillIntervals
13822     for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
13823          i != get_backfill_targets().end();
13824          ++i) {
13825       peer_backfill_info[*i].reset(
13826         recovery_state.get_peer_info(*i).last_backfill);
13827     }
13828     backfill_info.reset(last_backfill_started);
13829
13830     backfills_in_flight.clear();
13831     pending_backfill_updates.clear();
13832   }
13833
13834   for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
13835        i != get_backfill_targets().end();
13836        ++i) {
13837     dout(10) << "peer osd." << *i
13838            << " info " << recovery_state.get_peer_info(*i)
13839            << " interval " << peer_backfill_info[*i].begin
13840            << "-" << peer_backfill_info[*i].end
13841            << " " << peer_backfill_info[*i].objects.size() << " objects"
13842            << dendl;
13843   }
13844
13845   // update our local interval to cope with recent changes
13846   backfill_info.begin = last_backfill_started;
13847   update_range(&backfill_info, handle);
13848
13849   unsigned ops = 0;
13850   vector<boost::tuple<hobject_t, eversion_t, pg_shard_t> > to_remove;
13851   set<hobject_t> add_to_stat;
13852
13853   for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
13854        i != get_backfill_targets().end();
13855        ++i) {
13856     peer_backfill_info[*i].trim_to(
13857       std::max(
13858         recovery_state.get_peer_info(*i).last_backfill,
13859         last_backfill_started));
13860   }
13861   backfill_info.trim_to(last_backfill_started);
13862
13863   PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
13864   while (ops < max) {
13865     if (backfill_info.begin <= earliest_peer_backfill() &&
13866         !backfill_info.extends_to_end() && backfill_info.empty()) {
13867       hobject_t next = backfill_info.end;
13868       backfill_info.reset(next);
13869       backfill_info.end = hobject_t::get_max();
13870       update_range(&backfill_info, handle);
13871       backfill_info.trim();
13872     }
13873
13874     dout(20) << "   my backfill interval " << backfill_info << dendl;
13875
13876     bool sent_scan = false;
13877     for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
13878          i != get_backfill_targets().end();
13879          ++i) {
13880       pg_shard_t bt = *i;
13881       BackfillInterval& pbi = peer_backfill_info[bt];
13882
13883       dout(20) << " peer shard " << bt << " backfill " << pbi << dendl;
13884       if (pbi.begin <= backfill_info.begin &&
13885           !pbi.extends_to_end() && pbi.empty()) {
13886         dout(10) << " scanning peer osd." << bt << " from " << pbi.end << dendl;
13887         epoch_t e = get_osdmap_epoch();
13888         MOSDPGScan *m = new MOSDPGScan(
13889           MOSDPGScan::OP_SCAN_GET_DIGEST, pg_whoami, e, get_last_peering_reset(),
13890           spg_t(info.pgid.pgid, bt.shard),
13891           pbi.end, hobject_t());
13892
13893         if (cct->_conf->osd_op_queue == "mclock_scheduler") {
13894           /* This guard preserves legacy WeightedPriorityQueue behavior for
13895            * now, but should be removed after Reef */
13896           m->set_priority(recovery_state.get_recovery_op_priority());
13897         }
13898         osd->send_message_osd_cluster(bt.osd, m, get_osdmap_epoch());
13899         ceph_assert(waiting_on_backfill.find(bt) == waiting_on_backfill.end());
13900         waiting_on_backfill.insert(bt);
13901         sent_scan = true;
13902       }
13903     }
13904
13905     // Count simultaneous scans as a single op and let those complete
13906     if (sent_scan) {
13907       ops++;
13908       start_recovery_op(hobject_t::get_max()); // XXX: was pbi.end
13909       break;
13910     }
13911
13912     if (backfill_info.empty() && all_peer_done()) {
13913       dout(10) << " reached end for both local and all peers" << dendl;
13914       break;
13915     }
13916
13917     // Get object within set of peers to operate on and
13918     // the set of targets for which that object applies.
13919     hobject_t check = earliest_peer_backfill();
13920
13921     if (check < backfill_info.begin) {
13922
13923       set<pg_shard_t> check_targets;
13924       for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
13925            i != get_backfill_targets().end();
13926            ++i) {
13927         pg_shard_t bt = *i;
13928         BackfillInterval& pbi = peer_backfill_info[bt];
13929         if (pbi.begin == check)
13930           check_targets.insert(bt);
13931       }
13932       ceph_assert(!check_targets.empty());
13933
13934       dout(20) << " BACKFILL removing " << check
13935                << " from peers " << check_targets << dendl;
13936       for (set<pg_shard_t>::iterator i = check_targets.begin();
13937            i != check_targets.end();
13938            ++i) {
13939         pg_shard_t bt = *i;
13940         BackfillInterval& pbi = peer_backfill_info[bt];
13941         ceph_assert(pbi.begin == check);
13942
13943         to_remove.push_back(boost::make_tuple(check, pbi.objects.begin()->second, bt));
13944         pbi.pop_front();
13945       }
13946
13947       last_backfill_started = check;
13948
13949       // Don't increment ops here because deletions
13950       // are cheap and not replied to unlike real recovery_ops,
13951       // and we can't increment ops without requeueing ourself
13952       // for recovery.
13953     } else {
13954       eversion_t& obj_v = backfill_info.objects.begin()->second;
13955
13956       vector<pg_shard_t> need_ver_targs, missing_targs, keep_ver_targs, skip_targs;
13957       for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
13958            i != get_backfill_targets().end();
13959            ++i) {
13960         pg_shard_t bt = *i;
13961         BackfillInterval& pbi = peer_backfill_info[bt];
13962         // Find all check peers that have the wrong version
13963         if (check == backfill_info.begin && check == pbi.begin) {
13964           if (pbi.objects.begin()->second != obj_v) {
13965             need_ver_targs.push_back(bt);
13966           } else {
13967             keep_ver_targs.push_back(bt);
13968           }
13969         } else {
13970           const pg_info_t& pinfo = recovery_state.get_peer_info(bt);
13971
13972           // Only include peers that we've caught up to their backfill line
13973           // otherwise, they only appear to be missing this object
13974           // because their pbi.begin > backfill_info.begin.
13975           if (backfill_info.begin > pinfo.last_backfill)
13976             missing_targs.push_back(bt);
13977           else
13978             skip_targs.push_back(bt);
13979         }
13980       }
13981
13982       if (!keep_ver_targs.empty()) {
13983         // These peers have version obj_v
13984         dout(20) << " BACKFILL keeping " << check
13985                  << " with ver " << obj_v
13986                  << " on peers " << keep_ver_targs << dendl;
13987         //assert(!waiting_for_degraded_object.count(check));
13988       }
13989       if (!need_ver_targs.empty() || !missing_targs.empty()) {
13990         ObjectContextRef obc = get_object_context(backfill_info.begin, false);
13991         ceph_assert(obc);
13992         if (obc->get_recovery_read()) {
13993           if (!need_ver_targs.empty()) {
13994             dout(20) << " BACKFILL replacing " << check
13995                    << " with ver " << obj_v
13996                    << " to peers " << need_ver_targs << dendl;
13997           }
13998           if (!missing_targs.empty()) {
13999             dout(20) << " BACKFILL pushing " << backfill_info.begin
14000                  << " with ver " << obj_v
14001                  << " to peers " << missing_targs << dendl;
14002           }
14003           vector<pg_shard_t> all_push = need_ver_targs;
14004           all_push.insert(all_push.end(), missing_targs.begin(), missing_targs.end());
14005
14006           handle.reset_tp_timeout();
14007           int r = prep_backfill_object_push(backfill_info.begin, obj_v, obc, all_push, h);
14008           if (r < 0) {
14009             *work_started = true;
14010             dout(0) << __func__ << " Error " << r << " trying to backfill " << backfill_info.begin << dendl;
14011             break;
14012           }
14013           ops++;
14014         } else {
14015           *work_started = true;
14016           dout(20) << "backfill blocking on " << backfill_info.begin
14017                    << "; could not get rw_manager lock" << dendl;
14018           break;
14019         }
14020       }
14021       dout(20) << "need_ver_targs=" << need_ver_targs
14022                << " keep_ver_targs=" << keep_ver_targs << dendl;
14023       dout(20) << "backfill_targets=" << get_backfill_targets()
14024                << " missing_targs=" << missing_targs
14025                << " skip_targs=" << skip_targs << dendl;
14026
14027       last_backfill_started = backfill_info.begin;
14028       add_to_stat.insert(backfill_info.begin); // XXX: Only one for all pushes?
14029       backfill_info.pop_front();
14030       vector<pg_shard_t> check_targets = need_ver_targs;
14031       check_targets.insert(check_targets.end(), keep_ver_targs.begin(), keep_ver_targs.end());
14032       for (vector<pg_shard_t>::iterator i = check_targets.begin();
14033            i != check_targets.end();
14034            ++i) {
14035         pg_shard_t bt = *i;
14036         BackfillInterval& pbi = peer_backfill_info[bt];
14037         pbi.pop_front();
14038       }
14039     }
14040   }
14041
14042   for (set<hobject_t>::iterator i = add_to_stat.begin();
14043        i != add_to_stat.end();
14044        ++i) {
14045     ObjectContextRef obc = get_object_context(*i, false);
14046     ceph_assert(obc);
14047     pg_stat_t stat;
14048     add_object_context_to_pg_stat(obc, &stat);
14049     pending_backfill_updates[*i] = stat;
14050   }
14051   map<pg_shard_t,MOSDPGBackfillRemove*> reqs;
14052   for (unsigned i = 0; i < to_remove.size(); ++i) {
14053     handle.reset_tp_timeout();
14054     const hobject_t& oid = to_remove[i].get<0>();
14055     eversion_t v = to_remove[i].get<1>();
14056     pg_shard_t peer = to_remove[i].get<2>();
14057     MOSDPGBackfillRemove *m;
14058     auto it = reqs.find(peer);
14059     if (it != reqs.end()) {
14060       m = it->second;
14061     } else {
14062       m = reqs[peer] = new MOSDPGBackfillRemove(
14063         spg_t(info.pgid.pgid, peer.shard),
14064         get_osdmap_epoch());
14065       if (cct->_conf->osd_op_queue == "mclock_scheduler") {
14066         /* This guard preserves legacy WeightedPriorityQueue behavior for
14067            * now, but should be removed after Reef */
14068         m->set_priority(recovery_state.get_recovery_op_priority());
14069       }
14070     }
14071     m->ls.push_back(make_pair(oid, v));
14072
14073     if (oid <= last_backfill_started)
14074       pending_backfill_updates[oid]; // add empty stat!
14075   }
14076   for (auto p : reqs) {
14077     osd->send_message_osd_cluster(p.first.osd, p.second,
14078                                   get_osdmap_epoch());
14079   }
14080
14081   pgbackend->run_recovery_op(h, recovery_state.get_recovery_op_priority());
14082
14083   hobject_t backfill_pos =
14084     std::min(backfill_info.begin, earliest_peer_backfill());
14085   dout(5) << "backfill_pos is " << backfill_pos << dendl;
14086   for (set<hobject_t>::iterator i = backfills_in_flight.begin();
14087        i != backfills_in_flight.end();
14088        ++i) {
14089     dout(20) << *i << " is still in flight" << dendl;
14090   }
14091
14092   hobject_t next_backfill_to_complete = backfills_in_flight.empty() ?
14093     backfill_pos : *(backfills_in_flight.begin());
14094   hobject_t new_last_backfill = recovery_state.earliest_backfill();
14095   dout(10) << "starting new_last_backfill at " << new_last_backfill << dendl;
14096   for (map<hobject_t, pg_stat_t>::iterator i =
14097          pending_backfill_updates.begin();
14098        i != pending_backfill_updates.end() &&
14099          i->first < next_backfill_to_complete;
14100        pending_backfill_updates.erase(i++)) {
14101     dout(20) << " pending_backfill_update " << i->first << dendl;
14102     ceph_assert(i->first > new_last_backfill);
14103     // carried from a previous round – if we are here, then we had to
14104     // be requeued (by e.g. on_global_recover()) and those operations
14105     // are done.
14106     recovery_state.update_complete_backfill_object_stats(
14107       i->first,
14108       i->second);
14109     new_last_backfill = i->first;
14110   }
14111   dout(10) << "possible new_last_backfill at " << new_last_backfill << dendl;
14112
14113   ceph_assert(!pending_backfill_updates.empty() ||
14114          new_last_backfill == last_backfill_started);
14115   if (pending_backfill_updates.empty() &&
14116       backfill_pos.is_max()) {
14117     ceph_assert(backfills_in_flight.empty());
14118     new_last_backfill = backfill_pos;
14119     last_backfill_started = backfill_pos;
14120   }
14121   dout(10) << "final new_last_backfill at " << new_last_backfill << dendl;
14122
14123   // If new_last_backfill == MAX, then we will send OP_BACKFILL_FINISH to
14124   // all the backfill targets.  Otherwise, we will move last_backfill up on
14125   // those targets need it and send OP_BACKFILL_PROGRESS to them.
14126   for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
14127        i != get_backfill_targets().end();
14128        ++i) {
14129     pg_shard_t bt = *i;
14130     const pg_info_t& pinfo = recovery_state.get_peer_info(bt);
14131
14132     if (new_last_backfill > pinfo.last_backfill) {
14133       recovery_state.update_peer_last_backfill(bt, new_last_backfill);
14134       epoch_t e = get_osdmap_epoch();
14135       MOSDPGBackfill *m = NULL;
14136       if (pinfo.last_backfill.is_max()) {
14137         m = new MOSDPGBackfill(
14138           MOSDPGBackfill::OP_BACKFILL_FINISH,
14139           e,
14140           get_last_peering_reset(),
14141           spg_t(info.pgid.pgid, bt.shard));
14142         // Use default priority here, must match sub_op priority
14143         start_recovery_op(hobject_t::get_max());
14144       } else {
14145         m = new MOSDPGBackfill(
14146           MOSDPGBackfill::OP_BACKFILL_PROGRESS,
14147           e,
14148           get_last_peering_reset(),
14149           spg_t(info.pgid.pgid, bt.shard));
14150         // Use default priority here, must match sub_op priority
14151       }
14152       m->last_backfill = pinfo.last_backfill;
14153       m->stats = pinfo.stats;
14154
14155       if (cct->_conf->osd_op_queue == "mclock_scheduler") {
14156         /* This guard preserves legacy WeightedPriorityQueue behavior for
14157          * now, but should be removed after Reef */
14158         m->set_priority(recovery_state.get_recovery_op_priority());
14159       }
14160
14161       osd->send_message_osd_cluster(bt.osd, m, get_osdmap_epoch());
14162       dout(10) << " peer " << bt
14163                << " num_objects now " << pinfo.stats.stats.sum.num_objects
14164                << " / " << info.stats.stats.sum.num_objects << dendl;
14165     }
14166   }
14167
14168   if (ops)
14169     *work_started = true;
14170   return ops;
14171 }
14172
14173 int PrimaryLogPG::prep_backfill_object_push(
14174   hobject_t oid, eversion_t v,
14175   ObjectContextRef obc,
14176   vector<pg_shard_t> peers,
14177   PGBackend::RecoveryHandle *h)
14178 {
14179   dout(10) << __func__ << " " << oid << " v " << v << " to peers " << peers << dendl;
14180   ceph_assert(!peers.empty());
14181
14182   backfills_in_flight.insert(oid);
14183   recovery_state.prepare_backfill_for_missing(oid, v, peers);
14184
14185   ceph_assert(!recovering.count(oid));
14186
14187   start_recovery_op(oid);
14188   recovering.insert(make_pair(oid, obc));
14189
14190   int r = pgbackend->recover_object(
14191     oid,
14192     v,
14193     ObjectContextRef(),
14194     obc,
14195     h);
14196   if (r < 0) {
14197     dout(0) << __func__ << " Error " << r << " on oid " << oid << dendl;
14198     on_failed_pull({ pg_whoami }, oid, v);
14199   }
14200   return r;
14201 }
14202
14203 void PrimaryLogPG::update_range(
14204   BackfillInterval *bi,
14205   ThreadPool::TPHandle &handle)
14206 {
14207   int local_min = cct->_conf->osd_backfill_scan_min;
14208   int local_max = cct->_conf->osd_backfill_scan_max;
14209
14210   if (bi->version < info.log_tail) {
14211     dout(10) << __func__<< ": bi is old, rescanning local backfill_info"
14212              << dendl;
14213     bi->version = info.last_update;
14214     scan_range(local_min, local_max, bi, handle);
14215   }
14216
14217   if (bi->version >= projected_last_update) {
14218     dout(10) << __func__<< ": bi is current " << dendl;
14219     ceph_assert(bi->version == projected_last_update);
14220   } else if (bi->version >= info.log_tail) {
14221     if (recovery_state.get_pg_log().get_log().empty() && projected_log.empty()) {
14222       /* Because we don't move log_tail on split, the log might be
14223        * empty even if log_tail != last_update.  However, the only
14224        * way to get here with an empty log is if log_tail is actually
14225        * eversion_t(), because otherwise the entry which changed
14226        * last_update since the last scan would have to be present.
14227        */
14228       ceph_assert(bi->version == eversion_t());
14229       return;
14230     }
14231
14232     dout(10) << __func__<< ": bi is old, (" << bi->version
14233              << ") can be updated with log to projected_last_update "
14234              << projected_last_update << dendl;
14235
14236     auto func = [&](const pg_log_entry_t &e) {
14237       dout(10) << __func__ << ": updating from version " << e.version
14238                << dendl;
14239       const hobject_t &soid = e.soid;
14240       if (soid >= bi->begin &&
14241           soid < bi->end) {
14242         if (e.is_update()) {
14243           dout(10) << __func__ << ": " << e.soid << " updated to version "
14244                    << e.version << dendl;
14245           bi->objects.erase(e.soid);
14246           bi->objects.insert(
14247             make_pair(
14248               e.soid,
14249               e.version));
14250         } else if (e.is_delete()) {
14251           dout(10) << __func__ << ": " << e.soid << " removed" << dendl;
14252           bi->objects.erase(e.soid);
14253         }
14254       }
14255     };
14256     dout(10) << "scanning pg log first" << dendl;
14257     recovery_state.get_pg_log().get_log().scan_log_after(bi->version, func);
14258     dout(10) << "scanning projected log" << dendl;
14259     projected_log.scan_log_after(bi->version, func);
14260     bi->version = projected_last_update;
14261   } else {
14262     ceph_abort_msg("scan_range should have raised bi->version past log_tail");
14263   }
14264 }
14265
14266 void PrimaryLogPG::scan_range(
14267   int min, int max, BackfillInterval *bi,
14268   ThreadPool::TPHandle &handle)
14269 {
14270   ceph_assert(is_locked());
14271   dout(10) << "scan_range from " << bi->begin << dendl;
14272   bi->clear_objects();
14273
14274   vector<hobject_t> ls;
14275   ls.reserve(max);
14276   int r = pgbackend->objects_list_partial(bi->begin, min, max, &ls, &bi->end);
14277   ceph_assert(r >= 0);
14278   dout(10) << " got " << ls.size() << " items, next " << bi->end << dendl;
14279   dout(20) << ls << dendl;
14280
14281   for (vector<hobject_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
14282     handle.reset_tp_timeout();
14283     ObjectContextRef obc;
14284     if (is_primary())
14285       obc = object_contexts.lookup(*p);
14286     if (obc) {
14287       if (!obc->obs.exists) {
14288         /* If the object does not exist here, it must have been removed
14289          * between the collection_list_partial and here.  This can happen
14290          * for the first item in the range, which is usually last_backfill.
14291          */
14292         continue;
14293       }
14294       bi->objects[*p] = obc->obs.oi.version;
14295       dout(20) << "  " << *p << " " << obc->obs.oi.version << dendl;
14296     } else {
14297       bufferlist bl;
14298       int r = pgbackend->objects_get_attr(*p, OI_ATTR, &bl);
14299       /* If the object does not exist here, it must have been removed
14300        * between the collection_list_partial and here.  This can happen
14301        * for the first item in the range, which is usually last_backfill.
14302        */
14303       if (r == -ENOENT)
14304         continue;
14305
14306       ceph_assert(r >= 0);
14307       object_info_t oi(bl);
14308       bi->objects[*p] = oi.version;
14309       dout(20) << "  " << *p << " " << oi.version << dendl;
14310     }
14311   }
14312 }
14313
14314
14315 /** check_local
14316  *
14317  * verifies that stray objects have been deleted
14318  */
14319 void PrimaryLogPG::check_local()
14320 {
14321   dout(10) << __func__ << dendl;
14322
14323   ceph_assert(
14324     info.last_update >=
14325     recovery_state.get_pg_log().get_tail());  // otherwise we need some help!
14326
14327   if (!cct->_conf->osd_debug_verify_stray_on_activate)
14328     return;
14329
14330   // just scan the log.
14331   set<hobject_t> did;
14332   for (list<pg_log_entry_t>::const_reverse_iterator p = recovery_state.get_pg_log().get_log().log.rbegin();
14333        p != recovery_state.get_pg_log().get_log().log.rend();
14334        ++p) {
14335     if (did.count(p->soid))
14336       continue;
14337     did.insert(p->soid);
14338
14339     if (p->is_delete() && !is_missing_object(p->soid)) {
14340       dout(10) << " checking " << p->soid
14341                << " at " << p->version << dendl;
14342       struct stat st;
14343       int r = osd->store->stat(
14344         ch,
14345         ghobject_t(p->soid, ghobject_t::NO_GEN, pg_whoami.shard),
14346         &st);
14347       if (r != -ENOENT) {
14348         derr << __func__ << " " << p->soid << " exists, but should have been "
14349              << "deleted" << dendl;
14350         ceph_abort_msg("erroneously present object");
14351       }
14352     } else {
14353       // ignore old(+missing) objects
14354     }
14355   }
14356 }
14357
14358
14359
14360 // ===========================
14361 // hit sets
14362
14363 hobject_t PrimaryLogPG::get_hit_set_current_object(utime_t stamp)
14364 {
14365   ostringstream ss;
14366   ss << "hit_set_" << info.pgid.pgid << "_current_" << stamp;
14367   hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "",
14368                  info.pgid.ps(), info.pgid.pool(),
14369                  cct->_conf->osd_hit_set_namespace);
14370   dout(20) << __func__ << " " << hoid << dendl;
14371   return hoid;
14372 }
14373
14374 hobject_t PrimaryLogPG::get_hit_set_archive_object(utime_t start,
14375                                                    utime_t end,
14376                                                    bool using_gmt)
14377 {
14378   ostringstream ss;
14379   ss << "hit_set_" << info.pgid.pgid << "_archive_";
14380   if (using_gmt) {
14381     start.gmtime(ss, true /* legacy pre-octopus form */) << "_";
14382     end.gmtime(ss, true /* legacy pre-octopus form */);
14383   } else {
14384     start.localtime(ss, true /* legacy pre-octopus form */) << "_";
14385     end.localtime(ss, true /* legacy pre-octopus form */);
14386   }
14387   hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "",
14388                  info.pgid.ps(), info.pgid.pool(),
14389                  cct->_conf->osd_hit_set_namespace);
14390   dout(20) << __func__ << " " << hoid << dendl;
14391   return hoid;
14392 }
14393
14394 void PrimaryLogPG::hit_set_clear()
14395 {
14396   dout(20) << __func__ << dendl;
14397   hit_set.reset();
14398   hit_set_start_stamp = utime_t();
14399 }
14400
14401 void PrimaryLogPG::hit_set_setup()
14402 {
14403   if (!is_active() ||
14404       !is_primary()) {
14405     hit_set_clear();
14406     return;
14407   }
14408
14409   if (is_active() && is_primary() &&
14410       (!pool.info.hit_set_count ||
14411        !pool.info.hit_set_period ||
14412        pool.info.hit_set_params.get_type() == HitSet::TYPE_NONE)) {
14413     hit_set_clear();
14414
14415     // only primary is allowed to remove all the hit set objects
14416     hit_set_remove_all();
14417     return;
14418   }
14419
14420   // FIXME: discard any previous data for now
14421   hit_set_create();
14422
14423   // include any writes we know about from the pg log.  this doesn't
14424   // capture reads, but it is better than nothing!
14425   hit_set_apply_log();
14426 }
14427
14428 void PrimaryLogPG::hit_set_remove_all()
14429 {
14430   // If any archives are degraded we skip this
14431   for (auto p = info.hit_set.history.begin();
14432        p != info.hit_set.history.end();
14433        ++p) {
14434     hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
14435
14436     // Once we hit a degraded object just skip
14437     if (is_degraded_or_backfilling_object(aoid))
14438       return;
14439     if (m_scrubber->write_blocked_by_scrub(aoid))
14440       return;
14441   }
14442
14443   if (!info.hit_set.history.empty()) {
14444     auto p = info.hit_set.history.rbegin();
14445     ceph_assert(p != info.hit_set.history.rend());
14446     hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
14447     ceph_assert(!is_degraded_or_backfilling_object(oid));
14448     ObjectContextRef obc = get_object_context(oid, false);
14449     ceph_assert(obc);
14450
14451     OpContextUPtr ctx = simple_opc_create(obc);
14452     ctx->at_version = get_next_version();
14453     ctx->updated_hset_history = info.hit_set;
14454     utime_t now = ceph_clock_now();
14455     ctx->mtime = now;
14456     hit_set_trim(ctx, 0);
14457     simple_opc_submit(std::move(ctx));
14458   }
14459
14460   recovery_state.update_hset(pg_hit_set_history_t());
14461   if (agent_state) {
14462     agent_state->discard_hit_sets();
14463   }
14464 }
14465
14466 void PrimaryLogPG::hit_set_create()
14467 {
14468   utime_t now = ceph_clock_now();
14469   // make a copy of the params to modify
14470   HitSet::Params params(pool.info.hit_set_params);
14471
14472   dout(20) << __func__ << " " << params << dendl;
14473   if (pool.info.hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
14474     BloomHitSet::Params *p =
14475       static_cast<BloomHitSet::Params*>(params.impl.get());
14476
14477     // convert false positive rate so it holds up across the full period
14478     p->set_fpp(p->get_fpp() / pool.info.hit_set_count);
14479     if (p->get_fpp() <= 0.0)
14480       p->set_fpp(.01);  // fpp cannot be zero!
14481
14482     // if we don't have specified size, estimate target size based on the
14483     // previous bin!
14484     if (p->target_size == 0 && hit_set) {
14485       utime_t dur = now - hit_set_start_stamp;
14486       unsigned unique = hit_set->approx_unique_insert_count();
14487       dout(20) << __func__ << " previous set had approx " << unique
14488                << " unique items over " << dur << " seconds" << dendl;
14489       p->target_size = (double)unique * (double)pool.info.hit_set_period
14490                      / (double)dur;
14491     }
14492     if (p->target_size <
14493         static_cast<uint64_t>(cct->_conf->osd_hit_set_min_size))
14494       p->target_size = cct->_conf->osd_hit_set_min_size;
14495
14496     if (p->target_size
14497         > static_cast<uint64_t>(cct->_conf->osd_hit_set_max_size))
14498       p->target_size = cct->_conf->osd_hit_set_max_size;
14499
14500     p->seed = now.sec();
14501
14502     dout(10) << __func__ << " target_size " << p->target_size
14503              << " fpp " << p->get_fpp() << dendl;
14504   }
14505   hit_set.reset(new HitSet(params));
14506   hit_set_start_stamp = now;
14507 }
14508
14509 /**
14510  * apply log entries to set
14511  *
14512  * this would only happen after peering, to at least capture writes
14513  * during an interval that was potentially lost.
14514  */
14515 bool PrimaryLogPG::hit_set_apply_log()
14516 {
14517   if (!hit_set)
14518     return false;
14519
14520   eversion_t to = info.last_update;
14521   eversion_t from = info.hit_set.current_last_update;
14522   if (to <= from) {
14523     dout(20) << __func__ << " no update" << dendl;
14524     return false;
14525   }
14526
14527   dout(20) << __func__ << " " << to << " .. " << info.last_update << dendl;
14528   list<pg_log_entry_t>::const_reverse_iterator p =
14529     recovery_state.get_pg_log().get_log().log.rbegin();
14530   while (p != recovery_state.get_pg_log().get_log().log.rend() && p->version > to)
14531     ++p;
14532   while (p != recovery_state.get_pg_log().get_log().log.rend() && p->version > from) {
14533     hit_set->insert(p->soid);
14534     ++p;
14535   }
14536
14537   return true;
14538 }
14539
14540 void PrimaryLogPG::hit_set_persist()
14541 {
14542   dout(10) << __func__  << dendl;
14543   bufferlist bl;
14544   unsigned max = pool.info.hit_set_count;
14545
14546   utime_t now = ceph_clock_now();
14547   hobject_t oid;
14548
14549   // If any archives are degraded we skip this persist request
14550   // account for the additional entry being added below
14551   for (auto p = info.hit_set.history.begin();
14552        p != info.hit_set.history.end();
14553        ++p) {
14554     hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
14555
14556     // Once we hit a degraded object just skip further trim
14557     if (is_degraded_or_backfilling_object(aoid))
14558       return;
14559     if (m_scrubber->write_blocked_by_scrub(aoid))
14560       return;
14561   }
14562
14563   // If backfill is in progress and we could possibly overlap with the
14564   // hit_set_* objects, back off.  Since these all have
14565   // hobject_t::hash set to pgid.ps(), and those sort first, we can
14566   // look just at that.  This is necessary because our transactions
14567   // may include a modify of the new hit_set *and* a delete of the
14568   // old one, and this may span the backfill boundary.
14569   for (set<pg_shard_t>::const_iterator p = get_backfill_targets().begin();
14570        p != get_backfill_targets().end();
14571        ++p) {
14572     const pg_info_t& pi = recovery_state.get_peer_info(*p);
14573     if (pi.last_backfill == hobject_t() ||
14574         pi.last_backfill.get_hash() == info.pgid.ps()) {
14575       dout(10) << __func__ << " backfill target osd." << *p
14576                << " last_backfill has not progressed past pgid ps"
14577                << dendl;
14578       return;
14579     }
14580   }
14581
14582
14583   pg_hit_set_info_t new_hset = pg_hit_set_info_t(pool.info.use_gmt_hitset);
14584   new_hset.begin = hit_set_start_stamp;
14585   new_hset.end = now;
14586   oid = get_hit_set_archive_object(
14587     new_hset.begin,
14588     new_hset.end,
14589     new_hset.using_gmt);
14590
14591   // If the current object is degraded we skip this persist request
14592   if (m_scrubber->write_blocked_by_scrub(oid))
14593     return;
14594
14595   hit_set->seal();
14596   encode(*hit_set, bl);
14597   dout(20) << __func__ << " archive " << oid << dendl;
14598
14599   if (agent_state) {
14600     agent_state->add_hit_set(new_hset.begin, hit_set);
14601     uint32_t size = agent_state->hit_set_map.size();
14602     if (size >= pool.info.hit_set_count) {
14603       size = pool.info.hit_set_count > 0 ? pool.info.hit_set_count - 1: 0;
14604     }
14605     hit_set_in_memory_trim(size);
14606   }
14607
14608   ObjectContextRef obc = get_object_context(oid, true);
14609   OpContextUPtr ctx = simple_opc_create(obc);
14610
14611   ctx->at_version = get_next_version();
14612   ctx->updated_hset_history = info.hit_set;
14613   pg_hit_set_history_t &updated_hit_set_hist = *(ctx->updated_hset_history);
14614
14615   updated_hit_set_hist.current_last_update = info.last_update;
14616   new_hset.version = ctx->at_version;
14617
14618   updated_hit_set_hist.history.push_back(new_hset);
14619   hit_set_create();
14620
14621   // fabricate an object_info_t and SnapSet
14622   obc->obs.oi.version = ctx->at_version;
14623   obc->obs.oi.mtime = now;
14624   obc->obs.oi.size = bl.length();
14625   obc->obs.exists = true;
14626   obc->obs.oi.set_data_digest(bl.crc32c(-1));
14627
14628   ctx->new_obs = obc->obs;
14629
14630   ctx->new_snapset = obc->ssc->snapset;
14631
14632   ctx->delta_stats.num_objects++;
14633   ctx->delta_stats.num_objects_hit_set_archive++;
14634
14635   ctx->delta_stats.num_bytes += bl.length();
14636   ctx->delta_stats.num_bytes_hit_set_archive += bl.length();
14637
14638   bufferlist bss;
14639   encode(ctx->new_snapset, bss);
14640   bufferlist boi(sizeof(ctx->new_obs.oi));
14641   encode(ctx->new_obs.oi, boi,
14642            get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
14643
14644   ctx->op_t->create(oid);
14645   if (bl.length()) {
14646     ctx->op_t->write(oid, 0, bl.length(), bl, 0);
14647     write_update_size_and_usage(ctx->delta_stats, obc->obs.oi, ctx->modified_ranges,
14648         0, bl.length());
14649     ctx->clean_regions.mark_data_region_dirty(0, bl.length());
14650   }
14651   map<string, bufferlist, std::less<>> attrs = {
14652     {OI_ATTR, std::move(boi)},
14653     {SS_ATTR, std::move(bss)}
14654   };
14655   setattrs_maybe_cache(ctx->obc, ctx->op_t.get(), attrs);
14656   ctx->log.push_back(
14657     pg_log_entry_t(
14658       pg_log_entry_t::MODIFY,
14659       oid,
14660       ctx->at_version,
14661       eversion_t(),
14662       0,
14663       osd_reqid_t(),
14664       ctx->mtime,
14665       0)
14666     );
14667   ctx->log.back().clean_regions = ctx->clean_regions;
14668
14669   hit_set_trim(ctx, max);
14670
14671   simple_opc_submit(std::move(ctx));
14672 }
14673
14674 void PrimaryLogPG::hit_set_trim(OpContextUPtr &ctx, unsigned max)
14675 {
14676   ceph_assert(ctx->updated_hset_history);
14677   pg_hit_set_history_t &updated_hit_set_hist =
14678     *(ctx->updated_hset_history);
14679   for (unsigned num = updated_hit_set_hist.history.size(); num > max; --num) {
14680     list<pg_hit_set_info_t>::iterator p = updated_hit_set_hist.history.begin();
14681     ceph_assert(p != updated_hit_set_hist.history.end());
14682     hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
14683
14684     ceph_assert(!is_degraded_or_backfilling_object(oid));
14685
14686     dout(20) << __func__ << " removing " << oid << dendl;
14687     ++ctx->at_version.version;
14688     ctx->log.push_back(
14689         pg_log_entry_t(pg_log_entry_t::DELETE,
14690                        oid,
14691                        ctx->at_version,
14692                        p->version,
14693                        0,
14694                        osd_reqid_t(),
14695                        ctx->mtime,
14696                        0));
14697
14698     ctx->op_t->remove(oid);
14699     updated_hit_set_hist.history.pop_front();
14700
14701     ObjectContextRef obc = get_object_context(oid, false);
14702     ceph_assert(obc);
14703     --ctx->delta_stats.num_objects;
14704     --ctx->delta_stats.num_objects_hit_set_archive;
14705     ctx->delta_stats.num_bytes -= obc->obs.oi.size;
14706     ctx->delta_stats.num_bytes_hit_set_archive -= obc->obs.oi.size;
14707   }
14708 }
14709
14710 void PrimaryLogPG::hit_set_in_memory_trim(uint32_t max_in_memory)
14711 {
14712   while (agent_state->hit_set_map.size() > max_in_memory) {
14713     agent_state->remove_oldest_hit_set();
14714   }
14715 }
14716
14717
14718 // =======================================
14719 // cache agent
14720
14721 void PrimaryLogPG::agent_setup()
14722 {
14723   ceph_assert(is_locked());
14724   if (!is_active() ||
14725       !is_primary() ||
14726       state_test(PG_STATE_PREMERGE) ||
14727       pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE ||
14728       pool.info.tier_of < 0 ||
14729       !get_osdmap()->have_pg_pool(pool.info.tier_of)) {
14730     agent_clear();
14731     return;
14732   }
14733   if (!agent_state) {
14734     agent_state.reset(new TierAgentState);
14735
14736     // choose random starting position
14737     agent_state->position = hobject_t();
14738     agent_state->position.pool = info.pgid.pool();
14739     agent_state->position.set_hash(pool.info.get_random_pg_position(
14740       info.pgid.pgid,
14741       rand()));
14742     agent_state->start = agent_state->position;
14743
14744     dout(10) << __func__ << " allocated new state, position "
14745              << agent_state->position << dendl;
14746   } else {
14747     dout(10) << __func__ << " keeping existing state" << dendl;
14748   }
14749
14750   if (info.stats.stats_invalid) {
14751     osd->clog->warn() << "pg " << info.pgid << " has invalid (post-split) stats; must scrub before tier agent can activate";
14752   }
14753
14754   agent_choose_mode();
14755 }
14756
14757 void PrimaryLogPG::agent_clear()
14758 {
14759   agent_stop();
14760   agent_state.reset(NULL);
14761 }
14762
14763 // Return false if no objects operated on since start of object hash space
14764 bool PrimaryLogPG::agent_work(int start_max, int agent_flush_quota)
14765 {
14766   std::scoped_lock locker{*this};
14767   if (!agent_state) {
14768     dout(10) << __func__ << " no agent state, stopping" << dendl;
14769     return true;
14770   }
14771
14772   ceph_assert(!recovery_state.is_deleting());
14773
14774   if (agent_state->is_idle()) {
14775     dout(10) << __func__ << " idle, stopping" << dendl;
14776     return true;
14777   }
14778
14779   osd->logger->inc(l_osd_agent_wake);
14780
14781   dout(10) << __func__
14782            << " max " << start_max
14783            << ", flush " << agent_state->get_flush_mode_name()
14784            << ", evict " << agent_state->get_evict_mode_name()
14785            << ", pos " << agent_state->position
14786            << dendl;
14787   ceph_assert(is_primary());
14788   ceph_assert(is_active());
14789
14790   agent_load_hit_sets();
14791
14792   const pg_pool_t *base_pool = get_osdmap()->get_pg_pool(pool.info.tier_of);
14793   ceph_assert(base_pool);
14794
14795   int ls_min = 1;
14796   int ls_max = cct->_conf->osd_pool_default_cache_max_evict_check_size;
14797
14798   // list some objects.  this conveniently lists clones (oldest to
14799   // newest) before heads... the same order we want to flush in.
14800   //
14801   // NOTE: do not flush the Sequencer.  we will assume that the
14802   // listing we get back is imprecise.
14803   vector<hobject_t> ls;
14804   hobject_t next;
14805   int r = pgbackend->objects_list_partial(agent_state->position, ls_min, ls_max,
14806                                           &ls, &next);
14807   ceph_assert(r >= 0);
14808   dout(20) << __func__ << " got " << ls.size() << " objects" << dendl;
14809   int started = 0;
14810   for (vector<hobject_t>::iterator p = ls.begin();
14811        p != ls.end();
14812        ++p) {
14813     if (p->nspace == cct->_conf->osd_hit_set_namespace) {
14814       dout(20) << __func__ << " skip (hit set) " << *p << dendl;
14815       osd->logger->inc(l_osd_agent_skip);
14816       continue;
14817     }
14818     if (is_degraded_or_backfilling_object(*p)) {
14819       dout(20) << __func__ << " skip (degraded) " << *p << dendl;
14820       osd->logger->inc(l_osd_agent_skip);
14821       continue;
14822     }
14823     if (is_missing_object(p->get_head())) {
14824       dout(20) << __func__ << " skip (missing head) " << *p << dendl;
14825       osd->logger->inc(l_osd_agent_skip);
14826       continue;
14827     }
14828     ObjectContextRef obc = get_object_context(*p, false, NULL);
14829     if (!obc) {
14830       // we didn't flush; we may miss something here.
14831       dout(20) << __func__ << " skip (no obc) " << *p << dendl;
14832       osd->logger->inc(l_osd_agent_skip);
14833       continue;
14834     }
14835     if (!obc->obs.exists) {
14836       dout(20) << __func__ << " skip (dne) " << obc->obs.oi.soid << dendl;
14837       osd->logger->inc(l_osd_agent_skip);
14838       continue;
14839     }
14840     if (m_scrubber->range_intersects_scrub(obc->obs.oi.soid,
14841                                obc->obs.oi.soid.get_head())) {
14842       dout(20) << __func__ << " skip (scrubbing) " << obc->obs.oi << dendl;
14843       osd->logger->inc(l_osd_agent_skip);
14844       continue;
14845     }
14846     if (obc->is_blocked()) {
14847       dout(20) << __func__ << " skip (blocked) " << obc->obs.oi << dendl;
14848       osd->logger->inc(l_osd_agent_skip);
14849       continue;
14850     }
14851     if (obc->is_request_pending()) {
14852       dout(20) << __func__ << " skip (request pending) " << obc->obs.oi << dendl;
14853       osd->logger->inc(l_osd_agent_skip);
14854       continue;
14855     }
14856
14857     // be careful flushing omap to an EC pool.
14858     if (!base_pool->supports_omap() &&
14859         obc->obs.oi.is_omap()) {
14860       dout(20) << __func__ << " skip (omap to EC) " << obc->obs.oi << dendl;
14861       osd->logger->inc(l_osd_agent_skip);
14862       continue;
14863     }
14864
14865     if (agent_state->evict_mode != TierAgentState::EVICT_MODE_IDLE &&
14866         agent_maybe_evict(obc, false))
14867       ++started;
14868     else if (agent_state->flush_mode != TierAgentState::FLUSH_MODE_IDLE &&
14869              agent_flush_quota > 0 && agent_maybe_flush(obc)) {
14870       ++started;
14871       --agent_flush_quota;
14872     }
14873     if (started >= start_max) {
14874       // If finishing early, set "next" to the next object
14875       if (++p != ls.end())
14876         next = *p;
14877       break;
14878     }
14879   }
14880
14881   if (++agent_state->hist_age > cct->_conf->osd_agent_hist_halflife) {
14882     dout(20) << __func__ << " resetting atime and temp histograms" << dendl;
14883     agent_state->hist_age = 0;
14884     agent_state->temp_hist.decay();
14885   }
14886
14887   // Total objects operated on so far
14888   int total_started = agent_state->started + started;
14889   bool need_delay = false;
14890
14891   dout(20) << __func__ << " start pos " << agent_state->position
14892     << " next start pos " << next
14893     << " started " << total_started << dendl;
14894
14895   // See if we've made a full pass over the object hash space
14896   // This might check at most ls_max objects a second time to notice that
14897   // we've checked every objects at least once.
14898   if (agent_state->position < agent_state->start &&
14899       next >= agent_state->start) {
14900     dout(20) << __func__ << " wrap around " << agent_state->start << dendl;
14901     if (total_started == 0)
14902       need_delay = true;
14903     else
14904       total_started = 0;
14905     agent_state->start = next;
14906   }
14907   agent_state->started = total_started;
14908
14909   // See if we are starting from beginning
14910   if (next.is_max())
14911     agent_state->position = hobject_t();
14912   else
14913     agent_state->position = next;
14914
14915   // Discard old in memory HitSets
14916   hit_set_in_memory_trim(pool.info.hit_set_count);
14917
14918   if (need_delay) {
14919     ceph_assert(agent_state->delaying == false);
14920     agent_delay();
14921     return false;
14922   }
14923   agent_choose_mode();
14924   return true;
14925 }
14926
14927 void PrimaryLogPG::agent_load_hit_sets()
14928 {
14929   if (agent_state->evict_mode == TierAgentState::EVICT_MODE_IDLE) {
14930     return;
14931   }
14932
14933   if (agent_state->hit_set_map.size() < info.hit_set.history.size()) {
14934     dout(10) << __func__ << dendl;
14935     for (auto p = info.hit_set.history.begin();
14936          p != info.hit_set.history.end(); ++p) {
14937       if (agent_state->hit_set_map.count(p->begin.sec()) == 0) {
14938         dout(10) << __func__ << " loading " << p->begin << "-"
14939                  << p->end << dendl;
14940         if (!pool.info.is_replicated()) {
14941           // FIXME: EC not supported here yet
14942           derr << __func__ << " on non-replicated pool" << dendl;
14943           break;
14944         }
14945
14946         hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
14947         if (is_unreadable_object(oid)) {
14948           dout(10) << __func__ << " unreadable " << oid << ", waiting" << dendl;
14949           break;
14950         }
14951
14952         ObjectContextRef obc = get_object_context(oid, false);
14953         if (!obc) {
14954           derr << __func__ << ": could not load hitset " << oid << dendl;
14955           break;
14956         }
14957
14958         bufferlist bl;
14959         {
14960           int r = osd->store->read(ch, ghobject_t(oid), 0, 0, bl);
14961           ceph_assert(r >= 0);
14962         }
14963         HitSetRef hs(new HitSet);
14964         bufferlist::const_iterator pbl = bl.begin();
14965         decode(*hs, pbl);
14966         agent_state->add_hit_set(p->begin.sec(), hs);
14967       }
14968     }
14969   }
14970 }
14971
14972 bool PrimaryLogPG::agent_maybe_flush(ObjectContextRef& obc)
14973 {
14974   if (!obc->obs.oi.is_dirty()) {
14975     dout(20) << __func__ << " skip (clean) " << obc->obs.oi << dendl;
14976     osd->logger->inc(l_osd_agent_skip);
14977     return false;
14978   }
14979   if (obc->obs.oi.is_cache_pinned()) {
14980     dout(20) << __func__ << " skip (cache_pinned) " << obc->obs.oi << dendl;
14981     osd->logger->inc(l_osd_agent_skip);
14982     return false;
14983   }
14984
14985   utime_t now = ceph_clock_now();
14986   utime_t ob_local_mtime;
14987   if (obc->obs.oi.local_mtime != utime_t()) {
14988     ob_local_mtime = obc->obs.oi.local_mtime;
14989   } else {
14990     ob_local_mtime = obc->obs.oi.mtime;
14991   }
14992   bool evict_mode_full =
14993     (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL);
14994   if (!evict_mode_full &&
14995       obc->obs.oi.soid.snap == CEPH_NOSNAP &&  // snaps immutable; don't delay
14996       (ob_local_mtime + utime_t(pool.info.cache_min_flush_age, 0) > now)) {
14997     dout(20) << __func__ << " skip (too young) " << obc->obs.oi << dendl;
14998     osd->logger->inc(l_osd_agent_skip);
14999     return false;
15000   }
15001
15002   if (osd->agent_is_active_oid(obc->obs.oi.soid)) {
15003     dout(20) << __func__ << " skip (flushing) " << obc->obs.oi << dendl;
15004     osd->logger->inc(l_osd_agent_skip);
15005     return false;
15006   }
15007
15008   dout(10) << __func__ << " flushing " << obc->obs.oi << dendl;
15009
15010   // FIXME: flush anything dirty, regardless of what distribution of
15011   // ages we expect.
15012
15013   hobject_t oid = obc->obs.oi.soid;
15014   osd->agent_start_op(oid);
15015   // no need to capture a pg ref, can't outlive fop or ctx
15016   std::function<void()> on_flush = [this, oid]() {
15017     osd->agent_finish_op(oid);
15018   };
15019
15020   int result = start_flush(
15021     OpRequestRef(), obc, false, NULL,
15022     on_flush);
15023   if (result != -EINPROGRESS) {
15024     on_flush();
15025     dout(10) << __func__ << " start_flush() failed " << obc->obs.oi
15026       << " with " << result << dendl;
15027     osd->logger->inc(l_osd_agent_skip);
15028     return false;
15029   }
15030
15031   osd->logger->inc(l_osd_agent_flush);
15032   return true;
15033 }
15034
15035 bool PrimaryLogPG::agent_maybe_evict(ObjectContextRef& obc, bool after_flush)
15036 {
15037   const hobject_t& soid = obc->obs.oi.soid;
15038   if (!after_flush && obc->obs.oi.is_dirty()) {
15039     dout(20) << __func__ << " skip (dirty) " << obc->obs.oi << dendl;
15040     return false;
15041   }
15042   // This is already checked by agent_work() which passes after_flush = false
15043   if (after_flush && m_scrubber->range_intersects_scrub(soid, soid.get_head())) {
15044       dout(20) << __func__ << " skip (scrubbing) " << obc->obs.oi << dendl;
15045       return false;
15046   }
15047   if (!obc->obs.oi.watchers.empty()) {
15048     dout(20) << __func__ << " skip (watchers) " << obc->obs.oi << dendl;
15049     return false;
15050   }
15051   if (obc->is_blocked()) {
15052     dout(20) << __func__ << " skip (blocked) " << obc->obs.oi << dendl;
15053     return false;
15054   }
15055   if (obc->obs.oi.is_cache_pinned()) {
15056     dout(20) << __func__ << " skip (cache_pinned) " << obc->obs.oi << dendl;
15057     return false;
15058   }
15059
15060   if (soid.snap == CEPH_NOSNAP) {
15061     int result = _verify_no_head_clones(soid, obc->ssc->snapset);
15062     if (result < 0) {
15063       dout(20) << __func__ << " skip (clones) " << obc->obs.oi << dendl;
15064       return false;
15065     }
15066   }
15067
15068   if (agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL) {
15069     // is this object old than cache_min_evict_age?
15070     utime_t now = ceph_clock_now();
15071     utime_t ob_local_mtime;
15072     if (obc->obs.oi.local_mtime != utime_t()) {
15073       ob_local_mtime = obc->obs.oi.local_mtime;
15074     } else {
15075       ob_local_mtime = obc->obs.oi.mtime;
15076     }
15077     if (ob_local_mtime + utime_t(pool.info.cache_min_evict_age, 0) > now) {
15078       dout(20) << __func__ << " skip (too young) " << obc->obs.oi << dendl;
15079       osd->logger->inc(l_osd_agent_skip);
15080       return false;
15081     }
15082     // is this object old and/or cold enough?
15083     int temp = 0;
15084     uint64_t temp_upper = 0, temp_lower = 0;
15085     if (hit_set)
15086       agent_estimate_temp(soid, &temp);
15087     agent_state->temp_hist.add(temp);
15088     agent_state->temp_hist.get_position_micro(temp, &temp_lower, &temp_upper);
15089
15090     dout(20) << __func__
15091              << " temp " << temp
15092              << " pos " << temp_lower << "-" << temp_upper
15093              << ", evict_effort " << agent_state->evict_effort
15094              << dendl;
15095     dout(30) << "agent_state:\n";
15096     auto f = Formatter::create_unique("");
15097     f->open_object_section("agent_state");
15098     agent_state->dump(f.get());
15099     f->close_section();
15100     f->flush(*_dout);
15101     *_dout << dendl;
15102
15103     if (1000000 - temp_upper >= agent_state->evict_effort)
15104       return false;
15105   }
15106
15107   dout(10) << __func__ << " evicting " << obc->obs.oi << dendl;
15108   OpContextUPtr ctx = simple_opc_create(obc);
15109
15110   auto null_op_req = OpRequestRef();
15111   if (!ctx->lock_manager.get_lock_type(
15112         RWState::RWWRITE,
15113         obc->obs.oi.soid,
15114         obc,
15115         null_op_req)) {
15116     close_op_ctx(ctx.release());
15117     dout(20) << __func__ << " skip (cannot get lock) " << obc->obs.oi << dendl;
15118     return false;
15119   }
15120
15121   osd->agent_start_evict_op();
15122   ctx->register_on_finish(
15123     [this]() {
15124       osd->agent_finish_evict_op();
15125     });
15126
15127   ctx->at_version = get_next_version();
15128   ceph_assert(ctx->new_obs.exists);
15129   int r = _delete_oid(ctx.get(), true, false);
15130   if (obc->obs.oi.is_omap())
15131     ctx->delta_stats.num_objects_omap--;
15132   ctx->delta_stats.num_evict++;
15133   ctx->delta_stats.num_evict_kb += shift_round_up(obc->obs.oi.size, 10);
15134   if (obc->obs.oi.is_dirty())
15135     --ctx->delta_stats.num_objects_dirty;
15136   ceph_assert(r == 0);
15137   finish_ctx(ctx.get(), pg_log_entry_t::DELETE);
15138   simple_opc_submit(std::move(ctx));
15139   osd->logger->inc(l_osd_tier_evict);
15140   osd->logger->inc(l_osd_agent_evict);
15141   return true;
15142 }
15143
15144 void PrimaryLogPG::agent_stop()
15145 {
15146   dout(20) << __func__ << dendl;
15147   if (agent_state && !agent_state->is_idle()) {
15148     agent_state->evict_mode = TierAgentState::EVICT_MODE_IDLE;
15149     agent_state->flush_mode = TierAgentState::FLUSH_MODE_IDLE;
15150     osd->agent_disable_pg(this, agent_state->evict_effort);
15151   }
15152 }
15153
15154 void PrimaryLogPG::agent_delay()
15155 {
15156   dout(20) << __func__ << dendl;
15157   if (agent_state && !agent_state->is_idle()) {
15158     ceph_assert(agent_state->delaying == false);
15159     agent_state->delaying = true;
15160     osd->agent_disable_pg(this, agent_state->evict_effort);
15161   }
15162 }
15163
15164 void PrimaryLogPG::agent_choose_mode_restart()
15165 {
15166   dout(20) << __func__ << dendl;
15167   std::scoped_lock locker{*this};
15168   if (agent_state && agent_state->delaying) {
15169     agent_state->delaying = false;
15170     agent_choose_mode(true);
15171   }
15172 }
15173
15174 bool PrimaryLogPG::agent_choose_mode(bool restart, OpRequestRef op)
15175 {
15176   bool requeued = false;
15177   // Let delay play out
15178   if (agent_state->delaying) {
15179     dout(20) << __func__ << " " << this << " delaying, ignored" << dendl;
15180     return requeued;
15181   }
15182
15183   TierAgentState::flush_mode_t flush_mode = TierAgentState::FLUSH_MODE_IDLE;
15184   TierAgentState::evict_mode_t evict_mode = TierAgentState::EVICT_MODE_IDLE;
15185   unsigned evict_effort = 0;
15186
15187   if (info.stats.stats_invalid) {
15188     // idle; stats can't be trusted until we scrub.
15189     dout(20) << __func__ << " stats invalid (post-split), idle" << dendl;
15190     goto skip_calc;
15191   }
15192
15193   {
15194   uint64_t divisor = pool.info.get_pg_num_divisor(info.pgid.pgid);
15195   ceph_assert(divisor > 0);
15196
15197   // adjust (effective) user objects down based on the number
15198   // of HitSet objects, which should not count toward our total since
15199   // they cannot be flushed.
15200   uint64_t unflushable = info.stats.stats.sum.num_objects_hit_set_archive;
15201
15202   // also exclude omap objects if ec backing pool
15203   const pg_pool_t *base_pool = get_osdmap()->get_pg_pool(pool.info.tier_of);
15204   ceph_assert(base_pool);
15205   if (!base_pool->supports_omap())
15206     unflushable += info.stats.stats.sum.num_objects_omap;
15207
15208   uint64_t num_user_objects = info.stats.stats.sum.num_objects;
15209   if (num_user_objects > unflushable)
15210     num_user_objects -= unflushable;
15211   else
15212     num_user_objects = 0;
15213
15214   uint64_t num_user_bytes = info.stats.stats.sum.num_bytes;
15215   uint64_t unflushable_bytes = info.stats.stats.sum.num_bytes_hit_set_archive;
15216   num_user_bytes -= unflushable_bytes;
15217   uint64_t num_overhead_bytes = osd->store->estimate_objects_overhead(num_user_objects);
15218   num_user_bytes += num_overhead_bytes;
15219
15220   // also reduce the num_dirty by num_objects_omap
15221   int64_t num_dirty = info.stats.stats.sum.num_objects_dirty;
15222   if (!base_pool->supports_omap()) {
15223     if (num_dirty > info.stats.stats.sum.num_objects_omap)
15224       num_dirty -= info.stats.stats.sum.num_objects_omap;
15225     else
15226       num_dirty = 0;
15227   }
15228
15229   dout(10) << __func__
15230            << " flush_mode: "
15231            << TierAgentState::get_flush_mode_name(agent_state->flush_mode)
15232            << " evict_mode: "
15233            << TierAgentState::get_evict_mode_name(agent_state->evict_mode)
15234            << " num_objects: " << info.stats.stats.sum.num_objects
15235            << " num_bytes: " << info.stats.stats.sum.num_bytes
15236            << " num_objects_dirty: " << info.stats.stats.sum.num_objects_dirty
15237            << " num_objects_omap: " << info.stats.stats.sum.num_objects_omap
15238            << " num_dirty: " << num_dirty
15239            << " num_user_objects: " << num_user_objects
15240            << " num_user_bytes: " << num_user_bytes
15241            << " num_overhead_bytes: " << num_overhead_bytes
15242            << " pool.info.target_max_bytes: " << pool.info.target_max_bytes
15243            << " pool.info.target_max_objects: " << pool.info.target_max_objects
15244            << dendl;
15245
15246   // get dirty, full ratios
15247   uint64_t dirty_micro = 0;
15248   uint64_t full_micro = 0;
15249   if (pool.info.target_max_bytes && num_user_objects > 0) {
15250     uint64_t avg_size = num_user_bytes / num_user_objects;
15251     dirty_micro =
15252       num_dirty * avg_size * 1000000 /
15253       std::max<uint64_t>(pool.info.target_max_bytes / divisor, 1);
15254     full_micro =
15255       num_user_objects * avg_size * 1000000 /
15256       std::max<uint64_t>(pool.info.target_max_bytes / divisor, 1);
15257   }
15258   if (pool.info.target_max_objects > 0) {
15259     uint64_t dirty_objects_micro =
15260       num_dirty * 1000000 /
15261       std::max<uint64_t>(pool.info.target_max_objects / divisor, 1);
15262     if (dirty_objects_micro > dirty_micro)
15263       dirty_micro = dirty_objects_micro;
15264     uint64_t full_objects_micro =
15265       num_user_objects * 1000000 /
15266       std::max<uint64_t>(pool.info.target_max_objects / divisor, 1);
15267     if (full_objects_micro > full_micro)
15268       full_micro = full_objects_micro;
15269   }
15270   dout(20) << __func__ << " dirty " << ((float)dirty_micro / 1000000.0)
15271            << " full " << ((float)full_micro / 1000000.0)
15272            << dendl;
15273
15274   // flush mode
15275   uint64_t flush_target = pool.info.cache_target_dirty_ratio_micro;
15276   uint64_t flush_high_target = pool.info.cache_target_dirty_high_ratio_micro;
15277   uint64_t flush_slop = (float)flush_target * cct->_conf->osd_agent_slop;
15278   if (restart || agent_state->flush_mode == TierAgentState::FLUSH_MODE_IDLE) {
15279     flush_target += flush_slop;
15280     flush_high_target += flush_slop;
15281   } else {
15282     flush_target -= std::min(flush_target, flush_slop);
15283     flush_high_target -= std::min(flush_high_target, flush_slop);
15284   }
15285
15286   if (dirty_micro > flush_high_target) {
15287     flush_mode = TierAgentState::FLUSH_MODE_HIGH;
15288   } else if (dirty_micro > flush_target || (!flush_target && num_dirty > 0)) {
15289     flush_mode = TierAgentState::FLUSH_MODE_LOW;
15290   }
15291
15292   // evict mode
15293   uint64_t evict_target = pool.info.cache_target_full_ratio_micro;
15294   uint64_t evict_slop = (float)evict_target * cct->_conf->osd_agent_slop;
15295   if (restart || agent_state->evict_mode == TierAgentState::EVICT_MODE_IDLE)
15296     evict_target += evict_slop;
15297   else
15298     evict_target -= std::min(evict_target, evict_slop);
15299
15300   if (full_micro > 1000000) {
15301     // evict anything clean
15302     evict_mode = TierAgentState::EVICT_MODE_FULL;
15303     evict_effort = 1000000;
15304   } else if (full_micro > evict_target) {
15305     // set effort in [0..1] range based on where we are between
15306     evict_mode = TierAgentState::EVICT_MODE_SOME;
15307     uint64_t over = full_micro - evict_target;
15308     uint64_t span  = 1000000 - evict_target;
15309     evict_effort = std::max(over * 1000000 / span,
15310                             uint64_t(1000000.0 *
15311                                      cct->_conf->osd_agent_min_evict_effort));
15312
15313     // quantize effort to avoid too much reordering in the agent_queue.
15314     uint64_t inc = cct->_conf->osd_agent_quantize_effort * 1000000;
15315     ceph_assert(inc > 0);
15316     uint64_t was = evict_effort;
15317     evict_effort -= evict_effort % inc;
15318     if (evict_effort < inc)
15319       evict_effort = inc;
15320     ceph_assert(evict_effort >= inc && evict_effort <= 1000000);
15321     dout(30) << __func__ << " evict_effort " << was << " quantized by " << inc << " to " << evict_effort << dendl;
15322   }
15323   }
15324
15325   skip_calc:
15326   bool old_idle = agent_state->is_idle();
15327   if (flush_mode != agent_state->flush_mode) {
15328     dout(5) << __func__ << " flush_mode "
15329             << TierAgentState::get_flush_mode_name(agent_state->flush_mode)
15330             << " -> "
15331             << TierAgentState::get_flush_mode_name(flush_mode)
15332             << dendl;
15333     recovery_state.update_stats(
15334       [=, this](auto &history, auto &stats) {
15335         if (flush_mode == TierAgentState::FLUSH_MODE_HIGH) {
15336           osd->agent_inc_high_count();
15337           stats.stats.sum.num_flush_mode_high = 1;
15338         } else if (flush_mode == TierAgentState::FLUSH_MODE_LOW) {
15339           stats.stats.sum.num_flush_mode_low = 1;
15340         }
15341         if (agent_state->flush_mode == TierAgentState::FLUSH_MODE_HIGH) {
15342           osd->agent_dec_high_count();
15343           stats.stats.sum.num_flush_mode_high = 0;
15344         } else if (agent_state->flush_mode == TierAgentState::FLUSH_MODE_LOW) {
15345           stats.stats.sum.num_flush_mode_low = 0;
15346         }
15347         return false;
15348       });
15349     agent_state->flush_mode = flush_mode;
15350   }
15351   if (evict_mode != agent_state->evict_mode) {
15352     dout(5) << __func__ << " evict_mode "
15353             << TierAgentState::get_evict_mode_name(agent_state->evict_mode)
15354             << " -> "
15355             << TierAgentState::get_evict_mode_name(evict_mode)
15356             << dendl;
15357     if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL &&
15358         is_active()) {
15359       if (op)
15360         requeue_op(op);
15361       requeue_ops(waiting_for_flush);
15362       requeue_ops(waiting_for_active);
15363       requeue_ops(waiting_for_readable);
15364       requeue_ops(waiting_for_scrub);
15365       requeue_ops(waiting_for_cache_not_full);
15366       objects_blocked_on_cache_full.clear();
15367       requeued = true;
15368     }
15369     recovery_state.update_stats(
15370       [=, this](auto &history, auto &stats) {
15371         if (evict_mode == TierAgentState::EVICT_MODE_SOME) {
15372           stats.stats.sum.num_evict_mode_some = 1;
15373         } else if (evict_mode == TierAgentState::EVICT_MODE_FULL) {
15374           stats.stats.sum.num_evict_mode_full = 1;
15375         }
15376         if (agent_state->evict_mode == TierAgentState::EVICT_MODE_SOME) {
15377           stats.stats.sum.num_evict_mode_some = 0;
15378         } else if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
15379           stats.stats.sum.num_evict_mode_full = 0;
15380         }
15381         return false;
15382       });
15383     agent_state->evict_mode = evict_mode;
15384   }
15385   uint64_t old_effort = agent_state->evict_effort;
15386   if (evict_effort != agent_state->evict_effort) {
15387     dout(5) << __func__ << " evict_effort "
15388             << ((float)agent_state->evict_effort / 1000000.0)
15389             << " -> "
15390             << ((float)evict_effort / 1000000.0)
15391             << dendl;
15392     agent_state->evict_effort = evict_effort;
15393   }
15394
15395   // NOTE: we are using evict_effort as a proxy for *all* agent effort
15396   // (including flush).  This is probably fine (they should be
15397   // correlated) but it is not precisely correct.
15398   if (agent_state->is_idle()) {
15399     if (!restart && !old_idle) {
15400       osd->agent_disable_pg(this, old_effort);
15401     }
15402   } else {
15403     if (restart || old_idle) {
15404       osd->agent_enable_pg(this, agent_state->evict_effort);
15405     } else if (old_effort != agent_state->evict_effort) {
15406       osd->agent_adjust_pg(this, old_effort, agent_state->evict_effort);
15407     }
15408   }
15409   return requeued;
15410 }
15411
15412 void PrimaryLogPG::agent_estimate_temp(const hobject_t& oid, int *temp)
15413 {
15414   ceph_assert(hit_set);
15415   ceph_assert(temp);
15416   *temp = 0;
15417   if (hit_set->contains(oid))
15418     *temp = 1000000;
15419   unsigned i = 0;
15420   int last_n = pool.info.hit_set_search_last_n;
15421   for (map<time_t,HitSetRef>::reverse_iterator p =
15422        agent_state->hit_set_map.rbegin(); last_n > 0 &&
15423        p != agent_state->hit_set_map.rend(); ++p, ++i) {
15424     if (p->second->contains(oid)) {
15425       *temp += pool.info.get_grade(i);
15426       --last_n;
15427     }
15428   }
15429 }
15430
15431 // Dup op detection
15432
15433 bool PrimaryLogPG::already_complete(eversion_t v)
15434 {
15435   dout(20) << __func__ << ": " << v << dendl;
15436   for (xlist<RepGather*>::iterator i = repop_queue.begin();
15437        !i.end();
15438        ++i) {
15439     dout(20) << __func__ << ": " << **i << dendl;
15440     // skip copy from temp object ops
15441     if ((*i)->v == eversion_t()) {
15442       dout(20) << __func__ << ": " << **i
15443                << " version is empty" << dendl;
15444       continue;
15445     }
15446     if ((*i)->v > v) {
15447       dout(20) << __func__ << ": " << **i
15448                << " (*i)->v past v" << dendl;
15449       break;
15450     }
15451     if (!(*i)->all_committed) {
15452       dout(20) << __func__ << ": " << **i
15453                << " not committed, returning false"
15454                << dendl;
15455       return false;
15456     }
15457   }
15458   dout(20) << __func__ << ": returning true" << dendl;
15459   return true;
15460 }
15461
15462
15463 // ==========================================================================================
15464 // SCRUB
15465
15466 void PrimaryLogPG::do_replica_scrub_map(OpRequestRef op)
15467 {
15468   dout(15) << __func__ << " is scrub active? " << is_scrub_active() << dendl;
15469   op->mark_started();
15470
15471   if (!is_scrub_active()) {
15472     dout(10) << __func__ << " scrub isn't active" << dendl;
15473     return;
15474   }
15475   m_scrubber->map_from_replica(op);
15476 }
15477
15478 bool PrimaryLogPG::_range_available_for_scrub(const hobject_t& begin,
15479                                               const hobject_t& end)
15480 {
15481   pair<hobject_t, ObjectContextRef> next;
15482   next.second = object_contexts.lookup(begin);
15483   next.first = begin;
15484   bool more = true;
15485   while (more && next.first < end) {
15486     if (next.second && next.second->is_blocked()) {
15487       next.second->requeue_scrub_on_unblock = true;
15488       dout(10) << __func__ << ": scrub delayed, "
15489                << next.first << " is blocked"
15490                << dendl;
15491       return false;
15492     }
15493     more = object_contexts.get_next(next.first, &next);
15494   }
15495   return true;
15496 }
15497
15498
15499 int PrimaryLogPG::rep_repair_primary_object(const hobject_t& soid, OpContext *ctx)
15500 {
15501   OpRequestRef op = ctx->op;
15502   // Only supports replicated pools
15503   ceph_assert(!pool.info.is_erasure());
15504   ceph_assert(is_primary());
15505
15506   dout(10) << __func__ << " " << soid
15507            << " peers osd.{" << get_acting_recovery_backfill() << "}" << dendl;
15508
15509   if (!is_clean()) {
15510     block_for_clean(soid, op);
15511     return -EAGAIN;
15512   }
15513
15514   ceph_assert(!recovery_state.get_pg_log().get_missing().is_missing(soid));
15515   auto& oi = ctx->new_obs.oi;
15516   eversion_t v = oi.version;
15517
15518   if (primary_error(soid, v)) {
15519     dout(0) << __func__ << " No other replicas available for " << soid << dendl;
15520     // XXX: If we knew that there is no down osd which could include this
15521     // object, it would be nice if we could return EIO here.
15522     // If a "never fail" flag was available, that could be used
15523     // for rbd to NOT return EIO until object marked lost.
15524
15525     // Drop through to save this op in case an osd comes up with the object.
15526   }
15527
15528   // Restart the op after object becomes readable again
15529   waiting_for_unreadable_object[soid].push_back(op);
15530   op->mark_delayed("waiting for missing object");
15531
15532   ceph_assert(is_clean());
15533   state_set(PG_STATE_REPAIR);
15534   state_clear(PG_STATE_CLEAN);
15535   queue_peering_event(
15536       PGPeeringEventRef(
15537         std::make_shared<PGPeeringEvent>(
15538         get_osdmap_epoch(),
15539         get_osdmap_epoch(),
15540         PeeringState::DoRecovery())));
15541
15542   return -EAGAIN;
15543 }
15544
15545 /*---SnapTrimmer Logging---*/
15546 #undef dout_prefix
15547 #define dout_prefix pg->gen_prefix(*_dout)
15548
15549 void PrimaryLogPG::SnapTrimmer::log_enter(const char *state_name)
15550 {
15551   ldout(pg->cct, 20) << "enter " << state_name << dendl;
15552 }
15553
15554 void PrimaryLogPG::SnapTrimmer::log_exit(const char *state_name, utime_t enter_time)
15555 {
15556   ldout(pg->cct, 20) << "exit " << state_name << dendl;
15557 }
15558
15559 bool PrimaryLogPG::SnapTrimmer::permit_trim() {
15560   return
15561     pg->is_clean() &&
15562     !pg->is_scrub_queued_or_active() &&
15563     !pg->snap_trimq.empty();
15564 }
15565
15566 /*---SnapTrimmer states---*/
15567 #undef dout_prefix
15568 #define dout_prefix (context< SnapTrimmer >().pg->gen_prefix(*_dout) \
15569                      << "SnapTrimmer state<" << get_state_name() << ">: ")
15570
15571 /* NotTrimming */
15572 PrimaryLogPG::NotTrimming::NotTrimming(my_context ctx)
15573   : my_base(ctx),
15574     NamedState(nullptr, "NotTrimming")
15575 {
15576   context< SnapTrimmer >().log_enter(state_name);
15577 }
15578
15579 void PrimaryLogPG::NotTrimming::exit()
15580 {
15581   context< SnapTrimmer >().log_exit(state_name, enter_time);
15582 }
15583
15584 boost::statechart::result PrimaryLogPG::NotTrimming::react(const KickTrim&)
15585 {
15586   PrimaryLogPG *pg = context< SnapTrimmer >().pg;
15587   ldout(pg->cct, 10) << "NotTrimming react KickTrim" << dendl;
15588
15589   if (!(pg->is_primary() && pg->is_active())) {
15590     ldout(pg->cct, 10) << "NotTrimming not primary or active" << dendl;
15591     return discard_event();
15592   }
15593   if (!pg->is_clean() ||
15594       pg->snap_trimq.empty()) {
15595     ldout(pg->cct, 10) << "NotTrimming not clean or nothing to trim" << dendl;
15596     return discard_event();
15597   }
15598   if (pg->is_scrub_queued_or_active()) {
15599     ldout(pg->cct, 10) << " scrubbing, will requeue snap_trimmer after" << dendl;
15600     return transit< WaitScrub >();
15601   } else {
15602     return transit< Trimming >();
15603   }
15604 }
15605
15606 boost::statechart::result PrimaryLogPG::WaitReservation::react(const SnapTrimReserved&)
15607 {
15608   PrimaryLogPG *pg = context< SnapTrimmer >().pg;
15609   ldout(pg->cct, 10) << "WaitReservation react SnapTrimReserved" << dendl;
15610
15611   pending = nullptr;
15612   if (!context< SnapTrimmer >().can_trim()) {
15613     post_event(KickTrim());
15614     return transit< NotTrimming >();
15615   }
15616
15617   context<Trimming>().snap_to_trim = pg->snap_trimq.range_start();
15618   ldout(pg->cct, 10) << "NotTrimming: trimming "
15619                      << pg->snap_trimq.range_start()
15620                      << dendl;
15621   return transit< AwaitAsyncWork >();
15622 }
15623
15624 /* AwaitAsyncWork */
15625 PrimaryLogPG::AwaitAsyncWork::AwaitAsyncWork(my_context ctx)
15626   : my_base(ctx),
15627     NamedState(nullptr, "Trimming/AwaitAsyncWork")
15628 {
15629   auto *pg = context< SnapTrimmer >().pg;
15630   context< SnapTrimmer >().log_enter(state_name);
15631   context< SnapTrimmer >().pg->osd->queue_for_snap_trim(pg);
15632   pg->state_set(PG_STATE_SNAPTRIM);
15633   pg->state_clear(PG_STATE_SNAPTRIM_ERROR);
15634   pg->publish_stats_to_osd();
15635 }
15636
15637 boost::statechart::result PrimaryLogPG::AwaitAsyncWork::react(const DoSnapWork&)
15638 {
15639   PrimaryLogPGRef pg = context< SnapTrimmer >().pg;
15640   snapid_t snap_to_trim = context<Trimming>().snap_to_trim;
15641   auto &in_flight = context<Trimming>().in_flight;
15642   ceph_assert(in_flight.empty());
15643
15644   ceph_assert(pg->is_primary() && pg->is_active());
15645   if (!context< SnapTrimmer >().can_trim()) {
15646     ldout(pg->cct, 10) << "something changed, reverting to NotTrimming" << dendl;
15647     post_event(KickTrim());
15648     return transit< NotTrimming >();
15649   }
15650
15651   ldout(pg->cct, 10) << "AwaitAsyncWork: trimming snap " << snap_to_trim << dendl;
15652
15653   vector<hobject_t> to_trim;
15654   unsigned max = pg->cct->_conf->osd_pg_max_concurrent_snap_trims;
15655   // we need to look for at least 1 snaptrim, otherwise we'll misinterpret
15656   // the ENOENT below and erase snap_to_trim.
15657   ceph_assert(max > 0);
15658   to_trim.reserve(max);
15659   int r = pg->snap_mapper.get_next_objects_to_trim(
15660     snap_to_trim,
15661     max,
15662     &to_trim);
15663   if (r != 0 && r != -ENOENT) {
15664     lderr(pg->cct) << "get_next_objects_to_trim returned "
15665                    << cpp_strerror(r) << dendl;
15666     ceph_abort_msg("get_next_objects_to_trim returned an invalid code");
15667   } else if (r == -ENOENT) {
15668     // Done!
15669     ldout(pg->cct, 10) << "got ENOENT" << dendl;
15670
15671     pg->snap_trimq.erase(snap_to_trim);
15672
15673     if (pg->snap_trimq_repeat.count(snap_to_trim)) {
15674       ldout(pg->cct, 10) << " removing from snap_trimq_repeat" << dendl;
15675       pg->snap_trimq_repeat.erase(snap_to_trim);
15676     } else {
15677       ldout(pg->cct, 10) << "adding snap " << snap_to_trim
15678                          << " to purged_snaps"
15679                          << dendl;
15680       ObjectStore::Transaction t;
15681       pg->recovery_state.adjust_purged_snaps(
15682         [snap_to_trim](auto &purged_snaps) {
15683           purged_snaps.insert(snap_to_trim);
15684         });
15685       pg->write_if_dirty(t);
15686
15687       ldout(pg->cct, 10) << "purged_snaps now "
15688                          << pg->info.purged_snaps << ", snap_trimq now "
15689                          << pg->snap_trimq << dendl;
15690
15691       int tr = pg->osd->store->queue_transaction(pg->ch, std::move(t), NULL);
15692       ceph_assert(tr == 0);
15693
15694       pg->recovery_state.share_pg_info();
15695     }
15696     post_event(KickTrim());
15697     pg->set_snaptrim_duration();
15698     return transit< NotTrimming >();
15699   }
15700   ceph_assert(!to_trim.empty());
15701
15702   for (auto &&object: to_trim) {
15703     // Get next
15704     ldout(pg->cct, 10) << "AwaitAsyncWork react trimming " << object << dendl;
15705     OpContextUPtr ctx;
15706     int error = pg->trim_object(in_flight.empty(), object, snap_to_trim, &ctx);
15707     if (error) {
15708       if (error == -ENOLCK) {
15709         ldout(pg->cct, 10) << "could not get write lock on obj "
15710                            << object << dendl;
15711       } else {
15712         pg->state_set(PG_STATE_SNAPTRIM_ERROR);
15713         ldout(pg->cct, 10) << "Snaptrim error=" << error << dendl;
15714       }
15715       if (!in_flight.empty()) {
15716         ldout(pg->cct, 10) << "letting the ones we already started finish" << dendl;
15717         return transit< WaitRepops >();
15718       }
15719       if (error == -ENOLCK) {
15720         ldout(pg->cct, 10) << "waiting for it to clear"
15721                            << dendl;
15722         return transit< WaitRWLock >();
15723       } else {
15724         return transit< NotTrimming >();
15725       }
15726     }
15727
15728     in_flight.insert(object);
15729     ctx->register_on_success(
15730       [pg, object, &in_flight]() {
15731         ceph_assert(in_flight.find(object) != in_flight.end());
15732         in_flight.erase(object);
15733         if (in_flight.empty()) {
15734           if (pg->state_test(PG_STATE_SNAPTRIM_ERROR)) {
15735             pg->snap_trimmer_machine.process_event(Reset());
15736           } else {
15737             pg->snap_trimmer_machine.process_event(RepopsComplete());
15738           }
15739         }
15740       });
15741
15742     pg->simple_opc_submit(std::move(ctx));
15743   }
15744
15745   return transit< WaitRepops >();
15746 }
15747
15748 void PrimaryLogPG::setattr_maybe_cache(
15749   ObjectContextRef obc,
15750   PGTransaction *t,
15751   const string &key,
15752   bufferlist &val)
15753 {
15754   t->setattr(obc->obs.oi.soid, key, val);
15755 }
15756
15757 void PrimaryLogPG::setattrs_maybe_cache(
15758   ObjectContextRef obc,
15759   PGTransaction *t,
15760   map<string, bufferlist, less<>> &attrs)
15761 {
15762   t->setattrs(obc->obs.oi.soid, attrs);
15763 }
15764
15765 void PrimaryLogPG::rmattr_maybe_cache(
15766   ObjectContextRef obc,
15767   PGTransaction *t,
15768   const string &key)
15769 {
15770   t->rmattr(obc->obs.oi.soid, key);
15771 }
15772
15773 int PrimaryLogPG::getattr_maybe_cache(
15774   ObjectContextRef obc,
15775   const string &key,
15776   bufferlist *val)
15777 {
15778   if (pool.info.is_erasure()) {
15779     map<string, bufferlist>::iterator i = obc->attr_cache.find(key);
15780     if (i != obc->attr_cache.end()) {
15781       if (val)
15782         *val = i->second;
15783       return 0;
15784     } else {
15785       if (obc->obs.exists) {
15786         return -ENODATA;
15787       } else {
15788         return -ENOENT;
15789       }
15790     }
15791   }
15792   return pgbackend->objects_get_attr(obc->obs.oi.soid, key, val);
15793 }
15794
15795 int PrimaryLogPG::getattrs_maybe_cache(
15796   ObjectContextRef obc,
15797   map<string, bufferlist, less<>> *out)
15798 {
15799   int r = 0;
15800   ceph_assert(out);
15801   if (pool.info.is_erasure()) {
15802     *out = obc->attr_cache;
15803   } else {
15804     r = pgbackend->objects_get_attrs(obc->obs.oi.soid, out);
15805   }
15806   map<string, bufferlist, less<>> tmp;
15807   for (auto& [key, val]: *out) {
15808     if (key.size() > 1 && key[0] == '_') {
15809       tmp[key.substr(1, key.size())] = std::move(val);
15810     }
15811   }
15812   tmp.swap(*out);
15813   return r;
15814 }
15815
15816 bool PrimaryLogPG::check_failsafe_full() {
15817     return osd->check_failsafe_full(get_dpp());
15818 }
15819
15820 bool PrimaryLogPG::maybe_preempt_replica_scrub(const hobject_t& oid)
15821 {
15822   return m_scrubber->write_blocked_by_scrub(oid);
15823 }
15824
15825 void intrusive_ptr_add_ref(PrimaryLogPG *pg) { pg->get("intptr"); }
15826 void intrusive_ptr_release(PrimaryLogPG *pg) { pg->put("intptr"); }
15827
15828 #ifdef PG_DEBUG_REFS
15829 uint64_t get_with_id(PrimaryLogPG *pg) { return pg->get_with_id(); }
15830 void put_with_id(PrimaryLogPG *pg, uint64_t id) { return pg->put_with_id(id); }
15831 #endif
15832
15833 void intrusive_ptr_add_ref(PrimaryLogPG::RepGather *repop) { repop->get(); }
15834 void intrusive_ptr_release(PrimaryLogPG::RepGather *repop) { repop->put(); }