ceph/src/osd/PrimaryLogPG.cc

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Ceph - scalable distributed file system
   5  *
   6  * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
   7  * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
   8  *
   9  * Author: Loic Dachary <loic@dachary.org>
  10  *
  11  * This is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public
  13  * License version 2.1, as published by the Free Software
  14  * Foundation.  See file COPYING.
  15  *
  16  */
  17 #include "PrimaryLogPG.h"
  18
  19 #include <errno.h>
  20
  21 #include <charconv>
  22 #include <sstream>
  23 #include <utility>
  24
  25 #include <boost/intrusive_ptr.hpp>
  26 #include <boost/tuple/tuple.hpp>
  27
  28 #include "PrimaryLogPG.h"
  29
  30 #include "cls/cas/cls_cas_ops.h"
  31 #include "common/CDC.h"
  32 #include "common/EventTrace.h"
  33 #include "common/ceph_crypto.h"
  34 #include "common/config.h"
  35 #include "common/errno.h"
  36 #include "common/perf_counters.h"
  37 #include "common/scrub_types.h"
  38 #include "include/compat.h"
  39 #include "json_spirit/json_spirit_reader.h"
  40 #include "json_spirit/json_spirit_value.h"
  41 #include "messages/MCommandReply.h"
  42 #include "messages/MOSDBackoff.h"
  43 #include "messages/MOSDOp.h"
  44 #include "messages/MOSDPGBackfill.h"
  45 #include "messages/MOSDPGBackfillRemove.h"
  46 #include "messages/MOSDPGLog.h"
  47 #include "messages/MOSDPGScan.h"
  48 #include "messages/MOSDPGTrim.h"
  49 #include "messages/MOSDPGUpdateLogMissing.h"
  50 #include "messages/MOSDPGUpdateLogMissingReply.h"
  51 #include "messages/MOSDRepScrub.h"
  52 #include "messages/MOSDScrubReserve.h"
  53 #include "mon/MonClient.h"
  54 #include "objclass/objclass.h"
  55 #include "osd/ClassHandler.h"
  56 #include "osdc/Objecter.h"
  57 #include "osd/scrubber/PrimaryLogScrub.h"
  58 #include "osd/scrubber/ScrubStore.h"
  59 #include "osd/scrubber/pg_scrubber.h"
  60
  61 #include "OSD.h"
  62 #include "OpRequest.h"
  63 #include "PG.h"
  64 #include "Session.h"
  65
  66 // required includes order:
  67 #include "json_spirit/json_spirit_value.h"
  68 #include "json_spirit/json_spirit_reader.h"
  69 #include "include/ceph_assert.h"  // json_spirit clobbers it
  70 #include "include/rados/rados_types.hpp"
  71
  72 #ifdef WITH_LTTNG
  73 #include "tracing/osd.h"
  74 #else
  75 #define tracepoint(...)
  76 #endif
  77
  78 #define dout_context cct
  79 #define dout_subsys ceph_subsys_osd
  80 #define DOUT_PREFIX_ARGS this, osd->whoami, get_osdmap()
  81 #undef dout_prefix
  82 #define dout_prefix _prefix(_dout, this)
  83
  84 #include "osd_tracer.h"
  85
  86 MEMPOOL_DEFINE_OBJECT_FACTORY(PrimaryLogPG, replicatedpg, osd);
  87
  88 using std::less;
  89 using std::list;
  90 using std::ostream;
  91 using std::pair;
  92 using std::make_pair;
  93 using std::make_unique;
  94 using std::map;
  95 using std::ostringstream;
  96 using std::set;
  97 using std::string;
  98 using std::string_view;
  99 using std::stringstream;
 100 using std::unique_ptr;
 101 using std::vector;
 102
 103 using ceph::bufferlist;
 104 using ceph::bufferptr;
 105 using ceph::Formatter;
 106 using ceph::decode;
 107 using ceph::decode_noclear;
 108 using ceph::encode;
 109 using ceph::encode_destructively;
 110
 111 using namespace ceph::osd::scheduler;
 112 using TOPNSPC::common::cmd_getval;
 113 using TOPNSPC::common::cmd_getval_or;
 114
 115 template <typename T>
 116 static ostream& _prefix(std::ostream *_dout, T *pg) {
 117   return pg->gen_prefix(*_dout);
 118 }
 119
 120 /**
 121  * The CopyCallback class defines an interface for completions to the
 122  * copy_start code. Users of the copy infrastructure must implement
 123  * one and give an instance of the class to start_copy.
 124  *
 125  * The implementer is responsible for making sure that the CopyCallback
 126  * can associate itself with the correct copy operation.
 127  */
 128 class PrimaryLogPG::CopyCallback : public GenContext<CopyCallbackResults> {
 129 protected:
 130   CopyCallback() {}
 131   /**
 132    * results.get<0>() is the return code: 0 for success; -ECANCELED if
 133    * the operation was cancelled by the local OSD; -errno for other issues.
 134    * results.get<1>() is a pointer to a CopyResults object, which you are
 135    * responsible for deleting.
 136    */
 137   void finish(CopyCallbackResults results_) override = 0;
 138
 139 public:
 140   /// Provide the final size of the copied object to the CopyCallback
 141   ~CopyCallback() override {}
 142 };
 143
 144 template <typename T>
 145 class PrimaryLogPG::BlessedGenContext : public GenContext<T> {
 146   PrimaryLogPGRef pg;
 147   unique_ptr<GenContext<T>> c;
 148   epoch_t e;
 149 public:
 150   BlessedGenContext(PrimaryLogPG *pg, GenContext<T> *c, epoch_t e)
 151     : pg(pg), c(c), e(e) {}
 152   void finish(T t) override {
 153     std::scoped_lock locker{*pg};
 154     if (pg->pg_has_reset_since(e))
 155       c.reset();
 156     else
 157       c.release()->complete(t);
 158   }
 159   bool sync_finish(T t) {
 160     // we assume here all blessed/wrapped Contexts can complete synchronously.
 161     c.release()->complete(t);
 162     return true;
 163   }
 164 };
 165
 166 GenContext<ThreadPool::TPHandle&> *PrimaryLogPG::bless_gencontext(
 167   GenContext<ThreadPool::TPHandle&> *c) {
 168   return new BlessedGenContext<ThreadPool::TPHandle&>(
 169     this, c, get_osdmap_epoch());
 170 }
 171
 172 template <typename T>
 173 class PrimaryLogPG::UnlockedBlessedGenContext : public GenContext<T> {
 174   PrimaryLogPGRef pg;
 175   unique_ptr<GenContext<T>> c;
 176   epoch_t e;
 177 public:
 178   UnlockedBlessedGenContext(PrimaryLogPG *pg, GenContext<T> *c, epoch_t e)
 179     : pg(pg), c(c), e(e) {}
 180   void finish(T t) override {
 181     if (pg->pg_has_reset_since(e))
 182       c.reset();
 183     else
 184       c.release()->complete(t);
 185   }
 186   bool sync_finish(T t) {
 187     // we assume here all blessed/wrapped Contexts can complete synchronously.
 188     c.release()->complete(t);
 189     return true;
 190   }
 191 };
 192
 193 GenContext<ThreadPool::TPHandle&> *PrimaryLogPG::bless_unlocked_gencontext(
 194   GenContext<ThreadPool::TPHandle&> *c) {
 195   return new UnlockedBlessedGenContext<ThreadPool::TPHandle&>(
 196     this, c, get_osdmap_epoch());
 197 }
 198
 199 class PrimaryLogPG::BlessedContext : public Context {
 200   PrimaryLogPGRef pg;
 201   unique_ptr<Context> c;
 202   epoch_t e;
 203 public:
 204   BlessedContext(PrimaryLogPG *pg, Context *c, epoch_t e)
 205     : pg(pg), c(c), e(e) {}
 206   void finish(int r) override {
 207     std::scoped_lock locker{*pg};
 208     if (pg->pg_has_reset_since(e))
 209       c.reset();
 210     else
 211       c.release()->complete(r);
 212   }
 213   bool sync_finish(int r) override {
 214     // we assume here all blessed/wrapped Contexts can complete synchronously.
 215     c.release()->complete(r);
 216     return true;
 217   }
 218 };
 219
 220 Context *PrimaryLogPG::bless_context(Context *c) {
 221   return new BlessedContext(this, c, get_osdmap_epoch());
 222 }
 223
 224 class PrimaryLogPG::C_PG_ObjectContext : public Context {
 225   PrimaryLogPGRef pg;
 226   ObjectContext *obc;
 227   public:
 228   C_PG_ObjectContext(PrimaryLogPG *p, ObjectContext *o) :
 229     pg(p), obc(o) {}
 230   void finish(int r) override {
 231     pg->object_context_destructor_callback(obc);
 232   }
 233 };
 234
 235 struct OnReadComplete : public Context {
 236   PrimaryLogPG *pg;
 237   PrimaryLogPG::OpContext *opcontext;
 238   OnReadComplete(
 239     PrimaryLogPG *pg,
 240     PrimaryLogPG::OpContext *ctx) : pg(pg), opcontext(ctx) {}
 241   void finish(int r) override {
 242     opcontext->finish_read(pg);
 243   }
 244   ~OnReadComplete() override {}
 245 };
 246
 247 class PrimaryLogPG::C_OSD_AppliedRecoveredObject : public Context {
 248   PrimaryLogPGRef pg;
 249   ObjectContextRef obc;
 250   public:
 251   C_OSD_AppliedRecoveredObject(PrimaryLogPG *p, ObjectContextRef o) :
 252     pg(p), obc(o) {}
 253   bool sync_finish(int r) override {
 254     pg->_applied_recovered_object(obc);
 255     return true;
 256   }
 257   void finish(int r) override {
 258     std::scoped_lock locker{*pg};
 259     pg->_applied_recovered_object(obc);
 260   }
 261 };
 262
 263 class PrimaryLogPG::C_OSD_CommittedPushedObject : public Context {
 264   PrimaryLogPGRef pg;
 265   epoch_t epoch;
 266   eversion_t last_complete;
 267   public:
 268   C_OSD_CommittedPushedObject(
 269     PrimaryLogPG *p, epoch_t epoch, eversion_t lc) :
 270     pg(p), epoch(epoch), last_complete(lc) {
 271   }
 272   void finish(int r) override {
 273     pg->_committed_pushed_object(epoch, last_complete);
 274   }
 275 };
 276
 277 class PrimaryLogPG::C_OSD_AppliedRecoveredObjectReplica : public Context {
 278   PrimaryLogPGRef pg;
 279   public:
 280   explicit C_OSD_AppliedRecoveredObjectReplica(PrimaryLogPG *p) :
 281     pg(p) {}
 282   bool sync_finish(int r) override {
 283     pg->_applied_recovered_object_replica();
 284     return true;
 285   }
 286   void finish(int r) override {
 287     std::scoped_lock locker{*pg};
 288     pg->_applied_recovered_object_replica();
 289   }
 290 };
 291
 292 // OpContext
 293 void PrimaryLogPG::OpContext::start_async_reads(PrimaryLogPG *pg)
 294 {
 295   inflightreads = 1;
 296   list<pair<boost::tuple<uint64_t, uint64_t, unsigned>,
 297             pair<bufferlist*, Context*> > > in;
 298   in.swap(pending_async_reads);
 299   pg->pgbackend->objects_read_async(
 300     obc->obs.oi.soid,
 301     in,
 302     new OnReadComplete(pg, this), pg->get_pool().fast_read);
 303 }
 304 void PrimaryLogPG::OpContext::finish_read(PrimaryLogPG *pg)
 305 {
 306   ceph_assert(inflightreads > 0);
 307   --inflightreads;
 308   if (async_reads_complete()) {
 309     ceph_assert(pg->in_progress_async_reads.size());
 310     ceph_assert(pg->in_progress_async_reads.front().second == this);
 311     pg->in_progress_async_reads.pop_front();
 312
 313     // Restart the op context now that all reads have been
 314     // completed. Read failures will be handled by the op finisher
 315     pg->execute_ctx(this);
 316   }
 317 }
 318
 319 class CopyFromCallback : public PrimaryLogPG::CopyCallback {
 320 public:
 321   PrimaryLogPG::CopyResults *results = nullptr;
 322   PrimaryLogPG::OpContext *ctx;
 323   OSDOp &osd_op;
 324   uint32_t truncate_seq;
 325   uint64_t truncate_size;
 326   bool have_truncate = false;
 327
 328   CopyFromCallback(PrimaryLogPG::OpContext *ctx, OSDOp &osd_op)
 329     : ctx(ctx), osd_op(osd_op) {
 330   }
 331   ~CopyFromCallback() override {}
 332
 333   void finish(PrimaryLogPG::CopyCallbackResults results_) override {
 334     results = results_.get<1>();
 335     int r = results_.get<0>();
 336
 337     // Only use truncate_{seq,size} from the original object if the client
 338     // did not sent us these parameters
 339     if (!have_truncate) {
 340       truncate_seq = results->truncate_seq;
 341       truncate_size = results->truncate_size;
 342     }
 343
 344     // for finish_copyfrom
 345     ctx->user_at_version = results->user_version;
 346
 347     if (r >= 0) {
 348       ctx->pg->execute_ctx(ctx);
 349     } else {
 350       if (r != -ECANCELED) { // on cancel just toss it out; client resends
 351         if (ctx->op)
 352           ctx->pg->osd->reply_op_error(ctx->op, r);
 353       } else if (results->should_requeue) {
 354         if (ctx->op)
 355           ctx->pg->requeue_op(ctx->op);
 356       }
 357       ctx->pg->close_op_ctx(ctx);
 358     }
 359   }
 360
 361   bool is_temp_obj_used() {
 362     return results->started_temp_obj;
 363   }
 364   uint64_t get_data_size() {
 365     return results->object_size;
 366   }
 367   void set_truncate(uint32_t seq, uint64_t size) {
 368     truncate_seq = seq;
 369     truncate_size = size;
 370     have_truncate = true;
 371   }
 372 };
 373
 374 struct CopyFromFinisher : public PrimaryLogPG::OpFinisher {
 375   CopyFromCallback *copy_from_callback;
 376
 377   explicit CopyFromFinisher(CopyFromCallback *copy_from_callback)
 378     : copy_from_callback(copy_from_callback) {
 379   }
 380
 381   int execute() override {
 382     // instance will be destructed after this method completes
 383     copy_from_callback->ctx->pg->finish_copyfrom(copy_from_callback);
 384     return 0;
 385   }
 386 };
 387
 388 // ======================
 389 // PGBackend::Listener
 390
 391 void PrimaryLogPG::on_local_recover(
 392   const hobject_t &hoid,
 393   const ObjectRecoveryInfo &_recovery_info,
 394   ObjectContextRef obc,
 395   bool is_delete,
 396   ObjectStore::Transaction *t
 397   )
 398 {
 399   dout(10) << __func__ << ": " << hoid << dendl;
 400
 401   ObjectRecoveryInfo recovery_info(_recovery_info);
 402   clear_object_snap_mapping(t, hoid);
 403   if (!is_delete && recovery_info.soid.is_snap()) {
 404     OSDriver::OSTransaction _t(osdriver.get_transaction(t));
 405     set<snapid_t> snaps;
 406     dout(20) << " snapset " << recovery_info.ss << dendl;
 407     auto p = recovery_info.ss.clone_snaps.find(hoid.snap);
 408     if (p != recovery_info.ss.clone_snaps.end()) {
 409       snaps.insert(p->second.begin(), p->second.end());
 410       dout(20) << " snaps " << snaps << dendl;
 411       snap_mapper.add_oid(
 412         recovery_info.soid,
 413         snaps,
 414         &_t);
 415     } else {
 416       derr << __func__ << " " << hoid << " had no clone_snaps" << dendl;
 417     }
 418   }
 419   if (!is_delete && recovery_state.get_pg_log().get_missing().is_missing(recovery_info.soid) &&
 420       recovery_state.get_pg_log().get_missing().get_items().find(recovery_info.soid)->second.need > recovery_info.version) {
 421     ceph_assert(is_primary());
 422     const pg_log_entry_t *latest = recovery_state.get_pg_log().get_log().objects.find(recovery_info.soid)->second;
 423     if (latest->op == pg_log_entry_t::LOST_REVERT &&
 424         latest->reverting_to == recovery_info.version) {
 425       dout(10) << " got old revert version " << recovery_info.version
 426                << " for " << *latest << dendl;
 427       recovery_info.version = latest->version;
 428       // update the attr to the revert event version
 429       recovery_info.oi.prior_version = recovery_info.oi.version;
 430       recovery_info.oi.version = latest->version;
 431       bufferlist bl;
 432       encode(recovery_info.oi, bl,
 433                get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
 434       ceph_assert(!pool.info.is_erasure());
 435       t->setattr(coll, ghobject_t(recovery_info.soid), OI_ATTR, bl);
 436       if (obc)
 437         obc->attr_cache[OI_ATTR] = bl;
 438     }
 439   }
 440
 441   // keep track of active pushes for scrub
 442   ++active_pushes;
 443
 444   recovery_state.recover_got(
 445     recovery_info.soid,
 446     recovery_info.version,
 447     is_delete,
 448     *t);
 449
 450   if (is_primary()) {
 451     if (!is_delete) {
 452       obc->obs.exists = true;
 453
 454       bool got = obc->get_recovery_read();
 455       ceph_assert(got);
 456
 457       ceph_assert(recovering.count(obc->obs.oi.soid));
 458       recovering[obc->obs.oi.soid] = obc;
 459       obc->obs.oi = recovery_info.oi;  // may have been updated above
 460     }
 461
 462     t->register_on_applied(new C_OSD_AppliedRecoveredObject(this, obc));
 463
 464     publish_stats_to_osd();
 465     release_backoffs(hoid);
 466     if (!is_unreadable_object(hoid)) {
 467       auto unreadable_object_entry = waiting_for_unreadable_object.find(hoid);
 468       if (unreadable_object_entry != waiting_for_unreadable_object.end()) {
 469         dout(20) << " kicking unreadable waiters on " << hoid << dendl;
 470         requeue_ops(unreadable_object_entry->second);
 471         waiting_for_unreadable_object.erase(unreadable_object_entry);
 472       }
 473     }
 474   } else {
 475     t->register_on_applied(
 476       new C_OSD_AppliedRecoveredObjectReplica(this));
 477
 478   }
 479
 480   t->register_on_commit(
 481     new C_OSD_CommittedPushedObject(
 482       this,
 483       get_osdmap_epoch(),
 484       info.last_complete));
 485 }
 486
 487 void PrimaryLogPG::on_global_recover(
 488   const hobject_t &soid,
 489   const object_stat_sum_t &stat_diff,
 490   bool is_delete)
 491 {
 492   recovery_state.object_recovered(soid, stat_diff);
 493   publish_stats_to_osd();
 494   dout(10) << "pushed " << soid << " to all replicas" << dendl;
 495   auto i = recovering.find(soid);
 496   ceph_assert(i != recovering.end());
 497
 498   if (i->second && i->second->rwstate.recovery_read_marker) {
 499     // recover missing won't have had an obc, but it gets filled in
 500     // during on_local_recover
 501     ceph_assert(i->second);
 502     list<OpRequestRef> requeue_list;
 503     i->second->drop_recovery_read(&requeue_list);
 504     requeue_ops(requeue_list);
 505   }
 506
 507   backfills_in_flight.erase(soid);
 508
 509   recovering.erase(i);
 510   finish_recovery_op(soid);
 511   release_backoffs(soid);
 512   auto degraded_object_entry = waiting_for_degraded_object.find(soid);
 513   if (degraded_object_entry != waiting_for_degraded_object.end()) {
 514     dout(20) << " kicking degraded waiters on " << soid << dendl;
 515     requeue_ops(degraded_object_entry->second);
 516     waiting_for_degraded_object.erase(degraded_object_entry);
 517   }
 518   auto unreadable_object_entry = waiting_for_unreadable_object.find(soid);
 519   if (unreadable_object_entry != waiting_for_unreadable_object.end()) {
 520     dout(20) << " kicking unreadable waiters on " << soid << dendl;
 521     requeue_ops(unreadable_object_entry->second);
 522     waiting_for_unreadable_object.erase(unreadable_object_entry);
 523   }
 524   finish_degraded_object(soid);
 525 }
 526
 527 void PrimaryLogPG::schedule_recovery_work(
 528   GenContext<ThreadPool::TPHandle&> *c)
 529 {
 530   osd->queue_recovery_context(this, c);
 531 }
 532
 533 void PrimaryLogPG::replica_clear_repop_obc(
 534   const vector<pg_log_entry_t> &logv,
 535   ObjectStore::Transaction &t)
 536 {
 537   for (auto &&e: logv) {
 538     /* Have to blast all clones, they share a snapset */
 539     object_contexts.clear_range(
 540       e.soid.get_object_boundary(), e.soid.get_head());
 541     ceph_assert(
 542       snapset_contexts.find(e.soid.get_head()) ==
 543       snapset_contexts.end());
 544   }
 545 }
 546
 547 bool PrimaryLogPG::should_send_op(
 548   pg_shard_t peer,
 549   const hobject_t &hoid) {
 550   if (peer == get_primary())
 551     return true;
 552   ceph_assert(recovery_state.has_peer_info(peer));
 553   bool should_send =
 554       hoid.pool != (int64_t)info.pgid.pool() ||
 555       hoid <= last_backfill_started ||
 556       hoid <= recovery_state.get_peer_info(peer).last_backfill;
 557   if (!should_send) {
 558     ceph_assert(is_backfill_target(peer));
 559     dout(10) << __func__ << " issue_repop shipping empty opt to osd." << peer
 560              << ", object " << hoid
 561              << " beyond std::max(last_backfill_started "
 562              << ", peer_info[peer].last_backfill "
 563              << recovery_state.get_peer_info(peer).last_backfill
 564              << ")" << dendl;
 565     return should_send;
 566   }
 567   if (is_async_recovery_target(peer) &&
 568       recovery_state.get_peer_missing(peer).is_missing(hoid)) {
 569     should_send = false;
 570     dout(10) << __func__ << " issue_repop shipping empty opt to osd." << peer
 571              << ", object " << hoid
 572              << " which is pending recovery in async_recovery_targets" << dendl;
 573   }
 574   return should_send;
 575 }
 576
 577
 578 ConnectionRef PrimaryLogPG::get_con_osd_cluster(
 579   int peer, epoch_t from_epoch)
 580 {
 581   return osd->get_con_osd_cluster(peer, from_epoch);
 582 }
 583
 584 PerfCounters *PrimaryLogPG::get_logger()
 585 {
 586   return osd->logger;
 587 }
 588
 589
 590 // ====================
 591 // missing objects
 592
 593 bool PrimaryLogPG::is_missing_object(const hobject_t& soid) const
 594 {
 595   return recovery_state.get_pg_log().get_missing().get_items().count(soid);
 596 }
 597
 598 void PrimaryLogPG::maybe_kick_recovery(
 599   const hobject_t &soid)
 600 {
 601   eversion_t v;
 602   bool work_started = false;
 603   if (!recovery_state.get_missing_loc().needs_recovery(soid, &v))
 604     return;
 605
 606   map<hobject_t, ObjectContextRef>::const_iterator p = recovering.find(soid);
 607   if (p != recovering.end()) {
 608     dout(7) << "object " << soid << " v " << v << ", already recovering." << dendl;
 609   } else if (recovery_state.get_missing_loc().is_unfound(soid)) {
 610     dout(7) << "object " << soid << " v " << v << ", is unfound." << dendl;
 611   } else {
 612     dout(7) << "object " << soid << " v " << v << ", recovering." << dendl;
 613     PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
 614     if (is_missing_object(soid)) {
 615       recover_missing(soid, v, CEPH_MSG_PRIO_HIGH, h);
 616     } else if (recovery_state.get_missing_loc().is_deleted(soid)) {
 617       prep_object_replica_deletes(soid, v, h, &work_started);
 618     } else {
 619       prep_object_replica_pushes(soid, v, h, &work_started);
 620     }
 621     pgbackend->run_recovery_op(h, CEPH_MSG_PRIO_HIGH);
 622   }
 623 }
 624
 625 void PrimaryLogPG::wait_for_unreadable_object(
 626   const hobject_t& soid, OpRequestRef op)
 627 {
 628   ceph_assert(is_unreadable_object(soid));
 629   maybe_kick_recovery(soid);
 630   waiting_for_unreadable_object[soid].push_back(op);
 631   op->mark_delayed("waiting for missing object");
 632 }
 633
 634 bool PrimaryLogPG::is_degraded_or_backfilling_object(const hobject_t& soid)
 635 {
 636   /* The conditions below may clear (on_local_recover, before we queue
 637    * the transaction) before we actually requeue the degraded waiters
 638    * in on_global_recover after the transaction completes.
 639    */
 640   if (waiting_for_degraded_object.count(soid))
 641     return true;
 642   if (recovery_state.get_pg_log().get_missing().get_items().count(soid))
 643     return true;
 644   ceph_assert(!get_acting_recovery_backfill().empty());
 645   for (set<pg_shard_t>::iterator i = get_acting_recovery_backfill().begin();
 646        i != get_acting_recovery_backfill().end();
 647        ++i) {
 648     if (*i == get_primary()) continue;
 649     pg_shard_t peer = *i;
 650     auto peer_missing_entry = recovery_state.get_peer_missing().find(peer);
 651     // If an object is missing on an async_recovery_target, return false.
 652     // This will not block the op and the object is async recovered later.
 653     if (peer_missing_entry != recovery_state.get_peer_missing().end() &&
 654         peer_missing_entry->second.get_items().count(soid)) {
 655       if (is_async_recovery_target(peer))
 656         continue;
 657       else
 658         return true;
 659     }
 660     // Object is degraded if after last_backfill AND
 661     // we are backfilling it
 662     if (is_backfill_target(peer) &&
 663         recovery_state.get_peer_info(peer).last_backfill <= soid &&
 664         last_backfill_started >= soid &&
 665         backfills_in_flight.count(soid))
 666       return true;
 667   }
 668   return false;
 669 }
 670
 671 bool PrimaryLogPG::is_degraded_on_async_recovery_target(const hobject_t& soid)
 672 {
 673   for (auto &i: get_async_recovery_targets()) {
 674     auto peer_missing_entry = recovery_state.get_peer_missing().find(i);
 675     if (peer_missing_entry != recovery_state.get_peer_missing().end() &&
 676         peer_missing_entry->second.get_items().count(soid)) {
 677       dout(30) << __func__ << " " << soid << dendl;
 678       return true;
 679     }
 680   }
 681   return false;
 682 }
 683
 684 void PrimaryLogPG::wait_for_degraded_object(const hobject_t& soid, OpRequestRef op)
 685 {
 686   ceph_assert(is_degraded_or_backfilling_object(soid) || is_degraded_on_async_recovery_target(soid));
 687
 688   maybe_kick_recovery(soid);
 689   waiting_for_degraded_object[soid].push_back(op);
 690   op->mark_delayed("waiting for degraded object");
 691 }
 692
 693 void PrimaryLogPG::block_write_on_full_cache(
 694   const hobject_t& _oid, OpRequestRef op)
 695 {
 696   const hobject_t oid = _oid.get_head();
 697   dout(20) << __func__ << ": blocking object " << oid
 698            << " on full cache" << dendl;
 699   objects_blocked_on_cache_full.insert(oid);
 700   waiting_for_cache_not_full.push_back(op);
 701   op->mark_delayed("waiting for cache not full");
 702 }
 703
 704 void PrimaryLogPG::block_for_clean(
 705   const hobject_t& oid, OpRequestRef op)
 706 {
 707   dout(20) << __func__ << ": blocking object " << oid
 708            << " on primary repair" << dendl;
 709   waiting_for_clean_to_primary_repair.push_back(op);
 710   op->mark_delayed("waiting for clean to repair");
 711 }
 712
 713 void PrimaryLogPG::block_write_on_snap_rollback(
 714   const hobject_t& oid, ObjectContextRef obc, OpRequestRef op)
 715 {
 716   dout(20) << __func__ << ": blocking object " << oid.get_head()
 717            << " on snap promotion " << obc->obs.oi.soid << dendl;
 718   // otherwise, we'd have blocked in do_op
 719   ceph_assert(oid.is_head());
 720   ceph_assert(objects_blocked_on_snap_promotion.count(oid) == 0);
 721   /*
 722    * We block the head object here.
 723    *
 724    * Let's assume that there is racing read When the head object is being rollbacked.
 725    * Since the two different ops can trigger promote_object() with the same source,
 726    * infinite loop happens by canceling ops each other.
 727    * To avoid this, we block the head object during rollback.
 728    * So, the racing read will be blocked until the rollback is completed.
 729    * see also: https://tracker.ceph.com/issues/49726
 730    */
 731   ObjectContextRef head_obc = get_object_context(oid, false);
 732   head_obc->start_block();
 733   objects_blocked_on_snap_promotion[oid] = obc;
 734   wait_for_blocked_object(obc->obs.oi.soid, op);
 735 }
 736
 737 void PrimaryLogPG::block_write_on_degraded_snap(
 738   const hobject_t& snap, OpRequestRef op)
 739 {
 740   dout(20) << __func__ << ": blocking object " << snap.get_head()
 741            << " on degraded snap " << snap << dendl;
 742   // otherwise, we'd have blocked in do_op
 743   ceph_assert(objects_blocked_on_degraded_snap.count(snap.get_head()) == 0);
 744   objects_blocked_on_degraded_snap[snap.get_head()] = snap.snap;
 745   wait_for_degraded_object(snap, op);
 746 }
 747
 748 bool PrimaryLogPG::maybe_await_blocked_head(
 749   const hobject_t &hoid,
 750   OpRequestRef op)
 751 {
 752   ObjectContextRef obc;
 753   obc = object_contexts.lookup(hoid.get_head());
 754   if (obc) {
 755     if (obc->is_blocked()) {
 756       wait_for_blocked_object(obc->obs.oi.soid, op);
 757       return true;
 758     } else {
 759       return false;
 760     }
 761   }
 762   return false;
 763 }
 764
 765 void PrimaryLogPG::wait_for_blocked_object(const hobject_t& soid, OpRequestRef op)
 766 {
 767   dout(10) << __func__ << " " << soid << " " << op << dendl;
 768   waiting_for_blocked_object[soid].push_back(op);
 769   op->mark_delayed("waiting for blocked object");
 770 }
 771
 772 void PrimaryLogPG::maybe_force_recovery()
 773 {
 774   // no force if not in degraded/recovery/backfill states
 775   if (!is_degraded() &&
 776       !state_test(PG_STATE_RECOVERING |
 777                   PG_STATE_RECOVERY_WAIT |
 778                   PG_STATE_BACKFILLING |
 779                   PG_STATE_BACKFILL_WAIT |
 780                   PG_STATE_BACKFILL_TOOFULL))
 781     return;
 782
 783   if (recovery_state.get_pg_log().get_log().approx_size() <
 784       cct->_conf->osd_max_pg_log_entries *
 785         cct->_conf->osd_force_recovery_pg_log_entries_factor)
 786     return;
 787
 788   // find the oldest missing object
 789   version_t min_version = recovery_state.get_pg_log().get_log().head.version;
 790   hobject_t soid;
 791   if (!recovery_state.get_pg_log().get_missing().get_rmissing().empty()) {
 792     min_version = recovery_state.get_pg_log().get_missing().get_rmissing().begin()->first;
 793     soid = recovery_state.get_pg_log().get_missing().get_rmissing().begin()->second;
 794   }
 795   ceph_assert(!get_acting_recovery_backfill().empty());
 796   for (set<pg_shard_t>::iterator it = get_acting_recovery_backfill().begin();
 797        it != get_acting_recovery_backfill().end();
 798        ++it) {
 799     if (*it == get_primary()) continue;
 800     pg_shard_t peer = *it;
 801     auto it_missing = recovery_state.get_peer_missing().find(peer);
 802     if (it_missing != recovery_state.get_peer_missing().end() &&
 803         !it_missing->second.get_rmissing().empty()) {
 804       const auto& min_obj = recovery_state.get_peer_missing(peer).get_rmissing().begin();
 805       dout(20) << __func__ << " peer " << peer << " min_version " << min_obj->first
 806                << " oid " << min_obj->second << dendl;
 807       if (min_version > min_obj->first) {
 808         min_version = min_obj->first;
 809         soid = min_obj->second;
 810       }
 811     }
 812   }
 813
 814   // recover it
 815   if (soid != hobject_t())
 816     maybe_kick_recovery(soid);
 817 }
 818
 819 bool PrimaryLogPG::check_laggy(OpRequestRef& op)
 820 {
 821   assert(HAVE_FEATURE(recovery_state.get_min_upacting_features(),
 822                       SERVER_OCTOPUS));
 823   if (state_test(PG_STATE_WAIT)) {
 824     dout(10) << __func__ << " PG is WAIT state" << dendl;
 825   } else if (!state_test(PG_STATE_LAGGY)) {
 826     auto mnow = osd->get_mnow();
 827     auto ru = recovery_state.get_readable_until();
 828     if (mnow <= ru) {
 829       // not laggy
 830       return true;
 831     }
 832     dout(10) << __func__
 833              << " mnow " << mnow
 834              << " > readable_until " << ru << dendl;
 835
 836     if (!is_primary()) {
 837       osd->reply_op_error(op, -EAGAIN);
 838       return false;
 839     }
 840
 841     // go to laggy state
 842     state_set(PG_STATE_LAGGY);
 843     publish_stats_to_osd();
 844   }
 845   dout(10) << __func__ << " not readable" << dendl;
 846   waiting_for_readable.push_back(op);
 847   op->mark_delayed("waiting for readable");
 848   return false;
 849 }
 850
 851 bool PrimaryLogPG::check_laggy_requeue(OpRequestRef& op)
 852 {
 853   assert(HAVE_FEATURE(recovery_state.get_min_upacting_features(),
 854                       SERVER_OCTOPUS));
 855   if (!state_test(PG_STATE_WAIT) && !state_test(PG_STATE_LAGGY)) {
 856     return true; // not laggy
 857   }
 858   dout(10) << __func__ << " not readable" << dendl;
 859   waiting_for_readable.push_front(op);
 860   op->mark_delayed("waiting for readable");
 861   return false;
 862 }
 863
 864 void PrimaryLogPG::recheck_readable()
 865 {
 866   if (!is_wait() && !is_laggy()) {
 867     dout(20) << __func__ << " wasn't wait or laggy" << dendl;
 868     return;
 869   }
 870   auto mnow = osd->get_mnow();
 871   bool pub = false;
 872   if (is_wait()) {
 873     auto prior_readable_until_ub = recovery_state.get_prior_readable_until_ub();
 874     if (mnow < prior_readable_until_ub) {
 875       dout(10) << __func__ << " still wait (mnow " << mnow
 876                << " < prior_readable_until_ub " << prior_readable_until_ub
 877                << ")" << dendl;
 878     } else {
 879       dout(10) << __func__ << " no longer wait (mnow " << mnow
 880                << " >= prior_readable_until_ub " << prior_readable_until_ub
 881                << ")" << dendl;
 882       state_clear(PG_STATE_WAIT);
 883       recovery_state.clear_prior_readable_until_ub();
 884       pub = true;
 885     }
 886   }
 887   if (is_laggy()) {
 888     auto ru = recovery_state.get_readable_until();
 889     if (ru == ceph::signedspan::zero()) {
 890       dout(10) << __func__ << " still laggy (mnow " << mnow
 891                << ", readable_until zero)" << dendl;
 892     } else if (mnow >= ru) {
 893       dout(10) << __func__ << " still laggy (mnow " << mnow
 894                << " >= readable_until " << ru << ")" << dendl;
 895     } else {
 896       dout(10) << __func__ << " no longer laggy (mnow " << mnow
 897                << " < readable_until " << ru << ")" << dendl;
 898       state_clear(PG_STATE_LAGGY);
 899       pub = true;
 900     }
 901   }
 902   if (pub) {
 903     publish_stats_to_osd();
 904   }
 905   if (!is_laggy() && !is_wait()) {
 906     requeue_ops(waiting_for_readable);
 907   }
 908 }
 909
 910 bool PrimaryLogPG::pgls_filter(const PGLSFilter& filter, const hobject_t& sobj)
 911 {
 912   bufferlist bl;
 913
 914   // If filter has expressed an interest in an xattr, load it.
 915   if (!filter.get_xattr().empty()) {
 916     int ret = pgbackend->objects_get_attr(
 917       sobj,
 918       filter.get_xattr(),
 919       &bl);
 920     dout(0) << "getattr (sobj=" << sobj << ", attr=" << filter.get_xattr() << ") returned " << ret << dendl;
 921     if (ret < 0) {
 922       if (ret != -ENODATA || filter.reject_empty_xattr()) {
 923         return false;
 924       }
 925     }
 926   }
 927
 928   return filter.filter(sobj, bl);
 929 }
 930
 931 std::pair<int, std::unique_ptr<const PGLSFilter>>
 932 PrimaryLogPG::get_pgls_filter(bufferlist::const_iterator& iter)
 933 {
 934   string type;
 935   // storing non-const PGLSFilter for the sake of ::init()
 936   std::unique_ptr<PGLSFilter> filter;
 937
 938   try {
 939     decode(type, iter);
 940   }
 941   catch (ceph::buffer::error& e) {
 942     return { -EINVAL, nullptr };
 943   }
 944
 945   if (type.compare("plain") == 0) {
 946     filter = std::make_unique<PGLSPlainFilter>();
 947   } else {
 948     std::size_t dot = type.find('.');
 949     if (dot == std::string::npos || dot == 0 || dot == type.size() - 1) {
 950       return { -EINVAL, nullptr };
 951     }
 952
 953     const std::string class_name = type.substr(0, dot);
 954     const std::string filter_name = type.substr(dot + 1);
 955     ClassHandler::ClassData *cls = NULL;
 956     int r = ClassHandler::get_instance().open_class(class_name, &cls);
 957     if (r != 0) {
 958       derr << "Error opening class '" << class_name << "': "
 959            << cpp_strerror(r) << dendl;
 960       if (r != -EPERM) // propagate permission error
 961         r = -EINVAL;
 962       return { r, nullptr };
 963     } else {
 964       ceph_assert(cls);
 965     }
 966
 967     ClassHandler::ClassFilter *class_filter = cls->get_filter(filter_name);
 968     if (class_filter == NULL) {
 969       derr << "Error finding filter '" << filter_name << "' in class "
 970            << class_name << dendl;
 971       return { -EINVAL, nullptr };
 972     }
 973     filter.reset(class_filter->fn());
 974     if (!filter) {
 975       // Object classes are obliged to return us something, but let's
 976       // give an error rather than asserting out.
 977       derr << "Buggy class " << class_name << " failed to construct "
 978               "filter " << filter_name << dendl;
 979       return { -EINVAL, nullptr };
 980     }
 981   }
 982
 983   ceph_assert(filter);
 984   int r = filter->init(iter);
 985   if (r < 0) {
 986     derr << "Error initializing filter " << type << ": "
 987          << cpp_strerror(r) << dendl;
 988     return { -EINVAL, nullptr };
 989   } else {
 990     // Successfully constructed and initialized, return it.
 991     return std::make_pair(0, std::move(filter));
 992   }
 993 }
 994
 995
 996 // ==========================================================
 997
 998 void PrimaryLogPG::do_command(
 999   const string_view& orig_prefix,
1000   const cmdmap_t& cmdmap,
1001   const bufferlist& idata,
1002   std::function<void(int,const std::string&,bufferlist&)> on_finish)
1003 {
1004   string format;
1005   cmd_getval(cmdmap, "format", format);
1006   std::unique_ptr<Formatter> f(Formatter::create(
1007                                  format, "json-pretty", "json-pretty"));
1008   int ret = 0;
1009   stringstream ss;   // stderr error message stream
1010   bufferlist outbl;  // if empty at end, we'll dump formatter as output
1011
1012   // get final prefix:
1013   // - ceph pg <pgid> foo -> prefix=pg, cmd=foo
1014   // - ceph tell <pgid> foo -> prefix=foo
1015   string prefix(orig_prefix);
1016   string command;
1017   cmd_getval(cmdmap, "cmd", command);
1018   if (command.size()) {
1019     prefix = command;
1020   }
1021
1022   if (prefix == "query") {
1023     f->open_object_section("pg");
1024     f->dump_stream("snap_trimq") << snap_trimq;
1025     f->dump_unsigned("snap_trimq_len", snap_trimq.size());
1026     recovery_state.dump_peering_state(f.get());
1027
1028     f->open_array_section("recovery_state");
1029     handle_query_state(f.get());
1030     f->close_section();
1031
1032     if (is_primary() && is_active() && m_scrubber) {
1033       m_scrubber->dump_scrubber(f.get(), m_planned_scrub);
1034     }
1035
1036     f->open_object_section("agent_state");
1037     if (agent_state)
1038       agent_state->dump(f.get());
1039     f->close_section();
1040
1041     f->close_section();
1042   }
1043
1044   else if (prefix == "mark_unfound_lost") {
1045     string mulcmd;
1046     cmd_getval(cmdmap, "mulcmd", mulcmd);
1047     int mode = -1;
1048     if (mulcmd == "revert") {
1049       if (pool.info.is_erasure()) {
1050         ss << "mode must be 'delete' for ec pool";
1051         ret = -EINVAL;
1052         goto out;
1053       }
1054       mode = pg_log_entry_t::LOST_REVERT;
1055     } else if (mulcmd == "delete") {
1056       mode = pg_log_entry_t::LOST_DELETE;
1057     } else {
1058       ss << "mode must be 'revert' or 'delete'; mark not yet implemented";
1059       ret = -EINVAL;
1060       goto out;
1061     }
1062     ceph_assert(mode == pg_log_entry_t::LOST_REVERT ||
1063                 mode == pg_log_entry_t::LOST_DELETE);
1064
1065     if (!is_primary()) {
1066       ss << "not primary";
1067       ret = -EROFS;
1068       goto out;
1069     }
1070
1071     uint64_t unfound = recovery_state.get_missing_loc().num_unfound();
1072     if (!unfound) {
1073       ss << "pg has no unfound objects";
1074       goto out;  // make command idempotent
1075     }
1076
1077     if (!recovery_state.all_unfound_are_queried_or_lost(get_osdmap())) {
1078       ss << "pg has " << unfound
1079          << " unfound objects but we haven't probed all sources, not marking lost";
1080       ret = -EINVAL;
1081       goto out;
1082     }
1083
1084     mark_all_unfound_lost(mode, on_finish);
1085     return;
1086   }
1087
1088   else if (prefix == "list_unfound") {
1089     hobject_t offset;
1090     string offset_json;
1091     bool show_offset = false;
1092     if (cmd_getval(cmdmap, "offset", offset_json)) {
1093       json_spirit::Value v;
1094       try {
1095         if (!json_spirit::read(offset_json, v))
1096           throw std::runtime_error("bad json");
1097         offset.decode(v);
1098       } catch (std::runtime_error& e) {
1099         ss << "error parsing offset: " << e.what();
1100         ret = -EINVAL;
1101         goto out;
1102       }
1103       show_offset = true;
1104     }
1105     f->open_object_section("missing");
1106     if (show_offset) {
1107       f->open_object_section("offset");
1108       offset.dump(f.get());
1109       f->close_section();
1110     }
1111     auto &needs_recovery_map = recovery_state.get_missing_loc()
1112       .get_needs_recovery();
1113     f->dump_int("num_missing", needs_recovery_map.size());
1114     f->dump_int("num_unfound", get_num_unfound());
1115     map<hobject_t, pg_missing_item>::const_iterator p =
1116       needs_recovery_map.upper_bound(offset);
1117     {
1118       f->open_array_section("objects");
1119       int32_t num = 0;
1120       for (; p != needs_recovery_map.end() &&
1121              num < cct->_conf->osd_command_max_records;
1122            ++p) {
1123         if (recovery_state.get_missing_loc().is_unfound(p->first)) {
1124           f->open_object_section("object");
1125           {
1126             f->open_object_section("oid");
1127             p->first.dump(f.get());
1128             f->close_section();
1129           }
1130           p->second.dump(f.get()); // have, need keys
1131           {
1132             f->open_array_section("locations");
1133             for (auto &&r : recovery_state.get_missing_loc().get_locations(
1134                    p->first)) {
1135               f->dump_stream("shard") << r;
1136             }
1137             f->close_section();
1138           }
1139           f->close_section();
1140           num++;
1141         }
1142       }
1143       f->close_section();
1144     }
1145     // Get possible locations of missing objects from pg information
1146     PeeringState::QueryUnfound q(f.get());
1147     recovery_state.handle_event(q, 0);
1148     f->dump_bool("more", p != needs_recovery_map.end());
1149     f->close_section();
1150   }
1151
1152   else if (prefix == "scrub" ||
1153            prefix == "deep_scrub") {
1154     bool deep = (prefix == "deep_scrub");
1155     int64_t time = cmd_getval_or<int64_t>(cmdmap, "time", 0);
1156
1157     if (is_primary()) {
1158       const pg_pool_t *p = &pool.info;
1159       double pool_scrub_max_interval = 0;
1160       double scrub_max_interval;
1161       if (deep) {
1162         p->opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &pool_scrub_max_interval);
1163         scrub_max_interval = pool_scrub_max_interval > 0 ?
1164           pool_scrub_max_interval : g_conf()->osd_deep_scrub_interval;
1165       } else {
1166         p->opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &pool_scrub_max_interval);
1167         scrub_max_interval = pool_scrub_max_interval > 0 ?
1168           pool_scrub_max_interval : g_conf()->osd_scrub_max_interval;
1169       }
1170       // Instead of marking must_scrub force a schedule scrub
1171       utime_t stamp = ceph_clock_now();
1172       if (time == 0)
1173         stamp -= scrub_max_interval;
1174       else
1175         stamp -=  (float)time;
1176       stamp -= 100.0;  // push back last scrub more for good measure
1177       if (deep) {
1178         set_last_deep_scrub_stamp(stamp);
1179       }
1180       set_last_scrub_stamp(stamp); // for 'deep' as well, as we use this value to order scrubs
1181       f->open_object_section("result");
1182       f->dump_bool("deep", deep);
1183       f->dump_stream("stamp") << stamp;
1184       f->close_section();
1185     } else {
1186       ss << "Not primary";
1187       ret = -EPERM;
1188     }
1189     outbl.append(ss.str());
1190   }
1191
1192   else if (prefix == "block" || prefix == "unblock" || prefix == "set" ||
1193            prefix == "unset") {
1194     string value;
1195     cmd_getval(cmdmap, "value", value);
1196
1197     if (is_primary()) {
1198       ret = m_scrubber->asok_debug(prefix, value, f.get(), ss);
1199       f->open_object_section("result");
1200       f->dump_bool("success", true);
1201       f->close_section();
1202     } else {
1203       ss << "Not primary";
1204       ret = -EPERM;
1205     }
1206     outbl.append(ss.str());
1207   }
1208   else {
1209     ret = -ENOSYS;
1210     ss << "prefix '" << prefix << "' not implemented";
1211   }
1212
1213  out:
1214   if (ret >= 0 && outbl.length() == 0) {
1215     f->flush(outbl);
1216   }
1217   on_finish(ret, ss.str(), outbl);
1218 }
1219
1220
1221 // ==========================================================
1222
1223 void PrimaryLogPG::do_pg_op(OpRequestRef op)
1224 {
1225   const MOSDOp *m = static_cast<const MOSDOp *>(op->get_req());
1226   ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
1227   dout(10) << "do_pg_op " << *m << dendl;
1228
1229   op->mark_started();
1230
1231   int result = 0;
1232   string cname, mname;
1233
1234   snapid_t snapid = m->get_snapid();
1235
1236   vector<OSDOp> ops = m->ops;
1237
1238   for (vector<OSDOp>::iterator p = ops.begin(); p != ops.end(); ++p) {
1239     std::unique_ptr<const PGLSFilter> filter;
1240     OSDOp& osd_op = *p;
1241     auto bp = p->indata.cbegin();
1242     switch (p->op.op) {
1243     case CEPH_OSD_OP_PGNLS_FILTER:
1244       try {
1245         decode(cname, bp);
1246         decode(mname, bp);
1247       }
1248       catch (const ceph::buffer::error& e) {
1249         dout(0) << "unable to decode PGLS_FILTER description in " << *m << dendl;
1250         result = -EINVAL;
1251         break;
1252       }
1253       std::tie(result, filter) = get_pgls_filter(bp);
1254       if (result < 0)
1255         break;
1256
1257       ceph_assert(filter);
1258
1259       // fall through
1260
1261     case CEPH_OSD_OP_PGNLS:
1262       if (snapid != CEPH_NOSNAP) {
1263         result = -EINVAL;
1264         break;
1265       }
1266       if (get_osdmap()->raw_pg_to_pg(m->get_pg()) != info.pgid.pgid) {
1267         dout(10) << " pgnls pg=" << m->get_pg()
1268                  << " " << get_osdmap()->raw_pg_to_pg(m->get_pg())
1269                  << " != " << info.pgid << dendl;
1270         result = 0; // hmm?
1271       } else {
1272         unsigned list_size = std::min<uint64_t>(cct->_conf->osd_max_pgls,
1273                                                 p->op.pgls.count);
1274
1275         dout(10) << " pgnls pg=" << m->get_pg() << " count " << list_size
1276                  << dendl;
1277         // read into a buffer
1278         vector<hobject_t> sentries;
1279         pg_nls_response_t response;
1280         try {
1281           decode(response.handle, bp);
1282         }
1283         catch (const ceph::buffer::error& e) {
1284           dout(0) << "unable to decode PGNLS handle in " << *m << dendl;
1285           result = -EINVAL;
1286           break;
1287         }
1288
1289         hobject_t next;
1290         hobject_t lower_bound = response.handle;
1291         hobject_t pg_start = info.pgid.pgid.get_hobj_start();
1292         hobject_t pg_end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1293         dout(10) << " pgnls lower_bound " << lower_bound
1294                  << " pg_end " << pg_end << dendl;
1295         if (((!lower_bound.is_max() && lower_bound >= pg_end) ||
1296              (lower_bound != hobject_t() && lower_bound < pg_start))) {
1297           // this should only happen with a buggy client.
1298           dout(10) << "outside of PG bounds " << pg_start << " .. "
1299                    << pg_end << dendl;
1300           result = -EINVAL;
1301           break;
1302         }
1303
1304         hobject_t current = lower_bound;
1305         int r = pgbackend->objects_list_partial(
1306           current,
1307           list_size,
1308           list_size,
1309           &sentries,
1310           &next);
1311         if (r != 0) {
1312           result = -EINVAL;
1313           break;
1314         }
1315
1316         map<hobject_t, pg_missing_item>::const_iterator missing_iter =
1317           recovery_state.get_pg_log().get_missing().get_items().lower_bound(current);
1318         vector<hobject_t>::iterator ls_iter = sentries.begin();
1319         hobject_t _max = hobject_t::get_max();
1320         while (1) {
1321           const hobject_t &mcand =
1322             missing_iter == recovery_state.get_pg_log().get_missing().get_items().end() ?
1323             _max :
1324             missing_iter->first;
1325           const hobject_t &lcand =
1326             ls_iter == sentries.end() ?
1327             _max :
1328             *ls_iter;
1329
1330           hobject_t candidate;
1331           if (mcand == lcand) {
1332             candidate = mcand;
1333             if (!mcand.is_max()) {
1334               ++ls_iter;
1335               ++missing_iter;
1336             }
1337           } else if (mcand < lcand) {
1338             candidate = mcand;
1339             ceph_assert(!mcand.is_max());
1340             ++missing_iter;
1341           } else {
1342             candidate = lcand;
1343             ceph_assert(!lcand.is_max());
1344             ++ls_iter;
1345           }
1346
1347           dout(10) << " pgnls candidate 0x" << std::hex << candidate.get_hash()
1348                    << " vs lower bound 0x" << lower_bound.get_hash()
1349                    << std::dec << dendl;
1350
1351           if (candidate >= next) {
1352             break;
1353           }
1354
1355           if (response.entries.size() == list_size) {
1356             next = candidate;
1357             break;
1358           }
1359
1360           if (candidate.snap != CEPH_NOSNAP)
1361             continue;
1362
1363           // skip internal namespace
1364           if (candidate.get_namespace() == cct->_conf->osd_hit_set_namespace)
1365             continue;
1366
1367           if (recovery_state.get_missing_loc().is_deleted(candidate))
1368             continue;
1369
1370           // skip wrong namespace
1371           if (m->get_hobj().nspace != librados::all_nspaces &&
1372                candidate.get_namespace() != m->get_hobj().nspace)
1373             continue;
1374
1375           if (filter && !pgls_filter(*filter, candidate))
1376             continue;
1377
1378           dout(20) << "pgnls item 0x" << std::hex
1379             << candidate.get_hash()
1380             << ", rev 0x" << hobject_t::_reverse_bits(candidate.get_hash())
1381             << std::dec << " "
1382             << candidate.oid.name << dendl;
1383
1384           librados::ListObjectImpl item;
1385           item.nspace = candidate.get_namespace();
1386           item.oid = candidate.oid.name;
1387           item.locator = candidate.get_key();
1388           response.entries.push_back(item);
1389         }
1390
1391         if (next.is_max() &&
1392             missing_iter == recovery_state.get_pg_log().get_missing().get_items().end() &&
1393             ls_iter == sentries.end()) {
1394           result = 1;
1395
1396           // Set response.handle to the start of the next PG according
1397           // to the object sort order.
1398           response.handle = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1399         } else {
1400           response.handle = next;
1401         }
1402         dout(10) << "pgnls handle=" << response.handle << dendl;
1403         encode(response, osd_op.outdata);
1404         dout(10) << " pgnls result=" << result << " outdata.length()="
1405                  << osd_op.outdata.length() << dendl;
1406       }
1407       break;
1408
1409     case CEPH_OSD_OP_PGLS_FILTER:
1410       try {
1411         decode(cname, bp);
1412         decode(mname, bp);
1413       }
1414       catch (const ceph::buffer::error& e) {
1415         dout(0) << "unable to decode PGLS_FILTER description in " << *m << dendl;
1416         result = -EINVAL;
1417         break;
1418       }
1419       std::tie(result, filter) = get_pgls_filter(bp);
1420       if (result < 0)
1421         break;
1422
1423       ceph_assert(filter);
1424
1425       // fall through
1426
1427     case CEPH_OSD_OP_PGLS:
1428       if (snapid != CEPH_NOSNAP) {
1429         result = -EINVAL;
1430         break;
1431       }
1432       if (get_osdmap()->raw_pg_to_pg(m->get_pg()) != info.pgid.pgid) {
1433         dout(10) << " pgls pg=" << m->get_pg()
1434                  << " " << get_osdmap()->raw_pg_to_pg(m->get_pg())
1435                  << " != " << info.pgid << dendl;
1436         result = 0; // hmm?
1437       } else {
1438         unsigned list_size = std::min<uint64_t>(cct->_conf->osd_max_pgls,
1439                                                 p->op.pgls.count);
1440
1441         dout(10) << " pgls pg=" << m->get_pg() << " count " << list_size << dendl;
1442         // read into a buffer
1443         vector<hobject_t> sentries;
1444         pg_ls_response_t response;
1445         try {
1446           decode(response.handle, bp);
1447         }
1448         catch (const ceph::buffer::error& e) {
1449           dout(0) << "unable to decode PGLS handle in " << *m << dendl;
1450           result = -EINVAL;
1451           break;
1452         }
1453
1454         hobject_t next;
1455         hobject_t current = response.handle;
1456         int r = pgbackend->objects_list_partial(
1457           current,
1458           list_size,
1459           list_size,
1460           &sentries,
1461           &next);
1462         if (r != 0) {
1463           result = -EINVAL;
1464           break;
1465         }
1466
1467         ceph_assert(snapid == CEPH_NOSNAP || recovery_state.get_pg_log().get_missing().get_items().empty());
1468
1469         map<hobject_t, pg_missing_item>::const_iterator missing_iter =
1470           recovery_state.get_pg_log().get_missing().get_items().lower_bound(current);
1471         vector<hobject_t>::iterator ls_iter = sentries.begin();
1472         hobject_t _max = hobject_t::get_max();
1473         while (1) {
1474           const hobject_t &mcand =
1475             missing_iter == recovery_state.get_pg_log().get_missing().get_items().end() ?
1476             _max :
1477             missing_iter->first;
1478           const hobject_t &lcand =
1479             ls_iter == sentries.end() ?
1480             _max :
1481             *ls_iter;
1482
1483           hobject_t candidate;
1484           if (mcand == lcand) {
1485             candidate = mcand;
1486             if (!mcand.is_max()) {
1487               ++ls_iter;
1488               ++missing_iter;
1489             }
1490           } else if (mcand < lcand) {
1491             candidate = mcand;
1492             ceph_assert(!mcand.is_max());
1493             ++missing_iter;
1494           } else {
1495             candidate = lcand;
1496             ceph_assert(!lcand.is_max());
1497             ++ls_iter;
1498           }
1499
1500           if (candidate >= next) {
1501             break;
1502           }
1503
1504           if (response.entries.size() == list_size) {
1505             next = candidate;
1506             break;
1507           }
1508
1509           if (candidate.snap != CEPH_NOSNAP)
1510             continue;
1511
1512           // skip wrong namespace
1513           if (candidate.get_namespace() != m->get_hobj().nspace)
1514             continue;
1515
1516           if (recovery_state.get_missing_loc().is_deleted(candidate))
1517             continue;
1518
1519           if (filter && !pgls_filter(*filter, candidate))
1520             continue;
1521
1522           response.entries.push_back(make_pair(candidate.oid,
1523                                                candidate.get_key()));
1524         }
1525         if (next.is_max() &&
1526             missing_iter == recovery_state.get_pg_log().get_missing().get_items().end() &&
1527             ls_iter == sentries.end()) {
1528           result = 1;
1529         }
1530         response.handle = next;
1531         encode(response, osd_op.outdata);
1532         dout(10) << " pgls result=" << result << " outdata.length()="
1533                  << osd_op.outdata.length() << dendl;
1534       }
1535       break;
1536
1537     case CEPH_OSD_OP_PG_HITSET_LS:
1538       {
1539         list< pair<utime_t,utime_t> > ls;
1540         for (list<pg_hit_set_info_t>::const_iterator p = info.hit_set.history.begin();
1541              p != info.hit_set.history.end();
1542              ++p)
1543           ls.push_back(make_pair(p->begin, p->end));
1544         if (hit_set)
1545           ls.push_back(make_pair(hit_set_start_stamp, utime_t()));
1546         encode(ls, osd_op.outdata);
1547       }
1548       break;
1549
1550     case CEPH_OSD_OP_PG_HITSET_GET:
1551       {
1552         utime_t stamp(osd_op.op.hit_set_get.stamp);
1553         if (hit_set_start_stamp && stamp >= hit_set_start_stamp) {
1554           // read the current in-memory HitSet, not the version we've
1555           // checkpointed.
1556           if (!hit_set) {
1557             result= -ENOENT;
1558             break;
1559           }
1560           encode(*hit_set, osd_op.outdata);
1561           result = osd_op.outdata.length();
1562         } else {
1563           // read an archived HitSet.
1564           hobject_t oid;
1565           for (list<pg_hit_set_info_t>::const_iterator p = info.hit_set.history.begin();
1566                p != info.hit_set.history.end();
1567                ++p) {
1568             if (stamp >= p->begin && stamp <= p->end) {
1569               oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
1570               break;
1571             }
1572           }
1573           if (oid == hobject_t()) {
1574             result = -ENOENT;
1575             break;
1576           }
1577           if (!pool.info.is_replicated()) {
1578             // FIXME: EC not supported yet
1579             result = -EOPNOTSUPP;
1580             break;
1581           }
1582           if (is_unreadable_object(oid)) {
1583             wait_for_unreadable_object(oid, op);
1584             return;
1585           }
1586           result = osd->store->read(ch, ghobject_t(oid), 0, 0, osd_op.outdata);
1587         }
1588       }
1589       break;
1590
1591    case CEPH_OSD_OP_SCRUBLS:
1592       result = do_scrub_ls(m, &osd_op);
1593       break;
1594
1595     default:
1596       result = -EINVAL;
1597       break;
1598     }
1599
1600     if (result < 0)
1601       break;
1602   }
1603
1604   // reply
1605   MOSDOpReply *reply = new MOSDOpReply(m, 0, get_osdmap_epoch(),
1606                                        CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK,
1607                                        false);
1608   reply->claim_op_out_data(ops);
1609   reply->set_result(result);
1610   reply->set_reply_versions(info.last_update, info.last_user_version);
1611   osd->send_message_osd_client(reply, m->get_connection());
1612 }
1613
1614 int PrimaryLogPG::do_scrub_ls(const MOSDOp *m, OSDOp *osd_op)
1615 {
1616   if (m->get_pg() != info.pgid.pgid) {
1617     dout(10) << " scrubls pg=" << m->get_pg() << " != " << info.pgid << dendl;
1618     return -EINVAL; // hmm?
1619   }
1620   auto bp = osd_op->indata.cbegin();
1621   scrub_ls_arg_t arg;
1622   try {
1623     arg.decode(bp);
1624   } catch (ceph::buffer::error&) {
1625     dout(10) << " corrupted scrub_ls_arg_t" << dendl;
1626     return -EINVAL;
1627   }
1628
1629   int r = 0;
1630   scrub_ls_result_t result = {.interval = info.history.same_interval_since};
1631
1632   if (arg.interval != 0 && arg.interval != info.history.same_interval_since) {
1633     r = -EAGAIN;
1634   } else {
1635     bool store_queried = m_scrubber && m_scrubber->get_store_errors(arg, result);
1636     if (store_queried) {
1637       encode(result, osd_op->outdata);
1638     } else {
1639       // the scrubber's store is not initialized
1640       r = -ENOENT;
1641     }
1642   }
1643
1644   return r;
1645 }
1646
1647 /**
1648  * Grabs locks for OpContext, should be cleaned up in close_op_ctx
1649  *
1650  * @param ctx [in,out] ctx to get locks for
1651  * @return true on success, false if we are queued
1652  */
1653 bool PrimaryLogPG::get_rw_locks(bool write_ordered, OpContext *ctx)
1654 {
1655   /* If head_obc, !obc->obs->exists and we will always take the
1656    * snapdir lock *before* the head lock.  Since all callers will do
1657    * this (read or write) if we get the first we will be guaranteed
1658    * to get the second.
1659    */
1660   if (write_ordered && ctx->op->may_read()) {
1661     ctx->lock_type = RWState::RWEXCL;
1662   } else if (write_ordered) {
1663     ctx->lock_type = RWState::RWWRITE;
1664   } else {
1665     ceph_assert(ctx->op->may_read());
1666     ctx->lock_type = RWState::RWREAD;
1667   }
1668
1669   if (ctx->head_obc) {
1670     ceph_assert(!ctx->obc->obs.exists);
1671     if (!ctx->lock_manager.get_lock_type(
1672           ctx->lock_type,
1673           ctx->head_obc->obs.oi.soid,
1674           ctx->head_obc,
1675           ctx->op)) {
1676       ctx->lock_type = RWState::RWNONE;
1677       return false;
1678     }
1679   }
1680   if (ctx->lock_manager.get_lock_type(
1681         ctx->lock_type,
1682         ctx->obc->obs.oi.soid,
1683         ctx->obc,
1684         ctx->op)) {
1685     return true;
1686   } else {
1687     ceph_assert(!ctx->head_obc);
1688     ctx->lock_type = RWState::RWNONE;
1689     return false;
1690   }
1691 }
1692
1693 /**
1694  * Releases locks
1695  *
1696  * @param manager [in] manager with locks to release
1697  */
1698 void PrimaryLogPG::release_object_locks(
1699   ObcLockManager &lock_manager) {
1700   std::list<std::pair<ObjectContextRef, std::list<OpRequestRef> > > to_req;
1701   bool requeue_recovery = false;
1702   bool requeue_snaptrim = false;
1703   lock_manager.put_locks(
1704     &to_req,
1705     &requeue_recovery,
1706     &requeue_snaptrim);
1707   if (requeue_recovery)
1708     queue_recovery();
1709   if (requeue_snaptrim)
1710     snap_trimmer_machine.process_event(TrimWriteUnblocked());
1711
1712   if (!to_req.empty()) {
1713     // requeue at front of scrub blocking queue if we are blocked by scrub
1714     for (auto &&p: to_req) {
1715       if (m_scrubber->write_blocked_by_scrub(p.first->obs.oi.soid.get_head())) {
1716         for (auto& op : p.second) {
1717           op->mark_delayed("waiting for scrub");
1718         }
1719
1720         waiting_for_scrub.splice(
1721           waiting_for_scrub.begin(),
1722           p.second,
1723           p.second.begin(),
1724           p.second.end());
1725       } else if (is_laggy()) {
1726         for (auto& op : p.second) {
1727           op->mark_delayed("waiting for readable");
1728         }
1729         waiting_for_readable.splice(
1730           waiting_for_readable.begin(),
1731           p.second,
1732           p.second.begin(),
1733           p.second.end());
1734       } else {
1735         requeue_ops(p.second);
1736       }
1737     }
1738   }
1739 }
1740
1741 PrimaryLogPG::PrimaryLogPG(OSDService *o, OSDMapRef curmap,
1742                            const PGPool &_pool,
1743                            const map<string,string>& ec_profile, spg_t p) :
1744   PG(o, curmap, _pool, p),
1745   pgbackend(
1746     PGBackend::build_pg_backend(
1747       _pool.info, ec_profile, this, coll_t(p), ch, o->store, cct)),
1748   object_contexts(o->cct, o->cct->_conf->osd_pg_object_context_cache_count),
1749   new_backfill(false),
1750   temp_seq(0),
1751   snap_trimmer_machine(this)
1752 {
1753   recovery_state.set_backend_predicates(
1754     pgbackend->get_is_readable_predicate(),
1755     pgbackend->get_is_recoverable_predicate());
1756   snap_trimmer_machine.initiate();
1757
1758   m_scrubber = make_unique<PrimaryLogScrub>(this);
1759 }
1760
1761 PrimaryLogPG::~PrimaryLogPG()
1762 {
1763   m_scrubber.reset();
1764 }
1765
1766 void PrimaryLogPG::get_src_oloc(const object_t& oid, const object_locator_t& oloc, object_locator_t& src_oloc)
1767 {
1768   src_oloc = oloc;
1769   if (oloc.key.empty())
1770     src_oloc.key = oid.name;
1771 }
1772
1773 void PrimaryLogPG::handle_backoff(OpRequestRef& op)
1774 {
1775   auto m = op->get_req<MOSDBackoff>();
1776   auto session = ceph::ref_cast<Session>(m->get_connection()->get_priv());
1777   if (!session)
1778     return;  // drop it.
1779   hobject_t begin = info.pgid.pgid.get_hobj_start();
1780   hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1781   if (begin < m->begin) {
1782     begin = m->begin;
1783   }
1784   if (end > m->end) {
1785     end = m->end;
1786   }
1787   dout(10) << __func__ << " backoff ack id " << m->id
1788            << " [" << begin << "," << end << ")" << dendl;
1789   session->ack_backoff(cct, m->pgid, m->id, begin, end);
1790 }
1791
1792 void PrimaryLogPG::do_request(
1793   OpRequestRef& op,
1794   ThreadPool::TPHandle &handle)
1795 {
1796   if (op->osd_trace) {
1797     op->pg_trace.init("pg op", &trace_endpoint, &op->osd_trace);
1798     op->pg_trace.event("do request");
1799   }
1800
1801   [[maybe_unused]] auto span = tracing::osd::tracer.add_span(__func__, op->osd_parent_span);
1802
1803 // make sure we have a new enough map
1804   auto p = waiting_for_map.find(op->get_source());
1805   if (p != waiting_for_map.end()) {
1806     // preserve ordering
1807     dout(20) << __func__ << " waiting_for_map "
1808              << p->first << " not empty, queueing" << dendl;
1809     p->second.push_back(op);
1810     op->mark_delayed("waiting_for_map not empty");
1811     return;
1812   }
1813   if (!have_same_or_newer_map(op->min_epoch)) {
1814     dout(20) << __func__ << " min " << op->min_epoch
1815              << ", queue on waiting_for_map " << op->get_source() << dendl;
1816     waiting_for_map[op->get_source()].push_back(op);
1817     op->mark_delayed("op must wait for map");
1818     osd->request_osdmap_update(op->min_epoch);
1819     return;
1820   }
1821
1822   if (can_discard_request(op)) {
1823     return;
1824   }
1825
1826   // pg-wide backoffs
1827   const Message *m = op->get_req();
1828   int msg_type = m->get_type();
1829   if (m->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF)) {
1830     auto session = ceph::ref_cast<Session>(m->get_connection()->get_priv());
1831     if (!session)
1832       return;  // drop it.
1833     if (msg_type == CEPH_MSG_OSD_OP) {
1834       if (session->check_backoff(cct, info.pgid,
1835                                  info.pgid.pgid.get_hobj_start(), m)) {
1836         return;
1837       }
1838
1839       bool backoff =
1840         is_down() ||
1841         is_incomplete() ||
1842         (!is_active() && is_peered());
1843       if (g_conf()->osd_backoff_on_peering && !backoff) {
1844         if (is_peering()) {
1845           backoff = true;
1846         }
1847       }
1848       if (backoff) {
1849         add_pg_backoff(session);
1850         return;
1851       }
1852     }
1853     // pg backoff acks at pg-level
1854     if (msg_type == CEPH_MSG_OSD_BACKOFF) {
1855       const MOSDBackoff *ba = static_cast<const MOSDBackoff*>(m);
1856       if (ba->begin != ba->end) {
1857         handle_backoff(op);
1858         return;
1859       }
1860     }
1861   }
1862
1863   if (!is_peered()) {
1864     // Delay unless PGBackend says it's ok
1865     if (pgbackend->can_handle_while_inactive(op)) {
1866       bool handled = pgbackend->handle_message(op);
1867       ceph_assert(handled);
1868       return;
1869     } else {
1870       waiting_for_peered.push_back(op);
1871       op->mark_delayed("waiting for peered");
1872       return;
1873     }
1874   }
1875
1876   if (recovery_state.needs_flush()) {
1877     dout(20) << "waiting for flush on " << op << dendl;
1878     waiting_for_flush.push_back(op);
1879     op->mark_delayed("waiting for flush");
1880     return;
1881   }
1882
1883   ceph_assert(is_peered() && !recovery_state.needs_flush());
1884   if (pgbackend->handle_message(op))
1885     return;
1886
1887   switch (msg_type) {
1888   case CEPH_MSG_OSD_OP:
1889   case CEPH_MSG_OSD_BACKOFF:
1890     if (!is_active()) {
1891       dout(20) << " peered, not active, waiting for active on " << op << dendl;
1892       waiting_for_active.push_back(op);
1893       op->mark_delayed("waiting for active");
1894       return;
1895     }
1896     switch (msg_type) {
1897     case CEPH_MSG_OSD_OP:
1898       // verify client features
1899       if ((pool.info.has_tiers() || pool.info.is_tier()) &&
1900           !op->has_feature(CEPH_FEATURE_OSD_CACHEPOOL)) {
1901         osd->reply_op_error(op, -EOPNOTSUPP);
1902         return;
1903       }
1904       do_op(op);
1905       break;
1906     case CEPH_MSG_OSD_BACKOFF:
1907       // object-level backoff acks handled in osdop context
1908       handle_backoff(op);
1909       break;
1910     }
1911     break;
1912
1913   case MSG_OSD_PG_SCAN:
1914     do_scan(op, handle);
1915     break;
1916
1917   case MSG_OSD_PG_BACKFILL:
1918     do_backfill(op);
1919     break;
1920
1921   case MSG_OSD_PG_BACKFILL_REMOVE:
1922     do_backfill_remove(op);
1923     break;
1924
1925   case MSG_OSD_SCRUB_RESERVE:
1926     {
1927       if (!m_scrubber) {
1928         osd->reply_op_error(op, -EAGAIN);
1929         return;
1930       }
1931       auto m = op->get_req<MOSDScrubReserve>();
1932       switch (m->type) {
1933       case MOSDScrubReserve::REQUEST:
1934         m_scrubber->handle_scrub_reserve_request(op);
1935         break;
1936       case MOSDScrubReserve::GRANT:
1937         m_scrubber->handle_scrub_reserve_grant(op, m->from);
1938         break;
1939       case MOSDScrubReserve::REJECT:
1940         m_scrubber->handle_scrub_reserve_reject(op, m->from);
1941         break;
1942       case MOSDScrubReserve::RELEASE:
1943         m_scrubber->handle_scrub_reserve_release(op);
1944         break;
1945       }
1946     }
1947     break;
1948
1949   case MSG_OSD_REP_SCRUB:
1950     replica_scrub(op, handle);
1951     break;
1952
1953   case MSG_OSD_REP_SCRUBMAP:
1954     do_replica_scrub_map(op);
1955     break;
1956
1957   case MSG_OSD_PG_UPDATE_LOG_MISSING:
1958     do_update_log_missing(op);
1959     break;
1960
1961   case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY:
1962     do_update_log_missing_reply(op);
1963     break;
1964
1965   default:
1966     ceph_abort_msg("bad message type in do_request");
1967   }
1968 }
1969
1970 /** do_op - do an op
1971  * pg lock will be held (if multithreaded)
1972  * osd_lock NOT held.
1973  */
1974 void PrimaryLogPG::do_op(OpRequestRef& op)
1975 {
1976   FUNCTRACE(cct);
1977   // NOTE: take a non-const pointer here; we must be careful not to
1978   // change anything that will break other reads on m (operator<<).
1979   MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
1980   ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
1981   if (m->finish_decode()) {
1982     op->reset_desc();   // for TrackedOp
1983     m->clear_payload();
1984   }
1985
1986   dout(20) << __func__ << ": op " << *m << dendl;
1987
1988   const hobject_t head = m->get_hobj().get_head();
1989
1990   if (!info.pgid.pgid.contains(
1991         info.pgid.pgid.get_split_bits(pool.info.get_pg_num()), head)) {
1992     derr << __func__ << " " << info.pgid.pgid << " does not contain "
1993          << head << " pg_num " << pool.info.get_pg_num() << " hash "
1994          << std::hex << head.get_hash() << std::dec << dendl;
1995     osd->clog->warn() << info.pgid.pgid << " does not contain " << head
1996                       << " op " << *m;
1997     ceph_assert(!cct->_conf->osd_debug_misdirected_ops);
1998     return;
1999   }
2000
2001   bool can_backoff =
2002     m->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF);
2003   ceph::ref_t<Session> session;
2004   if (can_backoff) {
2005     session = static_cast<Session*>(m->get_connection()->get_priv().get());
2006     if (!session.get()) {
2007       dout(10) << __func__ << " no session" << dendl;
2008       return;
2009     }
2010
2011     if (session->check_backoff(cct, info.pgid, head, m)) {
2012       return;
2013     }
2014   }
2015
2016   if (m->has_flag(CEPH_OSD_FLAG_PARALLELEXEC)) {
2017     // not implemented.
2018     dout(20) << __func__ << ": PARALLELEXEC not implemented " << *m << dendl;
2019     osd->reply_op_error(op, -EINVAL);
2020     return;
2021   }
2022
2023   {
2024     int r = op->maybe_init_op_info(*get_osdmap());
2025     if (r) {
2026       osd->reply_op_error(op, r);
2027       return;
2028     }
2029   }
2030
2031   if ((m->get_flags() & (CEPH_OSD_FLAG_BALANCE_READS |
2032                          CEPH_OSD_FLAG_LOCALIZE_READS)) &&
2033       op->may_read() &&
2034       !(op->may_write() || op->may_cache())) {
2035     // balanced reads; any replica will do
2036     if (!(is_primary() || is_nonprimary())) {
2037       osd->handle_misdirected_op(this, op);
2038       return;
2039     }
2040   } else {
2041     // normal case; must be primary
2042     if (!is_primary()) {
2043       osd->handle_misdirected_op(this, op);
2044       return;
2045     }
2046   }
2047
2048   if (!check_laggy(op)) {
2049     return;
2050   }
2051
2052   if (!op_has_sufficient_caps(op)) {
2053     osd->reply_op_error(op, -EPERM);
2054     return;
2055   }
2056
2057   if (op->includes_pg_op()) {
2058     return do_pg_op(op);
2059   }
2060
2061   // object name too long?
2062   if (m->get_oid().name.size() > cct->_conf->osd_max_object_name_len) {
2063     dout(4) << "do_op name is longer than "
2064             << cct->_conf->osd_max_object_name_len
2065             << " bytes" << dendl;
2066     osd->reply_op_error(op, -ENAMETOOLONG);
2067     return;
2068   }
2069   if (m->get_hobj().get_key().size() > cct->_conf->osd_max_object_name_len) {
2070     dout(4) << "do_op locator is longer than "
2071             << cct->_conf->osd_max_object_name_len
2072             << " bytes" << dendl;
2073     osd->reply_op_error(op, -ENAMETOOLONG);
2074     return;
2075   }
2076   if (m->get_hobj().nspace.size() > cct->_conf->osd_max_object_namespace_len) {
2077     dout(4) << "do_op namespace is longer than "
2078             << cct->_conf->osd_max_object_namespace_len
2079             << " bytes" << dendl;
2080     osd->reply_op_error(op, -ENAMETOOLONG);
2081     return;
2082   }
2083   if (m->get_hobj().oid.name.empty()) {
2084     dout(4) << "do_op empty oid name is not allowed" << dendl;
2085     osd->reply_op_error(op, -EINVAL);
2086     return;
2087   }
2088
2089   if (int r = osd->store->validate_hobject_key(head)) {
2090     dout(4) << "do_op object " << head << " invalid for backing store: "
2091             << r << dendl;
2092     osd->reply_op_error(op, r);
2093     return;
2094   }
2095
2096   // blocklisted?
2097   if (get_osdmap()->is_blocklisted(m->get_source_addr())) {
2098     dout(10) << "do_op " << m->get_source_addr() << " is blocklisted" << dendl;
2099     osd->reply_op_error(op, -EBLOCKLISTED);
2100     return;
2101   }
2102
2103   // order this op as a write?
2104   bool write_ordered = op->rwordered();
2105
2106   // discard due to cluster full transition?  (we discard any op that
2107   // originates before the cluster or pool is marked full; the client
2108   // will resend after the full flag is removed or if they expect the
2109   // op to succeed despite being full).  The except is FULL_FORCE and
2110   // FULL_TRY ops, which there is no reason to discard because they
2111   // bypass all full checks anyway.  If this op isn't write or
2112   // read-ordered, we skip.
2113   // FIXME: we exclude mds writes for now.
2114   if (write_ordered && !(m->get_source().is_mds() ||
2115                          m->has_flag(CEPH_OSD_FLAG_FULL_TRY) ||
2116                          m->has_flag(CEPH_OSD_FLAG_FULL_FORCE)) &&
2117       info.history.last_epoch_marked_full > m->get_map_epoch()) {
2118     dout(10) << __func__ << " discarding op sent before full " << m << " "
2119              << *m << dendl;
2120     return;
2121   }
2122   // mds should have stopped writing before this point.
2123   // We can't allow OSD to become non-startable even if mds
2124   // could be writing as part of file removals.
2125   if (write_ordered && osd->check_failsafe_full(get_dpp()) &&
2126       !m->has_flag(CEPH_OSD_FLAG_FULL_TRY)) {
2127     dout(10) << __func__ << " fail-safe full check failed, dropping request." << dendl;
2128     return;
2129   }
2130   int64_t poolid = get_pgid().pool();
2131   const pg_pool_t *pi = get_osdmap()->get_pg_pool(poolid);
2132   if (!pi) {
2133     return;
2134   }
2135   if (pi->has_flag(pg_pool_t::FLAG_EIO)) {
2136     // drop op on the floor; the client will handle returning EIO
2137     if (m->has_flag(CEPH_OSD_FLAG_SUPPORTSPOOLEIO)) {
2138       dout(10) << __func__ << " discarding op due to pool EIO flag" << dendl;
2139     } else {
2140       dout(10) << __func__ << " replying EIO due to pool EIO flag" << dendl;
2141       osd->reply_op_error(op, -EIO);
2142     }
2143     return;
2144   }
2145   if (op->may_write()) {
2146
2147     // invalid?
2148     if (m->get_snapid() != CEPH_NOSNAP) {
2149       dout(20) << __func__ << ": write to clone not valid " << *m << dendl;
2150       osd->reply_op_error(op, -EINVAL);
2151       return;
2152     }
2153
2154     // too big?
2155     if (cct->_conf->osd_max_write_size &&
2156         m->get_data_len() > cct->_conf->osd_max_write_size << 20) {
2157       // journal can't hold commit!
2158       derr << "do_op msg data len " << m->get_data_len()
2159            << " > osd_max_write_size " << (cct->_conf->osd_max_write_size << 20)
2160            << " on " << *m << dendl;
2161       osd->reply_op_error(op, -OSD_WRITETOOBIG);
2162       return;
2163     }
2164   }
2165
2166   dout(10) << "do_op " << *m
2167            << (op->may_write() ? " may_write" : "")
2168            << (op->may_read() ? " may_read" : "")
2169            << (op->may_cache() ? " may_cache" : "")
2170            << " -> " << (write_ordered ? "write-ordered" : "read-ordered")
2171            << " flags " << ceph_osd_flag_string(m->get_flags())
2172            << dendl;
2173
2174   [[maybe_unused]] auto span = tracing::osd::tracer.add_span(__func__, op->osd_parent_span);
2175
2176   // missing object?
2177   if (is_unreadable_object(head)) {
2178     if (!is_primary()) {
2179       osd->reply_op_error(op, -EAGAIN);
2180       return;
2181     }
2182     if (can_backoff &&
2183         (g_conf()->osd_backoff_on_degraded ||
2184          (g_conf()->osd_backoff_on_unfound &&
2185           recovery_state.get_missing_loc().is_unfound(head)))) {
2186       add_backoff(session, head, head);
2187       maybe_kick_recovery(head);
2188     } else {
2189       wait_for_unreadable_object(head, op);
2190     }
2191     return;
2192   }
2193
2194   if (write_ordered) {
2195     // degraded object?
2196     if (is_degraded_or_backfilling_object(head)) {
2197       if (can_backoff && g_conf()->osd_backoff_on_degraded) {
2198         add_backoff(session, head, head);
2199         maybe_kick_recovery(head);
2200       } else {
2201         wait_for_degraded_object(head, op);
2202       }
2203       return;
2204     }
2205
2206     if (m_scrubber->is_scrub_active() && m_scrubber->write_blocked_by_scrub(head)) {
2207       dout(20) << __func__ << ": waiting for scrub" << dendl;
2208       waiting_for_scrub.push_back(op);
2209       op->mark_delayed("waiting for scrub");
2210       return;
2211     }
2212     if (!check_laggy_requeue(op)) {
2213       return;
2214     }
2215
2216     // blocked on snap?
2217     if (auto blocked_iter = objects_blocked_on_degraded_snap.find(head);
2218         blocked_iter != std::end(objects_blocked_on_degraded_snap)) {
2219       hobject_t to_wait_on(head);
2220       to_wait_on.snap = blocked_iter->second;
2221       wait_for_degraded_object(to_wait_on, op);
2222       return;
2223     }
2224     if (auto blocked_snap_promote_iter = objects_blocked_on_snap_promotion.find(head);
2225         blocked_snap_promote_iter != std::end(objects_blocked_on_snap_promotion)) {
2226       wait_for_blocked_object(blocked_snap_promote_iter->second->obs.oi.soid, op);
2227       return;
2228     }
2229     if (objects_blocked_on_cache_full.count(head)) {
2230       block_write_on_full_cache(head, op);
2231       return;
2232     }
2233   }
2234
2235   // dup/resent?
2236   if (op->may_write() || op->may_cache()) {
2237     // warning: we will get back *a* request for this reqid, but not
2238     // necessarily the most recent.  this happens with flush and
2239     // promote ops, but we can't possible have both in our log where
2240     // the original request is still not stable on disk, so for our
2241     // purposes here it doesn't matter which one we get.
2242     eversion_t version;
2243     version_t user_version;
2244     int return_code = 0;
2245     vector<pg_log_op_return_item_t> op_returns;
2246     bool got = check_in_progress_op(
2247       m->get_reqid(), &version, &user_version, &return_code, &op_returns);
2248     if (got) {
2249       dout(3) << __func__ << " dup " << m->get_reqid()
2250               << " version " << version << dendl;
2251       if (already_complete(version)) {
2252         osd->reply_op_error(op, return_code, version, user_version, op_returns);
2253       } else {
2254         dout(10) << " waiting for " << version << " to commit" << dendl;
2255         // always queue ondisk waiters, so that we can requeue if needed
2256         waiting_for_ondisk[version].emplace_back(op, user_version, return_code,
2257                                                  op_returns);
2258         op->mark_delayed("waiting for ondisk");
2259       }
2260       return;
2261     }
2262   }
2263
2264   ObjectContextRef obc;
2265   bool can_create = op->may_write();
2266   hobject_t missing_oid;
2267
2268   // kludge around the fact that LIST_SNAPS sets CEPH_SNAPDIR for LIST_SNAPS
2269   const hobject_t& oid =
2270     m->get_snapid() == CEPH_SNAPDIR ? head : m->get_hobj();
2271
2272   // make sure LIST_SNAPS is on CEPH_SNAPDIR and nothing else
2273   for (vector<OSDOp>::iterator p = m->ops.begin(); p != m->ops.end(); ++p) {
2274     OSDOp& osd_op = *p;
2275
2276     if (osd_op.op.op == CEPH_OSD_OP_LIST_SNAPS) {
2277       if (m->get_snapid() != CEPH_SNAPDIR) {
2278         dout(10) << "LIST_SNAPS with incorrect context" << dendl;
2279         osd->reply_op_error(op, -EINVAL);
2280         return;
2281       }
2282     } else {
2283       if (m->get_snapid() == CEPH_SNAPDIR) {
2284         dout(10) << "non-LIST_SNAPS on snapdir" << dendl;
2285         osd->reply_op_error(op, -EINVAL);
2286         return;
2287       }
2288     }
2289   }
2290
2291   // io blocked on obc?
2292   if (!m->has_flag(CEPH_OSD_FLAG_FLUSH) &&
2293       maybe_await_blocked_head(oid, op)) {
2294     return;
2295   }
2296
2297   if (!is_primary()) {
2298     if (!recovery_state.can_serve_replica_read(oid)) {
2299       dout(20) << __func__
2300                << ": unstable write on replica, bouncing to primary "
2301                << *m << dendl;
2302       osd->reply_op_error(op, -EAGAIN);
2303       return;
2304     }
2305     dout(20) << __func__ << ": serving replica read on oid " << oid
2306              << dendl;
2307   }
2308
2309   int r = find_object_context(
2310     oid, &obc, can_create,
2311     m->has_flag(CEPH_OSD_FLAG_MAP_SNAP_CLONE),
2312     &missing_oid);
2313
2314   // LIST_SNAPS needs the ssc too
2315   if (obc &&
2316       m->get_snapid() == CEPH_SNAPDIR &&
2317       !obc->ssc) {
2318     obc->ssc = get_snapset_context(oid, true);
2319   }
2320
2321   if (r == -EAGAIN) {
2322     // If we're not the primary of this OSD, we just return -EAGAIN. Otherwise,
2323     // we have to wait for the object.
2324     if (is_primary()) {
2325       // missing the specific snap we need; requeue and wait.
2326       ceph_assert(!op->may_write()); // only happens on a read/cache
2327       wait_for_unreadable_object(missing_oid, op);
2328       return;
2329     }
2330   } else if (r == 0) {
2331     if (is_unreadable_object(obc->obs.oi.soid)) {
2332       dout(10) << __func__ << ": clone " << obc->obs.oi.soid
2333                << " is unreadable, waiting" << dendl;
2334       wait_for_unreadable_object(obc->obs.oi.soid, op);
2335       return;
2336     }
2337
2338     // degraded object?  (the check above was for head; this could be a clone)
2339     if (write_ordered &&
2340         obc->obs.oi.soid.snap != CEPH_NOSNAP &&
2341         is_degraded_or_backfilling_object(obc->obs.oi.soid)) {
2342       dout(10) << __func__ << ": clone " << obc->obs.oi.soid
2343                << " is degraded, waiting" << dendl;
2344       wait_for_degraded_object(obc->obs.oi.soid, op);
2345       return;
2346     }
2347   }
2348
2349   bool in_hit_set = false;
2350   if (hit_set) {
2351     if (obc.get()) {
2352       if (obc->obs.oi.soid != hobject_t() && hit_set->contains(obc->obs.oi.soid))
2353         in_hit_set = true;
2354     } else {
2355       if (missing_oid != hobject_t() && hit_set->contains(missing_oid))
2356         in_hit_set = true;
2357     }
2358     if (!op->hitset_inserted) {
2359       hit_set->insert(oid);
2360       op->hitset_inserted = true;
2361       if (hit_set->is_full() ||
2362           hit_set_start_stamp + pool.info.hit_set_period <= m->get_recv_stamp()) {
2363         hit_set_persist();
2364       }
2365     }
2366   }
2367
2368   if (agent_state) {
2369     if (agent_choose_mode(false, op))
2370       return;
2371   }
2372
2373   if (obc.get() && obc->obs.exists) {
2374     if (recover_adjacent_clones(obc, op)) {
2375       return;
2376     }
2377     if (maybe_handle_manifest(op,
2378                                write_ordered,
2379                                obc))
2380     return;
2381   }
2382
2383   if (maybe_handle_cache(op,
2384                          write_ordered,
2385                          obc,
2386                          r,
2387                          missing_oid,
2388                          false,
2389                          in_hit_set))
2390     return;
2391
2392   if (r && (r != -ENOENT || !obc)) {
2393     // copy the reqids for copy get on ENOENT
2394     if (r == -ENOENT &&
2395         (m->ops[0].op.op == CEPH_OSD_OP_COPY_GET)) {
2396       fill_in_copy_get_noent(op, oid, m->ops[0]);
2397       return;
2398     }
2399     dout(20) << __func__ << ": find_object_context got error " << r << dendl;
2400     if (op->may_write() &&
2401         get_osdmap()->require_osd_release >= ceph_release_t::kraken) {
2402       record_write_error(op, oid, nullptr, r);
2403     } else {
2404       osd->reply_op_error(op, r);
2405     }
2406     return;
2407   }
2408
2409   // make sure locator is consistent
2410   object_locator_t oloc(obc->obs.oi.soid);
2411   if (m->get_object_locator() != oloc) {
2412     dout(10) << " provided locator " << m->get_object_locator()
2413              << " != object's " << obc->obs.oi.soid << dendl;
2414     osd->clog->warn() << "bad locator " << m->get_object_locator()
2415                      << " on object " << oloc
2416                       << " op " << *m;
2417   }
2418
2419   // io blocked on obc?
2420   if (obc->is_blocked() &&
2421       !m->has_flag(CEPH_OSD_FLAG_FLUSH)) {
2422     wait_for_blocked_object(obc->obs.oi.soid, op);
2423     return;
2424   }
2425
2426   dout(25) << __func__ << " oi " << obc->obs.oi << dendl;
2427
2428   OpContext *ctx = new OpContext(op, m->get_reqid(), &m->ops, obc, this);
2429
2430   if (m->has_flag(CEPH_OSD_FLAG_SKIPRWLOCKS)) {
2431     dout(20) << __func__ << ": skipping rw locks" << dendl;
2432   } else if (m->get_flags() & CEPH_OSD_FLAG_FLUSH) {
2433     dout(20) << __func__ << ": part of flush, will ignore write lock" << dendl;
2434
2435     // verify there is in fact a flush in progress
2436     // FIXME: we could make this a stronger test.
2437     map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(obc->obs.oi.soid);
2438     if (p == flush_ops.end()) {
2439       dout(10) << __func__ << " no flush in progress, aborting" << dendl;
2440       reply_ctx(ctx, -EINVAL);
2441       return;
2442     }
2443   } else if (!get_rw_locks(write_ordered, ctx)) {
2444     dout(20) << __func__ << " waiting for rw locks " << dendl;
2445     op->mark_delayed("waiting for rw locks");
2446     close_op_ctx(ctx);
2447     return;
2448   }
2449   dout(20) << __func__ << " obc " << *obc << dendl;
2450
2451   if (r) {
2452     dout(20) << __func__ << " returned an error: " << r << dendl;
2453     if (op->may_write() &&
2454         get_osdmap()->require_osd_release >= ceph_release_t::kraken) {
2455       record_write_error(op, oid, nullptr, r,
2456                          ctx->op->allows_returnvec() ? ctx : nullptr);
2457     } else {
2458       osd->reply_op_error(op, r);
2459     }
2460     close_op_ctx(ctx);
2461     return;
2462   }
2463
2464   if (m->has_flag(CEPH_OSD_FLAG_IGNORE_CACHE)) {
2465     ctx->ignore_cache = true;
2466   }
2467
2468   if ((op->may_read()) && (obc->obs.oi.is_lost())) {
2469     // This object is lost. Reading from it returns an error.
2470     dout(20) << __func__ << ": object " << obc->obs.oi.soid
2471              << " is lost" << dendl;
2472     reply_ctx(ctx, -ENFILE);
2473     return;
2474   }
2475   if (!op->may_write() &&
2476       !op->may_cache() &&
2477       (!obc->obs.exists ||
2478        ((m->get_snapid() != CEPH_SNAPDIR) &&
2479         obc->obs.oi.is_whiteout()))) {
2480     // copy the reqids for copy get on ENOENT
2481     if (m->ops[0].op.op == CEPH_OSD_OP_COPY_GET) {
2482       fill_in_copy_get_noent(op, oid, m->ops[0]);
2483       close_op_ctx(ctx);
2484       return;
2485     }
2486     reply_ctx(ctx, -ENOENT);
2487     return;
2488   }
2489
2490   op->mark_started();
2491
2492   execute_ctx(ctx);
2493   utime_t prepare_latency = ceph_clock_now();
2494   prepare_latency -= op->get_dequeued_time();
2495   osd->logger->tinc(l_osd_op_prepare_lat, prepare_latency);
2496   if (op->may_read() && op->may_write()) {
2497     osd->logger->tinc(l_osd_op_rw_prepare_lat, prepare_latency);
2498   } else if (op->may_read()) {
2499     osd->logger->tinc(l_osd_op_r_prepare_lat, prepare_latency);
2500   } else if (op->may_write() || op->may_cache()) {
2501     osd->logger->tinc(l_osd_op_w_prepare_lat, prepare_latency);
2502   }
2503
2504   // force recovery of the oldest missing object if too many logs
2505   maybe_force_recovery();
2506 }
2507
2508 PrimaryLogPG::cache_result_t PrimaryLogPG::maybe_handle_manifest_detail(
2509   OpRequestRef op,
2510   bool write_ordered,
2511   ObjectContextRef obc)
2512 {
2513   if (!obc) {
2514     dout(20) << __func__ << ": no obc " << dendl;
2515     return cache_result_t::NOOP;
2516   }
2517
2518   if (!obc->obs.oi.has_manifest()) {
2519     dout(20) << __func__ << ": " << obc->obs.oi.soid
2520              << " is not manifest object " << dendl;
2521     return cache_result_t::NOOP;
2522   }
2523   if (op->get_req<MOSDOp>()->get_flags() & CEPH_OSD_FLAG_IGNORE_REDIRECT) {
2524     dout(20) << __func__ << ": ignoring redirect due to flag" << dendl;
2525     return cache_result_t::NOOP;
2526   }
2527
2528   // if it is write-ordered and blocked, stop now
2529   if (obc->is_blocked() && write_ordered) {
2530     // we're already doing something with this object
2531     dout(20) << __func__ << " blocked on " << obc->obs.oi.soid << dendl;
2532     return cache_result_t::NOOP;
2533   }
2534
2535   vector<OSDOp> ops = op->get_req<MOSDOp>()->ops;
2536   for (vector<OSDOp>::iterator p = ops.begin(); p != ops.end(); ++p) {
2537     OSDOp& osd_op = *p;
2538     ceph_osd_op& op = osd_op.op;
2539     if (op.op == CEPH_OSD_OP_SET_REDIRECT ||
2540         op.op == CEPH_OSD_OP_SET_CHUNK ||
2541         op.op == CEPH_OSD_OP_UNSET_MANIFEST ||
2542         op.op == CEPH_OSD_OP_TIER_PROMOTE ||
2543         op.op == CEPH_OSD_OP_TIER_FLUSH ||
2544         op.op == CEPH_OSD_OP_TIER_EVICT ||
2545         op.op == CEPH_OSD_OP_ISDIRTY) {
2546       return cache_result_t::NOOP;
2547     }
2548   }
2549
2550   switch (obc->obs.oi.manifest.type) {
2551   case object_manifest_t::TYPE_REDIRECT:
2552     if (op->may_write() || write_ordered) {
2553       do_proxy_write(op, obc);
2554     } else {
2555       // promoted object
2556       if (obc->obs.oi.size != 0) {
2557         return cache_result_t::NOOP;
2558       }
2559       do_proxy_read(op, obc);
2560     }
2561     return cache_result_t::HANDLED_PROXY;
2562   case object_manifest_t::TYPE_CHUNKED:
2563     {
2564       if (can_proxy_chunked_read(op, obc)) {
2565         map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(obc->obs.oi.soid);
2566         if (p != flush_ops.end()) {
2567           do_proxy_chunked_op(op, obc->obs.oi.soid, obc, true);
2568           return cache_result_t::HANDLED_PROXY;
2569         }
2570         do_proxy_chunked_op(op, obc->obs.oi.soid, obc, write_ordered);
2571         return cache_result_t::HANDLED_PROXY;
2572       }
2573
2574       MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
2575       ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
2576       hobject_t head = m->get_hobj();
2577
2578       if (is_degraded_or_backfilling_object(head)) {
2579         dout(20) << __func__ << ": " << head << " is degraded, waiting" << dendl;
2580         wait_for_degraded_object(head, op);
2581         return cache_result_t::BLOCKED_RECOVERY;
2582       }
2583
2584       if (m_scrubber->write_blocked_by_scrub(head)) {
2585         dout(20) << __func__ << ": waiting for scrub" << dendl;
2586         waiting_for_scrub.push_back(op);
2587         op->mark_delayed("waiting for scrub");
2588         return cache_result_t::BLOCKED_RECOVERY;
2589       }
2590       if (!check_laggy_requeue(op)) {
2591         return cache_result_t::BLOCKED_RECOVERY;
2592       }
2593
2594       for (auto& p : obc->obs.oi.manifest.chunk_map) {
2595         if (p.second.is_missing()) {
2596           auto m = op->get_req<MOSDOp>();
2597           const object_locator_t oloc = m->get_object_locator();
2598           promote_object(obc, obc->obs.oi.soid, oloc, op, NULL);
2599           return cache_result_t::BLOCKED_PROMOTE;
2600         }
2601       }
2602       return cache_result_t::NOOP;
2603     }
2604   default:
2605     ceph_abort_msg("unrecognized manifest type");
2606   }
2607
2608   return cache_result_t::NOOP;
2609 }
2610
2611 void PrimaryLogPG::record_write_error(OpRequestRef op, const hobject_t &soid,
2612                                       MOSDOpReply *orig_reply, int r,
2613                                       OpContext *ctx_for_op_returns)
2614 {
2615   dout(20) << __func__ << " r=" << r << dendl;
2616   ceph_assert(op->may_write());
2617   const osd_reqid_t &reqid = op->get_req<MOSDOp>()->get_reqid();
2618   mempool::osd_pglog::list<pg_log_entry_t> entries;
2619   entries.push_back(pg_log_entry_t(pg_log_entry_t::ERROR, soid,
2620                                    get_next_version(), eversion_t(), 0,
2621                                    reqid, utime_t(), r));
2622   if (ctx_for_op_returns) {
2623     entries.back().set_op_returns(*ctx_for_op_returns->ops);
2624     dout(20) << __func__ << " op_returns=" << entries.back().op_returns << dendl;
2625   }
2626
2627   struct OnComplete {
2628     PrimaryLogPG *pg;
2629     OpRequestRef op;
2630     boost::intrusive_ptr<MOSDOpReply> orig_reply;
2631     int r;
2632     OnComplete(
2633       PrimaryLogPG *pg,
2634       OpRequestRef op,
2635       MOSDOpReply *orig_reply,
2636       int r)
2637       : pg(pg), op(op),
2638         orig_reply(orig_reply, false /* take over ref */), r(r)
2639       {}
2640     void operator()() {
2641       ldpp_dout(pg, 20) << "finished " << __func__ << " r=" << r << dendl;
2642       auto m = op->get_req<MOSDOp>();
2643       MOSDOpReply *reply = orig_reply.detach();
2644       ldpp_dout(pg, 10) << " sending commit on " << *m << " " << reply << dendl;
2645       pg->osd->send_message_osd_client(reply, m->get_connection());
2646     }
2647   };
2648
2649   ObcLockManager lock_manager;
2650   submit_log_entries(
2651     entries,
2652     std::move(lock_manager),
2653     std::optional<std::function<void(void)> >(
2654       OnComplete(this, op, orig_reply, r)),
2655     op,
2656     r);
2657 }
2658
2659 PrimaryLogPG::cache_result_t PrimaryLogPG::maybe_handle_cache_detail(
2660   OpRequestRef op,
2661   bool write_ordered,
2662   ObjectContextRef obc,
2663   int r, hobject_t missing_oid,
2664   bool must_promote,
2665   bool in_hit_set,
2666   ObjectContextRef *promote_obc)
2667 {
2668   // return quickly if caching is not enabled
2669   if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)
2670     return cache_result_t::NOOP;
2671
2672   if (op &&
2673       op->get_req() &&
2674       op->get_req()->get_type() == CEPH_MSG_OSD_OP &&
2675       (op->get_req<MOSDOp>()->get_flags() &
2676        CEPH_OSD_FLAG_IGNORE_CACHE)) {
2677     dout(20) << __func__ << ": ignoring cache due to flag" << dendl;
2678     return cache_result_t::NOOP;
2679   }
2680
2681   must_promote = must_promote || op->need_promote();
2682
2683   if (obc)
2684     dout(25) << __func__ << " " << obc->obs.oi << " "
2685              << (obc->obs.exists ? "exists" : "DNE")
2686              << " missing_oid " << missing_oid
2687              << " must_promote " << (int)must_promote
2688              << " in_hit_set " << (int)in_hit_set
2689              << dendl;
2690   else
2691     dout(25) << __func__ << " (no obc)"
2692              << " missing_oid " << missing_oid
2693              << " must_promote " << (int)must_promote
2694              << " in_hit_set " << (int)in_hit_set
2695              << dendl;
2696
2697   // if it is write-ordered and blocked, stop now
2698   if (obc.get() && obc->is_blocked() && write_ordered) {
2699     // we're already doing something with this object
2700     dout(20) << __func__ << " blocked on " << obc->obs.oi.soid << dendl;
2701     return cache_result_t::NOOP;
2702   }
2703
2704   if (r == -ENOENT && missing_oid == hobject_t()) {
2705     // we know this object is logically absent (e.g., an undefined clone)
2706     return cache_result_t::NOOP;
2707   }
2708
2709   if (obc.get() && obc->obs.exists) {
2710     osd->logger->inc(l_osd_op_cache_hit);
2711     return cache_result_t::NOOP;
2712   }
2713   if (!is_primary()) {
2714     dout(20) << __func__ << " cache miss; ask the primary" << dendl;
2715     osd->reply_op_error(op, -EAGAIN);
2716     return cache_result_t::REPLIED_WITH_EAGAIN;
2717   }
2718
2719   if (missing_oid == hobject_t() && obc.get()) {
2720     missing_oid = obc->obs.oi.soid;
2721   }
2722
2723   auto m = op->get_req<MOSDOp>();
2724   const object_locator_t oloc = m->get_object_locator();
2725
2726   if (op->need_skip_handle_cache()) {
2727     return cache_result_t::NOOP;
2728   }
2729
2730   OpRequestRef promote_op;
2731
2732   switch (pool.info.cache_mode) {
2733   case pg_pool_t::CACHEMODE_WRITEBACK:
2734     if (agent_state &&
2735         agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2736       if (!op->may_write() && !op->may_cache() &&
2737           !write_ordered && !must_promote) {
2738         dout(20) << __func__ << " cache pool full, proxying read" << dendl;
2739         do_proxy_read(op);
2740         return cache_result_t::HANDLED_PROXY;
2741       }
2742       dout(20) << __func__ << " cache pool full, waiting" << dendl;
2743       block_write_on_full_cache(missing_oid, op);
2744       return cache_result_t::BLOCKED_FULL;
2745     }
2746
2747     if (must_promote || (!hit_set && !op->need_skip_promote())) {
2748       promote_object(obc, missing_oid, oloc, op, promote_obc);
2749       return cache_result_t::BLOCKED_PROMOTE;
2750     }
2751
2752     if (op->may_write() || op->may_cache()) {
2753       do_proxy_write(op);
2754
2755       // Promote too?
2756       if (!op->need_skip_promote() &&
2757           maybe_promote(obc, missing_oid, oloc, in_hit_set,
2758                       pool.info.min_write_recency_for_promote,
2759                       OpRequestRef(),
2760                       promote_obc)) {
2761         return cache_result_t::BLOCKED_PROMOTE;
2762       }
2763       return cache_result_t::HANDLED_PROXY;
2764     } else {
2765       do_proxy_read(op);
2766
2767       // Avoid duplicate promotion
2768       if (obc.get() && obc->is_blocked()) {
2769         if (promote_obc)
2770           *promote_obc = obc;
2771         return cache_result_t::BLOCKED_PROMOTE;
2772       }
2773
2774       // Promote too?
2775       if (!op->need_skip_promote()) {
2776         (void)maybe_promote(obc, missing_oid, oloc, in_hit_set,
2777                             pool.info.min_read_recency_for_promote,
2778                             promote_op, promote_obc);
2779       }
2780
2781       return cache_result_t::HANDLED_PROXY;
2782     }
2783     ceph_abort_msg("unreachable");
2784     return cache_result_t::NOOP;
2785
2786   case pg_pool_t::CACHEMODE_READONLY:
2787     // TODO: clean this case up
2788     if (!obc.get() && r == -ENOENT) {
2789       // we don't have the object and op's a read
2790       promote_object(obc, missing_oid, oloc, op, promote_obc);
2791       return cache_result_t::BLOCKED_PROMOTE;
2792     }
2793     if (!r) { // it must be a write
2794       do_cache_redirect(op);
2795       return cache_result_t::HANDLED_REDIRECT;
2796     }
2797     // crap, there was a failure of some kind
2798     return cache_result_t::NOOP;
2799
2800   case pg_pool_t::CACHEMODE_FORWARD:
2801     // this mode is deprecated; proxy instead
2802   case pg_pool_t::CACHEMODE_PROXY:
2803     if (!must_promote) {
2804       if (op->may_write() || op->may_cache() || write_ordered) {
2805         do_proxy_write(op);
2806         return cache_result_t::HANDLED_PROXY;
2807       } else {
2808         do_proxy_read(op);
2809         return cache_result_t::HANDLED_PROXY;
2810       }
2811     }
2812     // ugh, we're forced to promote.
2813     if (agent_state &&
2814         agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2815       dout(20) << __func__ << " cache pool full, waiting" << dendl;
2816       block_write_on_full_cache(missing_oid, op);
2817       return cache_result_t::BLOCKED_FULL;
2818     }
2819     promote_object(obc, missing_oid, oloc, op, promote_obc);
2820     return cache_result_t::BLOCKED_PROMOTE;
2821
2822   case pg_pool_t::CACHEMODE_READFORWARD:
2823     // this mode is deprecated; proxy instead
2824   case pg_pool_t::CACHEMODE_READPROXY:
2825     // Do writeback to the cache tier for writes
2826     if (op->may_write() || write_ordered || must_promote) {
2827       if (agent_state &&
2828           agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2829         dout(20) << __func__ << " cache pool full, waiting" << dendl;
2830         block_write_on_full_cache(missing_oid, op);
2831         return cache_result_t::BLOCKED_FULL;
2832       }
2833       promote_object(obc, missing_oid, oloc, op, promote_obc);
2834       return cache_result_t::BLOCKED_PROMOTE;
2835     }
2836
2837     // If it is a read, we can read, we need to proxy it
2838     do_proxy_read(op);
2839     return cache_result_t::HANDLED_PROXY;
2840
2841   default:
2842     ceph_abort_msg("unrecognized cache_mode");
2843   }
2844   return cache_result_t::NOOP;
2845 }
2846
2847 bool PrimaryLogPG::maybe_promote(ObjectContextRef obc,
2848                                  const hobject_t& missing_oid,
2849                                  const object_locator_t& oloc,
2850                                  bool in_hit_set,
2851                                  uint32_t recency,
2852                                  OpRequestRef promote_op,
2853                                  ObjectContextRef *promote_obc)
2854 {
2855   dout(20) << __func__ << " missing_oid " << missing_oid
2856            << "  in_hit_set " << in_hit_set << dendl;
2857
2858   switch (recency) {
2859   case 0:
2860     break;
2861   case 1:
2862     // Check if in the current hit set
2863     if (in_hit_set) {
2864       break;
2865     } else {
2866       // not promoting
2867       return false;
2868     }
2869     break;
2870   default:
2871     {
2872       unsigned count = (int)in_hit_set;
2873       if (count) {
2874         // Check if in other hit sets
2875         const hobject_t& oid = obc.get() ? obc->obs.oi.soid : missing_oid;
2876         for (map<time_t,HitSetRef>::reverse_iterator itor =
2877                agent_state->hit_set_map.rbegin();
2878              itor != agent_state->hit_set_map.rend();
2879              ++itor) {
2880           if (!itor->second->contains(oid)) {
2881             break;
2882           }
2883           ++count;
2884           if (count >= recency) {
2885             break;
2886           }
2887         }
2888       }
2889       if (count >= recency) {
2890         break;
2891       }
2892       return false;     // not promoting
2893     }
2894     break;
2895   }
2896
2897   if (osd->promote_throttle()) {
2898     dout(10) << __func__ << " promote throttled" << dendl;
2899     return false;
2900   }
2901   promote_object(obc, missing_oid, oloc, promote_op, promote_obc);
2902   return true;
2903 }
2904
2905 void PrimaryLogPG::do_cache_redirect(OpRequestRef op)
2906 {
2907   auto m = op->get_req<MOSDOp>();
2908   int flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
2909   MOSDOpReply *reply = new MOSDOpReply(m, -ENOENT, get_osdmap_epoch(),
2910                                        flags, false);
2911   request_redirect_t redir(m->get_object_locator(), pool.info.tier_of);
2912   reply->set_redirect(redir);
2913   dout(10) << "sending redirect to pool " << pool.info.tier_of << " for op "
2914            << op << dendl;
2915   m->get_connection()->send_message(reply);
2916   return;
2917 }
2918
2919 struct C_ProxyRead : public Context {
2920   PrimaryLogPGRef pg;
2921   hobject_t oid;
2922   epoch_t last_peering_reset;
2923   ceph_tid_t tid;
2924   PrimaryLogPG::ProxyReadOpRef prdop;
2925   utime_t start;
2926   C_ProxyRead(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
2927              const PrimaryLogPG::ProxyReadOpRef& prd)
2928     : pg(p), oid(o), last_peering_reset(lpr),
2929       tid(0), prdop(prd), start(ceph_clock_now())
2930   {}
2931   void finish(int r) override {
2932     if (prdop->canceled)
2933       return;
2934     std::scoped_lock locker{*pg};
2935     if (prdop->canceled) {
2936       return;
2937     }
2938     if (last_peering_reset == pg->get_last_peering_reset()) {
2939       pg->finish_proxy_read(oid, tid, r);
2940       pg->osd->logger->tinc(l_osd_tier_r_lat, ceph_clock_now() - start);
2941     }
2942   }
2943 };
2944
2945 struct C_ProxyChunkRead : public Context {
2946   PrimaryLogPGRef pg;
2947   hobject_t oid;
2948   epoch_t last_peering_reset;
2949   ceph_tid_t tid;
2950   PrimaryLogPG::ProxyReadOpRef prdop;
2951   utime_t start;
2952   ObjectOperation *obj_op;
2953   int op_index = 0;
2954   uint64_t req_offset = 0;
2955   ObjectContextRef obc;
2956   uint64_t req_total_len = 0;
2957   C_ProxyChunkRead(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
2958                    const PrimaryLogPG::ProxyReadOpRef& prd)
2959     : pg(p), oid(o), last_peering_reset(lpr),
2960       tid(0), prdop(prd), start(ceph_clock_now()), obj_op(NULL)
2961   {}
2962   void finish(int r) override {
2963     if (prdop->canceled)
2964       return;
2965     std::scoped_lock locker{*pg};
2966     if (prdop->canceled) {
2967       return;
2968     }
2969     if (last_peering_reset == pg->get_last_peering_reset()) {
2970       if (r >= 0) {
2971         if (!prdop->ops[op_index].outdata.length()) {
2972           ceph_assert(req_total_len);
2973           bufferlist list;
2974           bufferptr bptr(req_total_len);
2975           list.push_back(std::move(bptr));
2976           prdop->ops[op_index].outdata.append(list);
2977         }
2978         ceph_assert(obj_op);
2979         uint64_t copy_offset;
2980         if (req_offset >= prdop->ops[op_index].op.extent.offset) {
2981           copy_offset = req_offset - prdop->ops[op_index].op.extent.offset;
2982         } else {
2983           copy_offset = 0;
2984         }
2985         prdop->ops[op_index].outdata.begin(copy_offset).copy_in(
2986           obj_op->ops[0].outdata.length(),
2987           obj_op->ops[0].outdata.c_str());
2988       }
2989
2990       pg->finish_proxy_read(oid, tid, r);
2991       pg->osd->logger->tinc(l_osd_tier_r_lat, ceph_clock_now() - start);
2992       if (obj_op) {
2993         delete obj_op;
2994       }
2995     }
2996   }
2997 };
2998
2999 void PrimaryLogPG::do_proxy_read(OpRequestRef op, ObjectContextRef obc)
3000 {
3001   // NOTE: non-const here because the ProxyReadOp needs mutable refs to
3002   // stash the result in the request's OSDOp vector
3003   MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
3004   object_locator_t oloc;
3005   hobject_t soid;
3006   /* extensible tier */
3007   if (obc && obc->obs.exists && obc->obs.oi.has_manifest()) {
3008     switch (obc->obs.oi.manifest.type) {
3009       case object_manifest_t::TYPE_REDIRECT:
3010           oloc = object_locator_t(obc->obs.oi.manifest.redirect_target);
3011           soid = obc->obs.oi.manifest.redirect_target;
3012           break;
3013       default:
3014         ceph_abort_msg("unrecognized manifest type");
3015     }
3016   } else {
3017   /* proxy */
3018     soid = m->get_hobj();
3019     oloc = object_locator_t(m->get_object_locator());
3020     oloc.pool = pool.info.tier_of;
3021   }
3022   unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY;
3023
3024   // pass through some original flags that make sense.
3025   //  - leave out redirection and balancing flags since we are
3026   //    already proxying through the primary
3027   //  - leave off read/write/exec flags that are derived from the op
3028   flags |= m->get_flags() & (CEPH_OSD_FLAG_RWORDERED |
3029                              CEPH_OSD_FLAG_ORDERSNAP |
3030                              CEPH_OSD_FLAG_ENFORCE_SNAPC |
3031                              CEPH_OSD_FLAG_MAP_SNAP_CLONE);
3032
3033   dout(10) << __func__ << " Start proxy read for " << *m << dendl;
3034
3035   ProxyReadOpRef prdop(std::make_shared<ProxyReadOp>(op, soid, m->ops));
3036
3037   ObjectOperation obj_op;
3038   obj_op.dup(prdop->ops);
3039
3040   if (pool.info.cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
3041       (agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL)) {
3042     for (unsigned i = 0; i < obj_op.ops.size(); i++) {
3043       ceph_osd_op op = obj_op.ops[i].op;
3044       switch (op.op) {
3045         case CEPH_OSD_OP_READ:
3046         case CEPH_OSD_OP_SYNC_READ:
3047         case CEPH_OSD_OP_SPARSE_READ:
3048         case CEPH_OSD_OP_CHECKSUM:
3049         case CEPH_OSD_OP_CMPEXT:
3050           op.flags = (op.flags | CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL) &
3051                        ~(CEPH_OSD_OP_FLAG_FADVISE_DONTNEED | CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
3052       }
3053     }
3054   }
3055
3056   C_ProxyRead *fin = new C_ProxyRead(this, soid, get_last_peering_reset(),
3057                                      prdop);
3058   ceph_tid_t tid = osd->objecter->read(
3059     soid.oid, oloc, obj_op,
3060     m->get_snapid(), NULL,
3061     flags, new C_OnFinisher(fin, osd->get_objecter_finisher(get_pg_shard())),
3062     &prdop->user_version,
3063     &prdop->data_offset,
3064     m->get_features());
3065   fin->tid = tid;
3066   prdop->objecter_tid = tid;
3067   proxyread_ops[tid] = prdop;
3068   in_progress_proxy_ops[soid].push_back(op);
3069 }
3070
3071 void PrimaryLogPG::finish_proxy_read(hobject_t oid, ceph_tid_t tid, int r)
3072 {
3073   dout(10) << __func__ << " " << oid << " tid " << tid
3074            << " " << cpp_strerror(r) << dendl;
3075
3076   map<ceph_tid_t, ProxyReadOpRef>::iterator p = proxyread_ops.find(tid);
3077   if (p == proxyread_ops.end()) {
3078     dout(10) << __func__ << " no proxyread_op found" << dendl;
3079     return;
3080   }
3081   ProxyReadOpRef prdop = p->second;
3082   if (tid != prdop->objecter_tid) {
3083     dout(10) << __func__ << " tid " << tid << " != prdop " << prdop
3084              << " tid " << prdop->objecter_tid << dendl;
3085     return;
3086   }
3087   if (oid != prdop->soid) {
3088     dout(10) << __func__ << " oid " << oid << " != prdop " << prdop
3089              << " soid " << prdop->soid << dendl;
3090     return;
3091   }
3092   proxyread_ops.erase(tid);
3093
3094   map<hobject_t, list<OpRequestRef>>::iterator q = in_progress_proxy_ops.find(oid);
3095   if (q == in_progress_proxy_ops.end()) {
3096     dout(10) << __func__ << " no in_progress_proxy_ops found" << dendl;
3097     return;
3098   }
3099   ceph_assert(q->second.size());
3100   list<OpRequestRef>::iterator it = std::find(q->second.begin(),
3101                                               q->second.end(),
3102                                               prdop->op);
3103   ceph_assert(it != q->second.end());
3104   OpRequestRef op = *it;
3105   q->second.erase(it);
3106   if (q->second.size() == 0) {
3107     in_progress_proxy_ops.erase(oid);
3108   } else if (std::find(q->second.begin(),
3109                        q->second.end(),
3110                        prdop->op) != q->second.end()) {
3111     /* multiple read case */
3112     dout(20) << __func__ << " " << oid << " is not completed  " << dendl;
3113     return;
3114   }
3115
3116   osd->logger->inc(l_osd_tier_proxy_read);
3117
3118   auto m = op->get_req<MOSDOp>();
3119   OpContext *ctx = new OpContext(op, m->get_reqid(), &prdop->ops, this);
3120   ctx->reply = new MOSDOpReply(m, 0, get_osdmap_epoch(), 0, false);
3121   ctx->user_at_version = prdop->user_version;
3122   ctx->data_off = prdop->data_offset;
3123   ctx->ignore_log_op_stats = true;
3124   complete_read_ctx(r, ctx);
3125 }
3126
3127 void PrimaryLogPG::kick_proxy_ops_blocked(hobject_t& soid)
3128 {
3129   map<hobject_t, list<OpRequestRef>>::iterator p = in_progress_proxy_ops.find(soid);
3130   if (p == in_progress_proxy_ops.end())
3131     return;
3132
3133   list<OpRequestRef>& ls = p->second;
3134   dout(10) << __func__ << " " << soid << " requeuing " << ls.size() << " requests" << dendl;
3135   requeue_ops(ls);
3136   in_progress_proxy_ops.erase(p);
3137 }
3138
3139 void PrimaryLogPG::cancel_proxy_read(ProxyReadOpRef prdop,
3140                                      vector<ceph_tid_t> *tids)
3141 {
3142   dout(10) << __func__ << " " << prdop->soid << dendl;
3143   prdop->canceled = true;
3144
3145   // cancel objecter op, if we can
3146   if (prdop->objecter_tid) {
3147     tids->push_back(prdop->objecter_tid);
3148     for (uint32_t i = 0; i < prdop->ops.size(); i++) {
3149       prdop->ops[i].outdata.clear();
3150     }
3151     proxyread_ops.erase(prdop->objecter_tid);
3152     prdop->objecter_tid = 0;
3153   }
3154 }
3155
3156 void PrimaryLogPG::cancel_proxy_ops(bool requeue, vector<ceph_tid_t> *tids)
3157 {
3158   dout(10) << __func__ << dendl;
3159
3160   // cancel proxy reads
3161   map<ceph_tid_t, ProxyReadOpRef>::iterator p = proxyread_ops.begin();
3162   while (p != proxyread_ops.end()) {
3163     cancel_proxy_read((p++)->second, tids);
3164   }
3165
3166   // cancel proxy writes
3167   map<ceph_tid_t, ProxyWriteOpRef>::iterator q = proxywrite_ops.begin();
3168   while (q != proxywrite_ops.end()) {
3169     cancel_proxy_write((q++)->second, tids);
3170   }
3171
3172   if (requeue) {
3173     map<hobject_t, list<OpRequestRef>>::iterator p =
3174       in_progress_proxy_ops.begin();
3175     while (p != in_progress_proxy_ops.end()) {
3176       list<OpRequestRef>& ls = p->second;
3177       dout(10) << __func__ << " " << p->first << " requeuing " << ls.size()
3178                << " requests" << dendl;
3179       requeue_ops(ls);
3180       in_progress_proxy_ops.erase(p++);
3181     }
3182   } else {
3183     in_progress_proxy_ops.clear();
3184   }
3185 }
3186
3187 struct C_ProxyWrite_Commit : public Context {
3188   PrimaryLogPGRef pg;
3189   hobject_t oid;
3190   epoch_t last_peering_reset;
3191   ceph_tid_t tid;
3192   PrimaryLogPG::ProxyWriteOpRef pwop;
3193   C_ProxyWrite_Commit(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
3194                       const PrimaryLogPG::ProxyWriteOpRef& pw)
3195     : pg(p), oid(o), last_peering_reset(lpr),
3196       tid(0), pwop(pw)
3197   {}
3198   void finish(int r) override {
3199     if (pwop->canceled)
3200       return;
3201     std::scoped_lock locker{*pg};
3202     if (pwop->canceled) {
3203       return;
3204     }
3205     if (last_peering_reset == pg->get_last_peering_reset()) {
3206       pg->finish_proxy_write(oid, tid, r);
3207     }
3208   }
3209 };
3210
3211 void PrimaryLogPG::do_proxy_write(OpRequestRef op, ObjectContextRef obc)
3212 {
3213   // NOTE: non-const because ProxyWriteOp takes a mutable ref
3214   MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
3215   object_locator_t oloc;
3216   SnapContext snapc(m->get_snap_seq(), m->get_snaps());
3217   hobject_t soid;
3218   /* extensible tier */
3219   if (obc && obc->obs.exists && obc->obs.oi.has_manifest()) {
3220     switch (obc->obs.oi.manifest.type) {
3221       case object_manifest_t::TYPE_REDIRECT:
3222           oloc = object_locator_t(obc->obs.oi.manifest.redirect_target);
3223           soid = obc->obs.oi.manifest.redirect_target;
3224           break;
3225       default:
3226         ceph_abort_msg("unrecognized manifest type");
3227     }
3228   } else {
3229   /* proxy */
3230     soid = m->get_hobj();
3231     oloc = object_locator_t(m->get_object_locator());
3232     oloc.pool = pool.info.tier_of;
3233   }
3234
3235   unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY;
3236   if (!(op->may_write() || op->may_cache())) {
3237     flags |= CEPH_OSD_FLAG_RWORDERED;
3238   }
3239   if (op->allows_returnvec()) {
3240     flags |= CEPH_OSD_FLAG_RETURNVEC;
3241   }
3242
3243   dout(10) << __func__ << " Start proxy write for " << *m << dendl;
3244
3245   ProxyWriteOpRef pwop(std::make_shared<ProxyWriteOp>(op, soid, m->ops, m->get_reqid()));
3246   pwop->ctx = new OpContext(op, m->get_reqid(), &pwop->ops, this);
3247   pwop->mtime = m->get_mtime();
3248
3249   ObjectOperation obj_op;
3250   obj_op.dup(pwop->ops);
3251
3252   C_ProxyWrite_Commit *fin = new C_ProxyWrite_Commit(
3253       this, soid, get_last_peering_reset(), pwop);
3254   ceph_tid_t tid = osd->objecter->mutate(
3255     soid.oid, oloc, obj_op, snapc,
3256     ceph::real_clock::from_ceph_timespec(pwop->mtime),
3257     flags, new C_OnFinisher(fin, osd->get_objecter_finisher(get_pg_shard())),
3258     &pwop->user_version, pwop->reqid);
3259   fin->tid = tid;
3260   pwop->objecter_tid = tid;
3261   proxywrite_ops[tid] = pwop;
3262   in_progress_proxy_ops[soid].push_back(op);
3263 }
3264
3265 void PrimaryLogPG::do_proxy_chunked_op(OpRequestRef op, const hobject_t& missing_oid,
3266                                        ObjectContextRef obc, bool write_ordered)
3267 {
3268   MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
3269   OSDOp *osd_op = NULL;
3270   for (unsigned int i = 0; i < m->ops.size(); i++) {
3271     osd_op = &m->ops[i];
3272     uint64_t cursor = osd_op->op.extent.offset;
3273     uint64_t op_length = osd_op->op.extent.offset + osd_op->op.extent.length;
3274     uint64_t chunk_length = 0, chunk_index = 0, req_len = 0;
3275     object_manifest_t *manifest = &obc->obs.oi.manifest;
3276     map <uint64_t, map<uint64_t, uint64_t>> chunk_read;
3277
3278     while (cursor < op_length) {
3279       chunk_index = 0;
3280       chunk_length = 0;
3281       /* find the right chunk position for cursor */
3282       for (auto &p : manifest->chunk_map) {
3283         if (p.first <= cursor && p.first + p.second.length > cursor) {
3284           chunk_length = p.second.length;
3285           chunk_index = p.first;
3286           break;
3287         }
3288       }
3289       /* no index */
3290       if (!chunk_index && !chunk_length) {
3291         if (cursor == osd_op->op.extent.offset) {
3292           OpContext *ctx = new OpContext(op, m->get_reqid(), &m->ops, this);
3293           ctx->reply = new MOSDOpReply(m, 0, get_osdmap_epoch(), 0, false);
3294           ctx->data_off = osd_op->op.extent.offset;
3295           ctx->ignore_log_op_stats = true;
3296           complete_read_ctx(0, ctx);
3297         }
3298         break;
3299       }
3300       uint64_t next_length = chunk_length;
3301       /* the size to read -> | op length | */
3302       /*                     |   a chunk   | */
3303       if (cursor + next_length > op_length) {
3304         next_length = op_length - cursor;
3305       }
3306       /* the size to read -> |   op length   | */
3307       /*                     |   a chunk | */
3308       if (cursor + next_length > chunk_index + chunk_length) {
3309         next_length = chunk_index + chunk_length - cursor;
3310       }
3311
3312       chunk_read[cursor] = {{chunk_index, next_length}};
3313       cursor += next_length;
3314     }
3315
3316     req_len = cursor - osd_op->op.extent.offset;
3317     for (auto &p : chunk_read) {
3318       auto chunks = p.second.begin();
3319       dout(20) << __func__ << " chunk_index: " << chunks->first
3320               << " next_length: " << chunks->second << " cursor: "
3321               << p.first << dendl;
3322       do_proxy_chunked_read(op, obc, i, chunks->first, p.first, chunks->second, req_len, write_ordered);
3323     }
3324   }
3325 }
3326
3327 struct RefCountCallback : public Context {
3328 public:
3329   PrimaryLogPG::OpContext *ctx;
3330   OSDOp& osd_op;
3331   bool requeue = false;
3332
3333   RefCountCallback(PrimaryLogPG::OpContext *ctx, OSDOp &osd_op)
3334     : ctx(ctx), osd_op(osd_op) {}
3335   void finish(int r) override {
3336     // NB: caller must already have pg->lock held
3337     ctx->obc->stop_block();
3338     ctx->pg->kick_object_context_blocked(ctx->obc);
3339     if (r >= 0) {
3340       osd_op.rval = 0;
3341       ctx->pg->execute_ctx(ctx);
3342     } else {
3343        // on cancel simply toss op out,
3344        // or requeue as requested
3345       if (r != -ECANCELED) {
3346         if (ctx->op)
3347           ctx->pg->osd->reply_op_error(ctx->op, r);
3348       } else if (requeue) {
3349         if (ctx->op)
3350           ctx->pg->requeue_op(ctx->op);
3351       }
3352       ctx->pg->close_op_ctx(ctx);
3353     }
3354   }
3355   void set_requeue(bool rq) {
3356     requeue = rq;
3357   }
3358 };
3359
3360 struct SetManifestFinisher : public PrimaryLogPG::OpFinisher {
3361   OSDOp& osd_op;
3362
3363   explicit SetManifestFinisher(OSDOp& osd_op) : osd_op(osd_op) {
3364   }
3365
3366   int execute() override {
3367     return osd_op.rval;
3368   }
3369 };
3370
3371 struct C_SetManifestRefCountDone : public Context {
3372   PrimaryLogPGRef pg;
3373   hobject_t soid;
3374   uint64_t offset;
3375   ceph_tid_t tid = 0;
3376   C_SetManifestRefCountDone(PrimaryLogPG *p,
3377     hobject_t soid, uint64_t offset) :
3378           pg(p), soid(soid), offset(offset) {}
3379   void finish(int r) override {
3380     if (r == -ECANCELED)
3381       return;
3382     std::scoped_lock locker{*pg};
3383     pg->finish_set_manifest_refcount(soid, r, tid, offset);
3384   }
3385 };
3386
3387 struct C_SetDedupChunks : public Context {
3388   PrimaryLogPGRef pg;
3389   hobject_t oid;
3390   epoch_t last_peering_reset;
3391   ceph_tid_t tid;
3392   uint64_t offset;
3393
3394   C_SetDedupChunks(PrimaryLogPG *p, hobject_t o, epoch_t lpr, uint64_t offset)
3395     : pg(p), oid(o), last_peering_reset(lpr),
3396       tid(0), offset(offset)
3397   {}
3398   void finish(int r) override {
3399     if (r == -ECANCELED)
3400       return;
3401     std::scoped_lock locker{*pg};
3402     if (last_peering_reset != pg->get_last_peering_reset()) {
3403       return;
3404     }
3405     pg->finish_set_dedup(oid, r, tid, offset);
3406   }
3407 };
3408
3409 void PrimaryLogPG::cancel_manifest_ops(bool requeue, vector<ceph_tid_t> *tids)
3410 {
3411   dout(10) << __func__ << dendl;
3412   auto p = manifest_ops.begin();
3413   while (p != manifest_ops.end()) {
3414     auto mop = p->second;
3415     // cancel objecter op, if we can
3416     if (mop->objecter_tid) {
3417       tids->push_back(mop->objecter_tid);
3418       mop->objecter_tid = 0;
3419     } else if (!mop->tids.empty()) {
3420       for (auto &p : mop->tids) {
3421         tids->push_back(p.second);
3422       }
3423     }
3424     if (mop->cb) {
3425       mop->cb->set_requeue(requeue);
3426       mop->cb->complete(-ECANCELED);
3427     }
3428     manifest_ops.erase(p++);
3429   }
3430 }
3431
3432 int PrimaryLogPG::get_manifest_ref_count(ObjectContextRef obc, std::string& fp_oid, OpRequestRef op)
3433 {
3434   int cnt = 0;
3435   // head
3436   for (auto &p : obc->obs.oi.manifest.chunk_map) {
3437     if (p.second.oid.oid.name == fp_oid) {
3438       cnt++;
3439     }
3440   }
3441   // snap
3442   SnapSet& ss = obc->ssc->snapset;
3443   const OSDMapRef& osdmap = get_osdmap();
3444   for (vector<snapid_t>::const_reverse_iterator p = ss.clones.rbegin();
3445       p != ss.clones.rend();
3446       ++p) {
3447     object_ref_delta_t refs;
3448     ObjectContextRef obc_l = nullptr;
3449     ObjectContextRef obc_g = nullptr;
3450     hobject_t clone_oid = obc->obs.oi.soid;
3451     clone_oid.snap = *p;
3452     if (osdmap->in_removed_snaps_queue(info.pgid.pgid.pool(), *p)) {
3453       return -EBUSY;
3454     }
3455     ObjectContextRef clone_obc = get_object_context(clone_oid, false);
3456     if (!clone_obc) {
3457       break;
3458     }
3459     if (recover_adjacent_clones(clone_obc, op)) {
3460       return -EAGAIN;
3461     }
3462     get_adjacent_clones(clone_obc, obc_l, obc_g);
3463     clone_obc->obs.oi.manifest.calc_refs_to_inc_on_set(
3464       obc_g ? &(obc_g->obs.oi.manifest) : nullptr ,
3465       nullptr,
3466       refs);
3467     for (auto p = refs.begin(); p != refs.end(); ++p) {
3468       if (p->first.oid.name == fp_oid && p->second > 0) {
3469         cnt += p->second;
3470       }
3471     }
3472   }
3473
3474   return cnt;
3475 }
3476
3477 bool PrimaryLogPG::recover_adjacent_clones(ObjectContextRef obc, OpRequestRef op)
3478 {
3479   if (!obc->ssc || !obc->ssc->snapset.clones.size()) {
3480     return false;
3481   }
3482   MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
3483   bool has_manifest_op = std::any_of(
3484     begin(m->ops),
3485     end(m->ops),
3486     [](const auto& osd_op) {
3487        return osd_op.op.op == CEPH_OSD_OP_SET_CHUNK;
3488     });
3489   if (!obc->obs.oi.manifest.is_chunked() && !has_manifest_op) {
3490     return false;
3491   }
3492   ceph_assert(op);
3493
3494   const SnapSet& snapset = obc->ssc->snapset;
3495   auto s = std::find(snapset.clones.begin(), snapset.clones.end(), obc->obs.oi.soid.snap);
3496   auto is_unreadable_snap = [this, obc, &snapset, op](auto iter) -> bool {
3497     hobject_t cid = obc->obs.oi.soid;
3498     cid.snap = (iter == snapset.clones.end()) ? snapid_t(CEPH_NOSNAP) : *iter;
3499     if (is_unreadable_object(cid)) {
3500       dout(10) << __func__ << ": clone " << cid
3501                << " is unreadable, waiting" << dendl;
3502       wait_for_unreadable_object(cid, op);
3503       return true;
3504     }
3505     return false;
3506   };
3507   if (s != snapset.clones.begin()) {
3508     if (is_unreadable_snap(s - 1)) {
3509       return true;
3510     }
3511   }
3512   if (s != snapset.clones.end()) {
3513     if (is_unreadable_snap(s + 1)) {
3514       return true;
3515     }
3516   }
3517   return false;
3518 }
3519
3520 ObjectContextRef PrimaryLogPG::get_prev_clone_obc(ObjectContextRef obc)
3521 {
3522   auto s = std::find(obc->ssc->snapset.clones.begin(), obc->ssc->snapset.clones.end(),
3523                     obc->obs.oi.soid.snap);
3524   if (s != obc->ssc->snapset.clones.begin()) {
3525     auto s_iter = s - 1;
3526     hobject_t cid = obc->obs.oi.soid;
3527     object_ref_delta_t refs;
3528     cid.snap = *s_iter;
3529     ObjectContextRef cobc = get_object_context(cid, false, NULL);
3530     ceph_assert(cobc);
3531     return cobc;
3532   }
3533   return nullptr;
3534 }
3535
3536 void PrimaryLogPG::dec_refcount(const hobject_t& soid, const object_ref_delta_t& refs)
3537 {
3538   for (auto p = refs.begin(); p != refs.end(); ++p) {
3539     int dec_ref_count = p->second;
3540     ceph_assert(dec_ref_count < 0);
3541     while (dec_ref_count < 0) {
3542       dout(10) << __func__ << ": decrement reference on offset oid: " << p->first << dendl;
3543       refcount_manifest(soid, p->first,
3544                         refcount_t::DECREMENT_REF, NULL, std::nullopt);
3545       dec_ref_count++;
3546     }
3547   }
3548 }
3549
3550
3551 void PrimaryLogPG::get_adjacent_clones(ObjectContextRef src_obc,
3552                                        ObjectContextRef& _l, ObjectContextRef& _g)
3553 {
3554   const SnapSet& snapset = src_obc->ssc->snapset;
3555   const object_info_t& oi = src_obc->obs.oi;
3556
3557   auto get_context = [this, &oi, &snapset](auto iter)
3558     -> ObjectContextRef {
3559     hobject_t cid = oi.soid;
3560     cid.snap = (iter == snapset.clones.end()) ? snapid_t(CEPH_NOSNAP) : *iter;
3561     ObjectContextRef obc = get_object_context(cid, false, NULL);
3562     ceph_assert(obc);
3563     return obc;
3564   };
3565
3566   // check adjacent clones
3567   auto s = std::find(snapset.clones.begin(), snapset.clones.end(), oi.soid.snap);
3568
3569   // We *must* find the clone iff it's not head,
3570   // let s == snapset.clones.end() mean head
3571   ceph_assert((s == snapset.clones.end()) == oi.soid.is_head());
3572
3573   if (s != snapset.clones.begin()) {
3574     _l = get_context(s - 1);
3575   }
3576
3577   if (s != snapset.clones.end()) {
3578     _g = get_context(s + 1);
3579   }
3580 }
3581
3582 bool PrimaryLogPG::inc_refcount_by_set(OpContext* ctx, object_manifest_t& set_chunk,
3583                                        OSDOp& osd_op)
3584 {
3585   object_ref_delta_t refs;
3586   ObjectContextRef obc_l, obc_g;
3587   get_adjacent_clones(ctx->obc, obc_l, obc_g);
3588   set_chunk.calc_refs_to_inc_on_set(
3589     obc_l ? &(obc_l->obs.oi.manifest) : nullptr,
3590     obc_g ? &(obc_g->obs.oi.manifest) : nullptr,
3591     refs);
3592   bool need_inc_ref = false;
3593   if (!refs.is_empty()) {
3594     ManifestOpRef mop(std::make_shared<ManifestOp>());
3595     for (auto c : set_chunk.chunk_map) {
3596       auto p = refs.find(c.second.oid);
3597       if (p == refs.end()) {
3598         continue;
3599       }
3600
3601       int inc_ref_count = p->second;
3602       if (inc_ref_count > 0) {
3603         /*
3604          * In set-chunk case, the first thing we should do is to increment
3605          * the reference the targe object has prior to update object_manifest in object_info_t.
3606          * So, call directly refcount_manifest.
3607          */
3608         auto target_oid = p->first;
3609         auto offset = c.first;
3610         auto length = c.second.length;
3611         auto* fin = new C_SetManifestRefCountDone(this, ctx->obs->oi.soid, offset);
3612         ceph_tid_t tid = refcount_manifest(ctx->obs->oi.soid, target_oid,
3613                                             refcount_t::INCREMENT_REF, fin, std::nullopt);
3614         fin->tid = tid;
3615         mop->chunks[target_oid] = make_pair(offset, length);
3616         mop->num_chunks++;
3617         mop->tids[offset] = tid;
3618
3619         if (!ctx->obc->is_blocked()) {
3620           ctx->obc->start_block();
3621         }
3622         need_inc_ref = true;
3623       } else if (inc_ref_count < 0) {
3624         hobject_t src = ctx->obs->oi.soid;
3625         hobject_t tgt = p->first;
3626         ctx->register_on_commit(
3627             [src, tgt, this](){
3628               refcount_manifest(src, tgt, refcount_t::DECREMENT_REF, NULL, std::nullopt);
3629             });
3630       }
3631     }
3632     if (mop->tids.size()) {
3633       mop->cb = new RefCountCallback(ctx, osd_op);
3634       manifest_ops[ctx->obs->oi.soid] = mop;
3635       manifest_ops[ctx->obs->oi.soid]->op = ctx->op;
3636     }
3637   }
3638
3639   return need_inc_ref;
3640 }
3641
3642 void PrimaryLogPG::update_chunk_map_by_dirty(OpContext* ctx) {
3643   /*
3644    * We should consider two cases here:
3645    *  1) just modification: This created dirty regions, but didn't update chunk_map.
3646    *  2) rollback: In rollback, head will be converted to the clone the rollback targets.
3647    *            Also, rollback already updated chunk_map.
3648    * So, we should do here is to check whether chunk_map is updated and the clean_region has dirty regions.
3649    * In case of the rollback, chunk_map doesn't need to be clear
3650    */
3651   for (auto &p : ctx->obs->oi.manifest.chunk_map) {
3652     if (!ctx->clean_regions.is_clean_region(p.first, p.second.length)) {
3653       ctx->new_obs.oi.manifest.chunk_map.erase(p.first);
3654       if (ctx->new_obs.oi.manifest.chunk_map.empty()) {
3655         ctx->new_obs.oi.manifest.type = object_manifest_t::TYPE_NONE;
3656         ctx->new_obs.oi.clear_flag(object_info_t::FLAG_MANIFEST);
3657         ctx->delta_stats.num_objects_manifest--;
3658       }
3659     }
3660   }
3661 }
3662
3663 void PrimaryLogPG::dec_refcount_by_dirty(OpContext* ctx)
3664 {
3665   object_ref_delta_t refs;
3666   ObjectContextRef cobc = nullptr;
3667   ObjectContextRef obc = ctx->obc;
3668   // Look over previous snapshot, then figure out whether updated chunk needs to be deleted
3669   cobc = get_prev_clone_obc(obc);
3670   obc->obs.oi.manifest.calc_refs_to_drop_on_modify(
3671     cobc ? &cobc->obs.oi.manifest : nullptr,
3672     ctx->clean_regions,
3673     refs);
3674   if (!refs.is_empty()) {
3675     hobject_t soid = obc->obs.oi.soid;
3676     ctx->register_on_commit(
3677       [soid, this, refs](){
3678         dec_refcount(soid, refs);
3679       });
3680   }
3681 }
3682
3683 void PrimaryLogPG::dec_all_refcount_manifest(const object_info_t& oi, OpContext* ctx)
3684 {
3685   ceph_assert(oi.has_manifest());
3686   ceph_assert(ctx->obc->ssc);
3687
3688   if (oi.manifest.is_chunked()) {
3689     object_ref_delta_t refs;
3690     ObjectContextRef obc_l, obc_g, obc;
3691     /* in trim_object, oi and ctx can have different oid */
3692     obc = get_object_context(oi.soid, false, NULL);
3693     ceph_assert(obc);
3694     get_adjacent_clones(obc, obc_l, obc_g);
3695     oi.manifest.calc_refs_to_drop_on_removal(
3696       obc_l ? &(obc_l->obs.oi.manifest) : nullptr,
3697       obc_g ? &(obc_g->obs.oi.manifest) : nullptr,
3698       refs);
3699
3700     if (!refs.is_empty()) {
3701       /* dec_refcount will use head object anyway */
3702       hobject_t soid = ctx->obc->obs.oi.soid;
3703       ctx->register_on_commit(
3704         [soid, this, refs](){
3705           dec_refcount(soid, refs);
3706         });
3707     }
3708   } else if (oi.manifest.is_redirect() &&
3709              oi.test_flag(object_info_t::FLAG_REDIRECT_HAS_REFERENCE)) {
3710     ctx->register_on_commit(
3711       [oi, this](){
3712         refcount_manifest(oi.soid, oi.manifest.redirect_target,
3713                           refcount_t::DECREMENT_REF, NULL, std::nullopt);
3714       });
3715   }
3716 }
3717
3718 ceph_tid_t PrimaryLogPG::refcount_manifest(hobject_t src_soid, hobject_t tgt_soid, refcount_t type,
3719                                      Context *cb, std::optional<bufferlist> chunk)
3720 {
3721   unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY |
3722                    CEPH_OSD_FLAG_RWORDERED;
3723
3724   dout(10) << __func__ << " Start refcount from " << src_soid
3725            << " to " << tgt_soid << dendl;
3726
3727   ObjectOperation obj_op;
3728   bufferlist in;
3729   if (type == refcount_t::INCREMENT_REF) {
3730     cls_cas_chunk_get_ref_op call;
3731     call.source = src_soid.get_head();
3732     ::encode(call, in);
3733     obj_op.call("cas", "chunk_get_ref", in);
3734   } else if (type == refcount_t::DECREMENT_REF) {
3735     cls_cas_chunk_put_ref_op call;
3736     call.source = src_soid.get_head();
3737     ::encode(call, in);
3738     obj_op.call("cas", "chunk_put_ref", in);
3739   } else if (type == refcount_t::CREATE_OR_GET_REF) {
3740     cls_cas_chunk_create_or_get_ref_op get_call;
3741     get_call.source = src_soid.get_head();
3742     ceph_assert(chunk);
3743     get_call.data = std::move(*chunk);
3744     ::encode(get_call, in);
3745     obj_op.call("cas", "chunk_create_or_get_ref", in);
3746   } else {
3747     ceph_assert(0 == "unrecognized type");
3748   }
3749
3750   Context *c = nullptr;
3751   if (cb) {
3752     c = new C_OnFinisher(cb, osd->get_objecter_finisher(get_pg_shard()));
3753   }
3754
3755   object_locator_t oloc(tgt_soid);
3756   ObjectContextRef src_obc = get_object_context(src_soid, false, NULL);
3757   ceph_assert(src_obc);
3758   auto tid = osd->objecter->mutate(
3759     tgt_soid.oid, oloc, obj_op, SnapContext(),
3760     ceph::real_clock::from_ceph_timespec(src_obc->obs.oi.mtime),
3761     flags, c);
3762   return tid;
3763 }
3764
3765 void PrimaryLogPG::do_proxy_chunked_read(OpRequestRef op, ObjectContextRef obc, int op_index,
3766                                          uint64_t chunk_index, uint64_t req_offset, uint64_t req_length,
3767                                          uint64_t req_total_len, bool write_ordered)
3768 {
3769   MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
3770   object_manifest_t *manifest = &obc->obs.oi.manifest;
3771   if (!manifest->chunk_map.count(chunk_index)) {
3772     return;
3773   }
3774   uint64_t chunk_length = manifest->chunk_map[chunk_index].length;
3775   hobject_t soid = manifest->chunk_map[chunk_index].oid;
3776   hobject_t ori_soid = m->get_hobj();
3777   object_locator_t oloc(soid);
3778   unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY;
3779   if (write_ordered) {
3780     flags |= CEPH_OSD_FLAG_RWORDERED;
3781   }
3782
3783   if (!chunk_length || soid == hobject_t()) {
3784     return;
3785   }
3786
3787   /* same as do_proxy_read() */
3788   flags |= m->get_flags() & (CEPH_OSD_FLAG_RWORDERED |
3789                              CEPH_OSD_FLAG_ORDERSNAP |
3790                              CEPH_OSD_FLAG_ENFORCE_SNAPC |
3791                              CEPH_OSD_FLAG_MAP_SNAP_CLONE);
3792
3793   dout(10) << __func__ << " Start do chunk proxy read for " << *m
3794            << " index: " << op_index << " oid: " << soid.oid.name << " req_offset: " << req_offset
3795            << " req_length: " << req_length << dendl;
3796
3797   ProxyReadOpRef prdop(std::make_shared<ProxyReadOp>(op, ori_soid, m->ops));
3798
3799   ObjectOperation *pobj_op = new ObjectOperation;
3800   OSDOp &osd_op = pobj_op->add_op(m->ops[op_index].op.op);
3801
3802   if (chunk_index <= req_offset) {
3803     osd_op.op.extent.offset = manifest->chunk_map[chunk_index].offset + req_offset - chunk_index;
3804   } else {
3805     ceph_abort_msg("chunk_index > req_offset");
3806   }
3807   osd_op.op.extent.length = req_length;
3808
3809   ObjectOperation obj_op;
3810   obj_op.dup(pobj_op->ops);
3811
3812   C_ProxyChunkRead *fin = new C_ProxyChunkRead(this, ori_soid, get_last_peering_reset(),
3813                                                prdop);
3814   fin->obj_op = pobj_op;
3815   fin->op_index = op_index;
3816   fin->req_offset = req_offset;
3817   fin->obc = obc;
3818   fin->req_total_len = req_total_len;
3819
3820   ceph_tid_t tid = osd->objecter->read(
3821     soid.oid, oloc, obj_op,
3822     m->get_snapid(), NULL,
3823     flags, new C_OnFinisher(fin, osd->get_objecter_finisher(get_pg_shard())),
3824     &prdop->user_version,
3825     &prdop->data_offset,
3826     m->get_features());
3827   fin->tid = tid;
3828   prdop->objecter_tid = tid;
3829   proxyread_ops[tid] = prdop;
3830   in_progress_proxy_ops[ori_soid].push_back(op);
3831 }
3832
3833 bool PrimaryLogPG::can_proxy_chunked_read(OpRequestRef op, ObjectContextRef obc)
3834 {
3835   MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
3836   OSDOp *osd_op = NULL;
3837   bool ret = true;
3838   for (unsigned int i = 0; i < m->ops.size(); i++) {
3839     osd_op = &m->ops[i];
3840     ceph_osd_op op = osd_op->op;
3841     switch (op.op) {
3842       case CEPH_OSD_OP_READ:
3843       case CEPH_OSD_OP_SYNC_READ: {
3844         uint64_t cursor = osd_op->op.extent.offset;
3845         uint64_t remain = osd_op->op.extent.length;
3846
3847         /* requested chunks exist in chunk_map ? */
3848         for (auto &p : obc->obs.oi.manifest.chunk_map) {
3849           if (p.first <= cursor && p.first + p.second.length > cursor) {
3850             if (!p.second.is_missing()) {
3851               return false;
3852             }
3853             if (p.second.length >= remain) {
3854               remain = 0;
3855               break;
3856             } else {
3857               remain = remain - p.second.length;
3858             }
3859             cursor += p.second.length;
3860           }
3861         }
3862
3863         if (remain) {
3864           dout(20) << __func__ << " requested chunks don't exist in chunk_map " << dendl;
3865           return false;
3866         }
3867         continue;
3868       }
3869       default:
3870         return false;
3871     }
3872   }
3873   return ret;
3874 }
3875
3876 void PrimaryLogPG::finish_proxy_write(hobject_t oid, ceph_tid_t tid, int r)
3877 {
3878   dout(10) << __func__ << " " << oid << " tid " << tid
3879            << " " << cpp_strerror(r) << dendl;
3880
3881   map<ceph_tid_t, ProxyWriteOpRef>::iterator p = proxywrite_ops.find(tid);
3882   if (p == proxywrite_ops.end()) {
3883     dout(10) << __func__ << " no proxywrite_op found" << dendl;
3884     return;
3885   }
3886   ProxyWriteOpRef pwop = p->second;
3887   ceph_assert(tid == pwop->objecter_tid);
3888   ceph_assert(oid == pwop->soid);
3889
3890   proxywrite_ops.erase(tid);
3891
3892   map<hobject_t, list<OpRequestRef> >::iterator q = in_progress_proxy_ops.find(oid);
3893   if (q == in_progress_proxy_ops.end()) {
3894     dout(10) << __func__ << " no in_progress_proxy_ops found" << dendl;
3895     delete pwop->ctx;
3896     pwop->ctx = NULL;
3897     return;
3898   }
3899   list<OpRequestRef>& in_progress_op = q->second;
3900   ceph_assert(in_progress_op.size());
3901   list<OpRequestRef>::iterator it = std::find(in_progress_op.begin(),
3902                                               in_progress_op.end(),
3903                                               pwop->op);
3904   ceph_assert(it != in_progress_op.end());
3905   in_progress_op.erase(it);
3906   if (in_progress_op.size() == 0) {
3907     in_progress_proxy_ops.erase(oid);
3908   } else if (std::find(in_progress_op.begin(),
3909                         in_progress_op.end(),
3910                         pwop->op) != in_progress_op.end()) {
3911     if (pwop->ctx)
3912       delete pwop->ctx;
3913     pwop->ctx = NULL;
3914     dout(20) << __func__ << " " << oid << " tid " << tid
3915             << " in_progress_op size: "
3916             << in_progress_op.size() << dendl;
3917     return;
3918   }
3919
3920   osd->logger->inc(l_osd_tier_proxy_write);
3921
3922   auto m = pwop->op->get_req<MOSDOp>();
3923   ceph_assert(m != NULL);
3924
3925   if (!pwop->sent_reply) {
3926     // send commit.
3927     assert(pwop->ctx->reply == nullptr);
3928     MOSDOpReply *reply = new MOSDOpReply(m, r, get_osdmap_epoch(), 0,
3929                                          true /* we claim it below */);
3930     reply->set_reply_versions(eversion_t(), pwop->user_version);
3931     reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
3932     reply->claim_op_out_data(pwop->ops);
3933     dout(10) << " sending commit on " << pwop << " " << reply << dendl;
3934     osd->send_message_osd_client(reply, m->get_connection());
3935     pwop->sent_reply = true;
3936     pwop->ctx->op->mark_commit_sent();
3937   }
3938
3939   delete pwop->ctx;
3940   pwop->ctx = NULL;
3941 }
3942
3943 void PrimaryLogPG::cancel_proxy_write(ProxyWriteOpRef pwop,
3944                                       vector<ceph_tid_t> *tids)
3945 {
3946   dout(10) << __func__ << " " << pwop->soid << dendl;
3947   pwop->canceled = true;
3948
3949   // cancel objecter op, if we can
3950   if (pwop->objecter_tid) {
3951     tids->push_back(pwop->objecter_tid);
3952     delete pwop->ctx;
3953     pwop->ctx = NULL;
3954     proxywrite_ops.erase(pwop->objecter_tid);
3955     pwop->objecter_tid = 0;
3956   }
3957 }
3958
3959 class PromoteCallback: public PrimaryLogPG::CopyCallback {
3960   ObjectContextRef obc;
3961   PrimaryLogPG *pg;
3962   utime_t start;
3963 public:
3964   PromoteCallback(ObjectContextRef obc_, PrimaryLogPG *pg_)
3965     : obc(obc_),
3966       pg(pg_),
3967       start(ceph_clock_now()) {}
3968
3969   void finish(PrimaryLogPG::CopyCallbackResults results) override {
3970     PrimaryLogPG::CopyResults *results_data = results.get<1>();
3971     int r = results.get<0>();
3972     if (obc->obs.oi.has_manifest() && obc->obs.oi.manifest.is_chunked()) {
3973       pg->finish_promote_manifest(r, results_data, obc);
3974     } else {
3975       pg->finish_promote(r, results_data, obc);
3976     }
3977     pg->osd->logger->tinc(l_osd_tier_promote_lat, ceph_clock_now() - start);
3978   }
3979 };
3980
3981 class PromoteManifestCallback: public PrimaryLogPG::CopyCallback {
3982   ObjectContextRef obc;
3983   PrimaryLogPG *pg;
3984   utime_t start;
3985   PrimaryLogPG::OpContext *ctx;
3986   PrimaryLogPG::CopyCallbackResults promote_results;
3987 public:
3988   PromoteManifestCallback(ObjectContextRef obc_, PrimaryLogPG *pg_, PrimaryLogPG::OpContext *ctx)
3989     : obc(obc_),
3990       pg(pg_),
3991       start(ceph_clock_now()), ctx(ctx) {}
3992
3993   void finish(PrimaryLogPG::CopyCallbackResults results) override {
3994     PrimaryLogPG::CopyResults *results_data = results.get<1>();
3995     int r = results.get<0>();
3996     promote_results = results;
3997     if (obc->obs.oi.has_manifest() && obc->obs.oi.manifest.is_redirect()) {
3998       ctx->user_at_version = results_data->user_version;
3999     }
4000     if (r >= 0) {
4001       ctx->pg->execute_ctx(ctx);
4002     } else {
4003       if (r != -ECANCELED) {
4004         if (ctx->op)
4005           ctx->pg->osd->reply_op_error(ctx->op, r);
4006       } else if (results_data->should_requeue) {
4007         if (ctx->op)
4008           ctx->pg->requeue_op(ctx->op);
4009       }
4010       ctx->pg->close_op_ctx(ctx);
4011     }
4012     pg->osd->logger->tinc(l_osd_tier_promote_lat, ceph_clock_now() - start);
4013   }
4014   friend struct PromoteFinisher;
4015 };
4016
4017 struct PromoteFinisher : public PrimaryLogPG::OpFinisher {
4018   PromoteManifestCallback *promote_callback;
4019
4020   explicit PromoteFinisher(PromoteManifestCallback *promote_callback)
4021     : promote_callback(promote_callback) {
4022   }
4023
4024   int execute() override {
4025     if (promote_callback->ctx->obc->obs.oi.manifest.is_redirect()) {
4026       promote_callback->ctx->pg->finish_promote(promote_callback->promote_results.get<0>(),
4027                                                 promote_callback->promote_results.get<1>(),
4028                                                 promote_callback->obc);
4029     } else if (promote_callback->ctx->obc->obs.oi.manifest.is_chunked()) {
4030       promote_callback->ctx->pg->finish_promote_manifest(promote_callback->promote_results.get<0>(),
4031                                                 promote_callback->promote_results.get<1>(),
4032                                                 promote_callback->obc);
4033     } else {
4034       ceph_abort_msg("unrecognized manifest type");
4035     }
4036     return 0;
4037   }
4038 };
4039
4040 void PrimaryLogPG::promote_object(ObjectContextRef obc,
4041                                   const hobject_t& missing_oid,
4042                                   const object_locator_t& oloc,
4043                                   OpRequestRef op,
4044                                   ObjectContextRef *promote_obc)
4045 {
4046   hobject_t hoid = obc ? obc->obs.oi.soid : missing_oid;
4047   ceph_assert(hoid != hobject_t());
4048   if (m_scrubber->write_blocked_by_scrub(hoid)) {
4049     dout(10) << __func__ << " " << hoid
4050              << " blocked by scrub" << dendl;
4051     if (op) {
4052       waiting_for_scrub.push_back(op);
4053       op->mark_delayed("waiting for scrub");
4054       dout(10) << __func__ << " " << hoid
4055                << " placing op in waiting_for_scrub" << dendl;
4056     } else {
4057       dout(10) << __func__ << " " << hoid
4058                << " no op, dropping on the floor" << dendl;
4059     }
4060     return;
4061   }
4062   if (op && !check_laggy_requeue(op)) {
4063     return;
4064   }
4065   if (!obc) { // we need to create an ObjectContext
4066     ceph_assert(missing_oid != hobject_t());
4067     obc = get_object_context(missing_oid, true);
4068   }
4069   if (promote_obc)
4070     *promote_obc = obc;
4071
4072   /*
4073    * Before promote complete, if there are  proxy-reads for the object,
4074    * for this case we don't use DONTNEED.
4075    */
4076   unsigned src_fadvise_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL;
4077   map<hobject_t, list<OpRequestRef>>::iterator q = in_progress_proxy_ops.find(obc->obs.oi.soid);
4078   if (q == in_progress_proxy_ops.end()) {
4079     src_fadvise_flags |= LIBRADOS_OP_FLAG_FADVISE_DONTNEED;
4080   }
4081
4082   CopyCallback *cb;
4083   object_locator_t my_oloc;
4084   hobject_t src_hoid;
4085   if (!obc->obs.oi.has_manifest()) {
4086     my_oloc = oloc;
4087     my_oloc.pool = pool.info.tier_of;
4088     src_hoid = obc->obs.oi.soid;
4089     cb = new PromoteCallback(obc, this);
4090   } else {
4091     if (obc->obs.oi.manifest.is_chunked()) {
4092       src_hoid = obc->obs.oi.soid;
4093       cb = new PromoteCallback(obc, this);
4094     } else if (obc->obs.oi.manifest.is_redirect()) {
4095       object_locator_t src_oloc(obc->obs.oi.manifest.redirect_target);
4096       my_oloc = src_oloc;
4097       src_hoid = obc->obs.oi.manifest.redirect_target;
4098       cb = new PromoteCallback(obc, this);
4099     } else {
4100       ceph_abort_msg("unrecognized manifest type");
4101     }
4102   }
4103
4104   unsigned flags = CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
4105                    CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
4106                    CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE |
4107                    CEPH_OSD_COPY_FROM_FLAG_RWORDERED;
4108   start_copy(cb, obc, src_hoid, my_oloc, 0, flags,
4109              obc->obs.oi.soid.snap == CEPH_NOSNAP,
4110              src_fadvise_flags, 0);
4111
4112   ceph_assert(obc->is_blocked());
4113
4114   if (op)
4115     wait_for_blocked_object(obc->obs.oi.soid, op);
4116
4117   recovery_state.update_stats(
4118     [](auto &history, auto &stats) {
4119       stats.stats.sum.num_promote++;
4120       return false;
4121     });
4122 }
4123
4124 void PrimaryLogPG::execute_ctx(OpContext *ctx)
4125 {
4126   FUNCTRACE(cct);
4127   dout(10) << __func__ << " " << ctx << dendl;
4128   ctx->reset_obs(ctx->obc);
4129   ctx->update_log_only = false; // reset in case finish_copyfrom() is re-running execute_ctx
4130   OpRequestRef op = ctx->op;
4131   auto m = op->get_req<MOSDOp>();
4132   ObjectContextRef obc = ctx->obc;
4133   const hobject_t& soid = obc->obs.oi.soid;
4134
4135   // this method must be idempotent since we may call it several times
4136   // before we finally apply the resulting transaction.
4137   ctx->op_t.reset(new PGTransaction);
4138
4139   if (op->may_write() || op->may_cache()) {
4140     // snap
4141     if (!(m->has_flag(CEPH_OSD_FLAG_ENFORCE_SNAPC)) &&
4142         pool.info.is_pool_snaps_mode()) {
4143       // use pool's snapc
4144       ctx->snapc = pool.snapc;
4145     } else {
4146       // client specified snapc
4147       ctx->snapc.seq = m->get_snap_seq();
4148       ctx->snapc.snaps = m->get_snaps();
4149       filter_snapc(ctx->snapc.snaps);
4150     }
4151     if ((m->has_flag(CEPH_OSD_FLAG_ORDERSNAP)) &&
4152         ctx->snapc.seq < obc->ssc->snapset.seq) {
4153       dout(10) << " ORDERSNAP flag set and snapc seq " << ctx->snapc.seq
4154                << " < snapset seq " << obc->ssc->snapset.seq
4155                << " on " << obc->obs.oi.soid << dendl;
4156       reply_ctx(ctx, -EOLDSNAPC);
4157       return;
4158     }
4159
4160     // version
4161     ctx->at_version = get_next_version();
4162     ctx->mtime = m->get_mtime();
4163
4164     dout(10) << __func__ << " " << soid << " " << *ctx->ops
4165              << " ov " << obc->obs.oi.version << " av " << ctx->at_version
4166              << " snapc " << ctx->snapc
4167              << " snapset " << obc->ssc->snapset
4168              << dendl;
4169   } else {
4170     dout(10) << __func__ << " " << soid << " " << *ctx->ops
4171              << " ov " << obc->obs.oi.version
4172              << dendl;
4173   }
4174
4175   if (!ctx->user_at_version)
4176     ctx->user_at_version = obc->obs.oi.user_version;
4177   dout(30) << __func__ << " user_at_version " << ctx->user_at_version << dendl;
4178
4179   {
4180 #ifdef WITH_LTTNG
4181     osd_reqid_t reqid = ctx->op->get_reqid();
4182 #endif
4183     tracepoint(osd, prepare_tx_enter, reqid.name._type,
4184         reqid.name._num, reqid.tid, reqid.inc);
4185   }
4186
4187   [[maybe_unused]] auto span = tracing::osd::tracer.add_span(__func__, ctx->op->osd_parent_span);
4188
4189   int result = prepare_transaction(ctx);
4190
4191   {
4192 #ifdef WITH_LTTNG
4193     osd_reqid_t reqid = ctx->op->get_reqid();
4194 #endif
4195     tracepoint(osd, prepare_tx_exit, reqid.name._type,
4196         reqid.name._num, reqid.tid, reqid.inc);
4197   }
4198
4199   bool pending_async_reads = !ctx->pending_async_reads.empty();
4200   if (result == -EINPROGRESS || pending_async_reads) {
4201     // come back later.
4202     if (pending_async_reads) {
4203       ceph_assert(pool.info.is_erasure());
4204       in_progress_async_reads.push_back(make_pair(op, ctx));
4205       ctx->start_async_reads(this);
4206     }
4207     return;
4208   }
4209
4210   if (result == -EAGAIN) {
4211     // clean up after the ctx
4212     close_op_ctx(ctx);
4213     return;
4214   }
4215
4216   bool ignore_out_data = false;
4217   if (!ctx->op_t->empty() &&
4218       op->may_write() &&
4219       result >= 0) {
4220     // successful update
4221     if (ctx->op->allows_returnvec()) {
4222       // enforce reasonable bound on the return buffer sizes
4223       for (auto& i : *ctx->ops) {
4224         if (i.outdata.length() > cct->_conf->osd_max_write_op_reply_len) {
4225           dout(10) << __func__ << " op " << i << " outdata overflow" << dendl;
4226           result = -EOVERFLOW;  // overall result is overflow
4227           i.rval = -EOVERFLOW;
4228           i.outdata.clear();
4229         }
4230       }
4231     } else {
4232       // legacy behavior -- zero result and return data etc.
4233       ignore_out_data = true;
4234       result = 0;
4235     }
4236   }
4237
4238   // prepare the reply
4239   ctx->reply = new MOSDOpReply(m, result, get_osdmap_epoch(), 0,
4240                                ignore_out_data);
4241   dout(20) << __func__ << " alloc reply " << ctx->reply
4242            << " result " << result << dendl;
4243
4244   // read or error?
4245   if ((ctx->op_t->empty() || result < 0) && !ctx->update_log_only) {
4246     // finish side-effects
4247     if (result >= 0)
4248       do_osd_op_effects(ctx, m->get_connection());
4249
4250     complete_read_ctx(result, ctx);
4251     return;
4252   }
4253
4254   ctx->reply->set_reply_versions(ctx->at_version, ctx->user_at_version);
4255
4256   ceph_assert(op->may_write() || op->may_cache());
4257
4258   // trim log?
4259   recovery_state.update_trim_to();
4260
4261   // verify that we are doing this in order?
4262   if (cct->_conf->osd_debug_op_order && m->get_source().is_client() &&
4263       !pool.info.is_tier() && !pool.info.has_tiers()) {
4264     map<client_t,ceph_tid_t>& cm = debug_op_order[obc->obs.oi.soid];
4265     ceph_tid_t t = m->get_tid();
4266     client_t n = m->get_source().num();
4267     map<client_t,ceph_tid_t>::iterator p = cm.find(n);
4268     if (p == cm.end()) {
4269       dout(20) << " op order client." << n << " tid " << t << " (first)" << dendl;
4270       cm[n] = t;
4271     } else {
4272       dout(20) << " op order client." << n << " tid " << t << " last was " << p->second << dendl;
4273       if (p->second > t) {
4274         derr << "bad op order, already applied " << p->second << " > this " << t << dendl;
4275         ceph_abort_msg("out of order op");
4276       }
4277       p->second = t;
4278     }
4279   }
4280
4281   if (ctx->update_log_only) {
4282     if (result >= 0)
4283       do_osd_op_effects(ctx, m->get_connection());
4284
4285     dout(20) << __func__ << " update_log_only -- result=" << result << dendl;
4286     // save just what we need from ctx
4287     MOSDOpReply *reply = ctx->reply;
4288     ctx->reply = nullptr;
4289     reply->get_header().data_off = (ctx->data_off ? *ctx->data_off : 0);
4290
4291     if (result == -ENOENT) {
4292       reply->set_enoent_reply_versions(info.last_update,
4293                                        info.last_user_version);
4294     }
4295     reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
4296     // append to pg log for dup detection - don't save buffers for now
4297     record_write_error(op, soid, reply, result,
4298                        ctx->op->allows_returnvec() ? ctx : nullptr);
4299     close_op_ctx(ctx);
4300     return;
4301   }
4302
4303   // no need to capture PG ref, repop cancel will handle that
4304   // Can capture the ctx by pointer, it's owned by the repop
4305   ctx->register_on_commit(
4306     [m, ctx, this](){
4307       if (ctx->op)
4308         log_op_stats(*ctx->op, ctx->bytes_written, ctx->bytes_read);
4309
4310       if (m && !ctx->sent_reply) {
4311         MOSDOpReply *reply = ctx->reply;
4312         ctx->reply = nullptr;
4313         reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
4314         dout(10) << " sending reply on " << *m << " " << reply << dendl;
4315         osd->send_message_osd_client(reply, m->get_connection());
4316         ctx->sent_reply = true;
4317         ctx->op->mark_commit_sent();
4318       }
4319     });
4320   ctx->register_on_success(
4321     [ctx, this]() {
4322       do_osd_op_effects(
4323         ctx,
4324         ctx->op ? ctx->op->get_req()->get_connection() :
4325         ConnectionRef());
4326     });
4327   ctx->register_on_finish(
4328     [ctx]() {
4329       delete ctx;
4330     });
4331
4332   // issue replica writes
4333   ceph_tid_t rep_tid = osd->get_tid();
4334
4335   RepGather *repop = new_repop(ctx, rep_tid);
4336
4337   issue_repop(repop, ctx);
4338   eval_repop(repop);
4339   repop->put();
4340 }
4341
4342 void PrimaryLogPG::close_op_ctx(OpContext *ctx) {
4343   release_object_locks(ctx->lock_manager);
4344
4345   ctx->op_t.reset();
4346
4347   for (auto p = ctx->on_finish.begin(); p != ctx->on_finish.end();
4348        ctx->on_finish.erase(p++)) {
4349     (*p)();
4350   }
4351   delete ctx;
4352 }
4353
4354 void PrimaryLogPG::reply_ctx(OpContext *ctx, int r)
4355 {
4356   if (ctx->op)
4357     osd->reply_op_error(ctx->op, r);
4358   close_op_ctx(ctx);
4359 }
4360
4361 void PrimaryLogPG::log_op_stats(const OpRequest& op,
4362                                 const uint64_t inb,
4363                                 const uint64_t outb)
4364 {
4365   auto m = op.get_req<MOSDOp>();
4366   const utime_t now = ceph_clock_now();
4367
4368   const utime_t latency = now - m->get_recv_stamp();
4369   const utime_t process_latency = now - op.get_dequeued_time();
4370
4371   osd->logger->inc(l_osd_op);
4372
4373   osd->logger->inc(l_osd_op_outb, outb);
4374   osd->logger->inc(l_osd_op_inb, inb);
4375   osd->logger->tinc(l_osd_op_lat, latency);
4376   osd->logger->tinc(l_osd_op_process_lat, process_latency);
4377
4378   if (op.may_read() && op.may_write()) {
4379     osd->logger->inc(l_osd_op_rw);
4380     osd->logger->inc(l_osd_op_rw_inb, inb);
4381     osd->logger->inc(l_osd_op_rw_outb, outb);
4382     osd->logger->tinc(l_osd_op_rw_lat, latency);
4383     osd->logger->hinc(l_osd_op_rw_lat_inb_hist, latency.to_nsec(), inb);
4384     osd->logger->hinc(l_osd_op_rw_lat_outb_hist, latency.to_nsec(), outb);
4385     osd->logger->tinc(l_osd_op_rw_process_lat, process_latency);
4386   } else if (op.may_read()) {
4387     osd->logger->inc(l_osd_op_r);
4388     osd->logger->inc(l_osd_op_r_outb, outb);
4389     osd->logger->tinc(l_osd_op_r_lat, latency);
4390     osd->logger->hinc(l_osd_op_r_lat_outb_hist, latency.to_nsec(), outb);
4391     osd->logger->tinc(l_osd_op_r_process_lat, process_latency);
4392   } else if (op.may_write() || op.may_cache()) {
4393     osd->logger->inc(l_osd_op_w);
4394     osd->logger->inc(l_osd_op_w_inb, inb);
4395     osd->logger->tinc(l_osd_op_w_lat, latency);
4396     osd->logger->hinc(l_osd_op_w_lat_inb_hist, latency.to_nsec(), inb);
4397     osd->logger->tinc(l_osd_op_w_process_lat, process_latency);
4398   } else {
4399     ceph_abort();
4400   }
4401
4402   dout(15) << "log_op_stats " << *m
4403            << " inb " << inb
4404            << " outb " << outb
4405            << " lat " << latency << dendl;
4406
4407   if (m_dynamic_perf_stats.is_enabled()) {
4408     m_dynamic_perf_stats.add(osd, info, op, inb, outb, latency);
4409   }
4410 }
4411
4412 void PrimaryLogPG::set_dynamic_perf_stats_queries(
4413     const std::list<OSDPerfMetricQuery> &queries)
4414 {
4415   m_dynamic_perf_stats.set_queries(queries);
4416 }
4417
4418 void PrimaryLogPG::get_dynamic_perf_stats(DynamicPerfStats *stats)
4419 {
4420   std::swap(m_dynamic_perf_stats, *stats);
4421 }
4422
4423 void PrimaryLogPG::do_scan(
4424   OpRequestRef op,
4425   ThreadPool::TPHandle &handle)
4426 {
4427   auto m = op->get_req<MOSDPGScan>();
4428   ceph_assert(m->get_type() == MSG_OSD_PG_SCAN);
4429   dout(10) << "do_scan " << *m << dendl;
4430
4431   op->mark_started();
4432
4433   switch (m->op) {
4434   case MOSDPGScan::OP_SCAN_GET_DIGEST:
4435     {
4436       auto dpp = get_dpp();
4437       if (osd->check_backfill_full(dpp)) {
4438         dout(1) << __func__ << ": Canceling backfill: Full." << dendl;
4439         queue_peering_event(
4440           PGPeeringEventRef(
4441             std::make_shared<PGPeeringEvent>(
4442               get_osdmap_epoch(),
4443               get_osdmap_epoch(),
4444               PeeringState::BackfillTooFull())));
4445         return;
4446       }
4447
4448       BackfillInterval bi;
4449       bi.begin = m->begin;
4450       // No need to flush, there won't be any in progress writes occuring
4451       // past m->begin
4452       scan_range(
4453         cct->_conf->osd_backfill_scan_min,
4454         cct->_conf->osd_backfill_scan_max,
4455         &bi,
4456         handle);
4457       MOSDPGScan *reply = new MOSDPGScan(
4458         MOSDPGScan::OP_SCAN_DIGEST,
4459         pg_whoami,
4460         get_osdmap_epoch(), m->query_epoch,
4461         spg_t(info.pgid.pgid, get_primary().shard), bi.begin, bi.end);
4462       encode(bi.objects, reply->get_data());
4463       osd->send_message_osd_cluster(reply, m->get_connection());
4464     }
4465     break;
4466
4467   case MOSDPGScan::OP_SCAN_DIGEST:
4468     {
4469       pg_shard_t from = m->from;
4470
4471       // Check that from is in backfill_targets vector
4472       ceph_assert(is_backfill_target(from));
4473
4474       BackfillInterval& bi = peer_backfill_info[from];
4475       bi.begin = m->begin;
4476       bi.end = m->end;
4477       auto p = m->get_data().cbegin();
4478
4479       // take care to preserve ordering!
4480       bi.clear_objects();
4481       decode_noclear(bi.objects, p);
4482       dout(10) << __func__ << " bi.begin=" << bi.begin << " bi.end=" << bi.end
4483                << " bi.objects.size()=" << bi.objects.size() << dendl;
4484
4485       if (waiting_on_backfill.erase(from)) {
4486         if (waiting_on_backfill.empty()) {
4487           ceph_assert(
4488             peer_backfill_info.size() ==
4489             get_backfill_targets().size());
4490           finish_recovery_op(hobject_t::get_max());
4491         }
4492       } else {
4493         // we canceled backfill for a while due to a too full, and this
4494         // is an extra response from a non-too-full peer
4495         dout(20) << __func__ << " canceled backfill (too full?)" << dendl;
4496       }
4497     }
4498     break;
4499   }
4500 }
4501
4502 void PrimaryLogPG::do_backfill(OpRequestRef op)
4503 {
4504   auto m = op->get_req<MOSDPGBackfill>();
4505   ceph_assert(m->get_type() == MSG_OSD_PG_BACKFILL);
4506   dout(10) << "do_backfill " << *m << dendl;
4507
4508   op->mark_started();
4509
4510   switch (m->op) {
4511   case MOSDPGBackfill::OP_BACKFILL_FINISH:
4512     {
4513       ceph_assert(cct->_conf->osd_kill_backfill_at != 1);
4514
4515       MOSDPGBackfill *reply = new MOSDPGBackfill(
4516         MOSDPGBackfill::OP_BACKFILL_FINISH_ACK,
4517         get_osdmap_epoch(),
4518         m->query_epoch,
4519         spg_t(info.pgid.pgid, get_primary().shard));
4520       reply->set_priority(get_recovery_op_priority());
4521       osd->send_message_osd_cluster(reply, m->get_connection());
4522       queue_peering_event(
4523         PGPeeringEventRef(
4524           std::make_shared<PGPeeringEvent>(
4525             get_osdmap_epoch(),
4526             get_osdmap_epoch(),
4527             RecoveryDone())));
4528     }
4529     // fall-thru
4530
4531   case MOSDPGBackfill::OP_BACKFILL_PROGRESS:
4532     {
4533       ceph_assert(cct->_conf->osd_kill_backfill_at != 2);
4534
4535       ObjectStore::Transaction t;
4536       recovery_state.update_backfill_progress(
4537         m->last_backfill,
4538         m->stats,
4539         m->op == MOSDPGBackfill::OP_BACKFILL_PROGRESS,
4540         t);
4541
4542       int tr = osd->store->queue_transaction(ch, std::move(t), NULL);
4543       ceph_assert(tr == 0);
4544     }
4545     break;
4546
4547   case MOSDPGBackfill::OP_BACKFILL_FINISH_ACK:
4548     {
4549       ceph_assert(is_primary());
4550       ceph_assert(cct->_conf->osd_kill_backfill_at != 3);
4551       finish_recovery_op(hobject_t::get_max());
4552     }
4553     break;
4554   }
4555 }
4556
4557 void PrimaryLogPG::do_backfill_remove(OpRequestRef op)
4558 {
4559   const MOSDPGBackfillRemove *m = static_cast<const MOSDPGBackfillRemove*>(
4560     op->get_req());
4561   ceph_assert(m->get_type() == MSG_OSD_PG_BACKFILL_REMOVE);
4562   dout(7) << __func__ << " " << m->ls << dendl;
4563
4564   op->mark_started();
4565
4566   ObjectStore::Transaction t;
4567   for (auto& p : m->ls) {
4568     if (is_remote_backfilling()) {
4569       struct stat st;
4570       int r = osd->store->stat(ch, ghobject_t(p.first, ghobject_t::NO_GEN,
4571                                pg_whoami.shard) , &st);
4572       if (r == 0) {
4573         sub_local_num_bytes(st.st_size);
4574         int64_t usersize;
4575         if (pool.info.is_erasure()) {
4576           bufferlist bv;
4577           int r = osd->store->getattr(
4578               ch,
4579               ghobject_t(p.first, ghobject_t::NO_GEN, pg_whoami.shard),
4580               OI_ATTR,
4581               bv);
4582           if (r >= 0) {
4583             object_info_t oi(bv);
4584             usersize = oi.size * pgbackend->get_ec_data_chunk_count();
4585           } else {
4586             dout(0) << __func__ << " " << ghobject_t(p.first, ghobject_t::NO_GEN, pg_whoami.shard)
4587                     << " can't get object info" << dendl;
4588             usersize = 0;
4589           }
4590         } else {
4591           usersize = st.st_size;
4592         }
4593         sub_num_bytes(usersize);
4594         dout(10) << __func__ << " " << ghobject_t(p.first, ghobject_t::NO_GEN, pg_whoami.shard)
4595                  << " sub actual data by " << st.st_size
4596                  << " sub num_bytes by " << usersize
4597                  << dendl;
4598       }
4599     }
4600     remove_snap_mapped_object(t, p.first);
4601   }
4602   int r = osd->store->queue_transaction(ch, std::move(t), NULL);
4603   ceph_assert(r == 0);
4604 }
4605
4606 int PrimaryLogPG::trim_object(
4607   bool first, const hobject_t &coid, snapid_t snap_to_trim,
4608   PrimaryLogPG::OpContextUPtr *ctxp)
4609 {
4610   *ctxp = NULL;
4611
4612   // load clone info
4613   bufferlist bl;
4614   ObjectContextRef obc = get_object_context(coid, false, NULL);
4615   if (!obc || !obc->ssc || !obc->ssc->exists) {
4616     osd->clog->error() << __func__ << ": Can not trim " << coid
4617       << " repair needed " << (obc ? "(no obc->ssc or !exists)" : "(no obc)");
4618     return -ENOENT;
4619   }
4620
4621   hobject_t head_oid = coid.get_head();
4622   ObjectContextRef head_obc = get_object_context(head_oid, false);
4623   if (!head_obc) {
4624     osd->clog->error() << __func__ << ": Can not trim " << coid
4625       << " repair needed, no snapset obc for " << head_oid;
4626     return -ENOENT;
4627   }
4628
4629   SnapSet& snapset = obc->ssc->snapset;
4630
4631   object_info_t &coi = obc->obs.oi;
4632   auto citer = snapset.clone_snaps.find(coid.snap);
4633   if (citer == snapset.clone_snaps.end()) {
4634     osd->clog->error() << "No clone_snaps in snapset " << snapset
4635                        << " for object " << coid << "\n";
4636     return -ENOENT;
4637   }
4638   set<snapid_t> old_snaps(citer->second.begin(), citer->second.end());
4639   if (old_snaps.empty()) {
4640     osd->clog->error() << "No object info snaps for object " << coid;
4641     return -ENOENT;
4642   }
4643
4644   dout(10) << coid << " old_snaps " << old_snaps
4645            << " old snapset " << snapset << dendl;
4646   if (snapset.seq == 0) {
4647     osd->clog->error() << "No snapset.seq for object " << coid;
4648     return -ENOENT;
4649   }
4650
4651   set<snapid_t> new_snaps;
4652   const OSDMapRef& osdmap = get_osdmap();
4653   for (set<snapid_t>::iterator i = old_snaps.begin();
4654        i != old_snaps.end();
4655        ++i) {
4656     if (!osdmap->in_removed_snaps_queue(info.pgid.pgid.pool(), *i) &&
4657         *i != snap_to_trim) {
4658       new_snaps.insert(*i);
4659     }
4660   }
4661
4662   vector<snapid_t>::iterator p = snapset.clones.end();
4663
4664   if (new_snaps.empty()) {
4665     p = std::find(snapset.clones.begin(), snapset.clones.end(), coid.snap);
4666     if (p == snapset.clones.end()) {
4667       osd->clog->error() << "Snap " << coid.snap << " not in clones";
4668       return -ENOENT;
4669     }
4670   }
4671
4672   OpContextUPtr ctx = simple_opc_create(obc);
4673   ctx->head_obc = head_obc;
4674
4675   if (!ctx->lock_manager.get_snaptrimmer_write(
4676         coid,
4677         obc,
4678         first)) {
4679     close_op_ctx(ctx.release());
4680     dout(10) << __func__ << ": Unable to get a wlock on " << coid << dendl;
4681     return -ENOLCK;
4682   }
4683
4684   if (!ctx->lock_manager.get_snaptrimmer_write(
4685         head_oid,
4686         head_obc,
4687         first)) {
4688     close_op_ctx(ctx.release());
4689     dout(10) << __func__ << ": Unable to get a wlock on " << head_oid << dendl;
4690     return -ENOLCK;
4691   }
4692
4693   ctx->at_version = get_next_version();
4694
4695   PGTransaction *t = ctx->op_t.get();
4696
4697   if (new_snaps.empty()) {
4698     // remove clone
4699     dout(10) << coid << " snaps " << old_snaps << " -> "
4700              << new_snaps << " ... deleting" << dendl;
4701
4702     // ...from snapset
4703     ceph_assert(p != snapset.clones.end());
4704
4705     snapid_t last = coid.snap;
4706     ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(last);
4707
4708     if (p != snapset.clones.begin()) {
4709       // not the oldest... merge overlap into next older clone
4710       vector<snapid_t>::iterator n = p - 1;
4711       hobject_t prev_coid = coid;
4712       prev_coid.snap = *n;
4713       bool adjust_prev_bytes = is_present_clone(prev_coid);
4714
4715       if (adjust_prev_bytes)
4716         ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(*n);
4717
4718       snapset.clone_overlap[*n].intersection_of(
4719         snapset.clone_overlap[*p]);
4720
4721       if (adjust_prev_bytes)
4722         ctx->delta_stats.num_bytes += snapset.get_clone_bytes(*n);
4723     }
4724     ctx->delta_stats.num_objects--;
4725     if (coi.is_dirty())
4726       ctx->delta_stats.num_objects_dirty--;
4727     if (coi.is_omap())
4728       ctx->delta_stats.num_objects_omap--;
4729     if (coi.is_whiteout()) {
4730       dout(20) << __func__ << " trimming whiteout on " << coid << dendl;
4731       ctx->delta_stats.num_whiteouts--;
4732     }
4733     ctx->delta_stats.num_object_clones--;
4734     if (coi.is_cache_pinned())
4735       ctx->delta_stats.num_objects_pinned--;
4736     if (coi.has_manifest()) {
4737       dec_all_refcount_manifest(coi, ctx.get());
4738       ctx->delta_stats.num_objects_manifest--;
4739     }
4740     obc->obs.exists = false;
4741
4742     snapset.clones.erase(p);
4743     snapset.clone_overlap.erase(last);
4744     snapset.clone_size.erase(last);
4745     snapset.clone_snaps.erase(last);
4746
4747     ctx->log.push_back(
4748       pg_log_entry_t(
4749         pg_log_entry_t::DELETE,
4750         coid,
4751         ctx->at_version,
4752         ctx->obs->oi.version,
4753         0,
4754         osd_reqid_t(),
4755         ctx->mtime,
4756         0)
4757       );
4758     t->remove(coid);
4759     t->update_snaps(
4760       coid,
4761       old_snaps,
4762       new_snaps);
4763
4764     coi = object_info_t(coid);
4765
4766     ctx->at_version.version++;
4767   } else {
4768     // save adjusted snaps for this object
4769     dout(10) << coid << " snaps " << old_snaps << " -> " << new_snaps << dendl;
4770     snapset.clone_snaps[coid.snap] =
4771       vector<snapid_t>(new_snaps.rbegin(), new_snaps.rend());
4772     // we still do a 'modify' event on this object just to trigger a
4773     // snapmapper.update ... :(
4774
4775     coi.prior_version = coi.version;
4776     coi.version = ctx->at_version;
4777     bl.clear();
4778     encode(coi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
4779     t->setattr(coid, OI_ATTR, bl);
4780
4781     ctx->log.push_back(
4782       pg_log_entry_t(
4783         pg_log_entry_t::MODIFY,
4784         coid,
4785         coi.version,
4786         coi.prior_version,
4787         0,
4788         osd_reqid_t(),
4789         ctx->mtime,
4790         0)
4791       );
4792     ctx->at_version.version++;
4793
4794     t->update_snaps(
4795       coid,
4796       old_snaps,
4797       new_snaps);
4798   }
4799
4800   // save head snapset
4801   dout(10) << coid << " new snapset " << snapset << " on "
4802            << head_obc->obs.oi << dendl;
4803   if (snapset.clones.empty() &&
4804       (head_obc->obs.oi.is_whiteout() &&
4805        !(head_obc->obs.oi.is_dirty() && pool.info.is_tier()) &&
4806        !head_obc->obs.oi.is_cache_pinned())) {
4807     // NOTE: this arguably constitutes minor interference with the
4808     // tiering agent if this is a cache tier since a snap trim event
4809     // is effectively evicting a whiteout we might otherwise want to
4810     // keep around.
4811     dout(10) << coid << " removing " << head_oid << dendl;
4812     ctx->log.push_back(
4813       pg_log_entry_t(
4814         pg_log_entry_t::DELETE,
4815         head_oid,
4816         ctx->at_version,
4817         head_obc->obs.oi.version,
4818         0,
4819         osd_reqid_t(),
4820         ctx->mtime,
4821         0)
4822       );
4823     dout(10) << "removing snap head" << dendl;
4824     object_info_t& oi = head_obc->obs.oi;
4825     ctx->delta_stats.num_objects--;
4826     if (oi.is_dirty()) {
4827       ctx->delta_stats.num_objects_dirty--;
4828     }
4829     if (oi.is_omap())
4830       ctx->delta_stats.num_objects_omap--;
4831     if (oi.is_whiteout()) {
4832       dout(20) << __func__ << " trimming whiteout on " << oi.soid << dendl;
4833       ctx->delta_stats.num_whiteouts--;
4834     }
4835     if (oi.is_cache_pinned()) {
4836       ctx->delta_stats.num_objects_pinned--;
4837     }
4838     if (oi.has_manifest()) {
4839       ctx->delta_stats.num_objects_manifest--;
4840       dec_all_refcount_manifest(oi, ctx.get());
4841     }
4842     head_obc->obs.exists = false;
4843     head_obc->obs.oi = object_info_t(head_oid);
4844     t->remove(head_oid);
4845   } else {
4846     if (get_osdmap()->require_osd_release < ceph_release_t::octopus) {
4847       // filter SnapSet::snaps for the benefit of pre-octopus
4848       // peers. This is perhaps overly conservative in that I'm not
4849       // certain they need this, but let's be conservative here.
4850       dout(10) << coid << " filtering snapset on " << head_oid << dendl;
4851       snapset.filter(pool.info);
4852     } else {
4853       snapset.snaps.clear();
4854     }
4855     dout(10) << coid << " writing updated snapset on " << head_oid
4856              << ", snapset is " << snapset << dendl;
4857     ctx->log.push_back(
4858       pg_log_entry_t(
4859         pg_log_entry_t::MODIFY,
4860         head_oid,
4861         ctx->at_version,
4862         head_obc->obs.oi.version,
4863         0,
4864         osd_reqid_t(),
4865         ctx->mtime,
4866         0)
4867       );
4868
4869     head_obc->obs.oi.prior_version = head_obc->obs.oi.version;
4870     head_obc->obs.oi.version = ctx->at_version;
4871
4872     map <string, bufferlist, less<>> attrs;
4873     bl.clear();
4874     encode(snapset, bl);
4875     attrs[SS_ATTR] = std::move(bl);
4876
4877     bl.clear();
4878     encode(head_obc->obs.oi, bl,
4879              get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
4880     attrs[OI_ATTR] = std::move(bl);
4881     t->setattrs(head_oid, attrs);
4882   }
4883
4884   *ctxp = std::move(ctx);
4885   return 0;
4886 }
4887
4888 void PrimaryLogPG::kick_snap_trim()
4889 {
4890   ceph_assert(is_active());
4891   ceph_assert(is_primary());
4892   if (is_clean() &&
4893       !state_test(PG_STATE_PREMERGE) &&
4894       !snap_trimq.empty()) {
4895     if (get_osdmap()->test_flag(CEPH_OSDMAP_NOSNAPTRIM)) {
4896       dout(10) << __func__ << ": nosnaptrim set, not kicking" << dendl;
4897     } else {
4898       dout(10) << __func__ << ": clean and snaps to trim, kicking" << dendl;
4899       snap_trimmer_machine.process_event(KickTrim());
4900     }
4901   }
4902 }
4903
4904 void PrimaryLogPG::snap_trimmer_scrub_complete()
4905 {
4906   if (is_primary() && is_active() && is_clean()) {
4907     ceph_assert(!snap_trimq.empty());
4908     snap_trimmer_machine.process_event(ScrubComplete());
4909   }
4910 }
4911
4912 void PrimaryLogPG::snap_trimmer(epoch_t queued)
4913 {
4914   if (recovery_state.is_deleting() || pg_has_reset_since(queued)) {
4915     return;
4916   }
4917
4918   ceph_assert(is_primary());
4919
4920   dout(10) << "snap_trimmer posting" << dendl;
4921   snap_trimmer_machine.process_event(DoSnapWork());
4922   dout(10) << "snap_trimmer complete" << dendl;
4923   return;
4924 }
4925
4926 namespace {
4927
4928 template<typename U, typename V>
4929 int do_cmp_xattr(int op, const U& lhs, const V& rhs)
4930 {
4931   switch (op) {
4932   case CEPH_OSD_CMPXATTR_OP_EQ:
4933     return lhs == rhs;
4934   case CEPH_OSD_CMPXATTR_OP_NE:
4935     return lhs != rhs;
4936   case CEPH_OSD_CMPXATTR_OP_GT:
4937     return lhs > rhs;
4938   case CEPH_OSD_CMPXATTR_OP_GTE:
4939     return lhs >= rhs;
4940   case CEPH_OSD_CMPXATTR_OP_LT:
4941     return lhs < rhs;
4942   case CEPH_OSD_CMPXATTR_OP_LTE:
4943     return lhs <= rhs;
4944   default:
4945     return -EINVAL;
4946   }
4947 }
4948
4949 } // anonymous namespace
4950
4951 int PrimaryLogPG::do_xattr_cmp_u64(int op, uint64_t v1, bufferlist& xattr)
4952 {
4953   uint64_t v2;
4954
4955   if (xattr.length()) {
4956     const char* first = xattr.c_str();
4957     if (auto [p, ec] = std::from_chars(first, first + xattr.length(), v2);
4958         ec != std::errc()) {
4959       return -EINVAL;
4960     }
4961   } else {
4962     v2 = 0;
4963   }
4964   dout(20) << "do_xattr_cmp_u64 '" << v1 << "' vs '" << v2 << "' op " << op << dendl;
4965   return do_cmp_xattr(op, v1, v2);
4966 }
4967
4968 int PrimaryLogPG::do_xattr_cmp_str(int op, string& v1s, bufferlist& xattr)
4969 {
4970   string_view v2s(xattr.c_str(), xattr.length());
4971   dout(20) << "do_xattr_cmp_str '" << v1s << "' vs '" << v2s << "' op " << op << dendl;
4972   return do_cmp_xattr(op, v1s, v2s);
4973 }
4974
4975 int PrimaryLogPG::do_writesame(OpContext *ctx, OSDOp& osd_op)
4976 {
4977   ceph_osd_op& op = osd_op.op;
4978   vector<OSDOp> write_ops(1);
4979   OSDOp& write_op = write_ops[0];
4980   uint64_t write_length = op.writesame.length;
4981   int result = 0;
4982
4983   if (!write_length)
4984     return 0;
4985
4986   if (!op.writesame.data_length || write_length % op.writesame.data_length)
4987     return -EINVAL;
4988
4989   if (op.writesame.data_length != osd_op.indata.length()) {
4990     derr << "invalid length ws data length " << op.writesame.data_length << " actual len " << osd_op.indata.length() << dendl;
4991     return -EINVAL;
4992   }
4993
4994   while (write_length) {
4995     write_op.indata.append(osd_op.indata);
4996     write_length -= op.writesame.data_length;
4997   }
4998
4999   write_op.op.op = CEPH_OSD_OP_WRITE;
5000   write_op.op.extent.offset = op.writesame.offset;
5001   write_op.op.extent.length = op.writesame.length;
5002   result = do_osd_ops(ctx, write_ops);
5003   if (result < 0)
5004     derr << "do_writesame do_osd_ops failed " << result << dendl;
5005
5006   return result;
5007 }
5008
5009 // ========================================================================
5010 // low level osd ops
5011
5012 int PrimaryLogPG::do_tmap2omap(OpContext *ctx, unsigned flags)
5013 {
5014   dout(20) << " convert tmap to omap for " << ctx->new_obs.oi.soid << dendl;
5015   bufferlist header, vals;
5016   int r = _get_tmap(ctx, &header, &vals);
5017   if (r < 0) {
5018     if (r == -ENODATA && (flags & CEPH_OSD_TMAP2OMAP_NULLOK))
5019       r = 0;
5020     return r;
5021   }
5022
5023   vector<OSDOp> ops(3);
5024
5025   ops[0].op.op = CEPH_OSD_OP_TRUNCATE;
5026   ops[0].op.extent.offset = 0;
5027   ops[0].op.extent.length = 0;
5028
5029   ops[1].op.op = CEPH_OSD_OP_OMAPSETHEADER;
5030   ops[1].indata = std::move(header);
5031
5032   ops[2].op.op = CEPH_OSD_OP_OMAPSETVALS;
5033   ops[2].indata = std::move(vals);
5034
5035   return do_osd_ops(ctx, ops);
5036 }
5037
5038 int PrimaryLogPG::do_tmapup_slow(OpContext *ctx, bufferlist::const_iterator& bp,
5039                                  OSDOp& osd_op, bufferlist& bl)
5040 {
5041   // decode
5042   bufferlist header;
5043   map<string, bufferlist> m;
5044   if (bl.length()) {
5045     auto p = bl.cbegin();
5046     decode(header, p);
5047     decode(m, p);
5048     ceph_assert(p.end());
5049   }
5050
5051   // do the update(s)
5052   while (!bp.end()) {
5053     __u8 op;
5054     string key;
5055     decode(op, bp);
5056
5057     switch (op) {
5058     case CEPH_OSD_TMAP_SET: // insert key
5059       {
5060         decode(key, bp);
5061         bufferlist data;
5062         decode(data, bp);
5063         m[key] = data;
5064       }
5065       break;
5066     case CEPH_OSD_TMAP_RM: // remove key
5067       decode(key, bp);
5068       if (!m.count(key)) {
5069         return -ENOENT;
5070       }
5071       m.erase(key);
5072       break;
5073     case CEPH_OSD_TMAP_RMSLOPPY: // remove key
5074       decode(key, bp);
5075       m.erase(key);
5076       break;
5077     case CEPH_OSD_TMAP_HDR: // update header
5078       {
5079         decode(header, bp);
5080       }
5081       break;
5082     default:
5083       return -EINVAL;
5084     }
5085   }
5086
5087   // reencode
5088   bufferlist obl;
5089   encode(header, obl);
5090   encode(m, obl);
5091
5092   // write it out
5093   vector<OSDOp> nops(1);
5094   OSDOp& newop = nops[0];
5095   newop.op.op = CEPH_OSD_OP_WRITEFULL;
5096   newop.op.extent.offset = 0;
5097   newop.op.extent.length = obl.length();
5098   newop.indata = obl;
5099   do_osd_ops(ctx, nops);
5100   return 0;
5101 }
5102
5103 int PrimaryLogPG::do_tmapup(OpContext *ctx, bufferlist::const_iterator& bp, OSDOp& osd_op)
5104 {
5105   bufferlist::const_iterator orig_bp = bp;
5106   int result = 0;
5107   if (bp.end()) {
5108     dout(10) << "tmapup is a no-op" << dendl;
5109   } else {
5110     // read the whole object
5111     vector<OSDOp> nops(1);
5112     OSDOp& newop = nops[0];
5113     newop.op.op = CEPH_OSD_OP_READ;
5114     newop.op.extent.offset = 0;
5115     newop.op.extent.length = 0;
5116     result = do_osd_ops(ctx, nops);
5117
5118     dout(10) << "tmapup read " << newop.outdata.length() << dendl;
5119
5120     dout(30) << " starting is \n";
5121     newop.outdata.hexdump(*_dout);
5122     *_dout << dendl;
5123
5124     auto ip = newop.outdata.cbegin();
5125     bufferlist obl;
5126
5127     dout(30) << "the update command is: \n";
5128     osd_op.indata.hexdump(*_dout);
5129     *_dout << dendl;
5130
5131     // header
5132     bufferlist header;
5133     __u32 nkeys = 0;
5134     if (newop.outdata.length()) {
5135       decode(header, ip);
5136       decode(nkeys, ip);
5137     }
5138     dout(10) << "tmapup header " << header.length() << dendl;
5139
5140     if (!bp.end() && *bp == CEPH_OSD_TMAP_HDR) {
5141       ++bp;
5142       decode(header, bp);
5143       dout(10) << "tmapup new header " << header.length() << dendl;
5144     }
5145
5146     encode(header, obl);
5147
5148     dout(20) << "tmapup initial nkeys " << nkeys << dendl;
5149
5150     // update keys
5151     bufferlist newkeydata;
5152     string nextkey, last_in_key;
5153     bufferlist nextval;
5154     bool have_next = false;
5155     if (!ip.end()) {
5156       have_next = true;
5157       decode(nextkey, ip);
5158       decode(nextval, ip);
5159     }
5160     while (!bp.end() && !result) {
5161       __u8 op;
5162       string key;
5163       try {
5164         decode(op, bp);
5165         decode(key, bp);
5166       }
5167       catch (ceph::buffer::error& e) {
5168         return -EINVAL;
5169       }
5170       if (key < last_in_key) {
5171         dout(5) << "tmapup warning: key '" << key << "' < previous key '" << last_in_key
5172                 << "', falling back to an inefficient (unsorted) update" << dendl;
5173         bp = orig_bp;
5174         return do_tmapup_slow(ctx, bp, osd_op, newop.outdata);
5175       }
5176       last_in_key = key;
5177
5178       dout(10) << "tmapup op " << (int)op << " key " << key << dendl;
5179
5180       // skip existing intervening keys
5181       bool key_exists = false;
5182       while (have_next && !key_exists) {
5183         dout(20) << "  (have_next=" << have_next << " nextkey=" << nextkey << ")" << dendl;
5184         if (nextkey > key)
5185           break;
5186         if (nextkey < key) {
5187           // copy untouched.
5188           encode(nextkey, newkeydata);
5189           encode(nextval, newkeydata);
5190           dout(20) << "  keep " << nextkey << " " << nextval.length() << dendl;
5191         } else {
5192           // don't copy; discard old value.  and stop.
5193           dout(20) << "  drop " << nextkey << " " << nextval.length() << dendl;
5194           key_exists = true;
5195           nkeys--;
5196         }
5197         if (!ip.end()) {
5198           decode(nextkey, ip);
5199           decode(nextval, ip);
5200         } else {
5201           have_next = false;
5202         }
5203       }
5204
5205       if (op == CEPH_OSD_TMAP_SET) {
5206         bufferlist val;
5207         try {
5208           decode(val, bp);
5209         }
5210         catch (ceph::buffer::error& e) {
5211           return -EINVAL;
5212         }
5213         encode(key, newkeydata);
5214         encode(val, newkeydata);
5215         dout(20) << "   set " << key << " " << val.length() << dendl;
5216         nkeys++;
5217       } else if (op == CEPH_OSD_TMAP_CREATE) {
5218         if (key_exists) {
5219           return -EEXIST;
5220         }
5221         bufferlist val;
5222         try {
5223           decode(val, bp);
5224         }
5225         catch (ceph::buffer::error& e) {
5226           return -EINVAL;
5227         }
5228         encode(key, newkeydata);
5229         encode(val, newkeydata);
5230         dout(20) << "   create " << key << " " << val.length() << dendl;
5231         nkeys++;
5232       } else if (op == CEPH_OSD_TMAP_RM) {
5233         // do nothing.
5234         if (!key_exists) {
5235           return -ENOENT;
5236         }
5237       } else if (op == CEPH_OSD_TMAP_RMSLOPPY) {
5238         // do nothing
5239       } else {
5240         dout(10) << "  invalid tmap op " << (int)op << dendl;
5241         return -EINVAL;
5242       }
5243     }
5244
5245     // copy remaining
5246     if (have_next) {
5247       encode(nextkey, newkeydata);
5248       encode(nextval, newkeydata);
5249       dout(20) << "  keep " << nextkey << " " << nextval.length() << dendl;
5250     }
5251     if (!ip.end()) {
5252       bufferlist rest;
5253       rest.substr_of(newop.outdata, ip.get_off(), newop.outdata.length() - ip.get_off());
5254       dout(20) << "  keep trailing " << rest.length()
5255                << " at " << newkeydata.length() << dendl;
5256       newkeydata.claim_append(rest);
5257     }
5258
5259     // encode final key count + key data
5260     dout(20) << "tmapup final nkeys " << nkeys << dendl;
5261     encode(nkeys, obl);
5262     obl.claim_append(newkeydata);
5263
5264     if (0) {
5265       dout(30) << " final is \n";
5266       obl.hexdump(*_dout);
5267       *_dout << dendl;
5268
5269       // sanity check
5270       auto tp = obl.cbegin();
5271       bufferlist h;
5272       decode(h, tp);
5273       map<string,bufferlist> d;
5274       decode(d, tp);
5275       ceph_assert(tp.end());
5276       dout(0) << " **** debug sanity check, looks ok ****" << dendl;
5277     }
5278
5279     // write it out
5280     if (!result) {
5281       dout(20) << "tmapput write " << obl.length() << dendl;
5282       newop.op.op = CEPH_OSD_OP_WRITEFULL;
5283       newop.op.extent.offset = 0;
5284       newop.op.extent.length = obl.length();
5285       newop.indata = obl;
5286       do_osd_ops(ctx, nops);
5287     }
5288   }
5289   return result;
5290 }
5291
5292 static int check_offset_and_length(uint64_t offset, uint64_t length,
5293   uint64_t max, DoutPrefixProvider *dpp)
5294 {
5295   if (offset >= max ||
5296       length > max ||
5297       offset + length > max) {
5298     ldpp_dout(dpp, 10) << __func__ << " "
5299       << "osd_max_object_size: " << max
5300       << "; Hard limit of object size is 4GB." << dendl;
5301     return -EFBIG;
5302   }
5303
5304   return 0;
5305 }
5306
5307 struct FillInVerifyExtent : public Context {
5308   ceph_le64 *r;
5309   int32_t *rval;
5310   bufferlist *outdatap;
5311   std::optional<uint32_t> maybe_crc;
5312   uint64_t size;
5313   OSDService *osd;
5314   hobject_t soid;
5315   uint32_t flags;
5316   FillInVerifyExtent(ceph_le64 *r, int32_t *rv, bufferlist *blp,
5317                      std::optional<uint32_t> mc, uint64_t size,
5318                      OSDService *osd, hobject_t soid, uint32_t flags) :
5319     r(r), rval(rv), outdatap(blp), maybe_crc(mc),
5320     size(size), osd(osd), soid(soid), flags(flags) {}
5321   void finish(int len) override {
5322     if (len < 0) {
5323       *rval = len;
5324       return;
5325     }
5326     *r = len;
5327     *rval = 0;
5328
5329     // whole object?  can we verify the checksum?
5330     if (maybe_crc && *r == size) {
5331       uint32_t crc = outdatap->crc32c(-1);
5332       if (maybe_crc != crc) {
5333         osd->clog->error() << std::hex << " full-object read crc 0x" << crc
5334                            << " != expected 0x" << *maybe_crc
5335                            << std::dec << " on " << soid;
5336         if (!(flags & CEPH_OSD_OP_FLAG_FAILOK)) {
5337           *rval = -EIO;
5338           *r = 0;
5339         }
5340       }
5341     }
5342   }
5343 };
5344
5345 struct ToSparseReadResult : public Context {
5346   int* result;
5347   bufferlist* data_bl;
5348   uint64_t data_offset;
5349   ceph_le64* len;
5350   ToSparseReadResult(int* result, bufferlist* bl, uint64_t offset,
5351                      ceph_le64* len)
5352     : result(result), data_bl(bl), data_offset(offset),len(len) {}
5353   void finish(int r) override {
5354     if (r < 0) {
5355       *result = r;
5356       return;
5357     }
5358     *result = 0;
5359     *len = r;
5360     bufferlist outdata;
5361     map<uint64_t, uint64_t> extents = {{data_offset, r}};
5362     encode(extents, outdata);
5363     encode_destructively(*data_bl, outdata);
5364     data_bl->swap(outdata);
5365   }
5366 };
5367
5368 template<typename V>
5369 static string list_keys(const map<string, V>& m) {
5370   string s;
5371   for (typename map<string, V>::const_iterator itr = m.begin(); itr != m.end(); ++itr) {
5372     if (!s.empty()) {
5373       s.push_back(',');
5374     }
5375     s.append(itr->first);
5376   }
5377   return s;
5378 }
5379
5380 template<typename T>
5381 static string list_entries(const T& m) {
5382   string s;
5383   for (typename T::const_iterator itr = m.begin(); itr != m.end(); ++itr) {
5384     if (!s.empty()) {
5385       s.push_back(',');
5386     }
5387     s.append(*itr);
5388   }
5389   return s;
5390 }
5391
5392 void PrimaryLogPG::maybe_create_new_object(
5393   OpContext *ctx,
5394   bool ignore_transaction)
5395 {
5396   ObjectState& obs = ctx->new_obs;
5397   if (!obs.exists) {
5398     ctx->delta_stats.num_objects++;
5399     obs.exists = true;
5400     ceph_assert(!obs.oi.is_whiteout());
5401     obs.oi.new_object();
5402     if (!ignore_transaction)
5403       ctx->op_t->create(obs.oi.soid);
5404   } else if (obs.oi.is_whiteout()) {
5405     dout(10) << __func__ << " clearing whiteout on " << obs.oi.soid << dendl;
5406     ctx->new_obs.oi.clear_flag(object_info_t::FLAG_WHITEOUT);
5407     --ctx->delta_stats.num_whiteouts;
5408   }
5409 }
5410
5411 struct ReadFinisher : public PrimaryLogPG::OpFinisher {
5412   OSDOp& osd_op;
5413
5414   explicit ReadFinisher(OSDOp& osd_op) : osd_op(osd_op) {
5415   }
5416
5417   int execute() override {
5418     return osd_op.rval;
5419   }
5420 };
5421
5422 struct C_ChecksumRead : public Context {
5423   PrimaryLogPG *primary_log_pg;
5424   OSDOp &osd_op;
5425   Checksummer::CSumType csum_type;
5426   bufferlist init_value_bl;
5427   ceph_le64 read_length;
5428   bufferlist read_bl;
5429   Context *fill_extent_ctx;
5430
5431   C_ChecksumRead(PrimaryLogPG *primary_log_pg, OSDOp &osd_op,
5432                  Checksummer::CSumType csum_type, bufferlist &&init_value_bl,
5433                  std::optional<uint32_t> maybe_crc, uint64_t size,
5434                  OSDService *osd, hobject_t soid, uint32_t flags)
5435     : primary_log_pg(primary_log_pg), osd_op(osd_op),
5436       csum_type(csum_type), init_value_bl(std::move(init_value_bl)),
5437       fill_extent_ctx(new FillInVerifyExtent(&read_length, &osd_op.rval,
5438                                              &read_bl, maybe_crc, size,
5439                                              osd, soid, flags)) {
5440   }
5441   ~C_ChecksumRead() override {
5442     delete fill_extent_ctx;
5443   }
5444
5445   void finish(int r) override {
5446     fill_extent_ctx->complete(r);
5447     fill_extent_ctx = nullptr;
5448
5449     if (osd_op.rval >= 0) {
5450       bufferlist::const_iterator init_value_bl_it = init_value_bl.begin();
5451       osd_op.rval = primary_log_pg->finish_checksum(osd_op, csum_type,
5452                                                     &init_value_bl_it, read_bl);
5453     }
5454   }
5455 };
5456
5457 int PrimaryLogPG::do_checksum(OpContext *ctx, OSDOp& osd_op,
5458                               bufferlist::const_iterator *bl_it)
5459 {
5460   dout(20) << __func__ << dendl;
5461
5462   auto& op = osd_op.op;
5463   if (op.checksum.chunk_size > 0) {
5464     if (op.checksum.length == 0) {
5465       dout(10) << __func__ << ": length required when chunk size provided"
5466                << dendl;
5467       return -EINVAL;
5468     }
5469     if (op.checksum.length % op.checksum.chunk_size != 0) {
5470       dout(10) << __func__ << ": length not aligned to chunk size" << dendl;
5471       return -EINVAL;
5472     }
5473   }
5474
5475   auto& oi = ctx->new_obs.oi;
5476   if (op.checksum.offset == 0 && op.checksum.length == 0) {
5477     // zeroed offset+length implies checksum whole object
5478     op.checksum.length = oi.size;
5479   } else if (op.checksum.offset >= oi.size) {
5480     // read size was trimmed to zero, do nothing
5481     // see PrimaryLogPG::do_read
5482     return 0;
5483   } else if (op.extent.offset + op.extent.length > oi.size) {
5484     op.extent.length = oi.size - op.extent.offset;
5485     if (op.checksum.chunk_size > 0 &&
5486         op.checksum.length % op.checksum.chunk_size != 0) {
5487       dout(10) << __func__ << ": length (trimmed to 0x"
5488                << std::hex << op.checksum.length
5489                << ") not aligned to chunk size 0x"
5490                << op.checksum.chunk_size << std::dec
5491                << dendl;
5492       return -EINVAL;
5493     }
5494   }
5495
5496   Checksummer::CSumType csum_type;
5497   switch (op.checksum.type) {
5498   case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH32:
5499     csum_type = Checksummer::CSUM_XXHASH32;
5500     break;
5501   case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH64:
5502     csum_type = Checksummer::CSUM_XXHASH64;
5503     break;
5504   case CEPH_OSD_CHECKSUM_OP_TYPE_CRC32C:
5505     csum_type = Checksummer::CSUM_CRC32C;
5506     break;
5507   default:
5508     dout(10) << __func__ << ": unknown crc type ("
5509              << static_cast<uint32_t>(op.checksum.type) << ")" << dendl;
5510     return -EINVAL;
5511   }
5512
5513   size_t csum_init_value_size = Checksummer::get_csum_init_value_size(csum_type);
5514   if (bl_it->get_remaining() < csum_init_value_size) {
5515     dout(10) << __func__ << ": init value not provided" << dendl;
5516     return -EINVAL;
5517   }
5518
5519   bufferlist init_value_bl;
5520   init_value_bl.substr_of(bl_it->get_bl(), bl_it->get_off(),
5521                           csum_init_value_size);
5522   *bl_it += csum_init_value_size;
5523
5524   if (pool.info.is_erasure() && op.checksum.length > 0) {
5525     // If there is a data digest and it is possible we are reading
5526     // entire object, pass the digest.
5527     std::optional<uint32_t> maybe_crc;
5528     if (oi.is_data_digest() && op.checksum.offset == 0 &&
5529         op.checksum.length >= oi.size) {
5530       maybe_crc = oi.data_digest;
5531     }
5532
5533     // async read
5534     auto& soid = oi.soid;
5535     auto checksum_ctx = new C_ChecksumRead(this, osd_op, csum_type,
5536                                            std::move(init_value_bl), maybe_crc,
5537                                            oi.size, osd, soid, op.flags);
5538
5539     ctx->pending_async_reads.push_back({
5540       {op.checksum.offset, op.checksum.length, op.flags},
5541       {&checksum_ctx->read_bl, checksum_ctx}});
5542
5543     dout(10) << __func__ << ": async_read noted for " << soid << dendl;
5544     ctx->op_finishers[ctx->current_osd_subop_num].reset(
5545       new ReadFinisher(osd_op));
5546     return -EINPROGRESS;
5547   }
5548
5549   // sync read
5550   std::vector<OSDOp> read_ops(1);
5551   auto& read_op = read_ops[0];
5552   if (op.checksum.length > 0) {
5553     read_op.op.op = CEPH_OSD_OP_READ;
5554     read_op.op.flags = op.flags;
5555     read_op.op.extent.offset = op.checksum.offset;
5556     read_op.op.extent.length = op.checksum.length;
5557     read_op.op.extent.truncate_size = 0;
5558     read_op.op.extent.truncate_seq = 0;
5559
5560     int r = do_osd_ops(ctx, read_ops);
5561     if (r < 0) {
5562       derr << __func__ << ": do_osd_ops failed: " << cpp_strerror(r) << dendl;
5563       return r;
5564     }
5565   }
5566
5567   bufferlist::const_iterator init_value_bl_it = init_value_bl.begin();
5568   return finish_checksum(osd_op, csum_type, &init_value_bl_it,
5569                          read_op.outdata);
5570 }
5571
5572 int PrimaryLogPG::finish_checksum(OSDOp& osd_op,
5573                                   Checksummer::CSumType csum_type,
5574                                   bufferlist::const_iterator *init_value_bl_it,
5575                                   const bufferlist &read_bl) {
5576   dout(20) << __func__ << dendl;
5577
5578   auto& op = osd_op.op;
5579
5580   if (op.checksum.length > 0 && read_bl.length() != op.checksum.length) {
5581     derr << __func__ << ": bytes read " << read_bl.length() << " != "
5582          << op.checksum.length << dendl;
5583     return -EINVAL;
5584   }
5585
5586   size_t csum_chunk_size = (op.checksum.chunk_size != 0 ?
5587                               op.checksum.chunk_size : read_bl.length());
5588   uint32_t csum_count = (csum_chunk_size > 0 ?
5589                            read_bl.length() / csum_chunk_size : 0);
5590
5591   bufferlist csum;
5592   bufferptr csum_data;
5593   if (csum_count > 0) {
5594     size_t csum_value_size = Checksummer::get_csum_value_size(csum_type);
5595     csum_data = ceph::buffer::create(csum_value_size * csum_count);
5596     csum_data.zero();
5597     csum.append(csum_data);
5598
5599     switch (csum_type) {
5600     case Checksummer::CSUM_XXHASH32:
5601       {
5602         Checksummer::xxhash32::init_value_t init_value;
5603         decode(init_value, *init_value_bl_it);
5604         Checksummer::calculate<Checksummer::xxhash32>(
5605           init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
5606           &csum_data);
5607       }
5608       break;
5609     case Checksummer::CSUM_XXHASH64:
5610       {
5611         Checksummer::xxhash64::init_value_t init_value;
5612         decode(init_value, *init_value_bl_it);
5613         Checksummer::calculate<Checksummer::xxhash64>(
5614           init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
5615           &csum_data);
5616       }
5617       break;
5618     case Checksummer::CSUM_CRC32C:
5619       {
5620         Checksummer::crc32c::init_value_t init_value;
5621         decode(init_value, *init_value_bl_it);
5622         Checksummer::calculate<Checksummer::crc32c>(
5623           init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
5624           &csum_data);
5625       }
5626       break;
5627     default:
5628       break;
5629     }
5630   }
5631
5632   encode(csum_count, osd_op.outdata);
5633   osd_op.outdata.claim_append(csum);
5634   return 0;
5635 }
5636
5637 struct C_ExtentCmpRead : public Context {
5638   PrimaryLogPG *primary_log_pg;
5639   OSDOp &osd_op;
5640   ceph_le64 read_length{};
5641   bufferlist read_bl;
5642   Context *fill_extent_ctx;
5643
5644   C_ExtentCmpRead(PrimaryLogPG *primary_log_pg, OSDOp &osd_op,
5645                   std::optional<uint32_t> maybe_crc, uint64_t size,
5646                   OSDService *osd, hobject_t soid, uint32_t flags)
5647     : primary_log_pg(primary_log_pg), osd_op(osd_op),
5648       fill_extent_ctx(new FillInVerifyExtent(&read_length, &osd_op.rval,
5649                                              &read_bl, maybe_crc, size,
5650                                              osd, soid, flags)) {
5651   }
5652   ~C_ExtentCmpRead() override {
5653     delete fill_extent_ctx;
5654   }
5655
5656   void finish(int r) override {
5657     if (r == -ENOENT) {
5658       osd_op.rval = 0;
5659       read_bl.clear();
5660       delete fill_extent_ctx;
5661     } else {
5662       fill_extent_ctx->complete(r);
5663     }
5664     fill_extent_ctx = nullptr;
5665
5666     if (osd_op.rval >= 0) {
5667       osd_op.rval = primary_log_pg->finish_extent_cmp(osd_op, read_bl);
5668     }
5669   }
5670 };
5671
5672 int PrimaryLogPG::do_extent_cmp(OpContext *ctx, OSDOp& osd_op)
5673 {
5674   dout(20) << __func__ << dendl;
5675   ceph_osd_op& op = osd_op.op;
5676
5677   auto& oi = ctx->new_obs.oi;
5678   uint64_t size = oi.size;
5679   if ((oi.truncate_seq < op.extent.truncate_seq) &&
5680       (op.extent.offset + op.extent.length > op.extent.truncate_size)) {
5681     size = op.extent.truncate_size;
5682   }
5683
5684   if (op.extent.offset >= size) {
5685     op.extent.length = 0;
5686   } else if (op.extent.offset + op.extent.length > size) {
5687     op.extent.length = size - op.extent.offset;
5688   }
5689
5690   if (op.extent.length == 0) {
5691     dout(20) << __func__ << " zero length extent" << dendl;
5692     return finish_extent_cmp(osd_op, bufferlist{});
5693   } else if (!ctx->obs->exists || ctx->obs->oi.is_whiteout()) {
5694     dout(20) << __func__ << " object DNE" << dendl;
5695     return finish_extent_cmp(osd_op, {});
5696   } else if (pool.info.is_erasure()) {
5697     // If there is a data digest and it is possible we are reading
5698     // entire object, pass the digest.
5699     std::optional<uint32_t> maybe_crc;
5700     if (oi.is_data_digest() && op.checksum.offset == 0 &&
5701         op.checksum.length >= oi.size) {
5702       maybe_crc = oi.data_digest;
5703     }
5704
5705     // async read
5706     auto& soid = oi.soid;
5707     auto extent_cmp_ctx = new C_ExtentCmpRead(this, osd_op, maybe_crc, oi.size,
5708                                               osd, soid, op.flags);
5709     ctx->pending_async_reads.push_back({
5710       {op.extent.offset, op.extent.length, op.flags},
5711       {&extent_cmp_ctx->read_bl, extent_cmp_ctx}});
5712
5713     dout(10) << __func__ << ": async_read noted for " << soid << dendl;
5714
5715     ctx->op_finishers[ctx->current_osd_subop_num].reset(
5716       new ReadFinisher(osd_op));
5717     return -EINPROGRESS;
5718   }
5719
5720   // sync read
5721   vector<OSDOp> read_ops(1);
5722   OSDOp& read_op = read_ops[0];
5723
5724   read_op.op.op = CEPH_OSD_OP_SYNC_READ;
5725   read_op.op.extent.offset = op.extent.offset;
5726   read_op.op.extent.length = op.extent.length;
5727   read_op.op.extent.truncate_seq = op.extent.truncate_seq;
5728   read_op.op.extent.truncate_size = op.extent.truncate_size;
5729
5730   int result = do_osd_ops(ctx, read_ops);
5731   if (result < 0) {
5732     derr << __func__ << " failed " << result << dendl;
5733     return result;
5734   }
5735   return finish_extent_cmp(osd_op, read_op.outdata);
5736 }
5737
5738 int PrimaryLogPG::finish_extent_cmp(OSDOp& osd_op, const bufferlist &read_bl)
5739 {
5740   for (uint64_t idx = 0; idx < osd_op.indata.length(); ++idx) {
5741     char read_byte = (idx < read_bl.length() ? read_bl[idx] : 0);
5742     if (osd_op.indata[idx] != read_byte) {
5743         return (-MAX_ERRNO - idx);
5744     }
5745   }
5746
5747   return 0;
5748 }
5749
5750 int PrimaryLogPG::do_read(OpContext *ctx, OSDOp& osd_op) {
5751   dout(20) << __func__ << dendl;
5752   auto& op = osd_op.op;
5753   auto& oi = ctx->new_obs.oi;
5754   auto& soid = oi.soid;
5755   __u32 seq = oi.truncate_seq;
5756   uint64_t size = oi.size;
5757   bool trimmed_read = false;
5758
5759   dout(30) << __func__ << " oi.size: " << oi.size << dendl;
5760   dout(30) << __func__ << " oi.truncate_seq: " << oi.truncate_seq << dendl;
5761   dout(30) << __func__ << " op.extent.truncate_seq: " << op.extent.truncate_seq << dendl;
5762   dout(30) << __func__ << " op.extent.truncate_size: " << op.extent.truncate_size << dendl;
5763
5764   // are we beyond truncate_size?
5765   if ( (seq < op.extent.truncate_seq) &&
5766        (op.extent.offset + op.extent.length > op.extent.truncate_size) &&
5767        (size > op.extent.truncate_size) )
5768     size = op.extent.truncate_size;
5769
5770   if (op.extent.length == 0) //length is zero mean read the whole object
5771     op.extent.length = size;
5772
5773   if (op.extent.offset >= size) {
5774     op.extent.length = 0;
5775     trimmed_read = true;
5776   } else if (op.extent.offset + op.extent.length > size) {
5777     op.extent.length = size - op.extent.offset;
5778     trimmed_read = true;
5779   }
5780
5781   dout(30) << __func__ << "op.extent.length is now " << op.extent.length << dendl;
5782
5783   // read into a buffer
5784   int result = 0;
5785   if (trimmed_read && op.extent.length == 0) {
5786     // read size was trimmed to zero and it is expected to do nothing
5787     // a read operation of 0 bytes does *not* do nothing, this is why
5788     // the trimmed_read boolean is needed
5789   } else if (pool.info.is_erasure()) {
5790     // The initialisation below is required to silence a false positive
5791     // -Wmaybe-uninitialized warning
5792     std::optional<uint32_t> maybe_crc;
5793     // If there is a data digest and it is possible we are reading
5794     // entire object, pass the digest.  FillInVerifyExtent will
5795     // will check the oi.size again.
5796     if (oi.is_data_digest() && op.extent.offset == 0 &&
5797         op.extent.length >= oi.size)
5798       maybe_crc = oi.data_digest;
5799     ctx->pending_async_reads.push_back(
5800       make_pair(
5801         boost::make_tuple(op.extent.offset, op.extent.length, op.flags),
5802         make_pair(&osd_op.outdata,
5803                   new FillInVerifyExtent(&op.extent.length, &osd_op.rval,
5804                                          &osd_op.outdata, maybe_crc, oi.size,
5805                                          osd, soid, op.flags))));
5806     dout(10) << " async_read noted for " << soid << dendl;
5807
5808     ctx->op_finishers[ctx->current_osd_subop_num].reset(
5809       new ReadFinisher(osd_op));
5810   } else {
5811     int r = pgbackend->objects_read_sync(
5812       soid, op.extent.offset, op.extent.length, op.flags, &osd_op.outdata);
5813     // whole object?  can we verify the checksum?
5814     if (r >= 0 && op.extent.offset == 0 &&
5815         (uint64_t)r == oi.size && oi.is_data_digest()) {
5816       uint32_t crc = osd_op.outdata.crc32c(-1);
5817       if (oi.data_digest != crc) {
5818         osd->clog->error() << info.pgid << std::hex
5819                            << " full-object read crc 0x" << crc
5820                            << " != expected 0x" << oi.data_digest
5821                            << std::dec << " on " << soid;
5822         r = -EIO; // try repair later
5823       }
5824     }
5825     if (r == -EIO) {
5826       r = rep_repair_primary_object(soid, ctx);
5827     }
5828     if (r >= 0)
5829       op.extent.length = r;
5830     else if (r == -EAGAIN) {
5831       result = -EAGAIN;
5832     } else {
5833       result = r;
5834       op.extent.length = 0;
5835     }
5836     dout(10) << " read got " << r << " / " << op.extent.length
5837              << " bytes from obj " << soid << dendl;
5838   }
5839   if (result >= 0) {
5840     ctx->delta_stats.num_rd_kb += shift_round_up(op.extent.length, 10);
5841     ctx->delta_stats.num_rd++;
5842   }
5843   return result;
5844 }
5845
5846 int PrimaryLogPG::do_sparse_read(OpContext *ctx, OSDOp& osd_op) {
5847   dout(20) << __func__ << dendl;
5848   auto& op = osd_op.op;
5849   auto& oi = ctx->new_obs.oi;
5850   auto& soid = oi.soid;
5851
5852   if (op.extent.truncate_seq) {
5853     dout(0) << "sparse_read does not support truncation sequence " << dendl;
5854     return -EINVAL;
5855   }
5856
5857   ++ctx->num_read;
5858   if (pool.info.is_erasure()) {
5859     // translate sparse read to a normal one if not supported
5860     uint64_t offset = op.extent.offset;
5861     uint64_t length = op.extent.length;
5862     if (offset > oi.size) {
5863       length = 0;
5864     } else if (offset + length > oi.size) {
5865       length = oi.size - offset;
5866     }
5867
5868     if (length > 0) {
5869       ctx->pending_async_reads.push_back(
5870         make_pair(
5871           boost::make_tuple(offset, length, op.flags),
5872           make_pair(
5873             &osd_op.outdata,
5874             new ToSparseReadResult(&osd_op.rval, &osd_op.outdata, offset,
5875                                    &op.extent.length))));
5876       dout(10) << " async_read (was sparse_read) noted for " << soid << dendl;
5877
5878       ctx->op_finishers[ctx->current_osd_subop_num].reset(
5879         new ReadFinisher(osd_op));
5880     } else {
5881       dout(10) << " sparse read ended up empty for " << soid << dendl;
5882       map<uint64_t, uint64_t> extents;
5883       encode(extents, osd_op.outdata);
5884     }
5885   } else {
5886     // read into a buffer
5887     map<uint64_t, uint64_t> m;
5888     int r = osd->store->fiemap(ch, ghobject_t(soid, ghobject_t::NO_GEN,
5889                                               info.pgid.shard),
5890                                op.extent.offset, op.extent.length, m);
5891     if (r < 0)  {
5892       return r;
5893     }
5894
5895     bufferlist data_bl;
5896     r = pgbackend->objects_readv_sync(soid, std::move(m), op.flags, &data_bl);
5897     if (r == -EIO) {
5898       r = rep_repair_primary_object(soid, ctx);
5899     }
5900     if (r < 0) {
5901       return r;
5902     }
5903
5904     // Why SPARSE_READ need checksum? In fact, librbd always use sparse-read.
5905     // Maybe at first, there is no much whole objects. With continued use, more
5906     // and more whole object exist. So from this point, for spare-read add
5907     // checksum make sense.
5908     if ((uint64_t)r == oi.size && oi.is_data_digest()) {
5909       uint32_t crc = data_bl.crc32c(-1);
5910       if (oi.data_digest != crc) {
5911         osd->clog->error() << info.pgid << std::hex
5912           << " full-object read crc 0x" << crc
5913           << " != expected 0x" << oi.data_digest
5914           << std::dec << " on " << soid;
5915         r = rep_repair_primary_object(soid, ctx);
5916         if (r < 0) {
5917           return r;
5918         }
5919       }
5920     }
5921
5922     op.extent.length = r;
5923
5924     encode(m, osd_op.outdata); // re-encode since it might be modified
5925     ::encode_destructively(data_bl, osd_op.outdata);
5926
5927     dout(10) << " sparse_read got " << r << " bytes from object "
5928              << soid << dendl;
5929   }
5930
5931   ctx->delta_stats.num_rd_kb += shift_round_up(op.extent.length, 10);
5932   ctx->delta_stats.num_rd++;
5933   return 0;
5934 }
5935
5936 int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
5937 {
5938   int result = 0;
5939   SnapSetContext *ssc = ctx->obc->ssc;
5940   ObjectState& obs = ctx->new_obs;
5941   object_info_t& oi = obs.oi;
5942   const hobject_t& soid = oi.soid;
5943   const bool skip_data_digest = osd->store->has_builtin_csum() &&
5944     osd->osd_skip_data_digest;
5945
5946   PGTransaction* t = ctx->op_t.get();
5947
5948   dout(10) << "do_osd_op " << soid << " " << ops << dendl;
5949
5950   jspan span;
5951   if (ctx->op) {
5952     span = tracing::osd::tracer.add_span(__func__, ctx->op->osd_parent_span);
5953   }
5954   ctx->current_osd_subop_num = 0;
5955   for (auto p = ops.begin(); p != ops.end(); ++p, ctx->current_osd_subop_num++, ctx->processed_subop_count++) {
5956     OSDOp& osd_op = *p;
5957     ceph_osd_op& op = osd_op.op;
5958
5959     OpFinisher* op_finisher = nullptr;
5960     {
5961       auto op_finisher_it = ctx->op_finishers.find(ctx->current_osd_subop_num);
5962       if (op_finisher_it != ctx->op_finishers.end()) {
5963         op_finisher = op_finisher_it->second.get();
5964       }
5965     }
5966
5967     // TODO: check endianness (ceph_le32 vs uint32_t, etc.)
5968     // The fields in ceph_osd_op are little-endian (according to the definition in rados.h),
5969     // but the code in this function seems to treat them as native-endian.  What should the
5970     // tracepoints do?
5971     tracepoint(osd, do_osd_op_pre, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op), op.flags);
5972
5973     dout(10) << "do_osd_op  " << osd_op << dendl;
5974
5975     auto bp = osd_op.indata.cbegin();
5976
5977     // user-visible modifcation?
5978     switch (op.op) {
5979       // non user-visible modifications
5980     case CEPH_OSD_OP_WATCH:
5981     case CEPH_OSD_OP_CACHE_EVICT:
5982     case CEPH_OSD_OP_CACHE_FLUSH:
5983     case CEPH_OSD_OP_CACHE_TRY_FLUSH:
5984     case CEPH_OSD_OP_UNDIRTY:
5985     case CEPH_OSD_OP_COPY_FROM:  // we handle user_version update explicitly
5986     case CEPH_OSD_OP_COPY_FROM2:
5987     case CEPH_OSD_OP_CACHE_PIN:
5988     case CEPH_OSD_OP_CACHE_UNPIN:
5989     case CEPH_OSD_OP_SET_REDIRECT:
5990     case CEPH_OSD_OP_SET_CHUNK:
5991     case CEPH_OSD_OP_TIER_PROMOTE:
5992     case CEPH_OSD_OP_TIER_FLUSH:
5993     case CEPH_OSD_OP_TIER_EVICT:
5994       break;
5995     default:
5996       if (op.op & CEPH_OSD_OP_MODE_WR)
5997         ctx->user_modify = true;
5998     }
5999
6000     // munge -1 truncate to 0 truncate
6001     if (ceph_osd_op_uses_extent(op.op) &&
6002         op.extent.truncate_seq == 1 &&
6003         op.extent.truncate_size == (-1ULL)) {
6004       op.extent.truncate_size = 0;
6005       op.extent.truncate_seq = 0;
6006     }
6007
6008     // munge ZERO -> TRUNCATE?  (don't munge to DELETE or we risk hosing attributes)
6009     if (op.op == CEPH_OSD_OP_ZERO &&
6010         obs.exists &&
6011         op.extent.offset < static_cast<Option::size_t>(osd->osd_max_object_size) &&
6012         op.extent.length >= 1 &&
6013         op.extent.length <= static_cast<Option::size_t>(osd->osd_max_object_size) &&
6014         op.extent.offset + op.extent.length >= oi.size) {
6015       if (op.extent.offset >= oi.size) {
6016         // no-op
6017         goto fail;
6018       }
6019       dout(10) << " munging ZERO " << op.extent.offset << "~" << op.extent.length
6020                << " -> TRUNCATE " << op.extent.offset << " (old size is " << oi.size << ")" << dendl;
6021       op.op = CEPH_OSD_OP_TRUNCATE;
6022     }
6023
6024     switch (op.op) {
6025
6026       // --- READS ---
6027
6028     case CEPH_OSD_OP_CMPEXT:
6029       ++ctx->num_read;
6030       tracepoint(osd, do_osd_op_pre_extent_cmp, soid.oid.name.c_str(),
6031                  soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
6032                  op.extent.length, op.extent.truncate_size,
6033                  op.extent.truncate_seq);
6034
6035       if (op_finisher == nullptr) {
6036         result = do_extent_cmp(ctx, osd_op);
6037       } else {
6038         result = op_finisher->execute();
6039       }
6040       break;
6041
6042     case CEPH_OSD_OP_SYNC_READ:
6043       if (pool.info.is_erasure()) {
6044         result = -EOPNOTSUPP;
6045         break;
6046       }
6047       // fall through
6048     case CEPH_OSD_OP_READ:
6049       ++ctx->num_read;
6050       tracepoint(osd, do_osd_op_pre_read, soid.oid.name.c_str(),
6051                  soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
6052                  op.extent.length, op.extent.truncate_size,
6053                  op.extent.truncate_seq);
6054       if (op_finisher == nullptr) {
6055         if (!ctx->data_off) {
6056           ctx->data_off = op.extent.offset;
6057         }
6058         result = do_read(ctx, osd_op);
6059       } else {
6060         result = op_finisher->execute();
6061       }
6062       break;
6063
6064     case CEPH_OSD_OP_CHECKSUM:
6065       ++ctx->num_read;
6066       {
6067         tracepoint(osd, do_osd_op_pre_checksum, soid.oid.name.c_str(),
6068                    soid.snap.val, oi.size, oi.truncate_seq, op.checksum.type,
6069                    op.checksum.offset, op.checksum.length,
6070                    op.checksum.chunk_size);
6071
6072         if (op_finisher == nullptr) {
6073           result = do_checksum(ctx, osd_op, &bp);
6074         } else {
6075           result = op_finisher->execute();
6076         }
6077       }
6078       break;
6079
6080     /* map extents */
6081     case CEPH_OSD_OP_MAPEXT:
6082       tracepoint(osd, do_osd_op_pre_mapext, soid.oid.name.c_str(), soid.snap.val, op.extent.offset, op.extent.length);
6083       if (pool.info.is_erasure()) {
6084         result = -EOPNOTSUPP;
6085         break;
6086       }
6087       ++ctx->num_read;
6088       {
6089         // read into a buffer
6090         bufferlist bl;
6091         int r = osd->store->fiemap(ch, ghobject_t(soid, ghobject_t::NO_GEN,
6092                                                   info.pgid.shard),
6093                                    op.extent.offset, op.extent.length, bl);
6094         osd_op.outdata = std::move(bl);
6095         if (r < 0)
6096           result = r;
6097         else
6098           ctx->delta_stats.num_rd_kb += shift_round_up(bl.length(), 10);
6099         ctx->delta_stats.num_rd++;
6100         dout(10) << " map_extents done on object " << soid << dendl;
6101       }
6102       break;
6103
6104     /* map extents */
6105     case CEPH_OSD_OP_SPARSE_READ:
6106       tracepoint(osd, do_osd_op_pre_sparse_read, soid.oid.name.c_str(),
6107                  soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
6108                  op.extent.length, op.extent.truncate_size,
6109                  op.extent.truncate_seq);
6110       if (op_finisher == nullptr) {
6111         result = do_sparse_read(ctx, osd_op);
6112       } else {
6113         result = op_finisher->execute();
6114       }
6115       break;
6116
6117     case CEPH_OSD_OP_CALL:
6118       {
6119         string cname, mname;
6120         bufferlist indata;
6121         try {
6122           bp.copy(op.cls.class_len, cname);
6123           bp.copy(op.cls.method_len, mname);
6124           bp.copy(op.cls.indata_len, indata);
6125         } catch (ceph::buffer::error& e) {
6126           dout(10) << "call unable to decode class + method + indata" << dendl;
6127           dout(30) << "in dump: ";
6128           osd_op.indata.hexdump(*_dout);
6129           *_dout << dendl;
6130           result = -EINVAL;
6131           tracepoint(osd, do_osd_op_pre_call, soid.oid.name.c_str(), soid.snap.val, "???", "???");
6132           break;
6133         }
6134         tracepoint(osd, do_osd_op_pre_call, soid.oid.name.c_str(), soid.snap.val, cname.c_str(), mname.c_str());
6135
6136         ClassHandler::ClassData *cls;
6137         result = ClassHandler::get_instance().open_class(cname, &cls);
6138         ceph_assert(result == 0);   // init_op_flags() already verified this works.
6139
6140         ClassHandler::ClassMethod *method = cls->get_method(mname);
6141         if (!method) {
6142           dout(10) << "call method " << cname << "." << mname << " does not exist" << dendl;
6143           result = -EOPNOTSUPP;
6144           break;
6145         }
6146
6147         int flags = method->get_flags();
6148         if (flags & CLS_METHOD_WR)
6149           ctx->user_modify = true;
6150
6151         bufferlist outdata;
6152         dout(10) << "call method " << cname << "." << mname << dendl;
6153         int prev_rd = ctx->num_read;
6154         int prev_wr = ctx->num_write;
6155         result = method->exec((cls_method_context_t)&ctx, indata, outdata);
6156
6157         if (ctx->num_read > prev_rd && !(flags & CLS_METHOD_RD)) {
6158           derr << "method " << cname << "." << mname << " tried to read object but is not marked RD" << dendl;
6159           result = -EIO;
6160           break;
6161         }
6162         if (ctx->num_write > prev_wr && !(flags & CLS_METHOD_WR)) {
6163           derr << "method " << cname << "." << mname << " tried to update object but is not marked WR" << dendl;
6164           result = -EIO;
6165           break;
6166         }
6167
6168         dout(10) << "method called response length=" << outdata.length() << dendl;
6169         op.extent.length = outdata.length();
6170         osd_op.outdata.claim_append(outdata);
6171         dout(30) << "out dump: ";
6172         osd_op.outdata.hexdump(*_dout);
6173         *_dout << dendl;
6174       }
6175       break;
6176
6177     case CEPH_OSD_OP_STAT:
6178       // note: stat does not require RD
6179       {
6180         tracepoint(osd, do_osd_op_pre_stat, soid.oid.name.c_str(), soid.snap.val);
6181
6182         if (obs.exists && !oi.is_whiteout()) {
6183           encode(oi.size, osd_op.outdata);
6184           encode(oi.mtime, osd_op.outdata);
6185           dout(10) << "stat oi has " << oi.size << " " << oi.mtime << dendl;
6186         } else {
6187           result = -ENOENT;
6188           dout(10) << "stat oi object does not exist" << dendl;
6189         }
6190
6191         ctx->delta_stats.num_rd++;
6192       }
6193       break;
6194
6195     case CEPH_OSD_OP_ISDIRTY:
6196       ++ctx->num_read;
6197       {
6198         tracepoint(osd, do_osd_op_pre_isdirty, soid.oid.name.c_str(), soid.snap.val);
6199         bool is_dirty = obs.oi.is_dirty();
6200         encode(is_dirty, osd_op.outdata);
6201         ctx->delta_stats.num_rd++;
6202         result = 0;
6203       }
6204       break;
6205
6206     case CEPH_OSD_OP_UNDIRTY:
6207       ++ctx->num_write;
6208       result = 0;
6209       {
6210         tracepoint(osd, do_osd_op_pre_undirty, soid.oid.name.c_str(), soid.snap.val);
6211         if (oi.is_dirty()) {
6212           ctx->undirty = true;  // see make_writeable()
6213           ctx->modify = true;
6214           ctx->delta_stats.num_wr++;
6215         }
6216       }
6217       break;
6218
6219     case CEPH_OSD_OP_CACHE_TRY_FLUSH:
6220       ++ctx->num_write;
6221       result = 0;
6222       {
6223         tracepoint(osd, do_osd_op_pre_try_flush, soid.oid.name.c_str(), soid.snap.val);
6224         if (ctx->lock_type != RWState::RWNONE) {
6225           dout(10) << "cache-try-flush without SKIPRWLOCKS flag set" << dendl;
6226           result = -EINVAL;
6227           break;
6228         }
6229         if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE || obs.oi.has_manifest()) {
6230           result = -EINVAL;
6231           break;
6232         }
6233         if (!obs.exists) {
6234           result = 0;
6235           break;
6236         }
6237         if (oi.is_cache_pinned()) {
6238           dout(10) << "cache-try-flush on a pinned object, consider unpin this object first" << dendl;
6239           result = -EPERM;
6240           break;
6241         }
6242         if (oi.is_dirty()) {
6243           result = start_flush(ctx->op, ctx->obc, false, NULL, std::nullopt);
6244           if (result == -EINPROGRESS)
6245             result = -EAGAIN;
6246         } else {
6247           result = 0;
6248         }
6249       }
6250       break;
6251
6252     case CEPH_OSD_OP_CACHE_FLUSH:
6253       ++ctx->num_write;
6254       result = 0;
6255       {
6256         tracepoint(osd, do_osd_op_pre_cache_flush, soid.oid.name.c_str(), soid.snap.val);
6257         if (ctx->lock_type == RWState::RWNONE) {
6258           dout(10) << "cache-flush with SKIPRWLOCKS flag set" << dendl;
6259           result = -EINVAL;
6260           break;
6261         }
6262         if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE || obs.oi.has_manifest()) {
6263           result = -EINVAL;
6264           break;
6265         }
6266         if (!obs.exists) {
6267           result = 0;
6268           break;
6269         }
6270         if (oi.is_cache_pinned()) {
6271           dout(10) << "cache-flush on a pinned object, consider unpin this object first" << dendl;
6272           result = -EPERM;
6273           break;
6274         }
6275         hobject_t missing;
6276         if (oi.is_dirty()) {
6277           result = start_flush(ctx->op, ctx->obc, true, &missing, std::nullopt);
6278           if (result == -EINPROGRESS)
6279             result = -EAGAIN;
6280         } else {
6281           result = 0;
6282         }
6283         // Check special return value which has set missing_return
6284         if (result == -ENOENT) {
6285           dout(10) << __func__ << " CEPH_OSD_OP_CACHE_FLUSH got ENOENT" << dendl;
6286           ceph_assert(!missing.is_min());
6287           wait_for_unreadable_object(missing, ctx->op);
6288           // Error code which is used elsewhere when wait_for_unreadable_object() is used
6289           result = -EAGAIN;
6290         }
6291       }
6292       break;
6293
6294     case CEPH_OSD_OP_CACHE_EVICT:
6295       ++ctx->num_write;
6296       result = 0;
6297       {
6298         tracepoint(osd, do_osd_op_pre_cache_evict, soid.oid.name.c_str(), soid.snap.val);
6299         if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE || obs.oi.has_manifest()) {
6300           result = -EINVAL;
6301           break;
6302         }
6303         if (!obs.exists) {
6304           result = 0;
6305           break;
6306         }
6307         if (oi.is_cache_pinned()) {
6308           dout(10) << "cache-evict on a pinned object, consider unpin this object first" << dendl;
6309           result = -EPERM;
6310           break;
6311         }
6312         if (oi.is_dirty()) {
6313           result = -EBUSY;
6314           break;
6315         }
6316         if (!oi.watchers.empty()) {
6317           result = -EBUSY;
6318           break;
6319         }
6320         if (soid.snap == CEPH_NOSNAP) {
6321           result = _verify_no_head_clones(soid, ssc->snapset);
6322           if (result < 0)
6323             break;
6324         }
6325         result = _delete_oid(ctx, true, false);
6326         if (result >= 0) {
6327           // mark that this is a cache eviction to avoid triggering normal
6328           // make_writeable() clone creation in finish_ctx()
6329           ctx->cache_operation = true;
6330         }
6331         osd->logger->inc(l_osd_tier_evict);
6332       }
6333       break;
6334
6335     case CEPH_OSD_OP_GETXATTR:
6336       ++ctx->num_read;
6337       {
6338         string aname;
6339         bp.copy(op.xattr.name_len, aname);
6340         tracepoint(osd, do_osd_op_pre_getxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
6341         string name = "_" + aname;
6342         int r = getattr_maybe_cache(
6343           ctx->obc,
6344           name,
6345           &(osd_op.outdata));
6346         if (r >= 0) {
6347           op.xattr.value_len = osd_op.outdata.length();
6348           result = 0;
6349           ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
6350         } else
6351           result = r;
6352
6353         ctx->delta_stats.num_rd++;
6354       }
6355       break;
6356
6357    case CEPH_OSD_OP_GETXATTRS:
6358       ++ctx->num_read;
6359       {
6360         tracepoint(osd, do_osd_op_pre_getxattrs, soid.oid.name.c_str(), soid.snap.val);
6361         map<string, bufferlist,less<>> out;
6362         result = getattrs_maybe_cache(
6363           ctx->obc,
6364           &out);
6365
6366         bufferlist bl;
6367         encode(out, bl);
6368         ctx->delta_stats.num_rd_kb += shift_round_up(bl.length(), 10);
6369         ctx->delta_stats.num_rd++;
6370         osd_op.outdata.claim_append(bl);
6371       }
6372       break;
6373
6374     case CEPH_OSD_OP_CMPXATTR:
6375       ++ctx->num_read;
6376       {
6377         string aname;
6378         bp.copy(op.xattr.name_len, aname);
6379         tracepoint(osd, do_osd_op_pre_cmpxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
6380         string name = "_" + aname;
6381         name[op.xattr.name_len + 1] = 0;
6382
6383         bufferlist xattr;
6384         result = getattr_maybe_cache(
6385           ctx->obc,
6386           name,
6387           &xattr);
6388         if (result < 0 && result != -EEXIST && result != -ENODATA)
6389           break;
6390
6391         ctx->delta_stats.num_rd++;
6392         ctx->delta_stats.num_rd_kb += shift_round_up(xattr.length(), 10);
6393
6394         switch (op.xattr.cmp_mode) {
6395         case CEPH_OSD_CMPXATTR_MODE_STRING:
6396           {
6397             string val;
6398             bp.copy(op.xattr.value_len, val);
6399             val[op.xattr.value_len] = 0;
6400             dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name << " val=" << val
6401                      << " op=" << (int)op.xattr.cmp_op << " mode=" << (int)op.xattr.cmp_mode << dendl;
6402             result = do_xattr_cmp_str(op.xattr.cmp_op, val, xattr);
6403           }
6404           break;
6405
6406         case CEPH_OSD_CMPXATTR_MODE_U64:
6407           {
6408             uint64_t u64val;
6409             try {
6410               decode(u64val, bp);
6411             }
6412             catch (ceph::buffer::error& e) {
6413               result = -EINVAL;
6414               goto fail;
6415             }
6416             dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name << " val=" << u64val
6417                      << " op=" << (int)op.xattr.cmp_op << " mode=" << (int)op.xattr.cmp_mode << dendl;
6418             result = do_xattr_cmp_u64(op.xattr.cmp_op, u64val, xattr);
6419           }
6420           break;
6421
6422         default:
6423           dout(10) << "bad cmp mode " << (int)op.xattr.cmp_mode << dendl;
6424           result = -EINVAL;
6425         }
6426
6427         if (!result) {
6428           dout(10) << "comparison returned false" << dendl;
6429           result = -ECANCELED;
6430           break;
6431         }
6432         if (result < 0) {
6433           dout(10) << "comparison returned " << result << " " << cpp_strerror(-result) << dendl;
6434           break;
6435         }
6436
6437         dout(10) << "comparison returned true" << dendl;
6438       }
6439       break;
6440
6441     case CEPH_OSD_OP_ASSERT_VER:
6442       ++ctx->num_read;
6443       {
6444         uint64_t ver = op.assert_ver.ver;
6445         tracepoint(osd, do_osd_op_pre_assert_ver, soid.oid.name.c_str(), soid.snap.val, ver);
6446         if (!ver)
6447           result = -EINVAL;
6448         else if (ver < oi.user_version)
6449           result = -ERANGE;
6450         else if (ver > oi.user_version)
6451           result = -EOVERFLOW;
6452       }
6453       break;
6454
6455     case CEPH_OSD_OP_LIST_WATCHERS:
6456       ++ctx->num_read;
6457       {
6458         tracepoint(osd, do_osd_op_pre_list_watchers, soid.oid.name.c_str(), soid.snap.val);
6459         obj_list_watch_response_t resp;
6460
6461         map<pair<uint64_t, entity_name_t>, watch_info_t>::const_iterator oi_iter;
6462         for (oi_iter = oi.watchers.begin(); oi_iter != oi.watchers.end();
6463                                        ++oi_iter) {
6464           dout(20) << "key cookie=" << oi_iter->first.first
6465                << " entity=" << oi_iter->first.second << " "
6466                << oi_iter->second << dendl;
6467           ceph_assert(oi_iter->first.first == oi_iter->second.cookie);
6468           ceph_assert(oi_iter->first.second.is_client());
6469
6470           watch_item_t wi(oi_iter->first.second, oi_iter->second.cookie,
6471                  oi_iter->second.timeout_seconds, oi_iter->second.addr);
6472           resp.entries.push_back(wi);
6473         }
6474
6475         resp.encode(osd_op.outdata, ctx->get_features());
6476         result = 0;
6477
6478         ctx->delta_stats.num_rd++;
6479         break;
6480       }
6481
6482     case CEPH_OSD_OP_LIST_SNAPS:
6483       ++ctx->num_read;
6484       {
6485         tracepoint(osd, do_osd_op_pre_list_snaps, soid.oid.name.c_str(), soid.snap.val);
6486         obj_list_snap_response_t resp;
6487
6488         if (!ssc) {
6489           ssc = ctx->obc->ssc = get_snapset_context(soid, false);
6490         }
6491         ceph_assert(ssc);
6492         dout(20) << " snapset " << ssc->snapset << dendl;
6493
6494         int clonecount = ssc->snapset.clones.size();
6495         clonecount++;  // for head
6496         resp.clones.reserve(clonecount);
6497         for (auto clone_iter = ssc->snapset.clones.begin();
6498              clone_iter != ssc->snapset.clones.end(); ++clone_iter) {
6499           clone_info ci;
6500           ci.cloneid = *clone_iter;
6501
6502           hobject_t clone_oid = soid;
6503           clone_oid.snap = *clone_iter;
6504
6505           auto p = ssc->snapset.clone_snaps.find(*clone_iter);
6506           if (p == ssc->snapset.clone_snaps.end()) {
6507             osd->clog->error() << "osd." << osd->whoami
6508                                << ": inconsistent clone_snaps found for oid "
6509                                << soid << " clone " << *clone_iter
6510                                << " snapset " << ssc->snapset;
6511             result = -EINVAL;
6512             break;
6513           }
6514           for (auto q = p->second.rbegin(); q != p->second.rend(); ++q) {
6515             ci.snaps.push_back(*q);
6516           }
6517
6518           dout(20) << " clone " << *clone_iter << " snaps " << ci.snaps << dendl;
6519
6520           map<snapid_t, interval_set<uint64_t> >::const_iterator coi;
6521           coi = ssc->snapset.clone_overlap.find(ci.cloneid);
6522           if (coi == ssc->snapset.clone_overlap.end()) {
6523             osd->clog->error() << "osd." << osd->whoami
6524                                << ": inconsistent clone_overlap found for oid "
6525                               << soid << " clone " << *clone_iter;
6526             result = -EINVAL;
6527             break;
6528           }
6529           const interval_set<uint64_t> &o = coi->second;
6530           ci.overlap.reserve(o.num_intervals());
6531           for (interval_set<uint64_t>::const_iterator r = o.begin();
6532                r != o.end(); ++r) {
6533             ci.overlap.push_back(pair<uint64_t,uint64_t>(r.get_start(),
6534                                                          r.get_len()));
6535           }
6536
6537           map<snapid_t, uint64_t>::const_iterator si;
6538           si = ssc->snapset.clone_size.find(ci.cloneid);
6539           if (si == ssc->snapset.clone_size.end()) {
6540             osd->clog->error() << "osd." << osd->whoami
6541                                << ": inconsistent clone_size found for oid "
6542                                << soid << " clone " << *clone_iter;
6543             result = -EINVAL;
6544             break;
6545           }
6546           ci.size = si->second;
6547
6548           resp.clones.push_back(ci);
6549         }
6550         if (result < 0) {
6551           break;
6552         }
6553         if (!ctx->obc->obs.oi.is_whiteout()) {
6554           ceph_assert(obs.exists);
6555           clone_info ci;
6556           ci.cloneid = CEPH_NOSNAP;
6557
6558           //Size for HEAD is oi.size
6559           ci.size = oi.size;
6560
6561           resp.clones.push_back(ci);
6562         }
6563         resp.seq = ssc->snapset.seq;
6564
6565         resp.encode(osd_op.outdata);
6566         result = 0;
6567
6568         ctx->delta_stats.num_rd++;
6569         break;
6570       }
6571
6572    case CEPH_OSD_OP_NOTIFY:
6573       ++ctx->num_read;
6574       {
6575         uint32_t timeout;
6576         bufferlist bl;
6577
6578         try {
6579           uint32_t ver; // obsolete
6580           decode(ver, bp);
6581           decode(timeout, bp);
6582           decode(bl, bp);
6583         } catch (const ceph::buffer::error &e) {
6584           timeout = 0;
6585         }
6586         tracepoint(osd, do_osd_op_pre_notify, soid.oid.name.c_str(), soid.snap.val, timeout);
6587         if (!timeout)
6588           timeout = cct->_conf->osd_default_notify_timeout;
6589
6590         notify_info_t n;
6591         n.timeout = timeout;
6592         n.notify_id = osd->get_next_id(get_osdmap_epoch());
6593         n.cookie = op.notify.cookie;
6594         n.bl = bl;
6595         ctx->notifies.push_back(n);
6596
6597         // return our unique notify id to the client
6598         encode(n.notify_id, osd_op.outdata);
6599       }
6600       break;
6601
6602     case CEPH_OSD_OP_NOTIFY_ACK:
6603       ++ctx->num_read;
6604       {
6605         try {
6606           uint64_t notify_id = 0;
6607           uint64_t watch_cookie = 0;
6608           decode(notify_id, bp);
6609           decode(watch_cookie, bp);
6610           bufferlist reply_bl;
6611           if (!bp.end()) {
6612             decode(reply_bl, bp);
6613           }
6614           tracepoint(osd, do_osd_op_pre_notify_ack, soid.oid.name.c_str(), soid.snap.val, notify_id, watch_cookie, "Y");
6615           OpContext::NotifyAck ack(notify_id, watch_cookie, reply_bl);
6616           ctx->notify_acks.push_back(ack);
6617         } catch (const ceph::buffer::error &e) {
6618           tracepoint(osd, do_osd_op_pre_notify_ack, soid.oid.name.c_str(), soid.snap.val, op.watch.cookie, 0, "N");
6619           OpContext::NotifyAck ack(
6620             // op.watch.cookie is actually the notify_id for historical reasons
6621             op.watch.cookie
6622             );
6623           ctx->notify_acks.push_back(ack);
6624         }
6625       }
6626       break;
6627
6628     case CEPH_OSD_OP_SETALLOCHINT:
6629       ++ctx->num_write;
6630       result = 0;
6631       {
6632         tracepoint(osd, do_osd_op_pre_setallochint, soid.oid.name.c_str(), soid.snap.val, op.alloc_hint.expected_object_size, op.alloc_hint.expected_write_size);
6633         maybe_create_new_object(ctx);
6634         oi.expected_object_size = op.alloc_hint.expected_object_size;
6635         oi.expected_write_size = op.alloc_hint.expected_write_size;
6636         oi.alloc_hint_flags = op.alloc_hint.flags;
6637         t->set_alloc_hint(soid, op.alloc_hint.expected_object_size,
6638                           op.alloc_hint.expected_write_size,
6639                           op.alloc_hint.flags);
6640       }
6641       break;
6642
6643
6644       // --- WRITES ---
6645
6646       // -- object data --
6647
6648     case CEPH_OSD_OP_WRITE:
6649       ++ctx->num_write;
6650       result = 0;
6651       { // write
6652         __u32 seq = oi.truncate_seq;
6653         tracepoint(osd, do_osd_op_pre_write, soid.oid.name.c_str(), soid.snap.val, oi.size, seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
6654         if (op.extent.length != osd_op.indata.length()) {
6655           result = -EINVAL;
6656           break;
6657         }
6658
6659         if (pool.info.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED))
6660           op.flags = op.flags | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
6661
6662         if (pool.info.requires_aligned_append() &&
6663             (op.extent.offset % pool.info.required_alignment() != 0)) {
6664           result = -EOPNOTSUPP;
6665           break;
6666         }
6667
6668         if (!obs.exists) {
6669           if (pool.info.requires_aligned_append() && op.extent.offset) {
6670             result = -EOPNOTSUPP;
6671             break;
6672           }
6673         } else if (op.extent.offset != oi.size &&
6674                    pool.info.requires_aligned_append()) {
6675           result = -EOPNOTSUPP;
6676           break;
6677         }
6678
6679         if (seq && (seq > op.extent.truncate_seq) &&
6680             (op.extent.offset + op.extent.length > oi.size)) {
6681           // old write, arrived after trimtrunc
6682           op.extent.length = (op.extent.offset > oi.size ? 0 : oi.size - op.extent.offset);
6683           dout(10) << " old truncate_seq " << op.extent.truncate_seq << " < current " << seq
6684                    << ", adjusting write length to " << op.extent.length << dendl;
6685           bufferlist t;
6686           t.substr_of(osd_op.indata, 0, op.extent.length);
6687           osd_op.indata.swap(t);
6688         }
6689         if (op.extent.truncate_seq > seq) {
6690           // write arrives before trimtrunc
6691           if (obs.exists && !oi.is_whiteout()) {
6692             dout(10) << " truncate_seq " << op.extent.truncate_seq << " > current " << seq
6693                      << ", truncating to " << op.extent.truncate_size << dendl;
6694             t->truncate(soid, op.extent.truncate_size);
6695             oi.truncate_seq = op.extent.truncate_seq;
6696             oi.truncate_size = op.extent.truncate_size;
6697             if (oi.size > op.extent.truncate_size) {
6698               interval_set<uint64_t> trim;
6699               trim.insert(op.extent.truncate_size,
6700                 oi.size - op.extent.truncate_size);
6701               ctx->modified_ranges.union_of(trim);
6702               ctx->clean_regions.mark_data_region_dirty(op.extent.truncate_size, oi.size - op.extent.truncate_size);
6703               oi.clear_data_digest();
6704             }
6705             if (op.extent.truncate_size != oi.size) {
6706               truncate_update_size_and_usage(ctx->delta_stats,
6707                                              oi,
6708                                              op.extent.truncate_size);
6709             }
6710           } else {
6711             dout(10) << " truncate_seq " << op.extent.truncate_seq << " > current " << seq
6712                      << ", but object is new" << dendl;
6713             oi.truncate_seq = op.extent.truncate_seq;
6714             oi.truncate_size = op.extent.truncate_size;
6715           }
6716         }
6717         result = check_offset_and_length(
6718           op.extent.offset, op.extent.length,
6719           static_cast<Option::size_t>(osd->osd_max_object_size), get_dpp());
6720         if (result < 0)
6721           break;
6722
6723         maybe_create_new_object(ctx);
6724
6725         if (op.extent.length == 0) {
6726           if (op.extent.offset > oi.size) {
6727             t->truncate(
6728               soid, op.extent.offset);
6729             truncate_update_size_and_usage(ctx->delta_stats, oi,
6730                                            op.extent.offset);
6731           } else {
6732             t->nop(soid);
6733           }
6734         } else {
6735           t->write(
6736             soid, op.extent.offset, op.extent.length, osd_op.indata, op.flags);
6737         }
6738
6739         if (op.extent.offset == 0 && op.extent.length >= oi.size
6740             && !skip_data_digest) {
6741           obs.oi.set_data_digest(osd_op.indata.crc32c(-1));
6742         } else if (op.extent.offset == oi.size && obs.oi.is_data_digest()) {
6743           if (skip_data_digest) {
6744             obs.oi.clear_data_digest();
6745           } else {
6746             obs.oi.set_data_digest(osd_op.indata.crc32c(obs.oi.data_digest));
6747           }
6748         } else {
6749           obs.oi.clear_data_digest();
6750         }
6751         write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
6752                                     op.extent.offset, op.extent.length);
6753         ctx->clean_regions.mark_data_region_dirty(op.extent.offset, op.extent.length);
6754         dout(10) << "clean_regions modified" << ctx->clean_regions << dendl;
6755       }
6756       break;
6757
6758     case CEPH_OSD_OP_WRITEFULL:
6759       ++ctx->num_write;
6760       result = 0;
6761       { // write full object
6762         tracepoint(osd, do_osd_op_pre_writefull, soid.oid.name.c_str(), soid.snap.val, oi.size, 0, op.extent.length);
6763
6764         if (op.extent.length != osd_op.indata.length()) {
6765           result = -EINVAL;
6766           break;
6767         }
6768         result = check_offset_and_length(
6769           0, op.extent.length,
6770           static_cast<Option::size_t>(osd->osd_max_object_size), get_dpp());
6771         if (result < 0)
6772           break;
6773
6774         if (pool.info.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED))
6775           op.flags = op.flags | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
6776
6777         maybe_create_new_object(ctx);
6778         if (pool.info.is_erasure()) {
6779           t->truncate(soid, 0);
6780         } else if (obs.exists && op.extent.length < oi.size) {
6781           t->truncate(soid, op.extent.length);
6782         }
6783         if (op.extent.length) {
6784           t->write(soid, 0, op.extent.length, osd_op.indata, op.flags);
6785         }
6786         if (!skip_data_digest) {
6787           obs.oi.set_data_digest(osd_op.indata.crc32c(-1));
6788         } else {
6789           obs.oi.clear_data_digest();
6790         }
6791         ctx->clean_regions.mark_data_region_dirty(0,
6792           std::max((uint64_t)op.extent.length, oi.size));
6793         write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
6794             0, op.extent.length, true);
6795       }
6796       break;
6797
6798     case CEPH_OSD_OP_WRITESAME:
6799       ++ctx->num_write;
6800       tracepoint(osd, do_osd_op_pre_writesame, soid.oid.name.c_str(), soid.snap.val, oi.size, op.writesame.offset, op.writesame.length, op.writesame.data_length);
6801       result = do_writesame(ctx, osd_op);
6802       break;
6803
6804     case CEPH_OSD_OP_ROLLBACK :
6805       ++ctx->num_write;
6806       tracepoint(osd, do_osd_op_pre_rollback, soid.oid.name.c_str(), soid.snap.val);
6807       result = _rollback_to(ctx, osd_op);
6808       break;
6809
6810     case CEPH_OSD_OP_ZERO:
6811       tracepoint(osd, do_osd_op_pre_zero, soid.oid.name.c_str(), soid.snap.val, op.extent.offset, op.extent.length);
6812       if (pool.info.requires_aligned_append()) {
6813         result = -EOPNOTSUPP;
6814         break;
6815       }
6816       ++ctx->num_write;
6817       { // zero
6818         result = check_offset_and_length(
6819           op.extent.offset, op.extent.length,
6820           static_cast<Option::size_t>(osd->osd_max_object_size), get_dpp());
6821         if (result < 0)
6822           break;
6823
6824         if (op.extent.length && obs.exists && !oi.is_whiteout()) {
6825           t->zero(soid, op.extent.offset, op.extent.length);
6826           interval_set<uint64_t> ch;
6827           ch.insert(op.extent.offset, op.extent.length);
6828           ctx->modified_ranges.union_of(ch);
6829           ctx->clean_regions.mark_data_region_dirty(op.extent.offset, op.extent.length);
6830           ctx->delta_stats.num_wr++;
6831           oi.clear_data_digest();
6832         } else {
6833           // no-op
6834         }
6835       }
6836       break;
6837     case CEPH_OSD_OP_CREATE:
6838       ++ctx->num_write;
6839       result = 0;
6840       {
6841         tracepoint(osd, do_osd_op_pre_create, soid.oid.name.c_str(), soid.snap.val);
6842         if (obs.exists && !oi.is_whiteout() &&
6843             (op.flags & CEPH_OSD_OP_FLAG_EXCL)) {
6844           result = -EEXIST; /* this is an exclusive create */
6845         } else {
6846           if (osd_op.indata.length()) {
6847             auto p = osd_op.indata.cbegin();
6848             string category;
6849             try {
6850               decode(category, p);
6851             }
6852             catch (ceph::buffer::error& e) {
6853               result = -EINVAL;
6854               goto fail;
6855             }
6856             // category is no longer implemented.
6857           }
6858           maybe_create_new_object(ctx);
6859           t->nop(soid);
6860         }
6861       }
6862       break;
6863
6864     case CEPH_OSD_OP_TRIMTRUNC:
6865       op.extent.offset = op.extent.truncate_size;
6866       // falling through
6867
6868     case CEPH_OSD_OP_TRUNCATE:
6869       tracepoint(osd, do_osd_op_pre_truncate, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
6870       if (pool.info.requires_aligned_append()) {
6871         result = -EOPNOTSUPP;
6872         break;
6873       }
6874       ++ctx->num_write;
6875       result = 0;
6876       {
6877         // truncate
6878         if (!obs.exists || oi.is_whiteout()) {
6879           dout(10) << " object dne, truncate is a no-op" << dendl;
6880           break;
6881         }
6882
6883         result = check_offset_and_length(
6884           op.extent.offset, op.extent.length,
6885           static_cast<Option::size_t>(osd->osd_max_object_size), get_dpp());
6886         if (result < 0)
6887           break;
6888
6889         if (op.extent.truncate_seq) {
6890           ceph_assert(op.extent.offset == op.extent.truncate_size);
6891           if (op.extent.truncate_seq <= oi.truncate_seq) {
6892             dout(10) << " truncate seq " << op.extent.truncate_seq << " <= current " << oi.truncate_seq
6893                      << ", no-op" << dendl;
6894             break; // old
6895           }
6896           dout(10) << " truncate seq " << op.extent.truncate_seq << " > current " << oi.truncate_seq
6897                    << ", truncating" << dendl;
6898           oi.truncate_seq = op.extent.truncate_seq;
6899           oi.truncate_size = op.extent.truncate_size;
6900         }
6901
6902         maybe_create_new_object(ctx);
6903         t->truncate(soid, op.extent.offset);
6904         if (oi.size > op.extent.offset) {
6905           interval_set<uint64_t> trim;
6906           trim.insert(op.extent.offset, oi.size-op.extent.offset);
6907           ctx->modified_ranges.union_of(trim);
6908           ctx->clean_regions.mark_data_region_dirty(op.extent.offset, oi.size - op.extent.offset);
6909         } else if (oi.size < op.extent.offset) {
6910           ctx->clean_regions.mark_data_region_dirty(oi.size, op.extent.offset - oi.size);
6911         }
6912         if (op.extent.offset != oi.size) {
6913           truncate_update_size_and_usage(ctx->delta_stats,
6914                                          oi,
6915                                          op.extent.offset);
6916         }
6917         ctx->delta_stats.num_wr++;
6918         // do no set exists, or we will break above DELETE -> TRUNCATE munging.
6919
6920         oi.clear_data_digest();
6921       }
6922       break;
6923
6924     case CEPH_OSD_OP_DELETE:
6925       ++ctx->num_write;
6926       result = 0;
6927       tracepoint(osd, do_osd_op_pre_delete, soid.oid.name.c_str(), soid.snap.val);
6928       {
6929         result = _delete_oid(ctx, false, ctx->ignore_cache);
6930       }
6931       break;
6932
6933     case CEPH_OSD_OP_WATCH:
6934       ++ctx->num_write;
6935       result = 0;
6936       {
6937         tracepoint(osd, do_osd_op_pre_watch, soid.oid.name.c_str(), soid.snap.val,
6938                    op.watch.cookie, op.watch.op);
6939         if (!obs.exists) {
6940           result = -ENOENT;
6941           break;
6942         }
6943         result = 0;
6944         uint64_t cookie = op.watch.cookie;
6945         entity_name_t entity = ctx->reqid.name;
6946         ObjectContextRef obc = ctx->obc;
6947
6948         dout(10) << "watch " << ceph_osd_watch_op_name(op.watch.op)
6949                  << ": ctx->obc=" << (void *)obc.get() << " cookie=" << cookie
6950                  << " oi.version=" << oi.version.version << " ctx->at_version=" << ctx->at_version << dendl;
6951         dout(10) << "watch: oi.user_version=" << oi.user_version<< dendl;
6952         dout(10) << "watch: peer_addr="
6953           << ctx->op->get_req()->get_connection()->get_peer_addr() << dendl;
6954
6955         uint32_t timeout = cct->_conf->osd_client_watch_timeout;
6956         if (op.watch.timeout != 0) {
6957           timeout = op.watch.timeout;
6958         }
6959
6960         watch_info_t w(cookie, timeout,
6961           ctx->op->get_req()->get_connection()->get_peer_addr());
6962         if (op.watch.op == CEPH_OSD_WATCH_OP_WATCH ||
6963             op.watch.op == CEPH_OSD_WATCH_OP_LEGACY_WATCH) {
6964           if (oi.watchers.count(make_pair(cookie, entity))) {
6965             dout(10) << " found existing watch " << w << " by " << entity << dendl;
6966           } else {
6967             dout(10) << " registered new watch " << w << " by " << entity << dendl;
6968             oi.watchers[make_pair(cookie, entity)] = w;
6969             t->nop(soid);  // make sure update the object_info on disk!
6970           }
6971           bool will_ping = (op.watch.op == CEPH_OSD_WATCH_OP_WATCH);
6972           ctx->watch_connects.push_back(make_pair(w, will_ping));
6973         } else if (op.watch.op == CEPH_OSD_WATCH_OP_RECONNECT) {
6974           if (!oi.watchers.count(make_pair(cookie, entity))) {
6975             result = -ENOTCONN;
6976             break;
6977           }
6978           dout(10) << " found existing watch " << w << " by " << entity << dendl;
6979           ctx->watch_connects.push_back(make_pair(w, true));
6980         } else if (op.watch.op == CEPH_OSD_WATCH_OP_PING) {
6981           /* Note: WATCH with PING doesn't cause may_write() to return true,
6982            * so if there is nothing else in the transaction, this is going
6983            * to run do_osd_op_effects, but not write out a log entry */
6984           if (!oi.watchers.count(make_pair(cookie, entity))) {
6985             result = -ENOTCONN;
6986             break;
6987           }
6988           map<pair<uint64_t,entity_name_t>,WatchRef>::iterator p =
6989             obc->watchers.find(make_pair(cookie, entity));
6990           if (p == obc->watchers.end() ||
6991               !p->second->is_connected()) {
6992             // client needs to reconnect
6993             result = -ETIMEDOUT;
6994             break;
6995           }
6996           dout(10) << " found existing watch " << w << " by " << entity << dendl;
6997           p->second->got_ping(ceph_clock_now());
6998           result = 0;
6999         } else if (op.watch.op == CEPH_OSD_WATCH_OP_UNWATCH) {
7000           map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator oi_iter =
7001             oi.watchers.find(make_pair(cookie, entity));
7002           if (oi_iter != oi.watchers.end()) {
7003             dout(10) << " removed watch " << oi_iter->second << " by "
7004                      << entity << dendl;
7005             oi.watchers.erase(oi_iter);
7006             t->nop(soid);  // update oi on disk
7007             ctx->watch_disconnects.push_back(
7008               watch_disconnect_t(cookie, entity, false));
7009           } else {
7010             dout(10) << " can't remove: no watch by " << entity << dendl;
7011           }
7012         }
7013       }
7014       break;
7015
7016     case CEPH_OSD_OP_CACHE_PIN:
7017       tracepoint(osd, do_osd_op_pre_cache_pin, soid.oid.name.c_str(), soid.snap.val);
7018       if ((!pool.info.is_tier() ||
7019           pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)) {
7020         result = -EINVAL;
7021         dout(10) << " pin object is only allowed on the cache tier " << dendl;
7022         break;
7023       }
7024       ++ctx->num_write;
7025       result = 0;
7026       {
7027         if (!obs.exists || oi.is_whiteout()) {
7028           result = -ENOENT;
7029           break;
7030         }
7031
7032         if (!oi.is_cache_pinned()) {
7033           oi.set_flag(object_info_t::FLAG_CACHE_PIN);
7034           ctx->modify = true;
7035           ctx->delta_stats.num_objects_pinned++;
7036           ctx->delta_stats.num_wr++;
7037         }
7038       }
7039       break;
7040
7041     case CEPH_OSD_OP_CACHE_UNPIN:
7042       tracepoint(osd, do_osd_op_pre_cache_unpin, soid.oid.name.c_str(), soid.snap.val);
7043       if ((!pool.info.is_tier() ||
7044           pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)) {
7045         result = -EINVAL;
7046         dout(10) << " pin object is only allowed on the cache tier " << dendl;
7047         break;
7048       }
7049       ++ctx->num_write;
7050       result = 0;
7051       {
7052         if (!obs.exists || oi.is_whiteout()) {
7053           result = -ENOENT;
7054           break;
7055         }
7056
7057         if (oi.is_cache_pinned()) {
7058           oi.clear_flag(object_info_t::FLAG_CACHE_PIN);
7059           ctx->modify = true;
7060           ctx->delta_stats.num_objects_pinned--;
7061           ctx->delta_stats.num_wr++;
7062         }
7063       }
7064       break;
7065
7066     case CEPH_OSD_OP_SET_REDIRECT:
7067       ++ctx->num_write;
7068       result = 0;
7069       {
7070         if (pool.info.is_tier()) {
7071           result = -EINVAL;
7072           break;
7073         }
7074         if (!obs.exists) {
7075           result = -ENOENT;
7076           break;
7077         }
7078         if (get_osdmap()->require_osd_release < ceph_release_t::luminous) {
7079           result = -EOPNOTSUPP;
7080           break;
7081         }
7082
7083         object_t target_name;
7084         object_locator_t target_oloc;
7085         snapid_t target_snapid = (uint64_t)op.copy_from.snapid;
7086         version_t target_version = op.copy_from.src_version;
7087         try {
7088           decode(target_name, bp);
7089           decode(target_oloc, bp);
7090         }
7091         catch (ceph::buffer::error& e) {
7092           result = -EINVAL;
7093           goto fail;
7094         }
7095         pg_t raw_pg;
7096         get_osdmap()->object_locator_to_pg(target_name, target_oloc, raw_pg);
7097         hobject_t target(target_name, target_oloc.key, target_snapid,
7098                 raw_pg.ps(), raw_pg.pool(),
7099                 target_oloc.nspace);
7100         if (target == soid) {
7101           dout(20) << " set-redirect self is invalid" << dendl;
7102           result = -EINVAL;
7103           break;
7104         }
7105
7106         bool need_reference = (osd_op.op.flags & CEPH_OSD_OP_FLAG_WITH_REFERENCE);
7107         bool has_reference = (oi.flags & object_info_t::FLAG_REDIRECT_HAS_REFERENCE);
7108         if (has_reference) {
7109           result = -EINVAL;
7110           dout(5) << " the object is already a manifest " << dendl;
7111           break;
7112         }
7113         if (op_finisher == nullptr && need_reference) {
7114           // start
7115           ctx->op_finishers[ctx->current_osd_subop_num].reset(
7116             new SetManifestFinisher(osd_op));
7117           ManifestOpRef mop = std::make_shared<ManifestOp>(new RefCountCallback(ctx, osd_op));
7118           auto* fin = new C_SetManifestRefCountDone(this, soid, 0);
7119           ceph_tid_t tid = refcount_manifest(soid, target,
7120                                               refcount_t::INCREMENT_REF, fin, std::nullopt);
7121           fin->tid = tid;
7122           mop->num_chunks++;
7123           mop->tids[0] = tid;
7124           manifest_ops[soid] = mop;
7125           ctx->obc->start_block();
7126           result = -EINPROGRESS;
7127         } else {
7128           // finish
7129           if (op_finisher) {
7130             result = op_finisher->execute();
7131             ceph_assert(result == 0);
7132           }
7133
7134           if (!oi.has_manifest() && !oi.manifest.is_redirect())
7135             ctx->delta_stats.num_objects_manifest++;
7136
7137           oi.set_flag(object_info_t::FLAG_MANIFEST);
7138           oi.manifest.redirect_target = target;
7139           oi.manifest.type = object_manifest_t::TYPE_REDIRECT;
7140           t->truncate(soid, 0);
7141           ctx->clean_regions.mark_data_region_dirty(0, oi.size);
7142           if (oi.is_omap() && pool.info.supports_omap()) {
7143             t->omap_clear(soid);
7144             obs.oi.clear_omap_digest();
7145             obs.oi.clear_flag(object_info_t::FLAG_OMAP);
7146             ctx->clean_regions.mark_omap_dirty();
7147           }
7148           write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
7149             0, oi.size, false);
7150           ctx->delta_stats.num_bytes -= oi.size;
7151           oi.size = 0;
7152           oi.new_object();
7153           oi.user_version = target_version;
7154           ctx->user_at_version = target_version;
7155           /* rm_attrs */
7156           map<string,bufferlist,less<>> rmattrs;
7157           result = getattrs_maybe_cache(ctx->obc, &rmattrs);
7158           if (result < 0) {
7159             dout(10) << __func__ << " error: " << cpp_strerror(result) << dendl;
7160             return result;
7161           }
7162           map<string, bufferlist>::iterator iter;
7163           for (iter = rmattrs.begin(); iter != rmattrs.end(); ++iter) {
7164             const string& name = iter->first;
7165             t->rmattr(soid, name);
7166           }
7167           if (!has_reference && need_reference) {
7168             oi.set_flag(object_info_t::FLAG_REDIRECT_HAS_REFERENCE);
7169           }
7170           dout(10) << "set-redirect oid:" << oi.soid << " user_version: " << oi.user_version << dendl;
7171           if (op_finisher) {
7172             ctx->op_finishers.erase(ctx->current_osd_subop_num);
7173           }
7174         }
7175       }
7176
7177       break;
7178
7179     case CEPH_OSD_OP_SET_CHUNK:
7180       ++ctx->num_write;
7181       result = 0;
7182       {
7183         if (pool.info.is_tier()) {
7184           result = -EINVAL;
7185           break;
7186         }
7187         if (!obs.exists) {
7188           result = -ENOENT;
7189           break;
7190         }
7191         if (get_osdmap()->require_osd_release < ceph_release_t::luminous) {
7192           result = -EOPNOTSUPP;
7193           break;
7194         }
7195         if (oi.manifest.is_redirect()) {
7196           result = -EINVAL;
7197           goto fail;
7198         }
7199
7200         object_locator_t tgt_oloc;
7201         uint64_t src_offset, src_length, tgt_offset;
7202         object_t tgt_name;
7203         try {
7204           decode(src_offset, bp);
7205           decode(src_length, bp);
7206           decode(tgt_oloc, bp);
7207           decode(tgt_name, bp);
7208           decode(tgt_offset, bp);
7209         }
7210         catch (ceph::buffer::error& e) {
7211           result = -EINVAL;
7212           goto fail;
7213         }
7214
7215         if (!src_length) {
7216           result = -EINVAL;
7217           goto fail;
7218         }
7219         if (src_offset + src_length > oi.size) {
7220           result = -ERANGE;
7221           goto fail;
7222         }
7223         if (!(osd_op.op.flags & CEPH_OSD_OP_FLAG_WITH_REFERENCE)) {
7224           result = -EOPNOTSUPP;
7225           break;
7226         }
7227         if (pool.info.is_erasure()) {
7228           result = -EOPNOTSUPP;
7229           break;
7230         }
7231
7232         for (auto &p : oi.manifest.chunk_map) {
7233           interval_set<uint64_t> chunk;
7234           chunk.insert(p.first, p.second.length);
7235           if (chunk.intersects(src_offset, src_length)) {
7236             dout(20) << __func__ << " overlapped !! offset: " << src_offset << " length: " << src_length
7237                     << " chunk_info: " << p << dendl;
7238             result = -EOPNOTSUPP;
7239             goto fail;
7240           }
7241         }
7242
7243         pg_t raw_pg;
7244         chunk_info_t chunk_info;
7245         get_osdmap()->object_locator_to_pg(tgt_name, tgt_oloc, raw_pg);
7246         hobject_t target(tgt_name, tgt_oloc.key, snapid_t(),
7247                          raw_pg.ps(), raw_pg.pool(),
7248                          tgt_oloc.nspace);
7249         bool has_reference = (oi.manifest.chunk_map.find(src_offset) != oi.manifest.chunk_map.end()) &&
7250                              (oi.manifest.chunk_map[src_offset].test_flag(chunk_info_t::FLAG_HAS_REFERENCE));
7251         if (has_reference) {
7252           result = -EINVAL;
7253           dout(5) << " the object is already a manifest " << dendl;
7254           break;
7255         }
7256         chunk_info.oid = target;
7257         chunk_info.offset = tgt_offset;
7258         chunk_info.length = src_length;
7259         if (op_finisher == nullptr)  {
7260           // start
7261           ctx->op_finishers[ctx->current_osd_subop_num].reset(
7262             new SetManifestFinisher(osd_op));
7263           object_manifest_t set_chunk;
7264           bool need_inc_ref = false;
7265           set_chunk.chunk_map[src_offset] = chunk_info;
7266           need_inc_ref = inc_refcount_by_set(ctx, set_chunk, osd_op);
7267           if (need_inc_ref) {
7268             result = -EINPROGRESS;
7269             break;
7270           }
7271         }
7272         if (op_finisher) {
7273           result = op_finisher->execute();
7274           ceph_assert(result == 0);
7275         }
7276
7277         oi.manifest.chunk_map[src_offset] = chunk_info;
7278         if (!oi.has_manifest() && !oi.manifest.is_chunked())
7279           ctx->delta_stats.num_objects_manifest++;
7280         oi.set_flag(object_info_t::FLAG_MANIFEST);
7281         oi.manifest.type = object_manifest_t::TYPE_CHUNKED;
7282         if (!has_reference) {
7283           oi.manifest.chunk_map[src_offset].set_flag(chunk_info_t::FLAG_HAS_REFERENCE);
7284         }
7285         ctx->modify = true;
7286         ctx->cache_operation = true;
7287
7288         dout(10) << "set-chunked oid:" << oi.soid << " user_version: " << oi.user_version
7289                  << " chunk_info: " << chunk_info << dendl;
7290         if (op_finisher) {
7291           ctx->op_finishers.erase(ctx->current_osd_subop_num);
7292         }
7293       }
7294
7295       break;
7296
7297     case CEPH_OSD_OP_TIER_PROMOTE:
7298       ++ctx->num_write;
7299       result = 0;
7300       {
7301         if (pool.info.is_tier()) {
7302           result = -EINVAL;
7303           break;
7304         }
7305         if (!obs.exists) {
7306           result = -ENOENT;
7307           break;
7308         }
7309         if (get_osdmap()->require_osd_release < ceph_release_t::luminous) {
7310           result = -EOPNOTSUPP;
7311           break;
7312         }
7313         if (!obs.oi.has_manifest()) {
7314           result = 0;
7315           break;
7316         }
7317
7318         if (op_finisher == nullptr) {
7319           PromoteManifestCallback *cb;
7320           object_locator_t my_oloc;
7321           hobject_t src_hoid;
7322
7323           if (obs.oi.manifest.is_chunked()) {
7324             src_hoid = obs.oi.soid;
7325           } else if (obs.oi.manifest.is_redirect()) {
7326             object_locator_t src_oloc(obs.oi.manifest.redirect_target);
7327             my_oloc = src_oloc;
7328             src_hoid = obs.oi.manifest.redirect_target;
7329           } else {
7330             ceph_abort_msg("unrecognized manifest type");
7331           }
7332           cb = new PromoteManifestCallback(ctx->obc, this, ctx);
7333           ctx->op_finishers[ctx->current_osd_subop_num].reset(
7334             new PromoteFinisher(cb));
7335           unsigned flags = CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
7336                            CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
7337                            CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE |
7338                            CEPH_OSD_COPY_FROM_FLAG_RWORDERED;
7339           unsigned src_fadvise_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL;
7340           start_copy(cb, ctx->obc, src_hoid, my_oloc, 0, flags,
7341                      obs.oi.soid.snap == CEPH_NOSNAP,
7342                      src_fadvise_flags, 0);
7343
7344           dout(10) << "tier-promote oid:" << oi.soid << " manifest: " << obs.oi.manifest << dendl;
7345           result = -EINPROGRESS;
7346         } else {
7347           result = op_finisher->execute();
7348           ceph_assert(result == 0);
7349           ctx->op_finishers.erase(ctx->current_osd_subop_num);
7350         }
7351       }
7352
7353       break;
7354
7355     case CEPH_OSD_OP_TIER_FLUSH:
7356       ++ctx->num_write;
7357       result = 0;
7358       {
7359         if (pool.info.is_tier()) {
7360           result = -EINVAL;
7361           break;
7362         }
7363         if (!obs.exists) {
7364           result = -ENOENT;
7365           break;
7366         }
7367         if (get_osdmap()->require_osd_release < ceph_release_t::octopus) {
7368           result = -EOPNOTSUPP;
7369           break;
7370         }
7371         if (!obs.oi.has_manifest()) {
7372           result = 0;
7373           break;
7374         }
7375
7376         if (oi.is_dirty()) {
7377           result = start_flush(ctx->op, ctx->obc, true, NULL, std::nullopt);
7378           if (result == -EINPROGRESS)
7379             result = -EAGAIN;
7380         } else {
7381           result = 0;
7382         }
7383       }
7384
7385       break;
7386
7387     case CEPH_OSD_OP_TIER_EVICT:
7388       ++ctx->num_write;
7389       result = 0;
7390       {
7391         if (pool.info.is_tier()) {
7392           result = -EINVAL;
7393           break;
7394         }
7395         if (!obs.exists) {
7396           result = -ENOENT;
7397           break;
7398         }
7399         if (get_osdmap()->require_osd_release < ceph_release_t::octopus) {
7400           result = -EOPNOTSUPP;
7401           break;
7402         }
7403         if (!obs.oi.has_manifest()) {
7404           result = -EINVAL;
7405           break;
7406         }
7407
7408         // The chunks already has a reference, so it is just enough to invoke truncate if necessary
7409         for (auto &p : obs.oi.manifest.chunk_map) {
7410           p.second.set_flag(chunk_info_t::FLAG_MISSING);
7411           // punch hole
7412           t->zero(soid, p.first, p.second.length);
7413         }
7414         oi.clear_data_digest();
7415         ctx->delta_stats.num_wr++;
7416         ctx->cache_operation = true;
7417         osd->logger->inc(l_osd_tier_evict);
7418       }
7419
7420       break;
7421
7422     case CEPH_OSD_OP_UNSET_MANIFEST:
7423       ++ctx->num_write;
7424       result = 0;
7425       {
7426         if (pool.info.is_tier()) {
7427           result = -EINVAL;
7428           break;
7429         }
7430         if (!obs.exists) {
7431           result = -ENOENT;
7432           break;
7433         }
7434         if (!oi.has_manifest()) {
7435           result = -EOPNOTSUPP;
7436           break;
7437         }
7438         if (get_osdmap()->require_osd_release < ceph_release_t::luminous) {
7439           result = -EOPNOTSUPP;
7440           break;
7441         }
7442
7443         dec_all_refcount_manifest(oi, ctx);
7444
7445         oi.clear_flag(object_info_t::FLAG_MANIFEST);
7446         oi.manifest = object_manifest_t();
7447         ctx->delta_stats.num_objects_manifest--;
7448         ctx->delta_stats.num_wr++;
7449         ctx->modify = true;
7450       }
7451
7452       break;
7453
7454       // -- object attrs --
7455
7456     case CEPH_OSD_OP_SETXATTR:
7457       ++ctx->num_write;
7458       result = 0;
7459       {
7460         if (cct->_conf->osd_max_attr_size > 0 &&
7461             op.xattr.value_len > cct->_conf->osd_max_attr_size) {
7462           tracepoint(osd, do_osd_op_pre_setxattr, soid.oid.name.c_str(), soid.snap.val, "???");
7463           result = -EFBIG;
7464           break;
7465         }
7466         unsigned max_name_len =
7467           std::min<uint64_t>(osd->store->get_max_attr_name_length(),
7468                              cct->_conf->osd_max_attr_name_len);
7469         if (op.xattr.name_len > max_name_len) {
7470           result = -ENAMETOOLONG;
7471           break;
7472         }
7473         maybe_create_new_object(ctx);
7474         string aname;
7475         bp.copy(op.xattr.name_len, aname);
7476         tracepoint(osd, do_osd_op_pre_setxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
7477         string name = "_" + aname;
7478         bufferlist bl;
7479         bp.copy(op.xattr.value_len, bl);
7480         t->setattr(soid, name, bl);
7481         ctx->delta_stats.num_wr++;
7482       }
7483       break;
7484
7485     case CEPH_OSD_OP_RMXATTR:
7486       ++ctx->num_write;
7487       result = 0;
7488       {
7489         string aname;
7490         bp.copy(op.xattr.name_len, aname);
7491         tracepoint(osd, do_osd_op_pre_rmxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
7492         if (!obs.exists || oi.is_whiteout()) {
7493           result = -ENOENT;
7494           break;
7495         }
7496         string name = "_" + aname;
7497         t->rmattr(soid, name);
7498         ctx->delta_stats.num_wr++;
7499       }
7500       break;
7501
7502
7503       // -- fancy writers --
7504     case CEPH_OSD_OP_APPEND:
7505       {
7506         tracepoint(osd, do_osd_op_pre_append, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
7507         // just do it inline; this works because we are happy to execute
7508         // fancy op on replicas as well.
7509         vector<OSDOp> nops(1);
7510         OSDOp& newop = nops[0];
7511         newop.op.op = CEPH_OSD_OP_WRITE;
7512         newop.op.extent.offset = oi.size;
7513         newop.op.extent.length = op.extent.length;
7514         newop.op.extent.truncate_seq = oi.truncate_seq;
7515         newop.indata = osd_op.indata;
7516         result = do_osd_ops(ctx, nops);
7517         osd_op.outdata = std::move(newop.outdata);
7518       }
7519       break;
7520
7521     case CEPH_OSD_OP_STARTSYNC:
7522       result = 0;
7523       t->nop(soid);
7524       break;
7525
7526       // -- trivial map --
7527     case CEPH_OSD_OP_TMAPGET:
7528       tracepoint(osd, do_osd_op_pre_tmapget, soid.oid.name.c_str(), soid.snap.val);
7529       if (pool.info.is_erasure()) {
7530         result = -EOPNOTSUPP;
7531         break;
7532       }
7533       {
7534         vector<OSDOp> nops(1);
7535         OSDOp& newop = nops[0];
7536         newop.op.op = CEPH_OSD_OP_SYNC_READ;
7537         newop.op.extent.offset = 0;
7538         newop.op.extent.length = 0;
7539         result = do_osd_ops(ctx, nops);
7540         osd_op.outdata = std::move(newop.outdata);
7541       }
7542       break;
7543
7544     case CEPH_OSD_OP_TMAPPUT:
7545       tracepoint(osd, do_osd_op_pre_tmapput, soid.oid.name.c_str(), soid.snap.val);
7546       if (pool.info.is_erasure()) {
7547         result = -EOPNOTSUPP;
7548         break;
7549       }
7550       {
7551         //_dout_lock.Lock();
7552         //osd_op.data.hexdump(*_dout);
7553         //_dout_lock.Unlock();
7554
7555         // verify sort order
7556         bool unsorted = false;
7557         if (true) {
7558           bufferlist header;
7559           decode(header, bp);
7560           uint32_t n;
7561           decode(n, bp);
7562           string last_key;
7563           while (n--) {
7564             string key;
7565             decode(key, bp);
7566             dout(10) << "tmapput key " << key << dendl;
7567             bufferlist val;
7568             decode(val, bp);
7569             if (key < last_key) {
7570               dout(10) << "TMAPPUT is unordered; resorting" << dendl;
7571               unsorted = true;
7572               break;
7573             }
7574             last_key = key;
7575           }
7576         }
7577
7578         // write it
7579         vector<OSDOp> nops(1);
7580         OSDOp& newop = nops[0];
7581         newop.op.op = CEPH_OSD_OP_WRITEFULL;
7582         newop.op.extent.offset = 0;
7583         newop.op.extent.length = osd_op.indata.length();
7584         newop.indata = osd_op.indata;
7585
7586         if (unsorted) {
7587           bp = osd_op.indata.begin();
7588           bufferlist header;
7589           map<string, bufferlist> m;
7590           decode(header, bp);
7591           decode(m, bp);
7592           ceph_assert(bp.end());
7593           bufferlist newbl;
7594           encode(header, newbl);
7595           encode(m, newbl);
7596           newop.indata = newbl;
7597         }
7598         result = do_osd_ops(ctx, nops);
7599         ceph_assert(result == 0);
7600       }
7601       break;
7602
7603     case CEPH_OSD_OP_TMAPUP:
7604       tracepoint(osd, do_osd_op_pre_tmapup, soid.oid.name.c_str(), soid.snap.val);
7605       if (pool.info.is_erasure()) {
7606         result = -EOPNOTSUPP;
7607         break;
7608       }
7609       ++ctx->num_write;
7610       result = do_tmapup(ctx, bp, osd_op);
7611       break;
7612
7613     case CEPH_OSD_OP_TMAP2OMAP:
7614       ++ctx->num_write;
7615       tracepoint(osd, do_osd_op_pre_tmap2omap, soid.oid.name.c_str(), soid.snap.val);
7616       result = do_tmap2omap(ctx, op.tmap2omap.flags);
7617       break;
7618
7619       // OMAP Read ops
7620     case CEPH_OSD_OP_OMAPGETKEYS:
7621       ++ctx->num_read;
7622       {
7623         string start_after;
7624         uint64_t max_return;
7625         try {
7626           decode(start_after, bp);
7627           decode(max_return, bp);
7628         }
7629         catch (ceph::buffer::error& e) {
7630           result = -EINVAL;
7631           tracepoint(osd, do_osd_op_pre_omapgetkeys, soid.oid.name.c_str(), soid.snap.val, "???", 0);
7632           goto fail;
7633         }
7634         if (max_return > cct->_conf->osd_max_omap_entries_per_request) {
7635           max_return = cct->_conf->osd_max_omap_entries_per_request;
7636         }
7637         tracepoint(osd, do_osd_op_pre_omapgetkeys, soid.oid.name.c_str(), soid.snap.val, start_after.c_str(), max_return);
7638
7639         bufferlist bl;
7640         uint32_t num = 0;
7641         bool truncated = false;
7642         if (oi.is_omap()) {
7643           ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(
7644             ch, ghobject_t(soid)
7645             );
7646           ceph_assert(iter);
7647           iter->upper_bound(start_after);
7648           for (num = 0; iter->valid(); ++num, iter->next()) {
7649             if (num >= max_return ||
7650                 bl.length() >= cct->_conf->osd_max_omap_bytes_per_request) {
7651               truncated = true;
7652               break;
7653             }
7654             encode(iter->key(), bl);
7655           }
7656         } // else return empty out_set
7657         encode(num, osd_op.outdata);
7658         osd_op.outdata.claim_append(bl);
7659         encode(truncated, osd_op.outdata);
7660         ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
7661         ctx->delta_stats.num_rd++;
7662       }
7663       break;
7664
7665     case CEPH_OSD_OP_OMAPGETVALS:
7666       ++ctx->num_read;
7667       {
7668         string start_after;
7669         uint64_t max_return;
7670         string filter_prefix;
7671         try {
7672           decode(start_after, bp);
7673           decode(max_return, bp);
7674           decode(filter_prefix, bp);
7675         }
7676         catch (ceph::buffer::error& e) {
7677           result = -EINVAL;
7678           tracepoint(osd, do_osd_op_pre_omapgetvals, soid.oid.name.c_str(), soid.snap.val, "???", 0, "???");
7679           goto fail;
7680         }
7681         if (max_return > cct->_conf->osd_max_omap_entries_per_request) {
7682           max_return = cct->_conf->osd_max_omap_entries_per_request;
7683         }
7684         tracepoint(osd, do_osd_op_pre_omapgetvals, soid.oid.name.c_str(), soid.snap.val, start_after.c_str(), max_return, filter_prefix.c_str());
7685
7686         uint32_t num = 0;
7687         bool truncated = false;
7688         bufferlist bl;
7689         if (oi.is_omap()) {
7690           ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(
7691             ch, ghobject_t(soid)
7692             );
7693           if (!iter) {
7694             result = -ENOENT;
7695             goto fail;
7696           }
7697           iter->upper_bound(start_after);
7698           if (filter_prefix > start_after) iter->lower_bound(filter_prefix);
7699           for (num = 0;
7700                iter->valid() &&
7701                  iter->key().substr(0, filter_prefix.size()) == filter_prefix;
7702                ++num, iter->next()) {
7703             dout(20) << "Found key " << iter->key() << dendl;
7704             if (num >= max_return ||
7705                 bl.length() >= cct->_conf->osd_max_omap_bytes_per_request) {
7706               truncated = true;
7707               break;
7708             }
7709             encode(iter->key(), bl);
7710             encode(iter->value(), bl);
7711           }
7712         } // else return empty out_set
7713         encode(num, osd_op.outdata);
7714         osd_op.outdata.claim_append(bl);
7715         encode(truncated, osd_op.outdata);
7716         ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
7717         ctx->delta_stats.num_rd++;
7718       }
7719       break;
7720
7721     case CEPH_OSD_OP_OMAPGETHEADER:
7722       tracepoint(osd, do_osd_op_pre_omapgetheader, soid.oid.name.c_str(), soid.snap.val);
7723       if (!oi.is_omap()) {
7724         // return empty header
7725         break;
7726       }
7727       ++ctx->num_read;
7728       {
7729         osd->store->omap_get_header(ch, ghobject_t(soid), &osd_op.outdata);
7730         ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
7731         ctx->delta_stats.num_rd++;
7732       }
7733       break;
7734
7735     case CEPH_OSD_OP_OMAPGETVALSBYKEYS:
7736       ++ctx->num_read;
7737       {
7738         set<string> keys_to_get;
7739         try {
7740           decode(keys_to_get, bp);
7741         }
7742         catch (ceph::buffer::error& e) {
7743           result = -EINVAL;
7744           tracepoint(osd, do_osd_op_pre_omapgetvalsbykeys, soid.oid.name.c_str(), soid.snap.val, "???");
7745           goto fail;
7746         }
7747         tracepoint(osd, do_osd_op_pre_omapgetvalsbykeys, soid.oid.name.c_str(), soid.snap.val, list_entries(keys_to_get).c_str());
7748         map<string, bufferlist> out;
7749         if (oi.is_omap()) {
7750           osd->store->omap_get_values(ch, ghobject_t(soid), keys_to_get, &out);
7751         } // else return empty omap entries
7752         encode(out, osd_op.outdata);
7753         ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
7754         ctx->delta_stats.num_rd++;
7755       }
7756       break;
7757
7758     case CEPH_OSD_OP_OMAP_CMP:
7759       ++ctx->num_read;
7760       {
7761         if (!obs.exists || oi.is_whiteout()) {
7762           result = -ENOENT;
7763           tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, "???");
7764           break;
7765         }
7766         map<string, pair<bufferlist, int> > assertions;
7767         try {
7768           decode(assertions, bp);
7769         }
7770         catch (ceph::buffer::error& e) {
7771           result = -EINVAL;
7772           tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, "???");
7773           goto fail;
7774         }
7775         tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, list_keys(assertions).c_str());
7776
7777         map<string, bufferlist> out;
7778
7779         if (oi.is_omap()) {
7780           set<string> to_get;
7781           for (map<string, pair<bufferlist, int> >::iterator i = assertions.begin();
7782                i != assertions.end();
7783                ++i)
7784             to_get.insert(i->first);
7785           int r = osd->store->omap_get_values(ch, ghobject_t(soid),
7786                                               to_get, &out);
7787           if (r < 0) {
7788             result = r;
7789             break;
7790           }
7791         } // else leave out empty
7792
7793         //Should set num_rd_kb based on encode length of map
7794         ctx->delta_stats.num_rd++;
7795
7796         int r = 0;
7797         bufferlist empty;
7798         for (map<string, pair<bufferlist, int> >::iterator i = assertions.begin();
7799              i != assertions.end();
7800              ++i) {
7801           auto out_entry = out.find(i->first);
7802           bufferlist &bl = (out_entry != out.end()) ?
7803             out_entry->second : empty;
7804           switch (i->second.second) {
7805           case CEPH_OSD_CMPXATTR_OP_EQ:
7806             if (!(bl == i->second.first)) {
7807               r = -ECANCELED;
7808             }
7809             break;
7810           case CEPH_OSD_CMPXATTR_OP_LT:
7811             if (!(bl < i->second.first)) {
7812               r = -ECANCELED;
7813             }
7814             break;
7815           case CEPH_OSD_CMPXATTR_OP_GT:
7816             if (!(bl > i->second.first)) {
7817               r = -ECANCELED;
7818             }
7819             break;
7820           default:
7821             r = -EINVAL;
7822             break;
7823           }
7824           if (r < 0)
7825             break;
7826         }
7827         if (r < 0) {
7828           result = r;
7829         }
7830       }
7831       break;
7832
7833       // OMAP Write ops
7834     case CEPH_OSD_OP_OMAPSETVALS:
7835       if (!pool.info.supports_omap()) {
7836         result = -EOPNOTSUPP;
7837         tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
7838         break;
7839       }
7840       ++ctx->num_write;
7841       result = 0;
7842       {
7843         maybe_create_new_object(ctx);
7844         bufferlist to_set_bl;
7845         try {
7846           decode_str_str_map_to_bl(bp, &to_set_bl);
7847         }
7848         catch (ceph::buffer::error& e) {
7849           result = -EINVAL;
7850           tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
7851           goto fail;
7852         }
7853         tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
7854         if (cct->_conf->subsys.should_gather<dout_subsys, 20>()) {
7855           dout(20) << "setting vals: " << dendl;
7856           map<string,bufferlist> to_set;
7857           bufferlist::const_iterator pt = to_set_bl.begin();
7858           decode(to_set, pt);
7859           for (map<string, bufferlist>::iterator i = to_set.begin();
7860                i != to_set.end();
7861                ++i) {
7862             dout(20) << "\t" << i->first << dendl;
7863           }
7864         }
7865         t->omap_setkeys(soid, to_set_bl);
7866         ctx->clean_regions.mark_omap_dirty();
7867         ctx->delta_stats.num_wr++;
7868         ctx->delta_stats.num_wr_kb += shift_round_up(to_set_bl.length(), 10);
7869       }
7870       obs.oi.set_flag(object_info_t::FLAG_OMAP);
7871       obs.oi.clear_omap_digest();
7872       break;
7873
7874     case CEPH_OSD_OP_OMAPSETHEADER:
7875       tracepoint(osd, do_osd_op_pre_omapsetheader, soid.oid.name.c_str(), soid.snap.val);
7876       if (!pool.info.supports_omap()) {
7877         result = -EOPNOTSUPP;
7878         break;
7879       }
7880       ++ctx->num_write;
7881       result = 0;
7882       {
7883         maybe_create_new_object(ctx);
7884         t->omap_setheader(soid, osd_op.indata);
7885         ctx->clean_regions.mark_omap_dirty();
7886         ctx->delta_stats.num_wr++;
7887       }
7888       obs.oi.set_flag(object_info_t::FLAG_OMAP);
7889       obs.oi.clear_omap_digest();
7890       break;
7891
7892     case CEPH_OSD_OP_OMAPCLEAR:
7893       tracepoint(osd, do_osd_op_pre_omapclear, soid.oid.name.c_str(), soid.snap.val);
7894       if (!pool.info.supports_omap()) {
7895         result = -EOPNOTSUPP;
7896         break;
7897       }
7898       ++ctx->num_write;
7899       result = 0;
7900       {
7901         if (!obs.exists || oi.is_whiteout()) {
7902           result = -ENOENT;
7903           break;
7904         }
7905         if (oi.is_omap()) {
7906           t->omap_clear(soid);
7907           ctx->clean_regions.mark_omap_dirty();
7908           ctx->delta_stats.num_wr++;
7909           obs.oi.clear_omap_digest();
7910           obs.oi.clear_flag(object_info_t::FLAG_OMAP);
7911         }
7912       }
7913       break;
7914
7915     case CEPH_OSD_OP_OMAPRMKEYS:
7916       if (!pool.info.supports_omap()) {
7917         result = -EOPNOTSUPP;
7918         tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
7919         break;
7920       }
7921       ++ctx->num_write;
7922       result = 0;
7923       {
7924         if (!obs.exists || oi.is_whiteout()) {
7925           result = -ENOENT;
7926           tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
7927           break;
7928         }
7929         bufferlist to_rm_bl;
7930         try {
7931           decode_str_set_to_bl(bp, &to_rm_bl);
7932         }
7933         catch (ceph::buffer::error& e) {
7934           result = -EINVAL;
7935           tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
7936           goto fail;
7937         }
7938         tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
7939         t->omap_rmkeys(soid, to_rm_bl);
7940         ctx->clean_regions.mark_omap_dirty();
7941         ctx->delta_stats.num_wr++;
7942       }
7943       obs.oi.clear_omap_digest();
7944       break;
7945
7946     case CEPH_OSD_OP_OMAPRMKEYRANGE:
7947       tracepoint(osd, do_osd_op_pre_omaprmkeyrange, soid.oid.name.c_str(), soid.snap.val);
7948       if (!pool.info.supports_omap()) {
7949         result = -EOPNOTSUPP;
7950         break;
7951       }
7952       ++ctx->num_write;
7953       result = 0;
7954       {
7955         if (!obs.exists || oi.is_whiteout()) {
7956           result = -ENOENT;
7957           break;
7958         }
7959         std::string key_begin, key_end;
7960         try {
7961           decode(key_begin, bp);
7962           decode(key_end, bp);
7963         } catch (ceph::buffer::error& e) {
7964           result = -EINVAL;
7965           goto fail;
7966         }
7967         t->omap_rmkeyrange(soid, key_begin, key_end);
7968         ctx->delta_stats.num_wr++;
7969       }
7970       obs.oi.clear_omap_digest();
7971       break;
7972
7973     case CEPH_OSD_OP_COPY_GET:
7974       ++ctx->num_read;
7975       tracepoint(osd, do_osd_op_pre_copy_get, soid.oid.name.c_str(),
7976                  soid.snap.val);
7977       if (op_finisher == nullptr) {
7978         result = do_copy_get(ctx, bp, osd_op, ctx->obc);
7979       } else {
7980         result = op_finisher->execute();
7981       }
7982       break;
7983
7984     case CEPH_OSD_OP_COPY_FROM:
7985     case CEPH_OSD_OP_COPY_FROM2:
7986       ++ctx->num_write;
7987       result = 0;
7988       {
7989         object_t src_name;
7990         object_locator_t src_oloc;
7991         uint32_t truncate_seq = 0;
7992         uint64_t truncate_size = 0;
7993         bool have_truncate = false;
7994         snapid_t src_snapid = (uint64_t)op.copy_from.snapid;
7995         version_t src_version = op.copy_from.src_version;
7996
7997         if ((op.op == CEPH_OSD_OP_COPY_FROM2) &&
7998             (op.copy_from.flags & ~CEPH_OSD_COPY_FROM_FLAGS)) {
7999           dout(20) << "invalid copy-from2 flags 0x"
8000                   << std::hex << (int)op.copy_from.flags << std::dec << dendl;
8001           result = -EINVAL;
8002           break;
8003         }
8004         try {
8005           decode(src_name, bp);
8006           decode(src_oloc, bp);
8007           // check if client sent us truncate_seq and truncate_size
8008           if ((op.op == CEPH_OSD_OP_COPY_FROM2) &&
8009               (op.copy_from.flags & CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ)) {
8010             decode(truncate_seq, bp);
8011             decode(truncate_size, bp);
8012             have_truncate = true;
8013           }
8014         }
8015         catch (ceph::buffer::error& e) {
8016           result = -EINVAL;
8017           tracepoint(osd,
8018                      do_osd_op_pre_copy_from,
8019                      soid.oid.name.c_str(),
8020                      soid.snap.val,
8021                      "???",
8022                      0,
8023                      "???",
8024                      "???",
8025                      0,
8026                      src_snapid,
8027                      src_version);
8028           goto fail;
8029         }
8030         tracepoint(osd,
8031                    do_osd_op_pre_copy_from,
8032                    soid.oid.name.c_str(),
8033                    soid.snap.val,
8034                    src_name.name.c_str(),
8035                    src_oloc.pool,
8036                    src_oloc.key.c_str(),
8037                    src_oloc.nspace.c_str(),
8038                    src_oloc.hash,
8039                    src_snapid,
8040                    src_version);
8041         if (op_finisher == nullptr) {
8042           // start
8043           pg_t raw_pg;
8044           get_osdmap()->object_locator_to_pg(src_name, src_oloc, raw_pg);
8045           hobject_t src(src_name, src_oloc.key, src_snapid,
8046                         raw_pg.ps(), raw_pg.pool(),
8047                         src_oloc.nspace);
8048           if (src == soid) {
8049             dout(20) << " copy from self is invalid" << dendl;
8050             result = -EINVAL;
8051             break;
8052           }
8053           CopyFromCallback *cb = new CopyFromCallback(ctx, osd_op);
8054           if (have_truncate)
8055             cb->set_truncate(truncate_seq, truncate_size);
8056           ctx->op_finishers[ctx->current_osd_subop_num].reset(
8057             new CopyFromFinisher(cb));
8058           start_copy(cb, ctx->obc, src, src_oloc, src_version,
8059                      op.copy_from.flags,
8060                      false,
8061                      op.copy_from.src_fadvise_flags,
8062                      op.flags);
8063           result = -EINPROGRESS;
8064         } else {
8065           // finish
8066           result = op_finisher->execute();
8067           ceph_assert(result == 0);
8068
8069           // COPY_FROM cannot be executed multiple times -- it must restart
8070           ctx->op_finishers.erase(ctx->current_osd_subop_num);
8071         }
8072       }
8073       break;
8074
8075     default:
8076       tracepoint(osd, do_osd_op_pre_unknown, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op));
8077       dout(1) << "unrecognized osd op " << op.op
8078               << " " << ceph_osd_op_name(op.op)
8079               << dendl;
8080       result = -EOPNOTSUPP;
8081     }
8082
8083   fail:
8084     osd_op.rval = result;
8085     tracepoint(osd, do_osd_op_post, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op), op.flags, result);
8086     if (result < 0 && (op.flags & CEPH_OSD_OP_FLAG_FAILOK) &&
8087         result != -EAGAIN && result != -EINPROGRESS)
8088       result = 0;
8089
8090     if (result < 0)
8091       break;
8092   }
8093   if (result < 0) {
8094     dout(10) << __func__ << " error: " << cpp_strerror(result) << dendl;
8095   }
8096   return result;
8097 }
8098
8099 int PrimaryLogPG::_get_tmap(OpContext *ctx, bufferlist *header, bufferlist *vals)
8100 {
8101   if (ctx->new_obs.oi.size == 0) {
8102     dout(20) << "unable to get tmap for zero sized " << ctx->new_obs.oi.soid << dendl;
8103     return -ENODATA;
8104   }
8105   vector<OSDOp> nops(1);
8106   OSDOp &newop = nops[0];
8107   newop.op.op = CEPH_OSD_OP_TMAPGET;
8108   do_osd_ops(ctx, nops);
8109   try {
8110     bufferlist::const_iterator i = newop.outdata.begin();
8111     decode(*header, i);
8112     (*vals).substr_of(newop.outdata, i.get_off(), i.get_remaining());
8113   } catch (...) {
8114     dout(20) << "unsuccessful at decoding tmap for " << ctx->new_obs.oi.soid
8115              << dendl;
8116     return -EINVAL;
8117   }
8118   dout(20) << "successful at decoding tmap for " << ctx->new_obs.oi.soid
8119            << dendl;
8120   return 0;
8121 }
8122
8123 int PrimaryLogPG::_verify_no_head_clones(const hobject_t& soid,
8124                                         const SnapSet& ss)
8125 {
8126   // verify that all clones have been evicted
8127   dout(20) << __func__ << " verifying clones are absent "
8128            << ss << dendl;
8129   for (vector<snapid_t>::const_iterator p = ss.clones.begin();
8130        p != ss.clones.end();
8131        ++p) {
8132     hobject_t clone_oid = soid;
8133     clone_oid.snap = *p;
8134     if (is_missing_object(clone_oid))
8135       return -EBUSY;
8136     ObjectContextRef clone_obc = get_object_context(clone_oid, false);
8137     if (clone_obc && clone_obc->obs.exists) {
8138       dout(10) << __func__ << " cannot evict head before clone "
8139                << clone_oid << dendl;
8140       return -EBUSY;
8141     }
8142     if (copy_ops.count(clone_oid)) {
8143       dout(10) << __func__ << " cannot evict head, pending promote on clone "
8144                << clone_oid << dendl;
8145       return -EBUSY;
8146     }
8147   }
8148   return 0;
8149 }
8150
8151 inline int PrimaryLogPG::_delete_oid(
8152   OpContext *ctx,
8153   bool no_whiteout,     // no whiteouts, no matter what.
8154   bool try_no_whiteout) // try not to whiteout
8155 {
8156   SnapSet& snapset = ctx->new_snapset;
8157   ObjectState& obs = ctx->new_obs;
8158   object_info_t& oi = obs.oi;
8159   const hobject_t& soid = oi.soid;
8160   PGTransaction* t = ctx->op_t.get();
8161
8162   // cache: cache: set whiteout on delete?
8163   bool whiteout = false;
8164   if (pool.info.cache_mode != pg_pool_t::CACHEMODE_NONE
8165       && !no_whiteout
8166       && !try_no_whiteout) {
8167     whiteout = true;
8168   }
8169
8170   // in luminous or later, we can't delete the head if there are
8171   // clones. we trust the caller passing no_whiteout has already
8172   // verified they don't exist.
8173   if (!snapset.clones.empty() ||
8174       (!ctx->snapc.snaps.empty() && ctx->snapc.snaps[0] > snapset.seq)) {
8175     if (no_whiteout) {
8176       dout(20) << __func__ << " has or will have clones but no_whiteout=1"
8177                << dendl;
8178     } else {
8179       dout(20) << __func__ << " has or will have clones; will whiteout"
8180                << dendl;
8181       whiteout = true;
8182     }
8183   }
8184   dout(20) << __func__ << " " << soid << " whiteout=" << (int)whiteout
8185            << " no_whiteout=" << (int)no_whiteout
8186            << " try_no_whiteout=" << (int)try_no_whiteout
8187            << dendl;
8188   if (!obs.exists || (obs.oi.is_whiteout() && whiteout))
8189     return -ENOENT;
8190
8191   t->remove(soid);
8192
8193   if (oi.size > 0) {
8194     interval_set<uint64_t> ch;
8195     ch.insert(0, oi.size);
8196     ctx->modified_ranges.union_of(ch);
8197     ctx->clean_regions.mark_data_region_dirty(0, oi.size);
8198   }
8199
8200   ctx->clean_regions.mark_omap_dirty();
8201   ctx->delta_stats.num_wr++;
8202   if (soid.is_snap()) {
8203     ceph_assert(ctx->obc->ssc->snapset.clone_overlap.count(soid.snap));
8204     ctx->delta_stats.num_bytes -= ctx->obc->ssc->snapset.get_clone_bytes(soid.snap);
8205   } else {
8206     ctx->delta_stats.num_bytes -= oi.size;
8207   }
8208   oi.size = 0;
8209   oi.new_object();
8210
8211   // disconnect all watchers
8212   for (map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator p =
8213          oi.watchers.begin();
8214        p != oi.watchers.end();
8215        ++p) {
8216     dout(20) << __func__ << " will disconnect watcher " << p->first << dendl;
8217     ctx->watch_disconnects.push_back(
8218       watch_disconnect_t(p->first.first, p->first.second, true));
8219   }
8220   oi.watchers.clear();
8221
8222   if (whiteout) {
8223     dout(20) << __func__ << " setting whiteout on " << soid << dendl;
8224     oi.set_flag(object_info_t::FLAG_WHITEOUT);
8225     ctx->delta_stats.num_whiteouts++;
8226     t->create(soid);
8227     osd->logger->inc(l_osd_tier_whiteout);
8228     return 0;
8229   }
8230
8231   if (oi.has_manifest()) {
8232     ctx->delta_stats.num_objects_manifest--;
8233     dec_all_refcount_manifest(oi, ctx);
8234   }
8235
8236   // delete the head
8237   ctx->delta_stats.num_objects--;
8238   if (soid.is_snap())
8239     ctx->delta_stats.num_object_clones--;
8240   if (oi.is_whiteout()) {
8241     dout(20) << __func__ << " deleting whiteout on " << soid << dendl;
8242     ctx->delta_stats.num_whiteouts--;
8243     oi.clear_flag(object_info_t::FLAG_WHITEOUT);
8244   }
8245   if (oi.is_cache_pinned()) {
8246     ctx->delta_stats.num_objects_pinned--;
8247   }
8248   obs.exists = false;
8249   return 0;
8250 }
8251
8252 int PrimaryLogPG::_rollback_to(OpContext *ctx, OSDOp& op)
8253 {
8254   ObjectState& obs = ctx->new_obs;
8255   object_info_t& oi = obs.oi;
8256   const hobject_t& soid = oi.soid;
8257   snapid_t snapid = (uint64_t)op.op.snap.snapid;
8258   hobject_t missing_oid;
8259
8260   dout(10) << "_rollback_to " << soid << " snapid " << snapid << dendl;
8261
8262   ObjectContextRef rollback_to;
8263
8264   int ret = find_object_context(
8265     hobject_t(soid.oid, soid.get_key(), snapid, soid.get_hash(), info.pgid.pool(),
8266               soid.get_namespace()),
8267     &rollback_to, false, false, &missing_oid);
8268   if (ret == -EAGAIN) {
8269     /* clone must be missing */
8270     ceph_assert(is_degraded_or_backfilling_object(missing_oid) || is_degraded_on_async_recovery_target(missing_oid));
8271     dout(20) << "_rollback_to attempted to roll back to a missing or backfilling clone "
8272              << missing_oid << " (requested snapid: ) " << snapid << dendl;
8273     block_write_on_degraded_snap(missing_oid, ctx->op);
8274     return ret;
8275   }
8276   {
8277     ObjectContextRef promote_obc;
8278     cache_result_t tier_mode_result;
8279     if (obs.exists && obs.oi.has_manifest()) {
8280       /*
8281        * In the case of manifest object, the object_info exists on the base tier at all time,
8282        * so promote_obc should be equal to rollback_to
8283        * */
8284       promote_obc = rollback_to;
8285       tier_mode_result =
8286         maybe_handle_manifest_detail(
8287           ctx->op,
8288           true,
8289           rollback_to);
8290     } else {
8291       tier_mode_result =
8292         maybe_handle_cache_detail(
8293           ctx->op,
8294           true,
8295           rollback_to,
8296           ret,
8297           missing_oid,
8298           true,
8299           false,
8300           &promote_obc);
8301     }
8302     switch (tier_mode_result) {
8303     case cache_result_t::NOOP:
8304       break;
8305     case cache_result_t::BLOCKED_PROMOTE:
8306       ceph_assert(promote_obc);
8307       block_write_on_snap_rollback(soid, promote_obc, ctx->op);
8308       return -EAGAIN;
8309     case cache_result_t::BLOCKED_FULL:
8310       block_write_on_full_cache(soid, ctx->op);
8311       return -EAGAIN;
8312     case cache_result_t::REPLIED_WITH_EAGAIN:
8313       ceph_abort_msg("this can't happen, no rollback on replica");
8314     default:
8315       ceph_abort_msg("must promote was set, other values are not valid");
8316       return -EAGAIN;
8317     }
8318   }
8319
8320   if (ret == -ENOENT || (rollback_to && rollback_to->obs.oi.is_whiteout())) {
8321     // there's no snapshot here, or there's no object.
8322     // if there's no snapshot, we delete the object; otherwise, do nothing.
8323     dout(20) << "_rollback_to deleting head on " << soid.oid
8324              << " because got ENOENT|whiteout on find_object_context" << dendl;
8325     if (ctx->obc->obs.oi.watchers.size()) {
8326       // Cannot delete an object with watchers
8327       ret = -EBUSY;
8328     } else {
8329       _delete_oid(ctx, false, false);
8330       ret = 0;
8331     }
8332   } else if (ret) {
8333     // ummm....huh? It *can't* return anything else at time of writing.
8334     ceph_abort_msg("unexpected error code in _rollback_to");
8335   } else { //we got our context, let's use it to do the rollback!
8336     hobject_t& rollback_to_sobject = rollback_to->obs.oi.soid;
8337     if (is_degraded_or_backfilling_object(rollback_to_sobject) ||
8338         is_degraded_on_async_recovery_target(rollback_to_sobject)) {
8339       dout(20) << "_rollback_to attempted to roll back to a degraded object "
8340                << rollback_to_sobject << " (requested snapid: ) " << snapid << dendl;
8341       block_write_on_degraded_snap(rollback_to_sobject, ctx->op);
8342       ret = -EAGAIN;
8343     } else if (rollback_to->obs.oi.soid.snap == CEPH_NOSNAP) {
8344       // rolling back to the head; we just need to clone it.
8345       ctx->modify = true;
8346     } else {
8347       if (rollback_to->obs.oi.has_manifest() && rollback_to->obs.oi.manifest.is_chunked()) {
8348         /*
8349          * looking at the following case, the foo head needs the reference of chunk4 and chunk5
8350          * in case snap[1] is removed.
8351          *
8352          * Before rollback to snap[1]:
8353          *
8354          * foo snap[1]:          [chunk4]          [chunk5]
8355          * foo snap[0]: [                  chunk2                   ]
8356          * foo head   :          [chunk1]                    [chunk3]
8357          *
8358          * After:
8359          *
8360          * foo snap[1]:          [chunk4]          [chunk5]
8361          * foo snap[0]: [                  chunk2                   ]
8362          * foo head   :          [chunk4]          [chunk5]
8363          *
8364          */
8365         OpFinisher* op_finisher = nullptr;
8366         auto op_finisher_it = ctx->op_finishers.find(ctx->current_osd_subop_num);
8367         if (op_finisher_it != ctx->op_finishers.end()) {
8368           op_finisher = op_finisher_it->second.get();
8369         }
8370         if (!op_finisher) {
8371           bool need_inc_ref = inc_refcount_by_set(ctx, rollback_to->obs.oi.manifest, op);
8372           if (need_inc_ref) {
8373             ceph_assert(op_finisher_it == ctx->op_finishers.end());
8374             ctx->op_finishers[ctx->current_osd_subop_num].reset(
8375                 new SetManifestFinisher(op));
8376             return -EINPROGRESS;
8377           }
8378         } else {
8379           op_finisher->execute();
8380           ctx->op_finishers.erase(ctx->current_osd_subop_num);
8381         }
8382       }
8383       _do_rollback_to(ctx, rollback_to, op);
8384     }
8385   }
8386   return ret;
8387 }
8388
8389 void PrimaryLogPG::_do_rollback_to(OpContext *ctx, ObjectContextRef rollback_to,
8390                                     OSDOp& op)
8391 {
8392   SnapSet& snapset = ctx->new_snapset;
8393   ObjectState& obs = ctx->new_obs;
8394   object_info_t& oi = obs.oi;
8395   const hobject_t& soid = oi.soid;
8396   PGTransaction* t = ctx->op_t.get();
8397   snapid_t snapid = (uint64_t)op.op.snap.snapid;
8398   hobject_t& rollback_to_sobject = rollback_to->obs.oi.soid;
8399
8400   /* 1) Delete current head
8401    * 2) Clone correct snapshot into head
8402    * 3) Calculate clone_overlaps by following overlaps
8403    *    forward from rollback snapshot */
8404   dout(10) << "_do_rollback_to deleting " << soid.oid
8405            << " and rolling back to old snap" << dendl;
8406
8407   if (obs.exists) {
8408     t->remove(soid);
8409     if (obs.oi.has_manifest()) {
8410       dec_all_refcount_manifest(obs.oi, ctx);
8411       oi.manifest.clear();
8412       oi.manifest.type = object_manifest_t::TYPE_NONE;
8413       oi.clear_flag(object_info_t::FLAG_MANIFEST);
8414       ctx->delta_stats.num_objects_manifest--;
8415       ctx->cache_operation = true; // do not trigger to call ref function to calculate refcount
8416     }
8417   }
8418   t->clone(soid, rollback_to_sobject);
8419   t->add_obc(rollback_to);
8420
8421   map<snapid_t, interval_set<uint64_t> >::iterator iter =
8422     snapset.clone_overlap.lower_bound(snapid);
8423   ceph_assert(iter != snapset.clone_overlap.end());
8424   interval_set<uint64_t> overlaps = iter->second;
8425   for ( ;
8426         iter != snapset.clone_overlap.end();
8427         ++iter)
8428     overlaps.intersection_of(iter->second);
8429
8430   if (obs.oi.size > 0) {
8431     interval_set<uint64_t> modified;
8432     modified.insert(0, obs.oi.size);
8433     overlaps.intersection_of(modified);
8434     modified.subtract(overlaps);
8435     ctx->modified_ranges.union_of(modified);
8436   }
8437
8438   // Adjust the cached objectcontext
8439   maybe_create_new_object(ctx, true);
8440   ctx->delta_stats.num_bytes -= obs.oi.size;
8441   ctx->delta_stats.num_bytes += rollback_to->obs.oi.size;
8442   ctx->clean_regions.mark_data_region_dirty(0, std::max(obs.oi.size, rollback_to->obs.oi.size));
8443   ctx->clean_regions.mark_omap_dirty();
8444   obs.oi.size = rollback_to->obs.oi.size;
8445   if (rollback_to->obs.oi.is_data_digest())
8446     obs.oi.set_data_digest(rollback_to->obs.oi.data_digest);
8447   else
8448     obs.oi.clear_data_digest();
8449   if (rollback_to->obs.oi.is_omap_digest())
8450     obs.oi.set_omap_digest(rollback_to->obs.oi.omap_digest);
8451   else
8452     obs.oi.clear_omap_digest();
8453
8454   if (rollback_to->obs.oi.has_manifest() && rollback_to->obs.oi.manifest.is_chunked()) {
8455     obs.oi.set_flag(object_info_t::FLAG_MANIFEST);
8456     obs.oi.manifest.type = rollback_to->obs.oi.manifest.type;
8457     obs.oi.manifest.chunk_map = rollback_to->obs.oi.manifest.chunk_map;
8458     ctx->cache_operation = true;
8459     ctx->delta_stats.num_objects_manifest++;
8460   }
8461
8462   if (rollback_to->obs.oi.is_omap()) {
8463     dout(10) << __func__ << " setting omap flag on " << obs.oi.soid << dendl;
8464     obs.oi.set_flag(object_info_t::FLAG_OMAP);
8465   } else {
8466     dout(10) << __func__ << " clearing omap flag on " << obs.oi.soid << dendl;
8467     obs.oi.clear_flag(object_info_t::FLAG_OMAP);
8468   }
8469 }
8470
8471 void PrimaryLogPG::_make_clone(
8472   OpContext *ctx,
8473   PGTransaction* t,
8474   ObjectContextRef obc,
8475   const hobject_t& head, const hobject_t& coid,
8476   object_info_t *poi)
8477 {
8478   bufferlist bv;
8479   encode(*poi, bv, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
8480
8481   t->clone(coid, head);
8482   setattr_maybe_cache(obc, t, OI_ATTR, bv);
8483   rmattr_maybe_cache(obc, t, SS_ATTR);
8484 }
8485
8486 void PrimaryLogPG::make_writeable(OpContext *ctx)
8487 {
8488   const hobject_t& soid = ctx->obs->oi.soid;
8489   SnapContext& snapc = ctx->snapc;
8490
8491   // clone?
8492   ceph_assert(soid.snap == CEPH_NOSNAP);
8493   dout(20) << "make_writeable " << soid << " snapset=" << ctx->new_snapset
8494            << "  snapc=" << snapc << dendl;
8495
8496   bool was_dirty = ctx->obc->obs.oi.is_dirty();
8497   if (ctx->new_obs.exists) {
8498     // we will mark the object dirty
8499     if (ctx->undirty && was_dirty) {
8500       dout(20) << " clearing DIRTY flag" << dendl;
8501       ceph_assert(ctx->new_obs.oi.is_dirty());
8502       ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
8503       --ctx->delta_stats.num_objects_dirty;
8504       osd->logger->inc(l_osd_tier_clean);
8505     } else if (!was_dirty && !ctx->undirty) {
8506       dout(20) << " setting DIRTY flag" << dendl;
8507       ctx->new_obs.oi.set_flag(object_info_t::FLAG_DIRTY);
8508       ++ctx->delta_stats.num_objects_dirty;
8509       osd->logger->inc(l_osd_tier_dirty);
8510     }
8511   } else {
8512     if (was_dirty) {
8513       dout(20) << " deletion, decrementing num_dirty and clearing flag" << dendl;
8514       ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
8515       --ctx->delta_stats.num_objects_dirty;
8516     }
8517   }
8518
8519   if ((ctx->new_obs.exists &&
8520        ctx->new_obs.oi.is_omap()) &&
8521       (!ctx->obc->obs.exists ||
8522        !ctx->obc->obs.oi.is_omap())) {
8523     ++ctx->delta_stats.num_objects_omap;
8524   }
8525   if ((!ctx->new_obs.exists ||
8526        !ctx->new_obs.oi.is_omap()) &&
8527       (ctx->obc->obs.exists &&
8528        ctx->obc->obs.oi.is_omap())) {
8529     --ctx->delta_stats.num_objects_omap;
8530   }
8531
8532   if (ctx->new_snapset.seq > snapc.seq) {
8533     dout(10) << " op snapset is old" << dendl;
8534   }
8535
8536   if ((ctx->obs->exists && !ctx->obs->oi.is_whiteout()) && // head exist(ed)
8537       snapc.snaps.size() &&                 // there are snaps
8538       !ctx->cache_operation &&
8539       snapc.snaps[0] > ctx->new_snapset.seq) {  // existing object is old
8540     // clone
8541     hobject_t coid = soid;
8542     coid.snap = snapc.seq;
8543
8544     unsigned l;
8545     for (l = 1;
8546          l < snapc.snaps.size() && snapc.snaps[l] > ctx->new_snapset.seq;
8547          l++) ;
8548
8549     vector<snapid_t> snaps(l);
8550     for (unsigned i=0; i<l; i++)
8551       snaps[i] = snapc.snaps[i];
8552
8553     // prepare clone
8554     object_info_t static_snap_oi(coid);
8555     object_info_t *snap_oi;
8556     if (is_primary()) {
8557       ctx->clone_obc = object_contexts.lookup_or_create(static_snap_oi.soid);
8558       ctx->clone_obc->destructor_callback =
8559         new C_PG_ObjectContext(this, ctx->clone_obc.get());
8560       ctx->clone_obc->obs.oi = static_snap_oi;
8561       ctx->clone_obc->obs.exists = true;
8562       ctx->clone_obc->ssc = ctx->obc->ssc;
8563       ctx->clone_obc->ssc->ref++;
8564       if (pool.info.is_erasure())
8565         ctx->clone_obc->attr_cache = ctx->obc->attr_cache;
8566       snap_oi = &ctx->clone_obc->obs.oi;
8567       if (ctx->obc->obs.oi.has_manifest()) {
8568         if ((ctx->obc->obs.oi.flags & object_info_t::FLAG_REDIRECT_HAS_REFERENCE) &&
8569             ctx->obc->obs.oi.manifest.is_redirect()) {
8570           snap_oi->set_flag(object_info_t::FLAG_MANIFEST);
8571           snap_oi->manifest.type = object_manifest_t::TYPE_REDIRECT;
8572           snap_oi->manifest.redirect_target = ctx->obc->obs.oi.manifest.redirect_target;
8573         } else if (ctx->obc->obs.oi.manifest.is_chunked()) {
8574           snap_oi->set_flag(object_info_t::FLAG_MANIFEST);
8575           snap_oi->manifest.type = object_manifest_t::TYPE_CHUNKED;
8576           snap_oi->manifest.chunk_map = ctx->obc->obs.oi.manifest.chunk_map;
8577         } else {
8578           ceph_abort_msg("unrecognized manifest type");
8579         }
8580       }
8581       bool got = ctx->lock_manager.get_write_greedy(
8582         coid,
8583         ctx->clone_obc,
8584         ctx->op);
8585       ceph_assert(got);
8586       dout(20) << " got greedy write on clone_obc " << *ctx->clone_obc << dendl;
8587     } else {
8588       snap_oi = &static_snap_oi;
8589     }
8590     snap_oi->version = ctx->at_version;
8591     snap_oi->prior_version = ctx->obs->oi.version;
8592     snap_oi->copy_user_bits(ctx->obs->oi);
8593
8594     _make_clone(ctx, ctx->op_t.get(), ctx->clone_obc, soid, coid, snap_oi);
8595
8596     ctx->delta_stats.num_objects++;
8597     if (snap_oi->is_dirty()) {
8598       ctx->delta_stats.num_objects_dirty++;
8599       osd->logger->inc(l_osd_tier_dirty);
8600     }
8601     if (snap_oi->is_omap())
8602       ctx->delta_stats.num_objects_omap++;
8603     if (snap_oi->is_cache_pinned())
8604       ctx->delta_stats.num_objects_pinned++;
8605     if (snap_oi->has_manifest())
8606       ctx->delta_stats.num_objects_manifest++;
8607     ctx->delta_stats.num_object_clones++;
8608     ctx->new_snapset.clones.push_back(coid.snap);
8609     ctx->new_snapset.clone_size[coid.snap] = ctx->obs->oi.size;
8610     ctx->new_snapset.clone_snaps[coid.snap] = snaps;
8611
8612     // clone_overlap should contain an entry for each clone
8613     // (an empty interval_set if there is no overlap)
8614     ctx->new_snapset.clone_overlap[coid.snap];
8615     if (ctx->obs->oi.size)
8616       ctx->new_snapset.clone_overlap[coid.snap].insert(0, ctx->obs->oi.size);
8617
8618     // log clone
8619     dout(10) << " cloning v " << ctx->obs->oi.version
8620              << " to " << coid << " v " << ctx->at_version
8621              << " snaps=" << snaps
8622              << " snapset=" << ctx->new_snapset << dendl;
8623     ctx->log.push_back(pg_log_entry_t(
8624                          pg_log_entry_t::CLONE, coid, ctx->at_version,
8625                          ctx->obs->oi.version,
8626                          ctx->obs->oi.user_version,
8627                          osd_reqid_t(), ctx->new_obs.oi.mtime, 0));
8628     encode(snaps, ctx->log.back().snaps);
8629
8630     ctx->at_version.version++;
8631   }
8632
8633   // update most recent clone_overlap and usage stats
8634   if (ctx->new_snapset.clones.size() > 0) {
8635     // the clone_overlap is difference of range between head and clones.
8636     // we need to check whether the most recent clone exists, if it's
8637     // been evicted, it's not included in the stats, but the clone_overlap
8638     // is still exist in the snapset, so we should update the
8639     // clone_overlap to make it sense.
8640     hobject_t last_clone_oid = soid;
8641     last_clone_oid.snap = ctx->new_snapset.clone_overlap.rbegin()->first;
8642     interval_set<uint64_t> &newest_overlap =
8643       ctx->new_snapset.clone_overlap.rbegin()->second;
8644     ctx->modified_ranges.intersection_of(newest_overlap);
8645     if (is_present_clone(last_clone_oid)) {
8646       // modified_ranges is still in use by the clone
8647       ctx->delta_stats.num_bytes += ctx->modified_ranges.size();
8648     }
8649     newest_overlap.subtract(ctx->modified_ranges);
8650   }
8651
8652   if (snapc.seq > ctx->new_snapset.seq) {
8653     // update snapset with latest snap context
8654     ctx->new_snapset.seq = snapc.seq;
8655     if (get_osdmap()->require_osd_release < ceph_release_t::octopus) {
8656       ctx->new_snapset.snaps = snapc.snaps;
8657     } else {
8658       ctx->new_snapset.snaps.clear();
8659     }
8660   }
8661   dout(20) << "make_writeable " << soid
8662            << " done, snapset=" << ctx->new_snapset << dendl;
8663 }
8664
8665
8666 void PrimaryLogPG::write_update_size_and_usage(object_stat_sum_t& delta_stats, object_info_t& oi,
8667                                                interval_set<uint64_t>& modified, uint64_t offset,
8668                                                uint64_t length, bool write_full)
8669 {
8670   interval_set<uint64_t> ch;
8671   if (write_full) {
8672     if (oi.size)
8673       ch.insert(0, oi.size);
8674   } else if (length)
8675     ch.insert(offset, length);
8676   modified.union_of(ch);
8677   if (write_full ||
8678       (offset + length > oi.size && length)) {
8679     uint64_t new_size = offset + length;
8680     delta_stats.num_bytes -= oi.size;
8681     delta_stats.num_bytes += new_size;
8682     oi.size = new_size;
8683   }
8684
8685   delta_stats.num_wr++;
8686   delta_stats.num_wr_kb += shift_round_up(length, 10);
8687 }
8688
8689 void PrimaryLogPG::truncate_update_size_and_usage(
8690   object_stat_sum_t& delta_stats,
8691   object_info_t& oi,
8692   uint64_t truncate_size)
8693 {
8694   if (oi.size != truncate_size) {
8695     delta_stats.num_bytes -= oi.size;
8696     delta_stats.num_bytes += truncate_size;
8697     oi.size = truncate_size;
8698   }
8699 }
8700
8701 void PrimaryLogPG::complete_disconnect_watches(
8702   ObjectContextRef obc,
8703   const list<watch_disconnect_t> &to_disconnect)
8704 {
8705   for (list<watch_disconnect_t>::const_iterator i =
8706          to_disconnect.begin();
8707        i != to_disconnect.end();
8708        ++i) {
8709     pair<uint64_t, entity_name_t> watcher(i->cookie, i->name);
8710     auto watchers_entry = obc->watchers.find(watcher);
8711     if (watchers_entry != obc->watchers.end()) {
8712       WatchRef watch = watchers_entry->second;
8713       dout(10) << "do_osd_op_effects disconnect watcher " << watcher << dendl;
8714       obc->watchers.erase(watcher);
8715       watch->remove(i->send_disconnect);
8716     } else {
8717       dout(10) << "do_osd_op_effects disconnect failed to find watcher "
8718                << watcher << dendl;
8719     }
8720   }
8721 }
8722
8723 void PrimaryLogPG::do_osd_op_effects(OpContext *ctx, const ConnectionRef& conn)
8724 {
8725   entity_name_t entity = ctx->reqid.name;
8726   dout(15) << "do_osd_op_effects " << entity << " con " << conn.get() << dendl;
8727
8728   // disconnects first
8729   complete_disconnect_watches(ctx->obc, ctx->watch_disconnects);
8730
8731   ceph_assert(conn);
8732
8733   auto session = conn->get_priv();
8734   if (!session)
8735     return;
8736
8737   for (list<pair<watch_info_t,bool> >::iterator i = ctx->watch_connects.begin();
8738        i != ctx->watch_connects.end();
8739        ++i) {
8740     pair<uint64_t, entity_name_t> watcher(i->first.cookie, entity);
8741     dout(15) << "do_osd_op_effects applying watch connect on session "
8742              << session.get() << " watcher " << watcher << dendl;
8743     WatchRef watch;
8744     if (ctx->obc->watchers.count(watcher)) {
8745       dout(15) << "do_osd_op_effects found existing watch watcher " << watcher
8746                << dendl;
8747       watch = ctx->obc->watchers[watcher];
8748     } else {
8749       dout(15) << "do_osd_op_effects new watcher " << watcher
8750                << dendl;
8751       watch = Watch::makeWatchRef(
8752         this, osd, ctx->obc, i->first.timeout_seconds,
8753         i->first.cookie, entity, conn->get_peer_addr());
8754       ctx->obc->watchers.insert(
8755         make_pair(
8756           watcher,
8757           watch));
8758     }
8759     watch->connect(conn, i->second);
8760   }
8761
8762   for (list<notify_info_t>::iterator p = ctx->notifies.begin();
8763        p != ctx->notifies.end();
8764        ++p) {
8765     dout(10) << "do_osd_op_effects, notify " << *p << dendl;
8766     ConnectionRef conn(ctx->op->get_req()->get_connection());
8767     NotifyRef notif(
8768       Notify::makeNotifyRef(
8769         conn,
8770         ctx->reqid.name.num(),
8771         p->bl,
8772         p->timeout,
8773         p->cookie,
8774         p->notify_id,
8775         ctx->obc->obs.oi.user_version,
8776         osd));
8777     for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator i =
8778            ctx->obc->watchers.begin();
8779          i != ctx->obc->watchers.end();
8780          ++i) {
8781       dout(10) << "starting notify on watch " << i->first << dendl;
8782       i->second->start_notify(notif);
8783     }
8784     notif->init();
8785   }
8786
8787   for (list<OpContext::NotifyAck>::iterator p = ctx->notify_acks.begin();
8788        p != ctx->notify_acks.end();
8789        ++p) {
8790     if (p->watch_cookie)
8791       dout(10) << "notify_ack " << make_pair(*(p->watch_cookie), p->notify_id) << dendl;
8792     else
8793       dout(10) << "notify_ack " << make_pair("NULL", p->notify_id) << dendl;
8794     for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator i =
8795            ctx->obc->watchers.begin();
8796          i != ctx->obc->watchers.end();
8797          ++i) {
8798       if (i->first.second != entity) continue;
8799       if (p->watch_cookie &&
8800           *(p->watch_cookie) != i->first.first) continue;
8801       dout(10) << "acking notify on watch " << i->first << dendl;
8802       i->second->notify_ack(p->notify_id, p->reply_bl);
8803     }
8804   }
8805 }
8806
8807 hobject_t PrimaryLogPG::generate_temp_object(const hobject_t& target)
8808 {
8809   ostringstream ss;
8810   ss << "temp_" << info.pgid << "_" << get_role()
8811      << "_" << osd->monc->get_global_id() << "_" << (++temp_seq);
8812   hobject_t hoid = target.make_temp_hobject(ss.str());
8813   dout(20) << __func__ << " " << hoid << dendl;
8814   return hoid;
8815 }
8816
8817 hobject_t PrimaryLogPG::get_temp_recovery_object(
8818   const hobject_t& target,
8819   eversion_t version)
8820 {
8821   ostringstream ss;
8822   ss << "temp_recovering_" << info.pgid  // (note this includes the shardid)
8823      << "_" << version
8824      << "_" << info.history.same_interval_since
8825      << "_" << target.snap;
8826   // pgid + version + interval + snapid is unique, and short
8827   hobject_t hoid = target.make_temp_hobject(ss.str());
8828   dout(20) << __func__ << " " << hoid << dendl;
8829   return hoid;
8830 }
8831
8832 int PrimaryLogPG::prepare_transaction(OpContext *ctx)
8833 {
8834   ceph_assert(!ctx->ops->empty());
8835
8836   // valid snap context?
8837   if (!ctx->snapc.is_valid()) {
8838     dout(10) << " invalid snapc " << ctx->snapc << dendl;
8839     return -EINVAL;
8840   }
8841
8842   // prepare the actual mutation
8843   int result = do_osd_ops(ctx, *ctx->ops);
8844   if (result < 0) {
8845     if (ctx->op->may_write() &&
8846         get_osdmap()->require_osd_release >= ceph_release_t::kraken) {
8847       // need to save the error code in the pg log, to detect dup ops,
8848       // but do nothing else
8849       ctx->update_log_only = true;
8850     }
8851     return result;
8852   }
8853
8854   // read-op?  write-op noop? done?
8855   if (ctx->op_t->empty() && !ctx->modify) {
8856     if (ctx->pending_async_reads.empty())
8857       unstable_stats.add(ctx->delta_stats);
8858     if (ctx->op->may_write() &&
8859         get_osdmap()->require_osd_release >= ceph_release_t::kraken) {
8860       ctx->update_log_only = true;
8861     }
8862     return result;
8863   }
8864
8865   // check for full
8866   if ((ctx->delta_stats.num_bytes > 0 ||
8867        ctx->delta_stats.num_objects > 0) &&  // FIXME: keys?
8868       pool.info.has_flag(pg_pool_t::FLAG_FULL)) {
8869     auto m = ctx->op->get_req<MOSDOp>();
8870     if (ctx->reqid.name.is_mds() ||   // FIXME: ignore MDS for now
8871         m->has_flag(CEPH_OSD_FLAG_FULL_FORCE)) {
8872       dout(20) << __func__ << " full, but proceeding due to FULL_FORCE or MDS"
8873                << dendl;
8874     } else if (m->has_flag(CEPH_OSD_FLAG_FULL_TRY)) {
8875       // they tried, they failed.
8876       dout(20) << __func__ << " full, replying to FULL_TRY op" << dendl;
8877       return pool.info.has_flag(pg_pool_t::FLAG_FULL_QUOTA) ? -EDQUOT : -ENOSPC;
8878     } else {
8879       // drop request
8880       dout(20) << __func__ << " full, dropping request (bad client)" << dendl;
8881       return -EAGAIN;
8882     }
8883   }
8884
8885   const hobject_t& soid = ctx->obs->oi.soid;
8886   // clone, if necessary
8887   if (soid.snap == CEPH_NOSNAP)
8888     make_writeable(ctx);
8889
8890   finish_ctx(ctx,
8891              ctx->new_obs.exists ? pg_log_entry_t::MODIFY :
8892              pg_log_entry_t::DELETE,
8893              result);
8894
8895   return result;
8896 }
8897
8898 void PrimaryLogPG::finish_ctx(OpContext *ctx, int log_op_type, int result)
8899 {
8900   const hobject_t& soid = ctx->obs->oi.soid;
8901   dout(20) << __func__ << " " << soid << " " << ctx
8902            << " op " << pg_log_entry_t::get_op_name(log_op_type)
8903            << dendl;
8904   utime_t now = ceph_clock_now();
8905
8906   jspan span;
8907   if (ctx->op) {
8908     span = tracing::osd::tracer.add_span(__func__, ctx->op->osd_parent_span);
8909   }
8910
8911   // Drop the reference if deduped chunk is modified
8912   if (ctx->new_obs.oi.is_dirty() &&
8913     (ctx->obs->oi.has_manifest() && ctx->obs->oi.manifest.is_chunked()) &&
8914     !ctx->cache_operation &&
8915     log_op_type != pg_log_entry_t::PROMOTE) {
8916     update_chunk_map_by_dirty(ctx);
8917     // If a clone is creating, ignore dropping the reference for manifest object
8918     if (!ctx->delta_stats.num_object_clones) {
8919       dec_refcount_by_dirty(ctx);
8920     }
8921   }
8922
8923   // finish and log the op.
8924   if (ctx->user_modify) {
8925     // update the user_version for any modify ops, except for the watch op
8926     ctx->user_at_version = std::max(info.last_user_version, ctx->new_obs.oi.user_version) + 1;
8927     /* In order for new clients and old clients to interoperate properly
8928      * when exchanging versions, we need to lower bound the user_version
8929      * (which our new clients pay proper attention to)
8930      * by the at_version (which is all the old clients can ever see). */
8931     if (ctx->at_version.version > ctx->user_at_version)
8932       ctx->user_at_version = ctx->at_version.version;
8933     ctx->new_obs.oi.user_version = ctx->user_at_version;
8934   }
8935   ctx->bytes_written = ctx->op_t->get_bytes_written();
8936
8937   if (ctx->new_obs.exists) {
8938     ctx->new_obs.oi.version = ctx->at_version;
8939     ctx->new_obs.oi.prior_version = ctx->obs->oi.version;
8940     ctx->new_obs.oi.last_reqid = ctx->reqid;
8941     if (ctx->mtime != utime_t()) {
8942       ctx->new_obs.oi.mtime = ctx->mtime;
8943       dout(10) << " set mtime to " << ctx->new_obs.oi.mtime << dendl;
8944       ctx->new_obs.oi.local_mtime = now;
8945     } else {
8946       dout(10) << " mtime unchanged at " << ctx->new_obs.oi.mtime << dendl;
8947     }
8948
8949     // object_info_t
8950     map <string, bufferlist, less<>> attrs;
8951     bufferlist bv(sizeof(ctx->new_obs.oi));
8952     encode(ctx->new_obs.oi, bv,
8953              get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
8954     attrs[OI_ATTR] = std::move(bv);
8955
8956     // snapset
8957     if (soid.snap == CEPH_NOSNAP) {
8958       dout(10) << " final snapset " << ctx->new_snapset
8959                << " in " << soid << dendl;
8960       bufferlist bss;
8961       encode(ctx->new_snapset, bss);
8962       attrs[SS_ATTR] = std::move(bss);
8963     } else {
8964       dout(10) << " no snapset (this is a clone)" << dendl;
8965     }
8966     ctx->op_t->setattrs(soid, attrs);
8967   } else {
8968     // reset cached oi
8969     ctx->new_obs.oi = object_info_t(ctx->obc->obs.oi.soid);
8970   }
8971
8972   // append to log
8973   ctx->log.push_back(
8974     pg_log_entry_t(log_op_type, soid, ctx->at_version,
8975                    ctx->obs->oi.version,
8976                    ctx->user_at_version, ctx->reqid,
8977                    ctx->mtime,
8978                    (ctx->op && ctx->op->allows_returnvec()) ? result : 0));
8979   if (ctx->op && ctx->op->allows_returnvec()) {
8980     // also the per-op values
8981     ctx->log.back().set_op_returns(*ctx->ops);
8982     dout(20) << __func__ << " op_returns " << ctx->log.back().op_returns
8983              << dendl;
8984   }
8985
8986   ctx->log.back().clean_regions = ctx->clean_regions;
8987   dout(20) << __func__ << " object " << soid <<  " marks clean_regions " << ctx->log.back().clean_regions << dendl;
8988
8989   if (soid.snap < CEPH_NOSNAP) {
8990     switch (log_op_type) {
8991     case pg_log_entry_t::MODIFY:
8992     case pg_log_entry_t::PROMOTE:
8993     case pg_log_entry_t::CLEAN:
8994       dout(20) << __func__ << " encoding snaps from " << ctx->new_snapset
8995                << dendl;
8996       encode(ctx->new_snapset.clone_snaps[soid.snap], ctx->log.back().snaps);
8997       break;
8998     default:
8999       break;
9000     }
9001   }
9002
9003   if (!ctx->extra_reqids.empty()) {
9004     dout(20) << __func__ << "  extra_reqids " << ctx->extra_reqids << " "
9005              << ctx->extra_reqid_return_codes << dendl;
9006     ctx->log.back().extra_reqids.swap(ctx->extra_reqids);
9007     ctx->log.back().extra_reqid_return_codes.swap(ctx->extra_reqid_return_codes);
9008   }
9009
9010   // apply new object state.
9011   ctx->obc->obs = ctx->new_obs;
9012
9013   if (soid.is_head() && !ctx->obc->obs.exists) {
9014     ctx->obc->ssc->exists = false;
9015     ctx->obc->ssc->snapset = SnapSet();
9016   } else {
9017     ctx->obc->ssc->exists = true;
9018     ctx->obc->ssc->snapset = ctx->new_snapset;
9019   }
9020 }
9021
9022 void PrimaryLogPG::apply_stats(
9023   const hobject_t &soid,
9024   const object_stat_sum_t &delta_stats) {
9025
9026   recovery_state.apply_op_stats(soid, delta_stats);
9027   for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
9028        i != get_backfill_targets().end();
9029        ++i) {
9030     pg_shard_t bt = *i;
9031     const pg_info_t& pinfo = recovery_state.get_peer_info(bt);
9032     if (soid > pinfo.last_backfill && soid <= last_backfill_started) {
9033       pending_backfill_updates[soid].stats.add(delta_stats);
9034     }
9035   }
9036
9037   m_scrubber->stats_of_handled_objects(delta_stats, soid);
9038 }
9039
9040 void PrimaryLogPG::complete_read_ctx(int result, OpContext *ctx)
9041 {
9042   auto m = ctx->op->get_req<MOSDOp>();
9043   ceph_assert(ctx->async_reads_complete());
9044
9045   for (auto p = ctx->ops->begin();
9046     p != ctx->ops->end() && result >= 0; ++p) {
9047     if (p->rval < 0 && !(p->op.flags & CEPH_OSD_OP_FLAG_FAILOK)) {
9048       result = p->rval;
9049       break;
9050     }
9051     ctx->bytes_read += p->outdata.length();
9052   }
9053   ctx->reply->get_header().data_off = (ctx->data_off ? *ctx->data_off : 0);
9054
9055   MOSDOpReply *reply = ctx->reply;
9056   ctx->reply = nullptr;
9057
9058   if (result >= 0) {
9059     if (!ctx->ignore_log_op_stats) {
9060       log_op_stats(*ctx->op, ctx->bytes_written, ctx->bytes_read);
9061
9062       publish_stats_to_osd();
9063     }
9064
9065     // on read, return the current object version
9066     if (ctx->obs) {
9067       reply->set_reply_versions(eversion_t(), ctx->obs->oi.user_version);
9068     } else {
9069       reply->set_reply_versions(eversion_t(), ctx->user_at_version);
9070     }
9071   } else if (result == -ENOENT) {
9072     // on ENOENT, set a floor for what the next user version will be.
9073     reply->set_enoent_reply_versions(info.last_update, info.last_user_version);
9074   }
9075
9076   reply->set_result(result);
9077   reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
9078   osd->send_message_osd_client(reply, m->get_connection());
9079   close_op_ctx(ctx);
9080 }
9081
9082 // ========================================================================
9083 // copyfrom
9084
9085 struct C_Copyfrom : public Context {
9086   PrimaryLogPGRef pg;
9087   hobject_t oid;
9088   epoch_t last_peering_reset;
9089   ceph_tid_t tid;
9090   PrimaryLogPG::CopyOpRef cop;  // used for keeping the cop alive
9091   C_Copyfrom(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
9092              const PrimaryLogPG::CopyOpRef& c)
9093     : pg(p), oid(o), last_peering_reset(lpr),
9094       tid(0), cop(c)
9095   {}
9096   void finish(int r) override {
9097     if (r == -ECANCELED)
9098       return;
9099     std::scoped_lock l{*pg};
9100     if (last_peering_reset == pg->get_last_peering_reset()) {
9101       pg->process_copy_chunk(oid, tid, r);
9102       cop.reset();
9103     }
9104   }
9105 };
9106
9107 struct C_CopyFrom_AsyncReadCb : public Context {
9108   OSDOp *osd_op;
9109   object_copy_data_t reply_obj;
9110   uint64_t features;
9111   size_t len;
9112   C_CopyFrom_AsyncReadCb(OSDOp *osd_op, uint64_t features) :
9113     osd_op(osd_op), features(features), len(0) {}
9114   void finish(int r) override {
9115     osd_op->rval = r;
9116     if (r < 0) {
9117       return;
9118     }
9119
9120     ceph_assert(len > 0);
9121     ceph_assert(len <= reply_obj.data.length());
9122     bufferlist bl;
9123     bl.substr_of(reply_obj.data, 0, len);
9124     reply_obj.data.swap(bl);
9125     encode(reply_obj, osd_op->outdata, features);
9126   }
9127 };
9128
9129 struct C_CopyChunk : public Context {
9130   PrimaryLogPGRef pg;
9131   hobject_t oid;
9132   epoch_t last_peering_reset;
9133   ceph_tid_t tid;
9134   PrimaryLogPG::CopyOpRef cop;  // used for keeping the cop alive
9135   uint64_t offset = 0;
9136   C_CopyChunk(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
9137              const PrimaryLogPG::CopyOpRef& c)
9138     : pg(p), oid(o), last_peering_reset(lpr),
9139       tid(0), cop(c)
9140   {}
9141   void finish(int r) override {
9142     if (r == -ECANCELED)
9143       return;
9144     std::scoped_lock l{*pg};
9145     if (last_peering_reset == pg->get_last_peering_reset()) {
9146       pg->process_copy_chunk_manifest(oid, tid, r, offset);
9147       cop.reset();
9148     }
9149   }
9150 };
9151
9152 int PrimaryLogPG::do_copy_get(OpContext *ctx, bufferlist::const_iterator& bp,
9153                               OSDOp& osd_op, ObjectContextRef &obc)
9154 {
9155   object_info_t& oi = obc->obs.oi;
9156   hobject_t& soid = oi.soid;
9157   int result = 0;
9158   object_copy_cursor_t cursor;
9159   uint64_t out_max;
9160   try {
9161     decode(cursor, bp);
9162     decode(out_max, bp);
9163   }
9164   catch (ceph::buffer::error& e) {
9165     result = -EINVAL;
9166     return result;
9167   }
9168
9169   const MOSDOp *op = reinterpret_cast<const MOSDOp*>(ctx->op->get_req());
9170   uint64_t features = op->get_features();
9171
9172   bool async_read_started = false;
9173   object_copy_data_t _reply_obj;
9174   C_CopyFrom_AsyncReadCb *cb = nullptr;
9175   if (pool.info.is_erasure()) {
9176     cb = new C_CopyFrom_AsyncReadCb(&osd_op, features);
9177   }
9178   object_copy_data_t &reply_obj = cb ? cb->reply_obj : _reply_obj;
9179   // size, mtime
9180   reply_obj.size = oi.size;
9181   reply_obj.mtime = oi.mtime;
9182   ceph_assert(obc->ssc);
9183   if (soid.snap < CEPH_NOSNAP) {
9184     auto p = obc->ssc->snapset.clone_snaps.find(soid.snap);
9185     ceph_assert(p != obc->ssc->snapset.clone_snaps.end()); // warn?
9186     reply_obj.snaps = p->second;
9187   } else {
9188     reply_obj.snap_seq = obc->ssc->snapset.seq;
9189   }
9190   if (oi.is_data_digest()) {
9191     reply_obj.flags |= object_copy_data_t::FLAG_DATA_DIGEST;
9192     reply_obj.data_digest = oi.data_digest;
9193   }
9194   if (oi.is_omap_digest()) {
9195     reply_obj.flags |= object_copy_data_t::FLAG_OMAP_DIGEST;
9196     reply_obj.omap_digest = oi.omap_digest;
9197   }
9198   reply_obj.truncate_seq = oi.truncate_seq;
9199   reply_obj.truncate_size = oi.truncate_size;
9200
9201   // attrs
9202   map<string,bufferlist,less<>>& out_attrs = reply_obj.attrs;
9203   if (!cursor.attr_complete) {
9204     result = getattrs_maybe_cache(
9205       ctx->obc,
9206       &out_attrs);
9207     if (result < 0) {
9208       if (cb) {
9209         delete cb;
9210       }
9211       return result;
9212     }
9213     cursor.attr_complete = true;
9214     dout(20) << " got attrs" << dendl;
9215   }
9216
9217   int64_t left = out_max - osd_op.outdata.length();
9218
9219   // data
9220   bufferlist& bl = reply_obj.data;
9221   if (left > 0 && !cursor.data_complete) {
9222     if (cursor.data_offset < oi.size) {
9223       uint64_t max_read = std::min(oi.size - cursor.data_offset, (uint64_t)left);
9224       if (cb) {
9225         async_read_started = true;
9226         ctx->pending_async_reads.push_back(
9227           make_pair(
9228             boost::make_tuple(cursor.data_offset, max_read, osd_op.op.flags),
9229             make_pair(&bl, cb)));
9230         cb->len = max_read;
9231
9232         ctx->op_finishers[ctx->current_osd_subop_num].reset(
9233           new ReadFinisher(osd_op));
9234         result = -EINPROGRESS;
9235
9236         dout(10) << __func__ << ": async_read noted for " << soid << dendl;
9237       } else {
9238         result = pgbackend->objects_read_sync(
9239           oi.soid, cursor.data_offset, max_read, osd_op.op.flags, &bl);
9240         if (result < 0)
9241           return result;
9242       }
9243       left -= max_read;
9244       cursor.data_offset += max_read;
9245     }
9246     if (cursor.data_offset == oi.size) {
9247       cursor.data_complete = true;
9248       dout(20) << " got data" << dendl;
9249     }
9250     ceph_assert(cursor.data_offset <= oi.size);
9251   }
9252
9253   // omap
9254   uint32_t omap_keys = 0;
9255   if (!pool.info.supports_omap() || !oi.is_omap()) {
9256     cursor.omap_complete = true;
9257   } else {
9258     if (left > 0 && !cursor.omap_complete) {
9259       ceph_assert(cursor.data_complete);
9260       if (cursor.omap_offset.empty()) {
9261         osd->store->omap_get_header(ch, ghobject_t(oi.soid),
9262                                     &reply_obj.omap_header);
9263       }
9264       bufferlist omap_data;
9265       ObjectMap::ObjectMapIterator iter =
9266         osd->store->get_omap_iterator(ch, ghobject_t(oi.soid));
9267       ceph_assert(iter);
9268       iter->upper_bound(cursor.omap_offset);
9269       for (; iter->valid(); iter->next()) {
9270         ++omap_keys;
9271         encode(iter->key(), omap_data);
9272         encode(iter->value(), omap_data);
9273         left -= iter->key().length() + 4 + iter->value().length() + 4;
9274         if (left <= 0)
9275           break;
9276       }
9277       if (omap_keys) {
9278         encode(omap_keys, reply_obj.omap_data);
9279         reply_obj.omap_data.claim_append(omap_data);
9280       }
9281       if (iter->valid()) {
9282         cursor.omap_offset = iter->key();
9283       } else {
9284         cursor.omap_complete = true;
9285         dout(20) << " got omap" << dendl;
9286       }
9287     }
9288   }
9289
9290   if (cursor.is_complete()) {
9291     // include reqids only in the final step.  this is a bit fragile
9292     // but it works...
9293     recovery_state.get_pg_log().get_log().get_object_reqids(ctx->obc->obs.oi.soid, 10,
9294                                        &reply_obj.reqids,
9295                                        &reply_obj.reqid_return_codes);
9296     dout(20) << " got reqids" << dendl;
9297   }
9298
9299   dout(20) << " cursor.is_complete=" << cursor.is_complete()
9300            << " " << out_attrs.size() << " attrs"
9301            << " " << bl.length() << " bytes"
9302            << " " << reply_obj.omap_header.length() << " omap header bytes"
9303            << " " << reply_obj.omap_data.length() << " omap data bytes in "
9304            << omap_keys << " keys"
9305            << " " << reply_obj.reqids.size() << " reqids"
9306            << dendl;
9307   reply_obj.cursor = cursor;
9308   if (!async_read_started) {
9309     encode(reply_obj, osd_op.outdata, features);
9310   }
9311   if (cb && !async_read_started) {
9312     delete cb;
9313   }
9314
9315   if (result > 0) {
9316     result = 0;
9317   }
9318   return result;
9319 }
9320
9321 void PrimaryLogPG::fill_in_copy_get_noent(OpRequestRef& op, hobject_t oid,
9322                                           OSDOp& osd_op)
9323 {
9324   const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
9325   uint64_t features = m->get_features();
9326   object_copy_data_t reply_obj;
9327
9328   recovery_state.get_pg_log().get_log().get_object_reqids(oid, 10, &reply_obj.reqids,
9329                                      &reply_obj.reqid_return_codes);
9330   dout(20) << __func__ << " got reqids " << reply_obj.reqids << dendl;
9331   encode(reply_obj, osd_op.outdata, features);
9332   osd_op.rval = -ENOENT;
9333   MOSDOpReply *reply = new MOSDOpReply(m, 0, get_osdmap_epoch(), 0, false);
9334   reply->set_result(-ENOENT);
9335   reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
9336   osd->send_message_osd_client(reply, m->get_connection());
9337 }
9338
9339 void PrimaryLogPG::start_copy(CopyCallback *cb, ObjectContextRef obc,
9340                               hobject_t src, object_locator_t oloc,
9341                               version_t version, unsigned flags,
9342                               bool mirror_snapset,
9343                               unsigned src_obj_fadvise_flags,
9344                               unsigned dest_obj_fadvise_flags)
9345 {
9346   const hobject_t& dest = obc->obs.oi.soid;
9347   dout(10) << __func__ << " " << dest
9348            << " from " << src << " " << oloc << " v" << version
9349            << " flags " << flags
9350            << (mirror_snapset ? " mirror_snapset" : "")
9351            << dendl;
9352
9353   ceph_assert(!mirror_snapset || src.snap == CEPH_NOSNAP);
9354
9355   // cancel a previous in-progress copy?
9356   if (copy_ops.count(dest)) {
9357     // FIXME: if the src etc match, we could avoid restarting from the
9358     // beginning.
9359     CopyOpRef cop = copy_ops[dest];
9360     vector<ceph_tid_t> tids;
9361     cancel_copy(cop, false, &tids);
9362     osd->objecter->op_cancel(tids, -ECANCELED);
9363   }
9364
9365   CopyOpRef cop(std::make_shared<CopyOp>(cb, obc, src, oloc, version, flags,
9366                            mirror_snapset, src_obj_fadvise_flags,
9367                            dest_obj_fadvise_flags));
9368   copy_ops[dest] = cop;
9369   obc->start_block();
9370
9371   if (!obc->obs.oi.has_manifest()) {
9372     _copy_some(obc, cop);
9373   } else {
9374     if (obc->obs.oi.manifest.is_redirect()) {
9375       _copy_some(obc, cop);
9376     } else if (obc->obs.oi.manifest.is_chunked()) {
9377       auto p = obc->obs.oi.manifest.chunk_map.begin();
9378       _copy_some_manifest(obc, cop, p->first);
9379     } else {
9380       ceph_abort_msg("unrecognized manifest type");
9381     }
9382   }
9383 }
9384
9385 void PrimaryLogPG::_copy_some(ObjectContextRef obc, CopyOpRef cop)
9386 {
9387   dout(10) << __func__ << " " << *obc << " " << cop << dendl;
9388
9389   unsigned flags = 0;
9390   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_FLUSH)
9391     flags |= CEPH_OSD_FLAG_FLUSH;
9392   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE)
9393     flags |= CEPH_OSD_FLAG_IGNORE_CACHE;
9394   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY)
9395     flags |= CEPH_OSD_FLAG_IGNORE_OVERLAY;
9396   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE)
9397     flags |= CEPH_OSD_FLAG_MAP_SNAP_CLONE;
9398   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_RWORDERED)
9399     flags |= CEPH_OSD_FLAG_RWORDERED;
9400
9401   C_GatherBuilder gather(cct);
9402
9403   if (cop->cursor.is_initial() && cop->mirror_snapset) {
9404     // list snaps too.
9405     ceph_assert(cop->src.snap == CEPH_NOSNAP);
9406     ObjectOperation op;
9407     op.list_snaps(&cop->results.snapset, NULL);
9408     ceph_tid_t tid = osd->objecter->read(cop->src.oid, cop->oloc, op,
9409                                     CEPH_SNAPDIR, NULL,
9410                                     flags, gather.new_sub(), NULL);
9411     cop->objecter_tid2 = tid;
9412   }
9413
9414   ObjectOperation op;
9415   if (cop->results.user_version) {
9416     op.assert_version(cop->results.user_version);
9417   } else {
9418     // we should learn the version after the first chunk, if we didn't know
9419     // it already!
9420     ceph_assert(cop->cursor.is_initial());
9421   }
9422   op.copy_get(&cop->cursor, get_copy_chunk_size(),
9423               &cop->results.object_size, &cop->results.mtime,
9424               &cop->attrs, &cop->data, &cop->omap_header, &cop->omap_data,
9425               &cop->results.snaps, &cop->results.snap_seq,
9426               &cop->results.flags,
9427               &cop->results.source_data_digest,
9428               &cop->results.source_omap_digest,
9429               &cop->results.reqids,
9430               &cop->results.reqid_return_codes,
9431               &cop->results.truncate_seq,
9432               &cop->results.truncate_size,
9433               &cop->rval);
9434   op.set_last_op_flags(cop->src_obj_fadvise_flags);
9435
9436   C_Copyfrom *fin = new C_Copyfrom(this, obc->obs.oi.soid,
9437                                    get_last_peering_reset(), cop);
9438   gather.set_finisher(new C_OnFinisher(fin,
9439                                        osd->get_objecter_finisher(get_pg_shard())));
9440
9441   ceph_tid_t tid = osd->objecter->read(cop->src.oid, cop->oloc, op,
9442                                   cop->src.snap, NULL,
9443                                   flags,
9444                                   gather.new_sub(),
9445                                   // discover the object version if we don't know it yet
9446                                   cop->results.user_version ? NULL : &cop->results.user_version);
9447   fin->tid = tid;
9448   cop->objecter_tid = tid;
9449   gather.activate();
9450 }
9451
9452 void PrimaryLogPG::_copy_some_manifest(ObjectContextRef obc, CopyOpRef cop, uint64_t start_offset)
9453 {
9454   dout(10) << __func__ << " " << *obc << " " << cop << dendl;
9455
9456   unsigned flags = 0;
9457   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_FLUSH)
9458     flags |= CEPH_OSD_FLAG_FLUSH;
9459   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE)
9460     flags |= CEPH_OSD_FLAG_IGNORE_CACHE;
9461   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY)
9462     flags |= CEPH_OSD_FLAG_IGNORE_OVERLAY;
9463   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE)
9464     flags |= CEPH_OSD_FLAG_MAP_SNAP_CLONE;
9465   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_RWORDERED)
9466     flags |= CEPH_OSD_FLAG_RWORDERED;
9467
9468   int num_chunks = 0;
9469   uint64_t last_offset = 0, chunks_size = 0;
9470   object_manifest_t *manifest = &obc->obs.oi.manifest;
9471   map<uint64_t, chunk_info_t>::iterator iter = manifest->chunk_map.find(start_offset);
9472   for (;iter != manifest->chunk_map.end(); ++iter) {
9473     num_chunks++;
9474     chunks_size += iter->second.length;
9475     last_offset = iter->first;
9476     if (get_copy_chunk_size() < chunks_size) {
9477       break;
9478     }
9479   }
9480
9481   cop->num_chunk = num_chunks;
9482   cop->start_offset = start_offset;
9483   cop->last_offset = last_offset;
9484   dout(20) << __func__ << " oid " << obc->obs.oi.soid << " num_chunks: " << num_chunks
9485           << " start_offset: " << start_offset << " chunks_size: " << chunks_size
9486           << " last_offset: " << last_offset << dendl;
9487
9488   iter = manifest->chunk_map.find(start_offset);
9489   for (;iter != manifest->chunk_map.end(); ++iter) {
9490     uint64_t obj_offset = iter->first;
9491     uint64_t length = manifest->chunk_map[iter->first].length;
9492     hobject_t soid = manifest->chunk_map[iter->first].oid;
9493     object_locator_t oloc(soid);
9494     CopyCallback * cb = NULL;
9495     CopyOpRef sub_cop(std::make_shared<CopyOp>(cb, ObjectContextRef(), cop->src, oloc,
9496                        cop->results.user_version, cop->flags, cop->mirror_snapset,
9497                        cop->src_obj_fadvise_flags, cop->dest_obj_fadvise_flags));
9498     sub_cop->cursor.data_offset = obj_offset;
9499     cop->chunk_cops[obj_offset] = sub_cop;
9500
9501     int s = sub_cop->chunk_ops.size();
9502     sub_cop->chunk_ops.resize(s+1);
9503     sub_cop->chunk_ops[s].op.op =  CEPH_OSD_OP_READ;
9504     sub_cop->chunk_ops[s].op.extent.offset = manifest->chunk_map[iter->first].offset;
9505     sub_cop->chunk_ops[s].op.extent.length = length;
9506
9507     ObjectOperation op;
9508     op.dup(sub_cop->chunk_ops);
9509
9510     if (cop->results.user_version) {
9511       op.assert_version(cop->results.user_version);
9512     } else {
9513       // we should learn the version after the first chunk, if we didn't know
9514       // it already!
9515       ceph_assert(cop->cursor.is_initial());
9516     }
9517     op.set_last_op_flags(cop->src_obj_fadvise_flags);
9518
9519     C_CopyChunk *fin = new C_CopyChunk(this, obc->obs.oi.soid,
9520                                      get_last_peering_reset(), cop);
9521     fin->offset = obj_offset;
9522
9523     ceph_tid_t tid = osd->objecter->read(
9524       soid.oid, oloc, op,
9525       sub_cop->src.snap, NULL,
9526       flags,
9527       new C_OnFinisher(fin, osd->get_objecter_finisher(get_pg_shard())),
9528       // discover the object version if we don't know it yet
9529       sub_cop->results.user_version ? NULL : &sub_cop->results.user_version);
9530     fin->tid = tid;
9531     sub_cop->objecter_tid = tid;
9532
9533     dout(20) << __func__ << " tgt_oid: " << soid.oid << " tgt_offset: "
9534             << manifest->chunk_map[iter->first].offset
9535             << " length: " << length << " pool id: " << oloc.pool
9536             << " tid: " << tid << dendl;
9537
9538     if (last_offset <= iter->first) {
9539       break;
9540     }
9541   }
9542 }
9543
9544 void PrimaryLogPG::process_copy_chunk(hobject_t oid, ceph_tid_t tid, int r)
9545 {
9546   dout(10) << __func__ << " " << oid << " tid " << tid
9547            << " " << cpp_strerror(r) << dendl;
9548   map<hobject_t,CopyOpRef>::iterator p = copy_ops.find(oid);
9549   if (p == copy_ops.end()) {
9550     dout(10) << __func__ << " no copy_op found" << dendl;
9551     return;
9552   }
9553   CopyOpRef cop = p->second;
9554   if (tid != cop->objecter_tid) {
9555     dout(10) << __func__ << " tid " << tid << " != cop " << cop
9556              << " tid " << cop->objecter_tid << dendl;
9557     return;
9558   }
9559
9560   if (cop->omap_data.length() || cop->omap_header.length())
9561     cop->results.has_omap = true;
9562
9563   if (r >= 0 && !pool.info.supports_omap() &&
9564       (cop->omap_data.length() || cop->omap_header.length())) {
9565     r = -EOPNOTSUPP;
9566   }
9567   cop->objecter_tid = 0;
9568   cop->objecter_tid2 = 0;  // assume this ordered before us (if it happened)
9569   ObjectContextRef& cobc = cop->obc;
9570
9571   if (r < 0)
9572     goto out;
9573
9574   ceph_assert(cop->rval >= 0);
9575
9576   if (oid.snap < CEPH_NOSNAP && !cop->results.snaps.empty()) {
9577     // verify snap hasn't been deleted
9578     vector<snapid_t>::iterator p = cop->results.snaps.begin();
9579     while (p != cop->results.snaps.end()) {
9580       // make best effort to sanitize snaps/clones.
9581       if (get_osdmap()->in_removed_snaps_queue(info.pgid.pgid.pool(), *p)) {
9582         dout(10) << __func__ << " clone snap " << *p << " has been deleted"
9583                  << dendl;
9584         for (vector<snapid_t>::iterator q = p + 1;
9585              q != cop->results.snaps.end();
9586              ++q)
9587           *(q - 1) = *q;
9588         cop->results.snaps.resize(cop->results.snaps.size() - 1);
9589       } else {
9590         ++p;
9591       }
9592     }
9593     if (cop->results.snaps.empty()) {
9594       dout(10) << __func__ << " no more snaps for " << oid << dendl;
9595       r = -ENOENT;
9596       goto out;
9597     }
9598   }
9599
9600   ceph_assert(cop->rval >= 0);
9601
9602   if (!cop->temp_cursor.data_complete) {
9603     cop->results.data_digest = cop->data.crc32c(cop->results.data_digest);
9604   }
9605   if (pool.info.supports_omap() && !cop->temp_cursor.omap_complete) {
9606     if (cop->omap_header.length()) {
9607       cop->results.omap_digest =
9608         cop->omap_header.crc32c(cop->results.omap_digest);
9609     }
9610     if (cop->omap_data.length()) {
9611       bufferlist keys;
9612       keys.substr_of(cop->omap_data, 4, cop->omap_data.length() - 4);
9613       cop->results.omap_digest = keys.crc32c(cop->results.omap_digest);
9614     }
9615   }
9616
9617   if (!cop->temp_cursor.attr_complete) {
9618     for (map<string,bufferlist>::iterator p = cop->attrs.begin();
9619          p != cop->attrs.end();
9620          ++p) {
9621       cop->results.attrs[string("_") + p->first] = p->second;
9622     }
9623     cop->attrs.clear();
9624   }
9625
9626   if (!cop->cursor.is_complete()) {
9627     // write out what we have so far
9628     if (cop->temp_cursor.is_initial()) {
9629       ceph_assert(!cop->results.started_temp_obj);
9630       cop->results.started_temp_obj = true;
9631       cop->results.temp_oid = generate_temp_object(oid);
9632       dout(20) << __func__ << " using temp " << cop->results.temp_oid << dendl;
9633     }
9634     ObjectContextRef tempobc = get_object_context(cop->results.temp_oid, true);
9635     OpContextUPtr ctx = simple_opc_create(tempobc);
9636     if (cop->temp_cursor.is_initial()) {
9637       ctx->new_temp_oid = cop->results.temp_oid;
9638     }
9639     _write_copy_chunk(cop, ctx->op_t.get());
9640     simple_opc_submit(std::move(ctx));
9641     dout(10) << __func__ << " fetching more" << dendl;
9642     _copy_some(cobc, cop);
9643     return;
9644   }
9645
9646   // verify digests?
9647   if (cop->results.is_data_digest() || cop->results.is_omap_digest()) {
9648     dout(20) << __func__ << std::hex
9649       << " got digest: rx data 0x" << cop->results.data_digest
9650       << " omap 0x" << cop->results.omap_digest
9651       << ", source: data 0x" << cop->results.source_data_digest
9652       << " omap 0x" <<  cop->results.source_omap_digest
9653       << std::dec
9654       << " flags " << cop->results.flags
9655       << dendl;
9656   }
9657   if (cop->results.is_data_digest() &&
9658       cop->results.data_digest != cop->results.source_data_digest) {
9659     derr << __func__ << std::hex << " data digest 0x" << cop->results.data_digest
9660          << " != source 0x" << cop->results.source_data_digest << std::dec
9661          << dendl;
9662     osd->clog->error() << info.pgid << " copy from " << cop->src
9663                        << " to " << cop->obc->obs.oi.soid << std::hex
9664                        << " data digest 0x" << cop->results.data_digest
9665                        << " != source 0x" << cop->results.source_data_digest
9666                        << std::dec;
9667     r = -EIO;
9668     goto out;
9669   }
9670   if (cop->results.is_omap_digest() &&
9671       cop->results.omap_digest != cop->results.source_omap_digest) {
9672     derr << __func__ << std::hex
9673          << " omap digest 0x" << cop->results.omap_digest
9674          << " != source 0x" << cop->results.source_omap_digest
9675          << std::dec << dendl;
9676     osd->clog->error() << info.pgid << " copy from " << cop->src
9677                        << " to " << cop->obc->obs.oi.soid << std::hex
9678                        << " omap digest 0x" << cop->results.omap_digest
9679                        << " != source 0x" << cop->results.source_omap_digest
9680                        << std::dec;
9681     r = -EIO;
9682     goto out;
9683   }
9684   if (cct->_conf->osd_debug_inject_copyfrom_error) {
9685     derr << __func__ << " injecting copyfrom failure" << dendl;
9686     r = -EIO;
9687     goto out;
9688   }
9689
9690   cop->results.fill_in_final_tx = std::function<void(PGTransaction*)>(
9691     [this, &cop /* avoid ref cycle */](PGTransaction *t) {
9692       ObjectState& obs = cop->obc->obs;
9693       if (cop->temp_cursor.is_initial()) {
9694         dout(20) << "fill_in_final_tx: writing "
9695                  << "directly to final object" << dendl;
9696         // write directly to final object
9697         cop->results.temp_oid = obs.oi.soid;
9698         _write_copy_chunk(cop, t);
9699       } else {
9700         // finish writing to temp object, then move into place
9701         dout(20) << "fill_in_final_tx: writing to temp object" << dendl;
9702         if (obs.oi.has_manifest() && obs.oi.manifest.is_redirect() && obs.exists) {
9703           /* In redirect manifest case, the object exists in the upper tier.
9704            * So, to avoid a conflict when rename() is called, remove existing
9705            * object first
9706            */
9707           t->remove(obs.oi.soid);
9708         }
9709         _write_copy_chunk(cop, t);
9710         t->rename(obs.oi.soid, cop->results.temp_oid);
9711       }
9712       t->setattrs(obs.oi.soid, cop->results.attrs);
9713     });
9714
9715   dout(20) << __func__ << " success; committing" << dendl;
9716
9717  out:
9718   dout(20) << __func__ << " complete r = " << cpp_strerror(r) << dendl;
9719   CopyCallbackResults results(r, &cop->results);
9720   cop->cb->complete(results);
9721
9722   copy_ops.erase(cobc->obs.oi.soid);
9723   cobc->stop_block();
9724
9725   if (r < 0 && cop->results.started_temp_obj) {
9726     dout(10) << __func__ << " deleting partial temp object "
9727              << cop->results.temp_oid << dendl;
9728     ObjectContextRef tempobc = get_object_context(cop->results.temp_oid, true);
9729     OpContextUPtr ctx = simple_opc_create(tempobc);
9730     ctx->op_t->remove(cop->results.temp_oid);
9731     ctx->discard_temp_oid = cop->results.temp_oid;
9732     simple_opc_submit(std::move(ctx));
9733   }
9734
9735   // cancel and requeue proxy ops on this object
9736   if (!r) {
9737     cancel_and_requeue_proxy_ops(cobc->obs.oi.soid);
9738   }
9739
9740   kick_object_context_blocked(cobc);
9741 }
9742
9743 void PrimaryLogPG::process_copy_chunk_manifest(hobject_t oid, ceph_tid_t tid, int r, uint64_t offset)
9744 {
9745   dout(10) << __func__ << " " << oid << " tid " << tid
9746            << " " << cpp_strerror(r) << dendl;
9747   map<hobject_t,CopyOpRef>::iterator p = copy_ops.find(oid);
9748   if (p == copy_ops.end()) {
9749     dout(10) << __func__ << " no copy_op found" << dendl;
9750     return;
9751   }
9752   CopyOpRef obj_cop = p->second;
9753   CopyOpRef chunk_cop = obj_cop->chunk_cops[offset];
9754
9755   if (tid != chunk_cop->objecter_tid) {
9756     dout(10) << __func__ << " tid " << tid << " != cop " << chunk_cop
9757              << " tid " << chunk_cop->objecter_tid << dendl;
9758     return;
9759   }
9760
9761   if (chunk_cop->omap_data.length() || chunk_cop->omap_header.length()) {
9762     r = -EOPNOTSUPP;
9763   }
9764
9765   chunk_cop->objecter_tid = 0;
9766   chunk_cop->objecter_tid2 = 0;  // assume this ordered before us (if it happened)
9767   ObjectContextRef& cobc = obj_cop->obc;
9768   OSDOp &chunk_data = chunk_cop->chunk_ops[0];
9769
9770   if (r < 0) {
9771     obj_cop->failed = true;
9772     goto out;
9773   }
9774
9775   if (obj_cop->failed) {
9776     return;
9777   }
9778   if (!chunk_data.outdata.length()) {
9779     r = -EIO;
9780     obj_cop->failed = true;
9781     goto out;
9782   }
9783
9784   obj_cop->num_chunk--;
9785
9786   /* check all of the copyop are completed */
9787   if (obj_cop->num_chunk) {
9788     dout(20) << __func__ << " num_chunk: " << obj_cop->num_chunk << dendl;
9789     return;
9790   }
9791
9792   {
9793     OpContextUPtr ctx = simple_opc_create(obj_cop->obc);
9794     if (!ctx->lock_manager.take_write_lock(
9795           obj_cop->obc->obs.oi.soid,
9796           obj_cop->obc)) {
9797       // recovery op can take read lock.
9798       // so need to wait for recovery completion
9799       r = -EAGAIN;
9800       obj_cop->failed = true;
9801       close_op_ctx(ctx.release());
9802       goto out;
9803     }
9804     dout(20) << __func__ << " took lock on obc, " << obj_cop->obc->rwstate << dendl;
9805
9806     PGTransaction *t = ctx->op_t.get();
9807     ObjectState& obs = ctx->new_obs;
9808     for (auto p : obj_cop->chunk_cops) {
9809       OSDOp &sub_chunk = p.second->chunk_ops[0];
9810       t->write(cobc->obs.oi.soid,
9811               p.second->cursor.data_offset,
9812               sub_chunk.outdata.length(),
9813               sub_chunk.outdata,
9814               p.second->dest_obj_fadvise_flags);
9815       dout(20) << __func__ << " offset: " << p.second->cursor.data_offset
9816               << " length: " << sub_chunk.outdata.length() << dendl;
9817       write_update_size_and_usage(ctx->delta_stats, obs.oi, ctx->modified_ranges,
9818                                   p.second->cursor.data_offset, sub_chunk.outdata.length());
9819       obs.oi.manifest.chunk_map[p.second->cursor.data_offset].clear_flag(chunk_info_t::FLAG_MISSING);
9820       ctx->clean_regions.mark_data_region_dirty(p.second->cursor.data_offset, sub_chunk.outdata.length());
9821       sub_chunk.outdata.clear();
9822     }
9823     obs.oi.clear_data_digest();
9824     ctx->at_version = get_next_version();
9825     finish_ctx(ctx.get(), pg_log_entry_t::PROMOTE);
9826     simple_opc_submit(std::move(ctx));
9827     obj_cop->chunk_cops.clear();
9828
9829     auto p = cobc->obs.oi.manifest.chunk_map.rbegin();
9830     /* check remaining work */
9831     if (p != cobc->obs.oi.manifest.chunk_map.rend()) {
9832       if (obj_cop->last_offset < p->first) {
9833         for (auto &en : cobc->obs.oi.manifest.chunk_map) {
9834           if (obj_cop->last_offset < en.first) {
9835             _copy_some_manifest(cobc, obj_cop, en.first);
9836             return;
9837           }
9838         }
9839       }
9840     }
9841   }
9842
9843  out:
9844   dout(20) << __func__ << " complete r = " << cpp_strerror(r) << dendl;
9845   CopyCallbackResults results(r, &obj_cop->results);
9846   obj_cop->cb->complete(results);
9847
9848   copy_ops.erase(cobc->obs.oi.soid);
9849   cobc->stop_block();
9850
9851   // cancel and requeue proxy ops on this object
9852   if (!r) {
9853     cancel_and_requeue_proxy_ops(cobc->obs.oi.soid);
9854   }
9855
9856   kick_object_context_blocked(cobc);
9857 }
9858
9859 void PrimaryLogPG::cancel_and_requeue_proxy_ops(hobject_t oid) {
9860   vector<ceph_tid_t> tids;
9861   for (map<ceph_tid_t, ProxyReadOpRef>::iterator it = proxyread_ops.begin();
9862       it != proxyread_ops.end();) {
9863     if (it->second->soid == oid) {
9864       cancel_proxy_read((it++)->second, &tids);
9865     } else {
9866       ++it;
9867     }
9868   }
9869   for (map<ceph_tid_t, ProxyWriteOpRef>::iterator it = proxywrite_ops.begin();
9870        it != proxywrite_ops.end();) {
9871     if (it->second->soid == oid) {
9872       cancel_proxy_write((it++)->second, &tids);
9873     } else {
9874       ++it;
9875     }
9876   }
9877   osd->objecter->op_cancel(tids, -ECANCELED);
9878   kick_proxy_ops_blocked(oid);
9879 }
9880
9881 void PrimaryLogPG::_write_copy_chunk(CopyOpRef cop, PGTransaction *t)
9882 {
9883   dout(20) << __func__ << " " << cop
9884            << " " << cop->attrs.size() << " attrs"
9885            << " " << cop->data.length() << " bytes"
9886            << " " << cop->omap_header.length() << " omap header bytes"
9887            << " " << cop->omap_data.length() << " omap data bytes"
9888            << dendl;
9889   if (!cop->temp_cursor.attr_complete) {
9890     t->create(cop->results.temp_oid);
9891   }
9892   if (!cop->temp_cursor.data_complete) {
9893     ceph_assert(cop->data.length() + cop->temp_cursor.data_offset ==
9894            cop->cursor.data_offset);
9895     if (pool.info.required_alignment() &&
9896         !cop->cursor.data_complete) {
9897       /**
9898        * Trim off the unaligned bit at the end, we'll adjust cursor.data_offset
9899        * to pick it up on the next pass.
9900        */
9901       ceph_assert(cop->temp_cursor.data_offset %
9902              pool.info.required_alignment() == 0);
9903       if (cop->data.length() % pool.info.required_alignment() != 0) {
9904         uint64_t to_trim =
9905           cop->data.length() % pool.info.required_alignment();
9906         bufferlist bl;
9907         bl.substr_of(cop->data, 0, cop->data.length() - to_trim);
9908         cop->data.swap(bl);
9909         cop->cursor.data_offset -= to_trim;
9910         ceph_assert(cop->data.length() + cop->temp_cursor.data_offset ==
9911                cop->cursor.data_offset);
9912       }
9913     }
9914     if (cop->data.length()) {
9915       t->write(
9916         cop->results.temp_oid,
9917         cop->temp_cursor.data_offset,
9918         cop->data.length(),
9919         cop->data,
9920         cop->dest_obj_fadvise_flags);
9921     }
9922     cop->data.clear();
9923   }
9924   if (pool.info.supports_omap()) {
9925     if (!cop->temp_cursor.omap_complete) {
9926       if (cop->omap_header.length()) {
9927         t->omap_setheader(
9928           cop->results.temp_oid,
9929           cop->omap_header);
9930         cop->omap_header.clear();
9931       }
9932       if (cop->omap_data.length()) {
9933         map<string,bufferlist> omap;
9934         bufferlist::const_iterator p = cop->omap_data.begin();
9935         decode(omap, p);
9936         t->omap_setkeys(cop->results.temp_oid, omap);
9937         cop->omap_data.clear();
9938       }
9939     }
9940   } else {
9941     ceph_assert(cop->omap_header.length() == 0);
9942     ceph_assert(cop->omap_data.length() == 0);
9943   }
9944   cop->temp_cursor = cop->cursor;
9945 }
9946
9947 void PrimaryLogPG::finish_copyfrom(CopyFromCallback *cb)
9948 {
9949   OpContext *ctx = cb->ctx;
9950   dout(20) << "finish_copyfrom on " << ctx->obs->oi.soid << dendl;
9951
9952   ObjectState& obs = ctx->new_obs;
9953   if (obs.exists) {
9954     dout(20) << __func__ << ": exists, removing" << dendl;
9955     ctx->op_t->remove(obs.oi.soid);
9956   } else {
9957     ctx->delta_stats.num_objects++;
9958     obs.exists = true;
9959   }
9960   if (cb->is_temp_obj_used()) {
9961     ctx->discard_temp_oid = cb->results->temp_oid;
9962   }
9963   cb->results->fill_in_final_tx(ctx->op_t.get());
9964
9965   // CopyFromCallback fills this in for us
9966   obs.oi.user_version = ctx->user_at_version;
9967
9968   if (cb->results->is_data_digest()) {
9969     obs.oi.set_data_digest(cb->results->data_digest);
9970   } else {
9971     obs.oi.clear_data_digest();
9972   }
9973   if (cb->results->is_omap_digest()) {
9974     obs.oi.set_omap_digest(cb->results->omap_digest);
9975   } else {
9976     obs.oi.clear_omap_digest();
9977   }
9978
9979   obs.oi.truncate_seq = cb->truncate_seq;
9980   obs.oi.truncate_size = cb->truncate_size;
9981
9982   obs.oi.mtime = ceph::real_clock::to_timespec(cb->results->mtime);
9983   ctx->mtime = utime_t();
9984
9985   ctx->extra_reqids = cb->results->reqids;
9986   ctx->extra_reqid_return_codes = cb->results->reqid_return_codes;
9987
9988   // cache: clear whiteout?
9989   if (obs.oi.is_whiteout()) {
9990     dout(10) << __func__ << " clearing whiteout on " << obs.oi.soid << dendl;
9991     obs.oi.clear_flag(object_info_t::FLAG_WHITEOUT);
9992     --ctx->delta_stats.num_whiteouts;
9993   }
9994
9995   if (cb->results->has_omap) {
9996     dout(10) << __func__ << " setting omap flag on " << obs.oi.soid << dendl;
9997     obs.oi.set_flag(object_info_t::FLAG_OMAP);
9998     ctx->clean_regions.mark_omap_dirty();
9999   } else {
10000     dout(10) << __func__ << " clearing omap flag on " << obs.oi.soid << dendl;
10001     obs.oi.clear_flag(object_info_t::FLAG_OMAP);
10002   }
10003
10004   interval_set<uint64_t> ch;
10005   if (obs.oi.size > 0)
10006     ch.insert(0, obs.oi.size);
10007   ctx->modified_ranges.union_of(ch);
10008   ctx->clean_regions.mark_data_region_dirty(0, std::max(obs.oi.size, cb->get_data_size()));
10009
10010   if (cb->get_data_size() != obs.oi.size) {
10011     ctx->delta_stats.num_bytes -= obs.oi.size;
10012     obs.oi.size = cb->get_data_size();
10013     ctx->delta_stats.num_bytes += obs.oi.size;
10014   }
10015   ctx->delta_stats.num_wr++;
10016   ctx->delta_stats.num_wr_kb += shift_round_up(obs.oi.size, 10);
10017
10018   osd->logger->inc(l_osd_copyfrom);
10019 }
10020
10021 void PrimaryLogPG::finish_promote(int r, CopyResults *results,
10022                                   ObjectContextRef obc)
10023 {
10024   const hobject_t& soid = obc->obs.oi.soid;
10025   dout(10) << __func__ << " " << soid << " r=" << r
10026            << " uv" << results->user_version << dendl;
10027
10028   if (r == -ECANCELED) {
10029     return;
10030   }
10031
10032   if (r != -ENOENT && soid.is_snap()) {
10033     if (results->snaps.empty()) {
10034       // we must have read "snap" content from the head object in the
10035       // base pool.  use snap_seq to construct what snaps should be
10036       // for this clone (what is was before we evicted the clean clone
10037       // from this pool, and what it will be when we flush and the
10038       // clone eventually happens in the base pool).  we want to use
10039       // snaps in (results->snap_seq,soid.snap]
10040       SnapSet& snapset = obc->ssc->snapset;
10041       for (auto p = snapset.clone_snaps.rbegin();
10042            p != snapset.clone_snaps.rend();
10043            ++p) {
10044         for (auto snap : p->second) {
10045           if (snap > soid.snap) {
10046             continue;
10047           }
10048           if (snap <= results->snap_seq) {
10049             break;
10050           }
10051           results->snaps.push_back(snap);
10052         }
10053       }
10054     }
10055
10056     dout(20) << __func__ << " snaps " << results->snaps << dendl;
10057     filter_snapc(results->snaps);
10058
10059     dout(20) << __func__ << " filtered snaps " << results->snaps << dendl;
10060     if (results->snaps.empty()) {
10061       dout(20) << __func__
10062                << " snaps are empty, clone is invalid,"
10063                << " setting r to ENOENT" << dendl;
10064       r = -ENOENT;
10065     }
10066   }
10067
10068   if (r < 0 && results->started_temp_obj) {
10069     dout(10) << __func__ << " abort; will clean up partial work" << dendl;
10070     ObjectContextRef tempobc = get_object_context(results->temp_oid, false);
10071     ceph_assert(tempobc);
10072     OpContextUPtr ctx = simple_opc_create(tempobc);
10073     ctx->op_t->remove(results->temp_oid);
10074     simple_opc_submit(std::move(ctx));
10075     results->started_temp_obj = false;
10076   }
10077
10078   if (r == -ENOENT && soid.is_snap()) {
10079     dout(10) << __func__
10080              << ": enoent while trying to promote clone, " << soid
10081              << " must have been trimmed, removing from snapset"
10082              << dendl;
10083     hobject_t head(soid.get_head());
10084     ObjectContextRef obc = get_object_context(head, false);
10085     ceph_assert(obc);
10086
10087     OpContextUPtr tctx = simple_opc_create(obc);
10088     tctx->at_version = get_next_version();
10089     if (get_osdmap()->require_osd_release < ceph_release_t::octopus) {
10090       filter_snapc(tctx->new_snapset.snaps);
10091     } else {
10092       tctx->new_snapset.snaps.clear();
10093     }
10094     vector<snapid_t> new_clones;
10095     map<snapid_t, vector<snapid_t>> new_clone_snaps;
10096     for (vector<snapid_t>::iterator i = tctx->new_snapset.clones.begin();
10097          i != tctx->new_snapset.clones.end();
10098          ++i) {
10099       if (*i != soid.snap) {
10100         new_clones.push_back(*i);
10101         auto p = tctx->new_snapset.clone_snaps.find(*i);
10102         if (p != tctx->new_snapset.clone_snaps.end()) {
10103           new_clone_snaps[*i] = p->second;
10104         }
10105       }
10106     }
10107     tctx->new_snapset.clones.swap(new_clones);
10108     tctx->new_snapset.clone_overlap.erase(soid.snap);
10109     tctx->new_snapset.clone_size.erase(soid.snap);
10110     tctx->new_snapset.clone_snaps.swap(new_clone_snaps);
10111
10112     // take RWWRITE lock for duration of our local write.  ignore starvation.
10113     if (!tctx->lock_manager.take_write_lock(
10114           head,
10115           obc)) {
10116       ceph_abort_msg("problem!");
10117     }
10118     dout(20) << __func__ << " took lock on obc, " << obc->rwstate << dendl;
10119
10120     finish_ctx(tctx.get(), pg_log_entry_t::PROMOTE);
10121
10122     simple_opc_submit(std::move(tctx));
10123     return;
10124   }
10125
10126   bool whiteout = false;
10127   if (r == -ENOENT) {
10128     ceph_assert(soid.snap == CEPH_NOSNAP); // snap case is above
10129     dout(10) << __func__ << " whiteout " << soid << dendl;
10130     whiteout = true;
10131   }
10132
10133   if (r < 0 && !whiteout) {
10134     derr << __func__ << " unexpected promote error " << cpp_strerror(r) << dendl;
10135     // pass error to everyone blocked on this object
10136     // FIXME: this is pretty sloppy, but at this point we got
10137     // something unexpected and don't have many other options.
10138     map<hobject_t,list<OpRequestRef>>::iterator blocked_iter =
10139       waiting_for_blocked_object.find(soid);
10140     if (blocked_iter != waiting_for_blocked_object.end()) {
10141       while (!blocked_iter->second.empty()) {
10142         osd->reply_op_error(blocked_iter->second.front(), r);
10143         blocked_iter->second.pop_front();
10144       }
10145       waiting_for_blocked_object.erase(blocked_iter);
10146     }
10147     return;
10148   }
10149
10150   osd->promote_finish(results->object_size);
10151
10152   OpContextUPtr tctx =  simple_opc_create(obc);
10153   tctx->at_version = get_next_version();
10154
10155   if (!obc->obs.oi.has_manifest()) {
10156     ++tctx->delta_stats.num_objects;
10157   }
10158   if (soid.snap < CEPH_NOSNAP)
10159     ++tctx->delta_stats.num_object_clones;
10160   tctx->new_obs.exists = true;
10161
10162   tctx->extra_reqids = results->reqids;
10163   tctx->extra_reqid_return_codes = results->reqid_return_codes;
10164
10165   if (obc->obs.oi.has_manifest() && obc->obs.oi.manifest.is_redirect()) {
10166     tctx->new_obs.oi.manifest.type = object_manifest_t::TYPE_NONE;
10167     tctx->new_obs.oi.clear_flag(object_info_t::FLAG_REDIRECT_HAS_REFERENCE);
10168     tctx->new_obs.oi.clear_flag(object_info_t::FLAG_MANIFEST);
10169     tctx->new_obs.oi.manifest.redirect_target = hobject_t();
10170     tctx->delta_stats.num_objects_manifest--;
10171     if (obc->obs.oi.test_flag(object_info_t::FLAG_REDIRECT_HAS_REFERENCE)) {
10172       dec_all_refcount_manifest(obc->obs.oi, tctx.get());
10173     }
10174   }
10175
10176   if (whiteout) {
10177     // create a whiteout
10178     tctx->op_t->create(soid);
10179     tctx->new_obs.oi.set_flag(object_info_t::FLAG_WHITEOUT);
10180     ++tctx->delta_stats.num_whiteouts;
10181     dout(20) << __func__ << " creating whiteout on " << soid << dendl;
10182     osd->logger->inc(l_osd_tier_whiteout);
10183   } else {
10184     if (results->has_omap) {
10185       dout(10) << __func__ << " setting omap flag on " << soid << dendl;
10186       tctx->new_obs.oi.set_flag(object_info_t::FLAG_OMAP);
10187       ++tctx->delta_stats.num_objects_omap;
10188     }
10189
10190     results->fill_in_final_tx(tctx->op_t.get());
10191     if (results->started_temp_obj) {
10192       tctx->discard_temp_oid = results->temp_oid;
10193     }
10194     tctx->new_obs.oi.size = results->object_size;
10195     tctx->new_obs.oi.user_version = results->user_version;
10196     tctx->new_obs.oi.mtime = ceph::real_clock::to_timespec(results->mtime);
10197     tctx->mtime = utime_t();
10198     if (results->is_data_digest()) {
10199       tctx->new_obs.oi.set_data_digest(results->data_digest);
10200     } else {
10201       tctx->new_obs.oi.clear_data_digest();
10202     }
10203     if (results->object_size)
10204       tctx->clean_regions.mark_data_region_dirty(0, results->object_size);
10205     if (results->is_omap_digest()) {
10206       tctx->new_obs.oi.set_omap_digest(results->omap_digest);
10207     } else {
10208       tctx->new_obs.oi.clear_omap_digest();
10209     }
10210     if (results->has_omap)
10211         tctx->clean_regions.mark_omap_dirty();
10212     tctx->new_obs.oi.truncate_seq = results->truncate_seq;
10213     tctx->new_obs.oi.truncate_size = results->truncate_size;
10214
10215     if (soid.snap != CEPH_NOSNAP) {
10216       ceph_assert(obc->ssc->snapset.clone_snaps.count(soid.snap));
10217       ceph_assert(obc->ssc->snapset.clone_size.count(soid.snap));
10218       ceph_assert(obc->ssc->snapset.clone_size[soid.snap] ==
10219              results->object_size);
10220       ceph_assert(obc->ssc->snapset.clone_overlap.count(soid.snap));
10221
10222       tctx->delta_stats.num_bytes += obc->ssc->snapset.get_clone_bytes(soid.snap);
10223     } else {
10224       tctx->delta_stats.num_bytes += results->object_size;
10225     }
10226   }
10227
10228   if (results->mirror_snapset) {
10229     ceph_assert(tctx->new_obs.oi.soid.snap == CEPH_NOSNAP);
10230     tctx->new_snapset.from_snap_set(
10231       results->snapset,
10232       get_osdmap()->require_osd_release < ceph_release_t::luminous);
10233   }
10234   dout(20) << __func__ << " new_snapset " << tctx->new_snapset << dendl;
10235
10236   // take RWWRITE lock for duration of our local write.  ignore starvation.
10237   if (!tctx->lock_manager.take_write_lock(
10238         obc->obs.oi.soid,
10239         obc)) {
10240     ceph_abort_msg("problem!");
10241   }
10242   dout(20) << __func__ << " took lock on obc, " << obc->rwstate << dendl;
10243
10244   finish_ctx(tctx.get(), pg_log_entry_t::PROMOTE);
10245
10246   simple_opc_submit(std::move(tctx));
10247
10248   osd->logger->inc(l_osd_tier_promote);
10249
10250   if (agent_state &&
10251       agent_state->is_idle())
10252     agent_choose_mode();
10253 }
10254
10255 void PrimaryLogPG::finish_promote_manifest(int r, CopyResults *results,
10256                                             ObjectContextRef obc)
10257 {
10258   const hobject_t& soid = obc->obs.oi.soid;
10259   dout(10) << __func__ << " " << soid << " r=" << r
10260            << " uv" << results->user_version << dendl;
10261
10262   if (r == -ECANCELED || r == -EAGAIN) {
10263     return;
10264   }
10265
10266   if (r < 0) {
10267     derr << __func__ << " unexpected promote error " << cpp_strerror(r) << dendl;
10268     // pass error to everyone blocked on this object
10269     // FIXME: this is pretty sloppy, but at this point we got
10270     // something unexpected and don't have many other options.
10271     map<hobject_t,list<OpRequestRef>>::iterator blocked_iter =
10272       waiting_for_blocked_object.find(soid);
10273     if (blocked_iter != waiting_for_blocked_object.end()) {
10274       while (!blocked_iter->second.empty()) {
10275         osd->reply_op_error(blocked_iter->second.front(), r);
10276         blocked_iter->second.pop_front();
10277       }
10278       waiting_for_blocked_object.erase(blocked_iter);
10279     }
10280     return;
10281   }
10282
10283   osd->promote_finish(results->object_size);
10284   osd->logger->inc(l_osd_tier_promote);
10285
10286   if (agent_state &&
10287       agent_state->is_idle())
10288     agent_choose_mode();
10289 }
10290
10291 void PrimaryLogPG::cancel_copy(CopyOpRef cop, bool requeue,
10292                                vector<ceph_tid_t> *tids)
10293 {
10294   dout(10) << __func__ << " " << cop->obc->obs.oi.soid
10295            << " from " << cop->src << " " << cop->oloc
10296            << " v" << cop->results.user_version << dendl;
10297
10298   // cancel objecter op, if we can
10299   if (cop->objecter_tid) {
10300     tids->push_back(cop->objecter_tid);
10301     cop->objecter_tid = 0;
10302     if (cop->objecter_tid2) {
10303       tids->push_back(cop->objecter_tid2);
10304       cop->objecter_tid2 = 0;
10305     }
10306   }
10307
10308   copy_ops.erase(cop->obc->obs.oi.soid);
10309   cop->obc->stop_block();
10310
10311   kick_object_context_blocked(cop->obc);
10312   cop->results.should_requeue = requeue;
10313   CopyCallbackResults result(-ECANCELED, &cop->results);
10314   cop->cb->complete(result);
10315
10316   // There may still be an objecter callback referencing this copy op.
10317   // That callback will not need the obc since it's been canceled, and
10318   // we need the obc reference to go away prior to flush.
10319   cop->obc = ObjectContextRef();
10320 }
10321
10322 void PrimaryLogPG::cancel_copy_ops(bool requeue, vector<ceph_tid_t> *tids)
10323 {
10324   dout(10) << __func__ << dendl;
10325   map<hobject_t,CopyOpRef>::iterator p = copy_ops.begin();
10326   while (p != copy_ops.end()) {
10327     // requeue this op? can I queue up all of them?
10328     cancel_copy((p++)->second, requeue, tids);
10329   }
10330 }
10331
10332 struct C_gather : public Context {
10333   PrimaryLogPGRef pg;
10334   hobject_t oid;
10335   epoch_t last_peering_reset;
10336   OSDOp *osd_op;
10337   C_gather(PrimaryLogPG *pg_, hobject_t oid_, epoch_t lpr_, OSDOp *osd_op_) :
10338     pg(pg_), oid(oid_), last_peering_reset(lpr_), osd_op(osd_op_) {}
10339   void finish(int r) override {
10340     if (r == -ECANCELED)
10341       return;
10342     std::scoped_lock locker{*pg};
10343     auto p = pg->cls_gather_ops.find(oid);
10344     if (p == pg->cls_gather_ops.end()) {
10345       // op was cancelled
10346       return;
10347     }
10348     if (last_peering_reset != pg->get_last_peering_reset()) {
10349       return;
10350     }
10351     osd_op->rval = r;
10352     PrimaryLogPG::OpContext *ctx = p->second.ctx;
10353     pg->cls_gather_ops.erase(p);
10354     pg->execute_ctx(ctx);
10355   }
10356 };
10357
10358 int PrimaryLogPG::start_cls_gather(OpContext *ctx, std::map<std::string, bufferlist> *src_obj_buffs, const std::string& pool,
10359                                    const char *cls, const char *method, bufferlist& inbl)
10360 {
10361   OpRequestRef op = ctx->op;
10362   MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
10363
10364   auto pool_id = osd->objecter->with_osdmap(std::mem_fn(&OSDMap::lookup_pg_pool_name), pool);
10365   object_locator_t oloc(pool_id);
10366
10367   ObjectState& obs = ctx->new_obs;
10368   object_info_t& oi = obs.oi;
10369   const hobject_t& soid = oi.soid;
10370
10371   ObjectContextRef obc = get_object_context(soid, false);
10372   C_GatherBuilder gather(cct);
10373
10374   auto [iter, inserted] = cls_gather_ops.emplace(soid, CLSGatherOp(ctx, obc, op));
10375   ceph_assert(inserted);
10376   auto &cgop = iter->second;
10377   for (std::map<std::string, bufferlist>::iterator it = src_obj_buffs->begin(); it != src_obj_buffs->end(); it++) {
10378     std::string oid = it->first;
10379     ObjectOperation obj_op;
10380     obj_op.call(cls, method, inbl);
10381     uint32_t flags = 0;
10382     ceph_tid_t tid = osd->objecter->read(
10383                                          object_t(oid), oloc, obj_op,
10384                                          m->get_snapid(), &it->second,
10385                                          flags, gather.new_sub());
10386     cgop.objecter_tids.push_back(tid);
10387     dout(10) << __func__ << " src=" << oid << ", tgt=" << soid << dendl;
10388   }
10389
10390   C_gather *fin = new C_gather(this, soid, get_last_peering_reset(), &(*ctx->ops)[ctx->current_osd_subop_num]);
10391   gather.set_finisher(new C_OnFinisher(fin,
10392                                        osd->get_objecter_finisher(get_pg_shard())));
10393   gather.activate();
10394
10395   return -EINPROGRESS;
10396 }
10397
10398 // ========================================================================
10399 // flush
10400 //
10401 // Flush a dirty object in the cache tier by writing it back to the
10402 // base tier.  The sequence looks like:
10403 //
10404 //  * send a copy-from operation to the base tier to copy the current
10405 //    version of the object
10406 //  * base tier will pull the object via (perhaps multiple) copy-get(s)
10407 //  * on completion, we check if the object has been modified.  if so,
10408 //    just reply with -EAGAIN.
10409 //  * try to take a write lock so we can clear the dirty flag.  if this
10410 //    fails, wait and retry
10411 //  * start a repop that clears the bit.
10412 //
10413 // If we have to wait, we will retry by coming back through the
10414 // start_flush method.  We check if a flush is already in progress
10415 // and, if so, try to finish it by rechecking the version and trying
10416 // to clear the dirty bit.
10417 //
10418 // In order for the cache-flush (a write op) to not block the copy-get
10419 // from reading the object, the client *must* set the SKIPRWLOCKS
10420 // flag.
10421 //
10422 // NOTE: normally writes are strictly ordered for the client, but
10423 // flushes are special in that they can be reordered with respect to
10424 // other writes.  In particular, we can't have a flush request block
10425 // an update to the cache pool object!
10426
10427 struct C_Flush : public Context {
10428   PrimaryLogPGRef pg;
10429   hobject_t oid;
10430   epoch_t last_peering_reset;
10431   ceph_tid_t tid;
10432   utime_t start;
10433   C_Flush(PrimaryLogPG *p, hobject_t o, epoch_t lpr)
10434     : pg(p), oid(o), last_peering_reset(lpr),
10435       tid(0), start(ceph_clock_now())
10436   {}
10437   void finish(int r) override {
10438     if (r == -ECANCELED)
10439       return;
10440     std::scoped_lock locker{*pg};
10441     if (last_peering_reset == pg->get_last_peering_reset()) {
10442       pg->finish_flush(oid, tid, r);
10443       pg->osd->logger->tinc(l_osd_tier_flush_lat, ceph_clock_now() - start);
10444     }
10445   }
10446 };
10447
10448 int PrimaryLogPG::start_dedup(OpRequestRef op, ObjectContextRef obc)
10449 {
10450   const object_info_t& oi = obc->obs.oi;
10451   const hobject_t& soid = oi.soid;
10452
10453   ceph_assert(obc->is_blocked());
10454   if (oi.size == 0) {
10455     // evicted
10456     return 0;
10457   }
10458   if (pool.info.get_fingerprint_type() == pg_pool_t::TYPE_FINGERPRINT_NONE) {
10459     dout(0) << " fingerprint algorithm is not set " << dendl;
10460     return -EINVAL;
10461   }
10462
10463   /*
10464    * The operations to make dedup chunks are tracked by a ManifestOp.
10465    * This op will be finished if all the operations are completed.
10466    */
10467   ManifestOpRef mop(std::make_shared<ManifestOp>());
10468
10469   // cdc
10470   std::map<uint64_t, bufferlist> chunks;
10471   int r = do_cdc(oi, mop->new_manifest.chunk_map, chunks);
10472   if (r < 0) {
10473     return r;
10474   }
10475   if (!chunks.size()) {
10476     return 0;
10477   }
10478
10479   // chunks issued here are different with chunk_map newly generated
10480   // because the same chunks in previous snap will not be issued
10481   // So, we need two data structures; the first is the issued chunk list to track
10482   // issued operations, and the second is the new chunk_map to update chunk_map after
10483   // all operations are finished
10484   object_ref_delta_t refs;
10485   ObjectContextRef obc_l, obc_g;
10486   get_adjacent_clones(obc, obc_l, obc_g);
10487   // skip if the same content exits in prev snap at same offset
10488   mop->new_manifest.calc_refs_to_inc_on_set(
10489     obc_l ? &(obc_l->obs.oi.manifest) : nullptr,
10490     obc_g ? &(obc_g->obs.oi.manifest) : nullptr,
10491     refs);
10492
10493   for (auto p : chunks) {
10494     hobject_t target = mop->new_manifest.chunk_map[p.first].oid;
10495     if (refs.find(target) == refs.end()) {
10496       continue;
10497     }
10498     C_SetDedupChunks *fin = new C_SetDedupChunks(this, soid, get_last_peering_reset(), p.first);
10499     ceph_tid_t tid = refcount_manifest(soid, target, refcount_t::CREATE_OR_GET_REF,
10500                             fin, std::move(chunks[p.first]));
10501     mop->chunks[target] = make_pair(p.first, p.second.length());
10502     mop->num_chunks++;
10503     mop->tids[p.first] = tid;
10504     fin->tid = tid;
10505     dout(10) << __func__ << " oid: " << soid << " tid: " << tid
10506             << " target: " << target << " offset: " << p.first
10507             << " length: " << p.second.length() << dendl;
10508   }
10509
10510   if (mop->tids.size()) {
10511     manifest_ops[soid] = mop;
10512     manifest_ops[soid]->op = op;
10513   } else {
10514     // size == 0
10515     return 0;
10516   }
10517
10518   return -EINPROGRESS;
10519 }
10520
10521 int PrimaryLogPG::do_cdc(const object_info_t& oi,
10522                          std::map<uint64_t, chunk_info_t>& chunk_map,
10523                          std::map<uint64_t, bufferlist>& chunks)
10524 {
10525   string chunk_algo = pool.info.get_dedup_chunk_algorithm_name();
10526   int64_t chunk_size = pool.info.get_dedup_cdc_chunk_size();
10527   uint64_t total_length = 0;
10528
10529   std::unique_ptr<CDC> cdc = CDC::create(chunk_algo, cbits(chunk_size)-1);
10530   if (!cdc) {
10531     dout(0) << __func__ << " unrecognized chunk-algorithm " << dendl;
10532     return -EINVAL;
10533   }
10534
10535   bufferlist bl;
10536   /**
10537    * We disable EC pool as a base tier of distributed dedup.
10538    * The reason why we disallow erasure code pool here is that the EC pool does not support objects_read_sync().
10539    * Therefore, we should change the current implementation totally to make EC pool compatible.
10540    * As s result, we leave this as a future work.
10541    */
10542   int r = pgbackend->objects_read_sync(
10543       oi.soid, 0, oi.size, 0, &bl);
10544   if (r < 0) {
10545     dout(0) << __func__ << " read fail " << oi.soid
10546             << " len: " << oi.size << " r: " << r << dendl;
10547     return r;
10548   }
10549   if (bl.length() != oi.size) {
10550     dout(0) << __func__ << " bl.length: " << bl.length() << " != oi.size: "
10551             << oi.size << " during chunking " << dendl;
10552     return -EIO;
10553   }
10554
10555   dout(10) << __func__ << " oid: " << oi.soid << " len: " << bl.length()
10556            << " oi.size: " << oi.size
10557            << " chunk_size: " << chunk_size << dendl;
10558
10559   vector<pair<uint64_t, uint64_t>> cdc_chunks;
10560   cdc->calc_chunks(bl, &cdc_chunks);
10561
10562   // get fingerprint
10563   for (auto p : cdc_chunks) {
10564     bufferlist chunk;
10565     chunk.substr_of(bl, p.first, p.second);
10566     hobject_t target = get_fpoid_from_chunk(oi.soid, chunk);
10567     chunks[p.first] = std::move(chunk);
10568     chunk_map[p.first] = chunk_info_t(0, p.second, target);
10569     total_length += p.second;
10570   }
10571   return total_length;
10572 }
10573
10574 hobject_t PrimaryLogPG::get_fpoid_from_chunk(const hobject_t soid, bufferlist& chunk)
10575 {
10576   pg_pool_t::fingerprint_t fp_algo = pool.info.get_fingerprint_type();
10577   if (fp_algo == pg_pool_t::TYPE_FINGERPRINT_NONE) {
10578     return hobject_t();
10579   }
10580   object_t fp_oid = [&fp_algo, &chunk]() -> string {
10581     switch (fp_algo) {
10582       case pg_pool_t::TYPE_FINGERPRINT_SHA1:
10583         return ceph::crypto::digest<ceph::crypto::SHA1>(chunk).to_str();
10584       case pg_pool_t::TYPE_FINGERPRINT_SHA256:
10585         return ceph::crypto::digest<ceph::crypto::SHA256>(chunk).to_str();
10586       case pg_pool_t::TYPE_FINGERPRINT_SHA512:
10587         return ceph::crypto::digest<ceph::crypto::SHA512>(chunk).to_str();
10588       default:
10589         assert(0 == "unrecognized fingerprint type");
10590         return {};
10591     }
10592   }();
10593
10594   pg_t raw_pg;
10595   object_locator_t oloc(soid);
10596   oloc.pool = pool.info.get_dedup_tier();
10597   get_osdmap()->object_locator_to_pg(fp_oid, oloc, raw_pg);
10598   hobject_t target(fp_oid, oloc.key, snapid_t(),
10599                     raw_pg.ps(), raw_pg.pool(),
10600                     oloc.nspace);
10601   return target;
10602 }
10603
10604 int PrimaryLogPG::finish_set_dedup(hobject_t oid, int r, ceph_tid_t tid, uint64_t offset)
10605 {
10606   dout(10) << __func__ << " " << oid << " tid " << tid
10607            << " " << cpp_strerror(r) << dendl;
10608   map<hobject_t,ManifestOpRef>::iterator p = manifest_ops.find(oid);
10609   if (p == manifest_ops.end()) {
10610     dout(10) << __func__ << " no manifest_op found" << dendl;
10611     return -EINVAL;
10612   }
10613   ManifestOpRef mop = p->second;
10614   mop->results[offset] = r;
10615   if (r < 0) {
10616     // if any failure occurs, put a mark on the results to recognize the failure
10617     mop->results[0] = r;
10618   }
10619   if (mop->num_chunks != mop->results.size()) {
10620     // there are on-going works
10621     return -EINPROGRESS;
10622   }
10623   ObjectContextRef obc = get_object_context(oid, false);
10624   if (!obc) {
10625     if (mop->op)
10626       osd->reply_op_error(mop->op, -EINVAL);
10627     return -EINVAL;
10628   }
10629   ceph_assert(obc->is_blocked());
10630   obc->stop_block();
10631   kick_object_context_blocked(obc);
10632   if (mop->results[0] < 0) {
10633     // check if the previous op returns fail
10634     ceph_assert(mop->num_chunks == mop->results.size());
10635     manifest_ops.erase(oid);
10636     osd->reply_op_error(mop->op, mop->results[0]);
10637     return -EIO;
10638   }
10639
10640   if (mop->chunks.size()) {
10641     OpContextUPtr ctx = simple_opc_create(obc);
10642     ceph_assert(ctx);
10643     if (ctx->lock_manager.get_lock_type(
10644           RWState::RWWRITE,
10645           oid,
10646           obc,
10647           mop->op)) {
10648       dout(20) << __func__ << " took write lock" << dendl;
10649     } else if (mop->op) {
10650       dout(10) << __func__ << " waiting on write lock " << mop->op << dendl;
10651       close_op_ctx(ctx.release());
10652       return -EAGAIN;
10653     }
10654
10655     ctx->at_version = get_next_version();
10656     ctx->new_obs = obc->obs;
10657     ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
10658     --ctx->delta_stats.num_objects_dirty;
10659
10660     /*
10661     * Let's assume that there is a manifest snapshotted object, and we issue tier_flush() to head.
10662     * head: [0, 2) aaa <-- tier_flush()
10663     * 20:   [0, 2) ddd, [6, 2) bbb, [8, 2) ccc
10664     *
10665     * In this case, if the new chunk_map is as follows,
10666     * new_chunk_map : [0, 2) ddd, [6, 2) bbb, [8, 2) ccc
10667     * we should drop aaa from head by using calc_refs_to_drop_on_removal().
10668     * So, the precedure is
10669     *   1. calc_refs_to_drop_on_removal()
10670     *   2. register old references to drop after tier_flush() is committed
10671     *   3. update new chunk_map
10672     */
10673
10674     ObjectCleanRegions c_regions = ctx->clean_regions;
10675     ObjectContextRef cobc = get_prev_clone_obc(obc);
10676     c_regions.mark_fully_dirty();
10677     // CDC was done on entire range of manifest object,
10678     // so the first thing we should do here is to drop the reference to old chunks
10679     ObjectContextRef obc_l, obc_g;
10680     get_adjacent_clones(obc, obc_l, obc_g);
10681     // clear all old references
10682     object_ref_delta_t refs;
10683     ctx->obs->oi.manifest.calc_refs_to_drop_on_removal(
10684       obc_l ? &(obc_l->obs.oi.manifest) : nullptr,
10685       obc_g ? &(obc_g->obs.oi.manifest) : nullptr,
10686       refs);
10687     if (!refs.is_empty()) {
10688       ctx->register_on_commit(
10689         [oid, this, refs](){
10690           dec_refcount(oid, refs);
10691         });
10692     }
10693
10694     // set new references
10695     ctx->new_obs.oi.manifest.chunk_map = mop->new_manifest.chunk_map;
10696
10697     finish_ctx(ctx.get(), pg_log_entry_t::CLEAN);
10698     simple_opc_submit(std::move(ctx));
10699   }
10700   if (mop->op)
10701     osd->reply_op_error(mop->op, r);
10702
10703   manifest_ops.erase(oid);
10704   return 0;
10705 }
10706
10707 int PrimaryLogPG::finish_set_manifest_refcount(hobject_t oid, int r, ceph_tid_t tid, uint64_t offset)
10708 {
10709   dout(10) << __func__ << " " << oid << " tid " << tid
10710            << " " << cpp_strerror(r) << dendl;
10711   map<hobject_t,ManifestOpRef>::iterator p = manifest_ops.find(oid);
10712   if (p == manifest_ops.end()) {
10713     dout(10) << __func__ << " no manifest_op found" << dendl;
10714     return -EINVAL;
10715   }
10716   ManifestOpRef mop = p->second;
10717   mop->results[offset] = r;
10718   if (r < 0) {
10719     // if any failure occurs, put a mark on the results to recognize the failure
10720     mop->results[0] = r;
10721   }
10722   if (mop->num_chunks != mop->results.size()) {
10723     // there are on-going works
10724     return -EINPROGRESS;
10725   }
10726
10727   if (mop->cb) {
10728     mop->cb->complete(r);
10729   }
10730
10731   manifest_ops.erase(p);
10732   mop.reset();
10733
10734   return 0;
10735 }
10736
10737 int PrimaryLogPG::start_flush(
10738   OpRequestRef op, ObjectContextRef obc,
10739   bool blocking, hobject_t *pmissing,
10740   std::optional<std::function<void()>> &&on_flush)
10741 {
10742   const object_info_t& oi = obc->obs.oi;
10743   const hobject_t& soid = oi.soid;
10744   dout(10) << __func__ << " " << soid
10745            << " v" << oi.version
10746            << " uv" << oi.user_version
10747            << " " << (blocking ? "blocking" : "non-blocking/best-effort")
10748            << dendl;
10749
10750   bool preoctopus_compat =
10751     get_osdmap()->require_osd_release < ceph_release_t::octopus;
10752   SnapSet snapset;
10753   if (preoctopus_compat) {
10754     // for pre-octopus compatibility, filter SnapSet::snaps.  not
10755     // certain we need this, but let's be conservative.
10756     snapset = obc->ssc->snapset.get_filtered(pool.info);
10757   } else {
10758     // NOTE: change this to a const ref when we remove this compat code
10759     snapset = obc->ssc->snapset;
10760   }
10761
10762   if (obc->obs.oi.has_manifest() && obc->obs.oi.manifest.is_chunked()) {
10763     // current dedup tier only supports blocking operation
10764     if (!blocking) {
10765       return -EOPNOTSUPP;
10766     }
10767   }
10768
10769   // verify there are no (older) check for dirty clones
10770   {
10771     dout(20) << " snapset " << snapset << dendl;
10772     vector<snapid_t>::reverse_iterator p = snapset.clones.rbegin();
10773     while (p != snapset.clones.rend() && *p >= soid.snap)
10774       ++p;
10775     if (p != snapset.clones.rend()) {
10776       hobject_t next = soid;
10777       next.snap = *p;
10778       ceph_assert(next.snap < soid.snap);
10779       if (recovery_state.get_pg_log().get_missing().is_missing(next)) {
10780         dout(10) << __func__ << " missing clone is " << next << dendl;
10781         if (pmissing)
10782           *pmissing = next;
10783         return -ENOENT;
10784       }
10785       ObjectContextRef older_obc = get_object_context(next, false);
10786       if (older_obc) {
10787         dout(20) << __func__ << " next oldest clone is " << older_obc->obs.oi
10788                  << dendl;
10789         if (older_obc->obs.oi.is_dirty()) {
10790           dout(10) << __func__ << " next oldest clone is dirty: "
10791                    << older_obc->obs.oi << dendl;
10792           return -EBUSY;
10793         }
10794       } else {
10795         dout(20) << __func__ << " next oldest clone " << next
10796                  << " is not present; implicitly clean" << dendl;
10797       }
10798     } else {
10799       dout(20) << __func__ << " no older clones" << dendl;
10800     }
10801   }
10802
10803   if (blocking)
10804     obc->start_block();
10805
10806   map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(soid);
10807   if (p != flush_ops.end()) {
10808     FlushOpRef fop = p->second;
10809     if (fop->op == op) {
10810       // we couldn't take the write lock on a cache-try-flush before;
10811       // now we are trying again for the lock.
10812       return try_flush_mark_clean(fop);
10813     }
10814     if (fop->flushed_version == obc->obs.oi.user_version &&
10815         (fop->blocking || !blocking)) {
10816       // nonblocking can join anything
10817       // blocking can only join a blocking flush
10818       dout(20) << __func__ << " piggybacking on existing flush " << dendl;
10819       if (op)
10820         fop->dup_ops.push_back(op);
10821       return -EAGAIN;   // clean up this ctx; op will retry later
10822     }
10823
10824     // cancel current flush since it will fail anyway, or because we
10825     // are blocking and the existing flush is nonblocking.
10826     dout(20) << __func__ << " canceling previous flush; it will fail" << dendl;
10827     if (fop->op)
10828       osd->reply_op_error(fop->op, -EBUSY);
10829     while (!fop->dup_ops.empty()) {
10830       osd->reply_op_error(fop->dup_ops.front(), -EBUSY);
10831       fop->dup_ops.pop_front();
10832     }
10833     vector<ceph_tid_t> tids;
10834     cancel_flush(fop, false, &tids);
10835     osd->objecter->op_cancel(tids, -ECANCELED);
10836   }
10837
10838   if (obc->obs.oi.has_manifest() && obc->obs.oi.manifest.is_chunked()) {
10839     int r = start_dedup(op, obc);
10840     if (r != -EINPROGRESS) {
10841       if (blocking)
10842         obc->stop_block();
10843     }
10844     return r;
10845   }
10846
10847   /**
10848    * In general, we need to send a delete and a copyfrom.
10849    * Consider snapc 10:[10, 9, 8, 4, 3, 2]:[10(10, 9), 4(4,3,2)]
10850    * where 4 is marked as clean.  To flush 10, we have to:
10851    * 1) delete 4:[4,3,2] -- Logically, the object does not exist after 4
10852    * 2) copyfrom 8:[8,4,3,2] -- flush object after snap 8
10853    *
10854    * There is a complicating case.  Supposed there had been a clone 7
10855    * for snaps [7, 6] which has been trimmed since they no longer exist.
10856    * In the base pool, we'd have 5:[4,3,2]:[4(4,3,2)]+head.  When we submit
10857    * the delete, the snap will be promoted to 5, and the head will become
10858    * a whiteout.  When the copy-from goes through, we'll end up with
10859    * 8:[8,4,3,2]:[4(4,3,2)]+head.
10860    *
10861    * Another complication is the case where there is an interval change
10862    * after doing the delete and the flush but before marking the object
10863    * clean.  We'll happily delete head and then recreate it at the same
10864    * sequence number, which works out ok.
10865    */
10866
10867   SnapContext snapc, dsnapc;
10868   if (snapset.seq != 0) {
10869     if (soid.snap == CEPH_NOSNAP) {
10870       snapc = snapset.get_ssc_as_of(snapset.seq);
10871     } else {
10872       snapid_t min_included_snap;
10873       auto p = snapset.clone_snaps.find(soid.snap);
10874       ceph_assert(p != snapset.clone_snaps.end());
10875       min_included_snap = p->second.back();
10876       snapc = snapset.get_ssc_as_of(min_included_snap - 1);
10877     }
10878
10879     snapid_t prev_snapc = 0;
10880     for (vector<snapid_t>::reverse_iterator citer = snapset.clones.rbegin();
10881          citer != snapset.clones.rend();
10882          ++citer) {
10883       if (*citer < soid.snap) {
10884         prev_snapc = *citer;
10885         break;
10886       }
10887     }
10888
10889     dsnapc = snapset.get_ssc_as_of(prev_snapc);
10890   }
10891
10892   object_locator_t base_oloc(soid);
10893   base_oloc.pool = pool.info.tier_of;
10894
10895   if (dsnapc.seq < snapc.seq) {
10896     ObjectOperation o;
10897     o.remove();
10898     osd->objecter->mutate(
10899       soid.oid,
10900       base_oloc,
10901       o,
10902       dsnapc,
10903       ceph::real_clock::from_ceph_timespec(oi.mtime),
10904       (CEPH_OSD_FLAG_IGNORE_OVERLAY |
10905        CEPH_OSD_FLAG_ENFORCE_SNAPC),
10906       NULL /* no callback, we'll rely on the ordering w.r.t the next op */);
10907   }
10908
10909   FlushOpRef fop(std::make_shared<FlushOp>());
10910   fop->obc = obc;
10911   fop->flushed_version = oi.user_version;
10912   fop->blocking = blocking;
10913   fop->on_flush = std::move(on_flush);
10914   fop->op = op;
10915
10916   ObjectOperation o;
10917   if (oi.is_whiteout()) {
10918     fop->removal = true;
10919     o.remove();
10920   } else {
10921     object_locator_t oloc(soid);
10922     o.copy_from(soid.oid.name, soid.snap, oloc, oi.user_version,
10923                 CEPH_OSD_COPY_FROM_FLAG_FLUSH |
10924                 CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
10925                 CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
10926                 CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE,
10927                 LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL|LIBRADOS_OP_FLAG_FADVISE_NOCACHE);
10928
10929     //mean the base tier don't cache data after this
10930     if (agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL)
10931       o.set_last_op_flags(LIBRADOS_OP_FLAG_FADVISE_DONTNEED);
10932   }
10933   C_Flush *fin = new C_Flush(this, soid, get_last_peering_reset());
10934
10935   ceph_tid_t tid = osd->objecter->mutate(
10936     soid.oid, base_oloc, o, snapc,
10937     ceph::real_clock::from_ceph_timespec(oi.mtime),
10938     CEPH_OSD_FLAG_IGNORE_OVERLAY | CEPH_OSD_FLAG_ENFORCE_SNAPC,
10939     new C_OnFinisher(fin,
10940                      osd->get_objecter_finisher(get_pg_shard())));
10941   /* we're under the pg lock and fin->finish() is grabbing that */
10942   fin->tid = tid;
10943   fop->objecter_tid = tid;
10944
10945   flush_ops[soid] = fop;
10946
10947   recovery_state.update_stats(
10948     [&oi](auto &history, auto &stats) {
10949       stats.stats.sum.num_flush++;
10950       stats.stats.sum.num_flush_kb += shift_round_up(oi.size, 10);
10951       return false;
10952     });
10953   return -EINPROGRESS;
10954 }
10955
10956 void PrimaryLogPG::finish_flush(hobject_t oid, ceph_tid_t tid, int r)
10957 {
10958   dout(10) << __func__ << " " << oid << " tid " << tid
10959            << " " << cpp_strerror(r) << dendl;
10960   map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(oid);
10961   if (p == flush_ops.end()) {
10962     dout(10) << __func__ << " no flush_op found" << dendl;
10963     return;
10964   }
10965   FlushOpRef fop = p->second;
10966   if (tid != fop->objecter_tid && !fop->obc->obs.oi.has_manifest()) {
10967     dout(10) << __func__ << " tid " << tid << " != fop " << fop
10968              << " tid " << fop->objecter_tid << dendl;
10969     return;
10970   }
10971   ObjectContextRef obc = fop->obc;
10972   fop->objecter_tid = 0;
10973
10974   if (r < 0 && !(r == -ENOENT && fop->removal)) {
10975     if (fop->op)
10976       osd->reply_op_error(fop->op, -EBUSY);
10977     if (fop->blocking) {
10978       obc->stop_block();
10979       kick_object_context_blocked(obc);
10980     }
10981
10982     if (!fop->dup_ops.empty()) {
10983       dout(20) << __func__ << " requeueing dups" << dendl;
10984       requeue_ops(fop->dup_ops);
10985     }
10986     if (fop->on_flush) {
10987       (*(fop->on_flush))();
10988       fop->on_flush = std::nullopt;
10989     }
10990     flush_ops.erase(oid);
10991     return;
10992   }
10993
10994   r = try_flush_mark_clean(fop);
10995   if (r == -EBUSY && fop->op) {
10996     osd->reply_op_error(fop->op, r);
10997   }
10998 }
10999
11000 int PrimaryLogPG::try_flush_mark_clean(FlushOpRef fop)
11001 {
11002   ObjectContextRef obc = fop->obc;
11003   const hobject_t& oid = obc->obs.oi.soid;
11004
11005   if (fop->blocking) {
11006     obc->stop_block();
11007     kick_object_context_blocked(obc);
11008   }
11009
11010   if (fop->flushed_version != obc->obs.oi.user_version ||
11011       !obc->obs.exists) {
11012     if (obc->obs.exists)
11013       dout(10) << __func__ << " flushed_version " << fop->flushed_version
11014                << " != current " << obc->obs.oi.user_version
11015                << dendl;
11016     else
11017       dout(10) << __func__ << " object no longer exists" << dendl;
11018
11019     if (!fop->dup_ops.empty()) {
11020       dout(20) << __func__ << " requeueing dups" << dendl;
11021       requeue_ops(fop->dup_ops);
11022     }
11023     if (fop->on_flush) {
11024       (*(fop->on_flush))();
11025       fop->on_flush = std::nullopt;
11026     }
11027     flush_ops.erase(oid);
11028     if (fop->blocking)
11029       osd->logger->inc(l_osd_tier_flush_fail);
11030     else
11031       osd->logger->inc(l_osd_tier_try_flush_fail);
11032     return -EBUSY;
11033   }
11034
11035   if (!fop->blocking &&
11036       m_scrubber->write_blocked_by_scrub(oid)) {
11037     if (fop->op) {
11038       dout(10) << __func__ << " blocked by scrub" << dendl;
11039       requeue_op(fop->op);
11040       requeue_ops(fop->dup_ops);
11041       return -EAGAIN;    // will retry
11042     } else {
11043       osd->logger->inc(l_osd_tier_try_flush_fail);
11044       vector<ceph_tid_t> tids;
11045       cancel_flush(fop, false, &tids);
11046       osd->objecter->op_cancel(tids, -ECANCELED);
11047       return -ECANCELED;
11048     }
11049   }
11050
11051   // successfully flushed, can we evict this object?
11052   if (!obc->obs.oi.has_manifest() && !fop->op &&
11053       agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_IDLE &&
11054       agent_maybe_evict(obc, true)) {
11055     osd->logger->inc(l_osd_tier_clean);
11056     if (fop->on_flush) {
11057       (*(fop->on_flush))();
11058       fop->on_flush = std::nullopt;
11059     }
11060     flush_ops.erase(oid);
11061     return 0;
11062   }
11063
11064   dout(10) << __func__ << " clearing DIRTY flag for " << oid << dendl;
11065   OpContextUPtr ctx = simple_opc_create(fop->obc);
11066
11067   // successfully flushed; can we clear the dirty bit?
11068   // try to take the lock manually, since we don't
11069   // have a ctx yet.
11070   if (ctx->lock_manager.get_lock_type(
11071         RWState::RWWRITE,
11072         oid,
11073         obc,
11074         fop->op)) {
11075     dout(20) << __func__ << " took write lock" << dendl;
11076   } else if (fop->op) {
11077     dout(10) << __func__ << " waiting on write lock " << fop->op << " "
11078              << fop->dup_ops << dendl;
11079     // fop->op is now waiting on the lock; get fop->dup_ops to wait too.
11080     for (auto op : fop->dup_ops) {
11081       bool locked = ctx->lock_manager.get_lock_type(
11082         RWState::RWWRITE,
11083         oid,
11084         obc,
11085         op);
11086       ceph_assert(!locked);
11087     }
11088     close_op_ctx(ctx.release());
11089     return -EAGAIN;    // will retry
11090   } else {
11091     dout(10) << __func__ << " failed write lock, no op; failing" << dendl;
11092     close_op_ctx(ctx.release());
11093     osd->logger->inc(l_osd_tier_try_flush_fail);
11094     vector<ceph_tid_t> tids;
11095     cancel_flush(fop, false, &tids);
11096     osd->objecter->op_cancel(tids, -ECANCELED);
11097     return -ECANCELED;
11098   }
11099
11100   if (fop->on_flush) {
11101     ctx->register_on_finish(*(fop->on_flush));
11102     fop->on_flush = std::nullopt;
11103   }
11104
11105   ctx->at_version = get_next_version();
11106
11107   ctx->new_obs = obc->obs;
11108   ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
11109   --ctx->delta_stats.num_objects_dirty;
11110   if (fop->obc->obs.oi.has_manifest()) {
11111     ceph_assert(obc->obs.oi.manifest.is_chunked());
11112     PGTransaction* t = ctx->op_t.get();
11113     uint64_t chunks_size = 0;
11114     for (auto &p : ctx->new_obs.oi.manifest.chunk_map) {
11115       chunks_size += p.second.length;
11116     }
11117     if (ctx->new_obs.oi.is_omap() && pool.info.supports_omap()) {
11118       t->omap_clear(oid);
11119       ctx->new_obs.oi.clear_omap_digest();
11120       ctx->new_obs.oi.clear_flag(object_info_t::FLAG_OMAP);
11121       ctx->clean_regions.mark_omap_dirty();
11122     }
11123     if (obc->obs.oi.size == chunks_size) {
11124       t->truncate(oid, 0);
11125       interval_set<uint64_t> trim;
11126       trim.insert(0, ctx->new_obs.oi.size);
11127       ctx->modified_ranges.union_of(trim);
11128       truncate_update_size_and_usage(ctx->delta_stats,
11129                                      ctx->new_obs.oi,
11130                                      0);
11131       ctx->clean_regions.mark_data_region_dirty(0, ctx->new_obs.oi.size);
11132       ctx->new_obs.oi.new_object();
11133       for (auto &p : ctx->new_obs.oi.manifest.chunk_map) {
11134         p.second.set_flag(chunk_info_t::FLAG_MISSING);
11135       }
11136     } else {
11137       for (auto &p : ctx->new_obs.oi.manifest.chunk_map) {
11138         dout(20) << __func__ << " offset: " << p.second.offset
11139                 << " length: " << p.second.length << dendl;
11140         p.second.clear_flag(chunk_info_t::FLAG_MISSING); // CLEAN
11141       }
11142     }
11143   }
11144
11145   finish_ctx(ctx.get(), pg_log_entry_t::CLEAN);
11146
11147   osd->logger->inc(l_osd_tier_clean);
11148
11149   if (!fop->dup_ops.empty() || fop->op) {
11150     dout(20) << __func__ << " requeueing for " << ctx->at_version << dendl;
11151     list<OpRequestRef> ls;
11152     if (fop->op)
11153       ls.push_back(fop->op);
11154     ls.splice(ls.end(), fop->dup_ops);
11155     requeue_ops(ls);
11156   }
11157
11158   simple_opc_submit(std::move(ctx));
11159
11160   flush_ops.erase(oid);
11161
11162   if (fop->blocking)
11163     osd->logger->inc(l_osd_tier_flush);
11164   else
11165     osd->logger->inc(l_osd_tier_try_flush);
11166
11167   return -EINPROGRESS;
11168 }
11169
11170 void PrimaryLogPG::cancel_flush(FlushOpRef fop, bool requeue,
11171                                 vector<ceph_tid_t> *tids)
11172 {
11173   dout(10) << __func__ << " " << fop->obc->obs.oi.soid << " tid "
11174            << fop->objecter_tid << dendl;
11175   if (fop->objecter_tid) {
11176     tids->push_back(fop->objecter_tid);
11177     fop->objecter_tid = 0;
11178   }
11179   if (fop->io_tids.size()) {
11180     for (auto &p : fop->io_tids) {
11181       tids->push_back(p.second);
11182       p.second = 0;
11183     }
11184   }
11185   if (fop->blocking && fop->obc->is_blocked()) {
11186     fop->obc->stop_block();
11187     kick_object_context_blocked(fop->obc);
11188   }
11189   if (requeue) {
11190     if (fop->op)
11191       requeue_op(fop->op);
11192     requeue_ops(fop->dup_ops);
11193   }
11194   if (fop->on_flush) {
11195     (*(fop->on_flush))();
11196     fop->on_flush = std::nullopt;
11197   }
11198   flush_ops.erase(fop->obc->obs.oi.soid);
11199 }
11200
11201 void PrimaryLogPG::cancel_flush_ops(bool requeue, vector<ceph_tid_t> *tids)
11202 {
11203   dout(10) << __func__ << dendl;
11204   map<hobject_t,FlushOpRef>::iterator p = flush_ops.begin();
11205   while (p != flush_ops.end()) {
11206     cancel_flush((p++)->second, requeue, tids);
11207   }
11208 }
11209
11210 bool PrimaryLogPG::is_present_clone(hobject_t coid)
11211 {
11212   if (!pool.info.allow_incomplete_clones())
11213     return true;
11214   if (is_missing_object(coid))
11215     return true;
11216   ObjectContextRef obc = get_object_context(coid, false);
11217   return obc && obc->obs.exists;
11218 }
11219
11220 // ========================================================================
11221 // cls gather
11222 //
11223
11224 void PrimaryLogPG::cancel_cls_gather(map<hobject_t,CLSGatherOp>::iterator iter, bool requeue,
11225                                      vector<ceph_tid_t> *tids)
11226 {
11227   auto &cgop = iter->second;
11228   for (std::vector<ceph_tid_t>::iterator p = cgop.objecter_tids.begin(); p != cgop.objecter_tids.end(); p++) {
11229     tids->push_back(*p);
11230     dout(10) << __func__ << " " << cgop.obc->obs.oi.soid << " tid " << *p << dendl;
11231   }
11232   cgop.objecter_tids.clear();
11233   close_op_ctx(cgop.ctx);
11234   cgop.ctx = NULL;
11235   if (requeue) {
11236     if (cgop.op)
11237       requeue_op(cgop.op);
11238   }
11239   cls_gather_ops.erase(iter);
11240 }
11241
11242 void PrimaryLogPG::cancel_cls_gather_ops(bool requeue, vector<ceph_tid_t> *tids)
11243 {
11244   dout(10) << __func__ << dendl;
11245   map<hobject_t,CLSGatherOp>::iterator p = cls_gather_ops.begin();
11246   while (p != cls_gather_ops.end()) {
11247     cancel_cls_gather(p++, requeue, tids);
11248   }
11249 }
11250
11251 // ========================================================================
11252 // rep op gather
11253
11254 class C_OSD_RepopCommit : public Context {
11255   PrimaryLogPGRef pg;
11256   boost::intrusive_ptr<PrimaryLogPG::RepGather> repop;
11257 public:
11258   C_OSD_RepopCommit(PrimaryLogPG *pg, PrimaryLogPG::RepGather *repop)
11259     : pg(pg), repop(repop) {}
11260   void finish(int) override {
11261     pg->repop_all_committed(repop.get());
11262   }
11263 };
11264
11265 void PrimaryLogPG::repop_all_committed(RepGather *repop)
11266 {
11267   dout(10) << __func__ << ": repop tid " << repop->rep_tid << " all committed "
11268            << dendl;
11269   repop->all_committed = true;
11270   if (!repop->rep_aborted) {
11271     if (repop->v != eversion_t()) {
11272       recovery_state.complete_write(repop->v, repop->pg_local_last_complete);
11273     }
11274     eval_repop(repop);
11275   }
11276 }
11277
11278 void PrimaryLogPG::op_applied(const eversion_t &applied_version)
11279 {
11280   dout(10) << "op_applied version " << applied_version << dendl;
11281   ceph_assert(applied_version != eversion_t());
11282   ceph_assert(applied_version <= info.last_update);
11283   recovery_state.local_write_applied(applied_version);
11284
11285   if (is_primary() && m_scrubber) {
11286     // if there's a scrub operation waiting for the selected chunk to be fully updated -
11287     // allow it to continue
11288     m_scrubber->on_applied_when_primary(recovery_state.get_last_update_applied());
11289   }
11290 }
11291
11292 void PrimaryLogPG::eval_repop(RepGather *repop)
11293 {
11294   jspan span;
11295   if (repop->op) {
11296     span = tracing::osd::tracer.add_span(__func__, repop->op->osd_parent_span);
11297   }
11298   dout(10) << "eval_repop " << *repop
11299     << (repop->op && repop->op->get_req<MOSDOp>() ? "" : " (no op)") << dendl;
11300
11301   // ondisk?
11302   if (repop->all_committed) {
11303     dout(10) << " commit: " << *repop << dendl;
11304     for (auto p = repop->on_committed.begin();
11305          p != repop->on_committed.end();
11306          repop->on_committed.erase(p++)) {
11307       (*p)();
11308     }
11309     // send dup commits, in order
11310     auto it = waiting_for_ondisk.find(repop->v);
11311     if (it != waiting_for_ondisk.end()) {
11312       ceph_assert(waiting_for_ondisk.begin()->first == repop->v);
11313       for (auto& i : it->second) {
11314         int return_code = repop->r;
11315         if (return_code >= 0) {
11316           return_code = std::get<2>(i);
11317         }
11318         osd->reply_op_error(std::get<0>(i), return_code, repop->v,
11319                             std::get<1>(i), std::get<3>(i));
11320       }
11321       waiting_for_ondisk.erase(it);
11322     }
11323
11324     publish_stats_to_osd();
11325
11326     dout(10) << " removing " << *repop << dendl;
11327     ceph_assert(!repop_queue.empty());
11328     dout(20) << "   q front is " << *repop_queue.front() << dendl;
11329     if (repop_queue.front() == repop) {
11330       RepGather *to_remove = nullptr;
11331       while (!repop_queue.empty() &&
11332              (to_remove = repop_queue.front())->all_committed) {
11333         repop_queue.pop_front();
11334         for (auto p = to_remove->on_success.begin();
11335              p != to_remove->on_success.end();
11336              to_remove->on_success.erase(p++)) {
11337           (*p)();
11338         }
11339         remove_repop(to_remove);
11340       }
11341     }
11342   }
11343 }
11344
11345 void PrimaryLogPG::issue_repop(RepGather *repop, OpContext *ctx)
11346 {
11347   FUNCTRACE(cct);
11348   const hobject_t& soid = ctx->obs->oi.soid;
11349   dout(7) << "issue_repop rep_tid " << repop->rep_tid
11350           << " o " << soid
11351           << dendl;
11352
11353   jspan span;
11354   if (ctx->op) {
11355     span = tracing::osd::tracer.add_span(__func__, ctx->op->osd_parent_span);
11356   }
11357
11358   repop->v = ctx->at_version;
11359
11360   ctx->op_t->add_obc(ctx->obc);
11361   if (ctx->clone_obc) {
11362     ctx->op_t->add_obc(ctx->clone_obc);
11363   }
11364   if (ctx->head_obc) {
11365     ctx->op_t->add_obc(ctx->head_obc);
11366   }
11367
11368   Context *on_all_commit = new C_OSD_RepopCommit(this, repop);
11369   if (!(ctx->log.empty())) {
11370     ceph_assert(ctx->at_version >= projected_last_update);
11371     projected_last_update = ctx->at_version;
11372   }
11373   for (auto &&entry: ctx->log) {
11374     projected_log.add(entry);
11375   }
11376
11377   recovery_state.pre_submit_op(
11378     soid,
11379     ctx->log,
11380     ctx->at_version);
11381   pgbackend->submit_transaction(
11382     soid,
11383     ctx->delta_stats,
11384     ctx->at_version,
11385     std::move(ctx->op_t),
11386     recovery_state.get_pg_trim_to(),
11387     recovery_state.get_min_last_complete_ondisk(),
11388     std::move(ctx->log),
11389     ctx->updated_hset_history,
11390     on_all_commit,
11391     repop->rep_tid,
11392     ctx->reqid,
11393     ctx->op);
11394 }
11395
11396 PrimaryLogPG::RepGather *PrimaryLogPG::new_repop(
11397   OpContext *ctx,
11398   ceph_tid_t rep_tid)
11399 {
11400   if (ctx->op)
11401     dout(10) << "new_repop rep_tid " << rep_tid << " on " << *ctx->op->get_req() << dendl;
11402   else
11403     dout(10) << "new_repop rep_tid " << rep_tid << " (no op)" << dendl;
11404
11405   RepGather *repop = new RepGather(
11406     ctx, rep_tid, info.last_complete);
11407
11408   repop->start = ceph_clock_now();
11409
11410   repop_queue.push_back(&repop->queue_item);
11411   repop->get();
11412
11413   osd->logger->inc(l_osd_op_wip);
11414
11415   dout(10) << __func__ << ": " << *repop << dendl;
11416   return repop;
11417 }
11418
11419 boost::intrusive_ptr<PrimaryLogPG::RepGather> PrimaryLogPG::new_repop(
11420   eversion_t version,
11421   int r,
11422   ObcLockManager &&manager,
11423   OpRequestRef &&op,
11424   std::optional<std::function<void(void)> > &&on_complete)
11425 {
11426   RepGather *repop = new RepGather(
11427     std::move(manager),
11428     std::move(op),
11429     std::move(on_complete),
11430     osd->get_tid(),
11431     info.last_complete,
11432     r);
11433   repop->v = version;
11434
11435   repop->start = ceph_clock_now();
11436
11437   repop_queue.push_back(&repop->queue_item);
11438
11439   osd->logger->inc(l_osd_op_wip);
11440
11441   dout(10) << __func__ << ": " << *repop << dendl;
11442   return boost::intrusive_ptr<RepGather>(repop);
11443 }
11444
11445 void PrimaryLogPG::remove_repop(RepGather *repop)
11446 {
11447   dout(20) << __func__ << " " << *repop << dendl;
11448
11449   for (auto p = repop->on_finish.begin();
11450        p != repop->on_finish.end();
11451        repop->on_finish.erase(p++)) {
11452     (*p)();
11453   }
11454
11455   release_object_locks(
11456     repop->lock_manager);
11457   repop->put();
11458
11459   osd->logger->dec(l_osd_op_wip);
11460 }
11461
11462 PrimaryLogPG::OpContextUPtr PrimaryLogPG::simple_opc_create(ObjectContextRef obc)
11463 {
11464   dout(20) << __func__ << " " << obc->obs.oi.soid << dendl;
11465   ceph_tid_t rep_tid = osd->get_tid();
11466   osd_reqid_t reqid(osd->get_cluster_msgr_name(), 0, rep_tid);
11467   OpContextUPtr ctx(new OpContext(OpRequestRef(), reqid, nullptr, obc, this));
11468   ctx->op_t.reset(new PGTransaction());
11469   ctx->mtime = ceph_clock_now();
11470   return ctx;
11471 }
11472
11473 void PrimaryLogPG::simple_opc_submit(OpContextUPtr ctx)
11474 {
11475   RepGather *repop = new_repop(ctx.get(), ctx->reqid.tid);
11476   dout(20) << __func__ << " " << repop << dendl;
11477   issue_repop(repop, ctx.get());
11478   eval_repop(repop);
11479   recovery_state.update_trim_to();
11480   repop->put();
11481 }
11482
11483
11484 void PrimaryLogPG::submit_log_entries(
11485   const mempool::osd_pglog::list<pg_log_entry_t> &entries,
11486   ObcLockManager &&manager,
11487   std::optional<std::function<void(void)> > &&_on_complete,
11488   OpRequestRef op,
11489   int r)
11490 {
11491   dout(10) << __func__ << " " << entries << dendl;
11492   ceph_assert(is_primary());
11493
11494   eversion_t version;
11495   if (!entries.empty()) {
11496     ceph_assert(entries.rbegin()->version >= projected_last_update);
11497     version = projected_last_update = entries.rbegin()->version;
11498   }
11499
11500   boost::intrusive_ptr<RepGather> repop;
11501   std::optional<std::function<void(void)> > on_complete;
11502   if (get_osdmap()->require_osd_release >= ceph_release_t::jewel) {
11503     repop = new_repop(
11504       version,
11505       r,
11506       std::move(manager),
11507       std::move(op),
11508       std::move(_on_complete));
11509   } else {
11510     on_complete = std::move(_on_complete);
11511   }
11512
11513   pgbackend->call_write_ordered(
11514     [this, entries, repop, on_complete]() {
11515       ObjectStore::Transaction t;
11516       eversion_t old_last_update = info.last_update;
11517       recovery_state.merge_new_log_entries(
11518         entries, t, recovery_state.get_pg_trim_to(),
11519         recovery_state.get_min_last_complete_ondisk());
11520
11521       set<pg_shard_t> waiting_on;
11522       for (set<pg_shard_t>::const_iterator i = get_acting_recovery_backfill().begin();
11523            i != get_acting_recovery_backfill().end();
11524            ++i) {
11525         pg_shard_t peer(*i);
11526         if (peer == pg_whoami) continue;
11527         ceph_assert(recovery_state.get_peer_missing().count(peer));
11528         ceph_assert(recovery_state.has_peer_info(peer));
11529         if (get_osdmap()->require_osd_release >= ceph_release_t::jewel) {
11530           ceph_assert(repop);
11531           MOSDPGUpdateLogMissing *m = new MOSDPGUpdateLogMissing(
11532             entries,
11533             spg_t(info.pgid.pgid, i->shard),
11534             pg_whoami.shard,
11535             get_osdmap_epoch(),
11536             get_last_peering_reset(),
11537             repop->rep_tid,
11538             recovery_state.get_pg_trim_to(),
11539             recovery_state.get_min_last_complete_ondisk());
11540           osd->send_message_osd_cluster(
11541             peer.osd, m, get_osdmap_epoch());
11542           waiting_on.insert(peer);
11543         } else {
11544           MOSDPGLog *m = new MOSDPGLog(
11545             peer.shard, pg_whoami.shard,
11546             info.last_update.epoch,
11547             info, get_last_peering_reset());
11548           m->log.log = entries;
11549           m->log.tail = old_last_update;
11550           m->log.head = info.last_update;
11551           osd->send_message_osd_cluster(
11552             peer.osd, m, get_osdmap_epoch());
11553         }
11554       }
11555       ceph_tid_t rep_tid = repop->rep_tid;
11556       waiting_on.insert(pg_whoami);
11557       log_entry_update_waiting_on.insert(
11558         make_pair(
11559           rep_tid,
11560           LogUpdateCtx{std::move(repop), std::move(waiting_on)}
11561           ));
11562       struct OnComplete : public Context {
11563         PrimaryLogPGRef pg;
11564         ceph_tid_t rep_tid;
11565         epoch_t epoch;
11566         OnComplete(
11567           PrimaryLogPGRef pg,
11568           ceph_tid_t rep_tid,
11569           epoch_t epoch)
11570           : pg(pg), rep_tid(rep_tid), epoch(epoch) {}
11571         void finish(int) override {
11572           std::scoped_lock l{*pg};
11573           if (!pg->pg_has_reset_since(epoch)) {
11574             auto it = pg->log_entry_update_waiting_on.find(rep_tid);
11575             ceph_assert(it != pg->log_entry_update_waiting_on.end());
11576             auto it2 = it->second.waiting_on.find(pg->pg_whoami);
11577             ceph_assert(it2 != it->second.waiting_on.end());
11578             it->second.waiting_on.erase(it2);
11579             if (it->second.waiting_on.empty()) {
11580               pg->repop_all_committed(it->second.repop.get());
11581               pg->log_entry_update_waiting_on.erase(it);
11582             }
11583           }
11584         }
11585       };
11586       t.register_on_commit(
11587         new OnComplete{this, rep_tid, get_osdmap_epoch()});
11588       int r = osd->store->queue_transaction(ch, std::move(t), NULL);
11589       ceph_assert(r == 0);
11590       op_applied(info.last_update);
11591     });
11592
11593   recovery_state.update_trim_to();
11594 }
11595
11596 void PrimaryLogPG::cancel_log_updates()
11597 {
11598   // get rid of all the LogUpdateCtx so their references to repops are
11599   // dropped
11600   log_entry_update_waiting_on.clear();
11601 }
11602
11603 // -------------------------------------------------------
11604
11605 void PrimaryLogPG::get_watchers(list<obj_watch_item_t> *ls)
11606 {
11607   std::scoped_lock l{*this};
11608   pair<hobject_t, ObjectContextRef> i;
11609   while (object_contexts.get_next(i.first, &i)) {
11610     ObjectContextRef obc(i.second);
11611     get_obc_watchers(obc, *ls);
11612   }
11613 }
11614
11615 void PrimaryLogPG::get_obc_watchers(ObjectContextRef obc, list<obj_watch_item_t> &pg_watchers)
11616 {
11617   for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j =
11618          obc->watchers.begin();
11619         j != obc->watchers.end();
11620         ++j) {
11621     obj_watch_item_t owi;
11622
11623     owi.obj = obc->obs.oi.soid;
11624     owi.wi.addr = j->second->get_peer_addr();
11625     owi.wi.name = j->second->get_entity();
11626     owi.wi.cookie = j->second->get_cookie();
11627     owi.wi.timeout_seconds = j->second->get_timeout();
11628
11629     dout(30) << "watch: Found oid=" << owi.obj << " addr=" << owi.wi.addr
11630       << " name=" << owi.wi.name << " cookie=" << owi.wi.cookie << dendl;
11631
11632     pg_watchers.push_back(owi);
11633   }
11634 }
11635
11636 void PrimaryLogPG::check_blocklisted_watchers()
11637 {
11638   dout(20) << "PrimaryLogPG::check_blocklisted_watchers for pg " << get_pgid() << dendl;
11639   pair<hobject_t, ObjectContextRef> i;
11640   while (object_contexts.get_next(i.first, &i))
11641     check_blocklisted_obc_watchers(i.second);
11642 }
11643
11644 void PrimaryLogPG::check_blocklisted_obc_watchers(ObjectContextRef obc)
11645 {
11646   dout(20) << "PrimaryLogPG::check_blocklisted_obc_watchers for obc " << obc->obs.oi.soid << dendl;
11647   for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator k =
11648          obc->watchers.begin();
11649         k != obc->watchers.end();
11650         ) {
11651     //Advance iterator now so handle_watch_timeout() can erase element
11652     map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j = k++;
11653     dout(30) << "watch: Found " << j->second->get_entity() << " cookie " << j->second->get_cookie() << dendl;
11654     entity_addr_t ea = j->second->get_peer_addr();
11655     dout(30) << "watch: Check entity_addr_t " << ea << dendl;
11656     if (get_osdmap()->is_blocklisted(ea)) {
11657       dout(10) << "watch: Found blocklisted watcher for " << ea << dendl;
11658       ceph_assert(j->second->get_pg() == this);
11659       j->second->unregister_cb();
11660       handle_watch_timeout(j->second);
11661     }
11662   }
11663 }
11664
11665 void PrimaryLogPG::populate_obc_watchers(ObjectContextRef obc)
11666 {
11667   ceph_assert(is_primary() && is_active());
11668   auto it_objects = recovery_state.get_pg_log().get_log().objects.find(obc->obs.oi.soid);
11669   ceph_assert((recovering.count(obc->obs.oi.soid) ||
11670           !is_missing_object(obc->obs.oi.soid)) ||
11671          (it_objects != recovery_state.get_pg_log().get_log().objects.end() && // or this is a revert... see recover_primary()
11672           it_objects->second->op ==
11673             pg_log_entry_t::LOST_REVERT &&
11674           it_objects->second->reverting_to ==
11675             obc->obs.oi.version));
11676
11677   dout(10) << "populate_obc_watchers " << obc->obs.oi.soid << dendl;
11678   ceph_assert(obc->watchers.empty());
11679   // populate unconnected_watchers
11680   for (map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator p =
11681         obc->obs.oi.watchers.begin();
11682        p != obc->obs.oi.watchers.end();
11683        ++p) {
11684     utime_t expire = info.stats.last_became_active;
11685     expire += p->second.timeout_seconds;
11686     dout(10) << "  unconnected watcher " << p->first << " will expire " << expire << dendl;
11687     WatchRef watch(
11688       Watch::makeWatchRef(
11689         this, osd, obc, p->second.timeout_seconds, p->first.first,
11690         p->first.second, p->second.addr));
11691     watch->disconnect();
11692     obc->watchers.insert(
11693       make_pair(
11694         make_pair(p->first.first, p->first.second),
11695         watch));
11696   }
11697   // Look for watchers from blocklisted clients and drop
11698   check_blocklisted_obc_watchers(obc);
11699 }
11700
11701 void PrimaryLogPG::handle_watch_timeout(WatchRef watch)
11702 {
11703   ObjectContextRef obc = watch->get_obc(); // handle_watch_timeout owns this ref
11704   dout(10) << "handle_watch_timeout obc " << obc << dendl;
11705
11706   if (!is_active()) {
11707     dout(10) << "handle_watch_timeout not active, no-op" << dendl;
11708     return;
11709   }
11710   if (!obc->obs.exists) {
11711     dout(10) << __func__ << " object " << obc->obs.oi.soid << " dne" << dendl;
11712     return;
11713   }
11714   if (is_degraded_or_backfilling_object(obc->obs.oi.soid)) {
11715     callbacks_for_degraded_object[obc->obs.oi.soid].push_back(
11716       watch->get_delayed_cb()
11717       );
11718     dout(10) << "handle_watch_timeout waiting for degraded on obj "
11719              << obc->obs.oi.soid
11720              << dendl;
11721     return;
11722   }
11723
11724   if (m_scrubber->write_blocked_by_scrub(obc->obs.oi.soid)) {
11725     dout(10) << "handle_watch_timeout waiting for scrub on obj "
11726              << obc->obs.oi.soid
11727              << dendl;
11728     m_scrubber->add_callback(
11729       watch->get_delayed_cb() // This callback!
11730       );
11731     return;
11732   }
11733
11734   OpContextUPtr ctx = simple_opc_create(obc);
11735   ctx->at_version = get_next_version();
11736
11737   object_info_t& oi = ctx->new_obs.oi;
11738   oi.watchers.erase(make_pair(watch->get_cookie(),
11739                               watch->get_entity()));
11740
11741   list<watch_disconnect_t> watch_disconnects = {
11742     watch_disconnect_t(watch->get_cookie(), watch->get_entity(), true)
11743   };
11744   ctx->register_on_success(
11745     [this, obc, watch_disconnects]() {
11746       complete_disconnect_watches(obc, watch_disconnects);
11747     });
11748
11749
11750   PGTransaction *t = ctx->op_t.get();
11751   ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::MODIFY, obc->obs.oi.soid,
11752                                     ctx->at_version,
11753                                     oi.version,
11754                                     0,
11755                                     osd_reqid_t(), ctx->mtime, 0));
11756
11757   oi.prior_version = obc->obs.oi.version;
11758   oi.version = ctx->at_version;
11759   bufferlist bl;
11760   encode(oi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
11761   t->setattr(obc->obs.oi.soid, OI_ATTR, bl);
11762
11763   // apply new object state.
11764   ctx->obc->obs = ctx->new_obs;
11765
11766   // no ctx->delta_stats
11767   simple_opc_submit(std::move(ctx));
11768 }
11769
11770 ObjectContextRef PrimaryLogPG::create_object_context(const object_info_t& oi,
11771                                                      SnapSetContext *ssc)
11772 {
11773   ObjectContextRef obc(object_contexts.lookup_or_create(oi.soid));
11774   ceph_assert(obc->destructor_callback == NULL);
11775   obc->destructor_callback = new C_PG_ObjectContext(this, obc.get());
11776   obc->obs.oi = oi;
11777   obc->obs.exists = false;
11778   obc->ssc = ssc;
11779   if (ssc)
11780     register_snapset_context(ssc);
11781   dout(10) << "create_object_context " << (void*)obc.get() << " " << oi.soid << " " << dendl;
11782   if (is_active())
11783     populate_obc_watchers(obc);
11784   return obc;
11785 }
11786
11787 ObjectContextRef PrimaryLogPG::get_object_context(
11788   const hobject_t& soid,
11789   bool can_create,
11790   const map<string, bufferlist, less<>> *attrs)
11791 {
11792   auto it_objects = recovery_state.get_pg_log().get_log().objects.find(soid);
11793   ceph_assert(
11794     attrs || !recovery_state.get_pg_log().get_missing().is_missing(soid) ||
11795     // or this is a revert... see recover_primary()
11796     (it_objects != recovery_state.get_pg_log().get_log().objects.end() &&
11797       it_objects->second->op ==
11798       pg_log_entry_t::LOST_REVERT));
11799   ObjectContextRef obc = object_contexts.lookup(soid);
11800   osd->logger->inc(l_osd_object_ctx_cache_total);
11801   if (obc) {
11802     osd->logger->inc(l_osd_object_ctx_cache_hit);
11803     dout(10) << __func__ << ": found obc in cache: " << obc
11804              << dendl;
11805   } else {
11806     dout(10) << __func__ << ": obc NOT found in cache: " << soid << dendl;
11807     // check disk
11808     bufferlist bv;
11809     if (attrs) {
11810       auto it_oi = attrs->find(OI_ATTR);
11811       ceph_assert(it_oi != attrs->end());
11812       bv = it_oi->second;
11813     } else {
11814       int r = pgbackend->objects_get_attr(soid, OI_ATTR, &bv);
11815       if (r < 0) {
11816         if (!can_create) {
11817           dout(10) << __func__ << ": no obc for soid "
11818                    << soid << " and !can_create"
11819                    << dendl;
11820           return ObjectContextRef();   // -ENOENT!
11821         }
11822
11823         dout(10) << __func__ << ": no obc for soid "
11824                  << soid << " but can_create"
11825                  << dendl;
11826         // new object.
11827         object_info_t oi(soid);
11828         SnapSetContext *ssc = get_snapset_context(
11829           soid, true, 0, false);
11830         ceph_assert(ssc);
11831         obc = create_object_context(oi, ssc);
11832         dout(10) << __func__ << ": " << obc << " " << soid
11833                  << " " << obc->rwstate
11834                  << " oi: " << obc->obs.oi
11835                  << " ssc: " << obc->ssc
11836                  << " snapset: " << obc->ssc->snapset << dendl;
11837         return obc;
11838       }
11839     }
11840
11841     object_info_t oi;
11842     try {
11843       bufferlist::const_iterator bliter = bv.begin();
11844       decode(oi, bliter);
11845     } catch (...) {
11846       dout(0) << __func__ << ": obc corrupt: " << soid << dendl;
11847       return ObjectContextRef();   // -ENOENT!
11848     }
11849
11850     ceph_assert(oi.soid.pool == (int64_t)info.pgid.pool());
11851
11852     obc = object_contexts.lookup_or_create(oi.soid);
11853     obc->destructor_callback = new C_PG_ObjectContext(this, obc.get());
11854     obc->obs.oi = oi;
11855     obc->obs.exists = true;
11856
11857     obc->ssc = get_snapset_context(
11858       soid, true,
11859       soid.has_snapset() ? attrs : 0);
11860
11861     if (is_primary() && is_active())
11862       populate_obc_watchers(obc);
11863
11864     if (pool.info.is_erasure()) {
11865       if (attrs) {
11866         obc->attr_cache = *attrs;
11867       } else {
11868         int r = pgbackend->objects_get_attrs(
11869           soid,
11870           &obc->attr_cache);
11871         ceph_assert(r == 0);
11872       }
11873     }
11874
11875     dout(10) << __func__ << ": creating obc from disk: " << obc
11876              << dendl;
11877   }
11878
11879   // XXX: Caller doesn't expect this
11880   if (obc->ssc == NULL) {
11881     derr << __func__ << ": obc->ssc not available, not returning context" << dendl;
11882     return ObjectContextRef();   // -ENOENT!
11883   }
11884
11885   dout(10) << __func__ << ": " << obc << " " << soid
11886            << " " << obc->rwstate
11887            << " oi: " << obc->obs.oi
11888            << " exists: " << (int)obc->obs.exists
11889            << " ssc: " << obc->ssc
11890            << " snapset: " << obc->ssc->snapset << dendl;
11891   return obc;
11892 }
11893
11894 void PrimaryLogPG::context_registry_on_change()
11895 {
11896   pair<hobject_t, ObjectContextRef> i;
11897   while (object_contexts.get_next(i.first, &i)) {
11898     ObjectContextRef obc(i.second);
11899     if (obc) {
11900       for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j =
11901              obc->watchers.begin();
11902            j != obc->watchers.end();
11903            obc->watchers.erase(j++)) {
11904         j->second->discard();
11905       }
11906     }
11907   }
11908 }
11909
11910
11911 /*
11912  * If we return an error, and set *pmissing, then promoting that
11913  * object may help.
11914  *
11915  * If we return -EAGAIN, we will always set *pmissing to the missing
11916  * object to wait for.
11917  *
11918  * If we return an error but do not set *pmissing, then we know the
11919  * object does not exist.
11920  */
11921 int PrimaryLogPG::find_object_context(const hobject_t& oid,
11922                                       ObjectContextRef *pobc,
11923                                       bool can_create,
11924                                       bool map_snapid_to_clone,
11925                                       hobject_t *pmissing)
11926 {
11927   FUNCTRACE(cct);
11928   ceph_assert(oid.pool == static_cast<int64_t>(info.pgid.pool()));
11929   // want the head?
11930   if (oid.snap == CEPH_NOSNAP) {
11931     ObjectContextRef obc = get_object_context(oid, can_create);
11932     if (!obc) {
11933       if (pmissing)
11934         *pmissing = oid;
11935       return -ENOENT;
11936     }
11937     dout(10) << __func__ << " " << oid
11938        << " @" << oid.snap
11939        << " oi=" << obc->obs.oi
11940        << dendl;
11941     *pobc = obc;
11942
11943     return 0;
11944   }
11945
11946   // we want a snap
11947
11948   hobject_t head = oid.get_head();
11949   SnapSetContext *ssc = get_snapset_context(oid, can_create);
11950   if (!ssc || !(ssc->exists || can_create)) {
11951     dout(20) << __func__ << " " << oid << " no snapset" << dendl;
11952     if (pmissing)
11953       *pmissing = head;  // start by getting the head
11954     if (ssc)
11955       put_snapset_context(ssc);
11956     return -ENOENT;
11957   }
11958
11959   if (map_snapid_to_clone) {
11960     dout(10) << __func__ << " " << oid << " @" << oid.snap
11961              << " snapset " << ssc->snapset
11962              << " map_snapid_to_clone=true" << dendl;
11963     if (oid.snap > ssc->snapset.seq) {
11964       // already must be readable
11965       ObjectContextRef obc = get_object_context(head, false);
11966       dout(10) << __func__ << " " << oid << " @" << oid.snap
11967                << " snapset " << ssc->snapset
11968                << " maps to head" << dendl;
11969       *pobc = obc;
11970       put_snapset_context(ssc);
11971       return (obc && obc->obs.exists) ? 0 : -ENOENT;
11972     } else {
11973       vector<snapid_t>::const_iterator citer = std::find(
11974         ssc->snapset.clones.begin(),
11975         ssc->snapset.clones.end(),
11976         oid.snap);
11977       if (citer == ssc->snapset.clones.end()) {
11978         dout(10) << __func__ << " " << oid << " @" << oid.snap
11979                  << " snapset " << ssc->snapset
11980                  << " maps to nothing" << dendl;
11981         put_snapset_context(ssc);
11982         return -ENOENT;
11983       }
11984
11985       dout(10) << __func__ << " " << oid << " @" << oid.snap
11986                << " snapset " << ssc->snapset
11987                << " maps to " << oid << dendl;
11988
11989       if (recovery_state.get_pg_log().get_missing().is_missing(oid)) {
11990         dout(10) << __func__ << " " << oid << " @" << oid.snap
11991                  << " snapset " << ssc->snapset
11992                  << " " << oid << " is missing" << dendl;
11993         if (pmissing)
11994           *pmissing = oid;
11995         put_snapset_context(ssc);
11996         return -EAGAIN;
11997       }
11998
11999       ObjectContextRef obc = get_object_context(oid, false);
12000       if (!obc || !obc->obs.exists) {
12001         dout(10) << __func__ << " " << oid << " @" << oid.snap
12002                  << " snapset " << ssc->snapset
12003                  << " " << oid << " is not present" << dendl;
12004         if (pmissing)
12005           *pmissing = oid;
12006         put_snapset_context(ssc);
12007         return -ENOENT;
12008       }
12009       dout(10) << __func__ << " " << oid << " @" << oid.snap
12010                << " snapset " << ssc->snapset
12011                << " " << oid << " HIT" << dendl;
12012       *pobc = obc;
12013       put_snapset_context(ssc);
12014       return 0;
12015     }
12016     ceph_abort(); //unreachable
12017   }
12018
12019   dout(10) << __func__ << " " << oid << " @" << oid.snap
12020            << " snapset " << ssc->snapset << dendl;
12021
12022   // head?
12023   if (oid.snap > ssc->snapset.seq) {
12024     ObjectContextRef obc = get_object_context(head, false);
12025     dout(10) << __func__ << " " << head
12026              << " want " << oid.snap << " > snapset seq " << ssc->snapset.seq
12027              << " -- HIT " << obc->obs
12028              << dendl;
12029     if (!obc->ssc)
12030       obc->ssc = ssc;
12031     else {
12032       ceph_assert(ssc == obc->ssc);
12033       put_snapset_context(ssc);
12034     }
12035     *pobc = obc;
12036     return 0;
12037   }
12038
12039   // which clone would it be?
12040   unsigned k = 0;
12041   while (k < ssc->snapset.clones.size() &&
12042          ssc->snapset.clones[k] < oid.snap)
12043     k++;
12044   if (k == ssc->snapset.clones.size()) {
12045     dout(10) << __func__ << " no clones with last >= oid.snap "
12046              << oid.snap << " -- DNE" << dendl;
12047     put_snapset_context(ssc);
12048     return -ENOENT;
12049   }
12050   hobject_t soid(oid.oid, oid.get_key(), ssc->snapset.clones[k], oid.get_hash(),
12051                  info.pgid.pool(), oid.get_namespace());
12052
12053   if (recovery_state.get_pg_log().get_missing().is_missing(soid)) {
12054     dout(20) << __func__ << " " << soid << " missing, try again later"
12055              << dendl;
12056     if (pmissing)
12057       *pmissing = soid;
12058     put_snapset_context(ssc);
12059     return -EAGAIN;
12060   }
12061
12062   ObjectContextRef obc = get_object_context(soid, false);
12063   if (!obc || !obc->obs.exists) {
12064     if (pmissing)
12065       *pmissing = soid;
12066     put_snapset_context(ssc);
12067     if (is_primary()) {
12068       if (is_degraded_or_backfilling_object(soid)) {
12069         dout(20) << __func__ << " clone is degraded or backfilling " << soid << dendl;
12070         return -EAGAIN;
12071       } else if (is_degraded_on_async_recovery_target(soid)) {
12072         dout(20) << __func__ << " clone is recovering " << soid << dendl;
12073         return -EAGAIN;
12074       } else {
12075         dout(20) << __func__ << " missing clone " << soid << dendl;
12076         return -ENOENT;
12077       }
12078     } else {
12079       dout(20) << __func__ << " replica missing clone" << soid << dendl;
12080       return -ENOENT;
12081     }
12082   }
12083
12084   if (!obc->ssc) {
12085     obc->ssc = ssc;
12086   } else {
12087     ceph_assert(obc->ssc == ssc);
12088     put_snapset_context(ssc);
12089   }
12090   ssc = 0;
12091
12092   // clone
12093   dout(20) << __func__ << " " << soid
12094            << " snapset " << obc->ssc->snapset
12095            << dendl;
12096   snapid_t first, last;
12097   auto p = obc->ssc->snapset.clone_snaps.find(soid.snap);
12098   ceph_assert(p != obc->ssc->snapset.clone_snaps.end());
12099   if (p->second.empty()) {
12100     dout(1) << __func__ << " " << soid << " empty snapset -- DNE" << dendl;
12101     ceph_assert(!cct->_conf->osd_debug_verify_snaps);
12102     return -ENOENT;
12103   }
12104   if (std::find(p->second.begin(), p->second.end(), oid.snap) ==
12105       p->second.end()) {
12106     dout(20) << __func__ << " " << soid << " clone_snaps " << p->second
12107              << " does not contain " << oid.snap << " -- DNE" << dendl;
12108     return -ENOENT;
12109   }
12110   if (get_osdmap()->in_removed_snaps_queue(info.pgid.pgid.pool(), oid.snap)) {
12111     dout(20) << __func__ << " " << soid << " snap " << oid.snap
12112              << " in removed_snaps_queue" << " -- DNE" << dendl;
12113     return -ENOENT;
12114   }
12115   dout(20) << __func__ << " " << soid << " clone_snaps " << p->second
12116            << " contains " << oid.snap << " -- HIT " << obc->obs << dendl;
12117   *pobc = obc;
12118   return 0;
12119 }
12120
12121 void PrimaryLogPG::object_context_destructor_callback(ObjectContext *obc)
12122 {
12123   if (obc->ssc)
12124     put_snapset_context(obc->ssc);
12125 }
12126
12127 void PrimaryLogPG::add_object_context_to_pg_stat(ObjectContextRef obc, pg_stat_t *pgstat)
12128 {
12129   object_info_t& oi = obc->obs.oi;
12130
12131   dout(10) << __func__ << " " << oi.soid << dendl;
12132   ceph_assert(!oi.soid.is_snapdir());
12133
12134   object_stat_sum_t stat;
12135   stat.num_objects++;
12136   if (oi.is_dirty())
12137     stat.num_objects_dirty++;
12138   if (oi.is_whiteout())
12139     stat.num_whiteouts++;
12140   if (oi.is_omap())
12141     stat.num_objects_omap++;
12142   if (oi.is_cache_pinned())
12143     stat.num_objects_pinned++;
12144   if (oi.has_manifest())
12145     stat.num_objects_manifest++;
12146
12147   if (oi.soid.is_snap()) {
12148     stat.num_object_clones++;
12149
12150     if (!obc->ssc)
12151       obc->ssc = get_snapset_context(oi.soid, false);
12152     ceph_assert(obc->ssc);
12153     stat.num_bytes += obc->ssc->snapset.get_clone_bytes(oi.soid.snap);
12154   } else {
12155     stat.num_bytes += oi.size;
12156   }
12157
12158   // add it in
12159   pgstat->stats.sum.add(stat);
12160 }
12161
12162 void PrimaryLogPG::requeue_op_blocked_by_object(const hobject_t &soid) {
12163   map<hobject_t, list<OpRequestRef>>::iterator p = waiting_for_blocked_object.find(soid);
12164   if (p != waiting_for_blocked_object.end()) {
12165     list<OpRequestRef>& ls = p->second;
12166     dout(10) << __func__ << " " << soid << " requeuing " << ls.size() << " requests" << dendl;
12167     requeue_ops(ls);
12168     waiting_for_blocked_object.erase(p);
12169   }
12170 }
12171
12172 void PrimaryLogPG::kick_object_context_blocked(ObjectContextRef obc)
12173 {
12174   const hobject_t& soid = obc->obs.oi.soid;
12175   if (obc->is_blocked()) {
12176     dout(10) << __func__ << " " << soid << " still blocked" << dendl;
12177     return;
12178   }
12179
12180   requeue_op_blocked_by_object(soid);
12181
12182   map<hobject_t, ObjectContextRef>::iterator i =
12183     objects_blocked_on_snap_promotion.find(obc->obs.oi.soid.get_head());
12184   if (i != objects_blocked_on_snap_promotion.end()) {
12185     ceph_assert(i->second == obc);
12186     ObjectContextRef head_obc = get_object_context(i->first, false);
12187     head_obc->stop_block();
12188     // kick blocked ops (head)
12189     requeue_op_blocked_by_object(i->first);
12190     objects_blocked_on_snap_promotion.erase(i);
12191   }
12192
12193   if (obc->requeue_scrub_on_unblock) {
12194
12195     obc->requeue_scrub_on_unblock = false;
12196
12197     dout(20) << __func__ << " requeuing if still active: " << (is_active() ? "yes" : "no") << dendl;
12198
12199     // only requeue if we are still active: we may be unblocking
12200     // because we are resetting for a new peering interval
12201     if (is_active()) {
12202       osd->queue_scrub_unblocking(this, is_scrub_blocking_ops());
12203     }
12204   }
12205 }
12206
12207 SnapSetContext *PrimaryLogPG::get_snapset_context(
12208   const hobject_t& oid,
12209   bool can_create,
12210   const map<string, bufferlist, less<>> *attrs,
12211   bool oid_existed)
12212 {
12213   std::lock_guard l(snapset_contexts_lock);
12214   SnapSetContext *ssc;
12215   map<hobject_t, SnapSetContext*>::iterator p = snapset_contexts.find(
12216     oid.get_snapdir());
12217   if (p != snapset_contexts.end()) {
12218     if (can_create || p->second->exists) {
12219       ssc = p->second;
12220     } else {
12221       return NULL;
12222     }
12223   } else {
12224     bufferlist bv;
12225     if (!attrs) {
12226       int r = -ENOENT;
12227       if (!(oid.is_head() && !oid_existed)) {
12228         r = pgbackend->objects_get_attr(oid.get_head(), SS_ATTR, &bv);
12229       }
12230       if (r < 0 && !can_create)
12231         return NULL;
12232     } else {
12233       auto it_ss = attrs->find(SS_ATTR);
12234       ceph_assert(it_ss != attrs->end());
12235       bv = it_ss->second;
12236     }
12237     ssc = new SnapSetContext(oid.get_snapdir());
12238     _register_snapset_context(ssc);
12239     if (bv.length()) {
12240       bufferlist::const_iterator bvp = bv.begin();
12241       try {
12242         ssc->snapset.decode(bvp);
12243       } catch (const ceph::buffer::error& e) {
12244         dout(0) << __func__ << " Can't decode snapset: " << e.what() << dendl;
12245         return NULL;
12246       }
12247       ssc->exists = true;
12248     } else {
12249       ssc->exists = false;
12250     }
12251   }
12252   ceph_assert(ssc);
12253   ssc->ref++;
12254   return ssc;
12255 }
12256
12257 void PrimaryLogPG::put_snapset_context(SnapSetContext *ssc)
12258 {
12259   std::lock_guard l(snapset_contexts_lock);
12260   --ssc->ref;
12261   if (ssc->ref == 0) {
12262     if (ssc->registered)
12263       snapset_contexts.erase(ssc->oid);
12264     delete ssc;
12265   }
12266 }
12267
12268 /*
12269  * Return values:
12270  *  NONE  - didn't pull anything
12271  *  YES   - pulled what the caller wanted
12272  *  HEAD  - needed to pull head first
12273  */
12274 enum { PULL_NONE, PULL_HEAD, PULL_YES };
12275
12276 int PrimaryLogPG::recover_missing(
12277   const hobject_t &soid, eversion_t v,
12278   int priority,
12279   PGBackend::RecoveryHandle *h)
12280 {
12281   if (recovery_state.get_missing_loc().is_unfound(soid)) {
12282     dout(7) << __func__ << " " << soid
12283             << " v " << v
12284             << " but it is unfound" << dendl;
12285     return PULL_NONE;
12286   }
12287
12288   if (recovery_state.get_missing_loc().is_deleted(soid)) {
12289     start_recovery_op(soid);
12290     ceph_assert(!recovering.count(soid));
12291     recovering.insert(make_pair(soid, ObjectContextRef()));
12292     epoch_t cur_epoch = get_osdmap_epoch();
12293     remove_missing_object(soid, v, new LambdaContext(
12294      [=](int) {
12295        std::scoped_lock locker{*this};
12296        if (!pg_has_reset_since(cur_epoch)) {
12297          bool object_missing = false;
12298          for (const auto& shard : get_acting_recovery_backfill()) {
12299            if (shard == pg_whoami)
12300              continue;
12301            if (recovery_state.get_peer_missing(shard).is_missing(soid)) {
12302              dout(20) << __func__ << ": soid " << soid << " needs to be deleted from replica " << shard << dendl;
12303              object_missing = true;
12304              break;
12305            }
12306          }
12307          if (!object_missing) {
12308            object_stat_sum_t stat_diff;
12309            stat_diff.num_objects_recovered = 1;
12310            if (scrub_after_recovery)
12311              stat_diff.num_objects_repaired = 1;
12312            on_global_recover(soid, stat_diff, true);
12313          } else {
12314            auto recovery_handle = pgbackend->open_recovery_op();
12315            pgbackend->recover_delete_object(soid, v, recovery_handle);
12316            pgbackend->run_recovery_op(recovery_handle, priority);
12317          }
12318        }
12319      }));
12320     return PULL_YES;
12321   }
12322
12323   // is this a snapped object?  if so, consult the snapset.. we may not need the entire object!
12324   ObjectContextRef obc;
12325   ObjectContextRef head_obc;
12326   if (soid.snap && soid.snap < CEPH_NOSNAP) {
12327     // do we have the head?
12328     hobject_t head = soid.get_head();
12329     if (recovery_state.get_pg_log().get_missing().is_missing(head)) {
12330       if (recovering.count(head)) {
12331         dout(10) << " missing but already recovering head " << head << dendl;
12332         return PULL_NONE;
12333       } else {
12334         int r = recover_missing(
12335           head, recovery_state.get_pg_log().get_missing().get_items().find(head)->second.need, priority,
12336           h);
12337         if (r != PULL_NONE)
12338           return PULL_HEAD;
12339         return PULL_NONE;
12340       }
12341     }
12342     head_obc = get_object_context(
12343       head,
12344       false,
12345       0);
12346     ceph_assert(head_obc);
12347   }
12348   start_recovery_op(soid);
12349   ceph_assert(!recovering.count(soid));
12350   recovering.insert(make_pair(soid, obc));
12351   int r = pgbackend->recover_object(
12352     soid,
12353     v,
12354     head_obc,
12355     obc,
12356     h);
12357   // This is only a pull which shouldn't return an error
12358   ceph_assert(r >= 0);
12359   return PULL_YES;
12360 }
12361
12362 void PrimaryLogPG::remove_missing_object(const hobject_t &soid,
12363                                          eversion_t v, Context *on_complete)
12364 {
12365   dout(20) << __func__ << " " << soid << " " << v << dendl;
12366   ceph_assert(on_complete != nullptr);
12367   // delete locally
12368   ObjectStore::Transaction t;
12369   remove_snap_mapped_object(t, soid);
12370
12371   ObjectRecoveryInfo recovery_info;
12372   recovery_info.soid = soid;
12373   recovery_info.version = v;
12374
12375   epoch_t cur_epoch = get_osdmap_epoch();
12376   t.register_on_complete(new LambdaContext(
12377      [=](int) {
12378        std::unique_lock locker{*this};
12379        if (!pg_has_reset_since(cur_epoch)) {
12380          ObjectStore::Transaction t2;
12381          on_local_recover(soid, recovery_info, ObjectContextRef(), true, &t2);
12382          t2.register_on_complete(on_complete);
12383          int r = osd->store->queue_transaction(ch, std::move(t2), nullptr);
12384          ceph_assert(r == 0);
12385          locker.unlock();
12386        } else {
12387          locker.unlock();
12388          on_complete->complete(-EAGAIN);
12389        }
12390      }));
12391   int r = osd->store->queue_transaction(ch, std::move(t), nullptr);
12392   ceph_assert(r == 0);
12393 }
12394
12395 void PrimaryLogPG::finish_degraded_object(const hobject_t oid)
12396 {
12397   dout(10) << __func__ << " " << oid << dendl;
12398   if (callbacks_for_degraded_object.count(oid)) {
12399     list<Context*> contexts;
12400     contexts.swap(callbacks_for_degraded_object[oid]);
12401     callbacks_for_degraded_object.erase(oid);
12402     for (list<Context*>::iterator i = contexts.begin();
12403          i != contexts.end();
12404          ++i) {
12405       (*i)->complete(0);
12406     }
12407   }
12408   map<hobject_t, snapid_t>::iterator i = objects_blocked_on_degraded_snap.find(
12409     oid.get_head());
12410   if (i != objects_blocked_on_degraded_snap.end() &&
12411       i->second == oid.snap)
12412     objects_blocked_on_degraded_snap.erase(i);
12413 }
12414
12415 void PrimaryLogPG::_committed_pushed_object(
12416   epoch_t epoch, eversion_t last_complete)
12417 {
12418   std::scoped_lock locker{*this};
12419   if (!pg_has_reset_since(epoch)) {
12420     recovery_state.recovery_committed_to(last_complete);
12421   } else {
12422     dout(10) << __func__
12423              << " pg has changed, not touching last_complete_ondisk" << dendl;
12424   }
12425 }
12426
12427 void PrimaryLogPG::_applied_recovered_object(ObjectContextRef obc)
12428 {
12429   dout(20) << __func__ << dendl;
12430   if (obc) {
12431     dout(20) << "obc = " << *obc << dendl;
12432   }
12433   ceph_assert(active_pushes >= 1);
12434   --active_pushes;
12435
12436   // requeue an active chunky scrub waiting on recovery ops
12437   if (!recovery_state.is_deleting() && active_pushes == 0 &&
12438       is_scrub_active()) {
12439
12440     osd->queue_scrub_pushes_update(this, is_scrub_blocking_ops());
12441   }
12442 }
12443
12444 void PrimaryLogPG::_applied_recovered_object_replica()
12445 {
12446   dout(20) << __func__ << dendl;
12447   ceph_assert(active_pushes >= 1);
12448   --active_pushes;
12449
12450   // requeue an active scrub waiting on recovery ops
12451   if (!recovery_state.is_deleting() && active_pushes == 0 &&
12452       is_scrub_active()) {
12453
12454     osd->queue_scrub_replica_pushes(this, m_scrubber->replica_op_priority());
12455   }
12456 }
12457
12458 void PrimaryLogPG::on_failed_pull(
12459   const set<pg_shard_t> &from,
12460   const hobject_t &soid,
12461   const eversion_t &v)
12462 {
12463   dout(20) << __func__ << ": " << soid << dendl;
12464   ceph_assert(recovering.count(soid));
12465   auto obc = recovering[soid];
12466   if (obc) {
12467     list<OpRequestRef> blocked_ops;
12468     obc->drop_recovery_read(&blocked_ops);
12469     requeue_ops(blocked_ops);
12470   }
12471   recovering.erase(soid);
12472   for (auto&& i : from) {
12473     if (i != pg_whoami) { // we'll get it below in primary_error
12474       recovery_state.force_object_missing(i, soid, v);
12475     }
12476   }
12477
12478   dout(0) << __func__ << " " << soid << " from shard " << from
12479           << ", reps on " << recovery_state.get_missing_loc().get_locations(soid)
12480           << " unfound? " << recovery_state.get_missing_loc().is_unfound(soid)
12481           << dendl;
12482   finish_recovery_op(soid);  // close out this attempt,
12483   finish_degraded_object(soid);
12484
12485   if (from.count(pg_whoami)) {
12486     dout(0) << " primary missing oid " << soid << " version " << v << dendl;
12487     primary_error(soid, v);
12488     backfills_in_flight.erase(soid);
12489   }
12490 }
12491
12492 eversion_t PrimaryLogPG::pick_newest_available(const hobject_t& oid)
12493 {
12494   eversion_t v;
12495   pg_missing_item pmi;
12496   bool is_missing = recovery_state.get_pg_log().get_missing().is_missing(oid, &pmi);
12497   ceph_assert(is_missing);
12498   v = pmi.have;
12499   dout(10) << "pick_newest_available " << oid << " " << v << " on osd." << osd->whoami << " (local)" << dendl;
12500
12501   ceph_assert(!get_acting_recovery_backfill().empty());
12502   for (set<pg_shard_t>::iterator i = get_acting_recovery_backfill().begin();
12503        i != get_acting_recovery_backfill().end();
12504        ++i) {
12505     if (*i == get_primary()) continue;
12506     pg_shard_t peer = *i;
12507     if (!recovery_state.get_peer_missing(peer).is_missing(oid)) {
12508       continue;
12509     }
12510     eversion_t h = recovery_state.get_peer_missing(peer).get_items().at(oid).have;
12511     dout(10) << "pick_newest_available " << oid << " " << h << " on osd." << peer << dendl;
12512     if (h > v)
12513       v = h;
12514   }
12515
12516   dout(10) << "pick_newest_available " << oid << " " << v << " (newest)" << dendl;
12517   return v;
12518 }
12519
12520 void PrimaryLogPG::do_update_log_missing(OpRequestRef &op)
12521 {
12522   const MOSDPGUpdateLogMissing *m = static_cast<const MOSDPGUpdateLogMissing*>(
12523     op->get_req());
12524   ceph_assert(m->get_type() == MSG_OSD_PG_UPDATE_LOG_MISSING);
12525   ObjectStore::Transaction t;
12526   std::optional<eversion_t> op_trim_to, op_roll_forward_to;
12527   if (m->pg_trim_to != eversion_t())
12528     op_trim_to = m->pg_trim_to;
12529   if (m->pg_roll_forward_to != eversion_t())
12530     op_roll_forward_to = m->pg_roll_forward_to;
12531
12532   dout(20) << __func__
12533            << " op_trim_to = " << op_trim_to << " op_roll_forward_to = " << op_roll_forward_to << dendl;
12534
12535   recovery_state.append_log_entries_update_missing(
12536     m->entries, t, op_trim_to, op_roll_forward_to);
12537   eversion_t new_lcod = info.last_complete;
12538
12539   Context *complete = new LambdaContext(
12540     [=](int) {
12541       const MOSDPGUpdateLogMissing *msg = static_cast<const MOSDPGUpdateLogMissing*>(
12542         op->get_req());
12543       std::scoped_lock locker{*this};
12544       if (!pg_has_reset_since(msg->get_epoch())) {
12545         update_last_complete_ondisk(new_lcod);
12546         MOSDPGUpdateLogMissingReply *reply =
12547           new MOSDPGUpdateLogMissingReply(
12548             spg_t(info.pgid.pgid, primary_shard().shard),
12549             pg_whoami.shard,
12550             msg->get_epoch(),
12551             msg->min_epoch,
12552             msg->get_tid(),
12553             new_lcod);
12554         reply->set_priority(CEPH_MSG_PRIO_HIGH);
12555         msg->get_connection()->send_message(reply);
12556       }
12557     });
12558
12559   if (get_osdmap()->require_osd_release >= ceph_release_t::kraken) {
12560     t.register_on_commit(complete);
12561   } else {
12562     /* Hack to work around the fact that ReplicatedBackend sends
12563      * ack+commit if commit happens first
12564      *
12565      * This behavior is no longer necessary, but we preserve it so old
12566      * primaries can keep their repops in order */
12567     if (pool.info.is_erasure()) {
12568       t.register_on_complete(complete);
12569     } else {
12570       t.register_on_commit(complete);
12571     }
12572   }
12573   int tr = osd->store->queue_transaction(
12574     ch,
12575     std::move(t),
12576     nullptr);
12577   ceph_assert(tr == 0);
12578   op_applied(info.last_update);
12579 }
12580
12581 void PrimaryLogPG::do_update_log_missing_reply(OpRequestRef &op)
12582 {
12583   const MOSDPGUpdateLogMissingReply *m =
12584     static_cast<const MOSDPGUpdateLogMissingReply*>(
12585     op->get_req());
12586   dout(20) << __func__ << " got reply from "
12587            << m->get_from() << dendl;
12588
12589   auto it = log_entry_update_waiting_on.find(m->get_tid());
12590   if (it != log_entry_update_waiting_on.end()) {
12591     if (it->second.waiting_on.count(m->get_from())) {
12592       it->second.waiting_on.erase(m->get_from());
12593       if (m->last_complete_ondisk != eversion_t()) {
12594         update_peer_last_complete_ondisk(m->get_from(), m->last_complete_ondisk);
12595       }
12596     } else {
12597       osd->clog->error()
12598         << info.pgid << " got reply "
12599         << *m << " from shard we are not waiting for "
12600         << m->get_from();
12601     }
12602
12603     if (it->second.waiting_on.empty()) {
12604       repop_all_committed(it->second.repop.get());
12605       log_entry_update_waiting_on.erase(it);
12606     }
12607   } else {
12608     osd->clog->error()
12609       << info.pgid << " got reply "
12610       << *m << " on unknown tid " << m->get_tid();
12611   }
12612 }
12613
12614 /* Mark all unfound objects as lost.
12615  */
12616 void PrimaryLogPG::mark_all_unfound_lost(
12617   int what,
12618   std::function<void(int,const std::string&,bufferlist&)> on_finish)
12619 {
12620   dout(3) << __func__ << " " << pg_log_entry_t::get_op_name(what) << dendl;
12621   list<hobject_t> oids;
12622
12623   dout(30) << __func__ << ": log before:\n";
12624   recovery_state.get_pg_log().get_log().print(*_dout);
12625   *_dout << dendl;
12626
12627   mempool::osd_pglog::list<pg_log_entry_t> log_entries;
12628
12629   utime_t mtime = ceph_clock_now();
12630   map<hobject_t, pg_missing_item>::const_iterator m =
12631     recovery_state.get_missing_loc().get_needs_recovery().begin();
12632   map<hobject_t, pg_missing_item>::const_iterator mend =
12633     recovery_state.get_missing_loc().get_needs_recovery().end();
12634
12635   ObcLockManager manager;
12636   eversion_t v = get_next_version();
12637   v.epoch = get_osdmap_epoch();
12638   uint64_t num_unfound = recovery_state.get_missing_loc().num_unfound();
12639   while (m != mend) {
12640     const hobject_t &oid(m->first);
12641     if (!recovery_state.get_missing_loc().is_unfound(oid)) {
12642       // We only care about unfound objects
12643       ++m;
12644       continue;
12645     }
12646
12647     ObjectContextRef obc;
12648     eversion_t prev;
12649
12650     switch (what) {
12651     case pg_log_entry_t::LOST_MARK:
12652       ceph_abort_msg("actually, not implemented yet!");
12653       break;
12654
12655     case pg_log_entry_t::LOST_REVERT:
12656       prev = pick_newest_available(oid);
12657       if (prev > eversion_t()) {
12658         // log it
12659         pg_log_entry_t e(
12660           pg_log_entry_t::LOST_REVERT, oid, v,
12661           m->second.need, 0, osd_reqid_t(), mtime, 0);
12662         e.reverting_to = prev;
12663         e.mark_unrollbackable();
12664         log_entries.push_back(e);
12665         dout(10) << e << dendl;
12666
12667         // we are now missing the new version; recovery code will sort it out.
12668         ++v.version;
12669         ++m;
12670         break;
12671       }
12672
12673     case pg_log_entry_t::LOST_DELETE:
12674       {
12675         pg_log_entry_t e(pg_log_entry_t::LOST_DELETE, oid, v, m->second.need,
12676                          0, osd_reqid_t(), mtime, 0);
12677         if (get_osdmap()->require_osd_release >= ceph_release_t::jewel) {
12678           if (pool.info.require_rollback()) {
12679             e.mod_desc.try_rmobject(v.version);
12680           } else {
12681             e.mark_unrollbackable();
12682           }
12683         } // otherwise, just do what we used to do
12684         dout(10) << e << dendl;
12685         log_entries.push_back(e);
12686         oids.push_back(oid);
12687
12688         // If context found mark object as deleted in case
12689         // of racing with new creation.  This can happen if
12690         // object lost and EIO at primary.
12691         obc = object_contexts.lookup(oid);
12692         if (obc)
12693           obc->obs.exists = false;
12694
12695         ++v.version;
12696         ++m;
12697       }
12698       break;
12699
12700     default:
12701       ceph_abort();
12702     }
12703   }
12704
12705   recovery_state.update_stats(
12706     [](auto &history, auto &stats) {
12707       stats.stats_invalid = true;
12708       return false;
12709     });
12710
12711   submit_log_entries(
12712     log_entries,
12713     std::move(manager),
12714     std::optional<std::function<void(void)> >(
12715       [this, oids, num_unfound, on_finish]() {
12716         if (recovery_state.perform_deletes_during_peering()) {
12717           for (auto oid : oids) {
12718             // clear old locations - merge_new_log_entries will have
12719             // handled rebuilding missing_loc for each of these
12720             // objects if we have the RECOVERY_DELETES flag
12721             recovery_state.object_recovered(oid, object_stat_sum_t());
12722           }
12723         }
12724
12725         if (is_recovery_unfound()) {
12726           queue_peering_event(
12727             PGPeeringEventRef(
12728               std::make_shared<PGPeeringEvent>(
12729               get_osdmap_epoch(),
12730               get_osdmap_epoch(),
12731               PeeringState::DoRecovery())));
12732         } else if (is_backfill_unfound()) {
12733           queue_peering_event(
12734             PGPeeringEventRef(
12735               std::make_shared<PGPeeringEvent>(
12736               get_osdmap_epoch(),
12737               get_osdmap_epoch(),
12738               PeeringState::RequestBackfill())));
12739         } else {
12740           queue_recovery();
12741         }
12742
12743         stringstream ss;
12744         ss << "pg has " << num_unfound
12745            << " objects unfound and apparently lost marking";
12746         string rs = ss.str();
12747         dout(0) << "do_command r=" << 0 << " " << rs << dendl;
12748         osd->clog->info() << rs;
12749         bufferlist empty;
12750         on_finish(0, rs, empty);
12751       }),
12752     OpRequestRef());
12753 }
12754
12755 void PrimaryLogPG::_split_into(pg_t child_pgid, PG *child, unsigned split_bits)
12756 {
12757   ceph_assert(repop_queue.empty());
12758 }
12759
12760 /*
12761  * pg status change notification
12762  */
12763
12764 void PrimaryLogPG::apply_and_flush_repops(bool requeue)
12765 {
12766   list<OpRequestRef> rq;
12767
12768   // apply all repops
12769   while (!repop_queue.empty()) {
12770     RepGather *repop = repop_queue.front();
12771     repop_queue.pop_front();
12772     dout(10) << " canceling repop tid " << repop->rep_tid << dendl;
12773     repop->rep_aborted = true;
12774     repop->on_committed.clear();
12775     repop->on_success.clear();
12776
12777     if (requeue) {
12778       if (repop->op) {
12779         dout(10) << " requeuing " << *repop->op->get_req() << dendl;
12780         rq.push_back(repop->op);
12781         repop->op = OpRequestRef();
12782       }
12783
12784       // also requeue any dups, interleaved into position
12785       auto p = waiting_for_ondisk.find(repop->v);
12786       if (p != waiting_for_ondisk.end()) {
12787         dout(10) << " also requeuing ondisk waiters " << p->second << dendl;
12788         for (auto& i : p->second) {
12789           rq.push_back(std::get<0>(i));
12790         }
12791         waiting_for_ondisk.erase(p);
12792       }
12793     }
12794
12795     remove_repop(repop);
12796   }
12797
12798   ceph_assert(repop_queue.empty());
12799
12800   if (requeue) {
12801     requeue_ops(rq);
12802     if (!waiting_for_ondisk.empty()) {
12803       for (auto& i : waiting_for_ondisk) {
12804         for (auto& j : i.second) {
12805           derr << __func__ << ": op " << *(std::get<0>(j)->get_req())
12806                << " waiting on " << i.first << dendl;
12807         }
12808       }
12809       ceph_assert(waiting_for_ondisk.empty());
12810     }
12811   }
12812
12813   waiting_for_ondisk.clear();
12814 }
12815
12816 void PrimaryLogPG::on_flushed()
12817 {
12818   requeue_ops(waiting_for_flush);
12819   if (!is_peered() || !is_primary()) {
12820     pair<hobject_t, ObjectContextRef> i;
12821     while (object_contexts.get_next(i.first, &i)) {
12822       derr << __func__ << ": object " << i.first << " obc still alive" << dendl;
12823     }
12824     ceph_assert(object_contexts.empty());
12825   }
12826 }
12827
12828 void PrimaryLogPG::on_removal(ObjectStore::Transaction &t)
12829 {
12830   dout(10) << __func__ << dendl;
12831
12832   on_shutdown();
12833
12834   t.register_on_commit(new C_DeleteMore(this, get_osdmap_epoch()));
12835 }
12836
12837 void PrimaryLogPG::clear_async_reads()
12838 {
12839   dout(10) << __func__ << dendl;
12840   for(auto& i : in_progress_async_reads) {
12841     dout(10) << "clear ctx: "
12842              << "OpRequestRef " << i.first
12843              << " OpContext " << i.second
12844              << dendl;
12845     close_op_ctx(i.second);
12846   }
12847 }
12848
12849 void PrimaryLogPG::clear_cache()
12850 {
12851   object_contexts.clear();
12852 }
12853
12854 void PrimaryLogPG::on_shutdown()
12855 {
12856   dout(10) << __func__ << dendl;
12857
12858   if (recovery_queued) {
12859     recovery_queued = false;
12860     osd->clear_queued_recovery(this);
12861   }
12862
12863   m_scrubber->scrub_clear_state();
12864   m_scrubber->rm_from_osd_scrubbing();
12865
12866   vector<ceph_tid_t> tids;
12867   cancel_copy_ops(false, &tids);
12868   cancel_flush_ops(false, &tids);
12869   cancel_proxy_ops(false, &tids);
12870   cancel_manifest_ops(false, &tids);
12871   cancel_cls_gather_ops(false, &tids);
12872   osd->objecter->op_cancel(tids, -ECANCELED);
12873
12874   apply_and_flush_repops(false);
12875   cancel_log_updates();
12876   // we must remove PGRefs, so do this this prior to release_backoffs() callers
12877   clear_backoffs();
12878   // clean up snap trim references
12879   snap_trimmer_machine.process_event(Reset());
12880
12881   pgbackend->on_change();
12882
12883   context_registry_on_change();
12884   object_contexts.clear();
12885
12886   clear_async_reads();
12887
12888   osd->remote_reserver.cancel_reservation(info.pgid);
12889   osd->local_reserver.cancel_reservation(info.pgid);
12890
12891   clear_primary_state();
12892   cancel_recovery();
12893
12894   if (is_primary()) {
12895     osd->clear_ready_to_merge(this);
12896   }
12897 }
12898
12899 void PrimaryLogPG::on_activate_complete()
12900 {
12901   check_local();
12902   // waiters
12903   if (!recovery_state.needs_flush()) {
12904     requeue_ops(waiting_for_peered);
12905   } else if (!waiting_for_peered.empty()) {
12906     dout(10) << __func__ << " flushes in progress, moving "
12907              << waiting_for_peered.size()
12908              << " items to waiting_for_flush"
12909              << dendl;
12910     ceph_assert(waiting_for_flush.empty());
12911     waiting_for_flush.swap(waiting_for_peered);
12912   }
12913
12914
12915   // all clean?
12916   if (needs_recovery()) {
12917     dout(10) << "activate not all replicas are up-to-date, queueing recovery" << dendl;
12918     queue_peering_event(
12919       PGPeeringEventRef(
12920         std::make_shared<PGPeeringEvent>(
12921           get_osdmap_epoch(),
12922           get_osdmap_epoch(),
12923           PeeringState::DoRecovery())));
12924   } else if (needs_backfill()) {
12925     dout(10) << "activate queueing backfill" << dendl;
12926     queue_peering_event(
12927       PGPeeringEventRef(
12928         std::make_shared<PGPeeringEvent>(
12929           get_osdmap_epoch(),
12930           get_osdmap_epoch(),
12931           PeeringState::RequestBackfill())));
12932   } else {
12933     dout(10) << "activate all replicas clean, no recovery" << dendl;
12934     queue_peering_event(
12935       PGPeeringEventRef(
12936         std::make_shared<PGPeeringEvent>(
12937           get_osdmap_epoch(),
12938           get_osdmap_epoch(),
12939           PeeringState::AllReplicasRecovered())));
12940   }
12941
12942   publish_stats_to_osd();
12943
12944   if (get_backfill_targets().size()) {
12945     last_backfill_started = recovery_state.earliest_backfill();
12946     new_backfill = true;
12947     ceph_assert(!last_backfill_started.is_max());
12948     dout(5) << __func__ << ": bft=" << get_backfill_targets()
12949            << " from " << last_backfill_started << dendl;
12950     for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
12951          i != get_backfill_targets().end();
12952          ++i) {
12953       dout(5) << "target shard " << *i
12954              << " from " << recovery_state.get_peer_info(*i).last_backfill
12955              << dendl;
12956     }
12957   }
12958
12959   hit_set_setup();
12960   agent_setup();
12961 }
12962
12963 void PrimaryLogPG::on_change(ObjectStore::Transaction &t)
12964 {
12965   dout(10) << __func__ << dendl;
12966
12967   if (hit_set && hit_set->insert_count() == 0) {
12968     dout(20) << " discarding empty hit_set" << dendl;
12969     hit_set_clear();
12970   }
12971
12972   if (recovery_queued) {
12973     recovery_queued = false;
12974     osd->clear_queued_recovery(this);
12975   }
12976
12977   // requeue everything in the reverse order they should be
12978   // reexamined.
12979   requeue_ops(waiting_for_peered);
12980   requeue_ops(waiting_for_flush);
12981   requeue_ops(waiting_for_active);
12982   requeue_ops(waiting_for_readable);
12983
12984   vector<ceph_tid_t> tids;
12985   cancel_copy_ops(is_primary(), &tids);
12986   cancel_flush_ops(is_primary(), &tids);
12987   cancel_proxy_ops(is_primary(), &tids);
12988   cancel_manifest_ops(is_primary(), &tids);
12989   cancel_cls_gather_ops(is_primary(), &tids);
12990   osd->objecter->op_cancel(tids, -ECANCELED);
12991
12992   // requeue object waiters
12993   for (auto& p : waiting_for_unreadable_object) {
12994     release_backoffs(p.first);
12995   }
12996   if (is_primary()) {
12997     requeue_object_waiters(waiting_for_unreadable_object);
12998   } else {
12999     waiting_for_unreadable_object.clear();
13000   }
13001   for (map<hobject_t,list<OpRequestRef>>::iterator p = waiting_for_degraded_object.begin();
13002        p != waiting_for_degraded_object.end();
13003        waiting_for_degraded_object.erase(p++)) {
13004     release_backoffs(p->first);
13005     if (is_primary())
13006       requeue_ops(p->second);
13007     else
13008       p->second.clear();
13009     finish_degraded_object(p->first);
13010   }
13011
13012   // requeues waiting_for_scrub
13013   m_scrubber->scrub_clear_state();
13014
13015   for (auto p = waiting_for_blocked_object.begin();
13016        p != waiting_for_blocked_object.end();
13017        waiting_for_blocked_object.erase(p++)) {
13018     if (is_primary())
13019       requeue_ops(p->second);
13020     else
13021       p->second.clear();
13022   }
13023   for (auto i = callbacks_for_degraded_object.begin();
13024        i != callbacks_for_degraded_object.end();
13025     ) {
13026     finish_degraded_object((i++)->first);
13027   }
13028   ceph_assert(callbacks_for_degraded_object.empty());
13029
13030   if (is_primary()) {
13031     requeue_ops(waiting_for_cache_not_full);
13032   } else {
13033     waiting_for_cache_not_full.clear();
13034   }
13035   objects_blocked_on_cache_full.clear();
13036
13037   for (list<pair<OpRequestRef, OpContext*> >::iterator i =
13038          in_progress_async_reads.begin();
13039        i != in_progress_async_reads.end();
13040        in_progress_async_reads.erase(i++)) {
13041     close_op_ctx(i->second);
13042     if (is_primary())
13043       requeue_op(i->first);
13044   }
13045
13046   // this will requeue ops we were working on but didn't finish, and
13047   // any dups
13048   apply_and_flush_repops(is_primary());
13049   cancel_log_updates();
13050
13051   // do this *after* apply_and_flush_repops so that we catch any newly
13052   // registered watches.
13053   context_registry_on_change();
13054
13055   pgbackend->on_change_cleanup(&t);
13056   m_scrubber->cleanup_store(&t);
13057   pgbackend->on_change();
13058
13059   // clear snap_trimmer state
13060   snap_trimmer_machine.process_event(Reset());
13061
13062   debug_op_order.clear();
13063   unstable_stats.clear();
13064
13065   // we don't want to cache object_contexts through the interval change
13066   // NOTE: we actually assert that all currently live references are dead
13067   // by the time the flush for the next interval completes.
13068   object_contexts.clear();
13069
13070   // should have been cleared above by finishing all of the degraded objects
13071   ceph_assert(objects_blocked_on_degraded_snap.empty());
13072 }
13073
13074 void PrimaryLogPG::plpg_on_role_change()
13075 {
13076   dout(10) << __func__ << dendl;
13077   if (get_role() != 0 && hit_set) {
13078     dout(10) << " clearing hit set" << dendl;
13079     hit_set_clear();
13080   }
13081 }
13082
13083 void PrimaryLogPG::plpg_on_pool_change()
13084 {
13085   dout(10) << __func__ << dendl;
13086   // requeue cache full waiters just in case the cache_mode is
13087   // changing away from writeback mode.  note that if we are not
13088   // active the normal requeuing machinery is sufficient (and properly
13089   // ordered).
13090   if (is_active() &&
13091       pool.info.cache_mode != pg_pool_t::CACHEMODE_WRITEBACK &&
13092       !waiting_for_cache_not_full.empty()) {
13093     dout(10) << __func__ << " requeuing full waiters (not in writeback) "
13094              << dendl;
13095     requeue_ops(waiting_for_cache_not_full);
13096     objects_blocked_on_cache_full.clear();
13097   }
13098   hit_set_setup();
13099   agent_setup();
13100 }
13101
13102 // clear state.  called on recovery completion AND cancellation.
13103 void PrimaryLogPG::_clear_recovery_state()
13104 {
13105 #ifdef DEBUG_RECOVERY_OIDS
13106   recovering_oids.clear();
13107 #endif
13108   dout(15) << __func__ << " flags: " << m_planned_scrub << dendl;
13109
13110   last_backfill_started = hobject_t();
13111   set<hobject_t>::iterator i = backfills_in_flight.begin();
13112   while (i != backfills_in_flight.end()) {
13113     backfills_in_flight.erase(i++);
13114   }
13115
13116   list<OpRequestRef> blocked_ops;
13117   for (map<hobject_t, ObjectContextRef>::iterator i = recovering.begin();
13118        i != recovering.end();
13119        recovering.erase(i++)) {
13120     if (i->second) {
13121       i->second->drop_recovery_read(&blocked_ops);
13122       requeue_ops(blocked_ops);
13123     }
13124   }
13125   ceph_assert(backfills_in_flight.empty());
13126   pending_backfill_updates.clear();
13127   ceph_assert(recovering.empty());
13128   pgbackend->clear_recovery_state();
13129 }
13130
13131 void PrimaryLogPG::cancel_pull(const hobject_t &soid)
13132 {
13133   dout(20) << __func__ << ": " << soid << dendl;
13134   ceph_assert(recovering.count(soid));
13135   ObjectContextRef obc = recovering[soid];
13136   if (obc) {
13137     list<OpRequestRef> blocked_ops;
13138     obc->drop_recovery_read(&blocked_ops);
13139     requeue_ops(blocked_ops);
13140   }
13141   recovering.erase(soid);
13142   finish_recovery_op(soid);
13143   release_backoffs(soid);
13144   if (waiting_for_degraded_object.count(soid)) {
13145     dout(20) << " kicking degraded waiters on " << soid << dendl;
13146     requeue_ops(waiting_for_degraded_object[soid]);
13147     waiting_for_degraded_object.erase(soid);
13148   }
13149   if (waiting_for_unreadable_object.count(soid)) {
13150     dout(20) << " kicking unreadable waiters on " << soid << dendl;
13151     requeue_ops(waiting_for_unreadable_object[soid]);
13152     waiting_for_unreadable_object.erase(soid);
13153   }
13154   if (is_missing_object(soid))
13155     recovery_state.set_last_requested(0);
13156   finish_degraded_object(soid);
13157 }
13158
13159 void PrimaryLogPG::check_recovery_sources(const OSDMapRef& osdmap)
13160 {
13161   pgbackend->check_recovery_sources(osdmap);
13162 }
13163
13164 bool PrimaryLogPG::start_recovery_ops(
13165   uint64_t max,
13166   ThreadPool::TPHandle &handle,
13167   uint64_t *ops_started)
13168 {
13169   uint64_t& started = *ops_started;
13170   started = 0;
13171   bool work_in_progress = false;
13172   bool recovery_started = false;
13173   ceph_assert(is_primary());
13174   ceph_assert(is_peered());
13175   ceph_assert(!recovery_state.is_deleting());
13176
13177   ceph_assert(recovery_queued);
13178   recovery_queued = false;
13179
13180   if (!state_test(PG_STATE_RECOVERING) &&
13181       !state_test(PG_STATE_BACKFILLING)) {
13182     /* TODO: I think this case is broken and will make do_recovery()
13183      * unhappy since we're returning false */
13184     dout(10) << "recovery raced and were queued twice, ignoring!" << dendl;
13185     return have_unfound();
13186   }
13187
13188   const auto &missing = recovery_state.get_pg_log().get_missing();
13189
13190   uint64_t num_unfound = get_num_unfound();
13191
13192   if (!recovery_state.have_missing()) {
13193     recovery_state.local_recovery_complete();
13194   }
13195
13196   if (!missing.have_missing() || // Primary does not have missing
13197       // or all of the missing objects are unfound.
13198       recovery_state.all_missing_unfound()) {
13199     // Recover the replicas.
13200     started = recover_replicas(max, handle, &recovery_started);
13201   }
13202   if (!started) {
13203     // We still have missing objects that we should grab from replicas.
13204     started += recover_primary(max, handle);
13205   }
13206   if (!started && num_unfound != get_num_unfound()) {
13207     // second chance to recovery replicas
13208     started = recover_replicas(max, handle, &recovery_started);
13209   }
13210
13211   if (started || recovery_started)
13212     work_in_progress = true;
13213
13214   bool deferred_backfill = false;
13215   if (recovering.empty() &&
13216       state_test(PG_STATE_BACKFILLING) &&
13217       !get_backfill_targets().empty() && started < max &&
13218       missing.num_missing() == 0 &&
13219       waiting_on_backfill.empty()) {
13220     if (get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL)) {
13221       dout(10) << "deferring backfill due to NOBACKFILL" << dendl;
13222       deferred_backfill = true;
13223     } else if (get_osdmap()->test_flag(CEPH_OSDMAP_NOREBALANCE) &&
13224                !is_degraded())  {
13225       dout(10) << "deferring backfill due to NOREBALANCE" << dendl;
13226       deferred_backfill = true;
13227     } else if (!recovery_state.is_backfill_reserved()) {
13228       /* DNMNOTE I think this branch is dead */
13229       dout(10) << "deferring backfill due to !backfill_reserved" << dendl;
13230       if (!backfill_reserving) {
13231         dout(10) << "queueing RequestBackfill" << dendl;
13232         backfill_reserving = true;
13233         queue_peering_event(
13234           PGPeeringEventRef(
13235             std::make_shared<PGPeeringEvent>(
13236               get_osdmap_epoch(),
13237               get_osdmap_epoch(),
13238               PeeringState::RequestBackfill())));
13239       }
13240       deferred_backfill = true;
13241     } else {
13242       started += recover_backfill(max - started, handle, &work_in_progress);
13243     }
13244   }
13245
13246   dout(10) << " started " << started << dendl;
13247   osd->logger->inc(l_osd_rop, started);
13248
13249   if (!recovering.empty() ||
13250       work_in_progress || recovery_ops_active > 0 || deferred_backfill)
13251     return !work_in_progress && have_unfound();
13252
13253   ceph_assert(recovering.empty());
13254   ceph_assert(recovery_ops_active == 0);
13255
13256   dout(10) << __func__ << " needs_recovery: "
13257            << recovery_state.get_missing_loc().get_needs_recovery()
13258            << dendl;
13259   dout(10) << __func__ << " missing_loc: "
13260            << recovery_state.get_missing_loc().get_missing_locs()
13261            << dendl;
13262   int unfound = get_num_unfound();
13263   if (unfound) {
13264     dout(10) << " still have " << unfound << " unfound" << dendl;
13265     return true;
13266   }
13267
13268   if (missing.num_missing() > 0) {
13269     // this shouldn't happen!
13270     osd->clog->error() << info.pgid << " Unexpected Error: recovery ending with "
13271                        << missing.num_missing() << ": " << missing.get_items();
13272     return false;
13273   }
13274
13275   if (needs_recovery()) {
13276     // this shouldn't happen!
13277     // We already checked num_missing() so we must have missing replicas
13278     osd->clog->error() << info.pgid
13279                        << " Unexpected Error: recovery ending with missing replicas";
13280     return false;
13281   }
13282
13283   if (state_test(PG_STATE_RECOVERING)) {
13284     state_clear(PG_STATE_RECOVERING);
13285     state_clear(PG_STATE_FORCED_RECOVERY);
13286     if (needs_backfill()) {
13287       dout(10) << "recovery done, queuing backfill" << dendl;
13288       queue_peering_event(
13289         PGPeeringEventRef(
13290           std::make_shared<PGPeeringEvent>(
13291             get_osdmap_epoch(),
13292             get_osdmap_epoch(),
13293             PeeringState::RequestBackfill())));
13294     } else {
13295       dout(10) << "recovery done, no backfill" << dendl;
13296       state_clear(PG_STATE_FORCED_BACKFILL);
13297       queue_peering_event(
13298         PGPeeringEventRef(
13299           std::make_shared<PGPeeringEvent>(
13300             get_osdmap_epoch(),
13301             get_osdmap_epoch(),
13302             PeeringState::AllReplicasRecovered())));
13303     }
13304   } else { // backfilling
13305     state_clear(PG_STATE_BACKFILLING);
13306     state_clear(PG_STATE_FORCED_BACKFILL);
13307     state_clear(PG_STATE_FORCED_RECOVERY);
13308     dout(10) << "recovery done, backfill done" << dendl;
13309     queue_peering_event(
13310       PGPeeringEventRef(
13311         std::make_shared<PGPeeringEvent>(
13312           get_osdmap_epoch(),
13313           get_osdmap_epoch(),
13314           PeeringState::Backfilled())));
13315   }
13316
13317   return false;
13318 }
13319
13320 /**
13321  * do one recovery op.
13322  * return true if done, false if nothing left to do.
13323  */
13324 uint64_t PrimaryLogPG::recover_primary(uint64_t max, ThreadPool::TPHandle &handle)
13325 {
13326   ceph_assert(is_primary());
13327
13328   const auto &missing = recovery_state.get_pg_log().get_missing();
13329
13330   dout(10) << __func__ << " recovering " << recovering.size()
13331            << " in pg,"
13332            << " missing " << missing << dendl;
13333
13334   dout(25) << __func__ << " " << missing.get_items() << dendl;
13335
13336   // look at log!
13337   pg_log_entry_t *latest = 0;
13338   unsigned started = 0;
13339   int skipped = 0;
13340
13341   PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
13342   map<version_t, hobject_t>::const_iterator p =
13343     missing.get_rmissing().lower_bound(recovery_state.get_pg_log().get_log().last_requested);
13344   while (p != missing.get_rmissing().end()) {
13345     handle.reset_tp_timeout();
13346     hobject_t soid;
13347     version_t v = p->first;
13348
13349     auto it_objects = recovery_state.get_pg_log().get_log().objects.find(p->second);
13350     if (it_objects != recovery_state.get_pg_log().get_log().objects.end()) {
13351       latest = it_objects->second;
13352       ceph_assert(latest->is_update() || latest->is_delete());
13353       soid = latest->soid;
13354     } else {
13355       latest = 0;
13356       soid = p->second;
13357     }
13358     const pg_missing_item& item = missing.get_items().find(p->second)->second;
13359     ++p;
13360
13361     hobject_t head = soid.get_head();
13362
13363     eversion_t need = item.need;
13364
13365     dout(10) << __func__ << " "
13366              << soid << " " << item.need
13367              << (missing.is_missing(soid) ? " (missing)":"")
13368              << (missing.is_missing(head) ? " (missing head)":"")
13369              << (recovering.count(soid) ? " (recovering)":"")
13370              << (recovering.count(head) ? " (recovering head)":"")
13371              << dendl;
13372
13373     if (latest) {
13374       switch (latest->op) {
13375       case pg_log_entry_t::CLONE:
13376         /*
13377          * Handling for this special case removed for now, until we
13378          * can correctly construct an accurate SnapSet from the old
13379          * one.
13380          */
13381         break;
13382
13383       case pg_log_entry_t::LOST_REVERT:
13384         {
13385           if (item.have == latest->reverting_to) {
13386             ObjectContextRef obc = get_object_context(soid, true);
13387
13388             if (obc->obs.oi.version == latest->version) {
13389               // I'm already reverting
13390               dout(10) << " already reverting " << soid << dendl;
13391             } else {
13392               dout(10) << " reverting " << soid << " to " << latest->prior_version << dendl;
13393               obc->obs.oi.version = latest->version;
13394
13395               ObjectStore::Transaction t;
13396               bufferlist b2;
13397               obc->obs.oi.encode(
13398                 b2,
13399                 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
13400               ceph_assert(!pool.info.require_rollback());
13401               t.setattr(coll, ghobject_t(soid), OI_ATTR, b2);
13402
13403               recovery_state.recover_got(
13404                 soid,
13405                 latest->version,
13406                 false,
13407                 t);
13408
13409               ++active_pushes;
13410
13411               t.register_on_applied(new C_OSD_AppliedRecoveredObject(this, obc));
13412               t.register_on_commit(new C_OSD_CommittedPushedObject(
13413                                      this,
13414                                      get_osdmap_epoch(),
13415                                      info.last_complete));
13416               osd->store->queue_transaction(ch, std::move(t));
13417               continue;
13418             }
13419           } else {
13420             /*
13421              * Pull the old version of the object.  Update missing_loc here to have the location
13422              * of the version we want.
13423              *
13424              * This doesn't use the usual missing_loc paths, but that's okay:
13425              *  - if we have it locally, we hit the case above, and go from there.
13426              *  - if we don't, we always pass through this case during recovery and set up the location
13427              *    properly.
13428              *  - this way we don't need to mangle the missing code to be general about needing an old
13429              *    version...
13430              */
13431             eversion_t alternate_need = latest->reverting_to;
13432             dout(10) << " need to pull prior_version " << alternate_need << " for revert " << item << dendl;
13433
13434             set<pg_shard_t> good_peers;
13435             for (auto p = recovery_state.get_peer_missing().begin();
13436                  p != recovery_state.get_peer_missing().end();
13437                  ++p) {
13438               if (p->second.is_missing(soid, need) &&
13439                   p->second.get_items().at(soid).have == alternate_need) {
13440                 good_peers.insert(p->first);
13441               }
13442             }
13443             recovery_state.set_revert_with_targets(
13444               soid,
13445               good_peers);
13446             dout(10) << " will pull " << alternate_need << " or " << need
13447                      << " from one of "
13448                      << recovery_state.get_missing_loc().get_locations(soid)
13449                      << dendl;
13450           }
13451         }
13452         break;
13453       }
13454     }
13455
13456     if (!recovering.count(soid)) {
13457       if (recovering.count(head)) {
13458         ++skipped;
13459       } else {
13460         int r = recover_missing(
13461           soid, need, get_recovery_op_priority(), h);
13462         switch (r) {
13463         case PULL_YES:
13464           ++started;
13465           break;
13466         case PULL_HEAD:
13467           ++started;
13468         case PULL_NONE:
13469           ++skipped;
13470           break;
13471         default:
13472           ceph_abort();
13473         }
13474         if (started >= max)
13475           break;
13476       }
13477     }
13478
13479     // only advance last_requested if we haven't skipped anything
13480     if (!skipped)
13481       recovery_state.set_last_requested(v);
13482   }
13483
13484   pgbackend->run_recovery_op(h, get_recovery_op_priority());
13485   return started;
13486 }
13487
13488 bool PrimaryLogPG::primary_error(
13489   const hobject_t& soid, eversion_t v)
13490 {
13491   recovery_state.force_object_missing(pg_whoami, soid, v);
13492   bool uhoh = recovery_state.get_missing_loc().is_unfound(soid);
13493   if (uhoh)
13494     osd->clog->error() << info.pgid << " missing primary copy of "
13495                        << soid << ", unfound";
13496   else
13497     osd->clog->error() << info.pgid << " missing primary copy of "
13498                        << soid
13499                        << ", will try copies on "
13500                        << recovery_state.get_missing_loc().get_locations(soid);
13501   return uhoh;
13502 }
13503
13504 int PrimaryLogPG::prep_object_replica_deletes(
13505   const hobject_t& soid, eversion_t v,
13506   PGBackend::RecoveryHandle *h,
13507   bool *work_started)
13508 {
13509   ceph_assert(is_primary());
13510   dout(10) << __func__ << ": on " << soid << dendl;
13511
13512   ObjectContextRef obc = get_object_context(soid, false);
13513   if (obc) {
13514     if (!obc->get_recovery_read()) {
13515       dout(20) << "replica delete delayed on " << soid
13516                << "; could not get rw_manager lock" << dendl;
13517       *work_started = true;
13518       return 0;
13519     } else {
13520       dout(20) << "replica delete got recovery read lock on " << soid
13521                << dendl;
13522     }
13523   }
13524
13525   start_recovery_op(soid);
13526   ceph_assert(!recovering.count(soid));
13527   if (!obc)
13528     recovering.insert(make_pair(soid, ObjectContextRef()));
13529   else
13530     recovering.insert(make_pair(soid, obc));
13531
13532   pgbackend->recover_delete_object(soid, v, h);
13533   return 1;
13534 }
13535
13536 int PrimaryLogPG::prep_object_replica_pushes(
13537   const hobject_t& soid, eversion_t v,
13538   PGBackend::RecoveryHandle *h,
13539   bool *work_started)
13540 {
13541   ceph_assert(is_primary());
13542   dout(10) << __func__ << ": on " << soid << dendl;
13543
13544   if (soid.snap && soid.snap < CEPH_NOSNAP) {
13545     // do we have the head and/or snapdir?
13546     hobject_t head = soid.get_head();
13547     if (recovery_state.get_pg_log().get_missing().is_missing(head)) {
13548       if (recovering.count(head)) {
13549         dout(10) << " missing but already recovering head " << head << dendl;
13550         return 0;
13551       } else {
13552         int r = recover_missing(
13553             head, recovery_state.get_pg_log().get_missing().get_items().find(head)->second.need,
13554             get_recovery_op_priority(), h);
13555         if (r != PULL_NONE)
13556           return 1;
13557         return 0;
13558       }
13559     }
13560   }
13561
13562   // NOTE: we know we will get a valid oloc off of disk here.
13563   ObjectContextRef obc = get_object_context(soid, false);
13564   if (!obc) {
13565     primary_error(soid, v);
13566     return 0;
13567   }
13568
13569   if (!obc->get_recovery_read()) {
13570     dout(20) << "recovery delayed on " << soid
13571              << "; could not get rw_manager lock" << dendl;
13572     *work_started = true;
13573     return 0;
13574   } else {
13575     dout(20) << "recovery got recovery read lock on " << soid
13576              << dendl;
13577   }
13578
13579   start_recovery_op(soid);
13580   ceph_assert(!recovering.count(soid));
13581   recovering.insert(make_pair(soid, obc));
13582
13583   int r = pgbackend->recover_object(
13584     soid,
13585     v,
13586     ObjectContextRef(),
13587     obc, // has snapset context
13588     h);
13589   if (r < 0) {
13590     dout(0) << __func__ << " Error " << r << " on oid " << soid << dendl;
13591     on_failed_pull({ pg_whoami }, soid, v);
13592     return 0;
13593   }
13594   return 1;
13595 }
13596
13597 uint64_t PrimaryLogPG::recover_replicas(uint64_t max, ThreadPool::TPHandle &handle,
13598   bool *work_started)
13599 {
13600   dout(10) << __func__ << "(" << max << ")" << dendl;
13601   uint64_t started = 0;
13602
13603   PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
13604
13605   // this is FAR from an optimal recovery order.  pretty lame, really.
13606   ceph_assert(!get_acting_recovery_backfill().empty());
13607   // choose replicas to recover, replica has the shortest missing list first
13608   // so we can bring it back to normal ASAP
13609   std::vector<std::pair<unsigned int, pg_shard_t>> replicas_by_num_missing,
13610     async_by_num_missing;
13611   replicas_by_num_missing.reserve(get_acting_recovery_backfill().size() - 1);
13612   for (auto &p: get_acting_recovery_backfill()) {
13613     if (p == get_primary()) {
13614       continue;
13615     }
13616     auto pm = recovery_state.get_peer_missing().find(p);
13617     ceph_assert(pm != recovery_state.get_peer_missing().end());
13618     auto nm = pm->second.num_missing();
13619     if (nm != 0) {
13620       if (is_async_recovery_target(p)) {
13621         async_by_num_missing.push_back(make_pair(nm, p));
13622       } else {
13623         replicas_by_num_missing.push_back(make_pair(nm, p));
13624       }
13625     }
13626   }
13627   // sort by number of missing objects, in ascending order.
13628   auto func = [](const std::pair<unsigned int, pg_shard_t> &lhs,
13629                  const std::pair<unsigned int, pg_shard_t> &rhs) {
13630     return lhs.first < rhs.first;
13631   };
13632   // acting goes first
13633   std::sort(replicas_by_num_missing.begin(), replicas_by_num_missing.end(), func);
13634   // then async_recovery_targets
13635   std::sort(async_by_num_missing.begin(), async_by_num_missing.end(), func);
13636   replicas_by_num_missing.insert(replicas_by_num_missing.end(),
13637     async_by_num_missing.begin(), async_by_num_missing.end());
13638   for (auto &replica: replicas_by_num_missing) {
13639     pg_shard_t &peer = replica.second;
13640     ceph_assert(peer != get_primary());
13641     auto pm = recovery_state.get_peer_missing().find(peer);
13642     ceph_assert(pm != recovery_state.get_peer_missing().end());
13643     size_t m_sz = pm->second.num_missing();
13644
13645     dout(10) << " peer osd." << peer << " missing " << m_sz << " objects." << dendl;
13646     dout(20) << " peer osd." << peer << " missing " << pm->second.get_items() << dendl;
13647
13648     // oldest first!
13649     const pg_missing_t &m(pm->second);
13650     for (map<version_t, hobject_t>::const_iterator p = m.get_rmissing().begin();
13651          p != m.get_rmissing().end() && started < max;
13652            ++p) {
13653       handle.reset_tp_timeout();
13654       const hobject_t soid(p->second);
13655
13656       if (recovery_state.get_missing_loc().is_unfound(soid)) {
13657         dout(10) << __func__ << ": " << soid << " still unfound" << dendl;
13658         continue;
13659       }
13660
13661       const pg_info_t &pi = recovery_state.get_peer_info(peer);
13662       if (soid > pi.last_backfill) {
13663         if (!recovering.count(soid)) {
13664           derr << __func__ << ": object " << soid << " last_backfill "
13665                << pi.last_backfill << dendl;
13666           derr << __func__ << ": object added to missing set for backfill, but "
13667                << "is not in recovering, error!" << dendl;
13668           ceph_abort();
13669         }
13670         continue;
13671       }
13672
13673       if (recovering.count(soid)) {
13674         dout(10) << __func__ << ": already recovering " << soid << dendl;
13675         continue;
13676       }
13677
13678       if (recovery_state.get_missing_loc().is_deleted(soid)) {
13679         dout(10) << __func__ << ": " << soid << " is a delete, removing" << dendl;
13680         map<hobject_t,pg_missing_item>::const_iterator r = m.get_items().find(soid);
13681         started += prep_object_replica_deletes(soid, r->second.need, h, work_started);
13682         continue;
13683       }
13684
13685       if (soid.is_snap() &&
13686           recovery_state.get_pg_log().get_missing().is_missing(
13687             soid.get_head())) {
13688         dout(10) << __func__ << ": " << soid.get_head()
13689                  << " still missing on primary" << dendl;
13690         continue;
13691       }
13692
13693       if (recovery_state.get_pg_log().get_missing().is_missing(soid)) {
13694         dout(10) << __func__ << ": " << soid << " still missing on primary" << dendl;
13695         continue;
13696       }
13697
13698       dout(10) << __func__ << ": recover_object_replicas(" << soid << ")" << dendl;
13699       map<hobject_t,pg_missing_item>::const_iterator r = m.get_items().find(soid);
13700       started += prep_object_replica_pushes(soid, r->second.need, h, work_started);
13701     }
13702   }
13703
13704   pgbackend->run_recovery_op(h, get_recovery_op_priority());
13705   return started;
13706 }
13707
13708 hobject_t PrimaryLogPG::earliest_peer_backfill() const
13709 {
13710   hobject_t e = hobject_t::get_max();
13711   for (const pg_shard_t& peer : get_backfill_targets()) {
13712     const auto iter = peer_backfill_info.find(peer);
13713     ceph_assert(iter != peer_backfill_info.end());
13714     e = std::min(e, iter->second.begin);
13715   }
13716   return e;
13717 }
13718
13719 bool PrimaryLogPG::all_peer_done() const
13720 {
13721   // Primary hasn't got any more objects
13722   ceph_assert(backfill_info.empty());
13723
13724   for (const pg_shard_t& bt : get_backfill_targets()) {
13725     const auto piter = peer_backfill_info.find(bt);
13726     ceph_assert(piter != peer_backfill_info.end());
13727     const BackfillInterval& pbi = piter->second;
13728     // See if peer has more to process
13729     if (!pbi.extends_to_end() || !pbi.empty())
13730         return false;
13731   }
13732   return true;
13733 }
13734
13735 /**
13736  * recover_backfill
13737  *
13738  * Invariants:
13739  *
13740  * backfilled: fully pushed to replica or present in replica's missing set (both
13741  * our copy and theirs).
13742  *
13743  * All objects on a backfill_target in
13744  * [MIN,peer_backfill_info[backfill_target].begin) are valid; logically-removed
13745  * objects have been actually deleted and all logically-valid objects are replicated.
13746  * There may be PG objects in this interval yet to be backfilled.
13747  *
13748  * All objects in PG in [MIN,backfill_info.begin) have been backfilled to all
13749  * backfill_targets.  There may be objects on backfill_target(s) yet to be deleted.
13750  *
13751  * For a backfill target, all objects < std::min(peer_backfill_info[target].begin,
13752  *     backfill_info.begin) in PG are backfilled.  No deleted objects in this
13753  * interval remain on the backfill target.
13754  *
13755  * For a backfill target, all objects <= peer_info[target].last_backfill
13756  * have been backfilled to target
13757  *
13758  * There *MAY* be missing/outdated objects between last_backfill_started and
13759  * std::min(peer_backfill_info[*].begin, backfill_info.begin) in the event that client
13760  * io created objects since the last scan.  For this reason, we call
13761  * update_range() again before continuing backfill.
13762  */
13763 uint64_t PrimaryLogPG::recover_backfill(
13764   uint64_t max,
13765   ThreadPool::TPHandle &handle, bool *work_started)
13766 {
13767   dout(10) << __func__ << " (" << max << ")"
13768            << " bft=" << get_backfill_targets()
13769            << " last_backfill_started " << last_backfill_started
13770            << (new_backfill ? " new_backfill":"")
13771            << dendl;
13772   ceph_assert(!get_backfill_targets().empty());
13773
13774   // Initialize from prior backfill state
13775   if (new_backfill) {
13776     // on_activate() was called prior to getting here
13777     ceph_assert(last_backfill_started == recovery_state.earliest_backfill());
13778     new_backfill = false;
13779
13780     // initialize BackfillIntervals
13781     for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
13782          i != get_backfill_targets().end();
13783          ++i) {
13784       peer_backfill_info[*i].reset(
13785         recovery_state.get_peer_info(*i).last_backfill);
13786     }
13787     backfill_info.reset(last_backfill_started);
13788
13789     backfills_in_flight.clear();
13790     pending_backfill_updates.clear();
13791   }
13792
13793   for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
13794        i != get_backfill_targets().end();
13795        ++i) {
13796     dout(10) << "peer osd." << *i
13797            << " info " << recovery_state.get_peer_info(*i)
13798            << " interval " << peer_backfill_info[*i].begin
13799            << "-" << peer_backfill_info[*i].end
13800            << " " << peer_backfill_info[*i].objects.size() << " objects"
13801            << dendl;
13802   }
13803
13804   // update our local interval to cope with recent changes
13805   backfill_info.begin = last_backfill_started;
13806   update_range(&backfill_info, handle);
13807
13808   unsigned ops = 0;
13809   vector<boost::tuple<hobject_t, eversion_t, pg_shard_t> > to_remove;
13810   set<hobject_t> add_to_stat;
13811
13812   for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
13813        i != get_backfill_targets().end();
13814        ++i) {
13815     peer_backfill_info[*i].trim_to(
13816       std::max(
13817         recovery_state.get_peer_info(*i).last_backfill,
13818         last_backfill_started));
13819   }
13820   backfill_info.trim_to(last_backfill_started);
13821
13822   PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
13823   while (ops < max) {
13824     if (backfill_info.begin <= earliest_peer_backfill() &&
13825         !backfill_info.extends_to_end() && backfill_info.empty()) {
13826       hobject_t next = backfill_info.end;
13827       backfill_info.reset(next);
13828       backfill_info.end = hobject_t::get_max();
13829       update_range(&backfill_info, handle);
13830       backfill_info.trim();
13831     }
13832
13833     dout(20) << "   my backfill interval " << backfill_info << dendl;
13834
13835     bool sent_scan = false;
13836     for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
13837          i != get_backfill_targets().end();
13838          ++i) {
13839       pg_shard_t bt = *i;
13840       BackfillInterval& pbi = peer_backfill_info[bt];
13841
13842       dout(20) << " peer shard " << bt << " backfill " << pbi << dendl;
13843       if (pbi.begin <= backfill_info.begin &&
13844           !pbi.extends_to_end() && pbi.empty()) {
13845         dout(10) << " scanning peer osd." << bt << " from " << pbi.end << dendl;
13846         epoch_t e = get_osdmap_epoch();
13847         MOSDPGScan *m = new MOSDPGScan(
13848           MOSDPGScan::OP_SCAN_GET_DIGEST, pg_whoami, e, get_last_peering_reset(),
13849           spg_t(info.pgid.pgid, bt.shard),
13850           pbi.end, hobject_t());
13851         osd->send_message_osd_cluster(bt.osd, m, get_osdmap_epoch());
13852         ceph_assert(waiting_on_backfill.find(bt) == waiting_on_backfill.end());
13853         waiting_on_backfill.insert(bt);
13854         sent_scan = true;
13855       }
13856     }
13857
13858     // Count simultaneous scans as a single op and let those complete
13859     if (sent_scan) {
13860       ops++;
13861       start_recovery_op(hobject_t::get_max()); // XXX: was pbi.end
13862       break;
13863     }
13864
13865     if (backfill_info.empty() && all_peer_done()) {
13866       dout(10) << " reached end for both local and all peers" << dendl;
13867       break;
13868     }
13869
13870     // Get object within set of peers to operate on and
13871     // the set of targets for which that object applies.
13872     hobject_t check = earliest_peer_backfill();
13873
13874     if (check < backfill_info.begin) {
13875
13876       set<pg_shard_t> check_targets;
13877       for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
13878            i != get_backfill_targets().end();
13879            ++i) {
13880         pg_shard_t bt = *i;
13881         BackfillInterval& pbi = peer_backfill_info[bt];
13882         if (pbi.begin == check)
13883           check_targets.insert(bt);
13884       }
13885       ceph_assert(!check_targets.empty());
13886
13887       dout(20) << " BACKFILL removing " << check
13888                << " from peers " << check_targets << dendl;
13889       for (set<pg_shard_t>::iterator i = check_targets.begin();
13890            i != check_targets.end();
13891            ++i) {
13892         pg_shard_t bt = *i;
13893         BackfillInterval& pbi = peer_backfill_info[bt];
13894         ceph_assert(pbi.begin == check);
13895
13896         to_remove.push_back(boost::make_tuple(check, pbi.objects.begin()->second, bt));
13897         pbi.pop_front();
13898       }
13899
13900       last_backfill_started = check;
13901
13902       // Don't increment ops here because deletions
13903       // are cheap and not replied to unlike real recovery_ops,
13904       // and we can't increment ops without requeueing ourself
13905       // for recovery.
13906     } else {
13907       eversion_t& obj_v = backfill_info.objects.begin()->second;
13908
13909       vector<pg_shard_t> need_ver_targs, missing_targs, keep_ver_targs, skip_targs;
13910       for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
13911            i != get_backfill_targets().end();
13912            ++i) {
13913         pg_shard_t bt = *i;
13914         BackfillInterval& pbi = peer_backfill_info[bt];
13915         // Find all check peers that have the wrong version
13916         if (check == backfill_info.begin && check == pbi.begin) {
13917           if (pbi.objects.begin()->second != obj_v) {
13918             need_ver_targs.push_back(bt);
13919           } else {
13920             keep_ver_targs.push_back(bt);
13921           }
13922         } else {
13923           const pg_info_t& pinfo = recovery_state.get_peer_info(bt);
13924
13925           // Only include peers that we've caught up to their backfill line
13926           // otherwise, they only appear to be missing this object
13927           // because their pbi.begin > backfill_info.begin.
13928           if (backfill_info.begin > pinfo.last_backfill)
13929             missing_targs.push_back(bt);
13930           else
13931             skip_targs.push_back(bt);
13932         }
13933       }
13934
13935       if (!keep_ver_targs.empty()) {
13936         // These peers have version obj_v
13937         dout(20) << " BACKFILL keeping " << check
13938                  << " with ver " << obj_v
13939                  << " on peers " << keep_ver_targs << dendl;
13940         //assert(!waiting_for_degraded_object.count(check));
13941       }
13942       if (!need_ver_targs.empty() || !missing_targs.empty()) {
13943         ObjectContextRef obc = get_object_context(backfill_info.begin, false);
13944         ceph_assert(obc);
13945         if (obc->get_recovery_read()) {
13946           if (!need_ver_targs.empty()) {
13947             dout(20) << " BACKFILL replacing " << check
13948                    << " with ver " << obj_v
13949                    << " to peers " << need_ver_targs << dendl;
13950           }
13951           if (!missing_targs.empty()) {
13952             dout(20) << " BACKFILL pushing " << backfill_info.begin
13953                  << " with ver " << obj_v
13954                  << " to peers " << missing_targs << dendl;
13955           }
13956           vector<pg_shard_t> all_push = need_ver_targs;
13957           all_push.insert(all_push.end(), missing_targs.begin(), missing_targs.end());
13958
13959           handle.reset_tp_timeout();
13960           int r = prep_backfill_object_push(backfill_info.begin, obj_v, obc, all_push, h);
13961           if (r < 0) {
13962             *work_started = true;
13963             dout(0) << __func__ << " Error " << r << " trying to backfill " << backfill_info.begin << dendl;
13964             break;
13965           }
13966           ops++;
13967         } else {
13968           *work_started = true;
13969           dout(20) << "backfill blocking on " << backfill_info.begin
13970                    << "; could not get rw_manager lock" << dendl;
13971           break;
13972         }
13973       }
13974       dout(20) << "need_ver_targs=" << need_ver_targs
13975                << " keep_ver_targs=" << keep_ver_targs << dendl;
13976       dout(20) << "backfill_targets=" << get_backfill_targets()
13977                << " missing_targs=" << missing_targs
13978                << " skip_targs=" << skip_targs << dendl;
13979
13980       last_backfill_started = backfill_info.begin;
13981       add_to_stat.insert(backfill_info.begin); // XXX: Only one for all pushes?
13982       backfill_info.pop_front();
13983       vector<pg_shard_t> check_targets = need_ver_targs;
13984       check_targets.insert(check_targets.end(), keep_ver_targs.begin(), keep_ver_targs.end());
13985       for (vector<pg_shard_t>::iterator i = check_targets.begin();
13986            i != check_targets.end();
13987            ++i) {
13988         pg_shard_t bt = *i;
13989         BackfillInterval& pbi = peer_backfill_info[bt];
13990         pbi.pop_front();
13991       }
13992     }
13993   }
13994
13995   for (set<hobject_t>::iterator i = add_to_stat.begin();
13996        i != add_to_stat.end();
13997        ++i) {
13998     ObjectContextRef obc = get_object_context(*i, false);
13999     ceph_assert(obc);
14000     pg_stat_t stat;
14001     add_object_context_to_pg_stat(obc, &stat);
14002     pending_backfill_updates[*i] = stat;
14003   }
14004   map<pg_shard_t,MOSDPGBackfillRemove*> reqs;
14005   for (unsigned i = 0; i < to_remove.size(); ++i) {
14006     handle.reset_tp_timeout();
14007     const hobject_t& oid = to_remove[i].get<0>();
14008     eversion_t v = to_remove[i].get<1>();
14009     pg_shard_t peer = to_remove[i].get<2>();
14010     MOSDPGBackfillRemove *m;
14011     auto it = reqs.find(peer);
14012     if (it != reqs.end()) {
14013       m = it->second;
14014     } else {
14015       m = reqs[peer] = new MOSDPGBackfillRemove(
14016         spg_t(info.pgid.pgid, peer.shard),
14017         get_osdmap_epoch());
14018     }
14019     m->ls.push_back(make_pair(oid, v));
14020
14021     if (oid <= last_backfill_started)
14022       pending_backfill_updates[oid]; // add empty stat!
14023   }
14024   for (auto p : reqs) {
14025     osd->send_message_osd_cluster(p.first.osd, p.second,
14026                                   get_osdmap_epoch());
14027   }
14028
14029   pgbackend->run_recovery_op(h, get_recovery_op_priority());
14030
14031   hobject_t backfill_pos =
14032     std::min(backfill_info.begin, earliest_peer_backfill());
14033   dout(5) << "backfill_pos is " << backfill_pos << dendl;
14034   for (set<hobject_t>::iterator i = backfills_in_flight.begin();
14035        i != backfills_in_flight.end();
14036        ++i) {
14037     dout(20) << *i << " is still in flight" << dendl;
14038   }
14039
14040   hobject_t next_backfill_to_complete = backfills_in_flight.empty() ?
14041     backfill_pos : *(backfills_in_flight.begin());
14042   hobject_t new_last_backfill = recovery_state.earliest_backfill();
14043   dout(10) << "starting new_last_backfill at " << new_last_backfill << dendl;
14044   for (map<hobject_t, pg_stat_t>::iterator i =
14045          pending_backfill_updates.begin();
14046        i != pending_backfill_updates.end() &&
14047          i->first < next_backfill_to_complete;
14048        pending_backfill_updates.erase(i++)) {
14049     dout(20) << " pending_backfill_update " << i->first << dendl;
14050     ceph_assert(i->first > new_last_backfill);
14051     // carried from a previous round – if we are here, then we had to
14052     // be requeued (by e.g. on_global_recover()) and those operations
14053     // are done.
14054     recovery_state.update_complete_backfill_object_stats(
14055       i->first,
14056       i->second);
14057     new_last_backfill = i->first;
14058   }
14059   dout(10) << "possible new_last_backfill at " << new_last_backfill << dendl;
14060
14061   ceph_assert(!pending_backfill_updates.empty() ||
14062          new_last_backfill == last_backfill_started);
14063   if (pending_backfill_updates.empty() &&
14064       backfill_pos.is_max()) {
14065     ceph_assert(backfills_in_flight.empty());
14066     new_last_backfill = backfill_pos;
14067     last_backfill_started = backfill_pos;
14068   }
14069   dout(10) << "final new_last_backfill at " << new_last_backfill << dendl;
14070
14071   // If new_last_backfill == MAX, then we will send OP_BACKFILL_FINISH to
14072   // all the backfill targets.  Otherwise, we will move last_backfill up on
14073   // those targets need it and send OP_BACKFILL_PROGRESS to them.
14074   for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
14075        i != get_backfill_targets().end();
14076        ++i) {
14077     pg_shard_t bt = *i;
14078     const pg_info_t& pinfo = recovery_state.get_peer_info(bt);
14079
14080     if (new_last_backfill > pinfo.last_backfill) {
14081       recovery_state.update_peer_last_backfill(bt, new_last_backfill);
14082       epoch_t e = get_osdmap_epoch();
14083       MOSDPGBackfill *m = NULL;
14084       if (pinfo.last_backfill.is_max()) {
14085         m = new MOSDPGBackfill(
14086           MOSDPGBackfill::OP_BACKFILL_FINISH,
14087           e,
14088           get_last_peering_reset(),
14089           spg_t(info.pgid.pgid, bt.shard));
14090         // Use default priority here, must match sub_op priority
14091         start_recovery_op(hobject_t::get_max());
14092       } else {
14093         m = new MOSDPGBackfill(
14094           MOSDPGBackfill::OP_BACKFILL_PROGRESS,
14095           e,
14096           get_last_peering_reset(),
14097           spg_t(info.pgid.pgid, bt.shard));
14098         // Use default priority here, must match sub_op priority
14099       }
14100       m->last_backfill = pinfo.last_backfill;
14101       m->stats = pinfo.stats;
14102       osd->send_message_osd_cluster(bt.osd, m, get_osdmap_epoch());
14103       dout(10) << " peer " << bt
14104                << " num_objects now " << pinfo.stats.stats.sum.num_objects
14105                << " / " << info.stats.stats.sum.num_objects << dendl;
14106     }
14107   }
14108
14109   if (ops)
14110     *work_started = true;
14111   return ops;
14112 }
14113
14114 int PrimaryLogPG::prep_backfill_object_push(
14115   hobject_t oid, eversion_t v,
14116   ObjectContextRef obc,
14117   vector<pg_shard_t> peers,
14118   PGBackend::RecoveryHandle *h)
14119 {
14120   dout(10) << __func__ << " " << oid << " v " << v << " to peers " << peers << dendl;
14121   ceph_assert(!peers.empty());
14122
14123   backfills_in_flight.insert(oid);
14124   recovery_state.prepare_backfill_for_missing(oid, v, peers);
14125
14126   ceph_assert(!recovering.count(oid));
14127
14128   start_recovery_op(oid);
14129   recovering.insert(make_pair(oid, obc));
14130
14131   int r = pgbackend->recover_object(
14132     oid,
14133     v,
14134     ObjectContextRef(),
14135     obc,
14136     h);
14137   if (r < 0) {
14138     dout(0) << __func__ << " Error " << r << " on oid " << oid << dendl;
14139     on_failed_pull({ pg_whoami }, oid, v);
14140   }
14141   return r;
14142 }
14143
14144 void PrimaryLogPG::update_range(
14145   BackfillInterval *bi,
14146   ThreadPool::TPHandle &handle)
14147 {
14148   int local_min = cct->_conf->osd_backfill_scan_min;
14149   int local_max = cct->_conf->osd_backfill_scan_max;
14150
14151   if (bi->version < info.log_tail) {
14152     dout(10) << __func__<< ": bi is old, rescanning local backfill_info"
14153              << dendl;
14154     bi->version = info.last_update;
14155     scan_range(local_min, local_max, bi, handle);
14156   }
14157
14158   if (bi->version >= projected_last_update) {
14159     dout(10) << __func__<< ": bi is current " << dendl;
14160     ceph_assert(bi->version == projected_last_update);
14161   } else if (bi->version >= info.log_tail) {
14162     if (recovery_state.get_pg_log().get_log().empty() && projected_log.empty()) {
14163       /* Because we don't move log_tail on split, the log might be
14164        * empty even if log_tail != last_update.  However, the only
14165        * way to get here with an empty log is if log_tail is actually
14166        * eversion_t(), because otherwise the entry which changed
14167        * last_update since the last scan would have to be present.
14168        */
14169       ceph_assert(bi->version == eversion_t());
14170       return;
14171     }
14172
14173     dout(10) << __func__<< ": bi is old, (" << bi->version
14174              << ") can be updated with log to projected_last_update "
14175              << projected_last_update << dendl;
14176
14177     auto func = [&](const pg_log_entry_t &e) {
14178       dout(10) << __func__ << ": updating from version " << e.version
14179                << dendl;
14180       const hobject_t &soid = e.soid;
14181       if (soid >= bi->begin &&
14182           soid < bi->end) {
14183         if (e.is_update()) {
14184           dout(10) << __func__ << ": " << e.soid << " updated to version "
14185                    << e.version << dendl;
14186           bi->objects.erase(e.soid);
14187           bi->objects.insert(
14188             make_pair(
14189               e.soid,
14190               e.version));
14191         } else if (e.is_delete()) {
14192           dout(10) << __func__ << ": " << e.soid << " removed" << dendl;
14193           bi->objects.erase(e.soid);
14194         }
14195       }
14196     };
14197     dout(10) << "scanning pg log first" << dendl;
14198     recovery_state.get_pg_log().get_log().scan_log_after(bi->version, func);
14199     dout(10) << "scanning projected log" << dendl;
14200     projected_log.scan_log_after(bi->version, func);
14201     bi->version = projected_last_update;
14202   } else {
14203     ceph_abort_msg("scan_range should have raised bi->version past log_tail");
14204   }
14205 }
14206
14207 void PrimaryLogPG::scan_range(
14208   int min, int max, BackfillInterval *bi,
14209   ThreadPool::TPHandle &handle)
14210 {
14211   ceph_assert(is_locked());
14212   dout(10) << "scan_range from " << bi->begin << dendl;
14213   bi->clear_objects();
14214
14215   vector<hobject_t> ls;
14216   ls.reserve(max);
14217   int r = pgbackend->objects_list_partial(bi->begin, min, max, &ls, &bi->end);
14218   ceph_assert(r >= 0);
14219   dout(10) << " got " << ls.size() << " items, next " << bi->end << dendl;
14220   dout(20) << ls << dendl;
14221
14222   for (vector<hobject_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
14223     handle.reset_tp_timeout();
14224     ObjectContextRef obc;
14225     if (is_primary())
14226       obc = object_contexts.lookup(*p);
14227     if (obc) {
14228       if (!obc->obs.exists) {
14229         /* If the object does not exist here, it must have been removed
14230          * between the collection_list_partial and here.  This can happen
14231          * for the first item in the range, which is usually last_backfill.
14232          */
14233         continue;
14234       }
14235       bi->objects[*p] = obc->obs.oi.version;
14236       dout(20) << "  " << *p << " " << obc->obs.oi.version << dendl;
14237     } else {
14238       bufferlist bl;
14239       int r = pgbackend->objects_get_attr(*p, OI_ATTR, &bl);
14240       /* If the object does not exist here, it must have been removed
14241        * between the collection_list_partial and here.  This can happen
14242        * for the first item in the range, which is usually last_backfill.
14243        */
14244       if (r == -ENOENT)
14245         continue;
14246
14247       ceph_assert(r >= 0);
14248       object_info_t oi(bl);
14249       bi->objects[*p] = oi.version;
14250       dout(20) << "  " << *p << " " << oi.version << dendl;
14251     }
14252   }
14253 }
14254
14255
14256 /** check_local
14257  *
14258  * verifies that stray objects have been deleted
14259  */
14260 void PrimaryLogPG::check_local()
14261 {
14262   dout(10) << __func__ << dendl;
14263
14264   ceph_assert(
14265     info.last_update >=
14266     recovery_state.get_pg_log().get_tail());  // otherwise we need some help!
14267
14268   if (!cct->_conf->osd_debug_verify_stray_on_activate)
14269     return;
14270
14271   // just scan the log.
14272   set<hobject_t> did;
14273   for (list<pg_log_entry_t>::const_reverse_iterator p = recovery_state.get_pg_log().get_log().log.rbegin();
14274        p != recovery_state.get_pg_log().get_log().log.rend();
14275        ++p) {
14276     if (did.count(p->soid))
14277       continue;
14278     did.insert(p->soid);
14279
14280     if (p->is_delete() && !is_missing_object(p->soid)) {
14281       dout(10) << " checking " << p->soid
14282                << " at " << p->version << dendl;
14283       struct stat st;
14284       int r = osd->store->stat(
14285         ch,
14286         ghobject_t(p->soid, ghobject_t::NO_GEN, pg_whoami.shard),
14287         &st);
14288       if (r != -ENOENT) {
14289         derr << __func__ << " " << p->soid << " exists, but should have been "
14290              << "deleted" << dendl;
14291         ceph_abort_msg("erroneously present object");
14292       }
14293     } else {
14294       // ignore old(+missing) objects
14295     }
14296   }
14297 }
14298
14299
14300
14301 // ===========================
14302 // hit sets
14303
14304 hobject_t PrimaryLogPG::get_hit_set_current_object(utime_t stamp)
14305 {
14306   ostringstream ss;
14307   ss << "hit_set_" << info.pgid.pgid << "_current_" << stamp;
14308   hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "",
14309                  info.pgid.ps(), info.pgid.pool(),
14310                  cct->_conf->osd_hit_set_namespace);
14311   dout(20) << __func__ << " " << hoid << dendl;
14312   return hoid;
14313 }
14314
14315 hobject_t PrimaryLogPG::get_hit_set_archive_object(utime_t start,
14316                                                    utime_t end,
14317                                                    bool using_gmt)
14318 {
14319   ostringstream ss;
14320   ss << "hit_set_" << info.pgid.pgid << "_archive_";
14321   if (using_gmt) {
14322     start.gmtime(ss, true /* legacy pre-octopus form */) << "_";
14323     end.gmtime(ss, true /* legacy pre-octopus form */);
14324   } else {
14325     start.localtime(ss, true /* legacy pre-octopus form */) << "_";
14326     end.localtime(ss, true /* legacy pre-octopus form */);
14327   }
14328   hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "",
14329                  info.pgid.ps(), info.pgid.pool(),
14330                  cct->_conf->osd_hit_set_namespace);
14331   dout(20) << __func__ << " " << hoid << dendl;
14332   return hoid;
14333 }
14334
14335 void PrimaryLogPG::hit_set_clear()
14336 {
14337   dout(20) << __func__ << dendl;
14338   hit_set.reset();
14339   hit_set_start_stamp = utime_t();
14340 }
14341
14342 void PrimaryLogPG::hit_set_setup()
14343 {
14344   if (!is_active() ||
14345       !is_primary()) {
14346     hit_set_clear();
14347     return;
14348   }
14349
14350   if (is_active() && is_primary() &&
14351       (!pool.info.hit_set_count ||
14352        !pool.info.hit_set_period ||
14353        pool.info.hit_set_params.get_type() == HitSet::TYPE_NONE)) {
14354     hit_set_clear();
14355
14356     // only primary is allowed to remove all the hit set objects
14357     hit_set_remove_all();
14358     return;
14359   }
14360
14361   // FIXME: discard any previous data for now
14362   hit_set_create();
14363
14364   // include any writes we know about from the pg log.  this doesn't
14365   // capture reads, but it is better than nothing!
14366   hit_set_apply_log();
14367 }
14368
14369 void PrimaryLogPG::hit_set_remove_all()
14370 {
14371   // If any archives are degraded we skip this
14372   for (auto p = info.hit_set.history.begin();
14373        p != info.hit_set.history.end();
14374        ++p) {
14375     hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
14376
14377     // Once we hit a degraded object just skip
14378     if (is_degraded_or_backfilling_object(aoid))
14379       return;
14380     if (m_scrubber->write_blocked_by_scrub(aoid))
14381       return;
14382   }
14383
14384   if (!info.hit_set.history.empty()) {
14385     auto p = info.hit_set.history.rbegin();
14386     ceph_assert(p != info.hit_set.history.rend());
14387     hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
14388     ceph_assert(!is_degraded_or_backfilling_object(oid));
14389     ObjectContextRef obc = get_object_context(oid, false);
14390     ceph_assert(obc);
14391
14392     OpContextUPtr ctx = simple_opc_create(obc);
14393     ctx->at_version = get_next_version();
14394     ctx->updated_hset_history = info.hit_set;
14395     utime_t now = ceph_clock_now();
14396     ctx->mtime = now;
14397     hit_set_trim(ctx, 0);
14398     simple_opc_submit(std::move(ctx));
14399   }
14400
14401   recovery_state.update_hset(pg_hit_set_history_t());
14402   if (agent_state) {
14403     agent_state->discard_hit_sets();
14404   }
14405 }
14406
14407 void PrimaryLogPG::hit_set_create()
14408 {
14409   utime_t now = ceph_clock_now();
14410   // make a copy of the params to modify
14411   HitSet::Params params(pool.info.hit_set_params);
14412
14413   dout(20) << __func__ << " " << params << dendl;
14414   if (pool.info.hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
14415     BloomHitSet::Params *p =
14416       static_cast<BloomHitSet::Params*>(params.impl.get());
14417
14418     // convert false positive rate so it holds up across the full period
14419     p->set_fpp(p->get_fpp() / pool.info.hit_set_count);
14420     if (p->get_fpp() <= 0.0)
14421       p->set_fpp(.01);  // fpp cannot be zero!
14422
14423     // if we don't have specified size, estimate target size based on the
14424     // previous bin!
14425     if (p->target_size == 0 && hit_set) {
14426       utime_t dur = now - hit_set_start_stamp;
14427       unsigned unique = hit_set->approx_unique_insert_count();
14428       dout(20) << __func__ << " previous set had approx " << unique
14429                << " unique items over " << dur << " seconds" << dendl;
14430       p->target_size = (double)unique * (double)pool.info.hit_set_period
14431                      / (double)dur;
14432     }
14433     if (p->target_size <
14434         static_cast<uint64_t>(cct->_conf->osd_hit_set_min_size))
14435       p->target_size = cct->_conf->osd_hit_set_min_size;
14436
14437     if (p->target_size
14438         > static_cast<uint64_t>(cct->_conf->osd_hit_set_max_size))
14439       p->target_size = cct->_conf->osd_hit_set_max_size;
14440
14441     p->seed = now.sec();
14442
14443     dout(10) << __func__ << " target_size " << p->target_size
14444              << " fpp " << p->get_fpp() << dendl;
14445   }
14446   hit_set.reset(new HitSet(params));
14447   hit_set_start_stamp = now;
14448 }
14449
14450 /**
14451  * apply log entries to set
14452  *
14453  * this would only happen after peering, to at least capture writes
14454  * during an interval that was potentially lost.
14455  */
14456 bool PrimaryLogPG::hit_set_apply_log()
14457 {
14458   if (!hit_set)
14459     return false;
14460
14461   eversion_t to = info.last_update;
14462   eversion_t from = info.hit_set.current_last_update;
14463   if (to <= from) {
14464     dout(20) << __func__ << " no update" << dendl;
14465     return false;
14466   }
14467
14468   dout(20) << __func__ << " " << to << " .. " << info.last_update << dendl;
14469   list<pg_log_entry_t>::const_reverse_iterator p =
14470     recovery_state.get_pg_log().get_log().log.rbegin();
14471   while (p != recovery_state.get_pg_log().get_log().log.rend() && p->version > to)
14472     ++p;
14473   while (p != recovery_state.get_pg_log().get_log().log.rend() && p->version > from) {
14474     hit_set->insert(p->soid);
14475     ++p;
14476   }
14477
14478   return true;
14479 }
14480
14481 void PrimaryLogPG::hit_set_persist()
14482 {
14483   dout(10) << __func__  << dendl;
14484   bufferlist bl;
14485   unsigned max = pool.info.hit_set_count;
14486
14487   utime_t now = ceph_clock_now();
14488   hobject_t oid;
14489
14490   // If any archives are degraded we skip this persist request
14491   // account for the additional entry being added below
14492   for (auto p = info.hit_set.history.begin();
14493        p != info.hit_set.history.end();
14494        ++p) {
14495     hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
14496
14497     // Once we hit a degraded object just skip further trim
14498     if (is_degraded_or_backfilling_object(aoid))
14499       return;
14500     if (m_scrubber->write_blocked_by_scrub(aoid))
14501       return;
14502   }
14503
14504   // If backfill is in progress and we could possibly overlap with the
14505   // hit_set_* objects, back off.  Since these all have
14506   // hobject_t::hash set to pgid.ps(), and those sort first, we can
14507   // look just at that.  This is necessary because our transactions
14508   // may include a modify of the new hit_set *and* a delete of the
14509   // old one, and this may span the backfill boundary.
14510   for (set<pg_shard_t>::const_iterator p = get_backfill_targets().begin();
14511        p != get_backfill_targets().end();
14512        ++p) {
14513     const pg_info_t& pi = recovery_state.get_peer_info(*p);
14514     if (pi.last_backfill == hobject_t() ||
14515         pi.last_backfill.get_hash() == info.pgid.ps()) {
14516       dout(10) << __func__ << " backfill target osd." << *p
14517                << " last_backfill has not progressed past pgid ps"
14518                << dendl;
14519       return;
14520     }
14521   }
14522
14523
14524   pg_hit_set_info_t new_hset = pg_hit_set_info_t(pool.info.use_gmt_hitset);
14525   new_hset.begin = hit_set_start_stamp;
14526   new_hset.end = now;
14527   oid = get_hit_set_archive_object(
14528     new_hset.begin,
14529     new_hset.end,
14530     new_hset.using_gmt);
14531
14532   // If the current object is degraded we skip this persist request
14533   if (m_scrubber->write_blocked_by_scrub(oid))
14534     return;
14535
14536   hit_set->seal();
14537   encode(*hit_set, bl);
14538   dout(20) << __func__ << " archive " << oid << dendl;
14539
14540   if (agent_state) {
14541     agent_state->add_hit_set(new_hset.begin, hit_set);
14542     uint32_t size = agent_state->hit_set_map.size();
14543     if (size >= pool.info.hit_set_count) {
14544       size = pool.info.hit_set_count > 0 ? pool.info.hit_set_count - 1: 0;
14545     }
14546     hit_set_in_memory_trim(size);
14547   }
14548
14549   ObjectContextRef obc = get_object_context(oid, true);
14550   OpContextUPtr ctx = simple_opc_create(obc);
14551
14552   ctx->at_version = get_next_version();
14553   ctx->updated_hset_history = info.hit_set;
14554   pg_hit_set_history_t &updated_hit_set_hist = *(ctx->updated_hset_history);
14555
14556   updated_hit_set_hist.current_last_update = info.last_update;
14557   new_hset.version = ctx->at_version;
14558
14559   updated_hit_set_hist.history.push_back(new_hset);
14560   hit_set_create();
14561
14562   // fabricate an object_info_t and SnapSet
14563   obc->obs.oi.version = ctx->at_version;
14564   obc->obs.oi.mtime = now;
14565   obc->obs.oi.size = bl.length();
14566   obc->obs.exists = true;
14567   obc->obs.oi.set_data_digest(bl.crc32c(-1));
14568
14569   ctx->new_obs = obc->obs;
14570
14571   ctx->new_snapset = obc->ssc->snapset;
14572
14573   ctx->delta_stats.num_objects++;
14574   ctx->delta_stats.num_objects_hit_set_archive++;
14575
14576   ctx->delta_stats.num_bytes += bl.length();
14577   ctx->delta_stats.num_bytes_hit_set_archive += bl.length();
14578
14579   bufferlist bss;
14580   encode(ctx->new_snapset, bss);
14581   bufferlist boi(sizeof(ctx->new_obs.oi));
14582   encode(ctx->new_obs.oi, boi,
14583            get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
14584
14585   ctx->op_t->create(oid);
14586   if (bl.length()) {
14587     ctx->op_t->write(oid, 0, bl.length(), bl, 0);
14588     write_update_size_and_usage(ctx->delta_stats, obc->obs.oi, ctx->modified_ranges,
14589         0, bl.length());
14590     ctx->clean_regions.mark_data_region_dirty(0, bl.length());
14591   }
14592   map<string, bufferlist, std::less<>> attrs = {
14593     {OI_ATTR, std::move(boi)},
14594     {SS_ATTR, std::move(bss)}
14595   };
14596   setattrs_maybe_cache(ctx->obc, ctx->op_t.get(), attrs);
14597   ctx->log.push_back(
14598     pg_log_entry_t(
14599       pg_log_entry_t::MODIFY,
14600       oid,
14601       ctx->at_version,
14602       eversion_t(),
14603       0,
14604       osd_reqid_t(),
14605       ctx->mtime,
14606       0)
14607     );
14608   ctx->log.back().clean_regions = ctx->clean_regions;
14609
14610   hit_set_trim(ctx, max);
14611
14612   simple_opc_submit(std::move(ctx));
14613 }
14614
14615 void PrimaryLogPG::hit_set_trim(OpContextUPtr &ctx, unsigned max)
14616 {
14617   ceph_assert(ctx->updated_hset_history);
14618   pg_hit_set_history_t &updated_hit_set_hist =
14619     *(ctx->updated_hset_history);
14620   for (unsigned num = updated_hit_set_hist.history.size(); num > max; --num) {
14621     list<pg_hit_set_info_t>::iterator p = updated_hit_set_hist.history.begin();
14622     ceph_assert(p != updated_hit_set_hist.history.end());
14623     hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
14624
14625     ceph_assert(!is_degraded_or_backfilling_object(oid));
14626
14627     dout(20) << __func__ << " removing " << oid << dendl;
14628     ++ctx->at_version.version;
14629     ctx->log.push_back(
14630         pg_log_entry_t(pg_log_entry_t::DELETE,
14631                        oid,
14632                        ctx->at_version,
14633                        p->version,
14634                        0,
14635                        osd_reqid_t(),
14636                        ctx->mtime,
14637                        0));
14638
14639     ctx->op_t->remove(oid);
14640     updated_hit_set_hist.history.pop_front();
14641
14642     ObjectContextRef obc = get_object_context(oid, false);
14643     ceph_assert(obc);
14644     --ctx->delta_stats.num_objects;
14645     --ctx->delta_stats.num_objects_hit_set_archive;
14646     ctx->delta_stats.num_bytes -= obc->obs.oi.size;
14647     ctx->delta_stats.num_bytes_hit_set_archive -= obc->obs.oi.size;
14648   }
14649 }
14650
14651 void PrimaryLogPG::hit_set_in_memory_trim(uint32_t max_in_memory)
14652 {
14653   while (agent_state->hit_set_map.size() > max_in_memory) {
14654     agent_state->remove_oldest_hit_set();
14655   }
14656 }
14657
14658
14659 // =======================================
14660 // cache agent
14661
14662 void PrimaryLogPG::agent_setup()
14663 {
14664   ceph_assert(is_locked());
14665   if (!is_active() ||
14666       !is_primary() ||
14667       state_test(PG_STATE_PREMERGE) ||
14668       pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE ||
14669       pool.info.tier_of < 0 ||
14670       !get_osdmap()->have_pg_pool(pool.info.tier_of)) {
14671     agent_clear();
14672     return;
14673   }
14674   if (!agent_state) {
14675     agent_state.reset(new TierAgentState);
14676
14677     // choose random starting position
14678     agent_state->position = hobject_t();
14679     agent_state->position.pool = info.pgid.pool();
14680     agent_state->position.set_hash(pool.info.get_random_pg_position(
14681       info.pgid.pgid,
14682       rand()));
14683     agent_state->start = agent_state->position;
14684
14685     dout(10) << __func__ << " allocated new state, position "
14686              << agent_state->position << dendl;
14687   } else {
14688     dout(10) << __func__ << " keeping existing state" << dendl;
14689   }
14690
14691   if (info.stats.stats_invalid) {
14692     osd->clog->warn() << "pg " << info.pgid << " has invalid (post-split) stats; must scrub before tier agent can activate";
14693   }
14694
14695   agent_choose_mode();
14696 }
14697
14698 void PrimaryLogPG::agent_clear()
14699 {
14700   agent_stop();
14701   agent_state.reset(NULL);
14702 }
14703
14704 // Return false if no objects operated on since start of object hash space
14705 bool PrimaryLogPG::agent_work(int start_max, int agent_flush_quota)
14706 {
14707   std::scoped_lock locker{*this};
14708   if (!agent_state) {
14709     dout(10) << __func__ << " no agent state, stopping" << dendl;
14710     return true;
14711   }
14712
14713   ceph_assert(!recovery_state.is_deleting());
14714
14715   if (agent_state->is_idle()) {
14716     dout(10) << __func__ << " idle, stopping" << dendl;
14717     return true;
14718   }
14719
14720   osd->logger->inc(l_osd_agent_wake);
14721
14722   dout(10) << __func__
14723            << " max " << start_max
14724            << ", flush " << agent_state->get_flush_mode_name()
14725            << ", evict " << agent_state->get_evict_mode_name()
14726            << ", pos " << agent_state->position
14727            << dendl;
14728   ceph_assert(is_primary());
14729   ceph_assert(is_active());
14730
14731   agent_load_hit_sets();
14732
14733   const pg_pool_t *base_pool = get_osdmap()->get_pg_pool(pool.info.tier_of);
14734   ceph_assert(base_pool);
14735
14736   int ls_min = 1;
14737   int ls_max = cct->_conf->osd_pool_default_cache_max_evict_check_size;
14738
14739   // list some objects.  this conveniently lists clones (oldest to
14740   // newest) before heads... the same order we want to flush in.
14741   //
14742   // NOTE: do not flush the Sequencer.  we will assume that the
14743   // listing we get back is imprecise.
14744   vector<hobject_t> ls;
14745   hobject_t next;
14746   int r = pgbackend->objects_list_partial(agent_state->position, ls_min, ls_max,
14747                                           &ls, &next);
14748   ceph_assert(r >= 0);
14749   dout(20) << __func__ << " got " << ls.size() << " objects" << dendl;
14750   int started = 0;
14751   for (vector<hobject_t>::iterator p = ls.begin();
14752        p != ls.end();
14753        ++p) {
14754     if (p->nspace == cct->_conf->osd_hit_set_namespace) {
14755       dout(20) << __func__ << " skip (hit set) " << *p << dendl;
14756       osd->logger->inc(l_osd_agent_skip);
14757       continue;
14758     }
14759     if (is_degraded_or_backfilling_object(*p)) {
14760       dout(20) << __func__ << " skip (degraded) " << *p << dendl;
14761       osd->logger->inc(l_osd_agent_skip);
14762       continue;
14763     }
14764     if (is_missing_object(p->get_head())) {
14765       dout(20) << __func__ << " skip (missing head) " << *p << dendl;
14766       osd->logger->inc(l_osd_agent_skip);
14767       continue;
14768     }
14769     ObjectContextRef obc = get_object_context(*p, false, NULL);
14770     if (!obc) {
14771       // we didn't flush; we may miss something here.
14772       dout(20) << __func__ << " skip (no obc) " << *p << dendl;
14773       osd->logger->inc(l_osd_agent_skip);
14774       continue;
14775     }
14776     if (!obc->obs.exists) {
14777       dout(20) << __func__ << " skip (dne) " << obc->obs.oi.soid << dendl;
14778       osd->logger->inc(l_osd_agent_skip);
14779       continue;
14780     }
14781     if (m_scrubber->range_intersects_scrub(obc->obs.oi.soid,
14782                                obc->obs.oi.soid.get_head())) {
14783       dout(20) << __func__ << " skip (scrubbing) " << obc->obs.oi << dendl;
14784       osd->logger->inc(l_osd_agent_skip);
14785       continue;
14786     }
14787     if (obc->is_blocked()) {
14788       dout(20) << __func__ << " skip (blocked) " << obc->obs.oi << dendl;
14789       osd->logger->inc(l_osd_agent_skip);
14790       continue;
14791     }
14792     if (obc->is_request_pending()) {
14793       dout(20) << __func__ << " skip (request pending) " << obc->obs.oi << dendl;
14794       osd->logger->inc(l_osd_agent_skip);
14795       continue;
14796     }
14797
14798     // be careful flushing omap to an EC pool.
14799     if (!base_pool->supports_omap() &&
14800         obc->obs.oi.is_omap()) {
14801       dout(20) << __func__ << " skip (omap to EC) " << obc->obs.oi << dendl;
14802       osd->logger->inc(l_osd_agent_skip);
14803       continue;
14804     }
14805
14806     if (agent_state->evict_mode != TierAgentState::EVICT_MODE_IDLE &&
14807         agent_maybe_evict(obc, false))
14808       ++started;
14809     else if (agent_state->flush_mode != TierAgentState::FLUSH_MODE_IDLE &&
14810              agent_flush_quota > 0 && agent_maybe_flush(obc)) {
14811       ++started;
14812       --agent_flush_quota;
14813     }
14814     if (started >= start_max) {
14815       // If finishing early, set "next" to the next object
14816       if (++p != ls.end())
14817         next = *p;
14818       break;
14819     }
14820   }
14821
14822   if (++agent_state->hist_age > cct->_conf->osd_agent_hist_halflife) {
14823     dout(20) << __func__ << " resetting atime and temp histograms" << dendl;
14824     agent_state->hist_age = 0;
14825     agent_state->temp_hist.decay();
14826   }
14827
14828   // Total objects operated on so far
14829   int total_started = agent_state->started + started;
14830   bool need_delay = false;
14831
14832   dout(20) << __func__ << " start pos " << agent_state->position
14833     << " next start pos " << next
14834     << " started " << total_started << dendl;
14835
14836   // See if we've made a full pass over the object hash space
14837   // This might check at most ls_max objects a second time to notice that
14838   // we've checked every objects at least once.
14839   if (agent_state->position < agent_state->start &&
14840       next >= agent_state->start) {
14841     dout(20) << __func__ << " wrap around " << agent_state->start << dendl;
14842     if (total_started == 0)
14843       need_delay = true;
14844     else
14845       total_started = 0;
14846     agent_state->start = next;
14847   }
14848   agent_state->started = total_started;
14849
14850   // See if we are starting from beginning
14851   if (next.is_max())
14852     agent_state->position = hobject_t();
14853   else
14854     agent_state->position = next;
14855
14856   // Discard old in memory HitSets
14857   hit_set_in_memory_trim(pool.info.hit_set_count);
14858
14859   if (need_delay) {
14860     ceph_assert(agent_state->delaying == false);
14861     agent_delay();
14862     return false;
14863   }
14864   agent_choose_mode();
14865   return true;
14866 }
14867
14868 void PrimaryLogPG::agent_load_hit_sets()
14869 {
14870   if (agent_state->evict_mode == TierAgentState::EVICT_MODE_IDLE) {
14871     return;
14872   }
14873
14874   if (agent_state->hit_set_map.size() < info.hit_set.history.size()) {
14875     dout(10) << __func__ << dendl;
14876     for (auto p = info.hit_set.history.begin();
14877          p != info.hit_set.history.end(); ++p) {
14878       if (agent_state->hit_set_map.count(p->begin.sec()) == 0) {
14879         dout(10) << __func__ << " loading " << p->begin << "-"
14880                  << p->end << dendl;
14881         if (!pool.info.is_replicated()) {
14882           // FIXME: EC not supported here yet
14883           derr << __func__ << " on non-replicated pool" << dendl;
14884           break;
14885         }
14886
14887         hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
14888         if (is_unreadable_object(oid)) {
14889           dout(10) << __func__ << " unreadable " << oid << ", waiting" << dendl;
14890           break;
14891         }
14892
14893         ObjectContextRef obc = get_object_context(oid, false);
14894         if (!obc) {
14895           derr << __func__ << ": could not load hitset " << oid << dendl;
14896           break;
14897         }
14898
14899         bufferlist bl;
14900         {
14901           int r = osd->store->read(ch, ghobject_t(oid), 0, 0, bl);
14902           ceph_assert(r >= 0);
14903         }
14904         HitSetRef hs(new HitSet);
14905         bufferlist::const_iterator pbl = bl.begin();
14906         decode(*hs, pbl);
14907         agent_state->add_hit_set(p->begin.sec(), hs);
14908       }
14909     }
14910   }
14911 }
14912
14913 bool PrimaryLogPG::agent_maybe_flush(ObjectContextRef& obc)
14914 {
14915   if (!obc->obs.oi.is_dirty()) {
14916     dout(20) << __func__ << " skip (clean) " << obc->obs.oi << dendl;
14917     osd->logger->inc(l_osd_agent_skip);
14918     return false;
14919   }
14920   if (obc->obs.oi.is_cache_pinned()) {
14921     dout(20) << __func__ << " skip (cache_pinned) " << obc->obs.oi << dendl;
14922     osd->logger->inc(l_osd_agent_skip);
14923     return false;
14924   }
14925
14926   utime_t now = ceph_clock_now();
14927   utime_t ob_local_mtime;
14928   if (obc->obs.oi.local_mtime != utime_t()) {
14929     ob_local_mtime = obc->obs.oi.local_mtime;
14930   } else {
14931     ob_local_mtime = obc->obs.oi.mtime;
14932   }
14933   bool evict_mode_full =
14934     (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL);
14935   if (!evict_mode_full &&
14936       obc->obs.oi.soid.snap == CEPH_NOSNAP &&  // snaps immutable; don't delay
14937       (ob_local_mtime + utime_t(pool.info.cache_min_flush_age, 0) > now)) {
14938     dout(20) << __func__ << " skip (too young) " << obc->obs.oi << dendl;
14939     osd->logger->inc(l_osd_agent_skip);
14940     return false;
14941   }
14942
14943   if (osd->agent_is_active_oid(obc->obs.oi.soid)) {
14944     dout(20) << __func__ << " skip (flushing) " << obc->obs.oi << dendl;
14945     osd->logger->inc(l_osd_agent_skip);
14946     return false;
14947   }
14948
14949   dout(10) << __func__ << " flushing " << obc->obs.oi << dendl;
14950
14951   // FIXME: flush anything dirty, regardless of what distribution of
14952   // ages we expect.
14953
14954   hobject_t oid = obc->obs.oi.soid;
14955   osd->agent_start_op(oid);
14956   // no need to capture a pg ref, can't outlive fop or ctx
14957   std::function<void()> on_flush = [this, oid]() {
14958     osd->agent_finish_op(oid);
14959   };
14960
14961   int result = start_flush(
14962     OpRequestRef(), obc, false, NULL,
14963     on_flush);
14964   if (result != -EINPROGRESS) {
14965     on_flush();
14966     dout(10) << __func__ << " start_flush() failed " << obc->obs.oi
14967       << " with " << result << dendl;
14968     osd->logger->inc(l_osd_agent_skip);
14969     return false;
14970   }
14971
14972   osd->logger->inc(l_osd_agent_flush);
14973   return true;
14974 }
14975
14976 bool PrimaryLogPG::agent_maybe_evict(ObjectContextRef& obc, bool after_flush)
14977 {
14978   const hobject_t& soid = obc->obs.oi.soid;
14979   if (!after_flush && obc->obs.oi.is_dirty()) {
14980     dout(20) << __func__ << " skip (dirty) " << obc->obs.oi << dendl;
14981     return false;
14982   }
14983   // This is already checked by agent_work() which passes after_flush = false
14984   if (after_flush && m_scrubber->range_intersects_scrub(soid, soid.get_head())) {
14985       dout(20) << __func__ << " skip (scrubbing) " << obc->obs.oi << dendl;
14986       return false;
14987   }
14988   if (!obc->obs.oi.watchers.empty()) {
14989     dout(20) << __func__ << " skip (watchers) " << obc->obs.oi << dendl;
14990     return false;
14991   }
14992   if (obc->is_blocked()) {
14993     dout(20) << __func__ << " skip (blocked) " << obc->obs.oi << dendl;
14994     return false;
14995   }
14996   if (obc->obs.oi.is_cache_pinned()) {
14997     dout(20) << __func__ << " skip (cache_pinned) " << obc->obs.oi << dendl;
14998     return false;
14999   }
15000
15001   if (soid.snap == CEPH_NOSNAP) {
15002     int result = _verify_no_head_clones(soid, obc->ssc->snapset);
15003     if (result < 0) {
15004       dout(20) << __func__ << " skip (clones) " << obc->obs.oi << dendl;
15005       return false;
15006     }
15007   }
15008
15009   if (agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL) {
15010     // is this object old than cache_min_evict_age?
15011     utime_t now = ceph_clock_now();
15012     utime_t ob_local_mtime;
15013     if (obc->obs.oi.local_mtime != utime_t()) {
15014       ob_local_mtime = obc->obs.oi.local_mtime;
15015     } else {
15016       ob_local_mtime = obc->obs.oi.mtime;
15017     }
15018     if (ob_local_mtime + utime_t(pool.info.cache_min_evict_age, 0) > now) {
15019       dout(20) << __func__ << " skip (too young) " << obc->obs.oi << dendl;
15020       osd->logger->inc(l_osd_agent_skip);
15021       return false;
15022     }
15023     // is this object old and/or cold enough?
15024     int temp = 0;
15025     uint64_t temp_upper = 0, temp_lower = 0;
15026     if (hit_set)
15027       agent_estimate_temp(soid, &temp);
15028     agent_state->temp_hist.add(temp);
15029     agent_state->temp_hist.get_position_micro(temp, &temp_lower, &temp_upper);
15030
15031     dout(20) << __func__
15032              << " temp " << temp
15033              << " pos " << temp_lower << "-" << temp_upper
15034              << ", evict_effort " << agent_state->evict_effort
15035              << dendl;
15036     dout(30) << "agent_state:\n";
15037     Formatter *f = Formatter::create("");
15038     f->open_object_section("agent_state");
15039     agent_state->dump(f);
15040     f->close_section();
15041     f->flush(*_dout);
15042     delete f;
15043     *_dout << dendl;
15044
15045     if (1000000 - temp_upper >= agent_state->evict_effort)
15046       return false;
15047   }
15048
15049   dout(10) << __func__ << " evicting " << obc->obs.oi << dendl;
15050   OpContextUPtr ctx = simple_opc_create(obc);
15051
15052   auto null_op_req = OpRequestRef();
15053   if (!ctx->lock_manager.get_lock_type(
15054         RWState::RWWRITE,
15055         obc->obs.oi.soid,
15056         obc,
15057         null_op_req)) {
15058     close_op_ctx(ctx.release());
15059     dout(20) << __func__ << " skip (cannot get lock) " << obc->obs.oi << dendl;
15060     return false;
15061   }
15062
15063   osd->agent_start_evict_op();
15064   ctx->register_on_finish(
15065     [this]() {
15066       osd->agent_finish_evict_op();
15067     });
15068
15069   ctx->at_version = get_next_version();
15070   ceph_assert(ctx->new_obs.exists);
15071   int r = _delete_oid(ctx.get(), true, false);
15072   if (obc->obs.oi.is_omap())
15073     ctx->delta_stats.num_objects_omap--;
15074   ctx->delta_stats.num_evict++;
15075   ctx->delta_stats.num_evict_kb += shift_round_up(obc->obs.oi.size, 10);
15076   if (obc->obs.oi.is_dirty())
15077     --ctx->delta_stats.num_objects_dirty;
15078   ceph_assert(r == 0);
15079   finish_ctx(ctx.get(), pg_log_entry_t::DELETE);
15080   simple_opc_submit(std::move(ctx));
15081   osd->logger->inc(l_osd_tier_evict);
15082   osd->logger->inc(l_osd_agent_evict);
15083   return true;
15084 }
15085
15086 void PrimaryLogPG::agent_stop()
15087 {
15088   dout(20) << __func__ << dendl;
15089   if (agent_state && !agent_state->is_idle()) {
15090     agent_state->evict_mode = TierAgentState::EVICT_MODE_IDLE;
15091     agent_state->flush_mode = TierAgentState::FLUSH_MODE_IDLE;
15092     osd->agent_disable_pg(this, agent_state->evict_effort);
15093   }
15094 }
15095
15096 void PrimaryLogPG::agent_delay()
15097 {
15098   dout(20) << __func__ << dendl;
15099   if (agent_state && !agent_state->is_idle()) {
15100     ceph_assert(agent_state->delaying == false);
15101     agent_state->delaying = true;
15102     osd->agent_disable_pg(this, agent_state->evict_effort);
15103   }
15104 }
15105
15106 void PrimaryLogPG::agent_choose_mode_restart()
15107 {
15108   dout(20) << __func__ << dendl;
15109   std::scoped_lock locker{*this};
15110   if (agent_state && agent_state->delaying) {
15111     agent_state->delaying = false;
15112     agent_choose_mode(true);
15113   }
15114 }
15115
15116 bool PrimaryLogPG::agent_choose_mode(bool restart, OpRequestRef op)
15117 {
15118   bool requeued = false;
15119   // Let delay play out
15120   if (agent_state->delaying) {
15121     dout(20) << __func__ << " " << this << " delaying, ignored" << dendl;
15122     return requeued;
15123   }
15124
15125   TierAgentState::flush_mode_t flush_mode = TierAgentState::FLUSH_MODE_IDLE;
15126   TierAgentState::evict_mode_t evict_mode = TierAgentState::EVICT_MODE_IDLE;
15127   unsigned evict_effort = 0;
15128
15129   if (info.stats.stats_invalid) {
15130     // idle; stats can't be trusted until we scrub.
15131     dout(20) << __func__ << " stats invalid (post-split), idle" << dendl;
15132     goto skip_calc;
15133   }
15134
15135   {
15136   uint64_t divisor = pool.info.get_pg_num_divisor(info.pgid.pgid);
15137   ceph_assert(divisor > 0);
15138
15139   // adjust (effective) user objects down based on the number
15140   // of HitSet objects, which should not count toward our total since
15141   // they cannot be flushed.
15142   uint64_t unflushable = info.stats.stats.sum.num_objects_hit_set_archive;
15143
15144   // also exclude omap objects if ec backing pool
15145   const pg_pool_t *base_pool = get_osdmap()->get_pg_pool(pool.info.tier_of);
15146   ceph_assert(base_pool);
15147   if (!base_pool->supports_omap())
15148     unflushable += info.stats.stats.sum.num_objects_omap;
15149
15150   uint64_t num_user_objects = info.stats.stats.sum.num_objects;
15151   if (num_user_objects > unflushable)
15152     num_user_objects -= unflushable;
15153   else
15154     num_user_objects = 0;
15155
15156   uint64_t num_user_bytes = info.stats.stats.sum.num_bytes;
15157   uint64_t unflushable_bytes = info.stats.stats.sum.num_bytes_hit_set_archive;
15158   num_user_bytes -= unflushable_bytes;
15159   uint64_t num_overhead_bytes = osd->store->estimate_objects_overhead(num_user_objects);
15160   num_user_bytes += num_overhead_bytes;
15161
15162   // also reduce the num_dirty by num_objects_omap
15163   int64_t num_dirty = info.stats.stats.sum.num_objects_dirty;
15164   if (!base_pool->supports_omap()) {
15165     if (num_dirty > info.stats.stats.sum.num_objects_omap)
15166       num_dirty -= info.stats.stats.sum.num_objects_omap;
15167     else
15168       num_dirty = 0;
15169   }
15170
15171   dout(10) << __func__
15172            << " flush_mode: "
15173            << TierAgentState::get_flush_mode_name(agent_state->flush_mode)
15174            << " evict_mode: "
15175            << TierAgentState::get_evict_mode_name(agent_state->evict_mode)
15176            << " num_objects: " << info.stats.stats.sum.num_objects
15177            << " num_bytes: " << info.stats.stats.sum.num_bytes
15178            << " num_objects_dirty: " << info.stats.stats.sum.num_objects_dirty
15179            << " num_objects_omap: " << info.stats.stats.sum.num_objects_omap
15180            << " num_dirty: " << num_dirty
15181            << " num_user_objects: " << num_user_objects
15182            << " num_user_bytes: " << num_user_bytes
15183            << " num_overhead_bytes: " << num_overhead_bytes
15184            << " pool.info.target_max_bytes: " << pool.info.target_max_bytes
15185            << " pool.info.target_max_objects: " << pool.info.target_max_objects
15186            << dendl;
15187
15188   // get dirty, full ratios
15189   uint64_t dirty_micro = 0;
15190   uint64_t full_micro = 0;
15191   if (pool.info.target_max_bytes && num_user_objects > 0) {
15192     uint64_t avg_size = num_user_bytes / num_user_objects;
15193     dirty_micro =
15194       num_dirty * avg_size * 1000000 /
15195       std::max<uint64_t>(pool.info.target_max_bytes / divisor, 1);
15196     full_micro =
15197       num_user_objects * avg_size * 1000000 /
15198       std::max<uint64_t>(pool.info.target_max_bytes / divisor, 1);
15199   }
15200   if (pool.info.target_max_objects > 0) {
15201     uint64_t dirty_objects_micro =
15202       num_dirty * 1000000 /
15203       std::max<uint64_t>(pool.info.target_max_objects / divisor, 1);
15204     if (dirty_objects_micro > dirty_micro)
15205       dirty_micro = dirty_objects_micro;
15206     uint64_t full_objects_micro =
15207       num_user_objects * 1000000 /
15208       std::max<uint64_t>(pool.info.target_max_objects / divisor, 1);
15209     if (full_objects_micro > full_micro)
15210       full_micro = full_objects_micro;
15211   }
15212   dout(20) << __func__ << " dirty " << ((float)dirty_micro / 1000000.0)
15213            << " full " << ((float)full_micro / 1000000.0)
15214            << dendl;
15215
15216   // flush mode
15217   uint64_t flush_target = pool.info.cache_target_dirty_ratio_micro;
15218   uint64_t flush_high_target = pool.info.cache_target_dirty_high_ratio_micro;
15219   uint64_t flush_slop = (float)flush_target * cct->_conf->osd_agent_slop;
15220   if (restart || agent_state->flush_mode == TierAgentState::FLUSH_MODE_IDLE) {
15221     flush_target += flush_slop;
15222     flush_high_target += flush_slop;
15223   } else {
15224     flush_target -= std::min(flush_target, flush_slop);
15225     flush_high_target -= std::min(flush_high_target, flush_slop);
15226   }
15227
15228   if (dirty_micro > flush_high_target) {
15229     flush_mode = TierAgentState::FLUSH_MODE_HIGH;
15230   } else if (dirty_micro > flush_target || (!flush_target && num_dirty > 0)) {
15231     flush_mode = TierAgentState::FLUSH_MODE_LOW;
15232   }
15233
15234   // evict mode
15235   uint64_t evict_target = pool.info.cache_target_full_ratio_micro;
15236   uint64_t evict_slop = (float)evict_target * cct->_conf->osd_agent_slop;
15237   if (restart || agent_state->evict_mode == TierAgentState::EVICT_MODE_IDLE)
15238     evict_target += evict_slop;
15239   else
15240     evict_target -= std::min(evict_target, evict_slop);
15241
15242   if (full_micro > 1000000) {
15243     // evict anything clean
15244     evict_mode = TierAgentState::EVICT_MODE_FULL;
15245     evict_effort = 1000000;
15246   } else if (full_micro > evict_target) {
15247     // set effort in [0..1] range based on where we are between
15248     evict_mode = TierAgentState::EVICT_MODE_SOME;
15249     uint64_t over = full_micro - evict_target;
15250     uint64_t span  = 1000000 - evict_target;
15251     evict_effort = std::max(over * 1000000 / span,
15252                             uint64_t(1000000.0 *
15253                                      cct->_conf->osd_agent_min_evict_effort));
15254
15255     // quantize effort to avoid too much reordering in the agent_queue.
15256     uint64_t inc = cct->_conf->osd_agent_quantize_effort * 1000000;
15257     ceph_assert(inc > 0);
15258     uint64_t was = evict_effort;
15259     evict_effort -= evict_effort % inc;
15260     if (evict_effort < inc)
15261       evict_effort = inc;
15262     ceph_assert(evict_effort >= inc && evict_effort <= 1000000);
15263     dout(30) << __func__ << " evict_effort " << was << " quantized by " << inc << " to " << evict_effort << dendl;
15264   }
15265   }
15266
15267   skip_calc:
15268   bool old_idle = agent_state->is_idle();
15269   if (flush_mode != agent_state->flush_mode) {
15270     dout(5) << __func__ << " flush_mode "
15271             << TierAgentState::get_flush_mode_name(agent_state->flush_mode)
15272             << " -> "
15273             << TierAgentState::get_flush_mode_name(flush_mode)
15274             << dendl;
15275     recovery_state.update_stats(
15276       [=](auto &history, auto &stats) {
15277         if (flush_mode == TierAgentState::FLUSH_MODE_HIGH) {
15278           osd->agent_inc_high_count();
15279           stats.stats.sum.num_flush_mode_high = 1;
15280         } else if (flush_mode == TierAgentState::FLUSH_MODE_LOW) {
15281           stats.stats.sum.num_flush_mode_low = 1;
15282         }
15283         if (agent_state->flush_mode == TierAgentState::FLUSH_MODE_HIGH) {
15284           osd->agent_dec_high_count();
15285           stats.stats.sum.num_flush_mode_high = 0;
15286         } else if (agent_state->flush_mode == TierAgentState::FLUSH_MODE_LOW) {
15287           stats.stats.sum.num_flush_mode_low = 0;
15288         }
15289         return false;
15290       });
15291     agent_state->flush_mode = flush_mode;
15292   }
15293   if (evict_mode != agent_state->evict_mode) {
15294     dout(5) << __func__ << " evict_mode "
15295             << TierAgentState::get_evict_mode_name(agent_state->evict_mode)
15296             << " -> "
15297             << TierAgentState::get_evict_mode_name(evict_mode)
15298             << dendl;
15299     if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL &&
15300         is_active()) {
15301       if (op)
15302         requeue_op(op);
15303       requeue_ops(waiting_for_flush);
15304       requeue_ops(waiting_for_active);
15305       requeue_ops(waiting_for_readable);
15306       requeue_ops(waiting_for_scrub);
15307       requeue_ops(waiting_for_cache_not_full);
15308       objects_blocked_on_cache_full.clear();
15309       requeued = true;
15310     }
15311     recovery_state.update_stats(
15312       [=](auto &history, auto &stats) {
15313         if (evict_mode == TierAgentState::EVICT_MODE_SOME) {
15314           stats.stats.sum.num_evict_mode_some = 1;
15315         } else if (evict_mode == TierAgentState::EVICT_MODE_FULL) {
15316           stats.stats.sum.num_evict_mode_full = 1;
15317         }
15318         if (agent_state->evict_mode == TierAgentState::EVICT_MODE_SOME) {
15319           stats.stats.sum.num_evict_mode_some = 0;
15320         } else if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
15321           stats.stats.sum.num_evict_mode_full = 0;
15322         }
15323         return false;
15324       });
15325     agent_state->evict_mode = evict_mode;
15326   }
15327   uint64_t old_effort = agent_state->evict_effort;
15328   if (evict_effort != agent_state->evict_effort) {
15329     dout(5) << __func__ << " evict_effort "
15330             << ((float)agent_state->evict_effort / 1000000.0)
15331             << " -> "
15332             << ((float)evict_effort / 1000000.0)
15333             << dendl;
15334     agent_state->evict_effort = evict_effort;
15335   }
15336
15337   // NOTE: we are using evict_effort as a proxy for *all* agent effort
15338   // (including flush).  This is probably fine (they should be
15339   // correlated) but it is not precisely correct.
15340   if (agent_state->is_idle()) {
15341     if (!restart && !old_idle) {
15342       osd->agent_disable_pg(this, old_effort);
15343     }
15344   } else {
15345     if (restart || old_idle) {
15346       osd->agent_enable_pg(this, agent_state->evict_effort);
15347     } else if (old_effort != agent_state->evict_effort) {
15348       osd->agent_adjust_pg(this, old_effort, agent_state->evict_effort);
15349     }
15350   }
15351   return requeued;
15352 }
15353
15354 void PrimaryLogPG::agent_estimate_temp(const hobject_t& oid, int *temp)
15355 {
15356   ceph_assert(hit_set);
15357   ceph_assert(temp);
15358   *temp = 0;
15359   if (hit_set->contains(oid))
15360     *temp = 1000000;
15361   unsigned i = 0;
15362   int last_n = pool.info.hit_set_search_last_n;
15363   for (map<time_t,HitSetRef>::reverse_iterator p =
15364        agent_state->hit_set_map.rbegin(); last_n > 0 &&
15365        p != agent_state->hit_set_map.rend(); ++p, ++i) {
15366     if (p->second->contains(oid)) {
15367       *temp += pool.info.get_grade(i);
15368       --last_n;
15369     }
15370   }
15371 }
15372
15373 // Dup op detection
15374
15375 bool PrimaryLogPG::already_complete(eversion_t v)
15376 {
15377   dout(20) << __func__ << ": " << v << dendl;
15378   for (xlist<RepGather*>::iterator i = repop_queue.begin();
15379        !i.end();
15380        ++i) {
15381     dout(20) << __func__ << ": " << **i << dendl;
15382     // skip copy from temp object ops
15383     if ((*i)->v == eversion_t()) {
15384       dout(20) << __func__ << ": " << **i
15385                << " version is empty" << dendl;
15386       continue;
15387     }
15388     if ((*i)->v > v) {
15389       dout(20) << __func__ << ": " << **i
15390                << " (*i)->v past v" << dendl;
15391       break;
15392     }
15393     if (!(*i)->all_committed) {
15394       dout(20) << __func__ << ": " << **i
15395                << " not committed, returning false"
15396                << dendl;
15397       return false;
15398     }
15399   }
15400   dout(20) << __func__ << ": returning true" << dendl;
15401   return true;
15402 }
15403
15404
15405 // ==========================================================================================
15406 // SCRUB
15407
15408 void PrimaryLogPG::do_replica_scrub_map(OpRequestRef op)
15409 {
15410   dout(15) << __func__ << " is scrub active? " << is_scrub_active() << dendl;
15411   op->mark_started();
15412
15413   if (!is_scrub_active()) {
15414     dout(10) << __func__ << " scrub isn't active" << dendl;
15415     return;
15416   }
15417   m_scrubber->map_from_replica(op);
15418 }
15419
15420 bool PrimaryLogPG::_range_available_for_scrub(const hobject_t& begin,
15421                                               const hobject_t& end)
15422 {
15423   pair<hobject_t, ObjectContextRef> next;
15424   next.second = object_contexts.lookup(begin);
15425   next.first = begin;
15426   bool more = true;
15427   while (more && next.first < end) {
15428     if (next.second && next.second->is_blocked()) {
15429       next.second->requeue_scrub_on_unblock = true;
15430       dout(10) << __func__ << ": scrub delayed, "
15431                << next.first << " is blocked"
15432                << dendl;
15433       return false;
15434     }
15435     more = object_contexts.get_next(next.first, &next);
15436   }
15437   return true;
15438 }
15439
15440
15441 int PrimaryLogPG::rep_repair_primary_object(const hobject_t& soid, OpContext *ctx)
15442 {
15443   OpRequestRef op = ctx->op;
15444   // Only supports replicated pools
15445   ceph_assert(!pool.info.is_erasure());
15446   ceph_assert(is_primary());
15447
15448   dout(10) << __func__ << " " << soid
15449            << " peers osd.{" << get_acting_recovery_backfill() << "}" << dendl;
15450
15451   if (!is_clean()) {
15452     block_for_clean(soid, op);
15453     return -EAGAIN;
15454   }
15455
15456   ceph_assert(!recovery_state.get_pg_log().get_missing().is_missing(soid));
15457   auto& oi = ctx->new_obs.oi;
15458   eversion_t v = oi.version;
15459
15460   if (primary_error(soid, v)) {
15461     dout(0) << __func__ << " No other replicas available for " << soid << dendl;
15462     // XXX: If we knew that there is no down osd which could include this
15463     // object, it would be nice if we could return EIO here.
15464     // If a "never fail" flag was available, that could be used
15465     // for rbd to NOT return EIO until object marked lost.
15466
15467     // Drop through to save this op in case an osd comes up with the object.
15468   }
15469
15470   // Restart the op after object becomes readable again
15471   waiting_for_unreadable_object[soid].push_back(op);
15472   op->mark_delayed("waiting for missing object");
15473
15474   ceph_assert(is_clean());
15475   state_set(PG_STATE_REPAIR);
15476   state_clear(PG_STATE_CLEAN);
15477   queue_peering_event(
15478       PGPeeringEventRef(
15479         std::make_shared<PGPeeringEvent>(
15480         get_osdmap_epoch(),
15481         get_osdmap_epoch(),
15482         PeeringState::DoRecovery())));
15483
15484   return -EAGAIN;
15485 }
15486
15487 /*---SnapTrimmer Logging---*/
15488 #undef dout_prefix
15489 #define dout_prefix pg->gen_prefix(*_dout)
15490
15491 void PrimaryLogPG::SnapTrimmer::log_enter(const char *state_name)
15492 {
15493   ldout(pg->cct, 20) << "enter " << state_name << dendl;
15494 }
15495
15496 void PrimaryLogPG::SnapTrimmer::log_exit(const char *state_name, utime_t enter_time)
15497 {
15498   ldout(pg->cct, 20) << "exit " << state_name << dendl;
15499 }
15500
15501 bool PrimaryLogPG::SnapTrimmer::permit_trim() {
15502   return
15503     pg->is_clean() &&
15504     !pg->is_scrub_queued_or_active() &&
15505     !pg->snap_trimq.empty();
15506 }
15507
15508 /*---SnapTrimmer states---*/
15509 #undef dout_prefix
15510 #define dout_prefix (context< SnapTrimmer >().pg->gen_prefix(*_dout) \
15511                      << "SnapTrimmer state<" << get_state_name() << ">: ")
15512
15513 /* NotTrimming */
15514 PrimaryLogPG::NotTrimming::NotTrimming(my_context ctx)
15515   : my_base(ctx),
15516     NamedState(nullptr, "NotTrimming")
15517 {
15518   context< SnapTrimmer >().log_enter(state_name);
15519 }
15520
15521 void PrimaryLogPG::NotTrimming::exit()
15522 {
15523   context< SnapTrimmer >().log_exit(state_name, enter_time);
15524 }
15525
15526 boost::statechart::result PrimaryLogPG::NotTrimming::react(const KickTrim&)
15527 {
15528   PrimaryLogPG *pg = context< SnapTrimmer >().pg;
15529   ldout(pg->cct, 10) << "NotTrimming react KickTrim" << dendl;
15530
15531   if (!(pg->is_primary() && pg->is_active())) {
15532     ldout(pg->cct, 10) << "NotTrimming not primary or active" << dendl;
15533     return discard_event();
15534   }
15535   if (!pg->is_clean() ||
15536       pg->snap_trimq.empty()) {
15537     ldout(pg->cct, 10) << "NotTrimming not clean or nothing to trim" << dendl;
15538     return discard_event();
15539   }
15540   if (pg->is_scrub_queued_or_active()) {
15541     ldout(pg->cct, 10) << " scrubbing, will requeue snap_trimmer after" << dendl;
15542     return transit< WaitScrub >();
15543   } else {
15544     return transit< Trimming >();
15545   }
15546 }
15547
15548 boost::statechart::result PrimaryLogPG::WaitReservation::react(const SnapTrimReserved&)
15549 {
15550   PrimaryLogPG *pg = context< SnapTrimmer >().pg;
15551   ldout(pg->cct, 10) << "WaitReservation react SnapTrimReserved" << dendl;
15552
15553   pending = nullptr;
15554   if (!context< SnapTrimmer >().can_trim()) {
15555     post_event(KickTrim());
15556     return transit< NotTrimming >();
15557   }
15558
15559   context<Trimming>().snap_to_trim = pg->snap_trimq.range_start();
15560   ldout(pg->cct, 10) << "NotTrimming: trimming "
15561                      << pg->snap_trimq.range_start()
15562                      << dendl;
15563   return transit< AwaitAsyncWork >();
15564 }
15565
15566 /* AwaitAsyncWork */
15567 PrimaryLogPG::AwaitAsyncWork::AwaitAsyncWork(my_context ctx)
15568   : my_base(ctx),
15569     NamedState(nullptr, "Trimming/AwaitAsyncWork")
15570 {
15571   auto *pg = context< SnapTrimmer >().pg;
15572   context< SnapTrimmer >().log_enter(state_name);
15573   context< SnapTrimmer >().pg->osd->queue_for_snap_trim(pg);
15574   pg->state_set(PG_STATE_SNAPTRIM);
15575   pg->state_clear(PG_STATE_SNAPTRIM_ERROR);
15576   pg->publish_stats_to_osd();
15577 }
15578
15579 boost::statechart::result PrimaryLogPG::AwaitAsyncWork::react(const DoSnapWork&)
15580 {
15581   PrimaryLogPGRef pg = context< SnapTrimmer >().pg;
15582   snapid_t snap_to_trim = context<Trimming>().snap_to_trim;
15583   auto &in_flight = context<Trimming>().in_flight;
15584   ceph_assert(in_flight.empty());
15585
15586   ceph_assert(pg->is_primary() && pg->is_active());
15587   if (!context< SnapTrimmer >().can_trim()) {
15588     ldout(pg->cct, 10) << "something changed, reverting to NotTrimming" << dendl;
15589     post_event(KickTrim());
15590     return transit< NotTrimming >();
15591   }
15592
15593   ldout(pg->cct, 10) << "AwaitAsyncWork: trimming snap " << snap_to_trim << dendl;
15594
15595   vector<hobject_t> to_trim;
15596   unsigned max = pg->cct->_conf->osd_pg_max_concurrent_snap_trims;
15597   to_trim.reserve(max);
15598   int r = pg->snap_mapper.get_next_objects_to_trim(
15599     snap_to_trim,
15600     max,
15601     &to_trim);
15602   if (r != 0 && r != -ENOENT) {
15603     lderr(pg->cct) << "get_next_objects_to_trim returned "
15604                    << cpp_strerror(r) << dendl;
15605     ceph_abort_msg("get_next_objects_to_trim returned an invalid code");
15606   } else if (r == -ENOENT) {
15607     // Done!
15608     ldout(pg->cct, 10) << "got ENOENT" << dendl;
15609
15610     pg->snap_trimq.erase(snap_to_trim);
15611
15612     if (pg->snap_trimq_repeat.count(snap_to_trim)) {
15613       ldout(pg->cct, 10) << " removing from snap_trimq_repeat" << dendl;
15614       pg->snap_trimq_repeat.erase(snap_to_trim);
15615     } else {
15616       ldout(pg->cct, 10) << "adding snap " << snap_to_trim
15617                          << " to purged_snaps"
15618                          << dendl;
15619       ObjectStore::Transaction t;
15620       pg->recovery_state.adjust_purged_snaps(
15621         [snap_to_trim](auto &purged_snaps) {
15622           purged_snaps.insert(snap_to_trim);
15623         });
15624       pg->write_if_dirty(t);
15625
15626       ldout(pg->cct, 10) << "purged_snaps now "
15627                          << pg->info.purged_snaps << ", snap_trimq now "
15628                          << pg->snap_trimq << dendl;
15629
15630       int tr = pg->osd->store->queue_transaction(pg->ch, std::move(t), NULL);
15631       ceph_assert(tr == 0);
15632
15633       pg->recovery_state.share_pg_info();
15634     }
15635     post_event(KickTrim());
15636     return transit< NotTrimming >();
15637   }
15638   ceph_assert(!to_trim.empty());
15639
15640   for (auto &&object: to_trim) {
15641     // Get next
15642     ldout(pg->cct, 10) << "AwaitAsyncWork react trimming " << object << dendl;
15643     OpContextUPtr ctx;
15644     int error = pg->trim_object(in_flight.empty(), object, snap_to_trim, &ctx);
15645     if (error) {
15646       if (error == -ENOLCK) {
15647         ldout(pg->cct, 10) << "could not get write lock on obj "
15648                            << object << dendl;
15649       } else {
15650         pg->state_set(PG_STATE_SNAPTRIM_ERROR);
15651         ldout(pg->cct, 10) << "Snaptrim error=" << error << dendl;
15652       }
15653       if (!in_flight.empty()) {
15654         ldout(pg->cct, 10) << "letting the ones we already started finish" << dendl;
15655         return transit< WaitRepops >();
15656       }
15657       if (error == -ENOLCK) {
15658         ldout(pg->cct, 10) << "waiting for it to clear"
15659                            << dendl;
15660         return transit< WaitRWLock >();
15661       } else {
15662         return transit< NotTrimming >();
15663       }
15664     }
15665
15666     in_flight.insert(object);
15667     ctx->register_on_success(
15668       [pg, object, &in_flight]() {
15669         ceph_assert(in_flight.find(object) != in_flight.end());
15670         in_flight.erase(object);
15671         if (in_flight.empty()) {
15672           if (pg->state_test(PG_STATE_SNAPTRIM_ERROR)) {
15673             pg->snap_trimmer_machine.process_event(Reset());
15674           } else {
15675             pg->snap_trimmer_machine.process_event(RepopsComplete());
15676           }
15677         }
15678       });
15679
15680     pg->simple_opc_submit(std::move(ctx));
15681   }
15682
15683   return transit< WaitRepops >();
15684 }
15685
15686 void PrimaryLogPG::setattr_maybe_cache(
15687   ObjectContextRef obc,
15688   PGTransaction *t,
15689   const string &key,
15690   bufferlist &val)
15691 {
15692   t->setattr(obc->obs.oi.soid, key, val);
15693 }
15694
15695 void PrimaryLogPG::setattrs_maybe_cache(
15696   ObjectContextRef obc,
15697   PGTransaction *t,
15698   map<string, bufferlist, less<>> &attrs)
15699 {
15700   t->setattrs(obc->obs.oi.soid, attrs);
15701 }
15702
15703 void PrimaryLogPG::rmattr_maybe_cache(
15704   ObjectContextRef obc,
15705   PGTransaction *t,
15706   const string &key)
15707 {
15708   t->rmattr(obc->obs.oi.soid, key);
15709 }
15710
15711 int PrimaryLogPG::getattr_maybe_cache(
15712   ObjectContextRef obc,
15713   const string &key,
15714   bufferlist *val)
15715 {
15716   if (pool.info.is_erasure()) {
15717     map<string, bufferlist>::iterator i = obc->attr_cache.find(key);
15718     if (i != obc->attr_cache.end()) {
15719       if (val)
15720         *val = i->second;
15721       return 0;
15722     } else {
15723       return -ENODATA;
15724     }
15725   }
15726   return pgbackend->objects_get_attr(obc->obs.oi.soid, key, val);
15727 }
15728
15729 int PrimaryLogPG::getattrs_maybe_cache(
15730   ObjectContextRef obc,
15731   map<string, bufferlist, less<>> *out)
15732 {
15733   int r = 0;
15734   ceph_assert(out);
15735   if (pool.info.is_erasure()) {
15736     *out = obc->attr_cache;
15737   } else {
15738     r = pgbackend->objects_get_attrs(obc->obs.oi.soid, out);
15739   }
15740   map<string, bufferlist, less<>> tmp;
15741   for (auto& [key, val]: *out) {
15742     if (key.size() > 1 && key[0] == '_') {
15743       tmp[key.substr(1, key.size())] = std::move(val);
15744     }
15745   }
15746   tmp.swap(*out);
15747   return r;
15748 }
15749
15750 bool PrimaryLogPG::check_failsafe_full() {
15751     return osd->check_failsafe_full(get_dpp());
15752 }
15753
15754 bool PrimaryLogPG::maybe_preempt_replica_scrub(const hobject_t& oid)
15755 {
15756   return m_scrubber->write_blocked_by_scrub(oid);
15757 }
15758
15759 void intrusive_ptr_add_ref(PrimaryLogPG *pg) { pg->get("intptr"); }
15760 void intrusive_ptr_release(PrimaryLogPG *pg) { pg->put("intptr"); }
15761
15762 #ifdef PG_DEBUG_REFS
15763 uint64_t get_with_id(PrimaryLogPG *pg) { return pg->get_with_id(); }
15764 void put_with_id(PrimaryLogPG *pg, uint64_t id) { return pg->put_with_id(id); }
15765 #endif
15766
15767 void intrusive_ptr_add_ref(PrimaryLogPG::RepGather *repop) { repop->get(); }
15768 void intrusive_ptr_release(PrimaryLogPG::RepGather *repop) { repop->put(); }