ceph/src/osd/PrimaryLogPG.cc

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Ceph - scalable distributed file system
   5  *
   6  * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
   7  * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
   8  *
   9  * Author: Loic Dachary <loic@dachary.org>
  10  *
  11  * This is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public
  13  * License version 2.1, as published by the Free Software
  14  * Foundation.  See file COPYING.
  15  *
  16  */
  17
  18 #include "boost/tuple/tuple.hpp"
  19 #include "boost/intrusive_ptr.hpp"
  20 #include "PG.h"
  21 #include "PrimaryLogPG.h"
  22 #include "OSD.h"
  23 #include "OpRequest.h"
  24 #include "ScrubStore.h"
  25 #include "Session.h"
  26 #include "objclass/objclass.h"
  27
  28 #include "common/ceph_crypto.h"
  29 #include "common/errno.h"
  30 #include "common/scrub_types.h"
  31 #include "common/perf_counters.h"
  32
  33 #include "messages/MOSDOp.h"
  34 #include "messages/MOSDBackoff.h"
  35 #include "messages/MOSDPGTrim.h"
  36 #include "messages/MOSDPGScan.h"
  37 #include "messages/MOSDRepScrub.h"
  38 #include "messages/MOSDPGBackfill.h"
  39 #include "messages/MOSDPGBackfillRemove.h"
  40 #include "messages/MOSDPGUpdateLogMissing.h"
  41 #include "messages/MOSDPGUpdateLogMissingReply.h"
  42 #include "messages/MCommandReply.h"
  43 #include "messages/MOSDScrubReserve.h"
  44 #include "common/EventTrace.h"
  45
  46 #include "common/config.h"
  47 #include "include/compat.h"
  48 #include "mon/MonClient.h"
  49 #include "osdc/Objecter.h"
  50 #include "json_spirit/json_spirit_value.h"
  51 #include "json_spirit/json_spirit_reader.h"
  52 #include "include/ceph_assert.h"  // json_spirit clobbers it
  53 #include "include/rados/rados_types.hpp"
  54
  55 #ifdef WITH_LTTNG
  56 #include "tracing/osd.h"
  57 #else
  58 #define tracepoint(...)
  59 #endif
  60
  61 #define dout_context cct
  62 #define dout_subsys ceph_subsys_osd
  63 #define DOUT_PREFIX_ARGS this, osd->whoami, get_osdmap()
  64 #undef dout_prefix
  65 #define dout_prefix _prefix(_dout, this)
  66 using TOPNSPC::common::cmd_getval;
  67
  68 template <typename T>
  69 static ostream& _prefix(std::ostream *_dout, T *pg) {
  70   return pg->gen_prefix(*_dout);
  71 }
  72
  73
  74 #include <sstream>
  75 #include <utility>
  76
  77 #include <errno.h>
  78
  79 MEMPOOL_DEFINE_OBJECT_FACTORY(PrimaryLogPG, replicatedpg, osd);
  80
  81 using namespace ceph::osd::scheduler;
  82
  83 /**
  84  * The CopyCallback class defines an interface for completions to the
  85  * copy_start code. Users of the copy infrastructure must implement
  86  * one and give an instance of the class to start_copy.
  87  *
  88  * The implementer is responsible for making sure that the CopyCallback
  89  * can associate itself with the correct copy operation.
  90  */
  91 class PrimaryLogPG::CopyCallback : public GenContext<CopyCallbackResults> {
  92 protected:
  93   CopyCallback() {}
  94   /**
  95    * results.get<0>() is the return code: 0 for success; -ECANCELED if
  96    * the operation was cancelled by the local OSD; -errno for other issues.
  97    * results.get<1>() is a pointer to a CopyResults object, which you are
  98    * responsible for deleting.
  99    */
 100   void finish(CopyCallbackResults results_) override = 0;
 101
 102 public:
 103   /// Provide the final size of the copied object to the CopyCallback
 104   ~CopyCallback() override {}
 105 };
 106
 107 template <typename T>
 108 class PrimaryLogPG::BlessedGenContext : public GenContext<T> {
 109   PrimaryLogPGRef pg;
 110   unique_ptr<GenContext<T>> c;
 111   epoch_t e;
 112 public:
 113   BlessedGenContext(PrimaryLogPG *pg, GenContext<T> *c, epoch_t e)
 114     : pg(pg), c(c), e(e) {}
 115   void finish(T t) override {
 116     std::scoped_lock locker{*pg};
 117     if (pg->pg_has_reset_since(e))
 118       c.reset();
 119     else
 120       c.release()->complete(t);
 121   }
 122   bool sync_finish(T t) {
 123     // we assume here all blessed/wrapped Contexts can complete synchronously.
 124     c.release()->complete(t);
 125     return true;
 126   }
 127 };
 128
 129 GenContext<ThreadPool::TPHandle&> *PrimaryLogPG::bless_gencontext(
 130   GenContext<ThreadPool::TPHandle&> *c) {
 131   return new BlessedGenContext<ThreadPool::TPHandle&>(
 132     this, c, get_osdmap_epoch());
 133 }
 134
 135 template <typename T>
 136 class PrimaryLogPG::UnlockedBlessedGenContext : public GenContext<T> {
 137   PrimaryLogPGRef pg;
 138   unique_ptr<GenContext<T>> c;
 139   epoch_t e;
 140 public:
 141   UnlockedBlessedGenContext(PrimaryLogPG *pg, GenContext<T> *c, epoch_t e)
 142     : pg(pg), c(c), e(e) {}
 143   void finish(T t) override {
 144     if (pg->pg_has_reset_since(e))
 145       c.reset();
 146     else
 147       c.release()->complete(t);
 148   }
 149   bool sync_finish(T t) {
 150     // we assume here all blessed/wrapped Contexts can complete synchronously.
 151     c.release()->complete(t);
 152     return true;
 153   }
 154 };
 155
 156 GenContext<ThreadPool::TPHandle&> *PrimaryLogPG::bless_unlocked_gencontext(
 157   GenContext<ThreadPool::TPHandle&> *c) {
 158   return new UnlockedBlessedGenContext<ThreadPool::TPHandle&>(
 159     this, c, get_osdmap_epoch());
 160 }
 161
 162 class PrimaryLogPG::BlessedContext : public Context {
 163   PrimaryLogPGRef pg;
 164   unique_ptr<Context> c;
 165   epoch_t e;
 166 public:
 167   BlessedContext(PrimaryLogPG *pg, Context *c, epoch_t e)
 168     : pg(pg), c(c), e(e) {}
 169   void finish(int r) override {
 170     std::scoped_lock locker{*pg};
 171     if (pg->pg_has_reset_since(e))
 172       c.reset();
 173     else
 174       c.release()->complete(r);
 175   }
 176   bool sync_finish(int r) {
 177     // we assume here all blessed/wrapped Contexts can complete synchronously.
 178     c.release()->complete(r);
 179     return true;
 180   }
 181 };
 182
 183 Context *PrimaryLogPG::bless_context(Context *c) {
 184   return new BlessedContext(this, c, get_osdmap_epoch());
 185 }
 186
 187 class PrimaryLogPG::C_PG_ObjectContext : public Context {
 188   PrimaryLogPGRef pg;
 189   ObjectContext *obc;
 190   public:
 191   C_PG_ObjectContext(PrimaryLogPG *p, ObjectContext *o) :
 192     pg(p), obc(o) {}
 193   void finish(int r) override {
 194     pg->object_context_destructor_callback(obc);
 195   }
 196 };
 197
 198 struct OnReadComplete : public Context {
 199   PrimaryLogPG *pg;
 200   PrimaryLogPG::OpContext *opcontext;
 201   OnReadComplete(
 202     PrimaryLogPG *pg,
 203     PrimaryLogPG::OpContext *ctx) : pg(pg), opcontext(ctx) {}
 204   void finish(int r) override {
 205     opcontext->finish_read(pg);
 206   }
 207   ~OnReadComplete() override {}
 208 };
 209
 210 class PrimaryLogPG::C_OSD_AppliedRecoveredObject : public Context {
 211   PrimaryLogPGRef pg;
 212   ObjectContextRef obc;
 213   public:
 214   C_OSD_AppliedRecoveredObject(PrimaryLogPG *p, ObjectContextRef o) :
 215     pg(p), obc(o) {}
 216   bool sync_finish(int r) override {
 217     pg->_applied_recovered_object(obc);
 218     return true;
 219   }
 220   void finish(int r) override {
 221     std::scoped_lock locker{*pg};
 222     pg->_applied_recovered_object(obc);
 223   }
 224 };
 225
 226 class PrimaryLogPG::C_OSD_CommittedPushedObject : public Context {
 227   PrimaryLogPGRef pg;
 228   epoch_t epoch;
 229   eversion_t last_complete;
 230   public:
 231   C_OSD_CommittedPushedObject(
 232     PrimaryLogPG *p, epoch_t epoch, eversion_t lc) :
 233     pg(p), epoch(epoch), last_complete(lc) {
 234   }
 235   void finish(int r) override {
 236     pg->_committed_pushed_object(epoch, last_complete);
 237   }
 238 };
 239
 240 class PrimaryLogPG::C_OSD_AppliedRecoveredObjectReplica : public Context {
 241   PrimaryLogPGRef pg;
 242   public:
 243   explicit C_OSD_AppliedRecoveredObjectReplica(PrimaryLogPG *p) :
 244     pg(p) {}
 245   bool sync_finish(int r) override {
 246     pg->_applied_recovered_object_replica();
 247     return true;
 248   }
 249   void finish(int r) override {
 250     std::scoped_lock locker{*pg};
 251     pg->_applied_recovered_object_replica();
 252   }
 253 };
 254
 255 // OpContext
 256 void PrimaryLogPG::OpContext::start_async_reads(PrimaryLogPG *pg)
 257 {
 258   inflightreads = 1;
 259   list<pair<boost::tuple<uint64_t, uint64_t, unsigned>,
 260             pair<bufferlist*, Context*> > > in;
 261   in.swap(pending_async_reads);
 262   pg->pgbackend->objects_read_async(
 263     obc->obs.oi.soid,
 264     in,
 265     new OnReadComplete(pg, this), pg->get_pool().fast_read);
 266 }
 267 void PrimaryLogPG::OpContext::finish_read(PrimaryLogPG *pg)
 268 {
 269   ceph_assert(inflightreads > 0);
 270   --inflightreads;
 271   if (async_reads_complete()) {
 272     ceph_assert(pg->in_progress_async_reads.size());
 273     ceph_assert(pg->in_progress_async_reads.front().second == this);
 274     pg->in_progress_async_reads.pop_front();
 275
 276     // Restart the op context now that all reads have been
 277     // completed. Read failures will be handled by the op finisher
 278     pg->execute_ctx(this);
 279   }
 280 }
 281
 282 class CopyFromCallback : public PrimaryLogPG::CopyCallback {
 283 public:
 284   PrimaryLogPG::CopyResults *results = nullptr;
 285   PrimaryLogPG::OpContext *ctx;
 286   OSDOp &osd_op;
 287   uint32_t truncate_seq;
 288   uint64_t truncate_size;
 289   bool have_truncate = false;
 290
 291   CopyFromCallback(PrimaryLogPG::OpContext *ctx, OSDOp &osd_op)
 292     : ctx(ctx), osd_op(osd_op) {
 293   }
 294   ~CopyFromCallback() override {}
 295
 296   void finish(PrimaryLogPG::CopyCallbackResults results_) override {
 297     results = results_.get<1>();
 298     int r = results_.get<0>();
 299
 300     // Only use truncate_{seq,size} from the original object if the client
 301     // did not sent us these parameters
 302     if (!have_truncate) {
 303       truncate_seq = results->truncate_seq;
 304       truncate_size = results->truncate_size;
 305     }
 306
 307     // for finish_copyfrom
 308     ctx->user_at_version = results->user_version;
 309
 310     if (r >= 0) {
 311       ctx->pg->execute_ctx(ctx);
 312     } else {
 313       if (r != -ECANCELED) { // on cancel just toss it out; client resends
 314         if (ctx->op)
 315           ctx->pg->osd->reply_op_error(ctx->op, r);
 316       } else if (results->should_requeue) {
 317         if (ctx->op)
 318           ctx->pg->requeue_op(ctx->op);
 319       }
 320       ctx->pg->close_op_ctx(ctx);
 321     }
 322   }
 323
 324   bool is_temp_obj_used() {
 325     return results->started_temp_obj;
 326   }
 327   uint64_t get_data_size() {
 328     return results->object_size;
 329   }
 330   void set_truncate(uint32_t seq, uint64_t size) {
 331     truncate_seq = seq;
 332     truncate_size = size;
 333     have_truncate = true;
 334   }
 335 };
 336
 337 struct CopyFromFinisher : public PrimaryLogPG::OpFinisher {
 338   CopyFromCallback *copy_from_callback;
 339
 340   explicit CopyFromFinisher(CopyFromCallback *copy_from_callback)
 341     : copy_from_callback(copy_from_callback) {
 342   }
 343
 344   int execute() override {
 345     // instance will be destructed after this method completes
 346     copy_from_callback->ctx->pg->finish_copyfrom(copy_from_callback);
 347     return 0;
 348   }
 349 };
 350
 351 // ======================
 352 // PGBackend::Listener
 353
 354 void PrimaryLogPG::on_local_recover(
 355   const hobject_t &hoid,
 356   const ObjectRecoveryInfo &_recovery_info,
 357   ObjectContextRef obc,
 358   bool is_delete,
 359   ObjectStore::Transaction *t
 360   )
 361 {
 362   dout(10) << __func__ << ": " << hoid << dendl;
 363
 364   ObjectRecoveryInfo recovery_info(_recovery_info);
 365   clear_object_snap_mapping(t, hoid);
 366   if (!is_delete && recovery_info.soid.is_snap()) {
 367     OSDriver::OSTransaction _t(osdriver.get_transaction(t));
 368     set<snapid_t> snaps;
 369     dout(20) << " snapset " << recovery_info.ss << dendl;
 370     auto p = recovery_info.ss.clone_snaps.find(hoid.snap);
 371     if (p != recovery_info.ss.clone_snaps.end()) {
 372       snaps.insert(p->second.begin(), p->second.end());
 373       dout(20) << " snaps " << snaps << dendl;
 374       snap_mapper.add_oid(
 375         recovery_info.soid,
 376         snaps,
 377         &_t);
 378     } else {
 379       derr << __func__ << " " << hoid << " had no clone_snaps" << dendl;
 380     }
 381   }
 382   if (!is_delete && recovery_state.get_pg_log().get_missing().is_missing(recovery_info.soid) &&
 383       recovery_state.get_pg_log().get_missing().get_items().find(recovery_info.soid)->second.need > recovery_info.version) {
 384     ceph_assert(is_primary());
 385     const pg_log_entry_t *latest = recovery_state.get_pg_log().get_log().objects.find(recovery_info.soid)->second;
 386     if (latest->op == pg_log_entry_t::LOST_REVERT &&
 387         latest->reverting_to == recovery_info.version) {
 388       dout(10) << " got old revert version " << recovery_info.version
 389                << " for " << *latest << dendl;
 390       recovery_info.version = latest->version;
 391       // update the attr to the revert event version
 392       recovery_info.oi.prior_version = recovery_info.oi.version;
 393       recovery_info.oi.version = latest->version;
 394       bufferlist bl;
 395       encode(recovery_info.oi, bl,
 396                get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
 397       ceph_assert(!pool.info.is_erasure());
 398       t->setattr(coll, ghobject_t(recovery_info.soid), OI_ATTR, bl);
 399       if (obc)
 400         obc->attr_cache[OI_ATTR] = bl;
 401     }
 402   }
 403
 404   // keep track of active pushes for scrub
 405   ++active_pushes;
 406
 407   recovery_state.recover_got(
 408     recovery_info.soid,
 409     recovery_info.version,
 410     is_delete,
 411     *t);
 412
 413   if (is_primary()) {
 414     if (!is_delete) {
 415       obc->obs.exists = true;
 416
 417       bool got = obc->get_recovery_read();
 418       ceph_assert(got);
 419
 420       ceph_assert(recovering.count(obc->obs.oi.soid));
 421       recovering[obc->obs.oi.soid] = obc;
 422       obc->obs.oi = recovery_info.oi;  // may have been updated above
 423     }
 424
 425     t->register_on_applied(new C_OSD_AppliedRecoveredObject(this, obc));
 426
 427     publish_stats_to_osd();
 428     release_backoffs(hoid);
 429     if (!is_unreadable_object(hoid)) {
 430       auto unreadable_object_entry = waiting_for_unreadable_object.find(hoid);
 431       if (unreadable_object_entry != waiting_for_unreadable_object.end()) {
 432         dout(20) << " kicking unreadable waiters on " << hoid << dendl;
 433         requeue_ops(unreadable_object_entry->second);
 434         waiting_for_unreadable_object.erase(unreadable_object_entry);
 435       }
 436     }
 437   } else {
 438     t->register_on_applied(
 439       new C_OSD_AppliedRecoveredObjectReplica(this));
 440
 441   }
 442
 443   t->register_on_commit(
 444     new C_OSD_CommittedPushedObject(
 445       this,
 446       get_osdmap_epoch(),
 447       info.last_complete));
 448 }
 449
 450 void PrimaryLogPG::on_global_recover(
 451   const hobject_t &soid,
 452   const object_stat_sum_t &stat_diff,
 453   bool is_delete)
 454 {
 455   recovery_state.object_recovered(soid, stat_diff);
 456   publish_stats_to_osd();
 457   dout(10) << "pushed " << soid << " to all replicas" << dendl;
 458   map<hobject_t, ObjectContextRef>::iterator i = recovering.find(soid);
 459   ceph_assert(i != recovering.end());
 460
 461   if (i->second && i->second->rwstate.recovery_read_marker) {
 462     // recover missing won't have had an obc, but it gets filled in
 463     // during on_local_recover
 464     ceph_assert(i->second);
 465     list<OpRequestRef> requeue_list;
 466     i->second->drop_recovery_read(&requeue_list);
 467     requeue_ops(requeue_list);
 468   }
 469
 470   backfills_in_flight.erase(soid);
 471
 472   recovering.erase(i);
 473   finish_recovery_op(soid);
 474   release_backoffs(soid);
 475   auto degraded_object_entry = waiting_for_degraded_object.find(soid);
 476   if (degraded_object_entry != waiting_for_degraded_object.end()) {
 477     dout(20) << " kicking degraded waiters on " << soid << dendl;
 478     requeue_ops(degraded_object_entry->second);
 479     waiting_for_degraded_object.erase(degraded_object_entry);
 480   }
 481   auto unreadable_object_entry = waiting_for_unreadable_object.find(soid);
 482   if (unreadable_object_entry != waiting_for_unreadable_object.end()) {
 483     dout(20) << " kicking unreadable waiters on " << soid << dendl;
 484     requeue_ops(unreadable_object_entry->second);
 485     waiting_for_unreadable_object.erase(unreadable_object_entry);
 486   }
 487   finish_degraded_object(soid);
 488 }
 489
 490 void PrimaryLogPG::schedule_recovery_work(
 491   GenContext<ThreadPool::TPHandle&> *c)
 492 {
 493   osd->queue_recovery_context(this, c);
 494 }
 495
 496 void PrimaryLogPG::replica_clear_repop_obc(
 497   const vector<pg_log_entry_t> &logv,
 498   ObjectStore::Transaction &t)
 499 {
 500   for (auto &&e: logv) {
 501     /* Have to blast all clones, they share a snapset */
 502     object_contexts.clear_range(
 503       e.soid.get_object_boundary(), e.soid.get_head());
 504     ceph_assert(
 505       snapset_contexts.find(e.soid.get_head()) ==
 506       snapset_contexts.end());
 507   }
 508 }
 509
 510 bool PrimaryLogPG::should_send_op(
 511   pg_shard_t peer,
 512   const hobject_t &hoid) {
 513   if (peer == get_primary())
 514     return true;
 515   ceph_assert(recovery_state.has_peer_info(peer));
 516   bool should_send =
 517       hoid.pool != (int64_t)info.pgid.pool() ||
 518       hoid <= last_backfill_started ||
 519       hoid <= recovery_state.get_peer_info(peer).last_backfill;
 520   if (!should_send) {
 521     ceph_assert(is_backfill_target(peer));
 522     dout(10) << __func__ << " issue_repop shipping empty opt to osd." << peer
 523              << ", object " << hoid
 524              << " beyond std::max(last_backfill_started "
 525              << ", peer_info[peer].last_backfill "
 526              << recovery_state.get_peer_info(peer).last_backfill
 527              << ")" << dendl;
 528     return should_send;
 529   }
 530   if (is_async_recovery_target(peer) &&
 531       recovery_state.get_peer_missing(peer).is_missing(hoid)) {
 532     should_send = false;
 533     dout(10) << __func__ << " issue_repop shipping empty opt to osd." << peer
 534              << ", object " << hoid
 535              << " which is pending recovery in async_recovery_targets" << dendl;
 536   }
 537   return should_send;
 538 }
 539
 540
 541 ConnectionRef PrimaryLogPG::get_con_osd_cluster(
 542   int peer, epoch_t from_epoch)
 543 {
 544   return osd->get_con_osd_cluster(peer, from_epoch);
 545 }
 546
 547 PerfCounters *PrimaryLogPG::get_logger()
 548 {
 549   return osd->logger;
 550 }
 551
 552
 553 // ====================
 554 // missing objects
 555
 556 bool PrimaryLogPG::is_missing_object(const hobject_t& soid) const
 557 {
 558   return recovery_state.get_pg_log().get_missing().get_items().count(soid);
 559 }
 560
 561 void PrimaryLogPG::maybe_kick_recovery(
 562   const hobject_t &soid)
 563 {
 564   eversion_t v;
 565   bool work_started = false;
 566   if (!recovery_state.get_missing_loc().needs_recovery(soid, &v))
 567     return;
 568
 569   map<hobject_t, ObjectContextRef>::const_iterator p = recovering.find(soid);
 570   if (p != recovering.end()) {
 571     dout(7) << "object " << soid << " v " << v << ", already recovering." << dendl;
 572   } else if (recovery_state.get_missing_loc().is_unfound(soid)) {
 573     dout(7) << "object " << soid << " v " << v << ", is unfound." << dendl;
 574   } else {
 575     dout(7) << "object " << soid << " v " << v << ", recovering." << dendl;
 576     PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
 577     if (is_missing_object(soid)) {
 578       recover_missing(soid, v, CEPH_MSG_PRIO_HIGH, h);
 579     } else if (recovery_state.get_missing_loc().is_deleted(soid)) {
 580       prep_object_replica_deletes(soid, v, h, &work_started);
 581     } else {
 582       prep_object_replica_pushes(soid, v, h, &work_started);
 583     }
 584     pgbackend->run_recovery_op(h, CEPH_MSG_PRIO_HIGH);
 585   }
 586 }
 587
 588 void PrimaryLogPG::wait_for_unreadable_object(
 589   const hobject_t& soid, OpRequestRef op)
 590 {
 591   ceph_assert(is_unreadable_object(soid));
 592   maybe_kick_recovery(soid);
 593   waiting_for_unreadable_object[soid].push_back(op);
 594   op->mark_delayed("waiting for missing object");
 595 }
 596
 597 bool PrimaryLogPG::is_degraded_or_backfilling_object(const hobject_t& soid)
 598 {
 599   /* The conditions below may clear (on_local_recover, before we queue
 600    * the transaction) before we actually requeue the degraded waiters
 601    * in on_global_recover after the transaction completes.
 602    */
 603   if (waiting_for_degraded_object.count(soid))
 604     return true;
 605   if (recovery_state.get_pg_log().get_missing().get_items().count(soid))
 606     return true;
 607   ceph_assert(!get_acting_recovery_backfill().empty());
 608   for (set<pg_shard_t>::iterator i = get_acting_recovery_backfill().begin();
 609        i != get_acting_recovery_backfill().end();
 610        ++i) {
 611     if (*i == get_primary()) continue;
 612     pg_shard_t peer = *i;
 613     auto peer_missing_entry = recovery_state.get_peer_missing().find(peer);
 614     // If an object is missing on an async_recovery_target, return false.
 615     // This will not block the op and the object is async recovered later.
 616     if (peer_missing_entry != recovery_state.get_peer_missing().end() &&
 617         peer_missing_entry->second.get_items().count(soid)) {
 618       if (is_async_recovery_target(peer))
 619         continue;
 620       else
 621         return true;
 622     }
 623     // Object is degraded if after last_backfill AND
 624     // we are backfilling it
 625     if (is_backfill_target(peer) &&
 626         recovery_state.get_peer_info(peer).last_backfill <= soid &&
 627         last_backfill_started >= soid &&
 628         backfills_in_flight.count(soid))
 629       return true;
 630   }
 631   return false;
 632 }
 633
 634 bool PrimaryLogPG::is_degraded_on_async_recovery_target(const hobject_t& soid)
 635 {
 636   for (auto &i: get_async_recovery_targets()) {
 637     auto peer_missing_entry = recovery_state.get_peer_missing().find(i);
 638     if (peer_missing_entry != recovery_state.get_peer_missing().end() &&
 639         peer_missing_entry->second.get_items().count(soid)) {
 640       dout(30) << __func__ << " " << soid << dendl;
 641       return true;
 642     }
 643   }
 644   return false;
 645 }
 646
 647 void PrimaryLogPG::wait_for_degraded_object(const hobject_t& soid, OpRequestRef op)
 648 {
 649   ceph_assert(is_degraded_or_backfilling_object(soid) || is_degraded_on_async_recovery_target(soid));
 650
 651   maybe_kick_recovery(soid);
 652   waiting_for_degraded_object[soid].push_back(op);
 653   op->mark_delayed("waiting for degraded object");
 654 }
 655
 656 void PrimaryLogPG::block_write_on_full_cache(
 657   const hobject_t& _oid, OpRequestRef op)
 658 {
 659   const hobject_t oid = _oid.get_head();
 660   dout(20) << __func__ << ": blocking object " << oid
 661            << " on full cache" << dendl;
 662   objects_blocked_on_cache_full.insert(oid);
 663   waiting_for_cache_not_full.push_back(op);
 664   op->mark_delayed("waiting for cache not full");
 665 }
 666
 667 void PrimaryLogPG::block_for_clean(
 668   const hobject_t& oid, OpRequestRef op)
 669 {
 670   dout(20) << __func__ << ": blocking object " << oid
 671            << " on primary repair" << dendl;
 672   waiting_for_clean_to_primary_repair.push_back(op);
 673   op->mark_delayed("waiting for clean to repair");
 674 }
 675
 676 void PrimaryLogPG::block_write_on_snap_rollback(
 677   const hobject_t& oid, ObjectContextRef obc, OpRequestRef op)
 678 {
 679   dout(20) << __func__ << ": blocking object " << oid.get_head()
 680            << " on snap promotion " << obc->obs.oi.soid << dendl;
 681   // otherwise, we'd have blocked in do_op
 682   ceph_assert(oid.is_head());
 683   ceph_assert(objects_blocked_on_snap_promotion.count(oid) == 0);
 684   objects_blocked_on_snap_promotion[oid] = obc;
 685   wait_for_blocked_object(obc->obs.oi.soid, op);
 686 }
 687
 688 void PrimaryLogPG::block_write_on_degraded_snap(
 689   const hobject_t& snap, OpRequestRef op)
 690 {
 691   dout(20) << __func__ << ": blocking object " << snap.get_head()
 692            << " on degraded snap " << snap << dendl;
 693   // otherwise, we'd have blocked in do_op
 694   ceph_assert(objects_blocked_on_degraded_snap.count(snap.get_head()) == 0);
 695   objects_blocked_on_degraded_snap[snap.get_head()] = snap.snap;
 696   wait_for_degraded_object(snap, op);
 697 }
 698
 699 bool PrimaryLogPG::maybe_await_blocked_head(
 700   const hobject_t &hoid,
 701   OpRequestRef op)
 702 {
 703   ObjectContextRef obc;
 704   obc = object_contexts.lookup(hoid.get_head());
 705   if (obc) {
 706     if (obc->is_blocked()) {
 707       wait_for_blocked_object(obc->obs.oi.soid, op);
 708       return true;
 709     } else {
 710       return false;
 711     }
 712   }
 713   return false;
 714 }
 715
 716 void PrimaryLogPG::wait_for_blocked_object(const hobject_t& soid, OpRequestRef op)
 717 {
 718   dout(10) << __func__ << " " << soid << " " << op << dendl;
 719   waiting_for_blocked_object[soid].push_back(op);
 720   op->mark_delayed("waiting for blocked object");
 721 }
 722
 723 void PrimaryLogPG::maybe_force_recovery()
 724 {
 725   // no force if not in degraded/recovery/backfill states
 726   if (!is_degraded() &&
 727       !state_test(PG_STATE_RECOVERING |
 728                   PG_STATE_RECOVERY_WAIT |
 729                   PG_STATE_BACKFILLING |
 730                   PG_STATE_BACKFILL_WAIT |
 731                   PG_STATE_BACKFILL_TOOFULL))
 732     return;
 733
 734   if (recovery_state.get_pg_log().get_log().approx_size() <
 735       cct->_conf->osd_max_pg_log_entries *
 736         cct->_conf->osd_force_recovery_pg_log_entries_factor)
 737     return;
 738
 739   // find the oldest missing object
 740   version_t min_version = recovery_state.get_pg_log().get_log().head.version;
 741   hobject_t soid;
 742   if (!recovery_state.get_pg_log().get_missing().get_rmissing().empty()) {
 743     min_version = recovery_state.get_pg_log().get_missing().get_rmissing().begin()->first;
 744     soid = recovery_state.get_pg_log().get_missing().get_rmissing().begin()->second;
 745   }
 746   ceph_assert(!get_acting_recovery_backfill().empty());
 747   for (set<pg_shard_t>::iterator it = get_acting_recovery_backfill().begin();
 748        it != get_acting_recovery_backfill().end();
 749        ++it) {
 750     if (*it == get_primary()) continue;
 751     pg_shard_t peer = *it;
 752     auto it_missing = recovery_state.get_peer_missing().find(peer);
 753     if (it_missing != recovery_state.get_peer_missing().end() &&
 754         !it_missing->second.get_rmissing().empty()) {
 755       const auto& min_obj = recovery_state.get_peer_missing(peer).get_rmissing().begin();
 756       dout(20) << __func__ << " peer " << peer << " min_version " << min_obj->first
 757                << " oid " << min_obj->second << dendl;
 758       if (min_version > min_obj->first) {
 759         min_version = min_obj->first;
 760         soid = min_obj->second;
 761       }
 762     }
 763   }
 764
 765   // recover it
 766   if (soid != hobject_t())
 767     maybe_kick_recovery(soid);
 768 }
 769
 770 bool PrimaryLogPG::check_laggy(OpRequestRef& op)
 771 {
 772   if (!HAVE_FEATURE(recovery_state.get_min_upacting_features(),
 773                     SERVER_OCTOPUS)) {
 774     dout(20) << __func__ << " not all upacting has SERVER_OCTOPUS" << dendl;
 775     return true;
 776   }
 777   if (state_test(PG_STATE_WAIT)) {
 778     dout(10) << __func__ << " PG is WAIT state" << dendl;
 779   } else if (!state_test(PG_STATE_LAGGY)) {
 780     auto mnow = osd->get_mnow();
 781     auto ru = recovery_state.get_readable_until();
 782     if (mnow <= ru) {
 783       // not laggy
 784       return true;
 785     }
 786     dout(10) << __func__
 787              << " mnow " << mnow
 788              << " > readable_until " << ru << dendl;
 789
 790     if (!is_primary()) {
 791       osd->reply_op_error(op, -EAGAIN);
 792       return false;
 793     }
 794
 795     // go to laggy state
 796     state_set(PG_STATE_LAGGY);
 797     publish_stats_to_osd();
 798   }
 799   dout(10) << __func__ << " not readable" << dendl;
 800   waiting_for_readable.push_back(op);
 801   op->mark_delayed("waiting for readable");
 802   return false;
 803 }
 804
 805 bool PrimaryLogPG::check_laggy_requeue(OpRequestRef& op)
 806 {
 807   if (!HAVE_FEATURE(recovery_state.get_min_upacting_features(),
 808                     SERVER_OCTOPUS)) {
 809     return true;
 810   }
 811   if (!state_test(PG_STATE_WAIT) && !state_test(PG_STATE_LAGGY)) {
 812     return true; // not laggy
 813   }
 814   dout(10) << __func__ << " not readable" << dendl;
 815   waiting_for_readable.push_front(op);
 816   op->mark_delayed("waiting for readable");
 817   return false;
 818 }
 819
 820 void PrimaryLogPG::recheck_readable()
 821 {
 822   if (!is_wait() && !is_laggy()) {
 823     dout(20) << __func__ << " wasn't wait or laggy" << dendl;
 824     return;
 825   }
 826   auto mnow = osd->get_mnow();
 827   bool pub = false;
 828   if (is_wait()) {
 829     auto prior_readable_until_ub = recovery_state.get_prior_readable_until_ub();
 830     if (mnow < prior_readable_until_ub) {
 831       dout(10) << __func__ << " still wait (mnow " << mnow
 832                << " < prior_readable_until_ub " << prior_readable_until_ub
 833                << ")" << dendl;
 834     } else {
 835       dout(10) << __func__ << " no longer wait (mnow " << mnow
 836                << " >= prior_readable_until_ub " << prior_readable_until_ub
 837                << ")" << dendl;
 838       state_clear(PG_STATE_WAIT);
 839       recovery_state.clear_prior_readable_until_ub();
 840       pub = true;
 841     }
 842   }
 843   if (is_laggy()) {
 844     auto ru = recovery_state.get_readable_until();
 845     if (ru == ceph::signedspan::zero()) {
 846       dout(10) << __func__ << " still laggy (mnow " << mnow
 847                << ", readable_until zero)" << dendl;
 848     } else if (mnow >= ru) {
 849       dout(10) << __func__ << " still laggy (mnow " << mnow
 850                << " >= readable_until " << ru << ")" << dendl;
 851     } else {
 852       dout(10) << __func__ << " no longer laggy (mnow " << mnow
 853                << " < readable_until " << ru << ")" << dendl;
 854       state_clear(PG_STATE_LAGGY);
 855       pub = true;
 856     }
 857   }
 858   if (pub) {
 859     publish_stats_to_osd();
 860   }
 861   if (!is_laggy() && !is_wait()) {
 862     requeue_ops(waiting_for_readable);
 863   }
 864 }
 865
 866 bool PrimaryLogPG::pgls_filter(const PGLSFilter& filter, const hobject_t& sobj)
 867 {
 868   bufferlist bl;
 869
 870   // If filter has expressed an interest in an xattr, load it.
 871   if (!filter.get_xattr().empty()) {
 872     int ret = pgbackend->objects_get_attr(
 873       sobj,
 874       filter.get_xattr(),
 875       &bl);
 876     dout(0) << "getattr (sobj=" << sobj << ", attr=" << filter.get_xattr() << ") returned " << ret << dendl;
 877     if (ret < 0) {
 878       if (ret != -ENODATA || filter.reject_empty_xattr()) {
 879         return false;
 880       }
 881     }
 882   }
 883
 884   return filter.filter(sobj, bl);
 885 }
 886
 887 std::pair<int, std::unique_ptr<const PGLSFilter>>
 888 PrimaryLogPG::get_pgls_filter(bufferlist::const_iterator& iter)
 889 {
 890   string type;
 891   // storing non-const PGLSFilter for the sake of ::init()
 892   std::unique_ptr<PGLSFilter> filter;
 893
 894   try {
 895     decode(type, iter);
 896   }
 897   catch (buffer::error& e) {
 898     return { -EINVAL, nullptr };
 899   }
 900
 901   if (type.compare("plain") == 0) {
 902     filter = std::make_unique<PGLSPlainFilter>();
 903   } else {
 904     std::size_t dot = type.find(".");
 905     if (dot == std::string::npos || dot == 0 || dot == type.size() - 1) {
 906       return { -EINVAL, nullptr };
 907     }
 908
 909     const std::string class_name = type.substr(0, dot);
 910     const std::string filter_name = type.substr(dot + 1);
 911     ClassHandler::ClassData *cls = NULL;
 912     int r = ClassHandler::get_instance().open_class(class_name, &cls);
 913     if (r != 0) {
 914       derr << "Error opening class '" << class_name << "': "
 915            << cpp_strerror(r) << dendl;
 916       if (r != -EPERM) // propogate permission error
 917         r = -EINVAL;
 918       return { r, nullptr };
 919     } else {
 920       ceph_assert(cls);
 921     }
 922
 923     ClassHandler::ClassFilter *class_filter = cls->get_filter(filter_name);
 924     if (class_filter == NULL) {
 925       derr << "Error finding filter '" << filter_name << "' in class "
 926            << class_name << dendl;
 927       return { -EINVAL, nullptr };
 928     }
 929     filter.reset(class_filter->fn());
 930     if (!filter) {
 931       // Object classes are obliged to return us something, but let's
 932       // give an error rather than asserting out.
 933       derr << "Buggy class " << class_name << " failed to construct "
 934               "filter " << filter_name << dendl;
 935       return { -EINVAL, nullptr };
 936     }
 937   }
 938
 939   ceph_assert(filter);
 940   int r = filter->init(iter);
 941   if (r < 0) {
 942     derr << "Error initializing filter " << type << ": "
 943          << cpp_strerror(r) << dendl;
 944     return { -EINVAL, nullptr };
 945   } else {
 946     // Successfully constructed and initialized, return it.
 947     return std::make_pair(0, std::move(filter));
 948   }
 949 }
 950
 951
 952 // ==========================================================
 953
 954 void PrimaryLogPG::do_command(
 955   const string_view& orig_prefix,
 956   const cmdmap_t& cmdmap,
 957   const bufferlist& idata,
 958   std::function<void(int,const std::string&,bufferlist&)> on_finish)
 959 {
 960   string format;
 961   cmd_getval(cmdmap, "format", format);
 962   std::unique_ptr<Formatter> f(Formatter::create(
 963                                  format, "json-pretty", "json-pretty"));
 964   int ret = 0;
 965   stringstream ss;   // stderr error message stream
 966   bufferlist outbl;  // if empty at end, we'll dump formatter as output
 967
 968   // get final prefix:
 969   // - ceph pg <pgid> foo -> prefix=pg, cmd=foo
 970   // - ceph tell <pgid> foo -> prefix=foo
 971   string prefix(orig_prefix);
 972   string command;
 973   cmd_getval(cmdmap, "cmd", command);
 974   if (command.size()) {
 975     prefix = command;
 976   }
 977
 978   if (prefix == "query") {
 979     f->open_object_section("pg");
 980     f->dump_stream("snap_trimq") << snap_trimq;
 981     f->dump_unsigned("snap_trimq_len", snap_trimq.size());
 982     recovery_state.dump_peering_state(f.get());
 983     f->close_section();
 984
 985     f->open_array_section("recovery_state");
 986     handle_query_state(f.get());
 987     f->close_section();
 988
 989     f->open_object_section("agent_state");
 990     if (agent_state)
 991       agent_state->dump(f.get());
 992     f->close_section();
 993
 994     f->close_section();
 995   }
 996
 997   else if (prefix == "mark_unfound_lost") {
 998     string mulcmd;
 999     cmd_getval(cmdmap, "mulcmd", mulcmd);
1000     int mode = -1;
1001     if (mulcmd == "revert") {
1002       if (pool.info.is_erasure()) {
1003         ss << "mode must be 'delete' for ec pool";
1004         ret = -EINVAL;
1005         goto out;
1006       }
1007       mode = pg_log_entry_t::LOST_REVERT;
1008     } else if (mulcmd == "delete") {
1009       mode = pg_log_entry_t::LOST_DELETE;
1010     } else {
1011       ss << "mode must be 'revert' or 'delete'; mark not yet implemented";
1012       ret = -EINVAL;
1013       goto out;
1014     }
1015     ceph_assert(mode == pg_log_entry_t::LOST_REVERT ||
1016                 mode == pg_log_entry_t::LOST_DELETE);
1017
1018     if (!is_primary()) {
1019       ss << "not primary";
1020       ret = -EROFS;
1021       goto out;
1022     }
1023
1024     uint64_t unfound = recovery_state.get_missing_loc().num_unfound();
1025     if (!unfound) {
1026       ss << "pg has no unfound objects";
1027       goto out;  // make command idempotent
1028     }
1029
1030     if (!recovery_state.all_unfound_are_queried_or_lost(get_osdmap())) {
1031       ss << "pg has " << unfound
1032          << " unfound objects but we haven't probed all sources, not marking lost";
1033       ret = -EINVAL;
1034       goto out;
1035     }
1036
1037     mark_all_unfound_lost(mode, on_finish);
1038     return;
1039   }
1040
1041   else if (prefix == "list_unfound") {
1042     hobject_t offset;
1043     string offset_json;
1044     bool show_offset = false;
1045     if (cmd_getval(cmdmap, "offset", offset_json)) {
1046       json_spirit::Value v;
1047       try {
1048         if (!json_spirit::read(offset_json, v))
1049           throw std::runtime_error("bad json");
1050         offset.decode(v);
1051       } catch (std::runtime_error& e) {
1052         ss << "error parsing offset: " << e.what();
1053         ret = -EINVAL;
1054         goto out;
1055       }
1056       show_offset = true;
1057     }
1058     f->open_object_section("missing");
1059     if (show_offset) {
1060       f->open_object_section("offset");
1061       offset.dump(f.get());
1062       f->close_section();
1063     }
1064     auto &needs_recovery_map = recovery_state.get_missing_loc()
1065       .get_needs_recovery();
1066     f->dump_int("num_missing", needs_recovery_map.size());
1067     f->dump_int("num_unfound", get_num_unfound());
1068     map<hobject_t, pg_missing_item>::const_iterator p =
1069       needs_recovery_map.upper_bound(offset);
1070     {
1071       f->open_array_section("objects");
1072       int32_t num = 0;
1073       for (; p != needs_recovery_map.end() &&
1074              num < cct->_conf->osd_command_max_records;
1075            ++p) {
1076         if (recovery_state.get_missing_loc().is_unfound(p->first)) {
1077           f->open_object_section("object");
1078           {
1079             f->open_object_section("oid");
1080             p->first.dump(f.get());
1081             f->close_section();
1082           }
1083           p->second.dump(f.get()); // have, need keys
1084           {
1085             f->open_array_section("locations");
1086             for (auto &&r : recovery_state.get_missing_loc().get_locations(
1087                    p->first)) {
1088               f->dump_stream("shard") << r;
1089             }
1090             f->close_section();
1091           }
1092           f->close_section();
1093           num++;
1094         }
1095       }
1096       f->close_section();
1097     }
1098     f->dump_bool("more", p != needs_recovery_map.end());
1099     f->close_section();
1100   }
1101
1102   else if (prefix == "scrub" ||
1103            prefix == "deep_scrub") {
1104     bool deep = (prefix == "deep_scrub");
1105     int64_t time;
1106     cmd_getval(cmdmap, "time", time, (int64_t)0);
1107
1108     if (is_primary()) {
1109       const pg_pool_t *p = &pool.info;
1110       double pool_scrub_max_interval = 0;
1111       double scrub_max_interval;
1112       if (deep) {
1113         p->opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &pool_scrub_max_interval);
1114         scrub_max_interval = pool_scrub_max_interval > 0 ?
1115           pool_scrub_max_interval : g_conf()->osd_deep_scrub_interval;
1116       } else {
1117         p->opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &pool_scrub_max_interval);
1118         scrub_max_interval = pool_scrub_max_interval > 0 ?
1119           pool_scrub_max_interval : g_conf()->osd_scrub_max_interval;
1120       }
1121       // Instead of marking must_scrub force a schedule scrub
1122       utime_t stamp = ceph_clock_now();
1123       if (time == 0)
1124         stamp -= scrub_max_interval;
1125       else
1126         stamp -=  (float)time;
1127       stamp -= 100.0;  // push back last scrub more for good measure
1128       if (deep) {
1129         set_last_deep_scrub_stamp(stamp);
1130       } else {
1131         set_last_scrub_stamp(stamp);
1132       }
1133       f->open_object_section("result");
1134       f->dump_bool("deep", deep);
1135       f->dump_stream("stamp") << stamp;
1136       f->close_section();
1137     } else {
1138       ss << "Not primary";
1139       ret = -EPERM;
1140     }
1141     outbl.append(ss.str());
1142   }
1143
1144   else {
1145     ret = -ENOSYS;
1146     ss << "prefix '" << prefix << "' not implemented";
1147   }
1148
1149  out:
1150   if (ret >= 0 && outbl.length() == 0) {
1151     f->flush(outbl);
1152   }
1153   on_finish(ret, ss.str(), outbl);
1154 }
1155
1156
1157 // ==========================================================
1158
1159 void PrimaryLogPG::do_pg_op(OpRequestRef op)
1160 {
1161   const MOSDOp *m = static_cast<const MOSDOp *>(op->get_req());
1162   ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
1163   dout(10) << "do_pg_op " << *m << dendl;
1164
1165   op->mark_started();
1166
1167   int result = 0;
1168   string cname, mname;
1169
1170   snapid_t snapid = m->get_snapid();
1171
1172   vector<OSDOp> ops = m->ops;
1173
1174   for (vector<OSDOp>::iterator p = ops.begin(); p != ops.end(); ++p) {
1175     std::unique_ptr<const PGLSFilter> filter;
1176     OSDOp& osd_op = *p;
1177     auto bp = p->indata.cbegin();
1178     switch (p->op.op) {
1179     case CEPH_OSD_OP_PGNLS_FILTER:
1180       try {
1181         decode(cname, bp);
1182         decode(mname, bp);
1183       }
1184       catch (const buffer::error& e) {
1185         dout(0) << "unable to decode PGLS_FILTER description in " << *m << dendl;
1186         result = -EINVAL;
1187         break;
1188       }
1189       std::tie(result, filter) = get_pgls_filter(bp);
1190       if (result < 0)
1191         break;
1192
1193       ceph_assert(filter);
1194
1195       // fall through
1196
1197     case CEPH_OSD_OP_PGNLS:
1198       if (snapid != CEPH_NOSNAP) {
1199         result = -EINVAL;
1200         break;
1201       }
1202       if (get_osdmap()->raw_pg_to_pg(m->get_pg()) != info.pgid.pgid) {
1203         dout(10) << " pgnls pg=" << m->get_pg()
1204                  << " " << get_osdmap()->raw_pg_to_pg(m->get_pg())
1205                  << " != " << info.pgid << dendl;
1206         result = 0; // hmm?
1207       } else {
1208         unsigned list_size = std::min<uint64_t>(cct->_conf->osd_max_pgls,
1209                                                 p->op.pgls.count);
1210
1211         dout(10) << " pgnls pg=" << m->get_pg() << " count " << list_size
1212                  << dendl;
1213         // read into a buffer
1214         vector<hobject_t> sentries;
1215         pg_nls_response_t response;
1216         try {
1217           decode(response.handle, bp);
1218         }
1219         catch (const buffer::error& e) {
1220           dout(0) << "unable to decode PGNLS handle in " << *m << dendl;
1221           result = -EINVAL;
1222           break;
1223         }
1224
1225         hobject_t next;
1226         hobject_t lower_bound = response.handle;
1227         hobject_t pg_start = info.pgid.pgid.get_hobj_start();
1228         hobject_t pg_end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1229         dout(10) << " pgnls lower_bound " << lower_bound
1230                  << " pg_end " << pg_end << dendl;
1231         if (((!lower_bound.is_max() && lower_bound >= pg_end) ||
1232              (lower_bound != hobject_t() && lower_bound < pg_start))) {
1233           // this should only happen with a buggy client.
1234           dout(10) << "outside of PG bounds " << pg_start << " .. "
1235                    << pg_end << dendl;
1236           result = -EINVAL;
1237           break;
1238         }
1239
1240         hobject_t current = lower_bound;
1241         int r = pgbackend->objects_list_partial(
1242           current,
1243           list_size,
1244           list_size,
1245           &sentries,
1246           &next);
1247         if (r != 0) {
1248           result = -EINVAL;
1249           break;
1250         }
1251
1252         map<hobject_t, pg_missing_item>::const_iterator missing_iter =
1253           recovery_state.get_pg_log().get_missing().get_items().lower_bound(current);
1254         vector<hobject_t>::iterator ls_iter = sentries.begin();
1255         hobject_t _max = hobject_t::get_max();
1256         while (1) {
1257           const hobject_t &mcand =
1258             missing_iter == recovery_state.get_pg_log().get_missing().get_items().end() ?
1259             _max :
1260             missing_iter->first;
1261           const hobject_t &lcand =
1262             ls_iter == sentries.end() ?
1263             _max :
1264             *ls_iter;
1265
1266           hobject_t candidate;
1267           if (mcand == lcand) {
1268             candidate = mcand;
1269             if (!mcand.is_max()) {
1270               ++ls_iter;
1271               ++missing_iter;
1272             }
1273           } else if (mcand < lcand) {
1274             candidate = mcand;
1275             ceph_assert(!mcand.is_max());
1276             ++missing_iter;
1277           } else {
1278             candidate = lcand;
1279             ceph_assert(!lcand.is_max());
1280             ++ls_iter;
1281           }
1282
1283           dout(10) << " pgnls candidate 0x" << std::hex << candidate.get_hash()
1284                    << " vs lower bound 0x" << lower_bound.get_hash()
1285                    << std::dec << dendl;
1286
1287           if (candidate >= next) {
1288             break;
1289           }
1290
1291           if (response.entries.size() == list_size) {
1292             next = candidate;
1293             break;
1294           }
1295
1296           if (candidate.snap != CEPH_NOSNAP)
1297             continue;
1298
1299           // skip internal namespace
1300           if (candidate.get_namespace() == cct->_conf->osd_hit_set_namespace)
1301             continue;
1302
1303           if (recovery_state.get_missing_loc().is_deleted(candidate))
1304             continue;
1305
1306           // skip wrong namespace
1307           if (m->get_hobj().nspace != librados::all_nspaces &&
1308                candidate.get_namespace() != m->get_hobj().nspace)
1309             continue;
1310
1311           if (filter && !pgls_filter(*filter, candidate))
1312             continue;
1313
1314           dout(20) << "pgnls item 0x" << std::hex
1315             << candidate.get_hash()
1316             << ", rev 0x" << hobject_t::_reverse_bits(candidate.get_hash())
1317             << std::dec << " "
1318             << candidate.oid.name << dendl;
1319
1320           librados::ListObjectImpl item;
1321           item.nspace = candidate.get_namespace();
1322           item.oid = candidate.oid.name;
1323           item.locator = candidate.get_key();
1324           response.entries.push_back(item);
1325         }
1326
1327         if (next.is_max() &&
1328             missing_iter == recovery_state.get_pg_log().get_missing().get_items().end() &&
1329             ls_iter == sentries.end()) {
1330           result = 1;
1331
1332           // Set response.handle to the start of the next PG according
1333           // to the object sort order.
1334           response.handle = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1335         } else {
1336           response.handle = next;
1337         }
1338         dout(10) << "pgnls handle=" << response.handle << dendl;
1339         encode(response, osd_op.outdata);
1340         dout(10) << " pgnls result=" << result << " outdata.length()="
1341                  << osd_op.outdata.length() << dendl;
1342       }
1343       break;
1344
1345     case CEPH_OSD_OP_PGLS_FILTER:
1346       try {
1347         decode(cname, bp);
1348         decode(mname, bp);
1349       }
1350       catch (const buffer::error& e) {
1351         dout(0) << "unable to decode PGLS_FILTER description in " << *m << dendl;
1352         result = -EINVAL;
1353         break;
1354       }
1355       std::tie(result, filter) = get_pgls_filter(bp);
1356       if (result < 0)
1357         break;
1358
1359       ceph_assert(filter);
1360
1361       // fall through
1362
1363     case CEPH_OSD_OP_PGLS:
1364       if (snapid != CEPH_NOSNAP) {
1365         result = -EINVAL;
1366         break;
1367       }
1368       if (get_osdmap()->raw_pg_to_pg(m->get_pg()) != info.pgid.pgid) {
1369         dout(10) << " pgls pg=" << m->get_pg()
1370                  << " " << get_osdmap()->raw_pg_to_pg(m->get_pg())
1371                  << " != " << info.pgid << dendl;
1372         result = 0; // hmm?
1373       } else {
1374         unsigned list_size = std::min<uint64_t>(cct->_conf->osd_max_pgls,
1375                                                 p->op.pgls.count);
1376
1377         dout(10) << " pgls pg=" << m->get_pg() << " count " << list_size << dendl;
1378         // read into a buffer
1379         vector<hobject_t> sentries;
1380         pg_ls_response_t response;
1381         try {
1382           decode(response.handle, bp);
1383         }
1384         catch (const buffer::error& e) {
1385           dout(0) << "unable to decode PGLS handle in " << *m << dendl;
1386           result = -EINVAL;
1387           break;
1388         }
1389
1390         hobject_t next;
1391         hobject_t current = response.handle;
1392         int r = pgbackend->objects_list_partial(
1393           current,
1394           list_size,
1395           list_size,
1396           &sentries,
1397           &next);
1398         if (r != 0) {
1399           result = -EINVAL;
1400           break;
1401         }
1402
1403         ceph_assert(snapid == CEPH_NOSNAP || recovery_state.get_pg_log().get_missing().get_items().empty());
1404
1405         map<hobject_t, pg_missing_item>::const_iterator missing_iter =
1406           recovery_state.get_pg_log().get_missing().get_items().lower_bound(current);
1407         vector<hobject_t>::iterator ls_iter = sentries.begin();
1408         hobject_t _max = hobject_t::get_max();
1409         while (1) {
1410           const hobject_t &mcand =
1411             missing_iter == recovery_state.get_pg_log().get_missing().get_items().end() ?
1412             _max :
1413             missing_iter->first;
1414           const hobject_t &lcand =
1415             ls_iter == sentries.end() ?
1416             _max :
1417             *ls_iter;
1418
1419           hobject_t candidate;
1420           if (mcand == lcand) {
1421             candidate = mcand;
1422             if (!mcand.is_max()) {
1423               ++ls_iter;
1424               ++missing_iter;
1425             }
1426           } else if (mcand < lcand) {
1427             candidate = mcand;
1428             ceph_assert(!mcand.is_max());
1429             ++missing_iter;
1430           } else {
1431             candidate = lcand;
1432             ceph_assert(!lcand.is_max());
1433             ++ls_iter;
1434           }
1435
1436           if (candidate >= next) {
1437             break;
1438           }
1439
1440           if (response.entries.size() == list_size) {
1441             next = candidate;
1442             break;
1443           }
1444
1445           if (candidate.snap != CEPH_NOSNAP)
1446             continue;
1447
1448           // skip wrong namespace
1449           if (candidate.get_namespace() != m->get_hobj().nspace)
1450             continue;
1451
1452           if (recovery_state.get_missing_loc().is_deleted(candidate))
1453             continue;
1454
1455           if (filter && !pgls_filter(*filter, candidate))
1456             continue;
1457
1458           response.entries.push_back(make_pair(candidate.oid,
1459                                                candidate.get_key()));
1460         }
1461         if (next.is_max() &&
1462             missing_iter == recovery_state.get_pg_log().get_missing().get_items().end() &&
1463             ls_iter == sentries.end()) {
1464           result = 1;
1465         }
1466         response.handle = next;
1467         encode(response, osd_op.outdata);
1468         dout(10) << " pgls result=" << result << " outdata.length()="
1469                  << osd_op.outdata.length() << dendl;
1470       }
1471       break;
1472
1473     case CEPH_OSD_OP_PG_HITSET_LS:
1474       {
1475         list< pair<utime_t,utime_t> > ls;
1476         for (list<pg_hit_set_info_t>::const_iterator p = info.hit_set.history.begin();
1477              p != info.hit_set.history.end();
1478              ++p)
1479           ls.push_back(make_pair(p->begin, p->end));
1480         if (hit_set)
1481           ls.push_back(make_pair(hit_set_start_stamp, utime_t()));
1482         encode(ls, osd_op.outdata);
1483       }
1484       break;
1485
1486     case CEPH_OSD_OP_PG_HITSET_GET:
1487       {
1488         utime_t stamp(osd_op.op.hit_set_get.stamp);
1489         if (hit_set_start_stamp && stamp >= hit_set_start_stamp) {
1490           // read the current in-memory HitSet, not the version we've
1491           // checkpointed.
1492           if (!hit_set) {
1493             result= -ENOENT;
1494             break;
1495           }
1496           encode(*hit_set, osd_op.outdata);
1497           result = osd_op.outdata.length();
1498         } else {
1499           // read an archived HitSet.
1500           hobject_t oid;
1501           for (list<pg_hit_set_info_t>::const_iterator p = info.hit_set.history.begin();
1502                p != info.hit_set.history.end();
1503                ++p) {
1504             if (stamp >= p->begin && stamp <= p->end) {
1505               oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
1506               break;
1507             }
1508           }
1509           if (oid == hobject_t()) {
1510             result = -ENOENT;
1511             break;
1512           }
1513           if (!pool.info.is_replicated()) {
1514             // FIXME: EC not supported yet
1515             result = -EOPNOTSUPP;
1516             break;
1517           }
1518           if (is_unreadable_object(oid)) {
1519             wait_for_unreadable_object(oid, op);
1520             return;
1521           }
1522           result = osd->store->read(ch, ghobject_t(oid), 0, 0, osd_op.outdata);
1523         }
1524       }
1525       break;
1526
1527    case CEPH_OSD_OP_SCRUBLS:
1528       result = do_scrub_ls(m, &osd_op);
1529       break;
1530
1531     default:
1532       result = -EINVAL;
1533       break;
1534     }
1535
1536     if (result < 0)
1537       break;
1538   }
1539
1540   // reply
1541   MOSDOpReply *reply = new MOSDOpReply(m, 0, get_osdmap_epoch(),
1542                                        CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK,
1543                                        false);
1544   reply->claim_op_out_data(ops);
1545   reply->set_result(result);
1546   reply->set_reply_versions(info.last_update, info.last_user_version);
1547   osd->send_message_osd_client(reply, m->get_connection());
1548 }
1549
1550 int PrimaryLogPG::do_scrub_ls(const MOSDOp *m, OSDOp *osd_op)
1551 {
1552   if (m->get_pg() != info.pgid.pgid) {
1553     dout(10) << " scrubls pg=" << m->get_pg() << " != " << info.pgid << dendl;
1554     return -EINVAL; // hmm?
1555   }
1556   auto bp = osd_op->indata.cbegin();
1557   scrub_ls_arg_t arg;
1558   try {
1559     arg.decode(bp);
1560   } catch (buffer::error&) {
1561     dout(10) << " corrupted scrub_ls_arg_t" << dendl;
1562     return -EINVAL;
1563   }
1564   int r = 0;
1565   scrub_ls_result_t result = {.interval = info.history.same_interval_since};
1566   if (arg.interval != 0 && arg.interval != info.history.same_interval_since) {
1567     r = -EAGAIN;
1568   } else if (!scrubber.store) {
1569     r = -ENOENT;
1570   } else if (arg.get_snapsets) {
1571     result.vals = scrubber.store->get_snap_errors(osd->store,
1572                                                   get_pgid().pool(),
1573                                                   arg.start_after,
1574                                                   arg.max_return);
1575   } else {
1576     result.vals = scrubber.store->get_object_errors(osd->store,
1577                                                     get_pgid().pool(),
1578                                                     arg.start_after,
1579                                                     arg.max_return);
1580   }
1581   encode(result, osd_op->outdata);
1582   return r;
1583 }
1584
1585 PrimaryLogPG::PrimaryLogPG(OSDService *o, OSDMapRef curmap,
1586                            const PGPool &_pool,
1587                            const map<string,string>& ec_profile, spg_t p) :
1588   PG(o, curmap, _pool, p),
1589   pgbackend(
1590     PGBackend::build_pg_backend(
1591       _pool.info, ec_profile, this, coll_t(p), ch, o->store, cct)),
1592   object_contexts(o->cct, o->cct->_conf->osd_pg_object_context_cache_count),
1593   new_backfill(false),
1594   temp_seq(0),
1595   snap_trimmer_machine(this)
1596 {
1597   recovery_state.set_backend_predicates(
1598     pgbackend->get_is_readable_predicate(),
1599     pgbackend->get_is_recoverable_predicate());
1600   snap_trimmer_machine.initiate();
1601 }
1602
1603 void PrimaryLogPG::get_src_oloc(const object_t& oid, const object_locator_t& oloc, object_locator_t& src_oloc)
1604 {
1605   src_oloc = oloc;
1606   if (oloc.key.empty())
1607     src_oloc.key = oid.name;
1608 }
1609
1610 void PrimaryLogPG::handle_backoff(OpRequestRef& op)
1611 {
1612   auto m = op->get_req<MOSDBackoff>();
1613   auto session = ceph::ref_cast<Session>(m->get_connection()->get_priv());
1614   if (!session)
1615     return;  // drop it.
1616   hobject_t begin = info.pgid.pgid.get_hobj_start();
1617   hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1618   if (begin < m->begin) {
1619     begin = m->begin;
1620   }
1621   if (end > m->end) {
1622     end = m->end;
1623   }
1624   dout(10) << __func__ << " backoff ack id " << m->id
1625            << " [" << begin << "," << end << ")" << dendl;
1626   session->ack_backoff(cct, m->pgid, m->id, begin, end);
1627 }
1628
1629 void PrimaryLogPG::do_request(
1630   OpRequestRef& op,
1631   ThreadPool::TPHandle &handle)
1632 {
1633   if (op->osd_trace) {
1634     op->pg_trace.init("pg op", &trace_endpoint, &op->osd_trace);
1635     op->pg_trace.event("do request");
1636   }
1637   // make sure we have a new enough map
1638   auto p = waiting_for_map.find(op->get_source());
1639   if (p != waiting_for_map.end()) {
1640     // preserve ordering
1641     dout(20) << __func__ << " waiting_for_map "
1642              << p->first << " not empty, queueing" << dendl;
1643     p->second.push_back(op);
1644     op->mark_delayed("waiting_for_map not empty");
1645     return;
1646   }
1647   if (!have_same_or_newer_map(op->min_epoch)) {
1648     dout(20) << __func__ << " min " << op->min_epoch
1649              << ", queue on waiting_for_map " << op->get_source() << dendl;
1650     waiting_for_map[op->get_source()].push_back(op);
1651     op->mark_delayed("op must wait for map");
1652     osd->request_osdmap_update(op->min_epoch);
1653     return;
1654   }
1655
1656   if (can_discard_request(op)) {
1657     return;
1658   }
1659
1660   // pg-wide backoffs
1661   const Message *m = op->get_req();
1662   int msg_type = m->get_type();
1663   if (m->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF)) {
1664     auto session = ceph::ref_cast<Session>(m->get_connection()->get_priv());
1665     if (!session)
1666       return;  // drop it.
1667
1668     if (msg_type == CEPH_MSG_OSD_OP) {
1669       if (session->check_backoff(cct, info.pgid,
1670                                  info.pgid.pgid.get_hobj_start(), m)) {
1671         return;
1672       }
1673
1674       bool backoff =
1675         is_down() ||
1676         is_incomplete() ||
1677         (!is_active() && is_peered());
1678       if (g_conf()->osd_backoff_on_peering && !backoff) {
1679         if (is_peering()) {
1680           backoff = true;
1681         }
1682       }
1683       if (backoff) {
1684         add_pg_backoff(session);
1685         return;
1686       }
1687     }
1688     // pg backoff acks at pg-level
1689     if (msg_type == CEPH_MSG_OSD_BACKOFF) {
1690       const MOSDBackoff *ba = static_cast<const MOSDBackoff*>(m);
1691       if (ba->begin != ba->end) {
1692         handle_backoff(op);
1693         return;
1694       }
1695     }
1696   }
1697
1698   if (!is_peered()) {
1699     // Delay unless PGBackend says it's ok
1700     if (pgbackend->can_handle_while_inactive(op)) {
1701       bool handled = pgbackend->handle_message(op);
1702       ceph_assert(handled);
1703       return;
1704     } else {
1705       waiting_for_peered.push_back(op);
1706       op->mark_delayed("waiting for peered");
1707       return;
1708     }
1709   }
1710
1711   if (recovery_state.needs_flush()) {
1712     dout(20) << "waiting for flush on " << op << dendl;
1713     waiting_for_flush.push_back(op);
1714     op->mark_delayed("waiting for flush");
1715     return;
1716   }
1717
1718   ceph_assert(is_peered() && !recovery_state.needs_flush());
1719   if (pgbackend->handle_message(op))
1720     return;
1721
1722   switch (msg_type) {
1723   case CEPH_MSG_OSD_OP:
1724   case CEPH_MSG_OSD_BACKOFF:
1725     if (!is_active()) {
1726       dout(20) << " peered, not active, waiting for active on " << op << dendl;
1727       waiting_for_active.push_back(op);
1728       op->mark_delayed("waiting for active");
1729       return;
1730     }
1731     switch (msg_type) {
1732     case CEPH_MSG_OSD_OP:
1733       // verify client features
1734       if ((pool.info.has_tiers() || pool.info.is_tier()) &&
1735           !op->has_feature(CEPH_FEATURE_OSD_CACHEPOOL)) {
1736         osd->reply_op_error(op, -EOPNOTSUPP);
1737         return;
1738       }
1739       do_op(op);
1740       break;
1741     case CEPH_MSG_OSD_BACKOFF:
1742       // object-level backoff acks handled in osdop context
1743       handle_backoff(op);
1744       break;
1745     }
1746     break;
1747
1748   case MSG_OSD_PG_SCAN:
1749     do_scan(op, handle);
1750     break;
1751
1752   case MSG_OSD_PG_BACKFILL:
1753     do_backfill(op);
1754     break;
1755
1756   case MSG_OSD_PG_BACKFILL_REMOVE:
1757     do_backfill_remove(op);
1758     break;
1759
1760   case MSG_OSD_SCRUB_RESERVE:
1761     {
1762       auto m = op->get_req<MOSDScrubReserve>();
1763       switch (m->type) {
1764       case MOSDScrubReserve::REQUEST:
1765         handle_scrub_reserve_request(op);
1766         break;
1767       case MOSDScrubReserve::GRANT:
1768         handle_scrub_reserve_grant(op, m->from);
1769         break;
1770       case MOSDScrubReserve::REJECT:
1771         handle_scrub_reserve_reject(op, m->from);
1772         break;
1773       case MOSDScrubReserve::RELEASE:
1774         handle_scrub_reserve_release(op);
1775         break;
1776       }
1777     }
1778     break;
1779
1780   case MSG_OSD_REP_SCRUB:
1781     replica_scrub(op, handle);
1782     break;
1783
1784   case MSG_OSD_REP_SCRUBMAP:
1785     do_replica_scrub_map(op);
1786     break;
1787
1788   case MSG_OSD_PG_UPDATE_LOG_MISSING:
1789     do_update_log_missing(op);
1790     break;
1791
1792   case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY:
1793     do_update_log_missing_reply(op);
1794     break;
1795
1796   default:
1797     ceph_abort_msg("bad message type in do_request");
1798   }
1799 }
1800
1801 hobject_t PrimaryLogPG::earliest_backfill() const
1802 {
1803   hobject_t e = hobject_t::get_max();
1804   for (const pg_shard_t& bt : get_backfill_targets()) {
1805     const pg_info_t &pi = recovery_state.get_peer_info(bt);
1806     e = std::min(pi.last_backfill, e);
1807   }
1808   return e;
1809 }
1810
1811 /** do_op - do an op
1812  * pg lock will be held (if multithreaded)
1813  * osd_lock NOT held.
1814  */
1815 void PrimaryLogPG::do_op(OpRequestRef& op)
1816 {
1817   FUNCTRACE(cct);
1818   // NOTE: take a non-const pointer here; we must be careful not to
1819   // change anything that will break other reads on m (operator<<).
1820   MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
1821   ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
1822   if (m->finish_decode()) {
1823     op->reset_desc();   // for TrackedOp
1824     m->clear_payload();
1825   }
1826
1827   dout(20) << __func__ << ": op " << *m << dendl;
1828
1829   const hobject_t head = m->get_hobj().get_head();
1830
1831   if (!info.pgid.pgid.contains(
1832         info.pgid.pgid.get_split_bits(pool.info.get_pg_num()), head)) {
1833     derr << __func__ << " " << info.pgid.pgid << " does not contain "
1834          << head << " pg_num " << pool.info.get_pg_num() << " hash "
1835          << std::hex << head.get_hash() << std::dec << dendl;
1836     osd->clog->warn() << info.pgid.pgid << " does not contain " << head
1837                       << " op " << *m;
1838     ceph_assert(!cct->_conf->osd_debug_misdirected_ops);
1839     return;
1840   }
1841
1842   bool can_backoff =
1843     m->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF);
1844   ceph::ref_t<Session> session;
1845   if (can_backoff) {
1846     session = static_cast<Session*>(m->get_connection()->get_priv().get());
1847     if (!session.get()) {
1848       dout(10) << __func__ << " no session" << dendl;
1849       return;
1850     }
1851
1852     if (session->check_backoff(cct, info.pgid, head, m)) {
1853       return;
1854     }
1855   }
1856
1857   if (m->has_flag(CEPH_OSD_FLAG_PARALLELEXEC)) {
1858     // not implemented.
1859     dout(20) << __func__ << ": PARALLELEXEC not implemented " << *m << dendl;
1860     osd->reply_op_error(op, -EINVAL);
1861     return;
1862   }
1863
1864   {
1865     int r = op->maybe_init_op_info(*get_osdmap());
1866     if (r) {
1867       osd->reply_op_error(op, r);
1868       return;
1869     }
1870   }
1871
1872   if ((m->get_flags() & (CEPH_OSD_FLAG_BALANCE_READS |
1873                          CEPH_OSD_FLAG_LOCALIZE_READS)) &&
1874       op->may_read() &&
1875       !(op->may_write() || op->may_cache())) {
1876     // balanced reads; any replica will do
1877     if (!(is_primary() || is_nonprimary())) {
1878       osd->handle_misdirected_op(this, op);
1879       return;
1880     }
1881   } else {
1882     // normal case; must be primary
1883     if (!is_primary()) {
1884       osd->handle_misdirected_op(this, op);
1885       return;
1886     }
1887   }
1888
1889   if (!check_laggy(op)) {
1890     return;
1891   }
1892
1893   if (!op_has_sufficient_caps(op)) {
1894     osd->reply_op_error(op, -EPERM);
1895     return;
1896   }
1897
1898   if (op->includes_pg_op()) {
1899     return do_pg_op(op);
1900   }
1901
1902   // object name too long?
1903   if (m->get_oid().name.size() > cct->_conf->osd_max_object_name_len) {
1904     dout(4) << "do_op name is longer than "
1905             << cct->_conf->osd_max_object_name_len
1906             << " bytes" << dendl;
1907     osd->reply_op_error(op, -ENAMETOOLONG);
1908     return;
1909   }
1910   if (m->get_hobj().get_key().size() > cct->_conf->osd_max_object_name_len) {
1911     dout(4) << "do_op locator is longer than "
1912             << cct->_conf->osd_max_object_name_len
1913             << " bytes" << dendl;
1914     osd->reply_op_error(op, -ENAMETOOLONG);
1915     return;
1916   }
1917   if (m->get_hobj().nspace.size() > cct->_conf->osd_max_object_namespace_len) {
1918     dout(4) << "do_op namespace is longer than "
1919             << cct->_conf->osd_max_object_namespace_len
1920             << " bytes" << dendl;
1921     osd->reply_op_error(op, -ENAMETOOLONG);
1922     return;
1923   }
1924   if (m->get_hobj().oid.name.empty()) {
1925     dout(4) << "do_op empty oid name is not allowed" << dendl;
1926     osd->reply_op_error(op, -EINVAL);
1927     return;
1928   }
1929
1930   if (int r = osd->store->validate_hobject_key(head)) {
1931     dout(4) << "do_op object " << head << " invalid for backing store: "
1932             << r << dendl;
1933     osd->reply_op_error(op, r);
1934     return;
1935   }
1936
1937   // blacklisted?
1938   if (get_osdmap()->is_blacklisted(m->get_source_addr())) {
1939     dout(10) << "do_op " << m->get_source_addr() << " is blacklisted" << dendl;
1940     osd->reply_op_error(op, -EBLACKLISTED);
1941     return;
1942   }
1943
1944   // order this op as a write?
1945   bool write_ordered = op->rwordered();
1946
1947   // discard due to cluster full transition?  (we discard any op that
1948   // originates before the cluster or pool is marked full; the client
1949   // will resend after the full flag is removed or if they expect the
1950   // op to succeed despite being full).  The except is FULL_FORCE and
1951   // FULL_TRY ops, which there is no reason to discard because they
1952   // bypass all full checks anyway.  If this op isn't write or
1953   // read-ordered, we skip.
1954   // FIXME: we exclude mds writes for now.
1955   if (write_ordered && !(m->get_source().is_mds() ||
1956                          m->has_flag(CEPH_OSD_FLAG_FULL_TRY) ||
1957                          m->has_flag(CEPH_OSD_FLAG_FULL_FORCE)) &&
1958       info.history.last_epoch_marked_full > m->get_map_epoch()) {
1959     dout(10) << __func__ << " discarding op sent before full " << m << " "
1960              << *m << dendl;
1961     return;
1962   }
1963   // mds should have stopped writing before this point.
1964   // We can't allow OSD to become non-startable even if mds
1965   // could be writing as part of file removals.
1966   if (write_ordered && osd->check_failsafe_full(get_dpp()) &&
1967       !m->has_flag(CEPH_OSD_FLAG_FULL_TRY)) {
1968     dout(10) << __func__ << " fail-safe full check failed, dropping request." << dendl;
1969     return;
1970   }
1971   int64_t poolid = get_pgid().pool();
1972   if (op->may_write()) {
1973
1974     const pg_pool_t *pi = get_osdmap()->get_pg_pool(poolid);
1975     if (!pi) {
1976       return;
1977     }
1978
1979     // invalid?
1980     if (m->get_snapid() != CEPH_NOSNAP) {
1981       dout(20) << __func__ << ": write to clone not valid " << *m << dendl;
1982       osd->reply_op_error(op, -EINVAL);
1983       return;
1984     }
1985
1986     // too big?
1987     if (cct->_conf->osd_max_write_size &&
1988         m->get_data_len() > cct->_conf->osd_max_write_size << 20) {
1989       // journal can't hold commit!
1990       derr << "do_op msg data len " << m->get_data_len()
1991            << " > osd_max_write_size " << (cct->_conf->osd_max_write_size << 20)
1992            << " on " << *m << dendl;
1993       osd->reply_op_error(op, -OSD_WRITETOOBIG);
1994       return;
1995     }
1996   }
1997
1998   dout(10) << "do_op " << *m
1999            << (op->may_write() ? " may_write" : "")
2000            << (op->may_read() ? " may_read" : "")
2001            << (op->may_cache() ? " may_cache" : "")
2002            << " -> " << (write_ordered ? "write-ordered" : "read-ordered")
2003            << " flags " << ceph_osd_flag_string(m->get_flags())
2004            << dendl;
2005
2006   // missing object?
2007   if (is_unreadable_object(head)) {
2008     if (!is_primary()) {
2009       osd->reply_op_error(op, -EAGAIN);
2010       return;
2011     }
2012     if (can_backoff &&
2013         (g_conf()->osd_backoff_on_degraded ||
2014          (g_conf()->osd_backoff_on_unfound &&
2015           recovery_state.get_missing_loc().is_unfound(head)))) {
2016       add_backoff(session, head, head);
2017       maybe_kick_recovery(head);
2018     } else {
2019       wait_for_unreadable_object(head, op);
2020     }
2021     return;
2022   }
2023
2024   if (write_ordered) {
2025     // degraded object?
2026     if (is_degraded_or_backfilling_object(head)) {
2027       if (can_backoff && g_conf()->osd_backoff_on_degraded) {
2028         add_backoff(session, head, head);
2029         maybe_kick_recovery(head);
2030       } else {
2031         wait_for_degraded_object(head, op);
2032       }
2033       return;
2034     }
2035
2036     if (scrubber.is_chunky_scrub_active() && write_blocked_by_scrub(head)) {
2037       dout(20) << __func__ << ": waiting for scrub" << dendl;
2038       waiting_for_scrub.push_back(op);
2039       op->mark_delayed("waiting for scrub");
2040       return;
2041     }
2042     if (!check_laggy_requeue(op)) {
2043       return;
2044     }
2045
2046     // blocked on snap?
2047     if (auto blocked_iter = objects_blocked_on_degraded_snap.find(head);
2048         blocked_iter != std::end(objects_blocked_on_degraded_snap)) {
2049       hobject_t to_wait_on(head);
2050       to_wait_on.snap = blocked_iter->second;
2051       wait_for_degraded_object(to_wait_on, op);
2052       return;
2053     }
2054     if (auto blocked_snap_promote_iter = objects_blocked_on_snap_promotion.find(head);
2055         blocked_snap_promote_iter != std::end(objects_blocked_on_snap_promotion)) {
2056       wait_for_blocked_object(blocked_snap_promote_iter->second->obs.oi.soid, op);
2057       return;
2058     }
2059     if (objects_blocked_on_cache_full.count(head)) {
2060       block_write_on_full_cache(head, op);
2061       return;
2062     }
2063   }
2064
2065   // dup/resent?
2066   if (op->may_write() || op->may_cache()) {
2067     // warning: we will get back *a* request for this reqid, but not
2068     // necessarily the most recent.  this happens with flush and
2069     // promote ops, but we can't possible have both in our log where
2070     // the original request is still not stable on disk, so for our
2071     // purposes here it doesn't matter which one we get.
2072     eversion_t version;
2073     version_t user_version;
2074     int return_code = 0;
2075     vector<pg_log_op_return_item_t> op_returns;
2076     bool got = check_in_progress_op(
2077       m->get_reqid(), &version, &user_version, &return_code, &op_returns);
2078     if (got) {
2079       dout(3) << __func__ << " dup " << m->get_reqid()
2080               << " version " << version << dendl;
2081       if (already_complete(version)) {
2082         osd->reply_op_error(op, return_code, version, user_version, op_returns);
2083       } else {
2084         dout(10) << " waiting for " << version << " to commit" << dendl;
2085         // always queue ondisk waiters, so that we can requeue if needed
2086         waiting_for_ondisk[version].emplace_back(op, user_version, return_code,
2087                                                  op_returns);
2088         op->mark_delayed("waiting for ondisk");
2089       }
2090       return;
2091     }
2092   }
2093
2094   ObjectContextRef obc;
2095   bool can_create = op->may_write();
2096   hobject_t missing_oid;
2097
2098   // kludge around the fact that LIST_SNAPS sets CEPH_SNAPDIR for LIST_SNAPS
2099   const hobject_t& oid =
2100     m->get_snapid() == CEPH_SNAPDIR ? head : m->get_hobj();
2101
2102   // make sure LIST_SNAPS is on CEPH_SNAPDIR and nothing else
2103   for (vector<OSDOp>::iterator p = m->ops.begin(); p != m->ops.end(); ++p) {
2104     OSDOp& osd_op = *p;
2105
2106     if (osd_op.op.op == CEPH_OSD_OP_LIST_SNAPS) {
2107       if (m->get_snapid() != CEPH_SNAPDIR) {
2108         dout(10) << "LIST_SNAPS with incorrect context" << dendl;
2109         osd->reply_op_error(op, -EINVAL);
2110         return;
2111       }
2112     } else {
2113       if (m->get_snapid() == CEPH_SNAPDIR) {
2114         dout(10) << "non-LIST_SNAPS on snapdir" << dendl;
2115         osd->reply_op_error(op, -EINVAL);
2116         return;
2117       }
2118     }
2119   }
2120
2121   // io blocked on obc?
2122   if (!m->has_flag(CEPH_OSD_FLAG_FLUSH) &&
2123       maybe_await_blocked_head(oid, op)) {
2124     return;
2125   }
2126
2127   if (!is_primary()) {
2128     if (!recovery_state.can_serve_replica_read(oid)) {
2129       dout(20) << __func__ << ": oid " << oid
2130                << " unstable write on replica, bouncing to primary."
2131                << *m << dendl;
2132       osd->reply_op_error(op, -EAGAIN);
2133       return;
2134     } else {
2135       dout(20) << __func__ << ": serving replica read on oid" << oid
2136                << dendl;
2137     }
2138   }
2139
2140   int r = find_object_context(
2141     oid, &obc, can_create,
2142     m->has_flag(CEPH_OSD_FLAG_MAP_SNAP_CLONE),
2143     &missing_oid);
2144
2145   // LIST_SNAPS needs the ssc too
2146   if (obc &&
2147       m->get_snapid() == CEPH_SNAPDIR &&
2148       !obc->ssc) {
2149     obc->ssc = get_snapset_context(oid, true);
2150   }
2151
2152   if (r == -EAGAIN) {
2153     // If we're not the primary of this OSD, we just return -EAGAIN. Otherwise,
2154     // we have to wait for the object.
2155     if (is_primary()) {
2156       // missing the specific snap we need; requeue and wait.
2157       ceph_assert(!op->may_write()); // only happens on a read/cache
2158       wait_for_unreadable_object(missing_oid, op);
2159       return;
2160     }
2161   } else if (r == 0) {
2162     if (is_unreadable_object(obc->obs.oi.soid)) {
2163       dout(10) << __func__ << ": clone " << obc->obs.oi.soid
2164                << " is unreadable, waiting" << dendl;
2165       wait_for_unreadable_object(obc->obs.oi.soid, op);
2166       return;
2167     }
2168
2169     // degraded object?  (the check above was for head; this could be a clone)
2170     if (write_ordered &&
2171         obc->obs.oi.soid.snap != CEPH_NOSNAP &&
2172         is_degraded_or_backfilling_object(obc->obs.oi.soid)) {
2173       dout(10) << __func__ << ": clone " << obc->obs.oi.soid
2174                << " is degraded, waiting" << dendl;
2175       wait_for_degraded_object(obc->obs.oi.soid, op);
2176       return;
2177     }
2178   }
2179
2180   bool in_hit_set = false;
2181   if (hit_set) {
2182     if (obc.get()) {
2183       if (obc->obs.oi.soid != hobject_t() && hit_set->contains(obc->obs.oi.soid))
2184         in_hit_set = true;
2185     } else {
2186       if (missing_oid != hobject_t() && hit_set->contains(missing_oid))
2187         in_hit_set = true;
2188     }
2189     if (!op->hitset_inserted) {
2190       hit_set->insert(oid);
2191       op->hitset_inserted = true;
2192       if (hit_set->is_full() ||
2193           hit_set_start_stamp + pool.info.hit_set_period <= m->get_recv_stamp()) {
2194         hit_set_persist();
2195       }
2196     }
2197   }
2198
2199   if (agent_state) {
2200     if (agent_choose_mode(false, op))
2201       return;
2202   }
2203
2204   if (obc.get() && obc->obs.exists && obc->obs.oi.has_manifest()) {
2205     if (maybe_handle_manifest(op,
2206                                write_ordered,
2207                                obc))
2208     return;
2209   }
2210
2211   if (maybe_handle_cache(op,
2212                          write_ordered,
2213                          obc,
2214                          r,
2215                          missing_oid,
2216                          false,
2217                          in_hit_set))
2218     return;
2219
2220   if (r && (r != -ENOENT || !obc)) {
2221     // copy the reqids for copy get on ENOENT
2222     if (r == -ENOENT &&
2223         (m->ops[0].op.op == CEPH_OSD_OP_COPY_GET)) {
2224       fill_in_copy_get_noent(op, oid, m->ops[0]);
2225       return;
2226     }
2227     dout(20) << __func__ << ": find_object_context got error " << r << dendl;
2228     if (op->may_write() &&
2229         get_osdmap()->require_osd_release >= ceph_release_t::kraken) {
2230       record_write_error(op, oid, nullptr, r);
2231     } else {
2232       osd->reply_op_error(op, r);
2233     }
2234     return;
2235   }
2236
2237   // make sure locator is consistent
2238   object_locator_t oloc(obc->obs.oi.soid);
2239   if (m->get_object_locator() != oloc) {
2240     dout(10) << " provided locator " << m->get_object_locator()
2241              << " != object's " << obc->obs.oi.soid << dendl;
2242     osd->clog->warn() << "bad locator " << m->get_object_locator()
2243                      << " on object " << oloc
2244                       << " op " << *m;
2245   }
2246
2247   // io blocked on obc?
2248   if (obc->is_blocked() &&
2249       !m->has_flag(CEPH_OSD_FLAG_FLUSH)) {
2250     wait_for_blocked_object(obc->obs.oi.soid, op);
2251     return;
2252   }
2253
2254   dout(25) << __func__ << " oi " << obc->obs.oi << dendl;
2255
2256   OpContext *ctx = new OpContext(op, m->get_reqid(), &m->ops, obc, this);
2257
2258   if (m->has_flag(CEPH_OSD_FLAG_SKIPRWLOCKS)) {
2259     dout(20) << __func__ << ": skipping rw locks" << dendl;
2260   } else if (m->get_flags() & CEPH_OSD_FLAG_FLUSH) {
2261     dout(20) << __func__ << ": part of flush, will ignore write lock" << dendl;
2262
2263     // verify there is in fact a flush in progress
2264     // FIXME: we could make this a stronger test.
2265     map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(obc->obs.oi.soid);
2266     if (p == flush_ops.end()) {
2267       dout(10) << __func__ << " no flush in progress, aborting" << dendl;
2268       reply_ctx(ctx, -EINVAL);
2269       return;
2270     }
2271   } else if (!get_rw_locks(write_ordered, ctx)) {
2272     dout(20) << __func__ << " waiting for rw locks " << dendl;
2273     op->mark_delayed("waiting for rw locks");
2274     close_op_ctx(ctx);
2275     return;
2276   }
2277   dout(20) << __func__ << " obc " << *obc << dendl;
2278
2279   if (r) {
2280     dout(20) << __func__ << " returned an error: " << r << dendl;
2281     if (op->may_write() &&
2282         get_osdmap()->require_osd_release >= ceph_release_t::kraken) {
2283       record_write_error(op, oid, nullptr, r,
2284                          ctx->op->allows_returnvec() ? ctx : nullptr);
2285     } else {
2286       osd->reply_op_error(op, r);
2287     }
2288     close_op_ctx(ctx);
2289     return;
2290   }
2291
2292   if (m->has_flag(CEPH_OSD_FLAG_IGNORE_CACHE)) {
2293     ctx->ignore_cache = true;
2294   }
2295
2296   if ((op->may_read()) && (obc->obs.oi.is_lost())) {
2297     // This object is lost. Reading from it returns an error.
2298     dout(20) << __func__ << ": object " << obc->obs.oi.soid
2299              << " is lost" << dendl;
2300     reply_ctx(ctx, -ENFILE);
2301     return;
2302   }
2303   if (!op->may_write() &&
2304       !op->may_cache() &&
2305       (!obc->obs.exists ||
2306        ((m->get_snapid() != CEPH_SNAPDIR) &&
2307         obc->obs.oi.is_whiteout()))) {
2308     // copy the reqids for copy get on ENOENT
2309     if (m->ops[0].op.op == CEPH_OSD_OP_COPY_GET) {
2310       fill_in_copy_get_noent(op, oid, m->ops[0]);
2311       close_op_ctx(ctx);
2312       return;
2313     }
2314     reply_ctx(ctx, -ENOENT);
2315     return;
2316   }
2317
2318   op->mark_started();
2319
2320   execute_ctx(ctx);
2321   utime_t prepare_latency = ceph_clock_now();
2322   prepare_latency -= op->get_dequeued_time();
2323   osd->logger->tinc(l_osd_op_prepare_lat, prepare_latency);
2324   if (op->may_read() && op->may_write()) {
2325     osd->logger->tinc(l_osd_op_rw_prepare_lat, prepare_latency);
2326   } else if (op->may_read()) {
2327     osd->logger->tinc(l_osd_op_r_prepare_lat, prepare_latency);
2328   } else if (op->may_write() || op->may_cache()) {
2329     osd->logger->tinc(l_osd_op_w_prepare_lat, prepare_latency);
2330   }
2331
2332   // force recovery of the oldest missing object if too many logs
2333   maybe_force_recovery();
2334 }
2335
2336 PrimaryLogPG::cache_result_t PrimaryLogPG::maybe_handle_manifest_detail(
2337   OpRequestRef op,
2338   bool write_ordered,
2339   ObjectContextRef obc)
2340 {
2341   ceph_assert(obc);
2342   if (op->get_req<MOSDOp>()->get_flags() & CEPH_OSD_FLAG_IGNORE_REDIRECT) {
2343     dout(20) << __func__ << ": ignoring redirect due to flag" << dendl;
2344     return cache_result_t::NOOP;
2345   }
2346
2347   // if it is write-ordered and blocked, stop now
2348   if (obc->is_blocked() && write_ordered) {
2349     // we're already doing something with this object
2350     dout(20) << __func__ << " blocked on " << obc->obs.oi.soid << dendl;
2351     return cache_result_t::NOOP;
2352   }
2353
2354   vector<OSDOp> ops = op->get_req<MOSDOp>()->ops;
2355   for (vector<OSDOp>::iterator p = ops.begin(); p != ops.end(); ++p) {
2356     OSDOp& osd_op = *p;
2357     ceph_osd_op& op = osd_op.op;
2358     if (op.op == CEPH_OSD_OP_SET_REDIRECT ||
2359         op.op == CEPH_OSD_OP_SET_CHUNK ||
2360         op.op == CEPH_OSD_OP_UNSET_MANIFEST ||
2361         op.op == CEPH_OSD_OP_TIER_FLUSH) {
2362       return cache_result_t::NOOP;
2363     } else if (op.op == CEPH_OSD_OP_TIER_PROMOTE) {
2364       bool is_dirty = false;
2365       for (auto& p : obc->obs.oi.manifest.chunk_map) {
2366         if (p.second.is_dirty()) {
2367           is_dirty = true;
2368         }
2369       }
2370       if (is_dirty) {
2371         start_flush(OpRequestRef(), obc, true, NULL, std::nullopt);
2372       }
2373       return cache_result_t::NOOP;
2374     }
2375   }
2376
2377   switch (obc->obs.oi.manifest.type) {
2378   case object_manifest_t::TYPE_REDIRECT:
2379     if (op->may_write() || write_ordered) {
2380       do_proxy_write(op, obc);
2381     } else {
2382       // promoted object
2383       if (obc->obs.oi.size != 0) {
2384         return cache_result_t::NOOP;
2385       }
2386       do_proxy_read(op, obc);
2387     }
2388     return cache_result_t::HANDLED_PROXY;
2389   case object_manifest_t::TYPE_CHUNKED:
2390     {
2391       if (can_proxy_chunked_read(op, obc)) {
2392         map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(obc->obs.oi.soid);
2393         if (p != flush_ops.end()) {
2394           do_proxy_chunked_op(op, obc->obs.oi.soid, obc, true);
2395           return cache_result_t::HANDLED_PROXY;
2396         }
2397         do_proxy_chunked_op(op, obc->obs.oi.soid, obc, write_ordered);
2398         return cache_result_t::HANDLED_PROXY;
2399       }
2400
2401       MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
2402       ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
2403       hobject_t head = m->get_hobj();
2404
2405       if (is_degraded_or_backfilling_object(head)) {
2406         dout(20) << __func__ << ": " << head << " is degraded, waiting" << dendl;
2407         wait_for_degraded_object(head, op);
2408         return cache_result_t::BLOCKED_RECOVERY;
2409       }
2410
2411       if (write_blocked_by_scrub(head)) {
2412         dout(20) << __func__ << ": waiting for scrub" << dendl;
2413         waiting_for_scrub.push_back(op);
2414         op->mark_delayed("waiting for scrub");
2415         return cache_result_t::BLOCKED_RECOVERY;
2416       }
2417       if (!check_laggy_requeue(op)) {
2418         return cache_result_t::BLOCKED_RECOVERY;
2419       }
2420
2421       for (auto& p : obc->obs.oi.manifest.chunk_map) {
2422         if (p.second.is_missing()) {
2423           auto m = op->get_req<MOSDOp>();
2424           const object_locator_t oloc = m->get_object_locator();
2425           promote_object(obc, obc->obs.oi.soid, oloc, op, NULL);
2426           return cache_result_t::BLOCKED_PROMOTE;
2427         }
2428       }
2429
2430       bool all_dirty = true;
2431       for (auto& p : obc->obs.oi.manifest.chunk_map) {
2432         if (!p.second.is_dirty()) {
2433           all_dirty = false;
2434         }
2435       }
2436       if (all_dirty) {
2437         start_flush(OpRequestRef(), obc, true, NULL, std::nullopt);
2438       }
2439       return cache_result_t::NOOP;
2440     }
2441   default:
2442     ceph_abort_msg("unrecognized manifest type");
2443   }
2444
2445   return cache_result_t::NOOP;
2446 }
2447
2448 struct C_ManifestFlush : public Context {
2449   PrimaryLogPGRef pg;
2450   hobject_t oid;
2451   epoch_t lpr;
2452   ceph_tid_t tid;
2453   utime_t start;
2454   uint64_t offset;
2455   uint64_t last_offset;
2456   C_ManifestFlush(PrimaryLogPG *p, hobject_t o, epoch_t e)
2457     : pg(p), oid(o), lpr(e),
2458       tid(0), start(ceph_clock_now())
2459   {}
2460   void finish(int r) override {
2461     if (r == -ECANCELED)
2462       return;
2463     std::scoped_lock locker{*pg};
2464     pg->handle_manifest_flush(oid, tid, r, offset, last_offset, lpr);
2465     pg->osd->logger->tinc(l_osd_tier_flush_lat, ceph_clock_now() - start);
2466   }
2467 };
2468
2469 void PrimaryLogPG::handle_manifest_flush(hobject_t oid, ceph_tid_t tid, int r,
2470                                          uint64_t offset, uint64_t last_offset,
2471                                          epoch_t lpr)
2472 {
2473   map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(oid);
2474   if (p == flush_ops.end()) {
2475     dout(10) << __func__ << " no flush_op found" << dendl;
2476     return;
2477   }
2478   if (p->second->rval < 0) {
2479     return;
2480   }
2481   p->second->io_results[offset] = r;
2482   for (auto &ior: p->second->io_results) {
2483     if (ior.second < 0) {
2484       finish_manifest_flush(oid, tid, r, p->second->obc, last_offset);
2485       p->second->rval = r;
2486       return;
2487     }
2488   }
2489   if (p->second->chunks == p->second->io_results.size()) {
2490     if (lpr == get_last_peering_reset()) {
2491       ceph_assert(p->second->obc);
2492       finish_manifest_flush(oid, tid, r, p->second->obc, last_offset);
2493     }
2494   }
2495 }
2496
2497 int PrimaryLogPG::start_manifest_flush(OpRequestRef op, ObjectContextRef obc, bool blocking,
2498                                        std::optional<std::function<void()>> &&on_flush)
2499 {
2500   auto p = obc->obs.oi.manifest.chunk_map.begin();
2501   FlushOpRef manifest_fop(std::make_shared<FlushOp>());
2502   manifest_fop->op = op;
2503   manifest_fop->obc = obc;
2504   manifest_fop->flushed_version = obc->obs.oi.user_version;
2505   manifest_fop->blocking = blocking;
2506   manifest_fop->on_flush = std::move(on_flush);
2507   int r = do_manifest_flush(op, obc, manifest_fop, p->first, blocking);
2508   if (r < 0) {
2509     return r;
2510   }
2511
2512   flush_ops[obc->obs.oi.soid] = manifest_fop;
2513   return -EINPROGRESS;
2514 }
2515
2516 int PrimaryLogPG::do_manifest_flush(OpRequestRef op, ObjectContextRef obc, FlushOpRef manifest_fop,
2517                                     uint64_t start_offset, bool block)
2518 {
2519   struct object_manifest_t &manifest = obc->obs.oi.manifest;
2520   hobject_t soid = obc->obs.oi.soid;
2521   ceph_tid_t tid;
2522   SnapContext snapc;
2523   uint64_t max_copy_size = 0, last_offset = 0;
2524
2525   map<uint64_t, chunk_info_t>::iterator iter = manifest.chunk_map.find(start_offset);
2526   ceph_assert(iter != manifest.chunk_map.end());
2527   for (;iter != manifest.chunk_map.end(); ++iter) {
2528     if (iter->second.is_dirty()) {
2529       last_offset = iter->first;
2530       max_copy_size += iter->second.length;
2531     }
2532     if (get_copy_chunk_size() < max_copy_size) {
2533       break;
2534     }
2535   }
2536
2537   iter = manifest.chunk_map.find(start_offset);
2538   for (;iter != manifest.chunk_map.end(); ++iter) {
2539     if (!iter->second.is_dirty()) {
2540       continue;
2541     }
2542     uint64_t tgt_length = iter->second.length;
2543     uint64_t tgt_offset= iter->second.offset;
2544     hobject_t tgt_soid = iter->second.oid;
2545     object_locator_t oloc(tgt_soid);
2546     ObjectOperation obj_op;
2547     bufferlist chunk_data;
2548     int r = pgbackend->objects_read_sync(
2549         soid, iter->first, tgt_length, 0, &chunk_data);
2550     if (r < 0) {
2551       dout(0) << __func__ << " read fail " << " offset: " << tgt_offset
2552               << " len: " << tgt_length << " r: " << r << dendl;
2553       return r;
2554     }
2555     if (!chunk_data.length()) {
2556       return -ENODATA;
2557     }
2558
2559     unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY |
2560                      CEPH_OSD_FLAG_RWORDERED;
2561     tgt_length = chunk_data.length();
2562     if (pg_pool_t::fingerprint_t fp_algo = pool.info.get_fingerprint_type();
2563         iter->second.has_reference() &&
2564         fp_algo != pg_pool_t::TYPE_FINGERPRINT_NONE) {
2565       object_t fp_oid = [fp_algo, &chunk_data]() -> string {
2566         switch (fp_algo) {
2567         case pg_pool_t::TYPE_FINGERPRINT_SHA1:
2568           return crypto::digest<crypto::SHA1>(chunk_data).to_str();
2569         case pg_pool_t::TYPE_FINGERPRINT_SHA256:
2570           return crypto::digest<crypto::SHA256>(chunk_data).to_str();
2571         case pg_pool_t::TYPE_FINGERPRINT_SHA512:
2572           return crypto::digest<crypto::SHA512>(chunk_data).to_str();
2573         default:
2574           assert(0 == "unrecognized fingerprint type");
2575           return {};
2576         }
2577       }();
2578       bufferlist in;
2579       if (fp_oid != tgt_soid.oid) {
2580         // decrement old chunk's reference count
2581         ObjectOperation dec_op;
2582         cls_chunk_refcount_put_op put_call;
2583         put_call.source = soid;
2584         ::encode(put_call, in);
2585         dec_op.call("cas", "chunk_put", in);
2586         // we don't care dec_op's completion. scrub for dedup will fix this.
2587         tid = osd->objecter->mutate(
2588           tgt_soid.oid, oloc, dec_op, snapc,
2589           ceph::real_clock::from_ceph_timespec(obc->obs.oi.mtime),
2590           flags, NULL);
2591         in.clear();
2592       }
2593       tgt_soid.oid = fp_oid;
2594       iter->second.oid = tgt_soid;
2595       // add data op
2596       ceph_osd_op osd_op;
2597       osd_op.extent.offset = 0;
2598       osd_op.extent.length = chunk_data.length();
2599       encode(osd_op, in);
2600       encode(soid, in);
2601       in.append(chunk_data);
2602       obj_op.call("cas", "cas_write_or_get", in);
2603     } else {
2604       obj_op.add_data(CEPH_OSD_OP_WRITE, tgt_offset, tgt_length, chunk_data);
2605     }
2606
2607     C_ManifestFlush *fin = new C_ManifestFlush(this, soid, get_last_peering_reset());
2608     fin->offset = iter->first;
2609     fin->last_offset = last_offset;
2610     manifest_fop->chunks++;
2611
2612     tid = osd->objecter->mutate(
2613       tgt_soid.oid, oloc, obj_op, snapc,
2614       ceph::real_clock::from_ceph_timespec(obc->obs.oi.mtime),
2615       flags, new C_OnFinisher(fin, osd->get_objecter_finisher(get_pg_shard())));
2616     fin->tid = tid;
2617     manifest_fop->io_tids[iter->first] = tid;
2618
2619     dout(20) << __func__ << " offset: " << tgt_offset << " len: " << tgt_length
2620             << " oid: " << tgt_soid.oid << " ori oid: " << soid.oid.name
2621             << " tid: " << tid << dendl;
2622     if (last_offset < iter->first) {
2623       break;
2624     }
2625   }
2626
2627   return 0;
2628 }
2629
2630 void PrimaryLogPG::finish_manifest_flush(hobject_t oid, ceph_tid_t tid, int r,
2631                                          ObjectContextRef obc, uint64_t last_offset)
2632 {
2633   dout(10) << __func__ << " " << oid << " tid " << tid
2634            << " " << cpp_strerror(r) << " last_offset: " << last_offset << dendl;
2635   map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(oid);
2636   if (p == flush_ops.end()) {
2637     dout(10) << __func__ << " no flush_op found" << dendl;
2638     return;
2639   }
2640   map<uint64_t, chunk_info_t>::iterator iter =
2641       obc->obs.oi.manifest.chunk_map.find(last_offset);
2642   ceph_assert(iter != obc->obs.oi.manifest.chunk_map.end());
2643   for (;iter != obc->obs.oi.manifest.chunk_map.end(); ++iter) {
2644     if (iter->second.is_dirty() && last_offset < iter->first) {
2645       do_manifest_flush(p->second->op, obc, p->second, iter->first, p->second->blocking);
2646       return;
2647     }
2648   }
2649   finish_flush(oid, tid, r);
2650 }
2651
2652 void PrimaryLogPG::record_write_error(OpRequestRef op, const hobject_t &soid,
2653                                       MOSDOpReply *orig_reply, int r,
2654                                       OpContext *ctx_for_op_returns)
2655 {
2656   dout(20) << __func__ << " r=" << r << dendl;
2657   ceph_assert(op->may_write());
2658   const osd_reqid_t &reqid = op->get_req<MOSDOp>()->get_reqid();
2659   mempool::osd_pglog::list<pg_log_entry_t> entries;
2660   entries.push_back(pg_log_entry_t(pg_log_entry_t::ERROR, soid,
2661                                    get_next_version(), eversion_t(), 0,
2662                                    reqid, utime_t(), r));
2663   if (ctx_for_op_returns) {
2664     entries.back().set_op_returns(*ctx_for_op_returns->ops);
2665     dout(20) << __func__ << " op_returns=" << entries.back().op_returns << dendl;
2666   }
2667
2668   struct OnComplete {
2669     PrimaryLogPG *pg;
2670     OpRequestRef op;
2671     boost::intrusive_ptr<MOSDOpReply> orig_reply;
2672     int r;
2673     OnComplete(
2674       PrimaryLogPG *pg,
2675       OpRequestRef op,
2676       MOSDOpReply *orig_reply,
2677       int r)
2678       : pg(pg), op(op),
2679         orig_reply(orig_reply, false /* take over ref */), r(r)
2680       {}
2681     void operator()() {
2682       ldpp_dout(pg, 20) << "finished " << __func__ << " r=" << r << dendl;
2683       auto m = op->get_req<MOSDOp>();
2684       MOSDOpReply *reply = orig_reply.detach();
2685       ldpp_dout(pg, 10) << " sending commit on " << *m << " " << reply << dendl;
2686       pg->osd->send_message_osd_client(reply, m->get_connection());
2687     }
2688   };
2689
2690   ObcLockManager lock_manager;
2691   submit_log_entries(
2692     entries,
2693     std::move(lock_manager),
2694     std::optional<std::function<void(void)> >(
2695       OnComplete(this, op, orig_reply, r)),
2696     op,
2697     r);
2698 }
2699
2700 PrimaryLogPG::cache_result_t PrimaryLogPG::maybe_handle_cache_detail(
2701   OpRequestRef op,
2702   bool write_ordered,
2703   ObjectContextRef obc,
2704   int r, hobject_t missing_oid,
2705   bool must_promote,
2706   bool in_hit_set,
2707   ObjectContextRef *promote_obc)
2708 {
2709   // return quickly if caching is not enabled
2710   if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)
2711     return cache_result_t::NOOP;
2712
2713   if (op &&
2714       op->get_req() &&
2715       op->get_req()->get_type() == CEPH_MSG_OSD_OP &&
2716       (op->get_req<MOSDOp>()->get_flags() &
2717        CEPH_OSD_FLAG_IGNORE_CACHE)) {
2718     dout(20) << __func__ << ": ignoring cache due to flag" << dendl;
2719     return cache_result_t::NOOP;
2720   }
2721
2722   must_promote = must_promote || op->need_promote();
2723
2724   if (obc)
2725     dout(25) << __func__ << " " << obc->obs.oi << " "
2726              << (obc->obs.exists ? "exists" : "DNE")
2727              << " missing_oid " << missing_oid
2728              << " must_promote " << (int)must_promote
2729              << " in_hit_set " << (int)in_hit_set
2730              << dendl;
2731   else
2732     dout(25) << __func__ << " (no obc)"
2733              << " missing_oid " << missing_oid
2734              << " must_promote " << (int)must_promote
2735              << " in_hit_set " << (int)in_hit_set
2736              << dendl;
2737
2738   // if it is write-ordered and blocked, stop now
2739   if (obc.get() && obc->is_blocked() && write_ordered) {
2740     // we're already doing something with this object
2741     dout(20) << __func__ << " blocked on " << obc->obs.oi.soid << dendl;
2742     return cache_result_t::NOOP;
2743   }
2744
2745   if (r == -ENOENT && missing_oid == hobject_t()) {
2746     // we know this object is logically absent (e.g., an undefined clone)
2747     return cache_result_t::NOOP;
2748   }
2749
2750   if (obc.get() && obc->obs.exists) {
2751     osd->logger->inc(l_osd_op_cache_hit);
2752     return cache_result_t::NOOP;
2753   }
2754   if (!is_primary()) {
2755     dout(20) << __func__ << " cache miss; ask the primary" << dendl;
2756     osd->reply_op_error(op, -EAGAIN);
2757     return cache_result_t::REPLIED_WITH_EAGAIN;
2758   }
2759
2760   if (missing_oid == hobject_t() && obc.get()) {
2761     missing_oid = obc->obs.oi.soid;
2762   }
2763
2764   auto m = op->get_req<MOSDOp>();
2765   const object_locator_t oloc = m->get_object_locator();
2766
2767   if (op->need_skip_handle_cache()) {
2768     return cache_result_t::NOOP;
2769   }
2770
2771   OpRequestRef promote_op;
2772
2773   switch (pool.info.cache_mode) {
2774   case pg_pool_t::CACHEMODE_WRITEBACK:
2775     if (agent_state &&
2776         agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2777       if (!op->may_write() && !op->may_cache() &&
2778           !write_ordered && !must_promote) {
2779         dout(20) << __func__ << " cache pool full, proxying read" << dendl;
2780         do_proxy_read(op);
2781         return cache_result_t::HANDLED_PROXY;
2782       }
2783       dout(20) << __func__ << " cache pool full, waiting" << dendl;
2784       block_write_on_full_cache(missing_oid, op);
2785       return cache_result_t::BLOCKED_FULL;
2786     }
2787
2788     if (must_promote || (!hit_set && !op->need_skip_promote())) {
2789       promote_object(obc, missing_oid, oloc, op, promote_obc);
2790       return cache_result_t::BLOCKED_PROMOTE;
2791     }
2792
2793     if (op->may_write() || op->may_cache()) {
2794       do_proxy_write(op);
2795
2796       // Promote too?
2797       if (!op->need_skip_promote() &&
2798           maybe_promote(obc, missing_oid, oloc, in_hit_set,
2799                       pool.info.min_write_recency_for_promote,
2800                       OpRequestRef(),
2801                       promote_obc)) {
2802         return cache_result_t::BLOCKED_PROMOTE;
2803       }
2804       return cache_result_t::HANDLED_PROXY;
2805     } else {
2806       do_proxy_read(op);
2807
2808       // Avoid duplicate promotion
2809       if (obc.get() && obc->is_blocked()) {
2810         if (promote_obc)
2811           *promote_obc = obc;
2812         return cache_result_t::BLOCKED_PROMOTE;
2813       }
2814
2815       // Promote too?
2816       if (!op->need_skip_promote()) {
2817         (void)maybe_promote(obc, missing_oid, oloc, in_hit_set,
2818                             pool.info.min_read_recency_for_promote,
2819                             promote_op, promote_obc);
2820       }
2821
2822       return cache_result_t::HANDLED_PROXY;
2823     }
2824     ceph_abort_msg("unreachable");
2825     return cache_result_t::NOOP;
2826
2827   case pg_pool_t::CACHEMODE_READONLY:
2828     // TODO: clean this case up
2829     if (!obc.get() && r == -ENOENT) {
2830       // we don't have the object and op's a read
2831       promote_object(obc, missing_oid, oloc, op, promote_obc);
2832       return cache_result_t::BLOCKED_PROMOTE;
2833     }
2834     if (!r) { // it must be a write
2835       do_cache_redirect(op);
2836       return cache_result_t::HANDLED_REDIRECT;
2837     }
2838     // crap, there was a failure of some kind
2839     return cache_result_t::NOOP;
2840
2841   case pg_pool_t::CACHEMODE_FORWARD:
2842     // this mode is deprecated; proxy instead
2843   case pg_pool_t::CACHEMODE_PROXY:
2844     if (!must_promote) {
2845       if (op->may_write() || op->may_cache() || write_ordered) {
2846         do_proxy_write(op);
2847         return cache_result_t::HANDLED_PROXY;
2848       } else {
2849         do_proxy_read(op);
2850         return cache_result_t::HANDLED_PROXY;
2851       }
2852     }
2853     // ugh, we're forced to promote.
2854     if (agent_state &&
2855         agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2856       dout(20) << __func__ << " cache pool full, waiting" << dendl;
2857       block_write_on_full_cache(missing_oid, op);
2858       return cache_result_t::BLOCKED_FULL;
2859     }
2860     promote_object(obc, missing_oid, oloc, op, promote_obc);
2861     return cache_result_t::BLOCKED_PROMOTE;
2862
2863   case pg_pool_t::CACHEMODE_READFORWARD:
2864     // this mode is deprecated; proxy instead
2865   case pg_pool_t::CACHEMODE_READPROXY:
2866     // Do writeback to the cache tier for writes
2867     if (op->may_write() || write_ordered || must_promote) {
2868       if (agent_state &&
2869           agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2870         dout(20) << __func__ << " cache pool full, waiting" << dendl;
2871         block_write_on_full_cache(missing_oid, op);
2872         return cache_result_t::BLOCKED_FULL;
2873       }
2874       promote_object(obc, missing_oid, oloc, op, promote_obc);
2875       return cache_result_t::BLOCKED_PROMOTE;
2876     }
2877
2878     // If it is a read, we can read, we need to proxy it
2879     do_proxy_read(op);
2880     return cache_result_t::HANDLED_PROXY;
2881
2882   default:
2883     ceph_abort_msg("unrecognized cache_mode");
2884   }
2885   return cache_result_t::NOOP;
2886 }
2887
2888 bool PrimaryLogPG::maybe_promote(ObjectContextRef obc,
2889                                  const hobject_t& missing_oid,
2890                                  const object_locator_t& oloc,
2891                                  bool in_hit_set,
2892                                  uint32_t recency,
2893                                  OpRequestRef promote_op,
2894                                  ObjectContextRef *promote_obc)
2895 {
2896   dout(20) << __func__ << " missing_oid " << missing_oid
2897            << "  in_hit_set " << in_hit_set << dendl;
2898
2899   switch (recency) {
2900   case 0:
2901     break;
2902   case 1:
2903     // Check if in the current hit set
2904     if (in_hit_set) {
2905       break;
2906     } else {
2907       // not promoting
2908       return false;
2909     }
2910     break;
2911   default:
2912     {
2913       unsigned count = (int)in_hit_set;
2914       if (count) {
2915         // Check if in other hit sets
2916         const hobject_t& oid = obc.get() ? obc->obs.oi.soid : missing_oid;
2917         for (map<time_t,HitSetRef>::reverse_iterator itor =
2918                agent_state->hit_set_map.rbegin();
2919              itor != agent_state->hit_set_map.rend();
2920              ++itor) {
2921           if (!itor->second->contains(oid)) {
2922             break;
2923           }
2924           ++count;
2925           if (count >= recency) {
2926             break;
2927           }
2928         }
2929       }
2930       if (count >= recency) {
2931         break;
2932       }
2933       return false;     // not promoting
2934     }
2935     break;
2936   }
2937
2938   if (osd->promote_throttle()) {
2939     dout(10) << __func__ << " promote throttled" << dendl;
2940     return false;
2941   }
2942   promote_object(obc, missing_oid, oloc, promote_op, promote_obc);
2943   return true;
2944 }
2945
2946 void PrimaryLogPG::do_cache_redirect(OpRequestRef op)
2947 {
2948   auto m = op->get_req<MOSDOp>();
2949   int flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
2950   MOSDOpReply *reply = new MOSDOpReply(m, -ENOENT, get_osdmap_epoch(),
2951                                        flags, false);
2952   request_redirect_t redir(m->get_object_locator(), pool.info.tier_of);
2953   reply->set_redirect(redir);
2954   dout(10) << "sending redirect to pool " << pool.info.tier_of << " for op "
2955            << op << dendl;
2956   m->get_connection()->send_message(reply);
2957   return;
2958 }
2959
2960 struct C_ProxyRead : public Context {
2961   PrimaryLogPGRef pg;
2962   hobject_t oid;
2963   epoch_t last_peering_reset;
2964   ceph_tid_t tid;
2965   PrimaryLogPG::ProxyReadOpRef prdop;
2966   utime_t start;
2967   C_ProxyRead(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
2968              const PrimaryLogPG::ProxyReadOpRef& prd)
2969     : pg(p), oid(o), last_peering_reset(lpr),
2970       tid(0), prdop(prd), start(ceph_clock_now())
2971   {}
2972   void finish(int r) override {
2973     if (prdop->canceled)
2974       return;
2975     std::scoped_lock locker{*pg};
2976     if (prdop->canceled) {
2977       return;
2978     }
2979     if (last_peering_reset == pg->get_last_peering_reset()) {
2980       pg->finish_proxy_read(oid, tid, r);
2981       pg->osd->logger->tinc(l_osd_tier_r_lat, ceph_clock_now() - start);
2982     }
2983   }
2984 };
2985
2986 struct C_ProxyChunkRead : public Context {
2987   PrimaryLogPGRef pg;
2988   hobject_t oid;
2989   epoch_t last_peering_reset;
2990   ceph_tid_t tid;
2991   PrimaryLogPG::ProxyReadOpRef prdop;
2992   utime_t start;
2993   ObjectOperation *obj_op;
2994   int op_index = 0;
2995   uint64_t req_offset = 0;
2996   ObjectContextRef obc;
2997   uint64_t req_total_len = 0;
2998   C_ProxyChunkRead(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
2999                    const PrimaryLogPG::ProxyReadOpRef& prd)
3000     : pg(p), oid(o), last_peering_reset(lpr),
3001       tid(0), prdop(prd), start(ceph_clock_now()), obj_op(NULL)
3002   {}
3003   void finish(int r) override {
3004     if (prdop->canceled)
3005       return;
3006     std::scoped_lock locker{*pg};
3007     if (prdop->canceled) {
3008       return;
3009     }
3010     if (last_peering_reset == pg->get_last_peering_reset()) {
3011       if (r >= 0) {
3012         if (!prdop->ops[op_index].outdata.length()) {
3013           ceph_assert(req_total_len);
3014           bufferlist list;
3015           bufferptr bptr(req_total_len);
3016           list.push_back(std::move(bptr));
3017           prdop->ops[op_index].outdata.append(list);
3018         }
3019         ceph_assert(obj_op);
3020         uint64_t copy_offset;
3021         if (req_offset >= prdop->ops[op_index].op.extent.offset) {
3022           copy_offset = req_offset - prdop->ops[op_index].op.extent.offset;
3023         } else {
3024           copy_offset = 0;
3025         }
3026         prdop->ops[op_index].outdata.begin(copy_offset).copy_in(
3027           obj_op->ops[0].outdata.length(),
3028           obj_op->ops[0].outdata.c_str());
3029       }
3030
3031       pg->finish_proxy_read(oid, tid, r);
3032       pg->osd->logger->tinc(l_osd_tier_r_lat, ceph_clock_now() - start);
3033       if (obj_op) {
3034         delete obj_op;
3035       }
3036     }
3037   }
3038 };
3039
3040 void PrimaryLogPG::do_proxy_read(OpRequestRef op, ObjectContextRef obc)
3041 {
3042   // NOTE: non-const here because the ProxyReadOp needs mutable refs to
3043   // stash the result in the request's OSDOp vector
3044   MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
3045   object_locator_t oloc;
3046   hobject_t soid;
3047   /* extensible tier */
3048   if (obc && obc->obs.exists && obc->obs.oi.has_manifest()) {
3049     switch (obc->obs.oi.manifest.type) {
3050       case object_manifest_t::TYPE_REDIRECT:
3051           oloc = object_locator_t(obc->obs.oi.manifest.redirect_target);
3052           soid = obc->obs.oi.manifest.redirect_target;
3053           break;
3054       default:
3055         ceph_abort_msg("unrecognized manifest type");
3056     }
3057   } else {
3058   /* proxy */
3059     soid = m->get_hobj();
3060     oloc = object_locator_t(m->get_object_locator());
3061     oloc.pool = pool.info.tier_of;
3062   }
3063   unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY;
3064
3065   // pass through some original flags that make sense.
3066   //  - leave out redirection and balancing flags since we are
3067   //    already proxying through the primary
3068   //  - leave off read/write/exec flags that are derived from the op
3069   flags |= m->get_flags() & (CEPH_OSD_FLAG_RWORDERED |
3070                              CEPH_OSD_FLAG_ORDERSNAP |
3071                              CEPH_OSD_FLAG_ENFORCE_SNAPC |
3072                              CEPH_OSD_FLAG_MAP_SNAP_CLONE);
3073
3074   dout(10) << __func__ << " Start proxy read for " << *m << dendl;
3075
3076   ProxyReadOpRef prdop(std::make_shared<ProxyReadOp>(op, soid, m->ops));
3077
3078   ObjectOperation obj_op;
3079   obj_op.dup(prdop->ops);
3080
3081   if (pool.info.cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
3082       (agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL)) {
3083     for (unsigned i = 0; i < obj_op.ops.size(); i++) {
3084       ceph_osd_op op = obj_op.ops[i].op;
3085       switch (op.op) {
3086         case CEPH_OSD_OP_READ:
3087         case CEPH_OSD_OP_SYNC_READ:
3088         case CEPH_OSD_OP_SPARSE_READ:
3089         case CEPH_OSD_OP_CHECKSUM:
3090         case CEPH_OSD_OP_CMPEXT:
3091           op.flags = (op.flags | CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL) &
3092                        ~(CEPH_OSD_OP_FLAG_FADVISE_DONTNEED | CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
3093       }
3094     }
3095   }
3096
3097   C_ProxyRead *fin = new C_ProxyRead(this, soid, get_last_peering_reset(),
3098                                      prdop);
3099   ceph_tid_t tid = osd->objecter->read(
3100     soid.oid, oloc, obj_op,
3101     m->get_snapid(), NULL,
3102     flags, new C_OnFinisher(fin, osd->get_objecter_finisher(get_pg_shard())),
3103     &prdop->user_version,
3104     &prdop->data_offset,
3105     m->get_features());
3106   fin->tid = tid;
3107   prdop->objecter_tid = tid;
3108   proxyread_ops[tid] = prdop;
3109   in_progress_proxy_ops[soid].push_back(op);
3110 }
3111
3112 void PrimaryLogPG::finish_proxy_read(hobject_t oid, ceph_tid_t tid, int r)
3113 {
3114   dout(10) << __func__ << " " << oid << " tid " << tid
3115            << " " << cpp_strerror(r) << dendl;
3116
3117   map<ceph_tid_t, ProxyReadOpRef>::iterator p = proxyread_ops.find(tid);
3118   if (p == proxyread_ops.end()) {
3119     dout(10) << __func__ << " no proxyread_op found" << dendl;
3120     return;
3121   }
3122   ProxyReadOpRef prdop = p->second;
3123   if (tid != prdop->objecter_tid) {
3124     dout(10) << __func__ << " tid " << tid << " != prdop " << prdop
3125              << " tid " << prdop->objecter_tid << dendl;
3126     return;
3127   }
3128   if (oid != prdop->soid) {
3129     dout(10) << __func__ << " oid " << oid << " != prdop " << prdop
3130              << " soid " << prdop->soid << dendl;
3131     return;
3132   }
3133   proxyread_ops.erase(tid);
3134
3135   map<hobject_t, list<OpRequestRef>>::iterator q = in_progress_proxy_ops.find(oid);
3136   if (q == in_progress_proxy_ops.end()) {
3137     dout(10) << __func__ << " no in_progress_proxy_ops found" << dendl;
3138     return;
3139   }
3140   ceph_assert(q->second.size());
3141   list<OpRequestRef>::iterator it = std::find(q->second.begin(),
3142                                               q->second.end(),
3143                                               prdop->op);
3144   ceph_assert(it != q->second.end());
3145   OpRequestRef op = *it;
3146   q->second.erase(it);
3147   if (q->second.size() == 0) {
3148     in_progress_proxy_ops.erase(oid);
3149   } else if (std::find(q->second.begin(),
3150                        q->second.end(),
3151                        prdop->op) != q->second.end()) {
3152     /* multiple read case */
3153     dout(20) << __func__ << " " << oid << " is not completed  " << dendl;
3154     return;
3155   }
3156
3157   osd->logger->inc(l_osd_tier_proxy_read);
3158
3159   auto m = op->get_req<MOSDOp>();
3160   OpContext *ctx = new OpContext(op, m->get_reqid(), &prdop->ops, this);
3161   ctx->reply = new MOSDOpReply(m, 0, get_osdmap_epoch(), 0, false);
3162   ctx->user_at_version = prdop->user_version;
3163   ctx->data_off = prdop->data_offset;
3164   ctx->ignore_log_op_stats = true;
3165   complete_read_ctx(r, ctx);
3166 }
3167
3168 void PrimaryLogPG::kick_proxy_ops_blocked(hobject_t& soid)
3169 {
3170   map<hobject_t, list<OpRequestRef>>::iterator p = in_progress_proxy_ops.find(soid);
3171   if (p == in_progress_proxy_ops.end())
3172     return;
3173
3174   list<OpRequestRef>& ls = p->second;
3175   dout(10) << __func__ << " " << soid << " requeuing " << ls.size() << " requests" << dendl;
3176   requeue_ops(ls);
3177   in_progress_proxy_ops.erase(p);
3178 }
3179
3180 void PrimaryLogPG::cancel_proxy_read(ProxyReadOpRef prdop,
3181                                      vector<ceph_tid_t> *tids)
3182 {
3183   dout(10) << __func__ << " " << prdop->soid << dendl;
3184   prdop->canceled = true;
3185
3186   // cancel objecter op, if we can
3187   if (prdop->objecter_tid) {
3188     tids->push_back(prdop->objecter_tid);
3189     for (uint32_t i = 0; i < prdop->ops.size(); i++) {
3190       prdop->ops[i].outdata.clear();
3191     }
3192     proxyread_ops.erase(prdop->objecter_tid);
3193     prdop->objecter_tid = 0;
3194   }
3195 }
3196
3197 void PrimaryLogPG::cancel_proxy_ops(bool requeue, vector<ceph_tid_t> *tids)
3198 {
3199   dout(10) << __func__ << dendl;
3200
3201   // cancel proxy reads
3202   map<ceph_tid_t, ProxyReadOpRef>::iterator p = proxyread_ops.begin();
3203   while (p != proxyread_ops.end()) {
3204     cancel_proxy_read((p++)->second, tids);
3205   }
3206
3207   // cancel proxy writes
3208   map<ceph_tid_t, ProxyWriteOpRef>::iterator q = proxywrite_ops.begin();
3209   while (q != proxywrite_ops.end()) {
3210     cancel_proxy_write((q++)->second, tids);
3211   }
3212
3213   if (requeue) {
3214     map<hobject_t, list<OpRequestRef>>::iterator p =
3215       in_progress_proxy_ops.begin();
3216     while (p != in_progress_proxy_ops.end()) {
3217       list<OpRequestRef>& ls = p->second;
3218       dout(10) << __func__ << " " << p->first << " requeuing " << ls.size()
3219                << " requests" << dendl;
3220       requeue_ops(ls);
3221       in_progress_proxy_ops.erase(p++);
3222     }
3223   } else {
3224     in_progress_proxy_ops.clear();
3225   }
3226 }
3227
3228 struct C_ProxyWrite_Commit : public Context {
3229   PrimaryLogPGRef pg;
3230   hobject_t oid;
3231   epoch_t last_peering_reset;
3232   ceph_tid_t tid;
3233   PrimaryLogPG::ProxyWriteOpRef pwop;
3234   C_ProxyWrite_Commit(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
3235                       const PrimaryLogPG::ProxyWriteOpRef& pw)
3236     : pg(p), oid(o), last_peering_reset(lpr),
3237       tid(0), pwop(pw)
3238   {}
3239   void finish(int r) override {
3240     if (pwop->canceled)
3241       return;
3242     std::scoped_lock locker{*pg};
3243     if (pwop->canceled) {
3244       return;
3245     }
3246     if (last_peering_reset == pg->get_last_peering_reset()) {
3247       pg->finish_proxy_write(oid, tid, r);
3248     }
3249   }
3250 };
3251
3252 void PrimaryLogPG::do_proxy_write(OpRequestRef op, ObjectContextRef obc)
3253 {
3254   // NOTE: non-const because ProxyWriteOp takes a mutable ref
3255   MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
3256   object_locator_t oloc;
3257   SnapContext snapc(m->get_snap_seq(), m->get_snaps());
3258   hobject_t soid;
3259   /* extensible tier */
3260   if (obc && obc->obs.exists && obc->obs.oi.has_manifest()) {
3261     switch (obc->obs.oi.manifest.type) {
3262       case object_manifest_t::TYPE_REDIRECT:
3263           oloc = object_locator_t(obc->obs.oi.manifest.redirect_target);
3264           soid = obc->obs.oi.manifest.redirect_target;
3265           break;
3266       default:
3267         ceph_abort_msg("unrecognized manifest type");
3268     }
3269   } else {
3270   /* proxy */
3271     soid = m->get_hobj();
3272     oloc = object_locator_t(m->get_object_locator());
3273     oloc.pool = pool.info.tier_of;
3274   }
3275
3276   unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY;
3277   if (!(op->may_write() || op->may_cache())) {
3278     flags |= CEPH_OSD_FLAG_RWORDERED;
3279   }
3280   if (op->allows_returnvec()) {
3281     flags |= CEPH_OSD_FLAG_RETURNVEC;
3282   }
3283
3284   dout(10) << __func__ << " Start proxy write for " << *m << dendl;
3285
3286   ProxyWriteOpRef pwop(std::make_shared<ProxyWriteOp>(op, soid, m->ops, m->get_reqid()));
3287   pwop->ctx = new OpContext(op, m->get_reqid(), &pwop->ops, this);
3288   pwop->mtime = m->get_mtime();
3289
3290   ObjectOperation obj_op;
3291   obj_op.dup(pwop->ops);
3292
3293   C_ProxyWrite_Commit *fin = new C_ProxyWrite_Commit(
3294       this, soid, get_last_peering_reset(), pwop);
3295   ceph_tid_t tid = osd->objecter->mutate(
3296     soid.oid, oloc, obj_op, snapc,
3297     ceph::real_clock::from_ceph_timespec(pwop->mtime),
3298     flags, new C_OnFinisher(fin, osd->get_objecter_finisher(get_pg_shard())),
3299     &pwop->user_version, pwop->reqid);
3300   fin->tid = tid;
3301   pwop->objecter_tid = tid;
3302   proxywrite_ops[tid] = pwop;
3303   in_progress_proxy_ops[soid].push_back(op);
3304 }
3305
3306 void PrimaryLogPG::do_proxy_chunked_op(OpRequestRef op, const hobject_t& missing_oid,
3307                                        ObjectContextRef obc, bool write_ordered)
3308 {
3309   MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
3310   OSDOp *osd_op = NULL;
3311   for (unsigned int i = 0; i < m->ops.size(); i++) {
3312     osd_op = &m->ops[i];
3313     uint64_t cursor = osd_op->op.extent.offset;
3314     uint64_t op_length = osd_op->op.extent.offset + osd_op->op.extent.length;
3315     uint64_t chunk_length = 0, chunk_index = 0, req_len = 0;
3316     object_manifest_t *manifest = &obc->obs.oi.manifest;
3317     map <uint64_t, map<uint64_t, uint64_t>> chunk_read;
3318
3319     while (cursor < op_length) {
3320       chunk_index = 0;
3321       chunk_length = 0;
3322       /* find the right chunk position for cursor */
3323       for (auto &p : manifest->chunk_map) {
3324         if (p.first <= cursor && p.first + p.second.length > cursor) {
3325           chunk_length = p.second.length;
3326           chunk_index = p.first;
3327           break;
3328         }
3329       }
3330       /* no index */
3331       if (!chunk_index && !chunk_length) {
3332         if (cursor == osd_op->op.extent.offset) {
3333           OpContext *ctx = new OpContext(op, m->get_reqid(), &m->ops, this);
3334           ctx->reply = new MOSDOpReply(m, 0, get_osdmap_epoch(), 0, false);
3335           ctx->data_off = osd_op->op.extent.offset;
3336           ctx->ignore_log_op_stats = true;
3337           complete_read_ctx(0, ctx);
3338         }
3339         break;
3340       }
3341       uint64_t next_length = chunk_length;
3342       /* the size to read -> | op length | */
3343       /*                     |   a chunk   | */
3344       if (cursor + next_length > op_length) {
3345         next_length = op_length - cursor;
3346       }
3347       /* the size to read -> |   op length   | */
3348       /*                     |   a chunk | */
3349       if (cursor + next_length > chunk_index + chunk_length) {
3350         next_length = chunk_index + chunk_length - cursor;
3351       }
3352
3353       chunk_read[cursor] = {{chunk_index, next_length}};
3354       cursor += next_length;
3355     }
3356
3357     req_len = cursor - osd_op->op.extent.offset;
3358     for (auto &p : chunk_read) {
3359       auto chunks = p.second.begin();
3360       dout(20) << __func__ << " chunk_index: " << chunks->first
3361               << " next_length: " << chunks->second << " cursor: "
3362               << p.first << dendl;
3363       do_proxy_chunked_read(op, obc, i, chunks->first, p.first, chunks->second, req_len, write_ordered);
3364     }
3365   }
3366 }
3367
3368 struct RefCountCallback : public Context {
3369 public:
3370   PrimaryLogPG::OpContext *ctx;
3371   OSDOp& osd_op;
3372   bool requeue = false;
3373
3374   RefCountCallback(PrimaryLogPG::OpContext *ctx, OSDOp &osd_op)
3375     : ctx(ctx), osd_op(osd_op) {}
3376   void finish(int r) override {
3377     // NB: caller must already have pg->lock held
3378     ctx->obc->stop_block();
3379     ctx->pg->kick_object_context_blocked(ctx->obc);
3380     if (r >= 0) {
3381       osd_op.rval = 0;
3382       ctx->pg->execute_ctx(ctx);
3383     } else {
3384        // on cancel simply toss op out,
3385        // or requeue as requested
3386       if (r != -ECANCELED) {
3387         if (ctx->op)
3388           ctx->pg->osd->reply_op_error(ctx->op, r);
3389       } else if (requeue) {
3390         if (ctx->op)
3391           ctx->pg->requeue_op(ctx->op);
3392       }
3393       ctx->pg->close_op_ctx(ctx);
3394     }
3395   }
3396   void set_requeue(bool rq) {
3397     requeue = rq;
3398   }
3399 };
3400
3401 struct SetManifestFinisher : public PrimaryLogPG::OpFinisher {
3402   OSDOp& osd_op;
3403
3404   explicit SetManifestFinisher(OSDOp& osd_op) : osd_op(osd_op) {
3405   }
3406
3407   int execute() override {
3408     return osd_op.rval;
3409   }
3410 };
3411
3412 struct C_SetManifestRefCountDone : public Context {
3413   RefCountCallback* cb;
3414   hobject_t soid;
3415   C_SetManifestRefCountDone(
3416     RefCountCallback* cb, hobject_t soid) : cb(cb), soid(soid) {}
3417   void finish(int r) override {
3418     if (r == -ECANCELED)
3419       return;
3420     auto pg = cb->ctx->pg;
3421     std::scoped_lock locker{*pg};
3422     auto it = pg->manifest_ops.find(soid);
3423     if (it == pg->manifest_ops.end()) {
3424       // raced with cancel_manifest_ops
3425       return;
3426     }
3427     pg->manifest_ops.erase(it);
3428     cb->complete(r);
3429   }
3430 };
3431
3432 void PrimaryLogPG::cancel_manifest_ops(bool requeue, vector<ceph_tid_t> *tids)
3433 {
3434   dout(10) << __func__ << dendl;
3435   auto p = manifest_ops.begin();
3436   while (p != manifest_ops.end()) {
3437     auto mop = p->second;
3438     // cancel objecter op, if we can
3439     if (mop->objecter_tid) {
3440       tids->push_back(mop->objecter_tid);
3441       mop->objecter_tid = 0;
3442     }
3443     mop->cb->set_requeue(requeue);
3444     mop->cb->complete(-ECANCELED);
3445     manifest_ops.erase(p++);
3446   }
3447 }
3448
3449 void PrimaryLogPG::refcount_manifest(ObjectContextRef obc, object_locator_t oloc, hobject_t soid,
3450                                      SnapContext snapc, bool get, RefCountCallback *cb, uint64_t offset)
3451 {
3452   unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY |
3453                    CEPH_OSD_FLAG_RWORDERED;
3454
3455   dout(10) << __func__ << " Start refcount for " << soid << dendl;
3456
3457   ObjectOperation obj_op;
3458   bufferlist in;
3459   if (get) {
3460     cls_chunk_refcount_get_op call;
3461     call.source = obc->obs.oi.soid;
3462     ::encode(call, in);
3463     obj_op.call("cas", "chunk_get", in);
3464   } else {
3465     cls_chunk_refcount_put_op call;
3466     call.source = obc->obs.oi.soid;
3467     ::encode(call, in);
3468     obj_op.call("cas", "chunk_put", in);
3469   }
3470
3471   Context *c = nullptr;
3472   if (cb) {
3473     C_SetManifestRefCountDone *fin =
3474       new C_SetManifestRefCountDone(cb, obc->obs.oi.soid);
3475     c = new C_OnFinisher(fin, osd->get_objecter_finisher(get_pg_shard()));
3476   }
3477
3478   auto tid = osd->objecter->mutate(
3479     soid.oid, oloc, obj_op, snapc,
3480     ceph::real_clock::from_ceph_timespec(obc->obs.oi.mtime),
3481     flags, c);
3482   if (cb) {
3483     manifest_ops[obc->obs.oi.soid] = std::make_shared<ManifestOp>(cb, tid);
3484     obc->start_block();
3485   }
3486 }
3487
3488 void PrimaryLogPG::do_proxy_chunked_read(OpRequestRef op, ObjectContextRef obc, int op_index,
3489                                          uint64_t chunk_index, uint64_t req_offset, uint64_t req_length,
3490                                          uint64_t req_total_len, bool write_ordered)
3491 {
3492   MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
3493   object_manifest_t *manifest = &obc->obs.oi.manifest;
3494   if (!manifest->chunk_map.count(chunk_index)) {
3495     return;
3496   }
3497   uint64_t chunk_length = manifest->chunk_map[chunk_index].length;
3498   hobject_t soid = manifest->chunk_map[chunk_index].oid;
3499   hobject_t ori_soid = m->get_hobj();
3500   object_locator_t oloc(soid);
3501   unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY;
3502   if (write_ordered) {
3503     flags |= CEPH_OSD_FLAG_RWORDERED;
3504   }
3505
3506   if (!chunk_length || soid == hobject_t()) {
3507     return;
3508   }
3509
3510   /* same as do_proxy_read() */
3511   flags |= m->get_flags() & (CEPH_OSD_FLAG_RWORDERED |
3512                              CEPH_OSD_FLAG_ORDERSNAP |
3513                              CEPH_OSD_FLAG_ENFORCE_SNAPC |
3514                              CEPH_OSD_FLAG_MAP_SNAP_CLONE);
3515
3516   dout(10) << __func__ << " Start do chunk proxy read for " << *m
3517            << " index: " << op_index << " oid: " << soid.oid.name << " req_offset: " << req_offset
3518            << " req_length: " << req_length << dendl;
3519
3520   ProxyReadOpRef prdop(std::make_shared<ProxyReadOp>(op, ori_soid, m->ops));
3521
3522   ObjectOperation *pobj_op = new ObjectOperation;
3523   OSDOp &osd_op = pobj_op->add_op(m->ops[op_index].op.op);
3524
3525   if (chunk_index <= req_offset) {
3526     osd_op.op.extent.offset = manifest->chunk_map[chunk_index].offset + req_offset - chunk_index;
3527   } else {
3528     ceph_abort_msg("chunk_index > req_offset");
3529   }
3530   osd_op.op.extent.length = req_length;
3531
3532   ObjectOperation obj_op;
3533   obj_op.dup(pobj_op->ops);
3534
3535   C_ProxyChunkRead *fin = new C_ProxyChunkRead(this, ori_soid, get_last_peering_reset(),
3536                                                prdop);
3537   fin->obj_op = pobj_op;
3538   fin->op_index = op_index;
3539   fin->req_offset = req_offset;
3540   fin->obc = obc;
3541   fin->req_total_len = req_total_len;
3542
3543   ceph_tid_t tid = osd->objecter->read(
3544     soid.oid, oloc, obj_op,
3545     m->get_snapid(), NULL,
3546     flags, new C_OnFinisher(fin, osd->get_objecter_finisher(get_pg_shard())),
3547     &prdop->user_version,
3548     &prdop->data_offset,
3549     m->get_features());
3550   fin->tid = tid;
3551   prdop->objecter_tid = tid;
3552   proxyread_ops[tid] = prdop;
3553   in_progress_proxy_ops[ori_soid].push_back(op);
3554 }
3555
3556 bool PrimaryLogPG::can_proxy_chunked_read(OpRequestRef op, ObjectContextRef obc)
3557 {
3558   MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
3559   OSDOp *osd_op = NULL;
3560   bool ret = true;
3561   for (unsigned int i = 0; i < m->ops.size(); i++) {
3562     osd_op = &m->ops[i];
3563     ceph_osd_op op = osd_op->op;
3564     switch (op.op) {
3565       case CEPH_OSD_OP_READ:
3566       case CEPH_OSD_OP_SYNC_READ: {
3567         uint64_t cursor = osd_op->op.extent.offset;
3568         uint64_t remain = osd_op->op.extent.length;
3569
3570         /* requested chunks exist in chunk_map ? */
3571         for (auto &p : obc->obs.oi.manifest.chunk_map) {
3572           if (p.first <= cursor && p.first + p.second.length > cursor) {
3573             if (!p.second.is_missing()) {
3574               return false;
3575             }
3576             if (p.second.length >= remain) {
3577               remain = 0;
3578               break;
3579             } else {
3580               remain = remain - p.second.length;
3581             }
3582             cursor += p.second.length;
3583           }
3584         }
3585
3586         if (remain) {
3587           dout(20) << __func__ << " requested chunks don't exist in chunk_map " << dendl;
3588           return false;
3589         }
3590         continue;
3591       }
3592       default:
3593         return false;
3594     }
3595   }
3596   return ret;
3597 }
3598
3599 void PrimaryLogPG::finish_proxy_write(hobject_t oid, ceph_tid_t tid, int r)
3600 {
3601   dout(10) << __func__ << " " << oid << " tid " << tid
3602            << " " << cpp_strerror(r) << dendl;
3603
3604   map<ceph_tid_t, ProxyWriteOpRef>::iterator p = proxywrite_ops.find(tid);
3605   if (p == proxywrite_ops.end()) {
3606     dout(10) << __func__ << " no proxywrite_op found" << dendl;
3607     return;
3608   }
3609   ProxyWriteOpRef pwop = p->second;
3610   ceph_assert(tid == pwop->objecter_tid);
3611   ceph_assert(oid == pwop->soid);
3612
3613   proxywrite_ops.erase(tid);
3614
3615   map<hobject_t, list<OpRequestRef> >::iterator q = in_progress_proxy_ops.find(oid);
3616   if (q == in_progress_proxy_ops.end()) {
3617     dout(10) << __func__ << " no in_progress_proxy_ops found" << dendl;
3618     delete pwop->ctx;
3619     pwop->ctx = NULL;
3620     return;
3621   }
3622   list<OpRequestRef>& in_progress_op = q->second;
3623   ceph_assert(in_progress_op.size());
3624   list<OpRequestRef>::iterator it = std::find(in_progress_op.begin(),
3625                                               in_progress_op.end(),
3626                                               pwop->op);
3627   ceph_assert(it != in_progress_op.end());
3628   in_progress_op.erase(it);
3629   if (in_progress_op.size() == 0) {
3630     in_progress_proxy_ops.erase(oid);
3631   } else if (std::find(in_progress_op.begin(),
3632                         in_progress_op.end(),
3633                         pwop->op) != in_progress_op.end()) {
3634     if (pwop->ctx)
3635       delete pwop->ctx;
3636     pwop->ctx = NULL;
3637     dout(20) << __func__ << " " << oid << " tid " << tid
3638             << " in_progress_op size: "
3639             << in_progress_op.size() << dendl;
3640     return;
3641   }
3642
3643   osd->logger->inc(l_osd_tier_proxy_write);
3644
3645   auto m = pwop->op->get_req<MOSDOp>();
3646   ceph_assert(m != NULL);
3647
3648   if (!pwop->sent_reply) {
3649     // send commit.
3650     assert(pwop->ctx->reply == nullptr);
3651     MOSDOpReply *reply = new MOSDOpReply(m, r, get_osdmap_epoch(), 0,
3652                                          true /* we claim it below */);
3653     reply->set_reply_versions(eversion_t(), pwop->user_version);
3654     reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
3655     reply->claim_op_out_data(pwop->ops);
3656     dout(10) << " sending commit on " << pwop << " " << reply << dendl;
3657     osd->send_message_osd_client(reply, m->get_connection());
3658     pwop->sent_reply = true;
3659     pwop->ctx->op->mark_commit_sent();
3660   }
3661
3662   delete pwop->ctx;
3663   pwop->ctx = NULL;
3664 }
3665
3666 void PrimaryLogPG::cancel_proxy_write(ProxyWriteOpRef pwop,
3667                                       vector<ceph_tid_t> *tids)
3668 {
3669   dout(10) << __func__ << " " << pwop->soid << dendl;
3670   pwop->canceled = true;
3671
3672   // cancel objecter op, if we can
3673   if (pwop->objecter_tid) {
3674     tids->push_back(pwop->objecter_tid);
3675     delete pwop->ctx;
3676     pwop->ctx = NULL;
3677     proxywrite_ops.erase(pwop->objecter_tid);
3678     pwop->objecter_tid = 0;
3679   }
3680 }
3681
3682 class PromoteCallback: public PrimaryLogPG::CopyCallback {
3683   ObjectContextRef obc;
3684   PrimaryLogPG *pg;
3685   utime_t start;
3686 public:
3687   PromoteCallback(ObjectContextRef obc_, PrimaryLogPG *pg_)
3688     : obc(obc_),
3689       pg(pg_),
3690       start(ceph_clock_now()) {}
3691
3692   void finish(PrimaryLogPG::CopyCallbackResults results) override {
3693     PrimaryLogPG::CopyResults *results_data = results.get<1>();
3694     int r = results.get<0>();
3695     pg->finish_promote(r, results_data, obc);
3696     pg->osd->logger->tinc(l_osd_tier_promote_lat, ceph_clock_now() - start);
3697   }
3698 };
3699
3700 class PromoteManifestCallback: public PrimaryLogPG::CopyCallback {
3701   ObjectContextRef obc;
3702   PrimaryLogPG *pg;
3703   utime_t start;
3704   PrimaryLogPG::OpContext *ctx;
3705   PrimaryLogPG::CopyCallbackResults promote_results;
3706 public:
3707   PromoteManifestCallback(ObjectContextRef obc_, PrimaryLogPG *pg_, PrimaryLogPG::OpContext *ctx = NULL)
3708     : obc(obc_),
3709       pg(pg_),
3710       start(ceph_clock_now()), ctx(ctx) {}
3711
3712   void finish(PrimaryLogPG::CopyCallbackResults results) override {
3713     PrimaryLogPG::CopyResults *results_data = results.get<1>();
3714     int r = results.get<0>();
3715     if (ctx) {
3716       promote_results = results;
3717       pg->execute_ctx(ctx);
3718     } else {
3719       pg->finish_promote_manifest(r, results_data, obc);
3720     }
3721     pg->osd->logger->tinc(l_osd_tier_promote_lat, ceph_clock_now() - start);
3722   }
3723   friend struct PromoteFinisher;
3724 };
3725
3726 struct PromoteFinisher : public PrimaryLogPG::OpFinisher {
3727   PromoteManifestCallback *promote_callback;
3728
3729   explicit PromoteFinisher(PromoteManifestCallback *promote_callback)
3730     : promote_callback(promote_callback) {
3731   }
3732
3733   int execute() override {
3734     if (promote_callback->ctx->obc->obs.oi.manifest.is_redirect()) {
3735       promote_callback->ctx->pg->finish_promote(promote_callback->promote_results.get<0>(),
3736                                                 promote_callback->promote_results.get<1>(),
3737                                                 promote_callback->obc);
3738     } else if (promote_callback->ctx->obc->obs.oi.manifest.is_chunked()) {
3739       promote_callback->ctx->pg->finish_promote_manifest(promote_callback->promote_results.get<0>(),
3740                                                 promote_callback->promote_results.get<1>(),
3741                                                 promote_callback->obc);
3742     } else {
3743       ceph_abort_msg("unrecognized manifest type");
3744     }
3745     return 0;
3746   }
3747 };
3748
3749 void PrimaryLogPG::promote_object(ObjectContextRef obc,
3750                                   const hobject_t& missing_oid,
3751                                   const object_locator_t& oloc,
3752                                   OpRequestRef op,
3753                                   ObjectContextRef *promote_obc)
3754 {
3755   hobject_t hoid = obc ? obc->obs.oi.soid : missing_oid;
3756   ceph_assert(hoid != hobject_t());
3757   if (write_blocked_by_scrub(hoid)) {
3758     dout(10) << __func__ << " " << hoid
3759              << " blocked by scrub" << dendl;
3760     if (op) {
3761       waiting_for_scrub.push_back(op);
3762       op->mark_delayed("waiting for scrub");
3763       dout(10) << __func__ << " " << hoid
3764                << " placing op in waiting_for_scrub" << dendl;
3765     } else {
3766       dout(10) << __func__ << " " << hoid
3767                << " no op, dropping on the floor" << dendl;
3768     }
3769     return;
3770   }
3771   if (op && !check_laggy_requeue(op)) {
3772     return;
3773   }
3774   if (!obc) { // we need to create an ObjectContext
3775     ceph_assert(missing_oid != hobject_t());
3776     obc = get_object_context(missing_oid, true);
3777   }
3778   if (promote_obc)
3779     *promote_obc = obc;
3780
3781   /*
3782    * Before promote complete, if there are  proxy-reads for the object,
3783    * for this case we don't use DONTNEED.
3784    */
3785   unsigned src_fadvise_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL;
3786   map<hobject_t, list<OpRequestRef>>::iterator q = in_progress_proxy_ops.find(obc->obs.oi.soid);
3787   if (q == in_progress_proxy_ops.end()) {
3788     src_fadvise_flags |= LIBRADOS_OP_FLAG_FADVISE_DONTNEED;
3789   }
3790
3791   CopyCallback *cb;
3792   object_locator_t my_oloc;
3793   hobject_t src_hoid;
3794   if (!obc->obs.oi.has_manifest()) {
3795     my_oloc = oloc;
3796     my_oloc.pool = pool.info.tier_of;
3797     src_hoid = obc->obs.oi.soid;
3798     cb = new PromoteCallback(obc, this);
3799   } else {
3800     if (obc->obs.oi.manifest.is_chunked()) {
3801       src_hoid = obc->obs.oi.soid;
3802       cb = new PromoteManifestCallback(obc, this);
3803     } else if (obc->obs.oi.manifest.is_redirect()) {
3804       object_locator_t src_oloc(obc->obs.oi.manifest.redirect_target);
3805       my_oloc = src_oloc;
3806       src_hoid = obc->obs.oi.manifest.redirect_target;
3807       cb = new PromoteCallback(obc, this);
3808     } else {
3809       ceph_abort_msg("unrecognized manifest type");
3810     }
3811   }
3812
3813   unsigned flags = CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
3814                    CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
3815                    CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE |
3816                    CEPH_OSD_COPY_FROM_FLAG_RWORDERED;
3817   start_copy(cb, obc, src_hoid, my_oloc, 0, flags,
3818              obc->obs.oi.soid.snap == CEPH_NOSNAP,
3819              src_fadvise_flags, 0);
3820
3821   ceph_assert(obc->is_blocked());
3822
3823   if (op)
3824     wait_for_blocked_object(obc->obs.oi.soid, op);
3825
3826   recovery_state.update_stats(
3827     [](auto &history, auto &stats) {
3828       stats.stats.sum.num_promote++;
3829       return false;
3830     });
3831 }
3832
3833 void PrimaryLogPG::execute_ctx(OpContext *ctx)
3834 {
3835   FUNCTRACE(cct);
3836   dout(10) << __func__ << " " << ctx << dendl;
3837   ctx->reset_obs(ctx->obc);
3838   ctx->update_log_only = false; // reset in case finish_copyfrom() is re-running execute_ctx
3839   OpRequestRef op = ctx->op;
3840   auto m = op->get_req<MOSDOp>();
3841   ObjectContextRef obc = ctx->obc;
3842   const hobject_t& soid = obc->obs.oi.soid;
3843
3844   // this method must be idempotent since we may call it several times
3845   // before we finally apply the resulting transaction.
3846   ctx->op_t.reset(new PGTransaction);
3847
3848   if (op->may_write() || op->may_cache()) {
3849     // snap
3850     if (!(m->has_flag(CEPH_OSD_FLAG_ENFORCE_SNAPC)) &&
3851         pool.info.is_pool_snaps_mode()) {
3852       // use pool's snapc
3853       ctx->snapc = pool.snapc;
3854     } else {
3855       // client specified snapc
3856       ctx->snapc.seq = m->get_snap_seq();
3857       ctx->snapc.snaps = m->get_snaps();
3858       filter_snapc(ctx->snapc.snaps);
3859     }
3860     if ((m->has_flag(CEPH_OSD_FLAG_ORDERSNAP)) &&
3861         ctx->snapc.seq < obc->ssc->snapset.seq) {
3862       dout(10) << " ORDERSNAP flag set and snapc seq " << ctx->snapc.seq
3863                << " < snapset seq " << obc->ssc->snapset.seq
3864                << " on " << obc->obs.oi.soid << dendl;
3865       reply_ctx(ctx, -EOLDSNAPC);
3866       return;
3867     }
3868
3869     // version
3870     ctx->at_version = get_next_version();
3871     ctx->mtime = m->get_mtime();
3872
3873     dout(10) << __func__ << " " << soid << " " << *ctx->ops
3874              << " ov " << obc->obs.oi.version << " av " << ctx->at_version
3875              << " snapc " << ctx->snapc
3876              << " snapset " << obc->ssc->snapset
3877              << dendl;
3878   } else {
3879     dout(10) << __func__ << " " << soid << " " << *ctx->ops
3880              << " ov " << obc->obs.oi.version
3881              << dendl;
3882   }
3883
3884   if (!ctx->user_at_version)
3885     ctx->user_at_version = obc->obs.oi.user_version;
3886   dout(30) << __func__ << " user_at_version " << ctx->user_at_version << dendl;
3887
3888   {
3889 #ifdef WITH_LTTNG
3890     osd_reqid_t reqid = ctx->op->get_reqid();
3891 #endif
3892     tracepoint(osd, prepare_tx_enter, reqid.name._type,
3893         reqid.name._num, reqid.tid, reqid.inc);
3894   }
3895
3896   int result = prepare_transaction(ctx);
3897
3898   {
3899 #ifdef WITH_LTTNG
3900     osd_reqid_t reqid = ctx->op->get_reqid();
3901 #endif
3902     tracepoint(osd, prepare_tx_exit, reqid.name._type,
3903         reqid.name._num, reqid.tid, reqid.inc);
3904   }
3905
3906   bool pending_async_reads = !ctx->pending_async_reads.empty();
3907   if (result == -EINPROGRESS || pending_async_reads) {
3908     // come back later.
3909     if (pending_async_reads) {
3910       ceph_assert(pool.info.is_erasure());
3911       in_progress_async_reads.push_back(make_pair(op, ctx));
3912       ctx->start_async_reads(this);
3913     }
3914     return;
3915   }
3916
3917   if (result == -EAGAIN) {
3918     // clean up after the ctx
3919     close_op_ctx(ctx);
3920     return;
3921   }
3922
3923   bool ignore_out_data = false;
3924   if (!ctx->op_t->empty() &&
3925       op->may_write() &&
3926       result >= 0) {
3927     // successful update
3928     if (ctx->op->allows_returnvec()) {
3929       // enforce reasonable bound on the return buffer sizes
3930       for (auto& i : *ctx->ops) {
3931         if (i.outdata.length() > cct->_conf->osd_max_write_op_reply_len) {
3932           dout(10) << __func__ << " op " << i << " outdata overflow" << dendl;
3933           result = -EOVERFLOW;  // overall result is overflow
3934           i.rval = -EOVERFLOW;
3935           i.outdata.clear();
3936         }
3937       }
3938     } else {
3939       // legacy behavior -- zero result and return data etc.
3940       ignore_out_data = true;
3941       result = 0;
3942     }
3943   }
3944
3945   // prepare the reply
3946   ctx->reply = new MOSDOpReply(m, result, get_osdmap_epoch(), 0,
3947                                ignore_out_data);
3948   dout(20) << __func__ << " alloc reply " << ctx->reply
3949            << " result " << result << dendl;
3950
3951   // read or error?
3952   if ((ctx->op_t->empty() || result < 0) && !ctx->update_log_only) {
3953     // finish side-effects
3954     if (result >= 0)
3955       do_osd_op_effects(ctx, m->get_connection());
3956
3957     complete_read_ctx(result, ctx);
3958     return;
3959   }
3960
3961   ctx->reply->set_reply_versions(ctx->at_version, ctx->user_at_version);
3962
3963   ceph_assert(op->may_write() || op->may_cache());
3964
3965   // trim log?
3966   recovery_state.update_trim_to();
3967
3968   // verify that we are doing this in order?
3969   if (cct->_conf->osd_debug_op_order && m->get_source().is_client() &&
3970       !pool.info.is_tier() && !pool.info.has_tiers()) {
3971     map<client_t,ceph_tid_t>& cm = debug_op_order[obc->obs.oi.soid];
3972     ceph_tid_t t = m->get_tid();
3973     client_t n = m->get_source().num();
3974     map<client_t,ceph_tid_t>::iterator p = cm.find(n);
3975     if (p == cm.end()) {
3976       dout(20) << " op order client." << n << " tid " << t << " (first)" << dendl;
3977       cm[n] = t;
3978     } else {
3979       dout(20) << " op order client." << n << " tid " << t << " last was " << p->second << dendl;
3980       if (p->second > t) {
3981         derr << "bad op order, already applied " << p->second << " > this " << t << dendl;
3982         ceph_abort_msg("out of order op");
3983       }
3984       p->second = t;
3985     }
3986   }
3987
3988   if (ctx->update_log_only) {
3989     if (result >= 0)
3990       do_osd_op_effects(ctx, m->get_connection());
3991
3992     dout(20) << __func__ << " update_log_only -- result=" << result << dendl;
3993     // save just what we need from ctx
3994     MOSDOpReply *reply = ctx->reply;
3995     ctx->reply = nullptr;
3996     reply->get_header().data_off = (ctx->data_off ? *ctx->data_off : 0);
3997
3998     if (result == -ENOENT) {
3999       reply->set_enoent_reply_versions(info.last_update,
4000                                        info.last_user_version);
4001     }
4002     reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
4003     // append to pg log for dup detection - don't save buffers for now
4004     record_write_error(op, soid, reply, result,
4005                        ctx->op->allows_returnvec() ? ctx : nullptr);
4006     close_op_ctx(ctx);
4007     return;
4008   }
4009
4010   // no need to capture PG ref, repop cancel will handle that
4011   // Can capture the ctx by pointer, it's owned by the repop
4012   ctx->register_on_commit(
4013     [m, ctx, this](){
4014       if (ctx->op)
4015         log_op_stats(*ctx->op, ctx->bytes_written, ctx->bytes_read);
4016
4017       if (m && !ctx->sent_reply) {
4018         MOSDOpReply *reply = ctx->reply;
4019         ctx->reply = nullptr;
4020         reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
4021         dout(10) << " sending reply on " << *m << " " << reply << dendl;
4022         osd->send_message_osd_client(reply, m->get_connection());
4023         ctx->sent_reply = true;
4024         ctx->op->mark_commit_sent();
4025       }
4026     });
4027   ctx->register_on_success(
4028     [ctx, this]() {
4029       do_osd_op_effects(
4030         ctx,
4031         ctx->op ? ctx->op->get_req()->get_connection() :
4032         ConnectionRef());
4033     });
4034   ctx->register_on_finish(
4035     [ctx]() {
4036       delete ctx;
4037     });
4038
4039   // issue replica writes
4040   ceph_tid_t rep_tid = osd->get_tid();
4041
4042   RepGather *repop = new_repop(ctx, obc, rep_tid);
4043
4044   issue_repop(repop, ctx);
4045   eval_repop(repop);
4046   repop->put();
4047 }
4048
4049 void PrimaryLogPG::close_op_ctx(OpContext *ctx) {
4050   release_object_locks(ctx->lock_manager);
4051
4052   ctx->op_t.reset();
4053
4054   for (auto p = ctx->on_finish.begin(); p != ctx->on_finish.end();
4055        ctx->on_finish.erase(p++)) {
4056     (*p)();
4057   }
4058   delete ctx;
4059 }
4060
4061 void PrimaryLogPG::reply_ctx(OpContext *ctx, int r)
4062 {
4063   if (ctx->op)
4064     osd->reply_op_error(ctx->op, r);
4065   close_op_ctx(ctx);
4066 }
4067
4068 void PrimaryLogPG::log_op_stats(const OpRequest& op,
4069                                 const uint64_t inb,
4070                                 const uint64_t outb)
4071 {
4072   auto m = op.get_req<MOSDOp>();
4073   const utime_t now = ceph_clock_now();
4074
4075   const utime_t latency = now - m->get_recv_stamp();
4076   const utime_t process_latency = now - op.get_dequeued_time();
4077
4078   osd->logger->inc(l_osd_op);
4079
4080   osd->logger->inc(l_osd_op_outb, outb);
4081   osd->logger->inc(l_osd_op_inb, inb);
4082   osd->logger->tinc(l_osd_op_lat, latency);
4083   osd->logger->tinc(l_osd_op_process_lat, process_latency);
4084
4085   if (op.may_read() && op.may_write()) {
4086     osd->logger->inc(l_osd_op_rw);
4087     osd->logger->inc(l_osd_op_rw_inb, inb);
4088     osd->logger->inc(l_osd_op_rw_outb, outb);
4089     osd->logger->tinc(l_osd_op_rw_lat, latency);
4090     osd->logger->hinc(l_osd_op_rw_lat_inb_hist, latency.to_nsec(), inb);
4091     osd->logger->hinc(l_osd_op_rw_lat_outb_hist, latency.to_nsec(), outb);
4092     osd->logger->tinc(l_osd_op_rw_process_lat, process_latency);
4093   } else if (op.may_read()) {
4094     osd->logger->inc(l_osd_op_r);
4095     osd->logger->inc(l_osd_op_r_outb, outb);
4096     osd->logger->tinc(l_osd_op_r_lat, latency);
4097     osd->logger->hinc(l_osd_op_r_lat_outb_hist, latency.to_nsec(), outb);
4098     osd->logger->tinc(l_osd_op_r_process_lat, process_latency);
4099   } else if (op.may_write() || op.may_cache()) {
4100     osd->logger->inc(l_osd_op_w);
4101     osd->logger->inc(l_osd_op_w_inb, inb);
4102     osd->logger->tinc(l_osd_op_w_lat, latency);
4103     osd->logger->hinc(l_osd_op_w_lat_inb_hist, latency.to_nsec(), inb);
4104     osd->logger->tinc(l_osd_op_w_process_lat, process_latency);
4105   } else {
4106     ceph_abort();
4107   }
4108
4109   dout(15) << "log_op_stats " << *m
4110            << " inb " << inb
4111            << " outb " << outb
4112            << " lat " << latency << dendl;
4113
4114   if (m_dynamic_perf_stats.is_enabled()) {
4115     m_dynamic_perf_stats.add(osd, info, op, inb, outb, latency);
4116   }
4117 }
4118
4119 void PrimaryLogPG::set_dynamic_perf_stats_queries(
4120     const std::list<OSDPerfMetricQuery> &queries)
4121 {
4122   m_dynamic_perf_stats.set_queries(queries);
4123 }
4124
4125 void PrimaryLogPG::get_dynamic_perf_stats(DynamicPerfStats *stats)
4126 {
4127   std::swap(m_dynamic_perf_stats, *stats);
4128 }
4129
4130 void PrimaryLogPG::do_scan(
4131   OpRequestRef op,
4132   ThreadPool::TPHandle &handle)
4133 {
4134   auto m = op->get_req<MOSDPGScan>();
4135   ceph_assert(m->get_type() == MSG_OSD_PG_SCAN);
4136   dout(10) << "do_scan " << *m << dendl;
4137
4138   op->mark_started();
4139
4140   switch (m->op) {
4141   case MOSDPGScan::OP_SCAN_GET_DIGEST:
4142     {
4143       auto dpp = get_dpp();
4144       if (osd->check_backfill_full(dpp)) {
4145         dout(1) << __func__ << ": Canceling backfill: Full." << dendl;
4146         queue_peering_event(
4147           PGPeeringEventRef(
4148             std::make_shared<PGPeeringEvent>(
4149               get_osdmap_epoch(),
4150               get_osdmap_epoch(),
4151               PeeringState::BackfillTooFull())));
4152         return;
4153       }
4154
4155       BackfillInterval bi;
4156       bi.begin = m->begin;
4157       // No need to flush, there won't be any in progress writes occuring
4158       // past m->begin
4159       scan_range(
4160         cct->_conf->osd_backfill_scan_min,
4161         cct->_conf->osd_backfill_scan_max,
4162         &bi,
4163         handle);
4164       MOSDPGScan *reply = new MOSDPGScan(
4165         MOSDPGScan::OP_SCAN_DIGEST,
4166         pg_whoami,
4167         get_osdmap_epoch(), m->query_epoch,
4168         spg_t(info.pgid.pgid, get_primary().shard), bi.begin, bi.end);
4169       encode(bi.objects, reply->get_data());
4170       osd->send_message_osd_cluster(reply, m->get_connection());
4171     }
4172     break;
4173
4174   case MOSDPGScan::OP_SCAN_DIGEST:
4175     {
4176       pg_shard_t from = m->from;
4177
4178       // Check that from is in backfill_targets vector
4179       ceph_assert(is_backfill_target(from));
4180
4181       BackfillInterval& bi = peer_backfill_info[from];
4182       bi.begin = m->begin;
4183       bi.end = m->end;
4184       auto p = m->get_data().cbegin();
4185
4186       // take care to preserve ordering!
4187       bi.clear_objects();
4188       ::decode_noclear(bi.objects, p);
4189
4190       if (waiting_on_backfill.erase(from)) {
4191         if (waiting_on_backfill.empty()) {
4192           ceph_assert(
4193             peer_backfill_info.size() ==
4194             get_backfill_targets().size());
4195           finish_recovery_op(hobject_t::get_max());
4196         }
4197       } else {
4198         // we canceled backfill for a while due to a too full, and this
4199         // is an extra response from a non-too-full peer
4200         dout(20) << __func__ << " canceled backfill (too full?)" << dendl;
4201       }
4202     }
4203     break;
4204   }
4205 }
4206
4207 void PrimaryLogPG::do_backfill(OpRequestRef op)
4208 {
4209   auto m = op->get_req<MOSDPGBackfill>();
4210   ceph_assert(m->get_type() == MSG_OSD_PG_BACKFILL);
4211   dout(10) << "do_backfill " << *m << dendl;
4212
4213   op->mark_started();
4214
4215   switch (m->op) {
4216   case MOSDPGBackfill::OP_BACKFILL_FINISH:
4217     {
4218       ceph_assert(cct->_conf->osd_kill_backfill_at != 1);
4219
4220       MOSDPGBackfill *reply = new MOSDPGBackfill(
4221         MOSDPGBackfill::OP_BACKFILL_FINISH_ACK,
4222         get_osdmap_epoch(),
4223         m->query_epoch,
4224         spg_t(info.pgid.pgid, get_primary().shard));
4225       reply->set_priority(get_recovery_op_priority());
4226       osd->send_message_osd_cluster(reply, m->get_connection());
4227       queue_peering_event(
4228         PGPeeringEventRef(
4229           std::make_shared<PGPeeringEvent>(
4230             get_osdmap_epoch(),
4231             get_osdmap_epoch(),
4232             RecoveryDone())));
4233     }
4234     // fall-thru
4235
4236   case MOSDPGBackfill::OP_BACKFILL_PROGRESS:
4237     {
4238       ceph_assert(cct->_conf->osd_kill_backfill_at != 2);
4239
4240       ObjectStore::Transaction t;
4241       recovery_state.update_backfill_progress(
4242         m->last_backfill,
4243         m->stats,
4244         m->op == MOSDPGBackfill::OP_BACKFILL_PROGRESS,
4245         t);
4246
4247       int tr = osd->store->queue_transaction(ch, std::move(t), NULL);
4248       ceph_assert(tr == 0);
4249     }
4250     break;
4251
4252   case MOSDPGBackfill::OP_BACKFILL_FINISH_ACK:
4253     {
4254       ceph_assert(is_primary());
4255       ceph_assert(cct->_conf->osd_kill_backfill_at != 3);
4256       finish_recovery_op(hobject_t::get_max());
4257     }
4258     break;
4259   }
4260 }
4261
4262 void PrimaryLogPG::do_backfill_remove(OpRequestRef op)
4263 {
4264   const MOSDPGBackfillRemove *m = static_cast<const MOSDPGBackfillRemove*>(
4265     op->get_req());
4266   ceph_assert(m->get_type() == MSG_OSD_PG_BACKFILL_REMOVE);
4267   dout(7) << __func__ << " " << m->ls << dendl;
4268
4269   op->mark_started();
4270
4271   ObjectStore::Transaction t;
4272   for (auto& p : m->ls) {
4273     if (is_remote_backfilling()) {
4274       struct stat st;
4275       int r = osd->store->stat(ch, ghobject_t(p.first, ghobject_t::NO_GEN,
4276                                pg_whoami.shard) , &st);
4277       if (r == 0) {
4278         sub_local_num_bytes(st.st_size);
4279         int64_t usersize;
4280         if (pool.info.is_erasure()) {
4281           bufferlist bv;
4282           int r = osd->store->getattr(
4283               ch,
4284               ghobject_t(p.first, ghobject_t::NO_GEN, pg_whoami.shard),
4285               OI_ATTR,
4286               bv);
4287           if (r >= 0) {
4288             object_info_t oi(bv);
4289             usersize = oi.size * pgbackend->get_ec_data_chunk_count();
4290           } else {
4291             dout(0) << __func__ << " " << ghobject_t(p.first, ghobject_t::NO_GEN, pg_whoami.shard)
4292                     << " can't get object info" << dendl;
4293             usersize = 0;
4294           }
4295         } else {
4296           usersize = st.st_size;
4297         }
4298         sub_num_bytes(usersize);
4299         dout(10) << __func__ << " " << ghobject_t(p.first, ghobject_t::NO_GEN, pg_whoami.shard)
4300                  << " sub actual data by " << st.st_size
4301                  << " sub num_bytes by " << usersize
4302                  << dendl;
4303       }
4304     }
4305     remove_snap_mapped_object(t, p.first);
4306   }
4307   int r = osd->store->queue_transaction(ch, std::move(t), NULL);
4308   ceph_assert(r == 0);
4309 }
4310
4311 int PrimaryLogPG::trim_object(
4312   bool first, const hobject_t &coid, snapid_t snap_to_trim,
4313   PrimaryLogPG::OpContextUPtr *ctxp)
4314 {
4315   *ctxp = NULL;
4316
4317   // load clone info
4318   bufferlist bl;
4319   ObjectContextRef obc = get_object_context(coid, false, NULL);
4320   if (!obc || !obc->ssc || !obc->ssc->exists) {
4321     osd->clog->error() << __func__ << ": Can not trim " << coid
4322       << " repair needed " << (obc ? "(no obc->ssc or !exists)" : "(no obc)");
4323     return -ENOENT;
4324   }
4325
4326   hobject_t head_oid = coid.get_head();
4327   ObjectContextRef head_obc = get_object_context(head_oid, false);
4328   if (!head_obc) {
4329     osd->clog->error() << __func__ << ": Can not trim " << coid
4330       << " repair needed, no snapset obc for " << head_oid;
4331     return -ENOENT;
4332   }
4333
4334   SnapSet& snapset = obc->ssc->snapset;
4335
4336   object_info_t &coi = obc->obs.oi;
4337   auto citer = snapset.clone_snaps.find(coid.snap);
4338   if (citer == snapset.clone_snaps.end()) {
4339     osd->clog->error() << "No clone_snaps in snapset " << snapset
4340                        << " for object " << coid << "\n";
4341     return -ENOENT;
4342   }
4343   set<snapid_t> old_snaps(citer->second.begin(), citer->second.end());
4344   if (old_snaps.empty()) {
4345     osd->clog->error() << "No object info snaps for object " << coid;
4346     return -ENOENT;
4347   }
4348
4349   dout(10) << coid << " old_snaps " << old_snaps
4350            << " old snapset " << snapset << dendl;
4351   if (snapset.seq == 0) {
4352     osd->clog->error() << "No snapset.seq for object " << coid;
4353     return -ENOENT;
4354   }
4355
4356   set<snapid_t> new_snaps;
4357   const OSDMapRef& osdmap = get_osdmap();
4358   for (set<snapid_t>::iterator i = old_snaps.begin();
4359        i != old_snaps.end();
4360        ++i) {
4361     if (!osdmap->in_removed_snaps_queue(info.pgid.pgid.pool(), *i) &&
4362         *i != snap_to_trim) {
4363       new_snaps.insert(*i);
4364     }
4365   }
4366
4367   vector<snapid_t>::iterator p = snapset.clones.end();
4368
4369   if (new_snaps.empty()) {
4370     p = std::find(snapset.clones.begin(), snapset.clones.end(), coid.snap);
4371     if (p == snapset.clones.end()) {
4372       osd->clog->error() << "Snap " << coid.snap << " not in clones";
4373       return -ENOENT;
4374     }
4375   }
4376
4377   OpContextUPtr ctx = simple_opc_create(obc);
4378   ctx->head_obc = head_obc;
4379
4380   if (!ctx->lock_manager.get_snaptrimmer_write(
4381         coid,
4382         obc,
4383         first)) {
4384     close_op_ctx(ctx.release());
4385     dout(10) << __func__ << ": Unable to get a wlock on " << coid << dendl;
4386     return -ENOLCK;
4387   }
4388
4389   if (!ctx->lock_manager.get_snaptrimmer_write(
4390         head_oid,
4391         head_obc,
4392         first)) {
4393     close_op_ctx(ctx.release());
4394     dout(10) << __func__ << ": Unable to get a wlock on " << head_oid << dendl;
4395     return -ENOLCK;
4396   }
4397
4398   ctx->at_version = get_next_version();
4399
4400   PGTransaction *t = ctx->op_t.get();
4401
4402   if (new_snaps.empty()) {
4403     // remove clone
4404     dout(10) << coid << " snaps " << old_snaps << " -> "
4405              << new_snaps << " ... deleting" << dendl;
4406
4407     // ...from snapset
4408     ceph_assert(p != snapset.clones.end());
4409
4410     snapid_t last = coid.snap;
4411     ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(last);
4412
4413     if (p != snapset.clones.begin()) {
4414       // not the oldest... merge overlap into next older clone
4415       vector<snapid_t>::iterator n = p - 1;
4416       hobject_t prev_coid = coid;
4417       prev_coid.snap = *n;
4418       bool adjust_prev_bytes = is_present_clone(prev_coid);
4419
4420       if (adjust_prev_bytes)
4421         ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(*n);
4422
4423       snapset.clone_overlap[*n].intersection_of(
4424         snapset.clone_overlap[*p]);
4425
4426       if (adjust_prev_bytes)
4427         ctx->delta_stats.num_bytes += snapset.get_clone_bytes(*n);
4428     }
4429     ctx->delta_stats.num_objects--;
4430     if (coi.is_dirty())
4431       ctx->delta_stats.num_objects_dirty--;
4432     if (coi.is_omap())
4433       ctx->delta_stats.num_objects_omap--;
4434     if (coi.is_whiteout()) {
4435       dout(20) << __func__ << " trimming whiteout on " << coid << dendl;
4436       ctx->delta_stats.num_whiteouts--;
4437     }
4438     ctx->delta_stats.num_object_clones--;
4439     if (coi.is_cache_pinned())
4440       ctx->delta_stats.num_objects_pinned--;
4441     if (coi.has_manifest())
4442       ctx->delta_stats.num_objects_manifest--;
4443     obc->obs.exists = false;
4444
4445     snapset.clones.erase(p);
4446     snapset.clone_overlap.erase(last);
4447     snapset.clone_size.erase(last);
4448     snapset.clone_snaps.erase(last);
4449
4450     ctx->log.push_back(
4451       pg_log_entry_t(
4452         pg_log_entry_t::DELETE,
4453         coid,
4454         ctx->at_version,
4455         ctx->obs->oi.version,
4456         0,
4457         osd_reqid_t(),
4458         ctx->mtime,
4459         0)
4460       );
4461     t->remove(coid);
4462     t->update_snaps(
4463       coid,
4464       old_snaps,
4465       new_snaps);
4466
4467     coi = object_info_t(coid);
4468
4469     ctx->at_version.version++;
4470   } else {
4471     // save adjusted snaps for this object
4472     dout(10) << coid << " snaps " << old_snaps << " -> " << new_snaps << dendl;
4473     snapset.clone_snaps[coid.snap] =
4474       vector<snapid_t>(new_snaps.rbegin(), new_snaps.rend());
4475     // we still do a 'modify' event on this object just to trigger a
4476     // snapmapper.update ... :(
4477
4478     coi.prior_version = coi.version;
4479     coi.version = ctx->at_version;
4480     bl.clear();
4481     encode(coi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
4482     t->setattr(coid, OI_ATTR, bl);
4483
4484     ctx->log.push_back(
4485       pg_log_entry_t(
4486         pg_log_entry_t::MODIFY,
4487         coid,
4488         coi.version,
4489         coi.prior_version,
4490         0,
4491         osd_reqid_t(),
4492         ctx->mtime,
4493         0)
4494       );
4495     ctx->at_version.version++;
4496
4497     t->update_snaps(
4498       coid,
4499       old_snaps,
4500       new_snaps);
4501   }
4502
4503   // save head snapset
4504   dout(10) << coid << " new snapset " << snapset << " on "
4505            << head_obc->obs.oi << dendl;
4506   if (snapset.clones.empty() &&
4507       (head_obc->obs.oi.is_whiteout() &&
4508        !(head_obc->obs.oi.is_dirty() && pool.info.is_tier()) &&
4509        !head_obc->obs.oi.is_cache_pinned())) {
4510     // NOTE: this arguably constitutes minor interference with the
4511     // tiering agent if this is a cache tier since a snap trim event
4512     // is effectively evicting a whiteout we might otherwise want to
4513     // keep around.
4514     dout(10) << coid << " removing " << head_oid << dendl;
4515     ctx->log.push_back(
4516       pg_log_entry_t(
4517         pg_log_entry_t::DELETE,
4518         head_oid,
4519         ctx->at_version,
4520         head_obc->obs.oi.version,
4521         0,
4522         osd_reqid_t(),
4523         ctx->mtime,
4524         0)
4525       );
4526     derr << "removing snap head" << dendl;
4527     object_info_t& oi = head_obc->obs.oi;
4528     ctx->delta_stats.num_objects--;
4529     if (oi.is_dirty()) {
4530       ctx->delta_stats.num_objects_dirty--;
4531     }
4532     if (oi.is_omap())
4533       ctx->delta_stats.num_objects_omap--;
4534     if (oi.is_whiteout()) {
4535       dout(20) << __func__ << " trimming whiteout on " << oi.soid << dendl;
4536       ctx->delta_stats.num_whiteouts--;
4537     }
4538     if (oi.is_cache_pinned()) {
4539       ctx->delta_stats.num_objects_pinned--;
4540     }
4541     if (coi.has_manifest())
4542       ctx->delta_stats.num_objects_manifest--;
4543     head_obc->obs.exists = false;
4544     head_obc->obs.oi = object_info_t(head_oid);
4545     t->remove(head_oid);
4546   } else {
4547     if (get_osdmap()->require_osd_release < ceph_release_t::octopus) {
4548       // filter SnapSet::snaps for the benefit of pre-octopus
4549       // peers. This is perhaps overly conservative in that I'm not
4550       // certain they need this, but let's be conservative here.
4551       dout(10) << coid << " filtering snapset on " << head_oid << dendl;
4552       snapset.filter(pool.info);
4553     } else {
4554       snapset.snaps.clear();
4555     }
4556     dout(10) << coid << " writing updated snapset on " << head_oid
4557              << ", snapset is " << snapset << dendl;
4558     ctx->log.push_back(
4559       pg_log_entry_t(
4560         pg_log_entry_t::MODIFY,
4561         head_oid,
4562         ctx->at_version,
4563         head_obc->obs.oi.version,
4564         0,
4565         osd_reqid_t(),
4566         ctx->mtime,
4567         0)
4568       );
4569
4570     head_obc->obs.oi.prior_version = head_obc->obs.oi.version;
4571     head_obc->obs.oi.version = ctx->at_version;
4572
4573     map <string, bufferlist> attrs;
4574     bl.clear();
4575     encode(snapset, bl);
4576     attrs[SS_ATTR].claim(bl);
4577
4578     bl.clear();
4579     encode(head_obc->obs.oi, bl,
4580              get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
4581     attrs[OI_ATTR].claim(bl);
4582     t->setattrs(head_oid, attrs);
4583   }
4584
4585   *ctxp = std::move(ctx);
4586   return 0;
4587 }
4588
4589 void PrimaryLogPG::kick_snap_trim()
4590 {
4591   ceph_assert(is_active());
4592   ceph_assert(is_primary());
4593   if (is_clean() &&
4594       !state_test(PG_STATE_PREMERGE) &&
4595       !snap_trimq.empty()) {
4596     if (get_osdmap()->test_flag(CEPH_OSDMAP_NOSNAPTRIM)) {
4597       dout(10) << __func__ << ": nosnaptrim set, not kicking" << dendl;
4598     } else {
4599       dout(10) << __func__ << ": clean and snaps to trim, kicking" << dendl;
4600       snap_trimmer_machine.process_event(KickTrim());
4601     }
4602   }
4603 }
4604
4605 void PrimaryLogPG::snap_trimmer_scrub_complete()
4606 {
4607   if (is_primary() && is_active() && is_clean()) {
4608     ceph_assert(!snap_trimq.empty());
4609     snap_trimmer_machine.process_event(ScrubComplete());
4610   }
4611 }
4612
4613 void PrimaryLogPG::snap_trimmer(epoch_t queued)
4614 {
4615   if (recovery_state.is_deleting() || pg_has_reset_since(queued)) {
4616     return;
4617   }
4618
4619   ceph_assert(is_primary());
4620
4621   dout(10) << "snap_trimmer posting" << dendl;
4622   snap_trimmer_machine.process_event(DoSnapWork());
4623   dout(10) << "snap_trimmer complete" << dendl;
4624   return;
4625 }
4626
4627 int PrimaryLogPG::do_xattr_cmp_u64(int op, __u64 v1, bufferlist& xattr)
4628 {
4629   __u64 v2;
4630
4631   string v2s(xattr.c_str(), xattr.length());
4632   if (v2s.length())
4633     v2 = strtoull(v2s.c_str(), NULL, 10);
4634   else
4635     v2 = 0;
4636
4637   dout(20) << "do_xattr_cmp_u64 '" << v1 << "' vs '" << v2 << "' op " << op << dendl;
4638
4639   switch (op) {
4640   case CEPH_OSD_CMPXATTR_OP_EQ:
4641     return (v1 == v2);
4642   case CEPH_OSD_CMPXATTR_OP_NE:
4643     return (v1 != v2);
4644   case CEPH_OSD_CMPXATTR_OP_GT:
4645     return (v1 > v2);
4646   case CEPH_OSD_CMPXATTR_OP_GTE:
4647     return (v1 >= v2);
4648   case CEPH_OSD_CMPXATTR_OP_LT:
4649     return (v1 < v2);
4650   case CEPH_OSD_CMPXATTR_OP_LTE:
4651     return (v1 <= v2);
4652   default:
4653     return -EINVAL;
4654   }
4655 }
4656
4657 int PrimaryLogPG::do_xattr_cmp_str(int op, string& v1s, bufferlist& xattr)
4658 {
4659   string v2s(xattr.c_str(), xattr.length());
4660
4661   dout(20) << "do_xattr_cmp_str '" << v1s << "' vs '" << v2s << "' op " << op << dendl;
4662
4663   switch (op) {
4664   case CEPH_OSD_CMPXATTR_OP_EQ:
4665     return (v1s.compare(v2s) == 0);
4666   case CEPH_OSD_CMPXATTR_OP_NE:
4667     return (v1s.compare(v2s) != 0);
4668   case CEPH_OSD_CMPXATTR_OP_GT:
4669     return (v1s.compare(v2s) > 0);
4670   case CEPH_OSD_CMPXATTR_OP_GTE:
4671     return (v1s.compare(v2s) >= 0);
4672   case CEPH_OSD_CMPXATTR_OP_LT:
4673     return (v1s.compare(v2s) < 0);
4674   case CEPH_OSD_CMPXATTR_OP_LTE:
4675     return (v1s.compare(v2s) <= 0);
4676   default:
4677     return -EINVAL;
4678   }
4679 }
4680
4681 int PrimaryLogPG::do_writesame(OpContext *ctx, OSDOp& osd_op)
4682 {
4683   ceph_osd_op& op = osd_op.op;
4684   vector<OSDOp> write_ops(1);
4685   OSDOp& write_op = write_ops[0];
4686   uint64_t write_length = op.writesame.length;
4687   int result = 0;
4688
4689   if (!write_length)
4690     return 0;
4691
4692   if (!op.writesame.data_length || write_length % op.writesame.data_length)
4693     return -EINVAL;
4694
4695   if (op.writesame.data_length != osd_op.indata.length()) {
4696     derr << "invalid length ws data length " << op.writesame.data_length << " actual len " << osd_op.indata.length() << dendl;
4697     return -EINVAL;
4698   }
4699
4700   while (write_length) {
4701     write_op.indata.append(osd_op.indata);
4702     write_length -= op.writesame.data_length;
4703   }
4704
4705   write_op.op.op = CEPH_OSD_OP_WRITE;
4706   write_op.op.extent.offset = op.writesame.offset;
4707   write_op.op.extent.length = op.writesame.length;
4708   result = do_osd_ops(ctx, write_ops);
4709   if (result < 0)
4710     derr << "do_writesame do_osd_ops failed " << result << dendl;
4711
4712   return result;
4713 }
4714
4715 // ========================================================================
4716 // low level osd ops
4717
4718 int PrimaryLogPG::do_tmap2omap(OpContext *ctx, unsigned flags)
4719 {
4720   dout(20) << " convert tmap to omap for " << ctx->new_obs.oi.soid << dendl;
4721   bufferlist header, vals;
4722   int r = _get_tmap(ctx, &header, &vals);
4723   if (r < 0) {
4724     if (r == -ENODATA && (flags & CEPH_OSD_TMAP2OMAP_NULLOK))
4725       r = 0;
4726     return r;
4727   }
4728
4729   vector<OSDOp> ops(3);
4730
4731   ops[0].op.op = CEPH_OSD_OP_TRUNCATE;
4732   ops[0].op.extent.offset = 0;
4733   ops[0].op.extent.length = 0;
4734
4735   ops[1].op.op = CEPH_OSD_OP_OMAPSETHEADER;
4736   ops[1].indata.claim(header);
4737
4738   ops[2].op.op = CEPH_OSD_OP_OMAPSETVALS;
4739   ops[2].indata.claim(vals);
4740
4741   return do_osd_ops(ctx, ops);
4742 }
4743
4744 int PrimaryLogPG::do_tmapup_slow(OpContext *ctx, bufferlist::const_iterator& bp,
4745                                  OSDOp& osd_op, bufferlist& bl)
4746 {
4747   // decode
4748   bufferlist header;
4749   map<string, bufferlist> m;
4750   if (bl.length()) {
4751     auto p = bl.cbegin();
4752     decode(header, p);
4753     decode(m, p);
4754     ceph_assert(p.end());
4755   }
4756
4757   // do the update(s)
4758   while (!bp.end()) {
4759     __u8 op;
4760     string key;
4761     decode(op, bp);
4762
4763     switch (op) {
4764     case CEPH_OSD_TMAP_SET: // insert key
4765       {
4766         decode(key, bp);
4767         bufferlist data;
4768         decode(data, bp);
4769         m[key] = data;
4770       }
4771       break;
4772     case CEPH_OSD_TMAP_RM: // remove key
4773       decode(key, bp);
4774       if (!m.count(key)) {
4775         return -ENOENT;
4776       }
4777       m.erase(key);
4778       break;
4779     case CEPH_OSD_TMAP_RMSLOPPY: // remove key
4780       decode(key, bp);
4781       m.erase(key);
4782       break;
4783     case CEPH_OSD_TMAP_HDR: // update header
4784       {
4785         decode(header, bp);
4786       }
4787       break;
4788     default:
4789       return -EINVAL;
4790     }
4791   }
4792
4793   // reencode
4794   bufferlist obl;
4795   encode(header, obl);
4796   encode(m, obl);
4797
4798   // write it out
4799   vector<OSDOp> nops(1);
4800   OSDOp& newop = nops[0];
4801   newop.op.op = CEPH_OSD_OP_WRITEFULL;
4802   newop.op.extent.offset = 0;
4803   newop.op.extent.length = obl.length();
4804   newop.indata = obl;
4805   do_osd_ops(ctx, nops);
4806   return 0;
4807 }
4808
4809 int PrimaryLogPG::do_tmapup(OpContext *ctx, bufferlist::const_iterator& bp, OSDOp& osd_op)
4810 {
4811   bufferlist::const_iterator orig_bp = bp;
4812   int result = 0;
4813   if (bp.end()) {
4814     dout(10) << "tmapup is a no-op" << dendl;
4815   } else {
4816     // read the whole object
4817     vector<OSDOp> nops(1);
4818     OSDOp& newop = nops[0];
4819     newop.op.op = CEPH_OSD_OP_READ;
4820     newop.op.extent.offset = 0;
4821     newop.op.extent.length = 0;
4822     result = do_osd_ops(ctx, nops);
4823
4824     dout(10) << "tmapup read " << newop.outdata.length() << dendl;
4825
4826     dout(30) << " starting is \n";
4827     newop.outdata.hexdump(*_dout);
4828     *_dout << dendl;
4829
4830     auto ip = newop.outdata.cbegin();
4831     bufferlist obl;
4832
4833     dout(30) << "the update command is: \n";
4834     osd_op.indata.hexdump(*_dout);
4835     *_dout << dendl;
4836
4837     // header
4838     bufferlist header;
4839     __u32 nkeys = 0;
4840     if (newop.outdata.length()) {
4841       decode(header, ip);
4842       decode(nkeys, ip);
4843     }
4844     dout(10) << "tmapup header " << header.length() << dendl;
4845
4846     if (!bp.end() && *bp == CEPH_OSD_TMAP_HDR) {
4847       ++bp;
4848       decode(header, bp);
4849       dout(10) << "tmapup new header " << header.length() << dendl;
4850     }
4851
4852     encode(header, obl);
4853
4854     dout(20) << "tmapup initial nkeys " << nkeys << dendl;
4855
4856     // update keys
4857     bufferlist newkeydata;
4858     string nextkey, last_in_key;
4859     bufferlist nextval;
4860     bool have_next = false;
4861     if (!ip.end()) {
4862       have_next = true;
4863       decode(nextkey, ip);
4864       decode(nextval, ip);
4865     }
4866     while (!bp.end() && !result) {
4867       __u8 op;
4868       string key;
4869       try {
4870         decode(op, bp);
4871         decode(key, bp);
4872       }
4873       catch (buffer::error& e) {
4874         return -EINVAL;
4875       }
4876       if (key < last_in_key) {
4877         dout(5) << "tmapup warning: key '" << key << "' < previous key '" << last_in_key
4878                 << "', falling back to an inefficient (unsorted) update" << dendl;
4879         bp = orig_bp;
4880         return do_tmapup_slow(ctx, bp, osd_op, newop.outdata);
4881       }
4882       last_in_key = key;
4883
4884       dout(10) << "tmapup op " << (int)op << " key " << key << dendl;
4885
4886       // skip existing intervening keys
4887       bool key_exists = false;
4888       while (have_next && !key_exists) {
4889         dout(20) << "  (have_next=" << have_next << " nextkey=" << nextkey << ")" << dendl;
4890         if (nextkey > key)
4891           break;
4892         if (nextkey < key) {
4893           // copy untouched.
4894           encode(nextkey, newkeydata);
4895           encode(nextval, newkeydata);
4896           dout(20) << "  keep " << nextkey << " " << nextval.length() << dendl;
4897         } else {
4898           // don't copy; discard old value.  and stop.
4899           dout(20) << "  drop " << nextkey << " " << nextval.length() << dendl;
4900           key_exists = true;
4901           nkeys--;
4902         }
4903         if (!ip.end()) {
4904           decode(nextkey, ip);
4905           decode(nextval, ip);
4906         } else {
4907           have_next = false;
4908         }
4909       }
4910
4911       if (op == CEPH_OSD_TMAP_SET) {
4912         bufferlist val;
4913         try {
4914           decode(val, bp);
4915         }
4916         catch (buffer::error& e) {
4917           return -EINVAL;
4918         }
4919         encode(key, newkeydata);
4920         encode(val, newkeydata);
4921         dout(20) << "   set " << key << " " << val.length() << dendl;
4922         nkeys++;
4923       } else if (op == CEPH_OSD_TMAP_CREATE) {
4924         if (key_exists) {
4925           return -EEXIST;
4926         }
4927         bufferlist val;
4928         try {
4929           decode(val, bp);
4930         }
4931         catch (buffer::error& e) {
4932           return -EINVAL;
4933         }
4934         encode(key, newkeydata);
4935         encode(val, newkeydata);
4936         dout(20) << "   create " << key << " " << val.length() << dendl;
4937         nkeys++;
4938       } else if (op == CEPH_OSD_TMAP_RM) {
4939         // do nothing.
4940         if (!key_exists) {
4941           return -ENOENT;
4942         }
4943       } else if (op == CEPH_OSD_TMAP_RMSLOPPY) {
4944         // do nothing
4945       } else {
4946         dout(10) << "  invalid tmap op " << (int)op << dendl;
4947         return -EINVAL;
4948       }
4949     }
4950
4951     // copy remaining
4952     if (have_next) {
4953       encode(nextkey, newkeydata);
4954       encode(nextval, newkeydata);
4955       dout(20) << "  keep " << nextkey << " " << nextval.length() << dendl;
4956     }
4957     if (!ip.end()) {
4958       bufferlist rest;
4959       rest.substr_of(newop.outdata, ip.get_off(), newop.outdata.length() - ip.get_off());
4960       dout(20) << "  keep trailing " << rest.length()
4961                << " at " << newkeydata.length() << dendl;
4962       newkeydata.claim_append(rest);
4963     }
4964
4965     // encode final key count + key data
4966     dout(20) << "tmapup final nkeys " << nkeys << dendl;
4967     encode(nkeys, obl);
4968     obl.claim_append(newkeydata);
4969
4970     if (0) {
4971       dout(30) << " final is \n";
4972       obl.hexdump(*_dout);
4973       *_dout << dendl;
4974
4975       // sanity check
4976       auto tp = obl.cbegin();
4977       bufferlist h;
4978       decode(h, tp);
4979       map<string,bufferlist> d;
4980       decode(d, tp);
4981       ceph_assert(tp.end());
4982       dout(0) << " **** debug sanity check, looks ok ****" << dendl;
4983     }
4984
4985     // write it out
4986     if (!result) {
4987       dout(20) << "tmapput write " << obl.length() << dendl;
4988       newop.op.op = CEPH_OSD_OP_WRITEFULL;
4989       newop.op.extent.offset = 0;
4990       newop.op.extent.length = obl.length();
4991       newop.indata = obl;
4992       do_osd_ops(ctx, nops);
4993     }
4994   }
4995   return result;
4996 }
4997
4998 static int check_offset_and_length(uint64_t offset, uint64_t length,
4999   uint64_t max, DoutPrefixProvider *dpp)
5000 {
5001   if (offset >= max ||
5002       length > max ||
5003       offset + length > max) {
5004     ldpp_dout(dpp, 10) << __func__ << " "
5005       << "osd_max_object_size: " << max
5006       << "; Hard limit of object size is 4GB." << dendl;
5007     return -EFBIG;
5008   }
5009
5010   return 0;
5011 }
5012
5013 struct FillInVerifyExtent : public Context {
5014   ceph_le64 *r;
5015   int32_t *rval;
5016   bufferlist *outdatap;
5017   std::optional<uint32_t> maybe_crc;
5018   uint64_t size;
5019   OSDService *osd;
5020   hobject_t soid;
5021   uint32_t flags;
5022   FillInVerifyExtent(ceph_le64 *r, int32_t *rv, bufferlist *blp,
5023                      std::optional<uint32_t> mc, uint64_t size,
5024                      OSDService *osd, hobject_t soid, uint32_t flags) :
5025     r(r), rval(rv), outdatap(blp), maybe_crc(mc),
5026     size(size), osd(osd), soid(soid), flags(flags) {}
5027   void finish(int len) override {
5028     *r = len;
5029     if (len < 0) {
5030       *rval = len;
5031       return;
5032     }
5033     *rval = 0;
5034
5035     // whole object?  can we verify the checksum?
5036     if (maybe_crc && *r == size) {
5037       uint32_t crc = outdatap->crc32c(-1);
5038       if (maybe_crc != crc) {
5039         osd->clog->error() << std::hex << " full-object read crc 0x" << crc
5040                            << " != expected 0x" << *maybe_crc
5041                            << std::dec << " on " << soid;
5042         if (!(flags & CEPH_OSD_OP_FLAG_FAILOK)) {
5043           *rval = -EIO;
5044           *r = 0;
5045         }
5046       }
5047     }
5048   }
5049 };
5050
5051 struct ToSparseReadResult : public Context {
5052   int* result;
5053   bufferlist* data_bl;
5054   uint64_t data_offset;
5055   ceph_le64* len;
5056   ToSparseReadResult(int* result, bufferlist* bl, uint64_t offset,
5057                      ceph_le64* len)
5058     : result(result), data_bl(bl), data_offset(offset),len(len) {}
5059   void finish(int r) override {
5060     if (r < 0) {
5061       *result = r;
5062       return;
5063     }
5064     *result = 0;
5065     *len = r;
5066     bufferlist outdata;
5067     map<uint64_t, uint64_t> extents = {{data_offset, r}};
5068     encode(extents, outdata);
5069     ::encode_destructively(*data_bl, outdata);
5070     data_bl->swap(outdata);
5071   }
5072 };
5073
5074 template<typename V>
5075 static string list_keys(const map<string, V>& m) {
5076   string s;
5077   for (typename map<string, V>::const_iterator itr = m.begin(); itr != m.end(); ++itr) {
5078     if (!s.empty()) {
5079       s.push_back(',');
5080     }
5081     s.append(itr->first);
5082   }
5083   return s;
5084 }
5085
5086 template<typename T>
5087 static string list_entries(const T& m) {
5088   string s;
5089   for (typename T::const_iterator itr = m.begin(); itr != m.end(); ++itr) {
5090     if (!s.empty()) {
5091       s.push_back(',');
5092     }
5093     s.append(*itr);
5094   }
5095   return s;
5096 }
5097
5098 void PrimaryLogPG::maybe_create_new_object(
5099   OpContext *ctx,
5100   bool ignore_transaction)
5101 {
5102   ObjectState& obs = ctx->new_obs;
5103   if (!obs.exists) {
5104     ctx->delta_stats.num_objects++;
5105     obs.exists = true;
5106     ceph_assert(!obs.oi.is_whiteout());
5107     obs.oi.new_object();
5108     if (!ignore_transaction)
5109       ctx->op_t->create(obs.oi.soid);
5110   } else if (obs.oi.is_whiteout()) {
5111     dout(10) << __func__ << " clearing whiteout on " << obs.oi.soid << dendl;
5112     ctx->new_obs.oi.clear_flag(object_info_t::FLAG_WHITEOUT);
5113     --ctx->delta_stats.num_whiteouts;
5114   }
5115 }
5116
5117 struct ReadFinisher : public PrimaryLogPG::OpFinisher {
5118   OSDOp& osd_op;
5119
5120   explicit ReadFinisher(OSDOp& osd_op) : osd_op(osd_op) {
5121   }
5122
5123   int execute() override {
5124     return osd_op.rval;
5125   }
5126 };
5127
5128 struct C_ChecksumRead : public Context {
5129   PrimaryLogPG *primary_log_pg;
5130   OSDOp &osd_op;
5131   Checksummer::CSumType csum_type;
5132   bufferlist init_value_bl;
5133   ceph_le64 read_length;
5134   bufferlist read_bl;
5135   Context *fill_extent_ctx;
5136
5137   C_ChecksumRead(PrimaryLogPG *primary_log_pg, OSDOp &osd_op,
5138                  Checksummer::CSumType csum_type, bufferlist &&init_value_bl,
5139                  std::optional<uint32_t> maybe_crc, uint64_t size,
5140                  OSDService *osd, hobject_t soid, uint32_t flags)
5141     : primary_log_pg(primary_log_pg), osd_op(osd_op),
5142       csum_type(csum_type), init_value_bl(std::move(init_value_bl)),
5143       fill_extent_ctx(new FillInVerifyExtent(&read_length, &osd_op.rval,
5144                                              &read_bl, maybe_crc, size,
5145                                              osd, soid, flags)) {
5146   }
5147   ~C_ChecksumRead() override {
5148     delete fill_extent_ctx;
5149   }
5150
5151   void finish(int r) override {
5152     fill_extent_ctx->complete(r);
5153     fill_extent_ctx = nullptr;
5154
5155     if (osd_op.rval >= 0) {
5156       bufferlist::const_iterator init_value_bl_it = init_value_bl.begin();
5157       osd_op.rval = primary_log_pg->finish_checksum(osd_op, csum_type,
5158                                                     &init_value_bl_it, read_bl);
5159     }
5160   }
5161 };
5162
5163 int PrimaryLogPG::do_checksum(OpContext *ctx, OSDOp& osd_op,
5164                               bufferlist::const_iterator *bl_it)
5165 {
5166   dout(20) << __func__ << dendl;
5167
5168   auto& op = osd_op.op;
5169   if (op.checksum.chunk_size > 0) {
5170     if (op.checksum.length == 0) {
5171       dout(10) << __func__ << ": length required when chunk size provided"
5172                << dendl;
5173       return -EINVAL;
5174     }
5175     if (op.checksum.length % op.checksum.chunk_size != 0) {
5176       dout(10) << __func__ << ": length not aligned to chunk size" << dendl;
5177       return -EINVAL;
5178     }
5179   }
5180
5181   auto& oi = ctx->new_obs.oi;
5182   if (op.checksum.offset == 0 && op.checksum.length == 0) {
5183     // zeroed offset+length implies checksum whole object
5184     op.checksum.length = oi.size;
5185   } else if (op.checksum.offset >= oi.size) {
5186     // read size was trimmed to zero, do nothing
5187     // see PrimaryLogPG::do_read
5188     return 0;
5189   } else if (op.extent.offset + op.extent.length > oi.size) {
5190     op.extent.length = oi.size - op.extent.offset;
5191     if (op.checksum.chunk_size > 0 &&
5192         op.checksum.length % op.checksum.chunk_size != 0) {
5193       dout(10) << __func__ << ": length (trimmed to 0x"
5194                << std::hex << op.checksum.length
5195                << ") not aligned to chunk size 0x"
5196                << op.checksum.chunk_size << std::dec
5197                << dendl;
5198       return -EINVAL;
5199     }
5200   }
5201
5202   Checksummer::CSumType csum_type;
5203   switch (op.checksum.type) {
5204   case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH32:
5205     csum_type = Checksummer::CSUM_XXHASH32;
5206     break;
5207   case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH64:
5208     csum_type = Checksummer::CSUM_XXHASH64;
5209     break;
5210   case CEPH_OSD_CHECKSUM_OP_TYPE_CRC32C:
5211     csum_type = Checksummer::CSUM_CRC32C;
5212     break;
5213   default:
5214     dout(10) << __func__ << ": unknown crc type ("
5215              << static_cast<uint32_t>(op.checksum.type) << ")" << dendl;
5216     return -EINVAL;
5217   }
5218
5219   size_t csum_init_value_size = Checksummer::get_csum_init_value_size(csum_type);
5220   if (bl_it->get_remaining() < csum_init_value_size) {
5221     dout(10) << __func__ << ": init value not provided" << dendl;
5222     return -EINVAL;
5223   }
5224
5225   bufferlist init_value_bl;
5226   init_value_bl.substr_of(bl_it->get_bl(), bl_it->get_off(),
5227                           csum_init_value_size);
5228   *bl_it += csum_init_value_size;
5229
5230   if (pool.info.is_erasure() && op.checksum.length > 0) {
5231     // If there is a data digest and it is possible we are reading
5232     // entire object, pass the digest.
5233     std::optional<uint32_t> maybe_crc;
5234     if (oi.is_data_digest() && op.checksum.offset == 0 &&
5235         op.checksum.length >= oi.size) {
5236       maybe_crc = oi.data_digest;
5237     }
5238
5239     // async read
5240     auto& soid = oi.soid;
5241     auto checksum_ctx = new C_ChecksumRead(this, osd_op, csum_type,
5242                                            std::move(init_value_bl), maybe_crc,
5243                                            oi.size, osd, soid, op.flags);
5244
5245     ctx->pending_async_reads.push_back({
5246       {op.checksum.offset, op.checksum.length, op.flags},
5247       {&checksum_ctx->read_bl, checksum_ctx}});
5248
5249     dout(10) << __func__ << ": async_read noted for " << soid << dendl;
5250     ctx->op_finishers[ctx->current_osd_subop_num].reset(
5251       new ReadFinisher(osd_op));
5252     return -EINPROGRESS;
5253   }
5254
5255   // sync read
5256   std::vector<OSDOp> read_ops(1);
5257   auto& read_op = read_ops[0];
5258   if (op.checksum.length > 0) {
5259     read_op.op.op = CEPH_OSD_OP_READ;
5260     read_op.op.flags = op.flags;
5261     read_op.op.extent.offset = op.checksum.offset;
5262     read_op.op.extent.length = op.checksum.length;
5263     read_op.op.extent.truncate_size = 0;
5264     read_op.op.extent.truncate_seq = 0;
5265
5266     int r = do_osd_ops(ctx, read_ops);
5267     if (r < 0) {
5268       derr << __func__ << ": do_osd_ops failed: " << cpp_strerror(r) << dendl;
5269       return r;
5270     }
5271   }
5272
5273   bufferlist::const_iterator init_value_bl_it = init_value_bl.begin();
5274   return finish_checksum(osd_op, csum_type, &init_value_bl_it,
5275                          read_op.outdata);
5276 }
5277
5278 int PrimaryLogPG::finish_checksum(OSDOp& osd_op,
5279                                   Checksummer::CSumType csum_type,
5280                                   bufferlist::const_iterator *init_value_bl_it,
5281                                   const bufferlist &read_bl) {
5282   dout(20) << __func__ << dendl;
5283
5284   auto& op = osd_op.op;
5285
5286   if (op.checksum.length > 0 && read_bl.length() != op.checksum.length) {
5287     derr << __func__ << ": bytes read " << read_bl.length() << " != "
5288          << op.checksum.length << dendl;
5289     return -EINVAL;
5290   }
5291
5292   size_t csum_chunk_size = (op.checksum.chunk_size != 0 ?
5293                               op.checksum.chunk_size : read_bl.length());
5294   uint32_t csum_count = (csum_chunk_size > 0 ?
5295                            read_bl.length() / csum_chunk_size : 0);
5296
5297   bufferlist csum;
5298   bufferptr csum_data;
5299   if (csum_count > 0) {
5300     size_t csum_value_size = Checksummer::get_csum_value_size(csum_type);
5301     csum_data = buffer::create(csum_value_size * csum_count);
5302     csum_data.zero();
5303     csum.append(csum_data);
5304
5305     switch (csum_type) {
5306     case Checksummer::CSUM_XXHASH32:
5307       {
5308         Checksummer::xxhash32::init_value_t init_value;
5309         decode(init_value, *init_value_bl_it);
5310         Checksummer::calculate<Checksummer::xxhash32>(
5311           init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
5312           &csum_data);
5313       }
5314       break;
5315     case Checksummer::CSUM_XXHASH64:
5316       {
5317         Checksummer::xxhash64::init_value_t init_value;
5318         decode(init_value, *init_value_bl_it);
5319         Checksummer::calculate<Checksummer::xxhash64>(
5320           init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
5321           &csum_data);
5322       }
5323       break;
5324     case Checksummer::CSUM_CRC32C:
5325       {
5326         Checksummer::crc32c::init_value_t init_value;
5327         decode(init_value, *init_value_bl_it);
5328         Checksummer::calculate<Checksummer::crc32c>(
5329           init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
5330           &csum_data);
5331       }
5332       break;
5333     default:
5334       break;
5335     }
5336   }
5337
5338   encode(csum_count, osd_op.outdata);
5339   osd_op.outdata.claim_append(csum);
5340   return 0;
5341 }
5342
5343 struct C_ExtentCmpRead : public Context {
5344   PrimaryLogPG *primary_log_pg;
5345   OSDOp &osd_op;
5346   ceph_le64 read_length{};
5347   bufferlist read_bl;
5348   Context *fill_extent_ctx;
5349
5350   C_ExtentCmpRead(PrimaryLogPG *primary_log_pg, OSDOp &osd_op,
5351                   std::optional<uint32_t> maybe_crc, uint64_t size,
5352                   OSDService *osd, hobject_t soid, uint32_t flags)
5353     : primary_log_pg(primary_log_pg), osd_op(osd_op),
5354       fill_extent_ctx(new FillInVerifyExtent(&read_length, &osd_op.rval,
5355                                              &read_bl, maybe_crc, size,
5356                                              osd, soid, flags)) {
5357   }
5358   ~C_ExtentCmpRead() override {
5359     delete fill_extent_ctx;
5360   }
5361
5362   void finish(int r) override {
5363     if (r == -ENOENT) {
5364       osd_op.rval = 0;
5365       read_bl.clear();
5366       delete fill_extent_ctx;
5367     } else {
5368       fill_extent_ctx->complete(r);
5369     }
5370     fill_extent_ctx = nullptr;
5371
5372     if (osd_op.rval >= 0) {
5373       osd_op.rval = primary_log_pg->finish_extent_cmp(osd_op, read_bl);
5374     }
5375   }
5376 };
5377
5378 int PrimaryLogPG::do_extent_cmp(OpContext *ctx, OSDOp& osd_op)
5379 {
5380   dout(20) << __func__ << dendl;
5381   ceph_osd_op& op = osd_op.op;
5382
5383   auto& oi = ctx->new_obs.oi;
5384   uint64_t size = oi.size;
5385   if ((oi.truncate_seq < op.extent.truncate_seq) &&
5386       (op.extent.offset + op.extent.length > op.extent.truncate_size)) {
5387     size = op.extent.truncate_size;
5388   }
5389
5390   if (op.extent.offset >= size) {
5391     op.extent.length = 0;
5392   } else if (op.extent.offset + op.extent.length > size) {
5393     op.extent.length = size - op.extent.offset;
5394   }
5395
5396   if (op.extent.length == 0) {
5397     dout(20) << __func__ << " zero length extent" << dendl;
5398     return finish_extent_cmp(osd_op, bufferlist{});
5399   } else if (!ctx->obs->exists || ctx->obs->oi.is_whiteout()) {
5400     dout(20) << __func__ << " object DNE" << dendl;
5401     return finish_extent_cmp(osd_op, {});
5402   } else if (pool.info.is_erasure()) {
5403     // If there is a data digest and it is possible we are reading
5404     // entire object, pass the digest.
5405     std::optional<uint32_t> maybe_crc;
5406     if (oi.is_data_digest() && op.checksum.offset == 0 &&
5407         op.checksum.length >= oi.size) {
5408       maybe_crc = oi.data_digest;
5409     }
5410
5411     // async read
5412     auto& soid = oi.soid;
5413     auto extent_cmp_ctx = new C_ExtentCmpRead(this, osd_op, maybe_crc, oi.size,
5414                                               osd, soid, op.flags);
5415     ctx->pending_async_reads.push_back({
5416       {op.extent.offset, op.extent.length, op.flags},
5417       {&extent_cmp_ctx->read_bl, extent_cmp_ctx}});
5418
5419     dout(10) << __func__ << ": async_read noted for " << soid << dendl;
5420
5421     ctx->op_finishers[ctx->current_osd_subop_num].reset(
5422       new ReadFinisher(osd_op));
5423     return -EINPROGRESS;
5424   }
5425
5426   // sync read
5427   vector<OSDOp> read_ops(1);
5428   OSDOp& read_op = read_ops[0];
5429
5430   read_op.op.op = CEPH_OSD_OP_SYNC_READ;
5431   read_op.op.extent.offset = op.extent.offset;
5432   read_op.op.extent.length = op.extent.length;
5433   read_op.op.extent.truncate_seq = op.extent.truncate_seq;
5434   read_op.op.extent.truncate_size = op.extent.truncate_size;
5435
5436   int result = do_osd_ops(ctx, read_ops);
5437   if (result < 0) {
5438     derr << __func__ << " failed " << result << dendl;
5439     return result;
5440   }
5441   return finish_extent_cmp(osd_op, read_op.outdata);
5442 }
5443
5444 int PrimaryLogPG::finish_extent_cmp(OSDOp& osd_op, const bufferlist &read_bl)
5445 {
5446   for (uint64_t idx = 0; idx < osd_op.indata.length(); ++idx) {
5447     char read_byte = (idx < read_bl.length() ? read_bl[idx] : 0);
5448     if (osd_op.indata[idx] != read_byte) {
5449         return (-MAX_ERRNO - idx);
5450     }
5451   }
5452
5453   return 0;
5454 }
5455
5456 int PrimaryLogPG::do_read(OpContext *ctx, OSDOp& osd_op) {
5457   dout(20) << __func__ << dendl;
5458   auto& op = osd_op.op;
5459   auto& oi = ctx->new_obs.oi;
5460   auto& soid = oi.soid;
5461   __u32 seq = oi.truncate_seq;
5462   uint64_t size = oi.size;
5463   bool trimmed_read = false;
5464
5465   dout(30) << __func__ << " oi.size: " << oi.size << dendl;
5466   dout(30) << __func__ << " oi.truncate_seq: " << oi.truncate_seq << dendl;
5467   dout(30) << __func__ << " op.extent.truncate_seq: " << op.extent.truncate_seq << dendl;
5468   dout(30) << __func__ << " op.extent.truncate_size: " << op.extent.truncate_size << dendl;
5469
5470   // are we beyond truncate_size?
5471   if ( (seq < op.extent.truncate_seq) &&
5472        (op.extent.offset + op.extent.length > op.extent.truncate_size) &&
5473        (size > op.extent.truncate_size) )
5474     size = op.extent.truncate_size;
5475
5476   if (op.extent.length == 0) //length is zero mean read the whole object
5477     op.extent.length = size;
5478
5479   if (op.extent.offset >= size) {
5480     op.extent.length = 0;
5481     trimmed_read = true;
5482   } else if (op.extent.offset + op.extent.length > size) {
5483     op.extent.length = size - op.extent.offset;
5484     trimmed_read = true;
5485   }
5486
5487   dout(30) << __func__ << "op.extent.length is now " << op.extent.length << dendl;
5488
5489   // read into a buffer
5490   int result = 0;
5491   if (trimmed_read && op.extent.length == 0) {
5492     // read size was trimmed to zero and it is expected to do nothing
5493     // a read operation of 0 bytes does *not* do nothing, this is why
5494     // the trimmed_read boolean is needed
5495   } else if (pool.info.is_erasure()) {
5496     // The initialisation below is required to silence a false positive
5497     // -Wmaybe-uninitialized warning
5498     std::optional<uint32_t> maybe_crc;
5499     // If there is a data digest and it is possible we are reading
5500     // entire object, pass the digest.  FillInVerifyExtent will
5501     // will check the oi.size again.
5502     if (oi.is_data_digest() && op.extent.offset == 0 &&
5503         op.extent.length >= oi.size)
5504       maybe_crc = oi.data_digest;
5505     ctx->pending_async_reads.push_back(
5506       make_pair(
5507         boost::make_tuple(op.extent.offset, op.extent.length, op.flags),
5508         make_pair(&osd_op.outdata,
5509                   new FillInVerifyExtent(&op.extent.length, &osd_op.rval,
5510                                          &osd_op.outdata, maybe_crc, oi.size,
5511                                          osd, soid, op.flags))));
5512     dout(10) << " async_read noted for " << soid << dendl;
5513
5514     ctx->op_finishers[ctx->current_osd_subop_num].reset(
5515       new ReadFinisher(osd_op));
5516   } else {
5517     int r = pgbackend->objects_read_sync(
5518       soid, op.extent.offset, op.extent.length, op.flags, &osd_op.outdata);
5519     // whole object?  can we verify the checksum?
5520     if (r >= 0 && op.extent.offset == 0 &&
5521         (uint64_t)r == oi.size && oi.is_data_digest()) {
5522       uint32_t crc = osd_op.outdata.crc32c(-1);
5523       if (oi.data_digest != crc) {
5524         osd->clog->error() << info.pgid << std::hex
5525                            << " full-object read crc 0x" << crc
5526                            << " != expected 0x" << oi.data_digest
5527                            << std::dec << " on " << soid;
5528         r = -EIO; // try repair later
5529       }
5530     }
5531     if (r == -EIO) {
5532       r = rep_repair_primary_object(soid, ctx);
5533     }
5534     if (r >= 0)
5535       op.extent.length = r;
5536     else if (r == -EAGAIN) {
5537       result = -EAGAIN;
5538     } else {
5539       result = r;
5540       op.extent.length = 0;
5541     }
5542     dout(10) << " read got " << r << " / " << op.extent.length
5543              << " bytes from obj " << soid << dendl;
5544   }
5545   if (result >= 0) {
5546     ctx->delta_stats.num_rd_kb += shift_round_up(op.extent.length, 10);
5547     ctx->delta_stats.num_rd++;
5548   }
5549   return result;
5550 }
5551
5552 int PrimaryLogPG::do_sparse_read(OpContext *ctx, OSDOp& osd_op) {
5553   dout(20) << __func__ << dendl;
5554   auto& op = osd_op.op;
5555   auto& oi = ctx->new_obs.oi;
5556   auto& soid = oi.soid;
5557
5558   if (op.extent.truncate_seq) {
5559     dout(0) << "sparse_read does not support truncation sequence " << dendl;
5560     return -EINVAL;
5561   }
5562
5563   ++ctx->num_read;
5564   if (pool.info.is_erasure()) {
5565     // translate sparse read to a normal one if not supported
5566     uint64_t offset = op.extent.offset;
5567     uint64_t length = op.extent.length;
5568     if (offset > oi.size) {
5569       length = 0;
5570     } else if (offset + length > oi.size) {
5571       length = oi.size - offset;
5572     }
5573
5574     if (length > 0) {
5575       ctx->pending_async_reads.push_back(
5576         make_pair(
5577           boost::make_tuple(offset, length, op.flags),
5578           make_pair(
5579             &osd_op.outdata,
5580             new ToSparseReadResult(&osd_op.rval, &osd_op.outdata, offset,
5581                                    &op.extent.length))));
5582       dout(10) << " async_read (was sparse_read) noted for " << soid << dendl;
5583
5584       ctx->op_finishers[ctx->current_osd_subop_num].reset(
5585         new ReadFinisher(osd_op));
5586     } else {
5587       dout(10) << " sparse read ended up empty for " << soid << dendl;
5588       map<uint64_t, uint64_t> extents;
5589       encode(extents, osd_op.outdata);
5590     }
5591   } else {
5592     // read into a buffer
5593     map<uint64_t, uint64_t> m;
5594     uint32_t total_read = 0;
5595     int r = osd->store->fiemap(ch, ghobject_t(soid, ghobject_t::NO_GEN,
5596                                               info.pgid.shard),
5597                                op.extent.offset, op.extent.length, m);
5598     if (r < 0)  {
5599       return r;
5600     }
5601
5602     bufferlist data_bl;
5603     r = pgbackend->objects_readv_sync(soid, std::move(m), op.flags, &data_bl);
5604     if (r == -EIO) {
5605       r = rep_repair_primary_object(soid, ctx);
5606     }
5607     if (r < 0) {
5608       return r;
5609     }
5610
5611     // Why SPARSE_READ need checksum? In fact, librbd always use sparse-read.
5612     // Maybe at first, there is no much whole objects. With continued use, more
5613     // and more whole object exist. So from this point, for spare-read add
5614     // checksum make sense.
5615     if ((uint64_t)r == oi.size && oi.is_data_digest()) {
5616       uint32_t crc = data_bl.crc32c(-1);
5617       if (oi.data_digest != crc) {
5618         osd->clog->error() << info.pgid << std::hex
5619           << " full-object read crc 0x" << crc
5620           << " != expected 0x" << oi.data_digest
5621           << std::dec << " on " << soid;
5622         r = rep_repair_primary_object(soid, ctx);
5623         if (r < 0) {
5624           return r;
5625         }
5626       }
5627     }
5628
5629     op.extent.length = total_read;
5630
5631     encode(m, osd_op.outdata); // re-encode since it might be modified
5632     ::encode_destructively(data_bl, osd_op.outdata);
5633
5634     dout(10) << " sparse_read got " << r << " bytes from object "
5635              << soid << dendl;
5636   }
5637
5638   ctx->delta_stats.num_rd_kb += shift_round_up(op.extent.length, 10);
5639   ctx->delta_stats.num_rd++;
5640   return 0;
5641 }
5642
5643 int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
5644 {
5645   int result = 0;
5646   SnapSetContext *ssc = ctx->obc->ssc;
5647   ObjectState& obs = ctx->new_obs;
5648   object_info_t& oi = obs.oi;
5649   const hobject_t& soid = oi.soid;
5650   const bool skip_data_digest = osd->store->has_builtin_csum() &&
5651     osd->osd_skip_data_digest;
5652
5653   PGTransaction* t = ctx->op_t.get();
5654
5655   dout(10) << "do_osd_op " << soid << " " << ops << dendl;
5656
5657   ctx->current_osd_subop_num = 0;
5658   for (auto p = ops.begin(); p != ops.end(); ++p, ctx->current_osd_subop_num++, ctx->processed_subop_count++) {
5659     OSDOp& osd_op = *p;
5660     ceph_osd_op& op = osd_op.op;
5661
5662     OpFinisher* op_finisher = nullptr;
5663     {
5664       auto op_finisher_it = ctx->op_finishers.find(ctx->current_osd_subop_num);
5665       if (op_finisher_it != ctx->op_finishers.end()) {
5666         op_finisher = op_finisher_it->second.get();
5667       }
5668     }
5669
5670     // TODO: check endianness (ceph_le32 vs uint32_t, etc.)
5671     // The fields in ceph_osd_op are little-endian (according to the definition in rados.h),
5672     // but the code in this function seems to treat them as native-endian.  What should the
5673     // tracepoints do?
5674     tracepoint(osd, do_osd_op_pre, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op), op.flags);
5675
5676     dout(10) << "do_osd_op  " << osd_op << dendl;
5677
5678     auto bp = osd_op.indata.cbegin();
5679
5680     // user-visible modifcation?
5681     switch (op.op) {
5682       // non user-visible modifications
5683     case CEPH_OSD_OP_WATCH:
5684     case CEPH_OSD_OP_CACHE_EVICT:
5685     case CEPH_OSD_OP_CACHE_FLUSH:
5686     case CEPH_OSD_OP_CACHE_TRY_FLUSH:
5687     case CEPH_OSD_OP_UNDIRTY:
5688     case CEPH_OSD_OP_COPY_FROM:  // we handle user_version update explicitly
5689     case CEPH_OSD_OP_COPY_FROM2:
5690     case CEPH_OSD_OP_CACHE_PIN:
5691     case CEPH_OSD_OP_CACHE_UNPIN:
5692     case CEPH_OSD_OP_SET_REDIRECT:
5693     case CEPH_OSD_OP_TIER_PROMOTE:
5694     case CEPH_OSD_OP_TIER_FLUSH:
5695       break;
5696     default:
5697       if (op.op & CEPH_OSD_OP_MODE_WR)
5698         ctx->user_modify = true;
5699     }
5700
5701     // munge -1 truncate to 0 truncate
5702     if (ceph_osd_op_uses_extent(op.op) &&
5703         op.extent.truncate_seq == 1 &&
5704         op.extent.truncate_size == (-1ULL)) {
5705       op.extent.truncate_size = 0;
5706       op.extent.truncate_seq = 0;
5707     }
5708
5709     // munge ZERO -> TRUNCATE?  (don't munge to DELETE or we risk hosing attributes)
5710     if (op.op == CEPH_OSD_OP_ZERO &&
5711         obs.exists &&
5712         op.extent.offset < static_cast<Option::size_t>(osd->osd_max_object_size) &&
5713         op.extent.length >= 1 &&
5714         op.extent.length <= static_cast<Option::size_t>(osd->osd_max_object_size) &&
5715         op.extent.offset + op.extent.length >= oi.size) {
5716       if (op.extent.offset >= oi.size) {
5717         // no-op
5718         goto fail;
5719       }
5720       dout(10) << " munging ZERO " << op.extent.offset << "~" << op.extent.length
5721                << " -> TRUNCATE " << op.extent.offset << " (old size is " << oi.size << ")" << dendl;
5722       op.op = CEPH_OSD_OP_TRUNCATE;
5723     }
5724
5725     switch (op.op) {
5726
5727       // --- READS ---
5728
5729     case CEPH_OSD_OP_CMPEXT:
5730       ++ctx->num_read;
5731       tracepoint(osd, do_osd_op_pre_extent_cmp, soid.oid.name.c_str(),
5732                  soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
5733                  op.extent.length, op.extent.truncate_size,
5734                  op.extent.truncate_seq);
5735
5736       if (op_finisher == nullptr) {
5737         result = do_extent_cmp(ctx, osd_op);
5738       } else {
5739         result = op_finisher->execute();
5740       }
5741       break;
5742
5743     case CEPH_OSD_OP_SYNC_READ:
5744       if (pool.info.is_erasure()) {
5745         result = -EOPNOTSUPP;
5746         break;
5747       }
5748       // fall through
5749     case CEPH_OSD_OP_READ:
5750       ++ctx->num_read;
5751       tracepoint(osd, do_osd_op_pre_read, soid.oid.name.c_str(),
5752                  soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
5753                  op.extent.length, op.extent.truncate_size,
5754                  op.extent.truncate_seq);
5755       if (op_finisher == nullptr) {
5756         if (!ctx->data_off) {
5757           ctx->data_off = op.extent.offset;
5758         }
5759         result = do_read(ctx, osd_op);
5760       } else {
5761         result = op_finisher->execute();
5762       }
5763       break;
5764
5765     case CEPH_OSD_OP_CHECKSUM:
5766       ++ctx->num_read;
5767       {
5768         tracepoint(osd, do_osd_op_pre_checksum, soid.oid.name.c_str(),
5769                    soid.snap.val, oi.size, oi.truncate_seq, op.checksum.type,
5770                    op.checksum.offset, op.checksum.length,
5771                    op.checksum.chunk_size);
5772
5773         if (op_finisher == nullptr) {
5774           result = do_checksum(ctx, osd_op, &bp);
5775         } else {
5776           result = op_finisher->execute();
5777         }
5778       }
5779       break;
5780
5781     /* map extents */
5782     case CEPH_OSD_OP_MAPEXT:
5783       tracepoint(osd, do_osd_op_pre_mapext, soid.oid.name.c_str(), soid.snap.val, op.extent.offset, op.extent.length);
5784       if (pool.info.is_erasure()) {
5785         result = -EOPNOTSUPP;
5786         break;
5787       }
5788       ++ctx->num_read;
5789       {
5790         // read into a buffer
5791         bufferlist bl;
5792         int r = osd->store->fiemap(ch, ghobject_t(soid, ghobject_t::NO_GEN,
5793                                                   info.pgid.shard),
5794                                    op.extent.offset, op.extent.length, bl);
5795         osd_op.outdata.claim(bl);
5796         if (r < 0)
5797           result = r;
5798         else
5799           ctx->delta_stats.num_rd_kb += shift_round_up(bl.length(), 10);
5800         ctx->delta_stats.num_rd++;
5801         dout(10) << " map_extents done on object " << soid << dendl;
5802       }
5803       break;
5804
5805     /* map extents */
5806     case CEPH_OSD_OP_SPARSE_READ:
5807       tracepoint(osd, do_osd_op_pre_sparse_read, soid.oid.name.c_str(),
5808                  soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
5809                  op.extent.length, op.extent.truncate_size,
5810                  op.extent.truncate_seq);
5811       if (op_finisher == nullptr) {
5812         result = do_sparse_read(ctx, osd_op);
5813       } else {
5814         result = op_finisher->execute();
5815       }
5816       break;
5817
5818     case CEPH_OSD_OP_CALL:
5819       {
5820         string cname, mname;
5821         bufferlist indata;
5822         try {
5823           bp.copy(op.cls.class_len, cname);
5824           bp.copy(op.cls.method_len, mname);
5825           bp.copy(op.cls.indata_len, indata);
5826         } catch (buffer::error& e) {
5827           dout(10) << "call unable to decode class + method + indata" << dendl;
5828           dout(30) << "in dump: ";
5829           osd_op.indata.hexdump(*_dout);
5830           *_dout << dendl;
5831           result = -EINVAL;
5832           tracepoint(osd, do_osd_op_pre_call, soid.oid.name.c_str(), soid.snap.val, "???", "???");
5833           break;
5834         }
5835         tracepoint(osd, do_osd_op_pre_call, soid.oid.name.c_str(), soid.snap.val, cname.c_str(), mname.c_str());
5836
5837         ClassHandler::ClassData *cls;
5838         result = ClassHandler::get_instance().open_class(cname, &cls);
5839         ceph_assert(result == 0);   // init_op_flags() already verified this works.
5840
5841         ClassHandler::ClassMethod *method = cls->get_method(mname);
5842         if (!method) {
5843           dout(10) << "call method " << cname << "." << mname << " does not exist" << dendl;
5844           result = -EOPNOTSUPP;
5845           break;
5846         }
5847
5848         int flags = method->get_flags();
5849         if (flags & CLS_METHOD_WR)
5850           ctx->user_modify = true;
5851
5852         bufferlist outdata;
5853         dout(10) << "call method " << cname << "." << mname << dendl;
5854         int prev_rd = ctx->num_read;
5855         int prev_wr = ctx->num_write;
5856         result = method->exec((cls_method_context_t)&ctx, indata, outdata);
5857
5858         if (ctx->num_read > prev_rd && !(flags & CLS_METHOD_RD)) {
5859           derr << "method " << cname << "." << mname << " tried to read object but is not marked RD" << dendl;
5860           result = -EIO;
5861           break;
5862         }
5863         if (ctx->num_write > prev_wr && !(flags & CLS_METHOD_WR)) {
5864           derr << "method " << cname << "." << mname << " tried to update object but is not marked WR" << dendl;
5865           result = -EIO;
5866           break;
5867         }
5868
5869         dout(10) << "method called response length=" << outdata.length() << dendl;
5870         op.extent.length = outdata.length();
5871         osd_op.outdata.claim_append(outdata);
5872         dout(30) << "out dump: ";
5873         osd_op.outdata.hexdump(*_dout);
5874         *_dout << dendl;
5875       }
5876       break;
5877
5878     case CEPH_OSD_OP_STAT:
5879       // note: stat does not require RD
5880       {
5881         tracepoint(osd, do_osd_op_pre_stat, soid.oid.name.c_str(), soid.snap.val);
5882
5883         if (obs.exists && !oi.is_whiteout()) {
5884           encode(oi.size, osd_op.outdata);
5885           encode(oi.mtime, osd_op.outdata);
5886           dout(10) << "stat oi has " << oi.size << " " << oi.mtime << dendl;
5887         } else {
5888           result = -ENOENT;
5889           dout(10) << "stat oi object does not exist" << dendl;
5890         }
5891
5892         ctx->delta_stats.num_rd++;
5893       }
5894       break;
5895
5896     case CEPH_OSD_OP_ISDIRTY:
5897       ++ctx->num_read;
5898       {
5899         tracepoint(osd, do_osd_op_pre_isdirty, soid.oid.name.c_str(), soid.snap.val);
5900         bool is_dirty = obs.oi.is_dirty();
5901         encode(is_dirty, osd_op.outdata);
5902         ctx->delta_stats.num_rd++;
5903         result = 0;
5904       }
5905       break;
5906
5907     case CEPH_OSD_OP_UNDIRTY:
5908       ++ctx->num_write;
5909       result = 0;
5910       {
5911         tracepoint(osd, do_osd_op_pre_undirty, soid.oid.name.c_str(), soid.snap.val);
5912         if (oi.is_dirty()) {
5913           ctx->undirty = true;  // see make_writeable()
5914           ctx->modify = true;
5915           ctx->delta_stats.num_wr++;
5916         }
5917       }
5918       break;
5919
5920     case CEPH_OSD_OP_CACHE_TRY_FLUSH:
5921       ++ctx->num_write;
5922       result = 0;
5923       {
5924         tracepoint(osd, do_osd_op_pre_try_flush, soid.oid.name.c_str(), soid.snap.val);
5925         if (ctx->lock_type != RWState::RWNONE) {
5926           dout(10) << "cache-try-flush without SKIPRWLOCKS flag set" << dendl;
5927           result = -EINVAL;
5928           break;
5929         }
5930         if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
5931           result = -EINVAL;
5932           break;
5933         }
5934         if (!obs.exists) {
5935           result = 0;
5936           break;
5937         }
5938         if (oi.is_cache_pinned()) {
5939           dout(10) << "cache-try-flush on a pinned object, consider unpin this object first" << dendl;
5940           result = -EPERM;
5941           break;
5942         }
5943         if (oi.is_dirty()) {
5944           result = start_flush(ctx->op, ctx->obc, false, NULL, std::nullopt);
5945           if (result == -EINPROGRESS)
5946             result = -EAGAIN;
5947         } else {
5948           result = 0;
5949         }
5950       }
5951       break;
5952
5953     case CEPH_OSD_OP_CACHE_FLUSH:
5954       ++ctx->num_write;
5955       result = 0;
5956       {
5957         tracepoint(osd, do_osd_op_pre_cache_flush, soid.oid.name.c_str(), soid.snap.val);
5958         if (ctx->lock_type == RWState::RWNONE) {
5959           dout(10) << "cache-flush with SKIPRWLOCKS flag set" << dendl;
5960           result = -EINVAL;
5961           break;
5962         }
5963         if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
5964           result = -EINVAL;
5965           break;
5966         }
5967         if (!obs.exists) {
5968           result = 0;
5969           break;
5970         }
5971         if (oi.is_cache_pinned()) {
5972           dout(10) << "cache-flush on a pinned object, consider unpin this object first" << dendl;
5973           result = -EPERM;
5974           break;
5975         }
5976         hobject_t missing;
5977         if (oi.is_dirty()) {
5978           result = start_flush(ctx->op, ctx->obc, true, &missing, std::nullopt);
5979           if (result == -EINPROGRESS)
5980             result = -EAGAIN;
5981         } else {
5982           result = 0;
5983         }
5984         // Check special return value which has set missing_return
5985         if (result == -ENOENT) {
5986           dout(10) << __func__ << " CEPH_OSD_OP_CACHE_FLUSH got ENOENT" << dendl;
5987           ceph_assert(!missing.is_min());
5988           wait_for_unreadable_object(missing, ctx->op);
5989           // Error code which is used elsewhere when wait_for_unreadable_object() is used
5990           result = -EAGAIN;
5991         }
5992       }
5993       break;
5994
5995     case CEPH_OSD_OP_CACHE_EVICT:
5996       ++ctx->num_write;
5997       result = 0;
5998       {
5999         tracepoint(osd, do_osd_op_pre_cache_evict, soid.oid.name.c_str(), soid.snap.val);
6000         if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
6001           result = -EINVAL;
6002           break;
6003         }
6004         if (!obs.exists) {
6005           result = 0;
6006           break;
6007         }
6008         if (oi.is_cache_pinned()) {
6009           dout(10) << "cache-evict on a pinned object, consider unpin this object first" << dendl;
6010           result = -EPERM;
6011           break;
6012         }
6013         if (oi.is_dirty()) {
6014           result = -EBUSY;
6015           break;
6016         }
6017         if (!oi.watchers.empty()) {
6018           result = -EBUSY;
6019           break;
6020         }
6021         if (soid.snap == CEPH_NOSNAP) {
6022           result = _verify_no_head_clones(soid, ssc->snapset);
6023           if (result < 0)
6024             break;
6025         }
6026         result = _delete_oid(ctx, true, false);
6027         if (result >= 0) {
6028           // mark that this is a cache eviction to avoid triggering normal
6029           // make_writeable() clone creation in finish_ctx()
6030           ctx->cache_evict = true;
6031         }
6032         osd->logger->inc(l_osd_tier_evict);
6033       }
6034       break;
6035
6036     case CEPH_OSD_OP_GETXATTR:
6037       ++ctx->num_read;
6038       {
6039         string aname;
6040         bp.copy(op.xattr.name_len, aname);
6041         tracepoint(osd, do_osd_op_pre_getxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
6042         string name = "_" + aname;
6043         int r = getattr_maybe_cache(
6044           ctx->obc,
6045           name,
6046           &(osd_op.outdata));
6047         if (r >= 0) {
6048           op.xattr.value_len = osd_op.outdata.length();
6049           result = 0;
6050           ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
6051         } else
6052           result = r;
6053
6054         ctx->delta_stats.num_rd++;
6055       }
6056       break;
6057
6058    case CEPH_OSD_OP_GETXATTRS:
6059       ++ctx->num_read;
6060       {
6061         tracepoint(osd, do_osd_op_pre_getxattrs, soid.oid.name.c_str(), soid.snap.val);
6062         map<string, bufferlist> out;
6063         result = getattrs_maybe_cache(
6064           ctx->obc,
6065           &out);
6066
6067         bufferlist bl;
6068         encode(out, bl);
6069         ctx->delta_stats.num_rd_kb += shift_round_up(bl.length(), 10);
6070         ctx->delta_stats.num_rd++;
6071         osd_op.outdata.claim_append(bl);
6072       }
6073       break;
6074
6075     case CEPH_OSD_OP_CMPXATTR:
6076       ++ctx->num_read;
6077       {
6078         string aname;
6079         bp.copy(op.xattr.name_len, aname);
6080         tracepoint(osd, do_osd_op_pre_cmpxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
6081         string name = "_" + aname;
6082         name[op.xattr.name_len + 1] = 0;
6083
6084         bufferlist xattr;
6085         result = getattr_maybe_cache(
6086           ctx->obc,
6087           name,
6088           &xattr);
6089         if (result < 0 && result != -EEXIST && result != -ENODATA)
6090           break;
6091
6092         ctx->delta_stats.num_rd++;
6093         ctx->delta_stats.num_rd_kb += shift_round_up(xattr.length(), 10);
6094
6095         switch (op.xattr.cmp_mode) {
6096         case CEPH_OSD_CMPXATTR_MODE_STRING:
6097           {
6098             string val;
6099             bp.copy(op.xattr.value_len, val);
6100             val[op.xattr.value_len] = 0;
6101             dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name << " val=" << val
6102                      << " op=" << (int)op.xattr.cmp_op << " mode=" << (int)op.xattr.cmp_mode << dendl;
6103             result = do_xattr_cmp_str(op.xattr.cmp_op, val, xattr);
6104           }
6105           break;
6106
6107         case CEPH_OSD_CMPXATTR_MODE_U64:
6108           {
6109             uint64_t u64val;
6110             try {
6111               decode(u64val, bp);
6112             }
6113             catch (buffer::error& e) {
6114               result = -EINVAL;
6115               goto fail;
6116             }
6117             dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name << " val=" << u64val
6118                      << " op=" << (int)op.xattr.cmp_op << " mode=" << (int)op.xattr.cmp_mode << dendl;
6119             result = do_xattr_cmp_u64(op.xattr.cmp_op, u64val, xattr);
6120           }
6121           break;
6122
6123         default:
6124           dout(10) << "bad cmp mode " << (int)op.xattr.cmp_mode << dendl;
6125           result = -EINVAL;
6126         }
6127
6128         if (!result) {
6129           dout(10) << "comparison returned false" << dendl;
6130           result = -ECANCELED;
6131           break;
6132         }
6133         if (result < 0) {
6134           dout(10) << "comparison returned " << result << " " << cpp_strerror(-result) << dendl;
6135           break;
6136         }
6137
6138         dout(10) << "comparison returned true" << dendl;
6139       }
6140       break;
6141
6142     case CEPH_OSD_OP_ASSERT_VER:
6143       ++ctx->num_read;
6144       {
6145         uint64_t ver = op.assert_ver.ver;
6146         tracepoint(osd, do_osd_op_pre_assert_ver, soid.oid.name.c_str(), soid.snap.val, ver);
6147         if (!ver)
6148           result = -EINVAL;
6149         else if (ver < oi.user_version)
6150           result = -ERANGE;
6151         else if (ver > oi.user_version)
6152           result = -EOVERFLOW;
6153       }
6154       break;
6155
6156     case CEPH_OSD_OP_LIST_WATCHERS:
6157       ++ctx->num_read;
6158       {
6159         tracepoint(osd, do_osd_op_pre_list_watchers, soid.oid.name.c_str(), soid.snap.val);
6160         obj_list_watch_response_t resp;
6161
6162         map<pair<uint64_t, entity_name_t>, watch_info_t>::const_iterator oi_iter;
6163         for (oi_iter = oi.watchers.begin(); oi_iter != oi.watchers.end();
6164                                        ++oi_iter) {
6165           dout(20) << "key cookie=" << oi_iter->first.first
6166                << " entity=" << oi_iter->first.second << " "
6167                << oi_iter->second << dendl;
6168           ceph_assert(oi_iter->first.first == oi_iter->second.cookie);
6169           ceph_assert(oi_iter->first.second.is_client());
6170
6171           watch_item_t wi(oi_iter->first.second, oi_iter->second.cookie,
6172                  oi_iter->second.timeout_seconds, oi_iter->second.addr);
6173           resp.entries.push_back(wi);
6174         }
6175
6176         resp.encode(osd_op.outdata, ctx->get_features());
6177         result = 0;
6178
6179         ctx->delta_stats.num_rd++;
6180         break;
6181       }
6182
6183     case CEPH_OSD_OP_LIST_SNAPS:
6184       ++ctx->num_read;
6185       {
6186         tracepoint(osd, do_osd_op_pre_list_snaps, soid.oid.name.c_str(), soid.snap.val);
6187         obj_list_snap_response_t resp;
6188
6189         if (!ssc) {
6190           ssc = ctx->obc->ssc = get_snapset_context(soid, false);
6191         }
6192         ceph_assert(ssc);
6193         dout(20) << " snapset " << ssc->snapset << dendl;
6194
6195         int clonecount = ssc->snapset.clones.size();
6196         clonecount++;  // for head
6197         resp.clones.reserve(clonecount);
6198         for (auto clone_iter = ssc->snapset.clones.begin();
6199              clone_iter != ssc->snapset.clones.end(); ++clone_iter) {
6200           clone_info ci;
6201           ci.cloneid = *clone_iter;
6202
6203           hobject_t clone_oid = soid;
6204           clone_oid.snap = *clone_iter;
6205
6206           auto p = ssc->snapset.clone_snaps.find(*clone_iter);
6207           if (p == ssc->snapset.clone_snaps.end()) {
6208             osd->clog->error() << "osd." << osd->whoami
6209                                << ": inconsistent clone_snaps found for oid "
6210                                << soid << " clone " << *clone_iter
6211                                << " snapset " << ssc->snapset;
6212             result = -EINVAL;
6213             break;
6214           }
6215           for (auto q = p->second.rbegin(); q != p->second.rend(); ++q) {
6216             ci.snaps.push_back(*q);
6217           }
6218
6219           dout(20) << " clone " << *clone_iter << " snaps " << ci.snaps << dendl;
6220
6221           map<snapid_t, interval_set<uint64_t> >::const_iterator coi;
6222           coi = ssc->snapset.clone_overlap.find(ci.cloneid);
6223           if (coi == ssc->snapset.clone_overlap.end()) {
6224             osd->clog->error() << "osd." << osd->whoami
6225                                << ": inconsistent clone_overlap found for oid "
6226                               << soid << " clone " << *clone_iter;
6227             result = -EINVAL;
6228             break;
6229           }
6230           const interval_set<uint64_t> &o = coi->second;
6231           ci.overlap.reserve(o.num_intervals());
6232           for (interval_set<uint64_t>::const_iterator r = o.begin();
6233                r != o.end(); ++r) {
6234             ci.overlap.push_back(pair<uint64_t,uint64_t>(r.get_start(),
6235                                                          r.get_len()));
6236           }
6237
6238           map<snapid_t, uint64_t>::const_iterator si;
6239           si = ssc->snapset.clone_size.find(ci.cloneid);
6240           if (si == ssc->snapset.clone_size.end()) {
6241             osd->clog->error() << "osd." << osd->whoami
6242                                << ": inconsistent clone_size found for oid "
6243                                << soid << " clone " << *clone_iter;
6244             result = -EINVAL;
6245             break;
6246           }
6247           ci.size = si->second;
6248
6249           resp.clones.push_back(ci);
6250         }
6251         if (result < 0) {
6252           break;
6253         }
6254         if (!ctx->obc->obs.oi.is_whiteout()) {
6255           ceph_assert(obs.exists);
6256           clone_info ci;
6257           ci.cloneid = CEPH_NOSNAP;
6258
6259           //Size for HEAD is oi.size
6260           ci.size = oi.size;
6261
6262           resp.clones.push_back(ci);
6263         }
6264         resp.seq = ssc->snapset.seq;
6265
6266         resp.encode(osd_op.outdata);
6267         result = 0;
6268
6269         ctx->delta_stats.num_rd++;
6270         break;
6271       }
6272
6273    case CEPH_OSD_OP_NOTIFY:
6274       ++ctx->num_read;
6275       {
6276         uint32_t timeout;
6277         bufferlist bl;
6278
6279         try {
6280           uint32_t ver; // obsolete
6281           decode(ver, bp);
6282           decode(timeout, bp);
6283           decode(bl, bp);
6284         } catch (const buffer::error &e) {
6285           timeout = 0;
6286         }
6287         tracepoint(osd, do_osd_op_pre_notify, soid.oid.name.c_str(), soid.snap.val, timeout);
6288         if (!timeout)
6289           timeout = cct->_conf->osd_default_notify_timeout;
6290
6291         notify_info_t n;
6292         n.timeout = timeout;
6293         n.notify_id = osd->get_next_id(get_osdmap_epoch());
6294         n.cookie = op.notify.cookie;
6295         n.bl = bl;
6296         ctx->notifies.push_back(n);
6297
6298         // return our unique notify id to the client
6299         encode(n.notify_id, osd_op.outdata);
6300       }
6301       break;
6302
6303     case CEPH_OSD_OP_NOTIFY_ACK:
6304       ++ctx->num_read;
6305       {
6306         try {
6307           uint64_t notify_id = 0;
6308           uint64_t watch_cookie = 0;
6309           decode(notify_id, bp);
6310           decode(watch_cookie, bp);
6311           bufferlist reply_bl;
6312           if (!bp.end()) {
6313             decode(reply_bl, bp);
6314           }
6315           tracepoint(osd, do_osd_op_pre_notify_ack, soid.oid.name.c_str(), soid.snap.val, notify_id, watch_cookie, "Y");
6316           OpContext::NotifyAck ack(notify_id, watch_cookie, reply_bl);
6317           ctx->notify_acks.push_back(ack);
6318         } catch (const buffer::error &e) {
6319           tracepoint(osd, do_osd_op_pre_notify_ack, soid.oid.name.c_str(), soid.snap.val, op.watch.cookie, 0, "N");
6320           OpContext::NotifyAck ack(
6321             // op.watch.cookie is actually the notify_id for historical reasons
6322             op.watch.cookie
6323             );
6324           ctx->notify_acks.push_back(ack);
6325         }
6326       }
6327       break;
6328
6329     case CEPH_OSD_OP_SETALLOCHINT:
6330       ++ctx->num_write;
6331       result = 0;
6332       {
6333         tracepoint(osd, do_osd_op_pre_setallochint, soid.oid.name.c_str(), soid.snap.val, op.alloc_hint.expected_object_size, op.alloc_hint.expected_write_size);
6334         maybe_create_new_object(ctx);
6335         oi.expected_object_size = op.alloc_hint.expected_object_size;
6336         oi.expected_write_size = op.alloc_hint.expected_write_size;
6337         oi.alloc_hint_flags = op.alloc_hint.flags;
6338         t->set_alloc_hint(soid, op.alloc_hint.expected_object_size,
6339                           op.alloc_hint.expected_write_size,
6340                           op.alloc_hint.flags);
6341       }
6342       break;
6343
6344
6345       // --- WRITES ---
6346
6347       // -- object data --
6348
6349     case CEPH_OSD_OP_WRITE:
6350       ++ctx->num_write;
6351       result = 0;
6352       { // write
6353         __u32 seq = oi.truncate_seq;
6354         tracepoint(osd, do_osd_op_pre_write, soid.oid.name.c_str(), soid.snap.val, oi.size, seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
6355         if (op.extent.length != osd_op.indata.length()) {
6356           result = -EINVAL;
6357           break;
6358         }
6359
6360         if (pool.info.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED))
6361           op.flags = op.flags | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
6362
6363         if (pool.info.requires_aligned_append() &&
6364             (op.extent.offset % pool.info.required_alignment() != 0)) {
6365           result = -EOPNOTSUPP;
6366           break;
6367         }
6368
6369         if (!obs.exists) {
6370           if (pool.info.requires_aligned_append() && op.extent.offset) {
6371             result = -EOPNOTSUPP;
6372             break;
6373           }
6374         } else if (op.extent.offset != oi.size &&
6375                    pool.info.requires_aligned_append()) {
6376           result = -EOPNOTSUPP;
6377           break;
6378         }
6379
6380         if (seq && (seq > op.extent.truncate_seq) &&
6381             (op.extent.offset + op.extent.length > oi.size)) {
6382           // old write, arrived after trimtrunc
6383           op.extent.length = (op.extent.offset > oi.size ? 0 : oi.size - op.extent.offset);
6384           dout(10) << " old truncate_seq " << op.extent.truncate_seq << " < current " << seq
6385                    << ", adjusting write length to " << op.extent.length << dendl;
6386           bufferlist t;
6387           t.substr_of(osd_op.indata, 0, op.extent.length);
6388           osd_op.indata.swap(t);
6389         }
6390         if (op.extent.truncate_seq > seq) {
6391           // write arrives before trimtrunc
6392           if (obs.exists && !oi.is_whiteout()) {
6393             dout(10) << " truncate_seq " << op.extent.truncate_seq << " > current " << seq
6394                      << ", truncating to " << op.extent.truncate_size << dendl;
6395             t->truncate(soid, op.extent.truncate_size);
6396             oi.truncate_seq = op.extent.truncate_seq;
6397             oi.truncate_size = op.extent.truncate_size;
6398             if (oi.size > op.extent.truncate_size) {
6399               interval_set<uint64_t> trim;
6400               trim.insert(op.extent.truncate_size,
6401                 oi.size - op.extent.truncate_size);
6402               ctx->modified_ranges.union_of(trim);
6403               ctx->clean_regions.mark_data_region_dirty(op.extent.truncate_size, oi.size - op.extent.truncate_size);
6404             }
6405             if (op.extent.truncate_size != oi.size) {
6406               truncate_update_size_and_usage(ctx->delta_stats,
6407                                              oi,
6408                                              op.extent.truncate_size);
6409             }
6410           } else {
6411             dout(10) << " truncate_seq " << op.extent.truncate_seq << " > current " << seq
6412                      << ", but object is new" << dendl;
6413             oi.truncate_seq = op.extent.truncate_seq;
6414             oi.truncate_size = op.extent.truncate_size;
6415           }
6416         }
6417         result = check_offset_and_length(
6418           op.extent.offset, op.extent.length,
6419           static_cast<Option::size_t>(osd->osd_max_object_size), get_dpp());
6420         if (result < 0)
6421           break;
6422
6423         maybe_create_new_object(ctx);
6424
6425         if (op.extent.length == 0) {
6426           if (op.extent.offset > oi.size) {
6427             t->truncate(
6428               soid, op.extent.offset);
6429             truncate_update_size_and_usage(ctx->delta_stats, oi,
6430                                            op.extent.offset);
6431           } else {
6432             t->nop(soid);
6433           }
6434         } else {
6435           t->write(
6436             soid, op.extent.offset, op.extent.length, osd_op.indata, op.flags);
6437         }
6438
6439         if (op.extent.offset == 0 && op.extent.length >= oi.size
6440             && !skip_data_digest) {
6441           obs.oi.set_data_digest(osd_op.indata.crc32c(-1));
6442         } else if (op.extent.offset == oi.size && obs.oi.is_data_digest()) {
6443           if (skip_data_digest) {
6444             obs.oi.clear_data_digest();
6445           } else {
6446             obs.oi.set_data_digest(osd_op.indata.crc32c(obs.oi.data_digest));
6447           }
6448         } else {
6449           obs.oi.clear_data_digest();
6450         }
6451         write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
6452                                     op.extent.offset, op.extent.length);
6453         ctx->clean_regions.mark_data_region_dirty(op.extent.offset, op.extent.length);
6454         dout(10) << "clean_regions modified" << ctx->clean_regions << dendl;
6455       }
6456       break;
6457
6458     case CEPH_OSD_OP_WRITEFULL:
6459       ++ctx->num_write;
6460       result = 0;
6461       { // write full object
6462         tracepoint(osd, do_osd_op_pre_writefull, soid.oid.name.c_str(), soid.snap.val, oi.size, 0, op.extent.length);
6463
6464         if (op.extent.length != osd_op.indata.length()) {
6465           result = -EINVAL;
6466           break;
6467         }
6468         result = check_offset_and_length(
6469           0, op.extent.length,
6470           static_cast<Option::size_t>(osd->osd_max_object_size), get_dpp());
6471         if (result < 0)
6472           break;
6473
6474         if (pool.info.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED))
6475           op.flags = op.flags | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
6476
6477         maybe_create_new_object(ctx);
6478         if (pool.info.is_erasure()) {
6479           t->truncate(soid, 0);
6480         } else if (obs.exists && op.extent.length < oi.size) {
6481           t->truncate(soid, op.extent.length);
6482         }
6483         if (op.extent.length) {
6484           t->write(soid, 0, op.extent.length, osd_op.indata, op.flags);
6485         }
6486         if (!skip_data_digest) {
6487           obs.oi.set_data_digest(osd_op.indata.crc32c(-1));
6488         } else {
6489           obs.oi.clear_data_digest();
6490         }
6491         ctx->clean_regions.mark_data_region_dirty(0,
6492           std::max((uint64_t)op.extent.length, oi.size));
6493         write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
6494             0, op.extent.length, true);
6495       }
6496       break;
6497
6498     case CEPH_OSD_OP_WRITESAME:
6499       ++ctx->num_write;
6500       tracepoint(osd, do_osd_op_pre_writesame, soid.oid.name.c_str(), soid.snap.val, oi.size, op.writesame.offset, op.writesame.length, op.writesame.data_length);
6501       result = do_writesame(ctx, osd_op);
6502       break;
6503
6504     case CEPH_OSD_OP_ROLLBACK :
6505       ++ctx->num_write;
6506       tracepoint(osd, do_osd_op_pre_rollback, soid.oid.name.c_str(), soid.snap.val);
6507       result = _rollback_to(ctx, op);
6508       break;
6509
6510     case CEPH_OSD_OP_ZERO:
6511       tracepoint(osd, do_osd_op_pre_zero, soid.oid.name.c_str(), soid.snap.val, op.extent.offset, op.extent.length);
6512       if (pool.info.requires_aligned_append()) {
6513         result = -EOPNOTSUPP;
6514         break;
6515       }
6516       ++ctx->num_write;
6517       { // zero
6518         result = check_offset_and_length(
6519           op.extent.offset, op.extent.length,
6520           static_cast<Option::size_t>(osd->osd_max_object_size), get_dpp());
6521         if (result < 0)
6522           break;
6523
6524         ceph_assert(op.extent.length);
6525         if (obs.exists && !oi.is_whiteout()) {
6526           t->zero(soid, op.extent.offset, op.extent.length);
6527           interval_set<uint64_t> ch;
6528           ch.insert(op.extent.offset, op.extent.length);
6529           ctx->modified_ranges.union_of(ch);
6530           ctx->clean_regions.mark_data_region_dirty(op.extent.offset, op.extent.length);
6531           ctx->delta_stats.num_wr++;
6532           oi.clear_data_digest();
6533         } else {
6534           // no-op
6535         }
6536       }
6537       break;
6538     case CEPH_OSD_OP_CREATE:
6539       ++ctx->num_write;
6540       result = 0;
6541       {
6542         tracepoint(osd, do_osd_op_pre_create, soid.oid.name.c_str(), soid.snap.val);
6543         if (obs.exists && !oi.is_whiteout() &&
6544             (op.flags & CEPH_OSD_OP_FLAG_EXCL)) {
6545           result = -EEXIST; /* this is an exclusive create */
6546         } else {
6547           if (osd_op.indata.length()) {
6548             auto p = osd_op.indata.cbegin();
6549             string category;
6550             try {
6551               decode(category, p);
6552             }
6553             catch (buffer::error& e) {
6554               result = -EINVAL;
6555               goto fail;
6556             }
6557             // category is no longer implemented.
6558           }
6559           maybe_create_new_object(ctx);
6560           t->nop(soid);
6561         }
6562       }
6563       break;
6564
6565     case CEPH_OSD_OP_TRIMTRUNC:
6566       op.extent.offset = op.extent.truncate_size;
6567       // falling through
6568
6569     case CEPH_OSD_OP_TRUNCATE:
6570       tracepoint(osd, do_osd_op_pre_truncate, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
6571       if (pool.info.requires_aligned_append()) {
6572         result = -EOPNOTSUPP;
6573         break;
6574       }
6575       ++ctx->num_write;
6576       result = 0;
6577       {
6578         // truncate
6579         if (!obs.exists || oi.is_whiteout()) {
6580           dout(10) << " object dne, truncate is a no-op" << dendl;
6581           break;
6582         }
6583
6584         result = check_offset_and_length(
6585           op.extent.offset, op.extent.length,
6586           static_cast<Option::size_t>(osd->osd_max_object_size), get_dpp());
6587         if (result < 0)
6588           break;
6589
6590         if (op.extent.truncate_seq) {
6591           ceph_assert(op.extent.offset == op.extent.truncate_size);
6592           if (op.extent.truncate_seq <= oi.truncate_seq) {
6593             dout(10) << " truncate seq " << op.extent.truncate_seq << " <= current " << oi.truncate_seq
6594                      << ", no-op" << dendl;
6595             break; // old
6596           }
6597           dout(10) << " truncate seq " << op.extent.truncate_seq << " > current " << oi.truncate_seq
6598                    << ", truncating" << dendl;
6599           oi.truncate_seq = op.extent.truncate_seq;
6600           oi.truncate_size = op.extent.truncate_size;
6601         }
6602
6603         maybe_create_new_object(ctx);
6604         t->truncate(soid, op.extent.offset);
6605         if (oi.size > op.extent.offset) {
6606           interval_set<uint64_t> trim;
6607           trim.insert(op.extent.offset, oi.size-op.extent.offset);
6608           ctx->modified_ranges.union_of(trim);
6609           ctx->clean_regions.mark_data_region_dirty(op.extent.offset, oi.size - op.extent.offset);
6610         } else if (oi.size < op.extent.offset) {
6611           ctx->clean_regions.mark_data_region_dirty(oi.size, op.extent.offset - oi.size);
6612         }
6613         if (op.extent.offset != oi.size) {
6614           truncate_update_size_and_usage(ctx->delta_stats,
6615                                          oi,
6616                                          op.extent.offset);
6617         }
6618         ctx->delta_stats.num_wr++;
6619         // do no set exists, or we will break above DELETE -> TRUNCATE munging.
6620
6621         oi.clear_data_digest();
6622       }
6623       break;
6624
6625     case CEPH_OSD_OP_DELETE:
6626       ++ctx->num_write;
6627       result = 0;
6628       tracepoint(osd, do_osd_op_pre_delete, soid.oid.name.c_str(), soid.snap.val);
6629       {
6630         if (oi.has_manifest()) {
6631           if ((oi.flags & object_info_t::FLAG_REDIRECT_HAS_REFERENCE) && oi.manifest.is_redirect()) {
6632             ctx->register_on_commit(
6633               [oi, ctx, this](){
6634               object_locator_t target_oloc(oi.manifest.redirect_target);
6635               refcount_manifest(ctx->obc, target_oloc, oi.manifest.redirect_target,
6636                                 SnapContext(), false, NULL, 0);
6637             });
6638           } else if (oi.manifest.is_chunked()) {
6639             ctx->register_on_commit(
6640               [oi, ctx, this](){
6641               for (auto p : oi.manifest.chunk_map) {
6642                 if (p.second.has_reference()) {
6643                   object_locator_t target_oloc(p.second.oid);
6644                   refcount_manifest(ctx->obc, target_oloc, p.second.oid,
6645                                     SnapContext(), false, NULL, p.first);
6646                 }
6647               }
6648             });
6649           }
6650         }
6651         result = _delete_oid(ctx, false, ctx->ignore_cache);
6652       }
6653       break;
6654
6655     case CEPH_OSD_OP_WATCH:
6656       ++ctx->num_write;
6657       result = 0;
6658       {
6659         tracepoint(osd, do_osd_op_pre_watch, soid.oid.name.c_str(), soid.snap.val,
6660                    op.watch.cookie, op.watch.op);
6661         if (!obs.exists) {
6662           result = -ENOENT;
6663           break;
6664         }
6665         result = 0;
6666         uint64_t cookie = op.watch.cookie;
6667         entity_name_t entity = ctx->reqid.name;
6668         ObjectContextRef obc = ctx->obc;
6669
6670         dout(10) << "watch " << ceph_osd_watch_op_name(op.watch.op)
6671                  << ": ctx->obc=" << (void *)obc.get() << " cookie=" << cookie
6672                  << " oi.version=" << oi.version.version << " ctx->at_version=" << ctx->at_version << dendl;
6673         dout(10) << "watch: oi.user_version=" << oi.user_version<< dendl;
6674         dout(10) << "watch: peer_addr="
6675           << ctx->op->get_req()->get_connection()->get_peer_addr() << dendl;
6676
6677         uint32_t timeout = cct->_conf->osd_client_watch_timeout;
6678         if (op.watch.timeout != 0) {
6679           timeout = op.watch.timeout;
6680         }
6681
6682         watch_info_t w(cookie, timeout,
6683           ctx->op->get_req()->get_connection()->get_peer_addr());
6684         if (op.watch.op == CEPH_OSD_WATCH_OP_WATCH ||
6685             op.watch.op == CEPH_OSD_WATCH_OP_LEGACY_WATCH) {
6686           if (oi.watchers.count(make_pair(cookie, entity))) {
6687             dout(10) << " found existing watch " << w << " by " << entity << dendl;
6688           } else {
6689             dout(10) << " registered new watch " << w << " by " << entity << dendl;
6690             oi.watchers[make_pair(cookie, entity)] = w;
6691             t->nop(soid);  // make sure update the object_info on disk!
6692           }
6693           bool will_ping = (op.watch.op == CEPH_OSD_WATCH_OP_WATCH);
6694           ctx->watch_connects.push_back(make_pair(w, will_ping));
6695         } else if (op.watch.op == CEPH_OSD_WATCH_OP_RECONNECT) {
6696           if (!oi.watchers.count(make_pair(cookie, entity))) {
6697             result = -ENOTCONN;
6698             break;
6699           }
6700           dout(10) << " found existing watch " << w << " by " << entity << dendl;
6701           ctx->watch_connects.push_back(make_pair(w, true));
6702         } else if (op.watch.op == CEPH_OSD_WATCH_OP_PING) {
6703           /* Note: WATCH with PING doesn't cause may_write() to return true,
6704            * so if there is nothing else in the transaction, this is going
6705            * to run do_osd_op_effects, but not write out a log entry */
6706           if (!oi.watchers.count(make_pair(cookie, entity))) {
6707             result = -ENOTCONN;
6708             break;
6709           }
6710           map<pair<uint64_t,entity_name_t>,WatchRef>::iterator p =
6711             obc->watchers.find(make_pair(cookie, entity));
6712           if (p == obc->watchers.end() ||
6713               !p->second->is_connected()) {
6714             // client needs to reconnect
6715             result = -ETIMEDOUT;
6716             break;
6717           }
6718           dout(10) << " found existing watch " << w << " by " << entity << dendl;
6719           p->second->got_ping(ceph_clock_now());
6720           result = 0;
6721         } else if (op.watch.op == CEPH_OSD_WATCH_OP_UNWATCH) {
6722           map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator oi_iter =
6723             oi.watchers.find(make_pair(cookie, entity));
6724           if (oi_iter != oi.watchers.end()) {
6725             dout(10) << " removed watch " << oi_iter->second << " by "
6726                      << entity << dendl;
6727             oi.watchers.erase(oi_iter);
6728             t->nop(soid);  // update oi on disk
6729             ctx->watch_disconnects.push_back(
6730               watch_disconnect_t(cookie, entity, false));
6731           } else {
6732             dout(10) << " can't remove: no watch by " << entity << dendl;
6733           }
6734         }
6735       }
6736       break;
6737
6738     case CEPH_OSD_OP_CACHE_PIN:
6739       tracepoint(osd, do_osd_op_pre_cache_pin, soid.oid.name.c_str(), soid.snap.val);
6740       if ((!pool.info.is_tier() ||
6741           pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)) {
6742         result = -EINVAL;
6743         dout(10) << " pin object is only allowed on the cache tier " << dendl;
6744         break;
6745       }
6746       ++ctx->num_write;
6747       result = 0;
6748       {
6749         if (!obs.exists || oi.is_whiteout()) {
6750           result = -ENOENT;
6751           break;
6752         }
6753
6754         if (!oi.is_cache_pinned()) {
6755           oi.set_flag(object_info_t::FLAG_CACHE_PIN);
6756           ctx->modify = true;
6757           ctx->delta_stats.num_objects_pinned++;
6758           ctx->delta_stats.num_wr++;
6759         }
6760       }
6761       break;
6762
6763     case CEPH_OSD_OP_CACHE_UNPIN:
6764       tracepoint(osd, do_osd_op_pre_cache_unpin, soid.oid.name.c_str(), soid.snap.val);
6765       if ((!pool.info.is_tier() ||
6766           pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)) {
6767         result = -EINVAL;
6768         dout(10) << " pin object is only allowed on the cache tier " << dendl;
6769         break;
6770       }
6771       ++ctx->num_write;
6772       result = 0;
6773       {
6774         if (!obs.exists || oi.is_whiteout()) {
6775           result = -ENOENT;
6776           break;
6777         }
6778
6779         if (oi.is_cache_pinned()) {
6780           oi.clear_flag(object_info_t::FLAG_CACHE_PIN);
6781           ctx->modify = true;
6782           ctx->delta_stats.num_objects_pinned--;
6783           ctx->delta_stats.num_wr++;
6784         }
6785       }
6786       break;
6787
6788     case CEPH_OSD_OP_SET_REDIRECT:
6789       ++ctx->num_write;
6790       result = 0;
6791       {
6792         if (pool.info.is_tier()) {
6793           result = -EINVAL;
6794           break;
6795         }
6796         if (!obs.exists) {
6797           result = -ENOENT;
6798           break;
6799         }
6800         if (get_osdmap()->require_osd_release < ceph_release_t::luminous) {
6801           result = -EOPNOTSUPP;
6802           break;
6803         }
6804
6805         object_t target_name;
6806         object_locator_t target_oloc;
6807         snapid_t target_snapid = (uint64_t)op.copy_from.snapid;
6808         version_t target_version = op.copy_from.src_version;
6809         try {
6810           decode(target_name, bp);
6811           decode(target_oloc, bp);
6812         }
6813         catch (buffer::error& e) {
6814           result = -EINVAL;
6815           goto fail;
6816         }
6817         pg_t raw_pg;
6818         get_osdmap()->object_locator_to_pg(target_name, target_oloc, raw_pg);
6819         hobject_t target(target_name, target_oloc.key, target_snapid,
6820                 raw_pg.ps(), raw_pg.pool(),
6821                 target_oloc.nspace);
6822         if (target == soid) {
6823           dout(20) << " set-redirect self is invalid" << dendl;
6824           result = -EINVAL;
6825           break;
6826         }
6827
6828         bool need_reference = (osd_op.op.flags & CEPH_OSD_OP_FLAG_WITH_REFERENCE);
6829         bool has_reference = (oi.flags & object_info_t::FLAG_REDIRECT_HAS_REFERENCE);
6830         if (has_reference) {
6831           result = -EINVAL;
6832           dout(5) << " the object is already a manifest " << dendl;
6833           break;
6834         }
6835         if (op_finisher == nullptr && need_reference) {
6836           // start
6837           ctx->op_finishers[ctx->current_osd_subop_num].reset(
6838             new SetManifestFinisher(osd_op));
6839           RefCountCallback *fin = new RefCountCallback(ctx, osd_op);
6840           refcount_manifest(ctx->obc, target_oloc, target, SnapContext(),
6841                             true, fin, 0);
6842           result = -EINPROGRESS;
6843         } else {
6844           // finish
6845           if (op_finisher) {
6846             result = op_finisher->execute();
6847             ceph_assert(result == 0);
6848           }
6849
6850           if (!oi.has_manifest() && !oi.manifest.is_redirect())
6851             ctx->delta_stats.num_objects_manifest++;
6852
6853           oi.set_flag(object_info_t::FLAG_MANIFEST);
6854           oi.manifest.redirect_target = target;
6855           oi.manifest.type = object_manifest_t::TYPE_REDIRECT;
6856           t->truncate(soid, 0);
6857           ctx->clean_regions.mark_data_region_dirty(0, oi.size);
6858           if (oi.is_omap() && pool.info.supports_omap()) {
6859             t->omap_clear(soid);
6860             obs.oi.clear_omap_digest();
6861             obs.oi.clear_flag(object_info_t::FLAG_OMAP);
6862             ctx->clean_regions.mark_omap_dirty();
6863           }
6864           write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
6865             0, oi.size, false);
6866           ctx->delta_stats.num_bytes -= oi.size;
6867           oi.size = 0;
6868           oi.new_object();
6869           oi.user_version = target_version;
6870           ctx->user_at_version = target_version;
6871           /* rm_attrs */
6872           map<string,bufferlist> rmattrs;
6873           result = getattrs_maybe_cache(ctx->obc, &rmattrs);
6874           if (result < 0) {
6875             dout(10) << __func__ << " error: " << cpp_strerror(result) << dendl;
6876             return result;
6877           }
6878           map<string, bufferlist>::iterator iter;
6879           for (iter = rmattrs.begin(); iter != rmattrs.end(); ++iter) {
6880             const string& name = iter->first;
6881             t->rmattr(soid, name);
6882           }
6883           if (!has_reference && need_reference) {
6884             oi.set_flag(object_info_t::FLAG_REDIRECT_HAS_REFERENCE);
6885           }
6886           dout(10) << "set-redirect oid:" << oi.soid << " user_version: " << oi.user_version << dendl;
6887           if (op_finisher) {
6888             ctx->op_finishers.erase(ctx->current_osd_subop_num);
6889           }
6890         }
6891       }
6892
6893       break;
6894
6895     case CEPH_OSD_OP_SET_CHUNK:
6896       ++ctx->num_write;
6897       result = 0;
6898       {
6899         if (pool.info.is_tier()) {
6900           result = -EINVAL;
6901           break;
6902         }
6903         if (!obs.exists) {
6904           result = -ENOENT;
6905           break;
6906         }
6907         if (get_osdmap()->require_osd_release < ceph_release_t::luminous) {
6908           result = -EOPNOTSUPP;
6909           break;
6910         }
6911
6912         object_locator_t tgt_oloc;
6913         uint64_t src_offset, src_length, tgt_offset;
6914         object_t tgt_name;
6915         try {
6916           decode(src_offset, bp);
6917           decode(src_length, bp);
6918           decode(tgt_oloc, bp);
6919           decode(tgt_name, bp);
6920           decode(tgt_offset, bp);
6921         }
6922         catch (buffer::error& e) {
6923           result = -EINVAL;
6924           goto fail;
6925         }
6926
6927         if (!src_length) {
6928           result = -EINVAL;
6929           goto fail;
6930         }
6931
6932         for (auto &p : oi.manifest.chunk_map) {
6933           if ((p.first <= src_offset && p.first + p.second.length > src_offset) ||
6934               (p.first > src_offset && p.first <= src_offset + src_length)) {
6935             dout(20) << __func__ << " overlapped !! offset: " << src_offset << " length: " << src_length
6936                     << " chunk_info: " << p << dendl;
6937             result = -EOPNOTSUPP;
6938             goto fail;
6939           }
6940         }
6941
6942         if (!oi.manifest.is_chunked()) {
6943           oi.manifest.clear();
6944         }
6945
6946         pg_t raw_pg;
6947         chunk_info_t chunk_info;
6948         get_osdmap()->object_locator_to_pg(tgt_name, tgt_oloc, raw_pg);
6949         hobject_t target(tgt_name, tgt_oloc.key, snapid_t(),
6950                          raw_pg.ps(), raw_pg.pool(),
6951                          tgt_oloc.nspace);
6952         bool need_reference = (osd_op.op.flags & CEPH_OSD_OP_FLAG_WITH_REFERENCE);
6953         bool has_reference = (oi.manifest.chunk_map.find(src_offset) != oi.manifest.chunk_map.end()) &&
6954                              (oi.manifest.chunk_map[src_offset].flags & chunk_info_t::FLAG_HAS_REFERENCE);
6955         if (has_reference) {
6956           result = -EINVAL;
6957           dout(5) << " the object is already a manifest " << dendl;
6958           break;
6959         }
6960         if (op_finisher == nullptr && need_reference) {
6961           // start
6962           ctx->op_finishers[ctx->current_osd_subop_num].reset(
6963             new SetManifestFinisher(osd_op));
6964           RefCountCallback *fin = new RefCountCallback(ctx, osd_op);
6965           refcount_manifest(ctx->obc, tgt_oloc, target, SnapContext(),
6966                             true, fin, src_offset);
6967           result = -EINPROGRESS;
6968         } else {
6969           if (op_finisher) {
6970             result = op_finisher->execute();
6971             ceph_assert(result == 0);
6972           }
6973
6974           chunk_info_t chunk_info;
6975           chunk_info.set_flag(chunk_info_t::FLAG_MISSING);
6976           chunk_info.oid = target;
6977           chunk_info.offset = tgt_offset;
6978           chunk_info.length= src_length;
6979           oi.manifest.chunk_map[src_offset] = chunk_info;
6980           if (!oi.has_manifest() && !oi.manifest.is_chunked())
6981             ctx->delta_stats.num_objects_manifest++;
6982           oi.set_flag(object_info_t::FLAG_MANIFEST);
6983           oi.manifest.type = object_manifest_t::TYPE_CHUNKED;
6984           if (!has_reference && need_reference) {
6985             oi.manifest.chunk_map[src_offset].set_flag(chunk_info_t::FLAG_HAS_REFERENCE);
6986           }
6987           if (need_reference && pool.info.get_fingerprint_type() != pg_pool_t::TYPE_FINGERPRINT_NONE) {
6988             oi.manifest.chunk_map[src_offset].set_flag(chunk_info_t::FLAG_HAS_FINGERPRINT);
6989           }
6990           ctx->modify = true;
6991
6992           dout(10) << "set-chunked oid:" << oi.soid << " user_version: " << oi.user_version
6993                    << " chunk_info: " << chunk_info << dendl;
6994           if (op_finisher) {
6995             ctx->op_finishers.erase(ctx->current_osd_subop_num);
6996           }
6997         }
6998       }
6999
7000       break;
7001
7002     case CEPH_OSD_OP_TIER_PROMOTE:
7003       ++ctx->num_write;
7004       result = 0;
7005       {
7006         if (pool.info.is_tier()) {
7007           result = -EINVAL;
7008           break;
7009         }
7010         if (!obs.exists) {
7011           result = -ENOENT;
7012           break;
7013         }
7014         if (get_osdmap()->require_osd_release < ceph_release_t::luminous) {
7015           result = -EOPNOTSUPP;
7016           break;
7017         }
7018         if (!obs.oi.has_manifest()) {
7019           result = 0;
7020           break;
7021         }
7022
7023         if (op_finisher == nullptr) {
7024           PromoteManifestCallback *cb;
7025           object_locator_t my_oloc;
7026           hobject_t src_hoid;
7027
7028           if (obs.oi.manifest.is_chunked()) {
7029             src_hoid = obs.oi.soid;
7030             cb = new PromoteManifestCallback(ctx->obc, this, ctx);
7031           } else if (obs.oi.manifest.is_redirect()) {
7032             object_locator_t src_oloc(obs.oi.manifest.redirect_target);
7033             my_oloc = src_oloc;
7034             src_hoid = obs.oi.manifest.redirect_target;
7035             cb = new PromoteManifestCallback(ctx->obc, this, ctx);
7036           } else {
7037             ceph_abort_msg("unrecognized manifest type");
7038           }
7039           ctx->op_finishers[ctx->current_osd_subop_num].reset(
7040             new PromoteFinisher(cb));
7041           unsigned flags = CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
7042                            CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
7043                            CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE |
7044                            CEPH_OSD_COPY_FROM_FLAG_RWORDERED;
7045           unsigned src_fadvise_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL;
7046           start_copy(cb, ctx->obc, src_hoid, my_oloc, 0, flags,
7047                      obs.oi.soid.snap == CEPH_NOSNAP,
7048                      src_fadvise_flags, 0);
7049
7050           dout(10) << "tier-promote oid:" << oi.soid << " manifest: " << obs.oi.manifest << dendl;
7051           result = -EINPROGRESS;
7052         } else {
7053           result = op_finisher->execute();
7054           ceph_assert(result == 0);
7055           ctx->op_finishers.erase(ctx->current_osd_subop_num);
7056         }
7057       }
7058
7059       break;
7060
7061     case CEPH_OSD_OP_TIER_FLUSH:
7062       ++ctx->num_write;
7063       result = 0;
7064       {
7065         if (pool.info.is_tier()) {
7066           result = -EINVAL;
7067           break;
7068         }
7069         if (!obs.exists) {
7070           result = -ENOENT;
7071           break;
7072         }
7073         if (get_osdmap()->require_osd_release < ceph_release_t::octopus) {
7074           result = -EOPNOTSUPP;
7075           break;
7076         }
7077         if (!obs.oi.has_manifest()) {
7078           result = 0;
7079           break;
7080         }
7081
7082         hobject_t missing;
7083         bool is_dirty = false;
7084         for (auto& p : ctx->obc->obs.oi.manifest.chunk_map) {
7085           if (p.second.is_dirty()) {
7086             is_dirty = true;
7087             break;
7088           }
7089         }
7090
7091         if (is_dirty) {
7092           result = start_flush(ctx->op, ctx->obc, true, NULL, std::nullopt);
7093           if (result == -EINPROGRESS)
7094             result = -EAGAIN;
7095         } else {
7096           result = 0;
7097         }
7098       }
7099
7100       break;
7101
7102     case CEPH_OSD_OP_UNSET_MANIFEST:
7103       ++ctx->num_write;
7104       result = 0;
7105       {
7106         if (pool.info.is_tier()) {
7107           result = -EINVAL;
7108           break;
7109         }
7110         if (!obs.exists) {
7111           result = -ENOENT;
7112           break;
7113         }
7114         if (!oi.has_manifest()) {
7115           result = -EOPNOTSUPP;
7116           break;
7117         }
7118         if (get_osdmap()->require_osd_release < ceph_release_t::luminous) {
7119           result = -EOPNOTSUPP;
7120           break;
7121         }
7122
7123         if (oi.manifest.is_redirect()) {
7124           if ((oi.flags & object_info_t::FLAG_REDIRECT_HAS_REFERENCE)) {
7125             ctx->register_on_commit(
7126               [oi, ctx, this](){
7127               object_locator_t target_oloc(oi.manifest.redirect_target);
7128               refcount_manifest(ctx->obc, target_oloc, oi.manifest.redirect_target,
7129                                 SnapContext(), false, NULL, 0);
7130             });
7131           }
7132         } else if (oi.manifest.is_chunked()) {
7133             ctx->register_on_commit(
7134               [oi, ctx, this](){
7135               for (auto p : oi.manifest.chunk_map) {
7136                 if (p.second.flags & chunk_info_t::FLAG_HAS_REFERENCE) {
7137                   object_locator_t target_oloc(p.second.oid);
7138                   refcount_manifest(ctx->obc, target_oloc, p.second.oid,
7139                                     SnapContext(), false, NULL, p.first);
7140                 }
7141               }
7142             });
7143         } else {
7144           ceph_abort_msg("unrecognized manifest type");
7145         }
7146
7147         oi.clear_flag(object_info_t::FLAG_MANIFEST);
7148         oi.manifest = object_manifest_t();
7149         ctx->delta_stats.num_objects_manifest--;
7150         ctx->delta_stats.num_wr++;
7151         ctx->modify = true;
7152       }
7153
7154       break;
7155
7156       // -- object attrs --
7157
7158     case CEPH_OSD_OP_SETXATTR:
7159       ++ctx->num_write;
7160       result = 0;
7161       {
7162         if (cct->_conf->osd_max_attr_size > 0 &&
7163             op.xattr.value_len > cct->_conf->osd_max_attr_size) {
7164           tracepoint(osd, do_osd_op_pre_setxattr, soid.oid.name.c_str(), soid.snap.val, "???");
7165           result = -EFBIG;
7166           break;
7167         }
7168         unsigned max_name_len =
7169           std::min<uint64_t>(osd->store->get_max_attr_name_length(),
7170                              cct->_conf->osd_max_attr_name_len);
7171         if (op.xattr.name_len > max_name_len) {
7172           result = -ENAMETOOLONG;
7173           break;
7174         }
7175         maybe_create_new_object(ctx);
7176         string aname;
7177         bp.copy(op.xattr.name_len, aname);
7178         tracepoint(osd, do_osd_op_pre_setxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
7179         string name = "_" + aname;
7180         bufferlist bl;
7181         bp.copy(op.xattr.value_len, bl);
7182         t->setattr(soid, name, bl);
7183         ctx->delta_stats.num_wr++;
7184       }
7185       break;
7186
7187     case CEPH_OSD_OP_RMXATTR:
7188       ++ctx->num_write;
7189       result = 0;
7190       {
7191         string aname;
7192         bp.copy(op.xattr.name_len, aname);
7193         tracepoint(osd, do_osd_op_pre_rmxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
7194         if (!obs.exists || oi.is_whiteout()) {
7195           result = -ENOENT;
7196           break;
7197         }
7198         string name = "_" + aname;
7199         t->rmattr(soid, name);
7200         ctx->delta_stats.num_wr++;
7201       }
7202       break;
7203
7204
7205       // -- fancy writers --
7206     case CEPH_OSD_OP_APPEND:
7207       {
7208         tracepoint(osd, do_osd_op_pre_append, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
7209         // just do it inline; this works because we are happy to execute
7210         // fancy op on replicas as well.
7211         vector<OSDOp> nops(1);
7212         OSDOp& newop = nops[0];
7213         newop.op.op = CEPH_OSD_OP_WRITE;
7214         newop.op.extent.offset = oi.size;
7215         newop.op.extent.length = op.extent.length;
7216         newop.op.extent.truncate_seq = oi.truncate_seq;
7217         newop.indata = osd_op.indata;
7218         result = do_osd_ops(ctx, nops);
7219         osd_op.outdata.claim(newop.outdata);
7220       }
7221       break;
7222
7223     case CEPH_OSD_OP_STARTSYNC:
7224       result = 0;
7225       t->nop(soid);
7226       break;
7227
7228       // -- trivial map --
7229     case CEPH_OSD_OP_TMAPGET:
7230       tracepoint(osd, do_osd_op_pre_tmapget, soid.oid.name.c_str(), soid.snap.val);
7231       if (pool.info.is_erasure()) {
7232         result = -EOPNOTSUPP;
7233         break;
7234       }
7235       {
7236         vector<OSDOp> nops(1);
7237         OSDOp& newop = nops[0];
7238         newop.op.op = CEPH_OSD_OP_SYNC_READ;
7239         newop.op.extent.offset = 0;
7240         newop.op.extent.length = 0;
7241         result = do_osd_ops(ctx, nops);
7242         osd_op.outdata.claim(newop.outdata);
7243       }
7244       break;
7245
7246     case CEPH_OSD_OP_TMAPPUT:
7247       tracepoint(osd, do_osd_op_pre_tmapput, soid.oid.name.c_str(), soid.snap.val);
7248       if (pool.info.is_erasure()) {
7249         result = -EOPNOTSUPP;
7250         break;
7251       }
7252       {
7253         //_dout_lock.Lock();
7254         //osd_op.data.hexdump(*_dout);
7255         //_dout_lock.Unlock();
7256
7257         // verify sort order
7258         bool unsorted = false;
7259         if (true) {
7260           bufferlist header;
7261           decode(header, bp);
7262           uint32_t n;
7263           decode(n, bp);
7264           string last_key;
7265           while (n--) {
7266             string key;
7267             decode(key, bp);
7268             dout(10) << "tmapput key " << key << dendl;
7269             bufferlist val;
7270             decode(val, bp);
7271             if (key < last_key) {
7272               dout(10) << "TMAPPUT is unordered; resorting" << dendl;
7273               unsorted = true;
7274               break;
7275             }
7276             last_key = key;
7277           }
7278         }
7279
7280         // write it
7281         vector<OSDOp> nops(1);
7282         OSDOp& newop = nops[0];
7283         newop.op.op = CEPH_OSD_OP_WRITEFULL;
7284         newop.op.extent.offset = 0;
7285         newop.op.extent.length = osd_op.indata.length();
7286         newop.indata = osd_op.indata;
7287
7288         if (unsorted) {
7289           bp = osd_op.indata.begin();
7290           bufferlist header;
7291           map<string, bufferlist> m;
7292           decode(header, bp);
7293           decode(m, bp);
7294           ceph_assert(bp.end());
7295           bufferlist newbl;
7296           encode(header, newbl);
7297           encode(m, newbl);
7298           newop.indata = newbl;
7299         }
7300         result = do_osd_ops(ctx, nops);
7301         ceph_assert(result == 0);
7302       }
7303       break;
7304
7305     case CEPH_OSD_OP_TMAPUP:
7306       tracepoint(osd, do_osd_op_pre_tmapup, soid.oid.name.c_str(), soid.snap.val);
7307       if (pool.info.is_erasure()) {
7308         result = -EOPNOTSUPP;
7309         break;
7310       }
7311       ++ctx->num_write;
7312       result = do_tmapup(ctx, bp, osd_op);
7313       break;
7314
7315     case CEPH_OSD_OP_TMAP2OMAP:
7316       ++ctx->num_write;
7317       tracepoint(osd, do_osd_op_pre_tmap2omap, soid.oid.name.c_str(), soid.snap.val);
7318       result = do_tmap2omap(ctx, op.tmap2omap.flags);
7319       break;
7320
7321       // OMAP Read ops
7322     case CEPH_OSD_OP_OMAPGETKEYS:
7323       ++ctx->num_read;
7324       {
7325         string start_after;
7326         uint64_t max_return;
7327         try {
7328           decode(start_after, bp);
7329           decode(max_return, bp);
7330         }
7331         catch (buffer::error& e) {
7332           result = -EINVAL;
7333           tracepoint(osd, do_osd_op_pre_omapgetkeys, soid.oid.name.c_str(), soid.snap.val, "???", 0);
7334           goto fail;
7335         }
7336         if (max_return > cct->_conf->osd_max_omap_entries_per_request) {
7337           max_return = cct->_conf->osd_max_omap_entries_per_request;
7338         }
7339         tracepoint(osd, do_osd_op_pre_omapgetkeys, soid.oid.name.c_str(), soid.snap.val, start_after.c_str(), max_return);
7340
7341         bufferlist bl;
7342         uint32_t num = 0;
7343         bool truncated = false;
7344         if (oi.is_omap()) {
7345           ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(
7346             ch, ghobject_t(soid)
7347             );
7348           ceph_assert(iter);
7349           iter->upper_bound(start_after);
7350           for (num = 0; iter->valid(); ++num, iter->next()) {
7351             if (num >= max_return ||
7352                 bl.length() >= cct->_conf->osd_max_omap_bytes_per_request) {
7353               truncated = true;
7354               break;
7355             }
7356             encode(iter->key(), bl);
7357           }
7358         } // else return empty out_set
7359         encode(num, osd_op.outdata);
7360         osd_op.outdata.claim_append(bl);
7361         encode(truncated, osd_op.outdata);
7362         ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
7363         ctx->delta_stats.num_rd++;
7364       }
7365       break;
7366
7367     case CEPH_OSD_OP_OMAPGETVALS:
7368       ++ctx->num_read;
7369       {
7370         string start_after;
7371         uint64_t max_return;
7372         string filter_prefix;
7373         try {
7374           decode(start_after, bp);
7375           decode(max_return, bp);
7376           decode(filter_prefix, bp);
7377         }
7378         catch (buffer::error& e) {
7379           result = -EINVAL;
7380           tracepoint(osd, do_osd_op_pre_omapgetvals, soid.oid.name.c_str(), soid.snap.val, "???", 0, "???");
7381           goto fail;
7382         }
7383         if (max_return > cct->_conf->osd_max_omap_entries_per_request) {
7384           max_return = cct->_conf->osd_max_omap_entries_per_request;
7385         }
7386         tracepoint(osd, do_osd_op_pre_omapgetvals, soid.oid.name.c_str(), soid.snap.val, start_after.c_str(), max_return, filter_prefix.c_str());
7387
7388         uint32_t num = 0;
7389         bool truncated = false;
7390         bufferlist bl;
7391         if (oi.is_omap()) {
7392           ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(
7393             ch, ghobject_t(soid)
7394             );
7395           if (!iter) {
7396             result = -ENOENT;
7397             goto fail;
7398           }
7399           iter->upper_bound(start_after);
7400           if (filter_prefix > start_after) iter->lower_bound(filter_prefix);
7401           for (num = 0;
7402                iter->valid() &&
7403                  iter->key().substr(0, filter_prefix.size()) == filter_prefix;
7404                ++num, iter->next()) {
7405             dout(20) << "Found key " << iter->key() << dendl;
7406             if (num >= max_return ||
7407                 bl.length() >= cct->_conf->osd_max_omap_bytes_per_request) {
7408               truncated = true;
7409               break;
7410             }
7411             encode(iter->key(), bl);
7412             encode(iter->value(), bl);
7413           }
7414         } // else return empty out_set
7415         encode(num, osd_op.outdata);
7416         osd_op.outdata.claim_append(bl);
7417         encode(truncated, osd_op.outdata);
7418         ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
7419         ctx->delta_stats.num_rd++;
7420       }
7421       break;
7422
7423     case CEPH_OSD_OP_OMAPGETHEADER:
7424       tracepoint(osd, do_osd_op_pre_omapgetheader, soid.oid.name.c_str(), soid.snap.val);
7425       if (!oi.is_omap()) {
7426         // return empty header
7427         break;
7428       }
7429       ++ctx->num_read;
7430       {
7431         osd->store->omap_get_header(ch, ghobject_t(soid), &osd_op.outdata);
7432         ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
7433         ctx->delta_stats.num_rd++;
7434       }
7435       break;
7436
7437     case CEPH_OSD_OP_OMAPGETVALSBYKEYS:
7438       ++ctx->num_read;
7439       {
7440         set<string> keys_to_get;
7441         try {
7442           decode(keys_to_get, bp);
7443         }
7444         catch (buffer::error& e) {
7445           result = -EINVAL;
7446           tracepoint(osd, do_osd_op_pre_omapgetvalsbykeys, soid.oid.name.c_str(), soid.snap.val, "???");
7447           goto fail;
7448         }
7449         tracepoint(osd, do_osd_op_pre_omapgetvalsbykeys, soid.oid.name.c_str(), soid.snap.val, list_entries(keys_to_get).c_str());
7450         map<string, bufferlist> out;
7451         if (oi.is_omap()) {
7452           osd->store->omap_get_values(ch, ghobject_t(soid), keys_to_get, &out);
7453         } // else return empty omap entries
7454         encode(out, osd_op.outdata);
7455         ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
7456         ctx->delta_stats.num_rd++;
7457       }
7458       break;
7459
7460     case CEPH_OSD_OP_OMAP_CMP:
7461       ++ctx->num_read;
7462       {
7463         if (!obs.exists || oi.is_whiteout()) {
7464           result = -ENOENT;
7465           tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, "???");
7466           break;
7467         }
7468         map<string, pair<bufferlist, int> > assertions;
7469         try {
7470           decode(assertions, bp);
7471         }
7472         catch (buffer::error& e) {
7473           result = -EINVAL;
7474           tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, "???");
7475           goto fail;
7476         }
7477         tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, list_keys(assertions).c_str());
7478
7479         map<string, bufferlist> out;
7480
7481         if (oi.is_omap()) {
7482           set<string> to_get;
7483           for (map<string, pair<bufferlist, int> >::iterator i = assertions.begin();
7484                i != assertions.end();
7485                ++i)
7486             to_get.insert(i->first);
7487           int r = osd->store->omap_get_values(ch, ghobject_t(soid),
7488                                               to_get, &out);
7489           if (r < 0) {
7490             result = r;
7491             break;
7492           }
7493         } // else leave out empty
7494
7495         //Should set num_rd_kb based on encode length of map
7496         ctx->delta_stats.num_rd++;
7497
7498         int r = 0;
7499         bufferlist empty;
7500         for (map<string, pair<bufferlist, int> >::iterator i = assertions.begin();
7501              i != assertions.end();
7502              ++i) {
7503           auto out_entry = out.find(i->first);
7504           bufferlist &bl = (out_entry != out.end()) ?
7505             out_entry->second : empty;
7506           switch (i->second.second) {
7507           case CEPH_OSD_CMPXATTR_OP_EQ:
7508             if (!(bl == i->second.first)) {
7509               r = -ECANCELED;
7510             }
7511             break;
7512           case CEPH_OSD_CMPXATTR_OP_LT:
7513             if (!(bl < i->second.first)) {
7514               r = -ECANCELED;
7515             }
7516             break;
7517           case CEPH_OSD_CMPXATTR_OP_GT:
7518             if (!(bl > i->second.first)) {
7519               r = -ECANCELED;
7520             }
7521             break;
7522           default:
7523             r = -EINVAL;
7524             break;
7525           }
7526           if (r < 0)
7527             break;
7528         }
7529         if (r < 0) {
7530           result = r;
7531         }
7532       }
7533       break;
7534
7535       // OMAP Write ops
7536     case CEPH_OSD_OP_OMAPSETVALS:
7537       if (!pool.info.supports_omap()) {
7538         result = -EOPNOTSUPP;
7539         tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
7540         break;
7541       }
7542       ++ctx->num_write;
7543       result = 0;
7544       {
7545         maybe_create_new_object(ctx);
7546         bufferlist to_set_bl;
7547         try {
7548           decode_str_str_map_to_bl(bp, &to_set_bl);
7549         }
7550         catch (buffer::error& e) {
7551           result = -EINVAL;
7552           tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
7553           goto fail;
7554         }
7555         tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
7556         if (cct->_conf->subsys.should_gather<dout_subsys, 20>()) {
7557           dout(20) << "setting vals: " << dendl;
7558           map<string,bufferlist> to_set;
7559           bufferlist::const_iterator pt = to_set_bl.begin();
7560           decode(to_set, pt);
7561           for (map<string, bufferlist>::iterator i = to_set.begin();
7562                i != to_set.end();
7563                ++i) {
7564             dout(20) << "\t" << i->first << dendl;
7565           }
7566         }
7567         t->omap_setkeys(soid, to_set_bl);
7568         ctx->clean_regions.mark_omap_dirty();
7569         ctx->delta_stats.num_wr++;
7570         ctx->delta_stats.num_wr_kb += shift_round_up(to_set_bl.length(), 10);
7571       }
7572       obs.oi.set_flag(object_info_t::FLAG_OMAP);
7573       obs.oi.clear_omap_digest();
7574       break;
7575
7576     case CEPH_OSD_OP_OMAPSETHEADER:
7577       tracepoint(osd, do_osd_op_pre_omapsetheader, soid.oid.name.c_str(), soid.snap.val);
7578       if (!pool.info.supports_omap()) {
7579         result = -EOPNOTSUPP;
7580         break;
7581       }
7582       ++ctx->num_write;
7583       result = 0;
7584       {
7585         maybe_create_new_object(ctx);
7586         t->omap_setheader(soid, osd_op.indata);
7587         ctx->clean_regions.mark_omap_dirty();
7588         ctx->delta_stats.num_wr++;
7589       }
7590       obs.oi.set_flag(object_info_t::FLAG_OMAP);
7591       obs.oi.clear_omap_digest();
7592       break;
7593
7594     case CEPH_OSD_OP_OMAPCLEAR:
7595       tracepoint(osd, do_osd_op_pre_omapclear, soid.oid.name.c_str(), soid.snap.val);
7596       if (!pool.info.supports_omap()) {
7597         result = -EOPNOTSUPP;
7598         break;
7599       }
7600       ++ctx->num_write;
7601       result = 0;
7602       {
7603         if (!obs.exists || oi.is_whiteout()) {
7604           result = -ENOENT;
7605           break;
7606         }
7607         if (oi.is_omap()) {
7608           t->omap_clear(soid);
7609           ctx->clean_regions.mark_omap_dirty();
7610           ctx->delta_stats.num_wr++;
7611           obs.oi.clear_omap_digest();
7612           obs.oi.clear_flag(object_info_t::FLAG_OMAP);
7613         }
7614       }
7615       break;
7616
7617     case CEPH_OSD_OP_OMAPRMKEYS:
7618       if (!pool.info.supports_omap()) {
7619         result = -EOPNOTSUPP;
7620         tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
7621         break;
7622       }
7623       ++ctx->num_write;
7624       result = 0;
7625       {
7626         if (!obs.exists || oi.is_whiteout()) {
7627           result = -ENOENT;
7628           tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
7629           break;
7630         }
7631         bufferlist to_rm_bl;
7632         try {
7633           decode_str_set_to_bl(bp, &to_rm_bl);
7634         }
7635         catch (buffer::error& e) {
7636           result = -EINVAL;
7637           tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
7638           goto fail;
7639         }
7640         tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
7641         t->omap_rmkeys(soid, to_rm_bl);
7642         ctx->clean_regions.mark_omap_dirty();
7643         ctx->delta_stats.num_wr++;
7644       }
7645       obs.oi.clear_omap_digest();
7646       break;
7647
7648     case CEPH_OSD_OP_OMAPRMKEYRANGE:
7649       tracepoint(osd, do_osd_op_pre_omaprmkeyrange, soid.oid.name.c_str(), soid.snap.val);
7650       if (!pool.info.supports_omap()) {
7651         result = -EOPNOTSUPP;
7652         break;
7653       }
7654       ++ctx->num_write;
7655       result = 0;
7656       {
7657         if (!obs.exists || oi.is_whiteout()) {
7658           result = -ENOENT;
7659           break;
7660         }
7661         std::string key_begin, key_end;
7662         try {
7663           decode(key_begin, bp);
7664           decode(key_end, bp);
7665         } catch (buffer::error& e) {
7666           result = -EINVAL;
7667           goto fail;
7668         }
7669         t->omap_rmkeyrange(soid, key_begin, key_end);
7670         ctx->delta_stats.num_wr++;
7671       }
7672       obs.oi.clear_omap_digest();
7673       break;
7674
7675     case CEPH_OSD_OP_COPY_GET:
7676       ++ctx->num_read;
7677       tracepoint(osd, do_osd_op_pre_copy_get, soid.oid.name.c_str(),
7678                  soid.snap.val);
7679       if (op_finisher == nullptr) {
7680         result = do_copy_get(ctx, bp, osd_op, ctx->obc);
7681       } else {
7682         result = op_finisher->execute();
7683       }
7684       break;
7685
7686     case CEPH_OSD_OP_COPY_FROM:
7687     case CEPH_OSD_OP_COPY_FROM2:
7688       ++ctx->num_write;
7689       result = 0;
7690       {
7691         object_t src_name;
7692         object_locator_t src_oloc;
7693         uint32_t truncate_seq = 0;
7694         uint64_t truncate_size = 0;
7695         bool have_truncate = false;
7696         snapid_t src_snapid = (uint64_t)op.copy_from.snapid;
7697         version_t src_version = op.copy_from.src_version;
7698
7699         if ((op.op == CEPH_OSD_OP_COPY_FROM2) &&
7700             (op.copy_from.flags & ~CEPH_OSD_COPY_FROM_FLAGS)) {
7701           dout(20) << "invalid copy-from2 flags 0x"
7702                   << std::hex << (int)op.copy_from.flags << std::dec << dendl;
7703           result = -EINVAL;
7704           break;
7705         }
7706         try {
7707           decode(src_name, bp);
7708           decode(src_oloc, bp);
7709           // check if client sent us truncate_seq and truncate_size
7710           if ((op.op == CEPH_OSD_OP_COPY_FROM2) &&
7711               (op.copy_from.flags & CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ)) {
7712             decode(truncate_seq, bp);
7713             decode(truncate_size, bp);
7714             have_truncate = true;
7715           }
7716         }
7717         catch (buffer::error& e) {
7718           result = -EINVAL;
7719           tracepoint(osd,
7720                      do_osd_op_pre_copy_from,
7721                      soid.oid.name.c_str(),
7722                      soid.snap.val,
7723                      "???",
7724                      0,
7725                      "???",
7726                      "???",
7727                      0,
7728                      src_snapid,
7729                      src_version);
7730           goto fail;
7731         }
7732         tracepoint(osd,
7733                    do_osd_op_pre_copy_from,
7734                    soid.oid.name.c_str(),
7735                    soid.snap.val,
7736                    src_name.name.c_str(),
7737                    src_oloc.pool,
7738                    src_oloc.key.c_str(),
7739                    src_oloc.nspace.c_str(),
7740                    src_oloc.hash,
7741                    src_snapid,
7742                    src_version);
7743         if (op_finisher == nullptr) {
7744           // start
7745           pg_t raw_pg;
7746           get_osdmap()->object_locator_to_pg(src_name, src_oloc, raw_pg);
7747           hobject_t src(src_name, src_oloc.key, src_snapid,
7748                         raw_pg.ps(), raw_pg.pool(),
7749                         src_oloc.nspace);
7750           if (src == soid) {
7751             dout(20) << " copy from self is invalid" << dendl;
7752             result = -EINVAL;
7753             break;
7754           }
7755           CopyFromCallback *cb = new CopyFromCallback(ctx, osd_op);
7756           if (have_truncate)
7757             cb->set_truncate(truncate_seq, truncate_size);
7758           ctx->op_finishers[ctx->current_osd_subop_num].reset(
7759             new CopyFromFinisher(cb));
7760           start_copy(cb, ctx->obc, src, src_oloc, src_version,
7761                      op.copy_from.flags,
7762                      false,
7763                      op.copy_from.src_fadvise_flags,
7764                      op.flags);
7765           result = -EINPROGRESS;
7766         } else {
7767           // finish
7768           result = op_finisher->execute();
7769           ceph_assert(result == 0);
7770
7771           // COPY_FROM cannot be executed multiple times -- it must restart
7772           ctx->op_finishers.erase(ctx->current_osd_subop_num);
7773         }
7774       }
7775       break;
7776
7777     default:
7778       tracepoint(osd, do_osd_op_pre_unknown, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op));
7779       dout(1) << "unrecognized osd op " << op.op
7780               << " " << ceph_osd_op_name(op.op)
7781               << dendl;
7782       result = -EOPNOTSUPP;
7783     }
7784
7785   fail:
7786     osd_op.rval = result;
7787     tracepoint(osd, do_osd_op_post, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op), op.flags, result);
7788     if (result < 0 && (op.flags & CEPH_OSD_OP_FLAG_FAILOK) &&
7789         result != -EAGAIN && result != -EINPROGRESS)
7790       result = 0;
7791
7792     if (result < 0)
7793       break;
7794   }
7795   if (result < 0) {
7796     dout(10) << __func__ << " error: " << cpp_strerror(result) << dendl;
7797   }
7798   return result;
7799 }
7800
7801 int PrimaryLogPG::_get_tmap(OpContext *ctx, bufferlist *header, bufferlist *vals)
7802 {
7803   if (ctx->new_obs.oi.size == 0) {
7804     dout(20) << "unable to get tmap for zero sized " << ctx->new_obs.oi.soid << dendl;
7805     return -ENODATA;
7806   }
7807   vector<OSDOp> nops(1);
7808   OSDOp &newop = nops[0];
7809   newop.op.op = CEPH_OSD_OP_TMAPGET;
7810   do_osd_ops(ctx, nops);
7811   try {
7812     bufferlist::const_iterator i = newop.outdata.begin();
7813     decode(*header, i);
7814     (*vals).substr_of(newop.outdata, i.get_off(), i.get_remaining());
7815   } catch (...) {
7816     dout(20) << "unsuccessful at decoding tmap for " << ctx->new_obs.oi.soid
7817              << dendl;
7818     return -EINVAL;
7819   }
7820   dout(20) << "successful at decoding tmap for " << ctx->new_obs.oi.soid
7821            << dendl;
7822   return 0;
7823 }
7824
7825 int PrimaryLogPG::_verify_no_head_clones(const hobject_t& soid,
7826                                         const SnapSet& ss)
7827 {
7828   // verify that all clones have been evicted
7829   dout(20) << __func__ << " verifying clones are absent "
7830            << ss << dendl;
7831   for (vector<snapid_t>::const_iterator p = ss.clones.begin();
7832        p != ss.clones.end();
7833        ++p) {
7834     hobject_t clone_oid = soid;
7835     clone_oid.snap = *p;
7836     if (is_missing_object(clone_oid))
7837       return -EBUSY;
7838     ObjectContextRef clone_obc = get_object_context(clone_oid, false);
7839     if (clone_obc && clone_obc->obs.exists) {
7840       dout(10) << __func__ << " cannot evict head before clone "
7841                << clone_oid << dendl;
7842       return -EBUSY;
7843     }
7844     if (copy_ops.count(clone_oid)) {
7845       dout(10) << __func__ << " cannot evict head, pending promote on clone "
7846                << clone_oid << dendl;
7847       return -EBUSY;
7848     }
7849   }
7850   return 0;
7851 }
7852
7853 inline int PrimaryLogPG::_delete_oid(
7854   OpContext *ctx,
7855   bool no_whiteout,     // no whiteouts, no matter what.
7856   bool try_no_whiteout) // try not to whiteout
7857 {
7858   SnapSet& snapset = ctx->new_snapset;
7859   ObjectState& obs = ctx->new_obs;
7860   object_info_t& oi = obs.oi;
7861   const hobject_t& soid = oi.soid;
7862   PGTransaction* t = ctx->op_t.get();
7863
7864   // cache: cache: set whiteout on delete?
7865   bool whiteout = false;
7866   if (pool.info.cache_mode != pg_pool_t::CACHEMODE_NONE
7867       && !no_whiteout
7868       && !try_no_whiteout) {
7869     whiteout = true;
7870   }
7871
7872   // in luminous or later, we can't delete the head if there are
7873   // clones. we trust the caller passing no_whiteout has already
7874   // verified they don't exist.
7875   if (!snapset.clones.empty() ||
7876       (!ctx->snapc.snaps.empty() && ctx->snapc.snaps[0] > snapset.seq)) {
7877     if (no_whiteout) {
7878       dout(20) << __func__ << " has or will have clones but no_whiteout=1"
7879                << dendl;
7880     } else {
7881       dout(20) << __func__ << " has or will have clones; will whiteout"
7882                << dendl;
7883       whiteout = true;
7884     }
7885   }
7886   dout(20) << __func__ << " " << soid << " whiteout=" << (int)whiteout
7887            << " no_whiteout=" << (int)no_whiteout
7888            << " try_no_whiteout=" << (int)try_no_whiteout
7889            << dendl;
7890   if (!obs.exists || (obs.oi.is_whiteout() && whiteout))
7891     return -ENOENT;
7892
7893   t->remove(soid);
7894
7895   if (oi.size > 0) {
7896     interval_set<uint64_t> ch;
7897     ch.insert(0, oi.size);
7898     ctx->modified_ranges.union_of(ch);
7899     ctx->clean_regions.mark_data_region_dirty(0, oi.size);
7900   }
7901
7902   ctx->clean_regions.mark_omap_dirty();
7903   ctx->delta_stats.num_wr++;
7904   if (soid.is_snap()) {
7905     ceph_assert(ctx->obc->ssc->snapset.clone_overlap.count(soid.snap));
7906     ctx->delta_stats.num_bytes -= ctx->obc->ssc->snapset.get_clone_bytes(soid.snap);
7907   } else {
7908     ctx->delta_stats.num_bytes -= oi.size;
7909   }
7910   oi.size = 0;
7911   oi.new_object();
7912
7913   // disconnect all watchers
7914   for (map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator p =
7915          oi.watchers.begin();
7916        p != oi.watchers.end();
7917        ++p) {
7918     dout(20) << __func__ << " will disconnect watcher " << p->first << dendl;
7919     ctx->watch_disconnects.push_back(
7920       watch_disconnect_t(p->first.first, p->first.second, true));
7921   }
7922   oi.watchers.clear();
7923
7924   if (whiteout) {
7925     dout(20) << __func__ << " setting whiteout on " << soid << dendl;
7926     oi.set_flag(object_info_t::FLAG_WHITEOUT);
7927     ctx->delta_stats.num_whiteouts++;
7928     t->create(soid);
7929     osd->logger->inc(l_osd_tier_whiteout);
7930     return 0;
7931   }
7932
7933   // delete the head
7934   ctx->delta_stats.num_objects--;
7935   if (soid.is_snap())
7936     ctx->delta_stats.num_object_clones--;
7937   if (oi.is_whiteout()) {
7938     dout(20) << __func__ << " deleting whiteout on " << soid << dendl;
7939     ctx->delta_stats.num_whiteouts--;
7940     oi.clear_flag(object_info_t::FLAG_WHITEOUT);
7941   }
7942   if (oi.is_cache_pinned()) {
7943     ctx->delta_stats.num_objects_pinned--;
7944   }
7945   if (oi.has_manifest()) {
7946     ctx->delta_stats.num_objects_manifest--;
7947   }
7948   obs.exists = false;
7949   return 0;
7950 }
7951
7952 int PrimaryLogPG::_rollback_to(OpContext *ctx, ceph_osd_op& op)
7953 {
7954   SnapSet& snapset = ctx->new_snapset;
7955   ObjectState& obs = ctx->new_obs;
7956   object_info_t& oi = obs.oi;
7957   const hobject_t& soid = oi.soid;
7958   PGTransaction* t = ctx->op_t.get();
7959   snapid_t snapid = (uint64_t)op.snap.snapid;
7960   hobject_t missing_oid;
7961
7962   dout(10) << "_rollback_to " << soid << " snapid " << snapid << dendl;
7963
7964   ObjectContextRef rollback_to;
7965
7966   int ret = find_object_context(
7967     hobject_t(soid.oid, soid.get_key(), snapid, soid.get_hash(), info.pgid.pool(),
7968               soid.get_namespace()),
7969     &rollback_to, false, false, &missing_oid);
7970   if (ret == -EAGAIN) {
7971     /* clone must be missing */
7972     ceph_assert(is_degraded_or_backfilling_object(missing_oid) || is_degraded_on_async_recovery_target(missing_oid));
7973     dout(20) << "_rollback_to attempted to roll back to a missing or backfilling clone "
7974              << missing_oid << " (requested snapid: ) " << snapid << dendl;
7975     block_write_on_degraded_snap(missing_oid, ctx->op);
7976     return ret;
7977   }
7978   {
7979     ObjectContextRef promote_obc;
7980     cache_result_t tier_mode_result;
7981     if (obs.exists && obs.oi.has_manifest()) {
7982       tier_mode_result =
7983         maybe_handle_manifest_detail(
7984           ctx->op,
7985           true,
7986           rollback_to);
7987     } else {
7988       tier_mode_result =
7989         maybe_handle_cache_detail(
7990           ctx->op,
7991           true,
7992           rollback_to,
7993           ret,
7994           missing_oid,
7995           true,
7996           false,
7997           &promote_obc);
7998     }
7999     switch (tier_mode_result) {
8000     case cache_result_t::NOOP:
8001       break;
8002     case cache_result_t::BLOCKED_PROMOTE:
8003       ceph_assert(promote_obc);
8004       block_write_on_snap_rollback(soid, promote_obc, ctx->op);
8005       return -EAGAIN;
8006     case cache_result_t::BLOCKED_FULL:
8007       block_write_on_full_cache(soid, ctx->op);
8008       return -EAGAIN;
8009     case cache_result_t::REPLIED_WITH_EAGAIN:
8010       ceph_abort_msg("this can't happen, no rollback on replica");
8011     default:
8012       ceph_abort_msg("must promote was set, other values are not valid");
8013       return -EAGAIN;
8014     }
8015   }
8016
8017   if (ret == -ENOENT || (rollback_to && rollback_to->obs.oi.is_whiteout())) {
8018     // there's no snapshot here, or there's no object.
8019     // if there's no snapshot, we delete the object; otherwise, do nothing.
8020     dout(20) << "_rollback_to deleting head on " << soid.oid
8021              << " because got ENOENT|whiteout on find_object_context" << dendl;
8022     if (ctx->obc->obs.oi.watchers.size()) {
8023       // Cannot delete an object with watchers
8024       ret = -EBUSY;
8025     } else {
8026       _delete_oid(ctx, false, false);
8027       ret = 0;
8028     }
8029   } else if (ret) {
8030     // ummm....huh? It *can't* return anything else at time of writing.
8031     ceph_abort_msg("unexpected error code in _rollback_to");
8032   } else { //we got our context, let's use it to do the rollback!
8033     hobject_t& rollback_to_sobject = rollback_to->obs.oi.soid;
8034     if (is_degraded_or_backfilling_object(rollback_to_sobject) ||
8035         is_degraded_on_async_recovery_target(rollback_to_sobject)) {
8036       dout(20) << "_rollback_to attempted to roll back to a degraded object "
8037                << rollback_to_sobject << " (requested snapid: ) " << snapid << dendl;
8038       block_write_on_degraded_snap(rollback_to_sobject, ctx->op);
8039       ret = -EAGAIN;
8040     } else if (rollback_to->obs.oi.soid.snap == CEPH_NOSNAP) {
8041       // rolling back to the head; we just need to clone it.
8042       ctx->modify = true;
8043     } else {
8044       /* 1) Delete current head
8045        * 2) Clone correct snapshot into head
8046        * 3) Calculate clone_overlaps by following overlaps
8047        *    forward from rollback snapshot */
8048       dout(10) << "_rollback_to deleting " << soid.oid
8049                << " and rolling back to old snap" << dendl;
8050
8051       if (obs.exists) {
8052         t->remove(soid);
8053       }
8054       t->clone(soid, rollback_to_sobject);
8055       t->add_obc(rollback_to);
8056
8057       map<snapid_t, interval_set<uint64_t> >::iterator iter =
8058         snapset.clone_overlap.lower_bound(snapid);
8059       ceph_assert(iter != snapset.clone_overlap.end());
8060       interval_set<uint64_t> overlaps = iter->second;
8061       for ( ;
8062             iter != snapset.clone_overlap.end();
8063             ++iter)
8064         overlaps.intersection_of(iter->second);
8065
8066       if (obs.oi.size > 0) {
8067         interval_set<uint64_t> modified;
8068         modified.insert(0, obs.oi.size);
8069         overlaps.intersection_of(modified);
8070         modified.subtract(overlaps);
8071         ctx->modified_ranges.union_of(modified);
8072       }
8073
8074       // Adjust the cached objectcontext
8075       maybe_create_new_object(ctx, true);
8076       ctx->delta_stats.num_bytes -= obs.oi.size;
8077       ctx->delta_stats.num_bytes += rollback_to->obs.oi.size;
8078       ctx->clean_regions.mark_data_region_dirty(0, std::max(obs.oi.size, rollback_to->obs.oi.size));
8079       ctx->clean_regions.mark_omap_dirty();
8080       obs.oi.size = rollback_to->obs.oi.size;
8081       if (rollback_to->obs.oi.is_data_digest())
8082         obs.oi.set_data_digest(rollback_to->obs.oi.data_digest);
8083       else
8084         obs.oi.clear_data_digest();
8085       if (rollback_to->obs.oi.is_omap_digest())
8086         obs.oi.set_omap_digest(rollback_to->obs.oi.omap_digest);
8087       else
8088         obs.oi.clear_omap_digest();
8089
8090       if (rollback_to->obs.oi.is_omap()) {
8091         dout(10) << __func__ << " setting omap flag on " << obs.oi.soid << dendl;
8092         obs.oi.set_flag(object_info_t::FLAG_OMAP);
8093       } else {
8094         dout(10) << __func__ << " clearing omap flag on " << obs.oi.soid << dendl;
8095         obs.oi.clear_flag(object_info_t::FLAG_OMAP);
8096       }
8097     }
8098   }
8099   return ret;
8100 }
8101
8102 void PrimaryLogPG::_make_clone(
8103   OpContext *ctx,
8104   PGTransaction* t,
8105   ObjectContextRef obc,
8106   const hobject_t& head, const hobject_t& coid,
8107   object_info_t *poi)
8108 {
8109   bufferlist bv;
8110   encode(*poi, bv, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
8111
8112   t->clone(coid, head);
8113   setattr_maybe_cache(obc, t, OI_ATTR, bv);
8114   rmattr_maybe_cache(obc, t, SS_ATTR);
8115 }
8116
8117 void PrimaryLogPG::make_writeable(OpContext *ctx)
8118 {
8119   const hobject_t& soid = ctx->obs->oi.soid;
8120   SnapContext& snapc = ctx->snapc;
8121
8122   // clone?
8123   ceph_assert(soid.snap == CEPH_NOSNAP);
8124   dout(20) << "make_writeable " << soid << " snapset=" << ctx->new_snapset
8125            << "  snapc=" << snapc << dendl;
8126
8127   bool was_dirty = ctx->obc->obs.oi.is_dirty();
8128   if (ctx->new_obs.exists) {
8129     // we will mark the object dirty
8130     if (ctx->undirty && was_dirty) {
8131       dout(20) << " clearing DIRTY flag" << dendl;
8132       ceph_assert(ctx->new_obs.oi.is_dirty());
8133       ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
8134       --ctx->delta_stats.num_objects_dirty;
8135       osd->logger->inc(l_osd_tier_clean);
8136     } else if (!was_dirty && !ctx->undirty) {
8137       dout(20) << " setting DIRTY flag" << dendl;
8138       ctx->new_obs.oi.set_flag(object_info_t::FLAG_DIRTY);
8139       ++ctx->delta_stats.num_objects_dirty;
8140       osd->logger->inc(l_osd_tier_dirty);
8141     }
8142   } else {
8143     if (was_dirty) {
8144       dout(20) << " deletion, decrementing num_dirty and clearing flag" << dendl;
8145       ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
8146       --ctx->delta_stats.num_objects_dirty;
8147     }
8148   }
8149
8150   if ((ctx->new_obs.exists &&
8151        ctx->new_obs.oi.is_omap()) &&
8152       (!ctx->obc->obs.exists ||
8153        !ctx->obc->obs.oi.is_omap())) {
8154     ++ctx->delta_stats.num_objects_omap;
8155   }
8156   if ((!ctx->new_obs.exists ||
8157        !ctx->new_obs.oi.is_omap()) &&
8158       (ctx->obc->obs.exists &&
8159        ctx->obc->obs.oi.is_omap())) {
8160     --ctx->delta_stats.num_objects_omap;
8161   }
8162
8163   if (ctx->new_snapset.seq > snapc.seq) {
8164     dout(10) << " op snapset is old" << dendl;
8165   }
8166
8167   if ((ctx->obs->exists && !ctx->obs->oi.is_whiteout()) && // head exist(ed)
8168       snapc.snaps.size() &&                 // there are snaps
8169       !ctx->cache_evict &&
8170       snapc.snaps[0] > ctx->new_snapset.seq) {  // existing object is old
8171     // clone
8172     hobject_t coid = soid;
8173     coid.snap = snapc.seq;
8174
8175     unsigned l;
8176     for (l = 1;
8177          l < snapc.snaps.size() && snapc.snaps[l] > ctx->new_snapset.seq;
8178          l++) ;
8179
8180     vector<snapid_t> snaps(l);
8181     for (unsigned i=0; i<l; i++)
8182       snaps[i] = snapc.snaps[i];
8183
8184     // prepare clone
8185     object_info_t static_snap_oi(coid);
8186     object_info_t *snap_oi;
8187     if (is_primary()) {
8188       ctx->clone_obc = object_contexts.lookup_or_create(static_snap_oi.soid);
8189       ctx->clone_obc->destructor_callback =
8190         new C_PG_ObjectContext(this, ctx->clone_obc.get());
8191       ctx->clone_obc->obs.oi = static_snap_oi;
8192       ctx->clone_obc->obs.exists = true;
8193       ctx->clone_obc->ssc = ctx->obc->ssc;
8194       ctx->clone_obc->ssc->ref++;
8195       if (pool.info.is_erasure())
8196         ctx->clone_obc->attr_cache = ctx->obc->attr_cache;
8197       snap_oi = &ctx->clone_obc->obs.oi;
8198       bool got = ctx->lock_manager.get_write_greedy(
8199         coid,
8200         ctx->clone_obc,
8201         ctx->op);
8202       ceph_assert(got);
8203       dout(20) << " got greedy write on clone_obc " << *ctx->clone_obc << dendl;
8204     } else {
8205       snap_oi = &static_snap_oi;
8206     }
8207     snap_oi->version = ctx->at_version;
8208     snap_oi->prior_version = ctx->obs->oi.version;
8209     snap_oi->copy_user_bits(ctx->obs->oi);
8210
8211     _make_clone(ctx, ctx->op_t.get(), ctx->clone_obc, soid, coid, snap_oi);
8212
8213     ctx->delta_stats.num_objects++;
8214     if (snap_oi->is_dirty()) {
8215       ctx->delta_stats.num_objects_dirty++;
8216       osd->logger->inc(l_osd_tier_dirty);
8217     }
8218     if (snap_oi->is_omap())
8219       ctx->delta_stats.num_objects_omap++;
8220     if (snap_oi->is_cache_pinned())
8221       ctx->delta_stats.num_objects_pinned++;
8222     if (snap_oi->has_manifest())
8223       ctx->delta_stats.num_objects_manifest++;
8224     ctx->delta_stats.num_object_clones++;
8225     ctx->new_snapset.clones.push_back(coid.snap);
8226     ctx->new_snapset.clone_size[coid.snap] = ctx->obs->oi.size;
8227     ctx->new_snapset.clone_snaps[coid.snap] = snaps;
8228
8229     // clone_overlap should contain an entry for each clone
8230     // (an empty interval_set if there is no overlap)
8231     ctx->new_snapset.clone_overlap[coid.snap];
8232     if (ctx->obs->oi.size)
8233       ctx->new_snapset.clone_overlap[coid.snap].insert(0, ctx->obs->oi.size);
8234
8235     // log clone
8236     dout(10) << " cloning v " << ctx->obs->oi.version
8237              << " to " << coid << " v " << ctx->at_version
8238              << " snaps=" << snaps
8239              << " snapset=" << ctx->new_snapset << dendl;
8240     ctx->log.push_back(pg_log_entry_t(
8241                          pg_log_entry_t::CLONE, coid, ctx->at_version,
8242                          ctx->obs->oi.version,
8243                          ctx->obs->oi.user_version,
8244                          osd_reqid_t(), ctx->new_obs.oi.mtime, 0));
8245     encode(snaps, ctx->log.back().snaps);
8246
8247     ctx->at_version.version++;
8248   }
8249
8250   // update most recent clone_overlap and usage stats
8251   if (ctx->new_snapset.clones.size() > 0) {
8252     // the clone_overlap is difference of range between head and clones.
8253     // we need to check whether the most recent clone exists, if it's
8254     // been evicted, it's not included in the stats, but the clone_overlap
8255     // is still exist in the snapset, so we should update the
8256     // clone_overlap to make it sense.
8257     hobject_t last_clone_oid = soid;
8258     last_clone_oid.snap = ctx->new_snapset.clone_overlap.rbegin()->first;
8259     interval_set<uint64_t> &newest_overlap =
8260       ctx->new_snapset.clone_overlap.rbegin()->second;
8261     ctx->modified_ranges.intersection_of(newest_overlap);
8262     if (is_present_clone(last_clone_oid)) {
8263       // modified_ranges is still in use by the clone
8264       ctx->delta_stats.num_bytes += ctx->modified_ranges.size();
8265     }
8266     newest_overlap.subtract(ctx->modified_ranges);
8267   }
8268
8269   if (snapc.seq > ctx->new_snapset.seq) {
8270     // update snapset with latest snap context
8271     ctx->new_snapset.seq = snapc.seq;
8272     if (get_osdmap()->require_osd_release < ceph_release_t::octopus) {
8273       ctx->new_snapset.snaps = snapc.snaps;
8274     } else {
8275       ctx->new_snapset.snaps.clear();
8276     }
8277   }
8278   dout(20) << "make_writeable " << soid
8279            << " done, snapset=" << ctx->new_snapset << dendl;
8280 }
8281
8282
8283 void PrimaryLogPG::write_update_size_and_usage(object_stat_sum_t& delta_stats, object_info_t& oi,
8284                                                interval_set<uint64_t>& modified, uint64_t offset,
8285                                                uint64_t length, bool write_full)
8286 {
8287   interval_set<uint64_t> ch;
8288   if (write_full) {
8289     if (oi.size)
8290       ch.insert(0, oi.size);
8291   } else if (length)
8292     ch.insert(offset, length);
8293   modified.union_of(ch);
8294   if (write_full ||
8295       (offset + length > oi.size && length)) {
8296     uint64_t new_size = offset + length;
8297     delta_stats.num_bytes -= oi.size;
8298     delta_stats.num_bytes += new_size;
8299     oi.size = new_size;
8300   }
8301
8302   if (oi.has_manifest() && oi.manifest.is_chunked()) {
8303     for (auto &p : oi.manifest.chunk_map) {
8304       if ((p.first <= offset && p.first + p.second.length > offset) ||
8305           (p.first > offset && p.first < offset + length)) {
8306         p.second.clear_flag(chunk_info_t::FLAG_MISSING);
8307         p.second.set_flag(chunk_info_t::FLAG_DIRTY);
8308       }
8309     }
8310   }
8311   delta_stats.num_wr++;
8312   delta_stats.num_wr_kb += shift_round_up(length, 10);
8313 }
8314
8315 void PrimaryLogPG::truncate_update_size_and_usage(
8316   object_stat_sum_t& delta_stats,
8317   object_info_t& oi,
8318   uint64_t truncate_size)
8319 {
8320   if (oi.size != truncate_size) {
8321     delta_stats.num_bytes -= oi.size;
8322     delta_stats.num_bytes += truncate_size;
8323     oi.size = truncate_size;
8324   }
8325 }
8326
8327 void PrimaryLogPG::complete_disconnect_watches(
8328   ObjectContextRef obc,
8329   const list<watch_disconnect_t> &to_disconnect)
8330 {
8331   for (list<watch_disconnect_t>::const_iterator i =
8332          to_disconnect.begin();
8333        i != to_disconnect.end();
8334        ++i) {
8335     pair<uint64_t, entity_name_t> watcher(i->cookie, i->name);
8336     auto watchers_entry = obc->watchers.find(watcher);
8337     if (watchers_entry != obc->watchers.end()) {
8338       WatchRef watch = watchers_entry->second;
8339       dout(10) << "do_osd_op_effects disconnect watcher " << watcher << dendl;
8340       obc->watchers.erase(watcher);
8341       watch->remove(i->send_disconnect);
8342     } else {
8343       dout(10) << "do_osd_op_effects disconnect failed to find watcher "
8344                << watcher << dendl;
8345     }
8346   }
8347 }
8348
8349 void PrimaryLogPG::do_osd_op_effects(OpContext *ctx, const ConnectionRef& conn)
8350 {
8351   entity_name_t entity = ctx->reqid.name;
8352   dout(15) << "do_osd_op_effects " << entity << " con " << conn.get() << dendl;
8353
8354   // disconnects first
8355   complete_disconnect_watches(ctx->obc, ctx->watch_disconnects);
8356
8357   ceph_assert(conn);
8358
8359   auto session = conn->get_priv();
8360   if (!session)
8361     return;
8362
8363   for (list<pair<watch_info_t,bool> >::iterator i = ctx->watch_connects.begin();
8364        i != ctx->watch_connects.end();
8365        ++i) {
8366     pair<uint64_t, entity_name_t> watcher(i->first.cookie, entity);
8367     dout(15) << "do_osd_op_effects applying watch connect on session "
8368              << session.get() << " watcher " << watcher << dendl;
8369     WatchRef watch;
8370     if (ctx->obc->watchers.count(watcher)) {
8371       dout(15) << "do_osd_op_effects found existing watch watcher " << watcher
8372                << dendl;
8373       watch = ctx->obc->watchers[watcher];
8374     } else {
8375       dout(15) << "do_osd_op_effects new watcher " << watcher
8376                << dendl;
8377       watch = Watch::makeWatchRef(
8378         this, osd, ctx->obc, i->first.timeout_seconds,
8379         i->first.cookie, entity, conn->get_peer_addr());
8380       ctx->obc->watchers.insert(
8381         make_pair(
8382           watcher,
8383           watch));
8384     }
8385     watch->connect(conn, i->second);
8386   }
8387
8388   for (list<notify_info_t>::iterator p = ctx->notifies.begin();
8389        p != ctx->notifies.end();
8390        ++p) {
8391     dout(10) << "do_osd_op_effects, notify " << *p << dendl;
8392     ConnectionRef conn(ctx->op->get_req()->get_connection());
8393     NotifyRef notif(
8394       Notify::makeNotifyRef(
8395         conn,
8396         ctx->reqid.name.num(),
8397         p->bl,
8398         p->timeout,
8399         p->cookie,
8400         p->notify_id,
8401         ctx->obc->obs.oi.user_version,
8402         osd));
8403     for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator i =
8404            ctx->obc->watchers.begin();
8405          i != ctx->obc->watchers.end();
8406          ++i) {
8407       dout(10) << "starting notify on watch " << i->first << dendl;
8408       i->second->start_notify(notif);
8409     }
8410     notif->init();
8411   }
8412
8413   for (list<OpContext::NotifyAck>::iterator p = ctx->notify_acks.begin();
8414        p != ctx->notify_acks.end();
8415        ++p) {
8416     if (p->watch_cookie)
8417       dout(10) << "notify_ack " << make_pair(*(p->watch_cookie), p->notify_id) << dendl;
8418     else
8419       dout(10) << "notify_ack " << make_pair("NULL", p->notify_id) << dendl;
8420     for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator i =
8421            ctx->obc->watchers.begin();
8422          i != ctx->obc->watchers.end();
8423          ++i) {
8424       if (i->first.second != entity) continue;
8425       if (p->watch_cookie &&
8426           *(p->watch_cookie) != i->first.first) continue;
8427       dout(10) << "acking notify on watch " << i->first << dendl;
8428       i->second->notify_ack(p->notify_id, p->reply_bl);
8429     }
8430   }
8431 }
8432
8433 hobject_t PrimaryLogPG::generate_temp_object(const hobject_t& target)
8434 {
8435   ostringstream ss;
8436   ss << "temp_" << info.pgid << "_" << get_role()
8437      << "_" << osd->monc->get_global_id() << "_" << (++temp_seq);
8438   hobject_t hoid = target.make_temp_hobject(ss.str());
8439   dout(20) << __func__ << " " << hoid << dendl;
8440   return hoid;
8441 }
8442
8443 hobject_t PrimaryLogPG::get_temp_recovery_object(
8444   const hobject_t& target,
8445   eversion_t version)
8446 {
8447   ostringstream ss;
8448   ss << "temp_recovering_" << info.pgid  // (note this includes the shardid)
8449      << "_" << version
8450      << "_" << info.history.same_interval_since
8451      << "_" << target.snap;
8452   // pgid + version + interval + snapid is unique, and short
8453   hobject_t hoid = target.make_temp_hobject(ss.str());
8454   dout(20) << __func__ << " " << hoid << dendl;
8455   return hoid;
8456 }
8457
8458 int PrimaryLogPG::prepare_transaction(OpContext *ctx)
8459 {
8460   ceph_assert(!ctx->ops->empty());
8461
8462   // valid snap context?
8463   if (!ctx->snapc.is_valid()) {
8464     dout(10) << " invalid snapc " << ctx->snapc << dendl;
8465     return -EINVAL;
8466   }
8467
8468   // prepare the actual mutation
8469   int result = do_osd_ops(ctx, *ctx->ops);
8470   if (result < 0) {
8471     if (ctx->op->may_write() &&
8472         get_osdmap()->require_osd_release >= ceph_release_t::kraken) {
8473       // need to save the error code in the pg log, to detect dup ops,
8474       // but do nothing else
8475       ctx->update_log_only = true;
8476     }
8477     return result;
8478   }
8479
8480   // read-op?  write-op noop? done?
8481   if (ctx->op_t->empty() && !ctx->modify) {
8482     if (ctx->pending_async_reads.empty())
8483       unstable_stats.add(ctx->delta_stats);
8484     if (ctx->op->may_write() &&
8485         get_osdmap()->require_osd_release >= ceph_release_t::kraken) {
8486       ctx->update_log_only = true;
8487     }
8488     return result;
8489   }
8490
8491   // check for full
8492   if ((ctx->delta_stats.num_bytes > 0 ||
8493        ctx->delta_stats.num_objects > 0) &&  // FIXME: keys?
8494       pool.info.has_flag(pg_pool_t::FLAG_FULL)) {
8495     auto m = ctx->op->get_req<MOSDOp>();
8496     if (ctx->reqid.name.is_mds() ||   // FIXME: ignore MDS for now
8497         m->has_flag(CEPH_OSD_FLAG_FULL_FORCE)) {
8498       dout(20) << __func__ << " full, but proceeding due to FULL_FORCE or MDS"
8499                << dendl;
8500     } else if (m->has_flag(CEPH_OSD_FLAG_FULL_TRY)) {
8501       // they tried, they failed.
8502       dout(20) << __func__ << " full, replying to FULL_TRY op" << dendl;
8503       return pool.info.has_flag(pg_pool_t::FLAG_FULL_QUOTA) ? -EDQUOT : -ENOSPC;
8504     } else {
8505       // drop request
8506       dout(20) << __func__ << " full, dropping request (bad client)" << dendl;
8507       return -EAGAIN;
8508     }
8509   }
8510
8511   const hobject_t& soid = ctx->obs->oi.soid;
8512   // clone, if necessary
8513   if (soid.snap == CEPH_NOSNAP)
8514     make_writeable(ctx);
8515
8516   finish_ctx(ctx,
8517              ctx->new_obs.exists ? pg_log_entry_t::MODIFY :
8518              pg_log_entry_t::DELETE,
8519              result);
8520
8521   return result;
8522 }
8523
8524 void PrimaryLogPG::finish_ctx(OpContext *ctx, int log_op_type, int result)
8525 {
8526   const hobject_t& soid = ctx->obs->oi.soid;
8527   dout(20) << __func__ << " " << soid << " " << ctx
8528            << " op " << pg_log_entry_t::get_op_name(log_op_type)
8529            << dendl;
8530   utime_t now = ceph_clock_now();
8531
8532   // finish and log the op.
8533   if (ctx->user_modify) {
8534     // update the user_version for any modify ops, except for the watch op
8535     ctx->user_at_version = std::max(info.last_user_version, ctx->new_obs.oi.user_version) + 1;
8536     /* In order for new clients and old clients to interoperate properly
8537      * when exchanging versions, we need to lower bound the user_version
8538      * (which our new clients pay proper attention to)
8539      * by the at_version (which is all the old clients can ever see). */
8540     if (ctx->at_version.version > ctx->user_at_version)
8541       ctx->user_at_version = ctx->at_version.version;
8542     ctx->new_obs.oi.user_version = ctx->user_at_version;
8543   }
8544   ctx->bytes_written = ctx->op_t->get_bytes_written();
8545
8546   if (ctx->new_obs.exists) {
8547     ctx->new_obs.oi.version = ctx->at_version;
8548     ctx->new_obs.oi.prior_version = ctx->obs->oi.version;
8549     ctx->new_obs.oi.last_reqid = ctx->reqid;
8550     if (ctx->mtime != utime_t()) {
8551       ctx->new_obs.oi.mtime = ctx->mtime;
8552       dout(10) << " set mtime to " << ctx->new_obs.oi.mtime << dendl;
8553       ctx->new_obs.oi.local_mtime = now;
8554     } else {
8555       dout(10) << " mtime unchanged at " << ctx->new_obs.oi.mtime << dendl;
8556     }
8557
8558     // object_info_t
8559     map <string, bufferlist> attrs;
8560     bufferlist bv(sizeof(ctx->new_obs.oi));
8561     encode(ctx->new_obs.oi, bv,
8562              get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
8563     attrs[OI_ATTR].claim(bv);
8564
8565     // snapset
8566     if (soid.snap == CEPH_NOSNAP) {
8567       dout(10) << " final snapset " << ctx->new_snapset
8568                << " in " << soid << dendl;
8569       bufferlist bss;
8570       encode(ctx->new_snapset, bss);
8571       attrs[SS_ATTR].claim(bss);
8572     } else {
8573       dout(10) << " no snapset (this is a clone)" << dendl;
8574     }
8575     ctx->op_t->setattrs(soid, attrs);
8576   } else {
8577     // reset cached oi
8578     ctx->new_obs.oi = object_info_t(ctx->obc->obs.oi.soid);
8579   }
8580
8581   // append to log
8582   ctx->log.push_back(
8583     pg_log_entry_t(log_op_type, soid, ctx->at_version,
8584                    ctx->obs->oi.version,
8585                    ctx->user_at_version, ctx->reqid,
8586                    ctx->mtime,
8587                    (ctx->op && ctx->op->allows_returnvec()) ? result : 0));
8588   if (ctx->op && ctx->op->allows_returnvec()) {
8589     // also the per-op values
8590     ctx->log.back().set_op_returns(*ctx->ops);
8591     dout(20) << __func__ << " op_returns " << ctx->log.back().op_returns
8592              << dendl;
8593   }
8594
8595   ctx->log.back().clean_regions = ctx->clean_regions;
8596   dout(20) << __func__ << " object " << soid <<  " marks clean_regions " << ctx->log.back().clean_regions << dendl;
8597
8598   if (soid.snap < CEPH_NOSNAP) {
8599     switch (log_op_type) {
8600     case pg_log_entry_t::MODIFY:
8601     case pg_log_entry_t::PROMOTE:
8602     case pg_log_entry_t::CLEAN:
8603       dout(20) << __func__ << " encoding snaps from " << ctx->new_snapset
8604                << dendl;
8605       encode(ctx->new_snapset.clone_snaps[soid.snap], ctx->log.back().snaps);
8606       break;
8607     default:
8608       break;
8609     }
8610   }
8611
8612   if (!ctx->extra_reqids.empty()) {
8613     dout(20) << __func__ << "  extra_reqids " << ctx->extra_reqids << " "
8614              << ctx->extra_reqid_return_codes << dendl;
8615     ctx->log.back().extra_reqids.swap(ctx->extra_reqids);
8616     ctx->log.back().extra_reqid_return_codes.swap(ctx->extra_reqid_return_codes);
8617   }
8618
8619   // apply new object state.
8620   ctx->obc->obs = ctx->new_obs;
8621
8622   if (soid.is_head() && !ctx->obc->obs.exists) {
8623     ctx->obc->ssc->exists = false;
8624     ctx->obc->ssc->snapset = SnapSet();
8625   } else {
8626     ctx->obc->ssc->exists = true;
8627     ctx->obc->ssc->snapset = ctx->new_snapset;
8628   }
8629 }
8630
8631 void PrimaryLogPG::apply_stats(
8632   const hobject_t &soid,
8633   const object_stat_sum_t &delta_stats) {
8634
8635   recovery_state.apply_op_stats(soid, delta_stats);
8636   for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
8637        i != get_backfill_targets().end();
8638        ++i) {
8639     pg_shard_t bt = *i;
8640     const pg_info_t& pinfo = recovery_state.get_peer_info(bt);
8641     if (soid > pinfo.last_backfill && soid <= last_backfill_started) {
8642       pending_backfill_updates[soid].stats.add(delta_stats);
8643     }
8644   }
8645
8646   if (is_primary() && scrubber.active) {
8647     if (soid < scrubber.start) {
8648       dout(20) << __func__ << " " << soid << " < [" << scrubber.start
8649                << "," << scrubber.end << ")" << dendl;
8650       scrub_cstat.add(delta_stats);
8651     } else {
8652       dout(20) << __func__ << " " << soid << " >= [" << scrubber.start
8653                << "," << scrubber.end << ")" << dendl;
8654     }
8655   }
8656 }
8657
8658 void PrimaryLogPG::complete_read_ctx(int result, OpContext *ctx)
8659 {
8660   auto m = ctx->op->get_req<MOSDOp>();
8661   ceph_assert(ctx->async_reads_complete());
8662
8663   for (vector<OSDOp>::iterator p = ctx->ops->begin();
8664     p != ctx->ops->end() && result >= 0; ++p) {
8665     if (p->rval < 0 && !(p->op.flags & CEPH_OSD_OP_FLAG_FAILOK)) {
8666       result = p->rval;
8667       break;
8668     }
8669     ctx->bytes_read += p->outdata.length();
8670   }
8671   ctx->reply->get_header().data_off = (ctx->data_off ? *ctx->data_off : 0);
8672
8673   MOSDOpReply *reply = ctx->reply;
8674   ctx->reply = nullptr;
8675
8676   if (result >= 0) {
8677     if (!ctx->ignore_log_op_stats) {
8678       log_op_stats(*ctx->op, ctx->bytes_written, ctx->bytes_read);
8679
8680       publish_stats_to_osd();
8681     }
8682
8683     // on read, return the current object version
8684     if (ctx->obs) {
8685       reply->set_reply_versions(eversion_t(), ctx->obs->oi.user_version);
8686     } else {
8687       reply->set_reply_versions(eversion_t(), ctx->user_at_version);
8688     }
8689   } else if (result == -ENOENT) {
8690     // on ENOENT, set a floor for what the next user version will be.
8691     reply->set_enoent_reply_versions(info.last_update, info.last_user_version);
8692   }
8693
8694   reply->set_result(result);
8695   reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
8696   osd->send_message_osd_client(reply, m->get_connection());
8697   close_op_ctx(ctx);
8698 }
8699
8700 // ========================================================================
8701 // copyfrom
8702
8703 struct C_Copyfrom : public Context {
8704   PrimaryLogPGRef pg;
8705   hobject_t oid;
8706   epoch_t last_peering_reset;
8707   ceph_tid_t tid;
8708   PrimaryLogPG::CopyOpRef cop;  // used for keeping the cop alive
8709   C_Copyfrom(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
8710              const PrimaryLogPG::CopyOpRef& c)
8711     : pg(p), oid(o), last_peering_reset(lpr),
8712       tid(0), cop(c)
8713   {}
8714   void finish(int r) override {
8715     if (r == -ECANCELED)
8716       return;
8717     std::scoped_lock l{*pg};
8718     if (last_peering_reset == pg->get_last_peering_reset()) {
8719       pg->process_copy_chunk(oid, tid, r);
8720       cop.reset();
8721     }
8722   }
8723 };
8724
8725 struct C_CopyFrom_AsyncReadCb : public Context {
8726   OSDOp *osd_op;
8727   object_copy_data_t reply_obj;
8728   uint64_t features;
8729   size_t len;
8730   C_CopyFrom_AsyncReadCb(OSDOp *osd_op, uint64_t features) :
8731     osd_op(osd_op), features(features), len(0) {}
8732   void finish(int r) override {
8733     osd_op->rval = r;
8734     if (r < 0) {
8735       return;
8736     }
8737
8738     ceph_assert(len > 0);
8739     ceph_assert(len <= reply_obj.data.length());
8740     bufferlist bl;
8741     bl.substr_of(reply_obj.data, 0, len);
8742     reply_obj.data.swap(bl);
8743     encode(reply_obj, osd_op->outdata, features);
8744   }
8745 };
8746
8747 struct C_CopyChunk : public Context {
8748   PrimaryLogPGRef pg;
8749   hobject_t oid;
8750   epoch_t last_peering_reset;
8751   ceph_tid_t tid;
8752   PrimaryLogPG::CopyOpRef cop;  // used for keeping the cop alive
8753   uint64_t offset = 0;
8754   C_CopyChunk(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
8755              const PrimaryLogPG::CopyOpRef& c)
8756     : pg(p), oid(o), last_peering_reset(lpr),
8757       tid(0), cop(c)
8758   {}
8759   void finish(int r) override {
8760     if (r == -ECANCELED)
8761       return;
8762     std::scoped_lock l{*pg};
8763     if (last_peering_reset == pg->get_last_peering_reset()) {
8764       pg->process_copy_chunk_manifest(oid, tid, r, offset);
8765       cop.reset();
8766     }
8767   }
8768 };
8769
8770 int PrimaryLogPG::do_copy_get(OpContext *ctx, bufferlist::const_iterator& bp,
8771                               OSDOp& osd_op, ObjectContextRef &obc)
8772 {
8773   object_info_t& oi = obc->obs.oi;
8774   hobject_t& soid = oi.soid;
8775   int result = 0;
8776   object_copy_cursor_t cursor;
8777   uint64_t out_max;
8778   try {
8779     decode(cursor, bp);
8780     decode(out_max, bp);
8781   }
8782   catch (buffer::error& e) {
8783     result = -EINVAL;
8784     return result;
8785   }
8786
8787   const MOSDOp *op = reinterpret_cast<const MOSDOp*>(ctx->op->get_req());
8788   uint64_t features = op->get_features();
8789
8790   bool async_read_started = false;
8791   object_copy_data_t _reply_obj;
8792   C_CopyFrom_AsyncReadCb *cb = nullptr;
8793   if (pool.info.is_erasure()) {
8794     cb = new C_CopyFrom_AsyncReadCb(&osd_op, features);
8795   }
8796   object_copy_data_t &reply_obj = cb ? cb->reply_obj : _reply_obj;
8797   // size, mtime
8798   reply_obj.size = oi.size;
8799   reply_obj.mtime = oi.mtime;
8800   ceph_assert(obc->ssc);
8801   if (soid.snap < CEPH_NOSNAP) {
8802     auto p = obc->ssc->snapset.clone_snaps.find(soid.snap);
8803     ceph_assert(p != obc->ssc->snapset.clone_snaps.end()); // warn?
8804     reply_obj.snaps = p->second;
8805   } else {
8806     reply_obj.snap_seq = obc->ssc->snapset.seq;
8807   }
8808   if (oi.is_data_digest()) {
8809     reply_obj.flags |= object_copy_data_t::FLAG_DATA_DIGEST;
8810     reply_obj.data_digest = oi.data_digest;
8811   }
8812   if (oi.is_omap_digest()) {
8813     reply_obj.flags |= object_copy_data_t::FLAG_OMAP_DIGEST;
8814     reply_obj.omap_digest = oi.omap_digest;
8815   }
8816   reply_obj.truncate_seq = oi.truncate_seq;
8817   reply_obj.truncate_size = oi.truncate_size;
8818
8819   // attrs
8820   map<string,bufferlist>& out_attrs = reply_obj.attrs;
8821   if (!cursor.attr_complete) {
8822     result = getattrs_maybe_cache(
8823       ctx->obc,
8824       &out_attrs);
8825     if (result < 0) {
8826       if (cb) {
8827         delete cb;
8828       }
8829       return result;
8830     }
8831     cursor.attr_complete = true;
8832     dout(20) << " got attrs" << dendl;
8833   }
8834
8835   int64_t left = out_max - osd_op.outdata.length();
8836
8837   // data
8838   bufferlist& bl = reply_obj.data;
8839   if (left > 0 && !cursor.data_complete) {
8840     if (cursor.data_offset < oi.size) {
8841       uint64_t max_read = std::min(oi.size - cursor.data_offset, (uint64_t)left);
8842       if (cb) {
8843         async_read_started = true;
8844         ctx->pending_async_reads.push_back(
8845           make_pair(
8846             boost::make_tuple(cursor.data_offset, max_read, osd_op.op.flags),
8847             make_pair(&bl, cb)));
8848         cb->len = max_read;
8849
8850         ctx->op_finishers[ctx->current_osd_subop_num].reset(
8851           new ReadFinisher(osd_op));
8852         result = -EINPROGRESS;
8853
8854         dout(10) << __func__ << ": async_read noted for " << soid << dendl;
8855       } else {
8856         result = pgbackend->objects_read_sync(
8857           oi.soid, cursor.data_offset, max_read, osd_op.op.flags, &bl);
8858         if (result < 0)
8859           return result;
8860       }
8861       left -= max_read;
8862       cursor.data_offset += max_read;
8863     }
8864     if (cursor.data_offset == oi.size) {
8865       cursor.data_complete = true;
8866       dout(20) << " got data" << dendl;
8867     }
8868     ceph_assert(cursor.data_offset <= oi.size);
8869   }
8870
8871   // omap
8872   uint32_t omap_keys = 0;
8873   if (!pool.info.supports_omap() || !oi.is_omap()) {
8874     cursor.omap_complete = true;
8875   } else {
8876     if (left > 0 && !cursor.omap_complete) {
8877       ceph_assert(cursor.data_complete);
8878       if (cursor.omap_offset.empty()) {
8879         osd->store->omap_get_header(ch, ghobject_t(oi.soid),
8880                                     &reply_obj.omap_header);
8881       }
8882       bufferlist omap_data;
8883       ObjectMap::ObjectMapIterator iter =
8884         osd->store->get_omap_iterator(ch, ghobject_t(oi.soid));
8885       ceph_assert(iter);
8886       iter->upper_bound(cursor.omap_offset);
8887       for (; iter->valid(); iter->next()) {
8888         ++omap_keys;
8889         encode(iter->key(), omap_data);
8890         encode(iter->value(), omap_data);
8891         left -= iter->key().length() + 4 + iter->value().length() + 4;
8892         if (left <= 0)
8893           break;
8894       }
8895       if (omap_keys) {
8896         encode(omap_keys, reply_obj.omap_data);
8897         reply_obj.omap_data.claim_append(omap_data);
8898       }
8899       if (iter->valid()) {
8900         cursor.omap_offset = iter->key();
8901       } else {
8902         cursor.omap_complete = true;
8903         dout(20) << " got omap" << dendl;
8904       }
8905     }
8906   }
8907
8908   if (cursor.is_complete()) {
8909     // include reqids only in the final step.  this is a bit fragile
8910     // but it works...
8911     recovery_state.get_pg_log().get_log().get_object_reqids(ctx->obc->obs.oi.soid, 10,
8912                                        &reply_obj.reqids,
8913                                        &reply_obj.reqid_return_codes);
8914     dout(20) << " got reqids" << dendl;
8915   }
8916
8917   dout(20) << " cursor.is_complete=" << cursor.is_complete()
8918            << " " << out_attrs.size() << " attrs"
8919            << " " << bl.length() << " bytes"
8920            << " " << reply_obj.omap_header.length() << " omap header bytes"
8921            << " " << reply_obj.omap_data.length() << " omap data bytes in "
8922            << omap_keys << " keys"
8923            << " " << reply_obj.reqids.size() << " reqids"
8924            << dendl;
8925   reply_obj.cursor = cursor;
8926   if (!async_read_started) {
8927     encode(reply_obj, osd_op.outdata, features);
8928   }
8929   if (cb && !async_read_started) {
8930     delete cb;
8931   }
8932
8933   if (result > 0) {
8934     result = 0;
8935   }
8936   return result;
8937 }
8938
8939 void PrimaryLogPG::fill_in_copy_get_noent(OpRequestRef& op, hobject_t oid,
8940                                           OSDOp& osd_op)
8941 {
8942   const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
8943   uint64_t features = m->get_features();
8944   object_copy_data_t reply_obj;
8945
8946   recovery_state.get_pg_log().get_log().get_object_reqids(oid, 10, &reply_obj.reqids,
8947                                      &reply_obj.reqid_return_codes);
8948   dout(20) << __func__ << " got reqids " << reply_obj.reqids << dendl;
8949   encode(reply_obj, osd_op.outdata, features);
8950   osd_op.rval = -ENOENT;
8951   MOSDOpReply *reply = new MOSDOpReply(m, 0, get_osdmap_epoch(), 0, false);
8952   reply->set_result(-ENOENT);
8953   reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
8954   osd->send_message_osd_client(reply, m->get_connection());
8955 }
8956
8957 void PrimaryLogPG::start_copy(CopyCallback *cb, ObjectContextRef obc,
8958                               hobject_t src, object_locator_t oloc,
8959                               version_t version, unsigned flags,
8960                               bool mirror_snapset,
8961                               unsigned src_obj_fadvise_flags,
8962                               unsigned dest_obj_fadvise_flags)
8963 {
8964   const hobject_t& dest = obc->obs.oi.soid;
8965   dout(10) << __func__ << " " << dest
8966            << " from " << src << " " << oloc << " v" << version
8967            << " flags " << flags
8968            << (mirror_snapset ? " mirror_snapset" : "")
8969            << dendl;
8970
8971   ceph_assert(!mirror_snapset || src.snap == CEPH_NOSNAP);
8972
8973   // cancel a previous in-progress copy?
8974   if (copy_ops.count(dest)) {
8975     // FIXME: if the src etc match, we could avoid restarting from the
8976     // beginning.
8977     CopyOpRef cop = copy_ops[dest];
8978     vector<ceph_tid_t> tids;
8979     cancel_copy(cop, false, &tids);
8980     osd->objecter->op_cancel(tids, -ECANCELED);
8981   }
8982
8983   CopyOpRef cop(std::make_shared<CopyOp>(cb, obc, src, oloc, version, flags,
8984                            mirror_snapset, src_obj_fadvise_flags,
8985                            dest_obj_fadvise_flags));
8986   copy_ops[dest] = cop;
8987   obc->start_block();
8988
8989   if (!obc->obs.oi.has_manifest()) {
8990     _copy_some(obc, cop);
8991   } else {
8992     if (obc->obs.oi.manifest.is_redirect()) {
8993       _copy_some(obc, cop);
8994     } else if (obc->obs.oi.manifest.is_chunked()) {
8995       auto p = obc->obs.oi.manifest.chunk_map.begin();
8996       _copy_some_manifest(obc, cop, p->first);
8997     } else {
8998       ceph_abort_msg("unrecognized manifest type");
8999     }
9000   }
9001 }
9002
9003 void PrimaryLogPG::_copy_some(ObjectContextRef obc, CopyOpRef cop)
9004 {
9005   dout(10) << __func__ << " " << *obc << " " << cop << dendl;
9006
9007   unsigned flags = 0;
9008   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_FLUSH)
9009     flags |= CEPH_OSD_FLAG_FLUSH;
9010   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE)
9011     flags |= CEPH_OSD_FLAG_IGNORE_CACHE;
9012   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY)
9013     flags |= CEPH_OSD_FLAG_IGNORE_OVERLAY;
9014   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE)
9015     flags |= CEPH_OSD_FLAG_MAP_SNAP_CLONE;
9016   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_RWORDERED)
9017     flags |= CEPH_OSD_FLAG_RWORDERED;
9018
9019   C_GatherBuilder gather(cct);
9020
9021   if (cop->cursor.is_initial() && cop->mirror_snapset) {
9022     // list snaps too.
9023     ceph_assert(cop->src.snap == CEPH_NOSNAP);
9024     ObjectOperation op;
9025     op.list_snaps(&cop->results.snapset, NULL);
9026     ceph_tid_t tid = osd->objecter->read(cop->src.oid, cop->oloc, op,
9027                                     CEPH_SNAPDIR, NULL,
9028                                     flags, gather.new_sub(), NULL);
9029     cop->objecter_tid2 = tid;
9030   }
9031
9032   ObjectOperation op;
9033   if (cop->results.user_version) {
9034     op.assert_version(cop->results.user_version);
9035   } else {
9036     // we should learn the version after the first chunk, if we didn't know
9037     // it already!
9038     ceph_assert(cop->cursor.is_initial());
9039   }
9040   op.copy_get(&cop->cursor, get_copy_chunk_size(),
9041               &cop->results.object_size, &cop->results.mtime,
9042               &cop->attrs, &cop->data, &cop->omap_header, &cop->omap_data,
9043               &cop->results.snaps, &cop->results.snap_seq,
9044               &cop->results.flags,
9045               &cop->results.source_data_digest,
9046               &cop->results.source_omap_digest,
9047               &cop->results.reqids,
9048               &cop->results.reqid_return_codes,
9049               &cop->results.truncate_seq,
9050               &cop->results.truncate_size,
9051               &cop->rval);
9052   op.set_last_op_flags(cop->src_obj_fadvise_flags);
9053
9054   C_Copyfrom *fin = new C_Copyfrom(this, obc->obs.oi.soid,
9055                                    get_last_peering_reset(), cop);
9056   gather.set_finisher(new C_OnFinisher(fin,
9057                                        osd->get_objecter_finisher(get_pg_shard())));
9058
9059   ceph_tid_t tid = osd->objecter->read(cop->src.oid, cop->oloc, op,
9060                                   cop->src.snap, NULL,
9061                                   flags,
9062                                   gather.new_sub(),
9063                                   // discover the object version if we don't know it yet
9064                                   cop->results.user_version ? NULL : &cop->results.user_version);
9065   fin->tid = tid;
9066   cop->objecter_tid = tid;
9067   gather.activate();
9068 }
9069
9070 void PrimaryLogPG::_copy_some_manifest(ObjectContextRef obc, CopyOpRef cop, uint64_t start_offset)
9071 {
9072   dout(10) << __func__ << " " << *obc << " " << cop << dendl;
9073
9074   unsigned flags = 0;
9075   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_FLUSH)
9076     flags |= CEPH_OSD_FLAG_FLUSH;
9077   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE)
9078     flags |= CEPH_OSD_FLAG_IGNORE_CACHE;
9079   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY)
9080     flags |= CEPH_OSD_FLAG_IGNORE_OVERLAY;
9081   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE)
9082     flags |= CEPH_OSD_FLAG_MAP_SNAP_CLONE;
9083   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_RWORDERED)
9084     flags |= CEPH_OSD_FLAG_RWORDERED;
9085
9086   int num_chunks = 0;
9087   uint64_t last_offset = 0, chunks_size = 0;
9088   object_manifest_t *manifest = &obc->obs.oi.manifest;
9089   map<uint64_t, chunk_info_t>::iterator iter = manifest->chunk_map.find(start_offset);
9090   for (;iter != manifest->chunk_map.end(); ++iter) {
9091     num_chunks++;
9092     chunks_size += iter->second.length;
9093     last_offset = iter->first;
9094     if (get_copy_chunk_size() < chunks_size) {
9095       break;
9096     }
9097   }
9098
9099   cop->num_chunk = num_chunks;
9100   cop->start_offset = start_offset;
9101   cop->last_offset = last_offset;
9102   dout(20) << __func__ << " oid " << obc->obs.oi.soid << " num_chunks: " << num_chunks
9103           << " start_offset: " << start_offset << " chunks_size: " << chunks_size
9104           << " last_offset: " << last_offset << dendl;
9105
9106   iter = manifest->chunk_map.find(start_offset);
9107   for (;iter != manifest->chunk_map.end(); ++iter) {
9108     uint64_t obj_offset = iter->first;
9109     uint64_t length = manifest->chunk_map[iter->first].length;
9110     hobject_t soid = manifest->chunk_map[iter->first].oid;
9111     object_locator_t oloc(soid);
9112     CopyCallback * cb = NULL;
9113     CopyOpRef sub_cop(std::make_shared<CopyOp>(cb, ObjectContextRef(), cop->src, oloc,
9114                        cop->results.user_version, cop->flags, cop->mirror_snapset,
9115                        cop->src_obj_fadvise_flags, cop->dest_obj_fadvise_flags));
9116     sub_cop->cursor.data_offset = obj_offset;
9117     cop->chunk_cops[obj_offset] = sub_cop;
9118
9119     int s = sub_cop->chunk_ops.size();
9120     sub_cop->chunk_ops.resize(s+1);
9121     sub_cop->chunk_ops[s].op.op =  CEPH_OSD_OP_READ;
9122     sub_cop->chunk_ops[s].op.extent.offset = manifest->chunk_map[iter->first].offset;
9123     sub_cop->chunk_ops[s].op.extent.length = length;
9124
9125     ObjectOperation op;
9126     op.dup(sub_cop->chunk_ops);
9127
9128     dout(20) << __func__ << " tgt_oid: " << soid.oid << " tgt_offset: "
9129             << manifest->chunk_map[iter->first].offset
9130             << " length: " << length << " pool id: " << oloc.pool << dendl;
9131
9132     if (cop->results.user_version) {
9133       op.assert_version(cop->results.user_version);
9134     } else {
9135       // we should learn the version after the first chunk, if we didn't know
9136       // it already!
9137       ceph_assert(cop->cursor.is_initial());
9138     }
9139     op.set_last_op_flags(cop->src_obj_fadvise_flags);
9140
9141     C_CopyChunk *fin = new C_CopyChunk(this, obc->obs.oi.soid,
9142                                      get_last_peering_reset(), cop);
9143     fin->offset = obj_offset;
9144
9145     ceph_tid_t tid = osd->objecter->read(
9146       soid.oid, oloc, op,
9147       sub_cop->src.snap, NULL,
9148       flags,
9149       new C_OnFinisher(fin, osd->get_objecter_finisher(get_pg_shard())),
9150       // discover the object version if we don't know it yet
9151       sub_cop->results.user_version ? NULL : &sub_cop->results.user_version);
9152     fin->tid = tid;
9153     sub_cop->objecter_tid = tid;
9154     if (last_offset < iter->first) {
9155       break;
9156     }
9157   }
9158 }
9159
9160 void PrimaryLogPG::process_copy_chunk(hobject_t oid, ceph_tid_t tid, int r)
9161 {
9162   dout(10) << __func__ << " " << oid << " tid " << tid
9163            << " " << cpp_strerror(r) << dendl;
9164   map<hobject_t,CopyOpRef>::iterator p = copy_ops.find(oid);
9165   if (p == copy_ops.end()) {
9166     dout(10) << __func__ << " no copy_op found" << dendl;
9167     return;
9168   }
9169   CopyOpRef cop = p->second;
9170   if (tid != cop->objecter_tid) {
9171     dout(10) << __func__ << " tid " << tid << " != cop " << cop
9172              << " tid " << cop->objecter_tid << dendl;
9173     return;
9174   }
9175
9176   if (cop->omap_data.length() || cop->omap_header.length())
9177     cop->results.has_omap = true;
9178
9179   if (r >= 0 && !pool.info.supports_omap() &&
9180       (cop->omap_data.length() || cop->omap_header.length())) {
9181     r = -EOPNOTSUPP;
9182   }
9183   cop->objecter_tid = 0;
9184   cop->objecter_tid2 = 0;  // assume this ordered before us (if it happened)
9185   ObjectContextRef& cobc = cop->obc;
9186
9187   if (r < 0)
9188     goto out;
9189
9190   ceph_assert(cop->rval >= 0);
9191
9192   if (oid.snap < CEPH_NOSNAP && !cop->results.snaps.empty()) {
9193     // verify snap hasn't been deleted
9194     vector<snapid_t>::iterator p = cop->results.snaps.begin();
9195     while (p != cop->results.snaps.end()) {
9196       // make best effort to sanitize snaps/clones.
9197       if (get_osdmap()->in_removed_snaps_queue(info.pgid.pgid.pool(), *p)) {
9198         dout(10) << __func__ << " clone snap " << *p << " has been deleted"
9199                  << dendl;
9200         for (vector<snapid_t>::iterator q = p + 1;
9201              q != cop->results.snaps.end();
9202              ++q)
9203           *(q - 1) = *q;
9204         cop->results.snaps.resize(cop->results.snaps.size() - 1);
9205       } else {
9206         ++p;
9207       }
9208     }
9209     if (cop->results.snaps.empty()) {
9210       dout(10) << __func__ << " no more snaps for " << oid << dendl;
9211       r = -ENOENT;
9212       goto out;
9213     }
9214   }
9215
9216   ceph_assert(cop->rval >= 0);
9217
9218   if (!cop->temp_cursor.data_complete) {
9219     cop->results.data_digest = cop->data.crc32c(cop->results.data_digest);
9220   }
9221   if (pool.info.supports_omap() && !cop->temp_cursor.omap_complete) {
9222     if (cop->omap_header.length()) {
9223       cop->results.omap_digest =
9224         cop->omap_header.crc32c(cop->results.omap_digest);
9225     }
9226     if (cop->omap_data.length()) {
9227       bufferlist keys;
9228       keys.substr_of(cop->omap_data, 4, cop->omap_data.length() - 4);
9229       cop->results.omap_digest = keys.crc32c(cop->results.omap_digest);
9230     }
9231   }
9232
9233   if (!cop->temp_cursor.attr_complete) {
9234     for (map<string,bufferlist>::iterator p = cop->attrs.begin();
9235          p != cop->attrs.end();
9236          ++p) {
9237       cop->results.attrs[string("_") + p->first] = p->second;
9238     }
9239     cop->attrs.clear();
9240   }
9241
9242   if (!cop->cursor.is_complete()) {
9243     // write out what we have so far
9244     if (cop->temp_cursor.is_initial()) {
9245       ceph_assert(!cop->results.started_temp_obj);
9246       cop->results.started_temp_obj = true;
9247       cop->results.temp_oid = generate_temp_object(oid);
9248       dout(20) << __func__ << " using temp " << cop->results.temp_oid << dendl;
9249     }
9250     ObjectContextRef tempobc = get_object_context(cop->results.temp_oid, true);
9251     OpContextUPtr ctx = simple_opc_create(tempobc);
9252     if (cop->temp_cursor.is_initial()) {
9253       ctx->new_temp_oid = cop->results.temp_oid;
9254     }
9255     _write_copy_chunk(cop, ctx->op_t.get());
9256     simple_opc_submit(std::move(ctx));
9257     dout(10) << __func__ << " fetching more" << dendl;
9258     _copy_some(cobc, cop);
9259     return;
9260   }
9261
9262   // verify digests?
9263   if (cop->results.is_data_digest() || cop->results.is_omap_digest()) {
9264     dout(20) << __func__ << std::hex
9265       << " got digest: rx data 0x" << cop->results.data_digest
9266       << " omap 0x" << cop->results.omap_digest
9267       << ", source: data 0x" << cop->results.source_data_digest
9268       << " omap 0x" <<  cop->results.source_omap_digest
9269       << std::dec
9270       << " flags " << cop->results.flags
9271       << dendl;
9272   }
9273   if (cop->results.is_data_digest() &&
9274       cop->results.data_digest != cop->results.source_data_digest) {
9275     derr << __func__ << std::hex << " data digest 0x" << cop->results.data_digest
9276          << " != source 0x" << cop->results.source_data_digest << std::dec
9277          << dendl;
9278     osd->clog->error() << info.pgid << " copy from " << cop->src
9279                        << " to " << cop->obc->obs.oi.soid << std::hex
9280                        << " data digest 0x" << cop->results.data_digest
9281                        << " != source 0x" << cop->results.source_data_digest
9282                        << std::dec;
9283     r = -EIO;
9284     goto out;
9285   }
9286   if (cop->results.is_omap_digest() &&
9287       cop->results.omap_digest != cop->results.source_omap_digest) {
9288     derr << __func__ << std::hex
9289          << " omap digest 0x" << cop->results.omap_digest
9290          << " != source 0x" << cop->results.source_omap_digest
9291          << std::dec << dendl;
9292     osd->clog->error() << info.pgid << " copy from " << cop->src
9293                        << " to " << cop->obc->obs.oi.soid << std::hex
9294                        << " omap digest 0x" << cop->results.omap_digest
9295                        << " != source 0x" << cop->results.source_omap_digest
9296                        << std::dec;
9297     r = -EIO;
9298     goto out;
9299   }
9300   if (cct->_conf->osd_debug_inject_copyfrom_error) {
9301     derr << __func__ << " injecting copyfrom failure" << dendl;
9302     r = -EIO;
9303     goto out;
9304   }
9305
9306   cop->results.fill_in_final_tx = std::function<void(PGTransaction*)>(
9307     [this, &cop /* avoid ref cycle */](PGTransaction *t) {
9308       ObjectState& obs = cop->obc->obs;
9309       if (cop->temp_cursor.is_initial()) {
9310         dout(20) << "fill_in_final_tx: writing "
9311                  << "directly to final object" << dendl;
9312         // write directly to final object
9313         cop->results.temp_oid = obs.oi.soid;
9314         _write_copy_chunk(cop, t);
9315       } else {
9316         // finish writing to temp object, then move into place
9317         dout(20) << "fill_in_final_tx: writing to temp object" << dendl;
9318         _write_copy_chunk(cop, t);
9319         t->rename(obs.oi.soid, cop->results.temp_oid);
9320       }
9321       t->setattrs(obs.oi.soid, cop->results.attrs);
9322     });
9323
9324   dout(20) << __func__ << " success; committing" << dendl;
9325
9326  out:
9327   dout(20) << __func__ << " complete r = " << cpp_strerror(r) << dendl;
9328   CopyCallbackResults results(r, &cop->results);
9329   cop->cb->complete(results);
9330
9331   copy_ops.erase(cobc->obs.oi.soid);
9332   cobc->stop_block();
9333
9334   if (r < 0 && cop->results.started_temp_obj) {
9335     dout(10) << __func__ << " deleting partial temp object "
9336              << cop->results.temp_oid << dendl;
9337     ObjectContextRef tempobc = get_object_context(cop->results.temp_oid, true);
9338     OpContextUPtr ctx = simple_opc_create(tempobc);
9339     ctx->op_t->remove(cop->results.temp_oid);
9340     ctx->discard_temp_oid = cop->results.temp_oid;
9341     simple_opc_submit(std::move(ctx));
9342   }
9343
9344   // cancel and requeue proxy ops on this object
9345   if (!r) {
9346     cancel_and_requeue_proxy_ops(cobc->obs.oi.soid);
9347   }
9348
9349   kick_object_context_blocked(cobc);
9350 }
9351
9352 void PrimaryLogPG::process_copy_chunk_manifest(hobject_t oid, ceph_tid_t tid, int r, uint64_t offset)
9353 {
9354   dout(10) << __func__ << " " << oid << " tid " << tid
9355            << " " << cpp_strerror(r) << dendl;
9356   map<hobject_t,CopyOpRef>::iterator p = copy_ops.find(oid);
9357   if (p == copy_ops.end()) {
9358     dout(10) << __func__ << " no copy_op found" << dendl;
9359     return;
9360   }
9361   CopyOpRef obj_cop = p->second;
9362   CopyOpRef chunk_cop = obj_cop->chunk_cops[offset];
9363
9364   if (tid != chunk_cop->objecter_tid) {
9365     dout(10) << __func__ << " tid " << tid << " != cop " << chunk_cop
9366              << " tid " << chunk_cop->objecter_tid << dendl;
9367     return;
9368   }
9369
9370   if (chunk_cop->omap_data.length() || chunk_cop->omap_header.length()) {
9371     r = -EOPNOTSUPP;
9372   }
9373
9374   chunk_cop->objecter_tid = 0;
9375   chunk_cop->objecter_tid2 = 0;  // assume this ordered before us (if it happened)
9376   ObjectContextRef& cobc = obj_cop->obc;
9377   OSDOp &chunk_data = chunk_cop->chunk_ops[0];
9378
9379   if (r < 0) {
9380     obj_cop->failed = true;
9381     goto out;
9382   }
9383
9384   if (obj_cop->failed) {
9385     return;
9386   }
9387   if (!chunk_data.outdata.length()) {
9388     r = -EIO;
9389     obj_cop->failed = true;
9390     goto out;
9391   }
9392
9393   obj_cop->num_chunk--;
9394
9395   /* check all of the copyop are completed */
9396   if (obj_cop->num_chunk) {
9397     dout(20) << __func__ << " num_chunk: " << obj_cop->num_chunk << dendl;
9398     return;
9399   }
9400
9401   {
9402     OpContextUPtr ctx = simple_opc_create(obj_cop->obc);
9403     if (!ctx->lock_manager.take_write_lock(
9404           obj_cop->obc->obs.oi.soid,
9405           obj_cop->obc)) {
9406       // recovery op can take read lock.
9407       // so need to wait for recovery completion
9408       r = -EAGAIN;
9409       obj_cop->failed = true;
9410       close_op_ctx(ctx.release());
9411       goto out;
9412     }
9413     dout(20) << __func__ << " took lock on obc, " << obj_cop->obc->rwstate << dendl;
9414
9415     PGTransaction *t = ctx->op_t.get();
9416     ObjectState& obs = ctx->new_obs;
9417     for (auto p : obj_cop->chunk_cops) {
9418       OSDOp &sub_chunk = p.second->chunk_ops[0];
9419       t->write(cobc->obs.oi.soid,
9420               p.second->cursor.data_offset,
9421               sub_chunk.outdata.length(),
9422               sub_chunk.outdata,
9423               p.second->dest_obj_fadvise_flags);
9424       dout(20) << __func__ << " offset: " << p.second->cursor.data_offset
9425               << " length: " << sub_chunk.outdata.length() << dendl;
9426       write_update_size_and_usage(ctx->delta_stats, obs.oi, ctx->modified_ranges,
9427                                   p.second->cursor.data_offset, sub_chunk.outdata.length());
9428       obs.oi.manifest.chunk_map[p.second->cursor.data_offset].clear_flag(chunk_info_t::FLAG_DIRTY);
9429       obs.oi.manifest.chunk_map[p.second->cursor.data_offset].clear_flag(chunk_info_t::FLAG_MISSING);
9430       ctx->clean_regions.mark_data_region_dirty(p.second->cursor.data_offset, sub_chunk.outdata.length());
9431       sub_chunk.outdata.clear();
9432     }
9433     obs.oi.clear_data_digest();
9434     ctx->at_version = get_next_version();
9435     finish_ctx(ctx.get(), pg_log_entry_t::PROMOTE);
9436     simple_opc_submit(std::move(ctx));
9437
9438     auto p = cobc->obs.oi.manifest.chunk_map.rbegin();
9439     /* check remaining work */
9440     if (p != cobc->obs.oi.manifest.chunk_map.rend()) {
9441       if (obj_cop->last_offset >= p->first + p->second.length) {
9442         for (auto &en : cobc->obs.oi.manifest.chunk_map) {
9443           if (obj_cop->last_offset < en.first) {
9444             _copy_some_manifest(cobc, obj_cop, en.first);
9445             return;
9446           }
9447         }
9448       }
9449     }
9450   }
9451
9452  out:
9453   dout(20) << __func__ << " complete r = " << cpp_strerror(r) << dendl;
9454   CopyCallbackResults results(r, &obj_cop->results);
9455   obj_cop->cb->complete(results);
9456
9457   copy_ops.erase(cobc->obs.oi.soid);
9458   cobc->stop_block();
9459
9460   // cancel and requeue proxy ops on this object
9461   if (!r) {
9462     cancel_and_requeue_proxy_ops(cobc->obs.oi.soid);
9463   }
9464
9465   kick_object_context_blocked(cobc);
9466 }
9467
9468 void PrimaryLogPG::cancel_and_requeue_proxy_ops(hobject_t oid) {
9469   vector<ceph_tid_t> tids;
9470   for (map<ceph_tid_t, ProxyReadOpRef>::iterator it = proxyread_ops.begin();
9471       it != proxyread_ops.end();) {
9472     if (it->second->soid == oid) {
9473       cancel_proxy_read((it++)->second, &tids);
9474     } else {
9475       ++it;
9476     }
9477   }
9478   for (map<ceph_tid_t, ProxyWriteOpRef>::iterator it = proxywrite_ops.begin();
9479        it != proxywrite_ops.end();) {
9480     if (it->second->soid == oid) {
9481       cancel_proxy_write((it++)->second, &tids);
9482     } else {
9483       ++it;
9484     }
9485   }
9486   osd->objecter->op_cancel(tids, -ECANCELED);
9487   kick_proxy_ops_blocked(oid);
9488 }
9489
9490 void PrimaryLogPG::_write_copy_chunk(CopyOpRef cop, PGTransaction *t)
9491 {
9492   dout(20) << __func__ << " " << cop
9493            << " " << cop->attrs.size() << " attrs"
9494            << " " << cop->data.length() << " bytes"
9495            << " " << cop->omap_header.length() << " omap header bytes"
9496            << " " << cop->omap_data.length() << " omap data bytes"
9497            << dendl;
9498   if (!cop->temp_cursor.attr_complete) {
9499     t->create(cop->results.temp_oid);
9500   }
9501   if (!cop->temp_cursor.data_complete) {
9502     ceph_assert(cop->data.length() + cop->temp_cursor.data_offset ==
9503            cop->cursor.data_offset);
9504     if (pool.info.required_alignment() &&
9505         !cop->cursor.data_complete) {
9506       /**
9507        * Trim off the unaligned bit at the end, we'll adjust cursor.data_offset
9508        * to pick it up on the next pass.
9509        */
9510       ceph_assert(cop->temp_cursor.data_offset %
9511              pool.info.required_alignment() == 0);
9512       if (cop->data.length() % pool.info.required_alignment() != 0) {
9513         uint64_t to_trim =
9514           cop->data.length() % pool.info.required_alignment();
9515         bufferlist bl;
9516         bl.substr_of(cop->data, 0, cop->data.length() - to_trim);
9517         cop->data.swap(bl);
9518         cop->cursor.data_offset -= to_trim;
9519         ceph_assert(cop->data.length() + cop->temp_cursor.data_offset ==
9520                cop->cursor.data_offset);
9521       }
9522     }
9523     if (cop->data.length()) {
9524       t->write(
9525         cop->results.temp_oid,
9526         cop->temp_cursor.data_offset,
9527         cop->data.length(),
9528         cop->data,
9529         cop->dest_obj_fadvise_flags);
9530     }
9531     cop->data.clear();
9532   }
9533   if (pool.info.supports_omap()) {
9534     if (!cop->temp_cursor.omap_complete) {
9535       if (cop->omap_header.length()) {
9536         t->omap_setheader(
9537           cop->results.temp_oid,
9538           cop->omap_header);
9539         cop->omap_header.clear();
9540       }
9541       if (cop->omap_data.length()) {
9542         map<string,bufferlist> omap;
9543         bufferlist::const_iterator p = cop->omap_data.begin();
9544         decode(omap, p);
9545         t->omap_setkeys(cop->results.temp_oid, omap);
9546         cop->omap_data.clear();
9547       }
9548     }
9549   } else {
9550     ceph_assert(cop->omap_header.length() == 0);
9551     ceph_assert(cop->omap_data.length() == 0);
9552   }
9553   cop->temp_cursor = cop->cursor;
9554 }
9555
9556 void PrimaryLogPG::finish_copyfrom(CopyFromCallback *cb)
9557 {
9558   OpContext *ctx = cb->ctx;
9559   dout(20) << "finish_copyfrom on " << ctx->obs->oi.soid << dendl;
9560
9561   ObjectState& obs = ctx->new_obs;
9562   if (obs.exists) {
9563     dout(20) << __func__ << ": exists, removing" << dendl;
9564     ctx->op_t->remove(obs.oi.soid);
9565   } else {
9566     ctx->delta_stats.num_objects++;
9567     obs.exists = true;
9568   }
9569   if (cb->is_temp_obj_used()) {
9570     ctx->discard_temp_oid = cb->results->temp_oid;
9571   }
9572   cb->results->fill_in_final_tx(ctx->op_t.get());
9573
9574   // CopyFromCallback fills this in for us
9575   obs.oi.user_version = ctx->user_at_version;
9576
9577   if (cb->results->is_data_digest()) {
9578     obs.oi.set_data_digest(cb->results->data_digest);
9579   } else {
9580     obs.oi.clear_data_digest();
9581   }
9582   if (cb->results->is_omap_digest()) {
9583     obs.oi.set_omap_digest(cb->results->omap_digest);
9584   } else {
9585     obs.oi.clear_omap_digest();
9586   }
9587
9588   obs.oi.truncate_seq = cb->truncate_seq;
9589   obs.oi.truncate_size = cb->truncate_size;
9590
9591   obs.oi.mtime = ceph::real_clock::to_timespec(cb->results->mtime);
9592   ctx->mtime = utime_t();
9593
9594   ctx->extra_reqids = cb->results->reqids;
9595   ctx->extra_reqid_return_codes = cb->results->reqid_return_codes;
9596
9597   // cache: clear whiteout?
9598   if (obs.oi.is_whiteout()) {
9599     dout(10) << __func__ << " clearing whiteout on " << obs.oi.soid << dendl;
9600     obs.oi.clear_flag(object_info_t::FLAG_WHITEOUT);
9601     --ctx->delta_stats.num_whiteouts;
9602   }
9603
9604   if (cb->results->has_omap) {
9605     dout(10) << __func__ << " setting omap flag on " << obs.oi.soid << dendl;
9606     obs.oi.set_flag(object_info_t::FLAG_OMAP);
9607     ctx->clean_regions.mark_omap_dirty();
9608   } else {
9609     dout(10) << __func__ << " clearing omap flag on " << obs.oi.soid << dendl;
9610     obs.oi.clear_flag(object_info_t::FLAG_OMAP);
9611   }
9612
9613   interval_set<uint64_t> ch;
9614   if (obs.oi.size > 0)
9615     ch.insert(0, obs.oi.size);
9616   ctx->modified_ranges.union_of(ch);
9617   ctx->clean_regions.mark_data_region_dirty(0, std::max(obs.oi.size, cb->get_data_size()));
9618
9619   if (cb->get_data_size() != obs.oi.size) {
9620     ctx->delta_stats.num_bytes -= obs.oi.size;
9621     obs.oi.size = cb->get_data_size();
9622     ctx->delta_stats.num_bytes += obs.oi.size;
9623   }
9624   ctx->delta_stats.num_wr++;
9625   ctx->delta_stats.num_wr_kb += shift_round_up(obs.oi.size, 10);
9626
9627   osd->logger->inc(l_osd_copyfrom);
9628 }
9629
9630 void PrimaryLogPG::finish_promote(int r, CopyResults *results,
9631                                   ObjectContextRef obc)
9632 {
9633   const hobject_t& soid = obc->obs.oi.soid;
9634   dout(10) << __func__ << " " << soid << " r=" << r
9635            << " uv" << results->user_version << dendl;
9636
9637   if (r == -ECANCELED) {
9638     return;
9639   }
9640
9641   if (r != -ENOENT && soid.is_snap()) {
9642     if (results->snaps.empty()) {
9643       // we must have read "snap" content from the head object in the
9644       // base pool.  use snap_seq to construct what snaps should be
9645       // for this clone (what is was before we evicted the clean clone
9646       // from this pool, and what it will be when we flush and the
9647       // clone eventually happens in the base pool).  we want to use
9648       // snaps in (results->snap_seq,soid.snap]
9649       SnapSet& snapset = obc->ssc->snapset;
9650       for (auto p = snapset.clone_snaps.rbegin();
9651            p != snapset.clone_snaps.rend();
9652            ++p) {
9653         for (auto snap : p->second) {
9654           if (snap > soid.snap) {
9655             continue;
9656           }
9657           if (snap <= results->snap_seq) {
9658             break;
9659           }
9660           results->snaps.push_back(snap);
9661         }
9662       }
9663     }
9664
9665     dout(20) << __func__ << " snaps " << results->snaps << dendl;
9666     filter_snapc(results->snaps);
9667
9668     dout(20) << __func__ << " filtered snaps " << results->snaps << dendl;
9669     if (results->snaps.empty()) {
9670       dout(20) << __func__
9671                << " snaps are empty, clone is invalid,"
9672                << " setting r to ENOENT" << dendl;
9673       r = -ENOENT;
9674     }
9675   }
9676
9677   if (r < 0 && results->started_temp_obj) {
9678     dout(10) << __func__ << " abort; will clean up partial work" << dendl;
9679     ObjectContextRef tempobc = get_object_context(results->temp_oid, false);
9680     ceph_assert(tempobc);
9681     OpContextUPtr ctx = simple_opc_create(tempobc);
9682     ctx->op_t->remove(results->temp_oid);
9683     simple_opc_submit(std::move(ctx));
9684     results->started_temp_obj = false;
9685   }
9686
9687   if (r == -ENOENT && soid.is_snap()) {
9688     dout(10) << __func__
9689              << ": enoent while trying to promote clone, " << soid
9690              << " must have been trimmed, removing from snapset"
9691              << dendl;
9692     hobject_t head(soid.get_head());
9693     ObjectContextRef obc = get_object_context(head, false);
9694     ceph_assert(obc);
9695
9696     OpContextUPtr tctx = simple_opc_create(obc);
9697     tctx->at_version = get_next_version();
9698     if (get_osdmap()->require_osd_release < ceph_release_t::octopus) {
9699       filter_snapc(tctx->new_snapset.snaps);
9700     } else {
9701       tctx->new_snapset.snaps.clear();
9702     }
9703     vector<snapid_t> new_clones;
9704     map<snapid_t, vector<snapid_t>> new_clone_snaps;
9705     for (vector<snapid_t>::iterator i = tctx->new_snapset.clones.begin();
9706          i != tctx->new_snapset.clones.end();
9707          ++i) {
9708       if (*i != soid.snap) {
9709         new_clones.push_back(*i);
9710         auto p = tctx->new_snapset.clone_snaps.find(*i);
9711         if (p != tctx->new_snapset.clone_snaps.end()) {
9712           new_clone_snaps[*i] = p->second;
9713         }
9714       }
9715     }
9716     tctx->new_snapset.clones.swap(new_clones);
9717     tctx->new_snapset.clone_overlap.erase(soid.snap);
9718     tctx->new_snapset.clone_size.erase(soid.snap);
9719     tctx->new_snapset.clone_snaps.swap(new_clone_snaps);
9720
9721     // take RWWRITE lock for duration of our local write.  ignore starvation.
9722     if (!tctx->lock_manager.take_write_lock(
9723           head,
9724           obc)) {
9725       ceph_abort_msg("problem!");
9726     }
9727     dout(20) << __func__ << " took lock on obc, " << obc->rwstate << dendl;
9728
9729     finish_ctx(tctx.get(), pg_log_entry_t::PROMOTE);
9730
9731     simple_opc_submit(std::move(tctx));
9732     return;
9733   }
9734
9735   bool whiteout = false;
9736   if (r == -ENOENT) {
9737     ceph_assert(soid.snap == CEPH_NOSNAP); // snap case is above
9738     dout(10) << __func__ << " whiteout " << soid << dendl;
9739     whiteout = true;
9740   }
9741
9742   if (r < 0 && !whiteout) {
9743     derr << __func__ << " unexpected promote error " << cpp_strerror(r) << dendl;
9744     // pass error to everyone blocked on this object
9745     // FIXME: this is pretty sloppy, but at this point we got
9746     // something unexpected and don't have many other options.
9747     map<hobject_t,list<OpRequestRef>>::iterator blocked_iter =
9748       waiting_for_blocked_object.find(soid);
9749     if (blocked_iter != waiting_for_blocked_object.end()) {
9750       while (!blocked_iter->second.empty()) {
9751         osd->reply_op_error(blocked_iter->second.front(), r);
9752         blocked_iter->second.pop_front();
9753       }
9754       waiting_for_blocked_object.erase(blocked_iter);
9755     }
9756     return;
9757   }
9758
9759   osd->promote_finish(results->object_size);
9760
9761   OpContextUPtr tctx =  simple_opc_create(obc);
9762   tctx->at_version = get_next_version();
9763
9764   if (!obc->obs.oi.has_manifest()) {
9765     ++tctx->delta_stats.num_objects;
9766   }
9767   if (soid.snap < CEPH_NOSNAP)
9768     ++tctx->delta_stats.num_object_clones;
9769   tctx->new_obs.exists = true;
9770
9771   tctx->extra_reqids = results->reqids;
9772   tctx->extra_reqid_return_codes = results->reqid_return_codes;
9773
9774   if (whiteout) {
9775     // create a whiteout
9776     tctx->op_t->create(soid);
9777     tctx->new_obs.oi.set_flag(object_info_t::FLAG_WHITEOUT);
9778     ++tctx->delta_stats.num_whiteouts;
9779     dout(20) << __func__ << " creating whiteout on " << soid << dendl;
9780     osd->logger->inc(l_osd_tier_whiteout);
9781   } else {
9782     if (results->has_omap) {
9783       dout(10) << __func__ << " setting omap flag on " << soid << dendl;
9784       tctx->new_obs.oi.set_flag(object_info_t::FLAG_OMAP);
9785       ++tctx->delta_stats.num_objects_omap;
9786     }
9787
9788     results->fill_in_final_tx(tctx->op_t.get());
9789     if (results->started_temp_obj) {
9790       tctx->discard_temp_oid = results->temp_oid;
9791     }
9792     tctx->new_obs.oi.size = results->object_size;
9793     tctx->new_obs.oi.user_version = results->user_version;
9794     tctx->new_obs.oi.mtime = ceph::real_clock::to_timespec(results->mtime);
9795     tctx->mtime = utime_t();
9796     if (results->is_data_digest()) {
9797       tctx->new_obs.oi.set_data_digest(results->data_digest);
9798     } else {
9799       tctx->new_obs.oi.clear_data_digest();
9800     }
9801     if (results->object_size)
9802       tctx->clean_regions.mark_data_region_dirty(0, results->object_size);
9803     if (results->is_omap_digest()) {
9804       tctx->new_obs.oi.set_omap_digest(results->omap_digest);
9805     } else {
9806       tctx->new_obs.oi.clear_omap_digest();
9807     }
9808     if (results->has_omap)
9809         tctx->clean_regions.mark_omap_dirty();
9810     tctx->new_obs.oi.truncate_seq = results->truncate_seq;
9811     tctx->new_obs.oi.truncate_size = results->truncate_size;
9812
9813     if (soid.snap != CEPH_NOSNAP) {
9814       ceph_assert(obc->ssc->snapset.clone_snaps.count(soid.snap));
9815       ceph_assert(obc->ssc->snapset.clone_size.count(soid.snap));
9816       ceph_assert(obc->ssc->snapset.clone_size[soid.snap] ==
9817              results->object_size);
9818       ceph_assert(obc->ssc->snapset.clone_overlap.count(soid.snap));
9819
9820       tctx->delta_stats.num_bytes += obc->ssc->snapset.get_clone_bytes(soid.snap);
9821     } else {
9822       tctx->delta_stats.num_bytes += results->object_size;
9823     }
9824   }
9825
9826   if (results->mirror_snapset) {
9827     ceph_assert(tctx->new_obs.oi.soid.snap == CEPH_NOSNAP);
9828     tctx->new_snapset.from_snap_set(
9829       results->snapset,
9830       get_osdmap()->require_osd_release < ceph_release_t::luminous);
9831   }
9832   dout(20) << __func__ << " new_snapset " << tctx->new_snapset << dendl;
9833
9834   // take RWWRITE lock for duration of our local write.  ignore starvation.
9835   if (!tctx->lock_manager.take_write_lock(
9836         obc->obs.oi.soid,
9837         obc)) {
9838     ceph_abort_msg("problem!");
9839   }
9840   dout(20) << __func__ << " took lock on obc, " << obc->rwstate << dendl;
9841
9842   finish_ctx(tctx.get(), pg_log_entry_t::PROMOTE);
9843
9844   simple_opc_submit(std::move(tctx));
9845
9846   osd->logger->inc(l_osd_tier_promote);
9847
9848   if (agent_state &&
9849       agent_state->is_idle())
9850     agent_choose_mode();
9851 }
9852
9853 void PrimaryLogPG::finish_promote_manifest(int r, CopyResults *results,
9854                                             ObjectContextRef obc)
9855 {
9856   const hobject_t& soid = obc->obs.oi.soid;
9857   dout(10) << __func__ << " " << soid << " r=" << r
9858            << " uv" << results->user_version << dendl;
9859
9860   if (r == -ECANCELED || r == -EAGAIN) {
9861     return;
9862   }
9863
9864   if (r < 0) {
9865     derr << __func__ << " unexpected promote error " << cpp_strerror(r) << dendl;
9866     // pass error to everyone blocked on this object
9867     // FIXME: this is pretty sloppy, but at this point we got
9868     // something unexpected and don't have many other options.
9869     map<hobject_t,list<OpRequestRef>>::iterator blocked_iter =
9870       waiting_for_blocked_object.find(soid);
9871     if (blocked_iter != waiting_for_blocked_object.end()) {
9872       while (!blocked_iter->second.empty()) {
9873         osd->reply_op_error(blocked_iter->second.front(), r);
9874         blocked_iter->second.pop_front();
9875       }
9876       waiting_for_blocked_object.erase(blocked_iter);
9877     }
9878     return;
9879   }
9880
9881   osd->promote_finish(results->object_size);
9882   osd->logger->inc(l_osd_tier_promote);
9883
9884   if (agent_state &&
9885       agent_state->is_idle())
9886     agent_choose_mode();
9887 }
9888
9889 void PrimaryLogPG::cancel_copy(CopyOpRef cop, bool requeue,
9890                                vector<ceph_tid_t> *tids)
9891 {
9892   dout(10) << __func__ << " " << cop->obc->obs.oi.soid
9893            << " from " << cop->src << " " << cop->oloc
9894            << " v" << cop->results.user_version << dendl;
9895
9896   // cancel objecter op, if we can
9897   if (cop->objecter_tid) {
9898     tids->push_back(cop->objecter_tid);
9899     cop->objecter_tid = 0;
9900     if (cop->objecter_tid2) {
9901       tids->push_back(cop->objecter_tid2);
9902       cop->objecter_tid2 = 0;
9903     }
9904   }
9905
9906   copy_ops.erase(cop->obc->obs.oi.soid);
9907   cop->obc->stop_block();
9908
9909   kick_object_context_blocked(cop->obc);
9910   cop->results.should_requeue = requeue;
9911   CopyCallbackResults result(-ECANCELED, &cop->results);
9912   cop->cb->complete(result);
9913
9914   // There may still be an objecter callback referencing this copy op.
9915   // That callback will not need the obc since it's been canceled, and
9916   // we need the obc reference to go away prior to flush.
9917   cop->obc = ObjectContextRef();
9918 }
9919
9920 void PrimaryLogPG::cancel_copy_ops(bool requeue, vector<ceph_tid_t> *tids)
9921 {
9922   dout(10) << __func__ << dendl;
9923   map<hobject_t,CopyOpRef>::iterator p = copy_ops.begin();
9924   while (p != copy_ops.end()) {
9925     // requeue this op? can I queue up all of them?
9926     cancel_copy((p++)->second, requeue, tids);
9927   }
9928 }
9929
9930
9931 // ========================================================================
9932 // flush
9933 //
9934 // Flush a dirty object in the cache tier by writing it back to the
9935 // base tier.  The sequence looks like:
9936 //
9937 //  * send a copy-from operation to the base tier to copy the current
9938 //    version of the object
9939 //  * base tier will pull the object via (perhaps multiple) copy-get(s)
9940 //  * on completion, we check if the object has been modified.  if so,
9941 //    just reply with -EAGAIN.
9942 //  * try to take a write lock so we can clear the dirty flag.  if this
9943 //    fails, wait and retry
9944 //  * start a repop that clears the bit.
9945 //
9946 // If we have to wait, we will retry by coming back through the
9947 // start_flush method.  We check if a flush is already in progress
9948 // and, if so, try to finish it by rechecking the version and trying
9949 // to clear the dirty bit.
9950 //
9951 // In order for the cache-flush (a write op) to not block the copy-get
9952 // from reading the object, the client *must* set the SKIPRWLOCKS
9953 // flag.
9954 //
9955 // NOTE: normally writes are strictly ordered for the client, but
9956 // flushes are special in that they can be reordered with respect to
9957 // other writes.  In particular, we can't have a flush request block
9958 // an update to the cache pool object!
9959
9960 struct C_Flush : public Context {
9961   PrimaryLogPGRef pg;
9962   hobject_t oid;
9963   epoch_t last_peering_reset;
9964   ceph_tid_t tid;
9965   utime_t start;
9966   C_Flush(PrimaryLogPG *p, hobject_t o, epoch_t lpr)
9967     : pg(p), oid(o), last_peering_reset(lpr),
9968       tid(0), start(ceph_clock_now())
9969   {}
9970   void finish(int r) override {
9971     if (r == -ECANCELED)
9972       return;
9973     std::scoped_lock locker{*pg};
9974     if (last_peering_reset == pg->get_last_peering_reset()) {
9975       pg->finish_flush(oid, tid, r);
9976       pg->osd->logger->tinc(l_osd_tier_flush_lat, ceph_clock_now() - start);
9977     }
9978   }
9979 };
9980
9981 int PrimaryLogPG::start_flush(
9982   OpRequestRef op, ObjectContextRef obc,
9983   bool blocking, hobject_t *pmissing,
9984   std::optional<std::function<void()>> &&on_flush)
9985 {
9986   const object_info_t& oi = obc->obs.oi;
9987   const hobject_t& soid = oi.soid;
9988   dout(10) << __func__ << " " << soid
9989            << " v" << oi.version
9990            << " uv" << oi.user_version
9991            << " " << (blocking ? "blocking" : "non-blocking/best-effort")
9992            << dendl;
9993
9994   bool preoctopus_compat =
9995     get_osdmap()->require_osd_release < ceph_release_t::octopus;
9996   SnapSet snapset;
9997   if (preoctopus_compat) {
9998     // for pre-octopus compatibility, filter SnapSet::snaps.  not
9999     // certain we need this, but let's be conservative.
10000     snapset = obc->ssc->snapset.get_filtered(pool.info);
10001   } else {
10002     // NOTE: change this to a const ref when we remove this compat code
10003     snapset = obc->ssc->snapset;
10004   }
10005
10006   // verify there are no (older) check for dirty clones
10007   {
10008     dout(20) << " snapset " << snapset << dendl;
10009     vector<snapid_t>::reverse_iterator p = snapset.clones.rbegin();
10010     while (p != snapset.clones.rend() && *p >= soid.snap)
10011       ++p;
10012     if (p != snapset.clones.rend()) {
10013       hobject_t next = soid;
10014       next.snap = *p;
10015       ceph_assert(next.snap < soid.snap);
10016       if (recovery_state.get_pg_log().get_missing().is_missing(next)) {
10017         dout(10) << __func__ << " missing clone is " << next << dendl;
10018         if (pmissing)
10019           *pmissing = next;
10020         return -ENOENT;
10021       }
10022       ObjectContextRef older_obc = get_object_context(next, false);
10023       if (older_obc) {
10024         dout(20) << __func__ << " next oldest clone is " << older_obc->obs.oi
10025                  << dendl;
10026         if (older_obc->obs.oi.is_dirty()) {
10027           dout(10) << __func__ << " next oldest clone is dirty: "
10028                    << older_obc->obs.oi << dendl;
10029           return -EBUSY;
10030         }
10031       } else {
10032         dout(20) << __func__ << " next oldest clone " << next
10033                  << " is not present; implicitly clean" << dendl;
10034       }
10035     } else {
10036       dout(20) << __func__ << " no older clones" << dendl;
10037     }
10038   }
10039
10040   if (blocking)
10041     obc->start_block();
10042
10043   map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(soid);
10044   if (p != flush_ops.end()) {
10045     FlushOpRef fop = p->second;
10046     if (fop->op == op) {
10047       // we couldn't take the write lock on a cache-try-flush before;
10048       // now we are trying again for the lock.
10049       return try_flush_mark_clean(fop);
10050     }
10051     if (fop->flushed_version == obc->obs.oi.user_version &&
10052         (fop->blocking || !blocking)) {
10053       // nonblocking can join anything
10054       // blocking can only join a blocking flush
10055       dout(20) << __func__ << " piggybacking on existing flush " << dendl;
10056       if (op)
10057         fop->dup_ops.push_back(op);
10058       return -EAGAIN;   // clean up this ctx; op will retry later
10059     }
10060
10061     // cancel current flush since it will fail anyway, or because we
10062     // are blocking and the existing flush is nonblocking.
10063     dout(20) << __func__ << " canceling previous flush; it will fail" << dendl;
10064     if (fop->op)
10065       osd->reply_op_error(fop->op, -EBUSY);
10066     while (!fop->dup_ops.empty()) {
10067       osd->reply_op_error(fop->dup_ops.front(), -EBUSY);
10068       fop->dup_ops.pop_front();
10069     }
10070     vector<ceph_tid_t> tids;
10071     cancel_flush(fop, false, &tids);
10072     osd->objecter->op_cancel(tids, -ECANCELED);
10073   }
10074
10075   if (obc->obs.oi.has_manifest() && obc->obs.oi.manifest.is_chunked()) {
10076     int r = start_manifest_flush(op, obc, blocking, std::move(on_flush));
10077     if (r != -EINPROGRESS) {
10078       if (blocking)
10079         obc->stop_block();
10080     }
10081     return r;
10082   }
10083
10084   /**
10085    * In general, we need to send a delete and a copyfrom.
10086    * Consider snapc 10:[10, 9, 8, 4, 3, 2]:[10(10, 9), 4(4,3,2)]
10087    * where 4 is marked as clean.  To flush 10, we have to:
10088    * 1) delete 4:[4,3,2] -- Logically, the object does not exist after 4
10089    * 2) copyfrom 8:[8,4,3,2] -- flush object after snap 8
10090    *
10091    * There is a complicating case.  Supposed there had been a clone 7
10092    * for snaps [7, 6] which has been trimmed since they no longer exist.
10093    * In the base pool, we'd have 5:[4,3,2]:[4(4,3,2)]+head.  When we submit
10094    * the delete, the snap will be promoted to 5, and the head will become
10095    * a whiteout.  When the copy-from goes through, we'll end up with
10096    * 8:[8,4,3,2]:[4(4,3,2)]+head.
10097    *
10098    * Another complication is the case where there is an interval change
10099    * after doing the delete and the flush but before marking the object
10100    * clean.  We'll happily delete head and then recreate it at the same
10101    * sequence number, which works out ok.
10102    */
10103
10104   SnapContext snapc, dsnapc;
10105   if (snapset.seq != 0) {
10106     if (soid.snap == CEPH_NOSNAP) {
10107       snapc = snapset.get_ssc_as_of(snapset.seq);
10108     } else {
10109       snapid_t min_included_snap;
10110       auto p = snapset.clone_snaps.find(soid.snap);
10111       ceph_assert(p != snapset.clone_snaps.end());
10112       min_included_snap = p->second.back();
10113       snapc = snapset.get_ssc_as_of(min_included_snap - 1);
10114     }
10115
10116     snapid_t prev_snapc = 0;
10117     for (vector<snapid_t>::reverse_iterator citer = snapset.clones.rbegin();
10118          citer != snapset.clones.rend();
10119          ++citer) {
10120       if (*citer < soid.snap) {
10121         prev_snapc = *citer;
10122         break;
10123       }
10124     }
10125
10126     dsnapc = snapset.get_ssc_as_of(prev_snapc);
10127   }
10128
10129   object_locator_t base_oloc(soid);
10130   base_oloc.pool = pool.info.tier_of;
10131
10132   if (dsnapc.seq < snapc.seq) {
10133     ObjectOperation o;
10134     o.remove();
10135     osd->objecter->mutate(
10136       soid.oid,
10137       base_oloc,
10138       o,
10139       dsnapc,
10140       ceph::real_clock::from_ceph_timespec(oi.mtime),
10141       (CEPH_OSD_FLAG_IGNORE_OVERLAY |
10142        CEPH_OSD_FLAG_ENFORCE_SNAPC),
10143       NULL /* no callback, we'll rely on the ordering w.r.t the next op */);
10144   }
10145
10146   FlushOpRef fop(std::make_shared<FlushOp>());
10147   fop->obc = obc;
10148   fop->flushed_version = oi.user_version;
10149   fop->blocking = blocking;
10150   fop->on_flush = std::move(on_flush);
10151   fop->op = op;
10152
10153   ObjectOperation o;
10154   if (oi.is_whiteout()) {
10155     fop->removal = true;
10156     o.remove();
10157   } else {
10158     object_locator_t oloc(soid);
10159     o.copy_from(soid.oid.name, soid.snap, oloc, oi.user_version,
10160                 CEPH_OSD_COPY_FROM_FLAG_FLUSH |
10161                 CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
10162                 CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
10163                 CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE,
10164                 LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL|LIBRADOS_OP_FLAG_FADVISE_NOCACHE);
10165
10166     //mean the base tier don't cache data after this
10167     if (agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL)
10168       o.set_last_op_flags(LIBRADOS_OP_FLAG_FADVISE_DONTNEED);
10169   }
10170   C_Flush *fin = new C_Flush(this, soid, get_last_peering_reset());
10171
10172   ceph_tid_t tid = osd->objecter->mutate(
10173     soid.oid, base_oloc, o, snapc,
10174     ceph::real_clock::from_ceph_timespec(oi.mtime),
10175     CEPH_OSD_FLAG_IGNORE_OVERLAY | CEPH_OSD_FLAG_ENFORCE_SNAPC,
10176     new C_OnFinisher(fin,
10177                      osd->get_objecter_finisher(get_pg_shard())));
10178   /* we're under the pg lock and fin->finish() is grabbing that */
10179   fin->tid = tid;
10180   fop->objecter_tid = tid;
10181
10182   flush_ops[soid] = fop;
10183
10184   recovery_state.update_stats(
10185     [&oi](auto &history, auto &stats) {
10186       stats.stats.sum.num_flush++;
10187       stats.stats.sum.num_flush_kb += shift_round_up(oi.size, 10);
10188       return false;
10189     });
10190   return -EINPROGRESS;
10191 }
10192
10193 void PrimaryLogPG::finish_flush(hobject_t oid, ceph_tid_t tid, int r)
10194 {
10195   dout(10) << __func__ << " " << oid << " tid " << tid
10196            << " " << cpp_strerror(r) << dendl;
10197   map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(oid);
10198   if (p == flush_ops.end()) {
10199     dout(10) << __func__ << " no flush_op found" << dendl;
10200     return;
10201   }
10202   FlushOpRef fop = p->second;
10203   if (tid != fop->objecter_tid && !fop->obc->obs.oi.has_manifest()) {
10204     dout(10) << __func__ << " tid " << tid << " != fop " << fop
10205              << " tid " << fop->objecter_tid << dendl;
10206     return;
10207   }
10208   ObjectContextRef obc = fop->obc;
10209   fop->objecter_tid = 0;
10210
10211   if (r < 0 && !(r == -ENOENT && fop->removal)) {
10212     if (fop->op)
10213       osd->reply_op_error(fop->op, -EBUSY);
10214     if (fop->blocking) {
10215       obc->stop_block();
10216       kick_object_context_blocked(obc);
10217     }
10218
10219     if (!fop->dup_ops.empty()) {
10220       dout(20) << __func__ << " requeueing dups" << dendl;
10221       requeue_ops(fop->dup_ops);
10222     }
10223     if (fop->on_flush) {
10224       (*(fop->on_flush))();
10225       fop->on_flush = std::nullopt;
10226     }
10227     flush_ops.erase(oid);
10228     return;
10229   }
10230
10231   r = try_flush_mark_clean(fop);
10232   if (r == -EBUSY && fop->op) {
10233     osd->reply_op_error(fop->op, r);
10234   }
10235 }
10236
10237 int PrimaryLogPG::try_flush_mark_clean(FlushOpRef fop)
10238 {
10239   ObjectContextRef obc = fop->obc;
10240   const hobject_t& oid = obc->obs.oi.soid;
10241
10242   if (fop->blocking) {
10243     obc->stop_block();
10244     kick_object_context_blocked(obc);
10245   }
10246
10247   if (fop->flushed_version != obc->obs.oi.user_version ||
10248       !obc->obs.exists) {
10249     if (obc->obs.exists)
10250       dout(10) << __func__ << " flushed_version " << fop->flushed_version
10251                << " != current " << obc->obs.oi.user_version
10252                << dendl;
10253     else
10254       dout(10) << __func__ << " object no longer exists" << dendl;
10255
10256     if (!fop->dup_ops.empty()) {
10257       dout(20) << __func__ << " requeueing dups" << dendl;
10258       requeue_ops(fop->dup_ops);
10259     }
10260     if (fop->on_flush) {
10261       (*(fop->on_flush))();
10262       fop->on_flush = std::nullopt;
10263     }
10264     flush_ops.erase(oid);
10265     if (fop->blocking)
10266       osd->logger->inc(l_osd_tier_flush_fail);
10267     else
10268       osd->logger->inc(l_osd_tier_try_flush_fail);
10269     return -EBUSY;
10270   }
10271
10272   if (!fop->blocking &&
10273       write_blocked_by_scrub(oid)) {
10274     if (fop->op) {
10275       dout(10) << __func__ << " blocked by scrub" << dendl;
10276       requeue_op(fop->op);
10277       requeue_ops(fop->dup_ops);
10278       return -EAGAIN;    // will retry
10279     } else {
10280       osd->logger->inc(l_osd_tier_try_flush_fail);
10281       vector<ceph_tid_t> tids;
10282       cancel_flush(fop, false, &tids);
10283       osd->objecter->op_cancel(tids, -ECANCELED);
10284       return -ECANCELED;
10285     }
10286   }
10287
10288   // successfully flushed, can we evict this object?
10289   if (!obc->obs.oi.has_manifest() && !fop->op &&
10290       agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_IDLE &&
10291       agent_maybe_evict(obc, true)) {
10292     osd->logger->inc(l_osd_tier_clean);
10293     if (fop->on_flush) {
10294       (*(fop->on_flush))();
10295       fop->on_flush = std::nullopt;
10296     }
10297     flush_ops.erase(oid);
10298     return 0;
10299   }
10300
10301   dout(10) << __func__ << " clearing DIRTY flag for " << oid << dendl;
10302   OpContextUPtr ctx = simple_opc_create(fop->obc);
10303
10304   // successfully flushed; can we clear the dirty bit?
10305   // try to take the lock manually, since we don't
10306   // have a ctx yet.
10307   if (ctx->lock_manager.get_lock_type(
10308         RWState::RWWRITE,
10309         oid,
10310         obc,
10311         fop->op)) {
10312     dout(20) << __func__ << " took write lock" << dendl;
10313   } else if (fop->op) {
10314     dout(10) << __func__ << " waiting on write lock " << fop->op << " "
10315              << fop->dup_ops << dendl;
10316     // fop->op is now waiting on the lock; get fop->dup_ops to wait too.
10317     for (auto op : fop->dup_ops) {
10318       bool locked = ctx->lock_manager.get_lock_type(
10319         RWState::RWWRITE,
10320         oid,
10321         obc,
10322         op);
10323       ceph_assert(!locked);
10324     }
10325     close_op_ctx(ctx.release());
10326     return -EAGAIN;    // will retry
10327   } else {
10328     dout(10) << __func__ << " failed write lock, no op; failing" << dendl;
10329     close_op_ctx(ctx.release());
10330     osd->logger->inc(l_osd_tier_try_flush_fail);
10331     vector<ceph_tid_t> tids;
10332     cancel_flush(fop, false, &tids);
10333     osd->objecter->op_cancel(tids, -ECANCELED);
10334     return -ECANCELED;
10335   }
10336
10337   if (fop->on_flush) {
10338     ctx->register_on_finish(*(fop->on_flush));
10339     fop->on_flush = std::nullopt;
10340   }
10341
10342   ctx->at_version = get_next_version();
10343
10344   ctx->new_obs = obc->obs;
10345   ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
10346   --ctx->delta_stats.num_objects_dirty;
10347   if (fop->obc->obs.oi.has_manifest()) {
10348     ceph_assert(obc->obs.oi.manifest.is_chunked());
10349     PGTransaction* t = ctx->op_t.get();
10350     uint64_t chunks_size = 0;
10351     for (auto &p : ctx->new_obs.oi.manifest.chunk_map) {
10352       chunks_size += p.second.length;
10353     }
10354     if (ctx->new_obs.oi.is_omap() && pool.info.supports_omap()) {
10355       t->omap_clear(oid);
10356       ctx->new_obs.oi.clear_omap_digest();
10357       ctx->new_obs.oi.clear_flag(object_info_t::FLAG_OMAP);
10358       ctx->clean_regions.mark_omap_dirty();
10359     }
10360     if (obc->obs.oi.size == chunks_size) {
10361       t->truncate(oid, 0);
10362       interval_set<uint64_t> trim;
10363       trim.insert(0, ctx->new_obs.oi.size);
10364       ctx->modified_ranges.union_of(trim);
10365       truncate_update_size_and_usage(ctx->delta_stats,
10366                                      ctx->new_obs.oi,
10367                                      0);
10368       ctx->clean_regions.mark_data_region_dirty(0, ctx->new_obs.oi.size);
10369       ctx->new_obs.oi.new_object();
10370       for (auto &p : ctx->new_obs.oi.manifest.chunk_map) {
10371         p.second.clear_flag(chunk_info_t::FLAG_DIRTY);
10372         p.second.set_flag(chunk_info_t::FLAG_MISSING);
10373       }
10374     } else {
10375       for (auto &p : ctx->new_obs.oi.manifest.chunk_map) {
10376         if (p.second.is_dirty()) {
10377           dout(20) << __func__ << " offset: " << p.second.offset
10378                   << " length: " << p.second.length << dendl;
10379           p.second.clear_flag(chunk_info_t::FLAG_DIRTY);
10380           p.second.clear_flag(chunk_info_t::FLAG_MISSING); // CLEAN
10381         }
10382       }
10383     }
10384   }
10385
10386   finish_ctx(ctx.get(), pg_log_entry_t::CLEAN);
10387
10388   osd->logger->inc(l_osd_tier_clean);
10389
10390   if (!fop->dup_ops.empty() || fop->op) {
10391     dout(20) << __func__ << " requeueing for " << ctx->at_version << dendl;
10392     list<OpRequestRef> ls;
10393     if (fop->op)
10394       ls.push_back(fop->op);
10395     ls.splice(ls.end(), fop->dup_ops);
10396     requeue_ops(ls);
10397   }
10398
10399   simple_opc_submit(std::move(ctx));
10400
10401   flush_ops.erase(oid);
10402
10403   if (fop->blocking)
10404     osd->logger->inc(l_osd_tier_flush);
10405   else
10406     osd->logger->inc(l_osd_tier_try_flush);
10407
10408   return -EINPROGRESS;
10409 }
10410
10411 void PrimaryLogPG::cancel_flush(FlushOpRef fop, bool requeue,
10412                                 vector<ceph_tid_t> *tids)
10413 {
10414   dout(10) << __func__ << " " << fop->obc->obs.oi.soid << " tid "
10415            << fop->objecter_tid << dendl;
10416   if (fop->objecter_tid) {
10417     tids->push_back(fop->objecter_tid);
10418     fop->objecter_tid = 0;
10419   }
10420   if (fop->io_tids.size()) {
10421     for (auto &p : fop->io_tids) {
10422       tids->push_back(p.second);
10423       p.second = 0;
10424     }
10425   }
10426   if (fop->blocking && fop->obc->is_blocked()) {
10427     fop->obc->stop_block();
10428     kick_object_context_blocked(fop->obc);
10429   }
10430   if (requeue) {
10431     if (fop->op)
10432       requeue_op(fop->op);
10433     requeue_ops(fop->dup_ops);
10434   }
10435   if (fop->on_flush) {
10436     (*(fop->on_flush))();
10437     fop->on_flush = std::nullopt;
10438   }
10439   flush_ops.erase(fop->obc->obs.oi.soid);
10440 }
10441
10442 void PrimaryLogPG::cancel_flush_ops(bool requeue, vector<ceph_tid_t> *tids)
10443 {
10444   dout(10) << __func__ << dendl;
10445   map<hobject_t,FlushOpRef>::iterator p = flush_ops.begin();
10446   while (p != flush_ops.end()) {
10447     cancel_flush((p++)->second, requeue, tids);
10448   }
10449 }
10450
10451 bool PrimaryLogPG::is_present_clone(hobject_t coid)
10452 {
10453   if (!pool.info.allow_incomplete_clones())
10454     return true;
10455   if (is_missing_object(coid))
10456     return true;
10457   ObjectContextRef obc = get_object_context(coid, false);
10458   return obc && obc->obs.exists;
10459 }
10460
10461 // ========================================================================
10462 // rep op gather
10463
10464 class C_OSD_RepopCommit : public Context {
10465   PrimaryLogPGRef pg;
10466   boost::intrusive_ptr<PrimaryLogPG::RepGather> repop;
10467 public:
10468   C_OSD_RepopCommit(PrimaryLogPG *pg, PrimaryLogPG::RepGather *repop)
10469     : pg(pg), repop(repop) {}
10470   void finish(int) override {
10471     pg->repop_all_committed(repop.get());
10472   }
10473 };
10474
10475 void PrimaryLogPG::repop_all_committed(RepGather *repop)
10476 {
10477   dout(10) << __func__ << ": repop tid " << repop->rep_tid << " all committed "
10478            << dendl;
10479   repop->all_committed = true;
10480   if (!repop->rep_aborted) {
10481     if (repop->v != eversion_t()) {
10482       recovery_state.complete_write(repop->v, repop->pg_local_last_complete);
10483     }
10484     eval_repop(repop);
10485   }
10486 }
10487
10488 void PrimaryLogPG::op_applied(const eversion_t &applied_version)
10489 {
10490   dout(10) << "op_applied version " << applied_version << dendl;
10491   ceph_assert(applied_version != eversion_t());
10492   ceph_assert(applied_version <= info.last_update);
10493   recovery_state.local_write_applied(applied_version);
10494   if (is_primary()) {
10495     if (scrubber.active) {
10496       if (recovery_state.get_last_update_applied() >=
10497         scrubber.subset_last_update) {
10498         requeue_scrub(ops_blocked_by_scrub());
10499       }
10500     } else {
10501       ceph_assert(scrubber.start == scrubber.end);
10502     }
10503   }
10504 }
10505
10506 void PrimaryLogPG::eval_repop(RepGather *repop)
10507 {
10508   dout(10) << "eval_repop " << *repop
10509     << (repop->op && repop->op->get_req<MOSDOp>() ? "" : " (no op)") << dendl;
10510
10511   // ondisk?
10512   if (repop->all_committed) {
10513     dout(10) << " commit: " << *repop << dendl;
10514     for (auto p = repop->on_committed.begin();
10515          p != repop->on_committed.end();
10516          repop->on_committed.erase(p++)) {
10517       (*p)();
10518     }
10519     // send dup commits, in order
10520     auto it = waiting_for_ondisk.find(repop->v);
10521     if (it != waiting_for_ondisk.end()) {
10522       ceph_assert(waiting_for_ondisk.begin()->first == repop->v);
10523       for (auto& i : it->second) {
10524         int return_code = repop->r;
10525         if (return_code >= 0) {
10526           return_code = std::get<2>(i);
10527         }
10528         osd->reply_op_error(std::get<0>(i), return_code, repop->v,
10529                             std::get<1>(i), std::get<3>(i));
10530       }
10531       waiting_for_ondisk.erase(it);
10532     }
10533
10534     publish_stats_to_osd();
10535
10536     dout(10) << " removing " << *repop << dendl;
10537     ceph_assert(!repop_queue.empty());
10538     dout(20) << "   q front is " << *repop_queue.front() << dendl;
10539     if (repop_queue.front() == repop) {
10540       RepGather *to_remove = nullptr;
10541       while (!repop_queue.empty() &&
10542              (to_remove = repop_queue.front())->all_committed) {
10543         repop_queue.pop_front();
10544         for (auto p = to_remove->on_success.begin();
10545              p != to_remove->on_success.end();
10546              to_remove->on_success.erase(p++)) {
10547           (*p)();
10548         }
10549         remove_repop(to_remove);
10550       }
10551     }
10552   }
10553 }
10554
10555 void PrimaryLogPG::issue_repop(RepGather *repop, OpContext *ctx)
10556 {
10557   FUNCTRACE(cct);
10558   const hobject_t& soid = ctx->obs->oi.soid;
10559   dout(7) << "issue_repop rep_tid " << repop->rep_tid
10560           << " o " << soid
10561           << dendl;
10562
10563   repop->v = ctx->at_version;
10564
10565   ctx->op_t->add_obc(ctx->obc);
10566   if (ctx->clone_obc) {
10567     ctx->op_t->add_obc(ctx->clone_obc);
10568   }
10569   if (ctx->head_obc) {
10570     ctx->op_t->add_obc(ctx->head_obc);
10571   }
10572
10573   Context *on_all_commit = new C_OSD_RepopCommit(this, repop);
10574   if (!(ctx->log.empty())) {
10575     ceph_assert(ctx->at_version >= projected_last_update);
10576     projected_last_update = ctx->at_version;
10577   }
10578   for (auto &&entry: ctx->log) {
10579     projected_log.add(entry);
10580   }
10581
10582   recovery_state.pre_submit_op(
10583     soid,
10584     ctx->log,
10585     ctx->at_version);
10586   pgbackend->submit_transaction(
10587     soid,
10588     ctx->delta_stats,
10589     ctx->at_version,
10590     std::move(ctx->op_t),
10591     recovery_state.get_pg_trim_to(),
10592     recovery_state.get_min_last_complete_ondisk(),
10593     ctx->log,
10594     ctx->updated_hset_history,
10595     on_all_commit,
10596     repop->rep_tid,
10597     ctx->reqid,
10598     ctx->op);
10599 }
10600
10601 PrimaryLogPG::RepGather *PrimaryLogPG::new_repop(
10602   OpContext *ctx, ObjectContextRef obc,
10603   ceph_tid_t rep_tid)
10604 {
10605   if (ctx->op)
10606     dout(10) << "new_repop rep_tid " << rep_tid << " on " << *ctx->op->get_req() << dendl;
10607   else
10608     dout(10) << "new_repop rep_tid " << rep_tid << " (no op)" << dendl;
10609
10610   RepGather *repop = new RepGather(
10611     ctx, rep_tid, info.last_complete);
10612
10613   repop->start = ceph_clock_now();
10614
10615   repop_queue.push_back(&repop->queue_item);
10616   repop->get();
10617
10618   osd->logger->inc(l_osd_op_wip);
10619
10620   dout(10) << __func__ << ": " << *repop << dendl;
10621   return repop;
10622 }
10623
10624 boost::intrusive_ptr<PrimaryLogPG::RepGather> PrimaryLogPG::new_repop(
10625   eversion_t version,
10626   int r,
10627   ObcLockManager &&manager,
10628   OpRequestRef &&op,
10629   std::optional<std::function<void(void)> > &&on_complete)
10630 {
10631   RepGather *repop = new RepGather(
10632     std::move(manager),
10633     std::move(op),
10634     std::move(on_complete),
10635     osd->get_tid(),
10636     info.last_complete,
10637     r);
10638   repop->v = version;
10639
10640   repop->start = ceph_clock_now();
10641
10642   repop_queue.push_back(&repop->queue_item);
10643
10644   osd->logger->inc(l_osd_op_wip);
10645
10646   dout(10) << __func__ << ": " << *repop << dendl;
10647   return boost::intrusive_ptr<RepGather>(repop);
10648 }
10649
10650 void PrimaryLogPG::remove_repop(RepGather *repop)
10651 {
10652   dout(20) << __func__ << " " << *repop << dendl;
10653
10654   for (auto p = repop->on_finish.begin();
10655        p != repop->on_finish.end();
10656        repop->on_finish.erase(p++)) {
10657     (*p)();
10658   }
10659
10660   release_object_locks(
10661     repop->lock_manager);
10662   repop->put();
10663
10664   osd->logger->dec(l_osd_op_wip);
10665 }
10666
10667 PrimaryLogPG::OpContextUPtr PrimaryLogPG::simple_opc_create(ObjectContextRef obc)
10668 {
10669   dout(20) << __func__ << " " << obc->obs.oi.soid << dendl;
10670   ceph_tid_t rep_tid = osd->get_tid();
10671   osd_reqid_t reqid(osd->get_cluster_msgr_name(), 0, rep_tid);
10672   OpContextUPtr ctx(new OpContext(OpRequestRef(), reqid, nullptr, obc, this));
10673   ctx->op_t.reset(new PGTransaction());
10674   ctx->mtime = ceph_clock_now();
10675   return ctx;
10676 }
10677
10678 void PrimaryLogPG::simple_opc_submit(OpContextUPtr ctx)
10679 {
10680   RepGather *repop = new_repop(ctx.get(), ctx->obc, ctx->reqid.tid);
10681   dout(20) << __func__ << " " << repop << dendl;
10682   issue_repop(repop, ctx.get());
10683   eval_repop(repop);
10684   recovery_state.update_trim_to();
10685   repop->put();
10686 }
10687
10688
10689 void PrimaryLogPG::submit_log_entries(
10690   const mempool::osd_pglog::list<pg_log_entry_t> &entries,
10691   ObcLockManager &&manager,
10692   std::optional<std::function<void(void)> > &&_on_complete,
10693   OpRequestRef op,
10694   int r)
10695 {
10696   dout(10) << __func__ << " " << entries << dendl;
10697   ceph_assert(is_primary());
10698
10699   eversion_t version;
10700   if (!entries.empty()) {
10701     ceph_assert(entries.rbegin()->version >= projected_last_update);
10702     version = projected_last_update = entries.rbegin()->version;
10703   }
10704
10705   boost::intrusive_ptr<RepGather> repop;
10706   std::optional<std::function<void(void)> > on_complete;
10707   if (get_osdmap()->require_osd_release >= ceph_release_t::jewel) {
10708     repop = new_repop(
10709       version,
10710       r,
10711       std::move(manager),
10712       std::move(op),
10713       std::move(_on_complete));
10714   } else {
10715     on_complete = std::move(_on_complete);
10716   }
10717
10718   pgbackend->call_write_ordered(
10719     [this, entries, repop, on_complete]() {
10720       ObjectStore::Transaction t;
10721       eversion_t old_last_update = info.last_update;
10722       recovery_state.merge_new_log_entries(
10723         entries, t, recovery_state.get_pg_trim_to(),
10724         recovery_state.get_min_last_complete_ondisk());
10725
10726       set<pg_shard_t> waiting_on;
10727       for (set<pg_shard_t>::const_iterator i = get_acting_recovery_backfill().begin();
10728            i != get_acting_recovery_backfill().end();
10729            ++i) {
10730         pg_shard_t peer(*i);
10731         if (peer == pg_whoami) continue;
10732         ceph_assert(recovery_state.get_peer_missing().count(peer));
10733         ceph_assert(recovery_state.has_peer_info(peer));
10734         if (get_osdmap()->require_osd_release >= ceph_release_t::jewel) {
10735           ceph_assert(repop);
10736           MOSDPGUpdateLogMissing *m = new MOSDPGUpdateLogMissing(
10737             entries,
10738             spg_t(info.pgid.pgid, i->shard),
10739             pg_whoami.shard,
10740             get_osdmap_epoch(),
10741             get_last_peering_reset(),
10742             repop->rep_tid,
10743             recovery_state.get_pg_trim_to(),
10744             recovery_state.get_min_last_complete_ondisk());
10745           osd->send_message_osd_cluster(
10746             peer.osd, m, get_osdmap_epoch());
10747           waiting_on.insert(peer);
10748         } else {
10749           MOSDPGLog *m = new MOSDPGLog(
10750             peer.shard, pg_whoami.shard,
10751             info.last_update.epoch,
10752             info, get_last_peering_reset());
10753           m->log.log = entries;
10754           m->log.tail = old_last_update;
10755           m->log.head = info.last_update;
10756           osd->send_message_osd_cluster(
10757             peer.osd, m, get_osdmap_epoch());
10758         }
10759       }
10760       ceph_tid_t rep_tid = repop->rep_tid;
10761       waiting_on.insert(pg_whoami);
10762       log_entry_update_waiting_on.insert(
10763         make_pair(
10764           rep_tid,
10765           LogUpdateCtx{std::move(repop), std::move(waiting_on)}
10766           ));
10767       struct OnComplete : public Context {
10768         PrimaryLogPGRef pg;
10769         ceph_tid_t rep_tid;
10770         epoch_t epoch;
10771         OnComplete(
10772           PrimaryLogPGRef pg,
10773           ceph_tid_t rep_tid,
10774           epoch_t epoch)
10775           : pg(pg), rep_tid(rep_tid), epoch(epoch) {}
10776         void finish(int) override {
10777           std::scoped_lock l{*pg};
10778           if (!pg->pg_has_reset_since(epoch)) {
10779             auto it = pg->log_entry_update_waiting_on.find(rep_tid);
10780             ceph_assert(it != pg->log_entry_update_waiting_on.end());
10781             auto it2 = it->second.waiting_on.find(pg->pg_whoami);
10782             ceph_assert(it2 != it->second.waiting_on.end());
10783             it->second.waiting_on.erase(it2);
10784             if (it->second.waiting_on.empty()) {
10785               pg->repop_all_committed(it->second.repop.get());
10786               pg->log_entry_update_waiting_on.erase(it);
10787             }
10788           }
10789         }
10790       };
10791       t.register_on_commit(
10792         new OnComplete{this, rep_tid, get_osdmap_epoch()});
10793       int r = osd->store->queue_transaction(ch, std::move(t), NULL);
10794       ceph_assert(r == 0);
10795       op_applied(info.last_update);
10796     });
10797
10798   recovery_state.update_trim_to();
10799 }
10800
10801 void PrimaryLogPG::cancel_log_updates()
10802 {
10803   // get rid of all the LogUpdateCtx so their references to repops are
10804   // dropped
10805   log_entry_update_waiting_on.clear();
10806 }
10807
10808 // -------------------------------------------------------
10809
10810 void PrimaryLogPG::get_watchers(list<obj_watch_item_t> *ls)
10811 {
10812   std::scoped_lock l{*this};
10813   pair<hobject_t, ObjectContextRef> i;
10814   while (object_contexts.get_next(i.first, &i)) {
10815     ObjectContextRef obc(i.second);
10816     get_obc_watchers(obc, *ls);
10817   }
10818 }
10819
10820 void PrimaryLogPG::get_obc_watchers(ObjectContextRef obc, list<obj_watch_item_t> &pg_watchers)
10821 {
10822   for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j =
10823          obc->watchers.begin();
10824         j != obc->watchers.end();
10825         ++j) {
10826     obj_watch_item_t owi;
10827
10828     owi.obj = obc->obs.oi.soid;
10829     owi.wi.addr = j->second->get_peer_addr();
10830     owi.wi.name = j->second->get_entity();
10831     owi.wi.cookie = j->second->get_cookie();
10832     owi.wi.timeout_seconds = j->second->get_timeout();
10833
10834     dout(30) << "watch: Found oid=" << owi.obj << " addr=" << owi.wi.addr
10835       << " name=" << owi.wi.name << " cookie=" << owi.wi.cookie << dendl;
10836
10837     pg_watchers.push_back(owi);
10838   }
10839 }
10840
10841 void PrimaryLogPG::check_blacklisted_watchers()
10842 {
10843   dout(20) << "PrimaryLogPG::check_blacklisted_watchers for pg " << get_pgid() << dendl;
10844   pair<hobject_t, ObjectContextRef> i;
10845   while (object_contexts.get_next(i.first, &i))
10846     check_blacklisted_obc_watchers(i.second);
10847 }
10848
10849 void PrimaryLogPG::check_blacklisted_obc_watchers(ObjectContextRef obc)
10850 {
10851   dout(20) << "PrimaryLogPG::check_blacklisted_obc_watchers for obc " << obc->obs.oi.soid << dendl;
10852   for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator k =
10853          obc->watchers.begin();
10854         k != obc->watchers.end();
10855         ) {
10856     //Advance iterator now so handle_watch_timeout() can erase element
10857     map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j = k++;
10858     dout(30) << "watch: Found " << j->second->get_entity() << " cookie " << j->second->get_cookie() << dendl;
10859     entity_addr_t ea = j->second->get_peer_addr();
10860     dout(30) << "watch: Check entity_addr_t " << ea << dendl;
10861     if (get_osdmap()->is_blacklisted(ea)) {
10862       dout(10) << "watch: Found blacklisted watcher for " << ea << dendl;
10863       ceph_assert(j->second->get_pg() == this);
10864       j->second->unregister_cb();
10865       handle_watch_timeout(j->second);
10866     }
10867   }
10868 }
10869
10870 void PrimaryLogPG::populate_obc_watchers(ObjectContextRef obc)
10871 {
10872   ceph_assert(is_active());
10873   auto it_objects = recovery_state.get_pg_log().get_log().objects.find(obc->obs.oi.soid);
10874   ceph_assert((recovering.count(obc->obs.oi.soid) ||
10875           !is_missing_object(obc->obs.oi.soid)) ||
10876          (it_objects != recovery_state.get_pg_log().get_log().objects.end() && // or this is a revert... see recover_primary()
10877           it_objects->second->op ==
10878             pg_log_entry_t::LOST_REVERT &&
10879           it_objects->second->reverting_to ==
10880             obc->obs.oi.version));
10881
10882   dout(10) << "populate_obc_watchers " << obc->obs.oi.soid << dendl;
10883   ceph_assert(obc->watchers.empty());
10884   // populate unconnected_watchers
10885   for (map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator p =
10886         obc->obs.oi.watchers.begin();
10887        p != obc->obs.oi.watchers.end();
10888        ++p) {
10889     utime_t expire = info.stats.last_became_active;
10890     expire += p->second.timeout_seconds;
10891     dout(10) << "  unconnected watcher " << p->first << " will expire " << expire << dendl;
10892     WatchRef watch(
10893       Watch::makeWatchRef(
10894         this, osd, obc, p->second.timeout_seconds, p->first.first,
10895         p->first.second, p->second.addr));
10896     watch->disconnect();
10897     obc->watchers.insert(
10898       make_pair(
10899         make_pair(p->first.first, p->first.second),
10900         watch));
10901   }
10902   // Look for watchers from blacklisted clients and drop
10903   check_blacklisted_obc_watchers(obc);
10904 }
10905
10906 void PrimaryLogPG::handle_watch_timeout(WatchRef watch)
10907 {
10908   ObjectContextRef obc = watch->get_obc(); // handle_watch_timeout owns this ref
10909   dout(10) << "handle_watch_timeout obc " << obc << dendl;
10910
10911   if (!is_active()) {
10912     dout(10) << "handle_watch_timeout not active, no-op" << dendl;
10913     return;
10914   }
10915   if (!obc->obs.exists) {
10916     dout(10) << __func__ << " object " << obc->obs.oi.soid << " dne" << dendl;
10917     return;
10918   }
10919   if (is_degraded_or_backfilling_object(obc->obs.oi.soid)) {
10920     callbacks_for_degraded_object[obc->obs.oi.soid].push_back(
10921       watch->get_delayed_cb()
10922       );
10923     dout(10) << "handle_watch_timeout waiting for degraded on obj "
10924              << obc->obs.oi.soid
10925              << dendl;
10926     return;
10927   }
10928
10929   if (write_blocked_by_scrub(obc->obs.oi.soid)) {
10930     dout(10) << "handle_watch_timeout waiting for scrub on obj "
10931              << obc->obs.oi.soid
10932              << dendl;
10933     scrubber.add_callback(
10934       watch->get_delayed_cb() // This callback!
10935       );
10936     return;
10937   }
10938
10939   OpContextUPtr ctx = simple_opc_create(obc);
10940   ctx->at_version = get_next_version();
10941
10942   object_info_t& oi = ctx->new_obs.oi;
10943   oi.watchers.erase(make_pair(watch->get_cookie(),
10944                               watch->get_entity()));
10945
10946   list<watch_disconnect_t> watch_disconnects = {
10947     watch_disconnect_t(watch->get_cookie(), watch->get_entity(), true)
10948   };
10949   ctx->register_on_success(
10950     [this, obc, watch_disconnects]() {
10951       complete_disconnect_watches(obc, watch_disconnects);
10952     });
10953
10954
10955   PGTransaction *t = ctx->op_t.get();
10956   ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::MODIFY, obc->obs.oi.soid,
10957                                     ctx->at_version,
10958                                     oi.version,
10959                                     0,
10960                                     osd_reqid_t(), ctx->mtime, 0));
10961
10962   oi.prior_version = obc->obs.oi.version;
10963   oi.version = ctx->at_version;
10964   bufferlist bl;
10965   encode(oi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
10966   t->setattr(obc->obs.oi.soid, OI_ATTR, bl);
10967
10968   // apply new object state.
10969   ctx->obc->obs = ctx->new_obs;
10970
10971   // no ctx->delta_stats
10972   simple_opc_submit(std::move(ctx));
10973 }
10974
10975 ObjectContextRef PrimaryLogPG::create_object_context(const object_info_t& oi,
10976                                                      SnapSetContext *ssc)
10977 {
10978   ObjectContextRef obc(object_contexts.lookup_or_create(oi.soid));
10979   ceph_assert(obc->destructor_callback == NULL);
10980   obc->destructor_callback = new C_PG_ObjectContext(this, obc.get());
10981   obc->obs.oi = oi;
10982   obc->obs.exists = false;
10983   obc->ssc = ssc;
10984   if (ssc)
10985     register_snapset_context(ssc);
10986   dout(10) << "create_object_context " << (void*)obc.get() << " " << oi.soid << " " << dendl;
10987   if (is_active())
10988     populate_obc_watchers(obc);
10989   return obc;
10990 }
10991
10992 ObjectContextRef PrimaryLogPG::get_object_context(
10993   const hobject_t& soid,
10994   bool can_create,
10995   const map<string, bufferlist> *attrs)
10996 {
10997   auto it_objects = recovery_state.get_pg_log().get_log().objects.find(soid);
10998   ceph_assert(
10999     attrs || !recovery_state.get_pg_log().get_missing().is_missing(soid) ||
11000     // or this is a revert... see recover_primary()
11001     (it_objects != recovery_state.get_pg_log().get_log().objects.end() &&
11002       it_objects->second->op ==
11003       pg_log_entry_t::LOST_REVERT));
11004   ObjectContextRef obc = object_contexts.lookup(soid);
11005   osd->logger->inc(l_osd_object_ctx_cache_total);
11006   if (obc) {
11007     osd->logger->inc(l_osd_object_ctx_cache_hit);
11008     dout(10) << __func__ << ": found obc in cache: " << obc
11009              << dendl;
11010   } else {
11011     dout(10) << __func__ << ": obc NOT found in cache: " << soid << dendl;
11012     // check disk
11013     bufferlist bv;
11014     if (attrs) {
11015       auto it_oi = attrs->find(OI_ATTR);
11016       ceph_assert(it_oi != attrs->end());
11017       bv = it_oi->second;
11018     } else {
11019       int r = pgbackend->objects_get_attr(soid, OI_ATTR, &bv);
11020       if (r < 0) {
11021         if (!can_create) {
11022           dout(10) << __func__ << ": no obc for soid "
11023                    << soid << " and !can_create"
11024                    << dendl;
11025           return ObjectContextRef();   // -ENOENT!
11026         }
11027
11028         dout(10) << __func__ << ": no obc for soid "
11029                  << soid << " but can_create"
11030                  << dendl;
11031         // new object.
11032         object_info_t oi(soid);
11033         SnapSetContext *ssc = get_snapset_context(
11034           soid, true, 0, false);
11035         ceph_assert(ssc);
11036         obc = create_object_context(oi, ssc);
11037         dout(10) << __func__ << ": " << obc << " " << soid
11038                  << " " << obc->rwstate
11039                  << " oi: " << obc->obs.oi
11040                  << " ssc: " << obc->ssc
11041                  << " snapset: " << obc->ssc->snapset << dendl;
11042         return obc;
11043       }
11044     }
11045
11046     object_info_t oi;
11047     try {
11048       bufferlist::const_iterator bliter = bv.begin();
11049       decode(oi, bliter);
11050     } catch (...) {
11051       dout(0) << __func__ << ": obc corrupt: " << soid << dendl;
11052       return ObjectContextRef();   // -ENOENT!
11053     }
11054
11055     ceph_assert(oi.soid.pool == (int64_t)info.pgid.pool());
11056
11057     obc = object_contexts.lookup_or_create(oi.soid);
11058     obc->destructor_callback = new C_PG_ObjectContext(this, obc.get());
11059     obc->obs.oi = oi;
11060     obc->obs.exists = true;
11061
11062     obc->ssc = get_snapset_context(
11063       soid, true,
11064       soid.has_snapset() ? attrs : 0);
11065
11066     if (is_active())
11067       populate_obc_watchers(obc);
11068
11069     if (pool.info.is_erasure()) {
11070       if (attrs) {
11071         obc->attr_cache = *attrs;
11072       } else {
11073         int r = pgbackend->objects_get_attrs(
11074           soid,
11075           &obc->attr_cache);
11076         ceph_assert(r == 0);
11077       }
11078     }
11079
11080     dout(10) << __func__ << ": creating obc from disk: " << obc
11081              << dendl;
11082   }
11083
11084   // XXX: Caller doesn't expect this
11085   if (obc->ssc == NULL) {
11086     derr << __func__ << ": obc->ssc not available, not returning context" << dendl;
11087     return ObjectContextRef();   // -ENOENT!
11088   }
11089
11090   dout(10) << __func__ << ": " << obc << " " << soid
11091            << " " << obc->rwstate
11092            << " oi: " << obc->obs.oi
11093            << " exists: " << (int)obc->obs.exists
11094            << " ssc: " << obc->ssc
11095            << " snapset: " << obc->ssc->snapset << dendl;
11096   return obc;
11097 }
11098
11099 void PrimaryLogPG::context_registry_on_change()
11100 {
11101   pair<hobject_t, ObjectContextRef> i;
11102   while (object_contexts.get_next(i.first, &i)) {
11103     ObjectContextRef obc(i.second);
11104     if (obc) {
11105       for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j =
11106              obc->watchers.begin();
11107            j != obc->watchers.end();
11108            obc->watchers.erase(j++)) {
11109         j->second->discard();
11110       }
11111     }
11112   }
11113 }
11114
11115
11116 /*
11117  * If we return an error, and set *pmissing, then promoting that
11118  * object may help.
11119  *
11120  * If we return -EAGAIN, we will always set *pmissing to the missing
11121  * object to wait for.
11122  *
11123  * If we return an error but do not set *pmissing, then we know the
11124  * object does not exist.
11125  */
11126 int PrimaryLogPG::find_object_context(const hobject_t& oid,
11127                                       ObjectContextRef *pobc,
11128                                       bool can_create,
11129                                       bool map_snapid_to_clone,
11130                                       hobject_t *pmissing)
11131 {
11132   FUNCTRACE(cct);
11133   ceph_assert(oid.pool == static_cast<int64_t>(info.pgid.pool()));
11134   // want the head?
11135   if (oid.snap == CEPH_NOSNAP) {
11136     ObjectContextRef obc = get_object_context(oid, can_create);
11137     if (!obc) {
11138       if (pmissing)
11139         *pmissing = oid;
11140       return -ENOENT;
11141     }
11142     dout(10) << __func__ << " " << oid
11143        << " @" << oid.snap
11144        << " oi=" << obc->obs.oi
11145        << dendl;
11146     *pobc = obc;
11147
11148     return 0;
11149   }
11150
11151   // we want a snap
11152
11153   hobject_t head = oid.get_head();
11154   SnapSetContext *ssc = get_snapset_context(oid, can_create);
11155   if (!ssc || !(ssc->exists || can_create)) {
11156     dout(20) << __func__ << " " << oid << " no snapset" << dendl;
11157     if (pmissing)
11158       *pmissing = head;  // start by getting the head
11159     if (ssc)
11160       put_snapset_context(ssc);
11161     return -ENOENT;
11162   }
11163
11164   if (map_snapid_to_clone) {
11165     dout(10) << __func__ << " " << oid << " @" << oid.snap
11166              << " snapset " << ssc->snapset
11167              << " map_snapid_to_clone=true" << dendl;
11168     if (oid.snap > ssc->snapset.seq) {
11169       // already must be readable
11170       ObjectContextRef obc = get_object_context(head, false);
11171       dout(10) << __func__ << " " << oid << " @" << oid.snap
11172                << " snapset " << ssc->snapset
11173                << " maps to head" << dendl;
11174       *pobc = obc;
11175       put_snapset_context(ssc);
11176       return (obc && obc->obs.exists) ? 0 : -ENOENT;
11177     } else {
11178       vector<snapid_t>::const_iterator citer = std::find(
11179         ssc->snapset.clones.begin(),
11180         ssc->snapset.clones.end(),
11181         oid.snap);
11182       if (citer == ssc->snapset.clones.end()) {
11183         dout(10) << __func__ << " " << oid << " @" << oid.snap
11184                  << " snapset " << ssc->snapset
11185                  << " maps to nothing" << dendl;
11186         put_snapset_context(ssc);
11187         return -ENOENT;
11188       }
11189
11190       dout(10) << __func__ << " " << oid << " @" << oid.snap
11191                << " snapset " << ssc->snapset
11192                << " maps to " << oid << dendl;
11193
11194       if (recovery_state.get_pg_log().get_missing().is_missing(oid)) {
11195         dout(10) << __func__ << " " << oid << " @" << oid.snap
11196                  << " snapset " << ssc->snapset
11197                  << " " << oid << " is missing" << dendl;
11198         if (pmissing)
11199           *pmissing = oid;
11200         put_snapset_context(ssc);
11201         return -EAGAIN;
11202       }
11203
11204       ObjectContextRef obc = get_object_context(oid, false);
11205       if (!obc || !obc->obs.exists) {
11206         dout(10) << __func__ << " " << oid << " @" << oid.snap
11207                  << " snapset " << ssc->snapset
11208                  << " " << oid << " is not present" << dendl;
11209         if (pmissing)
11210           *pmissing = oid;
11211         put_snapset_context(ssc);
11212         return -ENOENT;
11213       }
11214       dout(10) << __func__ << " " << oid << " @" << oid.snap
11215                << " snapset " << ssc->snapset
11216                << " " << oid << " HIT" << dendl;
11217       *pobc = obc;
11218       put_snapset_context(ssc);
11219       return 0;
11220     }
11221     ceph_abort(); //unreachable
11222   }
11223
11224   dout(10) << __func__ << " " << oid << " @" << oid.snap
11225            << " snapset " << ssc->snapset << dendl;
11226
11227   // head?
11228   if (oid.snap > ssc->snapset.seq) {
11229     ObjectContextRef obc = get_object_context(head, false);
11230     dout(10) << __func__ << " " << head
11231              << " want " << oid.snap << " > snapset seq " << ssc->snapset.seq
11232              << " -- HIT " << obc->obs
11233              << dendl;
11234     if (!obc->ssc)
11235       obc->ssc = ssc;
11236     else {
11237       ceph_assert(ssc == obc->ssc);
11238       put_snapset_context(ssc);
11239     }
11240     *pobc = obc;
11241     return 0;
11242   }
11243
11244   // which clone would it be?
11245   unsigned k = 0;
11246   while (k < ssc->snapset.clones.size() &&
11247          ssc->snapset.clones[k] < oid.snap)
11248     k++;
11249   if (k == ssc->snapset.clones.size()) {
11250     dout(10) << __func__ << " no clones with last >= oid.snap "
11251              << oid.snap << " -- DNE" << dendl;
11252     put_snapset_context(ssc);
11253     return -ENOENT;
11254   }
11255   hobject_t soid(oid.oid, oid.get_key(), ssc->snapset.clones[k], oid.get_hash(),
11256                  info.pgid.pool(), oid.get_namespace());
11257
11258   if (recovery_state.get_pg_log().get_missing().is_missing(soid)) {
11259     dout(20) << __func__ << " " << soid << " missing, try again later"
11260              << dendl;
11261     if (pmissing)
11262       *pmissing = soid;
11263     put_snapset_context(ssc);
11264     return -EAGAIN;
11265   }
11266
11267   ObjectContextRef obc = get_object_context(soid, false);
11268   if (!obc || !obc->obs.exists) {
11269     if (pmissing)
11270       *pmissing = soid;
11271     put_snapset_context(ssc);
11272     if (is_primary()) {
11273       if (is_degraded_or_backfilling_object(soid)) {
11274         dout(20) << __func__ << " clone is degraded or backfilling " << soid << dendl;
11275         return -EAGAIN;
11276       } else if (is_degraded_on_async_recovery_target(soid)) {
11277         dout(20) << __func__ << " clone is recovering " << soid << dendl;
11278         return -EAGAIN;
11279       } else {
11280         dout(20) << __func__ << " missing clone " << soid << dendl;
11281         return -ENOENT;
11282       }
11283     } else {
11284       dout(20) << __func__ << " replica missing clone" << soid << dendl;
11285       return -ENOENT;
11286     }
11287   }
11288
11289   if (!obc->ssc) {
11290     obc->ssc = ssc;
11291   } else {
11292     ceph_assert(obc->ssc == ssc);
11293     put_snapset_context(ssc);
11294   }
11295   ssc = 0;
11296
11297   // clone
11298   dout(20) << __func__ << " " << soid
11299            << " snapset " << obc->ssc->snapset
11300            << dendl;
11301   snapid_t first, last;
11302   auto p = obc->ssc->snapset.clone_snaps.find(soid.snap);
11303   ceph_assert(p != obc->ssc->snapset.clone_snaps.end());
11304   if (p->second.empty()) {
11305     dout(1) << __func__ << " " << soid << " empty snapset -- DNE" << dendl;
11306     ceph_assert(!cct->_conf->osd_debug_verify_snaps);
11307     return -ENOENT;
11308   }
11309   if (std::find(p->second.begin(), p->second.end(), oid.snap) ==
11310       p->second.end()) {
11311     dout(20) << __func__ << " " << soid << " clone_snaps " << p->second
11312              << " does not contain " << oid.snap << " -- DNE" << dendl;
11313     return -ENOENT;
11314   }
11315   if (get_osdmap()->in_removed_snaps_queue(info.pgid.pgid.pool(), oid.snap)) {
11316     dout(20) << __func__ << " " << soid << " snap " << oid.snap
11317              << " in removed_snaps_queue" << " -- DNE" << dendl;
11318     return -ENOENT;
11319   }
11320   dout(20) << __func__ << " " << soid << " clone_snaps " << p->second
11321            << " contains " << oid.snap << " -- HIT " << obc->obs << dendl;
11322   *pobc = obc;
11323   return 0;
11324 }
11325
11326 void PrimaryLogPG::object_context_destructor_callback(ObjectContext *obc)
11327 {
11328   if (obc->ssc)
11329     put_snapset_context(obc->ssc);
11330 }
11331
11332 void PrimaryLogPG::add_object_context_to_pg_stat(ObjectContextRef obc, pg_stat_t *pgstat)
11333 {
11334   object_info_t& oi = obc->obs.oi;
11335
11336   dout(10) << __func__ << " " << oi.soid << dendl;
11337   ceph_assert(!oi.soid.is_snapdir());
11338
11339   object_stat_sum_t stat;
11340   stat.num_objects++;
11341   if (oi.is_dirty())
11342     stat.num_objects_dirty++;
11343   if (oi.is_whiteout())
11344     stat.num_whiteouts++;
11345   if (oi.is_omap())
11346     stat.num_objects_omap++;
11347   if (oi.is_cache_pinned())
11348     stat.num_objects_pinned++;
11349   if (oi.has_manifest())
11350     stat.num_objects_manifest++;
11351
11352   if (oi.soid.is_snap()) {
11353     stat.num_object_clones++;
11354
11355     if (!obc->ssc)
11356       obc->ssc = get_snapset_context(oi.soid, false);
11357     ceph_assert(obc->ssc);
11358     stat.num_bytes += obc->ssc->snapset.get_clone_bytes(oi.soid.snap);
11359   } else {
11360     stat.num_bytes += oi.size;
11361   }
11362
11363   // add it in
11364   pgstat->stats.sum.add(stat);
11365 }
11366
11367 void PrimaryLogPG::kick_object_context_blocked(ObjectContextRef obc)
11368 {
11369   const hobject_t& soid = obc->obs.oi.soid;
11370   if (obc->is_blocked()) {
11371     dout(10) << __func__ << " " << soid << " still blocked" << dendl;
11372     return;
11373   }
11374
11375   map<hobject_t, list<OpRequestRef>>::iterator p = waiting_for_blocked_object.find(soid);
11376   if (p != waiting_for_blocked_object.end()) {
11377     list<OpRequestRef>& ls = p->second;
11378     dout(10) << __func__ << " " << soid << " requeuing " << ls.size() << " requests" << dendl;
11379     requeue_ops(ls);
11380     waiting_for_blocked_object.erase(p);
11381   }
11382
11383   map<hobject_t, ObjectContextRef>::iterator i =
11384     objects_blocked_on_snap_promotion.find(obc->obs.oi.soid.get_head());
11385   if (i != objects_blocked_on_snap_promotion.end()) {
11386     ceph_assert(i->second == obc);
11387     objects_blocked_on_snap_promotion.erase(i);
11388   }
11389
11390   if (obc->requeue_scrub_on_unblock) {
11391     obc->requeue_scrub_on_unblock = false;
11392     // only requeue if we are still active: we may be unblocking
11393     // because we are resetting for a new peering interval
11394     if (is_active()) {
11395       requeue_scrub();
11396     }
11397   }
11398 }
11399
11400 SnapSetContext *PrimaryLogPG::get_snapset_context(
11401   const hobject_t& oid,
11402   bool can_create,
11403   const map<string, bufferlist> *attrs,
11404   bool oid_existed)
11405 {
11406   std::lock_guard l(snapset_contexts_lock);
11407   SnapSetContext *ssc;
11408   map<hobject_t, SnapSetContext*>::iterator p = snapset_contexts.find(
11409     oid.get_snapdir());
11410   if (p != snapset_contexts.end()) {
11411     if (can_create || p->second->exists) {
11412       ssc = p->second;
11413     } else {
11414       return NULL;
11415     }
11416   } else {
11417     bufferlist bv;
11418     if (!attrs) {
11419       int r = -ENOENT;
11420       if (!(oid.is_head() && !oid_existed)) {
11421         r = pgbackend->objects_get_attr(oid.get_head(), SS_ATTR, &bv);
11422       }
11423       if (r < 0 && !can_create)
11424         return NULL;
11425     } else {
11426       auto it_ss = attrs->find(SS_ATTR);
11427       ceph_assert(it_ss != attrs->end());
11428       bv = it_ss->second;
11429     }
11430     ssc = new SnapSetContext(oid.get_snapdir());
11431     _register_snapset_context(ssc);
11432     if (bv.length()) {
11433       bufferlist::const_iterator bvp = bv.begin();
11434       try {
11435         ssc->snapset.decode(bvp);
11436       } catch (buffer::error& e) {
11437         dout(0) << __func__ << " Can't decode snapset: " << e << dendl;
11438         return NULL;
11439       }
11440       ssc->exists = true;
11441     } else {
11442       ssc->exists = false;
11443     }
11444   }
11445   ceph_assert(ssc);
11446   ssc->ref++;
11447   return ssc;
11448 }
11449
11450 void PrimaryLogPG::put_snapset_context(SnapSetContext *ssc)
11451 {
11452   std::lock_guard l(snapset_contexts_lock);
11453   --ssc->ref;
11454   if (ssc->ref == 0) {
11455     if (ssc->registered)
11456       snapset_contexts.erase(ssc->oid);
11457     delete ssc;
11458   }
11459 }
11460
11461 /*
11462  * Return values:
11463  *  NONE  - didn't pull anything
11464  *  YES   - pulled what the caller wanted
11465  *  HEAD  - needed to pull head first
11466  */
11467 enum { PULL_NONE, PULL_HEAD, PULL_YES };
11468
11469 int PrimaryLogPG::recover_missing(
11470   const hobject_t &soid, eversion_t v,
11471   int priority,
11472   PGBackend::RecoveryHandle *h)
11473 {
11474   if (recovery_state.get_missing_loc().is_unfound(soid)) {
11475     dout(7) << __func__ << " " << soid
11476             << " v " << v
11477             << " but it is unfound" << dendl;
11478     return PULL_NONE;
11479   }
11480
11481   if (recovery_state.get_missing_loc().is_deleted(soid)) {
11482     start_recovery_op(soid);
11483     ceph_assert(!recovering.count(soid));
11484     recovering.insert(make_pair(soid, ObjectContextRef()));
11485     epoch_t cur_epoch = get_osdmap_epoch();
11486     remove_missing_object(soid, v, new LambdaContext(
11487      [=](int) {
11488        std::scoped_lock locker{*this};
11489        if (!pg_has_reset_since(cur_epoch)) {
11490          bool object_missing = false;
11491          for (const auto& shard : get_acting_recovery_backfill()) {
11492            if (shard == pg_whoami)
11493              continue;
11494            if (recovery_state.get_peer_missing(shard).is_missing(soid)) {
11495              dout(20) << __func__ << ": soid " << soid << " needs to be deleted from replica " << shard << dendl;
11496              object_missing = true;
11497              break;
11498            }
11499          }
11500          if (!object_missing) {
11501            object_stat_sum_t stat_diff;
11502            stat_diff.num_objects_recovered = 1;
11503            if (scrub_after_recovery)
11504              stat_diff.num_objects_repaired = 1;
11505            on_global_recover(soid, stat_diff, true);
11506          } else {
11507            auto recovery_handle = pgbackend->open_recovery_op();
11508            pgbackend->recover_delete_object(soid, v, recovery_handle);
11509            pgbackend->run_recovery_op(recovery_handle, priority);
11510          }
11511        }
11512      }));
11513     return PULL_YES;
11514   }
11515
11516   // is this a snapped object?  if so, consult the snapset.. we may not need the entire object!
11517   ObjectContextRef obc;
11518   ObjectContextRef head_obc;
11519   if (soid.snap && soid.snap < CEPH_NOSNAP) {
11520     // do we have the head?
11521     hobject_t head = soid.get_head();
11522     if (recovery_state.get_pg_log().get_missing().is_missing(head)) {
11523       if (recovering.count(head)) {
11524         dout(10) << " missing but already recovering head " << head << dendl;
11525         return PULL_NONE;
11526       } else {
11527         int r = recover_missing(
11528           head, recovery_state.get_pg_log().get_missing().get_items().find(head)->second.need, priority,
11529           h);
11530         if (r != PULL_NONE)
11531           return PULL_HEAD;
11532         return PULL_NONE;
11533       }
11534     }
11535     head_obc = get_object_context(
11536       head,
11537       false,
11538       0);
11539     ceph_assert(head_obc);
11540   }
11541   start_recovery_op(soid);
11542   ceph_assert(!recovering.count(soid));
11543   recovering.insert(make_pair(soid, obc));
11544   int r = pgbackend->recover_object(
11545     soid,
11546     v,
11547     head_obc,
11548     obc,
11549     h);
11550   // This is only a pull which shouldn't return an error
11551   ceph_assert(r >= 0);
11552   return PULL_YES;
11553 }
11554
11555 void PrimaryLogPG::remove_missing_object(const hobject_t &soid,
11556                                          eversion_t v, Context *on_complete)
11557 {
11558   dout(20) << __func__ << " " << soid << " " << v << dendl;
11559   ceph_assert(on_complete != nullptr);
11560   // delete locally
11561   ObjectStore::Transaction t;
11562   remove_snap_mapped_object(t, soid);
11563
11564   ObjectRecoveryInfo recovery_info;
11565   recovery_info.soid = soid;
11566   recovery_info.version = v;
11567
11568   epoch_t cur_epoch = get_osdmap_epoch();
11569   t.register_on_complete(new LambdaContext(
11570      [=](int) {
11571        std::unique_lock locker{*this};
11572        if (!pg_has_reset_since(cur_epoch)) {
11573          ObjectStore::Transaction t2;
11574          on_local_recover(soid, recovery_info, ObjectContextRef(), true, &t2);
11575          t2.register_on_complete(on_complete);
11576          int r = osd->store->queue_transaction(ch, std::move(t2), nullptr);
11577          ceph_assert(r == 0);
11578          locker.unlock();
11579        } else {
11580          locker.unlock();
11581          on_complete->complete(-EAGAIN);
11582        }
11583      }));
11584   int r = osd->store->queue_transaction(ch, std::move(t), nullptr);
11585   ceph_assert(r == 0);
11586 }
11587
11588 void PrimaryLogPG::finish_degraded_object(const hobject_t oid)
11589 {
11590   dout(10) << __func__ << " " << oid << dendl;
11591   if (callbacks_for_degraded_object.count(oid)) {
11592     list<Context*> contexts;
11593     contexts.swap(callbacks_for_degraded_object[oid]);
11594     callbacks_for_degraded_object.erase(oid);
11595     for (list<Context*>::iterator i = contexts.begin();
11596          i != contexts.end();
11597          ++i) {
11598       (*i)->complete(0);
11599     }
11600   }
11601   map<hobject_t, snapid_t>::iterator i = objects_blocked_on_degraded_snap.find(
11602     oid.get_head());
11603   if (i != objects_blocked_on_degraded_snap.end() &&
11604       i->second == oid.snap)
11605     objects_blocked_on_degraded_snap.erase(i);
11606 }
11607
11608 void PrimaryLogPG::_committed_pushed_object(
11609   epoch_t epoch, eversion_t last_complete)
11610 {
11611   std::scoped_lock locker{*this};
11612   if (!pg_has_reset_since(epoch)) {
11613     recovery_state.recovery_committed_to(last_complete);
11614   } else {
11615     dout(10) << __func__
11616              << " pg has changed, not touching last_complete_ondisk" << dendl;
11617   }
11618 }
11619
11620 void PrimaryLogPG::_applied_recovered_object(ObjectContextRef obc)
11621 {
11622   dout(20) << __func__ << dendl;
11623   if (obc) {
11624     dout(20) << "obc = " << *obc << dendl;
11625   }
11626   ceph_assert(active_pushes >= 1);
11627   --active_pushes;
11628
11629   // requeue an active chunky scrub waiting on recovery ops
11630   if (!recovery_state.is_deleting() && active_pushes == 0
11631       && scrubber.is_chunky_scrub_active()) {
11632     requeue_scrub(ops_blocked_by_scrub());
11633   }
11634 }
11635
11636 void PrimaryLogPG::_applied_recovered_object_replica()
11637 {
11638   dout(20) << __func__ << dendl;
11639   ceph_assert(active_pushes >= 1);
11640   --active_pushes;
11641
11642   // requeue an active chunky scrub waiting on recovery ops
11643   if (!recovery_state.is_deleting() && active_pushes == 0 &&
11644       scrubber.active_rep_scrub && static_cast<const MOSDRepScrub*>(
11645         scrubber.active_rep_scrub->get_req())->chunky) {
11646     auto& op = scrubber.active_rep_scrub;
11647     osd->enqueue_back(
11648       OpSchedulerItem(
11649         unique_ptr<OpSchedulerItem::OpQueueable>(new PGOpItem(info.pgid, op)),
11650         op->get_req()->get_cost(),
11651         op->get_req()->get_priority(),
11652         op->get_req()->get_recv_stamp(),
11653         op->get_req()->get_source().num(),
11654         get_osdmap_epoch()));
11655     scrubber.active_rep_scrub.reset();
11656   }
11657 }
11658
11659 void PrimaryLogPG::on_failed_pull(
11660   const set<pg_shard_t> &from,
11661   const hobject_t &soid,
11662   const eversion_t &v)
11663 {
11664   dout(20) << __func__ << ": " << soid << dendl;
11665   ceph_assert(recovering.count(soid));
11666   auto obc = recovering[soid];
11667   if (obc) {
11668     list<OpRequestRef> blocked_ops;
11669     obc->drop_recovery_read(&blocked_ops);
11670     requeue_ops(blocked_ops);
11671   }
11672   recovering.erase(soid);
11673   for (auto&& i : from) {
11674     if (i != pg_whoami) { // we'll get it below in primary_error
11675       recovery_state.force_object_missing(i, soid, v);
11676     }
11677   }
11678
11679   dout(0) << __func__ << " " << soid << " from shard " << from
11680           << ", reps on " << recovery_state.get_missing_loc().get_locations(soid)
11681           << " unfound? " << recovery_state.get_missing_loc().is_unfound(soid)
11682           << dendl;
11683   finish_recovery_op(soid);  // close out this attempt,
11684   finish_degraded_object(soid);
11685
11686   if (from.count(pg_whoami)) {
11687     dout(0) << " primary missing oid " << soid << " version " << v << dendl;
11688     primary_error(soid, v);
11689     backfills_in_flight.erase(soid);
11690   }
11691 }
11692
11693 eversion_t PrimaryLogPG::pick_newest_available(const hobject_t& oid)
11694 {
11695   eversion_t v;
11696   pg_missing_item pmi;
11697   bool is_missing = recovery_state.get_pg_log().get_missing().is_missing(oid, &pmi);
11698   ceph_assert(is_missing);
11699   v = pmi.have;
11700   dout(10) << "pick_newest_available " << oid << " " << v << " on osd." << osd->whoami << " (local)" << dendl;
11701
11702   ceph_assert(!get_acting_recovery_backfill().empty());
11703   for (set<pg_shard_t>::iterator i = get_acting_recovery_backfill().begin();
11704        i != get_acting_recovery_backfill().end();
11705        ++i) {
11706     if (*i == get_primary()) continue;
11707     pg_shard_t peer = *i;
11708     if (!recovery_state.get_peer_missing(peer).is_missing(oid)) {
11709       continue;
11710     }
11711     eversion_t h = recovery_state.get_peer_missing(peer).get_items().at(oid).have;
11712     dout(10) << "pick_newest_available " << oid << " " << h << " on osd." << peer << dendl;
11713     if (h > v)
11714       v = h;
11715   }
11716
11717   dout(10) << "pick_newest_available " << oid << " " << v << " (newest)" << dendl;
11718   return v;
11719 }
11720
11721 void PrimaryLogPG::do_update_log_missing(OpRequestRef &op)
11722 {
11723   const MOSDPGUpdateLogMissing *m = static_cast<const MOSDPGUpdateLogMissing*>(
11724     op->get_req());
11725   ceph_assert(m->get_type() == MSG_OSD_PG_UPDATE_LOG_MISSING);
11726   ObjectStore::Transaction t;
11727   std::optional<eversion_t> op_trim_to, op_roll_forward_to;
11728   if (m->pg_trim_to != eversion_t())
11729     op_trim_to = m->pg_trim_to;
11730   if (m->pg_roll_forward_to != eversion_t())
11731     op_roll_forward_to = m->pg_roll_forward_to;
11732
11733   dout(20) << __func__
11734            << " op_trim_to = " << op_trim_to << " op_roll_forward_to = " << op_roll_forward_to << dendl;
11735
11736   recovery_state.append_log_entries_update_missing(
11737     m->entries, t, op_trim_to, op_roll_forward_to);
11738   eversion_t new_lcod = info.last_complete;
11739
11740   Context *complete = new LambdaContext(
11741     [=](int) {
11742       const MOSDPGUpdateLogMissing *msg = static_cast<const MOSDPGUpdateLogMissing*>(
11743         op->get_req());
11744       std::scoped_lock locker{*this};
11745       if (!pg_has_reset_since(msg->get_epoch())) {
11746         update_last_complete_ondisk(new_lcod);
11747         MOSDPGUpdateLogMissingReply *reply =
11748           new MOSDPGUpdateLogMissingReply(
11749             spg_t(info.pgid.pgid, primary_shard().shard),
11750             pg_whoami.shard,
11751             msg->get_epoch(),
11752             msg->min_epoch,
11753             msg->get_tid(),
11754             new_lcod);
11755         reply->set_priority(CEPH_MSG_PRIO_HIGH);
11756         msg->get_connection()->send_message(reply);
11757       }
11758     });
11759
11760   if (get_osdmap()->require_osd_release >= ceph_release_t::kraken) {
11761     t.register_on_commit(complete);
11762   } else {
11763     /* Hack to work around the fact that ReplicatedBackend sends
11764      * ack+commit if commit happens first
11765      *
11766      * This behavior is no longer necessary, but we preserve it so old
11767      * primaries can keep their repops in order */
11768     if (pool.info.is_erasure()) {
11769       t.register_on_complete(complete);
11770     } else {
11771       t.register_on_commit(complete);
11772     }
11773   }
11774   int tr = osd->store->queue_transaction(
11775     ch,
11776     std::move(t),
11777     nullptr);
11778   ceph_assert(tr == 0);
11779   op_applied(info.last_update);
11780 }
11781
11782 void PrimaryLogPG::do_update_log_missing_reply(OpRequestRef &op)
11783 {
11784   const MOSDPGUpdateLogMissingReply *m =
11785     static_cast<const MOSDPGUpdateLogMissingReply*>(
11786     op->get_req());
11787   dout(20) << __func__ << " got reply from "
11788            << m->get_from() << dendl;
11789
11790   auto it = log_entry_update_waiting_on.find(m->get_tid());
11791   if (it != log_entry_update_waiting_on.end()) {
11792     if (it->second.waiting_on.count(m->get_from())) {
11793       it->second.waiting_on.erase(m->get_from());
11794       if (m->last_complete_ondisk != eversion_t()) {
11795         update_peer_last_complete_ondisk(m->get_from(), m->last_complete_ondisk);
11796       }
11797     } else {
11798       osd->clog->error()
11799         << info.pgid << " got reply "
11800         << *m << " from shard we are not waiting for "
11801         << m->get_from();
11802     }
11803
11804     if (it->second.waiting_on.empty()) {
11805       repop_all_committed(it->second.repop.get());
11806       log_entry_update_waiting_on.erase(it);
11807     }
11808   } else {
11809     osd->clog->error()
11810       << info.pgid << " got reply "
11811       << *m << " on unknown tid " << m->get_tid();
11812   }
11813 }
11814
11815 /* Mark all unfound objects as lost.
11816  */
11817 void PrimaryLogPG::mark_all_unfound_lost(
11818   int what,
11819   std::function<void(int,const std::string&,bufferlist&)> on_finish)
11820 {
11821   dout(3) << __func__ << " " << pg_log_entry_t::get_op_name(what) << dendl;
11822   list<hobject_t> oids;
11823
11824   dout(30) << __func__ << ": log before:\n";
11825   recovery_state.get_pg_log().get_log().print(*_dout);
11826   *_dout << dendl;
11827
11828   mempool::osd_pglog::list<pg_log_entry_t> log_entries;
11829
11830   utime_t mtime = ceph_clock_now();
11831   map<hobject_t, pg_missing_item>::const_iterator m =
11832     recovery_state.get_missing_loc().get_needs_recovery().begin();
11833   map<hobject_t, pg_missing_item>::const_iterator mend =
11834     recovery_state.get_missing_loc().get_needs_recovery().end();
11835
11836   ObcLockManager manager;
11837   eversion_t v = get_next_version();
11838   v.epoch = get_osdmap_epoch();
11839   uint64_t num_unfound = recovery_state.get_missing_loc().num_unfound();
11840   while (m != mend) {
11841     const hobject_t &oid(m->first);
11842     if (!recovery_state.get_missing_loc().is_unfound(oid)) {
11843       // We only care about unfound objects
11844       ++m;
11845       continue;
11846     }
11847
11848     ObjectContextRef obc;
11849     eversion_t prev;
11850
11851     switch (what) {
11852     case pg_log_entry_t::LOST_MARK:
11853       ceph_abort_msg("actually, not implemented yet!");
11854       break;
11855
11856     case pg_log_entry_t::LOST_REVERT:
11857       prev = pick_newest_available(oid);
11858       if (prev > eversion_t()) {
11859         // log it
11860         pg_log_entry_t e(
11861           pg_log_entry_t::LOST_REVERT, oid, v,
11862           m->second.need, 0, osd_reqid_t(), mtime, 0);
11863         e.reverting_to = prev;
11864         e.mark_unrollbackable();
11865         log_entries.push_back(e);
11866         dout(10) << e << dendl;
11867
11868         // we are now missing the new version; recovery code will sort it out.
11869         ++v.version;
11870         ++m;
11871         break;
11872       }
11873
11874     case pg_log_entry_t::LOST_DELETE:
11875       {
11876         pg_log_entry_t e(pg_log_entry_t::LOST_DELETE, oid, v, m->second.need,
11877                          0, osd_reqid_t(), mtime, 0);
11878         if (get_osdmap()->require_osd_release >= ceph_release_t::jewel) {
11879           if (pool.info.require_rollback()) {
11880             e.mod_desc.try_rmobject(v.version);
11881           } else {
11882             e.mark_unrollbackable();
11883           }
11884         } // otherwise, just do what we used to do
11885         dout(10) << e << dendl;
11886         log_entries.push_back(e);
11887         oids.push_back(oid);
11888
11889         // If context found mark object as deleted in case
11890         // of racing with new creation.  This can happen if
11891         // object lost and EIO at primary.
11892         obc = object_contexts.lookup(oid);
11893         if (obc)
11894           obc->obs.exists = false;
11895
11896         ++v.version;
11897         ++m;
11898       }
11899       break;
11900
11901     default:
11902       ceph_abort();
11903     }
11904   }
11905
11906   recovery_state.update_stats(
11907     [](auto &history, auto &stats) {
11908       stats.stats_invalid = true;
11909       return false;
11910     });
11911
11912   submit_log_entries(
11913     log_entries,
11914     std::move(manager),
11915     std::optional<std::function<void(void)> >(
11916       [this, oids, num_unfound, on_finish]() {
11917         if (recovery_state.perform_deletes_during_peering()) {
11918           for (auto oid : oids) {
11919             // clear old locations - merge_new_log_entries will have
11920             // handled rebuilding missing_loc for each of these
11921             // objects if we have the RECOVERY_DELETES flag
11922             recovery_state.object_recovered(oid, object_stat_sum_t());
11923           }
11924         }
11925
11926         if (is_recovery_unfound()) {
11927           queue_peering_event(
11928             PGPeeringEventRef(
11929               std::make_shared<PGPeeringEvent>(
11930               get_osdmap_epoch(),
11931               get_osdmap_epoch(),
11932               PeeringState::DoRecovery())));
11933         } else if (is_backfill_unfound()) {
11934           queue_peering_event(
11935             PGPeeringEventRef(
11936               std::make_shared<PGPeeringEvent>(
11937               get_osdmap_epoch(),
11938               get_osdmap_epoch(),
11939               PeeringState::RequestBackfill())));
11940         } else {
11941           queue_recovery();
11942         }
11943
11944         stringstream ss;
11945         ss << "pg has " << num_unfound
11946            << " objects unfound and apparently lost marking";
11947         string rs = ss.str();
11948         dout(0) << "do_command r=" << 0 << " " << rs << dendl;
11949         osd->clog->info() << rs;
11950         bufferlist empty;
11951         on_finish(0, rs, empty);
11952       }),
11953     OpRequestRef());
11954 }
11955
11956 void PrimaryLogPG::_split_into(pg_t child_pgid, PG *child, unsigned split_bits)
11957 {
11958   ceph_assert(repop_queue.empty());
11959 }
11960
11961 /*
11962  * pg status change notification
11963  */
11964
11965 void PrimaryLogPG::apply_and_flush_repops(bool requeue)
11966 {
11967   list<OpRequestRef> rq;
11968
11969   // apply all repops
11970   while (!repop_queue.empty()) {
11971     RepGather *repop = repop_queue.front();
11972     repop_queue.pop_front();
11973     dout(10) << " canceling repop tid " << repop->rep_tid << dendl;
11974     repop->rep_aborted = true;
11975     repop->on_committed.clear();
11976     repop->on_success.clear();
11977
11978     if (requeue) {
11979       if (repop->op) {
11980         dout(10) << " requeuing " << *repop->op->get_req() << dendl;
11981         rq.push_back(repop->op);
11982         repop->op = OpRequestRef();
11983       }
11984
11985       // also requeue any dups, interleaved into position
11986       auto p = waiting_for_ondisk.find(repop->v);
11987       if (p != waiting_for_ondisk.end()) {
11988         dout(10) << " also requeuing ondisk waiters " << p->second << dendl;
11989         for (auto& i : p->second) {
11990           rq.push_back(std::get<0>(i));
11991         }
11992         waiting_for_ondisk.erase(p);
11993       }
11994     }
11995
11996     remove_repop(repop);
11997   }
11998
11999   ceph_assert(repop_queue.empty());
12000
12001   if (requeue) {
12002     requeue_ops(rq);
12003     if (!waiting_for_ondisk.empty()) {
12004       for (auto& i : waiting_for_ondisk) {
12005         for (auto& j : i.second) {
12006           derr << __func__ << ": op " << *(std::get<0>(j)->get_req())
12007                << " waiting on " << i.first << dendl;
12008         }
12009       }
12010       ceph_assert(waiting_for_ondisk.empty());
12011     }
12012   }
12013
12014   waiting_for_ondisk.clear();
12015 }
12016
12017 void PrimaryLogPG::on_flushed()
12018 {
12019   requeue_ops(waiting_for_flush);
12020   if (!is_peered() || !is_primary()) {
12021     pair<hobject_t, ObjectContextRef> i;
12022     while (object_contexts.get_next(i.first, &i)) {
12023       derr << __func__ << ": object " << i.first << " obc still alive" << dendl;
12024     }
12025     ceph_assert(object_contexts.empty());
12026   }
12027 }
12028
12029 void PrimaryLogPG::on_removal(ObjectStore::Transaction &t)
12030 {
12031   dout(10) << __func__ << dendl;
12032
12033   on_shutdown();
12034
12035   t.register_on_commit(new C_DeleteMore(this, get_osdmap_epoch()));
12036 }
12037
12038 void PrimaryLogPG::clear_async_reads()
12039 {
12040   dout(10) << __func__ << dendl;
12041   for(auto& i : in_progress_async_reads) {
12042     dout(10) << "clear ctx: "
12043              << "OpRequestRef " << i.first
12044              << " OpContext " << i.second
12045              << dendl;
12046     close_op_ctx(i.second);
12047   }
12048 }
12049
12050 void PrimaryLogPG::clear_cache()
12051 {
12052   object_contexts.clear();
12053 }
12054
12055 void PrimaryLogPG::on_shutdown()
12056 {
12057   dout(10) << __func__ << dendl;
12058
12059   if (recovery_queued) {
12060     recovery_queued = false;
12061     osd->clear_queued_recovery(this);
12062   }
12063
12064   clear_scrub_reserved();
12065   scrub_clear_state();
12066
12067   unreg_next_scrub();
12068
12069   vector<ceph_tid_t> tids;
12070   cancel_copy_ops(false, &tids);
12071   cancel_flush_ops(false, &tids);
12072   cancel_proxy_ops(false, &tids);
12073   cancel_manifest_ops(false, &tids);
12074   osd->objecter->op_cancel(tids, -ECANCELED);
12075
12076   apply_and_flush_repops(false);
12077   cancel_log_updates();
12078   // we must remove PGRefs, so do this this prior to release_backoffs() callers
12079   clear_backoffs();
12080   // clean up snap trim references
12081   snap_trimmer_machine.process_event(Reset());
12082
12083   pgbackend->on_change();
12084
12085   context_registry_on_change();
12086   object_contexts.clear();
12087
12088   clear_async_reads();
12089
12090   osd->remote_reserver.cancel_reservation(info.pgid);
12091   osd->local_reserver.cancel_reservation(info.pgid);
12092
12093   clear_primary_state();
12094   cancel_recovery();
12095
12096   if (is_primary()) {
12097     osd->clear_ready_to_merge(this);
12098   }
12099 }
12100
12101 void PrimaryLogPG::on_activate_complete()
12102 {
12103   check_local();
12104   // waiters
12105   if (!recovery_state.needs_flush()) {
12106     requeue_ops(waiting_for_peered);
12107   } else if (!waiting_for_peered.empty()) {
12108     dout(10) << __func__ << " flushes in progress, moving "
12109              << waiting_for_peered.size()
12110              << " items to waiting_for_flush"
12111              << dendl;
12112     ceph_assert(waiting_for_flush.empty());
12113     waiting_for_flush.swap(waiting_for_peered);
12114   }
12115
12116
12117   // all clean?
12118   if (needs_recovery()) {
12119     dout(10) << "activate not all replicas are up-to-date, queueing recovery" << dendl;
12120     queue_peering_event(
12121       PGPeeringEventRef(
12122         std::make_shared<PGPeeringEvent>(
12123           get_osdmap_epoch(),
12124           get_osdmap_epoch(),
12125           PeeringState::DoRecovery())));
12126   } else if (needs_backfill()) {
12127     dout(10) << "activate queueing backfill" << dendl;
12128     queue_peering_event(
12129       PGPeeringEventRef(
12130         std::make_shared<PGPeeringEvent>(
12131           get_osdmap_epoch(),
12132           get_osdmap_epoch(),
12133           PeeringState::RequestBackfill())));
12134   } else {
12135     dout(10) << "activate all replicas clean, no recovery" << dendl;
12136     eio_errors_to_process = false;
12137     queue_peering_event(
12138       PGPeeringEventRef(
12139         std::make_shared<PGPeeringEvent>(
12140           get_osdmap_epoch(),
12141           get_osdmap_epoch(),
12142           PeeringState::AllReplicasRecovered())));
12143   }
12144
12145   publish_stats_to_osd();
12146
12147   if (get_backfill_targets().size()) {
12148     last_backfill_started = earliest_backfill();
12149     new_backfill = true;
12150     ceph_assert(!last_backfill_started.is_max());
12151     dout(5) << __func__ << ": bft=" << get_backfill_targets()
12152            << " from " << last_backfill_started << dendl;
12153     for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
12154          i != get_backfill_targets().end();
12155          ++i) {
12156       dout(5) << "target shard " << *i
12157              << " from " << recovery_state.get_peer_info(*i).last_backfill
12158              << dendl;
12159     }
12160   }
12161
12162   hit_set_setup();
12163   agent_setup();
12164 }
12165
12166 void PrimaryLogPG::on_change(ObjectStore::Transaction &t)
12167 {
12168   dout(10) << __func__ << dendl;
12169
12170   if (hit_set && hit_set->insert_count() == 0) {
12171     dout(20) << " discarding empty hit_set" << dendl;
12172     hit_set_clear();
12173   }
12174
12175   if (recovery_queued) {
12176     recovery_queued = false;
12177     osd->clear_queued_recovery(this);
12178   }
12179
12180   // requeue everything in the reverse order they should be
12181   // reexamined.
12182   requeue_ops(waiting_for_peered);
12183   requeue_ops(waiting_for_flush);
12184   requeue_ops(waiting_for_active);
12185   requeue_ops(waiting_for_readable);
12186
12187   clear_scrub_reserved();
12188
12189   vector<ceph_tid_t> tids;
12190   cancel_copy_ops(is_primary(), &tids);
12191   cancel_flush_ops(is_primary(), &tids);
12192   cancel_proxy_ops(is_primary(), &tids);
12193   cancel_manifest_ops(is_primary(), &tids);
12194   osd->objecter->op_cancel(tids, -ECANCELED);
12195
12196   // requeue object waiters
12197   for (auto& p : waiting_for_unreadable_object) {
12198     release_backoffs(p.first);
12199   }
12200   if (is_primary()) {
12201     requeue_object_waiters(waiting_for_unreadable_object);
12202   } else {
12203     waiting_for_unreadable_object.clear();
12204   }
12205   for (map<hobject_t,list<OpRequestRef>>::iterator p = waiting_for_degraded_object.begin();
12206        p != waiting_for_degraded_object.end();
12207        waiting_for_degraded_object.erase(p++)) {
12208     release_backoffs(p->first);
12209     if (is_primary())
12210       requeue_ops(p->second);
12211     else
12212       p->second.clear();
12213     finish_degraded_object(p->first);
12214   }
12215
12216   // requeues waiting_for_scrub
12217   scrub_clear_state();
12218
12219   for (auto p = waiting_for_blocked_object.begin();
12220        p != waiting_for_blocked_object.end();
12221        waiting_for_blocked_object.erase(p++)) {
12222     if (is_primary())
12223       requeue_ops(p->second);
12224     else
12225       p->second.clear();
12226   }
12227   for (auto i = callbacks_for_degraded_object.begin();
12228        i != callbacks_for_degraded_object.end();
12229     ) {
12230     finish_degraded_object((i++)->first);
12231   }
12232   ceph_assert(callbacks_for_degraded_object.empty());
12233
12234   if (is_primary()) {
12235     requeue_ops(waiting_for_cache_not_full);
12236   } else {
12237     waiting_for_cache_not_full.clear();
12238   }
12239   objects_blocked_on_cache_full.clear();
12240
12241   for (list<pair<OpRequestRef, OpContext*> >::iterator i =
12242          in_progress_async_reads.begin();
12243        i != in_progress_async_reads.end();
12244        in_progress_async_reads.erase(i++)) {
12245     close_op_ctx(i->second);
12246     if (is_primary())
12247       requeue_op(i->first);
12248   }
12249
12250   // this will requeue ops we were working on but didn't finish, and
12251   // any dups
12252   apply_and_flush_repops(is_primary());
12253   cancel_log_updates();
12254
12255   // do this *after* apply_and_flush_repops so that we catch any newly
12256   // registered watches.
12257   context_registry_on_change();
12258
12259   pgbackend->on_change_cleanup(&t);
12260   scrubber.cleanup_store(&t);
12261   pgbackend->on_change();
12262
12263   // clear snap_trimmer state
12264   snap_trimmer_machine.process_event(Reset());
12265
12266   debug_op_order.clear();
12267   unstable_stats.clear();
12268
12269   // we don't want to cache object_contexts through the interval change
12270   // NOTE: we actually assert that all currently live references are dead
12271   // by the time the flush for the next interval completes.
12272   object_contexts.clear();
12273
12274   // should have been cleared above by finishing all of the degraded objects
12275   ceph_assert(objects_blocked_on_degraded_snap.empty());
12276 }
12277
12278 void PrimaryLogPG::plpg_on_role_change()
12279 {
12280   dout(10) << __func__ << dendl;
12281   if (get_role() != 0 && hit_set) {
12282     dout(10) << " clearing hit set" << dendl;
12283     hit_set_clear();
12284   }
12285 }
12286
12287 void PrimaryLogPG::plpg_on_pool_change()
12288 {
12289   dout(10) << __func__ << dendl;
12290   // requeue cache full waiters just in case the cache_mode is
12291   // changing away from writeback mode.  note that if we are not
12292   // active the normal requeuing machinery is sufficient (and properly
12293   // ordered).
12294   if (is_active() &&
12295       pool.info.cache_mode != pg_pool_t::CACHEMODE_WRITEBACK &&
12296       !waiting_for_cache_not_full.empty()) {
12297     dout(10) << __func__ << " requeuing full waiters (not in writeback) "
12298              << dendl;
12299     requeue_ops(waiting_for_cache_not_full);
12300     objects_blocked_on_cache_full.clear();
12301   }
12302   hit_set_setup();
12303   agent_setup();
12304 }
12305
12306 // clear state.  called on recovery completion AND cancellation.
12307 void PrimaryLogPG::_clear_recovery_state()
12308 {
12309 #ifdef DEBUG_RECOVERY_OIDS
12310   recovering_oids.clear();
12311 #endif
12312   last_backfill_started = hobject_t();
12313   set<hobject_t>::iterator i = backfills_in_flight.begin();
12314   while (i != backfills_in_flight.end()) {
12315     ceph_assert(recovering.count(*i));
12316     backfills_in_flight.erase(i++);
12317   }
12318
12319   list<OpRequestRef> blocked_ops;
12320   for (map<hobject_t, ObjectContextRef>::iterator i = recovering.begin();
12321        i != recovering.end();
12322        recovering.erase(i++)) {
12323     if (i->second) {
12324       i->second->drop_recovery_read(&blocked_ops);
12325       requeue_ops(blocked_ops);
12326     }
12327   }
12328   ceph_assert(backfills_in_flight.empty());
12329   pending_backfill_updates.clear();
12330   ceph_assert(recovering.empty());
12331   pgbackend->clear_recovery_state();
12332 }
12333
12334 void PrimaryLogPG::cancel_pull(const hobject_t &soid)
12335 {
12336   dout(20) << __func__ << ": " << soid << dendl;
12337   ceph_assert(recovering.count(soid));
12338   ObjectContextRef obc = recovering[soid];
12339   if (obc) {
12340     list<OpRequestRef> blocked_ops;
12341     obc->drop_recovery_read(&blocked_ops);
12342     requeue_ops(blocked_ops);
12343   }
12344   recovering.erase(soid);
12345   finish_recovery_op(soid);
12346   release_backoffs(soid);
12347   if (waiting_for_degraded_object.count(soid)) {
12348     dout(20) << " kicking degraded waiters on " << soid << dendl;
12349     requeue_ops(waiting_for_degraded_object[soid]);
12350     waiting_for_degraded_object.erase(soid);
12351   }
12352   if (waiting_for_unreadable_object.count(soid)) {
12353     dout(20) << " kicking unreadable waiters on " << soid << dendl;
12354     requeue_ops(waiting_for_unreadable_object[soid]);
12355     waiting_for_unreadable_object.erase(soid);
12356   }
12357   if (is_missing_object(soid))
12358     recovery_state.set_last_requested(0);
12359   finish_degraded_object(soid);
12360 }
12361
12362 void PrimaryLogPG::check_recovery_sources(const OSDMapRef& osdmap)
12363 {
12364   pgbackend->check_recovery_sources(osdmap);
12365 }
12366
12367 bool PrimaryLogPG::start_recovery_ops(
12368   uint64_t max,
12369   ThreadPool::TPHandle &handle,
12370   uint64_t *ops_started)
12371 {
12372   uint64_t& started = *ops_started;
12373   started = 0;
12374   bool work_in_progress = false;
12375   bool recovery_started = false;
12376   ceph_assert(is_primary());
12377   ceph_assert(is_peered());
12378   ceph_assert(!recovery_state.is_deleting());
12379
12380   ceph_assert(recovery_queued);
12381   recovery_queued = false;
12382
12383   if (!state_test(PG_STATE_RECOVERING) &&
12384       !state_test(PG_STATE_BACKFILLING)) {
12385     /* TODO: I think this case is broken and will make do_recovery()
12386      * unhappy since we're returning false */
12387     dout(10) << "recovery raced and were queued twice, ignoring!" << dendl;
12388     return have_unfound();
12389   }
12390
12391   const auto &missing = recovery_state.get_pg_log().get_missing();
12392
12393   uint64_t num_unfound = get_num_unfound();
12394
12395   if (!recovery_state.have_missing()) {
12396     recovery_state.local_recovery_complete();
12397   }
12398
12399   if (!missing.have_missing() || // Primary does not have missing
12400       // or all of the missing objects are unfound.
12401       recovery_state.all_missing_unfound()) {
12402     // Recover the replicas.
12403     started = recover_replicas(max, handle, &recovery_started);
12404   }
12405   if (!started) {
12406     // We still have missing objects that we should grab from replicas.
12407     started += recover_primary(max, handle);
12408   }
12409   if (!started && num_unfound != get_num_unfound()) {
12410     // second chance to recovery replicas
12411     started = recover_replicas(max, handle, &recovery_started);
12412   }
12413
12414   if (started || recovery_started)
12415     work_in_progress = true;
12416
12417   bool deferred_backfill = false;
12418   if (recovering.empty() &&
12419       state_test(PG_STATE_BACKFILLING) &&
12420       !get_backfill_targets().empty() && started < max &&
12421       missing.num_missing() == 0 &&
12422       waiting_on_backfill.empty()) {
12423     if (get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL)) {
12424       dout(10) << "deferring backfill due to NOBACKFILL" << dendl;
12425       deferred_backfill = true;
12426     } else if (get_osdmap()->test_flag(CEPH_OSDMAP_NOREBALANCE) &&
12427                !is_degraded())  {
12428       dout(10) << "deferring backfill due to NOREBALANCE" << dendl;
12429       deferred_backfill = true;
12430     } else if (!recovery_state.is_backfill_reserved()) {
12431       dout(10) << "deferring backfill due to !backfill_reserved" << dendl;
12432       if (!backfill_reserving) {
12433         dout(10) << "queueing RequestBackfill" << dendl;
12434         backfill_reserving = true;
12435         queue_peering_event(
12436           PGPeeringEventRef(
12437             std::make_shared<PGPeeringEvent>(
12438               get_osdmap_epoch(),
12439               get_osdmap_epoch(),
12440               PeeringState::RequestBackfill())));
12441       }
12442       deferred_backfill = true;
12443     } else {
12444       started += recover_backfill(max - started, handle, &work_in_progress);
12445     }
12446   }
12447
12448   dout(10) << " started " << started << dendl;
12449   osd->logger->inc(l_osd_rop, started);
12450
12451   if (!recovering.empty() ||
12452       work_in_progress || recovery_ops_active > 0 || deferred_backfill)
12453     return !work_in_progress && have_unfound();
12454
12455   ceph_assert(recovering.empty());
12456   ceph_assert(recovery_ops_active == 0);
12457
12458   dout(10) << __func__ << " needs_recovery: "
12459            << recovery_state.get_missing_loc().get_needs_recovery()
12460            << dendl;
12461   dout(10) << __func__ << " missing_loc: "
12462            << recovery_state.get_missing_loc().get_missing_locs()
12463            << dendl;
12464   int unfound = get_num_unfound();
12465   if (unfound) {
12466     dout(10) << " still have " << unfound << " unfound" << dendl;
12467     return true;
12468   }
12469
12470   if (missing.num_missing() > 0) {
12471     // this shouldn't happen!
12472     osd->clog->error() << info.pgid << " Unexpected Error: recovery ending with "
12473                        << missing.num_missing() << ": " << missing.get_items();
12474     return false;
12475   }
12476
12477   if (needs_recovery()) {
12478     // this shouldn't happen!
12479     // We already checked num_missing() so we must have missing replicas
12480     osd->clog->error() << info.pgid
12481                        << " Unexpected Error: recovery ending with missing replicas";
12482     return false;
12483   }
12484
12485   if (state_test(PG_STATE_RECOVERING)) {
12486     state_clear(PG_STATE_RECOVERING);
12487     state_clear(PG_STATE_FORCED_RECOVERY);
12488     if (needs_backfill()) {
12489       dout(10) << "recovery done, queuing backfill" << dendl;
12490       queue_peering_event(
12491         PGPeeringEventRef(
12492           std::make_shared<PGPeeringEvent>(
12493             get_osdmap_epoch(),
12494             get_osdmap_epoch(),
12495             PeeringState::RequestBackfill())));
12496     } else {
12497       dout(10) << "recovery done, no backfill" << dendl;
12498       eio_errors_to_process = false;
12499       state_clear(PG_STATE_FORCED_BACKFILL);
12500       queue_peering_event(
12501         PGPeeringEventRef(
12502           std::make_shared<PGPeeringEvent>(
12503             get_osdmap_epoch(),
12504             get_osdmap_epoch(),
12505             PeeringState::AllReplicasRecovered())));
12506     }
12507   } else { // backfilling
12508     state_clear(PG_STATE_BACKFILLING);
12509     state_clear(PG_STATE_FORCED_BACKFILL);
12510     state_clear(PG_STATE_FORCED_RECOVERY);
12511     dout(10) << "recovery done, backfill done" << dendl;
12512     eio_errors_to_process = false;
12513     queue_peering_event(
12514       PGPeeringEventRef(
12515         std::make_shared<PGPeeringEvent>(
12516           get_osdmap_epoch(),
12517           get_osdmap_epoch(),
12518           PeeringState::Backfilled())));
12519   }
12520
12521   return false;
12522 }
12523
12524 /**
12525  * do one recovery op.
12526  * return true if done, false if nothing left to do.
12527  */
12528 uint64_t PrimaryLogPG::recover_primary(uint64_t max, ThreadPool::TPHandle &handle)
12529 {
12530   ceph_assert(is_primary());
12531
12532   const auto &missing = recovery_state.get_pg_log().get_missing();
12533
12534   dout(10) << __func__ << " recovering " << recovering.size()
12535            << " in pg,"
12536            << " missing " << missing << dendl;
12537
12538   dout(25) << __func__ << " " << missing.get_items() << dendl;
12539
12540   // look at log!
12541   pg_log_entry_t *latest = 0;
12542   unsigned started = 0;
12543   int skipped = 0;
12544
12545   PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
12546   map<version_t, hobject_t>::const_iterator p =
12547     missing.get_rmissing().lower_bound(recovery_state.get_pg_log().get_log().last_requested);
12548   while (p != missing.get_rmissing().end()) {
12549     handle.reset_tp_timeout();
12550     hobject_t soid;
12551     version_t v = p->first;
12552
12553     auto it_objects = recovery_state.get_pg_log().get_log().objects.find(p->second);
12554     if (it_objects != recovery_state.get_pg_log().get_log().objects.end()) {
12555       latest = it_objects->second;
12556       ceph_assert(latest->is_update() || latest->is_delete());
12557       soid = latest->soid;
12558     } else {
12559       latest = 0;
12560       soid = p->second;
12561     }
12562     const pg_missing_item& item = missing.get_items().find(p->second)->second;
12563     ++p;
12564
12565     hobject_t head = soid.get_head();
12566
12567     eversion_t need = item.need;
12568
12569     dout(10) << __func__ << " "
12570              << soid << " " << item.need
12571              << (missing.is_missing(soid) ? " (missing)":"")
12572              << (missing.is_missing(head) ? " (missing head)":"")
12573              << (recovering.count(soid) ? " (recovering)":"")
12574              << (recovering.count(head) ? " (recovering head)":"")
12575              << dendl;
12576
12577     if (latest) {
12578       switch (latest->op) {
12579       case pg_log_entry_t::CLONE:
12580         /*
12581          * Handling for this special case removed for now, until we
12582          * can correctly construct an accurate SnapSet from the old
12583          * one.
12584          */
12585         break;
12586
12587       case pg_log_entry_t::LOST_REVERT:
12588         {
12589           if (item.have == latest->reverting_to) {
12590             ObjectContextRef obc = get_object_context(soid, true);
12591
12592             if (obc->obs.oi.version == latest->version) {
12593               // I'm already reverting
12594               dout(10) << " already reverting " << soid << dendl;
12595             } else {
12596               dout(10) << " reverting " << soid << " to " << latest->prior_version << dendl;
12597               obc->obs.oi.version = latest->version;
12598
12599               ObjectStore::Transaction t;
12600               bufferlist b2;
12601               obc->obs.oi.encode(
12602                 b2,
12603                 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
12604               ceph_assert(!pool.info.require_rollback());
12605               t.setattr(coll, ghobject_t(soid), OI_ATTR, b2);
12606
12607               recovery_state.recover_got(
12608                 soid,
12609                 latest->version,
12610                 false,
12611                 t);
12612
12613               ++active_pushes;
12614
12615               t.register_on_applied(new C_OSD_AppliedRecoveredObject(this, obc));
12616               t.register_on_commit(new C_OSD_CommittedPushedObject(
12617                                      this,
12618                                      get_osdmap_epoch(),
12619                                      info.last_complete));
12620               osd->store->queue_transaction(ch, std::move(t));
12621               continue;
12622             }
12623           } else {
12624             /*
12625              * Pull the old version of the object.  Update missing_loc here to have the location
12626              * of the version we want.
12627              *
12628              * This doesn't use the usual missing_loc paths, but that's okay:
12629              *  - if we have it locally, we hit the case above, and go from there.
12630              *  - if we don't, we always pass through this case during recovery and set up the location
12631              *    properly.
12632              *  - this way we don't need to mangle the missing code to be general about needing an old
12633              *    version...
12634              */
12635             eversion_t alternate_need = latest->reverting_to;
12636             dout(10) << " need to pull prior_version " << alternate_need << " for revert " << item << dendl;
12637
12638             set<pg_shard_t> good_peers;
12639             for (auto p = recovery_state.get_peer_missing().begin();
12640                  p != recovery_state.get_peer_missing().end();
12641                  ++p) {
12642               if (p->second.is_missing(soid, need) &&
12643                   p->second.get_items().at(soid).have == alternate_need) {
12644                 good_peers.insert(p->first);
12645               }
12646             }
12647             recovery_state.set_revert_with_targets(
12648               soid,
12649               good_peers);
12650             dout(10) << " will pull " << alternate_need << " or " << need
12651                      << " from one of "
12652                      << recovery_state.get_missing_loc().get_locations(soid)
12653                      << dendl;
12654           }
12655         }
12656         break;
12657       }
12658     }
12659
12660     if (!recovering.count(soid)) {
12661       if (recovering.count(head)) {
12662         ++skipped;
12663       } else {
12664         int r = recover_missing(
12665           soid, need, get_recovery_op_priority(), h);
12666         switch (r) {
12667         case PULL_YES:
12668           ++started;
12669           break;
12670         case PULL_HEAD:
12671           ++started;
12672         case PULL_NONE:
12673           ++skipped;
12674           break;
12675         default:
12676           ceph_abort();
12677         }
12678         if (started >= max)
12679           break;
12680       }
12681     }
12682
12683     // only advance last_requested if we haven't skipped anything
12684     if (!skipped)
12685       recovery_state.set_last_requested(v);
12686   }
12687
12688   pgbackend->run_recovery_op(h, get_recovery_op_priority());
12689   return started;
12690 }
12691
12692 bool PrimaryLogPG::primary_error(
12693   const hobject_t& soid, eversion_t v)
12694 {
12695   recovery_state.force_object_missing(pg_whoami, soid, v);
12696   bool uhoh = recovery_state.get_missing_loc().is_unfound(soid);
12697   if (uhoh)
12698     osd->clog->error() << info.pgid << " missing primary copy of "
12699                        << soid << ", unfound";
12700   else
12701     osd->clog->error() << info.pgid << " missing primary copy of "
12702                        << soid
12703                        << ", will try copies on "
12704                        << recovery_state.get_missing_loc().get_locations(soid);
12705   return uhoh;
12706 }
12707
12708 int PrimaryLogPG::prep_object_replica_deletes(
12709   const hobject_t& soid, eversion_t v,
12710   PGBackend::RecoveryHandle *h,
12711   bool *work_started)
12712 {
12713   ceph_assert(is_primary());
12714   dout(10) << __func__ << ": on " << soid << dendl;
12715
12716   ObjectContextRef obc = get_object_context(soid, false);
12717   if (obc) {
12718     if (!obc->get_recovery_read()) {
12719       dout(20) << "replica delete delayed on " << soid
12720                << "; could not get rw_manager lock" << dendl;
12721       *work_started = true;
12722       return 0;
12723     } else {
12724       dout(20) << "replica delete got recovery read lock on " << soid
12725                << dendl;
12726     }
12727   }
12728
12729   start_recovery_op(soid);
12730   ceph_assert(!recovering.count(soid));
12731   if (!obc)
12732     recovering.insert(make_pair(soid, ObjectContextRef()));
12733   else
12734     recovering.insert(make_pair(soid, obc));
12735
12736   pgbackend->recover_delete_object(soid, v, h);
12737   return 1;
12738 }
12739
12740 int PrimaryLogPG::prep_object_replica_pushes(
12741   const hobject_t& soid, eversion_t v,
12742   PGBackend::RecoveryHandle *h,
12743   bool *work_started)
12744 {
12745   ceph_assert(is_primary());
12746   dout(10) << __func__ << ": on " << soid << dendl;
12747
12748   if (soid.snap && soid.snap < CEPH_NOSNAP) {
12749     // do we have the head and/or snapdir?
12750     hobject_t head = soid.get_head();
12751     if (recovery_state.get_pg_log().get_missing().is_missing(head)) {
12752       if (recovering.count(head)) {
12753         dout(10) << " missing but already recovering head " << head << dendl;
12754         return 0;
12755       } else {
12756         int r = recover_missing(
12757             head, recovery_state.get_pg_log().get_missing().get_items().find(head)->second.need,
12758             get_recovery_op_priority(), h);
12759         if (r != PULL_NONE)
12760           return 1;
12761         return 0;
12762       }
12763     }
12764   }
12765
12766   // NOTE: we know we will get a valid oloc off of disk here.
12767   ObjectContextRef obc = get_object_context(soid, false);
12768   if (!obc) {
12769     primary_error(soid, v);
12770     return 0;
12771   }
12772
12773   if (!obc->get_recovery_read()) {
12774     dout(20) << "recovery delayed on " << soid
12775              << "; could not get rw_manager lock" << dendl;
12776     *work_started = true;
12777     return 0;
12778   } else {
12779     dout(20) << "recovery got recovery read lock on " << soid
12780              << dendl;
12781   }
12782
12783   start_recovery_op(soid);
12784   ceph_assert(!recovering.count(soid));
12785   recovering.insert(make_pair(soid, obc));
12786
12787   int r = pgbackend->recover_object(
12788     soid,
12789     v,
12790     ObjectContextRef(),
12791     obc, // has snapset context
12792     h);
12793   if (r < 0) {
12794     dout(0) << __func__ << " Error " << r << " on oid " << soid << dendl;
12795     on_failed_pull({ pg_whoami }, soid, v);
12796     return 0;
12797   }
12798   return 1;
12799 }
12800
12801 uint64_t PrimaryLogPG::recover_replicas(uint64_t max, ThreadPool::TPHandle &handle,
12802   bool *work_started)
12803 {
12804   dout(10) << __func__ << "(" << max << ")" << dendl;
12805   uint64_t started = 0;
12806
12807   PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
12808
12809   // this is FAR from an optimal recovery order.  pretty lame, really.
12810   ceph_assert(!get_acting_recovery_backfill().empty());
12811   // choose replicas to recover, replica has the shortest missing list first
12812   // so we can bring it back to normal ASAP
12813   std::vector<std::pair<unsigned int, pg_shard_t>> replicas_by_num_missing,
12814     async_by_num_missing;
12815   replicas_by_num_missing.reserve(get_acting_recovery_backfill().size() - 1);
12816   for (auto &p: get_acting_recovery_backfill()) {
12817     if (p == get_primary()) {
12818       continue;
12819     }
12820     auto pm = recovery_state.get_peer_missing().find(p);
12821     ceph_assert(pm != recovery_state.get_peer_missing().end());
12822     auto nm = pm->second.num_missing();
12823     if (nm != 0) {
12824       if (is_async_recovery_target(p)) {
12825         async_by_num_missing.push_back(make_pair(nm, p));
12826       } else {
12827         replicas_by_num_missing.push_back(make_pair(nm, p));
12828       }
12829     }
12830   }
12831   // sort by number of missing objects, in ascending order.
12832   auto func = [](const std::pair<unsigned int, pg_shard_t> &lhs,
12833                  const std::pair<unsigned int, pg_shard_t> &rhs) {
12834     return lhs.first < rhs.first;
12835   };
12836   // acting goes first
12837   std::sort(replicas_by_num_missing.begin(), replicas_by_num_missing.end(), func);
12838   // then async_recovery_targets
12839   std::sort(async_by_num_missing.begin(), async_by_num_missing.end(), func);
12840   replicas_by_num_missing.insert(replicas_by_num_missing.end(),
12841     async_by_num_missing.begin(), async_by_num_missing.end());
12842   for (auto &replica: replicas_by_num_missing) {
12843     pg_shard_t &peer = replica.second;
12844     ceph_assert(peer != get_primary());
12845     auto pm = recovery_state.get_peer_missing().find(peer);
12846     ceph_assert(pm != recovery_state.get_peer_missing().end());
12847     size_t m_sz = pm->second.num_missing();
12848
12849     dout(10) << " peer osd." << peer << " missing " << m_sz << " objects." << dendl;
12850     dout(20) << " peer osd." << peer << " missing " << pm->second.get_items() << dendl;
12851
12852     // oldest first!
12853     const pg_missing_t &m(pm->second);
12854     for (map<version_t, hobject_t>::const_iterator p = m.get_rmissing().begin();
12855          p != m.get_rmissing().end() && started < max;
12856            ++p) {
12857       handle.reset_tp_timeout();
12858       const hobject_t soid(p->second);
12859
12860       if (recovery_state.get_missing_loc().is_unfound(soid)) {
12861         dout(10) << __func__ << ": " << soid << " still unfound" << dendl;
12862         continue;
12863       }
12864
12865       const pg_info_t &pi = recovery_state.get_peer_info(peer);
12866       if (soid > pi.last_backfill) {
12867         if (!recovering.count(soid)) {
12868           derr << __func__ << ": object " << soid << " last_backfill "
12869                << pi.last_backfill << dendl;
12870           derr << __func__ << ": object added to missing set for backfill, but "
12871                << "is not in recovering, error!" << dendl;
12872           ceph_abort();
12873         }
12874         continue;
12875       }
12876
12877       if (recovering.count(soid)) {
12878         dout(10) << __func__ << ": already recovering " << soid << dendl;
12879         continue;
12880       }
12881
12882       if (recovery_state.get_missing_loc().is_deleted(soid)) {
12883         dout(10) << __func__ << ": " << soid << " is a delete, removing" << dendl;
12884         map<hobject_t,pg_missing_item>::const_iterator r = m.get_items().find(soid);
12885         started += prep_object_replica_deletes(soid, r->second.need, h, work_started);
12886         continue;
12887       }
12888
12889       if (soid.is_snap() &&
12890           recovery_state.get_pg_log().get_missing().is_missing(
12891             soid.get_head())) {
12892         dout(10) << __func__ << ": " << soid.get_head()
12893                  << " still missing on primary" << dendl;
12894         continue;
12895       }
12896
12897       if (recovery_state.get_pg_log().get_missing().is_missing(soid)) {
12898         dout(10) << __func__ << ": " << soid << " still missing on primary" << dendl;
12899         continue;
12900       }
12901
12902       dout(10) << __func__ << ": recover_object_replicas(" << soid << ")" << dendl;
12903       map<hobject_t,pg_missing_item>::const_iterator r = m.get_items().find(soid);
12904       started += prep_object_replica_pushes(soid, r->second.need, h, work_started);
12905     }
12906   }
12907
12908   pgbackend->run_recovery_op(h, get_recovery_op_priority());
12909   return started;
12910 }
12911
12912 hobject_t PrimaryLogPG::earliest_peer_backfill() const
12913 {
12914   hobject_t e = hobject_t::get_max();
12915   for (const pg_shard_t& peer : get_backfill_targets()) {
12916     const auto iter = peer_backfill_info.find(peer);
12917     ceph_assert(iter != peer_backfill_info.end());
12918     e = std::min(e, iter->second.begin);
12919   }
12920   return e;
12921 }
12922
12923 bool PrimaryLogPG::all_peer_done() const
12924 {
12925   // Primary hasn't got any more objects
12926   ceph_assert(backfill_info.empty());
12927
12928   for (const pg_shard_t& bt : get_backfill_targets()) {
12929     const auto piter = peer_backfill_info.find(bt);
12930     ceph_assert(piter != peer_backfill_info.end());
12931     const BackfillInterval& pbi = piter->second;
12932     // See if peer has more to process
12933     if (!pbi.extends_to_end() || !pbi.empty())
12934         return false;
12935   }
12936   return true;
12937 }
12938
12939 /**
12940  * recover_backfill
12941  *
12942  * Invariants:
12943  *
12944  * backfilled: fully pushed to replica or present in replica's missing set (both
12945  * our copy and theirs).
12946  *
12947  * All objects on a backfill_target in
12948  * [MIN,peer_backfill_info[backfill_target].begin) are valid; logically-removed
12949  * objects have been actually deleted and all logically-valid objects are replicated.
12950  * There may be PG objects in this interval yet to be backfilled.
12951  *
12952  * All objects in PG in [MIN,backfill_info.begin) have been backfilled to all
12953  * backfill_targets.  There may be objects on backfill_target(s) yet to be deleted.
12954  *
12955  * For a backfill target, all objects < std::min(peer_backfill_info[target].begin,
12956  *     backfill_info.begin) in PG are backfilled.  No deleted objects in this
12957  * interval remain on the backfill target.
12958  *
12959  * For a backfill target, all objects <= peer_info[target].last_backfill
12960  * have been backfilled to target
12961  *
12962  * There *MAY* be missing/outdated objects between last_backfill_started and
12963  * std::min(peer_backfill_info[*].begin, backfill_info.begin) in the event that client
12964  * io created objects since the last scan.  For this reason, we call
12965  * update_range() again before continuing backfill.
12966  */
12967 uint64_t PrimaryLogPG::recover_backfill(
12968   uint64_t max,
12969   ThreadPool::TPHandle &handle, bool *work_started)
12970 {
12971   dout(10) << __func__ << " (" << max << ")"
12972            << " bft=" << get_backfill_targets()
12973            << " last_backfill_started " << last_backfill_started
12974            << (new_backfill ? " new_backfill":"")
12975            << dendl;
12976   ceph_assert(!get_backfill_targets().empty());
12977
12978   // Initialize from prior backfill state
12979   if (new_backfill) {
12980     // on_activate() was called prior to getting here
12981     ceph_assert(last_backfill_started == earliest_backfill());
12982     new_backfill = false;
12983
12984     // initialize BackfillIntervals
12985     for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
12986          i != get_backfill_targets().end();
12987          ++i) {
12988       peer_backfill_info[*i].reset(
12989         recovery_state.get_peer_info(*i).last_backfill);
12990     }
12991     backfill_info.reset(last_backfill_started);
12992
12993     backfills_in_flight.clear();
12994     pending_backfill_updates.clear();
12995   }
12996
12997   for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
12998        i != get_backfill_targets().end();
12999        ++i) {
13000     dout(10) << "peer osd." << *i
13001            << " info " << recovery_state.get_peer_info(*i)
13002            << " interval " << peer_backfill_info[*i].begin
13003            << "-" << peer_backfill_info[*i].end
13004            << " " << peer_backfill_info[*i].objects.size() << " objects"
13005            << dendl;
13006   }
13007
13008   // update our local interval to cope with recent changes
13009   backfill_info.begin = last_backfill_started;
13010   update_range(&backfill_info, handle);
13011
13012   unsigned ops = 0;
13013   vector<boost::tuple<hobject_t, eversion_t, pg_shard_t> > to_remove;
13014   set<hobject_t> add_to_stat;
13015
13016   for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
13017        i != get_backfill_targets().end();
13018        ++i) {
13019     peer_backfill_info[*i].trim_to(
13020       std::max(
13021         recovery_state.get_peer_info(*i).last_backfill,
13022         last_backfill_started));
13023   }
13024   backfill_info.trim_to(last_backfill_started);
13025
13026   PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
13027   while (ops < max) {
13028     if (backfill_info.begin <= earliest_peer_backfill() &&
13029         !backfill_info.extends_to_end() && backfill_info.empty()) {
13030       hobject_t next = backfill_info.end;
13031       backfill_info.reset(next);
13032       backfill_info.end = hobject_t::get_max();
13033       update_range(&backfill_info, handle);
13034       backfill_info.trim();
13035     }
13036
13037     dout(20) << "   my backfill interval " << backfill_info << dendl;
13038
13039     bool sent_scan = false;
13040     for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
13041          i != get_backfill_targets().end();
13042          ++i) {
13043       pg_shard_t bt = *i;
13044       BackfillInterval& pbi = peer_backfill_info[bt];
13045
13046       dout(20) << " peer shard " << bt << " backfill " << pbi << dendl;
13047       if (pbi.begin <= backfill_info.begin &&
13048           !pbi.extends_to_end() && pbi.empty()) {
13049         dout(10) << " scanning peer osd." << bt << " from " << pbi.end << dendl;
13050         epoch_t e = get_osdmap_epoch();
13051         MOSDPGScan *m = new MOSDPGScan(
13052           MOSDPGScan::OP_SCAN_GET_DIGEST, pg_whoami, e, get_last_peering_reset(),
13053           spg_t(info.pgid.pgid, bt.shard),
13054           pbi.end, hobject_t());
13055         osd->send_message_osd_cluster(bt.osd, m, get_osdmap_epoch());
13056         ceph_assert(waiting_on_backfill.find(bt) == waiting_on_backfill.end());
13057         waiting_on_backfill.insert(bt);
13058         sent_scan = true;
13059       }
13060     }
13061
13062     // Count simultaneous scans as a single op and let those complete
13063     if (sent_scan) {
13064       ops++;
13065       start_recovery_op(hobject_t::get_max()); // XXX: was pbi.end
13066       break;
13067     }
13068
13069     if (backfill_info.empty() && all_peer_done()) {
13070       dout(10) << " reached end for both local and all peers" << dendl;
13071       break;
13072     }
13073
13074     // Get object within set of peers to operate on and
13075     // the set of targets for which that object applies.
13076     hobject_t check = earliest_peer_backfill();
13077
13078     if (check < backfill_info.begin) {
13079
13080       set<pg_shard_t> check_targets;
13081       for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
13082            i != get_backfill_targets().end();
13083            ++i) {
13084         pg_shard_t bt = *i;
13085         BackfillInterval& pbi = peer_backfill_info[bt];
13086         if (pbi.begin == check)
13087           check_targets.insert(bt);
13088       }
13089       ceph_assert(!check_targets.empty());
13090
13091       dout(20) << " BACKFILL removing " << check
13092                << " from peers " << check_targets << dendl;
13093       for (set<pg_shard_t>::iterator i = check_targets.begin();
13094            i != check_targets.end();
13095            ++i) {
13096         pg_shard_t bt = *i;
13097         BackfillInterval& pbi = peer_backfill_info[bt];
13098         ceph_assert(pbi.begin == check);
13099
13100         to_remove.push_back(boost::make_tuple(check, pbi.objects.begin()->second, bt));
13101         pbi.pop_front();
13102       }
13103
13104       last_backfill_started = check;
13105
13106       // Don't increment ops here because deletions
13107       // are cheap and not replied to unlike real recovery_ops,
13108       // and we can't increment ops without requeueing ourself
13109       // for recovery.
13110     } else {
13111       eversion_t& obj_v = backfill_info.objects.begin()->second;
13112
13113       vector<pg_shard_t> need_ver_targs, missing_targs, keep_ver_targs, skip_targs;
13114       for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
13115            i != get_backfill_targets().end();
13116            ++i) {
13117         pg_shard_t bt = *i;
13118         BackfillInterval& pbi = peer_backfill_info[bt];
13119         // Find all check peers that have the wrong version
13120         if (check == backfill_info.begin && check == pbi.begin) {
13121           if (pbi.objects.begin()->second != obj_v) {
13122             need_ver_targs.push_back(bt);
13123           } else {
13124             keep_ver_targs.push_back(bt);
13125           }
13126         } else {
13127           const pg_info_t& pinfo = recovery_state.get_peer_info(bt);
13128
13129           // Only include peers that we've caught up to their backfill line
13130           // otherwise, they only appear to be missing this object
13131           // because their pbi.begin > backfill_info.begin.
13132           if (backfill_info.begin > pinfo.last_backfill)
13133             missing_targs.push_back(bt);
13134           else
13135             skip_targs.push_back(bt);
13136         }
13137       }
13138
13139       if (!keep_ver_targs.empty()) {
13140         // These peers have version obj_v
13141         dout(20) << " BACKFILL keeping " << check
13142                  << " with ver " << obj_v
13143                  << " on peers " << keep_ver_targs << dendl;
13144         //assert(!waiting_for_degraded_object.count(check));
13145       }
13146       if (!need_ver_targs.empty() || !missing_targs.empty()) {
13147         ObjectContextRef obc = get_object_context(backfill_info.begin, false);
13148         ceph_assert(obc);
13149         if (obc->get_recovery_read()) {
13150           if (!need_ver_targs.empty()) {
13151             dout(20) << " BACKFILL replacing " << check
13152                    << " with ver " << obj_v
13153                    << " to peers " << need_ver_targs << dendl;
13154           }
13155           if (!missing_targs.empty()) {
13156             dout(20) << " BACKFILL pushing " << backfill_info.begin
13157                  << " with ver " << obj_v
13158                  << " to peers " << missing_targs << dendl;
13159           }
13160           vector<pg_shard_t> all_push = need_ver_targs;
13161           all_push.insert(all_push.end(), missing_targs.begin(), missing_targs.end());
13162
13163           handle.reset_tp_timeout();
13164           int r = prep_backfill_object_push(backfill_info.begin, obj_v, obc, all_push, h);
13165           if (r < 0) {
13166             *work_started = true;
13167             dout(0) << __func__ << " Error " << r << " trying to backfill " << backfill_info.begin << dendl;
13168             break;
13169           }
13170           ops++;
13171         } else {
13172           *work_started = true;
13173           dout(20) << "backfill blocking on " << backfill_info.begin
13174                    << "; could not get rw_manager lock" << dendl;
13175           break;
13176         }
13177       }
13178       dout(20) << "need_ver_targs=" << need_ver_targs
13179                << " keep_ver_targs=" << keep_ver_targs << dendl;
13180       dout(20) << "backfill_targets=" << get_backfill_targets()
13181                << " missing_targs=" << missing_targs
13182                << " skip_targs=" << skip_targs << dendl;
13183
13184       last_backfill_started = backfill_info.begin;
13185       add_to_stat.insert(backfill_info.begin); // XXX: Only one for all pushes?
13186       backfill_info.pop_front();
13187       vector<pg_shard_t> check_targets = need_ver_targs;
13188       check_targets.insert(check_targets.end(), keep_ver_targs.begin(), keep_ver_targs.end());
13189       for (vector<pg_shard_t>::iterator i = check_targets.begin();
13190            i != check_targets.end();
13191            ++i) {
13192         pg_shard_t bt = *i;
13193         BackfillInterval& pbi = peer_backfill_info[bt];
13194         pbi.pop_front();
13195       }
13196     }
13197   }
13198
13199   hobject_t backfill_pos =
13200     std::min(backfill_info.begin, earliest_peer_backfill());
13201
13202   for (set<hobject_t>::iterator i = add_to_stat.begin();
13203        i != add_to_stat.end();
13204        ++i) {
13205     ObjectContextRef obc = get_object_context(*i, false);
13206     ceph_assert(obc);
13207     pg_stat_t stat;
13208     add_object_context_to_pg_stat(obc, &stat);
13209     pending_backfill_updates[*i] = stat;
13210   }
13211   map<pg_shard_t,MOSDPGBackfillRemove*> reqs;
13212   for (unsigned i = 0; i < to_remove.size(); ++i) {
13213     handle.reset_tp_timeout();
13214     const hobject_t& oid = to_remove[i].get<0>();
13215     eversion_t v = to_remove[i].get<1>();
13216     pg_shard_t peer = to_remove[i].get<2>();
13217     MOSDPGBackfillRemove *m;
13218     auto it = reqs.find(peer);
13219     if (it != reqs.end()) {
13220       m = it->second;
13221     } else {
13222       m = reqs[peer] = new MOSDPGBackfillRemove(
13223         spg_t(info.pgid.pgid, peer.shard),
13224         get_osdmap_epoch());
13225     }
13226     m->ls.push_back(make_pair(oid, v));
13227
13228     if (oid <= last_backfill_started)
13229       pending_backfill_updates[oid]; // add empty stat!
13230   }
13231   for (auto p : reqs) {
13232     osd->send_message_osd_cluster(p.first.osd, p.second,
13233                                   get_osdmap_epoch());
13234   }
13235
13236   pgbackend->run_recovery_op(h, get_recovery_op_priority());
13237
13238   dout(5) << "backfill_pos is " << backfill_pos << dendl;
13239   for (set<hobject_t>::iterator i = backfills_in_flight.begin();
13240        i != backfills_in_flight.end();
13241        ++i) {
13242     dout(20) << *i << " is still in flight" << dendl;
13243   }
13244
13245   hobject_t next_backfill_to_complete = backfills_in_flight.empty() ?
13246     backfill_pos : *(backfills_in_flight.begin());
13247   hobject_t new_last_backfill = earliest_backfill();
13248   dout(10) << "starting new_last_backfill at " << new_last_backfill << dendl;
13249   for (map<hobject_t, pg_stat_t>::iterator i =
13250          pending_backfill_updates.begin();
13251        i != pending_backfill_updates.end() &&
13252          i->first < next_backfill_to_complete;
13253        pending_backfill_updates.erase(i++)) {
13254     dout(20) << " pending_backfill_update " << i->first << dendl;
13255     ceph_assert(i->first > new_last_backfill);
13256     recovery_state.update_complete_backfill_object_stats(
13257       i->first,
13258       i->second);
13259     new_last_backfill = i->first;
13260   }
13261   dout(10) << "possible new_last_backfill at " << new_last_backfill << dendl;
13262
13263   ceph_assert(!pending_backfill_updates.empty() ||
13264          new_last_backfill == last_backfill_started);
13265   if (pending_backfill_updates.empty() &&
13266       backfill_pos.is_max()) {
13267     ceph_assert(backfills_in_flight.empty());
13268     new_last_backfill = backfill_pos;
13269     last_backfill_started = backfill_pos;
13270   }
13271   dout(10) << "final new_last_backfill at " << new_last_backfill << dendl;
13272
13273   // If new_last_backfill == MAX, then we will send OP_BACKFILL_FINISH to
13274   // all the backfill targets.  Otherwise, we will move last_backfill up on
13275   // those targets need it and send OP_BACKFILL_PROGRESS to them.
13276   for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
13277        i != get_backfill_targets().end();
13278        ++i) {
13279     pg_shard_t bt = *i;
13280     const pg_info_t& pinfo = recovery_state.get_peer_info(bt);
13281
13282     if (new_last_backfill > pinfo.last_backfill) {
13283       recovery_state.update_peer_last_backfill(bt, new_last_backfill);
13284       epoch_t e = get_osdmap_epoch();
13285       MOSDPGBackfill *m = NULL;
13286       if (pinfo.last_backfill.is_max()) {
13287         m = new MOSDPGBackfill(
13288           MOSDPGBackfill::OP_BACKFILL_FINISH,
13289           e,
13290           get_last_peering_reset(),
13291           spg_t(info.pgid.pgid, bt.shard));
13292         // Use default priority here, must match sub_op priority
13293         start_recovery_op(hobject_t::get_max());
13294       } else {
13295         m = new MOSDPGBackfill(
13296           MOSDPGBackfill::OP_BACKFILL_PROGRESS,
13297           e,
13298           get_last_peering_reset(),
13299           spg_t(info.pgid.pgid, bt.shard));
13300         // Use default priority here, must match sub_op priority
13301       }
13302       m->last_backfill = pinfo.last_backfill;
13303       m->stats = pinfo.stats;
13304       osd->send_message_osd_cluster(bt.osd, m, get_osdmap_epoch());
13305       dout(10) << " peer " << bt
13306                << " num_objects now " << pinfo.stats.stats.sum.num_objects
13307                << " / " << info.stats.stats.sum.num_objects << dendl;
13308     }
13309   }
13310
13311   if (ops)
13312     *work_started = true;
13313   return ops;
13314 }
13315
13316 int PrimaryLogPG::prep_backfill_object_push(
13317   hobject_t oid, eversion_t v,
13318   ObjectContextRef obc,
13319   vector<pg_shard_t> peers,
13320   PGBackend::RecoveryHandle *h)
13321 {
13322   dout(10) << __func__ << " " << oid << " v " << v << " to peers " << peers << dendl;
13323   ceph_assert(!peers.empty());
13324
13325   backfills_in_flight.insert(oid);
13326   recovery_state.prepare_backfill_for_missing(oid, v, peers);
13327
13328   ceph_assert(!recovering.count(oid));
13329
13330   start_recovery_op(oid);
13331   recovering.insert(make_pair(oid, obc));
13332
13333   int r = pgbackend->recover_object(
13334     oid,
13335     v,
13336     ObjectContextRef(),
13337     obc,
13338     h);
13339   if (r < 0) {
13340     dout(0) << __func__ << " Error " << r << " on oid " << oid << dendl;
13341     on_failed_pull({ pg_whoami }, oid, v);
13342   }
13343   return r;
13344 }
13345
13346 void PrimaryLogPG::update_range(
13347   BackfillInterval *bi,
13348   ThreadPool::TPHandle &handle)
13349 {
13350   int local_min = cct->_conf->osd_backfill_scan_min;
13351   int local_max = cct->_conf->osd_backfill_scan_max;
13352
13353   if (bi->version < info.log_tail) {
13354     dout(10) << __func__<< ": bi is old, rescanning local backfill_info"
13355              << dendl;
13356     bi->version = info.last_update;
13357     scan_range(local_min, local_max, bi, handle);
13358   }
13359
13360   if (bi->version >= projected_last_update) {
13361     dout(10) << __func__<< ": bi is current " << dendl;
13362     ceph_assert(bi->version == projected_last_update);
13363   } else if (bi->version >= info.log_tail) {
13364     if (recovery_state.get_pg_log().get_log().empty() && projected_log.empty()) {
13365       /* Because we don't move log_tail on split, the log might be
13366        * empty even if log_tail != last_update.  However, the only
13367        * way to get here with an empty log is if log_tail is actually
13368        * eversion_t(), because otherwise the entry which changed
13369        * last_update since the last scan would have to be present.
13370        */
13371       ceph_assert(bi->version == eversion_t());
13372       return;
13373     }
13374
13375     dout(10) << __func__<< ": bi is old, (" << bi->version
13376              << ") can be updated with log to projected_last_update "
13377              << projected_last_update << dendl;
13378
13379     auto func = [&](const pg_log_entry_t &e) {
13380       dout(10) << __func__ << ": updating from version " << e.version
13381                << dendl;
13382       const hobject_t &soid = e.soid;
13383       if (soid >= bi->begin &&
13384           soid < bi->end) {
13385         if (e.is_update()) {
13386           dout(10) << __func__ << ": " << e.soid << " updated to version "
13387                    << e.version << dendl;
13388           bi->objects.erase(e.soid);
13389           bi->objects.insert(
13390             make_pair(
13391               e.soid,
13392               e.version));
13393         } else if (e.is_delete()) {
13394           dout(10) << __func__ << ": " << e.soid << " removed" << dendl;
13395           bi->objects.erase(e.soid);
13396         }
13397       }
13398     };
13399     dout(10) << "scanning pg log first" << dendl;
13400     recovery_state.get_pg_log().get_log().scan_log_after(bi->version, func);
13401     dout(10) << "scanning projected log" << dendl;
13402     projected_log.scan_log_after(bi->version, func);
13403     bi->version = projected_last_update;
13404   } else {
13405     ceph_abort_msg("scan_range should have raised bi->version past log_tail");
13406   }
13407 }
13408
13409 void PrimaryLogPG::scan_range(
13410   int min, int max, BackfillInterval *bi,
13411   ThreadPool::TPHandle &handle)
13412 {
13413   ceph_assert(is_locked());
13414   dout(10) << "scan_range from " << bi->begin << dendl;
13415   bi->clear_objects();
13416
13417   vector<hobject_t> ls;
13418   ls.reserve(max);
13419   int r = pgbackend->objects_list_partial(bi->begin, min, max, &ls, &bi->end);
13420   ceph_assert(r >= 0);
13421   dout(10) << " got " << ls.size() << " items, next " << bi->end << dendl;
13422   dout(20) << ls << dendl;
13423
13424   for (vector<hobject_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
13425     handle.reset_tp_timeout();
13426     ObjectContextRef obc;
13427     if (is_primary())
13428       obc = object_contexts.lookup(*p);
13429     if (obc) {
13430       if (!obc->obs.exists) {
13431         /* If the object does not exist here, it must have been removed
13432          * between the collection_list_partial and here.  This can happen
13433          * for the first item in the range, which is usually last_backfill.
13434          */
13435         continue;
13436       }
13437       bi->objects[*p] = obc->obs.oi.version;
13438       dout(20) << "  " << *p << " " << obc->obs.oi.version << dendl;
13439     } else {
13440       bufferlist bl;
13441       int r = pgbackend->objects_get_attr(*p, OI_ATTR, &bl);
13442       /* If the object does not exist here, it must have been removed
13443        * between the collection_list_partial and here.  This can happen
13444        * for the first item in the range, which is usually last_backfill.
13445        */
13446       if (r == -ENOENT)
13447         continue;
13448
13449       ceph_assert(r >= 0);
13450       object_info_t oi(bl);
13451       bi->objects[*p] = oi.version;
13452       dout(20) << "  " << *p << " " << oi.version << dendl;
13453     }
13454   }
13455 }
13456
13457
13458 /** check_local
13459  *
13460  * verifies that stray objects have been deleted
13461  */
13462 void PrimaryLogPG::check_local()
13463 {
13464   dout(10) << __func__ << dendl;
13465
13466   ceph_assert(
13467     info.last_update >=
13468     recovery_state.get_pg_log().get_tail());  // otherwise we need some help!
13469
13470   if (!cct->_conf->osd_debug_verify_stray_on_activate)
13471     return;
13472
13473   // just scan the log.
13474   set<hobject_t> did;
13475   for (list<pg_log_entry_t>::const_reverse_iterator p = recovery_state.get_pg_log().get_log().log.rbegin();
13476        p != recovery_state.get_pg_log().get_log().log.rend();
13477        ++p) {
13478     if (did.count(p->soid))
13479       continue;
13480     did.insert(p->soid);
13481
13482     if (p->is_delete() && !is_missing_object(p->soid)) {
13483       dout(10) << " checking " << p->soid
13484                << " at " << p->version << dendl;
13485       struct stat st;
13486       int r = osd->store->stat(
13487         ch,
13488         ghobject_t(p->soid, ghobject_t::NO_GEN, pg_whoami.shard),
13489         &st);
13490       if (r != -ENOENT) {
13491         derr << __func__ << " " << p->soid << " exists, but should have been "
13492              << "deleted" << dendl;
13493         ceph_abort_msg("erroneously present object");
13494       }
13495     } else {
13496       // ignore old(+missing) objects
13497     }
13498   }
13499 }
13500
13501
13502
13503 // ===========================
13504 // hit sets
13505
13506 hobject_t PrimaryLogPG::get_hit_set_current_object(utime_t stamp)
13507 {
13508   ostringstream ss;
13509   ss << "hit_set_" << info.pgid.pgid << "_current_" << stamp;
13510   hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "",
13511                  info.pgid.ps(), info.pgid.pool(),
13512                  cct->_conf->osd_hit_set_namespace);
13513   dout(20) << __func__ << " " << hoid << dendl;
13514   return hoid;
13515 }
13516
13517 hobject_t PrimaryLogPG::get_hit_set_archive_object(utime_t start,
13518                                                    utime_t end,
13519                                                    bool using_gmt)
13520 {
13521   ostringstream ss;
13522   ss << "hit_set_" << info.pgid.pgid << "_archive_";
13523   if (using_gmt) {
13524     start.gmtime(ss, true /* legacy pre-octopus form */) << "_";
13525     end.gmtime(ss, true /* legacy pre-octopus form */);
13526   } else {
13527     start.localtime(ss, true /* legacy pre-octopus form */) << "_";
13528     end.localtime(ss, true /* legacy pre-octopus form */);
13529   }
13530   hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "",
13531                  info.pgid.ps(), info.pgid.pool(),
13532                  cct->_conf->osd_hit_set_namespace);
13533   dout(20) << __func__ << " " << hoid << dendl;
13534   return hoid;
13535 }
13536
13537 void PrimaryLogPG::hit_set_clear()
13538 {
13539   dout(20) << __func__ << dendl;
13540   hit_set.reset();
13541   hit_set_start_stamp = utime_t();
13542 }
13543
13544 void PrimaryLogPG::hit_set_setup()
13545 {
13546   if (!is_active() ||
13547       !is_primary()) {
13548     hit_set_clear();
13549     return;
13550   }
13551
13552   if (is_active() && is_primary() &&
13553       (!pool.info.hit_set_count ||
13554        !pool.info.hit_set_period ||
13555        pool.info.hit_set_params.get_type() == HitSet::TYPE_NONE)) {
13556     hit_set_clear();
13557
13558     // only primary is allowed to remove all the hit set objects
13559     hit_set_remove_all();
13560     return;
13561   }
13562
13563   // FIXME: discard any previous data for now
13564   hit_set_create();
13565
13566   // include any writes we know about from the pg log.  this doesn't
13567   // capture reads, but it is better than nothing!
13568   hit_set_apply_log();
13569 }
13570
13571 void PrimaryLogPG::hit_set_remove_all()
13572 {
13573   // If any archives are degraded we skip this
13574   for (auto p = info.hit_set.history.begin();
13575        p != info.hit_set.history.end();
13576        ++p) {
13577     hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
13578
13579     // Once we hit a degraded object just skip
13580     if (is_degraded_or_backfilling_object(aoid))
13581       return;
13582     if (write_blocked_by_scrub(aoid))
13583       return;
13584   }
13585
13586   if (!info.hit_set.history.empty()) {
13587     auto p = info.hit_set.history.rbegin();
13588     ceph_assert(p != info.hit_set.history.rend());
13589     hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
13590     ceph_assert(!is_degraded_or_backfilling_object(oid));
13591     ObjectContextRef obc = get_object_context(oid, false);
13592     ceph_assert(obc);
13593
13594     OpContextUPtr ctx = simple_opc_create(obc);
13595     ctx->at_version = get_next_version();
13596     ctx->updated_hset_history = info.hit_set;
13597     utime_t now = ceph_clock_now();
13598     ctx->mtime = now;
13599     hit_set_trim(ctx, 0);
13600     simple_opc_submit(std::move(ctx));
13601   }
13602
13603   recovery_state.update_hset(pg_hit_set_history_t());
13604   if (agent_state) {
13605     agent_state->discard_hit_sets();
13606   }
13607 }
13608
13609 void PrimaryLogPG::hit_set_create()
13610 {
13611   utime_t now = ceph_clock_now();
13612   // make a copy of the params to modify
13613   HitSet::Params params(pool.info.hit_set_params);
13614
13615   dout(20) << __func__ << " " << params << dendl;
13616   if (pool.info.hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
13617     BloomHitSet::Params *p =
13618       static_cast<BloomHitSet::Params*>(params.impl.get());
13619
13620     // convert false positive rate so it holds up across the full period
13621     p->set_fpp(p->get_fpp() / pool.info.hit_set_count);
13622     if (p->get_fpp() <= 0.0)
13623       p->set_fpp(.01);  // fpp cannot be zero!
13624
13625     // if we don't have specified size, estimate target size based on the
13626     // previous bin!
13627     if (p->target_size == 0 && hit_set) {
13628       utime_t dur = now - hit_set_start_stamp;
13629       unsigned unique = hit_set->approx_unique_insert_count();
13630       dout(20) << __func__ << " previous set had approx " << unique
13631                << " unique items over " << dur << " seconds" << dendl;
13632       p->target_size = (double)unique * (double)pool.info.hit_set_period
13633                      / (double)dur;
13634     }
13635     if (p->target_size <
13636         static_cast<uint64_t>(cct->_conf->osd_hit_set_min_size))
13637       p->target_size = cct->_conf->osd_hit_set_min_size;
13638
13639     if (p->target_size
13640         > static_cast<uint64_t>(cct->_conf->osd_hit_set_max_size))
13641       p->target_size = cct->_conf->osd_hit_set_max_size;
13642
13643     p->seed = now.sec();
13644
13645     dout(10) << __func__ << " target_size " << p->target_size
13646              << " fpp " << p->get_fpp() << dendl;
13647   }
13648   hit_set.reset(new HitSet(params));
13649   hit_set_start_stamp = now;
13650 }
13651
13652 /**
13653  * apply log entries to set
13654  *
13655  * this would only happen after peering, to at least capture writes
13656  * during an interval that was potentially lost.
13657  */
13658 bool PrimaryLogPG::hit_set_apply_log()
13659 {
13660   if (!hit_set)
13661     return false;
13662
13663   eversion_t to = info.last_update;
13664   eversion_t from = info.hit_set.current_last_update;
13665   if (to <= from) {
13666     dout(20) << __func__ << " no update" << dendl;
13667     return false;
13668   }
13669
13670   dout(20) << __func__ << " " << to << " .. " << info.last_update << dendl;
13671   list<pg_log_entry_t>::const_reverse_iterator p =
13672     recovery_state.get_pg_log().get_log().log.rbegin();
13673   while (p != recovery_state.get_pg_log().get_log().log.rend() && p->version > to)
13674     ++p;
13675   while (p != recovery_state.get_pg_log().get_log().log.rend() && p->version > from) {
13676     hit_set->insert(p->soid);
13677     ++p;
13678   }
13679
13680   return true;
13681 }
13682
13683 void PrimaryLogPG::hit_set_persist()
13684 {
13685   dout(10) << __func__  << dendl;
13686   bufferlist bl;
13687   unsigned max = pool.info.hit_set_count;
13688
13689   utime_t now = ceph_clock_now();
13690   hobject_t oid;
13691
13692   // If any archives are degraded we skip this persist request
13693   // account for the additional entry being added below
13694   for (auto p = info.hit_set.history.begin();
13695        p != info.hit_set.history.end();
13696        ++p) {
13697     hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
13698
13699     // Once we hit a degraded object just skip further trim
13700     if (is_degraded_or_backfilling_object(aoid))
13701       return;
13702     if (write_blocked_by_scrub(aoid))
13703       return;
13704   }
13705
13706   // If backfill is in progress and we could possibly overlap with the
13707   // hit_set_* objects, back off.  Since these all have
13708   // hobject_t::hash set to pgid.ps(), and those sort first, we can
13709   // look just at that.  This is necessary because our transactions
13710   // may include a modify of the new hit_set *and* a delete of the
13711   // old one, and this may span the backfill boundary.
13712   for (set<pg_shard_t>::const_iterator p = get_backfill_targets().begin();
13713        p != get_backfill_targets().end();
13714        ++p) {
13715     const pg_info_t& pi = recovery_state.get_peer_info(*p);
13716     if (pi.last_backfill == hobject_t() ||
13717         pi.last_backfill.get_hash() == info.pgid.ps()) {
13718       dout(10) << __func__ << " backfill target osd." << *p
13719                << " last_backfill has not progressed past pgid ps"
13720                << dendl;
13721       return;
13722     }
13723   }
13724
13725
13726   pg_hit_set_info_t new_hset = pg_hit_set_info_t(pool.info.use_gmt_hitset);
13727   new_hset.begin = hit_set_start_stamp;
13728   new_hset.end = now;
13729   oid = get_hit_set_archive_object(
13730     new_hset.begin,
13731     new_hset.end,
13732     new_hset.using_gmt);
13733
13734   // If the current object is degraded we skip this persist request
13735   if (write_blocked_by_scrub(oid))
13736     return;
13737
13738   hit_set->seal();
13739   encode(*hit_set, bl);
13740   dout(20) << __func__ << " archive " << oid << dendl;
13741
13742   if (agent_state) {
13743     agent_state->add_hit_set(new_hset.begin, hit_set);
13744     uint32_t size = agent_state->hit_set_map.size();
13745     if (size >= pool.info.hit_set_count) {
13746       size = pool.info.hit_set_count > 0 ? pool.info.hit_set_count - 1: 0;
13747     }
13748     hit_set_in_memory_trim(size);
13749   }
13750
13751   ObjectContextRef obc = get_object_context(oid, true);
13752   OpContextUPtr ctx = simple_opc_create(obc);
13753
13754   ctx->at_version = get_next_version();
13755   ctx->updated_hset_history = info.hit_set;
13756   pg_hit_set_history_t &updated_hit_set_hist = *(ctx->updated_hset_history);
13757
13758   updated_hit_set_hist.current_last_update = info.last_update;
13759   new_hset.version = ctx->at_version;
13760
13761   updated_hit_set_hist.history.push_back(new_hset);
13762   hit_set_create();
13763
13764   // fabricate an object_info_t and SnapSet
13765   obc->obs.oi.version = ctx->at_version;
13766   obc->obs.oi.mtime = now;
13767   obc->obs.oi.size = bl.length();
13768   obc->obs.exists = true;
13769   obc->obs.oi.set_data_digest(bl.crc32c(-1));
13770
13771   ctx->new_obs = obc->obs;
13772
13773   ctx->new_snapset = obc->ssc->snapset;
13774
13775   ctx->delta_stats.num_objects++;
13776   ctx->delta_stats.num_objects_hit_set_archive++;
13777
13778   ctx->delta_stats.num_bytes += bl.length();
13779   ctx->delta_stats.num_bytes_hit_set_archive += bl.length();
13780
13781   bufferlist bss;
13782   encode(ctx->new_snapset, bss);
13783   bufferlist boi(sizeof(ctx->new_obs.oi));
13784   encode(ctx->new_obs.oi, boi,
13785            get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
13786
13787   ctx->op_t->create(oid);
13788   if (bl.length()) {
13789     ctx->op_t->write(oid, 0, bl.length(), bl, 0);
13790     write_update_size_and_usage(ctx->delta_stats, obc->obs.oi, ctx->modified_ranges,
13791         0, bl.length());
13792     ctx->clean_regions.mark_data_region_dirty(0, bl.length());
13793   }
13794   map <string, bufferlist> attrs;
13795   attrs[OI_ATTR].claim(boi);
13796   attrs[SS_ATTR].claim(bss);
13797   setattrs_maybe_cache(ctx->obc, ctx->op_t.get(), attrs);
13798   ctx->log.push_back(
13799     pg_log_entry_t(
13800       pg_log_entry_t::MODIFY,
13801       oid,
13802       ctx->at_version,
13803       eversion_t(),
13804       0,
13805       osd_reqid_t(),
13806       ctx->mtime,
13807       0)
13808     );
13809   ctx->log.back().clean_regions = ctx->clean_regions;
13810
13811   hit_set_trim(ctx, max);
13812
13813   simple_opc_submit(std::move(ctx));
13814 }
13815
13816 void PrimaryLogPG::hit_set_trim(OpContextUPtr &ctx, unsigned max)
13817 {
13818   ceph_assert(ctx->updated_hset_history);
13819   pg_hit_set_history_t &updated_hit_set_hist =
13820     *(ctx->updated_hset_history);
13821   for (unsigned num = updated_hit_set_hist.history.size(); num > max; --num) {
13822     list<pg_hit_set_info_t>::iterator p = updated_hit_set_hist.history.begin();
13823     ceph_assert(p != updated_hit_set_hist.history.end());
13824     hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
13825
13826     ceph_assert(!is_degraded_or_backfilling_object(oid));
13827
13828     dout(20) << __func__ << " removing " << oid << dendl;
13829     ++ctx->at_version.version;
13830     ctx->log.push_back(
13831         pg_log_entry_t(pg_log_entry_t::DELETE,
13832                        oid,
13833                        ctx->at_version,
13834                        p->version,
13835                        0,
13836                        osd_reqid_t(),
13837                        ctx->mtime,
13838                        0));
13839
13840     ctx->op_t->remove(oid);
13841     updated_hit_set_hist.history.pop_front();
13842
13843     ObjectContextRef obc = get_object_context(oid, false);
13844     ceph_assert(obc);
13845     --ctx->delta_stats.num_objects;
13846     --ctx->delta_stats.num_objects_hit_set_archive;
13847     ctx->delta_stats.num_bytes -= obc->obs.oi.size;
13848     ctx->delta_stats.num_bytes_hit_set_archive -= obc->obs.oi.size;
13849   }
13850 }
13851
13852 void PrimaryLogPG::hit_set_in_memory_trim(uint32_t max_in_memory)
13853 {
13854   while (agent_state->hit_set_map.size() > max_in_memory) {
13855     agent_state->remove_oldest_hit_set();
13856   }
13857 }
13858
13859
13860 // =======================================
13861 // cache agent
13862
13863 void PrimaryLogPG::agent_setup()
13864 {
13865   ceph_assert(is_locked());
13866   if (!is_active() ||
13867       !is_primary() ||
13868       state_test(PG_STATE_PREMERGE) ||
13869       pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE ||
13870       pool.info.tier_of < 0 ||
13871       !get_osdmap()->have_pg_pool(pool.info.tier_of)) {
13872     agent_clear();
13873     return;
13874   }
13875   if (!agent_state) {
13876     agent_state.reset(new TierAgentState);
13877
13878     // choose random starting position
13879     agent_state->position = hobject_t();
13880     agent_state->position.pool = info.pgid.pool();
13881     agent_state->position.set_hash(pool.info.get_random_pg_position(
13882       info.pgid.pgid,
13883       rand()));
13884     agent_state->start = agent_state->position;
13885
13886     dout(10) << __func__ << " allocated new state, position "
13887              << agent_state->position << dendl;
13888   } else {
13889     dout(10) << __func__ << " keeping existing state" << dendl;
13890   }
13891
13892   if (info.stats.stats_invalid) {
13893     osd->clog->warn() << "pg " << info.pgid << " has invalid (post-split) stats; must scrub before tier agent can activate";
13894   }
13895
13896   agent_choose_mode();
13897 }
13898
13899 void PrimaryLogPG::agent_clear()
13900 {
13901   agent_stop();
13902   agent_state.reset(NULL);
13903 }
13904
13905 // Return false if no objects operated on since start of object hash space
13906 bool PrimaryLogPG::agent_work(int start_max, int agent_flush_quota)
13907 {
13908   std::scoped_lock locker{*this};
13909   if (!agent_state) {
13910     dout(10) << __func__ << " no agent state, stopping" << dendl;
13911     return true;
13912   }
13913
13914   ceph_assert(!recovery_state.is_deleting());
13915
13916   if (agent_state->is_idle()) {
13917     dout(10) << __func__ << " idle, stopping" << dendl;
13918     return true;
13919   }
13920
13921   osd->logger->inc(l_osd_agent_wake);
13922
13923   dout(10) << __func__
13924            << " max " << start_max
13925            << ", flush " << agent_state->get_flush_mode_name()
13926            << ", evict " << agent_state->get_evict_mode_name()
13927            << ", pos " << agent_state->position
13928            << dendl;
13929   ceph_assert(is_primary());
13930   ceph_assert(is_active());
13931
13932   agent_load_hit_sets();
13933
13934   const pg_pool_t *base_pool = get_osdmap()->get_pg_pool(pool.info.tier_of);
13935   ceph_assert(base_pool);
13936
13937   int ls_min = 1;
13938   int ls_max = cct->_conf->osd_pool_default_cache_max_evict_check_size;
13939
13940   // list some objects.  this conveniently lists clones (oldest to
13941   // newest) before heads... the same order we want to flush in.
13942   //
13943   // NOTE: do not flush the Sequencer.  we will assume that the
13944   // listing we get back is imprecise.
13945   vector<hobject_t> ls;
13946   hobject_t next;
13947   int r = pgbackend->objects_list_partial(agent_state->position, ls_min, ls_max,
13948                                           &ls, &next);
13949   ceph_assert(r >= 0);
13950   dout(20) << __func__ << " got " << ls.size() << " objects" << dendl;
13951   int started = 0;
13952   for (vector<hobject_t>::iterator p = ls.begin();
13953        p != ls.end();
13954        ++p) {
13955     if (p->nspace == cct->_conf->osd_hit_set_namespace) {
13956       dout(20) << __func__ << " skip (hit set) " << *p << dendl;
13957       osd->logger->inc(l_osd_agent_skip);
13958       continue;
13959     }
13960     if (is_degraded_or_backfilling_object(*p)) {
13961       dout(20) << __func__ << " skip (degraded) " << *p << dendl;
13962       osd->logger->inc(l_osd_agent_skip);
13963       continue;
13964     }
13965     if (is_missing_object(p->get_head())) {
13966       dout(20) << __func__ << " skip (missing head) " << *p << dendl;
13967       osd->logger->inc(l_osd_agent_skip);
13968       continue;
13969     }
13970     ObjectContextRef obc = get_object_context(*p, false, NULL);
13971     if (!obc) {
13972       // we didn't flush; we may miss something here.
13973       dout(20) << __func__ << " skip (no obc) " << *p << dendl;
13974       osd->logger->inc(l_osd_agent_skip);
13975       continue;
13976     }
13977     if (!obc->obs.exists) {
13978       dout(20) << __func__ << " skip (dne) " << obc->obs.oi.soid << dendl;
13979       osd->logger->inc(l_osd_agent_skip);
13980       continue;
13981     }
13982     if (range_intersects_scrub(obc->obs.oi.soid,
13983                                obc->obs.oi.soid.get_head())) {
13984       dout(20) << __func__ << " skip (scrubbing) " << obc->obs.oi << dendl;
13985       osd->logger->inc(l_osd_agent_skip);
13986       continue;
13987     }
13988     if (obc->is_blocked()) {
13989       dout(20) << __func__ << " skip (blocked) " << obc->obs.oi << dendl;
13990       osd->logger->inc(l_osd_agent_skip);
13991       continue;
13992     }
13993     if (obc->is_request_pending()) {
13994       dout(20) << __func__ << " skip (request pending) " << obc->obs.oi << dendl;
13995       osd->logger->inc(l_osd_agent_skip);
13996       continue;
13997     }
13998
13999     // be careful flushing omap to an EC pool.
14000     if (!base_pool->supports_omap() &&
14001         obc->obs.oi.is_omap()) {
14002       dout(20) << __func__ << " skip (omap to EC) " << obc->obs.oi << dendl;
14003       osd->logger->inc(l_osd_agent_skip);
14004       continue;
14005     }
14006
14007     if (agent_state->evict_mode != TierAgentState::EVICT_MODE_IDLE &&
14008         agent_maybe_evict(obc, false))
14009       ++started;
14010     else if (agent_state->flush_mode != TierAgentState::FLUSH_MODE_IDLE &&
14011              agent_flush_quota > 0 && agent_maybe_flush(obc)) {
14012       ++started;
14013       --agent_flush_quota;
14014     }
14015     if (started >= start_max) {
14016       // If finishing early, set "next" to the next object
14017       if (++p != ls.end())
14018         next = *p;
14019       break;
14020     }
14021   }
14022
14023   if (++agent_state->hist_age > cct->_conf->osd_agent_hist_halflife) {
14024     dout(20) << __func__ << " resetting atime and temp histograms" << dendl;
14025     agent_state->hist_age = 0;
14026     agent_state->temp_hist.decay();
14027   }
14028
14029   // Total objects operated on so far
14030   int total_started = agent_state->started + started;
14031   bool need_delay = false;
14032
14033   dout(20) << __func__ << " start pos " << agent_state->position
14034     << " next start pos " << next
14035     << " started " << total_started << dendl;
14036
14037   // See if we've made a full pass over the object hash space
14038   // This might check at most ls_max objects a second time to notice that
14039   // we've checked every objects at least once.
14040   if (agent_state->position < agent_state->start &&
14041       next >= agent_state->start) {
14042     dout(20) << __func__ << " wrap around " << agent_state->start << dendl;
14043     if (total_started == 0)
14044       need_delay = true;
14045     else
14046       total_started = 0;
14047     agent_state->start = next;
14048   }
14049   agent_state->started = total_started;
14050
14051   // See if we are starting from beginning
14052   if (next.is_max())
14053     agent_state->position = hobject_t();
14054   else
14055     agent_state->position = next;
14056
14057   // Discard old in memory HitSets
14058   hit_set_in_memory_trim(pool.info.hit_set_count);
14059
14060   if (need_delay) {
14061     ceph_assert(agent_state->delaying == false);
14062     agent_delay();
14063     return false;
14064   }
14065   agent_choose_mode();
14066   return true;
14067 }
14068
14069 void PrimaryLogPG::agent_load_hit_sets()
14070 {
14071   if (agent_state->evict_mode == TierAgentState::EVICT_MODE_IDLE) {
14072     return;
14073   }
14074
14075   if (agent_state->hit_set_map.size() < info.hit_set.history.size()) {
14076     dout(10) << __func__ << dendl;
14077     for (auto p = info.hit_set.history.begin();
14078          p != info.hit_set.history.end(); ++p) {
14079       if (agent_state->hit_set_map.count(p->begin.sec()) == 0) {
14080         dout(10) << __func__ << " loading " << p->begin << "-"
14081                  << p->end << dendl;
14082         if (!pool.info.is_replicated()) {
14083           // FIXME: EC not supported here yet
14084           derr << __func__ << " on non-replicated pool" << dendl;
14085           break;
14086         }
14087
14088         hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
14089         if (is_unreadable_object(oid)) {
14090           dout(10) << __func__ << " unreadable " << oid << ", waiting" << dendl;
14091           break;
14092         }
14093
14094         ObjectContextRef obc = get_object_context(oid, false);
14095         if (!obc) {
14096           derr << __func__ << ": could not load hitset " << oid << dendl;
14097           break;
14098         }
14099
14100         bufferlist bl;
14101         {
14102           int r = osd->store->read(ch, ghobject_t(oid), 0, 0, bl);
14103           ceph_assert(r >= 0);
14104         }
14105         HitSetRef hs(new HitSet);
14106         bufferlist::const_iterator pbl = bl.begin();
14107         decode(*hs, pbl);
14108         agent_state->add_hit_set(p->begin.sec(), hs);
14109       }
14110     }
14111   }
14112 }
14113
14114 bool PrimaryLogPG::agent_maybe_flush(ObjectContextRef& obc)
14115 {
14116   if (!obc->obs.oi.is_dirty()) {
14117     dout(20) << __func__ << " skip (clean) " << obc->obs.oi << dendl;
14118     osd->logger->inc(l_osd_agent_skip);
14119     return false;
14120   }
14121   if (obc->obs.oi.is_cache_pinned()) {
14122     dout(20) << __func__ << " skip (cache_pinned) " << obc->obs.oi << dendl;
14123     osd->logger->inc(l_osd_agent_skip);
14124     return false;
14125   }
14126
14127   utime_t now = ceph_clock_now();
14128   utime_t ob_local_mtime;
14129   if (obc->obs.oi.local_mtime != utime_t()) {
14130     ob_local_mtime = obc->obs.oi.local_mtime;
14131   } else {
14132     ob_local_mtime = obc->obs.oi.mtime;
14133   }
14134   bool evict_mode_full =
14135     (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL);
14136   if (!evict_mode_full &&
14137       obc->obs.oi.soid.snap == CEPH_NOSNAP &&  // snaps immutable; don't delay
14138       (ob_local_mtime + utime_t(pool.info.cache_min_flush_age, 0) > now)) {
14139     dout(20) << __func__ << " skip (too young) " << obc->obs.oi << dendl;
14140     osd->logger->inc(l_osd_agent_skip);
14141     return false;
14142   }
14143
14144   if (osd->agent_is_active_oid(obc->obs.oi.soid)) {
14145     dout(20) << __func__ << " skip (flushing) " << obc->obs.oi << dendl;
14146     osd->logger->inc(l_osd_agent_skip);
14147     return false;
14148   }
14149
14150   dout(10) << __func__ << " flushing " << obc->obs.oi << dendl;
14151
14152   // FIXME: flush anything dirty, regardless of what distribution of
14153   // ages we expect.
14154
14155   hobject_t oid = obc->obs.oi.soid;
14156   osd->agent_start_op(oid);
14157   // no need to capture a pg ref, can't outlive fop or ctx
14158   std::function<void()> on_flush = [this, oid]() {
14159     osd->agent_finish_op(oid);
14160   };
14161
14162   int result = start_flush(
14163     OpRequestRef(), obc, false, NULL,
14164     on_flush);
14165   if (result != -EINPROGRESS) {
14166     on_flush();
14167     dout(10) << __func__ << " start_flush() failed " << obc->obs.oi
14168       << " with " << result << dendl;
14169     osd->logger->inc(l_osd_agent_skip);
14170     return false;
14171   }
14172
14173   osd->logger->inc(l_osd_agent_flush);
14174   return true;
14175 }
14176
14177 bool PrimaryLogPG::agent_maybe_evict(ObjectContextRef& obc, bool after_flush)
14178 {
14179   const hobject_t& soid = obc->obs.oi.soid;
14180   if (!after_flush && obc->obs.oi.is_dirty()) {
14181     dout(20) << __func__ << " skip (dirty) " << obc->obs.oi << dendl;
14182     return false;
14183   }
14184   // This is already checked by agent_work() which passes after_flush = false
14185   if (after_flush && range_intersects_scrub(soid, soid.get_head())) {
14186       dout(20) << __func__ << " skip (scrubbing) " << obc->obs.oi << dendl;
14187       return false;
14188   }
14189   if (!obc->obs.oi.watchers.empty()) {
14190     dout(20) << __func__ << " skip (watchers) " << obc->obs.oi << dendl;
14191     return false;
14192   }
14193   if (obc->is_blocked()) {
14194     dout(20) << __func__ << " skip (blocked) " << obc->obs.oi << dendl;
14195     return false;
14196   }
14197   if (obc->obs.oi.is_cache_pinned()) {
14198     dout(20) << __func__ << " skip (cache_pinned) " << obc->obs.oi << dendl;
14199     return false;
14200   }
14201
14202   if (soid.snap == CEPH_NOSNAP) {
14203     int result = _verify_no_head_clones(soid, obc->ssc->snapset);
14204     if (result < 0) {
14205       dout(20) << __func__ << " skip (clones) " << obc->obs.oi << dendl;
14206       return false;
14207     }
14208   }
14209
14210   if (agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL) {
14211     // is this object old than cache_min_evict_age?
14212     utime_t now = ceph_clock_now();
14213     utime_t ob_local_mtime;
14214     if (obc->obs.oi.local_mtime != utime_t()) {
14215       ob_local_mtime = obc->obs.oi.local_mtime;
14216     } else {
14217       ob_local_mtime = obc->obs.oi.mtime;
14218     }
14219     if (ob_local_mtime + utime_t(pool.info.cache_min_evict_age, 0) > now) {
14220       dout(20) << __func__ << " skip (too young) " << obc->obs.oi << dendl;
14221       osd->logger->inc(l_osd_agent_skip);
14222       return false;
14223     }
14224     // is this object old and/or cold enough?
14225     int temp = 0;
14226     uint64_t temp_upper = 0, temp_lower = 0;
14227     if (hit_set)
14228       agent_estimate_temp(soid, &temp);
14229     agent_state->temp_hist.add(temp);
14230     agent_state->temp_hist.get_position_micro(temp, &temp_lower, &temp_upper);
14231
14232     dout(20) << __func__
14233              << " temp " << temp
14234              << " pos " << temp_lower << "-" << temp_upper
14235              << ", evict_effort " << agent_state->evict_effort
14236              << dendl;
14237     dout(30) << "agent_state:\n";
14238     Formatter *f = Formatter::create("");
14239     f->open_object_section("agent_state");
14240     agent_state->dump(f);
14241     f->close_section();
14242     f->flush(*_dout);
14243     delete f;
14244     *_dout << dendl;
14245
14246     if (1000000 - temp_upper >= agent_state->evict_effort)
14247       return false;
14248   }
14249
14250   dout(10) << __func__ << " evicting " << obc->obs.oi << dendl;
14251   OpContextUPtr ctx = simple_opc_create(obc);
14252
14253   auto null_op_req = OpRequestRef();
14254   if (!ctx->lock_manager.get_lock_type(
14255         RWState::RWWRITE,
14256         obc->obs.oi.soid,
14257         obc,
14258         null_op_req)) {
14259     close_op_ctx(ctx.release());
14260     dout(20) << __func__ << " skip (cannot get lock) " << obc->obs.oi << dendl;
14261     return false;
14262   }
14263
14264   osd->agent_start_evict_op();
14265   ctx->register_on_finish(
14266     [this]() {
14267       osd->agent_finish_evict_op();
14268     });
14269
14270   ctx->at_version = get_next_version();
14271   ceph_assert(ctx->new_obs.exists);
14272   int r = _delete_oid(ctx.get(), true, false);
14273   if (obc->obs.oi.is_omap())
14274     ctx->delta_stats.num_objects_omap--;
14275   ctx->delta_stats.num_evict++;
14276   ctx->delta_stats.num_evict_kb += shift_round_up(obc->obs.oi.size, 10);
14277   if (obc->obs.oi.is_dirty())
14278     --ctx->delta_stats.num_objects_dirty;
14279   ceph_assert(r == 0);
14280   finish_ctx(ctx.get(), pg_log_entry_t::DELETE);
14281   simple_opc_submit(std::move(ctx));
14282   osd->logger->inc(l_osd_tier_evict);
14283   osd->logger->inc(l_osd_agent_evict);
14284   return true;
14285 }
14286
14287 void PrimaryLogPG::agent_stop()
14288 {
14289   dout(20) << __func__ << dendl;
14290   if (agent_state && !agent_state->is_idle()) {
14291     agent_state->evict_mode = TierAgentState::EVICT_MODE_IDLE;
14292     agent_state->flush_mode = TierAgentState::FLUSH_MODE_IDLE;
14293     osd->agent_disable_pg(this, agent_state->evict_effort);
14294   }
14295 }
14296
14297 void PrimaryLogPG::agent_delay()
14298 {
14299   dout(20) << __func__ << dendl;
14300   if (agent_state && !agent_state->is_idle()) {
14301     ceph_assert(agent_state->delaying == false);
14302     agent_state->delaying = true;
14303     osd->agent_disable_pg(this, agent_state->evict_effort);
14304   }
14305 }
14306
14307 void PrimaryLogPG::agent_choose_mode_restart()
14308 {
14309   dout(20) << __func__ << dendl;
14310   std::scoped_lock locker{*this};
14311   if (agent_state && agent_state->delaying) {
14312     agent_state->delaying = false;
14313     agent_choose_mode(true);
14314   }
14315 }
14316
14317 bool PrimaryLogPG::agent_choose_mode(bool restart, OpRequestRef op)
14318 {
14319   bool requeued = false;
14320   // Let delay play out
14321   if (agent_state->delaying) {
14322     dout(20) << __func__ << " " << this << " delaying, ignored" << dendl;
14323     return requeued;
14324   }
14325
14326   TierAgentState::flush_mode_t flush_mode = TierAgentState::FLUSH_MODE_IDLE;
14327   TierAgentState::evict_mode_t evict_mode = TierAgentState::EVICT_MODE_IDLE;
14328   unsigned evict_effort = 0;
14329
14330   if (info.stats.stats_invalid) {
14331     // idle; stats can't be trusted until we scrub.
14332     dout(20) << __func__ << " stats invalid (post-split), idle" << dendl;
14333     goto skip_calc;
14334   }
14335
14336   {
14337   uint64_t divisor = pool.info.get_pg_num_divisor(info.pgid.pgid);
14338   ceph_assert(divisor > 0);
14339
14340   // adjust (effective) user objects down based on the number
14341   // of HitSet objects, which should not count toward our total since
14342   // they cannot be flushed.
14343   uint64_t unflushable = info.stats.stats.sum.num_objects_hit_set_archive;
14344
14345   // also exclude omap objects if ec backing pool
14346   const pg_pool_t *base_pool = get_osdmap()->get_pg_pool(pool.info.tier_of);
14347   ceph_assert(base_pool);
14348   if (!base_pool->supports_omap())
14349     unflushable += info.stats.stats.sum.num_objects_omap;
14350
14351   uint64_t num_user_objects = info.stats.stats.sum.num_objects;
14352   if (num_user_objects > unflushable)
14353     num_user_objects -= unflushable;
14354   else
14355     num_user_objects = 0;
14356
14357   uint64_t num_user_bytes = info.stats.stats.sum.num_bytes;
14358   uint64_t unflushable_bytes = info.stats.stats.sum.num_bytes_hit_set_archive;
14359   num_user_bytes -= unflushable_bytes;
14360   uint64_t num_overhead_bytes = osd->store->estimate_objects_overhead(num_user_objects);
14361   num_user_bytes += num_overhead_bytes;
14362
14363   // also reduce the num_dirty by num_objects_omap
14364   int64_t num_dirty = info.stats.stats.sum.num_objects_dirty;
14365   if (!base_pool->supports_omap()) {
14366     if (num_dirty > info.stats.stats.sum.num_objects_omap)
14367       num_dirty -= info.stats.stats.sum.num_objects_omap;
14368     else
14369       num_dirty = 0;
14370   }
14371
14372   dout(10) << __func__
14373            << " flush_mode: "
14374            << TierAgentState::get_flush_mode_name(agent_state->flush_mode)
14375            << " evict_mode: "
14376            << TierAgentState::get_evict_mode_name(agent_state->evict_mode)
14377            << " num_objects: " << info.stats.stats.sum.num_objects
14378            << " num_bytes: " << info.stats.stats.sum.num_bytes
14379            << " num_objects_dirty: " << info.stats.stats.sum.num_objects_dirty
14380            << " num_objects_omap: " << info.stats.stats.sum.num_objects_omap
14381            << " num_dirty: " << num_dirty
14382            << " num_user_objects: " << num_user_objects
14383            << " num_user_bytes: " << num_user_bytes
14384            << " num_overhead_bytes: " << num_overhead_bytes
14385            << " pool.info.target_max_bytes: " << pool.info.target_max_bytes
14386            << " pool.info.target_max_objects: " << pool.info.target_max_objects
14387            << dendl;
14388
14389   // get dirty, full ratios
14390   uint64_t dirty_micro = 0;
14391   uint64_t full_micro = 0;
14392   if (pool.info.target_max_bytes && num_user_objects > 0) {
14393     uint64_t avg_size = num_user_bytes / num_user_objects;
14394     dirty_micro =
14395       num_dirty * avg_size * 1000000 /
14396       std::max<uint64_t>(pool.info.target_max_bytes / divisor, 1);
14397     full_micro =
14398       num_user_objects * avg_size * 1000000 /
14399       std::max<uint64_t>(pool.info.target_max_bytes / divisor, 1);
14400   }
14401   if (pool.info.target_max_objects > 0) {
14402     uint64_t dirty_objects_micro =
14403       num_dirty * 1000000 /
14404       std::max<uint64_t>(pool.info.target_max_objects / divisor, 1);
14405     if (dirty_objects_micro > dirty_micro)
14406       dirty_micro = dirty_objects_micro;
14407     uint64_t full_objects_micro =
14408       num_user_objects * 1000000 /
14409       std::max<uint64_t>(pool.info.target_max_objects / divisor, 1);
14410     if (full_objects_micro > full_micro)
14411       full_micro = full_objects_micro;
14412   }
14413   dout(20) << __func__ << " dirty " << ((float)dirty_micro / 1000000.0)
14414            << " full " << ((float)full_micro / 1000000.0)
14415            << dendl;
14416
14417   // flush mode
14418   uint64_t flush_target = pool.info.cache_target_dirty_ratio_micro;
14419   uint64_t flush_high_target = pool.info.cache_target_dirty_high_ratio_micro;
14420   uint64_t flush_slop = (float)flush_target * cct->_conf->osd_agent_slop;
14421   if (restart || agent_state->flush_mode == TierAgentState::FLUSH_MODE_IDLE) {
14422     flush_target += flush_slop;
14423     flush_high_target += flush_slop;
14424   } else {
14425     flush_target -= std::min(flush_target, flush_slop);
14426     flush_high_target -= std::min(flush_high_target, flush_slop);
14427   }
14428
14429   if (dirty_micro > flush_high_target) {
14430     flush_mode = TierAgentState::FLUSH_MODE_HIGH;
14431   } else if (dirty_micro > flush_target || (!flush_target && num_dirty > 0)) {
14432     flush_mode = TierAgentState::FLUSH_MODE_LOW;
14433   }
14434
14435   // evict mode
14436   uint64_t evict_target = pool.info.cache_target_full_ratio_micro;
14437   uint64_t evict_slop = (float)evict_target * cct->_conf->osd_agent_slop;
14438   if (restart || agent_state->evict_mode == TierAgentState::EVICT_MODE_IDLE)
14439     evict_target += evict_slop;
14440   else
14441     evict_target -= std::min(evict_target, evict_slop);
14442
14443   if (full_micro > 1000000) {
14444     // evict anything clean
14445     evict_mode = TierAgentState::EVICT_MODE_FULL;
14446     evict_effort = 1000000;
14447   } else if (full_micro > evict_target) {
14448     // set effort in [0..1] range based on where we are between
14449     evict_mode = TierAgentState::EVICT_MODE_SOME;
14450     uint64_t over = full_micro - evict_target;
14451     uint64_t span  = 1000000 - evict_target;
14452     evict_effort = std::max(over * 1000000 / span,
14453                             uint64_t(1000000.0 *
14454                                      cct->_conf->osd_agent_min_evict_effort));
14455
14456     // quantize effort to avoid too much reordering in the agent_queue.
14457     uint64_t inc = cct->_conf->osd_agent_quantize_effort * 1000000;
14458     ceph_assert(inc > 0);
14459     uint64_t was = evict_effort;
14460     evict_effort -= evict_effort % inc;
14461     if (evict_effort < inc)
14462       evict_effort = inc;
14463     ceph_assert(evict_effort >= inc && evict_effort <= 1000000);
14464     dout(30) << __func__ << " evict_effort " << was << " quantized by " << inc << " to " << evict_effort << dendl;
14465   }
14466   }
14467
14468   skip_calc:
14469   bool old_idle = agent_state->is_idle();
14470   if (flush_mode != agent_state->flush_mode) {
14471     dout(5) << __func__ << " flush_mode "
14472             << TierAgentState::get_flush_mode_name(agent_state->flush_mode)
14473             << " -> "
14474             << TierAgentState::get_flush_mode_name(flush_mode)
14475             << dendl;
14476     recovery_state.update_stats(
14477       [=](auto &history, auto &stats) {
14478         if (flush_mode == TierAgentState::FLUSH_MODE_HIGH) {
14479           osd->agent_inc_high_count();
14480           stats.stats.sum.num_flush_mode_high = 1;
14481         } else if (flush_mode == TierAgentState::FLUSH_MODE_LOW) {
14482           stats.stats.sum.num_flush_mode_low = 1;
14483         }
14484         if (agent_state->flush_mode == TierAgentState::FLUSH_MODE_HIGH) {
14485           osd->agent_dec_high_count();
14486           stats.stats.sum.num_flush_mode_high = 0;
14487         } else if (agent_state->flush_mode == TierAgentState::FLUSH_MODE_LOW) {
14488           stats.stats.sum.num_flush_mode_low = 0;
14489         }
14490         return false;
14491       });
14492     agent_state->flush_mode = flush_mode;
14493   }
14494   if (evict_mode != agent_state->evict_mode) {
14495     dout(5) << __func__ << " evict_mode "
14496             << TierAgentState::get_evict_mode_name(agent_state->evict_mode)
14497             << " -> "
14498             << TierAgentState::get_evict_mode_name(evict_mode)
14499             << dendl;
14500     if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL &&
14501         is_active()) {
14502       if (op)
14503         requeue_op(op);
14504       requeue_ops(waiting_for_flush);
14505       requeue_ops(waiting_for_active);
14506       requeue_ops(waiting_for_readable);
14507       requeue_ops(waiting_for_scrub);
14508       requeue_ops(waiting_for_cache_not_full);
14509       objects_blocked_on_cache_full.clear();
14510       requeued = true;
14511     }
14512     recovery_state.update_stats(
14513       [=](auto &history, auto &stats) {
14514         if (evict_mode == TierAgentState::EVICT_MODE_SOME) {
14515           stats.stats.sum.num_evict_mode_some = 1;
14516         } else if (evict_mode == TierAgentState::EVICT_MODE_FULL) {
14517           stats.stats.sum.num_evict_mode_full = 1;
14518         }
14519         if (agent_state->evict_mode == TierAgentState::EVICT_MODE_SOME) {
14520           stats.stats.sum.num_evict_mode_some = 0;
14521         } else if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
14522           stats.stats.sum.num_evict_mode_full = 0;
14523         }
14524         return false;
14525       });
14526     agent_state->evict_mode = evict_mode;
14527   }
14528   uint64_t old_effort = agent_state->evict_effort;
14529   if (evict_effort != agent_state->evict_effort) {
14530     dout(5) << __func__ << " evict_effort "
14531             << ((float)agent_state->evict_effort / 1000000.0)
14532             << " -> "
14533             << ((float)evict_effort / 1000000.0)
14534             << dendl;
14535     agent_state->evict_effort = evict_effort;
14536   }
14537
14538   // NOTE: we are using evict_effort as a proxy for *all* agent effort
14539   // (including flush).  This is probably fine (they should be
14540   // correlated) but it is not precisely correct.
14541   if (agent_state->is_idle()) {
14542     if (!restart && !old_idle) {
14543       osd->agent_disable_pg(this, old_effort);
14544     }
14545   } else {
14546     if (restart || old_idle) {
14547       osd->agent_enable_pg(this, agent_state->evict_effort);
14548     } else if (old_effort != agent_state->evict_effort) {
14549       osd->agent_adjust_pg(this, old_effort, agent_state->evict_effort);
14550     }
14551   }
14552   return requeued;
14553 }
14554
14555 void PrimaryLogPG::agent_estimate_temp(const hobject_t& oid, int *temp)
14556 {
14557   ceph_assert(hit_set);
14558   ceph_assert(temp);
14559   *temp = 0;
14560   if (hit_set->contains(oid))
14561     *temp = 1000000;
14562   unsigned i = 0;
14563   int last_n = pool.info.hit_set_search_last_n;
14564   for (map<time_t,HitSetRef>::reverse_iterator p =
14565        agent_state->hit_set_map.rbegin(); last_n > 0 &&
14566        p != agent_state->hit_set_map.rend(); ++p, ++i) {
14567     if (p->second->contains(oid)) {
14568       *temp += pool.info.get_grade(i);
14569       --last_n;
14570     }
14571   }
14572 }
14573
14574 // Dup op detection
14575
14576 bool PrimaryLogPG::already_complete(eversion_t v)
14577 {
14578   dout(20) << __func__ << ": " << v << dendl;
14579   for (xlist<RepGather*>::iterator i = repop_queue.begin();
14580        !i.end();
14581        ++i) {
14582     dout(20) << __func__ << ": " << **i << dendl;
14583     // skip copy from temp object ops
14584     if ((*i)->v == eversion_t()) {
14585       dout(20) << __func__ << ": " << **i
14586                << " version is empty" << dendl;
14587       continue;
14588     }
14589     if ((*i)->v > v) {
14590       dout(20) << __func__ << ": " << **i
14591                << " (*i)->v past v" << dendl;
14592       break;
14593     }
14594     if (!(*i)->all_committed) {
14595       dout(20) << __func__ << ": " << **i
14596                << " not committed, returning false"
14597                << dendl;
14598       return false;
14599     }
14600   }
14601   dout(20) << __func__ << ": returning true" << dendl;
14602   return true;
14603 }
14604
14605
14606 // ==========================================================================================
14607 // SCRUB
14608
14609
14610 bool PrimaryLogPG::_range_available_for_scrub(
14611   const hobject_t &begin, const hobject_t &end)
14612 {
14613   pair<hobject_t, ObjectContextRef> next;
14614   next.second = object_contexts.lookup(begin);
14615   next.first = begin;
14616   bool more = true;
14617   while (more && next.first < end) {
14618     if (next.second && next.second->is_blocked()) {
14619       next.second->requeue_scrub_on_unblock = true;
14620       dout(10) << __func__ << ": scrub delayed, "
14621                << next.first << " is blocked"
14622                << dendl;
14623       return false;
14624     }
14625     more = object_contexts.get_next(next.first, &next);
14626   }
14627   return true;
14628 }
14629
14630 static bool doing_clones(const std::optional<SnapSet> &snapset,
14631                          const vector<snapid_t>::reverse_iterator &curclone) {
14632     return snapset && curclone != snapset->clones.rend();
14633 }
14634
14635 void PrimaryLogPG::log_missing(unsigned missing,
14636                         const std::optional<hobject_t> &head,
14637                         LogChannelRef clog,
14638                         const spg_t &pgid,
14639                         const char *func,
14640                         const char *mode,
14641                         bool allow_incomplete_clones)
14642 {
14643   ceph_assert(head);
14644   if (allow_incomplete_clones) {
14645     dout(20) << func << " " << mode << " " << pgid << " " << *head
14646              << " skipped " << missing << " clone(s) in cache tier" << dendl;
14647   } else {
14648     clog->info() << mode << " " << pgid << " " << *head
14649                  << " : " << missing << " missing clone(s)";
14650   }
14651 }
14652
14653 unsigned PrimaryLogPG::process_clones_to(const std::optional<hobject_t> &head,
14654   const std::optional<SnapSet> &snapset,
14655   LogChannelRef clog,
14656   const spg_t &pgid,
14657   const char *mode,
14658   bool allow_incomplete_clones,
14659   std::optional<snapid_t> target,
14660   vector<snapid_t>::reverse_iterator *curclone,
14661   inconsistent_snapset_wrapper &e)
14662 {
14663   ceph_assert(head);
14664   ceph_assert(snapset);
14665   unsigned missing = 0;
14666
14667   // NOTE: clones are in descending order, thus **curclone > target test here
14668   hobject_t next_clone(*head);
14669   while(doing_clones(snapset, *curclone) && (!target || **curclone > *target)) {
14670     ++missing;
14671     // it is okay to be missing one or more clones in a cache tier.
14672     // skip higher-numbered clones in the list.
14673     if (!allow_incomplete_clones) {
14674       next_clone.snap = **curclone;
14675       clog->error() << mode << " " << pgid << " " << *head
14676                          << " : expected clone " << next_clone << " " << missing
14677                          << " missing";
14678       ++scrubber.shallow_errors;
14679       e.set_clone_missing(next_clone.snap);
14680     }
14681     // Clones are descending
14682     ++(*curclone);
14683   }
14684   return missing;
14685 }
14686
14687 /*
14688  * Validate consistency of the object info and snap sets.
14689  *
14690  * We are sort of comparing 2 lists. The main loop is on objmap.objects. But
14691  * the comparison of the objects is against multiple snapset.clones. There are
14692  * multiple clone lists and in between lists we expect head.
14693  *
14694  * Example
14695  *
14696  * objects              expected
14697  * =======              =======
14698  * obj1 snap 1          head, unexpected obj1 snap 1
14699  * obj2 head            head, match
14700  *              [SnapSet clones 6 4 2 1]
14701  * obj2 snap 7          obj2 snap 6, unexpected obj2 snap 7
14702  * obj2 snap 6          obj2 snap 6, match
14703  * obj2 snap 4          obj2 snap 4, match
14704  * obj3 head            obj2 snap 2 (expected), obj2 snap 1 (expected), match
14705  *              [Snapset clones 3 1]
14706  * obj3 snap 3          obj3 snap 3 match
14707  * obj3 snap 1          obj3 snap 1 match
14708  * obj4 head            head, match
14709  *              [Snapset clones 4]
14710  * EOL                  obj4 snap 4, (expected)
14711  */
14712 void PrimaryLogPG::scrub_snapshot_metadata(
14713   ScrubMap &scrubmap,
14714   const map<hobject_t,
14715             pair<std::optional<uint32_t>,
14716                  std::optional<uint32_t>>> &missing_digest)
14717 {
14718   dout(10) << __func__ << dendl;
14719
14720   bool repair = state_test(PG_STATE_REPAIR);
14721   bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
14722   const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
14723   std::optional<snapid_t> all_clones;   // Unspecified snapid_t or std::nullopt
14724
14725   // traverse in reverse order.
14726   std::optional<hobject_t> head;
14727   std::optional<SnapSet> snapset; // If initialized so will head (above)
14728   vector<snapid_t>::reverse_iterator curclone; // Defined only if snapset initialized
14729   unsigned missing = 0;
14730   inconsistent_snapset_wrapper soid_error, head_error;
14731   unsigned soid_error_count = 0;
14732
14733   for (map<hobject_t,ScrubMap::object>::reverse_iterator
14734        p = scrubmap.objects.rbegin(); p != scrubmap.objects.rend(); ++p) {
14735     const hobject_t& soid = p->first;
14736     ceph_assert(!soid.is_snapdir());
14737     soid_error = inconsistent_snapset_wrapper{soid};
14738     object_stat_sum_t stat;
14739     std::optional<object_info_t> oi;
14740
14741     stat.num_objects++;
14742
14743     if (soid.nspace == cct->_conf->osd_hit_set_namespace)
14744       stat.num_objects_hit_set_archive++;
14745
14746     if (soid.is_snap()) {
14747       // it's a clone
14748       stat.num_object_clones++;
14749     }
14750
14751     // basic checks.
14752     if (p->second.attrs.count(OI_ATTR) == 0) {
14753       oi = std::nullopt;
14754       osd->clog->error() << mode << " " << info.pgid << " " << soid
14755                         << " : no '" << OI_ATTR << "' attr";
14756       ++scrubber.shallow_errors;
14757       soid_error.set_info_missing();
14758     } else {
14759       bufferlist bv;
14760       bv.push_back(p->second.attrs[OI_ATTR]);
14761       try {
14762         oi = object_info_t(); // Initialize optional<> before decode into it
14763         oi->decode(bv);
14764       } catch (buffer::error& e) {
14765         oi = std::nullopt;
14766         osd->clog->error() << mode << " " << info.pgid << " " << soid
14767                 << " : can't decode '" << OI_ATTR << "' attr " << e.what();
14768         ++scrubber.shallow_errors;
14769         soid_error.set_info_corrupted();
14770         soid_error.set_info_missing(); // Not available too
14771       }
14772     }
14773
14774     if (oi) {
14775       if (pgbackend->be_get_ondisk_size(oi->size) != p->second.size) {
14776         osd->clog->error() << mode << " " << info.pgid << " " << soid
14777                            << " : on disk size (" << p->second.size
14778                            << ") does not match object info size ("
14779                            << oi->size << ") adjusted for ondisk to ("
14780                            << pgbackend->be_get_ondisk_size(oi->size)
14781                            << ")";
14782         soid_error.set_size_mismatch();
14783         ++scrubber.shallow_errors;
14784       }
14785
14786       dout(20) << mode << "  " << soid << " " << *oi << dendl;
14787
14788       // A clone num_bytes will be added later when we have snapset
14789       if (!soid.is_snap()) {
14790         stat.num_bytes += oi->size;
14791       }
14792       if (soid.nspace == cct->_conf->osd_hit_set_namespace)
14793         stat.num_bytes_hit_set_archive += oi->size;
14794
14795       if (oi->is_dirty())
14796         ++stat.num_objects_dirty;
14797       if (oi->is_whiteout())
14798         ++stat.num_whiteouts;
14799       if (oi->is_omap())
14800         ++stat.num_objects_omap;
14801       if (oi->is_cache_pinned())
14802         ++stat.num_objects_pinned;
14803       if (oi->has_manifest())
14804         ++stat.num_objects_manifest;
14805     }
14806
14807     // Check for any problems while processing clones
14808     if (doing_clones(snapset, curclone)) {
14809       std::optional<snapid_t> target;
14810       // Expecting an object with snap for current head
14811       if (soid.has_snapset() || soid.get_head() != head->get_head()) {
14812
14813         dout(10) << __func__ << " " << mode << " " << info.pgid << " new object "
14814                  << soid << " while processing " << *head << dendl;
14815
14816         target = all_clones;
14817       } else {
14818         ceph_assert(soid.is_snap());
14819         target = soid.snap;
14820       }
14821
14822       // Log any clones we were expecting to be there up to target
14823       // This will set missing, but will be a no-op if snap.soid == *curclone.
14824       missing += process_clones_to(head, snapset, osd->clog, info.pgid, mode,
14825                         pool.info.allow_incomplete_clones(), target, &curclone,
14826                         head_error);
14827     }
14828     bool expected;
14829     // Check doing_clones() again in case we ran process_clones_to()
14830     if (doing_clones(snapset, curclone)) {
14831       // A head would have processed all clones above
14832       // or all greater than *curclone.
14833       ceph_assert(soid.is_snap() && *curclone <= soid.snap);
14834
14835       // After processing above clone snap should match the expected curclone
14836       expected = (*curclone == soid.snap);
14837     } else {
14838       // If we aren't doing clones any longer, then expecting head
14839       expected = soid.has_snapset();
14840     }
14841     if (!expected) {
14842       // If we couldn't read the head's snapset, just ignore clones
14843       if (head && !snapset) {
14844         osd->clog->error() << mode << " " << info.pgid << " " << soid
14845                           << " : clone ignored due to missing snapset";
14846       } else {
14847         osd->clog->error() << mode << " " << info.pgid << " " << soid
14848                            << " : is an unexpected clone";
14849       }
14850       ++scrubber.shallow_errors;
14851       soid_error.set_headless();
14852       scrubber.store->add_snap_error(pool.id, soid_error);
14853       ++soid_error_count;
14854       if (head && soid.get_head() == head->get_head())
14855         head_error.set_clone(soid.snap);
14856       continue;
14857     }
14858
14859     // new snapset?
14860     if (soid.has_snapset()) {
14861
14862       if (missing) {
14863         log_missing(missing, head, osd->clog, info.pgid, __func__, mode,
14864                     pool.info.allow_incomplete_clones());
14865       }
14866
14867       // Save previous head error information
14868       if (head && (head_error.errors || soid_error_count))
14869         scrubber.store->add_snap_error(pool.id, head_error);
14870       // Set this as a new head object
14871       head = soid;
14872       missing = 0;
14873       head_error = soid_error;
14874       soid_error_count = 0;
14875
14876       dout(20) << __func__ << " " << mode << " new head " << head << dendl;
14877
14878       if (p->second.attrs.count(SS_ATTR) == 0) {
14879         osd->clog->error() << mode << " " << info.pgid << " " << soid
14880                           << " : no '" << SS_ATTR << "' attr";
14881         ++scrubber.shallow_errors;
14882         snapset = std::nullopt;
14883         head_error.set_snapset_missing();
14884       } else {
14885         bufferlist bl;
14886         bl.push_back(p->second.attrs[SS_ATTR]);
14887         auto blp = bl.cbegin();
14888         try {
14889           snapset = SnapSet(); // Initialize optional<> before decoding into it
14890           decode(*snapset, blp);
14891           head_error.ss_bl.push_back(p->second.attrs[SS_ATTR]);
14892         } catch (buffer::error& e) {
14893           snapset = std::nullopt;
14894           osd->clog->error() << mode << " " << info.pgid << " " << soid
14895                 << " : can't decode '" << SS_ATTR << "' attr " << e.what();
14896           ++scrubber.shallow_errors;
14897           head_error.set_snapset_corrupted();
14898         }
14899       }
14900
14901       if (snapset) {
14902         // what will be next?
14903         curclone = snapset->clones.rbegin();
14904
14905         if (!snapset->clones.empty()) {
14906           dout(20) << "  snapset " << *snapset << dendl;
14907           if (snapset->seq == 0) {
14908             osd->clog->error() << mode << " " << info.pgid << " " << soid
14909                                << " : snaps.seq not set";
14910             ++scrubber.shallow_errors;
14911             head_error.set_snapset_error();
14912           }
14913         }
14914       }
14915     } else {
14916       ceph_assert(soid.is_snap());
14917       ceph_assert(head);
14918       ceph_assert(snapset);
14919       ceph_assert(soid.snap == *curclone);
14920
14921       dout(20) << __func__ << " " << mode << " matched clone " << soid << dendl;
14922
14923       if (snapset->clone_size.count(soid.snap) == 0) {
14924         osd->clog->error() << mode << " " << info.pgid << " " << soid
14925                            << " : is missing in clone_size";
14926         ++scrubber.shallow_errors;
14927         soid_error.set_size_mismatch();
14928       } else {
14929         if (oi && oi->size != snapset->clone_size[soid.snap]) {
14930           osd->clog->error() << mode << " " << info.pgid << " " << soid
14931                              << " : size " << oi->size << " != clone_size "
14932                              << snapset->clone_size[*curclone];
14933           ++scrubber.shallow_errors;
14934           soid_error.set_size_mismatch();
14935         }
14936
14937         if (snapset->clone_overlap.count(soid.snap) == 0) {
14938           osd->clog->error() << mode << " " << info.pgid << " " << soid
14939                              << " : is missing in clone_overlap";
14940           ++scrubber.shallow_errors;
14941           soid_error.set_size_mismatch();
14942         } else {
14943           // This checking is based on get_clone_bytes().  The first 2 asserts
14944           // can't happen because we know we have a clone_size and
14945           // a clone_overlap.  Now we check that the interval_set won't
14946           // cause the last assert.
14947           uint64_t size = snapset->clone_size.find(soid.snap)->second;
14948           const interval_set<uint64_t> &overlap =
14949                 snapset->clone_overlap.find(soid.snap)->second;
14950           bool bad_interval_set = false;
14951           for (interval_set<uint64_t>::const_iterator i = overlap.begin();
14952                i != overlap.end(); ++i) {
14953             if (size < i.get_len()) {
14954               bad_interval_set = true;
14955               break;
14956             }
14957             size -= i.get_len();
14958           }
14959
14960           if (bad_interval_set) {
14961             osd->clog->error() << mode << " " << info.pgid << " " << soid
14962                                << " : bad interval_set in clone_overlap";
14963             ++scrubber.shallow_errors;
14964             soid_error.set_size_mismatch();
14965           } else {
14966             stat.num_bytes += snapset->get_clone_bytes(soid.snap);
14967           }
14968         }
14969       }
14970
14971       // what's next?
14972       ++curclone;
14973       if (soid_error.errors) {
14974         scrubber.store->add_snap_error(pool.id, soid_error);
14975         ++soid_error_count;
14976       }
14977     }
14978
14979     scrub_cstat.add(stat);
14980   }
14981
14982   if (doing_clones(snapset, curclone)) {
14983     dout(10) << __func__ << " " << mode << " " << info.pgid
14984              << " No more objects while processing " << *head << dendl;
14985
14986     missing += process_clones_to(head, snapset, osd->clog, info.pgid, mode,
14987                       pool.info.allow_incomplete_clones(), all_clones, &curclone,
14988                       head_error);
14989   }
14990   // There could be missing found by the test above or even
14991   // before dropping out of the loop for the last head.
14992   if (missing) {
14993     log_missing(missing, head, osd->clog, info.pgid, __func__,
14994                 mode, pool.info.allow_incomplete_clones());
14995   }
14996   if (head && (head_error.errors || soid_error_count))
14997     scrubber.store->add_snap_error(pool.id, head_error);
14998
14999   for (auto p = missing_digest.begin(); p != missing_digest.end(); ++p) {
15000     ceph_assert(!p->first.is_snapdir());
15001     dout(10) << __func__ << " recording digests for " << p->first << dendl;
15002     ObjectContextRef obc = get_object_context(p->first, false);
15003     if (!obc) {
15004       osd->clog->error() << info.pgid << " " << mode
15005                          << " cannot get object context for object "
15006                          << p->first;
15007       continue;
15008     } else if (obc->obs.oi.soid != p->first) {
15009       osd->clog->error() << info.pgid << " " << mode
15010                          << " " << p->first
15011                          << " : object has a valid oi attr with a mismatched name, "
15012                          << " obc->obs.oi.soid: " << obc->obs.oi.soid;
15013       continue;
15014     }
15015     OpContextUPtr ctx = simple_opc_create(obc);
15016     ctx->at_version = get_next_version();
15017     ctx->mtime = utime_t();      // do not update mtime
15018     if (p->second.first) {
15019       ctx->new_obs.oi.set_data_digest(*p->second.first);
15020     } else {
15021       ctx->new_obs.oi.clear_data_digest();
15022     }
15023     if (p->second.second) {
15024       ctx->new_obs.oi.set_omap_digest(*p->second.second);
15025     } else {
15026       ctx->new_obs.oi.clear_omap_digest();
15027     }
15028     finish_ctx(ctx.get(), pg_log_entry_t::MODIFY);
15029
15030     ctx->register_on_success(
15031       [this]() {
15032         dout(20) << "updating scrub digest" << dendl;
15033         if (--scrubber.num_digest_updates_pending == 0) {
15034           requeue_scrub();
15035         }
15036       });
15037
15038     simple_opc_submit(std::move(ctx));
15039     ++scrubber.num_digest_updates_pending;
15040   }
15041
15042   dout(10) << __func__ << " (" << mode << ") finish" << dendl;
15043 }
15044
15045 void PrimaryLogPG::_scrub_clear_state()
15046 {
15047   scrub_cstat = object_stat_collection_t();
15048 }
15049
15050 void PrimaryLogPG::_scrub_finish()
15051 {
15052   bool repair = state_test(PG_STATE_REPAIR);
15053   bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
15054   const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
15055
15056   if (info.stats.stats_invalid) {
15057     recovery_state.update_stats(
15058       [=](auto &history, auto &stats) {
15059         stats.stats = scrub_cstat;
15060         stats.stats_invalid = false;
15061         return false;
15062       });
15063
15064     if (agent_state)
15065       agent_choose_mode();
15066   }
15067
15068   dout(10) << mode << " got "
15069            << scrub_cstat.sum.num_objects << "/" << info.stats.stats.sum.num_objects << " objects, "
15070            << scrub_cstat.sum.num_object_clones << "/" << info.stats.stats.sum.num_object_clones << " clones, "
15071            << scrub_cstat.sum.num_objects_dirty << "/" << info.stats.stats.sum.num_objects_dirty << " dirty, "
15072            << scrub_cstat.sum.num_objects_omap << "/" << info.stats.stats.sum.num_objects_omap << " omap, "
15073            << scrub_cstat.sum.num_objects_pinned << "/" << info.stats.stats.sum.num_objects_pinned << " pinned, "
15074            << scrub_cstat.sum.num_objects_hit_set_archive << "/" << info.stats.stats.sum.num_objects_hit_set_archive << " hit_set_archive, "
15075            << scrub_cstat.sum.num_bytes << "/" << info.stats.stats.sum.num_bytes << " bytes, "
15076            << scrub_cstat.sum.num_objects_manifest << "/" << info.stats.stats.sum.num_objects_manifest << " manifest objects, "
15077            << scrub_cstat.sum.num_bytes_hit_set_archive << "/" << info.stats.stats.sum.num_bytes_hit_set_archive << " hit_set_archive bytes."
15078            << dendl;
15079
15080   if (scrub_cstat.sum.num_objects != info.stats.stats.sum.num_objects ||
15081       scrub_cstat.sum.num_object_clones != info.stats.stats.sum.num_object_clones ||
15082       (scrub_cstat.sum.num_objects_dirty != info.stats.stats.sum.num_objects_dirty &&
15083        !info.stats.dirty_stats_invalid) ||
15084       (scrub_cstat.sum.num_objects_omap != info.stats.stats.sum.num_objects_omap &&
15085        !info.stats.omap_stats_invalid) ||
15086       (scrub_cstat.sum.num_objects_pinned != info.stats.stats.sum.num_objects_pinned &&
15087        !info.stats.pin_stats_invalid) ||
15088       (scrub_cstat.sum.num_objects_hit_set_archive != info.stats.stats.sum.num_objects_hit_set_archive &&
15089        !info.stats.hitset_stats_invalid) ||
15090       (scrub_cstat.sum.num_bytes_hit_set_archive != info.stats.stats.sum.num_bytes_hit_set_archive &&
15091        !info.stats.hitset_bytes_stats_invalid) ||
15092       (scrub_cstat.sum.num_objects_manifest != info.stats.stats.sum.num_objects_manifest &&
15093        !info.stats.manifest_stats_invalid) ||
15094       scrub_cstat.sum.num_whiteouts != info.stats.stats.sum.num_whiteouts ||
15095       scrub_cstat.sum.num_bytes != info.stats.stats.sum.num_bytes) {
15096     osd->clog->error() << info.pgid << " " << mode
15097                       << " : stat mismatch, got "
15098                       << scrub_cstat.sum.num_objects << "/" << info.stats.stats.sum.num_objects << " objects, "
15099                       << scrub_cstat.sum.num_object_clones << "/" << info.stats.stats.sum.num_object_clones << " clones, "
15100                       << scrub_cstat.sum.num_objects_dirty << "/" << info.stats.stats.sum.num_objects_dirty << " dirty, "
15101                       << scrub_cstat.sum.num_objects_omap << "/" << info.stats.stats.sum.num_objects_omap << " omap, "
15102                       << scrub_cstat.sum.num_objects_pinned << "/" << info.stats.stats.sum.num_objects_pinned << " pinned, "
15103                       << scrub_cstat.sum.num_objects_hit_set_archive << "/" << info.stats.stats.sum.num_objects_hit_set_archive << " hit_set_archive, "
15104                       << scrub_cstat.sum.num_whiteouts << "/" << info.stats.stats.sum.num_whiteouts << " whiteouts, "
15105                       << scrub_cstat.sum.num_bytes << "/" << info.stats.stats.sum.num_bytes << " bytes, "
15106                       << scrub_cstat.sum.num_objects_manifest << "/" << info.stats.stats.sum.num_objects_manifest << " manifest objects, "
15107                       << scrub_cstat.sum.num_bytes_hit_set_archive << "/" << info.stats.stats.sum.num_bytes_hit_set_archive << " hit_set_archive bytes.";
15108     ++scrubber.shallow_errors;
15109
15110     if (repair) {
15111       ++scrubber.fixed;
15112       recovery_state.update_stats(
15113         [this](auto &history, auto &stats) {
15114           stats.stats = scrub_cstat;
15115           stats.dirty_stats_invalid = false;
15116           stats.omap_stats_invalid = false;
15117           stats.hitset_stats_invalid = false;
15118           stats.hitset_bytes_stats_invalid = false;
15119           stats.pin_stats_invalid = false;
15120           stats.manifest_stats_invalid = false;
15121           return false;
15122         });
15123       publish_stats_to_osd();
15124       recovery_state.share_pg_info();
15125     }
15126   }
15127   // Clear object context cache to get repair information
15128   if (repair)
15129     object_contexts.clear();
15130 }
15131
15132 int PrimaryLogPG::rep_repair_primary_object(const hobject_t& soid, OpContext *ctx)
15133 {
15134   OpRequestRef op = ctx->op;
15135   // Only supports replicated pools
15136   ceph_assert(!pool.info.is_erasure());
15137   ceph_assert(is_primary());
15138
15139   dout(10) << __func__ << " " << soid
15140            << " peers osd.{" << get_acting_recovery_backfill() << "}" << dendl;
15141
15142   if (!is_clean()) {
15143     block_for_clean(soid, op);
15144     return -EAGAIN;
15145   }
15146
15147   ceph_assert(!recovery_state.get_pg_log().get_missing().is_missing(soid));
15148   auto& oi = ctx->new_obs.oi;
15149   eversion_t v = oi.version;
15150
15151   if (primary_error(soid, v)) {
15152     dout(0) << __func__ << " No other replicas available for " << soid << dendl;
15153     // XXX: If we knew that there is no down osd which could include this
15154     // object, it would be nice if we could return EIO here.
15155     // If a "never fail" flag was available, that could be used
15156     // for rbd to NOT return EIO until object marked lost.
15157
15158     // Drop through to save this op in case an osd comes up with the object.
15159   }
15160
15161   // Restart the op after object becomes readable again
15162   waiting_for_unreadable_object[soid].push_back(op);
15163   op->mark_delayed("waiting for missing object");
15164
15165   if (!eio_errors_to_process) {
15166     eio_errors_to_process = true;
15167     ceph_assert(is_clean());
15168     state_set(PG_STATE_REPAIR);
15169     state_clear(PG_STATE_CLEAN);
15170     queue_peering_event(
15171         PGPeeringEventRef(
15172           std::make_shared<PGPeeringEvent>(
15173           get_osdmap_epoch(),
15174           get_osdmap_epoch(),
15175           PeeringState::DoRecovery())));
15176   } else {
15177     // A prior error must have already cleared clean state and queued recovery
15178     // or a map change has triggered re-peering.
15179     // Not inlining the recovery by calling maybe_kick_recovery(soid);
15180     dout(5) << __func__<< ": Read error on " << soid << ", but already seen errors" << dendl;
15181   }
15182
15183   return -EAGAIN;
15184 }
15185
15186 /*---SnapTrimmer Logging---*/
15187 #undef dout_prefix
15188 #define dout_prefix pg->gen_prefix(*_dout)
15189
15190 void PrimaryLogPG::SnapTrimmer::log_enter(const char *state_name)
15191 {
15192   ldout(pg->cct, 20) << "enter " << state_name << dendl;
15193 }
15194
15195 void PrimaryLogPG::SnapTrimmer::log_exit(const char *state_name, utime_t enter_time)
15196 {
15197   ldout(pg->cct, 20) << "exit " << state_name << dendl;
15198 }
15199
15200 /*---SnapTrimmer states---*/
15201 #undef dout_prefix
15202 #define dout_prefix (context< SnapTrimmer >().pg->gen_prefix(*_dout) \
15203                      << "SnapTrimmer state<" << get_state_name() << ">: ")
15204
15205 /* NotTrimming */
15206 PrimaryLogPG::NotTrimming::NotTrimming(my_context ctx)
15207   : my_base(ctx),
15208     NamedState(nullptr, "NotTrimming")
15209 {
15210   context< SnapTrimmer >().log_enter(state_name);
15211 }
15212
15213 void PrimaryLogPG::NotTrimming::exit()
15214 {
15215   context< SnapTrimmer >().log_exit(state_name, enter_time);
15216 }
15217
15218 boost::statechart::result PrimaryLogPG::NotTrimming::react(const KickTrim&)
15219 {
15220   PrimaryLogPG *pg = context< SnapTrimmer >().pg;
15221   ldout(pg->cct, 10) << "NotTrimming react KickTrim" << dendl;
15222
15223   if (!(pg->is_primary() && pg->is_active())) {
15224     ldout(pg->cct, 10) << "NotTrimming not primary or active" << dendl;
15225     return discard_event();
15226   }
15227   if (!pg->is_clean() ||
15228       pg->snap_trimq.empty()) {
15229     ldout(pg->cct, 10) << "NotTrimming not clean or nothing to trim" << dendl;
15230     return discard_event();
15231   }
15232   if (pg->scrubber.active) {
15233     ldout(pg->cct, 10) << " scrubbing, will requeue snap_trimmer after" << dendl;
15234     return transit< WaitScrub >();
15235   } else {
15236     return transit< Trimming >();
15237   }
15238 }
15239
15240 boost::statechart::result PrimaryLogPG::WaitReservation::react(const SnapTrimReserved&)
15241 {
15242   PrimaryLogPG *pg = context< SnapTrimmer >().pg;
15243   ldout(pg->cct, 10) << "WaitReservation react SnapTrimReserved" << dendl;
15244
15245   pending = nullptr;
15246   if (!context< SnapTrimmer >().can_trim()) {
15247     post_event(KickTrim());
15248     return transit< NotTrimming >();
15249   }
15250
15251   context<Trimming>().snap_to_trim = pg->snap_trimq.range_start();
15252   ldout(pg->cct, 10) << "NotTrimming: trimming "
15253                      << pg->snap_trimq.range_start()
15254                      << dendl;
15255   return transit< AwaitAsyncWork >();
15256 }
15257
15258 /* AwaitAsyncWork */
15259 PrimaryLogPG::AwaitAsyncWork::AwaitAsyncWork(my_context ctx)
15260   : my_base(ctx),
15261     NamedState(nullptr, "Trimming/AwaitAsyncWork")
15262 {
15263   auto *pg = context< SnapTrimmer >().pg;
15264   context< SnapTrimmer >().log_enter(state_name);
15265   context< SnapTrimmer >().pg->osd->queue_for_snap_trim(pg);
15266   pg->state_set(PG_STATE_SNAPTRIM);
15267   pg->state_clear(PG_STATE_SNAPTRIM_ERROR);
15268   pg->publish_stats_to_osd();
15269 }
15270
15271 boost::statechart::result PrimaryLogPG::AwaitAsyncWork::react(const DoSnapWork&)
15272 {
15273   PrimaryLogPGRef pg = context< SnapTrimmer >().pg;
15274   snapid_t snap_to_trim = context<Trimming>().snap_to_trim;
15275   auto &in_flight = context<Trimming>().in_flight;
15276   ceph_assert(in_flight.empty());
15277
15278   ceph_assert(pg->is_primary() && pg->is_active());
15279   if (!context< SnapTrimmer >().can_trim()) {
15280     ldout(pg->cct, 10) << "something changed, reverting to NotTrimming" << dendl;
15281     post_event(KickTrim());
15282     return transit< NotTrimming >();
15283   }
15284
15285   ldout(pg->cct, 10) << "AwaitAsyncWork: trimming snap " << snap_to_trim << dendl;
15286
15287   vector<hobject_t> to_trim;
15288   unsigned max = pg->cct->_conf->osd_pg_max_concurrent_snap_trims;
15289   to_trim.reserve(max);
15290   int r = pg->snap_mapper.get_next_objects_to_trim(
15291     snap_to_trim,
15292     max,
15293     &to_trim);
15294   if (r != 0 && r != -ENOENT) {
15295     lderr(pg->cct) << "get_next_objects_to_trim returned "
15296                    << cpp_strerror(r) << dendl;
15297     ceph_abort_msg("get_next_objects_to_trim returned an invalid code");
15298   } else if (r == -ENOENT) {
15299     // Done!
15300     ldout(pg->cct, 10) << "got ENOENT" << dendl;
15301
15302     pg->snap_trimq.erase(snap_to_trim);
15303
15304     if (pg->snap_trimq_repeat.count(snap_to_trim)) {
15305       ldout(pg->cct, 10) << " removing from snap_trimq_repeat" << dendl;
15306       pg->snap_trimq_repeat.erase(snap_to_trim);
15307     } else {
15308       ldout(pg->cct, 10) << "adding snap " << snap_to_trim
15309                          << " to purged_snaps"
15310                          << dendl;
15311       ObjectStore::Transaction t;
15312       pg->recovery_state.adjust_purged_snaps(
15313         [snap_to_trim](auto &purged_snaps) {
15314           purged_snaps.insert(snap_to_trim);
15315         });
15316       pg->write_if_dirty(t);
15317
15318       ldout(pg->cct, 10) << "purged_snaps now "
15319                          << pg->info.purged_snaps << ", snap_trimq now "
15320                          << pg->snap_trimq << dendl;
15321
15322       int tr = pg->osd->store->queue_transaction(pg->ch, std::move(t), NULL);
15323       ceph_assert(tr == 0);
15324
15325       pg->recovery_state.share_pg_info();
15326     }
15327     post_event(KickTrim());
15328     return transit< NotTrimming >();
15329   }
15330   ceph_assert(!to_trim.empty());
15331
15332   for (auto &&object: to_trim) {
15333     // Get next
15334     ldout(pg->cct, 10) << "AwaitAsyncWork react trimming " << object << dendl;
15335     OpContextUPtr ctx;
15336     int error = pg->trim_object(in_flight.empty(), object, snap_to_trim, &ctx);
15337     if (error) {
15338       if (error == -ENOLCK) {
15339         ldout(pg->cct, 10) << "could not get write lock on obj "
15340                            << object << dendl;
15341       } else {
15342         pg->state_set(PG_STATE_SNAPTRIM_ERROR);
15343         ldout(pg->cct, 10) << "Snaptrim error=" << error << dendl;
15344       }
15345       if (!in_flight.empty()) {
15346         ldout(pg->cct, 10) << "letting the ones we already started finish" << dendl;
15347         return transit< WaitRepops >();
15348       }
15349       if (error == -ENOLCK) {
15350         ldout(pg->cct, 10) << "waiting for it to clear"
15351                            << dendl;
15352         return transit< WaitRWLock >();
15353       } else {
15354         return transit< NotTrimming >();
15355       }
15356     }
15357
15358     in_flight.insert(object);
15359     ctx->register_on_success(
15360       [pg, object, &in_flight]() {
15361         ceph_assert(in_flight.find(object) != in_flight.end());
15362         in_flight.erase(object);
15363         if (in_flight.empty()) {
15364           if (pg->state_test(PG_STATE_SNAPTRIM_ERROR)) {
15365             pg->snap_trimmer_machine.process_event(Reset());
15366           } else {
15367             pg->snap_trimmer_machine.process_event(RepopsComplete());
15368           }
15369         }
15370       });
15371
15372     pg->simple_opc_submit(std::move(ctx));
15373   }
15374
15375   return transit< WaitRepops >();
15376 }
15377
15378 void PrimaryLogPG::setattr_maybe_cache(
15379   ObjectContextRef obc,
15380   PGTransaction *t,
15381   const string &key,
15382   bufferlist &val)
15383 {
15384   t->setattr(obc->obs.oi.soid, key, val);
15385 }
15386
15387 void PrimaryLogPG::setattrs_maybe_cache(
15388   ObjectContextRef obc,
15389   PGTransaction *t,
15390   map<string, bufferlist> &attrs)
15391 {
15392   t->setattrs(obc->obs.oi.soid, attrs);
15393 }
15394
15395 void PrimaryLogPG::rmattr_maybe_cache(
15396   ObjectContextRef obc,
15397   PGTransaction *t,
15398   const string &key)
15399 {
15400   t->rmattr(obc->obs.oi.soid, key);
15401 }
15402
15403 int PrimaryLogPG::getattr_maybe_cache(
15404   ObjectContextRef obc,
15405   const string &key,
15406   bufferlist *val)
15407 {
15408   if (pool.info.is_erasure()) {
15409     map<string, bufferlist>::iterator i = obc->attr_cache.find(key);
15410     if (i != obc->attr_cache.end()) {
15411       if (val)
15412         *val = i->second;
15413       return 0;
15414     } else {
15415       return -ENODATA;
15416     }
15417   }
15418   return pgbackend->objects_get_attr(obc->obs.oi.soid, key, val);
15419 }
15420
15421 int PrimaryLogPG::getattrs_maybe_cache(
15422   ObjectContextRef obc,
15423   map<string, bufferlist> *out)
15424 {
15425   int r = 0;
15426   ceph_assert(out);
15427   if (pool.info.is_erasure()) {
15428     *out = obc->attr_cache;
15429   } else {
15430     r = pgbackend->objects_get_attrs(obc->obs.oi.soid, out);
15431   }
15432   map<string, bufferlist> tmp;
15433   for (map<string, bufferlist>::iterator i = out->begin();
15434        i != out->end();
15435        ++i) {
15436     if (i->first.size() > 1 && i->first[0] == '_')
15437       tmp[i->first.substr(1, i->first.size())].claim(i->second);
15438   }
15439   tmp.swap(*out);
15440   return r;
15441 }
15442
15443 bool PrimaryLogPG::check_failsafe_full() {
15444     return osd->check_failsafe_full(get_dpp());
15445 }
15446
15447 void intrusive_ptr_add_ref(PrimaryLogPG *pg) { pg->get("intptr"); }
15448 void intrusive_ptr_release(PrimaryLogPG *pg) { pg->put("intptr"); }
15449
15450 #ifdef PG_DEBUG_REFS
15451 uint64_t get_with_id(PrimaryLogPG *pg) { return pg->get_with_id(); }
15452 void put_with_id(PrimaryLogPG *pg, uint64_t id) { return pg->put_with_id(id); }
15453 #endif
15454
15455 void intrusive_ptr_add_ref(PrimaryLogPG::RepGather *repop) { repop->get(); }
15456 void intrusive_ptr_release(PrimaryLogPG::RepGather *repop) { repop->put(); }