ceph/src/osd/PrimaryLogPG.cc

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Ceph - scalable distributed file system
   5  *
   6  * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
   7  * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
   8  *
   9  * Author: Loic Dachary <loic@dachary.org>
  10  *
  11  * This is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public
  13  * License version 2.1, as published by the Free Software
  14  * Foundation.  See file COPYING.
  15  *
  16  */
  17
  18 #include "boost/tuple/tuple.hpp"
  19 #include "boost/intrusive_ptr.hpp"
  20 #include "PG.h"
  21 #include "pg_scrubber.h"
  22 #include "PrimaryLogPG.h"
  23 #include "OSD.h"
  24 #include "PrimaryLogScrub.h"
  25 #include "OpRequest.h"
  26 #include "ScrubStore.h"
  27 #include "Session.h"
  28 #include "objclass/objclass.h"
  29 #include "osd/ClassHandler.h"
  30
  31 #include "cls/cas/cls_cas_ops.h"
  32 #include "common/ceph_crypto.h"
  33 #include "common/errno.h"
  34 #include "common/scrub_types.h"
  35 #include "common/perf_counters.h"
  36
  37 #include "messages/MOSDOp.h"
  38 #include "messages/MOSDBackoff.h"
  39 #include "messages/MOSDPGTrim.h"
  40 #include "messages/MOSDPGScan.h"
  41 #include "messages/MOSDRepScrub.h"
  42 #include "messages/MOSDPGBackfill.h"
  43 #include "messages/MOSDPGBackfillRemove.h"
  44 #include "messages/MOSDPGUpdateLogMissing.h"
  45 #include "messages/MOSDPGUpdateLogMissingReply.h"
  46 #include "messages/MCommandReply.h"
  47 #include "messages/MOSDScrubReserve.h"
  48 #include "common/EventTrace.h"
  49
  50 #include "common/config.h"
  51 #include "include/compat.h"
  52 #include "mon/MonClient.h"
  53 #include "osdc/Objecter.h"
  54 #include "json_spirit/json_spirit_value.h"
  55 #include "json_spirit/json_spirit_reader.h"
  56 #include "include/ceph_assert.h"  // json_spirit clobbers it
  57 #include "include/rados/rados_types.hpp"
  58
  59 #ifdef WITH_LTTNG
  60 #include "tracing/osd.h"
  61 #else
  62 #define tracepoint(...)
  63 #endif
  64
  65 #define dout_context cct
  66 #define dout_subsys ceph_subsys_osd
  67 #define DOUT_PREFIX_ARGS this, osd->whoami, get_osdmap()
  68 #undef dout_prefix
  69 #define dout_prefix _prefix(_dout, this)
  70
  71 #include <sstream>
  72 #include <utility>
  73
  74 #include <errno.h>
  75 #ifdef HAVE_JAEGER
  76 #include "common/tracer.h"
  77 #endif
  78
  79 #include <common/CDC.h>
  80
  81 MEMPOOL_DEFINE_OBJECT_FACTORY(PrimaryLogPG, replicatedpg, osd);
  82
  83 using std::list;
  84 using std::ostream;
  85 using std::pair;
  86 using std::make_pair;
  87 using std::map;
  88 using std::ostringstream;
  89 using std::set;
  90 using std::string;
  91 using std::string_view;
  92 using std::stringstream;
  93 using std::unique_ptr;
  94 using std::vector;
  95
  96 using ceph::bufferlist;
  97 using ceph::bufferptr;
  98 using ceph::Formatter;
  99 using ceph::decode;
 100 using ceph::decode_noclear;
 101 using ceph::encode;
 102 using ceph::encode_destructively;
 103
 104 using namespace ceph::osd::scheduler;
 105 using TOPNSPC::common::cmd_getval;
 106
 107 template <typename T>
 108 static ostream& _prefix(std::ostream *_dout, T *pg) {
 109   return pg->gen_prefix(*_dout);
 110 }
 111
 112 /**
 113  * The CopyCallback class defines an interface for completions to the
 114  * copy_start code. Users of the copy infrastructure must implement
 115  * one and give an instance of the class to start_copy.
 116  *
 117  * The implementer is responsible for making sure that the CopyCallback
 118  * can associate itself with the correct copy operation.
 119  */
 120 class PrimaryLogPG::CopyCallback : public GenContext<CopyCallbackResults> {
 121 protected:
 122   CopyCallback() {}
 123   /**
 124    * results.get<0>() is the return code: 0 for success; -ECANCELED if
 125    * the operation was cancelled by the local OSD; -errno for other issues.
 126    * results.get<1>() is a pointer to a CopyResults object, which you are
 127    * responsible for deleting.
 128    */
 129   void finish(CopyCallbackResults results_) override = 0;
 130
 131 public:
 132   /// Provide the final size of the copied object to the CopyCallback
 133   ~CopyCallback() override {}
 134 };
 135
 136 template <typename T>
 137 class PrimaryLogPG::BlessedGenContext : public GenContext<T> {
 138   PrimaryLogPGRef pg;
 139   unique_ptr<GenContext<T>> c;
 140   epoch_t e;
 141 public:
 142   BlessedGenContext(PrimaryLogPG *pg, GenContext<T> *c, epoch_t e)
 143     : pg(pg), c(c), e(e) {}
 144   void finish(T t) override {
 145     std::scoped_lock locker{*pg};
 146     if (pg->pg_has_reset_since(e))
 147       c.reset();
 148     else
 149       c.release()->complete(t);
 150   }
 151   bool sync_finish(T t) {
 152     // we assume here all blessed/wrapped Contexts can complete synchronously.
 153     c.release()->complete(t);
 154     return true;
 155   }
 156 };
 157
 158 GenContext<ThreadPool::TPHandle&> *PrimaryLogPG::bless_gencontext(
 159   GenContext<ThreadPool::TPHandle&> *c) {
 160   return new BlessedGenContext<ThreadPool::TPHandle&>(
 161     this, c, get_osdmap_epoch());
 162 }
 163
 164 template <typename T>
 165 class PrimaryLogPG::UnlockedBlessedGenContext : public GenContext<T> {
 166   PrimaryLogPGRef pg;
 167   unique_ptr<GenContext<T>> c;
 168   epoch_t e;
 169 public:
 170   UnlockedBlessedGenContext(PrimaryLogPG *pg, GenContext<T> *c, epoch_t e)
 171     : pg(pg), c(c), e(e) {}
 172   void finish(T t) override {
 173     if (pg->pg_has_reset_since(e))
 174       c.reset();
 175     else
 176       c.release()->complete(t);
 177   }
 178   bool sync_finish(T t) {
 179     // we assume here all blessed/wrapped Contexts can complete synchronously.
 180     c.release()->complete(t);
 181     return true;
 182   }
 183 };
 184
 185 GenContext<ThreadPool::TPHandle&> *PrimaryLogPG::bless_unlocked_gencontext(
 186   GenContext<ThreadPool::TPHandle&> *c) {
 187   return new UnlockedBlessedGenContext<ThreadPool::TPHandle&>(
 188     this, c, get_osdmap_epoch());
 189 }
 190
 191 class PrimaryLogPG::BlessedContext : public Context {
 192   PrimaryLogPGRef pg;
 193   unique_ptr<Context> c;
 194   epoch_t e;
 195 public:
 196   BlessedContext(PrimaryLogPG *pg, Context *c, epoch_t e)
 197     : pg(pg), c(c), e(e) {}
 198   void finish(int r) override {
 199     std::scoped_lock locker{*pg};
 200     if (pg->pg_has_reset_since(e))
 201       c.reset();
 202     else
 203       c.release()->complete(r);
 204   }
 205   bool sync_finish(int r) override {
 206     // we assume here all blessed/wrapped Contexts can complete synchronously.
 207     c.release()->complete(r);
 208     return true;
 209   }
 210 };
 211
 212 Context *PrimaryLogPG::bless_context(Context *c) {
 213   return new BlessedContext(this, c, get_osdmap_epoch());
 214 }
 215
 216 class PrimaryLogPG::C_PG_ObjectContext : public Context {
 217   PrimaryLogPGRef pg;
 218   ObjectContext *obc;
 219   public:
 220   C_PG_ObjectContext(PrimaryLogPG *p, ObjectContext *o) :
 221     pg(p), obc(o) {}
 222   void finish(int r) override {
 223     pg->object_context_destructor_callback(obc);
 224   }
 225 };
 226
 227 struct OnReadComplete : public Context {
 228   PrimaryLogPG *pg;
 229   PrimaryLogPG::OpContext *opcontext;
 230   OnReadComplete(
 231     PrimaryLogPG *pg,
 232     PrimaryLogPG::OpContext *ctx) : pg(pg), opcontext(ctx) {}
 233   void finish(int r) override {
 234     opcontext->finish_read(pg);
 235   }
 236   ~OnReadComplete() override {}
 237 };
 238
 239 class PrimaryLogPG::C_OSD_AppliedRecoveredObject : public Context {
 240   PrimaryLogPGRef pg;
 241   ObjectContextRef obc;
 242   public:
 243   C_OSD_AppliedRecoveredObject(PrimaryLogPG *p, ObjectContextRef o) :
 244     pg(p), obc(o) {}
 245   bool sync_finish(int r) override {
 246     pg->_applied_recovered_object(obc);
 247     return true;
 248   }
 249   void finish(int r) override {
 250     std::scoped_lock locker{*pg};
 251     pg->_applied_recovered_object(obc);
 252   }
 253 };
 254
 255 class PrimaryLogPG::C_OSD_CommittedPushedObject : public Context {
 256   PrimaryLogPGRef pg;
 257   epoch_t epoch;
 258   eversion_t last_complete;
 259   public:
 260   C_OSD_CommittedPushedObject(
 261     PrimaryLogPG *p, epoch_t epoch, eversion_t lc) :
 262     pg(p), epoch(epoch), last_complete(lc) {
 263   }
 264   void finish(int r) override {
 265     pg->_committed_pushed_object(epoch, last_complete);
 266   }
 267 };
 268
 269 class PrimaryLogPG::C_OSD_AppliedRecoveredObjectReplica : public Context {
 270   PrimaryLogPGRef pg;
 271   public:
 272   explicit C_OSD_AppliedRecoveredObjectReplica(PrimaryLogPG *p) :
 273     pg(p) {}
 274   bool sync_finish(int r) override {
 275     pg->_applied_recovered_object_replica();
 276     return true;
 277   }
 278   void finish(int r) override {
 279     std::scoped_lock locker{*pg};
 280     pg->_applied_recovered_object_replica();
 281   }
 282 };
 283
 284 // OpContext
 285 void PrimaryLogPG::OpContext::start_async_reads(PrimaryLogPG *pg)
 286 {
 287   inflightreads = 1;
 288   list<pair<boost::tuple<uint64_t, uint64_t, unsigned>,
 289             pair<bufferlist*, Context*> > > in;
 290   in.swap(pending_async_reads);
 291   pg->pgbackend->objects_read_async(
 292     obc->obs.oi.soid,
 293     in,
 294     new OnReadComplete(pg, this), pg->get_pool().fast_read);
 295 }
 296 void PrimaryLogPG::OpContext::finish_read(PrimaryLogPG *pg)
 297 {
 298   ceph_assert(inflightreads > 0);
 299   --inflightreads;
 300   if (async_reads_complete()) {
 301     ceph_assert(pg->in_progress_async_reads.size());
 302     ceph_assert(pg->in_progress_async_reads.front().second == this);
 303     pg->in_progress_async_reads.pop_front();
 304
 305     // Restart the op context now that all reads have been
 306     // completed. Read failures will be handled by the op finisher
 307     pg->execute_ctx(this);
 308   }
 309 }
 310
 311 class CopyFromCallback : public PrimaryLogPG::CopyCallback {
 312 public:
 313   PrimaryLogPG::CopyResults *results = nullptr;
 314   PrimaryLogPG::OpContext *ctx;
 315   OSDOp &osd_op;
 316   uint32_t truncate_seq;
 317   uint64_t truncate_size;
 318   bool have_truncate = false;
 319
 320   CopyFromCallback(PrimaryLogPG::OpContext *ctx, OSDOp &osd_op)
 321     : ctx(ctx), osd_op(osd_op) {
 322   }
 323   ~CopyFromCallback() override {}
 324
 325   void finish(PrimaryLogPG::CopyCallbackResults results_) override {
 326     results = results_.get<1>();
 327     int r = results_.get<0>();
 328
 329     // Only use truncate_{seq,size} from the original object if the client
 330     // did not sent us these parameters
 331     if (!have_truncate) {
 332       truncate_seq = results->truncate_seq;
 333       truncate_size = results->truncate_size;
 334     }
 335
 336     // for finish_copyfrom
 337     ctx->user_at_version = results->user_version;
 338
 339     if (r >= 0) {
 340       ctx->pg->execute_ctx(ctx);
 341     } else {
 342       if (r != -ECANCELED) { // on cancel just toss it out; client resends
 343         if (ctx->op)
 344           ctx->pg->osd->reply_op_error(ctx->op, r);
 345       } else if (results->should_requeue) {
 346         if (ctx->op)
 347           ctx->pg->requeue_op(ctx->op);
 348       }
 349       ctx->pg->close_op_ctx(ctx);
 350     }
 351   }
 352
 353   bool is_temp_obj_used() {
 354     return results->started_temp_obj;
 355   }
 356   uint64_t get_data_size() {
 357     return results->object_size;
 358   }
 359   void set_truncate(uint32_t seq, uint64_t size) {
 360     truncate_seq = seq;
 361     truncate_size = size;
 362     have_truncate = true;
 363   }
 364 };
 365
 366 struct CopyFromFinisher : public PrimaryLogPG::OpFinisher {
 367   CopyFromCallback *copy_from_callback;
 368
 369   explicit CopyFromFinisher(CopyFromCallback *copy_from_callback)
 370     : copy_from_callback(copy_from_callback) {
 371   }
 372
 373   int execute() override {
 374     // instance will be destructed after this method completes
 375     copy_from_callback->ctx->pg->finish_copyfrom(copy_from_callback);
 376     return 0;
 377   }
 378 };
 379
 380 // ======================
 381 // PGBackend::Listener
 382
 383 void PrimaryLogPG::on_local_recover(
 384   const hobject_t &hoid,
 385   const ObjectRecoveryInfo &_recovery_info,
 386   ObjectContextRef obc,
 387   bool is_delete,
 388   ObjectStore::Transaction *t
 389   )
 390 {
 391   dout(10) << __func__ << ": " << hoid << dendl;
 392
 393   ObjectRecoveryInfo recovery_info(_recovery_info);
 394   clear_object_snap_mapping(t, hoid);
 395   if (!is_delete && recovery_info.soid.is_snap()) {
 396     OSDriver::OSTransaction _t(osdriver.get_transaction(t));
 397     set<snapid_t> snaps;
 398     dout(20) << " snapset " << recovery_info.ss << dendl;
 399     auto p = recovery_info.ss.clone_snaps.find(hoid.snap);
 400     if (p != recovery_info.ss.clone_snaps.end()) {
 401       snaps.insert(p->second.begin(), p->second.end());
 402       dout(20) << " snaps " << snaps << dendl;
 403       snap_mapper.add_oid(
 404         recovery_info.soid,
 405         snaps,
 406         &_t);
 407     } else {
 408       derr << __func__ << " " << hoid << " had no clone_snaps" << dendl;
 409     }
 410   }
 411   if (!is_delete && recovery_state.get_pg_log().get_missing().is_missing(recovery_info.soid) &&
 412       recovery_state.get_pg_log().get_missing().get_items().find(recovery_info.soid)->second.need > recovery_info.version) {
 413     ceph_assert(is_primary());
 414     const pg_log_entry_t *latest = recovery_state.get_pg_log().get_log().objects.find(recovery_info.soid)->second;
 415     if (latest->op == pg_log_entry_t::LOST_REVERT &&
 416         latest->reverting_to == recovery_info.version) {
 417       dout(10) << " got old revert version " << recovery_info.version
 418                << " for " << *latest << dendl;
 419       recovery_info.version = latest->version;
 420       // update the attr to the revert event version
 421       recovery_info.oi.prior_version = recovery_info.oi.version;
 422       recovery_info.oi.version = latest->version;
 423       bufferlist bl;
 424       encode(recovery_info.oi, bl,
 425                get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
 426       ceph_assert(!pool.info.is_erasure());
 427       t->setattr(coll, ghobject_t(recovery_info.soid), OI_ATTR, bl);
 428       if (obc)
 429         obc->attr_cache[OI_ATTR] = bl;
 430     }
 431   }
 432
 433   // keep track of active pushes for scrub
 434   ++active_pushes;
 435
 436   recovery_state.recover_got(
 437     recovery_info.soid,
 438     recovery_info.version,
 439     is_delete,
 440     *t);
 441
 442   if (is_primary()) {
 443     if (!is_delete) {
 444       obc->obs.exists = true;
 445
 446       bool got = obc->get_recovery_read();
 447       ceph_assert(got);
 448
 449       ceph_assert(recovering.count(obc->obs.oi.soid));
 450       recovering[obc->obs.oi.soid] = obc;
 451       obc->obs.oi = recovery_info.oi;  // may have been updated above
 452     }
 453
 454     t->register_on_applied(new C_OSD_AppliedRecoveredObject(this, obc));
 455
 456     publish_stats_to_osd();
 457     release_backoffs(hoid);
 458     if (!is_unreadable_object(hoid)) {
 459       auto unreadable_object_entry = waiting_for_unreadable_object.find(hoid);
 460       if (unreadable_object_entry != waiting_for_unreadable_object.end()) {
 461         dout(20) << " kicking unreadable waiters on " << hoid << dendl;
 462         requeue_ops(unreadable_object_entry->second);
 463         waiting_for_unreadable_object.erase(unreadable_object_entry);
 464       }
 465     }
 466   } else {
 467     t->register_on_applied(
 468       new C_OSD_AppliedRecoveredObjectReplica(this));
 469
 470   }
 471
 472   t->register_on_commit(
 473     new C_OSD_CommittedPushedObject(
 474       this,
 475       get_osdmap_epoch(),
 476       info.last_complete));
 477 }
 478
 479 void PrimaryLogPG::on_global_recover(
 480   const hobject_t &soid,
 481   const object_stat_sum_t &stat_diff,
 482   bool is_delete)
 483 {
 484   recovery_state.object_recovered(soid, stat_diff);
 485   publish_stats_to_osd();
 486   dout(10) << "pushed " << soid << " to all replicas" << dendl;
 487   auto i = recovering.find(soid);
 488   ceph_assert(i != recovering.end());
 489
 490   if (i->second && i->second->rwstate.recovery_read_marker) {
 491     // recover missing won't have had an obc, but it gets filled in
 492     // during on_local_recover
 493     ceph_assert(i->second);
 494     list<OpRequestRef> requeue_list;
 495     i->second->drop_recovery_read(&requeue_list);
 496     requeue_ops(requeue_list);
 497   }
 498
 499   backfills_in_flight.erase(soid);
 500
 501   recovering.erase(i);
 502   finish_recovery_op(soid);
 503   release_backoffs(soid);
 504   auto degraded_object_entry = waiting_for_degraded_object.find(soid);
 505   if (degraded_object_entry != waiting_for_degraded_object.end()) {
 506     dout(20) << " kicking degraded waiters on " << soid << dendl;
 507     requeue_ops(degraded_object_entry->second);
 508     waiting_for_degraded_object.erase(degraded_object_entry);
 509   }
 510   auto unreadable_object_entry = waiting_for_unreadable_object.find(soid);
 511   if (unreadable_object_entry != waiting_for_unreadable_object.end()) {
 512     dout(20) << " kicking unreadable waiters on " << soid << dendl;
 513     requeue_ops(unreadable_object_entry->second);
 514     waiting_for_unreadable_object.erase(unreadable_object_entry);
 515   }
 516   finish_degraded_object(soid);
 517 }
 518
 519 void PrimaryLogPG::schedule_recovery_work(
 520   GenContext<ThreadPool::TPHandle&> *c)
 521 {
 522   osd->queue_recovery_context(this, c);
 523 }
 524
 525 void PrimaryLogPG::replica_clear_repop_obc(
 526   const vector<pg_log_entry_t> &logv,
 527   ObjectStore::Transaction &t)
 528 {
 529   for (auto &&e: logv) {
 530     /* Have to blast all clones, they share a snapset */
 531     object_contexts.clear_range(
 532       e.soid.get_object_boundary(), e.soid.get_head());
 533     ceph_assert(
 534       snapset_contexts.find(e.soid.get_head()) ==
 535       snapset_contexts.end());
 536   }
 537 }
 538
 539 bool PrimaryLogPG::should_send_op(
 540   pg_shard_t peer,
 541   const hobject_t &hoid) {
 542   if (peer == get_primary())
 543     return true;
 544   ceph_assert(recovery_state.has_peer_info(peer));
 545   bool should_send =
 546       hoid.pool != (int64_t)info.pgid.pool() ||
 547       hoid <= last_backfill_started ||
 548       hoid <= recovery_state.get_peer_info(peer).last_backfill;
 549   if (!should_send) {
 550     ceph_assert(is_backfill_target(peer));
 551     dout(10) << __func__ << " issue_repop shipping empty opt to osd." << peer
 552              << ", object " << hoid
 553              << " beyond std::max(last_backfill_started "
 554              << ", peer_info[peer].last_backfill "
 555              << recovery_state.get_peer_info(peer).last_backfill
 556              << ")" << dendl;
 557     return should_send;
 558   }
 559   if (is_async_recovery_target(peer) &&
 560       recovery_state.get_peer_missing(peer).is_missing(hoid)) {
 561     should_send = false;
 562     dout(10) << __func__ << " issue_repop shipping empty opt to osd." << peer
 563              << ", object " << hoid
 564              << " which is pending recovery in async_recovery_targets" << dendl;
 565   }
 566   return should_send;
 567 }
 568
 569
 570 ConnectionRef PrimaryLogPG::get_con_osd_cluster(
 571   int peer, epoch_t from_epoch)
 572 {
 573   return osd->get_con_osd_cluster(peer, from_epoch);
 574 }
 575
 576 PerfCounters *PrimaryLogPG::get_logger()
 577 {
 578   return osd->logger;
 579 }
 580
 581
 582 // ====================
 583 // missing objects
 584
 585 bool PrimaryLogPG::is_missing_object(const hobject_t& soid) const
 586 {
 587   return recovery_state.get_pg_log().get_missing().get_items().count(soid);
 588 }
 589
 590 void PrimaryLogPG::maybe_kick_recovery(
 591   const hobject_t &soid)
 592 {
 593   eversion_t v;
 594   bool work_started = false;
 595   if (!recovery_state.get_missing_loc().needs_recovery(soid, &v))
 596     return;
 597
 598   map<hobject_t, ObjectContextRef>::const_iterator p = recovering.find(soid);
 599   if (p != recovering.end()) {
 600     dout(7) << "object " << soid << " v " << v << ", already recovering." << dendl;
 601   } else if (recovery_state.get_missing_loc().is_unfound(soid)) {
 602     dout(7) << "object " << soid << " v " << v << ", is unfound." << dendl;
 603   } else {
 604     dout(7) << "object " << soid << " v " << v << ", recovering." << dendl;
 605     PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
 606     if (is_missing_object(soid)) {
 607       recover_missing(soid, v, CEPH_MSG_PRIO_HIGH, h);
 608     } else if (recovery_state.get_missing_loc().is_deleted(soid)) {
 609       prep_object_replica_deletes(soid, v, h, &work_started);
 610     } else {
 611       prep_object_replica_pushes(soid, v, h, &work_started);
 612     }
 613     pgbackend->run_recovery_op(h, CEPH_MSG_PRIO_HIGH);
 614   }
 615 }
 616
 617 void PrimaryLogPG::wait_for_unreadable_object(
 618   const hobject_t& soid, OpRequestRef op)
 619 {
 620   ceph_assert(is_unreadable_object(soid));
 621   maybe_kick_recovery(soid);
 622   waiting_for_unreadable_object[soid].push_back(op);
 623   op->mark_delayed("waiting for missing object");
 624 }
 625
 626 bool PrimaryLogPG::is_degraded_or_backfilling_object(const hobject_t& soid)
 627 {
 628   /* The conditions below may clear (on_local_recover, before we queue
 629    * the transaction) before we actually requeue the degraded waiters
 630    * in on_global_recover after the transaction completes.
 631    */
 632   if (waiting_for_degraded_object.count(soid))
 633     return true;
 634   if (recovery_state.get_pg_log().get_missing().get_items().count(soid))
 635     return true;
 636   ceph_assert(!get_acting_recovery_backfill().empty());
 637   for (set<pg_shard_t>::iterator i = get_acting_recovery_backfill().begin();
 638        i != get_acting_recovery_backfill().end();
 639        ++i) {
 640     if (*i == get_primary()) continue;
 641     pg_shard_t peer = *i;
 642     auto peer_missing_entry = recovery_state.get_peer_missing().find(peer);
 643     // If an object is missing on an async_recovery_target, return false.
 644     // This will not block the op and the object is async recovered later.
 645     if (peer_missing_entry != recovery_state.get_peer_missing().end() &&
 646         peer_missing_entry->second.get_items().count(soid)) {
 647       if (is_async_recovery_target(peer))
 648         continue;
 649       else
 650         return true;
 651     }
 652     // Object is degraded if after last_backfill AND
 653     // we are backfilling it
 654     if (is_backfill_target(peer) &&
 655         recovery_state.get_peer_info(peer).last_backfill <= soid &&
 656         last_backfill_started >= soid &&
 657         backfills_in_flight.count(soid))
 658       return true;
 659   }
 660   return false;
 661 }
 662
 663 bool PrimaryLogPG::is_degraded_on_async_recovery_target(const hobject_t& soid)
 664 {
 665   for (auto &i: get_async_recovery_targets()) {
 666     auto peer_missing_entry = recovery_state.get_peer_missing().find(i);
 667     if (peer_missing_entry != recovery_state.get_peer_missing().end() &&
 668         peer_missing_entry->second.get_items().count(soid)) {
 669       dout(30) << __func__ << " " << soid << dendl;
 670       return true;
 671     }
 672   }
 673   return false;
 674 }
 675
 676 void PrimaryLogPG::wait_for_degraded_object(const hobject_t& soid, OpRequestRef op)
 677 {
 678   ceph_assert(is_degraded_or_backfilling_object(soid) || is_degraded_on_async_recovery_target(soid));
 679
 680   maybe_kick_recovery(soid);
 681   waiting_for_degraded_object[soid].push_back(op);
 682   op->mark_delayed("waiting for degraded object");
 683 }
 684
 685 void PrimaryLogPG::block_write_on_full_cache(
 686   const hobject_t& _oid, OpRequestRef op)
 687 {
 688   const hobject_t oid = _oid.get_head();
 689   dout(20) << __func__ << ": blocking object " << oid
 690            << " on full cache" << dendl;
 691   objects_blocked_on_cache_full.insert(oid);
 692   waiting_for_cache_not_full.push_back(op);
 693   op->mark_delayed("waiting for cache not full");
 694 }
 695
 696 void PrimaryLogPG::block_for_clean(
 697   const hobject_t& oid, OpRequestRef op)
 698 {
 699   dout(20) << __func__ << ": blocking object " << oid
 700            << " on primary repair" << dendl;
 701   waiting_for_clean_to_primary_repair.push_back(op);
 702   op->mark_delayed("waiting for clean to repair");
 703 }
 704
 705 void PrimaryLogPG::block_write_on_snap_rollback(
 706   const hobject_t& oid, ObjectContextRef obc, OpRequestRef op)
 707 {
 708   dout(20) << __func__ << ": blocking object " << oid.get_head()
 709            << " on snap promotion " << obc->obs.oi.soid << dendl;
 710   // otherwise, we'd have blocked in do_op
 711   ceph_assert(oid.is_head());
 712   ceph_assert(objects_blocked_on_snap_promotion.count(oid) == 0);
 713   objects_blocked_on_snap_promotion[oid] = obc;
 714   wait_for_blocked_object(obc->obs.oi.soid, op);
 715 }
 716
 717 void PrimaryLogPG::block_write_on_degraded_snap(
 718   const hobject_t& snap, OpRequestRef op)
 719 {
 720   dout(20) << __func__ << ": blocking object " << snap.get_head()
 721            << " on degraded snap " << snap << dendl;
 722   // otherwise, we'd have blocked in do_op
 723   ceph_assert(objects_blocked_on_degraded_snap.count(snap.get_head()) == 0);
 724   objects_blocked_on_degraded_snap[snap.get_head()] = snap.snap;
 725   wait_for_degraded_object(snap, op);
 726 }
 727
 728 bool PrimaryLogPG::maybe_await_blocked_head(
 729   const hobject_t &hoid,
 730   OpRequestRef op)
 731 {
 732   ObjectContextRef obc;
 733   obc = object_contexts.lookup(hoid.get_head());
 734   if (obc) {
 735     if (obc->is_blocked()) {
 736       wait_for_blocked_object(obc->obs.oi.soid, op);
 737       return true;
 738     } else {
 739       return false;
 740     }
 741   }
 742   return false;
 743 }
 744
 745 void PrimaryLogPG::wait_for_blocked_object(const hobject_t& soid, OpRequestRef op)
 746 {
 747   dout(10) << __func__ << " " << soid << " " << op << dendl;
 748   waiting_for_blocked_object[soid].push_back(op);
 749   op->mark_delayed("waiting for blocked object");
 750 }
 751
 752 void PrimaryLogPG::maybe_force_recovery()
 753 {
 754   // no force if not in degraded/recovery/backfill states
 755   if (!is_degraded() &&
 756       !state_test(PG_STATE_RECOVERING |
 757                   PG_STATE_RECOVERY_WAIT |
 758                   PG_STATE_BACKFILLING |
 759                   PG_STATE_BACKFILL_WAIT |
 760                   PG_STATE_BACKFILL_TOOFULL))
 761     return;
 762
 763   if (recovery_state.get_pg_log().get_log().approx_size() <
 764       cct->_conf->osd_max_pg_log_entries *
 765         cct->_conf->osd_force_recovery_pg_log_entries_factor)
 766     return;
 767
 768   // find the oldest missing object
 769   version_t min_version = recovery_state.get_pg_log().get_log().head.version;
 770   hobject_t soid;
 771   if (!recovery_state.get_pg_log().get_missing().get_rmissing().empty()) {
 772     min_version = recovery_state.get_pg_log().get_missing().get_rmissing().begin()->first;
 773     soid = recovery_state.get_pg_log().get_missing().get_rmissing().begin()->second;
 774   }
 775   ceph_assert(!get_acting_recovery_backfill().empty());
 776   for (set<pg_shard_t>::iterator it = get_acting_recovery_backfill().begin();
 777        it != get_acting_recovery_backfill().end();
 778        ++it) {
 779     if (*it == get_primary()) continue;
 780     pg_shard_t peer = *it;
 781     auto it_missing = recovery_state.get_peer_missing().find(peer);
 782     if (it_missing != recovery_state.get_peer_missing().end() &&
 783         !it_missing->second.get_rmissing().empty()) {
 784       const auto& min_obj = recovery_state.get_peer_missing(peer).get_rmissing().begin();
 785       dout(20) << __func__ << " peer " << peer << " min_version " << min_obj->first
 786                << " oid " << min_obj->second << dendl;
 787       if (min_version > min_obj->first) {
 788         min_version = min_obj->first;
 789         soid = min_obj->second;
 790       }
 791     }
 792   }
 793
 794   // recover it
 795   if (soid != hobject_t())
 796     maybe_kick_recovery(soid);
 797 }
 798
 799 bool PrimaryLogPG::check_laggy(OpRequestRef& op)
 800 {
 801   if (!HAVE_FEATURE(recovery_state.get_min_upacting_features(),
 802                     SERVER_OCTOPUS)) {
 803     dout(20) << __func__ << " not all upacting has SERVER_OCTOPUS" << dendl;
 804     return true;
 805   }
 806   if (state_test(PG_STATE_WAIT)) {
 807     dout(10) << __func__ << " PG is WAIT state" << dendl;
 808   } else if (!state_test(PG_STATE_LAGGY)) {
 809     auto mnow = osd->get_mnow();
 810     auto ru = recovery_state.get_readable_until();
 811     if (mnow <= ru) {
 812       // not laggy
 813       return true;
 814     }
 815     dout(10) << __func__
 816              << " mnow " << mnow
 817              << " > readable_until " << ru << dendl;
 818
 819     if (!is_primary()) {
 820       osd->reply_op_error(op, -EAGAIN);
 821       return false;
 822     }
 823
 824     // go to laggy state
 825     state_set(PG_STATE_LAGGY);
 826     publish_stats_to_osd();
 827   }
 828   dout(10) << __func__ << " not readable" << dendl;
 829   waiting_for_readable.push_back(op);
 830   op->mark_delayed("waiting for readable");
 831   return false;
 832 }
 833
 834 bool PrimaryLogPG::check_laggy_requeue(OpRequestRef& op)
 835 {
 836   if (!HAVE_FEATURE(recovery_state.get_min_upacting_features(),
 837                     SERVER_OCTOPUS)) {
 838     return true;
 839   }
 840   if (!state_test(PG_STATE_WAIT) && !state_test(PG_STATE_LAGGY)) {
 841     return true; // not laggy
 842   }
 843   dout(10) << __func__ << " not readable" << dendl;
 844   waiting_for_readable.push_front(op);
 845   op->mark_delayed("waiting for readable");
 846   return false;
 847 }
 848
 849 void PrimaryLogPG::recheck_readable()
 850 {
 851   if (!is_wait() && !is_laggy()) {
 852     dout(20) << __func__ << " wasn't wait or laggy" << dendl;
 853     return;
 854   }
 855   auto mnow = osd->get_mnow();
 856   bool pub = false;
 857   if (is_wait()) {
 858     auto prior_readable_until_ub = recovery_state.get_prior_readable_until_ub();
 859     if (mnow < prior_readable_until_ub) {
 860       dout(10) << __func__ << " still wait (mnow " << mnow
 861                << " < prior_readable_until_ub " << prior_readable_until_ub
 862                << ")" << dendl;
 863     } else {
 864       dout(10) << __func__ << " no longer wait (mnow " << mnow
 865                << " >= prior_readable_until_ub " << prior_readable_until_ub
 866                << ")" << dendl;
 867       state_clear(PG_STATE_WAIT);
 868       recovery_state.clear_prior_readable_until_ub();
 869       pub = true;
 870     }
 871   }
 872   if (is_laggy()) {
 873     auto ru = recovery_state.get_readable_until();
 874     if (ru == ceph::signedspan::zero()) {
 875       dout(10) << __func__ << " still laggy (mnow " << mnow
 876                << ", readable_until zero)" << dendl;
 877     } else if (mnow >= ru) {
 878       dout(10) << __func__ << " still laggy (mnow " << mnow
 879                << " >= readable_until " << ru << ")" << dendl;
 880     } else {
 881       dout(10) << __func__ << " no longer laggy (mnow " << mnow
 882                << " < readable_until " << ru << ")" << dendl;
 883       state_clear(PG_STATE_LAGGY);
 884       pub = true;
 885     }
 886   }
 887   if (pub) {
 888     publish_stats_to_osd();
 889   }
 890   if (!is_laggy() && !is_wait()) {
 891     requeue_ops(waiting_for_readable);
 892   }
 893 }
 894
 895 bool PrimaryLogPG::pgls_filter(const PGLSFilter& filter, const hobject_t& sobj)
 896 {
 897   bufferlist bl;
 898
 899   // If filter has expressed an interest in an xattr, load it.
 900   if (!filter.get_xattr().empty()) {
 901     int ret = pgbackend->objects_get_attr(
 902       sobj,
 903       filter.get_xattr(),
 904       &bl);
 905     dout(0) << "getattr (sobj=" << sobj << ", attr=" << filter.get_xattr() << ") returned " << ret << dendl;
 906     if (ret < 0) {
 907       if (ret != -ENODATA || filter.reject_empty_xattr()) {
 908         return false;
 909       }
 910     }
 911   }
 912
 913   return filter.filter(sobj, bl);
 914 }
 915
 916 std::pair<int, std::unique_ptr<const PGLSFilter>>
 917 PrimaryLogPG::get_pgls_filter(bufferlist::const_iterator& iter)
 918 {
 919   string type;
 920   // storing non-const PGLSFilter for the sake of ::init()
 921   std::unique_ptr<PGLSFilter> filter;
 922
 923   try {
 924     decode(type, iter);
 925   }
 926   catch (ceph::buffer::error& e) {
 927     return { -EINVAL, nullptr };
 928   }
 929
 930   if (type.compare("plain") == 0) {
 931     filter = std::make_unique<PGLSPlainFilter>();
 932   } else {
 933     std::size_t dot = type.find(".");
 934     if (dot == std::string::npos || dot == 0 || dot == type.size() - 1) {
 935       return { -EINVAL, nullptr };
 936     }
 937
 938     const std::string class_name = type.substr(0, dot);
 939     const std::string filter_name = type.substr(dot + 1);
 940     ClassHandler::ClassData *cls = NULL;
 941     int r = ClassHandler::get_instance().open_class(class_name, &cls);
 942     if (r != 0) {
 943       derr << "Error opening class '" << class_name << "': "
 944            << cpp_strerror(r) << dendl;
 945       if (r != -EPERM) // propagate permission error
 946         r = -EINVAL;
 947       return { r, nullptr };
 948     } else {
 949       ceph_assert(cls);
 950     }
 951
 952     ClassHandler::ClassFilter *class_filter = cls->get_filter(filter_name);
 953     if (class_filter == NULL) {
 954       derr << "Error finding filter '" << filter_name << "' in class "
 955            << class_name << dendl;
 956       return { -EINVAL, nullptr };
 957     }
 958     filter.reset(class_filter->fn());
 959     if (!filter) {
 960       // Object classes are obliged to return us something, but let's
 961       // give an error rather than asserting out.
 962       derr << "Buggy class " << class_name << " failed to construct "
 963               "filter " << filter_name << dendl;
 964       return { -EINVAL, nullptr };
 965     }
 966   }
 967
 968   ceph_assert(filter);
 969   int r = filter->init(iter);
 970   if (r < 0) {
 971     derr << "Error initializing filter " << type << ": "
 972          << cpp_strerror(r) << dendl;
 973     return { -EINVAL, nullptr };
 974   } else {
 975     // Successfully constructed and initialized, return it.
 976     return std::make_pair(0, std::move(filter));
 977   }
 978 }
 979
 980
 981 // ==========================================================
 982
 983 void PrimaryLogPG::do_command(
 984   const string_view& orig_prefix,
 985   const cmdmap_t& cmdmap,
 986   const bufferlist& idata,
 987   std::function<void(int,const std::string&,bufferlist&)> on_finish)
 988 {
 989   string format;
 990   cmd_getval(cmdmap, "format", format);
 991   std::unique_ptr<Formatter> f(Formatter::create(
 992                                  format, "json-pretty", "json-pretty"));
 993   int ret = 0;
 994   stringstream ss;   // stderr error message stream
 995   bufferlist outbl;  // if empty at end, we'll dump formatter as output
 996
 997   // get final prefix:
 998   // - ceph pg <pgid> foo -> prefix=pg, cmd=foo
 999   // - ceph tell <pgid> foo -> prefix=foo
1000   string prefix(orig_prefix);
1001   string command;
1002   cmd_getval(cmdmap, "cmd", command);
1003   if (command.size()) {
1004     prefix = command;
1005   }
1006
1007   if (prefix == "query") {
1008     f->open_object_section("pg");
1009     f->dump_stream("snap_trimq") << snap_trimq;
1010     f->dump_unsigned("snap_trimq_len", snap_trimq.size());
1011     recovery_state.dump_peering_state(f.get());
1012
1013     f->open_array_section("recovery_state");
1014     handle_query_state(f.get());
1015     f->close_section();
1016
1017     if (is_primary() && is_active() && m_scrubber) {
1018       m_scrubber->dump(f.get());
1019     }
1020
1021     f->open_object_section("agent_state");
1022     if (agent_state)
1023       agent_state->dump(f.get());
1024     f->close_section();
1025
1026     f->close_section();
1027   }
1028
1029   else if (prefix == "mark_unfound_lost") {
1030     string mulcmd;
1031     cmd_getval(cmdmap, "mulcmd", mulcmd);
1032     int mode = -1;
1033     if (mulcmd == "revert") {
1034       if (pool.info.is_erasure()) {
1035         ss << "mode must be 'delete' for ec pool";
1036         ret = -EINVAL;
1037         goto out;
1038       }
1039       mode = pg_log_entry_t::LOST_REVERT;
1040     } else if (mulcmd == "delete") {
1041       mode = pg_log_entry_t::LOST_DELETE;
1042     } else {
1043       ss << "mode must be 'revert' or 'delete'; mark not yet implemented";
1044       ret = -EINVAL;
1045       goto out;
1046     }
1047     ceph_assert(mode == pg_log_entry_t::LOST_REVERT ||
1048                 mode == pg_log_entry_t::LOST_DELETE);
1049
1050     if (!is_primary()) {
1051       ss << "not primary";
1052       ret = -EROFS;
1053       goto out;
1054     }
1055
1056     uint64_t unfound = recovery_state.get_missing_loc().num_unfound();
1057     if (!unfound) {
1058       ss << "pg has no unfound objects";
1059       goto out;  // make command idempotent
1060     }
1061
1062     if (!recovery_state.all_unfound_are_queried_or_lost(get_osdmap())) {
1063       ss << "pg has " << unfound
1064          << " unfound objects but we haven't probed all sources, not marking lost";
1065       ret = -EINVAL;
1066       goto out;
1067     }
1068
1069     mark_all_unfound_lost(mode, on_finish);
1070     return;
1071   }
1072
1073   else if (prefix == "list_unfound") {
1074     hobject_t offset;
1075     string offset_json;
1076     bool show_offset = false;
1077     if (cmd_getval(cmdmap, "offset", offset_json)) {
1078       json_spirit::Value v;
1079       try {
1080         if (!json_spirit::read(offset_json, v))
1081           throw std::runtime_error("bad json");
1082         offset.decode(v);
1083       } catch (std::runtime_error& e) {
1084         ss << "error parsing offset: " << e.what();
1085         ret = -EINVAL;
1086         goto out;
1087       }
1088       show_offset = true;
1089     }
1090     f->open_object_section("missing");
1091     if (show_offset) {
1092       f->open_object_section("offset");
1093       offset.dump(f.get());
1094       f->close_section();
1095     }
1096     auto &needs_recovery_map = recovery_state.get_missing_loc()
1097       .get_needs_recovery();
1098     f->dump_int("num_missing", needs_recovery_map.size());
1099     f->dump_int("num_unfound", get_num_unfound());
1100     map<hobject_t, pg_missing_item>::const_iterator p =
1101       needs_recovery_map.upper_bound(offset);
1102     {
1103       f->open_array_section("objects");
1104       int32_t num = 0;
1105       for (; p != needs_recovery_map.end() &&
1106              num < cct->_conf->osd_command_max_records;
1107            ++p) {
1108         if (recovery_state.get_missing_loc().is_unfound(p->first)) {
1109           f->open_object_section("object");
1110           {
1111             f->open_object_section("oid");
1112             p->first.dump(f.get());
1113             f->close_section();
1114           }
1115           p->second.dump(f.get()); // have, need keys
1116           {
1117             f->open_array_section("locations");
1118             for (auto &&r : recovery_state.get_missing_loc().get_locations(
1119                    p->first)) {
1120               f->dump_stream("shard") << r;
1121             }
1122             f->close_section();
1123           }
1124           f->close_section();
1125           num++;
1126         }
1127       }
1128       f->close_section();
1129     }
1130     // Get possible locations of missing objects from pg information
1131     PeeringState::QueryUnfound q(f.get());
1132     recovery_state.handle_event(q, 0);
1133     f->dump_bool("more", p != needs_recovery_map.end());
1134     f->close_section();
1135   }
1136
1137   else if (prefix == "scrub" ||
1138            prefix == "deep_scrub") {
1139     bool deep = (prefix == "deep_scrub");
1140     int64_t time;
1141     cmd_getval(cmdmap, "time", time, (int64_t)0);
1142
1143     if (is_primary()) {
1144       const pg_pool_t *p = &pool.info;
1145       double pool_scrub_max_interval = 0;
1146       double scrub_max_interval;
1147       if (deep) {
1148         p->opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &pool_scrub_max_interval);
1149         scrub_max_interval = pool_scrub_max_interval > 0 ?
1150           pool_scrub_max_interval : g_conf()->osd_deep_scrub_interval;
1151       } else {
1152         p->opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &pool_scrub_max_interval);
1153         scrub_max_interval = pool_scrub_max_interval > 0 ?
1154           pool_scrub_max_interval : g_conf()->osd_scrub_max_interval;
1155       }
1156       // Instead of marking must_scrub force a schedule scrub
1157       utime_t stamp = ceph_clock_now();
1158       if (time == 0)
1159         stamp -= scrub_max_interval;
1160       else
1161         stamp -=  (float)time;
1162       stamp -= 100.0;  // push back last scrub more for good measure
1163       if (deep) {
1164         set_last_deep_scrub_stamp(stamp);
1165       } else {
1166         set_last_scrub_stamp(stamp);
1167       }
1168       f->open_object_section("result");
1169       f->dump_bool("deep", deep);
1170       f->dump_stream("stamp") << stamp;
1171       f->close_section();
1172     } else {
1173       ss << "Not primary";
1174       ret = -EPERM;
1175     }
1176     outbl.append(ss.str());
1177   }
1178
1179   else {
1180     ret = -ENOSYS;
1181     ss << "prefix '" << prefix << "' not implemented";
1182   }
1183
1184  out:
1185   if (ret >= 0 && outbl.length() == 0) {
1186     f->flush(outbl);
1187   }
1188   on_finish(ret, ss.str(), outbl);
1189 }
1190
1191
1192 // ==========================================================
1193
1194 void PrimaryLogPG::do_pg_op(OpRequestRef op)
1195 {
1196   const MOSDOp *m = static_cast<const MOSDOp *>(op->get_req());
1197   ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
1198   dout(10) << "do_pg_op " << *m << dendl;
1199
1200   op->mark_started();
1201
1202   int result = 0;
1203   string cname, mname;
1204
1205   snapid_t snapid = m->get_snapid();
1206
1207   vector<OSDOp> ops = m->ops;
1208
1209   for (vector<OSDOp>::iterator p = ops.begin(); p != ops.end(); ++p) {
1210     std::unique_ptr<const PGLSFilter> filter;
1211     OSDOp& osd_op = *p;
1212     auto bp = p->indata.cbegin();
1213     switch (p->op.op) {
1214     case CEPH_OSD_OP_PGNLS_FILTER:
1215       try {
1216         decode(cname, bp);
1217         decode(mname, bp);
1218       }
1219       catch (const ceph::buffer::error& e) {
1220         dout(0) << "unable to decode PGLS_FILTER description in " << *m << dendl;
1221         result = -EINVAL;
1222         break;
1223       }
1224       std::tie(result, filter) = get_pgls_filter(bp);
1225       if (result < 0)
1226         break;
1227
1228       ceph_assert(filter);
1229
1230       // fall through
1231
1232     case CEPH_OSD_OP_PGNLS:
1233       if (snapid != CEPH_NOSNAP) {
1234         result = -EINVAL;
1235         break;
1236       }
1237       if (get_osdmap()->raw_pg_to_pg(m->get_pg()) != info.pgid.pgid) {
1238         dout(10) << " pgnls pg=" << m->get_pg()
1239                  << " " << get_osdmap()->raw_pg_to_pg(m->get_pg())
1240                  << " != " << info.pgid << dendl;
1241         result = 0; // hmm?
1242       } else {
1243         unsigned list_size = std::min<uint64_t>(cct->_conf->osd_max_pgls,
1244                                                 p->op.pgls.count);
1245
1246         dout(10) << " pgnls pg=" << m->get_pg() << " count " << list_size
1247                  << dendl;
1248         // read into a buffer
1249         vector<hobject_t> sentries;
1250         pg_nls_response_t response;
1251         try {
1252           decode(response.handle, bp);
1253         }
1254         catch (const ceph::buffer::error& e) {
1255           dout(0) << "unable to decode PGNLS handle in " << *m << dendl;
1256           result = -EINVAL;
1257           break;
1258         }
1259
1260         hobject_t next;
1261         hobject_t lower_bound = response.handle;
1262         hobject_t pg_start = info.pgid.pgid.get_hobj_start();
1263         hobject_t pg_end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1264         dout(10) << " pgnls lower_bound " << lower_bound
1265                  << " pg_end " << pg_end << dendl;
1266         if (((!lower_bound.is_max() && lower_bound >= pg_end) ||
1267              (lower_bound != hobject_t() && lower_bound < pg_start))) {
1268           // this should only happen with a buggy client.
1269           dout(10) << "outside of PG bounds " << pg_start << " .. "
1270                    << pg_end << dendl;
1271           result = -EINVAL;
1272           break;
1273         }
1274
1275         hobject_t current = lower_bound;
1276         int r = pgbackend->objects_list_partial(
1277           current,
1278           list_size,
1279           list_size,
1280           &sentries,
1281           &next);
1282         if (r != 0) {
1283           result = -EINVAL;
1284           break;
1285         }
1286
1287         map<hobject_t, pg_missing_item>::const_iterator missing_iter =
1288           recovery_state.get_pg_log().get_missing().get_items().lower_bound(current);
1289         vector<hobject_t>::iterator ls_iter = sentries.begin();
1290         hobject_t _max = hobject_t::get_max();
1291         while (1) {
1292           const hobject_t &mcand =
1293             missing_iter == recovery_state.get_pg_log().get_missing().get_items().end() ?
1294             _max :
1295             missing_iter->first;
1296           const hobject_t &lcand =
1297             ls_iter == sentries.end() ?
1298             _max :
1299             *ls_iter;
1300
1301           hobject_t candidate;
1302           if (mcand == lcand) {
1303             candidate = mcand;
1304             if (!mcand.is_max()) {
1305               ++ls_iter;
1306               ++missing_iter;
1307             }
1308           } else if (mcand < lcand) {
1309             candidate = mcand;
1310             ceph_assert(!mcand.is_max());
1311             ++missing_iter;
1312           } else {
1313             candidate = lcand;
1314             ceph_assert(!lcand.is_max());
1315             ++ls_iter;
1316           }
1317
1318           dout(10) << " pgnls candidate 0x" << std::hex << candidate.get_hash()
1319                    << " vs lower bound 0x" << lower_bound.get_hash()
1320                    << std::dec << dendl;
1321
1322           if (candidate >= next) {
1323             break;
1324           }
1325
1326           if (response.entries.size() == list_size) {
1327             next = candidate;
1328             break;
1329           }
1330
1331           if (candidate.snap != CEPH_NOSNAP)
1332             continue;
1333
1334           // skip internal namespace
1335           if (candidate.get_namespace() == cct->_conf->osd_hit_set_namespace)
1336             continue;
1337
1338           if (recovery_state.get_missing_loc().is_deleted(candidate))
1339             continue;
1340
1341           // skip wrong namespace
1342           if (m->get_hobj().nspace != librados::all_nspaces &&
1343                candidate.get_namespace() != m->get_hobj().nspace)
1344             continue;
1345
1346           if (filter && !pgls_filter(*filter, candidate))
1347             continue;
1348
1349           dout(20) << "pgnls item 0x" << std::hex
1350             << candidate.get_hash()
1351             << ", rev 0x" << hobject_t::_reverse_bits(candidate.get_hash())
1352             << std::dec << " "
1353             << candidate.oid.name << dendl;
1354
1355           librados::ListObjectImpl item;
1356           item.nspace = candidate.get_namespace();
1357           item.oid = candidate.oid.name;
1358           item.locator = candidate.get_key();
1359           response.entries.push_back(item);
1360         }
1361
1362         if (next.is_max() &&
1363             missing_iter == recovery_state.get_pg_log().get_missing().get_items().end() &&
1364             ls_iter == sentries.end()) {
1365           result = 1;
1366
1367           // Set response.handle to the start of the next PG according
1368           // to the object sort order.
1369           response.handle = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1370         } else {
1371           response.handle = next;
1372         }
1373         dout(10) << "pgnls handle=" << response.handle << dendl;
1374         encode(response, osd_op.outdata);
1375         dout(10) << " pgnls result=" << result << " outdata.length()="
1376                  << osd_op.outdata.length() << dendl;
1377       }
1378       break;
1379
1380     case CEPH_OSD_OP_PGLS_FILTER:
1381       try {
1382         decode(cname, bp);
1383         decode(mname, bp);
1384       }
1385       catch (const ceph::buffer::error& e) {
1386         dout(0) << "unable to decode PGLS_FILTER description in " << *m << dendl;
1387         result = -EINVAL;
1388         break;
1389       }
1390       std::tie(result, filter) = get_pgls_filter(bp);
1391       if (result < 0)
1392         break;
1393
1394       ceph_assert(filter);
1395
1396       // fall through
1397
1398     case CEPH_OSD_OP_PGLS:
1399       if (snapid != CEPH_NOSNAP) {
1400         result = -EINVAL;
1401         break;
1402       }
1403       if (get_osdmap()->raw_pg_to_pg(m->get_pg()) != info.pgid.pgid) {
1404         dout(10) << " pgls pg=" << m->get_pg()
1405                  << " " << get_osdmap()->raw_pg_to_pg(m->get_pg())
1406                  << " != " << info.pgid << dendl;
1407         result = 0; // hmm?
1408       } else {
1409         unsigned list_size = std::min<uint64_t>(cct->_conf->osd_max_pgls,
1410                                                 p->op.pgls.count);
1411
1412         dout(10) << " pgls pg=" << m->get_pg() << " count " << list_size << dendl;
1413         // read into a buffer
1414         vector<hobject_t> sentries;
1415         pg_ls_response_t response;
1416         try {
1417           decode(response.handle, bp);
1418         }
1419         catch (const ceph::buffer::error& e) {
1420           dout(0) << "unable to decode PGLS handle in " << *m << dendl;
1421           result = -EINVAL;
1422           break;
1423         }
1424
1425         hobject_t next;
1426         hobject_t current = response.handle;
1427         int r = pgbackend->objects_list_partial(
1428           current,
1429           list_size,
1430           list_size,
1431           &sentries,
1432           &next);
1433         if (r != 0) {
1434           result = -EINVAL;
1435           break;
1436         }
1437
1438         ceph_assert(snapid == CEPH_NOSNAP || recovery_state.get_pg_log().get_missing().get_items().empty());
1439
1440         map<hobject_t, pg_missing_item>::const_iterator missing_iter =
1441           recovery_state.get_pg_log().get_missing().get_items().lower_bound(current);
1442         vector<hobject_t>::iterator ls_iter = sentries.begin();
1443         hobject_t _max = hobject_t::get_max();
1444         while (1) {
1445           const hobject_t &mcand =
1446             missing_iter == recovery_state.get_pg_log().get_missing().get_items().end() ?
1447             _max :
1448             missing_iter->first;
1449           const hobject_t &lcand =
1450             ls_iter == sentries.end() ?
1451             _max :
1452             *ls_iter;
1453
1454           hobject_t candidate;
1455           if (mcand == lcand) {
1456             candidate = mcand;
1457             if (!mcand.is_max()) {
1458               ++ls_iter;
1459               ++missing_iter;
1460             }
1461           } else if (mcand < lcand) {
1462             candidate = mcand;
1463             ceph_assert(!mcand.is_max());
1464             ++missing_iter;
1465           } else {
1466             candidate = lcand;
1467             ceph_assert(!lcand.is_max());
1468             ++ls_iter;
1469           }
1470
1471           if (candidate >= next) {
1472             break;
1473           }
1474
1475           if (response.entries.size() == list_size) {
1476             next = candidate;
1477             break;
1478           }
1479
1480           if (candidate.snap != CEPH_NOSNAP)
1481             continue;
1482
1483           // skip wrong namespace
1484           if (candidate.get_namespace() != m->get_hobj().nspace)
1485             continue;
1486
1487           if (recovery_state.get_missing_loc().is_deleted(candidate))
1488             continue;
1489
1490           if (filter && !pgls_filter(*filter, candidate))
1491             continue;
1492
1493           response.entries.push_back(make_pair(candidate.oid,
1494                                                candidate.get_key()));
1495         }
1496         if (next.is_max() &&
1497             missing_iter == recovery_state.get_pg_log().get_missing().get_items().end() &&
1498             ls_iter == sentries.end()) {
1499           result = 1;
1500         }
1501         response.handle = next;
1502         encode(response, osd_op.outdata);
1503         dout(10) << " pgls result=" << result << " outdata.length()="
1504                  << osd_op.outdata.length() << dendl;
1505       }
1506       break;
1507
1508     case CEPH_OSD_OP_PG_HITSET_LS:
1509       {
1510         list< pair<utime_t,utime_t> > ls;
1511         for (list<pg_hit_set_info_t>::const_iterator p = info.hit_set.history.begin();
1512              p != info.hit_set.history.end();
1513              ++p)
1514           ls.push_back(make_pair(p->begin, p->end));
1515         if (hit_set)
1516           ls.push_back(make_pair(hit_set_start_stamp, utime_t()));
1517         encode(ls, osd_op.outdata);
1518       }
1519       break;
1520
1521     case CEPH_OSD_OP_PG_HITSET_GET:
1522       {
1523         utime_t stamp(osd_op.op.hit_set_get.stamp);
1524         if (hit_set_start_stamp && stamp >= hit_set_start_stamp) {
1525           // read the current in-memory HitSet, not the version we've
1526           // checkpointed.
1527           if (!hit_set) {
1528             result= -ENOENT;
1529             break;
1530           }
1531           encode(*hit_set, osd_op.outdata);
1532           result = osd_op.outdata.length();
1533         } else {
1534           // read an archived HitSet.
1535           hobject_t oid;
1536           for (list<pg_hit_set_info_t>::const_iterator p = info.hit_set.history.begin();
1537                p != info.hit_set.history.end();
1538                ++p) {
1539             if (stamp >= p->begin && stamp <= p->end) {
1540               oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
1541               break;
1542             }
1543           }
1544           if (oid == hobject_t()) {
1545             result = -ENOENT;
1546             break;
1547           }
1548           if (!pool.info.is_replicated()) {
1549             // FIXME: EC not supported yet
1550             result = -EOPNOTSUPP;
1551             break;
1552           }
1553           if (is_unreadable_object(oid)) {
1554             wait_for_unreadable_object(oid, op);
1555             return;
1556           }
1557           result = osd->store->read(ch, ghobject_t(oid), 0, 0, osd_op.outdata);
1558         }
1559       }
1560       break;
1561
1562    case CEPH_OSD_OP_SCRUBLS:
1563       result = do_scrub_ls(m, &osd_op);
1564       break;
1565
1566     default:
1567       result = -EINVAL;
1568       break;
1569     }
1570
1571     if (result < 0)
1572       break;
1573   }
1574
1575   // reply
1576   MOSDOpReply *reply = new MOSDOpReply(m, 0, get_osdmap_epoch(),
1577                                        CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK,
1578                                        false);
1579   reply->claim_op_out_data(ops);
1580   reply->set_result(result);
1581   reply->set_reply_versions(info.last_update, info.last_user_version);
1582   osd->send_message_osd_client(reply, m->get_connection());
1583 }
1584
1585 int PrimaryLogPG::do_scrub_ls(const MOSDOp *m, OSDOp *osd_op)
1586 {
1587   if (m->get_pg() != info.pgid.pgid) {
1588     dout(10) << " scrubls pg=" << m->get_pg() << " != " << info.pgid << dendl;
1589     return -EINVAL; // hmm?
1590   }
1591   auto bp = osd_op->indata.cbegin();
1592   scrub_ls_arg_t arg;
1593   try {
1594     arg.decode(bp);
1595   } catch (ceph::buffer::error&) {
1596     dout(10) << " corrupted scrub_ls_arg_t" << dendl;
1597     return -EINVAL;
1598   }
1599
1600   int r = 0;
1601   scrub_ls_result_t result = {.interval = info.history.same_interval_since};
1602
1603   if (arg.interval != 0 && arg.interval != info.history.same_interval_since) {
1604     r = -EAGAIN;
1605   } else {
1606     bool store_queried = m_scrubber && m_scrubber->get_store_errors(arg, result);
1607     if (store_queried) {
1608       encode(result, osd_op->outdata);
1609     } else {
1610       // the scrubber's store is not initialized
1611       r = -ENOENT;
1612     }
1613   }
1614
1615   return r;
1616 }
1617
1618 /**
1619  * Releases locks
1620  *
1621  * @param manager [in] manager with locks to release
1622  */
1623 void PrimaryLogPG::release_object_locks(
1624   ObcLockManager &lock_manager) {
1625   std::list<std::pair<ObjectContextRef, std::list<OpRequestRef> > > to_req;
1626   bool requeue_recovery = false;
1627   bool requeue_snaptrim = false;
1628   lock_manager.put_locks(
1629     &to_req,
1630     &requeue_recovery,
1631     &requeue_snaptrim);
1632   if (requeue_recovery)
1633     queue_recovery();
1634   if (requeue_snaptrim)
1635     snap_trimmer_machine.process_event(TrimWriteUnblocked());
1636
1637   if (!to_req.empty()) {
1638     // requeue at front of scrub blocking queue if we are blocked by scrub
1639     for (auto &&p: to_req) {
1640       if (m_scrubber->write_blocked_by_scrub(p.first->obs.oi.soid.get_head())) {
1641         for (auto& op : p.second) {
1642           op->mark_delayed("waiting for scrub");
1643         }
1644
1645         waiting_for_scrub.splice(
1646           waiting_for_scrub.begin(),
1647           p.second,
1648           p.second.begin(),
1649           p.second.end());
1650       } else if (is_laggy()) {
1651         for (auto& op : p.second) {
1652           op->mark_delayed("waiting for readable");
1653         }
1654         waiting_for_readable.splice(
1655           waiting_for_readable.begin(),
1656           p.second,
1657           p.second.begin(),
1658           p.second.end());
1659       } else {
1660         requeue_ops(p.second);
1661       }
1662     }
1663   }
1664 }
1665
1666 PrimaryLogPG::PrimaryLogPG(OSDService *o, OSDMapRef curmap,
1667                            const PGPool &_pool,
1668                            const map<string,string>& ec_profile, spg_t p) :
1669   PG(o, curmap, _pool, p),
1670   pgbackend(
1671     PGBackend::build_pg_backend(
1672       _pool.info, ec_profile, this, coll_t(p), ch, o->store, cct)),
1673   object_contexts(o->cct, o->cct->_conf->osd_pg_object_context_cache_count),
1674   new_backfill(false),
1675   temp_seq(0),
1676   snap_trimmer_machine(this)
1677 {
1678   recovery_state.set_backend_predicates(
1679     pgbackend->get_is_readable_predicate(),
1680     pgbackend->get_is_recoverable_predicate());
1681   snap_trimmer_machine.initiate();
1682
1683   m_scrubber = make_unique<PrimaryLogScrub>(this);
1684 }
1685
1686 void PrimaryLogPG::get_src_oloc(const object_t& oid, const object_locator_t& oloc, object_locator_t& src_oloc)
1687 {
1688   src_oloc = oloc;
1689   if (oloc.key.empty())
1690     src_oloc.key = oid.name;
1691 }
1692
1693 void PrimaryLogPG::handle_backoff(OpRequestRef& op)
1694 {
1695   auto m = op->get_req<MOSDBackoff>();
1696   auto session = ceph::ref_cast<Session>(m->get_connection()->get_priv());
1697   if (!session)
1698     return;  // drop it.
1699   hobject_t begin = info.pgid.pgid.get_hobj_start();
1700   hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1701   if (begin < m->begin) {
1702     begin = m->begin;
1703   }
1704   if (end > m->end) {
1705     end = m->end;
1706   }
1707   dout(10) << __func__ << " backoff ack id " << m->id
1708            << " [" << begin << "," << end << ")" << dendl;
1709   session->ack_backoff(cct, m->pgid, m->id, begin, end);
1710 }
1711
1712 void PrimaryLogPG::do_request(
1713   OpRequestRef& op,
1714   ThreadPool::TPHandle &handle)
1715 {
1716   if (op->osd_trace) {
1717     op->pg_trace.init("pg op", &trace_endpoint, &op->osd_trace);
1718     op->pg_trace.event("do request");
1719   }
1720 #ifdef HAVE_JAEGER
1721   if (op->osd_parent_span) {
1722     auto do_req_span = jaeger_tracing::child_span(__func__, op->osd_parent_span);
1723   }
1724 #endif
1725 // make sure we have a new enough map
1726   auto p = waiting_for_map.find(op->get_source());
1727   if (p != waiting_for_map.end()) {
1728     // preserve ordering
1729     dout(20) << __func__ << " waiting_for_map "
1730              << p->first << " not empty, queueing" << dendl;
1731     p->second.push_back(op);
1732     op->mark_delayed("waiting_for_map not empty");
1733     return;
1734   }
1735   if (!have_same_or_newer_map(op->min_epoch)) {
1736     dout(20) << __func__ << " min " << op->min_epoch
1737              << ", queue on waiting_for_map " << op->get_source() << dendl;
1738     waiting_for_map[op->get_source()].push_back(op);
1739     op->mark_delayed("op must wait for map");
1740     osd->request_osdmap_update(op->min_epoch);
1741     return;
1742   }
1743
1744   if (can_discard_request(op)) {
1745     return;
1746   }
1747
1748   // pg-wide backoffs
1749   const Message *m = op->get_req();
1750   int msg_type = m->get_type();
1751   if (m->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF)) {
1752     auto session = ceph::ref_cast<Session>(m->get_connection()->get_priv());
1753     if (!session)
1754       return;  // drop it.
1755     if (msg_type == CEPH_MSG_OSD_OP) {
1756       if (session->check_backoff(cct, info.pgid,
1757                                  info.pgid.pgid.get_hobj_start(), m)) {
1758         return;
1759       }
1760
1761       bool backoff =
1762         is_down() ||
1763         is_incomplete() ||
1764         (!is_active() && is_peered());
1765       if (g_conf()->osd_backoff_on_peering && !backoff) {
1766         if (is_peering()) {
1767           backoff = true;
1768         }
1769       }
1770       if (backoff) {
1771         add_pg_backoff(session);
1772         return;
1773       }
1774     }
1775     // pg backoff acks at pg-level
1776     if (msg_type == CEPH_MSG_OSD_BACKOFF) {
1777       const MOSDBackoff *ba = static_cast<const MOSDBackoff*>(m);
1778       if (ba->begin != ba->end) {
1779         handle_backoff(op);
1780         return;
1781       }
1782     }
1783   }
1784
1785   if (!is_peered()) {
1786     // Delay unless PGBackend says it's ok
1787     if (pgbackend->can_handle_while_inactive(op)) {
1788       bool handled = pgbackend->handle_message(op);
1789       ceph_assert(handled);
1790       return;
1791     } else {
1792       waiting_for_peered.push_back(op);
1793       op->mark_delayed("waiting for peered");
1794       return;
1795     }
1796   }
1797
1798   if (recovery_state.needs_flush()) {
1799     dout(20) << "waiting for flush on " << op << dendl;
1800     waiting_for_flush.push_back(op);
1801     op->mark_delayed("waiting for flush");
1802     return;
1803   }
1804
1805   ceph_assert(is_peered() && !recovery_state.needs_flush());
1806   if (pgbackend->handle_message(op))
1807     return;
1808
1809   switch (msg_type) {
1810   case CEPH_MSG_OSD_OP:
1811   case CEPH_MSG_OSD_BACKOFF:
1812     if (!is_active()) {
1813       dout(20) << " peered, not active, waiting for active on " << op << dendl;
1814       waiting_for_active.push_back(op);
1815       op->mark_delayed("waiting for active");
1816       return;
1817     }
1818     switch (msg_type) {
1819     case CEPH_MSG_OSD_OP:
1820       // verify client features
1821       if ((pool.info.has_tiers() || pool.info.is_tier()) &&
1822           !op->has_feature(CEPH_FEATURE_OSD_CACHEPOOL)) {
1823         osd->reply_op_error(op, -EOPNOTSUPP);
1824         return;
1825       }
1826       do_op(op);
1827       break;
1828     case CEPH_MSG_OSD_BACKOFF:
1829       // object-level backoff acks handled in osdop context
1830       handle_backoff(op);
1831       break;
1832     }
1833     break;
1834
1835   case MSG_OSD_PG_SCAN:
1836     do_scan(op, handle);
1837     break;
1838
1839   case MSG_OSD_PG_BACKFILL:
1840     do_backfill(op);
1841     break;
1842
1843   case MSG_OSD_PG_BACKFILL_REMOVE:
1844     do_backfill_remove(op);
1845     break;
1846
1847   case MSG_OSD_SCRUB_RESERVE:
1848     {
1849       if (!m_scrubber) {
1850         osd->reply_op_error(op, -EAGAIN);
1851         return;
1852       }
1853       auto m = op->get_req<MOSDScrubReserve>();
1854       switch (m->type) {
1855       case MOSDScrubReserve::REQUEST:
1856         m_scrubber->handle_scrub_reserve_request(op);
1857         break;
1858       case MOSDScrubReserve::GRANT:
1859         m_scrubber->handle_scrub_reserve_grant(op, m->from);
1860         break;
1861       case MOSDScrubReserve::REJECT:
1862         m_scrubber->handle_scrub_reserve_reject(op, m->from);
1863         break;
1864       case MOSDScrubReserve::RELEASE:
1865         m_scrubber->handle_scrub_reserve_release(op);
1866         break;
1867       }
1868     }
1869     break;
1870
1871   case MSG_OSD_REP_SCRUB:
1872     replica_scrub(op, handle);
1873     break;
1874
1875   case MSG_OSD_REP_SCRUBMAP:
1876     do_replica_scrub_map(op);
1877     break;
1878
1879   case MSG_OSD_PG_UPDATE_LOG_MISSING:
1880     do_update_log_missing(op);
1881     break;
1882
1883   case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY:
1884     do_update_log_missing_reply(op);
1885     break;
1886
1887   default:
1888     ceph_abort_msg("bad message type in do_request");
1889   }
1890 }
1891
1892 /** do_op - do an op
1893  * pg lock will be held (if multithreaded)
1894  * osd_lock NOT held.
1895  */
1896 void PrimaryLogPG::do_op(OpRequestRef& op)
1897 {
1898   FUNCTRACE(cct);
1899   // NOTE: take a non-const pointer here; we must be careful not to
1900   // change anything that will break other reads on m (operator<<).
1901   MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
1902   ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
1903   if (m->finish_decode()) {
1904     op->reset_desc();   // for TrackedOp
1905     m->clear_payload();
1906   }
1907
1908   dout(20) << __func__ << ": op " << *m << dendl;
1909
1910   const hobject_t head = m->get_hobj().get_head();
1911
1912   if (!info.pgid.pgid.contains(
1913         info.pgid.pgid.get_split_bits(pool.info.get_pg_num()), head)) {
1914     derr << __func__ << " " << info.pgid.pgid << " does not contain "
1915          << head << " pg_num " << pool.info.get_pg_num() << " hash "
1916          << std::hex << head.get_hash() << std::dec << dendl;
1917     osd->clog->warn() << info.pgid.pgid << " does not contain " << head
1918                       << " op " << *m;
1919     ceph_assert(!cct->_conf->osd_debug_misdirected_ops);
1920     return;
1921   }
1922
1923   bool can_backoff =
1924     m->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF);
1925   ceph::ref_t<Session> session;
1926   if (can_backoff) {
1927     session = static_cast<Session*>(m->get_connection()->get_priv().get());
1928     if (!session.get()) {
1929       dout(10) << __func__ << " no session" << dendl;
1930       return;
1931     }
1932
1933     if (session->check_backoff(cct, info.pgid, head, m)) {
1934       return;
1935     }
1936   }
1937
1938   if (m->has_flag(CEPH_OSD_FLAG_PARALLELEXEC)) {
1939     // not implemented.
1940     dout(20) << __func__ << ": PARALLELEXEC not implemented " << *m << dendl;
1941     osd->reply_op_error(op, -EINVAL);
1942     return;
1943   }
1944
1945   {
1946     int r = op->maybe_init_op_info(*get_osdmap());
1947     if (r) {
1948       osd->reply_op_error(op, r);
1949       return;
1950     }
1951   }
1952
1953   if ((m->get_flags() & (CEPH_OSD_FLAG_BALANCE_READS |
1954                          CEPH_OSD_FLAG_LOCALIZE_READS)) &&
1955       op->may_read() &&
1956       !(op->may_write() || op->may_cache())) {
1957     // balanced reads; any replica will do
1958     if (!(is_primary() || is_nonprimary())) {
1959       osd->handle_misdirected_op(this, op);
1960       return;
1961     }
1962   } else {
1963     // normal case; must be primary
1964     if (!is_primary()) {
1965       osd->handle_misdirected_op(this, op);
1966       return;
1967     }
1968   }
1969
1970   if (!check_laggy(op)) {
1971     return;
1972   }
1973
1974   if (!op_has_sufficient_caps(op)) {
1975     osd->reply_op_error(op, -EPERM);
1976     return;
1977   }
1978
1979   if (op->includes_pg_op()) {
1980     return do_pg_op(op);
1981   }
1982
1983   // object name too long?
1984   if (m->get_oid().name.size() > cct->_conf->osd_max_object_name_len) {
1985     dout(4) << "do_op name is longer than "
1986             << cct->_conf->osd_max_object_name_len
1987             << " bytes" << dendl;
1988     osd->reply_op_error(op, -ENAMETOOLONG);
1989     return;
1990   }
1991   if (m->get_hobj().get_key().size() > cct->_conf->osd_max_object_name_len) {
1992     dout(4) << "do_op locator is longer than "
1993             << cct->_conf->osd_max_object_name_len
1994             << " bytes" << dendl;
1995     osd->reply_op_error(op, -ENAMETOOLONG);
1996     return;
1997   }
1998   if (m->get_hobj().nspace.size() > cct->_conf->osd_max_object_namespace_len) {
1999     dout(4) << "do_op namespace is longer than "
2000             << cct->_conf->osd_max_object_namespace_len
2001             << " bytes" << dendl;
2002     osd->reply_op_error(op, -ENAMETOOLONG);
2003     return;
2004   }
2005   if (m->get_hobj().oid.name.empty()) {
2006     dout(4) << "do_op empty oid name is not allowed" << dendl;
2007     osd->reply_op_error(op, -EINVAL);
2008     return;
2009   }
2010
2011   if (int r = osd->store->validate_hobject_key(head)) {
2012     dout(4) << "do_op object " << head << " invalid for backing store: "
2013             << r << dendl;
2014     osd->reply_op_error(op, r);
2015     return;
2016   }
2017
2018   // blocklisted?
2019   if (get_osdmap()->is_blocklisted(m->get_source_addr())) {
2020     dout(10) << "do_op " << m->get_source_addr() << " is blocklisted" << dendl;
2021     osd->reply_op_error(op, -EBLOCKLISTED);
2022     return;
2023   }
2024
2025   // order this op as a write?
2026   bool write_ordered = op->rwordered();
2027
2028   // discard due to cluster full transition?  (we discard any op that
2029   // originates before the cluster or pool is marked full; the client
2030   // will resend after the full flag is removed or if they expect the
2031   // op to succeed despite being full).  The except is FULL_FORCE and
2032   // FULL_TRY ops, which there is no reason to discard because they
2033   // bypass all full checks anyway.  If this op isn't write or
2034   // read-ordered, we skip.
2035   // FIXME: we exclude mds writes for now.
2036   if (write_ordered && !(m->get_source().is_mds() ||
2037                          m->has_flag(CEPH_OSD_FLAG_FULL_TRY) ||
2038                          m->has_flag(CEPH_OSD_FLAG_FULL_FORCE)) &&
2039       info.history.last_epoch_marked_full > m->get_map_epoch()) {
2040     dout(10) << __func__ << " discarding op sent before full " << m << " "
2041              << *m << dendl;
2042     return;
2043   }
2044   // mds should have stopped writing before this point.
2045   // We can't allow OSD to become non-startable even if mds
2046   // could be writing as part of file removals.
2047   if (write_ordered && osd->check_failsafe_full(get_dpp()) &&
2048       !m->has_flag(CEPH_OSD_FLAG_FULL_TRY)) {
2049     dout(10) << __func__ << " fail-safe full check failed, dropping request." << dendl;
2050     return;
2051   }
2052   int64_t poolid = get_pgid().pool();
2053   if (op->may_write()) {
2054
2055     const pg_pool_t *pi = get_osdmap()->get_pg_pool(poolid);
2056     if (!pi) {
2057       return;
2058     }
2059
2060     // invalid?
2061     if (m->get_snapid() != CEPH_NOSNAP) {
2062       dout(20) << __func__ << ": write to clone not valid " << *m << dendl;
2063       osd->reply_op_error(op, -EINVAL);
2064       return;
2065     }
2066
2067     // too big?
2068     if (cct->_conf->osd_max_write_size &&
2069         m->get_data_len() > cct->_conf->osd_max_write_size << 20) {
2070       // journal can't hold commit!
2071       derr << "do_op msg data len " << m->get_data_len()
2072            << " > osd_max_write_size " << (cct->_conf->osd_max_write_size << 20)
2073            << " on " << *m << dendl;
2074       osd->reply_op_error(op, -OSD_WRITETOOBIG);
2075       return;
2076     }
2077   }
2078
2079   dout(10) << "do_op " << *m
2080            << (op->may_write() ? " may_write" : "")
2081            << (op->may_read() ? " may_read" : "")
2082            << (op->may_cache() ? " may_cache" : "")
2083            << " -> " << (write_ordered ? "write-ordered" : "read-ordered")
2084            << " flags " << ceph_osd_flag_string(m->get_flags())
2085            << dendl;
2086
2087 #ifdef HAVE_JAEGER
2088   if (op->osd_parent_span) {
2089     auto do_op_span = jaeger_tracing::child_span(__func__, op->osd_parent_span);
2090   }
2091 #endif
2092   // missing object?
2093   if (is_unreadable_object(head)) {
2094     if (!is_primary()) {
2095       osd->reply_op_error(op, -EAGAIN);
2096       return;
2097     }
2098     if (can_backoff &&
2099         (g_conf()->osd_backoff_on_degraded ||
2100          (g_conf()->osd_backoff_on_unfound &&
2101           recovery_state.get_missing_loc().is_unfound(head)))) {
2102       add_backoff(session, head, head);
2103       maybe_kick_recovery(head);
2104     } else {
2105       wait_for_unreadable_object(head, op);
2106     }
2107     return;
2108   }
2109
2110   if (write_ordered) {
2111     // degraded object?
2112     if (is_degraded_or_backfilling_object(head)) {
2113       if (can_backoff && g_conf()->osd_backoff_on_degraded) {
2114         add_backoff(session, head, head);
2115         maybe_kick_recovery(head);
2116       } else {
2117         wait_for_degraded_object(head, op);
2118       }
2119       return;
2120     }
2121
2122     if (m_scrubber->is_scrub_active() && m_scrubber->write_blocked_by_scrub(head)) {
2123       dout(20) << __func__ << ": waiting for scrub" << dendl;
2124       waiting_for_scrub.push_back(op);
2125       op->mark_delayed("waiting for scrub");
2126       return;
2127     }
2128     if (!check_laggy_requeue(op)) {
2129       return;
2130     }
2131
2132     // blocked on snap?
2133     if (auto blocked_iter = objects_blocked_on_degraded_snap.find(head);
2134         blocked_iter != std::end(objects_blocked_on_degraded_snap)) {
2135       hobject_t to_wait_on(head);
2136       to_wait_on.snap = blocked_iter->second;
2137       wait_for_degraded_object(to_wait_on, op);
2138       return;
2139     }
2140     if (auto blocked_snap_promote_iter = objects_blocked_on_snap_promotion.find(head);
2141         blocked_snap_promote_iter != std::end(objects_blocked_on_snap_promotion)) {
2142       wait_for_blocked_object(blocked_snap_promote_iter->second->obs.oi.soid, op);
2143       return;
2144     }
2145     if (objects_blocked_on_cache_full.count(head)) {
2146       block_write_on_full_cache(head, op);
2147       return;
2148     }
2149   }
2150
2151   // dup/resent?
2152   if (op->may_write() || op->may_cache()) {
2153     // warning: we will get back *a* request for this reqid, but not
2154     // necessarily the most recent.  this happens with flush and
2155     // promote ops, but we can't possible have both in our log where
2156     // the original request is still not stable on disk, so for our
2157     // purposes here it doesn't matter which one we get.
2158     eversion_t version;
2159     version_t user_version;
2160     int return_code = 0;
2161     vector<pg_log_op_return_item_t> op_returns;
2162     bool got = check_in_progress_op(
2163       m->get_reqid(), &version, &user_version, &return_code, &op_returns);
2164     if (got) {
2165       dout(3) << __func__ << " dup " << m->get_reqid()
2166               << " version " << version << dendl;
2167       if (already_complete(version)) {
2168         osd->reply_op_error(op, return_code, version, user_version, op_returns);
2169       } else {
2170         dout(10) << " waiting for " << version << " to commit" << dendl;
2171         // always queue ondisk waiters, so that we can requeue if needed
2172         waiting_for_ondisk[version].emplace_back(op, user_version, return_code,
2173                                                  op_returns);
2174         op->mark_delayed("waiting for ondisk");
2175       }
2176       return;
2177     }
2178   }
2179
2180   ObjectContextRef obc;
2181   bool can_create = op->may_write();
2182   hobject_t missing_oid;
2183
2184   // kludge around the fact that LIST_SNAPS sets CEPH_SNAPDIR for LIST_SNAPS
2185   const hobject_t& oid =
2186     m->get_snapid() == CEPH_SNAPDIR ? head : m->get_hobj();
2187
2188   // make sure LIST_SNAPS is on CEPH_SNAPDIR and nothing else
2189   for (vector<OSDOp>::iterator p = m->ops.begin(); p != m->ops.end(); ++p) {
2190     OSDOp& osd_op = *p;
2191
2192     if (osd_op.op.op == CEPH_OSD_OP_LIST_SNAPS) {
2193       if (m->get_snapid() != CEPH_SNAPDIR) {
2194         dout(10) << "LIST_SNAPS with incorrect context" << dendl;
2195         osd->reply_op_error(op, -EINVAL);
2196         return;
2197       }
2198     } else {
2199       if (m->get_snapid() == CEPH_SNAPDIR) {
2200         dout(10) << "non-LIST_SNAPS on snapdir" << dendl;
2201         osd->reply_op_error(op, -EINVAL);
2202         return;
2203       }
2204     }
2205   }
2206
2207   // io blocked on obc?
2208   if (!m->has_flag(CEPH_OSD_FLAG_FLUSH) &&
2209       maybe_await_blocked_head(oid, op)) {
2210     return;
2211   }
2212
2213   if (!is_primary()) {
2214     if (!recovery_state.can_serve_replica_read(oid)) {
2215       dout(20) << __func__
2216                << ": unstable write on replica, bouncing to primary "
2217                << *m << dendl;
2218       osd->reply_op_error(op, -EAGAIN);
2219       return;
2220     }
2221     dout(20) << __func__ << ": serving replica read on oid " << oid
2222              << dendl;
2223   }
2224
2225   int r = find_object_context(
2226     oid, &obc, can_create,
2227     m->has_flag(CEPH_OSD_FLAG_MAP_SNAP_CLONE),
2228     &missing_oid);
2229
2230   // LIST_SNAPS needs the ssc too
2231   if (obc &&
2232       m->get_snapid() == CEPH_SNAPDIR &&
2233       !obc->ssc) {
2234     obc->ssc = get_snapset_context(oid, true);
2235   }
2236
2237   if (r == -EAGAIN) {
2238     // If we're not the primary of this OSD, we just return -EAGAIN. Otherwise,
2239     // we have to wait for the object.
2240     if (is_primary()) {
2241       // missing the specific snap we need; requeue and wait.
2242       ceph_assert(!op->may_write()); // only happens on a read/cache
2243       wait_for_unreadable_object(missing_oid, op);
2244       return;
2245     }
2246   } else if (r == 0) {
2247     if (is_unreadable_object(obc->obs.oi.soid)) {
2248       dout(10) << __func__ << ": clone " << obc->obs.oi.soid
2249                << " is unreadable, waiting" << dendl;
2250       wait_for_unreadable_object(obc->obs.oi.soid, op);
2251       return;
2252     }
2253
2254     // degraded object?  (the check above was for head; this could be a clone)
2255     if (write_ordered &&
2256         obc->obs.oi.soid.snap != CEPH_NOSNAP &&
2257         is_degraded_or_backfilling_object(obc->obs.oi.soid)) {
2258       dout(10) << __func__ << ": clone " << obc->obs.oi.soid
2259                << " is degraded, waiting" << dendl;
2260       wait_for_degraded_object(obc->obs.oi.soid, op);
2261       return;
2262     }
2263   }
2264
2265   bool in_hit_set = false;
2266   if (hit_set) {
2267     if (obc.get()) {
2268       if (obc->obs.oi.soid != hobject_t() && hit_set->contains(obc->obs.oi.soid))
2269         in_hit_set = true;
2270     } else {
2271       if (missing_oid != hobject_t() && hit_set->contains(missing_oid))
2272         in_hit_set = true;
2273     }
2274     if (!op->hitset_inserted) {
2275       hit_set->insert(oid);
2276       op->hitset_inserted = true;
2277       if (hit_set->is_full() ||
2278           hit_set_start_stamp + pool.info.hit_set_period <= m->get_recv_stamp()) {
2279         hit_set_persist();
2280       }
2281     }
2282   }
2283
2284   if (agent_state) {
2285     if (agent_choose_mode(false, op))
2286       return;
2287   }
2288
2289   if (obc.get() && obc->obs.exists && obc->obs.oi.has_manifest()) {
2290     if (recover_adjacent_clones(obc, op)) {
2291       return;
2292     }
2293     if (maybe_handle_manifest(op,
2294                                write_ordered,
2295                                obc))
2296     return;
2297   }
2298
2299   if (maybe_handle_cache(op,
2300                          write_ordered,
2301                          obc,
2302                          r,
2303                          missing_oid,
2304                          false,
2305                          in_hit_set))
2306     return;
2307
2308   if (r && (r != -ENOENT || !obc)) {
2309     // copy the reqids for copy get on ENOENT
2310     if (r == -ENOENT &&
2311         (m->ops[0].op.op == CEPH_OSD_OP_COPY_GET)) {
2312       fill_in_copy_get_noent(op, oid, m->ops[0]);
2313       return;
2314     }
2315     dout(20) << __func__ << ": find_object_context got error " << r << dendl;
2316     if (op->may_write() &&
2317         get_osdmap()->require_osd_release >= ceph_release_t::kraken) {
2318       record_write_error(op, oid, nullptr, r);
2319     } else {
2320       osd->reply_op_error(op, r);
2321     }
2322     return;
2323   }
2324
2325   // make sure locator is consistent
2326   object_locator_t oloc(obc->obs.oi.soid);
2327   if (m->get_object_locator() != oloc) {
2328     dout(10) << " provided locator " << m->get_object_locator()
2329              << " != object's " << obc->obs.oi.soid << dendl;
2330     osd->clog->warn() << "bad locator " << m->get_object_locator()
2331                      << " on object " << oloc
2332                       << " op " << *m;
2333   }
2334
2335   // io blocked on obc?
2336   if (obc->is_blocked() &&
2337       !m->has_flag(CEPH_OSD_FLAG_FLUSH)) {
2338     wait_for_blocked_object(obc->obs.oi.soid, op);
2339     return;
2340   }
2341
2342   dout(25) << __func__ << " oi " << obc->obs.oi << dendl;
2343
2344   OpContext *ctx = new OpContext(op, m->get_reqid(), &m->ops, obc, this);
2345
2346   if (m->has_flag(CEPH_OSD_FLAG_SKIPRWLOCKS)) {
2347     dout(20) << __func__ << ": skipping rw locks" << dendl;
2348   } else if (m->get_flags() & CEPH_OSD_FLAG_FLUSH) {
2349     dout(20) << __func__ << ": part of flush, will ignore write lock" << dendl;
2350
2351     // verify there is in fact a flush in progress
2352     // FIXME: we could make this a stronger test.
2353     map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(obc->obs.oi.soid);
2354     if (p == flush_ops.end()) {
2355       dout(10) << __func__ << " no flush in progress, aborting" << dendl;
2356       reply_ctx(ctx, -EINVAL);
2357       return;
2358     }
2359   } else if (!get_rw_locks(write_ordered, ctx)) {
2360     dout(20) << __func__ << " waiting for rw locks " << dendl;
2361     op->mark_delayed("waiting for rw locks");
2362     close_op_ctx(ctx);
2363     return;
2364   }
2365   dout(20) << __func__ << " obc " << *obc << dendl;
2366
2367   if (r) {
2368     dout(20) << __func__ << " returned an error: " << r << dendl;
2369     if (op->may_write() &&
2370         get_osdmap()->require_osd_release >= ceph_release_t::kraken) {
2371       record_write_error(op, oid, nullptr, r,
2372                          ctx->op->allows_returnvec() ? ctx : nullptr);
2373     } else {
2374       osd->reply_op_error(op, r);
2375     }
2376     close_op_ctx(ctx);
2377     return;
2378   }
2379
2380   if (m->has_flag(CEPH_OSD_FLAG_IGNORE_CACHE)) {
2381     ctx->ignore_cache = true;
2382   }
2383
2384   if ((op->may_read()) && (obc->obs.oi.is_lost())) {
2385     // This object is lost. Reading from it returns an error.
2386     dout(20) << __func__ << ": object " << obc->obs.oi.soid
2387              << " is lost" << dendl;
2388     reply_ctx(ctx, -ENFILE);
2389     return;
2390   }
2391   if (!op->may_write() &&
2392       !op->may_cache() &&
2393       (!obc->obs.exists ||
2394        ((m->get_snapid() != CEPH_SNAPDIR) &&
2395         obc->obs.oi.is_whiteout()))) {
2396     // copy the reqids for copy get on ENOENT
2397     if (m->ops[0].op.op == CEPH_OSD_OP_COPY_GET) {
2398       fill_in_copy_get_noent(op, oid, m->ops[0]);
2399       close_op_ctx(ctx);
2400       return;
2401     }
2402     reply_ctx(ctx, -ENOENT);
2403     return;
2404   }
2405
2406   op->mark_started();
2407
2408   execute_ctx(ctx);
2409   utime_t prepare_latency = ceph_clock_now();
2410   prepare_latency -= op->get_dequeued_time();
2411   osd->logger->tinc(l_osd_op_prepare_lat, prepare_latency);
2412   if (op->may_read() && op->may_write()) {
2413     osd->logger->tinc(l_osd_op_rw_prepare_lat, prepare_latency);
2414   } else if (op->may_read()) {
2415     osd->logger->tinc(l_osd_op_r_prepare_lat, prepare_latency);
2416   } else if (op->may_write() || op->may_cache()) {
2417     osd->logger->tinc(l_osd_op_w_prepare_lat, prepare_latency);
2418   }
2419
2420   // force recovery of the oldest missing object if too many logs
2421   maybe_force_recovery();
2422 }
2423
2424 PrimaryLogPG::cache_result_t PrimaryLogPG::maybe_handle_manifest_detail(
2425   OpRequestRef op,
2426   bool write_ordered,
2427   ObjectContextRef obc)
2428 {
2429   ceph_assert(obc);
2430   if (op->get_req<MOSDOp>()->get_flags() & CEPH_OSD_FLAG_IGNORE_REDIRECT) {
2431     dout(20) << __func__ << ": ignoring redirect due to flag" << dendl;
2432     return cache_result_t::NOOP;
2433   }
2434
2435   // if it is write-ordered and blocked, stop now
2436   if (obc->is_blocked() && write_ordered) {
2437     // we're already doing something with this object
2438     dout(20) << __func__ << " blocked on " << obc->obs.oi.soid << dendl;
2439     return cache_result_t::NOOP;
2440   }
2441
2442   vector<OSDOp> ops = op->get_req<MOSDOp>()->ops;
2443   for (vector<OSDOp>::iterator p = ops.begin(); p != ops.end(); ++p) {
2444     OSDOp& osd_op = *p;
2445     ceph_osd_op& op = osd_op.op;
2446     if (op.op == CEPH_OSD_OP_SET_REDIRECT ||
2447         op.op == CEPH_OSD_OP_SET_CHUNK ||
2448         op.op == CEPH_OSD_OP_UNSET_MANIFEST ||
2449         op.op == CEPH_OSD_OP_TIER_PROMOTE ||
2450         op.op == CEPH_OSD_OP_TIER_FLUSH ||
2451         op.op == CEPH_OSD_OP_TIER_EVICT) {
2452       return cache_result_t::NOOP;
2453     }
2454   }
2455
2456   switch (obc->obs.oi.manifest.type) {
2457   case object_manifest_t::TYPE_REDIRECT:
2458     if (op->may_write() || write_ordered) {
2459       do_proxy_write(op, obc);
2460     } else {
2461       // promoted object
2462       if (obc->obs.oi.size != 0) {
2463         return cache_result_t::NOOP;
2464       }
2465       do_proxy_read(op, obc);
2466     }
2467     return cache_result_t::HANDLED_PROXY;
2468   case object_manifest_t::TYPE_CHUNKED:
2469     {
2470       if (can_proxy_chunked_read(op, obc)) {
2471         map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(obc->obs.oi.soid);
2472         if (p != flush_ops.end()) {
2473           do_proxy_chunked_op(op, obc->obs.oi.soid, obc, true);
2474           return cache_result_t::HANDLED_PROXY;
2475         }
2476         do_proxy_chunked_op(op, obc->obs.oi.soid, obc, write_ordered);
2477         return cache_result_t::HANDLED_PROXY;
2478       }
2479
2480       MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
2481       ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
2482       hobject_t head = m->get_hobj();
2483
2484       if (is_degraded_or_backfilling_object(head)) {
2485         dout(20) << __func__ << ": " << head << " is degraded, waiting" << dendl;
2486         wait_for_degraded_object(head, op);
2487         return cache_result_t::BLOCKED_RECOVERY;
2488       }
2489
2490       if (m_scrubber->write_blocked_by_scrub(head)) {
2491         dout(20) << __func__ << ": waiting for scrub" << dendl;
2492         waiting_for_scrub.push_back(op);
2493         op->mark_delayed("waiting for scrub");
2494         return cache_result_t::BLOCKED_RECOVERY;
2495       }
2496       if (!check_laggy_requeue(op)) {
2497         return cache_result_t::BLOCKED_RECOVERY;
2498       }
2499
2500       for (auto& p : obc->obs.oi.manifest.chunk_map) {
2501         if (p.second.is_missing()) {
2502           auto m = op->get_req<MOSDOp>();
2503           const object_locator_t oloc = m->get_object_locator();
2504           promote_object(obc, obc->obs.oi.soid, oloc, op, NULL);
2505           return cache_result_t::BLOCKED_PROMOTE;
2506         }
2507       }
2508       return cache_result_t::NOOP;
2509     }
2510   default:
2511     ceph_abort_msg("unrecognized manifest type");
2512   }
2513
2514   return cache_result_t::NOOP;
2515 }
2516
2517 void PrimaryLogPG::record_write_error(OpRequestRef op, const hobject_t &soid,
2518                                       MOSDOpReply *orig_reply, int r,
2519                                       OpContext *ctx_for_op_returns)
2520 {
2521   dout(20) << __func__ << " r=" << r << dendl;
2522   ceph_assert(op->may_write());
2523   const osd_reqid_t &reqid = op->get_req<MOSDOp>()->get_reqid();
2524   mempool::osd_pglog::list<pg_log_entry_t> entries;
2525   entries.push_back(pg_log_entry_t(pg_log_entry_t::ERROR, soid,
2526                                    get_next_version(), eversion_t(), 0,
2527                                    reqid, utime_t(), r));
2528   if (ctx_for_op_returns) {
2529     entries.back().set_op_returns(*ctx_for_op_returns->ops);
2530     dout(20) << __func__ << " op_returns=" << entries.back().op_returns << dendl;
2531   }
2532
2533   struct OnComplete {
2534     PrimaryLogPG *pg;
2535     OpRequestRef op;
2536     boost::intrusive_ptr<MOSDOpReply> orig_reply;
2537     int r;
2538     OnComplete(
2539       PrimaryLogPG *pg,
2540       OpRequestRef op,
2541       MOSDOpReply *orig_reply,
2542       int r)
2543       : pg(pg), op(op),
2544         orig_reply(orig_reply, false /* take over ref */), r(r)
2545       {}
2546     void operator()() {
2547       ldpp_dout(pg, 20) << "finished " << __func__ << " r=" << r << dendl;
2548       auto m = op->get_req<MOSDOp>();
2549       MOSDOpReply *reply = orig_reply.detach();
2550       ldpp_dout(pg, 10) << " sending commit on " << *m << " " << reply << dendl;
2551       pg->osd->send_message_osd_client(reply, m->get_connection());
2552     }
2553   };
2554
2555   ObcLockManager lock_manager;
2556   submit_log_entries(
2557     entries,
2558     std::move(lock_manager),
2559     std::optional<std::function<void(void)> >(
2560       OnComplete(this, op, orig_reply, r)),
2561     op,
2562     r);
2563 }
2564
2565 PrimaryLogPG::cache_result_t PrimaryLogPG::maybe_handle_cache_detail(
2566   OpRequestRef op,
2567   bool write_ordered,
2568   ObjectContextRef obc,
2569   int r, hobject_t missing_oid,
2570   bool must_promote,
2571   bool in_hit_set,
2572   ObjectContextRef *promote_obc)
2573 {
2574   // return quickly if caching is not enabled
2575   if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)
2576     return cache_result_t::NOOP;
2577
2578   if (op &&
2579       op->get_req() &&
2580       op->get_req()->get_type() == CEPH_MSG_OSD_OP &&
2581       (op->get_req<MOSDOp>()->get_flags() &
2582        CEPH_OSD_FLAG_IGNORE_CACHE)) {
2583     dout(20) << __func__ << ": ignoring cache due to flag" << dendl;
2584     return cache_result_t::NOOP;
2585   }
2586
2587   must_promote = must_promote || op->need_promote();
2588
2589   if (obc)
2590     dout(25) << __func__ << " " << obc->obs.oi << " "
2591              << (obc->obs.exists ? "exists" : "DNE")
2592              << " missing_oid " << missing_oid
2593              << " must_promote " << (int)must_promote
2594              << " in_hit_set " << (int)in_hit_set
2595              << dendl;
2596   else
2597     dout(25) << __func__ << " (no obc)"
2598              << " missing_oid " << missing_oid
2599              << " must_promote " << (int)must_promote
2600              << " in_hit_set " << (int)in_hit_set
2601              << dendl;
2602
2603   // if it is write-ordered and blocked, stop now
2604   if (obc.get() && obc->is_blocked() && write_ordered) {
2605     // we're already doing something with this object
2606     dout(20) << __func__ << " blocked on " << obc->obs.oi.soid << dendl;
2607     return cache_result_t::NOOP;
2608   }
2609
2610   if (r == -ENOENT && missing_oid == hobject_t()) {
2611     // we know this object is logically absent (e.g., an undefined clone)
2612     return cache_result_t::NOOP;
2613   }
2614
2615   if (obc.get() && obc->obs.exists) {
2616     osd->logger->inc(l_osd_op_cache_hit);
2617     return cache_result_t::NOOP;
2618   }
2619   if (!is_primary()) {
2620     dout(20) << __func__ << " cache miss; ask the primary" << dendl;
2621     osd->reply_op_error(op, -EAGAIN);
2622     return cache_result_t::REPLIED_WITH_EAGAIN;
2623   }
2624
2625   if (missing_oid == hobject_t() && obc.get()) {
2626     missing_oid = obc->obs.oi.soid;
2627   }
2628
2629   auto m = op->get_req<MOSDOp>();
2630   const object_locator_t oloc = m->get_object_locator();
2631
2632   if (op->need_skip_handle_cache()) {
2633     return cache_result_t::NOOP;
2634   }
2635
2636   OpRequestRef promote_op;
2637
2638   switch (pool.info.cache_mode) {
2639   case pg_pool_t::CACHEMODE_WRITEBACK:
2640     if (agent_state &&
2641         agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2642       if (!op->may_write() && !op->may_cache() &&
2643           !write_ordered && !must_promote) {
2644         dout(20) << __func__ << " cache pool full, proxying read" << dendl;
2645         do_proxy_read(op);
2646         return cache_result_t::HANDLED_PROXY;
2647       }
2648       dout(20) << __func__ << " cache pool full, waiting" << dendl;
2649       block_write_on_full_cache(missing_oid, op);
2650       return cache_result_t::BLOCKED_FULL;
2651     }
2652
2653     if (must_promote || (!hit_set && !op->need_skip_promote())) {
2654       promote_object(obc, missing_oid, oloc, op, promote_obc);
2655       return cache_result_t::BLOCKED_PROMOTE;
2656     }
2657
2658     if (op->may_write() || op->may_cache()) {
2659       do_proxy_write(op);
2660
2661       // Promote too?
2662       if (!op->need_skip_promote() &&
2663           maybe_promote(obc, missing_oid, oloc, in_hit_set,
2664                       pool.info.min_write_recency_for_promote,
2665                       OpRequestRef(),
2666                       promote_obc)) {
2667         return cache_result_t::BLOCKED_PROMOTE;
2668       }
2669       return cache_result_t::HANDLED_PROXY;
2670     } else {
2671       do_proxy_read(op);
2672
2673       // Avoid duplicate promotion
2674       if (obc.get() && obc->is_blocked()) {
2675         if (promote_obc)
2676           *promote_obc = obc;
2677         return cache_result_t::BLOCKED_PROMOTE;
2678       }
2679
2680       // Promote too?
2681       if (!op->need_skip_promote()) {
2682         (void)maybe_promote(obc, missing_oid, oloc, in_hit_set,
2683                             pool.info.min_read_recency_for_promote,
2684                             promote_op, promote_obc);
2685       }
2686
2687       return cache_result_t::HANDLED_PROXY;
2688     }
2689     ceph_abort_msg("unreachable");
2690     return cache_result_t::NOOP;
2691
2692   case pg_pool_t::CACHEMODE_READONLY:
2693     // TODO: clean this case up
2694     if (!obc.get() && r == -ENOENT) {
2695       // we don't have the object and op's a read
2696       promote_object(obc, missing_oid, oloc, op, promote_obc);
2697       return cache_result_t::BLOCKED_PROMOTE;
2698     }
2699     if (!r) { // it must be a write
2700       do_cache_redirect(op);
2701       return cache_result_t::HANDLED_REDIRECT;
2702     }
2703     // crap, there was a failure of some kind
2704     return cache_result_t::NOOP;
2705
2706   case pg_pool_t::CACHEMODE_FORWARD:
2707     // this mode is deprecated; proxy instead
2708   case pg_pool_t::CACHEMODE_PROXY:
2709     if (!must_promote) {
2710       if (op->may_write() || op->may_cache() || write_ordered) {
2711         do_proxy_write(op);
2712         return cache_result_t::HANDLED_PROXY;
2713       } else {
2714         do_proxy_read(op);
2715         return cache_result_t::HANDLED_PROXY;
2716       }
2717     }
2718     // ugh, we're forced to promote.
2719     if (agent_state &&
2720         agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2721       dout(20) << __func__ << " cache pool full, waiting" << dendl;
2722       block_write_on_full_cache(missing_oid, op);
2723       return cache_result_t::BLOCKED_FULL;
2724     }
2725     promote_object(obc, missing_oid, oloc, op, promote_obc);
2726     return cache_result_t::BLOCKED_PROMOTE;
2727
2728   case pg_pool_t::CACHEMODE_READFORWARD:
2729     // this mode is deprecated; proxy instead
2730   case pg_pool_t::CACHEMODE_READPROXY:
2731     // Do writeback to the cache tier for writes
2732     if (op->may_write() || write_ordered || must_promote) {
2733       if (agent_state &&
2734           agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2735         dout(20) << __func__ << " cache pool full, waiting" << dendl;
2736         block_write_on_full_cache(missing_oid, op);
2737         return cache_result_t::BLOCKED_FULL;
2738       }
2739       promote_object(obc, missing_oid, oloc, op, promote_obc);
2740       return cache_result_t::BLOCKED_PROMOTE;
2741     }
2742
2743     // If it is a read, we can read, we need to proxy it
2744     do_proxy_read(op);
2745     return cache_result_t::HANDLED_PROXY;
2746
2747   default:
2748     ceph_abort_msg("unrecognized cache_mode");
2749   }
2750   return cache_result_t::NOOP;
2751 }
2752
2753 bool PrimaryLogPG::maybe_promote(ObjectContextRef obc,
2754                                  const hobject_t& missing_oid,
2755                                  const object_locator_t& oloc,
2756                                  bool in_hit_set,
2757                                  uint32_t recency,
2758                                  OpRequestRef promote_op,
2759                                  ObjectContextRef *promote_obc)
2760 {
2761   dout(20) << __func__ << " missing_oid " << missing_oid
2762            << "  in_hit_set " << in_hit_set << dendl;
2763
2764   switch (recency) {
2765   case 0:
2766     break;
2767   case 1:
2768     // Check if in the current hit set
2769     if (in_hit_set) {
2770       break;
2771     } else {
2772       // not promoting
2773       return false;
2774     }
2775     break;
2776   default:
2777     {
2778       unsigned count = (int)in_hit_set;
2779       if (count) {
2780         // Check if in other hit sets
2781         const hobject_t& oid = obc.get() ? obc->obs.oi.soid : missing_oid;
2782         for (map<time_t,HitSetRef>::reverse_iterator itor =
2783                agent_state->hit_set_map.rbegin();
2784              itor != agent_state->hit_set_map.rend();
2785              ++itor) {
2786           if (!itor->second->contains(oid)) {
2787             break;
2788           }
2789           ++count;
2790           if (count >= recency) {
2791             break;
2792           }
2793         }
2794       }
2795       if (count >= recency) {
2796         break;
2797       }
2798       return false;     // not promoting
2799     }
2800     break;
2801   }
2802
2803   if (osd->promote_throttle()) {
2804     dout(10) << __func__ << " promote throttled" << dendl;
2805     return false;
2806   }
2807   promote_object(obc, missing_oid, oloc, promote_op, promote_obc);
2808   return true;
2809 }
2810
2811 void PrimaryLogPG::do_cache_redirect(OpRequestRef op)
2812 {
2813   auto m = op->get_req<MOSDOp>();
2814   int flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
2815   MOSDOpReply *reply = new MOSDOpReply(m, -ENOENT, get_osdmap_epoch(),
2816                                        flags, false);
2817   request_redirect_t redir(m->get_object_locator(), pool.info.tier_of);
2818   reply->set_redirect(redir);
2819   dout(10) << "sending redirect to pool " << pool.info.tier_of << " for op "
2820            << op << dendl;
2821   m->get_connection()->send_message(reply);
2822   return;
2823 }
2824
2825 struct C_ProxyRead : public Context {
2826   PrimaryLogPGRef pg;
2827   hobject_t oid;
2828   epoch_t last_peering_reset;
2829   ceph_tid_t tid;
2830   PrimaryLogPG::ProxyReadOpRef prdop;
2831   utime_t start;
2832   C_ProxyRead(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
2833              const PrimaryLogPG::ProxyReadOpRef& prd)
2834     : pg(p), oid(o), last_peering_reset(lpr),
2835       tid(0), prdop(prd), start(ceph_clock_now())
2836   {}
2837   void finish(int r) override {
2838     if (prdop->canceled)
2839       return;
2840     std::scoped_lock locker{*pg};
2841     if (prdop->canceled) {
2842       return;
2843     }
2844     if (last_peering_reset == pg->get_last_peering_reset()) {
2845       pg->finish_proxy_read(oid, tid, r);
2846       pg->osd->logger->tinc(l_osd_tier_r_lat, ceph_clock_now() - start);
2847     }
2848   }
2849 };
2850
2851 struct C_ProxyChunkRead : public Context {
2852   PrimaryLogPGRef pg;
2853   hobject_t oid;
2854   epoch_t last_peering_reset;
2855   ceph_tid_t tid;
2856   PrimaryLogPG::ProxyReadOpRef prdop;
2857   utime_t start;
2858   ObjectOperation *obj_op;
2859   int op_index = 0;
2860   uint64_t req_offset = 0;
2861   ObjectContextRef obc;
2862   uint64_t req_total_len = 0;
2863   C_ProxyChunkRead(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
2864                    const PrimaryLogPG::ProxyReadOpRef& prd)
2865     : pg(p), oid(o), last_peering_reset(lpr),
2866       tid(0), prdop(prd), start(ceph_clock_now()), obj_op(NULL)
2867   {}
2868   void finish(int r) override {
2869     if (prdop->canceled)
2870       return;
2871     std::scoped_lock locker{*pg};
2872     if (prdop->canceled) {
2873       return;
2874     }
2875     if (last_peering_reset == pg->get_last_peering_reset()) {
2876       if (r >= 0) {
2877         if (!prdop->ops[op_index].outdata.length()) {
2878           ceph_assert(req_total_len);
2879           bufferlist list;
2880           bufferptr bptr(req_total_len);
2881           list.push_back(std::move(bptr));
2882           prdop->ops[op_index].outdata.append(list);
2883         }
2884         ceph_assert(obj_op);
2885         uint64_t copy_offset;
2886         if (req_offset >= prdop->ops[op_index].op.extent.offset) {
2887           copy_offset = req_offset - prdop->ops[op_index].op.extent.offset;
2888         } else {
2889           copy_offset = 0;
2890         }
2891         prdop->ops[op_index].outdata.begin(copy_offset).copy_in(
2892           obj_op->ops[0].outdata.length(),
2893           obj_op->ops[0].outdata.c_str());
2894       }
2895
2896       pg->finish_proxy_read(oid, tid, r);
2897       pg->osd->logger->tinc(l_osd_tier_r_lat, ceph_clock_now() - start);
2898       if (obj_op) {
2899         delete obj_op;
2900       }
2901     }
2902   }
2903 };
2904
2905 void PrimaryLogPG::do_proxy_read(OpRequestRef op, ObjectContextRef obc)
2906 {
2907   // NOTE: non-const here because the ProxyReadOp needs mutable refs to
2908   // stash the result in the request's OSDOp vector
2909   MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
2910   object_locator_t oloc;
2911   hobject_t soid;
2912   /* extensible tier */
2913   if (obc && obc->obs.exists && obc->obs.oi.has_manifest()) {
2914     switch (obc->obs.oi.manifest.type) {
2915       case object_manifest_t::TYPE_REDIRECT:
2916           oloc = object_locator_t(obc->obs.oi.manifest.redirect_target);
2917           soid = obc->obs.oi.manifest.redirect_target;
2918           break;
2919       default:
2920         ceph_abort_msg("unrecognized manifest type");
2921     }
2922   } else {
2923   /* proxy */
2924     soid = m->get_hobj();
2925     oloc = object_locator_t(m->get_object_locator());
2926     oloc.pool = pool.info.tier_of;
2927   }
2928   unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY;
2929
2930   // pass through some original flags that make sense.
2931   //  - leave out redirection and balancing flags since we are
2932   //    already proxying through the primary
2933   //  - leave off read/write/exec flags that are derived from the op
2934   flags |= m->get_flags() & (CEPH_OSD_FLAG_RWORDERED |
2935                              CEPH_OSD_FLAG_ORDERSNAP |
2936                              CEPH_OSD_FLAG_ENFORCE_SNAPC |
2937                              CEPH_OSD_FLAG_MAP_SNAP_CLONE);
2938
2939   dout(10) << __func__ << " Start proxy read for " << *m << dendl;
2940
2941   ProxyReadOpRef prdop(std::make_shared<ProxyReadOp>(op, soid, m->ops));
2942
2943   ObjectOperation obj_op;
2944   obj_op.dup(prdop->ops);
2945
2946   if (pool.info.cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
2947       (agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL)) {
2948     for (unsigned i = 0; i < obj_op.ops.size(); i++) {
2949       ceph_osd_op op = obj_op.ops[i].op;
2950       switch (op.op) {
2951         case CEPH_OSD_OP_READ:
2952         case CEPH_OSD_OP_SYNC_READ:
2953         case CEPH_OSD_OP_SPARSE_READ:
2954         case CEPH_OSD_OP_CHECKSUM:
2955         case CEPH_OSD_OP_CMPEXT:
2956           op.flags = (op.flags | CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL) &
2957                        ~(CEPH_OSD_OP_FLAG_FADVISE_DONTNEED | CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
2958       }
2959     }
2960   }
2961
2962   C_ProxyRead *fin = new C_ProxyRead(this, soid, get_last_peering_reset(),
2963                                      prdop);
2964   ceph_tid_t tid = osd->objecter->read(
2965     soid.oid, oloc, obj_op,
2966     m->get_snapid(), NULL,
2967     flags, new C_OnFinisher(fin, osd->get_objecter_finisher(get_pg_shard())),
2968     &prdop->user_version,
2969     &prdop->data_offset,
2970     m->get_features());
2971   fin->tid = tid;
2972   prdop->objecter_tid = tid;
2973   proxyread_ops[tid] = prdop;
2974   in_progress_proxy_ops[soid].push_back(op);
2975 }
2976
2977 void PrimaryLogPG::finish_proxy_read(hobject_t oid, ceph_tid_t tid, int r)
2978 {
2979   dout(10) << __func__ << " " << oid << " tid " << tid
2980            << " " << cpp_strerror(r) << dendl;
2981
2982   map<ceph_tid_t, ProxyReadOpRef>::iterator p = proxyread_ops.find(tid);
2983   if (p == proxyread_ops.end()) {
2984     dout(10) << __func__ << " no proxyread_op found" << dendl;
2985     return;
2986   }
2987   ProxyReadOpRef prdop = p->second;
2988   if (tid != prdop->objecter_tid) {
2989     dout(10) << __func__ << " tid " << tid << " != prdop " << prdop
2990              << " tid " << prdop->objecter_tid << dendl;
2991     return;
2992   }
2993   if (oid != prdop->soid) {
2994     dout(10) << __func__ << " oid " << oid << " != prdop " << prdop
2995              << " soid " << prdop->soid << dendl;
2996     return;
2997   }
2998   proxyread_ops.erase(tid);
2999
3000   map<hobject_t, list<OpRequestRef>>::iterator q = in_progress_proxy_ops.find(oid);
3001   if (q == in_progress_proxy_ops.end()) {
3002     dout(10) << __func__ << " no in_progress_proxy_ops found" << dendl;
3003     return;
3004   }
3005   ceph_assert(q->second.size());
3006   list<OpRequestRef>::iterator it = std::find(q->second.begin(),
3007                                               q->second.end(),
3008                                               prdop->op);
3009   ceph_assert(it != q->second.end());
3010   OpRequestRef op = *it;
3011   q->second.erase(it);
3012   if (q->second.size() == 0) {
3013     in_progress_proxy_ops.erase(oid);
3014   } else if (std::find(q->second.begin(),
3015                        q->second.end(),
3016                        prdop->op) != q->second.end()) {
3017     /* multiple read case */
3018     dout(20) << __func__ << " " << oid << " is not completed  " << dendl;
3019     return;
3020   }
3021
3022   osd->logger->inc(l_osd_tier_proxy_read);
3023
3024   auto m = op->get_req<MOSDOp>();
3025   OpContext *ctx = new OpContext(op, m->get_reqid(), &prdop->ops, this);
3026   ctx->reply = new MOSDOpReply(m, 0, get_osdmap_epoch(), 0, false);
3027   ctx->user_at_version = prdop->user_version;
3028   ctx->data_off = prdop->data_offset;
3029   ctx->ignore_log_op_stats = true;
3030   complete_read_ctx(r, ctx);
3031 }
3032
3033 void PrimaryLogPG::kick_proxy_ops_blocked(hobject_t& soid)
3034 {
3035   map<hobject_t, list<OpRequestRef>>::iterator p = in_progress_proxy_ops.find(soid);
3036   if (p == in_progress_proxy_ops.end())
3037     return;
3038
3039   list<OpRequestRef>& ls = p->second;
3040   dout(10) << __func__ << " " << soid << " requeuing " << ls.size() << " requests" << dendl;
3041   requeue_ops(ls);
3042   in_progress_proxy_ops.erase(p);
3043 }
3044
3045 void PrimaryLogPG::cancel_proxy_read(ProxyReadOpRef prdop,
3046                                      vector<ceph_tid_t> *tids)
3047 {
3048   dout(10) << __func__ << " " << prdop->soid << dendl;
3049   prdop->canceled = true;
3050
3051   // cancel objecter op, if we can
3052   if (prdop->objecter_tid) {
3053     tids->push_back(prdop->objecter_tid);
3054     for (uint32_t i = 0; i < prdop->ops.size(); i++) {
3055       prdop->ops[i].outdata.clear();
3056     }
3057     proxyread_ops.erase(prdop->objecter_tid);
3058     prdop->objecter_tid = 0;
3059   }
3060 }
3061
3062 void PrimaryLogPG::cancel_proxy_ops(bool requeue, vector<ceph_tid_t> *tids)
3063 {
3064   dout(10) << __func__ << dendl;
3065
3066   // cancel proxy reads
3067   map<ceph_tid_t, ProxyReadOpRef>::iterator p = proxyread_ops.begin();
3068   while (p != proxyread_ops.end()) {
3069     cancel_proxy_read((p++)->second, tids);
3070   }
3071
3072   // cancel proxy writes
3073   map<ceph_tid_t, ProxyWriteOpRef>::iterator q = proxywrite_ops.begin();
3074   while (q != proxywrite_ops.end()) {
3075     cancel_proxy_write((q++)->second, tids);
3076   }
3077
3078   if (requeue) {
3079     map<hobject_t, list<OpRequestRef>>::iterator p =
3080       in_progress_proxy_ops.begin();
3081     while (p != in_progress_proxy_ops.end()) {
3082       list<OpRequestRef>& ls = p->second;
3083       dout(10) << __func__ << " " << p->first << " requeuing " << ls.size()
3084                << " requests" << dendl;
3085       requeue_ops(ls);
3086       in_progress_proxy_ops.erase(p++);
3087     }
3088   } else {
3089     in_progress_proxy_ops.clear();
3090   }
3091 }
3092
3093 struct C_ProxyWrite_Commit : public Context {
3094   PrimaryLogPGRef pg;
3095   hobject_t oid;
3096   epoch_t last_peering_reset;
3097   ceph_tid_t tid;
3098   PrimaryLogPG::ProxyWriteOpRef pwop;
3099   C_ProxyWrite_Commit(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
3100                       const PrimaryLogPG::ProxyWriteOpRef& pw)
3101     : pg(p), oid(o), last_peering_reset(lpr),
3102       tid(0), pwop(pw)
3103   {}
3104   void finish(int r) override {
3105     if (pwop->canceled)
3106       return;
3107     std::scoped_lock locker{*pg};
3108     if (pwop->canceled) {
3109       return;
3110     }
3111     if (last_peering_reset == pg->get_last_peering_reset()) {
3112       pg->finish_proxy_write(oid, tid, r);
3113     }
3114   }
3115 };
3116
3117 void PrimaryLogPG::do_proxy_write(OpRequestRef op, ObjectContextRef obc)
3118 {
3119   // NOTE: non-const because ProxyWriteOp takes a mutable ref
3120   MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
3121   object_locator_t oloc;
3122   SnapContext snapc(m->get_snap_seq(), m->get_snaps());
3123   hobject_t soid;
3124   /* extensible tier */
3125   if (obc && obc->obs.exists && obc->obs.oi.has_manifest()) {
3126     switch (obc->obs.oi.manifest.type) {
3127       case object_manifest_t::TYPE_REDIRECT:
3128           oloc = object_locator_t(obc->obs.oi.manifest.redirect_target);
3129           soid = obc->obs.oi.manifest.redirect_target;
3130           break;
3131       default:
3132         ceph_abort_msg("unrecognized manifest type");
3133     }
3134   } else {
3135   /* proxy */
3136     soid = m->get_hobj();
3137     oloc = object_locator_t(m->get_object_locator());
3138     oloc.pool = pool.info.tier_of;
3139   }
3140
3141   unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY;
3142   if (!(op->may_write() || op->may_cache())) {
3143     flags |= CEPH_OSD_FLAG_RWORDERED;
3144   }
3145   if (op->allows_returnvec()) {
3146     flags |= CEPH_OSD_FLAG_RETURNVEC;
3147   }
3148
3149   dout(10) << __func__ << " Start proxy write for " << *m << dendl;
3150
3151   ProxyWriteOpRef pwop(std::make_shared<ProxyWriteOp>(op, soid, m->ops, m->get_reqid()));
3152   pwop->ctx = new OpContext(op, m->get_reqid(), &pwop->ops, this);
3153   pwop->mtime = m->get_mtime();
3154
3155   ObjectOperation obj_op;
3156   obj_op.dup(pwop->ops);
3157
3158   C_ProxyWrite_Commit *fin = new C_ProxyWrite_Commit(
3159       this, soid, get_last_peering_reset(), pwop);
3160   ceph_tid_t tid = osd->objecter->mutate(
3161     soid.oid, oloc, obj_op, snapc,
3162     ceph::real_clock::from_ceph_timespec(pwop->mtime),
3163     flags, new C_OnFinisher(fin, osd->get_objecter_finisher(get_pg_shard())),
3164     &pwop->user_version, pwop->reqid);
3165   fin->tid = tid;
3166   pwop->objecter_tid = tid;
3167   proxywrite_ops[tid] = pwop;
3168   in_progress_proxy_ops[soid].push_back(op);
3169 }
3170
3171 void PrimaryLogPG::do_proxy_chunked_op(OpRequestRef op, const hobject_t& missing_oid,
3172                                        ObjectContextRef obc, bool write_ordered)
3173 {
3174   MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
3175   OSDOp *osd_op = NULL;
3176   for (unsigned int i = 0; i < m->ops.size(); i++) {
3177     osd_op = &m->ops[i];
3178     uint64_t cursor = osd_op->op.extent.offset;
3179     uint64_t op_length = osd_op->op.extent.offset + osd_op->op.extent.length;
3180     uint64_t chunk_length = 0, chunk_index = 0, req_len = 0;
3181     object_manifest_t *manifest = &obc->obs.oi.manifest;
3182     map <uint64_t, map<uint64_t, uint64_t>> chunk_read;
3183
3184     while (cursor < op_length) {
3185       chunk_index = 0;
3186       chunk_length = 0;
3187       /* find the right chunk position for cursor */
3188       for (auto &p : manifest->chunk_map) {
3189         if (p.first <= cursor && p.first + p.second.length > cursor) {
3190           chunk_length = p.second.length;
3191           chunk_index = p.first;
3192           break;
3193         }
3194       }
3195       /* no index */
3196       if (!chunk_index && !chunk_length) {
3197         if (cursor == osd_op->op.extent.offset) {
3198           OpContext *ctx = new OpContext(op, m->get_reqid(), &m->ops, this);
3199           ctx->reply = new MOSDOpReply(m, 0, get_osdmap_epoch(), 0, false);
3200           ctx->data_off = osd_op->op.extent.offset;
3201           ctx->ignore_log_op_stats = true;
3202           complete_read_ctx(0, ctx);
3203         }
3204         break;
3205       }
3206       uint64_t next_length = chunk_length;
3207       /* the size to read -> | op length | */
3208       /*                     |   a chunk   | */
3209       if (cursor + next_length > op_length) {
3210         next_length = op_length - cursor;
3211       }
3212       /* the size to read -> |   op length   | */
3213       /*                     |   a chunk | */
3214       if (cursor + next_length > chunk_index + chunk_length) {
3215         next_length = chunk_index + chunk_length - cursor;
3216       }
3217
3218       chunk_read[cursor] = {{chunk_index, next_length}};
3219       cursor += next_length;
3220     }
3221
3222     req_len = cursor - osd_op->op.extent.offset;
3223     for (auto &p : chunk_read) {
3224       auto chunks = p.second.begin();
3225       dout(20) << __func__ << " chunk_index: " << chunks->first
3226               << " next_length: " << chunks->second << " cursor: "
3227               << p.first << dendl;
3228       do_proxy_chunked_read(op, obc, i, chunks->first, p.first, chunks->second, req_len, write_ordered);
3229     }
3230   }
3231 }
3232
3233 struct RefCountCallback : public Context {
3234 public:
3235   PrimaryLogPG::OpContext *ctx;
3236   OSDOp& osd_op;
3237   bool requeue = false;
3238
3239   RefCountCallback(PrimaryLogPG::OpContext *ctx, OSDOp &osd_op)
3240     : ctx(ctx), osd_op(osd_op) {}
3241   void finish(int r) override {
3242     // NB: caller must already have pg->lock held
3243     ctx->obc->stop_block();
3244     ctx->pg->kick_object_context_blocked(ctx->obc);
3245     if (r >= 0) {
3246       osd_op.rval = 0;
3247       ctx->pg->execute_ctx(ctx);
3248     } else {
3249        // on cancel simply toss op out,
3250        // or requeue as requested
3251       if (r != -ECANCELED) {
3252         if (ctx->op)
3253           ctx->pg->osd->reply_op_error(ctx->op, r);
3254       } else if (requeue) {
3255         if (ctx->op)
3256           ctx->pg->requeue_op(ctx->op);
3257       }
3258       ctx->pg->close_op_ctx(ctx);
3259     }
3260   }
3261   void set_requeue(bool rq) {
3262     requeue = rq;
3263   }
3264 };
3265
3266 struct SetManifestFinisher : public PrimaryLogPG::OpFinisher {
3267   OSDOp& osd_op;
3268
3269   explicit SetManifestFinisher(OSDOp& osd_op) : osd_op(osd_op) {
3270   }
3271
3272   int execute() override {
3273     return osd_op.rval;
3274   }
3275 };
3276
3277 struct C_SetManifestRefCountDone : public Context {
3278   PrimaryLogPGRef pg;
3279   PrimaryLogPG::ManifestOpRef mop;
3280   hobject_t soid;
3281   C_SetManifestRefCountDone(PrimaryLogPG *p,
3282     PrimaryLogPG::ManifestOpRef mop, hobject_t soid) :
3283     pg(p), mop(mop), soid(soid) {}
3284   void finish(int r) override {
3285     if (r == -ECANCELED)
3286       return;
3287     std::scoped_lock locker{*pg};
3288     auto it = pg->manifest_ops.find(soid);
3289     if (it == pg->manifest_ops.end()) {
3290       // raced with cancel_manifest_ops
3291       return;
3292     }
3293     if (it->second->cb) {
3294       it->second->cb->complete(r);
3295     }
3296     pg->manifest_ops.erase(it);
3297     mop.reset();
3298   }
3299 };
3300
3301 struct C_SetDedupChunks : public Context {
3302   PrimaryLogPGRef pg;
3303   hobject_t oid;
3304   epoch_t last_peering_reset;
3305   ceph_tid_t tid;
3306   uint64_t offset;
3307
3308   C_SetDedupChunks(PrimaryLogPG *p, hobject_t o, epoch_t lpr, uint64_t offset)
3309     : pg(p), oid(o), last_peering_reset(lpr),
3310       tid(0), offset(offset)
3311   {}
3312   void finish(int r) override {
3313     if (r == -ECANCELED)
3314       return;
3315     std::scoped_lock locker{*pg};
3316     if (last_peering_reset != pg->get_last_peering_reset()) {
3317       return;
3318     }
3319     pg->finish_set_dedup(oid, r, tid, offset);
3320   }
3321 };
3322
3323 void PrimaryLogPG::cancel_manifest_ops(bool requeue, vector<ceph_tid_t> *tids)
3324 {
3325   dout(10) << __func__ << dendl;
3326   auto p = manifest_ops.begin();
3327   while (p != manifest_ops.end()) {
3328     auto mop = p->second;
3329     // cancel objecter op, if we can
3330     if (mop->objecter_tid) {
3331       tids->push_back(mop->objecter_tid);
3332       mop->objecter_tid = 0;
3333     }
3334     if (mop->cb) {
3335       mop->cb->set_requeue(requeue);
3336       mop->cb->complete(-ECANCELED);
3337     }
3338     manifest_ops.erase(p++);
3339   }
3340 }
3341
3342 int PrimaryLogPG::get_manifest_ref_count(ObjectContextRef obc, std::string& fp_oid)
3343 {
3344   int cnt = 0;
3345   // head
3346   for (auto &p : obc->obs.oi.manifest.chunk_map) {
3347     if (p.second.oid.oid.name == fp_oid) {
3348       cnt++;
3349     }
3350   }
3351   // snap
3352   SnapSet& ss = obc->ssc->snapset;
3353   const OSDMapRef& osdmap = get_osdmap();
3354   for (vector<snapid_t>::const_reverse_iterator p = ss.clones.rbegin();
3355       p != ss.clones.rend();
3356       ++p) {
3357     object_ref_delta_t refs;
3358     ObjectContextRef obc_l = nullptr;
3359     ObjectContextRef obc_g = nullptr;
3360     hobject_t clone_oid = obc->obs.oi.soid;
3361     clone_oid.snap = *p;
3362     if (osdmap->in_removed_snaps_queue(info.pgid.pgid.pool(), *p)) {
3363       return -EBUSY;
3364     }
3365     ObjectContextRef clone_obc = get_object_context(clone_oid, false);
3366     if (!clone_obc) {
3367       break;
3368     }
3369     get_adjacent_clones(clone_obc, obc_l, obc_g);
3370     clone_obc->obs.oi.manifest.calc_refs_to_inc_on_set(
3371       obc_g ? &(obc_g->obs.oi.manifest) : nullptr ,
3372       nullptr,
3373       refs);
3374     for (auto p = refs.begin(); p != refs.end(); ++p) {
3375       if (p->first.oid.name == fp_oid && p->second > 0) {
3376         cnt += p->second;
3377       }
3378     }
3379   }
3380
3381   return cnt;
3382 }
3383
3384 bool PrimaryLogPG::recover_adjacent_clones(ObjectContextRef obc, OpRequestRef op)
3385 {
3386   if (!obc->obs.oi.manifest.is_chunked() || !obc->ssc || !obc->ssc->snapset.clones.size()) {
3387     return false;
3388   }
3389
3390   const SnapSet& snapset = obc->ssc->snapset;
3391   auto s = std::find(snapset.clones.begin(), snapset.clones.end(), obc->obs.oi.soid.snap);
3392   auto is_unreadable_snap = [this, obc, &snapset, op](auto iter) -> bool {
3393     hobject_t cid = obc->obs.oi.soid;
3394     cid.snap = (iter == snapset.clones.end()) ? snapid_t(CEPH_NOSNAP) : *iter;
3395     if (is_unreadable_object(cid)) {
3396       dout(10) << __func__ << ": clone " << cid
3397                << " is unreadable, waiting" << dendl;
3398       wait_for_unreadable_object(cid, op);
3399       return true;
3400     }
3401     return false;
3402   };
3403   if (s != snapset.clones.begin()) {
3404     if (is_unreadable_snap(s - 1)) {
3405       return true;
3406     }
3407   }
3408   if (s != snapset.clones.end()) {
3409     if (is_unreadable_snap(s + 1)) {
3410       return true;
3411     }
3412   }
3413   return false;
3414 }
3415
3416 ObjectContextRef PrimaryLogPG::get_prev_clone_obc(ObjectContextRef obc)
3417 {
3418   auto s = std::find(obc->ssc->snapset.clones.begin(), obc->ssc->snapset.clones.end(),
3419                     obc->obs.oi.soid.snap);
3420   if (s != obc->ssc->snapset.clones.begin()) {
3421     auto s_iter = s - 1;
3422     hobject_t cid = obc->obs.oi.soid;
3423     object_ref_delta_t refs;
3424     cid.snap = *s_iter;
3425     ObjectContextRef cobc = get_object_context(cid, false, NULL);
3426     ceph_assert(cobc);
3427     return cobc;
3428   }
3429   return nullptr;
3430 }
3431
3432 void PrimaryLogPG::dec_refcount(const hobject_t& soid, const object_ref_delta_t& refs)
3433 {
3434   for (auto p = refs.begin(); p != refs.end(); ++p) {
3435     int dec_ref_count = p->second;
3436     ceph_assert(dec_ref_count < 0);
3437     while (dec_ref_count < 0) {
3438       dout(10) << __func__ << ": decrement reference on offset oid: " << p->first << dendl;
3439       refcount_manifest(soid, p->first,
3440                         refcount_t::DECREMENT_REF, NULL, std::nullopt);
3441       dec_ref_count++;
3442     }
3443   }
3444 }
3445
3446
3447 void PrimaryLogPG::get_adjacent_clones(ObjectContextRef src_obc,
3448                                        ObjectContextRef& _l, ObjectContextRef& _g)
3449 {
3450   const SnapSet& snapset = src_obc->ssc->snapset;
3451   const object_info_t& oi = src_obc->obs.oi;
3452
3453   auto get_context = [this, &oi, &snapset](auto iter)
3454     -> ObjectContextRef {
3455     hobject_t cid = oi.soid;
3456     cid.snap = (iter == snapset.clones.end()) ? snapid_t(CEPH_NOSNAP) : *iter;
3457     ObjectContextRef obc = get_object_context(cid, false, NULL);
3458     ceph_assert(obc);
3459     return obc;
3460   };
3461
3462   // check adjacent clones
3463   auto s = std::find(snapset.clones.begin(), snapset.clones.end(), oi.soid.snap);
3464
3465   // We *must* find the clone iff it's not head,
3466   // let s == snapset.clones.end() mean head
3467   ceph_assert((s == snapset.clones.end()) == oi.soid.is_head());
3468
3469   if (s != snapset.clones.begin()) {
3470     _l = get_context(s - 1);
3471   }
3472
3473   if (s != snapset.clones.end()) {
3474     _g = get_context(s + 1);
3475   }
3476 }
3477
3478 bool PrimaryLogPG::inc_refcount_by_set(OpContext* ctx, object_manifest_t& set_chunk,
3479                                        OSDOp& osd_op)
3480 {
3481   object_ref_delta_t refs;
3482   ObjectContextRef obc_l, obc_g;
3483   get_adjacent_clones(ctx->obc, obc_l, obc_g);
3484   set_chunk.calc_refs_to_inc_on_set(
3485     obc_l ? &(obc_l->obs.oi.manifest) : nullptr,
3486     obc_g ? &(obc_g->obs.oi.manifest) : nullptr,
3487     refs);
3488   if (!refs.is_empty()) {
3489     /* This is called by set-chunk, so we only consider a single chunk for the time being */
3490     ceph_assert(refs.size() == 1);
3491     auto p = refs.begin();
3492     int inc_ref_count = p->second;
3493     if (inc_ref_count > 0) {
3494       /*
3495        * In set-chunk case, the first thing we should do is to increment
3496        * the reference the targe object has prior to update object_manifest in object_info_t.
3497        * So, call directly refcount_manifest.
3498        */
3499       ManifestOpRef mop = std::make_shared<ManifestOp>(new RefCountCallback(ctx, osd_op));
3500       C_SetManifestRefCountDone* fin = new C_SetManifestRefCountDone(this, mop, ctx->obs->oi.soid);
3501       ceph_tid_t tid = refcount_manifest(ctx->obs->oi.soid, p->first,
3502                                           refcount_t::INCREMENT_REF, fin, std::nullopt);
3503       mop->objecter_tid = tid;
3504       manifest_ops[ctx->obs->oi.soid] = mop;
3505       ctx->obc->start_block();
3506       return true;
3507     } else if (inc_ref_count < 0) {
3508       hobject_t src = ctx->obs->oi.soid;
3509       hobject_t tgt = p->first;
3510       ctx->register_on_commit(
3511           [src, tgt, this](){
3512             refcount_manifest(src, tgt, refcount_t::DECREMENT_REF, NULL, std::nullopt);
3513           });
3514       return false;
3515     }
3516   }
3517
3518   return false;
3519 }
3520
3521 void PrimaryLogPG::dec_refcount_by_dirty(OpContext* ctx)
3522 {
3523   object_ref_delta_t refs;
3524   ObjectContextRef cobc = nullptr;
3525   ObjectContextRef obc = ctx->obc;
3526   for (auto &p : ctx->obs->oi.manifest.chunk_map) {
3527     if (!ctx->clean_regions.is_clean_region(p.first, p.second.length)) {
3528       ctx->new_obs.oi.manifest.chunk_map.erase(p.first);
3529       if (ctx->new_obs.oi.manifest.chunk_map.empty()) {
3530         ctx->new_obs.oi.manifest.type = object_manifest_t::TYPE_NONE;
3531         ctx->new_obs.oi.clear_flag(object_info_t::FLAG_MANIFEST);
3532         ctx->delta_stats.num_objects_manifest--;
3533       }
3534     }
3535   }
3536   // Look over previous snapshot, then figure out whether updated chunk needs to be deleted
3537   cobc = get_prev_clone_obc(obc);
3538   obc->obs.oi.manifest.calc_refs_to_drop_on_modify(
3539     cobc ? &cobc->obs.oi.manifest : nullptr,
3540     ctx->clean_regions,
3541     refs);
3542   if (!refs.is_empty()) {
3543     hobject_t soid = obc->obs.oi.soid;
3544     ctx->register_on_commit(
3545       [soid, this, refs](){
3546         dec_refcount(soid, refs);
3547       });
3548   }
3549 }
3550
3551 void PrimaryLogPG::dec_all_refcount_manifest(const object_info_t& oi, OpContext* ctx)
3552 {
3553   ceph_assert(oi.has_manifest());
3554   ceph_assert(ctx->obc->ssc);
3555
3556   if (oi.manifest.is_chunked()) {
3557     object_ref_delta_t refs;
3558     ObjectContextRef obc_l, obc_g;
3559     get_adjacent_clones(ctx->obc, obc_l, obc_g);
3560     oi.manifest.calc_refs_to_drop_on_removal(
3561       obc_l ? &(obc_l->obs.oi.manifest) : nullptr,
3562       obc_g ? &(obc_g->obs.oi.manifest) : nullptr,
3563       refs);
3564
3565     if (!refs.is_empty()) {
3566       hobject_t soid = ctx->obc->obs.oi.soid;
3567       ctx->register_on_commit(
3568         [soid, this, refs](){
3569           dec_refcount(soid, refs);
3570         });
3571     }
3572   } else if (oi.manifest.is_redirect() &&
3573              oi.test_flag(object_info_t::FLAG_REDIRECT_HAS_REFERENCE)) {
3574     ctx->register_on_commit(
3575       [oi, this](){
3576         refcount_manifest(oi.soid, oi.manifest.redirect_target,
3577                           refcount_t::DECREMENT_REF, NULL, std::nullopt);
3578       });
3579   }
3580 }
3581
3582 ceph_tid_t PrimaryLogPG::refcount_manifest(hobject_t src_soid, hobject_t tgt_soid, refcount_t type,
3583                                      Context *cb, std::optional<bufferlist> chunk)
3584 {
3585   unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY |
3586                    CEPH_OSD_FLAG_RWORDERED;
3587
3588   dout(10) << __func__ << " Start refcount from " << src_soid
3589            << " to " << tgt_soid << dendl;
3590
3591   ObjectOperation obj_op;
3592   bufferlist in;
3593   if (type == refcount_t::INCREMENT_REF) {
3594     cls_cas_chunk_get_ref_op call;
3595     call.source = src_soid.get_head();
3596     ::encode(call, in);
3597     obj_op.call("cas", "chunk_get_ref", in);
3598   } else if (type == refcount_t::DECREMENT_REF) {
3599     cls_cas_chunk_put_ref_op call;
3600     call.source = src_soid.get_head();
3601     ::encode(call, in);
3602     obj_op.call("cas", "chunk_put_ref", in);
3603   } else if (type == refcount_t::CREATE_OR_GET_REF) {
3604     cls_cas_chunk_create_or_get_ref_op get_call;
3605     get_call.source = src_soid.get_head();
3606     ceph_assert(chunk);
3607     get_call.data = move(*chunk);
3608     ::encode(get_call, in);
3609     obj_op.call("cas", "chunk_create_or_get_ref", in);
3610   } else {
3611     ceph_assert(0 == "unrecognized type");
3612   }
3613
3614   Context *c = nullptr;
3615   if (cb) {
3616     c = new C_OnFinisher(cb, osd->get_objecter_finisher(get_pg_shard()));
3617   }
3618
3619   object_locator_t oloc(tgt_soid);
3620   ObjectContextRef src_obc = get_object_context(src_soid, false, NULL);
3621   ceph_assert(src_obc);
3622   auto tid = osd->objecter->mutate(
3623     tgt_soid.oid, oloc, obj_op, SnapContext(),
3624     ceph::real_clock::from_ceph_timespec(src_obc->obs.oi.mtime),
3625     flags, c);
3626   return tid;
3627 }
3628
3629 void PrimaryLogPG::do_proxy_chunked_read(OpRequestRef op, ObjectContextRef obc, int op_index,
3630                                          uint64_t chunk_index, uint64_t req_offset, uint64_t req_length,
3631                                          uint64_t req_total_len, bool write_ordered)
3632 {
3633   MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
3634   object_manifest_t *manifest = &obc->obs.oi.manifest;
3635   if (!manifest->chunk_map.count(chunk_index)) {
3636     return;
3637   }
3638   uint64_t chunk_length = manifest->chunk_map[chunk_index].length;
3639   hobject_t soid = manifest->chunk_map[chunk_index].oid;
3640   hobject_t ori_soid = m->get_hobj();
3641   object_locator_t oloc(soid);
3642   unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY;
3643   if (write_ordered) {
3644     flags |= CEPH_OSD_FLAG_RWORDERED;
3645   }
3646
3647   if (!chunk_length || soid == hobject_t()) {
3648     return;
3649   }
3650
3651   /* same as do_proxy_read() */
3652   flags |= m->get_flags() & (CEPH_OSD_FLAG_RWORDERED |
3653                              CEPH_OSD_FLAG_ORDERSNAP |
3654                              CEPH_OSD_FLAG_ENFORCE_SNAPC |
3655                              CEPH_OSD_FLAG_MAP_SNAP_CLONE);
3656
3657   dout(10) << __func__ << " Start do chunk proxy read for " << *m
3658            << " index: " << op_index << " oid: " << soid.oid.name << " req_offset: " << req_offset
3659            << " req_length: " << req_length << dendl;
3660
3661   ProxyReadOpRef prdop(std::make_shared<ProxyReadOp>(op, ori_soid, m->ops));
3662
3663   ObjectOperation *pobj_op = new ObjectOperation;
3664   OSDOp &osd_op = pobj_op->add_op(m->ops[op_index].op.op);
3665
3666   if (chunk_index <= req_offset) {
3667     osd_op.op.extent.offset = manifest->chunk_map[chunk_index].offset + req_offset - chunk_index;
3668   } else {
3669     ceph_abort_msg("chunk_index > req_offset");
3670   }
3671   osd_op.op.extent.length = req_length;
3672
3673   ObjectOperation obj_op;
3674   obj_op.dup(pobj_op->ops);
3675
3676   C_ProxyChunkRead *fin = new C_ProxyChunkRead(this, ori_soid, get_last_peering_reset(),
3677                                                prdop);
3678   fin->obj_op = pobj_op;
3679   fin->op_index = op_index;
3680   fin->req_offset = req_offset;
3681   fin->obc = obc;
3682   fin->req_total_len = req_total_len;
3683
3684   ceph_tid_t tid = osd->objecter->read(
3685     soid.oid, oloc, obj_op,
3686     m->get_snapid(), NULL,
3687     flags, new C_OnFinisher(fin, osd->get_objecter_finisher(get_pg_shard())),
3688     &prdop->user_version,
3689     &prdop->data_offset,
3690     m->get_features());
3691   fin->tid = tid;
3692   prdop->objecter_tid = tid;
3693   proxyread_ops[tid] = prdop;
3694   in_progress_proxy_ops[ori_soid].push_back(op);
3695 }
3696
3697 bool PrimaryLogPG::can_proxy_chunked_read(OpRequestRef op, ObjectContextRef obc)
3698 {
3699   MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
3700   OSDOp *osd_op = NULL;
3701   bool ret = true;
3702   for (unsigned int i = 0; i < m->ops.size(); i++) {
3703     osd_op = &m->ops[i];
3704     ceph_osd_op op = osd_op->op;
3705     switch (op.op) {
3706       case CEPH_OSD_OP_READ:
3707       case CEPH_OSD_OP_SYNC_READ: {
3708         uint64_t cursor = osd_op->op.extent.offset;
3709         uint64_t remain = osd_op->op.extent.length;
3710
3711         /* requested chunks exist in chunk_map ? */
3712         for (auto &p : obc->obs.oi.manifest.chunk_map) {
3713           if (p.first <= cursor && p.first + p.second.length > cursor) {
3714             if (!p.second.is_missing()) {
3715               return false;
3716             }
3717             if (p.second.length >= remain) {
3718               remain = 0;
3719               break;
3720             } else {
3721               remain = remain - p.second.length;
3722             }
3723             cursor += p.second.length;
3724           }
3725         }
3726
3727         if (remain) {
3728           dout(20) << __func__ << " requested chunks don't exist in chunk_map " << dendl;
3729           return false;
3730         }
3731         continue;
3732       }
3733       default:
3734         return false;
3735     }
3736   }
3737   return ret;
3738 }
3739
3740 void PrimaryLogPG::finish_proxy_write(hobject_t oid, ceph_tid_t tid, int r)
3741 {
3742   dout(10) << __func__ << " " << oid << " tid " << tid
3743            << " " << cpp_strerror(r) << dendl;
3744
3745   map<ceph_tid_t, ProxyWriteOpRef>::iterator p = proxywrite_ops.find(tid);
3746   if (p == proxywrite_ops.end()) {
3747     dout(10) << __func__ << " no proxywrite_op found" << dendl;
3748     return;
3749   }
3750   ProxyWriteOpRef pwop = p->second;
3751   ceph_assert(tid == pwop->objecter_tid);
3752   ceph_assert(oid == pwop->soid);
3753
3754   proxywrite_ops.erase(tid);
3755
3756   map<hobject_t, list<OpRequestRef> >::iterator q = in_progress_proxy_ops.find(oid);
3757   if (q == in_progress_proxy_ops.end()) {
3758     dout(10) << __func__ << " no in_progress_proxy_ops found" << dendl;
3759     delete pwop->ctx;
3760     pwop->ctx = NULL;
3761     return;
3762   }
3763   list<OpRequestRef>& in_progress_op = q->second;
3764   ceph_assert(in_progress_op.size());
3765   list<OpRequestRef>::iterator it = std::find(in_progress_op.begin(),
3766                                               in_progress_op.end(),
3767                                               pwop->op);
3768   ceph_assert(it != in_progress_op.end());
3769   in_progress_op.erase(it);
3770   if (in_progress_op.size() == 0) {
3771     in_progress_proxy_ops.erase(oid);
3772   } else if (std::find(in_progress_op.begin(),
3773                         in_progress_op.end(),
3774                         pwop->op) != in_progress_op.end()) {
3775     if (pwop->ctx)
3776       delete pwop->ctx;
3777     pwop->ctx = NULL;
3778     dout(20) << __func__ << " " << oid << " tid " << tid
3779             << " in_progress_op size: "
3780             << in_progress_op.size() << dendl;
3781     return;
3782   }
3783
3784   osd->logger->inc(l_osd_tier_proxy_write);
3785
3786   auto m = pwop->op->get_req<MOSDOp>();
3787   ceph_assert(m != NULL);
3788
3789   if (!pwop->sent_reply) {
3790     // send commit.
3791     assert(pwop->ctx->reply == nullptr);
3792     MOSDOpReply *reply = new MOSDOpReply(m, r, get_osdmap_epoch(), 0,
3793                                          true /* we claim it below */);
3794     reply->set_reply_versions(eversion_t(), pwop->user_version);
3795     reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
3796     reply->claim_op_out_data(pwop->ops);
3797     dout(10) << " sending commit on " << pwop << " " << reply << dendl;
3798     osd->send_message_osd_client(reply, m->get_connection());
3799     pwop->sent_reply = true;
3800     pwop->ctx->op->mark_commit_sent();
3801   }
3802
3803   delete pwop->ctx;
3804   pwop->ctx = NULL;
3805 }
3806
3807 void PrimaryLogPG::cancel_proxy_write(ProxyWriteOpRef pwop,
3808                                       vector<ceph_tid_t> *tids)
3809 {
3810   dout(10) << __func__ << " " << pwop->soid << dendl;
3811   pwop->canceled = true;
3812
3813   // cancel objecter op, if we can
3814   if (pwop->objecter_tid) {
3815     tids->push_back(pwop->objecter_tid);
3816     delete pwop->ctx;
3817     pwop->ctx = NULL;
3818     proxywrite_ops.erase(pwop->objecter_tid);
3819     pwop->objecter_tid = 0;
3820   }
3821 }
3822
3823 class PromoteCallback: public PrimaryLogPG::CopyCallback {
3824   ObjectContextRef obc;
3825   PrimaryLogPG *pg;
3826   utime_t start;
3827 public:
3828   PromoteCallback(ObjectContextRef obc_, PrimaryLogPG *pg_)
3829     : obc(obc_),
3830       pg(pg_),
3831       start(ceph_clock_now()) {}
3832
3833   void finish(PrimaryLogPG::CopyCallbackResults results) override {
3834     PrimaryLogPG::CopyResults *results_data = results.get<1>();
3835     int r = results.get<0>();
3836     pg->finish_promote(r, results_data, obc);
3837     pg->osd->logger->tinc(l_osd_tier_promote_lat, ceph_clock_now() - start);
3838   }
3839 };
3840
3841 class PromoteManifestCallback: public PrimaryLogPG::CopyCallback {
3842   ObjectContextRef obc;
3843   PrimaryLogPG *pg;
3844   utime_t start;
3845   PrimaryLogPG::OpContext *ctx;
3846   PrimaryLogPG::CopyCallbackResults promote_results;
3847 public:
3848   PromoteManifestCallback(ObjectContextRef obc_, PrimaryLogPG *pg_, PrimaryLogPG::OpContext *ctx = NULL)
3849     : obc(obc_),
3850       pg(pg_),
3851       start(ceph_clock_now()), ctx(ctx) {}
3852
3853   void finish(PrimaryLogPG::CopyCallbackResults results) override {
3854     PrimaryLogPG::CopyResults *results_data = results.get<1>();
3855     int r = results.get<0>();
3856     if (ctx) {
3857       promote_results = results;
3858       pg->execute_ctx(ctx);
3859     } else {
3860       pg->finish_promote_manifest(r, results_data, obc);
3861     }
3862     pg->osd->logger->tinc(l_osd_tier_promote_lat, ceph_clock_now() - start);
3863   }
3864   friend struct PromoteFinisher;
3865 };
3866
3867 struct PromoteFinisher : public PrimaryLogPG::OpFinisher {
3868   PromoteManifestCallback *promote_callback;
3869
3870   explicit PromoteFinisher(PromoteManifestCallback *promote_callback)
3871     : promote_callback(promote_callback) {
3872   }
3873
3874   int execute() override {
3875     if (promote_callback->ctx->obc->obs.oi.manifest.is_redirect()) {
3876       promote_callback->ctx->pg->finish_promote(promote_callback->promote_results.get<0>(),
3877                                                 promote_callback->promote_results.get<1>(),
3878                                                 promote_callback->obc);
3879     } else if (promote_callback->ctx->obc->obs.oi.manifest.is_chunked()) {
3880       promote_callback->ctx->pg->finish_promote_manifest(promote_callback->promote_results.get<0>(),
3881                                                 promote_callback->promote_results.get<1>(),
3882                                                 promote_callback->obc);
3883     } else {
3884       ceph_abort_msg("unrecognized manifest type");
3885     }
3886     return 0;
3887   }
3888 };
3889
3890 void PrimaryLogPG::promote_object(ObjectContextRef obc,
3891                                   const hobject_t& missing_oid,
3892                                   const object_locator_t& oloc,
3893                                   OpRequestRef op,
3894                                   ObjectContextRef *promote_obc)
3895 {
3896   hobject_t hoid = obc ? obc->obs.oi.soid : missing_oid;
3897   ceph_assert(hoid != hobject_t());
3898   if (m_scrubber->write_blocked_by_scrub(hoid)) {
3899     dout(10) << __func__ << " " << hoid
3900              << " blocked by scrub" << dendl;
3901     if (op) {
3902       waiting_for_scrub.push_back(op);
3903       op->mark_delayed("waiting for scrub");
3904       dout(10) << __func__ << " " << hoid
3905                << " placing op in waiting_for_scrub" << dendl;
3906     } else {
3907       dout(10) << __func__ << " " << hoid
3908                << " no op, dropping on the floor" << dendl;
3909     }
3910     return;
3911   }
3912   if (op && !check_laggy_requeue(op)) {
3913     return;
3914   }
3915   if (!obc) { // we need to create an ObjectContext
3916     ceph_assert(missing_oid != hobject_t());
3917     obc = get_object_context(missing_oid, true);
3918   }
3919   if (promote_obc)
3920     *promote_obc = obc;
3921
3922   /*
3923    * Before promote complete, if there are  proxy-reads for the object,
3924    * for this case we don't use DONTNEED.
3925    */
3926   unsigned src_fadvise_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL;
3927   map<hobject_t, list<OpRequestRef>>::iterator q = in_progress_proxy_ops.find(obc->obs.oi.soid);
3928   if (q == in_progress_proxy_ops.end()) {
3929     src_fadvise_flags |= LIBRADOS_OP_FLAG_FADVISE_DONTNEED;
3930   }
3931
3932   CopyCallback *cb;
3933   object_locator_t my_oloc;
3934   hobject_t src_hoid;
3935   if (!obc->obs.oi.has_manifest()) {
3936     my_oloc = oloc;
3937     my_oloc.pool = pool.info.tier_of;
3938     src_hoid = obc->obs.oi.soid;
3939     cb = new PromoteCallback(obc, this);
3940   } else {
3941     if (obc->obs.oi.manifest.is_chunked()) {
3942       src_hoid = obc->obs.oi.soid;
3943       cb = new PromoteManifestCallback(obc, this);
3944     } else if (obc->obs.oi.manifest.is_redirect()) {
3945       object_locator_t src_oloc(obc->obs.oi.manifest.redirect_target);
3946       my_oloc = src_oloc;
3947       src_hoid = obc->obs.oi.manifest.redirect_target;
3948       cb = new PromoteCallback(obc, this);
3949     } else {
3950       ceph_abort_msg("unrecognized manifest type");
3951     }
3952   }
3953
3954   unsigned flags = CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
3955                    CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
3956                    CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE |
3957                    CEPH_OSD_COPY_FROM_FLAG_RWORDERED;
3958   start_copy(cb, obc, src_hoid, my_oloc, 0, flags,
3959              obc->obs.oi.soid.snap == CEPH_NOSNAP,
3960              src_fadvise_flags, 0);
3961
3962   ceph_assert(obc->is_blocked());
3963
3964   if (op)
3965     wait_for_blocked_object(obc->obs.oi.soid, op);
3966
3967   recovery_state.update_stats(
3968     [](auto &history, auto &stats) {
3969       stats.stats.sum.num_promote++;
3970       return false;
3971     });
3972 }
3973
3974 void PrimaryLogPG::execute_ctx(OpContext *ctx)
3975 {
3976   FUNCTRACE(cct);
3977   dout(10) << __func__ << " " << ctx << dendl;
3978   ctx->reset_obs(ctx->obc);
3979   ctx->update_log_only = false; // reset in case finish_copyfrom() is re-running execute_ctx
3980   OpRequestRef op = ctx->op;
3981   auto m = op->get_req<MOSDOp>();
3982   ObjectContextRef obc = ctx->obc;
3983   const hobject_t& soid = obc->obs.oi.soid;
3984
3985   // this method must be idempotent since we may call it several times
3986   // before we finally apply the resulting transaction.
3987   ctx->op_t.reset(new PGTransaction);
3988
3989   if (op->may_write() || op->may_cache()) {
3990     // snap
3991     if (!(m->has_flag(CEPH_OSD_FLAG_ENFORCE_SNAPC)) &&
3992         pool.info.is_pool_snaps_mode()) {
3993       // use pool's snapc
3994       ctx->snapc = pool.snapc;
3995     } else {
3996       // client specified snapc
3997       ctx->snapc.seq = m->get_snap_seq();
3998       ctx->snapc.snaps = m->get_snaps();
3999       filter_snapc(ctx->snapc.snaps);
4000     }
4001     if ((m->has_flag(CEPH_OSD_FLAG_ORDERSNAP)) &&
4002         ctx->snapc.seq < obc->ssc->snapset.seq) {
4003       dout(10) << " ORDERSNAP flag set and snapc seq " << ctx->snapc.seq
4004                << " < snapset seq " << obc->ssc->snapset.seq
4005                << " on " << obc->obs.oi.soid << dendl;
4006       reply_ctx(ctx, -EOLDSNAPC);
4007       return;
4008     }
4009
4010     // version
4011     ctx->at_version = get_next_version();
4012     ctx->mtime = m->get_mtime();
4013
4014     dout(10) << __func__ << " " << soid << " " << *ctx->ops
4015              << " ov " << obc->obs.oi.version << " av " << ctx->at_version
4016              << " snapc " << ctx->snapc
4017              << " snapset " << obc->ssc->snapset
4018              << dendl;
4019   } else {
4020     dout(10) << __func__ << " " << soid << " " << *ctx->ops
4021              << " ov " << obc->obs.oi.version
4022              << dendl;
4023   }
4024
4025   if (!ctx->user_at_version)
4026     ctx->user_at_version = obc->obs.oi.user_version;
4027   dout(30) << __func__ << " user_at_version " << ctx->user_at_version << dendl;
4028
4029   {
4030 #ifdef WITH_LTTNG
4031     osd_reqid_t reqid = ctx->op->get_reqid();
4032 #endif
4033     tracepoint(osd, prepare_tx_enter, reqid.name._type,
4034         reqid.name._num, reqid.tid, reqid.inc);
4035   }
4036 #ifdef HAVE_JAEGER
4037   if (ctx->op->osd_parent_span) {
4038     auto execute_span = jaeger_tracing::child_span(__func__, ctx->op->osd_parent_span);
4039   }
4040 #endif
4041
4042   int result = prepare_transaction(ctx);
4043
4044   {
4045 #ifdef WITH_LTTNG
4046     osd_reqid_t reqid = ctx->op->get_reqid();
4047 #endif
4048     tracepoint(osd, prepare_tx_exit, reqid.name._type,
4049         reqid.name._num, reqid.tid, reqid.inc);
4050   }
4051
4052   bool pending_async_reads = !ctx->pending_async_reads.empty();
4053   if (result == -EINPROGRESS || pending_async_reads) {
4054     // come back later.
4055     if (pending_async_reads) {
4056       ceph_assert(pool.info.is_erasure());
4057       in_progress_async_reads.push_back(make_pair(op, ctx));
4058       ctx->start_async_reads(this);
4059     }
4060     return;
4061   }
4062
4063   if (result == -EAGAIN) {
4064     // clean up after the ctx
4065     close_op_ctx(ctx);
4066     return;
4067   }
4068
4069   bool ignore_out_data = false;
4070   if (!ctx->op_t->empty() &&
4071       op->may_write() &&
4072       result >= 0) {
4073     // successful update
4074     if (ctx->op->allows_returnvec()) {
4075       // enforce reasonable bound on the return buffer sizes
4076       for (auto& i : *ctx->ops) {
4077         if (i.outdata.length() > cct->_conf->osd_max_write_op_reply_len) {
4078           dout(10) << __func__ << " op " << i << " outdata overflow" << dendl;
4079           result = -EOVERFLOW;  // overall result is overflow
4080           i.rval = -EOVERFLOW;
4081           i.outdata.clear();
4082         }
4083       }
4084     } else {
4085       // legacy behavior -- zero result and return data etc.
4086       ignore_out_data = true;
4087       result = 0;
4088     }
4089   }
4090
4091   // prepare the reply
4092   ctx->reply = new MOSDOpReply(m, result, get_osdmap_epoch(), 0,
4093                                ignore_out_data);
4094   dout(20) << __func__ << " alloc reply " << ctx->reply
4095            << " result " << result << dendl;
4096
4097   // read or error?
4098   if ((ctx->op_t->empty() || result < 0) && !ctx->update_log_only) {
4099     // finish side-effects
4100     if (result >= 0)
4101       do_osd_op_effects(ctx, m->get_connection());
4102
4103     complete_read_ctx(result, ctx);
4104     return;
4105   }
4106
4107   ctx->reply->set_reply_versions(ctx->at_version, ctx->user_at_version);
4108
4109   ceph_assert(op->may_write() || op->may_cache());
4110
4111   // trim log?
4112   recovery_state.update_trim_to();
4113
4114   // verify that we are doing this in order?
4115   if (cct->_conf->osd_debug_op_order && m->get_source().is_client() &&
4116       !pool.info.is_tier() && !pool.info.has_tiers()) {
4117     map<client_t,ceph_tid_t>& cm = debug_op_order[obc->obs.oi.soid];
4118     ceph_tid_t t = m->get_tid();
4119     client_t n = m->get_source().num();
4120     map<client_t,ceph_tid_t>::iterator p = cm.find(n);
4121     if (p == cm.end()) {
4122       dout(20) << " op order client." << n << " tid " << t << " (first)" << dendl;
4123       cm[n] = t;
4124     } else {
4125       dout(20) << " op order client." << n << " tid " << t << " last was " << p->second << dendl;
4126       if (p->second > t) {
4127         derr << "bad op order, already applied " << p->second << " > this " << t << dendl;
4128         ceph_abort_msg("out of order op");
4129       }
4130       p->second = t;
4131     }
4132   }
4133
4134   if (ctx->update_log_only) {
4135     if (result >= 0)
4136       do_osd_op_effects(ctx, m->get_connection());
4137
4138     dout(20) << __func__ << " update_log_only -- result=" << result << dendl;
4139     // save just what we need from ctx
4140     MOSDOpReply *reply = ctx->reply;
4141     ctx->reply = nullptr;
4142     reply->get_header().data_off = (ctx->data_off ? *ctx->data_off : 0);
4143
4144     if (result == -ENOENT) {
4145       reply->set_enoent_reply_versions(info.last_update,
4146                                        info.last_user_version);
4147     }
4148     reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
4149     // append to pg log for dup detection - don't save buffers for now
4150     record_write_error(op, soid, reply, result,
4151                        ctx->op->allows_returnvec() ? ctx : nullptr);
4152     close_op_ctx(ctx);
4153     return;
4154   }
4155
4156   // no need to capture PG ref, repop cancel will handle that
4157   // Can capture the ctx by pointer, it's owned by the repop
4158   ctx->register_on_commit(
4159     [m, ctx, this](){
4160       if (ctx->op)
4161         log_op_stats(*ctx->op, ctx->bytes_written, ctx->bytes_read);
4162
4163       if (m && !ctx->sent_reply) {
4164         MOSDOpReply *reply = ctx->reply;
4165         ctx->reply = nullptr;
4166         reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
4167         dout(10) << " sending reply on " << *m << " " << reply << dendl;
4168         osd->send_message_osd_client(reply, m->get_connection());
4169         ctx->sent_reply = true;
4170         ctx->op->mark_commit_sent();
4171       }
4172     });
4173   ctx->register_on_success(
4174     [ctx, this]() {
4175       do_osd_op_effects(
4176         ctx,
4177         ctx->op ? ctx->op->get_req()->get_connection() :
4178         ConnectionRef());
4179     });
4180   ctx->register_on_finish(
4181     [ctx]() {
4182       delete ctx;
4183     });
4184
4185   // issue replica writes
4186   ceph_tid_t rep_tid = osd->get_tid();
4187
4188   RepGather *repop = new_repop(ctx, obc, rep_tid);
4189
4190   issue_repop(repop, ctx);
4191   eval_repop(repop);
4192   repop->put();
4193 }
4194
4195 void PrimaryLogPG::close_op_ctx(OpContext *ctx) {
4196   release_object_locks(ctx->lock_manager);
4197
4198   ctx->op_t.reset();
4199
4200   for (auto p = ctx->on_finish.begin(); p != ctx->on_finish.end();
4201        ctx->on_finish.erase(p++)) {
4202     (*p)();
4203   }
4204   delete ctx;
4205 }
4206
4207 void PrimaryLogPG::reply_ctx(OpContext *ctx, int r)
4208 {
4209   if (ctx->op)
4210     osd->reply_op_error(ctx->op, r);
4211   close_op_ctx(ctx);
4212 }
4213
4214 void PrimaryLogPG::log_op_stats(const OpRequest& op,
4215                                 const uint64_t inb,
4216                                 const uint64_t outb)
4217 {
4218   auto m = op.get_req<MOSDOp>();
4219   const utime_t now = ceph_clock_now();
4220
4221   const utime_t latency = now - m->get_recv_stamp();
4222   const utime_t process_latency = now - op.get_dequeued_time();
4223
4224   osd->logger->inc(l_osd_op);
4225
4226   osd->logger->inc(l_osd_op_outb, outb);
4227   osd->logger->inc(l_osd_op_inb, inb);
4228   osd->logger->tinc(l_osd_op_lat, latency);
4229   osd->logger->tinc(l_osd_op_process_lat, process_latency);
4230
4231   if (op.may_read() && op.may_write()) {
4232     osd->logger->inc(l_osd_op_rw);
4233     osd->logger->inc(l_osd_op_rw_inb, inb);
4234     osd->logger->inc(l_osd_op_rw_outb, outb);
4235     osd->logger->tinc(l_osd_op_rw_lat, latency);
4236     osd->logger->hinc(l_osd_op_rw_lat_inb_hist, latency.to_nsec(), inb);
4237     osd->logger->hinc(l_osd_op_rw_lat_outb_hist, latency.to_nsec(), outb);
4238     osd->logger->tinc(l_osd_op_rw_process_lat, process_latency);
4239   } else if (op.may_read()) {
4240     osd->logger->inc(l_osd_op_r);
4241     osd->logger->inc(l_osd_op_r_outb, outb);
4242     osd->logger->tinc(l_osd_op_r_lat, latency);
4243     osd->logger->hinc(l_osd_op_r_lat_outb_hist, latency.to_nsec(), outb);
4244     osd->logger->tinc(l_osd_op_r_process_lat, process_latency);
4245   } else if (op.may_write() || op.may_cache()) {
4246     osd->logger->inc(l_osd_op_w);
4247     osd->logger->inc(l_osd_op_w_inb, inb);
4248     osd->logger->tinc(l_osd_op_w_lat, latency);
4249     osd->logger->hinc(l_osd_op_w_lat_inb_hist, latency.to_nsec(), inb);
4250     osd->logger->tinc(l_osd_op_w_process_lat, process_latency);
4251   } else {
4252     ceph_abort();
4253   }
4254
4255   dout(15) << "log_op_stats " << *m
4256            << " inb " << inb
4257            << " outb " << outb
4258            << " lat " << latency << dendl;
4259
4260   if (m_dynamic_perf_stats.is_enabled()) {
4261     m_dynamic_perf_stats.add(osd, info, op, inb, outb, latency);
4262   }
4263 }
4264
4265 void PrimaryLogPG::set_dynamic_perf_stats_queries(
4266     const std::list<OSDPerfMetricQuery> &queries)
4267 {
4268   m_dynamic_perf_stats.set_queries(queries);
4269 }
4270
4271 void PrimaryLogPG::get_dynamic_perf_stats(DynamicPerfStats *stats)
4272 {
4273   std::swap(m_dynamic_perf_stats, *stats);
4274 }
4275
4276 void PrimaryLogPG::do_scan(
4277   OpRequestRef op,
4278   ThreadPool::TPHandle &handle)
4279 {
4280   auto m = op->get_req<MOSDPGScan>();
4281   ceph_assert(m->get_type() == MSG_OSD_PG_SCAN);
4282   dout(10) << "do_scan " << *m << dendl;
4283
4284   op->mark_started();
4285
4286   switch (m->op) {
4287   case MOSDPGScan::OP_SCAN_GET_DIGEST:
4288     {
4289       auto dpp = get_dpp();
4290       if (osd->check_backfill_full(dpp)) {
4291         dout(1) << __func__ << ": Canceling backfill: Full." << dendl;
4292         queue_peering_event(
4293           PGPeeringEventRef(
4294             std::make_shared<PGPeeringEvent>(
4295               get_osdmap_epoch(),
4296               get_osdmap_epoch(),
4297               PeeringState::BackfillTooFull())));
4298         return;
4299       }
4300
4301       BackfillInterval bi;
4302       bi.begin = m->begin;
4303       // No need to flush, there won't be any in progress writes occuring
4304       // past m->begin
4305       scan_range(
4306         cct->_conf->osd_backfill_scan_min,
4307         cct->_conf->osd_backfill_scan_max,
4308         &bi,
4309         handle);
4310       MOSDPGScan *reply = new MOSDPGScan(
4311         MOSDPGScan::OP_SCAN_DIGEST,
4312         pg_whoami,
4313         get_osdmap_epoch(), m->query_epoch,
4314         spg_t(info.pgid.pgid, get_primary().shard), bi.begin, bi.end);
4315       encode(bi.objects, reply->get_data());
4316       osd->send_message_osd_cluster(reply, m->get_connection());
4317     }
4318     break;
4319
4320   case MOSDPGScan::OP_SCAN_DIGEST:
4321     {
4322       pg_shard_t from = m->from;
4323
4324       // Check that from is in backfill_targets vector
4325       ceph_assert(is_backfill_target(from));
4326
4327       BackfillInterval& bi = peer_backfill_info[from];
4328       bi.begin = m->begin;
4329       bi.end = m->end;
4330       auto p = m->get_data().cbegin();
4331
4332       // take care to preserve ordering!
4333       bi.clear_objects();
4334       decode_noclear(bi.objects, p);
4335       dout(10) << __func__ << " bi.begin=" << bi.begin << " bi.end=" << bi.end
4336                << " bi.objects.size()=" << bi.objects.size() << dendl;
4337
4338       if (waiting_on_backfill.erase(from)) {
4339         if (waiting_on_backfill.empty()) {
4340           ceph_assert(
4341             peer_backfill_info.size() ==
4342             get_backfill_targets().size());
4343           finish_recovery_op(hobject_t::get_max());
4344         }
4345       } else {
4346         // we canceled backfill for a while due to a too full, and this
4347         // is an extra response from a non-too-full peer
4348         dout(20) << __func__ << " canceled backfill (too full?)" << dendl;
4349       }
4350     }
4351     break;
4352   }
4353 }
4354
4355 void PrimaryLogPG::do_backfill(OpRequestRef op)
4356 {
4357   auto m = op->get_req<MOSDPGBackfill>();
4358   ceph_assert(m->get_type() == MSG_OSD_PG_BACKFILL);
4359   dout(10) << "do_backfill " << *m << dendl;
4360
4361   op->mark_started();
4362
4363   switch (m->op) {
4364   case MOSDPGBackfill::OP_BACKFILL_FINISH:
4365     {
4366       ceph_assert(cct->_conf->osd_kill_backfill_at != 1);
4367
4368       MOSDPGBackfill *reply = new MOSDPGBackfill(
4369         MOSDPGBackfill::OP_BACKFILL_FINISH_ACK,
4370         get_osdmap_epoch(),
4371         m->query_epoch,
4372         spg_t(info.pgid.pgid, get_primary().shard));
4373       reply->set_priority(get_recovery_op_priority());
4374       osd->send_message_osd_cluster(reply, m->get_connection());
4375       queue_peering_event(
4376         PGPeeringEventRef(
4377           std::make_shared<PGPeeringEvent>(
4378             get_osdmap_epoch(),
4379             get_osdmap_epoch(),
4380             RecoveryDone())));
4381     }
4382     // fall-thru
4383
4384   case MOSDPGBackfill::OP_BACKFILL_PROGRESS:
4385     {
4386       ceph_assert(cct->_conf->osd_kill_backfill_at != 2);
4387
4388       ObjectStore::Transaction t;
4389       recovery_state.update_backfill_progress(
4390         m->last_backfill,
4391         m->stats,
4392         m->op == MOSDPGBackfill::OP_BACKFILL_PROGRESS,
4393         t);
4394
4395       int tr = osd->store->queue_transaction(ch, std::move(t), NULL);
4396       ceph_assert(tr == 0);
4397     }
4398     break;
4399
4400   case MOSDPGBackfill::OP_BACKFILL_FINISH_ACK:
4401     {
4402       ceph_assert(is_primary());
4403       ceph_assert(cct->_conf->osd_kill_backfill_at != 3);
4404       finish_recovery_op(hobject_t::get_max());
4405     }
4406     break;
4407   }
4408 }
4409
4410 void PrimaryLogPG::do_backfill_remove(OpRequestRef op)
4411 {
4412   const MOSDPGBackfillRemove *m = static_cast<const MOSDPGBackfillRemove*>(
4413     op->get_req());
4414   ceph_assert(m->get_type() == MSG_OSD_PG_BACKFILL_REMOVE);
4415   dout(7) << __func__ << " " << m->ls << dendl;
4416
4417   op->mark_started();
4418
4419   ObjectStore::Transaction t;
4420   for (auto& p : m->ls) {
4421     if (is_remote_backfilling()) {
4422       struct stat st;
4423       int r = osd->store->stat(ch, ghobject_t(p.first, ghobject_t::NO_GEN,
4424                                pg_whoami.shard) , &st);
4425       if (r == 0) {
4426         sub_local_num_bytes(st.st_size);
4427         int64_t usersize;
4428         if (pool.info.is_erasure()) {
4429           bufferlist bv;
4430           int r = osd->store->getattr(
4431               ch,
4432               ghobject_t(p.first, ghobject_t::NO_GEN, pg_whoami.shard),
4433               OI_ATTR,
4434               bv);
4435           if (r >= 0) {
4436             object_info_t oi(bv);
4437             usersize = oi.size * pgbackend->get_ec_data_chunk_count();
4438           } else {
4439             dout(0) << __func__ << " " << ghobject_t(p.first, ghobject_t::NO_GEN, pg_whoami.shard)
4440                     << " can't get object info" << dendl;
4441             usersize = 0;
4442           }
4443         } else {
4444           usersize = st.st_size;
4445         }
4446         sub_num_bytes(usersize);
4447         dout(10) << __func__ << " " << ghobject_t(p.first, ghobject_t::NO_GEN, pg_whoami.shard)
4448                  << " sub actual data by " << st.st_size
4449                  << " sub num_bytes by " << usersize
4450                  << dendl;
4451       }
4452     }
4453     remove_snap_mapped_object(t, p.first);
4454   }
4455   int r = osd->store->queue_transaction(ch, std::move(t), NULL);
4456   ceph_assert(r == 0);
4457 }
4458
4459 int PrimaryLogPG::trim_object(
4460   bool first, const hobject_t &coid, snapid_t snap_to_trim,
4461   PrimaryLogPG::OpContextUPtr *ctxp)
4462 {
4463   *ctxp = NULL;
4464
4465   // load clone info
4466   bufferlist bl;
4467   ObjectContextRef obc = get_object_context(coid, false, NULL);
4468   if (!obc || !obc->ssc || !obc->ssc->exists) {
4469     osd->clog->error() << __func__ << ": Can not trim " << coid
4470       << " repair needed " << (obc ? "(no obc->ssc or !exists)" : "(no obc)");
4471     return -ENOENT;
4472   }
4473
4474   hobject_t head_oid = coid.get_head();
4475   ObjectContextRef head_obc = get_object_context(head_oid, false);
4476   if (!head_obc) {
4477     osd->clog->error() << __func__ << ": Can not trim " << coid
4478       << " repair needed, no snapset obc for " << head_oid;
4479     return -ENOENT;
4480   }
4481
4482   SnapSet& snapset = obc->ssc->snapset;
4483
4484   object_info_t &coi = obc->obs.oi;
4485   auto citer = snapset.clone_snaps.find(coid.snap);
4486   if (citer == snapset.clone_snaps.end()) {
4487     osd->clog->error() << "No clone_snaps in snapset " << snapset
4488                        << " for object " << coid << "\n";
4489     return -ENOENT;
4490   }
4491   set<snapid_t> old_snaps(citer->second.begin(), citer->second.end());
4492   if (old_snaps.empty()) {
4493     osd->clog->error() << "No object info snaps for object " << coid;
4494     return -ENOENT;
4495   }
4496
4497   dout(10) << coid << " old_snaps " << old_snaps
4498            << " old snapset " << snapset << dendl;
4499   if (snapset.seq == 0) {
4500     osd->clog->error() << "No snapset.seq for object " << coid;
4501     return -ENOENT;
4502   }
4503
4504   set<snapid_t> new_snaps;
4505   const OSDMapRef& osdmap = get_osdmap();
4506   for (set<snapid_t>::iterator i = old_snaps.begin();
4507        i != old_snaps.end();
4508        ++i) {
4509     if (!osdmap->in_removed_snaps_queue(info.pgid.pgid.pool(), *i) &&
4510         *i != snap_to_trim) {
4511       new_snaps.insert(*i);
4512     }
4513   }
4514
4515   vector<snapid_t>::iterator p = snapset.clones.end();
4516
4517   if (new_snaps.empty()) {
4518     p = std::find(snapset.clones.begin(), snapset.clones.end(), coid.snap);
4519     if (p == snapset.clones.end()) {
4520       osd->clog->error() << "Snap " << coid.snap << " not in clones";
4521       return -ENOENT;
4522     }
4523   }
4524
4525   OpContextUPtr ctx = simple_opc_create(obc);
4526   ctx->head_obc = head_obc;
4527
4528   if (!ctx->lock_manager.get_snaptrimmer_write(
4529         coid,
4530         obc,
4531         first)) {
4532     close_op_ctx(ctx.release());
4533     dout(10) << __func__ << ": Unable to get a wlock on " << coid << dendl;
4534     return -ENOLCK;
4535   }
4536
4537   if (!ctx->lock_manager.get_snaptrimmer_write(
4538         head_oid,
4539         head_obc,
4540         first)) {
4541     close_op_ctx(ctx.release());
4542     dout(10) << __func__ << ": Unable to get a wlock on " << head_oid << dendl;
4543     return -ENOLCK;
4544   }
4545
4546   ctx->at_version = get_next_version();
4547
4548   PGTransaction *t = ctx->op_t.get();
4549
4550   if (new_snaps.empty()) {
4551     // remove clone
4552     dout(10) << coid << " snaps " << old_snaps << " -> "
4553              << new_snaps << " ... deleting" << dendl;
4554
4555     // ...from snapset
4556     ceph_assert(p != snapset.clones.end());
4557
4558     snapid_t last = coid.snap;
4559     ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(last);
4560
4561     if (p != snapset.clones.begin()) {
4562       // not the oldest... merge overlap into next older clone
4563       vector<snapid_t>::iterator n = p - 1;
4564       hobject_t prev_coid = coid;
4565       prev_coid.snap = *n;
4566       bool adjust_prev_bytes = is_present_clone(prev_coid);
4567
4568       if (adjust_prev_bytes)
4569         ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(*n);
4570
4571       snapset.clone_overlap[*n].intersection_of(
4572         snapset.clone_overlap[*p]);
4573
4574       if (adjust_prev_bytes)
4575         ctx->delta_stats.num_bytes += snapset.get_clone_bytes(*n);
4576     }
4577     ctx->delta_stats.num_objects--;
4578     if (coi.is_dirty())
4579       ctx->delta_stats.num_objects_dirty--;
4580     if (coi.is_omap())
4581       ctx->delta_stats.num_objects_omap--;
4582     if (coi.is_whiteout()) {
4583       dout(20) << __func__ << " trimming whiteout on " << coid << dendl;
4584       ctx->delta_stats.num_whiteouts--;
4585     }
4586     ctx->delta_stats.num_object_clones--;
4587     if (coi.is_cache_pinned())
4588       ctx->delta_stats.num_objects_pinned--;
4589     if (coi.has_manifest()) {
4590       dec_all_refcount_manifest(coi, ctx.get());
4591       ctx->delta_stats.num_objects_manifest--;
4592     }
4593     obc->obs.exists = false;
4594
4595     snapset.clones.erase(p);
4596     snapset.clone_overlap.erase(last);
4597     snapset.clone_size.erase(last);
4598     snapset.clone_snaps.erase(last);
4599
4600     ctx->log.push_back(
4601       pg_log_entry_t(
4602         pg_log_entry_t::DELETE,
4603         coid,
4604         ctx->at_version,
4605         ctx->obs->oi.version,
4606         0,
4607         osd_reqid_t(),
4608         ctx->mtime,
4609         0)
4610       );
4611     t->remove(coid);
4612     t->update_snaps(
4613       coid,
4614       old_snaps,
4615       new_snaps);
4616
4617     coi = object_info_t(coid);
4618
4619     ctx->at_version.version++;
4620   } else {
4621     // save adjusted snaps for this object
4622     dout(10) << coid << " snaps " << old_snaps << " -> " << new_snaps << dendl;
4623     snapset.clone_snaps[coid.snap] =
4624       vector<snapid_t>(new_snaps.rbegin(), new_snaps.rend());
4625     // we still do a 'modify' event on this object just to trigger a
4626     // snapmapper.update ... :(
4627
4628     coi.prior_version = coi.version;
4629     coi.version = ctx->at_version;
4630     bl.clear();
4631     encode(coi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
4632     t->setattr(coid, OI_ATTR, bl);
4633
4634     ctx->log.push_back(
4635       pg_log_entry_t(
4636         pg_log_entry_t::MODIFY,
4637         coid,
4638         coi.version,
4639         coi.prior_version,
4640         0,
4641         osd_reqid_t(),
4642         ctx->mtime,
4643         0)
4644       );
4645     ctx->at_version.version++;
4646
4647     t->update_snaps(
4648       coid,
4649       old_snaps,
4650       new_snaps);
4651   }
4652
4653   // save head snapset
4654   dout(10) << coid << " new snapset " << snapset << " on "
4655            << head_obc->obs.oi << dendl;
4656   if (snapset.clones.empty() &&
4657       (head_obc->obs.oi.is_whiteout() &&
4658        !(head_obc->obs.oi.is_dirty() && pool.info.is_tier()) &&
4659        !head_obc->obs.oi.is_cache_pinned())) {
4660     // NOTE: this arguably constitutes minor interference with the
4661     // tiering agent if this is a cache tier since a snap trim event
4662     // is effectively evicting a whiteout we might otherwise want to
4663     // keep around.
4664     dout(10) << coid << " removing " << head_oid << dendl;
4665     ctx->log.push_back(
4666       pg_log_entry_t(
4667         pg_log_entry_t::DELETE,
4668         head_oid,
4669         ctx->at_version,
4670         head_obc->obs.oi.version,
4671         0,
4672         osd_reqid_t(),
4673         ctx->mtime,
4674         0)
4675       );
4676     dout(10) << "removing snap head" << dendl;
4677     object_info_t& oi = head_obc->obs.oi;
4678     ctx->delta_stats.num_objects--;
4679     if (oi.is_dirty()) {
4680       ctx->delta_stats.num_objects_dirty--;
4681     }
4682     if (oi.is_omap())
4683       ctx->delta_stats.num_objects_omap--;
4684     if (oi.is_whiteout()) {
4685       dout(20) << __func__ << " trimming whiteout on " << oi.soid << dendl;
4686       ctx->delta_stats.num_whiteouts--;
4687     }
4688     if (oi.is_cache_pinned()) {
4689       ctx->delta_stats.num_objects_pinned--;
4690     }
4691     if (oi.has_manifest()) {
4692       ctx->delta_stats.num_objects_manifest--;
4693       dec_all_refcount_manifest(oi, ctx.get());
4694     }
4695     head_obc->obs.exists = false;
4696     head_obc->obs.oi = object_info_t(head_oid);
4697     t->remove(head_oid);
4698   } else {
4699     if (get_osdmap()->require_osd_release < ceph_release_t::octopus) {
4700       // filter SnapSet::snaps for the benefit of pre-octopus
4701       // peers. This is perhaps overly conservative in that I'm not
4702       // certain they need this, but let's be conservative here.
4703       dout(10) << coid << " filtering snapset on " << head_oid << dendl;
4704       snapset.filter(pool.info);
4705     } else {
4706       snapset.snaps.clear();
4707     }
4708     dout(10) << coid << " writing updated snapset on " << head_oid
4709              << ", snapset is " << snapset << dendl;
4710     ctx->log.push_back(
4711       pg_log_entry_t(
4712         pg_log_entry_t::MODIFY,
4713         head_oid,
4714         ctx->at_version,
4715         head_obc->obs.oi.version,
4716         0,
4717         osd_reqid_t(),
4718         ctx->mtime,
4719         0)
4720       );
4721
4722     head_obc->obs.oi.prior_version = head_obc->obs.oi.version;
4723     head_obc->obs.oi.version = ctx->at_version;
4724
4725     map <string, bufferlist> attrs;
4726     bl.clear();
4727     encode(snapset, bl);
4728     attrs[SS_ATTR] = std::move(bl);
4729
4730     bl.clear();
4731     encode(head_obc->obs.oi, bl,
4732              get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
4733     attrs[OI_ATTR] = std::move(bl);
4734     t->setattrs(head_oid, attrs);
4735   }
4736
4737   *ctxp = std::move(ctx);
4738   return 0;
4739 }
4740
4741 void PrimaryLogPG::kick_snap_trim()
4742 {
4743   ceph_assert(is_active());
4744   ceph_assert(is_primary());
4745   if (is_clean() &&
4746       !state_test(PG_STATE_PREMERGE) &&
4747       !snap_trimq.empty()) {
4748     if (get_osdmap()->test_flag(CEPH_OSDMAP_NOSNAPTRIM)) {
4749       dout(10) << __func__ << ": nosnaptrim set, not kicking" << dendl;
4750     } else {
4751       dout(10) << __func__ << ": clean and snaps to trim, kicking" << dendl;
4752       snap_trimmer_machine.process_event(KickTrim());
4753     }
4754   }
4755 }
4756
4757 void PrimaryLogPG::snap_trimmer_scrub_complete()
4758 {
4759   if (is_primary() && is_active() && is_clean()) {
4760     ceph_assert(!snap_trimq.empty());
4761     snap_trimmer_machine.process_event(ScrubComplete());
4762   }
4763 }
4764
4765 void PrimaryLogPG::snap_trimmer(epoch_t queued)
4766 {
4767   if (recovery_state.is_deleting() || pg_has_reset_since(queued)) {
4768     return;
4769   }
4770
4771   ceph_assert(is_primary());
4772
4773   dout(10) << "snap_trimmer posting" << dendl;
4774   snap_trimmer_machine.process_event(DoSnapWork());
4775   dout(10) << "snap_trimmer complete" << dendl;
4776   return;
4777 }
4778
4779 int PrimaryLogPG::do_xattr_cmp_u64(int op, __u64 v1, bufferlist& xattr)
4780 {
4781   __u64 v2;
4782
4783   string v2s(xattr.c_str(), xattr.length());
4784   if (v2s.length())
4785     v2 = strtoull(v2s.c_str(), NULL, 10);
4786   else
4787     v2 = 0;
4788
4789   dout(20) << "do_xattr_cmp_u64 '" << v1 << "' vs '" << v2 << "' op " << op << dendl;
4790
4791   switch (op) {
4792   case CEPH_OSD_CMPXATTR_OP_EQ:
4793     return (v1 == v2);
4794   case CEPH_OSD_CMPXATTR_OP_NE:
4795     return (v1 != v2);
4796   case CEPH_OSD_CMPXATTR_OP_GT:
4797     return (v1 > v2);
4798   case CEPH_OSD_CMPXATTR_OP_GTE:
4799     return (v1 >= v2);
4800   case CEPH_OSD_CMPXATTR_OP_LT:
4801     return (v1 < v2);
4802   case CEPH_OSD_CMPXATTR_OP_LTE:
4803     return (v1 <= v2);
4804   default:
4805     return -EINVAL;
4806   }
4807 }
4808
4809 int PrimaryLogPG::do_xattr_cmp_str(int op, string& v1s, bufferlist& xattr)
4810 {
4811   string v2s(xattr.c_str(), xattr.length());
4812
4813   dout(20) << "do_xattr_cmp_str '" << v1s << "' vs '" << v2s << "' op " << op << dendl;
4814
4815   switch (op) {
4816   case CEPH_OSD_CMPXATTR_OP_EQ:
4817     return (v1s.compare(v2s) == 0);
4818   case CEPH_OSD_CMPXATTR_OP_NE:
4819     return (v1s.compare(v2s) != 0);
4820   case CEPH_OSD_CMPXATTR_OP_GT:
4821     return (v1s.compare(v2s) > 0);
4822   case CEPH_OSD_CMPXATTR_OP_GTE:
4823     return (v1s.compare(v2s) >= 0);
4824   case CEPH_OSD_CMPXATTR_OP_LT:
4825     return (v1s.compare(v2s) < 0);
4826   case CEPH_OSD_CMPXATTR_OP_LTE:
4827     return (v1s.compare(v2s) <= 0);
4828   default:
4829     return -EINVAL;
4830   }
4831 }
4832
4833 int PrimaryLogPG::do_writesame(OpContext *ctx, OSDOp& osd_op)
4834 {
4835   ceph_osd_op& op = osd_op.op;
4836   vector<OSDOp> write_ops(1);
4837   OSDOp& write_op = write_ops[0];
4838   uint64_t write_length = op.writesame.length;
4839   int result = 0;
4840
4841   if (!write_length)
4842     return 0;
4843
4844   if (!op.writesame.data_length || write_length % op.writesame.data_length)
4845     return -EINVAL;
4846
4847   if (op.writesame.data_length != osd_op.indata.length()) {
4848     derr << "invalid length ws data length " << op.writesame.data_length << " actual len " << osd_op.indata.length() << dendl;
4849     return -EINVAL;
4850   }
4851
4852   while (write_length) {
4853     write_op.indata.append(osd_op.indata);
4854     write_length -= op.writesame.data_length;
4855   }
4856
4857   write_op.op.op = CEPH_OSD_OP_WRITE;
4858   write_op.op.extent.offset = op.writesame.offset;
4859   write_op.op.extent.length = op.writesame.length;
4860   result = do_osd_ops(ctx, write_ops);
4861   if (result < 0)
4862     derr << "do_writesame do_osd_ops failed " << result << dendl;
4863
4864   return result;
4865 }
4866
4867 // ========================================================================
4868 // low level osd ops
4869
4870 int PrimaryLogPG::do_tmap2omap(OpContext *ctx, unsigned flags)
4871 {
4872   dout(20) << " convert tmap to omap for " << ctx->new_obs.oi.soid << dendl;
4873   bufferlist header, vals;
4874   int r = _get_tmap(ctx, &header, &vals);
4875   if (r < 0) {
4876     if (r == -ENODATA && (flags & CEPH_OSD_TMAP2OMAP_NULLOK))
4877       r = 0;
4878     return r;
4879   }
4880
4881   vector<OSDOp> ops(3);
4882
4883   ops[0].op.op = CEPH_OSD_OP_TRUNCATE;
4884   ops[0].op.extent.offset = 0;
4885   ops[0].op.extent.length = 0;
4886
4887   ops[1].op.op = CEPH_OSD_OP_OMAPSETHEADER;
4888   ops[1].indata = std::move(header);
4889
4890   ops[2].op.op = CEPH_OSD_OP_OMAPSETVALS;
4891   ops[2].indata = std::move(vals);
4892
4893   return do_osd_ops(ctx, ops);
4894 }
4895
4896 int PrimaryLogPG::do_tmapup_slow(OpContext *ctx, bufferlist::const_iterator& bp,
4897                                  OSDOp& osd_op, bufferlist& bl)
4898 {
4899   // decode
4900   bufferlist header;
4901   map<string, bufferlist> m;
4902   if (bl.length()) {
4903     auto p = bl.cbegin();
4904     decode(header, p);
4905     decode(m, p);
4906     ceph_assert(p.end());
4907   }
4908
4909   // do the update(s)
4910   while (!bp.end()) {
4911     __u8 op;
4912     string key;
4913     decode(op, bp);
4914
4915     switch (op) {
4916     case CEPH_OSD_TMAP_SET: // insert key
4917       {
4918         decode(key, bp);
4919         bufferlist data;
4920         decode(data, bp);
4921         m[key] = data;
4922       }
4923       break;
4924     case CEPH_OSD_TMAP_RM: // remove key
4925       decode(key, bp);
4926       if (!m.count(key)) {
4927         return -ENOENT;
4928       }
4929       m.erase(key);
4930       break;
4931     case CEPH_OSD_TMAP_RMSLOPPY: // remove key
4932       decode(key, bp);
4933       m.erase(key);
4934       break;
4935     case CEPH_OSD_TMAP_HDR: // update header
4936       {
4937         decode(header, bp);
4938       }
4939       break;
4940     default:
4941       return -EINVAL;
4942     }
4943   }
4944
4945   // reencode
4946   bufferlist obl;
4947   encode(header, obl);
4948   encode(m, obl);
4949
4950   // write it out
4951   vector<OSDOp> nops(1);
4952   OSDOp& newop = nops[0];
4953   newop.op.op = CEPH_OSD_OP_WRITEFULL;
4954   newop.op.extent.offset = 0;
4955   newop.op.extent.length = obl.length();
4956   newop.indata = obl;
4957   do_osd_ops(ctx, nops);
4958   return 0;
4959 }
4960
4961 int PrimaryLogPG::do_tmapup(OpContext *ctx, bufferlist::const_iterator& bp, OSDOp& osd_op)
4962 {
4963   bufferlist::const_iterator orig_bp = bp;
4964   int result = 0;
4965   if (bp.end()) {
4966     dout(10) << "tmapup is a no-op" << dendl;
4967   } else {
4968     // read the whole object
4969     vector<OSDOp> nops(1);
4970     OSDOp& newop = nops[0];
4971     newop.op.op = CEPH_OSD_OP_READ;
4972     newop.op.extent.offset = 0;
4973     newop.op.extent.length = 0;
4974     result = do_osd_ops(ctx, nops);
4975
4976     dout(10) << "tmapup read " << newop.outdata.length() << dendl;
4977
4978     dout(30) << " starting is \n";
4979     newop.outdata.hexdump(*_dout);
4980     *_dout << dendl;
4981
4982     auto ip = newop.outdata.cbegin();
4983     bufferlist obl;
4984
4985     dout(30) << "the update command is: \n";
4986     osd_op.indata.hexdump(*_dout);
4987     *_dout << dendl;
4988
4989     // header
4990     bufferlist header;
4991     __u32 nkeys = 0;
4992     if (newop.outdata.length()) {
4993       decode(header, ip);
4994       decode(nkeys, ip);
4995     }
4996     dout(10) << "tmapup header " << header.length() << dendl;
4997
4998     if (!bp.end() && *bp == CEPH_OSD_TMAP_HDR) {
4999       ++bp;
5000       decode(header, bp);
5001       dout(10) << "tmapup new header " << header.length() << dendl;
5002     }
5003
5004     encode(header, obl);
5005
5006     dout(20) << "tmapup initial nkeys " << nkeys << dendl;
5007
5008     // update keys
5009     bufferlist newkeydata;
5010     string nextkey, last_in_key;
5011     bufferlist nextval;
5012     bool have_next = false;
5013     if (!ip.end()) {
5014       have_next = true;
5015       decode(nextkey, ip);
5016       decode(nextval, ip);
5017     }
5018     while (!bp.end() && !result) {
5019       __u8 op;
5020       string key;
5021       try {
5022         decode(op, bp);
5023         decode(key, bp);
5024       }
5025       catch (ceph::buffer::error& e) {
5026         return -EINVAL;
5027       }
5028       if (key < last_in_key) {
5029         dout(5) << "tmapup warning: key '" << key << "' < previous key '" << last_in_key
5030                 << "', falling back to an inefficient (unsorted) update" << dendl;
5031         bp = orig_bp;
5032         return do_tmapup_slow(ctx, bp, osd_op, newop.outdata);
5033       }
5034       last_in_key = key;
5035
5036       dout(10) << "tmapup op " << (int)op << " key " << key << dendl;
5037
5038       // skip existing intervening keys
5039       bool key_exists = false;
5040       while (have_next && !key_exists) {
5041         dout(20) << "  (have_next=" << have_next << " nextkey=" << nextkey << ")" << dendl;
5042         if (nextkey > key)
5043           break;
5044         if (nextkey < key) {
5045           // copy untouched.
5046           encode(nextkey, newkeydata);
5047           encode(nextval, newkeydata);
5048           dout(20) << "  keep " << nextkey << " " << nextval.length() << dendl;
5049         } else {
5050           // don't copy; discard old value.  and stop.
5051           dout(20) << "  drop " << nextkey << " " << nextval.length() << dendl;
5052           key_exists = true;
5053           nkeys--;
5054         }
5055         if (!ip.end()) {
5056           decode(nextkey, ip);
5057           decode(nextval, ip);
5058         } else {
5059           have_next = false;
5060         }
5061       }
5062
5063       if (op == CEPH_OSD_TMAP_SET) {
5064         bufferlist val;
5065         try {
5066           decode(val, bp);
5067         }
5068         catch (ceph::buffer::error& e) {
5069           return -EINVAL;
5070         }
5071         encode(key, newkeydata);
5072         encode(val, newkeydata);
5073         dout(20) << "   set " << key << " " << val.length() << dendl;
5074         nkeys++;
5075       } else if (op == CEPH_OSD_TMAP_CREATE) {
5076         if (key_exists) {
5077           return -EEXIST;
5078         }
5079         bufferlist val;
5080         try {
5081           decode(val, bp);
5082         }
5083         catch (ceph::buffer::error& e) {
5084           return -EINVAL;
5085         }
5086         encode(key, newkeydata);
5087         encode(val, newkeydata);
5088         dout(20) << "   create " << key << " " << val.length() << dendl;
5089         nkeys++;
5090       } else if (op == CEPH_OSD_TMAP_RM) {
5091         // do nothing.
5092         if (!key_exists) {
5093           return -ENOENT;
5094         }
5095       } else if (op == CEPH_OSD_TMAP_RMSLOPPY) {
5096         // do nothing
5097       } else {
5098         dout(10) << "  invalid tmap op " << (int)op << dendl;
5099         return -EINVAL;
5100       }
5101     }
5102
5103     // copy remaining
5104     if (have_next) {
5105       encode(nextkey, newkeydata);
5106       encode(nextval, newkeydata);
5107       dout(20) << "  keep " << nextkey << " " << nextval.length() << dendl;
5108     }
5109     if (!ip.end()) {
5110       bufferlist rest;
5111       rest.substr_of(newop.outdata, ip.get_off(), newop.outdata.length() - ip.get_off());
5112       dout(20) << "  keep trailing " << rest.length()
5113                << " at " << newkeydata.length() << dendl;
5114       newkeydata.claim_append(rest);
5115     }
5116
5117     // encode final key count + key data
5118     dout(20) << "tmapup final nkeys " << nkeys << dendl;
5119     encode(nkeys, obl);
5120     obl.claim_append(newkeydata);
5121
5122     if (0) {
5123       dout(30) << " final is \n";
5124       obl.hexdump(*_dout);
5125       *_dout << dendl;
5126
5127       // sanity check
5128       auto tp = obl.cbegin();
5129       bufferlist h;
5130       decode(h, tp);
5131       map<string,bufferlist> d;
5132       decode(d, tp);
5133       ceph_assert(tp.end());
5134       dout(0) << " **** debug sanity check, looks ok ****" << dendl;
5135     }
5136
5137     // write it out
5138     if (!result) {
5139       dout(20) << "tmapput write " << obl.length() << dendl;
5140       newop.op.op = CEPH_OSD_OP_WRITEFULL;
5141       newop.op.extent.offset = 0;
5142       newop.op.extent.length = obl.length();
5143       newop.indata = obl;
5144       do_osd_ops(ctx, nops);
5145     }
5146   }
5147   return result;
5148 }
5149
5150 static int check_offset_and_length(uint64_t offset, uint64_t length,
5151   uint64_t max, DoutPrefixProvider *dpp)
5152 {
5153   if (offset >= max ||
5154       length > max ||
5155       offset + length > max) {
5156     ldpp_dout(dpp, 10) << __func__ << " "
5157       << "osd_max_object_size: " << max
5158       << "; Hard limit of object size is 4GB." << dendl;
5159     return -EFBIG;
5160   }
5161
5162   return 0;
5163 }
5164
5165 struct FillInVerifyExtent : public Context {
5166   ceph_le64 *r;
5167   int32_t *rval;
5168   bufferlist *outdatap;
5169   std::optional<uint32_t> maybe_crc;
5170   uint64_t size;
5171   OSDService *osd;
5172   hobject_t soid;
5173   uint32_t flags;
5174   FillInVerifyExtent(ceph_le64 *r, int32_t *rv, bufferlist *blp,
5175                      std::optional<uint32_t> mc, uint64_t size,
5176                      OSDService *osd, hobject_t soid, uint32_t flags) :
5177     r(r), rval(rv), outdatap(blp), maybe_crc(mc),
5178     size(size), osd(osd), soid(soid), flags(flags) {}
5179   void finish(int len) override {
5180     *r = len;
5181     if (len < 0) {
5182       *rval = len;
5183       return;
5184     }
5185     *rval = 0;
5186
5187     // whole object?  can we verify the checksum?
5188     if (maybe_crc && *r == size) {
5189       uint32_t crc = outdatap->crc32c(-1);
5190       if (maybe_crc != crc) {
5191         osd->clog->error() << std::hex << " full-object read crc 0x" << crc
5192                            << " != expected 0x" << *maybe_crc
5193                            << std::dec << " on " << soid;
5194         if (!(flags & CEPH_OSD_OP_FLAG_FAILOK)) {
5195           *rval = -EIO;
5196           *r = 0;
5197         }
5198       }
5199     }
5200   }
5201 };
5202
5203 struct ToSparseReadResult : public Context {
5204   int* result;
5205   bufferlist* data_bl;
5206   uint64_t data_offset;
5207   ceph_le64* len;
5208   ToSparseReadResult(int* result, bufferlist* bl, uint64_t offset,
5209                      ceph_le64* len)
5210     : result(result), data_bl(bl), data_offset(offset),len(len) {}
5211   void finish(int r) override {
5212     if (r < 0) {
5213       *result = r;
5214       return;
5215     }
5216     *result = 0;
5217     *len = r;
5218     bufferlist outdata;
5219     map<uint64_t, uint64_t> extents = {{data_offset, r}};
5220     encode(extents, outdata);
5221     encode_destructively(*data_bl, outdata);
5222     data_bl->swap(outdata);
5223   }
5224 };
5225
5226 template<typename V>
5227 static string list_keys(const map<string, V>& m) {
5228   string s;
5229   for (typename map<string, V>::const_iterator itr = m.begin(); itr != m.end(); ++itr) {
5230     if (!s.empty()) {
5231       s.push_back(',');
5232     }
5233     s.append(itr->first);
5234   }
5235   return s;
5236 }
5237
5238 template<typename T>
5239 static string list_entries(const T& m) {
5240   string s;
5241   for (typename T::const_iterator itr = m.begin(); itr != m.end(); ++itr) {
5242     if (!s.empty()) {
5243       s.push_back(',');
5244     }
5245     s.append(*itr);
5246   }
5247   return s;
5248 }
5249
5250 void PrimaryLogPG::maybe_create_new_object(
5251   OpContext *ctx,
5252   bool ignore_transaction)
5253 {
5254   ObjectState& obs = ctx->new_obs;
5255   if (!obs.exists) {
5256     ctx->delta_stats.num_objects++;
5257     obs.exists = true;
5258     ceph_assert(!obs.oi.is_whiteout());
5259     obs.oi.new_object();
5260     if (!ignore_transaction)
5261       ctx->op_t->create(obs.oi.soid);
5262   } else if (obs.oi.is_whiteout()) {
5263     dout(10) << __func__ << " clearing whiteout on " << obs.oi.soid << dendl;
5264     ctx->new_obs.oi.clear_flag(object_info_t::FLAG_WHITEOUT);
5265     --ctx->delta_stats.num_whiteouts;
5266   }
5267 }
5268
5269 struct ReadFinisher : public PrimaryLogPG::OpFinisher {
5270   OSDOp& osd_op;
5271
5272   explicit ReadFinisher(OSDOp& osd_op) : osd_op(osd_op) {
5273   }
5274
5275   int execute() override {
5276     return osd_op.rval;
5277   }
5278 };
5279
5280 struct C_ChecksumRead : public Context {
5281   PrimaryLogPG *primary_log_pg;
5282   OSDOp &osd_op;
5283   Checksummer::CSumType csum_type;
5284   bufferlist init_value_bl;
5285   ceph_le64 read_length;
5286   bufferlist read_bl;
5287   Context *fill_extent_ctx;
5288
5289   C_ChecksumRead(PrimaryLogPG *primary_log_pg, OSDOp &osd_op,
5290                  Checksummer::CSumType csum_type, bufferlist &&init_value_bl,
5291                  std::optional<uint32_t> maybe_crc, uint64_t size,
5292                  OSDService *osd, hobject_t soid, uint32_t flags)
5293     : primary_log_pg(primary_log_pg), osd_op(osd_op),
5294       csum_type(csum_type), init_value_bl(std::move(init_value_bl)),
5295       fill_extent_ctx(new FillInVerifyExtent(&read_length, &osd_op.rval,
5296                                              &read_bl, maybe_crc, size,
5297                                              osd, soid, flags)) {
5298   }
5299   ~C_ChecksumRead() override {
5300     delete fill_extent_ctx;
5301   }
5302
5303   void finish(int r) override {
5304     fill_extent_ctx->complete(r);
5305     fill_extent_ctx = nullptr;
5306
5307     if (osd_op.rval >= 0) {
5308       bufferlist::const_iterator init_value_bl_it = init_value_bl.begin();
5309       osd_op.rval = primary_log_pg->finish_checksum(osd_op, csum_type,
5310                                                     &init_value_bl_it, read_bl);
5311     }
5312   }
5313 };
5314
5315 int PrimaryLogPG::do_checksum(OpContext *ctx, OSDOp& osd_op,
5316                               bufferlist::const_iterator *bl_it)
5317 {
5318   dout(20) << __func__ << dendl;
5319
5320   auto& op = osd_op.op;
5321   if (op.checksum.chunk_size > 0) {
5322     if (op.checksum.length == 0) {
5323       dout(10) << __func__ << ": length required when chunk size provided"
5324                << dendl;
5325       return -EINVAL;
5326     }
5327     if (op.checksum.length % op.checksum.chunk_size != 0) {
5328       dout(10) << __func__ << ": length not aligned to chunk size" << dendl;
5329       return -EINVAL;
5330     }
5331   }
5332
5333   auto& oi = ctx->new_obs.oi;
5334   if (op.checksum.offset == 0 && op.checksum.length == 0) {
5335     // zeroed offset+length implies checksum whole object
5336     op.checksum.length = oi.size;
5337   } else if (op.checksum.offset >= oi.size) {
5338     // read size was trimmed to zero, do nothing
5339     // see PrimaryLogPG::do_read
5340     return 0;
5341   } else if (op.extent.offset + op.extent.length > oi.size) {
5342     op.extent.length = oi.size - op.extent.offset;
5343     if (op.checksum.chunk_size > 0 &&
5344         op.checksum.length % op.checksum.chunk_size != 0) {
5345       dout(10) << __func__ << ": length (trimmed to 0x"
5346                << std::hex << op.checksum.length
5347                << ") not aligned to chunk size 0x"
5348                << op.checksum.chunk_size << std::dec
5349                << dendl;
5350       return -EINVAL;
5351     }
5352   }
5353
5354   Checksummer::CSumType csum_type;
5355   switch (op.checksum.type) {
5356   case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH32:
5357     csum_type = Checksummer::CSUM_XXHASH32;
5358     break;
5359   case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH64:
5360     csum_type = Checksummer::CSUM_XXHASH64;
5361     break;
5362   case CEPH_OSD_CHECKSUM_OP_TYPE_CRC32C:
5363     csum_type = Checksummer::CSUM_CRC32C;
5364     break;
5365   default:
5366     dout(10) << __func__ << ": unknown crc type ("
5367              << static_cast<uint32_t>(op.checksum.type) << ")" << dendl;
5368     return -EINVAL;
5369   }
5370
5371   size_t csum_init_value_size = Checksummer::get_csum_init_value_size(csum_type);
5372   if (bl_it->get_remaining() < csum_init_value_size) {
5373     dout(10) << __func__ << ": init value not provided" << dendl;
5374     return -EINVAL;
5375   }
5376
5377   bufferlist init_value_bl;
5378   init_value_bl.substr_of(bl_it->get_bl(), bl_it->get_off(),
5379                           csum_init_value_size);
5380   *bl_it += csum_init_value_size;
5381
5382   if (pool.info.is_erasure() && op.checksum.length > 0) {
5383     // If there is a data digest and it is possible we are reading
5384     // entire object, pass the digest.
5385     std::optional<uint32_t> maybe_crc;
5386     if (oi.is_data_digest() && op.checksum.offset == 0 &&
5387         op.checksum.length >= oi.size) {
5388       maybe_crc = oi.data_digest;
5389     }
5390
5391     // async read
5392     auto& soid = oi.soid;
5393     auto checksum_ctx = new C_ChecksumRead(this, osd_op, csum_type,
5394                                            std::move(init_value_bl), maybe_crc,
5395                                            oi.size, osd, soid, op.flags);
5396
5397     ctx->pending_async_reads.push_back({
5398       {op.checksum.offset, op.checksum.length, op.flags},
5399       {&checksum_ctx->read_bl, checksum_ctx}});
5400
5401     dout(10) << __func__ << ": async_read noted for " << soid << dendl;
5402     ctx->op_finishers[ctx->current_osd_subop_num].reset(
5403       new ReadFinisher(osd_op));
5404     return -EINPROGRESS;
5405   }
5406
5407   // sync read
5408   std::vector<OSDOp> read_ops(1);
5409   auto& read_op = read_ops[0];
5410   if (op.checksum.length > 0) {
5411     read_op.op.op = CEPH_OSD_OP_READ;
5412     read_op.op.flags = op.flags;
5413     read_op.op.extent.offset = op.checksum.offset;
5414     read_op.op.extent.length = op.checksum.length;
5415     read_op.op.extent.truncate_size = 0;
5416     read_op.op.extent.truncate_seq = 0;
5417
5418     int r = do_osd_ops(ctx, read_ops);
5419     if (r < 0) {
5420       derr << __func__ << ": do_osd_ops failed: " << cpp_strerror(r) << dendl;
5421       return r;
5422     }
5423   }
5424
5425   bufferlist::const_iterator init_value_bl_it = init_value_bl.begin();
5426   return finish_checksum(osd_op, csum_type, &init_value_bl_it,
5427                          read_op.outdata);
5428 }
5429
5430 int PrimaryLogPG::finish_checksum(OSDOp& osd_op,
5431                                   Checksummer::CSumType csum_type,
5432                                   bufferlist::const_iterator *init_value_bl_it,
5433                                   const bufferlist &read_bl) {
5434   dout(20) << __func__ << dendl;
5435
5436   auto& op = osd_op.op;
5437
5438   if (op.checksum.length > 0 && read_bl.length() != op.checksum.length) {
5439     derr << __func__ << ": bytes read " << read_bl.length() << " != "
5440          << op.checksum.length << dendl;
5441     return -EINVAL;
5442   }
5443
5444   size_t csum_chunk_size = (op.checksum.chunk_size != 0 ?
5445                               op.checksum.chunk_size : read_bl.length());
5446   uint32_t csum_count = (csum_chunk_size > 0 ?
5447                            read_bl.length() / csum_chunk_size : 0);
5448
5449   bufferlist csum;
5450   bufferptr csum_data;
5451   if (csum_count > 0) {
5452     size_t csum_value_size = Checksummer::get_csum_value_size(csum_type);
5453     csum_data = ceph::buffer::create(csum_value_size * csum_count);
5454     csum_data.zero();
5455     csum.append(csum_data);
5456
5457     switch (csum_type) {
5458     case Checksummer::CSUM_XXHASH32:
5459       {
5460         Checksummer::xxhash32::init_value_t init_value;
5461         decode(init_value, *init_value_bl_it);
5462         Checksummer::calculate<Checksummer::xxhash32>(
5463           init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
5464           &csum_data);
5465       }
5466       break;
5467     case Checksummer::CSUM_XXHASH64:
5468       {
5469         Checksummer::xxhash64::init_value_t init_value;
5470         decode(init_value, *init_value_bl_it);
5471         Checksummer::calculate<Checksummer::xxhash64>(
5472           init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
5473           &csum_data);
5474       }
5475       break;
5476     case Checksummer::CSUM_CRC32C:
5477       {
5478         Checksummer::crc32c::init_value_t init_value;
5479         decode(init_value, *init_value_bl_it);
5480         Checksummer::calculate<Checksummer::crc32c>(
5481           init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
5482           &csum_data);
5483       }
5484       break;
5485     default:
5486       break;
5487     }
5488   }
5489
5490   encode(csum_count, osd_op.outdata);
5491   osd_op.outdata.claim_append(csum);
5492   return 0;
5493 }
5494
5495 struct C_ExtentCmpRead : public Context {
5496   PrimaryLogPG *primary_log_pg;
5497   OSDOp &osd_op;
5498   ceph_le64 read_length{};
5499   bufferlist read_bl;
5500   Context *fill_extent_ctx;
5501
5502   C_ExtentCmpRead(PrimaryLogPG *primary_log_pg, OSDOp &osd_op,
5503                   std::optional<uint32_t> maybe_crc, uint64_t size,
5504                   OSDService *osd, hobject_t soid, uint32_t flags)
5505     : primary_log_pg(primary_log_pg), osd_op(osd_op),
5506       fill_extent_ctx(new FillInVerifyExtent(&read_length, &osd_op.rval,
5507                                              &read_bl, maybe_crc, size,
5508                                              osd, soid, flags)) {
5509   }
5510   ~C_ExtentCmpRead() override {
5511     delete fill_extent_ctx;
5512   }
5513
5514   void finish(int r) override {
5515     if (r == -ENOENT) {
5516       osd_op.rval = 0;
5517       read_bl.clear();
5518       delete fill_extent_ctx;
5519     } else {
5520       fill_extent_ctx->complete(r);
5521     }
5522     fill_extent_ctx = nullptr;
5523
5524     if (osd_op.rval >= 0) {
5525       osd_op.rval = primary_log_pg->finish_extent_cmp(osd_op, read_bl);
5526     }
5527   }
5528 };
5529
5530 int PrimaryLogPG::do_extent_cmp(OpContext *ctx, OSDOp& osd_op)
5531 {
5532   dout(20) << __func__ << dendl;
5533   ceph_osd_op& op = osd_op.op;
5534
5535   auto& oi = ctx->new_obs.oi;
5536   uint64_t size = oi.size;
5537   if ((oi.truncate_seq < op.extent.truncate_seq) &&
5538       (op.extent.offset + op.extent.length > op.extent.truncate_size)) {
5539     size = op.extent.truncate_size;
5540   }
5541
5542   if (op.extent.offset >= size) {
5543     op.extent.length = 0;
5544   } else if (op.extent.offset + op.extent.length > size) {
5545     op.extent.length = size - op.extent.offset;
5546   }
5547
5548   if (op.extent.length == 0) {
5549     dout(20) << __func__ << " zero length extent" << dendl;
5550     return finish_extent_cmp(osd_op, bufferlist{});
5551   } else if (!ctx->obs->exists || ctx->obs->oi.is_whiteout()) {
5552     dout(20) << __func__ << " object DNE" << dendl;
5553     return finish_extent_cmp(osd_op, {});
5554   } else if (pool.info.is_erasure()) {
5555     // If there is a data digest and it is possible we are reading
5556     // entire object, pass the digest.
5557     std::optional<uint32_t> maybe_crc;
5558     if (oi.is_data_digest() && op.checksum.offset == 0 &&
5559         op.checksum.length >= oi.size) {
5560       maybe_crc = oi.data_digest;
5561     }
5562
5563     // async read
5564     auto& soid = oi.soid;
5565     auto extent_cmp_ctx = new C_ExtentCmpRead(this, osd_op, maybe_crc, oi.size,
5566                                               osd, soid, op.flags);
5567     ctx->pending_async_reads.push_back({
5568       {op.extent.offset, op.extent.length, op.flags},
5569       {&extent_cmp_ctx->read_bl, extent_cmp_ctx}});
5570
5571     dout(10) << __func__ << ": async_read noted for " << soid << dendl;
5572
5573     ctx->op_finishers[ctx->current_osd_subop_num].reset(
5574       new ReadFinisher(osd_op));
5575     return -EINPROGRESS;
5576   }
5577
5578   // sync read
5579   vector<OSDOp> read_ops(1);
5580   OSDOp& read_op = read_ops[0];
5581
5582   read_op.op.op = CEPH_OSD_OP_SYNC_READ;
5583   read_op.op.extent.offset = op.extent.offset;
5584   read_op.op.extent.length = op.extent.length;
5585   read_op.op.extent.truncate_seq = op.extent.truncate_seq;
5586   read_op.op.extent.truncate_size = op.extent.truncate_size;
5587
5588   int result = do_osd_ops(ctx, read_ops);
5589   if (result < 0) {
5590     derr << __func__ << " failed " << result << dendl;
5591     return result;
5592   }
5593   return finish_extent_cmp(osd_op, read_op.outdata);
5594 }
5595
5596 int PrimaryLogPG::finish_extent_cmp(OSDOp& osd_op, const bufferlist &read_bl)
5597 {
5598   for (uint64_t idx = 0; idx < osd_op.indata.length(); ++idx) {
5599     char read_byte = (idx < read_bl.length() ? read_bl[idx] : 0);
5600     if (osd_op.indata[idx] != read_byte) {
5601         return (-MAX_ERRNO - idx);
5602     }
5603   }
5604
5605   return 0;
5606 }
5607
5608 int PrimaryLogPG::do_read(OpContext *ctx, OSDOp& osd_op) {
5609   dout(20) << __func__ << dendl;
5610   auto& op = osd_op.op;
5611   auto& oi = ctx->new_obs.oi;
5612   auto& soid = oi.soid;
5613   __u32 seq = oi.truncate_seq;
5614   uint64_t size = oi.size;
5615   bool trimmed_read = false;
5616
5617   dout(30) << __func__ << " oi.size: " << oi.size << dendl;
5618   dout(30) << __func__ << " oi.truncate_seq: " << oi.truncate_seq << dendl;
5619   dout(30) << __func__ << " op.extent.truncate_seq: " << op.extent.truncate_seq << dendl;
5620   dout(30) << __func__ << " op.extent.truncate_size: " << op.extent.truncate_size << dendl;
5621
5622   // are we beyond truncate_size?
5623   if ( (seq < op.extent.truncate_seq) &&
5624        (op.extent.offset + op.extent.length > op.extent.truncate_size) &&
5625        (size > op.extent.truncate_size) )
5626     size = op.extent.truncate_size;
5627
5628   if (op.extent.length == 0) //length is zero mean read the whole object
5629     op.extent.length = size;
5630
5631   if (op.extent.offset >= size) {
5632     op.extent.length = 0;
5633     trimmed_read = true;
5634   } else if (op.extent.offset + op.extent.length > size) {
5635     op.extent.length = size - op.extent.offset;
5636     trimmed_read = true;
5637   }
5638
5639   dout(30) << __func__ << "op.extent.length is now " << op.extent.length << dendl;
5640
5641   // read into a buffer
5642   int result = 0;
5643   if (trimmed_read && op.extent.length == 0) {
5644     // read size was trimmed to zero and it is expected to do nothing
5645     // a read operation of 0 bytes does *not* do nothing, this is why
5646     // the trimmed_read boolean is needed
5647   } else if (pool.info.is_erasure()) {
5648     // The initialisation below is required to silence a false positive
5649     // -Wmaybe-uninitialized warning
5650     std::optional<uint32_t> maybe_crc;
5651     // If there is a data digest and it is possible we are reading
5652     // entire object, pass the digest.  FillInVerifyExtent will
5653     // will check the oi.size again.
5654     if (oi.is_data_digest() && op.extent.offset == 0 &&
5655         op.extent.length >= oi.size)
5656       maybe_crc = oi.data_digest;
5657     ctx->pending_async_reads.push_back(
5658       make_pair(
5659         boost::make_tuple(op.extent.offset, op.extent.length, op.flags),
5660         make_pair(&osd_op.outdata,
5661                   new FillInVerifyExtent(&op.extent.length, &osd_op.rval,
5662                                          &osd_op.outdata, maybe_crc, oi.size,
5663                                          osd, soid, op.flags))));
5664     dout(10) << " async_read noted for " << soid << dendl;
5665
5666     ctx->op_finishers[ctx->current_osd_subop_num].reset(
5667       new ReadFinisher(osd_op));
5668   } else {
5669     int r = pgbackend->objects_read_sync(
5670       soid, op.extent.offset, op.extent.length, op.flags, &osd_op.outdata);
5671     // whole object?  can we verify the checksum?
5672     if (r >= 0 && op.extent.offset == 0 &&
5673         (uint64_t)r == oi.size && oi.is_data_digest()) {
5674       uint32_t crc = osd_op.outdata.crc32c(-1);
5675       if (oi.data_digest != crc) {
5676         osd->clog->error() << info.pgid << std::hex
5677                            << " full-object read crc 0x" << crc
5678                            << " != expected 0x" << oi.data_digest
5679                            << std::dec << " on " << soid;
5680         r = -EIO; // try repair later
5681       }
5682     }
5683     if (r == -EIO) {
5684       r = rep_repair_primary_object(soid, ctx);
5685     }
5686     if (r >= 0)
5687       op.extent.length = r;
5688     else if (r == -EAGAIN) {
5689       result = -EAGAIN;
5690     } else {
5691       result = r;
5692       op.extent.length = 0;
5693     }
5694     dout(10) << " read got " << r << " / " << op.extent.length
5695              << " bytes from obj " << soid << dendl;
5696   }
5697   if (result >= 0) {
5698     ctx->delta_stats.num_rd_kb += shift_round_up(op.extent.length, 10);
5699     ctx->delta_stats.num_rd++;
5700   }
5701   return result;
5702 }
5703
5704 int PrimaryLogPG::do_sparse_read(OpContext *ctx, OSDOp& osd_op) {
5705   dout(20) << __func__ << dendl;
5706   auto& op = osd_op.op;
5707   auto& oi = ctx->new_obs.oi;
5708   auto& soid = oi.soid;
5709
5710   if (op.extent.truncate_seq) {
5711     dout(0) << "sparse_read does not support truncation sequence " << dendl;
5712     return -EINVAL;
5713   }
5714
5715   ++ctx->num_read;
5716   if (pool.info.is_erasure()) {
5717     // translate sparse read to a normal one if not supported
5718     uint64_t offset = op.extent.offset;
5719     uint64_t length = op.extent.length;
5720     if (offset > oi.size) {
5721       length = 0;
5722     } else if (offset + length > oi.size) {
5723       length = oi.size - offset;
5724     }
5725
5726     if (length > 0) {
5727       ctx->pending_async_reads.push_back(
5728         make_pair(
5729           boost::make_tuple(offset, length, op.flags),
5730           make_pair(
5731             &osd_op.outdata,
5732             new ToSparseReadResult(&osd_op.rval, &osd_op.outdata, offset,
5733                                    &op.extent.length))));
5734       dout(10) << " async_read (was sparse_read) noted for " << soid << dendl;
5735
5736       ctx->op_finishers[ctx->current_osd_subop_num].reset(
5737         new ReadFinisher(osd_op));
5738     } else {
5739       dout(10) << " sparse read ended up empty for " << soid << dendl;
5740       map<uint64_t, uint64_t> extents;
5741       encode(extents, osd_op.outdata);
5742     }
5743   } else {
5744     // read into a buffer
5745     map<uint64_t, uint64_t> m;
5746     int r = osd->store->fiemap(ch, ghobject_t(soid, ghobject_t::NO_GEN,
5747                                               info.pgid.shard),
5748                                op.extent.offset, op.extent.length, m);
5749     if (r < 0)  {
5750       return r;
5751     }
5752
5753     bufferlist data_bl;
5754     r = pgbackend->objects_readv_sync(soid, std::move(m), op.flags, &data_bl);
5755     if (r == -EIO) {
5756       r = rep_repair_primary_object(soid, ctx);
5757     }
5758     if (r < 0) {
5759       return r;
5760     }
5761
5762     // Why SPARSE_READ need checksum? In fact, librbd always use sparse-read.
5763     // Maybe at first, there is no much whole objects. With continued use, more
5764     // and more whole object exist. So from this point, for spare-read add
5765     // checksum make sense.
5766     if ((uint64_t)r == oi.size && oi.is_data_digest()) {
5767       uint32_t crc = data_bl.crc32c(-1);
5768       if (oi.data_digest != crc) {
5769         osd->clog->error() << info.pgid << std::hex
5770           << " full-object read crc 0x" << crc
5771           << " != expected 0x" << oi.data_digest
5772           << std::dec << " on " << soid;
5773         r = rep_repair_primary_object(soid, ctx);
5774         if (r < 0) {
5775           return r;
5776         }
5777       }
5778     }
5779
5780     op.extent.length = r;
5781
5782     encode(m, osd_op.outdata); // re-encode since it might be modified
5783     ::encode_destructively(data_bl, osd_op.outdata);
5784
5785     dout(10) << " sparse_read got " << r << " bytes from object "
5786              << soid << dendl;
5787   }
5788
5789   ctx->delta_stats.num_rd_kb += shift_round_up(op.extent.length, 10);
5790   ctx->delta_stats.num_rd++;
5791   return 0;
5792 }
5793
5794 int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
5795 {
5796   int result = 0;
5797   SnapSetContext *ssc = ctx->obc->ssc;
5798   ObjectState& obs = ctx->new_obs;
5799   object_info_t& oi = obs.oi;
5800   const hobject_t& soid = oi.soid;
5801   const bool skip_data_digest = osd->store->has_builtin_csum() &&
5802     osd->osd_skip_data_digest;
5803
5804   PGTransaction* t = ctx->op_t.get();
5805
5806   dout(10) << "do_osd_op " << soid << " " << ops << dendl;
5807 #ifdef HAVE_JAEGER
5808   if (ctx->op->osd_parent_span) {
5809     auto do_osd_op_span = jaeger_tracing::child_span(__func__, ctx->op->osd_parent_span);
5810   }
5811 #endif
5812
5813   ctx->current_osd_subop_num = 0;
5814   for (auto p = ops.begin(); p != ops.end(); ++p, ctx->current_osd_subop_num++, ctx->processed_subop_count++) {
5815     OSDOp& osd_op = *p;
5816     ceph_osd_op& op = osd_op.op;
5817
5818     OpFinisher* op_finisher = nullptr;
5819     {
5820       auto op_finisher_it = ctx->op_finishers.find(ctx->current_osd_subop_num);
5821       if (op_finisher_it != ctx->op_finishers.end()) {
5822         op_finisher = op_finisher_it->second.get();
5823       }
5824     }
5825
5826     // TODO: check endianness (ceph_le32 vs uint32_t, etc.)
5827     // The fields in ceph_osd_op are little-endian (according to the definition in rados.h),
5828     // but the code in this function seems to treat them as native-endian.  What should the
5829     // tracepoints do?
5830     tracepoint(osd, do_osd_op_pre, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op), op.flags);
5831
5832     dout(10) << "do_osd_op  " << osd_op << dendl;
5833
5834     auto bp = osd_op.indata.cbegin();
5835
5836     // user-visible modifcation?
5837     switch (op.op) {
5838       // non user-visible modifications
5839     case CEPH_OSD_OP_WATCH:
5840     case CEPH_OSD_OP_CACHE_EVICT:
5841     case CEPH_OSD_OP_CACHE_FLUSH:
5842     case CEPH_OSD_OP_CACHE_TRY_FLUSH:
5843     case CEPH_OSD_OP_UNDIRTY:
5844     case CEPH_OSD_OP_COPY_FROM:  // we handle user_version update explicitly
5845     case CEPH_OSD_OP_COPY_FROM2:
5846     case CEPH_OSD_OP_CACHE_PIN:
5847     case CEPH_OSD_OP_CACHE_UNPIN:
5848     case CEPH_OSD_OP_SET_REDIRECT:
5849     case CEPH_OSD_OP_SET_CHUNK:
5850     case CEPH_OSD_OP_TIER_PROMOTE:
5851     case CEPH_OSD_OP_TIER_FLUSH:
5852     case CEPH_OSD_OP_TIER_EVICT:
5853       break;
5854     default:
5855       if (op.op & CEPH_OSD_OP_MODE_WR)
5856         ctx->user_modify = true;
5857     }
5858
5859     // munge -1 truncate to 0 truncate
5860     if (ceph_osd_op_uses_extent(op.op) &&
5861         op.extent.truncate_seq == 1 &&
5862         op.extent.truncate_size == (-1ULL)) {
5863       op.extent.truncate_size = 0;
5864       op.extent.truncate_seq = 0;
5865     }
5866
5867     // munge ZERO -> TRUNCATE?  (don't munge to DELETE or we risk hosing attributes)
5868     if (op.op == CEPH_OSD_OP_ZERO &&
5869         obs.exists &&
5870         op.extent.offset < static_cast<Option::size_t>(osd->osd_max_object_size) &&
5871         op.extent.length >= 1 &&
5872         op.extent.length <= static_cast<Option::size_t>(osd->osd_max_object_size) &&
5873         op.extent.offset + op.extent.length >= oi.size) {
5874       if (op.extent.offset >= oi.size) {
5875         // no-op
5876         goto fail;
5877       }
5878       dout(10) << " munging ZERO " << op.extent.offset << "~" << op.extent.length
5879                << " -> TRUNCATE " << op.extent.offset << " (old size is " << oi.size << ")" << dendl;
5880       op.op = CEPH_OSD_OP_TRUNCATE;
5881     }
5882
5883     switch (op.op) {
5884
5885       // --- READS ---
5886
5887     case CEPH_OSD_OP_CMPEXT:
5888       ++ctx->num_read;
5889       tracepoint(osd, do_osd_op_pre_extent_cmp, soid.oid.name.c_str(),
5890                  soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
5891                  op.extent.length, op.extent.truncate_size,
5892                  op.extent.truncate_seq);
5893
5894       if (op_finisher == nullptr) {
5895         result = do_extent_cmp(ctx, osd_op);
5896       } else {
5897         result = op_finisher->execute();
5898       }
5899       break;
5900
5901     case CEPH_OSD_OP_SYNC_READ:
5902       if (pool.info.is_erasure()) {
5903         result = -EOPNOTSUPP;
5904         break;
5905       }
5906       // fall through
5907     case CEPH_OSD_OP_READ:
5908       ++ctx->num_read;
5909       tracepoint(osd, do_osd_op_pre_read, soid.oid.name.c_str(),
5910                  soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
5911                  op.extent.length, op.extent.truncate_size,
5912                  op.extent.truncate_seq);
5913       if (op_finisher == nullptr) {
5914         if (!ctx->data_off) {
5915           ctx->data_off = op.extent.offset;
5916         }
5917         result = do_read(ctx, osd_op);
5918       } else {
5919         result = op_finisher->execute();
5920       }
5921       break;
5922
5923     case CEPH_OSD_OP_CHECKSUM:
5924       ++ctx->num_read;
5925       {
5926         tracepoint(osd, do_osd_op_pre_checksum, soid.oid.name.c_str(),
5927                    soid.snap.val, oi.size, oi.truncate_seq, op.checksum.type,
5928                    op.checksum.offset, op.checksum.length,
5929                    op.checksum.chunk_size);
5930
5931         if (op_finisher == nullptr) {
5932           result = do_checksum(ctx, osd_op, &bp);
5933         } else {
5934           result = op_finisher->execute();
5935         }
5936       }
5937       break;
5938
5939     /* map extents */
5940     case CEPH_OSD_OP_MAPEXT:
5941       tracepoint(osd, do_osd_op_pre_mapext, soid.oid.name.c_str(), soid.snap.val, op.extent.offset, op.extent.length);
5942       if (pool.info.is_erasure()) {
5943         result = -EOPNOTSUPP;
5944         break;
5945       }
5946       ++ctx->num_read;
5947       {
5948         // read into a buffer
5949         bufferlist bl;
5950         int r = osd->store->fiemap(ch, ghobject_t(soid, ghobject_t::NO_GEN,
5951                                                   info.pgid.shard),
5952                                    op.extent.offset, op.extent.length, bl);
5953         osd_op.outdata = std::move(bl);
5954         if (r < 0)
5955           result = r;
5956         else
5957           ctx->delta_stats.num_rd_kb += shift_round_up(bl.length(), 10);
5958         ctx->delta_stats.num_rd++;
5959         dout(10) << " map_extents done on object " << soid << dendl;
5960       }
5961       break;
5962
5963     /* map extents */
5964     case CEPH_OSD_OP_SPARSE_READ:
5965       tracepoint(osd, do_osd_op_pre_sparse_read, soid.oid.name.c_str(),
5966                  soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
5967                  op.extent.length, op.extent.truncate_size,
5968                  op.extent.truncate_seq);
5969       if (op_finisher == nullptr) {
5970         result = do_sparse_read(ctx, osd_op);
5971       } else {
5972         result = op_finisher->execute();
5973       }
5974       break;
5975
5976     case CEPH_OSD_OP_CALL:
5977       {
5978         string cname, mname;
5979         bufferlist indata;
5980         try {
5981           bp.copy(op.cls.class_len, cname);
5982           bp.copy(op.cls.method_len, mname);
5983           bp.copy(op.cls.indata_len, indata);
5984         } catch (ceph::buffer::error& e) {
5985           dout(10) << "call unable to decode class + method + indata" << dendl;
5986           dout(30) << "in dump: ";
5987           osd_op.indata.hexdump(*_dout);
5988           *_dout << dendl;
5989           result = -EINVAL;
5990           tracepoint(osd, do_osd_op_pre_call, soid.oid.name.c_str(), soid.snap.val, "???", "???");
5991           break;
5992         }
5993         tracepoint(osd, do_osd_op_pre_call, soid.oid.name.c_str(), soid.snap.val, cname.c_str(), mname.c_str());
5994
5995         ClassHandler::ClassData *cls;
5996         result = ClassHandler::get_instance().open_class(cname, &cls);
5997         ceph_assert(result == 0);   // init_op_flags() already verified this works.
5998
5999         ClassHandler::ClassMethod *method = cls->get_method(mname);
6000         if (!method) {
6001           dout(10) << "call method " << cname << "." << mname << " does not exist" << dendl;
6002           result = -EOPNOTSUPP;
6003           break;
6004         }
6005
6006         int flags = method->get_flags();
6007         if (flags & CLS_METHOD_WR)
6008           ctx->user_modify = true;
6009
6010         bufferlist outdata;
6011         dout(10) << "call method " << cname << "." << mname << dendl;
6012         int prev_rd = ctx->num_read;
6013         int prev_wr = ctx->num_write;
6014         result = method->exec((cls_method_context_t)&ctx, indata, outdata);
6015
6016         if (ctx->num_read > prev_rd && !(flags & CLS_METHOD_RD)) {
6017           derr << "method " << cname << "." << mname << " tried to read object but is not marked RD" << dendl;
6018           result = -EIO;
6019           break;
6020         }
6021         if (ctx->num_write > prev_wr && !(flags & CLS_METHOD_WR)) {
6022           derr << "method " << cname << "." << mname << " tried to update object but is not marked WR" << dendl;
6023           result = -EIO;
6024           break;
6025         }
6026
6027         dout(10) << "method called response length=" << outdata.length() << dendl;
6028         op.extent.length = outdata.length();
6029         osd_op.outdata.claim_append(outdata);
6030         dout(30) << "out dump: ";
6031         osd_op.outdata.hexdump(*_dout);
6032         *_dout << dendl;
6033       }
6034       break;
6035
6036     case CEPH_OSD_OP_STAT:
6037       // note: stat does not require RD
6038       {
6039         tracepoint(osd, do_osd_op_pre_stat, soid.oid.name.c_str(), soid.snap.val);
6040
6041         if (obs.exists && !oi.is_whiteout()) {
6042           encode(oi.size, osd_op.outdata);
6043           encode(oi.mtime, osd_op.outdata);
6044           dout(10) << "stat oi has " << oi.size << " " << oi.mtime << dendl;
6045         } else {
6046           result = -ENOENT;
6047           dout(10) << "stat oi object does not exist" << dendl;
6048         }
6049
6050         ctx->delta_stats.num_rd++;
6051       }
6052       break;
6053
6054     case CEPH_OSD_OP_ISDIRTY:
6055       ++ctx->num_read;
6056       {
6057         tracepoint(osd, do_osd_op_pre_isdirty, soid.oid.name.c_str(), soid.snap.val);
6058         bool is_dirty = obs.oi.is_dirty();
6059         encode(is_dirty, osd_op.outdata);
6060         ctx->delta_stats.num_rd++;
6061         result = 0;
6062       }
6063       break;
6064
6065     case CEPH_OSD_OP_UNDIRTY:
6066       ++ctx->num_write;
6067       result = 0;
6068       {
6069         tracepoint(osd, do_osd_op_pre_undirty, soid.oid.name.c_str(), soid.snap.val);
6070         if (oi.is_dirty()) {
6071           ctx->undirty = true;  // see make_writeable()
6072           ctx->modify = true;
6073           ctx->delta_stats.num_wr++;
6074         }
6075       }
6076       break;
6077
6078     case CEPH_OSD_OP_CACHE_TRY_FLUSH:
6079       ++ctx->num_write;
6080       result = 0;
6081       {
6082         tracepoint(osd, do_osd_op_pre_try_flush, soid.oid.name.c_str(), soid.snap.val);
6083         if (ctx->lock_type != RWState::RWNONE) {
6084           dout(10) << "cache-try-flush without SKIPRWLOCKS flag set" << dendl;
6085           result = -EINVAL;
6086           break;
6087         }
6088         if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE || obs.oi.has_manifest()) {
6089           result = -EINVAL;
6090           break;
6091         }
6092         if (!obs.exists) {
6093           result = 0;
6094           break;
6095         }
6096         if (oi.is_cache_pinned()) {
6097           dout(10) << "cache-try-flush on a pinned object, consider unpin this object first" << dendl;
6098           result = -EPERM;
6099           break;
6100         }
6101         if (oi.is_dirty()) {
6102           result = start_flush(ctx->op, ctx->obc, false, NULL, std::nullopt);
6103           if (result == -EINPROGRESS)
6104             result = -EAGAIN;
6105         } else {
6106           result = 0;
6107         }
6108       }
6109       break;
6110
6111     case CEPH_OSD_OP_CACHE_FLUSH:
6112       ++ctx->num_write;
6113       result = 0;
6114       {
6115         tracepoint(osd, do_osd_op_pre_cache_flush, soid.oid.name.c_str(), soid.snap.val);
6116         if (ctx->lock_type == RWState::RWNONE) {
6117           dout(10) << "cache-flush with SKIPRWLOCKS flag set" << dendl;
6118           result = -EINVAL;
6119           break;
6120         }
6121         if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE || obs.oi.has_manifest()) {
6122           result = -EINVAL;
6123           break;
6124         }
6125         if (!obs.exists) {
6126           result = 0;
6127           break;
6128         }
6129         if (oi.is_cache_pinned()) {
6130           dout(10) << "cache-flush on a pinned object, consider unpin this object first" << dendl;
6131           result = -EPERM;
6132           break;
6133         }
6134         hobject_t missing;
6135         if (oi.is_dirty()) {
6136           result = start_flush(ctx->op, ctx->obc, true, &missing, std::nullopt);
6137           if (result == -EINPROGRESS)
6138             result = -EAGAIN;
6139         } else {
6140           result = 0;
6141         }
6142         // Check special return value which has set missing_return
6143         if (result == -ENOENT) {
6144           dout(10) << __func__ << " CEPH_OSD_OP_CACHE_FLUSH got ENOENT" << dendl;
6145           ceph_assert(!missing.is_min());
6146           wait_for_unreadable_object(missing, ctx->op);
6147           // Error code which is used elsewhere when wait_for_unreadable_object() is used
6148           result = -EAGAIN;
6149         }
6150       }
6151       break;
6152
6153     case CEPH_OSD_OP_CACHE_EVICT:
6154       ++ctx->num_write;
6155       result = 0;
6156       {
6157         tracepoint(osd, do_osd_op_pre_cache_evict, soid.oid.name.c_str(), soid.snap.val);
6158         if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE || obs.oi.has_manifest()) {
6159           result = -EINVAL;
6160           break;
6161         }
6162         if (!obs.exists) {
6163           result = 0;
6164           break;
6165         }
6166         if (oi.is_cache_pinned()) {
6167           dout(10) << "cache-evict on a pinned object, consider unpin this object first" << dendl;
6168           result = -EPERM;
6169           break;
6170         }
6171         if (oi.is_dirty()) {
6172           result = -EBUSY;
6173           break;
6174         }
6175         if (!oi.watchers.empty()) {
6176           result = -EBUSY;
6177           break;
6178         }
6179         if (soid.snap == CEPH_NOSNAP) {
6180           result = _verify_no_head_clones(soid, ssc->snapset);
6181           if (result < 0)
6182             break;
6183         }
6184         result = _delete_oid(ctx, true, false);
6185         if (result >= 0) {
6186           // mark that this is a cache eviction to avoid triggering normal
6187           // make_writeable() clone creation in finish_ctx()
6188           ctx->cache_operation = true;
6189         }
6190         osd->logger->inc(l_osd_tier_evict);
6191       }
6192       break;
6193
6194     case CEPH_OSD_OP_GETXATTR:
6195       ++ctx->num_read;
6196       {
6197         string aname;
6198         bp.copy(op.xattr.name_len, aname);
6199         tracepoint(osd, do_osd_op_pre_getxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
6200         string name = "_" + aname;
6201         int r = getattr_maybe_cache(
6202           ctx->obc,
6203           name,
6204           &(osd_op.outdata));
6205         if (r >= 0) {
6206           op.xattr.value_len = osd_op.outdata.length();
6207           result = 0;
6208           ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
6209         } else
6210           result = r;
6211
6212         ctx->delta_stats.num_rd++;
6213       }
6214       break;
6215
6216    case CEPH_OSD_OP_GETXATTRS:
6217       ++ctx->num_read;
6218       {
6219         tracepoint(osd, do_osd_op_pre_getxattrs, soid.oid.name.c_str(), soid.snap.val);
6220         map<string, bufferlist> out;
6221         result = getattrs_maybe_cache(
6222           ctx->obc,
6223           &out);
6224
6225         bufferlist bl;
6226         encode(out, bl);
6227         ctx->delta_stats.num_rd_kb += shift_round_up(bl.length(), 10);
6228         ctx->delta_stats.num_rd++;
6229         osd_op.outdata.claim_append(bl);
6230       }
6231       break;
6232
6233     case CEPH_OSD_OP_CMPXATTR:
6234       ++ctx->num_read;
6235       {
6236         string aname;
6237         bp.copy(op.xattr.name_len, aname);
6238         tracepoint(osd, do_osd_op_pre_cmpxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
6239         string name = "_" + aname;
6240         name[op.xattr.name_len + 1] = 0;
6241
6242         bufferlist xattr;
6243         result = getattr_maybe_cache(
6244           ctx->obc,
6245           name,
6246           &xattr);
6247         if (result < 0 && result != -EEXIST && result != -ENODATA)
6248           break;
6249
6250         ctx->delta_stats.num_rd++;
6251         ctx->delta_stats.num_rd_kb += shift_round_up(xattr.length(), 10);
6252
6253         switch (op.xattr.cmp_mode) {
6254         case CEPH_OSD_CMPXATTR_MODE_STRING:
6255           {
6256             string val;
6257             bp.copy(op.xattr.value_len, val);
6258             val[op.xattr.value_len] = 0;
6259             dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name << " val=" << val
6260                      << " op=" << (int)op.xattr.cmp_op << " mode=" << (int)op.xattr.cmp_mode << dendl;
6261             result = do_xattr_cmp_str(op.xattr.cmp_op, val, xattr);
6262           }
6263           break;
6264
6265         case CEPH_OSD_CMPXATTR_MODE_U64:
6266           {
6267             uint64_t u64val;
6268             try {
6269               decode(u64val, bp);
6270             }
6271             catch (ceph::buffer::error& e) {
6272               result = -EINVAL;
6273               goto fail;
6274             }
6275             dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name << " val=" << u64val
6276                      << " op=" << (int)op.xattr.cmp_op << " mode=" << (int)op.xattr.cmp_mode << dendl;
6277             result = do_xattr_cmp_u64(op.xattr.cmp_op, u64val, xattr);
6278           }
6279           break;
6280
6281         default:
6282           dout(10) << "bad cmp mode " << (int)op.xattr.cmp_mode << dendl;
6283           result = -EINVAL;
6284         }
6285
6286         if (!result) {
6287           dout(10) << "comparison returned false" << dendl;
6288           result = -ECANCELED;
6289           break;
6290         }
6291         if (result < 0) {
6292           dout(10) << "comparison returned " << result << " " << cpp_strerror(-result) << dendl;
6293           break;
6294         }
6295
6296         dout(10) << "comparison returned true" << dendl;
6297       }
6298       break;
6299
6300     case CEPH_OSD_OP_ASSERT_VER:
6301       ++ctx->num_read;
6302       {
6303         uint64_t ver = op.assert_ver.ver;
6304         tracepoint(osd, do_osd_op_pre_assert_ver, soid.oid.name.c_str(), soid.snap.val, ver);
6305         if (!ver)
6306           result = -EINVAL;
6307         else if (ver < oi.user_version)
6308           result = -ERANGE;
6309         else if (ver > oi.user_version)
6310           result = -EOVERFLOW;
6311       }
6312       break;
6313
6314     case CEPH_OSD_OP_LIST_WATCHERS:
6315       ++ctx->num_read;
6316       {
6317         tracepoint(osd, do_osd_op_pre_list_watchers, soid.oid.name.c_str(), soid.snap.val);
6318         obj_list_watch_response_t resp;
6319
6320         map<pair<uint64_t, entity_name_t>, watch_info_t>::const_iterator oi_iter;
6321         for (oi_iter = oi.watchers.begin(); oi_iter != oi.watchers.end();
6322                                        ++oi_iter) {
6323           dout(20) << "key cookie=" << oi_iter->first.first
6324                << " entity=" << oi_iter->first.second << " "
6325                << oi_iter->second << dendl;
6326           ceph_assert(oi_iter->first.first == oi_iter->second.cookie);
6327           ceph_assert(oi_iter->first.second.is_client());
6328
6329           watch_item_t wi(oi_iter->first.second, oi_iter->second.cookie,
6330                  oi_iter->second.timeout_seconds, oi_iter->second.addr);
6331           resp.entries.push_back(wi);
6332         }
6333
6334         resp.encode(osd_op.outdata, ctx->get_features());
6335         result = 0;
6336
6337         ctx->delta_stats.num_rd++;
6338         break;
6339       }
6340
6341     case CEPH_OSD_OP_LIST_SNAPS:
6342       ++ctx->num_read;
6343       {
6344         tracepoint(osd, do_osd_op_pre_list_snaps, soid.oid.name.c_str(), soid.snap.val);
6345         obj_list_snap_response_t resp;
6346
6347         if (!ssc) {
6348           ssc = ctx->obc->ssc = get_snapset_context(soid, false);
6349         }
6350         ceph_assert(ssc);
6351         dout(20) << " snapset " << ssc->snapset << dendl;
6352
6353         int clonecount = ssc->snapset.clones.size();
6354         clonecount++;  // for head
6355         resp.clones.reserve(clonecount);
6356         for (auto clone_iter = ssc->snapset.clones.begin();
6357              clone_iter != ssc->snapset.clones.end(); ++clone_iter) {
6358           clone_info ci;
6359           ci.cloneid = *clone_iter;
6360
6361           hobject_t clone_oid = soid;
6362           clone_oid.snap = *clone_iter;
6363
6364           auto p = ssc->snapset.clone_snaps.find(*clone_iter);
6365           if (p == ssc->snapset.clone_snaps.end()) {
6366             osd->clog->error() << "osd." << osd->whoami
6367                                << ": inconsistent clone_snaps found for oid "
6368                                << soid << " clone " << *clone_iter
6369                                << " snapset " << ssc->snapset;
6370             result = -EINVAL;
6371             break;
6372           }
6373           for (auto q = p->second.rbegin(); q != p->second.rend(); ++q) {
6374             ci.snaps.push_back(*q);
6375           }
6376
6377           dout(20) << " clone " << *clone_iter << " snaps " << ci.snaps << dendl;
6378
6379           map<snapid_t, interval_set<uint64_t> >::const_iterator coi;
6380           coi = ssc->snapset.clone_overlap.find(ci.cloneid);
6381           if (coi == ssc->snapset.clone_overlap.end()) {
6382             osd->clog->error() << "osd." << osd->whoami
6383                                << ": inconsistent clone_overlap found for oid "
6384                               << soid << " clone " << *clone_iter;
6385             result = -EINVAL;
6386             break;
6387           }
6388           const interval_set<uint64_t> &o = coi->second;
6389           ci.overlap.reserve(o.num_intervals());
6390           for (interval_set<uint64_t>::const_iterator r = o.begin();
6391                r != o.end(); ++r) {
6392             ci.overlap.push_back(pair<uint64_t,uint64_t>(r.get_start(),
6393                                                          r.get_len()));
6394           }
6395
6396           map<snapid_t, uint64_t>::const_iterator si;
6397           si = ssc->snapset.clone_size.find(ci.cloneid);
6398           if (si == ssc->snapset.clone_size.end()) {
6399             osd->clog->error() << "osd." << osd->whoami
6400                                << ": inconsistent clone_size found for oid "
6401                                << soid << " clone " << *clone_iter;
6402             result = -EINVAL;
6403             break;
6404           }
6405           ci.size = si->second;
6406
6407           resp.clones.push_back(ci);
6408         }
6409         if (result < 0) {
6410           break;
6411         }
6412         if (!ctx->obc->obs.oi.is_whiteout()) {
6413           ceph_assert(obs.exists);
6414           clone_info ci;
6415           ci.cloneid = CEPH_NOSNAP;
6416
6417           //Size for HEAD is oi.size
6418           ci.size = oi.size;
6419
6420           resp.clones.push_back(ci);
6421         }
6422         resp.seq = ssc->snapset.seq;
6423
6424         resp.encode(osd_op.outdata);
6425         result = 0;
6426
6427         ctx->delta_stats.num_rd++;
6428         break;
6429       }
6430
6431    case CEPH_OSD_OP_NOTIFY:
6432       ++ctx->num_read;
6433       {
6434         uint32_t timeout;
6435         bufferlist bl;
6436
6437         try {
6438           uint32_t ver; // obsolete
6439           decode(ver, bp);
6440           decode(timeout, bp);
6441           decode(bl, bp);
6442         } catch (const ceph::buffer::error &e) {
6443           timeout = 0;
6444         }
6445         tracepoint(osd, do_osd_op_pre_notify, soid.oid.name.c_str(), soid.snap.val, timeout);
6446         if (!timeout)
6447           timeout = cct->_conf->osd_default_notify_timeout;
6448
6449         notify_info_t n;
6450         n.timeout = timeout;
6451         n.notify_id = osd->get_next_id(get_osdmap_epoch());
6452         n.cookie = op.notify.cookie;
6453         n.bl = bl;
6454         ctx->notifies.push_back(n);
6455
6456         // return our unique notify id to the client
6457         encode(n.notify_id, osd_op.outdata);
6458       }
6459       break;
6460
6461     case CEPH_OSD_OP_NOTIFY_ACK:
6462       ++ctx->num_read;
6463       {
6464         try {
6465           uint64_t notify_id = 0;
6466           uint64_t watch_cookie = 0;
6467           decode(notify_id, bp);
6468           decode(watch_cookie, bp);
6469           bufferlist reply_bl;
6470           if (!bp.end()) {
6471             decode(reply_bl, bp);
6472           }
6473           tracepoint(osd, do_osd_op_pre_notify_ack, soid.oid.name.c_str(), soid.snap.val, notify_id, watch_cookie, "Y");
6474           OpContext::NotifyAck ack(notify_id, watch_cookie, reply_bl);
6475           ctx->notify_acks.push_back(ack);
6476         } catch (const ceph::buffer::error &e) {
6477           tracepoint(osd, do_osd_op_pre_notify_ack, soid.oid.name.c_str(), soid.snap.val, op.watch.cookie, 0, "N");
6478           OpContext::NotifyAck ack(
6479             // op.watch.cookie is actually the notify_id for historical reasons
6480             op.watch.cookie
6481             );
6482           ctx->notify_acks.push_back(ack);
6483         }
6484       }
6485       break;
6486
6487     case CEPH_OSD_OP_SETALLOCHINT:
6488       ++ctx->num_write;
6489       result = 0;
6490       {
6491         tracepoint(osd, do_osd_op_pre_setallochint, soid.oid.name.c_str(), soid.snap.val, op.alloc_hint.expected_object_size, op.alloc_hint.expected_write_size);
6492         maybe_create_new_object(ctx);
6493         oi.expected_object_size = op.alloc_hint.expected_object_size;
6494         oi.expected_write_size = op.alloc_hint.expected_write_size;
6495         oi.alloc_hint_flags = op.alloc_hint.flags;
6496         t->set_alloc_hint(soid, op.alloc_hint.expected_object_size,
6497                           op.alloc_hint.expected_write_size,
6498                           op.alloc_hint.flags);
6499       }
6500       break;
6501
6502
6503       // --- WRITES ---
6504
6505       // -- object data --
6506
6507     case CEPH_OSD_OP_WRITE:
6508       ++ctx->num_write;
6509       result = 0;
6510       { // write
6511         __u32 seq = oi.truncate_seq;
6512         tracepoint(osd, do_osd_op_pre_write, soid.oid.name.c_str(), soid.snap.val, oi.size, seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
6513         if (op.extent.length != osd_op.indata.length()) {
6514           result = -EINVAL;
6515           break;
6516         }
6517
6518         if (pool.info.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED))
6519           op.flags = op.flags | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
6520
6521         if (pool.info.requires_aligned_append() &&
6522             (op.extent.offset % pool.info.required_alignment() != 0)) {
6523           result = -EOPNOTSUPP;
6524           break;
6525         }
6526
6527         if (!obs.exists) {
6528           if (pool.info.requires_aligned_append() && op.extent.offset) {
6529             result = -EOPNOTSUPP;
6530             break;
6531           }
6532         } else if (op.extent.offset != oi.size &&
6533                    pool.info.requires_aligned_append()) {
6534           result = -EOPNOTSUPP;
6535           break;
6536         }
6537
6538         if (seq && (seq > op.extent.truncate_seq) &&
6539             (op.extent.offset + op.extent.length > oi.size)) {
6540           // old write, arrived after trimtrunc
6541           op.extent.length = (op.extent.offset > oi.size ? 0 : oi.size - op.extent.offset);
6542           dout(10) << " old truncate_seq " << op.extent.truncate_seq << " < current " << seq
6543                    << ", adjusting write length to " << op.extent.length << dendl;
6544           bufferlist t;
6545           t.substr_of(osd_op.indata, 0, op.extent.length);
6546           osd_op.indata.swap(t);
6547         }
6548         if (op.extent.truncate_seq > seq) {
6549           // write arrives before trimtrunc
6550           if (obs.exists && !oi.is_whiteout()) {
6551             dout(10) << " truncate_seq " << op.extent.truncate_seq << " > current " << seq
6552                      << ", truncating to " << op.extent.truncate_size << dendl;
6553             t->truncate(soid, op.extent.truncate_size);
6554             oi.truncate_seq = op.extent.truncate_seq;
6555             oi.truncate_size = op.extent.truncate_size;
6556             if (oi.size > op.extent.truncate_size) {
6557               interval_set<uint64_t> trim;
6558               trim.insert(op.extent.truncate_size,
6559                 oi.size - op.extent.truncate_size);
6560               ctx->modified_ranges.union_of(trim);
6561               ctx->clean_regions.mark_data_region_dirty(op.extent.truncate_size, oi.size - op.extent.truncate_size);
6562               oi.clear_data_digest();
6563             }
6564             if (op.extent.truncate_size != oi.size) {
6565               truncate_update_size_and_usage(ctx->delta_stats,
6566                                              oi,
6567                                              op.extent.truncate_size);
6568             }
6569           } else {
6570             dout(10) << " truncate_seq " << op.extent.truncate_seq << " > current " << seq
6571                      << ", but object is new" << dendl;
6572             oi.truncate_seq = op.extent.truncate_seq;
6573             oi.truncate_size = op.extent.truncate_size;
6574           }
6575         }
6576         result = check_offset_and_length(
6577           op.extent.offset, op.extent.length,
6578           static_cast<Option::size_t>(osd->osd_max_object_size), get_dpp());
6579         if (result < 0)
6580           break;
6581
6582         maybe_create_new_object(ctx);
6583
6584         if (op.extent.length == 0) {
6585           if (op.extent.offset > oi.size) {
6586             t->truncate(
6587               soid, op.extent.offset);
6588             truncate_update_size_and_usage(ctx->delta_stats, oi,
6589                                            op.extent.offset);
6590           } else {
6591             t->nop(soid);
6592           }
6593         } else {
6594           t->write(
6595             soid, op.extent.offset, op.extent.length, osd_op.indata, op.flags);
6596         }
6597
6598         if (op.extent.offset == 0 && op.extent.length >= oi.size
6599             && !skip_data_digest) {
6600           obs.oi.set_data_digest(osd_op.indata.crc32c(-1));
6601         } else if (op.extent.offset == oi.size && obs.oi.is_data_digest()) {
6602           if (skip_data_digest) {
6603             obs.oi.clear_data_digest();
6604           } else {
6605             obs.oi.set_data_digest(osd_op.indata.crc32c(obs.oi.data_digest));
6606           }
6607         } else {
6608           obs.oi.clear_data_digest();
6609         }
6610         write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
6611                                     op.extent.offset, op.extent.length);
6612         ctx->clean_regions.mark_data_region_dirty(op.extent.offset, op.extent.length);
6613         dout(10) << "clean_regions modified" << ctx->clean_regions << dendl;
6614       }
6615       break;
6616
6617     case CEPH_OSD_OP_WRITEFULL:
6618       ++ctx->num_write;
6619       result = 0;
6620       { // write full object
6621         tracepoint(osd, do_osd_op_pre_writefull, soid.oid.name.c_str(), soid.snap.val, oi.size, 0, op.extent.length);
6622
6623         if (op.extent.length != osd_op.indata.length()) {
6624           result = -EINVAL;
6625           break;
6626         }
6627         result = check_offset_and_length(
6628           0, op.extent.length,
6629           static_cast<Option::size_t>(osd->osd_max_object_size), get_dpp());
6630         if (result < 0)
6631           break;
6632
6633         if (pool.info.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED))
6634           op.flags = op.flags | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
6635
6636         maybe_create_new_object(ctx);
6637         if (pool.info.is_erasure()) {
6638           t->truncate(soid, 0);
6639         } else if (obs.exists && op.extent.length < oi.size) {
6640           t->truncate(soid, op.extent.length);
6641         }
6642         if (op.extent.length) {
6643           t->write(soid, 0, op.extent.length, osd_op.indata, op.flags);
6644         }
6645         if (!skip_data_digest) {
6646           obs.oi.set_data_digest(osd_op.indata.crc32c(-1));
6647         } else {
6648           obs.oi.clear_data_digest();
6649         }
6650         ctx->clean_regions.mark_data_region_dirty(0,
6651           std::max((uint64_t)op.extent.length, oi.size));
6652         write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
6653             0, op.extent.length, true);
6654       }
6655       break;
6656
6657     case CEPH_OSD_OP_WRITESAME:
6658       ++ctx->num_write;
6659       tracepoint(osd, do_osd_op_pre_writesame, soid.oid.name.c_str(), soid.snap.val, oi.size, op.writesame.offset, op.writesame.length, op.writesame.data_length);
6660       result = do_writesame(ctx, osd_op);
6661       break;
6662
6663     case CEPH_OSD_OP_ROLLBACK :
6664       ++ctx->num_write;
6665       tracepoint(osd, do_osd_op_pre_rollback, soid.oid.name.c_str(), soid.snap.val);
6666       result = _rollback_to(ctx, op);
6667       break;
6668
6669     case CEPH_OSD_OP_ZERO:
6670       tracepoint(osd, do_osd_op_pre_zero, soid.oid.name.c_str(), soid.snap.val, op.extent.offset, op.extent.length);
6671       if (pool.info.requires_aligned_append()) {
6672         result = -EOPNOTSUPP;
6673         break;
6674       }
6675       ++ctx->num_write;
6676       { // zero
6677         result = check_offset_and_length(
6678           op.extent.offset, op.extent.length,
6679           static_cast<Option::size_t>(osd->osd_max_object_size), get_dpp());
6680         if (result < 0)
6681           break;
6682
6683         ceph_assert(op.extent.length);
6684         if (obs.exists && !oi.is_whiteout()) {
6685           t->zero(soid, op.extent.offset, op.extent.length);
6686           interval_set<uint64_t> ch;
6687           ch.insert(op.extent.offset, op.extent.length);
6688           ctx->modified_ranges.union_of(ch);
6689           ctx->clean_regions.mark_data_region_dirty(op.extent.offset, op.extent.length);
6690           ctx->delta_stats.num_wr++;
6691           oi.clear_data_digest();
6692         } else {
6693           // no-op
6694         }
6695       }
6696       break;
6697     case CEPH_OSD_OP_CREATE:
6698       ++ctx->num_write;
6699       result = 0;
6700       {
6701         tracepoint(osd, do_osd_op_pre_create, soid.oid.name.c_str(), soid.snap.val);
6702         if (obs.exists && !oi.is_whiteout() &&
6703             (op.flags & CEPH_OSD_OP_FLAG_EXCL)) {
6704           result = -EEXIST; /* this is an exclusive create */
6705         } else {
6706           if (osd_op.indata.length()) {
6707             auto p = osd_op.indata.cbegin();
6708             string category;
6709             try {
6710               decode(category, p);
6711             }
6712             catch (ceph::buffer::error& e) {
6713               result = -EINVAL;
6714               goto fail;
6715             }
6716             // category is no longer implemented.
6717           }
6718           maybe_create_new_object(ctx);
6719           t->nop(soid);
6720         }
6721       }
6722       break;
6723
6724     case CEPH_OSD_OP_TRIMTRUNC:
6725       op.extent.offset = op.extent.truncate_size;
6726       // falling through
6727
6728     case CEPH_OSD_OP_TRUNCATE:
6729       tracepoint(osd, do_osd_op_pre_truncate, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
6730       if (pool.info.requires_aligned_append()) {
6731         result = -EOPNOTSUPP;
6732         break;
6733       }
6734       ++ctx->num_write;
6735       result = 0;
6736       {
6737         // truncate
6738         if (!obs.exists || oi.is_whiteout()) {
6739           dout(10) << " object dne, truncate is a no-op" << dendl;
6740           break;
6741         }
6742
6743         result = check_offset_and_length(
6744           op.extent.offset, op.extent.length,
6745           static_cast<Option::size_t>(osd->osd_max_object_size), get_dpp());
6746         if (result < 0)
6747           break;
6748
6749         if (op.extent.truncate_seq) {
6750           ceph_assert(op.extent.offset == op.extent.truncate_size);
6751           if (op.extent.truncate_seq <= oi.truncate_seq) {
6752             dout(10) << " truncate seq " << op.extent.truncate_seq << " <= current " << oi.truncate_seq
6753                      << ", no-op" << dendl;
6754             break; // old
6755           }
6756           dout(10) << " truncate seq " << op.extent.truncate_seq << " > current " << oi.truncate_seq
6757                    << ", truncating" << dendl;
6758           oi.truncate_seq = op.extent.truncate_seq;
6759           oi.truncate_size = op.extent.truncate_size;
6760         }
6761
6762         maybe_create_new_object(ctx);
6763         t->truncate(soid, op.extent.offset);
6764         if (oi.size > op.extent.offset) {
6765           interval_set<uint64_t> trim;
6766           trim.insert(op.extent.offset, oi.size-op.extent.offset);
6767           ctx->modified_ranges.union_of(trim);
6768           ctx->clean_regions.mark_data_region_dirty(op.extent.offset, oi.size - op.extent.offset);
6769         } else if (oi.size < op.extent.offset) {
6770           ctx->clean_regions.mark_data_region_dirty(oi.size, op.extent.offset - oi.size);
6771         }
6772         if (op.extent.offset != oi.size) {
6773           truncate_update_size_and_usage(ctx->delta_stats,
6774                                          oi,
6775                                          op.extent.offset);
6776         }
6777         ctx->delta_stats.num_wr++;
6778         // do no set exists, or we will break above DELETE -> TRUNCATE munging.
6779
6780         oi.clear_data_digest();
6781       }
6782       break;
6783
6784     case CEPH_OSD_OP_DELETE:
6785       ++ctx->num_write;
6786       result = 0;
6787       tracepoint(osd, do_osd_op_pre_delete, soid.oid.name.c_str(), soid.snap.val);
6788       {
6789         result = _delete_oid(ctx, false, ctx->ignore_cache);
6790       }
6791       break;
6792
6793     case CEPH_OSD_OP_WATCH:
6794       ++ctx->num_write;
6795       result = 0;
6796       {
6797         tracepoint(osd, do_osd_op_pre_watch, soid.oid.name.c_str(), soid.snap.val,
6798                    op.watch.cookie, op.watch.op);
6799         if (!obs.exists) {
6800           result = -ENOENT;
6801           break;
6802         }
6803         result = 0;
6804         uint64_t cookie = op.watch.cookie;
6805         entity_name_t entity = ctx->reqid.name;
6806         ObjectContextRef obc = ctx->obc;
6807
6808         dout(10) << "watch " << ceph_osd_watch_op_name(op.watch.op)
6809                  << ": ctx->obc=" << (void *)obc.get() << " cookie=" << cookie
6810                  << " oi.version=" << oi.version.version << " ctx->at_version=" << ctx->at_version << dendl;
6811         dout(10) << "watch: oi.user_version=" << oi.user_version<< dendl;
6812         dout(10) << "watch: peer_addr="
6813           << ctx->op->get_req()->get_connection()->get_peer_addr() << dendl;
6814
6815         uint32_t timeout = cct->_conf->osd_client_watch_timeout;
6816         if (op.watch.timeout != 0) {
6817           timeout = op.watch.timeout;
6818         }
6819
6820         watch_info_t w(cookie, timeout,
6821           ctx->op->get_req()->get_connection()->get_peer_addr());
6822         if (op.watch.op == CEPH_OSD_WATCH_OP_WATCH ||
6823             op.watch.op == CEPH_OSD_WATCH_OP_LEGACY_WATCH) {
6824           if (oi.watchers.count(make_pair(cookie, entity))) {
6825             dout(10) << " found existing watch " << w << " by " << entity << dendl;
6826           } else {
6827             dout(10) << " registered new watch " << w << " by " << entity << dendl;
6828             oi.watchers[make_pair(cookie, entity)] = w;
6829             t->nop(soid);  // make sure update the object_info on disk!
6830           }
6831           bool will_ping = (op.watch.op == CEPH_OSD_WATCH_OP_WATCH);
6832           ctx->watch_connects.push_back(make_pair(w, will_ping));
6833         } else if (op.watch.op == CEPH_OSD_WATCH_OP_RECONNECT) {
6834           if (!oi.watchers.count(make_pair(cookie, entity))) {
6835             result = -ENOTCONN;
6836             break;
6837           }
6838           dout(10) << " found existing watch " << w << " by " << entity << dendl;
6839           ctx->watch_connects.push_back(make_pair(w, true));
6840         } else if (op.watch.op == CEPH_OSD_WATCH_OP_PING) {
6841           /* Note: WATCH with PING doesn't cause may_write() to return true,
6842            * so if there is nothing else in the transaction, this is going
6843            * to run do_osd_op_effects, but not write out a log entry */
6844           if (!oi.watchers.count(make_pair(cookie, entity))) {
6845             result = -ENOTCONN;
6846             break;
6847           }
6848           map<pair<uint64_t,entity_name_t>,WatchRef>::iterator p =
6849             obc->watchers.find(make_pair(cookie, entity));
6850           if (p == obc->watchers.end() ||
6851               !p->second->is_connected()) {
6852             // client needs to reconnect
6853             result = -ETIMEDOUT;
6854             break;
6855           }
6856           dout(10) << " found existing watch " << w << " by " << entity << dendl;
6857           p->second->got_ping(ceph_clock_now());
6858           result = 0;
6859         } else if (op.watch.op == CEPH_OSD_WATCH_OP_UNWATCH) {
6860           map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator oi_iter =
6861             oi.watchers.find(make_pair(cookie, entity));
6862           if (oi_iter != oi.watchers.end()) {
6863             dout(10) << " removed watch " << oi_iter->second << " by "
6864                      << entity << dendl;
6865             oi.watchers.erase(oi_iter);
6866             t->nop(soid);  // update oi on disk
6867             ctx->watch_disconnects.push_back(
6868               watch_disconnect_t(cookie, entity, false));
6869           } else {
6870             dout(10) << " can't remove: no watch by " << entity << dendl;
6871           }
6872         }
6873       }
6874       break;
6875
6876     case CEPH_OSD_OP_CACHE_PIN:
6877       tracepoint(osd, do_osd_op_pre_cache_pin, soid.oid.name.c_str(), soid.snap.val);
6878       if ((!pool.info.is_tier() ||
6879           pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)) {
6880         result = -EINVAL;
6881         dout(10) << " pin object is only allowed on the cache tier " << dendl;
6882         break;
6883       }
6884       ++ctx->num_write;
6885       result = 0;
6886       {
6887         if (!obs.exists || oi.is_whiteout()) {
6888           result = -ENOENT;
6889           break;
6890         }
6891
6892         if (!oi.is_cache_pinned()) {
6893           oi.set_flag(object_info_t::FLAG_CACHE_PIN);
6894           ctx->modify = true;
6895           ctx->delta_stats.num_objects_pinned++;
6896           ctx->delta_stats.num_wr++;
6897         }
6898       }
6899       break;
6900
6901     case CEPH_OSD_OP_CACHE_UNPIN:
6902       tracepoint(osd, do_osd_op_pre_cache_unpin, soid.oid.name.c_str(), soid.snap.val);
6903       if ((!pool.info.is_tier() ||
6904           pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)) {
6905         result = -EINVAL;
6906         dout(10) << " pin object is only allowed on the cache tier " << dendl;
6907         break;
6908       }
6909       ++ctx->num_write;
6910       result = 0;
6911       {
6912         if (!obs.exists || oi.is_whiteout()) {
6913           result = -ENOENT;
6914           break;
6915         }
6916
6917         if (oi.is_cache_pinned()) {
6918           oi.clear_flag(object_info_t::FLAG_CACHE_PIN);
6919           ctx->modify = true;
6920           ctx->delta_stats.num_objects_pinned--;
6921           ctx->delta_stats.num_wr++;
6922         }
6923       }
6924       break;
6925
6926     case CEPH_OSD_OP_SET_REDIRECT:
6927       ++ctx->num_write;
6928       result = 0;
6929       {
6930         if (pool.info.is_tier()) {
6931           result = -EINVAL;
6932           break;
6933         }
6934         if (!obs.exists) {
6935           result = -ENOENT;
6936           break;
6937         }
6938         if (get_osdmap()->require_osd_release < ceph_release_t::luminous) {
6939           result = -EOPNOTSUPP;
6940           break;
6941         }
6942
6943         object_t target_name;
6944         object_locator_t target_oloc;
6945         snapid_t target_snapid = (uint64_t)op.copy_from.snapid;
6946         version_t target_version = op.copy_from.src_version;
6947         try {
6948           decode(target_name, bp);
6949           decode(target_oloc, bp);
6950         }
6951         catch (ceph::buffer::error& e) {
6952           result = -EINVAL;
6953           goto fail;
6954         }
6955         pg_t raw_pg;
6956         get_osdmap()->object_locator_to_pg(target_name, target_oloc, raw_pg);
6957         hobject_t target(target_name, target_oloc.key, target_snapid,
6958                 raw_pg.ps(), raw_pg.pool(),
6959                 target_oloc.nspace);
6960         if (target == soid) {
6961           dout(20) << " set-redirect self is invalid" << dendl;
6962           result = -EINVAL;
6963           break;
6964         }
6965
6966         bool need_reference = (osd_op.op.flags & CEPH_OSD_OP_FLAG_WITH_REFERENCE);
6967         bool has_reference = (oi.flags & object_info_t::FLAG_REDIRECT_HAS_REFERENCE);
6968         if (has_reference) {
6969           result = -EINVAL;
6970           dout(5) << " the object is already a manifest " << dendl;
6971           break;
6972         }
6973         if (op_finisher == nullptr && need_reference) {
6974           // start
6975           ctx->op_finishers[ctx->current_osd_subop_num].reset(
6976             new SetManifestFinisher(osd_op));
6977           ManifestOpRef mop = std::make_shared<ManifestOp>(new RefCountCallback(ctx, osd_op));
6978           C_SetManifestRefCountDone* fin = new C_SetManifestRefCountDone(this, mop, soid);
6979           ceph_tid_t tid = refcount_manifest(soid, target,
6980                                               refcount_t::INCREMENT_REF, fin, std::nullopt);
6981           mop->objecter_tid = tid;
6982           manifest_ops[soid] = mop;
6983           ctx->obc->start_block();
6984           result = -EINPROGRESS;
6985         } else {
6986           // finish
6987           if (op_finisher) {
6988             result = op_finisher->execute();
6989             ceph_assert(result == 0);
6990           }
6991
6992           if (!oi.has_manifest() && !oi.manifest.is_redirect())
6993             ctx->delta_stats.num_objects_manifest++;
6994
6995           oi.set_flag(object_info_t::FLAG_MANIFEST);
6996           oi.manifest.redirect_target = target;
6997           oi.manifest.type = object_manifest_t::TYPE_REDIRECT;
6998           t->truncate(soid, 0);
6999           ctx->clean_regions.mark_data_region_dirty(0, oi.size);
7000           if (oi.is_omap() && pool.info.supports_omap()) {
7001             t->omap_clear(soid);
7002             obs.oi.clear_omap_digest();
7003             obs.oi.clear_flag(object_info_t::FLAG_OMAP);
7004             ctx->clean_regions.mark_omap_dirty();
7005           }
7006           write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
7007             0, oi.size, false);
7008           ctx->delta_stats.num_bytes -= oi.size;
7009           oi.size = 0;
7010           oi.new_object();
7011           oi.user_version = target_version;
7012           ctx->user_at_version = target_version;
7013           /* rm_attrs */
7014           map<string,bufferlist> rmattrs;
7015           result = getattrs_maybe_cache(ctx->obc, &rmattrs);
7016           if (result < 0) {
7017             dout(10) << __func__ << " error: " << cpp_strerror(result) << dendl;
7018             return result;
7019           }
7020           map<string, bufferlist>::iterator iter;
7021           for (iter = rmattrs.begin(); iter != rmattrs.end(); ++iter) {
7022             const string& name = iter->first;
7023             t->rmattr(soid, name);
7024           }
7025           if (!has_reference && need_reference) {
7026             oi.set_flag(object_info_t::FLAG_REDIRECT_HAS_REFERENCE);
7027           }
7028           dout(10) << "set-redirect oid:" << oi.soid << " user_version: " << oi.user_version << dendl;
7029           if (op_finisher) {
7030             ctx->op_finishers.erase(ctx->current_osd_subop_num);
7031           }
7032         }
7033       }
7034
7035       break;
7036
7037     case CEPH_OSD_OP_SET_CHUNK:
7038       ++ctx->num_write;
7039       result = 0;
7040       {
7041         if (pool.info.is_tier()) {
7042           result = -EINVAL;
7043           break;
7044         }
7045         if (!obs.exists) {
7046           result = -ENOENT;
7047           break;
7048         }
7049         if (get_osdmap()->require_osd_release < ceph_release_t::luminous) {
7050           result = -EOPNOTSUPP;
7051           break;
7052         }
7053         if (oi.manifest.is_redirect()) {
7054           result = -EINVAL;
7055           goto fail;
7056         }
7057
7058         object_locator_t tgt_oloc;
7059         uint64_t src_offset, src_length, tgt_offset;
7060         object_t tgt_name;
7061         try {
7062           decode(src_offset, bp);
7063           decode(src_length, bp);
7064           decode(tgt_oloc, bp);
7065           decode(tgt_name, bp);
7066           decode(tgt_offset, bp);
7067         }
7068         catch (ceph::buffer::error& e) {
7069           result = -EINVAL;
7070           goto fail;
7071         }
7072
7073         if (!src_length) {
7074           result = -EINVAL;
7075           goto fail;
7076         }
7077         if (src_offset + src_length > oi.size) {
7078           result = -ERANGE;
7079           goto fail;
7080         }
7081         if (!(osd_op.op.flags & CEPH_OSD_OP_FLAG_WITH_REFERENCE)) {
7082           result = -EOPNOTSUPP;
7083           break;
7084         }
7085         if (pool.info.is_erasure()) {
7086           result = -EOPNOTSUPP;
7087           break;
7088         }
7089
7090         for (auto &p : oi.manifest.chunk_map) {
7091           interval_set<uint64_t> chunk;
7092           chunk.insert(p.first, p.second.length);
7093           if (chunk.intersects(src_offset, src_length)) {
7094             dout(20) << __func__ << " overlapped !! offset: " << src_offset << " length: " << src_length
7095                     << " chunk_info: " << p << dendl;
7096             result = -EOPNOTSUPP;
7097             goto fail;
7098           }
7099         }
7100
7101         pg_t raw_pg;
7102         chunk_info_t chunk_info;
7103         get_osdmap()->object_locator_to_pg(tgt_name, tgt_oloc, raw_pg);
7104         hobject_t target(tgt_name, tgt_oloc.key, snapid_t(),
7105                          raw_pg.ps(), raw_pg.pool(),
7106                          tgt_oloc.nspace);
7107         bool has_reference = (oi.manifest.chunk_map.find(src_offset) != oi.manifest.chunk_map.end()) &&
7108                              (oi.manifest.chunk_map[src_offset].test_flag(chunk_info_t::FLAG_HAS_REFERENCE));
7109         if (has_reference) {
7110           result = -EINVAL;
7111           dout(5) << " the object is already a manifest " << dendl;
7112           break;
7113         }
7114         chunk_info.oid = target;
7115         chunk_info.offset = tgt_offset;
7116         chunk_info.length = src_length;
7117         if (op_finisher == nullptr)  {
7118           // start
7119           ctx->op_finishers[ctx->current_osd_subop_num].reset(
7120             new SetManifestFinisher(osd_op));
7121           object_manifest_t set_chunk;
7122           bool need_inc_ref = false;
7123           set_chunk.chunk_map[src_offset] = chunk_info;
7124           need_inc_ref = inc_refcount_by_set(ctx, set_chunk, osd_op);
7125           if (need_inc_ref) {
7126             result = -EINPROGRESS;
7127             break;
7128           }
7129         }
7130         if (op_finisher) {
7131           result = op_finisher->execute();
7132           ceph_assert(result == 0);
7133         }
7134
7135         oi.manifest.chunk_map[src_offset] = chunk_info;
7136         if (!oi.has_manifest() && !oi.manifest.is_chunked())
7137           ctx->delta_stats.num_objects_manifest++;
7138         oi.set_flag(object_info_t::FLAG_MANIFEST);
7139         oi.manifest.type = object_manifest_t::TYPE_CHUNKED;
7140         if (!has_reference) {
7141           oi.manifest.chunk_map[src_offset].set_flag(chunk_info_t::FLAG_HAS_REFERENCE);
7142         }
7143         ctx->modify = true;
7144         ctx->cache_operation = true;
7145
7146         dout(10) << "set-chunked oid:" << oi.soid << " user_version: " << oi.user_version
7147                  << " chunk_info: " << chunk_info << dendl;
7148         if (op_finisher) {
7149           ctx->op_finishers.erase(ctx->current_osd_subop_num);
7150         }
7151       }
7152
7153       break;
7154
7155     case CEPH_OSD_OP_TIER_PROMOTE:
7156       ++ctx->num_write;
7157       result = 0;
7158       {
7159         if (pool.info.is_tier()) {
7160           result = -EINVAL;
7161           break;
7162         }
7163         if (!obs.exists) {
7164           result = -ENOENT;
7165           break;
7166         }
7167         if (get_osdmap()->require_osd_release < ceph_release_t::luminous) {
7168           result = -EOPNOTSUPP;
7169           break;
7170         }
7171         if (!obs.oi.has_manifest()) {
7172           result = 0;
7173           break;
7174         }
7175
7176         if (op_finisher == nullptr) {
7177           PromoteManifestCallback *cb;
7178           object_locator_t my_oloc;
7179           hobject_t src_hoid;
7180
7181           if (obs.oi.manifest.is_chunked()) {
7182             src_hoid = obs.oi.soid;
7183           } else if (obs.oi.manifest.is_redirect()) {
7184             object_locator_t src_oloc(obs.oi.manifest.redirect_target);
7185             my_oloc = src_oloc;
7186             src_hoid = obs.oi.manifest.redirect_target;
7187           } else {
7188             ceph_abort_msg("unrecognized manifest type");
7189           }
7190           cb = new PromoteManifestCallback(ctx->obc, this, ctx);
7191           ctx->op_finishers[ctx->current_osd_subop_num].reset(
7192             new PromoteFinisher(cb));
7193           unsigned flags = CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
7194                            CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
7195                            CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE |
7196                            CEPH_OSD_COPY_FROM_FLAG_RWORDERED;
7197           unsigned src_fadvise_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL;
7198           start_copy(cb, ctx->obc, src_hoid, my_oloc, 0, flags,
7199                      obs.oi.soid.snap == CEPH_NOSNAP,
7200                      src_fadvise_flags, 0);
7201
7202           dout(10) << "tier-promote oid:" << oi.soid << " manifest: " << obs.oi.manifest << dendl;
7203           result = -EINPROGRESS;
7204         } else {
7205           result = op_finisher->execute();
7206           ceph_assert(result == 0);
7207           ctx->op_finishers.erase(ctx->current_osd_subop_num);
7208         }
7209       }
7210
7211       break;
7212
7213     case CEPH_OSD_OP_TIER_FLUSH:
7214       ++ctx->num_write;
7215       result = 0;
7216       {
7217         if (pool.info.is_tier()) {
7218           result = -EINVAL;
7219           break;
7220         }
7221         if (!obs.exists) {
7222           result = -ENOENT;
7223           break;
7224         }
7225         if (get_osdmap()->require_osd_release < ceph_release_t::octopus) {
7226           result = -EOPNOTSUPP;
7227           break;
7228         }
7229         if (!obs.oi.has_manifest()) {
7230           result = 0;
7231           break;
7232         }
7233
7234         if (oi.is_dirty()) {
7235           result = start_flush(ctx->op, ctx->obc, true, NULL, std::nullopt);
7236           if (result == -EINPROGRESS)
7237             result = -EAGAIN;
7238         } else {
7239           result = 0;
7240         }
7241       }
7242
7243       break;
7244
7245     case CEPH_OSD_OP_TIER_EVICT:
7246       ++ctx->num_write;
7247       result = 0;
7248       {
7249         if (pool.info.is_tier()) {
7250           result = -EINVAL;
7251           break;
7252         }
7253         if (!obs.exists) {
7254           result = -ENOENT;
7255           break;
7256         }
7257         if (get_osdmap()->require_osd_release < ceph_release_t::octopus) {
7258           result = -EOPNOTSUPP;
7259           break;
7260         }
7261         if (!obs.oi.has_manifest()) {
7262           result = -EINVAL;
7263           break;
7264         }
7265
7266         // The chunks already has a reference, so it is just enough to invoke truncate if necessary
7267         uint64_t chunk_length = 0;
7268         for (auto p : obs.oi.manifest.chunk_map) {
7269           chunk_length += p.second.length;
7270         }
7271         if (chunk_length == obs.oi.size) {
7272           for (auto &p : obs.oi.manifest.chunk_map) {
7273             p.second.set_flag(chunk_info_t::FLAG_MISSING);
7274           }
7275           // punch hole
7276           t->zero(soid, 0, oi.size);
7277           oi.clear_data_digest();
7278           ctx->delta_stats.num_wr++;
7279           ctx->cache_operation = true;
7280         }
7281         osd->logger->inc(l_osd_tier_evict);
7282       }
7283
7284       break;
7285
7286     case CEPH_OSD_OP_UNSET_MANIFEST:
7287       ++ctx->num_write;
7288       result = 0;
7289       {
7290         if (pool.info.is_tier()) {
7291           result = -EINVAL;
7292           break;
7293         }
7294         if (!obs.exists) {
7295           result = -ENOENT;
7296           break;
7297         }
7298         if (!oi.has_manifest()) {
7299           result = -EOPNOTSUPP;
7300           break;
7301         }
7302         if (get_osdmap()->require_osd_release < ceph_release_t::luminous) {
7303           result = -EOPNOTSUPP;
7304           break;
7305         }
7306
7307         dec_all_refcount_manifest(oi, ctx);
7308
7309         oi.clear_flag(object_info_t::FLAG_MANIFEST);
7310         oi.manifest = object_manifest_t();
7311         ctx->delta_stats.num_objects_manifest--;
7312         ctx->delta_stats.num_wr++;
7313         ctx->modify = true;
7314       }
7315
7316       break;
7317
7318       // -- object attrs --
7319
7320     case CEPH_OSD_OP_SETXATTR:
7321       ++ctx->num_write;
7322       result = 0;
7323       {
7324         if (cct->_conf->osd_max_attr_size > 0 &&
7325             op.xattr.value_len > cct->_conf->osd_max_attr_size) {
7326           tracepoint(osd, do_osd_op_pre_setxattr, soid.oid.name.c_str(), soid.snap.val, "???");
7327           result = -EFBIG;
7328           break;
7329         }
7330         unsigned max_name_len =
7331           std::min<uint64_t>(osd->store->get_max_attr_name_length(),
7332                              cct->_conf->osd_max_attr_name_len);
7333         if (op.xattr.name_len > max_name_len) {
7334           result = -ENAMETOOLONG;
7335           break;
7336         }
7337         maybe_create_new_object(ctx);
7338         string aname;
7339         bp.copy(op.xattr.name_len, aname);
7340         tracepoint(osd, do_osd_op_pre_setxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
7341         string name = "_" + aname;
7342         bufferlist bl;
7343         bp.copy(op.xattr.value_len, bl);
7344         t->setattr(soid, name, bl);
7345         ctx->delta_stats.num_wr++;
7346       }
7347       break;
7348
7349     case CEPH_OSD_OP_RMXATTR:
7350       ++ctx->num_write;
7351       result = 0;
7352       {
7353         string aname;
7354         bp.copy(op.xattr.name_len, aname);
7355         tracepoint(osd, do_osd_op_pre_rmxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
7356         if (!obs.exists || oi.is_whiteout()) {
7357           result = -ENOENT;
7358           break;
7359         }
7360         string name = "_" + aname;
7361         t->rmattr(soid, name);
7362         ctx->delta_stats.num_wr++;
7363       }
7364       break;
7365
7366
7367       // -- fancy writers --
7368     case CEPH_OSD_OP_APPEND:
7369       {
7370         tracepoint(osd, do_osd_op_pre_append, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
7371         // just do it inline; this works because we are happy to execute
7372         // fancy op on replicas as well.
7373         vector<OSDOp> nops(1);
7374         OSDOp& newop = nops[0];
7375         newop.op.op = CEPH_OSD_OP_WRITE;
7376         newop.op.extent.offset = oi.size;
7377         newop.op.extent.length = op.extent.length;
7378         newop.op.extent.truncate_seq = oi.truncate_seq;
7379         newop.indata = osd_op.indata;
7380         result = do_osd_ops(ctx, nops);
7381         osd_op.outdata = std::move(newop.outdata);
7382       }
7383       break;
7384
7385     case CEPH_OSD_OP_STARTSYNC:
7386       result = 0;
7387       t->nop(soid);
7388       break;
7389
7390       // -- trivial map --
7391     case CEPH_OSD_OP_TMAPGET:
7392       tracepoint(osd, do_osd_op_pre_tmapget, soid.oid.name.c_str(), soid.snap.val);
7393       if (pool.info.is_erasure()) {
7394         result = -EOPNOTSUPP;
7395         break;
7396       }
7397       {
7398         vector<OSDOp> nops(1);
7399         OSDOp& newop = nops[0];
7400         newop.op.op = CEPH_OSD_OP_SYNC_READ;
7401         newop.op.extent.offset = 0;
7402         newop.op.extent.length = 0;
7403         result = do_osd_ops(ctx, nops);
7404         osd_op.outdata = std::move(newop.outdata);
7405       }
7406       break;
7407
7408     case CEPH_OSD_OP_TMAPPUT:
7409       tracepoint(osd, do_osd_op_pre_tmapput, soid.oid.name.c_str(), soid.snap.val);
7410       if (pool.info.is_erasure()) {
7411         result = -EOPNOTSUPP;
7412         break;
7413       }
7414       {
7415         //_dout_lock.Lock();
7416         //osd_op.data.hexdump(*_dout);
7417         //_dout_lock.Unlock();
7418
7419         // verify sort order
7420         bool unsorted = false;
7421         if (true) {
7422           bufferlist header;
7423           decode(header, bp);
7424           uint32_t n;
7425           decode(n, bp);
7426           string last_key;
7427           while (n--) {
7428             string key;
7429             decode(key, bp);
7430             dout(10) << "tmapput key " << key << dendl;
7431             bufferlist val;
7432             decode(val, bp);
7433             if (key < last_key) {
7434               dout(10) << "TMAPPUT is unordered; resorting" << dendl;
7435               unsorted = true;
7436               break;
7437             }
7438             last_key = key;
7439           }
7440         }
7441
7442         // write it
7443         vector<OSDOp> nops(1);
7444         OSDOp& newop = nops[0];
7445         newop.op.op = CEPH_OSD_OP_WRITEFULL;
7446         newop.op.extent.offset = 0;
7447         newop.op.extent.length = osd_op.indata.length();
7448         newop.indata = osd_op.indata;
7449
7450         if (unsorted) {
7451           bp = osd_op.indata.begin();
7452           bufferlist header;
7453           map<string, bufferlist> m;
7454           decode(header, bp);
7455           decode(m, bp);
7456           ceph_assert(bp.end());
7457           bufferlist newbl;
7458           encode(header, newbl);
7459           encode(m, newbl);
7460           newop.indata = newbl;
7461         }
7462         result = do_osd_ops(ctx, nops);
7463         ceph_assert(result == 0);
7464       }
7465       break;
7466
7467     case CEPH_OSD_OP_TMAPUP:
7468       tracepoint(osd, do_osd_op_pre_tmapup, soid.oid.name.c_str(), soid.snap.val);
7469       if (pool.info.is_erasure()) {
7470         result = -EOPNOTSUPP;
7471         break;
7472       }
7473       ++ctx->num_write;
7474       result = do_tmapup(ctx, bp, osd_op);
7475       break;
7476
7477     case CEPH_OSD_OP_TMAP2OMAP:
7478       ++ctx->num_write;
7479       tracepoint(osd, do_osd_op_pre_tmap2omap, soid.oid.name.c_str(), soid.snap.val);
7480       result = do_tmap2omap(ctx, op.tmap2omap.flags);
7481       break;
7482
7483       // OMAP Read ops
7484     case CEPH_OSD_OP_OMAPGETKEYS:
7485       ++ctx->num_read;
7486       {
7487         string start_after;
7488         uint64_t max_return;
7489         try {
7490           decode(start_after, bp);
7491           decode(max_return, bp);
7492         }
7493         catch (ceph::buffer::error& e) {
7494           result = -EINVAL;
7495           tracepoint(osd, do_osd_op_pre_omapgetkeys, soid.oid.name.c_str(), soid.snap.val, "???", 0);
7496           goto fail;
7497         }
7498         if (max_return > cct->_conf->osd_max_omap_entries_per_request) {
7499           max_return = cct->_conf->osd_max_omap_entries_per_request;
7500         }
7501         tracepoint(osd, do_osd_op_pre_omapgetkeys, soid.oid.name.c_str(), soid.snap.val, start_after.c_str(), max_return);
7502
7503         bufferlist bl;
7504         uint32_t num = 0;
7505         bool truncated = false;
7506         if (oi.is_omap()) {
7507           ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(
7508             ch, ghobject_t(soid)
7509             );
7510           ceph_assert(iter);
7511           iter->upper_bound(start_after);
7512           for (num = 0; iter->valid(); ++num, iter->next()) {
7513             if (num >= max_return ||
7514                 bl.length() >= cct->_conf->osd_max_omap_bytes_per_request) {
7515               truncated = true;
7516               break;
7517             }
7518             encode(iter->key(), bl);
7519           }
7520         } // else return empty out_set
7521         encode(num, osd_op.outdata);
7522         osd_op.outdata.claim_append(bl);
7523         encode(truncated, osd_op.outdata);
7524         ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
7525         ctx->delta_stats.num_rd++;
7526       }
7527       break;
7528
7529     case CEPH_OSD_OP_OMAPGETVALS:
7530       ++ctx->num_read;
7531       {
7532         string start_after;
7533         uint64_t max_return;
7534         string filter_prefix;
7535         try {
7536           decode(start_after, bp);
7537           decode(max_return, bp);
7538           decode(filter_prefix, bp);
7539         }
7540         catch (ceph::buffer::error& e) {
7541           result = -EINVAL;
7542           tracepoint(osd, do_osd_op_pre_omapgetvals, soid.oid.name.c_str(), soid.snap.val, "???", 0, "???");
7543           goto fail;
7544         }
7545         if (max_return > cct->_conf->osd_max_omap_entries_per_request) {
7546           max_return = cct->_conf->osd_max_omap_entries_per_request;
7547         }
7548         tracepoint(osd, do_osd_op_pre_omapgetvals, soid.oid.name.c_str(), soid.snap.val, start_after.c_str(), max_return, filter_prefix.c_str());
7549
7550         uint32_t num = 0;
7551         bool truncated = false;
7552         bufferlist bl;
7553         if (oi.is_omap()) {
7554           ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(
7555             ch, ghobject_t(soid)
7556             );
7557           if (!iter) {
7558             result = -ENOENT;
7559             goto fail;
7560           }
7561           iter->upper_bound(start_after);
7562           if (filter_prefix > start_after) iter->lower_bound(filter_prefix);
7563           for (num = 0;
7564                iter->valid() &&
7565                  iter->key().substr(0, filter_prefix.size()) == filter_prefix;
7566                ++num, iter->next()) {
7567             dout(20) << "Found key " << iter->key() << dendl;
7568             if (num >= max_return ||
7569                 bl.length() >= cct->_conf->osd_max_omap_bytes_per_request) {
7570               truncated = true;
7571               break;
7572             }
7573             encode(iter->key(), bl);
7574             encode(iter->value(), bl);
7575           }
7576         } // else return empty out_set
7577         encode(num, osd_op.outdata);
7578         osd_op.outdata.claim_append(bl);
7579         encode(truncated, osd_op.outdata);
7580         ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
7581         ctx->delta_stats.num_rd++;
7582       }
7583       break;
7584
7585     case CEPH_OSD_OP_OMAPGETHEADER:
7586       tracepoint(osd, do_osd_op_pre_omapgetheader, soid.oid.name.c_str(), soid.snap.val);
7587       if (!oi.is_omap()) {
7588         // return empty header
7589         break;
7590       }
7591       ++ctx->num_read;
7592       {
7593         osd->store->omap_get_header(ch, ghobject_t(soid), &osd_op.outdata);
7594         ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
7595         ctx->delta_stats.num_rd++;
7596       }
7597       break;
7598
7599     case CEPH_OSD_OP_OMAPGETVALSBYKEYS:
7600       ++ctx->num_read;
7601       {
7602         set<string> keys_to_get;
7603         try {
7604           decode(keys_to_get, bp);
7605         }
7606         catch (ceph::buffer::error& e) {
7607           result = -EINVAL;
7608           tracepoint(osd, do_osd_op_pre_omapgetvalsbykeys, soid.oid.name.c_str(), soid.snap.val, "???");
7609           goto fail;
7610         }
7611         tracepoint(osd, do_osd_op_pre_omapgetvalsbykeys, soid.oid.name.c_str(), soid.snap.val, list_entries(keys_to_get).c_str());
7612         map<string, bufferlist> out;
7613         if (oi.is_omap()) {
7614           osd->store->omap_get_values(ch, ghobject_t(soid), keys_to_get, &out);
7615         } // else return empty omap entries
7616         encode(out, osd_op.outdata);
7617         ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
7618         ctx->delta_stats.num_rd++;
7619       }
7620       break;
7621
7622     case CEPH_OSD_OP_OMAP_CMP:
7623       ++ctx->num_read;
7624       {
7625         if (!obs.exists || oi.is_whiteout()) {
7626           result = -ENOENT;
7627           tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, "???");
7628           break;
7629         }
7630         map<string, pair<bufferlist, int> > assertions;
7631         try {
7632           decode(assertions, bp);
7633         }
7634         catch (ceph::buffer::error& e) {
7635           result = -EINVAL;
7636           tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, "???");
7637           goto fail;
7638         }
7639         tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, list_keys(assertions).c_str());
7640
7641         map<string, bufferlist> out;
7642
7643         if (oi.is_omap()) {
7644           set<string> to_get;
7645           for (map<string, pair<bufferlist, int> >::iterator i = assertions.begin();
7646                i != assertions.end();
7647                ++i)
7648             to_get.insert(i->first);
7649           int r = osd->store->omap_get_values(ch, ghobject_t(soid),
7650                                               to_get, &out);
7651           if (r < 0) {
7652             result = r;
7653             break;
7654           }
7655         } // else leave out empty
7656
7657         //Should set num_rd_kb based on encode length of map
7658         ctx->delta_stats.num_rd++;
7659
7660         int r = 0;
7661         bufferlist empty;
7662         for (map<string, pair<bufferlist, int> >::iterator i = assertions.begin();
7663              i != assertions.end();
7664              ++i) {
7665           auto out_entry = out.find(i->first);
7666           bufferlist &bl = (out_entry != out.end()) ?
7667             out_entry->second : empty;
7668           switch (i->second.second) {
7669           case CEPH_OSD_CMPXATTR_OP_EQ:
7670             if (!(bl == i->second.first)) {
7671               r = -ECANCELED;
7672             }
7673             break;
7674           case CEPH_OSD_CMPXATTR_OP_LT:
7675             if (!(bl < i->second.first)) {
7676               r = -ECANCELED;
7677             }
7678             break;
7679           case CEPH_OSD_CMPXATTR_OP_GT:
7680             if (!(bl > i->second.first)) {
7681               r = -ECANCELED;
7682             }
7683             break;
7684           default:
7685             r = -EINVAL;
7686             break;
7687           }
7688           if (r < 0)
7689             break;
7690         }
7691         if (r < 0) {
7692           result = r;
7693         }
7694       }
7695       break;
7696
7697       // OMAP Write ops
7698     case CEPH_OSD_OP_OMAPSETVALS:
7699       if (!pool.info.supports_omap()) {
7700         result = -EOPNOTSUPP;
7701         tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
7702         break;
7703       }
7704       ++ctx->num_write;
7705       result = 0;
7706       {
7707         maybe_create_new_object(ctx);
7708         bufferlist to_set_bl;
7709         try {
7710           decode_str_str_map_to_bl(bp, &to_set_bl);
7711         }
7712         catch (ceph::buffer::error& e) {
7713           result = -EINVAL;
7714           tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
7715           goto fail;
7716         }
7717         tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
7718         if (cct->_conf->subsys.should_gather<dout_subsys, 20>()) {
7719           dout(20) << "setting vals: " << dendl;
7720           map<string,bufferlist> to_set;
7721           bufferlist::const_iterator pt = to_set_bl.begin();
7722           decode(to_set, pt);
7723           for (map<string, bufferlist>::iterator i = to_set.begin();
7724                i != to_set.end();
7725                ++i) {
7726             dout(20) << "\t" << i->first << dendl;
7727           }
7728         }
7729         t->omap_setkeys(soid, to_set_bl);
7730         ctx->clean_regions.mark_omap_dirty();
7731         ctx->delta_stats.num_wr++;
7732         ctx->delta_stats.num_wr_kb += shift_round_up(to_set_bl.length(), 10);
7733       }
7734       obs.oi.set_flag(object_info_t::FLAG_OMAP);
7735       obs.oi.clear_omap_digest();
7736       break;
7737
7738     case CEPH_OSD_OP_OMAPSETHEADER:
7739       tracepoint(osd, do_osd_op_pre_omapsetheader, soid.oid.name.c_str(), soid.snap.val);
7740       if (!pool.info.supports_omap()) {
7741         result = -EOPNOTSUPP;
7742         break;
7743       }
7744       ++ctx->num_write;
7745       result = 0;
7746       {
7747         maybe_create_new_object(ctx);
7748         t->omap_setheader(soid, osd_op.indata);
7749         ctx->clean_regions.mark_omap_dirty();
7750         ctx->delta_stats.num_wr++;
7751       }
7752       obs.oi.set_flag(object_info_t::FLAG_OMAP);
7753       obs.oi.clear_omap_digest();
7754       break;
7755
7756     case CEPH_OSD_OP_OMAPCLEAR:
7757       tracepoint(osd, do_osd_op_pre_omapclear, soid.oid.name.c_str(), soid.snap.val);
7758       if (!pool.info.supports_omap()) {
7759         result = -EOPNOTSUPP;
7760         break;
7761       }
7762       ++ctx->num_write;
7763       result = 0;
7764       {
7765         if (!obs.exists || oi.is_whiteout()) {
7766           result = -ENOENT;
7767           break;
7768         }
7769         if (oi.is_omap()) {
7770           t->omap_clear(soid);
7771           ctx->clean_regions.mark_omap_dirty();
7772           ctx->delta_stats.num_wr++;
7773           obs.oi.clear_omap_digest();
7774           obs.oi.clear_flag(object_info_t::FLAG_OMAP);
7775         }
7776       }
7777       break;
7778
7779     case CEPH_OSD_OP_OMAPRMKEYS:
7780       if (!pool.info.supports_omap()) {
7781         result = -EOPNOTSUPP;
7782         tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
7783         break;
7784       }
7785       ++ctx->num_write;
7786       result = 0;
7787       {
7788         if (!obs.exists || oi.is_whiteout()) {
7789           result = -ENOENT;
7790           tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
7791           break;
7792         }
7793         bufferlist to_rm_bl;
7794         try {
7795           decode_str_set_to_bl(bp, &to_rm_bl);
7796         }
7797         catch (ceph::buffer::error& e) {
7798           result = -EINVAL;
7799           tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
7800           goto fail;
7801         }
7802         tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
7803         t->omap_rmkeys(soid, to_rm_bl);
7804         ctx->clean_regions.mark_omap_dirty();
7805         ctx->delta_stats.num_wr++;
7806       }
7807       obs.oi.clear_omap_digest();
7808       break;
7809
7810     case CEPH_OSD_OP_OMAPRMKEYRANGE:
7811       tracepoint(osd, do_osd_op_pre_omaprmkeyrange, soid.oid.name.c_str(), soid.snap.val);
7812       if (!pool.info.supports_omap()) {
7813         result = -EOPNOTSUPP;
7814         break;
7815       }
7816       ++ctx->num_write;
7817       result = 0;
7818       {
7819         if (!obs.exists || oi.is_whiteout()) {
7820           result = -ENOENT;
7821           break;
7822         }
7823         std::string key_begin, key_end;
7824         try {
7825           decode(key_begin, bp);
7826           decode(key_end, bp);
7827         } catch (ceph::buffer::error& e) {
7828           result = -EINVAL;
7829           goto fail;
7830         }
7831         t->omap_rmkeyrange(soid, key_begin, key_end);
7832         ctx->delta_stats.num_wr++;
7833       }
7834       obs.oi.clear_omap_digest();
7835       break;
7836
7837     case CEPH_OSD_OP_COPY_GET:
7838       ++ctx->num_read;
7839       tracepoint(osd, do_osd_op_pre_copy_get, soid.oid.name.c_str(),
7840                  soid.snap.val);
7841       if (op_finisher == nullptr) {
7842         result = do_copy_get(ctx, bp, osd_op, ctx->obc);
7843       } else {
7844         result = op_finisher->execute();
7845       }
7846       break;
7847
7848     case CEPH_OSD_OP_COPY_FROM:
7849     case CEPH_OSD_OP_COPY_FROM2:
7850       ++ctx->num_write;
7851       result = 0;
7852       {
7853         object_t src_name;
7854         object_locator_t src_oloc;
7855         uint32_t truncate_seq = 0;
7856         uint64_t truncate_size = 0;
7857         bool have_truncate = false;
7858         snapid_t src_snapid = (uint64_t)op.copy_from.snapid;
7859         version_t src_version = op.copy_from.src_version;
7860
7861         if ((op.op == CEPH_OSD_OP_COPY_FROM2) &&
7862             (op.copy_from.flags & ~CEPH_OSD_COPY_FROM_FLAGS)) {
7863           dout(20) << "invalid copy-from2 flags 0x"
7864                   << std::hex << (int)op.copy_from.flags << std::dec << dendl;
7865           result = -EINVAL;
7866           break;
7867         }
7868         try {
7869           decode(src_name, bp);
7870           decode(src_oloc, bp);
7871           // check if client sent us truncate_seq and truncate_size
7872           if ((op.op == CEPH_OSD_OP_COPY_FROM2) &&
7873               (op.copy_from.flags & CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ)) {
7874             decode(truncate_seq, bp);
7875             decode(truncate_size, bp);
7876             have_truncate = true;
7877           }
7878         }
7879         catch (ceph::buffer::error& e) {
7880           result = -EINVAL;
7881           tracepoint(osd,
7882                      do_osd_op_pre_copy_from,
7883                      soid.oid.name.c_str(),
7884                      soid.snap.val,
7885                      "???",
7886                      0,
7887                      "???",
7888                      "???",
7889                      0,
7890                      src_snapid,
7891                      src_version);
7892           goto fail;
7893         }
7894         tracepoint(osd,
7895                    do_osd_op_pre_copy_from,
7896                    soid.oid.name.c_str(),
7897                    soid.snap.val,
7898                    src_name.name.c_str(),
7899                    src_oloc.pool,
7900                    src_oloc.key.c_str(),
7901                    src_oloc.nspace.c_str(),
7902                    src_oloc.hash,
7903                    src_snapid,
7904                    src_version);
7905         if (op_finisher == nullptr) {
7906           // start
7907           pg_t raw_pg;
7908           get_osdmap()->object_locator_to_pg(src_name, src_oloc, raw_pg);
7909           hobject_t src(src_name, src_oloc.key, src_snapid,
7910                         raw_pg.ps(), raw_pg.pool(),
7911                         src_oloc.nspace);
7912           if (src == soid) {
7913             dout(20) << " copy from self is invalid" << dendl;
7914             result = -EINVAL;
7915             break;
7916           }
7917           CopyFromCallback *cb = new CopyFromCallback(ctx, osd_op);
7918           if (have_truncate)
7919             cb->set_truncate(truncate_seq, truncate_size);
7920           ctx->op_finishers[ctx->current_osd_subop_num].reset(
7921             new CopyFromFinisher(cb));
7922           start_copy(cb, ctx->obc, src, src_oloc, src_version,
7923                      op.copy_from.flags,
7924                      false,
7925                      op.copy_from.src_fadvise_flags,
7926                      op.flags);
7927           result = -EINPROGRESS;
7928         } else {
7929           // finish
7930           result = op_finisher->execute();
7931           ceph_assert(result == 0);
7932
7933           // COPY_FROM cannot be executed multiple times -- it must restart
7934           ctx->op_finishers.erase(ctx->current_osd_subop_num);
7935         }
7936       }
7937       break;
7938
7939     default:
7940       tracepoint(osd, do_osd_op_pre_unknown, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op));
7941       dout(1) << "unrecognized osd op " << op.op
7942               << " " << ceph_osd_op_name(op.op)
7943               << dendl;
7944       result = -EOPNOTSUPP;
7945     }
7946
7947   fail:
7948     osd_op.rval = result;
7949     tracepoint(osd, do_osd_op_post, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op), op.flags, result);
7950     if (result < 0 && (op.flags & CEPH_OSD_OP_FLAG_FAILOK) &&
7951         result != -EAGAIN && result != -EINPROGRESS)
7952       result = 0;
7953
7954     if (result < 0)
7955       break;
7956   }
7957   if (result < 0) {
7958     dout(10) << __func__ << " error: " << cpp_strerror(result) << dendl;
7959   }
7960   return result;
7961 }
7962
7963 int PrimaryLogPG::_get_tmap(OpContext *ctx, bufferlist *header, bufferlist *vals)
7964 {
7965   if (ctx->new_obs.oi.size == 0) {
7966     dout(20) << "unable to get tmap for zero sized " << ctx->new_obs.oi.soid << dendl;
7967     return -ENODATA;
7968   }
7969   vector<OSDOp> nops(1);
7970   OSDOp &newop = nops[0];
7971   newop.op.op = CEPH_OSD_OP_TMAPGET;
7972   do_osd_ops(ctx, nops);
7973   try {
7974     bufferlist::const_iterator i = newop.outdata.begin();
7975     decode(*header, i);
7976     (*vals).substr_of(newop.outdata, i.get_off(), i.get_remaining());
7977   } catch (...) {
7978     dout(20) << "unsuccessful at decoding tmap for " << ctx->new_obs.oi.soid
7979              << dendl;
7980     return -EINVAL;
7981   }
7982   dout(20) << "successful at decoding tmap for " << ctx->new_obs.oi.soid
7983            << dendl;
7984   return 0;
7985 }
7986
7987 int PrimaryLogPG::_verify_no_head_clones(const hobject_t& soid,
7988                                         const SnapSet& ss)
7989 {
7990   // verify that all clones have been evicted
7991   dout(20) << __func__ << " verifying clones are absent "
7992            << ss << dendl;
7993   for (vector<snapid_t>::const_iterator p = ss.clones.begin();
7994        p != ss.clones.end();
7995        ++p) {
7996     hobject_t clone_oid = soid;
7997     clone_oid.snap = *p;
7998     if (is_missing_object(clone_oid))
7999       return -EBUSY;
8000     ObjectContextRef clone_obc = get_object_context(clone_oid, false);
8001     if (clone_obc && clone_obc->obs.exists) {
8002       dout(10) << __func__ << " cannot evict head before clone "
8003                << clone_oid << dendl;
8004       return -EBUSY;
8005     }
8006     if (copy_ops.count(clone_oid)) {
8007       dout(10) << __func__ << " cannot evict head, pending promote on clone "
8008                << clone_oid << dendl;
8009       return -EBUSY;
8010     }
8011   }
8012   return 0;
8013 }
8014
8015 inline int PrimaryLogPG::_delete_oid(
8016   OpContext *ctx,
8017   bool no_whiteout,     // no whiteouts, no matter what.
8018   bool try_no_whiteout) // try not to whiteout
8019 {
8020   SnapSet& snapset = ctx->new_snapset;
8021   ObjectState& obs = ctx->new_obs;
8022   object_info_t& oi = obs.oi;
8023   const hobject_t& soid = oi.soid;
8024   PGTransaction* t = ctx->op_t.get();
8025
8026   // cache: cache: set whiteout on delete?
8027   bool whiteout = false;
8028   if (pool.info.cache_mode != pg_pool_t::CACHEMODE_NONE
8029       && !no_whiteout
8030       && !try_no_whiteout) {
8031     whiteout = true;
8032   }
8033
8034   // in luminous or later, we can't delete the head if there are
8035   // clones. we trust the caller passing no_whiteout has already
8036   // verified they don't exist.
8037   if (!snapset.clones.empty() ||
8038       (!ctx->snapc.snaps.empty() && ctx->snapc.snaps[0] > snapset.seq)) {
8039     if (no_whiteout) {
8040       dout(20) << __func__ << " has or will have clones but no_whiteout=1"
8041                << dendl;
8042     } else {
8043       dout(20) << __func__ << " has or will have clones; will whiteout"
8044                << dendl;
8045       whiteout = true;
8046     }
8047   }
8048   dout(20) << __func__ << " " << soid << " whiteout=" << (int)whiteout
8049            << " no_whiteout=" << (int)no_whiteout
8050            << " try_no_whiteout=" << (int)try_no_whiteout
8051            << dendl;
8052   if (!obs.exists || (obs.oi.is_whiteout() && whiteout))
8053     return -ENOENT;
8054
8055   t->remove(soid);
8056
8057   if (oi.size > 0) {
8058     interval_set<uint64_t> ch;
8059     ch.insert(0, oi.size);
8060     ctx->modified_ranges.union_of(ch);
8061     ctx->clean_regions.mark_data_region_dirty(0, oi.size);
8062   }
8063
8064   ctx->clean_regions.mark_omap_dirty();
8065   ctx->delta_stats.num_wr++;
8066   if (soid.is_snap()) {
8067     ceph_assert(ctx->obc->ssc->snapset.clone_overlap.count(soid.snap));
8068     ctx->delta_stats.num_bytes -= ctx->obc->ssc->snapset.get_clone_bytes(soid.snap);
8069   } else {
8070     ctx->delta_stats.num_bytes -= oi.size;
8071   }
8072   oi.size = 0;
8073   oi.new_object();
8074
8075   // disconnect all watchers
8076   for (map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator p =
8077          oi.watchers.begin();
8078        p != oi.watchers.end();
8079        ++p) {
8080     dout(20) << __func__ << " will disconnect watcher " << p->first << dendl;
8081     ctx->watch_disconnects.push_back(
8082       watch_disconnect_t(p->first.first, p->first.second, true));
8083   }
8084   oi.watchers.clear();
8085
8086   if (oi.has_manifest()) {
8087     ctx->delta_stats.num_objects_manifest--;
8088     dec_all_refcount_manifest(oi, ctx);
8089   }
8090
8091   if (whiteout) {
8092     dout(20) << __func__ << " setting whiteout on " << soid << dendl;
8093     oi.set_flag(object_info_t::FLAG_WHITEOUT);
8094     ctx->delta_stats.num_whiteouts++;
8095     t->create(soid);
8096     osd->logger->inc(l_osd_tier_whiteout);
8097     return 0;
8098   }
8099
8100   // delete the head
8101   ctx->delta_stats.num_objects--;
8102   if (soid.is_snap())
8103     ctx->delta_stats.num_object_clones--;
8104   if (oi.is_whiteout()) {
8105     dout(20) << __func__ << " deleting whiteout on " << soid << dendl;
8106     ctx->delta_stats.num_whiteouts--;
8107     oi.clear_flag(object_info_t::FLAG_WHITEOUT);
8108   }
8109   if (oi.is_cache_pinned()) {
8110     ctx->delta_stats.num_objects_pinned--;
8111   }
8112   obs.exists = false;
8113   return 0;
8114 }
8115
8116 int PrimaryLogPG::_rollback_to(OpContext *ctx, ceph_osd_op& op)
8117 {
8118   SnapSet& snapset = ctx->new_snapset;
8119   ObjectState& obs = ctx->new_obs;
8120   object_info_t& oi = obs.oi;
8121   const hobject_t& soid = oi.soid;
8122   PGTransaction* t = ctx->op_t.get();
8123   snapid_t snapid = (uint64_t)op.snap.snapid;
8124   hobject_t missing_oid;
8125
8126   dout(10) << "_rollback_to " << soid << " snapid " << snapid << dendl;
8127
8128   ObjectContextRef rollback_to;
8129
8130   int ret = find_object_context(
8131     hobject_t(soid.oid, soid.get_key(), snapid, soid.get_hash(), info.pgid.pool(),
8132               soid.get_namespace()),
8133     &rollback_to, false, false, &missing_oid);
8134   if (ret == -EAGAIN) {
8135     /* clone must be missing */
8136     ceph_assert(is_degraded_or_backfilling_object(missing_oid) || is_degraded_on_async_recovery_target(missing_oid));
8137     dout(20) << "_rollback_to attempted to roll back to a missing or backfilling clone "
8138              << missing_oid << " (requested snapid: ) " << snapid << dendl;
8139     block_write_on_degraded_snap(missing_oid, ctx->op);
8140     return ret;
8141   }
8142   {
8143     ObjectContextRef promote_obc;
8144     cache_result_t tier_mode_result;
8145     if (obs.exists && obs.oi.has_manifest()) {
8146       tier_mode_result =
8147         maybe_handle_manifest_detail(
8148           ctx->op,
8149           true,
8150           rollback_to);
8151     } else {
8152       tier_mode_result =
8153         maybe_handle_cache_detail(
8154           ctx->op,
8155           true,
8156           rollback_to,
8157           ret,
8158           missing_oid,
8159           true,
8160           false,
8161           &promote_obc);
8162     }
8163     switch (tier_mode_result) {
8164     case cache_result_t::NOOP:
8165       break;
8166     case cache_result_t::BLOCKED_PROMOTE:
8167       ceph_assert(promote_obc);
8168       block_write_on_snap_rollback(soid, promote_obc, ctx->op);
8169       return -EAGAIN;
8170     case cache_result_t::BLOCKED_FULL:
8171       block_write_on_full_cache(soid, ctx->op);
8172       return -EAGAIN;
8173     case cache_result_t::REPLIED_WITH_EAGAIN:
8174       ceph_abort_msg("this can't happen, no rollback on replica");
8175     default:
8176       ceph_abort_msg("must promote was set, other values are not valid");
8177       return -EAGAIN;
8178     }
8179   }
8180
8181   if (ret == -ENOENT || (rollback_to && rollback_to->obs.oi.is_whiteout())) {
8182     // there's no snapshot here, or there's no object.
8183     // if there's no snapshot, we delete the object; otherwise, do nothing.
8184     dout(20) << "_rollback_to deleting head on " << soid.oid
8185              << " because got ENOENT|whiteout on find_object_context" << dendl;
8186     if (ctx->obc->obs.oi.watchers.size()) {
8187       // Cannot delete an object with watchers
8188       ret = -EBUSY;
8189     } else {
8190       _delete_oid(ctx, false, false);
8191       ret = 0;
8192     }
8193   } else if (ret) {
8194     // ummm....huh? It *can't* return anything else at time of writing.
8195     ceph_abort_msg("unexpected error code in _rollback_to");
8196   } else { //we got our context, let's use it to do the rollback!
8197     hobject_t& rollback_to_sobject = rollback_to->obs.oi.soid;
8198     if (is_degraded_or_backfilling_object(rollback_to_sobject) ||
8199         is_degraded_on_async_recovery_target(rollback_to_sobject)) {
8200       dout(20) << "_rollback_to attempted to roll back to a degraded object "
8201                << rollback_to_sobject << " (requested snapid: ) " << snapid << dendl;
8202       block_write_on_degraded_snap(rollback_to_sobject, ctx->op);
8203       ret = -EAGAIN;
8204     } else if (rollback_to->obs.oi.soid.snap == CEPH_NOSNAP) {
8205       // rolling back to the head; we just need to clone it.
8206       ctx->modify = true;
8207     } else {
8208       /* 1) Delete current head
8209        * 2) Clone correct snapshot into head
8210        * 3) Calculate clone_overlaps by following overlaps
8211        *    forward from rollback snapshot */
8212       dout(10) << "_rollback_to deleting " << soid.oid
8213                << " and rolling back to old snap" << dendl;
8214
8215       if (obs.exists) {
8216         t->remove(soid);
8217       }
8218       t->clone(soid, rollback_to_sobject);
8219       t->add_obc(rollback_to);
8220
8221       map<snapid_t, interval_set<uint64_t> >::iterator iter =
8222         snapset.clone_overlap.lower_bound(snapid);
8223       ceph_assert(iter != snapset.clone_overlap.end());
8224       interval_set<uint64_t> overlaps = iter->second;
8225       for ( ;
8226             iter != snapset.clone_overlap.end();
8227             ++iter)
8228         overlaps.intersection_of(iter->second);
8229
8230       if (obs.oi.size > 0) {
8231         interval_set<uint64_t> modified;
8232         modified.insert(0, obs.oi.size);
8233         overlaps.intersection_of(modified);
8234         modified.subtract(overlaps);
8235         ctx->modified_ranges.union_of(modified);
8236       }
8237
8238       // Adjust the cached objectcontext
8239       maybe_create_new_object(ctx, true);
8240       ctx->delta_stats.num_bytes -= obs.oi.size;
8241       ctx->delta_stats.num_bytes += rollback_to->obs.oi.size;
8242       ctx->clean_regions.mark_data_region_dirty(0, std::max(obs.oi.size, rollback_to->obs.oi.size));
8243       ctx->clean_regions.mark_omap_dirty();
8244       obs.oi.size = rollback_to->obs.oi.size;
8245       if (rollback_to->obs.oi.is_data_digest())
8246         obs.oi.set_data_digest(rollback_to->obs.oi.data_digest);
8247       else
8248         obs.oi.clear_data_digest();
8249       if (rollback_to->obs.oi.is_omap_digest())
8250         obs.oi.set_omap_digest(rollback_to->obs.oi.omap_digest);
8251       else
8252         obs.oi.clear_omap_digest();
8253
8254       if (rollback_to->obs.oi.is_omap()) {
8255         dout(10) << __func__ << " setting omap flag on " << obs.oi.soid << dendl;
8256         obs.oi.set_flag(object_info_t::FLAG_OMAP);
8257       } else {
8258         dout(10) << __func__ << " clearing omap flag on " << obs.oi.soid << dendl;
8259         obs.oi.clear_flag(object_info_t::FLAG_OMAP);
8260       }
8261     }
8262   }
8263   return ret;
8264 }
8265
8266 void PrimaryLogPG::_make_clone(
8267   OpContext *ctx,
8268   PGTransaction* t,
8269   ObjectContextRef obc,
8270   const hobject_t& head, const hobject_t& coid,
8271   object_info_t *poi)
8272 {
8273   bufferlist bv;
8274   encode(*poi, bv, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
8275
8276   t->clone(coid, head);
8277   setattr_maybe_cache(obc, t, OI_ATTR, bv);
8278   rmattr_maybe_cache(obc, t, SS_ATTR);
8279 }
8280
8281 void PrimaryLogPG::make_writeable(OpContext *ctx)
8282 {
8283   const hobject_t& soid = ctx->obs->oi.soid;
8284   SnapContext& snapc = ctx->snapc;
8285
8286   // clone?
8287   ceph_assert(soid.snap == CEPH_NOSNAP);
8288   dout(20) << "make_writeable " << soid << " snapset=" << ctx->new_snapset
8289            << "  snapc=" << snapc << dendl;
8290
8291   bool was_dirty = ctx->obc->obs.oi.is_dirty();
8292   if (ctx->new_obs.exists) {
8293     // we will mark the object dirty
8294     if (ctx->undirty && was_dirty) {
8295       dout(20) << " clearing DIRTY flag" << dendl;
8296       ceph_assert(ctx->new_obs.oi.is_dirty());
8297       ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
8298       --ctx->delta_stats.num_objects_dirty;
8299       osd->logger->inc(l_osd_tier_clean);
8300     } else if (!was_dirty && !ctx->undirty) {
8301       dout(20) << " setting DIRTY flag" << dendl;
8302       ctx->new_obs.oi.set_flag(object_info_t::FLAG_DIRTY);
8303       ++ctx->delta_stats.num_objects_dirty;
8304       osd->logger->inc(l_osd_tier_dirty);
8305     }
8306   } else {
8307     if (was_dirty) {
8308       dout(20) << " deletion, decrementing num_dirty and clearing flag" << dendl;
8309       ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
8310       --ctx->delta_stats.num_objects_dirty;
8311     }
8312   }
8313
8314   if ((ctx->new_obs.exists &&
8315        ctx->new_obs.oi.is_omap()) &&
8316       (!ctx->obc->obs.exists ||
8317        !ctx->obc->obs.oi.is_omap())) {
8318     ++ctx->delta_stats.num_objects_omap;
8319   }
8320   if ((!ctx->new_obs.exists ||
8321        !ctx->new_obs.oi.is_omap()) &&
8322       (ctx->obc->obs.exists &&
8323        ctx->obc->obs.oi.is_omap())) {
8324     --ctx->delta_stats.num_objects_omap;
8325   }
8326
8327   if (ctx->new_snapset.seq > snapc.seq) {
8328     dout(10) << " op snapset is old" << dendl;
8329   }
8330
8331   if ((ctx->obs->exists && !ctx->obs->oi.is_whiteout()) && // head exist(ed)
8332       snapc.snaps.size() &&                 // there are snaps
8333       !ctx->cache_operation &&
8334       snapc.snaps[0] > ctx->new_snapset.seq) {  // existing object is old
8335     // clone
8336     hobject_t coid = soid;
8337     coid.snap = snapc.seq;
8338
8339     unsigned l;
8340     for (l = 1;
8341          l < snapc.snaps.size() && snapc.snaps[l] > ctx->new_snapset.seq;
8342          l++) ;
8343
8344     vector<snapid_t> snaps(l);
8345     for (unsigned i=0; i<l; i++)
8346       snaps[i] = snapc.snaps[i];
8347
8348     // prepare clone
8349     object_info_t static_snap_oi(coid);
8350     object_info_t *snap_oi;
8351     if (is_primary()) {
8352       ctx->clone_obc = object_contexts.lookup_or_create(static_snap_oi.soid);
8353       ctx->clone_obc->destructor_callback =
8354         new C_PG_ObjectContext(this, ctx->clone_obc.get());
8355       ctx->clone_obc->obs.oi = static_snap_oi;
8356       ctx->clone_obc->obs.exists = true;
8357       ctx->clone_obc->ssc = ctx->obc->ssc;
8358       ctx->clone_obc->ssc->ref++;
8359       if (pool.info.is_erasure())
8360         ctx->clone_obc->attr_cache = ctx->obc->attr_cache;
8361       snap_oi = &ctx->clone_obc->obs.oi;
8362       if (ctx->obc->obs.oi.has_manifest()) {
8363         if ((ctx->obc->obs.oi.flags & object_info_t::FLAG_REDIRECT_HAS_REFERENCE) &&
8364             ctx->obc->obs.oi.manifest.is_redirect()) {
8365           snap_oi->set_flag(object_info_t::FLAG_MANIFEST);
8366           snap_oi->manifest.type = object_manifest_t::TYPE_REDIRECT;
8367           snap_oi->manifest.redirect_target = ctx->obc->obs.oi.manifest.redirect_target;
8368         } else if (ctx->obc->obs.oi.manifest.is_chunked()) {
8369           snap_oi->set_flag(object_info_t::FLAG_MANIFEST);
8370           snap_oi->manifest.type = object_manifest_t::TYPE_CHUNKED;
8371           snap_oi->manifest.chunk_map = ctx->obc->obs.oi.manifest.chunk_map;
8372         } else {
8373           ceph_abort_msg("unrecognized manifest type");
8374         }
8375       }
8376       bool got = ctx->lock_manager.get_write_greedy(
8377         coid,
8378         ctx->clone_obc,
8379         ctx->op);
8380       ceph_assert(got);
8381       dout(20) << " got greedy write on clone_obc " << *ctx->clone_obc << dendl;
8382     } else {
8383       snap_oi = &static_snap_oi;
8384     }
8385     snap_oi->version = ctx->at_version;
8386     snap_oi->prior_version = ctx->obs->oi.version;
8387     snap_oi->copy_user_bits(ctx->obs->oi);
8388
8389     _make_clone(ctx, ctx->op_t.get(), ctx->clone_obc, soid, coid, snap_oi);
8390
8391     ctx->delta_stats.num_objects++;
8392     if (snap_oi->is_dirty()) {
8393       ctx->delta_stats.num_objects_dirty++;
8394       osd->logger->inc(l_osd_tier_dirty);
8395     }
8396     if (snap_oi->is_omap())
8397       ctx->delta_stats.num_objects_omap++;
8398     if (snap_oi->is_cache_pinned())
8399       ctx->delta_stats.num_objects_pinned++;
8400     if (snap_oi->has_manifest())
8401       ctx->delta_stats.num_objects_manifest++;
8402     ctx->delta_stats.num_object_clones++;
8403     ctx->new_snapset.clones.push_back(coid.snap);
8404     ctx->new_snapset.clone_size[coid.snap] = ctx->obs->oi.size;
8405     ctx->new_snapset.clone_snaps[coid.snap] = snaps;
8406
8407     // clone_overlap should contain an entry for each clone
8408     // (an empty interval_set if there is no overlap)
8409     ctx->new_snapset.clone_overlap[coid.snap];
8410     if (ctx->obs->oi.size)
8411       ctx->new_snapset.clone_overlap[coid.snap].insert(0, ctx->obs->oi.size);
8412
8413     // log clone
8414     dout(10) << " cloning v " << ctx->obs->oi.version
8415              << " to " << coid << " v " << ctx->at_version
8416              << " snaps=" << snaps
8417              << " snapset=" << ctx->new_snapset << dendl;
8418     ctx->log.push_back(pg_log_entry_t(
8419                          pg_log_entry_t::CLONE, coid, ctx->at_version,
8420                          ctx->obs->oi.version,
8421                          ctx->obs->oi.user_version,
8422                          osd_reqid_t(), ctx->new_obs.oi.mtime, 0));
8423     encode(snaps, ctx->log.back().snaps);
8424
8425     ctx->at_version.version++;
8426   }
8427
8428   // update most recent clone_overlap and usage stats
8429   if (ctx->new_snapset.clones.size() > 0) {
8430     // the clone_overlap is difference of range between head and clones.
8431     // we need to check whether the most recent clone exists, if it's
8432     // been evicted, it's not included in the stats, but the clone_overlap
8433     // is still exist in the snapset, so we should update the
8434     // clone_overlap to make it sense.
8435     hobject_t last_clone_oid = soid;
8436     last_clone_oid.snap = ctx->new_snapset.clone_overlap.rbegin()->first;
8437     interval_set<uint64_t> &newest_overlap =
8438       ctx->new_snapset.clone_overlap.rbegin()->second;
8439     ctx->modified_ranges.intersection_of(newest_overlap);
8440     if (is_present_clone(last_clone_oid)) {
8441       // modified_ranges is still in use by the clone
8442       ctx->delta_stats.num_bytes += ctx->modified_ranges.size();
8443     }
8444     newest_overlap.subtract(ctx->modified_ranges);
8445   }
8446
8447   if (snapc.seq > ctx->new_snapset.seq) {
8448     // update snapset with latest snap context
8449     ctx->new_snapset.seq = snapc.seq;
8450     if (get_osdmap()->require_osd_release < ceph_release_t::octopus) {
8451       ctx->new_snapset.snaps = snapc.snaps;
8452     } else {
8453       ctx->new_snapset.snaps.clear();
8454     }
8455   }
8456   dout(20) << "make_writeable " << soid
8457            << " done, snapset=" << ctx->new_snapset << dendl;
8458 }
8459
8460
8461 void PrimaryLogPG::write_update_size_and_usage(object_stat_sum_t& delta_stats, object_info_t& oi,
8462                                                interval_set<uint64_t>& modified, uint64_t offset,
8463                                                uint64_t length, bool write_full)
8464 {
8465   interval_set<uint64_t> ch;
8466   if (write_full) {
8467     if (oi.size)
8468       ch.insert(0, oi.size);
8469   } else if (length)
8470     ch.insert(offset, length);
8471   modified.union_of(ch);
8472   if (write_full ||
8473       (offset + length > oi.size && length)) {
8474     uint64_t new_size = offset + length;
8475     delta_stats.num_bytes -= oi.size;
8476     delta_stats.num_bytes += new_size;
8477     oi.size = new_size;
8478   }
8479
8480   delta_stats.num_wr++;
8481   delta_stats.num_wr_kb += shift_round_up(length, 10);
8482 }
8483
8484 void PrimaryLogPG::truncate_update_size_and_usage(
8485   object_stat_sum_t& delta_stats,
8486   object_info_t& oi,
8487   uint64_t truncate_size)
8488 {
8489   if (oi.size != truncate_size) {
8490     delta_stats.num_bytes -= oi.size;
8491     delta_stats.num_bytes += truncate_size;
8492     oi.size = truncate_size;
8493   }
8494 }
8495
8496 void PrimaryLogPG::complete_disconnect_watches(
8497   ObjectContextRef obc,
8498   const list<watch_disconnect_t> &to_disconnect)
8499 {
8500   for (list<watch_disconnect_t>::const_iterator i =
8501          to_disconnect.begin();
8502        i != to_disconnect.end();
8503        ++i) {
8504     pair<uint64_t, entity_name_t> watcher(i->cookie, i->name);
8505     auto watchers_entry = obc->watchers.find(watcher);
8506     if (watchers_entry != obc->watchers.end()) {
8507       WatchRef watch = watchers_entry->second;
8508       dout(10) << "do_osd_op_effects disconnect watcher " << watcher << dendl;
8509       obc->watchers.erase(watcher);
8510       watch->remove(i->send_disconnect);
8511     } else {
8512       dout(10) << "do_osd_op_effects disconnect failed to find watcher "
8513                << watcher << dendl;
8514     }
8515   }
8516 }
8517
8518 void PrimaryLogPG::do_osd_op_effects(OpContext *ctx, const ConnectionRef& conn)
8519 {
8520   entity_name_t entity = ctx->reqid.name;
8521   dout(15) << "do_osd_op_effects " << entity << " con " << conn.get() << dendl;
8522
8523   // disconnects first
8524   complete_disconnect_watches(ctx->obc, ctx->watch_disconnects);
8525
8526   ceph_assert(conn);
8527
8528   auto session = conn->get_priv();
8529   if (!session)
8530     return;
8531
8532   for (list<pair<watch_info_t,bool> >::iterator i = ctx->watch_connects.begin();
8533        i != ctx->watch_connects.end();
8534        ++i) {
8535     pair<uint64_t, entity_name_t> watcher(i->first.cookie, entity);
8536     dout(15) << "do_osd_op_effects applying watch connect on session "
8537              << session.get() << " watcher " << watcher << dendl;
8538     WatchRef watch;
8539     if (ctx->obc->watchers.count(watcher)) {
8540       dout(15) << "do_osd_op_effects found existing watch watcher " << watcher
8541                << dendl;
8542       watch = ctx->obc->watchers[watcher];
8543     } else {
8544       dout(15) << "do_osd_op_effects new watcher " << watcher
8545                << dendl;
8546       watch = Watch::makeWatchRef(
8547         this, osd, ctx->obc, i->first.timeout_seconds,
8548         i->first.cookie, entity, conn->get_peer_addr());
8549       ctx->obc->watchers.insert(
8550         make_pair(
8551           watcher,
8552           watch));
8553     }
8554     watch->connect(conn, i->second);
8555   }
8556
8557   for (list<notify_info_t>::iterator p = ctx->notifies.begin();
8558        p != ctx->notifies.end();
8559        ++p) {
8560     dout(10) << "do_osd_op_effects, notify " << *p << dendl;
8561     ConnectionRef conn(ctx->op->get_req()->get_connection());
8562     NotifyRef notif(
8563       Notify::makeNotifyRef(
8564         conn,
8565         ctx->reqid.name.num(),
8566         p->bl,
8567         p->timeout,
8568         p->cookie,
8569         p->notify_id,
8570         ctx->obc->obs.oi.user_version,
8571         osd));
8572     for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator i =
8573            ctx->obc->watchers.begin();
8574          i != ctx->obc->watchers.end();
8575          ++i) {
8576       dout(10) << "starting notify on watch " << i->first << dendl;
8577       i->second->start_notify(notif);
8578     }
8579     notif->init();
8580   }
8581
8582   for (list<OpContext::NotifyAck>::iterator p = ctx->notify_acks.begin();
8583        p != ctx->notify_acks.end();
8584        ++p) {
8585     if (p->watch_cookie)
8586       dout(10) << "notify_ack " << make_pair(*(p->watch_cookie), p->notify_id) << dendl;
8587     else
8588       dout(10) << "notify_ack " << make_pair("NULL", p->notify_id) << dendl;
8589     for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator i =
8590            ctx->obc->watchers.begin();
8591          i != ctx->obc->watchers.end();
8592          ++i) {
8593       if (i->first.second != entity) continue;
8594       if (p->watch_cookie &&
8595           *(p->watch_cookie) != i->first.first) continue;
8596       dout(10) << "acking notify on watch " << i->first << dendl;
8597       i->second->notify_ack(p->notify_id, p->reply_bl);
8598     }
8599   }
8600 }
8601
8602 hobject_t PrimaryLogPG::generate_temp_object(const hobject_t& target)
8603 {
8604   ostringstream ss;
8605   ss << "temp_" << info.pgid << "_" << get_role()
8606      << "_" << osd->monc->get_global_id() << "_" << (++temp_seq);
8607   hobject_t hoid = target.make_temp_hobject(ss.str());
8608   dout(20) << __func__ << " " << hoid << dendl;
8609   return hoid;
8610 }
8611
8612 hobject_t PrimaryLogPG::get_temp_recovery_object(
8613   const hobject_t& target,
8614   eversion_t version)
8615 {
8616   ostringstream ss;
8617   ss << "temp_recovering_" << info.pgid  // (note this includes the shardid)
8618      << "_" << version
8619      << "_" << info.history.same_interval_since
8620      << "_" << target.snap;
8621   // pgid + version + interval + snapid is unique, and short
8622   hobject_t hoid = target.make_temp_hobject(ss.str());
8623   dout(20) << __func__ << " " << hoid << dendl;
8624   return hoid;
8625 }
8626
8627 int PrimaryLogPG::prepare_transaction(OpContext *ctx)
8628 {
8629   ceph_assert(!ctx->ops->empty());
8630
8631   // valid snap context?
8632   if (!ctx->snapc.is_valid()) {
8633     dout(10) << " invalid snapc " << ctx->snapc << dendl;
8634     return -EINVAL;
8635   }
8636
8637   // prepare the actual mutation
8638   int result = do_osd_ops(ctx, *ctx->ops);
8639   if (result < 0) {
8640     if (ctx->op->may_write() &&
8641         get_osdmap()->require_osd_release >= ceph_release_t::kraken) {
8642       // need to save the error code in the pg log, to detect dup ops,
8643       // but do nothing else
8644       ctx->update_log_only = true;
8645     }
8646     return result;
8647   }
8648
8649   // read-op?  write-op noop? done?
8650   if (ctx->op_t->empty() && !ctx->modify) {
8651     if (ctx->pending_async_reads.empty())
8652       unstable_stats.add(ctx->delta_stats);
8653     if (ctx->op->may_write() &&
8654         get_osdmap()->require_osd_release >= ceph_release_t::kraken) {
8655       ctx->update_log_only = true;
8656     }
8657     return result;
8658   }
8659
8660   // check for full
8661   if ((ctx->delta_stats.num_bytes > 0 ||
8662        ctx->delta_stats.num_objects > 0) &&  // FIXME: keys?
8663       pool.info.has_flag(pg_pool_t::FLAG_FULL)) {
8664     auto m = ctx->op->get_req<MOSDOp>();
8665     if (ctx->reqid.name.is_mds() ||   // FIXME: ignore MDS for now
8666         m->has_flag(CEPH_OSD_FLAG_FULL_FORCE)) {
8667       dout(20) << __func__ << " full, but proceeding due to FULL_FORCE or MDS"
8668                << dendl;
8669     } else if (m->has_flag(CEPH_OSD_FLAG_FULL_TRY)) {
8670       // they tried, they failed.
8671       dout(20) << __func__ << " full, replying to FULL_TRY op" << dendl;
8672       return pool.info.has_flag(pg_pool_t::FLAG_FULL_QUOTA) ? -EDQUOT : -ENOSPC;
8673     } else {
8674       // drop request
8675       dout(20) << __func__ << " full, dropping request (bad client)" << dendl;
8676       return -EAGAIN;
8677     }
8678   }
8679
8680   const hobject_t& soid = ctx->obs->oi.soid;
8681   // clone, if necessary
8682   if (soid.snap == CEPH_NOSNAP)
8683     make_writeable(ctx);
8684
8685   finish_ctx(ctx,
8686              ctx->new_obs.exists ? pg_log_entry_t::MODIFY :
8687              pg_log_entry_t::DELETE,
8688              result);
8689
8690   return result;
8691 }
8692
8693 void PrimaryLogPG::finish_ctx(OpContext *ctx, int log_op_type, int result)
8694 {
8695   const hobject_t& soid = ctx->obs->oi.soid;
8696   dout(20) << __func__ << " " << soid << " " << ctx
8697            << " op " << pg_log_entry_t::get_op_name(log_op_type)
8698            << dendl;
8699   utime_t now = ceph_clock_now();
8700
8701 #ifdef HAVE_JAEGER
8702   if (ctx->op->osd_parent_span) {
8703     auto finish_ctx_span = jaeger_tracing::child_span(__func__, ctx->op->osd_parent_span);
8704   }
8705 #endif
8706   // Drop the reference if deduped chunk is modified
8707   if (ctx->new_obs.oi.is_dirty() &&
8708     (ctx->obs->oi.has_manifest() && ctx->obs->oi.manifest.is_chunked()) &&
8709     // If a clone is creating, ignore dropping the reference for manifest object
8710     !ctx->delta_stats.num_object_clones &&
8711     ctx->new_obs.oi.size != 0 && // missing, redirect and delete
8712     !ctx->cache_operation &&
8713     log_op_type != pg_log_entry_t::PROMOTE) {
8714     dec_refcount_by_dirty(ctx);
8715   }
8716
8717   // finish and log the op.
8718   if (ctx->user_modify) {
8719     // update the user_version for any modify ops, except for the watch op
8720     ctx->user_at_version = std::max(info.last_user_version, ctx->new_obs.oi.user_version) + 1;
8721     /* In order for new clients and old clients to interoperate properly
8722      * when exchanging versions, we need to lower bound the user_version
8723      * (which our new clients pay proper attention to)
8724      * by the at_version (which is all the old clients can ever see). */
8725     if (ctx->at_version.version > ctx->user_at_version)
8726       ctx->user_at_version = ctx->at_version.version;
8727     ctx->new_obs.oi.user_version = ctx->user_at_version;
8728   }
8729   ctx->bytes_written = ctx->op_t->get_bytes_written();
8730
8731   if (ctx->new_obs.exists) {
8732     ctx->new_obs.oi.version = ctx->at_version;
8733     ctx->new_obs.oi.prior_version = ctx->obs->oi.version;
8734     ctx->new_obs.oi.last_reqid = ctx->reqid;
8735     if (ctx->mtime != utime_t()) {
8736       ctx->new_obs.oi.mtime = ctx->mtime;
8737       dout(10) << " set mtime to " << ctx->new_obs.oi.mtime << dendl;
8738       ctx->new_obs.oi.local_mtime = now;
8739     } else {
8740       dout(10) << " mtime unchanged at " << ctx->new_obs.oi.mtime << dendl;
8741     }
8742
8743     // object_info_t
8744     map <string, bufferlist> attrs;
8745     bufferlist bv(sizeof(ctx->new_obs.oi));
8746     encode(ctx->new_obs.oi, bv,
8747              get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
8748     attrs[OI_ATTR] = std::move(bv);
8749
8750     // snapset
8751     if (soid.snap == CEPH_NOSNAP) {
8752       dout(10) << " final snapset " << ctx->new_snapset
8753                << " in " << soid << dendl;
8754       bufferlist bss;
8755       encode(ctx->new_snapset, bss);
8756       attrs[SS_ATTR] = std::move(bss);
8757     } else {
8758       dout(10) << " no snapset (this is a clone)" << dendl;
8759     }
8760     ctx->op_t->setattrs(soid, attrs);
8761   } else {
8762     // reset cached oi
8763     ctx->new_obs.oi = object_info_t(ctx->obc->obs.oi.soid);
8764   }
8765
8766   // append to log
8767   ctx->log.push_back(
8768     pg_log_entry_t(log_op_type, soid, ctx->at_version,
8769                    ctx->obs->oi.version,
8770                    ctx->user_at_version, ctx->reqid,
8771                    ctx->mtime,
8772                    (ctx->op && ctx->op->allows_returnvec()) ? result : 0));
8773   if (ctx->op && ctx->op->allows_returnvec()) {
8774     // also the per-op values
8775     ctx->log.back().set_op_returns(*ctx->ops);
8776     dout(20) << __func__ << " op_returns " << ctx->log.back().op_returns
8777              << dendl;
8778   }
8779
8780   ctx->log.back().clean_regions = ctx->clean_regions;
8781   dout(20) << __func__ << " object " << soid <<  " marks clean_regions " << ctx->log.back().clean_regions << dendl;
8782
8783   if (soid.snap < CEPH_NOSNAP) {
8784     switch (log_op_type) {
8785     case pg_log_entry_t::MODIFY:
8786     case pg_log_entry_t::PROMOTE:
8787     case pg_log_entry_t::CLEAN:
8788       dout(20) << __func__ << " encoding snaps from " << ctx->new_snapset
8789                << dendl;
8790       encode(ctx->new_snapset.clone_snaps[soid.snap], ctx->log.back().snaps);
8791       break;
8792     default:
8793       break;
8794     }
8795   }
8796
8797   if (!ctx->extra_reqids.empty()) {
8798     dout(20) << __func__ << "  extra_reqids " << ctx->extra_reqids << " "
8799              << ctx->extra_reqid_return_codes << dendl;
8800     ctx->log.back().extra_reqids.swap(ctx->extra_reqids);
8801     ctx->log.back().extra_reqid_return_codes.swap(ctx->extra_reqid_return_codes);
8802   }
8803
8804   // apply new object state.
8805   ctx->obc->obs = ctx->new_obs;
8806
8807   if (soid.is_head() && !ctx->obc->obs.exists) {
8808     ctx->obc->ssc->exists = false;
8809     ctx->obc->ssc->snapset = SnapSet();
8810   } else {
8811     ctx->obc->ssc->exists = true;
8812     ctx->obc->ssc->snapset = ctx->new_snapset;
8813   }
8814 }
8815
8816 void PrimaryLogPG::apply_stats(
8817   const hobject_t &soid,
8818   const object_stat_sum_t &delta_stats) {
8819
8820   recovery_state.apply_op_stats(soid, delta_stats);
8821   for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
8822        i != get_backfill_targets().end();
8823        ++i) {
8824     pg_shard_t bt = *i;
8825     const pg_info_t& pinfo = recovery_state.get_peer_info(bt);
8826     if (soid > pinfo.last_backfill && soid <= last_backfill_started) {
8827       pending_backfill_updates[soid].stats.add(delta_stats);
8828     }
8829   }
8830
8831   m_scrubber->stats_of_handled_objects(delta_stats, soid);
8832 }
8833
8834 void PrimaryLogPG::complete_read_ctx(int result, OpContext *ctx)
8835 {
8836   auto m = ctx->op->get_req<MOSDOp>();
8837   ceph_assert(ctx->async_reads_complete());
8838
8839   for (auto p = ctx->ops->begin();
8840     p != ctx->ops->end() && result >= 0; ++p) {
8841     if (p->rval < 0 && !(p->op.flags & CEPH_OSD_OP_FLAG_FAILOK)) {
8842       result = p->rval;
8843       break;
8844     }
8845     ctx->bytes_read += p->outdata.length();
8846   }
8847   ctx->reply->get_header().data_off = (ctx->data_off ? *ctx->data_off : 0);
8848
8849   MOSDOpReply *reply = ctx->reply;
8850   ctx->reply = nullptr;
8851
8852   if (result >= 0) {
8853     if (!ctx->ignore_log_op_stats) {
8854       log_op_stats(*ctx->op, ctx->bytes_written, ctx->bytes_read);
8855
8856       publish_stats_to_osd();
8857     }
8858
8859     // on read, return the current object version
8860     if (ctx->obs) {
8861       reply->set_reply_versions(eversion_t(), ctx->obs->oi.user_version);
8862     } else {
8863       reply->set_reply_versions(eversion_t(), ctx->user_at_version);
8864     }
8865   } else if (result == -ENOENT) {
8866     // on ENOENT, set a floor for what the next user version will be.
8867     reply->set_enoent_reply_versions(info.last_update, info.last_user_version);
8868   }
8869
8870   reply->set_result(result);
8871   reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
8872   osd->send_message_osd_client(reply, m->get_connection());
8873   close_op_ctx(ctx);
8874 }
8875
8876 // ========================================================================
8877 // copyfrom
8878
8879 struct C_Copyfrom : public Context {
8880   PrimaryLogPGRef pg;
8881   hobject_t oid;
8882   epoch_t last_peering_reset;
8883   ceph_tid_t tid;
8884   PrimaryLogPG::CopyOpRef cop;  // used for keeping the cop alive
8885   C_Copyfrom(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
8886              const PrimaryLogPG::CopyOpRef& c)
8887     : pg(p), oid(o), last_peering_reset(lpr),
8888       tid(0), cop(c)
8889   {}
8890   void finish(int r) override {
8891     if (r == -ECANCELED)
8892       return;
8893     std::scoped_lock l{*pg};
8894     if (last_peering_reset == pg->get_last_peering_reset()) {
8895       pg->process_copy_chunk(oid, tid, r);
8896       cop.reset();
8897     }
8898   }
8899 };
8900
8901 struct C_CopyFrom_AsyncReadCb : public Context {
8902   OSDOp *osd_op;
8903   object_copy_data_t reply_obj;
8904   uint64_t features;
8905   size_t len;
8906   C_CopyFrom_AsyncReadCb(OSDOp *osd_op, uint64_t features) :
8907     osd_op(osd_op), features(features), len(0) {}
8908   void finish(int r) override {
8909     osd_op->rval = r;
8910     if (r < 0) {
8911       return;
8912     }
8913
8914     ceph_assert(len > 0);
8915     ceph_assert(len <= reply_obj.data.length());
8916     bufferlist bl;
8917     bl.substr_of(reply_obj.data, 0, len);
8918     reply_obj.data.swap(bl);
8919     encode(reply_obj, osd_op->outdata, features);
8920   }
8921 };
8922
8923 struct C_CopyChunk : public Context {
8924   PrimaryLogPGRef pg;
8925   hobject_t oid;
8926   epoch_t last_peering_reset;
8927   ceph_tid_t tid;
8928   PrimaryLogPG::CopyOpRef cop;  // used for keeping the cop alive
8929   uint64_t offset = 0;
8930   C_CopyChunk(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
8931              const PrimaryLogPG::CopyOpRef& c)
8932     : pg(p), oid(o), last_peering_reset(lpr),
8933       tid(0), cop(c)
8934   {}
8935   void finish(int r) override {
8936     if (r == -ECANCELED)
8937       return;
8938     std::scoped_lock l{*pg};
8939     if (last_peering_reset == pg->get_last_peering_reset()) {
8940       pg->process_copy_chunk_manifest(oid, tid, r, offset);
8941       cop.reset();
8942     }
8943   }
8944 };
8945
8946 int PrimaryLogPG::do_copy_get(OpContext *ctx, bufferlist::const_iterator& bp,
8947                               OSDOp& osd_op, ObjectContextRef &obc)
8948 {
8949   object_info_t& oi = obc->obs.oi;
8950   hobject_t& soid = oi.soid;
8951   int result = 0;
8952   object_copy_cursor_t cursor;
8953   uint64_t out_max;
8954   try {
8955     decode(cursor, bp);
8956     decode(out_max, bp);
8957   }
8958   catch (ceph::buffer::error& e) {
8959     result = -EINVAL;
8960     return result;
8961   }
8962
8963   const MOSDOp *op = reinterpret_cast<const MOSDOp*>(ctx->op->get_req());
8964   uint64_t features = op->get_features();
8965
8966   bool async_read_started = false;
8967   object_copy_data_t _reply_obj;
8968   C_CopyFrom_AsyncReadCb *cb = nullptr;
8969   if (pool.info.is_erasure()) {
8970     cb = new C_CopyFrom_AsyncReadCb(&osd_op, features);
8971   }
8972   object_copy_data_t &reply_obj = cb ? cb->reply_obj : _reply_obj;
8973   // size, mtime
8974   reply_obj.size = oi.size;
8975   reply_obj.mtime = oi.mtime;
8976   ceph_assert(obc->ssc);
8977   if (soid.snap < CEPH_NOSNAP) {
8978     auto p = obc->ssc->snapset.clone_snaps.find(soid.snap);
8979     ceph_assert(p != obc->ssc->snapset.clone_snaps.end()); // warn?
8980     reply_obj.snaps = p->second;
8981   } else {
8982     reply_obj.snap_seq = obc->ssc->snapset.seq;
8983   }
8984   if (oi.is_data_digest()) {
8985     reply_obj.flags |= object_copy_data_t::FLAG_DATA_DIGEST;
8986     reply_obj.data_digest = oi.data_digest;
8987   }
8988   if (oi.is_omap_digest()) {
8989     reply_obj.flags |= object_copy_data_t::FLAG_OMAP_DIGEST;
8990     reply_obj.omap_digest = oi.omap_digest;
8991   }
8992   reply_obj.truncate_seq = oi.truncate_seq;
8993   reply_obj.truncate_size = oi.truncate_size;
8994
8995   // attrs
8996   map<string,bufferlist>& out_attrs = reply_obj.attrs;
8997   if (!cursor.attr_complete) {
8998     result = getattrs_maybe_cache(
8999       ctx->obc,
9000       &out_attrs);
9001     if (result < 0) {
9002       if (cb) {
9003         delete cb;
9004       }
9005       return result;
9006     }
9007     cursor.attr_complete = true;
9008     dout(20) << " got attrs" << dendl;
9009   }
9010
9011   int64_t left = out_max - osd_op.outdata.length();
9012
9013   // data
9014   bufferlist& bl = reply_obj.data;
9015   if (left > 0 && !cursor.data_complete) {
9016     if (cursor.data_offset < oi.size) {
9017       uint64_t max_read = std::min(oi.size - cursor.data_offset, (uint64_t)left);
9018       if (cb) {
9019         async_read_started = true;
9020         ctx->pending_async_reads.push_back(
9021           make_pair(
9022             boost::make_tuple(cursor.data_offset, max_read, osd_op.op.flags),
9023             make_pair(&bl, cb)));
9024         cb->len = max_read;
9025
9026         ctx->op_finishers[ctx->current_osd_subop_num].reset(
9027           new ReadFinisher(osd_op));
9028         result = -EINPROGRESS;
9029
9030         dout(10) << __func__ << ": async_read noted for " << soid << dendl;
9031       } else {
9032         result = pgbackend->objects_read_sync(
9033           oi.soid, cursor.data_offset, max_read, osd_op.op.flags, &bl);
9034         if (result < 0)
9035           return result;
9036       }
9037       left -= max_read;
9038       cursor.data_offset += max_read;
9039     }
9040     if (cursor.data_offset == oi.size) {
9041       cursor.data_complete = true;
9042       dout(20) << " got data" << dendl;
9043     }
9044     ceph_assert(cursor.data_offset <= oi.size);
9045   }
9046
9047   // omap
9048   uint32_t omap_keys = 0;
9049   if (!pool.info.supports_omap() || !oi.is_omap()) {
9050     cursor.omap_complete = true;
9051   } else {
9052     if (left > 0 && !cursor.omap_complete) {
9053       ceph_assert(cursor.data_complete);
9054       if (cursor.omap_offset.empty()) {
9055         osd->store->omap_get_header(ch, ghobject_t(oi.soid),
9056                                     &reply_obj.omap_header);
9057       }
9058       bufferlist omap_data;
9059       ObjectMap::ObjectMapIterator iter =
9060         osd->store->get_omap_iterator(ch, ghobject_t(oi.soid));
9061       ceph_assert(iter);
9062       iter->upper_bound(cursor.omap_offset);
9063       for (; iter->valid(); iter->next()) {
9064         ++omap_keys;
9065         encode(iter->key(), omap_data);
9066         encode(iter->value(), omap_data);
9067         left -= iter->key().length() + 4 + iter->value().length() + 4;
9068         if (left <= 0)
9069           break;
9070       }
9071       if (omap_keys) {
9072         encode(omap_keys, reply_obj.omap_data);
9073         reply_obj.omap_data.claim_append(omap_data);
9074       }
9075       if (iter->valid()) {
9076         cursor.omap_offset = iter->key();
9077       } else {
9078         cursor.omap_complete = true;
9079         dout(20) << " got omap" << dendl;
9080       }
9081     }
9082   }
9083
9084   if (cursor.is_complete()) {
9085     // include reqids only in the final step.  this is a bit fragile
9086     // but it works...
9087     recovery_state.get_pg_log().get_log().get_object_reqids(ctx->obc->obs.oi.soid, 10,
9088                                        &reply_obj.reqids,
9089                                        &reply_obj.reqid_return_codes);
9090     dout(20) << " got reqids" << dendl;
9091   }
9092
9093   dout(20) << " cursor.is_complete=" << cursor.is_complete()
9094            << " " << out_attrs.size() << " attrs"
9095            << " " << bl.length() << " bytes"
9096            << " " << reply_obj.omap_header.length() << " omap header bytes"
9097            << " " << reply_obj.omap_data.length() << " omap data bytes in "
9098            << omap_keys << " keys"
9099            << " " << reply_obj.reqids.size() << " reqids"
9100            << dendl;
9101   reply_obj.cursor = cursor;
9102   if (!async_read_started) {
9103     encode(reply_obj, osd_op.outdata, features);
9104   }
9105   if (cb && !async_read_started) {
9106     delete cb;
9107   }
9108
9109   if (result > 0) {
9110     result = 0;
9111   }
9112   return result;
9113 }
9114
9115 void PrimaryLogPG::fill_in_copy_get_noent(OpRequestRef& op, hobject_t oid,
9116                                           OSDOp& osd_op)
9117 {
9118   const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
9119   uint64_t features = m->get_features();
9120   object_copy_data_t reply_obj;
9121
9122   recovery_state.get_pg_log().get_log().get_object_reqids(oid, 10, &reply_obj.reqids,
9123                                      &reply_obj.reqid_return_codes);
9124   dout(20) << __func__ << " got reqids " << reply_obj.reqids << dendl;
9125   encode(reply_obj, osd_op.outdata, features);
9126   osd_op.rval = -ENOENT;
9127   MOSDOpReply *reply = new MOSDOpReply(m, 0, get_osdmap_epoch(), 0, false);
9128   reply->set_result(-ENOENT);
9129   reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
9130   osd->send_message_osd_client(reply, m->get_connection());
9131 }
9132
9133 void PrimaryLogPG::start_copy(CopyCallback *cb, ObjectContextRef obc,
9134                               hobject_t src, object_locator_t oloc,
9135                               version_t version, unsigned flags,
9136                               bool mirror_snapset,
9137                               unsigned src_obj_fadvise_flags,
9138                               unsigned dest_obj_fadvise_flags)
9139 {
9140   const hobject_t& dest = obc->obs.oi.soid;
9141   dout(10) << __func__ << " " << dest
9142            << " from " << src << " " << oloc << " v" << version
9143            << " flags " << flags
9144            << (mirror_snapset ? " mirror_snapset" : "")
9145            << dendl;
9146
9147   ceph_assert(!mirror_snapset || src.snap == CEPH_NOSNAP);
9148
9149   // cancel a previous in-progress copy?
9150   if (copy_ops.count(dest)) {
9151     // FIXME: if the src etc match, we could avoid restarting from the
9152     // beginning.
9153     CopyOpRef cop = copy_ops[dest];
9154     vector<ceph_tid_t> tids;
9155     cancel_copy(cop, false, &tids);
9156     osd->objecter->op_cancel(tids, -ECANCELED);
9157   }
9158
9159   CopyOpRef cop(std::make_shared<CopyOp>(cb, obc, src, oloc, version, flags,
9160                            mirror_snapset, src_obj_fadvise_flags,
9161                            dest_obj_fadvise_flags));
9162   copy_ops[dest] = cop;
9163   obc->start_block();
9164
9165   if (!obc->obs.oi.has_manifest()) {
9166     _copy_some(obc, cop);
9167   } else {
9168     if (obc->obs.oi.manifest.is_redirect()) {
9169       _copy_some(obc, cop);
9170     } else if (obc->obs.oi.manifest.is_chunked()) {
9171       auto p = obc->obs.oi.manifest.chunk_map.begin();
9172       _copy_some_manifest(obc, cop, p->first);
9173     } else {
9174       ceph_abort_msg("unrecognized manifest type");
9175     }
9176   }
9177 }
9178
9179 void PrimaryLogPG::_copy_some(ObjectContextRef obc, CopyOpRef cop)
9180 {
9181   dout(10) << __func__ << " " << *obc << " " << cop << dendl;
9182
9183   unsigned flags = 0;
9184   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_FLUSH)
9185     flags |= CEPH_OSD_FLAG_FLUSH;
9186   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE)
9187     flags |= CEPH_OSD_FLAG_IGNORE_CACHE;
9188   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY)
9189     flags |= CEPH_OSD_FLAG_IGNORE_OVERLAY;
9190   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE)
9191     flags |= CEPH_OSD_FLAG_MAP_SNAP_CLONE;
9192   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_RWORDERED)
9193     flags |= CEPH_OSD_FLAG_RWORDERED;
9194
9195   C_GatherBuilder gather(cct);
9196
9197   if (cop->cursor.is_initial() && cop->mirror_snapset) {
9198     // list snaps too.
9199     ceph_assert(cop->src.snap == CEPH_NOSNAP);
9200     ObjectOperation op;
9201     op.list_snaps(&cop->results.snapset, NULL);
9202     ceph_tid_t tid = osd->objecter->read(cop->src.oid, cop->oloc, op,
9203                                     CEPH_SNAPDIR, NULL,
9204                                     flags, gather.new_sub(), NULL);
9205     cop->objecter_tid2 = tid;
9206   }
9207
9208   ObjectOperation op;
9209   if (cop->results.user_version) {
9210     op.assert_version(cop->results.user_version);
9211   } else {
9212     // we should learn the version after the first chunk, if we didn't know
9213     // it already!
9214     ceph_assert(cop->cursor.is_initial());
9215   }
9216   op.copy_get(&cop->cursor, get_copy_chunk_size(),
9217               &cop->results.object_size, &cop->results.mtime,
9218               &cop->attrs, &cop->data, &cop->omap_header, &cop->omap_data,
9219               &cop->results.snaps, &cop->results.snap_seq,
9220               &cop->results.flags,
9221               &cop->results.source_data_digest,
9222               &cop->results.source_omap_digest,
9223               &cop->results.reqids,
9224               &cop->results.reqid_return_codes,
9225               &cop->results.truncate_seq,
9226               &cop->results.truncate_size,
9227               &cop->rval);
9228   op.set_last_op_flags(cop->src_obj_fadvise_flags);
9229
9230   C_Copyfrom *fin = new C_Copyfrom(this, obc->obs.oi.soid,
9231                                    get_last_peering_reset(), cop);
9232   gather.set_finisher(new C_OnFinisher(fin,
9233                                        osd->get_objecter_finisher(get_pg_shard())));
9234
9235   ceph_tid_t tid = osd->objecter->read(cop->src.oid, cop->oloc, op,
9236                                   cop->src.snap, NULL,
9237                                   flags,
9238                                   gather.new_sub(),
9239                                   // discover the object version if we don't know it yet
9240                                   cop->results.user_version ? NULL : &cop->results.user_version);
9241   fin->tid = tid;
9242   cop->objecter_tid = tid;
9243   gather.activate();
9244 }
9245
9246 void PrimaryLogPG::_copy_some_manifest(ObjectContextRef obc, CopyOpRef cop, uint64_t start_offset)
9247 {
9248   dout(10) << __func__ << " " << *obc << " " << cop << dendl;
9249
9250   unsigned flags = 0;
9251   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_FLUSH)
9252     flags |= CEPH_OSD_FLAG_FLUSH;
9253   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE)
9254     flags |= CEPH_OSD_FLAG_IGNORE_CACHE;
9255   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY)
9256     flags |= CEPH_OSD_FLAG_IGNORE_OVERLAY;
9257   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE)
9258     flags |= CEPH_OSD_FLAG_MAP_SNAP_CLONE;
9259   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_RWORDERED)
9260     flags |= CEPH_OSD_FLAG_RWORDERED;
9261
9262   int num_chunks = 0;
9263   uint64_t last_offset = 0, chunks_size = 0;
9264   object_manifest_t *manifest = &obc->obs.oi.manifest;
9265   map<uint64_t, chunk_info_t>::iterator iter = manifest->chunk_map.find(start_offset);
9266   for (;iter != manifest->chunk_map.end(); ++iter) {
9267     num_chunks++;
9268     chunks_size += iter->second.length;
9269     last_offset = iter->first;
9270     if (get_copy_chunk_size() < chunks_size) {
9271       break;
9272     }
9273   }
9274
9275   cop->num_chunk = num_chunks;
9276   cop->start_offset = start_offset;
9277   cop->last_offset = last_offset;
9278   dout(20) << __func__ << " oid " << obc->obs.oi.soid << " num_chunks: " << num_chunks
9279           << " start_offset: " << start_offset << " chunks_size: " << chunks_size
9280           << " last_offset: " << last_offset << dendl;
9281
9282   iter = manifest->chunk_map.find(start_offset);
9283   for (;iter != manifest->chunk_map.end(); ++iter) {
9284     uint64_t obj_offset = iter->first;
9285     uint64_t length = manifest->chunk_map[iter->first].length;
9286     hobject_t soid = manifest->chunk_map[iter->first].oid;
9287     object_locator_t oloc(soid);
9288     CopyCallback * cb = NULL;
9289     CopyOpRef sub_cop(std::make_shared<CopyOp>(cb, ObjectContextRef(), cop->src, oloc,
9290                        cop->results.user_version, cop->flags, cop->mirror_snapset,
9291                        cop->src_obj_fadvise_flags, cop->dest_obj_fadvise_flags));
9292     sub_cop->cursor.data_offset = obj_offset;
9293     cop->chunk_cops[obj_offset] = sub_cop;
9294
9295     int s = sub_cop->chunk_ops.size();
9296     sub_cop->chunk_ops.resize(s+1);
9297     sub_cop->chunk_ops[s].op.op =  CEPH_OSD_OP_READ;
9298     sub_cop->chunk_ops[s].op.extent.offset = manifest->chunk_map[iter->first].offset;
9299     sub_cop->chunk_ops[s].op.extent.length = length;
9300
9301     ObjectOperation op;
9302     op.dup(sub_cop->chunk_ops);
9303
9304     if (cop->results.user_version) {
9305       op.assert_version(cop->results.user_version);
9306     } else {
9307       // we should learn the version after the first chunk, if we didn't know
9308       // it already!
9309       ceph_assert(cop->cursor.is_initial());
9310     }
9311     op.set_last_op_flags(cop->src_obj_fadvise_flags);
9312
9313     C_CopyChunk *fin = new C_CopyChunk(this, obc->obs.oi.soid,
9314                                      get_last_peering_reset(), cop);
9315     fin->offset = obj_offset;
9316
9317     ceph_tid_t tid = osd->objecter->read(
9318       soid.oid, oloc, op,
9319       sub_cop->src.snap, NULL,
9320       flags,
9321       new C_OnFinisher(fin, osd->get_objecter_finisher(get_pg_shard())),
9322       // discover the object version if we don't know it yet
9323       sub_cop->results.user_version ? NULL : &sub_cop->results.user_version);
9324     fin->tid = tid;
9325     sub_cop->objecter_tid = tid;
9326
9327     dout(20) << __func__ << " tgt_oid: " << soid.oid << " tgt_offset: "
9328             << manifest->chunk_map[iter->first].offset
9329             << " length: " << length << " pool id: " << oloc.pool
9330             << " tid: " << tid << dendl;
9331
9332     if (last_offset < iter->first) {
9333       break;
9334     }
9335   }
9336 }
9337
9338 void PrimaryLogPG::process_copy_chunk(hobject_t oid, ceph_tid_t tid, int r)
9339 {
9340   dout(10) << __func__ << " " << oid << " tid " << tid
9341            << " " << cpp_strerror(r) << dendl;
9342   map<hobject_t,CopyOpRef>::iterator p = copy_ops.find(oid);
9343   if (p == copy_ops.end()) {
9344     dout(10) << __func__ << " no copy_op found" << dendl;
9345     return;
9346   }
9347   CopyOpRef cop = p->second;
9348   if (tid != cop->objecter_tid) {
9349     dout(10) << __func__ << " tid " << tid << " != cop " << cop
9350              << " tid " << cop->objecter_tid << dendl;
9351     return;
9352   }
9353
9354   if (cop->omap_data.length() || cop->omap_header.length())
9355     cop->results.has_omap = true;
9356
9357   if (r >= 0 && !pool.info.supports_omap() &&
9358       (cop->omap_data.length() || cop->omap_header.length())) {
9359     r = -EOPNOTSUPP;
9360   }
9361   cop->objecter_tid = 0;
9362   cop->objecter_tid2 = 0;  // assume this ordered before us (if it happened)
9363   ObjectContextRef& cobc = cop->obc;
9364
9365   if (r < 0)
9366     goto out;
9367
9368   ceph_assert(cop->rval >= 0);
9369
9370   if (oid.snap < CEPH_NOSNAP && !cop->results.snaps.empty()) {
9371     // verify snap hasn't been deleted
9372     vector<snapid_t>::iterator p = cop->results.snaps.begin();
9373     while (p != cop->results.snaps.end()) {
9374       // make best effort to sanitize snaps/clones.
9375       if (get_osdmap()->in_removed_snaps_queue(info.pgid.pgid.pool(), *p)) {
9376         dout(10) << __func__ << " clone snap " << *p << " has been deleted"
9377                  << dendl;
9378         for (vector<snapid_t>::iterator q = p + 1;
9379              q != cop->results.snaps.end();
9380              ++q)
9381           *(q - 1) = *q;
9382         cop->results.snaps.resize(cop->results.snaps.size() - 1);
9383       } else {
9384         ++p;
9385       }
9386     }
9387     if (cop->results.snaps.empty()) {
9388       dout(10) << __func__ << " no more snaps for " << oid << dendl;
9389       r = -ENOENT;
9390       goto out;
9391     }
9392   }
9393
9394   ceph_assert(cop->rval >= 0);
9395
9396   if (!cop->temp_cursor.data_complete) {
9397     cop->results.data_digest = cop->data.crc32c(cop->results.data_digest);
9398   }
9399   if (pool.info.supports_omap() && !cop->temp_cursor.omap_complete) {
9400     if (cop->omap_header.length()) {
9401       cop->results.omap_digest =
9402         cop->omap_header.crc32c(cop->results.omap_digest);
9403     }
9404     if (cop->omap_data.length()) {
9405       bufferlist keys;
9406       keys.substr_of(cop->omap_data, 4, cop->omap_data.length() - 4);
9407       cop->results.omap_digest = keys.crc32c(cop->results.omap_digest);
9408     }
9409   }
9410
9411   if (!cop->temp_cursor.attr_complete) {
9412     for (map<string,bufferlist>::iterator p = cop->attrs.begin();
9413          p != cop->attrs.end();
9414          ++p) {
9415       cop->results.attrs[string("_") + p->first] = p->second;
9416     }
9417     cop->attrs.clear();
9418   }
9419
9420   if (!cop->cursor.is_complete()) {
9421     // write out what we have so far
9422     if (cop->temp_cursor.is_initial()) {
9423       ceph_assert(!cop->results.started_temp_obj);
9424       cop->results.started_temp_obj = true;
9425       cop->results.temp_oid = generate_temp_object(oid);
9426       dout(20) << __func__ << " using temp " << cop->results.temp_oid << dendl;
9427     }
9428     ObjectContextRef tempobc = get_object_context(cop->results.temp_oid, true);
9429     OpContextUPtr ctx = simple_opc_create(tempobc);
9430     if (cop->temp_cursor.is_initial()) {
9431       ctx->new_temp_oid = cop->results.temp_oid;
9432     }
9433     _write_copy_chunk(cop, ctx->op_t.get());
9434     simple_opc_submit(std::move(ctx));
9435     dout(10) << __func__ << " fetching more" << dendl;
9436     _copy_some(cobc, cop);
9437     return;
9438   }
9439
9440   // verify digests?
9441   if (cop->results.is_data_digest() || cop->results.is_omap_digest()) {
9442     dout(20) << __func__ << std::hex
9443       << " got digest: rx data 0x" << cop->results.data_digest
9444       << " omap 0x" << cop->results.omap_digest
9445       << ", source: data 0x" << cop->results.source_data_digest
9446       << " omap 0x" <<  cop->results.source_omap_digest
9447       << std::dec
9448       << " flags " << cop->results.flags
9449       << dendl;
9450   }
9451   if (cop->results.is_data_digest() &&
9452       cop->results.data_digest != cop->results.source_data_digest) {
9453     derr << __func__ << std::hex << " data digest 0x" << cop->results.data_digest
9454          << " != source 0x" << cop->results.source_data_digest << std::dec
9455          << dendl;
9456     osd->clog->error() << info.pgid << " copy from " << cop->src
9457                        << " to " << cop->obc->obs.oi.soid << std::hex
9458                        << " data digest 0x" << cop->results.data_digest
9459                        << " != source 0x" << cop->results.source_data_digest
9460                        << std::dec;
9461     r = -EIO;
9462     goto out;
9463   }
9464   if (cop->results.is_omap_digest() &&
9465       cop->results.omap_digest != cop->results.source_omap_digest) {
9466     derr << __func__ << std::hex
9467          << " omap digest 0x" << cop->results.omap_digest
9468          << " != source 0x" << cop->results.source_omap_digest
9469          << std::dec << dendl;
9470     osd->clog->error() << info.pgid << " copy from " << cop->src
9471                        << " to " << cop->obc->obs.oi.soid << std::hex
9472                        << " omap digest 0x" << cop->results.omap_digest
9473                        << " != source 0x" << cop->results.source_omap_digest
9474                        << std::dec;
9475     r = -EIO;
9476     goto out;
9477   }
9478   if (cct->_conf->osd_debug_inject_copyfrom_error) {
9479     derr << __func__ << " injecting copyfrom failure" << dendl;
9480     r = -EIO;
9481     goto out;
9482   }
9483
9484   cop->results.fill_in_final_tx = std::function<void(PGTransaction*)>(
9485     [this, &cop /* avoid ref cycle */](PGTransaction *t) {
9486       ObjectState& obs = cop->obc->obs;
9487       if (cop->temp_cursor.is_initial()) {
9488         dout(20) << "fill_in_final_tx: writing "
9489                  << "directly to final object" << dendl;
9490         // write directly to final object
9491         cop->results.temp_oid = obs.oi.soid;
9492         _write_copy_chunk(cop, t);
9493       } else {
9494         // finish writing to temp object, then move into place
9495         dout(20) << "fill_in_final_tx: writing to temp object" << dendl;
9496         if (obs.oi.has_manifest() && obs.oi.manifest.is_redirect() && obs.exists) {
9497           /* In redirect manifest case, the object exists in the upper tier.
9498            * So, to avoid a conflict when rename() is called, remove existing
9499            * object first
9500            */
9501           t->remove(obs.oi.soid);
9502         }
9503         _write_copy_chunk(cop, t);
9504         t->rename(obs.oi.soid, cop->results.temp_oid);
9505       }
9506       t->setattrs(obs.oi.soid, cop->results.attrs);
9507     });
9508
9509   dout(20) << __func__ << " success; committing" << dendl;
9510
9511  out:
9512   dout(20) << __func__ << " complete r = " << cpp_strerror(r) << dendl;
9513   CopyCallbackResults results(r, &cop->results);
9514   cop->cb->complete(results);
9515
9516   copy_ops.erase(cobc->obs.oi.soid);
9517   cobc->stop_block();
9518
9519   if (r < 0 && cop->results.started_temp_obj) {
9520     dout(10) << __func__ << " deleting partial temp object "
9521              << cop->results.temp_oid << dendl;
9522     ObjectContextRef tempobc = get_object_context(cop->results.temp_oid, true);
9523     OpContextUPtr ctx = simple_opc_create(tempobc);
9524     ctx->op_t->remove(cop->results.temp_oid);
9525     ctx->discard_temp_oid = cop->results.temp_oid;
9526     simple_opc_submit(std::move(ctx));
9527   }
9528
9529   // cancel and requeue proxy ops on this object
9530   if (!r) {
9531     cancel_and_requeue_proxy_ops(cobc->obs.oi.soid);
9532   }
9533
9534   kick_object_context_blocked(cobc);
9535 }
9536
9537 void PrimaryLogPG::process_copy_chunk_manifest(hobject_t oid, ceph_tid_t tid, int r, uint64_t offset)
9538 {
9539   dout(10) << __func__ << " " << oid << " tid " << tid
9540            << " " << cpp_strerror(r) << dendl;
9541   map<hobject_t,CopyOpRef>::iterator p = copy_ops.find(oid);
9542   if (p == copy_ops.end()) {
9543     dout(10) << __func__ << " no copy_op found" << dendl;
9544     return;
9545   }
9546   CopyOpRef obj_cop = p->second;
9547   CopyOpRef chunk_cop = obj_cop->chunk_cops[offset];
9548
9549   if (tid != chunk_cop->objecter_tid) {
9550     dout(10) << __func__ << " tid " << tid << " != cop " << chunk_cop
9551              << " tid " << chunk_cop->objecter_tid << dendl;
9552     return;
9553   }
9554
9555   if (chunk_cop->omap_data.length() || chunk_cop->omap_header.length()) {
9556     r = -EOPNOTSUPP;
9557   }
9558
9559   chunk_cop->objecter_tid = 0;
9560   chunk_cop->objecter_tid2 = 0;  // assume this ordered before us (if it happened)
9561   ObjectContextRef& cobc = obj_cop->obc;
9562   OSDOp &chunk_data = chunk_cop->chunk_ops[0];
9563
9564   if (r < 0) {
9565     obj_cop->failed = true;
9566     goto out;
9567   }
9568
9569   if (obj_cop->failed) {
9570     return;
9571   }
9572   if (!chunk_data.outdata.length()) {
9573     r = -EIO;
9574     obj_cop->failed = true;
9575     goto out;
9576   }
9577
9578   obj_cop->num_chunk--;
9579
9580   /* check all of the copyop are completed */
9581   if (obj_cop->num_chunk) {
9582     dout(20) << __func__ << " num_chunk: " << obj_cop->num_chunk << dendl;
9583     return;
9584   }
9585
9586   {
9587     OpContextUPtr ctx = simple_opc_create(obj_cop->obc);
9588     if (!ctx->lock_manager.take_write_lock(
9589           obj_cop->obc->obs.oi.soid,
9590           obj_cop->obc)) {
9591       // recovery op can take read lock.
9592       // so need to wait for recovery completion
9593       r = -EAGAIN;
9594       obj_cop->failed = true;
9595       close_op_ctx(ctx.release());
9596       goto out;
9597     }
9598     dout(20) << __func__ << " took lock on obc, " << obj_cop->obc->rwstate << dendl;
9599
9600     PGTransaction *t = ctx->op_t.get();
9601     ObjectState& obs = ctx->new_obs;
9602     for (auto p : obj_cop->chunk_cops) {
9603       OSDOp &sub_chunk = p.second->chunk_ops[0];
9604       t->write(cobc->obs.oi.soid,
9605               p.second->cursor.data_offset,
9606               sub_chunk.outdata.length(),
9607               sub_chunk.outdata,
9608               p.second->dest_obj_fadvise_flags);
9609       dout(20) << __func__ << " offset: " << p.second->cursor.data_offset
9610               << " length: " << sub_chunk.outdata.length() << dendl;
9611       write_update_size_and_usage(ctx->delta_stats, obs.oi, ctx->modified_ranges,
9612                                   p.second->cursor.data_offset, sub_chunk.outdata.length());
9613       obs.oi.manifest.chunk_map[p.second->cursor.data_offset].clear_flag(chunk_info_t::FLAG_MISSING);
9614       ctx->clean_regions.mark_data_region_dirty(p.second->cursor.data_offset, sub_chunk.outdata.length());
9615       sub_chunk.outdata.clear();
9616     }
9617     obs.oi.clear_data_digest();
9618     ctx->at_version = get_next_version();
9619     finish_ctx(ctx.get(), pg_log_entry_t::PROMOTE);
9620     simple_opc_submit(std::move(ctx));
9621
9622     auto p = cobc->obs.oi.manifest.chunk_map.rbegin();
9623     /* check remaining work */
9624     if (p != cobc->obs.oi.manifest.chunk_map.rend()) {
9625       if (obj_cop->last_offset >= p->first + p->second.length) {
9626         for (auto &en : cobc->obs.oi.manifest.chunk_map) {
9627           if (obj_cop->last_offset < en.first) {
9628             _copy_some_manifest(cobc, obj_cop, en.first);
9629             return;
9630           }
9631         }
9632       }
9633     }
9634   }
9635
9636  out:
9637   dout(20) << __func__ << " complete r = " << cpp_strerror(r) << dendl;
9638   CopyCallbackResults results(r, &obj_cop->results);
9639   obj_cop->cb->complete(results);
9640
9641   copy_ops.erase(cobc->obs.oi.soid);
9642   cobc->stop_block();
9643
9644   // cancel and requeue proxy ops on this object
9645   if (!r) {
9646     cancel_and_requeue_proxy_ops(cobc->obs.oi.soid);
9647   }
9648
9649   kick_object_context_blocked(cobc);
9650 }
9651
9652 void PrimaryLogPG::cancel_and_requeue_proxy_ops(hobject_t oid) {
9653   vector<ceph_tid_t> tids;
9654   for (map<ceph_tid_t, ProxyReadOpRef>::iterator it = proxyread_ops.begin();
9655       it != proxyread_ops.end();) {
9656     if (it->second->soid == oid) {
9657       cancel_proxy_read((it++)->second, &tids);
9658     } else {
9659       ++it;
9660     }
9661   }
9662   for (map<ceph_tid_t, ProxyWriteOpRef>::iterator it = proxywrite_ops.begin();
9663        it != proxywrite_ops.end();) {
9664     if (it->second->soid == oid) {
9665       cancel_proxy_write((it++)->second, &tids);
9666     } else {
9667       ++it;
9668     }
9669   }
9670   osd->objecter->op_cancel(tids, -ECANCELED);
9671   kick_proxy_ops_blocked(oid);
9672 }
9673
9674 void PrimaryLogPG::_write_copy_chunk(CopyOpRef cop, PGTransaction *t)
9675 {
9676   dout(20) << __func__ << " " << cop
9677            << " " << cop->attrs.size() << " attrs"
9678            << " " << cop->data.length() << " bytes"
9679            << " " << cop->omap_header.length() << " omap header bytes"
9680            << " " << cop->omap_data.length() << " omap data bytes"
9681            << dendl;
9682   if (!cop->temp_cursor.attr_complete) {
9683     t->create(cop->results.temp_oid);
9684   }
9685   if (!cop->temp_cursor.data_complete) {
9686     ceph_assert(cop->data.length() + cop->temp_cursor.data_offset ==
9687            cop->cursor.data_offset);
9688     if (pool.info.required_alignment() &&
9689         !cop->cursor.data_complete) {
9690       /**
9691        * Trim off the unaligned bit at the end, we'll adjust cursor.data_offset
9692        * to pick it up on the next pass.
9693        */
9694       ceph_assert(cop->temp_cursor.data_offset %
9695              pool.info.required_alignment() == 0);
9696       if (cop->data.length() % pool.info.required_alignment() != 0) {
9697         uint64_t to_trim =
9698           cop->data.length() % pool.info.required_alignment();
9699         bufferlist bl;
9700         bl.substr_of(cop->data, 0, cop->data.length() - to_trim);
9701         cop->data.swap(bl);
9702         cop->cursor.data_offset -= to_trim;
9703         ceph_assert(cop->data.length() + cop->temp_cursor.data_offset ==
9704                cop->cursor.data_offset);
9705       }
9706     }
9707     if (cop->data.length()) {
9708       t->write(
9709         cop->results.temp_oid,
9710         cop->temp_cursor.data_offset,
9711         cop->data.length(),
9712         cop->data,
9713         cop->dest_obj_fadvise_flags);
9714     }
9715     cop->data.clear();
9716   }
9717   if (pool.info.supports_omap()) {
9718     if (!cop->temp_cursor.omap_complete) {
9719       if (cop->omap_header.length()) {
9720         t->omap_setheader(
9721           cop->results.temp_oid,
9722           cop->omap_header);
9723         cop->omap_header.clear();
9724       }
9725       if (cop->omap_data.length()) {
9726         map<string,bufferlist> omap;
9727         bufferlist::const_iterator p = cop->omap_data.begin();
9728         decode(omap, p);
9729         t->omap_setkeys(cop->results.temp_oid, omap);
9730         cop->omap_data.clear();
9731       }
9732     }
9733   } else {
9734     ceph_assert(cop->omap_header.length() == 0);
9735     ceph_assert(cop->omap_data.length() == 0);
9736   }
9737   cop->temp_cursor = cop->cursor;
9738 }
9739
9740 void PrimaryLogPG::finish_copyfrom(CopyFromCallback *cb)
9741 {
9742   OpContext *ctx = cb->ctx;
9743   dout(20) << "finish_copyfrom on " << ctx->obs->oi.soid << dendl;
9744
9745   ObjectState& obs = ctx->new_obs;
9746   if (obs.exists) {
9747     dout(20) << __func__ << ": exists, removing" << dendl;
9748     ctx->op_t->remove(obs.oi.soid);
9749   } else {
9750     ctx->delta_stats.num_objects++;
9751     obs.exists = true;
9752   }
9753   if (cb->is_temp_obj_used()) {
9754     ctx->discard_temp_oid = cb->results->temp_oid;
9755   }
9756   cb->results->fill_in_final_tx(ctx->op_t.get());
9757
9758   // CopyFromCallback fills this in for us
9759   obs.oi.user_version = ctx->user_at_version;
9760
9761   if (cb->results->is_data_digest()) {
9762     obs.oi.set_data_digest(cb->results->data_digest);
9763   } else {
9764     obs.oi.clear_data_digest();
9765   }
9766   if (cb->results->is_omap_digest()) {
9767     obs.oi.set_omap_digest(cb->results->omap_digest);
9768   } else {
9769     obs.oi.clear_omap_digest();
9770   }
9771
9772   obs.oi.truncate_seq = cb->truncate_seq;
9773   obs.oi.truncate_size = cb->truncate_size;
9774
9775   obs.oi.mtime = ceph::real_clock::to_timespec(cb->results->mtime);
9776   ctx->mtime = utime_t();
9777
9778   ctx->extra_reqids = cb->results->reqids;
9779   ctx->extra_reqid_return_codes = cb->results->reqid_return_codes;
9780
9781   // cache: clear whiteout?
9782   if (obs.oi.is_whiteout()) {
9783     dout(10) << __func__ << " clearing whiteout on " << obs.oi.soid << dendl;
9784     obs.oi.clear_flag(object_info_t::FLAG_WHITEOUT);
9785     --ctx->delta_stats.num_whiteouts;
9786   }
9787
9788   if (cb->results->has_omap) {
9789     dout(10) << __func__ << " setting omap flag on " << obs.oi.soid << dendl;
9790     obs.oi.set_flag(object_info_t::FLAG_OMAP);
9791     ctx->clean_regions.mark_omap_dirty();
9792   } else {
9793     dout(10) << __func__ << " clearing omap flag on " << obs.oi.soid << dendl;
9794     obs.oi.clear_flag(object_info_t::FLAG_OMAP);
9795   }
9796
9797   interval_set<uint64_t> ch;
9798   if (obs.oi.size > 0)
9799     ch.insert(0, obs.oi.size);
9800   ctx->modified_ranges.union_of(ch);
9801   ctx->clean_regions.mark_data_region_dirty(0, std::max(obs.oi.size, cb->get_data_size()));
9802
9803   if (cb->get_data_size() != obs.oi.size) {
9804     ctx->delta_stats.num_bytes -= obs.oi.size;
9805     obs.oi.size = cb->get_data_size();
9806     ctx->delta_stats.num_bytes += obs.oi.size;
9807   }
9808   ctx->delta_stats.num_wr++;
9809   ctx->delta_stats.num_wr_kb += shift_round_up(obs.oi.size, 10);
9810
9811   osd->logger->inc(l_osd_copyfrom);
9812 }
9813
9814 void PrimaryLogPG::finish_promote(int r, CopyResults *results,
9815                                   ObjectContextRef obc)
9816 {
9817   const hobject_t& soid = obc->obs.oi.soid;
9818   dout(10) << __func__ << " " << soid << " r=" << r
9819            << " uv" << results->user_version << dendl;
9820
9821   if (r == -ECANCELED) {
9822     return;
9823   }
9824
9825   if (r != -ENOENT && soid.is_snap()) {
9826     if (results->snaps.empty()) {
9827       // we must have read "snap" content from the head object in the
9828       // base pool.  use snap_seq to construct what snaps should be
9829       // for this clone (what is was before we evicted the clean clone
9830       // from this pool, and what it will be when we flush and the
9831       // clone eventually happens in the base pool).  we want to use
9832       // snaps in (results->snap_seq,soid.snap]
9833       SnapSet& snapset = obc->ssc->snapset;
9834       for (auto p = snapset.clone_snaps.rbegin();
9835            p != snapset.clone_snaps.rend();
9836            ++p) {
9837         for (auto snap : p->second) {
9838           if (snap > soid.snap) {
9839             continue;
9840           }
9841           if (snap <= results->snap_seq) {
9842             break;
9843           }
9844           results->snaps.push_back(snap);
9845         }
9846       }
9847     }
9848
9849     dout(20) << __func__ << " snaps " << results->snaps << dendl;
9850     filter_snapc(results->snaps);
9851
9852     dout(20) << __func__ << " filtered snaps " << results->snaps << dendl;
9853     if (results->snaps.empty()) {
9854       dout(20) << __func__
9855                << " snaps are empty, clone is invalid,"
9856                << " setting r to ENOENT" << dendl;
9857       r = -ENOENT;
9858     }
9859   }
9860
9861   if (r < 0 && results->started_temp_obj) {
9862     dout(10) << __func__ << " abort; will clean up partial work" << dendl;
9863     ObjectContextRef tempobc = get_object_context(results->temp_oid, false);
9864     ceph_assert(tempobc);
9865     OpContextUPtr ctx = simple_opc_create(tempobc);
9866     ctx->op_t->remove(results->temp_oid);
9867     simple_opc_submit(std::move(ctx));
9868     results->started_temp_obj = false;
9869   }
9870
9871   if (r == -ENOENT && soid.is_snap()) {
9872     dout(10) << __func__
9873              << ": enoent while trying to promote clone, " << soid
9874              << " must have been trimmed, removing from snapset"
9875              << dendl;
9876     hobject_t head(soid.get_head());
9877     ObjectContextRef obc = get_object_context(head, false);
9878     ceph_assert(obc);
9879
9880     OpContextUPtr tctx = simple_opc_create(obc);
9881     tctx->at_version = get_next_version();
9882     if (get_osdmap()->require_osd_release < ceph_release_t::octopus) {
9883       filter_snapc(tctx->new_snapset.snaps);
9884     } else {
9885       tctx->new_snapset.snaps.clear();
9886     }
9887     vector<snapid_t> new_clones;
9888     map<snapid_t, vector<snapid_t>> new_clone_snaps;
9889     for (vector<snapid_t>::iterator i = tctx->new_snapset.clones.begin();
9890          i != tctx->new_snapset.clones.end();
9891          ++i) {
9892       if (*i != soid.snap) {
9893         new_clones.push_back(*i);
9894         auto p = tctx->new_snapset.clone_snaps.find(*i);
9895         if (p != tctx->new_snapset.clone_snaps.end()) {
9896           new_clone_snaps[*i] = p->second;
9897         }
9898       }
9899     }
9900     tctx->new_snapset.clones.swap(new_clones);
9901     tctx->new_snapset.clone_overlap.erase(soid.snap);
9902     tctx->new_snapset.clone_size.erase(soid.snap);
9903     tctx->new_snapset.clone_snaps.swap(new_clone_snaps);
9904
9905     // take RWWRITE lock for duration of our local write.  ignore starvation.
9906     if (!tctx->lock_manager.take_write_lock(
9907           head,
9908           obc)) {
9909       ceph_abort_msg("problem!");
9910     }
9911     dout(20) << __func__ << " took lock on obc, " << obc->rwstate << dendl;
9912
9913     finish_ctx(tctx.get(), pg_log_entry_t::PROMOTE);
9914
9915     simple_opc_submit(std::move(tctx));
9916     return;
9917   }
9918
9919   bool whiteout = false;
9920   if (r == -ENOENT) {
9921     ceph_assert(soid.snap == CEPH_NOSNAP); // snap case is above
9922     dout(10) << __func__ << " whiteout " << soid << dendl;
9923     whiteout = true;
9924   }
9925
9926   if (r < 0 && !whiteout) {
9927     derr << __func__ << " unexpected promote error " << cpp_strerror(r) << dendl;
9928     // pass error to everyone blocked on this object
9929     // FIXME: this is pretty sloppy, but at this point we got
9930     // something unexpected and don't have many other options.
9931     map<hobject_t,list<OpRequestRef>>::iterator blocked_iter =
9932       waiting_for_blocked_object.find(soid);
9933     if (blocked_iter != waiting_for_blocked_object.end()) {
9934       while (!blocked_iter->second.empty()) {
9935         osd->reply_op_error(blocked_iter->second.front(), r);
9936         blocked_iter->second.pop_front();
9937       }
9938       waiting_for_blocked_object.erase(blocked_iter);
9939     }
9940     return;
9941   }
9942
9943   osd->promote_finish(results->object_size);
9944
9945   OpContextUPtr tctx =  simple_opc_create(obc);
9946   tctx->at_version = get_next_version();
9947
9948   if (!obc->obs.oi.has_manifest()) {
9949     ++tctx->delta_stats.num_objects;
9950   }
9951   if (soid.snap < CEPH_NOSNAP)
9952     ++tctx->delta_stats.num_object_clones;
9953   tctx->new_obs.exists = true;
9954
9955   tctx->extra_reqids = results->reqids;
9956   tctx->extra_reqid_return_codes = results->reqid_return_codes;
9957
9958   if (obc->obs.oi.has_manifest() && obc->obs.oi.manifest.is_redirect()) {
9959     tctx->new_obs.oi.manifest.type = object_manifest_t::TYPE_NONE;
9960     tctx->new_obs.oi.clear_flag(object_info_t::FLAG_REDIRECT_HAS_REFERENCE);
9961     tctx->new_obs.oi.clear_flag(object_info_t::FLAG_MANIFEST);
9962     tctx->new_obs.oi.manifest.redirect_target = hobject_t();
9963     tctx->delta_stats.num_objects_manifest--;
9964     if (obc->obs.oi.test_flag(object_info_t::FLAG_REDIRECT_HAS_REFERENCE)) {
9965       dec_all_refcount_manifest(obc->obs.oi, tctx.get());
9966     }
9967   }
9968
9969   if (whiteout) {
9970     // create a whiteout
9971     tctx->op_t->create(soid);
9972     tctx->new_obs.oi.set_flag(object_info_t::FLAG_WHITEOUT);
9973     ++tctx->delta_stats.num_whiteouts;
9974     dout(20) << __func__ << " creating whiteout on " << soid << dendl;
9975     osd->logger->inc(l_osd_tier_whiteout);
9976   } else {
9977     if (results->has_omap) {
9978       dout(10) << __func__ << " setting omap flag on " << soid << dendl;
9979       tctx->new_obs.oi.set_flag(object_info_t::FLAG_OMAP);
9980       ++tctx->delta_stats.num_objects_omap;
9981     }
9982
9983     results->fill_in_final_tx(tctx->op_t.get());
9984     if (results->started_temp_obj) {
9985       tctx->discard_temp_oid = results->temp_oid;
9986     }
9987     tctx->new_obs.oi.size = results->object_size;
9988     tctx->new_obs.oi.user_version = results->user_version;
9989     tctx->new_obs.oi.mtime = ceph::real_clock::to_timespec(results->mtime);
9990     tctx->mtime = utime_t();
9991     if (results->is_data_digest()) {
9992       tctx->new_obs.oi.set_data_digest(results->data_digest);
9993     } else {
9994       tctx->new_obs.oi.clear_data_digest();
9995     }
9996     if (results->object_size)
9997       tctx->clean_regions.mark_data_region_dirty(0, results->object_size);
9998     if (results->is_omap_digest()) {
9999       tctx->new_obs.oi.set_omap_digest(results->omap_digest);
10000     } else {
10001       tctx->new_obs.oi.clear_omap_digest();
10002     }
10003     if (results->has_omap)
10004         tctx->clean_regions.mark_omap_dirty();
10005     tctx->new_obs.oi.truncate_seq = results->truncate_seq;
10006     tctx->new_obs.oi.truncate_size = results->truncate_size;
10007
10008     if (soid.snap != CEPH_NOSNAP) {
10009       ceph_assert(obc->ssc->snapset.clone_snaps.count(soid.snap));
10010       ceph_assert(obc->ssc->snapset.clone_size.count(soid.snap));
10011       ceph_assert(obc->ssc->snapset.clone_size[soid.snap] ==
10012              results->object_size);
10013       ceph_assert(obc->ssc->snapset.clone_overlap.count(soid.snap));
10014
10015       tctx->delta_stats.num_bytes += obc->ssc->snapset.get_clone_bytes(soid.snap);
10016     } else {
10017       tctx->delta_stats.num_bytes += results->object_size;
10018     }
10019   }
10020
10021   if (results->mirror_snapset) {
10022     ceph_assert(tctx->new_obs.oi.soid.snap == CEPH_NOSNAP);
10023     tctx->new_snapset.from_snap_set(
10024       results->snapset,
10025       get_osdmap()->require_osd_release < ceph_release_t::luminous);
10026   }
10027   dout(20) << __func__ << " new_snapset " << tctx->new_snapset << dendl;
10028
10029   // take RWWRITE lock for duration of our local write.  ignore starvation.
10030   if (!tctx->lock_manager.take_write_lock(
10031         obc->obs.oi.soid,
10032         obc)) {
10033     ceph_abort_msg("problem!");
10034   }
10035   dout(20) << __func__ << " took lock on obc, " << obc->rwstate << dendl;
10036
10037   finish_ctx(tctx.get(), pg_log_entry_t::PROMOTE);
10038
10039   simple_opc_submit(std::move(tctx));
10040
10041   osd->logger->inc(l_osd_tier_promote);
10042
10043   if (agent_state &&
10044       agent_state->is_idle())
10045     agent_choose_mode();
10046 }
10047
10048 void PrimaryLogPG::finish_promote_manifest(int r, CopyResults *results,
10049                                             ObjectContextRef obc)
10050 {
10051   const hobject_t& soid = obc->obs.oi.soid;
10052   dout(10) << __func__ << " " << soid << " r=" << r
10053            << " uv" << results->user_version << dendl;
10054
10055   if (r == -ECANCELED || r == -EAGAIN) {
10056     return;
10057   }
10058
10059   if (r < 0) {
10060     derr << __func__ << " unexpected promote error " << cpp_strerror(r) << dendl;
10061     // pass error to everyone blocked on this object
10062     // FIXME: this is pretty sloppy, but at this point we got
10063     // something unexpected and don't have many other options.
10064     map<hobject_t,list<OpRequestRef>>::iterator blocked_iter =
10065       waiting_for_blocked_object.find(soid);
10066     if (blocked_iter != waiting_for_blocked_object.end()) {
10067       while (!blocked_iter->second.empty()) {
10068         osd->reply_op_error(blocked_iter->second.front(), r);
10069         blocked_iter->second.pop_front();
10070       }
10071       waiting_for_blocked_object.erase(blocked_iter);
10072     }
10073     return;
10074   }
10075
10076   osd->promote_finish(results->object_size);
10077   osd->logger->inc(l_osd_tier_promote);
10078
10079   if (agent_state &&
10080       agent_state->is_idle())
10081     agent_choose_mode();
10082 }
10083
10084 void PrimaryLogPG::cancel_copy(CopyOpRef cop, bool requeue,
10085                                vector<ceph_tid_t> *tids)
10086 {
10087   dout(10) << __func__ << " " << cop->obc->obs.oi.soid
10088            << " from " << cop->src << " " << cop->oloc
10089            << " v" << cop->results.user_version << dendl;
10090
10091   // cancel objecter op, if we can
10092   if (cop->objecter_tid) {
10093     tids->push_back(cop->objecter_tid);
10094     cop->objecter_tid = 0;
10095     if (cop->objecter_tid2) {
10096       tids->push_back(cop->objecter_tid2);
10097       cop->objecter_tid2 = 0;
10098     }
10099   }
10100
10101   copy_ops.erase(cop->obc->obs.oi.soid);
10102   cop->obc->stop_block();
10103
10104   kick_object_context_blocked(cop->obc);
10105   cop->results.should_requeue = requeue;
10106   CopyCallbackResults result(-ECANCELED, &cop->results);
10107   cop->cb->complete(result);
10108
10109   // There may still be an objecter callback referencing this copy op.
10110   // That callback will not need the obc since it's been canceled, and
10111   // we need the obc reference to go away prior to flush.
10112   cop->obc = ObjectContextRef();
10113 }
10114
10115 void PrimaryLogPG::cancel_copy_ops(bool requeue, vector<ceph_tid_t> *tids)
10116 {
10117   dout(10) << __func__ << dendl;
10118   map<hobject_t,CopyOpRef>::iterator p = copy_ops.begin();
10119   while (p != copy_ops.end()) {
10120     // requeue this op? can I queue up all of them?
10121     cancel_copy((p++)->second, requeue, tids);
10122   }
10123 }
10124
10125
10126 // ========================================================================
10127 // flush
10128 //
10129 // Flush a dirty object in the cache tier by writing it back to the
10130 // base tier.  The sequence looks like:
10131 //
10132 //  * send a copy-from operation to the base tier to copy the current
10133 //    version of the object
10134 //  * base tier will pull the object via (perhaps multiple) copy-get(s)
10135 //  * on completion, we check if the object has been modified.  if so,
10136 //    just reply with -EAGAIN.
10137 //  * try to take a write lock so we can clear the dirty flag.  if this
10138 //    fails, wait and retry
10139 //  * start a repop that clears the bit.
10140 //
10141 // If we have to wait, we will retry by coming back through the
10142 // start_flush method.  We check if a flush is already in progress
10143 // and, if so, try to finish it by rechecking the version and trying
10144 // to clear the dirty bit.
10145 //
10146 // In order for the cache-flush (a write op) to not block the copy-get
10147 // from reading the object, the client *must* set the SKIPRWLOCKS
10148 // flag.
10149 //
10150 // NOTE: normally writes are strictly ordered for the client, but
10151 // flushes are special in that they can be reordered with respect to
10152 // other writes.  In particular, we can't have a flush request block
10153 // an update to the cache pool object!
10154
10155 struct C_Flush : public Context {
10156   PrimaryLogPGRef pg;
10157   hobject_t oid;
10158   epoch_t last_peering_reset;
10159   ceph_tid_t tid;
10160   utime_t start;
10161   C_Flush(PrimaryLogPG *p, hobject_t o, epoch_t lpr)
10162     : pg(p), oid(o), last_peering_reset(lpr),
10163       tid(0), start(ceph_clock_now())
10164   {}
10165   void finish(int r) override {
10166     if (r == -ECANCELED)
10167       return;
10168     std::scoped_lock locker{*pg};
10169     if (last_peering_reset == pg->get_last_peering_reset()) {
10170       pg->finish_flush(oid, tid, r);
10171       pg->osd->logger->tinc(l_osd_tier_flush_lat, ceph_clock_now() - start);
10172     }
10173   }
10174 };
10175
10176 int PrimaryLogPG::start_dedup(OpRequestRef op, ObjectContextRef obc)
10177 {
10178   const object_info_t& oi = obc->obs.oi;
10179   const hobject_t& soid = oi.soid;
10180
10181   ceph_assert(obc->is_blocked());
10182   if (oi.size == 0) {
10183     // evicted
10184     return 0;
10185   }
10186   if (pool.info.get_fingerprint_type() == pg_pool_t::TYPE_FINGERPRINT_NONE) {
10187     dout(0) << " fingerprint algorithm is not set " << dendl;
10188     return -EINVAL;
10189   }
10190
10191   /*
10192    * The operations to make dedup chunks are tracked by a ManifestOp.
10193    * This op will be finished if all the operations are completed.
10194    */
10195   ManifestOpRef mop(std::make_shared<ManifestOp>(nullptr));
10196
10197   // cdc
10198   std::map<uint64_t, bufferlist> chunks;
10199   int r = do_cdc(oi, mop->new_manifest.chunk_map, chunks);
10200   if (r < 0) {
10201     return r;
10202   }
10203   if (!chunks.size()) {
10204     return 0;
10205   }
10206
10207   // chunks issued here are different with chunk_map newly generated
10208   // because the same chunks in previous snap will not be issued
10209   // So, we need two data structures; the first is the issued chunk list to track
10210   // issued operations, and the second is the new chunk_map to update chunk_map after
10211   // all operations are finished
10212   object_ref_delta_t refs;
10213   ObjectContextRef obc_l, obc_g;
10214   get_adjacent_clones(obc, obc_l, obc_g);
10215   // skip if the same content exits in prev snap at same offset
10216   mop->new_manifest.calc_refs_to_inc_on_set(
10217     obc_l ? &(obc_l->obs.oi.manifest) : nullptr,
10218     obc_g ? &(obc_g->obs.oi.manifest) : nullptr,
10219     refs);
10220
10221   for (auto p : chunks) {
10222     hobject_t target = mop->new_manifest.chunk_map[p.first].oid;
10223     if (refs.find(target) == refs.end()) {
10224       continue;
10225     }
10226     C_SetDedupChunks *fin = new C_SetDedupChunks(this, soid, get_last_peering_reset(), p.first);
10227     ceph_tid_t tid = refcount_manifest(soid, target, refcount_t::CREATE_OR_GET_REF,
10228                             fin, move(chunks[p.first]));
10229     mop->chunks[target] = make_pair(p.first, p.second.length());
10230     mop->num_chunks++;
10231     mop->tids[p.first] = tid;
10232     fin->tid = tid;
10233     dout(10) << __func__ << " oid: " << soid << " tid: " << tid
10234             << " target: " << target << " offset: " << p.first
10235             << " length: " << p.second.length() << dendl;
10236   }
10237
10238   if (mop->tids.size()) {
10239     manifest_ops[soid] = mop;
10240     manifest_ops[soid]->op = op;
10241   } else {
10242     // size == 0
10243     return 0;
10244   }
10245
10246   return -EINPROGRESS;
10247 }
10248
10249 int PrimaryLogPG::do_cdc(const object_info_t& oi,
10250                          std::map<uint64_t, chunk_info_t>& chunk_map,
10251                          std::map<uint64_t, bufferlist>& chunks)
10252 {
10253   string chunk_algo = pool.info.get_dedup_chunk_algorithm_name();
10254   int64_t chunk_size = pool.info.get_dedup_cdc_chunk_size();
10255   uint64_t total_length = 0;
10256
10257   std::unique_ptr<CDC> cdc = CDC::create(chunk_algo, cbits(chunk_size)-1);
10258   if (!cdc) {
10259     dout(0) << __func__ << " unrecognized chunk-algorithm " << dendl;
10260     return -EINVAL;
10261   }
10262
10263   bufferlist bl;
10264   /**
10265    * We disable EC pool as a base tier of distributed dedup.
10266    * The reason why we disallow erasure code pool here is that the EC pool does not support objects_read_sync().
10267    * Therefore, we should change the current implementation totally to make EC pool compatible.
10268    * As s result, we leave this as a future work.
10269    */
10270   int r = pgbackend->objects_read_sync(
10271       oi.soid, 0, oi.size, 0, &bl);
10272   if (r < 0) {
10273     dout(0) << __func__ << " read fail " << oi.soid
10274             << " len: " << oi.size << " r: " << r << dendl;
10275     return r;
10276   }
10277   if (bl.length() != oi.size) {
10278     dout(0) << __func__ << " bl.length: " << bl.length() << " != oi.size: "
10279             << oi.size << " during chunking " << dendl;
10280     return -EIO;
10281   }
10282
10283   dout(10) << __func__ << " oid: " << oi.soid << " len: " << bl.length()
10284            << " oi.size: " << oi.size
10285            << " chunk_size: " << chunk_size << dendl;
10286
10287   vector<pair<uint64_t, uint64_t>> cdc_chunks;
10288   cdc->calc_chunks(bl, &cdc_chunks);
10289
10290   // get fingerprint
10291   for (auto p : cdc_chunks) {
10292     bufferlist chunk;
10293     chunk.substr_of(bl, p.first, p.second);
10294     hobject_t target = get_fpoid_from_chunk(oi.soid, chunk);
10295     chunks[p.first] = move(chunk);
10296     chunk_map[p.first] = chunk_info_t(0, p.second, target);
10297     total_length += p.second;
10298   }
10299   return total_length;
10300 }
10301
10302 hobject_t PrimaryLogPG::get_fpoid_from_chunk(const hobject_t soid, bufferlist& chunk)
10303 {
10304   pg_pool_t::fingerprint_t fp_algo = pool.info.get_fingerprint_type();
10305   if (fp_algo == pg_pool_t::TYPE_FINGERPRINT_NONE) {
10306     return hobject_t();
10307   }
10308   object_t fp_oid = [&fp_algo, &chunk]() -> string {
10309     switch (fp_algo) {
10310       case pg_pool_t::TYPE_FINGERPRINT_SHA1:
10311         return ceph::crypto::digest<ceph::crypto::SHA1>(chunk).to_str();
10312       case pg_pool_t::TYPE_FINGERPRINT_SHA256:
10313         return ceph::crypto::digest<ceph::crypto::SHA256>(chunk).to_str();
10314       case pg_pool_t::TYPE_FINGERPRINT_SHA512:
10315         return ceph::crypto::digest<ceph::crypto::SHA512>(chunk).to_str();
10316       default:
10317         assert(0 == "unrecognized fingerprint type");
10318         return {};
10319     }
10320   }();
10321
10322   pg_t raw_pg;
10323   object_locator_t oloc(soid);
10324   oloc.pool = pool.info.get_dedup_tier();
10325   get_osdmap()->object_locator_to_pg(fp_oid, oloc, raw_pg);
10326   hobject_t target(fp_oid, oloc.key, snapid_t(),
10327                     raw_pg.ps(), raw_pg.pool(),
10328                     oloc.nspace);
10329   return target;
10330 }
10331
10332 int PrimaryLogPG::finish_set_dedup(hobject_t oid, int r, ceph_tid_t tid, uint64_t offset)
10333 {
10334   dout(10) << __func__ << " " << oid << " tid " << tid
10335            << " " << cpp_strerror(r) << dendl;
10336   map<hobject_t,ManifestOpRef>::iterator p = manifest_ops.find(oid);
10337   if (p == manifest_ops.end()) {
10338     dout(10) << __func__ << " no manifest_op found" << dendl;
10339     return -EINVAL;
10340   }
10341   ManifestOpRef mop = p->second;
10342   mop->results[offset] = r;
10343   if (r < 0) {
10344     // if any failure occurs, put a mark on the results to recognize the failure
10345     mop->results[0] = r;
10346   }
10347   if (mop->num_chunks != mop->results.size()) {
10348     // there are on-going works
10349     return -EINPROGRESS;
10350   }
10351   ObjectContextRef obc = get_object_context(oid, false);
10352   if (!obc) {
10353     if (mop->op)
10354       osd->reply_op_error(mop->op, -EINVAL);
10355     return -EINVAL;
10356   }
10357   ceph_assert(obc->is_blocked());
10358   obc->stop_block();
10359   kick_object_context_blocked(obc);
10360   if (mop->results[0] < 0) {
10361     // check if the previous op returns fail
10362     ceph_assert(mop->num_chunks == mop->results.size());
10363     manifest_ops.erase(oid);
10364     osd->reply_op_error(mop->op, mop->results[0]);
10365     return -EIO;
10366   }
10367
10368   if (mop->chunks.size()) {
10369     OpContextUPtr ctx = simple_opc_create(obc);
10370     ceph_assert(ctx);
10371     if (ctx->lock_manager.get_lock_type(
10372           RWState::RWWRITE,
10373           oid,
10374           obc,
10375           mop->op)) {
10376       dout(20) << __func__ << " took write lock" << dendl;
10377     } else if (mop->op) {
10378       dout(10) << __func__ << " waiting on write lock " << mop->op << dendl;
10379       close_op_ctx(ctx.release());
10380       return -EAGAIN;
10381     }
10382
10383     ctx->at_version = get_next_version();
10384     ctx->new_obs = obc->obs;
10385     ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
10386
10387     /*
10388     * Let's assume that there is a manifest snapshotted object, and we issue tier_flush() to head.
10389     * head: [0, 2) aaa <-- tier_flush()
10390     * 20:   [0, 2) ddd, [6, 2) bbb, [8, 2) ccc
10391     *
10392     * In this case, if the new chunk_map is as follows,
10393     * new_chunk_map : [0, 2) ddd, [6, 2) bbb, [8, 2) ccc
10394     * we should drop aaa from head by using calc_refs_to_drop_on_removal().
10395     * So, the precedure is
10396     *   1. calc_refs_to_drop_on_removal()
10397     *   2. register old references to drop after tier_flush() is committed
10398     *   3. update new chunk_map
10399     */
10400
10401     ObjectCleanRegions c_regions = ctx->clean_regions;
10402     ObjectContextRef cobc = get_prev_clone_obc(obc);
10403     c_regions.mark_fully_dirty();
10404     // CDC was done on entire range of manifest object,
10405     // so the first thing we should do here is to drop the reference to old chunks
10406     ObjectContextRef obc_l, obc_g;
10407     get_adjacent_clones(obc, obc_l, obc_g);
10408     // clear all old references
10409     object_ref_delta_t refs;
10410     ctx->obs->oi.manifest.calc_refs_to_drop_on_removal(
10411       obc_l ? &(obc_l->obs.oi.manifest) : nullptr,
10412       obc_g ? &(obc_g->obs.oi.manifest) : nullptr,
10413       refs);
10414     if (!refs.is_empty()) {
10415       ctx->register_on_commit(
10416         [oid, this, refs](){
10417           dec_refcount(oid, refs);
10418         });
10419     }
10420
10421     // set new references
10422     ctx->new_obs.oi.manifest.chunk_map = mop->new_manifest.chunk_map;
10423
10424     finish_ctx(ctx.get(), pg_log_entry_t::CLEAN);
10425     simple_opc_submit(std::move(ctx));
10426   }
10427   if (mop->op)
10428     osd->reply_op_error(mop->op, r);
10429
10430   manifest_ops.erase(oid);
10431   return 0;
10432 }
10433
10434 int PrimaryLogPG::start_flush(
10435   OpRequestRef op, ObjectContextRef obc,
10436   bool blocking, hobject_t *pmissing,
10437   std::optional<std::function<void()>> &&on_flush)
10438 {
10439   const object_info_t& oi = obc->obs.oi;
10440   const hobject_t& soid = oi.soid;
10441   dout(10) << __func__ << " " << soid
10442            << " v" << oi.version
10443            << " uv" << oi.user_version
10444            << " " << (blocking ? "blocking" : "non-blocking/best-effort")
10445            << dendl;
10446
10447   bool preoctopus_compat =
10448     get_osdmap()->require_osd_release < ceph_release_t::octopus;
10449   SnapSet snapset;
10450   if (preoctopus_compat) {
10451     // for pre-octopus compatibility, filter SnapSet::snaps.  not
10452     // certain we need this, but let's be conservative.
10453     snapset = obc->ssc->snapset.get_filtered(pool.info);
10454   } else {
10455     // NOTE: change this to a const ref when we remove this compat code
10456     snapset = obc->ssc->snapset;
10457   }
10458
10459   if (obc->obs.oi.has_manifest() && obc->obs.oi.manifest.is_chunked()) {
10460     // current dedup tier only supports blocking operation
10461     if (!blocking) {
10462       return -EOPNOTSUPP;
10463     }
10464   }
10465
10466   // verify there are no (older) check for dirty clones
10467   {
10468     dout(20) << " snapset " << snapset << dendl;
10469     vector<snapid_t>::reverse_iterator p = snapset.clones.rbegin();
10470     while (p != snapset.clones.rend() && *p >= soid.snap)
10471       ++p;
10472     if (p != snapset.clones.rend()) {
10473       hobject_t next = soid;
10474       next.snap = *p;
10475       ceph_assert(next.snap < soid.snap);
10476       if (recovery_state.get_pg_log().get_missing().is_missing(next)) {
10477         dout(10) << __func__ << " missing clone is " << next << dendl;
10478         if (pmissing)
10479           *pmissing = next;
10480         return -ENOENT;
10481       }
10482       ObjectContextRef older_obc = get_object_context(next, false);
10483       if (older_obc) {
10484         dout(20) << __func__ << " next oldest clone is " << older_obc->obs.oi
10485                  << dendl;
10486         if (older_obc->obs.oi.is_dirty()) {
10487           dout(10) << __func__ << " next oldest clone is dirty: "
10488                    << older_obc->obs.oi << dendl;
10489           return -EBUSY;
10490         }
10491       } else {
10492         dout(20) << __func__ << " next oldest clone " << next
10493                  << " is not present; implicitly clean" << dendl;
10494       }
10495     } else {
10496       dout(20) << __func__ << " no older clones" << dendl;
10497     }
10498   }
10499
10500   if (blocking)
10501     obc->start_block();
10502
10503   map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(soid);
10504   if (p != flush_ops.end()) {
10505     FlushOpRef fop = p->second;
10506     if (fop->op == op) {
10507       // we couldn't take the write lock on a cache-try-flush before;
10508       // now we are trying again for the lock.
10509       return try_flush_mark_clean(fop);
10510     }
10511     if (fop->flushed_version == obc->obs.oi.user_version &&
10512         (fop->blocking || !blocking)) {
10513       // nonblocking can join anything
10514       // blocking can only join a blocking flush
10515       dout(20) << __func__ << " piggybacking on existing flush " << dendl;
10516       if (op)
10517         fop->dup_ops.push_back(op);
10518       return -EAGAIN;   // clean up this ctx; op will retry later
10519     }
10520
10521     // cancel current flush since it will fail anyway, or because we
10522     // are blocking and the existing flush is nonblocking.
10523     dout(20) << __func__ << " canceling previous flush; it will fail" << dendl;
10524     if (fop->op)
10525       osd->reply_op_error(fop->op, -EBUSY);
10526     while (!fop->dup_ops.empty()) {
10527       osd->reply_op_error(fop->dup_ops.front(), -EBUSY);
10528       fop->dup_ops.pop_front();
10529     }
10530     vector<ceph_tid_t> tids;
10531     cancel_flush(fop, false, &tids);
10532     osd->objecter->op_cancel(tids, -ECANCELED);
10533   }
10534
10535   if (obc->obs.oi.has_manifest() && obc->obs.oi.manifest.is_chunked()) {
10536     int r = start_dedup(op, obc);
10537     if (r != -EINPROGRESS) {
10538       if (blocking)
10539         obc->stop_block();
10540     }
10541     return r;
10542   }
10543
10544   /**
10545    * In general, we need to send a delete and a copyfrom.
10546    * Consider snapc 10:[10, 9, 8, 4, 3, 2]:[10(10, 9), 4(4,3,2)]
10547    * where 4 is marked as clean.  To flush 10, we have to:
10548    * 1) delete 4:[4,3,2] -- Logically, the object does not exist after 4
10549    * 2) copyfrom 8:[8,4,3,2] -- flush object after snap 8
10550    *
10551    * There is a complicating case.  Supposed there had been a clone 7
10552    * for snaps [7, 6] which has been trimmed since they no longer exist.
10553    * In the base pool, we'd have 5:[4,3,2]:[4(4,3,2)]+head.  When we submit
10554    * the delete, the snap will be promoted to 5, and the head will become
10555    * a whiteout.  When the copy-from goes through, we'll end up with
10556    * 8:[8,4,3,2]:[4(4,3,2)]+head.
10557    *
10558    * Another complication is the case where there is an interval change
10559    * after doing the delete and the flush but before marking the object
10560    * clean.  We'll happily delete head and then recreate it at the same
10561    * sequence number, which works out ok.
10562    */
10563
10564   SnapContext snapc, dsnapc;
10565   if (snapset.seq != 0) {
10566     if (soid.snap == CEPH_NOSNAP) {
10567       snapc = snapset.get_ssc_as_of(snapset.seq);
10568     } else {
10569       snapid_t min_included_snap;
10570       auto p = snapset.clone_snaps.find(soid.snap);
10571       ceph_assert(p != snapset.clone_snaps.end());
10572       min_included_snap = p->second.back();
10573       snapc = snapset.get_ssc_as_of(min_included_snap - 1);
10574     }
10575
10576     snapid_t prev_snapc = 0;
10577     for (vector<snapid_t>::reverse_iterator citer = snapset.clones.rbegin();
10578          citer != snapset.clones.rend();
10579          ++citer) {
10580       if (*citer < soid.snap) {
10581         prev_snapc = *citer;
10582         break;
10583       }
10584     }
10585
10586     dsnapc = snapset.get_ssc_as_of(prev_snapc);
10587   }
10588
10589   object_locator_t base_oloc(soid);
10590   base_oloc.pool = pool.info.tier_of;
10591
10592   if (dsnapc.seq < snapc.seq) {
10593     ObjectOperation o;
10594     o.remove();
10595     osd->objecter->mutate(
10596       soid.oid,
10597       base_oloc,
10598       o,
10599       dsnapc,
10600       ceph::real_clock::from_ceph_timespec(oi.mtime),
10601       (CEPH_OSD_FLAG_IGNORE_OVERLAY |
10602        CEPH_OSD_FLAG_ENFORCE_SNAPC),
10603       NULL /* no callback, we'll rely on the ordering w.r.t the next op */);
10604   }
10605
10606   FlushOpRef fop(std::make_shared<FlushOp>());
10607   fop->obc = obc;
10608   fop->flushed_version = oi.user_version;
10609   fop->blocking = blocking;
10610   fop->on_flush = std::move(on_flush);
10611   fop->op = op;
10612
10613   ObjectOperation o;
10614   if (oi.is_whiteout()) {
10615     fop->removal = true;
10616     o.remove();
10617   } else {
10618     object_locator_t oloc(soid);
10619     o.copy_from(soid.oid.name, soid.snap, oloc, oi.user_version,
10620                 CEPH_OSD_COPY_FROM_FLAG_FLUSH |
10621                 CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
10622                 CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
10623                 CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE,
10624                 LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL|LIBRADOS_OP_FLAG_FADVISE_NOCACHE);
10625
10626     //mean the base tier don't cache data after this
10627     if (agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL)
10628       o.set_last_op_flags(LIBRADOS_OP_FLAG_FADVISE_DONTNEED);
10629   }
10630   C_Flush *fin = new C_Flush(this, soid, get_last_peering_reset());
10631
10632   ceph_tid_t tid = osd->objecter->mutate(
10633     soid.oid, base_oloc, o, snapc,
10634     ceph::real_clock::from_ceph_timespec(oi.mtime),
10635     CEPH_OSD_FLAG_IGNORE_OVERLAY | CEPH_OSD_FLAG_ENFORCE_SNAPC,
10636     new C_OnFinisher(fin,
10637                      osd->get_objecter_finisher(get_pg_shard())));
10638   /* we're under the pg lock and fin->finish() is grabbing that */
10639   fin->tid = tid;
10640   fop->objecter_tid = tid;
10641
10642   flush_ops[soid] = fop;
10643
10644   recovery_state.update_stats(
10645     [&oi](auto &history, auto &stats) {
10646       stats.stats.sum.num_flush++;
10647       stats.stats.sum.num_flush_kb += shift_round_up(oi.size, 10);
10648       return false;
10649     });
10650   return -EINPROGRESS;
10651 }
10652
10653 void PrimaryLogPG::finish_flush(hobject_t oid, ceph_tid_t tid, int r)
10654 {
10655   dout(10) << __func__ << " " << oid << " tid " << tid
10656            << " " << cpp_strerror(r) << dendl;
10657   map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(oid);
10658   if (p == flush_ops.end()) {
10659     dout(10) << __func__ << " no flush_op found" << dendl;
10660     return;
10661   }
10662   FlushOpRef fop = p->second;
10663   if (tid != fop->objecter_tid && !fop->obc->obs.oi.has_manifest()) {
10664     dout(10) << __func__ << " tid " << tid << " != fop " << fop
10665              << " tid " << fop->objecter_tid << dendl;
10666     return;
10667   }
10668   ObjectContextRef obc = fop->obc;
10669   fop->objecter_tid = 0;
10670
10671   if (r < 0 && !(r == -ENOENT && fop->removal)) {
10672     if (fop->op)
10673       osd->reply_op_error(fop->op, -EBUSY);
10674     if (fop->blocking) {
10675       obc->stop_block();
10676       kick_object_context_blocked(obc);
10677     }
10678
10679     if (!fop->dup_ops.empty()) {
10680       dout(20) << __func__ << " requeueing dups" << dendl;
10681       requeue_ops(fop->dup_ops);
10682     }
10683     if (fop->on_flush) {
10684       (*(fop->on_flush))();
10685       fop->on_flush = std::nullopt;
10686     }
10687     flush_ops.erase(oid);
10688     return;
10689   }
10690
10691   r = try_flush_mark_clean(fop);
10692   if (r == -EBUSY && fop->op) {
10693     osd->reply_op_error(fop->op, r);
10694   }
10695 }
10696
10697 int PrimaryLogPG::try_flush_mark_clean(FlushOpRef fop)
10698 {
10699   ObjectContextRef obc = fop->obc;
10700   const hobject_t& oid = obc->obs.oi.soid;
10701
10702   if (fop->blocking) {
10703     obc->stop_block();
10704     kick_object_context_blocked(obc);
10705   }
10706
10707   if (fop->flushed_version != obc->obs.oi.user_version ||
10708       !obc->obs.exists) {
10709     if (obc->obs.exists)
10710       dout(10) << __func__ << " flushed_version " << fop->flushed_version
10711                << " != current " << obc->obs.oi.user_version
10712                << dendl;
10713     else
10714       dout(10) << __func__ << " object no longer exists" << dendl;
10715
10716     if (!fop->dup_ops.empty()) {
10717       dout(20) << __func__ << " requeueing dups" << dendl;
10718       requeue_ops(fop->dup_ops);
10719     }
10720     if (fop->on_flush) {
10721       (*(fop->on_flush))();
10722       fop->on_flush = std::nullopt;
10723     }
10724     flush_ops.erase(oid);
10725     if (fop->blocking)
10726       osd->logger->inc(l_osd_tier_flush_fail);
10727     else
10728       osd->logger->inc(l_osd_tier_try_flush_fail);
10729     return -EBUSY;
10730   }
10731
10732   if (!fop->blocking &&
10733       m_scrubber->write_blocked_by_scrub(oid)) {
10734     if (fop->op) {
10735       dout(10) << __func__ << " blocked by scrub" << dendl;
10736       requeue_op(fop->op);
10737       requeue_ops(fop->dup_ops);
10738       return -EAGAIN;    // will retry
10739     } else {
10740       osd->logger->inc(l_osd_tier_try_flush_fail);
10741       vector<ceph_tid_t> tids;
10742       cancel_flush(fop, false, &tids);
10743       osd->objecter->op_cancel(tids, -ECANCELED);
10744       return -ECANCELED;
10745     }
10746   }
10747
10748   // successfully flushed, can we evict this object?
10749   if (!obc->obs.oi.has_manifest() && !fop->op &&
10750       agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_IDLE &&
10751       agent_maybe_evict(obc, true)) {
10752     osd->logger->inc(l_osd_tier_clean);
10753     if (fop->on_flush) {
10754       (*(fop->on_flush))();
10755       fop->on_flush = std::nullopt;
10756     }
10757     flush_ops.erase(oid);
10758     return 0;
10759   }
10760
10761   dout(10) << __func__ << " clearing DIRTY flag for " << oid << dendl;
10762   OpContextUPtr ctx = simple_opc_create(fop->obc);
10763
10764   // successfully flushed; can we clear the dirty bit?
10765   // try to take the lock manually, since we don't
10766   // have a ctx yet.
10767   if (ctx->lock_manager.get_lock_type(
10768         RWState::RWWRITE,
10769         oid,
10770         obc,
10771         fop->op)) {
10772     dout(20) << __func__ << " took write lock" << dendl;
10773   } else if (fop->op) {
10774     dout(10) << __func__ << " waiting on write lock " << fop->op << " "
10775              << fop->dup_ops << dendl;
10776     // fop->op is now waiting on the lock; get fop->dup_ops to wait too.
10777     for (auto op : fop->dup_ops) {
10778       bool locked = ctx->lock_manager.get_lock_type(
10779         RWState::RWWRITE,
10780         oid,
10781         obc,
10782         op);
10783       ceph_assert(!locked);
10784     }
10785     close_op_ctx(ctx.release());
10786     return -EAGAIN;    // will retry
10787   } else {
10788     dout(10) << __func__ << " failed write lock, no op; failing" << dendl;
10789     close_op_ctx(ctx.release());
10790     osd->logger->inc(l_osd_tier_try_flush_fail);
10791     vector<ceph_tid_t> tids;
10792     cancel_flush(fop, false, &tids);
10793     osd->objecter->op_cancel(tids, -ECANCELED);
10794     return -ECANCELED;
10795   }
10796
10797   if (fop->on_flush) {
10798     ctx->register_on_finish(*(fop->on_flush));
10799     fop->on_flush = std::nullopt;
10800   }
10801
10802   ctx->at_version = get_next_version();
10803
10804   ctx->new_obs = obc->obs;
10805   ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
10806   --ctx->delta_stats.num_objects_dirty;
10807   if (fop->obc->obs.oi.has_manifest()) {
10808     ceph_assert(obc->obs.oi.manifest.is_chunked());
10809     PGTransaction* t = ctx->op_t.get();
10810     uint64_t chunks_size = 0;
10811     for (auto &p : ctx->new_obs.oi.manifest.chunk_map) {
10812       chunks_size += p.second.length;
10813     }
10814     if (ctx->new_obs.oi.is_omap() && pool.info.supports_omap()) {
10815       t->omap_clear(oid);
10816       ctx->new_obs.oi.clear_omap_digest();
10817       ctx->new_obs.oi.clear_flag(object_info_t::FLAG_OMAP);
10818       ctx->clean_regions.mark_omap_dirty();
10819     }
10820     if (obc->obs.oi.size == chunks_size) {
10821       t->truncate(oid, 0);
10822       interval_set<uint64_t> trim;
10823       trim.insert(0, ctx->new_obs.oi.size);
10824       ctx->modified_ranges.union_of(trim);
10825       truncate_update_size_and_usage(ctx->delta_stats,
10826                                      ctx->new_obs.oi,
10827                                      0);
10828       ctx->clean_regions.mark_data_region_dirty(0, ctx->new_obs.oi.size);
10829       ctx->new_obs.oi.new_object();
10830       for (auto &p : ctx->new_obs.oi.manifest.chunk_map) {
10831         p.second.set_flag(chunk_info_t::FLAG_MISSING);
10832       }
10833     } else {
10834       for (auto &p : ctx->new_obs.oi.manifest.chunk_map) {
10835         dout(20) << __func__ << " offset: " << p.second.offset
10836                 << " length: " << p.second.length << dendl;
10837         p.second.clear_flag(chunk_info_t::FLAG_MISSING); // CLEAN
10838       }
10839     }
10840   }
10841
10842   finish_ctx(ctx.get(), pg_log_entry_t::CLEAN);
10843
10844   osd->logger->inc(l_osd_tier_clean);
10845
10846   if (!fop->dup_ops.empty() || fop->op) {
10847     dout(20) << __func__ << " requeueing for " << ctx->at_version << dendl;
10848     list<OpRequestRef> ls;
10849     if (fop->op)
10850       ls.push_back(fop->op);
10851     ls.splice(ls.end(), fop->dup_ops);
10852     requeue_ops(ls);
10853   }
10854
10855   simple_opc_submit(std::move(ctx));
10856
10857   flush_ops.erase(oid);
10858
10859   if (fop->blocking)
10860     osd->logger->inc(l_osd_tier_flush);
10861   else
10862     osd->logger->inc(l_osd_tier_try_flush);
10863
10864   return -EINPROGRESS;
10865 }
10866
10867 void PrimaryLogPG::cancel_flush(FlushOpRef fop, bool requeue,
10868                                 vector<ceph_tid_t> *tids)
10869 {
10870   dout(10) << __func__ << " " << fop->obc->obs.oi.soid << " tid "
10871            << fop->objecter_tid << dendl;
10872   if (fop->objecter_tid) {
10873     tids->push_back(fop->objecter_tid);
10874     fop->objecter_tid = 0;
10875   }
10876   if (fop->io_tids.size()) {
10877     for (auto &p : fop->io_tids) {
10878       tids->push_back(p.second);
10879       p.second = 0;
10880     }
10881   }
10882   if (fop->blocking && fop->obc->is_blocked()) {
10883     fop->obc->stop_block();
10884     kick_object_context_blocked(fop->obc);
10885   }
10886   if (requeue) {
10887     if (fop->op)
10888       requeue_op(fop->op);
10889     requeue_ops(fop->dup_ops);
10890   }
10891   if (fop->on_flush) {
10892     (*(fop->on_flush))();
10893     fop->on_flush = std::nullopt;
10894   }
10895   flush_ops.erase(fop->obc->obs.oi.soid);
10896 }
10897
10898 void PrimaryLogPG::cancel_flush_ops(bool requeue, vector<ceph_tid_t> *tids)
10899 {
10900   dout(10) << __func__ << dendl;
10901   map<hobject_t,FlushOpRef>::iterator p = flush_ops.begin();
10902   while (p != flush_ops.end()) {
10903     cancel_flush((p++)->second, requeue, tids);
10904   }
10905 }
10906
10907 bool PrimaryLogPG::is_present_clone(hobject_t coid)
10908 {
10909   if (!pool.info.allow_incomplete_clones())
10910     return true;
10911   if (is_missing_object(coid))
10912     return true;
10913   ObjectContextRef obc = get_object_context(coid, false);
10914   return obc && obc->obs.exists;
10915 }
10916
10917 // ========================================================================
10918 // rep op gather
10919
10920 class C_OSD_RepopCommit : public Context {
10921   PrimaryLogPGRef pg;
10922   boost::intrusive_ptr<PrimaryLogPG::RepGather> repop;
10923 public:
10924   C_OSD_RepopCommit(PrimaryLogPG *pg, PrimaryLogPG::RepGather *repop)
10925     : pg(pg), repop(repop) {}
10926   void finish(int) override {
10927     pg->repop_all_committed(repop.get());
10928   }
10929 };
10930
10931 void PrimaryLogPG::repop_all_committed(RepGather *repop)
10932 {
10933   dout(10) << __func__ << ": repop tid " << repop->rep_tid << " all committed "
10934            << dendl;
10935   repop->all_committed = true;
10936   if (!repop->rep_aborted) {
10937     if (repop->v != eversion_t()) {
10938       recovery_state.complete_write(repop->v, repop->pg_local_last_complete);
10939     }
10940     eval_repop(repop);
10941   }
10942 }
10943
10944 void PrimaryLogPG::op_applied(const eversion_t &applied_version)
10945 {
10946   dout(10) << "op_applied version " << applied_version << dendl;
10947   ceph_assert(applied_version != eversion_t());
10948   ceph_assert(applied_version <= info.last_update);
10949   recovery_state.local_write_applied(applied_version);
10950
10951   if (is_primary() && m_scrubber->should_requeue_blocked_ops(recovery_state.get_last_update_applied())) {
10952     osd->queue_scrub_applied_update(this, is_scrub_blocking_ops());
10953   }
10954 }
10955
10956 void PrimaryLogPG::eval_repop(RepGather *repop)
10957 {
10958   #ifdef HAVE_JAEGER
10959   if (repop->op->osd_parent_span) {
10960     auto eval_span = jaeger_tracing::child_span(__func__, repop->op->osd_parent_span);
10961   }
10962  #endif
10963   dout(10) << "eval_repop " << *repop
10964     << (repop->op && repop->op->get_req<MOSDOp>() ? "" : " (no op)") << dendl;
10965
10966   // ondisk?
10967   if (repop->all_committed) {
10968     dout(10) << " commit: " << *repop << dendl;
10969     for (auto p = repop->on_committed.begin();
10970          p != repop->on_committed.end();
10971          repop->on_committed.erase(p++)) {
10972       (*p)();
10973     }
10974     // send dup commits, in order
10975     auto it = waiting_for_ondisk.find(repop->v);
10976     if (it != waiting_for_ondisk.end()) {
10977       ceph_assert(waiting_for_ondisk.begin()->first == repop->v);
10978       for (auto& i : it->second) {
10979         int return_code = repop->r;
10980         if (return_code >= 0) {
10981           return_code = std::get<2>(i);
10982         }
10983         osd->reply_op_error(std::get<0>(i), return_code, repop->v,
10984                             std::get<1>(i), std::get<3>(i));
10985       }
10986       waiting_for_ondisk.erase(it);
10987     }
10988
10989     publish_stats_to_osd();
10990
10991     dout(10) << " removing " << *repop << dendl;
10992     ceph_assert(!repop_queue.empty());
10993     dout(20) << "   q front is " << *repop_queue.front() << dendl;
10994     if (repop_queue.front() == repop) {
10995       RepGather *to_remove = nullptr;
10996       while (!repop_queue.empty() &&
10997              (to_remove = repop_queue.front())->all_committed) {
10998         repop_queue.pop_front();
10999         for (auto p = to_remove->on_success.begin();
11000              p != to_remove->on_success.end();
11001              to_remove->on_success.erase(p++)) {
11002           (*p)();
11003         }
11004         remove_repop(to_remove);
11005       }
11006     }
11007   }
11008 }
11009
11010 void PrimaryLogPG::issue_repop(RepGather *repop, OpContext *ctx)
11011 {
11012   FUNCTRACE(cct);
11013   const hobject_t& soid = ctx->obs->oi.soid;
11014   dout(7) << "issue_repop rep_tid " << repop->rep_tid
11015           << " o " << soid
11016           << dendl;
11017 #ifdef HAVE_JAEGER
11018   if (ctx->op->osd_parent_span) {
11019     auto issue_repop_span = jaeger_tracing::child_span(__func__, ctx->op->osd_parent_span);
11020   }
11021 #endif
11022
11023   repop->v = ctx->at_version;
11024
11025   ctx->op_t->add_obc(ctx->obc);
11026   if (ctx->clone_obc) {
11027     ctx->op_t->add_obc(ctx->clone_obc);
11028   }
11029   if (ctx->head_obc) {
11030     ctx->op_t->add_obc(ctx->head_obc);
11031   }
11032
11033   Context *on_all_commit = new C_OSD_RepopCommit(this, repop);
11034   if (!(ctx->log.empty())) {
11035     ceph_assert(ctx->at_version >= projected_last_update);
11036     projected_last_update = ctx->at_version;
11037   }
11038   for (auto &&entry: ctx->log) {
11039     projected_log.add(entry);
11040   }
11041
11042   recovery_state.pre_submit_op(
11043     soid,
11044     ctx->log,
11045     ctx->at_version);
11046   pgbackend->submit_transaction(
11047     soid,
11048     ctx->delta_stats,
11049     ctx->at_version,
11050     std::move(ctx->op_t),
11051     recovery_state.get_pg_trim_to(),
11052     recovery_state.get_min_last_complete_ondisk(),
11053     std::move(ctx->log),
11054     ctx->updated_hset_history,
11055     on_all_commit,
11056     repop->rep_tid,
11057     ctx->reqid,
11058     ctx->op);
11059 }
11060
11061 PrimaryLogPG::RepGather *PrimaryLogPG::new_repop(
11062   OpContext *ctx, ObjectContextRef obc,
11063   ceph_tid_t rep_tid)
11064 {
11065   if (ctx->op)
11066     dout(10) << "new_repop rep_tid " << rep_tid << " on " << *ctx->op->get_req() << dendl;
11067   else
11068     dout(10) << "new_repop rep_tid " << rep_tid << " (no op)" << dendl;
11069
11070   RepGather *repop = new RepGather(
11071     ctx, rep_tid, info.last_complete);
11072
11073   repop->start = ceph_clock_now();
11074
11075   repop_queue.push_back(&repop->queue_item);
11076   repop->get();
11077
11078   osd->logger->inc(l_osd_op_wip);
11079
11080   dout(10) << __func__ << ": " << *repop << dendl;
11081   return repop;
11082 }
11083
11084 boost::intrusive_ptr<PrimaryLogPG::RepGather> PrimaryLogPG::new_repop(
11085   eversion_t version,
11086   int r,
11087   ObcLockManager &&manager,
11088   OpRequestRef &&op,
11089   std::optional<std::function<void(void)> > &&on_complete)
11090 {
11091   RepGather *repop = new RepGather(
11092     std::move(manager),
11093     std::move(op),
11094     std::move(on_complete),
11095     osd->get_tid(),
11096     info.last_complete,
11097     r);
11098   repop->v = version;
11099
11100   repop->start = ceph_clock_now();
11101
11102   repop_queue.push_back(&repop->queue_item);
11103
11104   osd->logger->inc(l_osd_op_wip);
11105
11106   dout(10) << __func__ << ": " << *repop << dendl;
11107   return boost::intrusive_ptr<RepGather>(repop);
11108 }
11109
11110 void PrimaryLogPG::remove_repop(RepGather *repop)
11111 {
11112   dout(20) << __func__ << " " << *repop << dendl;
11113
11114   for (auto p = repop->on_finish.begin();
11115        p != repop->on_finish.end();
11116        repop->on_finish.erase(p++)) {
11117     (*p)();
11118   }
11119
11120   release_object_locks(
11121     repop->lock_manager);
11122   repop->put();
11123
11124   osd->logger->dec(l_osd_op_wip);
11125 }
11126
11127 PrimaryLogPG::OpContextUPtr PrimaryLogPG::simple_opc_create(ObjectContextRef obc)
11128 {
11129   dout(20) << __func__ << " " << obc->obs.oi.soid << dendl;
11130   ceph_tid_t rep_tid = osd->get_tid();
11131   osd_reqid_t reqid(osd->get_cluster_msgr_name(), 0, rep_tid);
11132   OpContextUPtr ctx(new OpContext(OpRequestRef(), reqid, nullptr, obc, this));
11133   ctx->op_t.reset(new PGTransaction());
11134   ctx->mtime = ceph_clock_now();
11135   return ctx;
11136 }
11137
11138 void PrimaryLogPG::simple_opc_submit(OpContextUPtr ctx)
11139 {
11140   RepGather *repop = new_repop(ctx.get(), ctx->obc, ctx->reqid.tid);
11141   dout(20) << __func__ << " " << repop << dendl;
11142   issue_repop(repop, ctx.get());
11143   eval_repop(repop);
11144   recovery_state.update_trim_to();
11145   repop->put();
11146 }
11147
11148
11149 void PrimaryLogPG::submit_log_entries(
11150   const mempool::osd_pglog::list<pg_log_entry_t> &entries,
11151   ObcLockManager &&manager,
11152   std::optional<std::function<void(void)> > &&_on_complete,
11153   OpRequestRef op,
11154   int r)
11155 {
11156   dout(10) << __func__ << " " << entries << dendl;
11157   ceph_assert(is_primary());
11158
11159   eversion_t version;
11160   if (!entries.empty()) {
11161     ceph_assert(entries.rbegin()->version >= projected_last_update);
11162     version = projected_last_update = entries.rbegin()->version;
11163   }
11164
11165   boost::intrusive_ptr<RepGather> repop;
11166   std::optional<std::function<void(void)> > on_complete;
11167   if (get_osdmap()->require_osd_release >= ceph_release_t::jewel) {
11168     repop = new_repop(
11169       version,
11170       r,
11171       std::move(manager),
11172       std::move(op),
11173       std::move(_on_complete));
11174   } else {
11175     on_complete = std::move(_on_complete);
11176   }
11177
11178   pgbackend->call_write_ordered(
11179     [this, entries, repop, on_complete]() {
11180       ObjectStore::Transaction t;
11181       eversion_t old_last_update = info.last_update;
11182       recovery_state.merge_new_log_entries(
11183         entries, t, recovery_state.get_pg_trim_to(),
11184         recovery_state.get_min_last_complete_ondisk());
11185
11186       set<pg_shard_t> waiting_on;
11187       for (set<pg_shard_t>::const_iterator i = get_acting_recovery_backfill().begin();
11188            i != get_acting_recovery_backfill().end();
11189            ++i) {
11190         pg_shard_t peer(*i);
11191         if (peer == pg_whoami) continue;
11192         ceph_assert(recovery_state.get_peer_missing().count(peer));
11193         ceph_assert(recovery_state.has_peer_info(peer));
11194         if (get_osdmap()->require_osd_release >= ceph_release_t::jewel) {
11195           ceph_assert(repop);
11196           MOSDPGUpdateLogMissing *m = new MOSDPGUpdateLogMissing(
11197             entries,
11198             spg_t(info.pgid.pgid, i->shard),
11199             pg_whoami.shard,
11200             get_osdmap_epoch(),
11201             get_last_peering_reset(),
11202             repop->rep_tid,
11203             recovery_state.get_pg_trim_to(),
11204             recovery_state.get_min_last_complete_ondisk());
11205           osd->send_message_osd_cluster(
11206             peer.osd, m, get_osdmap_epoch());
11207           waiting_on.insert(peer);
11208         } else {
11209           MOSDPGLog *m = new MOSDPGLog(
11210             peer.shard, pg_whoami.shard,
11211             info.last_update.epoch,
11212             info, get_last_peering_reset());
11213           m->log.log = entries;
11214           m->log.tail = old_last_update;
11215           m->log.head = info.last_update;
11216           osd->send_message_osd_cluster(
11217             peer.osd, m, get_osdmap_epoch());
11218         }
11219       }
11220       ceph_tid_t rep_tid = repop->rep_tid;
11221       waiting_on.insert(pg_whoami);
11222       log_entry_update_waiting_on.insert(
11223         make_pair(
11224           rep_tid,
11225           LogUpdateCtx{std::move(repop), std::move(waiting_on)}
11226           ));
11227       struct OnComplete : public Context {
11228         PrimaryLogPGRef pg;
11229         ceph_tid_t rep_tid;
11230         epoch_t epoch;
11231         OnComplete(
11232           PrimaryLogPGRef pg,
11233           ceph_tid_t rep_tid,
11234           epoch_t epoch)
11235           : pg(pg), rep_tid(rep_tid), epoch(epoch) {}
11236         void finish(int) override {
11237           std::scoped_lock l{*pg};
11238           if (!pg->pg_has_reset_since(epoch)) {
11239             auto it = pg->log_entry_update_waiting_on.find(rep_tid);
11240             ceph_assert(it != pg->log_entry_update_waiting_on.end());
11241             auto it2 = it->second.waiting_on.find(pg->pg_whoami);
11242             ceph_assert(it2 != it->second.waiting_on.end());
11243             it->second.waiting_on.erase(it2);
11244             if (it->second.waiting_on.empty()) {
11245               pg->repop_all_committed(it->second.repop.get());
11246               pg->log_entry_update_waiting_on.erase(it);
11247             }
11248           }
11249         }
11250       };
11251       t.register_on_commit(
11252         new OnComplete{this, rep_tid, get_osdmap_epoch()});
11253       int r = osd->store->queue_transaction(ch, std::move(t), NULL);
11254       ceph_assert(r == 0);
11255       op_applied(info.last_update);
11256     });
11257
11258   recovery_state.update_trim_to();
11259 }
11260
11261 void PrimaryLogPG::cancel_log_updates()
11262 {
11263   // get rid of all the LogUpdateCtx so their references to repops are
11264   // dropped
11265   log_entry_update_waiting_on.clear();
11266 }
11267
11268 // -------------------------------------------------------
11269
11270 void PrimaryLogPG::get_watchers(list<obj_watch_item_t> *ls)
11271 {
11272   std::scoped_lock l{*this};
11273   pair<hobject_t, ObjectContextRef> i;
11274   while (object_contexts.get_next(i.first, &i)) {
11275     ObjectContextRef obc(i.second);
11276     get_obc_watchers(obc, *ls);
11277   }
11278 }
11279
11280 void PrimaryLogPG::get_obc_watchers(ObjectContextRef obc, list<obj_watch_item_t> &pg_watchers)
11281 {
11282   for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j =
11283          obc->watchers.begin();
11284         j != obc->watchers.end();
11285         ++j) {
11286     obj_watch_item_t owi;
11287
11288     owi.obj = obc->obs.oi.soid;
11289     owi.wi.addr = j->second->get_peer_addr();
11290     owi.wi.name = j->second->get_entity();
11291     owi.wi.cookie = j->second->get_cookie();
11292     owi.wi.timeout_seconds = j->second->get_timeout();
11293
11294     dout(30) << "watch: Found oid=" << owi.obj << " addr=" << owi.wi.addr
11295       << " name=" << owi.wi.name << " cookie=" << owi.wi.cookie << dendl;
11296
11297     pg_watchers.push_back(owi);
11298   }
11299 }
11300
11301 void PrimaryLogPG::check_blocklisted_watchers()
11302 {
11303   dout(20) << "PrimaryLogPG::check_blocklisted_watchers for pg " << get_pgid() << dendl;
11304   pair<hobject_t, ObjectContextRef> i;
11305   while (object_contexts.get_next(i.first, &i))
11306     check_blocklisted_obc_watchers(i.second);
11307 }
11308
11309 void PrimaryLogPG::check_blocklisted_obc_watchers(ObjectContextRef obc)
11310 {
11311   dout(20) << "PrimaryLogPG::check_blocklisted_obc_watchers for obc " << obc->obs.oi.soid << dendl;
11312   for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator k =
11313          obc->watchers.begin();
11314         k != obc->watchers.end();
11315         ) {
11316     //Advance iterator now so handle_watch_timeout() can erase element
11317     map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j = k++;
11318     dout(30) << "watch: Found " << j->second->get_entity() << " cookie " << j->second->get_cookie() << dendl;
11319     entity_addr_t ea = j->second->get_peer_addr();
11320     dout(30) << "watch: Check entity_addr_t " << ea << dendl;
11321     if (get_osdmap()->is_blocklisted(ea)) {
11322       dout(10) << "watch: Found blocklisted watcher for " << ea << dendl;
11323       ceph_assert(j->second->get_pg() == this);
11324       j->second->unregister_cb();
11325       handle_watch_timeout(j->second);
11326     }
11327   }
11328 }
11329
11330 void PrimaryLogPG::populate_obc_watchers(ObjectContextRef obc)
11331 {
11332   ceph_assert(is_primary() && is_active());
11333   auto it_objects = recovery_state.get_pg_log().get_log().objects.find(obc->obs.oi.soid);
11334   ceph_assert((recovering.count(obc->obs.oi.soid) ||
11335           !is_missing_object(obc->obs.oi.soid)) ||
11336          (it_objects != recovery_state.get_pg_log().get_log().objects.end() && // or this is a revert... see recover_primary()
11337           it_objects->second->op ==
11338             pg_log_entry_t::LOST_REVERT &&
11339           it_objects->second->reverting_to ==
11340             obc->obs.oi.version));
11341
11342   dout(10) << "populate_obc_watchers " << obc->obs.oi.soid << dendl;
11343   ceph_assert(obc->watchers.empty());
11344   // populate unconnected_watchers
11345   for (map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator p =
11346         obc->obs.oi.watchers.begin();
11347        p != obc->obs.oi.watchers.end();
11348        ++p) {
11349     utime_t expire = info.stats.last_became_active;
11350     expire += p->second.timeout_seconds;
11351     dout(10) << "  unconnected watcher " << p->first << " will expire " << expire << dendl;
11352     WatchRef watch(
11353       Watch::makeWatchRef(
11354         this, osd, obc, p->second.timeout_seconds, p->first.first,
11355         p->first.second, p->second.addr));
11356     watch->disconnect();
11357     obc->watchers.insert(
11358       make_pair(
11359         make_pair(p->first.first, p->first.second),
11360         watch));
11361   }
11362   // Look for watchers from blocklisted clients and drop
11363   check_blocklisted_obc_watchers(obc);
11364 }
11365
11366 void PrimaryLogPG::handle_watch_timeout(WatchRef watch)
11367 {
11368   ObjectContextRef obc = watch->get_obc(); // handle_watch_timeout owns this ref
11369   dout(10) << "handle_watch_timeout obc " << obc << dendl;
11370
11371   if (!is_active()) {
11372     dout(10) << "handle_watch_timeout not active, no-op" << dendl;
11373     return;
11374   }
11375   if (!obc->obs.exists) {
11376     dout(10) << __func__ << " object " << obc->obs.oi.soid << " dne" << dendl;
11377     return;
11378   }
11379   if (is_degraded_or_backfilling_object(obc->obs.oi.soid)) {
11380     callbacks_for_degraded_object[obc->obs.oi.soid].push_back(
11381       watch->get_delayed_cb()
11382       );
11383     dout(10) << "handle_watch_timeout waiting for degraded on obj "
11384              << obc->obs.oi.soid
11385              << dendl;
11386     return;
11387   }
11388
11389   if (m_scrubber->write_blocked_by_scrub(obc->obs.oi.soid)) {
11390     dout(10) << "handle_watch_timeout waiting for scrub on obj "
11391              << obc->obs.oi.soid
11392              << dendl;
11393     m_scrubber->add_callback(
11394       watch->get_delayed_cb() // This callback!
11395       );
11396     return;
11397   }
11398
11399   OpContextUPtr ctx = simple_opc_create(obc);
11400   ctx->at_version = get_next_version();
11401
11402   object_info_t& oi = ctx->new_obs.oi;
11403   oi.watchers.erase(make_pair(watch->get_cookie(),
11404                               watch->get_entity()));
11405
11406   list<watch_disconnect_t> watch_disconnects = {
11407     watch_disconnect_t(watch->get_cookie(), watch->get_entity(), true)
11408   };
11409   ctx->register_on_success(
11410     [this, obc, watch_disconnects]() {
11411       complete_disconnect_watches(obc, watch_disconnects);
11412     });
11413
11414
11415   PGTransaction *t = ctx->op_t.get();
11416   ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::MODIFY, obc->obs.oi.soid,
11417                                     ctx->at_version,
11418                                     oi.version,
11419                                     0,
11420                                     osd_reqid_t(), ctx->mtime, 0));
11421
11422   oi.prior_version = obc->obs.oi.version;
11423   oi.version = ctx->at_version;
11424   bufferlist bl;
11425   encode(oi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
11426   t->setattr(obc->obs.oi.soid, OI_ATTR, bl);
11427
11428   // apply new object state.
11429   ctx->obc->obs = ctx->new_obs;
11430
11431   // no ctx->delta_stats
11432   simple_opc_submit(std::move(ctx));
11433 }
11434
11435 ObjectContextRef PrimaryLogPG::create_object_context(const object_info_t& oi,
11436                                                      SnapSetContext *ssc)
11437 {
11438   ObjectContextRef obc(object_contexts.lookup_or_create(oi.soid));
11439   ceph_assert(obc->destructor_callback == NULL);
11440   obc->destructor_callback = new C_PG_ObjectContext(this, obc.get());
11441   obc->obs.oi = oi;
11442   obc->obs.exists = false;
11443   obc->ssc = ssc;
11444   if (ssc)
11445     register_snapset_context(ssc);
11446   dout(10) << "create_object_context " << (void*)obc.get() << " " << oi.soid << " " << dendl;
11447   if (is_active())
11448     populate_obc_watchers(obc);
11449   return obc;
11450 }
11451
11452 ObjectContextRef PrimaryLogPG::get_object_context(
11453   const hobject_t& soid,
11454   bool can_create,
11455   const map<string, bufferlist> *attrs)
11456 {
11457   auto it_objects = recovery_state.get_pg_log().get_log().objects.find(soid);
11458   ceph_assert(
11459     attrs || !recovery_state.get_pg_log().get_missing().is_missing(soid) ||
11460     // or this is a revert... see recover_primary()
11461     (it_objects != recovery_state.get_pg_log().get_log().objects.end() &&
11462       it_objects->second->op ==
11463       pg_log_entry_t::LOST_REVERT));
11464   ObjectContextRef obc = object_contexts.lookup(soid);
11465   osd->logger->inc(l_osd_object_ctx_cache_total);
11466   if (obc) {
11467     osd->logger->inc(l_osd_object_ctx_cache_hit);
11468     dout(10) << __func__ << ": found obc in cache: " << obc
11469              << dendl;
11470   } else {
11471     dout(10) << __func__ << ": obc NOT found in cache: " << soid << dendl;
11472     // check disk
11473     bufferlist bv;
11474     if (attrs) {
11475       auto it_oi = attrs->find(OI_ATTR);
11476       ceph_assert(it_oi != attrs->end());
11477       bv = it_oi->second;
11478     } else {
11479       int r = pgbackend->objects_get_attr(soid, OI_ATTR, &bv);
11480       if (r < 0) {
11481         if (!can_create) {
11482           dout(10) << __func__ << ": no obc for soid "
11483                    << soid << " and !can_create"
11484                    << dendl;
11485           return ObjectContextRef();   // -ENOENT!
11486         }
11487
11488         dout(10) << __func__ << ": no obc for soid "
11489                  << soid << " but can_create"
11490                  << dendl;
11491         // new object.
11492         object_info_t oi(soid);
11493         SnapSetContext *ssc = get_snapset_context(
11494           soid, true, 0, false);
11495         ceph_assert(ssc);
11496         obc = create_object_context(oi, ssc);
11497         dout(10) << __func__ << ": " << obc << " " << soid
11498                  << " " << obc->rwstate
11499                  << " oi: " << obc->obs.oi
11500                  << " ssc: " << obc->ssc
11501                  << " snapset: " << obc->ssc->snapset << dendl;
11502         return obc;
11503       }
11504     }
11505
11506     object_info_t oi;
11507     try {
11508       bufferlist::const_iterator bliter = bv.begin();
11509       decode(oi, bliter);
11510     } catch (...) {
11511       dout(0) << __func__ << ": obc corrupt: " << soid << dendl;
11512       return ObjectContextRef();   // -ENOENT!
11513     }
11514
11515     ceph_assert(oi.soid.pool == (int64_t)info.pgid.pool());
11516
11517     obc = object_contexts.lookup_or_create(oi.soid);
11518     obc->destructor_callback = new C_PG_ObjectContext(this, obc.get());
11519     obc->obs.oi = oi;
11520     obc->obs.exists = true;
11521
11522     obc->ssc = get_snapset_context(
11523       soid, true,
11524       soid.has_snapset() ? attrs : 0);
11525
11526     if (is_primary() && is_active())
11527       populate_obc_watchers(obc);
11528
11529     if (pool.info.is_erasure()) {
11530       if (attrs) {
11531         obc->attr_cache = *attrs;
11532       } else {
11533         int r = pgbackend->objects_get_attrs(
11534           soid,
11535           &obc->attr_cache);
11536         ceph_assert(r == 0);
11537       }
11538     }
11539
11540     dout(10) << __func__ << ": creating obc from disk: " << obc
11541              << dendl;
11542   }
11543
11544   // XXX: Caller doesn't expect this
11545   if (obc->ssc == NULL) {
11546     derr << __func__ << ": obc->ssc not available, not returning context" << dendl;
11547     return ObjectContextRef();   // -ENOENT!
11548   }
11549
11550   dout(10) << __func__ << ": " << obc << " " << soid
11551            << " " << obc->rwstate
11552            << " oi: " << obc->obs.oi
11553            << " exists: " << (int)obc->obs.exists
11554            << " ssc: " << obc->ssc
11555            << " snapset: " << obc->ssc->snapset << dendl;
11556   return obc;
11557 }
11558
11559 void PrimaryLogPG::context_registry_on_change()
11560 {
11561   pair<hobject_t, ObjectContextRef> i;
11562   while (object_contexts.get_next(i.first, &i)) {
11563     ObjectContextRef obc(i.second);
11564     if (obc) {
11565       for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j =
11566              obc->watchers.begin();
11567            j != obc->watchers.end();
11568            obc->watchers.erase(j++)) {
11569         j->second->discard();
11570       }
11571     }
11572   }
11573 }
11574
11575
11576 /*
11577  * If we return an error, and set *pmissing, then promoting that
11578  * object may help.
11579  *
11580  * If we return -EAGAIN, we will always set *pmissing to the missing
11581  * object to wait for.
11582  *
11583  * If we return an error but do not set *pmissing, then we know the
11584  * object does not exist.
11585  */
11586 int PrimaryLogPG::find_object_context(const hobject_t& oid,
11587                                       ObjectContextRef *pobc,
11588                                       bool can_create,
11589                                       bool map_snapid_to_clone,
11590                                       hobject_t *pmissing)
11591 {
11592   FUNCTRACE(cct);
11593   ceph_assert(oid.pool == static_cast<int64_t>(info.pgid.pool()));
11594   // want the head?
11595   if (oid.snap == CEPH_NOSNAP) {
11596     ObjectContextRef obc = get_object_context(oid, can_create);
11597     if (!obc) {
11598       if (pmissing)
11599         *pmissing = oid;
11600       return -ENOENT;
11601     }
11602     dout(10) << __func__ << " " << oid
11603        << " @" << oid.snap
11604        << " oi=" << obc->obs.oi
11605        << dendl;
11606     *pobc = obc;
11607
11608     return 0;
11609   }
11610
11611   // we want a snap
11612
11613   hobject_t head = oid.get_head();
11614   SnapSetContext *ssc = get_snapset_context(oid, can_create);
11615   if (!ssc || !(ssc->exists || can_create)) {
11616     dout(20) << __func__ << " " << oid << " no snapset" << dendl;
11617     if (pmissing)
11618       *pmissing = head;  // start by getting the head
11619     if (ssc)
11620       put_snapset_context(ssc);
11621     return -ENOENT;
11622   }
11623
11624   if (map_snapid_to_clone) {
11625     dout(10) << __func__ << " " << oid << " @" << oid.snap
11626              << " snapset " << ssc->snapset
11627              << " map_snapid_to_clone=true" << dendl;
11628     if (oid.snap > ssc->snapset.seq) {
11629       // already must be readable
11630       ObjectContextRef obc = get_object_context(head, false);
11631       dout(10) << __func__ << " " << oid << " @" << oid.snap
11632                << " snapset " << ssc->snapset
11633                << " maps to head" << dendl;
11634       *pobc = obc;
11635       put_snapset_context(ssc);
11636       return (obc && obc->obs.exists) ? 0 : -ENOENT;
11637     } else {
11638       vector<snapid_t>::const_iterator citer = std::find(
11639         ssc->snapset.clones.begin(),
11640         ssc->snapset.clones.end(),
11641         oid.snap);
11642       if (citer == ssc->snapset.clones.end()) {
11643         dout(10) << __func__ << " " << oid << " @" << oid.snap
11644                  << " snapset " << ssc->snapset
11645                  << " maps to nothing" << dendl;
11646         put_snapset_context(ssc);
11647         return -ENOENT;
11648       }
11649
11650       dout(10) << __func__ << " " << oid << " @" << oid.snap
11651                << " snapset " << ssc->snapset
11652                << " maps to " << oid << dendl;
11653
11654       if (recovery_state.get_pg_log().get_missing().is_missing(oid)) {
11655         dout(10) << __func__ << " " << oid << " @" << oid.snap
11656                  << " snapset " << ssc->snapset
11657                  << " " << oid << " is missing" << dendl;
11658         if (pmissing)
11659           *pmissing = oid;
11660         put_snapset_context(ssc);
11661         return -EAGAIN;
11662       }
11663
11664       ObjectContextRef obc = get_object_context(oid, false);
11665       if (!obc || !obc->obs.exists) {
11666         dout(10) << __func__ << " " << oid << " @" << oid.snap
11667                  << " snapset " << ssc->snapset
11668                  << " " << oid << " is not present" << dendl;
11669         if (pmissing)
11670           *pmissing = oid;
11671         put_snapset_context(ssc);
11672         return -ENOENT;
11673       }
11674       dout(10) << __func__ << " " << oid << " @" << oid.snap
11675                << " snapset " << ssc->snapset
11676                << " " << oid << " HIT" << dendl;
11677       *pobc = obc;
11678       put_snapset_context(ssc);
11679       return 0;
11680     }
11681     ceph_abort(); //unreachable
11682   }
11683
11684   dout(10) << __func__ << " " << oid << " @" << oid.snap
11685            << " snapset " << ssc->snapset << dendl;
11686
11687   // head?
11688   if (oid.snap > ssc->snapset.seq) {
11689     ObjectContextRef obc = get_object_context(head, false);
11690     dout(10) << __func__ << " " << head
11691              << " want " << oid.snap << " > snapset seq " << ssc->snapset.seq
11692              << " -- HIT " << obc->obs
11693              << dendl;
11694     if (!obc->ssc)
11695       obc->ssc = ssc;
11696     else {
11697       ceph_assert(ssc == obc->ssc);
11698       put_snapset_context(ssc);
11699     }
11700     *pobc = obc;
11701     return 0;
11702   }
11703
11704   // which clone would it be?
11705   unsigned k = 0;
11706   while (k < ssc->snapset.clones.size() &&
11707          ssc->snapset.clones[k] < oid.snap)
11708     k++;
11709   if (k == ssc->snapset.clones.size()) {
11710     dout(10) << __func__ << " no clones with last >= oid.snap "
11711              << oid.snap << " -- DNE" << dendl;
11712     put_snapset_context(ssc);
11713     return -ENOENT;
11714   }
11715   hobject_t soid(oid.oid, oid.get_key(), ssc->snapset.clones[k], oid.get_hash(),
11716                  info.pgid.pool(), oid.get_namespace());
11717
11718   if (recovery_state.get_pg_log().get_missing().is_missing(soid)) {
11719     dout(20) << __func__ << " " << soid << " missing, try again later"
11720              << dendl;
11721     if (pmissing)
11722       *pmissing = soid;
11723     put_snapset_context(ssc);
11724     return -EAGAIN;
11725   }
11726
11727   ObjectContextRef obc = get_object_context(soid, false);
11728   if (!obc || !obc->obs.exists) {
11729     if (pmissing)
11730       *pmissing = soid;
11731     put_snapset_context(ssc);
11732     if (is_primary()) {
11733       if (is_degraded_or_backfilling_object(soid)) {
11734         dout(20) << __func__ << " clone is degraded or backfilling " << soid << dendl;
11735         return -EAGAIN;
11736       } else if (is_degraded_on_async_recovery_target(soid)) {
11737         dout(20) << __func__ << " clone is recovering " << soid << dendl;
11738         return -EAGAIN;
11739       } else {
11740         dout(20) << __func__ << " missing clone " << soid << dendl;
11741         return -ENOENT;
11742       }
11743     } else {
11744       dout(20) << __func__ << " replica missing clone" << soid << dendl;
11745       return -ENOENT;
11746     }
11747   }
11748
11749   if (!obc->ssc) {
11750     obc->ssc = ssc;
11751   } else {
11752     ceph_assert(obc->ssc == ssc);
11753     put_snapset_context(ssc);
11754   }
11755   ssc = 0;
11756
11757   // clone
11758   dout(20) << __func__ << " " << soid
11759            << " snapset " << obc->ssc->snapset
11760            << dendl;
11761   snapid_t first, last;
11762   auto p = obc->ssc->snapset.clone_snaps.find(soid.snap);
11763   ceph_assert(p != obc->ssc->snapset.clone_snaps.end());
11764   if (p->second.empty()) {
11765     dout(1) << __func__ << " " << soid << " empty snapset -- DNE" << dendl;
11766     ceph_assert(!cct->_conf->osd_debug_verify_snaps);
11767     return -ENOENT;
11768   }
11769   if (std::find(p->second.begin(), p->second.end(), oid.snap) ==
11770       p->second.end()) {
11771     dout(20) << __func__ << " " << soid << " clone_snaps " << p->second
11772              << " does not contain " << oid.snap << " -- DNE" << dendl;
11773     return -ENOENT;
11774   }
11775   if (get_osdmap()->in_removed_snaps_queue(info.pgid.pgid.pool(), oid.snap)) {
11776     dout(20) << __func__ << " " << soid << " snap " << oid.snap
11777              << " in removed_snaps_queue" << " -- DNE" << dendl;
11778     return -ENOENT;
11779   }
11780   dout(20) << __func__ << " " << soid << " clone_snaps " << p->second
11781            << " contains " << oid.snap << " -- HIT " << obc->obs << dendl;
11782   *pobc = obc;
11783   return 0;
11784 }
11785
11786 void PrimaryLogPG::object_context_destructor_callback(ObjectContext *obc)
11787 {
11788   if (obc->ssc)
11789     put_snapset_context(obc->ssc);
11790 }
11791
11792 void PrimaryLogPG::add_object_context_to_pg_stat(ObjectContextRef obc, pg_stat_t *pgstat)
11793 {
11794   object_info_t& oi = obc->obs.oi;
11795
11796   dout(10) << __func__ << " " << oi.soid << dendl;
11797   ceph_assert(!oi.soid.is_snapdir());
11798
11799   object_stat_sum_t stat;
11800   stat.num_objects++;
11801   if (oi.is_dirty())
11802     stat.num_objects_dirty++;
11803   if (oi.is_whiteout())
11804     stat.num_whiteouts++;
11805   if (oi.is_omap())
11806     stat.num_objects_omap++;
11807   if (oi.is_cache_pinned())
11808     stat.num_objects_pinned++;
11809   if (oi.has_manifest())
11810     stat.num_objects_manifest++;
11811
11812   if (oi.soid.is_snap()) {
11813     stat.num_object_clones++;
11814
11815     if (!obc->ssc)
11816       obc->ssc = get_snapset_context(oi.soid, false);
11817     ceph_assert(obc->ssc);
11818     stat.num_bytes += obc->ssc->snapset.get_clone_bytes(oi.soid.snap);
11819   } else {
11820     stat.num_bytes += oi.size;
11821   }
11822
11823   // add it in
11824   pgstat->stats.sum.add(stat);
11825 }
11826
11827 void PrimaryLogPG::kick_object_context_blocked(ObjectContextRef obc)
11828 {
11829   const hobject_t& soid = obc->obs.oi.soid;
11830   if (obc->is_blocked()) {
11831     dout(10) << __func__ << " " << soid << " still blocked" << dendl;
11832     return;
11833   }
11834
11835   map<hobject_t, list<OpRequestRef>>::iterator p = waiting_for_blocked_object.find(soid);
11836   if (p != waiting_for_blocked_object.end()) {
11837     list<OpRequestRef>& ls = p->second;
11838     dout(10) << __func__ << " " << soid << " requeuing " << ls.size() << " requests" << dendl;
11839     requeue_ops(ls);
11840     waiting_for_blocked_object.erase(p);
11841   }
11842
11843   map<hobject_t, ObjectContextRef>::iterator i =
11844     objects_blocked_on_snap_promotion.find(obc->obs.oi.soid.get_head());
11845   if (i != objects_blocked_on_snap_promotion.end()) {
11846     ceph_assert(i->second == obc);
11847     objects_blocked_on_snap_promotion.erase(i);
11848   }
11849
11850   if (obc->requeue_scrub_on_unblock) {
11851
11852     obc->requeue_scrub_on_unblock = false;
11853
11854     dout(20) << __func__ << " requeuing if still active: " << (is_active() ? "yes" : "no") << dendl;
11855
11856     // only requeue if we are still active: we may be unblocking
11857     // because we are resetting for a new peering interval
11858     if (is_active()) {
11859       osd->queue_scrub_unblocking(this, is_scrub_blocking_ops());
11860     }
11861   }
11862 }
11863
11864 SnapSetContext *PrimaryLogPG::get_snapset_context(
11865   const hobject_t& oid,
11866   bool can_create,
11867   const map<string, bufferlist> *attrs,
11868   bool oid_existed)
11869 {
11870   std::lock_guard l(snapset_contexts_lock);
11871   SnapSetContext *ssc;
11872   map<hobject_t, SnapSetContext*>::iterator p = snapset_contexts.find(
11873     oid.get_snapdir());
11874   if (p != snapset_contexts.end()) {
11875     if (can_create || p->second->exists) {
11876       ssc = p->second;
11877     } else {
11878       return NULL;
11879     }
11880   } else {
11881     bufferlist bv;
11882     if (!attrs) {
11883       int r = -ENOENT;
11884       if (!(oid.is_head() && !oid_existed)) {
11885         r = pgbackend->objects_get_attr(oid.get_head(), SS_ATTR, &bv);
11886       }
11887       if (r < 0 && !can_create)
11888         return NULL;
11889     } else {
11890       auto it_ss = attrs->find(SS_ATTR);
11891       ceph_assert(it_ss != attrs->end());
11892       bv = it_ss->second;
11893     }
11894     ssc = new SnapSetContext(oid.get_snapdir());
11895     _register_snapset_context(ssc);
11896     if (bv.length()) {
11897       bufferlist::const_iterator bvp = bv.begin();
11898       try {
11899         ssc->snapset.decode(bvp);
11900       } catch (const ceph::buffer::error& e) {
11901         dout(0) << __func__ << " Can't decode snapset: " << e.what() << dendl;
11902         return NULL;
11903       }
11904       ssc->exists = true;
11905     } else {
11906       ssc->exists = false;
11907     }
11908   }
11909   ceph_assert(ssc);
11910   ssc->ref++;
11911   return ssc;
11912 }
11913
11914 void PrimaryLogPG::put_snapset_context(SnapSetContext *ssc)
11915 {
11916   std::lock_guard l(snapset_contexts_lock);
11917   --ssc->ref;
11918   if (ssc->ref == 0) {
11919     if (ssc->registered)
11920       snapset_contexts.erase(ssc->oid);
11921     delete ssc;
11922   }
11923 }
11924
11925 /*
11926  * Return values:
11927  *  NONE  - didn't pull anything
11928  *  YES   - pulled what the caller wanted
11929  *  HEAD  - needed to pull head first
11930  */
11931 enum { PULL_NONE, PULL_HEAD, PULL_YES };
11932
11933 int PrimaryLogPG::recover_missing(
11934   const hobject_t &soid, eversion_t v,
11935   int priority,
11936   PGBackend::RecoveryHandle *h)
11937 {
11938   if (recovery_state.get_missing_loc().is_unfound(soid)) {
11939     dout(7) << __func__ << " " << soid
11940             << " v " << v
11941             << " but it is unfound" << dendl;
11942     return PULL_NONE;
11943   }
11944
11945   if (recovery_state.get_missing_loc().is_deleted(soid)) {
11946     start_recovery_op(soid);
11947     ceph_assert(!recovering.count(soid));
11948     recovering.insert(make_pair(soid, ObjectContextRef()));
11949     epoch_t cur_epoch = get_osdmap_epoch();
11950     remove_missing_object(soid, v, new LambdaContext(
11951      [=](int) {
11952        std::scoped_lock locker{*this};
11953        if (!pg_has_reset_since(cur_epoch)) {
11954          bool object_missing = false;
11955          for (const auto& shard : get_acting_recovery_backfill()) {
11956            if (shard == pg_whoami)
11957              continue;
11958            if (recovery_state.get_peer_missing(shard).is_missing(soid)) {
11959              dout(20) << __func__ << ": soid " << soid << " needs to be deleted from replica " << shard << dendl;
11960              object_missing = true;
11961              break;
11962            }
11963          }
11964          if (!object_missing) {
11965            object_stat_sum_t stat_diff;
11966            stat_diff.num_objects_recovered = 1;
11967            if (scrub_after_recovery)
11968              stat_diff.num_objects_repaired = 1;
11969            on_global_recover(soid, stat_diff, true);
11970          } else {
11971            auto recovery_handle = pgbackend->open_recovery_op();
11972            pgbackend->recover_delete_object(soid, v, recovery_handle);
11973            pgbackend->run_recovery_op(recovery_handle, priority);
11974          }
11975        }
11976      }));
11977     return PULL_YES;
11978   }
11979
11980   // is this a snapped object?  if so, consult the snapset.. we may not need the entire object!
11981   ObjectContextRef obc;
11982   ObjectContextRef head_obc;
11983   if (soid.snap && soid.snap < CEPH_NOSNAP) {
11984     // do we have the head?
11985     hobject_t head = soid.get_head();
11986     if (recovery_state.get_pg_log().get_missing().is_missing(head)) {
11987       if (recovering.count(head)) {
11988         dout(10) << " missing but already recovering head " << head << dendl;
11989         return PULL_NONE;
11990       } else {
11991         int r = recover_missing(
11992           head, recovery_state.get_pg_log().get_missing().get_items().find(head)->second.need, priority,
11993           h);
11994         if (r != PULL_NONE)
11995           return PULL_HEAD;
11996         return PULL_NONE;
11997       }
11998     }
11999     head_obc = get_object_context(
12000       head,
12001       false,
12002       0);
12003     ceph_assert(head_obc);
12004   }
12005   start_recovery_op(soid);
12006   ceph_assert(!recovering.count(soid));
12007   recovering.insert(make_pair(soid, obc));
12008   int r = pgbackend->recover_object(
12009     soid,
12010     v,
12011     head_obc,
12012     obc,
12013     h);
12014   // This is only a pull which shouldn't return an error
12015   ceph_assert(r >= 0);
12016   return PULL_YES;
12017 }
12018
12019 void PrimaryLogPG::remove_missing_object(const hobject_t &soid,
12020                                          eversion_t v, Context *on_complete)
12021 {
12022   dout(20) << __func__ << " " << soid << " " << v << dendl;
12023   ceph_assert(on_complete != nullptr);
12024   // delete locally
12025   ObjectStore::Transaction t;
12026   remove_snap_mapped_object(t, soid);
12027
12028   ObjectRecoveryInfo recovery_info;
12029   recovery_info.soid = soid;
12030   recovery_info.version = v;
12031
12032   epoch_t cur_epoch = get_osdmap_epoch();
12033   t.register_on_complete(new LambdaContext(
12034      [=](int) {
12035        std::unique_lock locker{*this};
12036        if (!pg_has_reset_since(cur_epoch)) {
12037          ObjectStore::Transaction t2;
12038          on_local_recover(soid, recovery_info, ObjectContextRef(), true, &t2);
12039          t2.register_on_complete(on_complete);
12040          int r = osd->store->queue_transaction(ch, std::move(t2), nullptr);
12041          ceph_assert(r == 0);
12042          locker.unlock();
12043        } else {
12044          locker.unlock();
12045          on_complete->complete(-EAGAIN);
12046        }
12047      }));
12048   int r = osd->store->queue_transaction(ch, std::move(t), nullptr);
12049   ceph_assert(r == 0);
12050 }
12051
12052 void PrimaryLogPG::finish_degraded_object(const hobject_t oid)
12053 {
12054   dout(10) << __func__ << " " << oid << dendl;
12055   if (callbacks_for_degraded_object.count(oid)) {
12056     list<Context*> contexts;
12057     contexts.swap(callbacks_for_degraded_object[oid]);
12058     callbacks_for_degraded_object.erase(oid);
12059     for (list<Context*>::iterator i = contexts.begin();
12060          i != contexts.end();
12061          ++i) {
12062       (*i)->complete(0);
12063     }
12064   }
12065   map<hobject_t, snapid_t>::iterator i = objects_blocked_on_degraded_snap.find(
12066     oid.get_head());
12067   if (i != objects_blocked_on_degraded_snap.end() &&
12068       i->second == oid.snap)
12069     objects_blocked_on_degraded_snap.erase(i);
12070 }
12071
12072 void PrimaryLogPG::_committed_pushed_object(
12073   epoch_t epoch, eversion_t last_complete)
12074 {
12075   std::scoped_lock locker{*this};
12076   if (!pg_has_reset_since(epoch)) {
12077     recovery_state.recovery_committed_to(last_complete);
12078   } else {
12079     dout(10) << __func__
12080              << " pg has changed, not touching last_complete_ondisk" << dendl;
12081   }
12082 }
12083
12084 void PrimaryLogPG::_applied_recovered_object(ObjectContextRef obc)
12085 {
12086   dout(20) << __func__ << dendl;
12087   if (obc) {
12088     dout(20) << "obc = " << *obc << dendl;
12089   }
12090   ceph_assert(active_pushes >= 1);
12091   --active_pushes;
12092
12093   // requeue an active chunky scrub waiting on recovery ops
12094   if (!recovery_state.is_deleting() && active_pushes == 0 &&
12095       m_scrubber->is_scrub_active()) {
12096
12097     osd->queue_scrub_pushes_update(this, is_scrub_blocking_ops());
12098   }
12099 }
12100
12101 void PrimaryLogPG::_applied_recovered_object_replica()
12102 {
12103   dout(20) << __func__ << dendl;
12104   ceph_assert(active_pushes >= 1);
12105   --active_pushes;
12106
12107   // requeue an active scrub waiting on recovery ops
12108   if (!recovery_state.is_deleting() && active_pushes == 0 &&
12109       m_scrubber->is_scrub_active()) {
12110
12111     osd->queue_scrub_replica_pushes(this, m_scrubber->replica_op_priority());
12112   }
12113 }
12114
12115 void PrimaryLogPG::on_failed_pull(
12116   const set<pg_shard_t> &from,
12117   const hobject_t &soid,
12118   const eversion_t &v)
12119 {
12120   dout(20) << __func__ << ": " << soid << dendl;
12121   ceph_assert(recovering.count(soid));
12122   auto obc = recovering[soid];
12123   if (obc) {
12124     list<OpRequestRef> blocked_ops;
12125     obc->drop_recovery_read(&blocked_ops);
12126     requeue_ops(blocked_ops);
12127   }
12128   recovering.erase(soid);
12129   for (auto&& i : from) {
12130     if (i != pg_whoami) { // we'll get it below in primary_error
12131       recovery_state.force_object_missing(i, soid, v);
12132     }
12133   }
12134
12135   dout(0) << __func__ << " " << soid << " from shard " << from
12136           << ", reps on " << recovery_state.get_missing_loc().get_locations(soid)
12137           << " unfound? " << recovery_state.get_missing_loc().is_unfound(soid)
12138           << dendl;
12139   finish_recovery_op(soid);  // close out this attempt,
12140   finish_degraded_object(soid);
12141
12142   if (from.count(pg_whoami)) {
12143     dout(0) << " primary missing oid " << soid << " version " << v << dendl;
12144     primary_error(soid, v);
12145     backfills_in_flight.erase(soid);
12146   }
12147 }
12148
12149 eversion_t PrimaryLogPG::pick_newest_available(const hobject_t& oid)
12150 {
12151   eversion_t v;
12152   pg_missing_item pmi;
12153   bool is_missing = recovery_state.get_pg_log().get_missing().is_missing(oid, &pmi);
12154   ceph_assert(is_missing);
12155   v = pmi.have;
12156   dout(10) << "pick_newest_available " << oid << " " << v << " on osd." << osd->whoami << " (local)" << dendl;
12157
12158   ceph_assert(!get_acting_recovery_backfill().empty());
12159   for (set<pg_shard_t>::iterator i = get_acting_recovery_backfill().begin();
12160        i != get_acting_recovery_backfill().end();
12161        ++i) {
12162     if (*i == get_primary()) continue;
12163     pg_shard_t peer = *i;
12164     if (!recovery_state.get_peer_missing(peer).is_missing(oid)) {
12165       continue;
12166     }
12167     eversion_t h = recovery_state.get_peer_missing(peer).get_items().at(oid).have;
12168     dout(10) << "pick_newest_available " << oid << " " << h << " on osd." << peer << dendl;
12169     if (h > v)
12170       v = h;
12171   }
12172
12173   dout(10) << "pick_newest_available " << oid << " " << v << " (newest)" << dendl;
12174   return v;
12175 }
12176
12177 void PrimaryLogPG::do_update_log_missing(OpRequestRef &op)
12178 {
12179   const MOSDPGUpdateLogMissing *m = static_cast<const MOSDPGUpdateLogMissing*>(
12180     op->get_req());
12181   ceph_assert(m->get_type() == MSG_OSD_PG_UPDATE_LOG_MISSING);
12182   ObjectStore::Transaction t;
12183   std::optional<eversion_t> op_trim_to, op_roll_forward_to;
12184   if (m->pg_trim_to != eversion_t())
12185     op_trim_to = m->pg_trim_to;
12186   if (m->pg_roll_forward_to != eversion_t())
12187     op_roll_forward_to = m->pg_roll_forward_to;
12188
12189   dout(20) << __func__
12190            << " op_trim_to = " << op_trim_to << " op_roll_forward_to = " << op_roll_forward_to << dendl;
12191
12192   recovery_state.append_log_entries_update_missing(
12193     m->entries, t, op_trim_to, op_roll_forward_to);
12194   eversion_t new_lcod = info.last_complete;
12195
12196   Context *complete = new LambdaContext(
12197     [=](int) {
12198       const MOSDPGUpdateLogMissing *msg = static_cast<const MOSDPGUpdateLogMissing*>(
12199         op->get_req());
12200       std::scoped_lock locker{*this};
12201       if (!pg_has_reset_since(msg->get_epoch())) {
12202         update_last_complete_ondisk(new_lcod);
12203         MOSDPGUpdateLogMissingReply *reply =
12204           new MOSDPGUpdateLogMissingReply(
12205             spg_t(info.pgid.pgid, primary_shard().shard),
12206             pg_whoami.shard,
12207             msg->get_epoch(),
12208             msg->min_epoch,
12209             msg->get_tid(),
12210             new_lcod);
12211         reply->set_priority(CEPH_MSG_PRIO_HIGH);
12212         msg->get_connection()->send_message(reply);
12213       }
12214     });
12215
12216   if (get_osdmap()->require_osd_release >= ceph_release_t::kraken) {
12217     t.register_on_commit(complete);
12218   } else {
12219     /* Hack to work around the fact that ReplicatedBackend sends
12220      * ack+commit if commit happens first
12221      *
12222      * This behavior is no longer necessary, but we preserve it so old
12223      * primaries can keep their repops in order */
12224     if (pool.info.is_erasure()) {
12225       t.register_on_complete(complete);
12226     } else {
12227       t.register_on_commit(complete);
12228     }
12229   }
12230   int tr = osd->store->queue_transaction(
12231     ch,
12232     std::move(t),
12233     nullptr);
12234   ceph_assert(tr == 0);
12235   op_applied(info.last_update);
12236 }
12237
12238 void PrimaryLogPG::do_update_log_missing_reply(OpRequestRef &op)
12239 {
12240   const MOSDPGUpdateLogMissingReply *m =
12241     static_cast<const MOSDPGUpdateLogMissingReply*>(
12242     op->get_req());
12243   dout(20) << __func__ << " got reply from "
12244            << m->get_from() << dendl;
12245
12246   auto it = log_entry_update_waiting_on.find(m->get_tid());
12247   if (it != log_entry_update_waiting_on.end()) {
12248     if (it->second.waiting_on.count(m->get_from())) {
12249       it->second.waiting_on.erase(m->get_from());
12250       if (m->last_complete_ondisk != eversion_t()) {
12251         update_peer_last_complete_ondisk(m->get_from(), m->last_complete_ondisk);
12252       }
12253     } else {
12254       osd->clog->error()
12255         << info.pgid << " got reply "
12256         << *m << " from shard we are not waiting for "
12257         << m->get_from();
12258     }
12259
12260     if (it->second.waiting_on.empty()) {
12261       repop_all_committed(it->second.repop.get());
12262       log_entry_update_waiting_on.erase(it);
12263     }
12264   } else {
12265     osd->clog->error()
12266       << info.pgid << " got reply "
12267       << *m << " on unknown tid " << m->get_tid();
12268   }
12269 }
12270
12271 /* Mark all unfound objects as lost.
12272  */
12273 void PrimaryLogPG::mark_all_unfound_lost(
12274   int what,
12275   std::function<void(int,const std::string&,bufferlist&)> on_finish)
12276 {
12277   dout(3) << __func__ << " " << pg_log_entry_t::get_op_name(what) << dendl;
12278   list<hobject_t> oids;
12279
12280   dout(30) << __func__ << ": log before:\n";
12281   recovery_state.get_pg_log().get_log().print(*_dout);
12282   *_dout << dendl;
12283
12284   mempool::osd_pglog::list<pg_log_entry_t> log_entries;
12285
12286   utime_t mtime = ceph_clock_now();
12287   map<hobject_t, pg_missing_item>::const_iterator m =
12288     recovery_state.get_missing_loc().get_needs_recovery().begin();
12289   map<hobject_t, pg_missing_item>::const_iterator mend =
12290     recovery_state.get_missing_loc().get_needs_recovery().end();
12291
12292   ObcLockManager manager;
12293   eversion_t v = get_next_version();
12294   v.epoch = get_osdmap_epoch();
12295   uint64_t num_unfound = recovery_state.get_missing_loc().num_unfound();
12296   while (m != mend) {
12297     const hobject_t &oid(m->first);
12298     if (!recovery_state.get_missing_loc().is_unfound(oid)) {
12299       // We only care about unfound objects
12300       ++m;
12301       continue;
12302     }
12303
12304     ObjectContextRef obc;
12305     eversion_t prev;
12306
12307     switch (what) {
12308     case pg_log_entry_t::LOST_MARK:
12309       ceph_abort_msg("actually, not implemented yet!");
12310       break;
12311
12312     case pg_log_entry_t::LOST_REVERT:
12313       prev = pick_newest_available(oid);
12314       if (prev > eversion_t()) {
12315         // log it
12316         pg_log_entry_t e(
12317           pg_log_entry_t::LOST_REVERT, oid, v,
12318           m->second.need, 0, osd_reqid_t(), mtime, 0);
12319         e.reverting_to = prev;
12320         e.mark_unrollbackable();
12321         log_entries.push_back(e);
12322         dout(10) << e << dendl;
12323
12324         // we are now missing the new version; recovery code will sort it out.
12325         ++v.version;
12326         ++m;
12327         break;
12328       }
12329
12330     case pg_log_entry_t::LOST_DELETE:
12331       {
12332         pg_log_entry_t e(pg_log_entry_t::LOST_DELETE, oid, v, m->second.need,
12333                          0, osd_reqid_t(), mtime, 0);
12334         if (get_osdmap()->require_osd_release >= ceph_release_t::jewel) {
12335           if (pool.info.require_rollback()) {
12336             e.mod_desc.try_rmobject(v.version);
12337           } else {
12338             e.mark_unrollbackable();
12339           }
12340         } // otherwise, just do what we used to do
12341         dout(10) << e << dendl;
12342         log_entries.push_back(e);
12343         oids.push_back(oid);
12344
12345         // If context found mark object as deleted in case
12346         // of racing with new creation.  This can happen if
12347         // object lost and EIO at primary.
12348         obc = object_contexts.lookup(oid);
12349         if (obc)
12350           obc->obs.exists = false;
12351
12352         ++v.version;
12353         ++m;
12354       }
12355       break;
12356
12357     default:
12358       ceph_abort();
12359     }
12360   }
12361
12362   recovery_state.update_stats(
12363     [](auto &history, auto &stats) {
12364       stats.stats_invalid = true;
12365       return false;
12366     });
12367
12368   submit_log_entries(
12369     log_entries,
12370     std::move(manager),
12371     std::optional<std::function<void(void)> >(
12372       [this, oids, num_unfound, on_finish]() {
12373         if (recovery_state.perform_deletes_during_peering()) {
12374           for (auto oid : oids) {
12375             // clear old locations - merge_new_log_entries will have
12376             // handled rebuilding missing_loc for each of these
12377             // objects if we have the RECOVERY_DELETES flag
12378             recovery_state.object_recovered(oid, object_stat_sum_t());
12379           }
12380         }
12381
12382         if (is_recovery_unfound()) {
12383           queue_peering_event(
12384             PGPeeringEventRef(
12385               std::make_shared<PGPeeringEvent>(
12386               get_osdmap_epoch(),
12387               get_osdmap_epoch(),
12388               PeeringState::DoRecovery())));
12389         } else if (is_backfill_unfound()) {
12390           queue_peering_event(
12391             PGPeeringEventRef(
12392               std::make_shared<PGPeeringEvent>(
12393               get_osdmap_epoch(),
12394               get_osdmap_epoch(),
12395               PeeringState::RequestBackfill())));
12396         } else {
12397           queue_recovery();
12398         }
12399
12400         stringstream ss;
12401         ss << "pg has " << num_unfound
12402            << " objects unfound and apparently lost marking";
12403         string rs = ss.str();
12404         dout(0) << "do_command r=" << 0 << " " << rs << dendl;
12405         osd->clog->info() << rs;
12406         bufferlist empty;
12407         on_finish(0, rs, empty);
12408       }),
12409     OpRequestRef());
12410 }
12411
12412 void PrimaryLogPG::_split_into(pg_t child_pgid, PG *child, unsigned split_bits)
12413 {
12414   ceph_assert(repop_queue.empty());
12415 }
12416
12417 /*
12418  * pg status change notification
12419  */
12420
12421 void PrimaryLogPG::apply_and_flush_repops(bool requeue)
12422 {
12423   list<OpRequestRef> rq;
12424
12425   // apply all repops
12426   while (!repop_queue.empty()) {
12427     RepGather *repop = repop_queue.front();
12428     repop_queue.pop_front();
12429     dout(10) << " canceling repop tid " << repop->rep_tid << dendl;
12430     repop->rep_aborted = true;
12431     repop->on_committed.clear();
12432     repop->on_success.clear();
12433
12434     if (requeue) {
12435       if (repop->op) {
12436         dout(10) << " requeuing " << *repop->op->get_req() << dendl;
12437         rq.push_back(repop->op);
12438         repop->op = OpRequestRef();
12439       }
12440
12441       // also requeue any dups, interleaved into position
12442       auto p = waiting_for_ondisk.find(repop->v);
12443       if (p != waiting_for_ondisk.end()) {
12444         dout(10) << " also requeuing ondisk waiters " << p->second << dendl;
12445         for (auto& i : p->second) {
12446           rq.push_back(std::get<0>(i));
12447         }
12448         waiting_for_ondisk.erase(p);
12449       }
12450     }
12451
12452     remove_repop(repop);
12453   }
12454
12455   ceph_assert(repop_queue.empty());
12456
12457   if (requeue) {
12458     requeue_ops(rq);
12459     if (!waiting_for_ondisk.empty()) {
12460       for (auto& i : waiting_for_ondisk) {
12461         for (auto& j : i.second) {
12462           derr << __func__ << ": op " << *(std::get<0>(j)->get_req())
12463                << " waiting on " << i.first << dendl;
12464         }
12465       }
12466       ceph_assert(waiting_for_ondisk.empty());
12467     }
12468   }
12469
12470   waiting_for_ondisk.clear();
12471 }
12472
12473 void PrimaryLogPG::on_flushed()
12474 {
12475   requeue_ops(waiting_for_flush);
12476   if (!is_peered() || !is_primary()) {
12477     pair<hobject_t, ObjectContextRef> i;
12478     while (object_contexts.get_next(i.first, &i)) {
12479       derr << __func__ << ": object " << i.first << " obc still alive" << dendl;
12480     }
12481     ceph_assert(object_contexts.empty());
12482   }
12483 }
12484
12485 void PrimaryLogPG::on_removal(ObjectStore::Transaction &t)
12486 {
12487   dout(10) << __func__ << dendl;
12488
12489   on_shutdown();
12490
12491   t.register_on_commit(new C_DeleteMore(this, get_osdmap_epoch()));
12492 }
12493
12494 void PrimaryLogPG::clear_async_reads()
12495 {
12496   dout(10) << __func__ << dendl;
12497   for(auto& i : in_progress_async_reads) {
12498     dout(10) << "clear ctx: "
12499              << "OpRequestRef " << i.first
12500              << " OpContext " << i.second
12501              << dendl;
12502     close_op_ctx(i.second);
12503   }
12504 }
12505
12506 void PrimaryLogPG::clear_cache()
12507 {
12508   object_contexts.clear();
12509 }
12510
12511 void PrimaryLogPG::on_shutdown()
12512 {
12513   dout(10) << __func__ << dendl;
12514
12515   if (recovery_queued) {
12516     recovery_queued = false;
12517     osd->clear_queued_recovery(this);
12518   }
12519
12520   m_scrubber->scrub_clear_state();
12521
12522   m_scrubber->unreg_next_scrub();
12523
12524   vector<ceph_tid_t> tids;
12525   cancel_copy_ops(false, &tids);
12526   cancel_flush_ops(false, &tids);
12527   cancel_proxy_ops(false, &tids);
12528   cancel_manifest_ops(false, &tids);
12529   osd->objecter->op_cancel(tids, -ECANCELED);
12530
12531   apply_and_flush_repops(false);
12532   cancel_log_updates();
12533   // we must remove PGRefs, so do this this prior to release_backoffs() callers
12534   clear_backoffs();
12535   // clean up snap trim references
12536   snap_trimmer_machine.process_event(Reset());
12537
12538   pgbackend->on_change();
12539
12540   context_registry_on_change();
12541   object_contexts.clear();
12542
12543   clear_async_reads();
12544
12545   osd->remote_reserver.cancel_reservation(info.pgid);
12546   osd->local_reserver.cancel_reservation(info.pgid);
12547
12548   clear_primary_state();
12549   cancel_recovery();
12550
12551   if (is_primary()) {
12552     osd->clear_ready_to_merge(this);
12553   }
12554 }
12555
12556 void PrimaryLogPG::on_activate_complete()
12557 {
12558   check_local();
12559   // waiters
12560   if (!recovery_state.needs_flush()) {
12561     requeue_ops(waiting_for_peered);
12562   } else if (!waiting_for_peered.empty()) {
12563     dout(10) << __func__ << " flushes in progress, moving "
12564              << waiting_for_peered.size()
12565              << " items to waiting_for_flush"
12566              << dendl;
12567     ceph_assert(waiting_for_flush.empty());
12568     waiting_for_flush.swap(waiting_for_peered);
12569   }
12570
12571
12572   // all clean?
12573   if (needs_recovery()) {
12574     dout(10) << "activate not all replicas are up-to-date, queueing recovery" << dendl;
12575     queue_peering_event(
12576       PGPeeringEventRef(
12577         std::make_shared<PGPeeringEvent>(
12578           get_osdmap_epoch(),
12579           get_osdmap_epoch(),
12580           PeeringState::DoRecovery())));
12581   } else if (needs_backfill()) {
12582     dout(10) << "activate queueing backfill" << dendl;
12583     queue_peering_event(
12584       PGPeeringEventRef(
12585         std::make_shared<PGPeeringEvent>(
12586           get_osdmap_epoch(),
12587           get_osdmap_epoch(),
12588           PeeringState::RequestBackfill())));
12589   } else {
12590     dout(10) << "activate all replicas clean, no recovery" << dendl;
12591     queue_peering_event(
12592       PGPeeringEventRef(
12593         std::make_shared<PGPeeringEvent>(
12594           get_osdmap_epoch(),
12595           get_osdmap_epoch(),
12596           PeeringState::AllReplicasRecovered())));
12597   }
12598
12599   publish_stats_to_osd();
12600
12601   if (get_backfill_targets().size()) {
12602     last_backfill_started = recovery_state.earliest_backfill();
12603     new_backfill = true;
12604     ceph_assert(!last_backfill_started.is_max());
12605     dout(5) << __func__ << ": bft=" << get_backfill_targets()
12606            << " from " << last_backfill_started << dendl;
12607     for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
12608          i != get_backfill_targets().end();
12609          ++i) {
12610       dout(5) << "target shard " << *i
12611              << " from " << recovery_state.get_peer_info(*i).last_backfill
12612              << dendl;
12613     }
12614   }
12615
12616   hit_set_setup();
12617   agent_setup();
12618 }
12619
12620 void PrimaryLogPG::on_change(ObjectStore::Transaction &t)
12621 {
12622   dout(10) << __func__ << dendl;
12623
12624   if (hit_set && hit_set->insert_count() == 0) {
12625     dout(20) << " discarding empty hit_set" << dendl;
12626     hit_set_clear();
12627   }
12628
12629   if (recovery_queued) {
12630     recovery_queued = false;
12631     osd->clear_queued_recovery(this);
12632   }
12633
12634   // requeue everything in the reverse order they should be
12635   // reexamined.
12636   requeue_ops(waiting_for_peered);
12637   requeue_ops(waiting_for_flush);
12638   requeue_ops(waiting_for_active);
12639   requeue_ops(waiting_for_readable);
12640
12641   vector<ceph_tid_t> tids;
12642   cancel_copy_ops(is_primary(), &tids);
12643   cancel_flush_ops(is_primary(), &tids);
12644   cancel_proxy_ops(is_primary(), &tids);
12645   cancel_manifest_ops(is_primary(), &tids);
12646   osd->objecter->op_cancel(tids, -ECANCELED);
12647
12648   // requeue object waiters
12649   for (auto& p : waiting_for_unreadable_object) {
12650     release_backoffs(p.first);
12651   }
12652   if (is_primary()) {
12653     requeue_object_waiters(waiting_for_unreadable_object);
12654   } else {
12655     waiting_for_unreadable_object.clear();
12656   }
12657   for (map<hobject_t,list<OpRequestRef>>::iterator p = waiting_for_degraded_object.begin();
12658        p != waiting_for_degraded_object.end();
12659        waiting_for_degraded_object.erase(p++)) {
12660     release_backoffs(p->first);
12661     if (is_primary())
12662       requeue_ops(p->second);
12663     else
12664       p->second.clear();
12665     finish_degraded_object(p->first);
12666   }
12667
12668   // requeues waiting_for_scrub
12669   m_scrubber->scrub_clear_state();
12670
12671   for (auto p = waiting_for_blocked_object.begin();
12672        p != waiting_for_blocked_object.end();
12673        waiting_for_blocked_object.erase(p++)) {
12674     if (is_primary())
12675       requeue_ops(p->second);
12676     else
12677       p->second.clear();
12678   }
12679   for (auto i = callbacks_for_degraded_object.begin();
12680        i != callbacks_for_degraded_object.end();
12681     ) {
12682     finish_degraded_object((i++)->first);
12683   }
12684   ceph_assert(callbacks_for_degraded_object.empty());
12685
12686   if (is_primary()) {
12687     requeue_ops(waiting_for_cache_not_full);
12688   } else {
12689     waiting_for_cache_not_full.clear();
12690   }
12691   objects_blocked_on_cache_full.clear();
12692
12693   for (list<pair<OpRequestRef, OpContext*> >::iterator i =
12694          in_progress_async_reads.begin();
12695        i != in_progress_async_reads.end();
12696        in_progress_async_reads.erase(i++)) {
12697     close_op_ctx(i->second);
12698     if (is_primary())
12699       requeue_op(i->first);
12700   }
12701
12702   // this will requeue ops we were working on but didn't finish, and
12703   // any dups
12704   apply_and_flush_repops(is_primary());
12705   cancel_log_updates();
12706
12707   // do this *after* apply_and_flush_repops so that we catch any newly
12708   // registered watches.
12709   context_registry_on_change();
12710
12711   pgbackend->on_change_cleanup(&t);
12712   m_scrubber->cleanup_store(&t);
12713   pgbackend->on_change();
12714
12715   // clear snap_trimmer state
12716   snap_trimmer_machine.process_event(Reset());
12717
12718   debug_op_order.clear();
12719   unstable_stats.clear();
12720
12721   // we don't want to cache object_contexts through the interval change
12722   // NOTE: we actually assert that all currently live references are dead
12723   // by the time the flush for the next interval completes.
12724   object_contexts.clear();
12725
12726   // should have been cleared above by finishing all of the degraded objects
12727   ceph_assert(objects_blocked_on_degraded_snap.empty());
12728 }
12729
12730 void PrimaryLogPG::plpg_on_role_change()
12731 {
12732   dout(10) << __func__ << dendl;
12733   if (get_role() != 0 && hit_set) {
12734     dout(10) << " clearing hit set" << dendl;
12735     hit_set_clear();
12736   }
12737 }
12738
12739 void PrimaryLogPG::plpg_on_pool_change()
12740 {
12741   dout(10) << __func__ << dendl;
12742   // requeue cache full waiters just in case the cache_mode is
12743   // changing away from writeback mode.  note that if we are not
12744   // active the normal requeuing machinery is sufficient (and properly
12745   // ordered).
12746   if (is_active() &&
12747       pool.info.cache_mode != pg_pool_t::CACHEMODE_WRITEBACK &&
12748       !waiting_for_cache_not_full.empty()) {
12749     dout(10) << __func__ << " requeuing full waiters (not in writeback) "
12750              << dendl;
12751     requeue_ops(waiting_for_cache_not_full);
12752     objects_blocked_on_cache_full.clear();
12753   }
12754   hit_set_setup();
12755   agent_setup();
12756 }
12757
12758 // clear state.  called on recovery completion AND cancellation.
12759 void PrimaryLogPG::_clear_recovery_state()
12760 {
12761 #ifdef DEBUG_RECOVERY_OIDS
12762   recovering_oids.clear();
12763 #endif
12764   dout(15) << __func__ << " flags: " << m_planned_scrub << dendl;
12765
12766   last_backfill_started = hobject_t();
12767   set<hobject_t>::iterator i = backfills_in_flight.begin();
12768   while (i != backfills_in_flight.end()) {
12769     backfills_in_flight.erase(i++);
12770   }
12771
12772   list<OpRequestRef> blocked_ops;
12773   for (map<hobject_t, ObjectContextRef>::iterator i = recovering.begin();
12774        i != recovering.end();
12775        recovering.erase(i++)) {
12776     if (i->second) {
12777       i->second->drop_recovery_read(&blocked_ops);
12778       requeue_ops(blocked_ops);
12779     }
12780   }
12781   ceph_assert(backfills_in_flight.empty());
12782   pending_backfill_updates.clear();
12783   ceph_assert(recovering.empty());
12784   pgbackend->clear_recovery_state();
12785 }
12786
12787 void PrimaryLogPG::cancel_pull(const hobject_t &soid)
12788 {
12789   dout(20) << __func__ << ": " << soid << dendl;
12790   ceph_assert(recovering.count(soid));
12791   ObjectContextRef obc = recovering[soid];
12792   if (obc) {
12793     list<OpRequestRef> blocked_ops;
12794     obc->drop_recovery_read(&blocked_ops);
12795     requeue_ops(blocked_ops);
12796   }
12797   recovering.erase(soid);
12798   finish_recovery_op(soid);
12799   release_backoffs(soid);
12800   if (waiting_for_degraded_object.count(soid)) {
12801     dout(20) << " kicking degraded waiters on " << soid << dendl;
12802     requeue_ops(waiting_for_degraded_object[soid]);
12803     waiting_for_degraded_object.erase(soid);
12804   }
12805   if (waiting_for_unreadable_object.count(soid)) {
12806     dout(20) << " kicking unreadable waiters on " << soid << dendl;
12807     requeue_ops(waiting_for_unreadable_object[soid]);
12808     waiting_for_unreadable_object.erase(soid);
12809   }
12810   if (is_missing_object(soid))
12811     recovery_state.set_last_requested(0);
12812   finish_degraded_object(soid);
12813 }
12814
12815 void PrimaryLogPG::check_recovery_sources(const OSDMapRef& osdmap)
12816 {
12817   pgbackend->check_recovery_sources(osdmap);
12818 }
12819
12820 bool PrimaryLogPG::start_recovery_ops(
12821   uint64_t max,
12822   ThreadPool::TPHandle &handle,
12823   uint64_t *ops_started)
12824 {
12825   uint64_t& started = *ops_started;
12826   started = 0;
12827   bool work_in_progress = false;
12828   bool recovery_started = false;
12829   ceph_assert(is_primary());
12830   ceph_assert(is_peered());
12831   ceph_assert(!recovery_state.is_deleting());
12832
12833   ceph_assert(recovery_queued);
12834   recovery_queued = false;
12835
12836   if (!state_test(PG_STATE_RECOVERING) &&
12837       !state_test(PG_STATE_BACKFILLING)) {
12838     /* TODO: I think this case is broken and will make do_recovery()
12839      * unhappy since we're returning false */
12840     dout(10) << "recovery raced and were queued twice, ignoring!" << dendl;
12841     return have_unfound();
12842   }
12843
12844   const auto &missing = recovery_state.get_pg_log().get_missing();
12845
12846   uint64_t num_unfound = get_num_unfound();
12847
12848   if (!recovery_state.have_missing()) {
12849     recovery_state.local_recovery_complete();
12850   }
12851
12852   if (!missing.have_missing() || // Primary does not have missing
12853       // or all of the missing objects are unfound.
12854       recovery_state.all_missing_unfound()) {
12855     // Recover the replicas.
12856     started = recover_replicas(max, handle, &recovery_started);
12857   }
12858   if (!started) {
12859     // We still have missing objects that we should grab from replicas.
12860     started += recover_primary(max, handle);
12861   }
12862   if (!started && num_unfound != get_num_unfound()) {
12863     // second chance to recovery replicas
12864     started = recover_replicas(max, handle, &recovery_started);
12865   }
12866
12867   if (started || recovery_started)
12868     work_in_progress = true;
12869
12870   bool deferred_backfill = false;
12871   if (recovering.empty() &&
12872       state_test(PG_STATE_BACKFILLING) &&
12873       !get_backfill_targets().empty() && started < max &&
12874       missing.num_missing() == 0 &&
12875       waiting_on_backfill.empty()) {
12876     if (get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL)) {
12877       dout(10) << "deferring backfill due to NOBACKFILL" << dendl;
12878       deferred_backfill = true;
12879     } else if (get_osdmap()->test_flag(CEPH_OSDMAP_NOREBALANCE) &&
12880                !is_degraded())  {
12881       dout(10) << "deferring backfill due to NOREBALANCE" << dendl;
12882       deferred_backfill = true;
12883     } else if (!recovery_state.is_backfill_reserved()) {
12884       /* DNMNOTE I think this branch is dead */
12885       dout(10) << "deferring backfill due to !backfill_reserved" << dendl;
12886       if (!backfill_reserving) {
12887         dout(10) << "queueing RequestBackfill" << dendl;
12888         backfill_reserving = true;
12889         queue_peering_event(
12890           PGPeeringEventRef(
12891             std::make_shared<PGPeeringEvent>(
12892               get_osdmap_epoch(),
12893               get_osdmap_epoch(),
12894               PeeringState::RequestBackfill())));
12895       }
12896       deferred_backfill = true;
12897     } else {
12898       started += recover_backfill(max - started, handle, &work_in_progress);
12899     }
12900   }
12901
12902   dout(10) << " started " << started << dendl;
12903   osd->logger->inc(l_osd_rop, started);
12904
12905   if (!recovering.empty() ||
12906       work_in_progress || recovery_ops_active > 0 || deferred_backfill)
12907     return !work_in_progress && have_unfound();
12908
12909   ceph_assert(recovering.empty());
12910   ceph_assert(recovery_ops_active == 0);
12911
12912   dout(10) << __func__ << " needs_recovery: "
12913            << recovery_state.get_missing_loc().get_needs_recovery()
12914            << dendl;
12915   dout(10) << __func__ << " missing_loc: "
12916            << recovery_state.get_missing_loc().get_missing_locs()
12917            << dendl;
12918   int unfound = get_num_unfound();
12919   if (unfound) {
12920     dout(10) << " still have " << unfound << " unfound" << dendl;
12921     return true;
12922   }
12923
12924   if (missing.num_missing() > 0) {
12925     // this shouldn't happen!
12926     osd->clog->error() << info.pgid << " Unexpected Error: recovery ending with "
12927                        << missing.num_missing() << ": " << missing.get_items();
12928     return false;
12929   }
12930
12931   if (needs_recovery()) {
12932     // this shouldn't happen!
12933     // We already checked num_missing() so we must have missing replicas
12934     osd->clog->error() << info.pgid
12935                        << " Unexpected Error: recovery ending with missing replicas";
12936     return false;
12937   }
12938
12939   if (state_test(PG_STATE_RECOVERING)) {
12940     state_clear(PG_STATE_RECOVERING);
12941     state_clear(PG_STATE_FORCED_RECOVERY);
12942     if (needs_backfill()) {
12943       dout(10) << "recovery done, queuing backfill" << dendl;
12944       queue_peering_event(
12945         PGPeeringEventRef(
12946           std::make_shared<PGPeeringEvent>(
12947             get_osdmap_epoch(),
12948             get_osdmap_epoch(),
12949             PeeringState::RequestBackfill())));
12950     } else {
12951       dout(10) << "recovery done, no backfill" << dendl;
12952       state_clear(PG_STATE_FORCED_BACKFILL);
12953       queue_peering_event(
12954         PGPeeringEventRef(
12955           std::make_shared<PGPeeringEvent>(
12956             get_osdmap_epoch(),
12957             get_osdmap_epoch(),
12958             PeeringState::AllReplicasRecovered())));
12959     }
12960   } else { // backfilling
12961     state_clear(PG_STATE_BACKFILLING);
12962     state_clear(PG_STATE_FORCED_BACKFILL);
12963     state_clear(PG_STATE_FORCED_RECOVERY);
12964     dout(10) << "recovery done, backfill done" << dendl;
12965     queue_peering_event(
12966       PGPeeringEventRef(
12967         std::make_shared<PGPeeringEvent>(
12968           get_osdmap_epoch(),
12969           get_osdmap_epoch(),
12970           PeeringState::Backfilled())));
12971   }
12972
12973   return false;
12974 }
12975
12976 /**
12977  * do one recovery op.
12978  * return true if done, false if nothing left to do.
12979  */
12980 uint64_t PrimaryLogPG::recover_primary(uint64_t max, ThreadPool::TPHandle &handle)
12981 {
12982   ceph_assert(is_primary());
12983
12984   const auto &missing = recovery_state.get_pg_log().get_missing();
12985
12986   dout(10) << __func__ << " recovering " << recovering.size()
12987            << " in pg,"
12988            << " missing " << missing << dendl;
12989
12990   dout(25) << __func__ << " " << missing.get_items() << dendl;
12991
12992   // look at log!
12993   pg_log_entry_t *latest = 0;
12994   unsigned started = 0;
12995   int skipped = 0;
12996
12997   PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
12998   map<version_t, hobject_t>::const_iterator p =
12999     missing.get_rmissing().lower_bound(recovery_state.get_pg_log().get_log().last_requested);
13000   while (p != missing.get_rmissing().end()) {
13001     handle.reset_tp_timeout();
13002     hobject_t soid;
13003     version_t v = p->first;
13004
13005     auto it_objects = recovery_state.get_pg_log().get_log().objects.find(p->second);
13006     if (it_objects != recovery_state.get_pg_log().get_log().objects.end()) {
13007       latest = it_objects->second;
13008       ceph_assert(latest->is_update() || latest->is_delete());
13009       soid = latest->soid;
13010     } else {
13011       latest = 0;
13012       soid = p->second;
13013     }
13014     const pg_missing_item& item = missing.get_items().find(p->second)->second;
13015     ++p;
13016
13017     hobject_t head = soid.get_head();
13018
13019     eversion_t need = item.need;
13020
13021     dout(10) << __func__ << " "
13022              << soid << " " << item.need
13023              << (missing.is_missing(soid) ? " (missing)":"")
13024              << (missing.is_missing(head) ? " (missing head)":"")
13025              << (recovering.count(soid) ? " (recovering)":"")
13026              << (recovering.count(head) ? " (recovering head)":"")
13027              << dendl;
13028
13029     if (latest) {
13030       switch (latest->op) {
13031       case pg_log_entry_t::CLONE:
13032         /*
13033          * Handling for this special case removed for now, until we
13034          * can correctly construct an accurate SnapSet from the old
13035          * one.
13036          */
13037         break;
13038
13039       case pg_log_entry_t::LOST_REVERT:
13040         {
13041           if (item.have == latest->reverting_to) {
13042             ObjectContextRef obc = get_object_context(soid, true);
13043
13044             if (obc->obs.oi.version == latest->version) {
13045               // I'm already reverting
13046               dout(10) << " already reverting " << soid << dendl;
13047             } else {
13048               dout(10) << " reverting " << soid << " to " << latest->prior_version << dendl;
13049               obc->obs.oi.version = latest->version;
13050
13051               ObjectStore::Transaction t;
13052               bufferlist b2;
13053               obc->obs.oi.encode(
13054                 b2,
13055                 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
13056               ceph_assert(!pool.info.require_rollback());
13057               t.setattr(coll, ghobject_t(soid), OI_ATTR, b2);
13058
13059               recovery_state.recover_got(
13060                 soid,
13061                 latest->version,
13062                 false,
13063                 t);
13064
13065               ++active_pushes;
13066
13067               t.register_on_applied(new C_OSD_AppliedRecoveredObject(this, obc));
13068               t.register_on_commit(new C_OSD_CommittedPushedObject(
13069                                      this,
13070                                      get_osdmap_epoch(),
13071                                      info.last_complete));
13072               osd->store->queue_transaction(ch, std::move(t));
13073               continue;
13074             }
13075           } else {
13076             /*
13077              * Pull the old version of the object.  Update missing_loc here to have the location
13078              * of the version we want.
13079              *
13080              * This doesn't use the usual missing_loc paths, but that's okay:
13081              *  - if we have it locally, we hit the case above, and go from there.
13082              *  - if we don't, we always pass through this case during recovery and set up the location
13083              *    properly.
13084              *  - this way we don't need to mangle the missing code to be general about needing an old
13085              *    version...
13086              */
13087             eversion_t alternate_need = latest->reverting_to;
13088             dout(10) << " need to pull prior_version " << alternate_need << " for revert " << item << dendl;
13089
13090             set<pg_shard_t> good_peers;
13091             for (auto p = recovery_state.get_peer_missing().begin();
13092                  p != recovery_state.get_peer_missing().end();
13093                  ++p) {
13094               if (p->second.is_missing(soid, need) &&
13095                   p->second.get_items().at(soid).have == alternate_need) {
13096                 good_peers.insert(p->first);
13097               }
13098             }
13099             recovery_state.set_revert_with_targets(
13100               soid,
13101               good_peers);
13102             dout(10) << " will pull " << alternate_need << " or " << need
13103                      << " from one of "
13104                      << recovery_state.get_missing_loc().get_locations(soid)
13105                      << dendl;
13106           }
13107         }
13108         break;
13109       }
13110     }
13111
13112     if (!recovering.count(soid)) {
13113       if (recovering.count(head)) {
13114         ++skipped;
13115       } else {
13116         int r = recover_missing(
13117           soid, need, get_recovery_op_priority(), h);
13118         switch (r) {
13119         case PULL_YES:
13120           ++started;
13121           break;
13122         case PULL_HEAD:
13123           ++started;
13124         case PULL_NONE:
13125           ++skipped;
13126           break;
13127         default:
13128           ceph_abort();
13129         }
13130         if (started >= max)
13131           break;
13132       }
13133     }
13134
13135     // only advance last_requested if we haven't skipped anything
13136     if (!skipped)
13137       recovery_state.set_last_requested(v);
13138   }
13139
13140   pgbackend->run_recovery_op(h, get_recovery_op_priority());
13141   return started;
13142 }
13143
13144 bool PrimaryLogPG::primary_error(
13145   const hobject_t& soid, eversion_t v)
13146 {
13147   recovery_state.force_object_missing(pg_whoami, soid, v);
13148   bool uhoh = recovery_state.get_missing_loc().is_unfound(soid);
13149   if (uhoh)
13150     osd->clog->error() << info.pgid << " missing primary copy of "
13151                        << soid << ", unfound";
13152   else
13153     osd->clog->error() << info.pgid << " missing primary copy of "
13154                        << soid
13155                        << ", will try copies on "
13156                        << recovery_state.get_missing_loc().get_locations(soid);
13157   return uhoh;
13158 }
13159
13160 int PrimaryLogPG::prep_object_replica_deletes(
13161   const hobject_t& soid, eversion_t v,
13162   PGBackend::RecoveryHandle *h,
13163   bool *work_started)
13164 {
13165   ceph_assert(is_primary());
13166   dout(10) << __func__ << ": on " << soid << dendl;
13167
13168   ObjectContextRef obc = get_object_context(soid, false);
13169   if (obc) {
13170     if (!obc->get_recovery_read()) {
13171       dout(20) << "replica delete delayed on " << soid
13172                << "; could not get rw_manager lock" << dendl;
13173       *work_started = true;
13174       return 0;
13175     } else {
13176       dout(20) << "replica delete got recovery read lock on " << soid
13177                << dendl;
13178     }
13179   }
13180
13181   start_recovery_op(soid);
13182   ceph_assert(!recovering.count(soid));
13183   if (!obc)
13184     recovering.insert(make_pair(soid, ObjectContextRef()));
13185   else
13186     recovering.insert(make_pair(soid, obc));
13187
13188   pgbackend->recover_delete_object(soid, v, h);
13189   return 1;
13190 }
13191
13192 int PrimaryLogPG::prep_object_replica_pushes(
13193   const hobject_t& soid, eversion_t v,
13194   PGBackend::RecoveryHandle *h,
13195   bool *work_started)
13196 {
13197   ceph_assert(is_primary());
13198   dout(10) << __func__ << ": on " << soid << dendl;
13199
13200   if (soid.snap && soid.snap < CEPH_NOSNAP) {
13201     // do we have the head and/or snapdir?
13202     hobject_t head = soid.get_head();
13203     if (recovery_state.get_pg_log().get_missing().is_missing(head)) {
13204       if (recovering.count(head)) {
13205         dout(10) << " missing but already recovering head " << head << dendl;
13206         return 0;
13207       } else {
13208         int r = recover_missing(
13209             head, recovery_state.get_pg_log().get_missing().get_items().find(head)->second.need,
13210             get_recovery_op_priority(), h);
13211         if (r != PULL_NONE)
13212           return 1;
13213         return 0;
13214       }
13215     }
13216   }
13217
13218   // NOTE: we know we will get a valid oloc off of disk here.
13219   ObjectContextRef obc = get_object_context(soid, false);
13220   if (!obc) {
13221     primary_error(soid, v);
13222     return 0;
13223   }
13224
13225   if (!obc->get_recovery_read()) {
13226     dout(20) << "recovery delayed on " << soid
13227              << "; could not get rw_manager lock" << dendl;
13228     *work_started = true;
13229     return 0;
13230   } else {
13231     dout(20) << "recovery got recovery read lock on " << soid
13232              << dendl;
13233   }
13234
13235   start_recovery_op(soid);
13236   ceph_assert(!recovering.count(soid));
13237   recovering.insert(make_pair(soid, obc));
13238
13239   int r = pgbackend->recover_object(
13240     soid,
13241     v,
13242     ObjectContextRef(),
13243     obc, // has snapset context
13244     h);
13245   if (r < 0) {
13246     dout(0) << __func__ << " Error " << r << " on oid " << soid << dendl;
13247     on_failed_pull({ pg_whoami }, soid, v);
13248     return 0;
13249   }
13250   return 1;
13251 }
13252
13253 uint64_t PrimaryLogPG::recover_replicas(uint64_t max, ThreadPool::TPHandle &handle,
13254   bool *work_started)
13255 {
13256   dout(10) << __func__ << "(" << max << ")" << dendl;
13257   uint64_t started = 0;
13258
13259   PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
13260
13261   // this is FAR from an optimal recovery order.  pretty lame, really.
13262   ceph_assert(!get_acting_recovery_backfill().empty());
13263   // choose replicas to recover, replica has the shortest missing list first
13264   // so we can bring it back to normal ASAP
13265   std::vector<std::pair<unsigned int, pg_shard_t>> replicas_by_num_missing,
13266     async_by_num_missing;
13267   replicas_by_num_missing.reserve(get_acting_recovery_backfill().size() - 1);
13268   for (auto &p: get_acting_recovery_backfill()) {
13269     if (p == get_primary()) {
13270       continue;
13271     }
13272     auto pm = recovery_state.get_peer_missing().find(p);
13273     ceph_assert(pm != recovery_state.get_peer_missing().end());
13274     auto nm = pm->second.num_missing();
13275     if (nm != 0) {
13276       if (is_async_recovery_target(p)) {
13277         async_by_num_missing.push_back(make_pair(nm, p));
13278       } else {
13279         replicas_by_num_missing.push_back(make_pair(nm, p));
13280       }
13281     }
13282   }
13283   // sort by number of missing objects, in ascending order.
13284   auto func = [](const std::pair<unsigned int, pg_shard_t> &lhs,
13285                  const std::pair<unsigned int, pg_shard_t> &rhs) {
13286     return lhs.first < rhs.first;
13287   };
13288   // acting goes first
13289   std::sort(replicas_by_num_missing.begin(), replicas_by_num_missing.end(), func);
13290   // then async_recovery_targets
13291   std::sort(async_by_num_missing.begin(), async_by_num_missing.end(), func);
13292   replicas_by_num_missing.insert(replicas_by_num_missing.end(),
13293     async_by_num_missing.begin(), async_by_num_missing.end());
13294   for (auto &replica: replicas_by_num_missing) {
13295     pg_shard_t &peer = replica.second;
13296     ceph_assert(peer != get_primary());
13297     auto pm = recovery_state.get_peer_missing().find(peer);
13298     ceph_assert(pm != recovery_state.get_peer_missing().end());
13299     size_t m_sz = pm->second.num_missing();
13300
13301     dout(10) << " peer osd." << peer << " missing " << m_sz << " objects." << dendl;
13302     dout(20) << " peer osd." << peer << " missing " << pm->second.get_items() << dendl;
13303
13304     // oldest first!
13305     const pg_missing_t &m(pm->second);
13306     for (map<version_t, hobject_t>::const_iterator p = m.get_rmissing().begin();
13307          p != m.get_rmissing().end() && started < max;
13308            ++p) {
13309       handle.reset_tp_timeout();
13310       const hobject_t soid(p->second);
13311
13312       if (recovery_state.get_missing_loc().is_unfound(soid)) {
13313         dout(10) << __func__ << ": " << soid << " still unfound" << dendl;
13314         continue;
13315       }
13316
13317       const pg_info_t &pi = recovery_state.get_peer_info(peer);
13318       if (soid > pi.last_backfill) {
13319         if (!recovering.count(soid)) {
13320           derr << __func__ << ": object " << soid << " last_backfill "
13321                << pi.last_backfill << dendl;
13322           derr << __func__ << ": object added to missing set for backfill, but "
13323                << "is not in recovering, error!" << dendl;
13324           ceph_abort();
13325         }
13326         continue;
13327       }
13328
13329       if (recovering.count(soid)) {
13330         dout(10) << __func__ << ": already recovering " << soid << dendl;
13331         continue;
13332       }
13333
13334       if (recovery_state.get_missing_loc().is_deleted(soid)) {
13335         dout(10) << __func__ << ": " << soid << " is a delete, removing" << dendl;
13336         map<hobject_t,pg_missing_item>::const_iterator r = m.get_items().find(soid);
13337         started += prep_object_replica_deletes(soid, r->second.need, h, work_started);
13338         continue;
13339       }
13340
13341       if (soid.is_snap() &&
13342           recovery_state.get_pg_log().get_missing().is_missing(
13343             soid.get_head())) {
13344         dout(10) << __func__ << ": " << soid.get_head()
13345                  << " still missing on primary" << dendl;
13346         continue;
13347       }
13348
13349       if (recovery_state.get_pg_log().get_missing().is_missing(soid)) {
13350         dout(10) << __func__ << ": " << soid << " still missing on primary" << dendl;
13351         continue;
13352       }
13353
13354       dout(10) << __func__ << ": recover_object_replicas(" << soid << ")" << dendl;
13355       map<hobject_t,pg_missing_item>::const_iterator r = m.get_items().find(soid);
13356       started += prep_object_replica_pushes(soid, r->second.need, h, work_started);
13357     }
13358   }
13359
13360   pgbackend->run_recovery_op(h, get_recovery_op_priority());
13361   return started;
13362 }
13363
13364 hobject_t PrimaryLogPG::earliest_peer_backfill() const
13365 {
13366   hobject_t e = hobject_t::get_max();
13367   for (const pg_shard_t& peer : get_backfill_targets()) {
13368     const auto iter = peer_backfill_info.find(peer);
13369     ceph_assert(iter != peer_backfill_info.end());
13370     e = std::min(e, iter->second.begin);
13371   }
13372   return e;
13373 }
13374
13375 bool PrimaryLogPG::all_peer_done() const
13376 {
13377   // Primary hasn't got any more objects
13378   ceph_assert(backfill_info.empty());
13379
13380   for (const pg_shard_t& bt : get_backfill_targets()) {
13381     const auto piter = peer_backfill_info.find(bt);
13382     ceph_assert(piter != peer_backfill_info.end());
13383     const BackfillInterval& pbi = piter->second;
13384     // See if peer has more to process
13385     if (!pbi.extends_to_end() || !pbi.empty())
13386         return false;
13387   }
13388   return true;
13389 }
13390
13391 /**
13392  * recover_backfill
13393  *
13394  * Invariants:
13395  *
13396  * backfilled: fully pushed to replica or present in replica's missing set (both
13397  * our copy and theirs).
13398  *
13399  * All objects on a backfill_target in
13400  * [MIN,peer_backfill_info[backfill_target].begin) are valid; logically-removed
13401  * objects have been actually deleted and all logically-valid objects are replicated.
13402  * There may be PG objects in this interval yet to be backfilled.
13403  *
13404  * All objects in PG in [MIN,backfill_info.begin) have been backfilled to all
13405  * backfill_targets.  There may be objects on backfill_target(s) yet to be deleted.
13406  *
13407  * For a backfill target, all objects < std::min(peer_backfill_info[target].begin,
13408  *     backfill_info.begin) in PG are backfilled.  No deleted objects in this
13409  * interval remain on the backfill target.
13410  *
13411  * For a backfill target, all objects <= peer_info[target].last_backfill
13412  * have been backfilled to target
13413  *
13414  * There *MAY* be missing/outdated objects between last_backfill_started and
13415  * std::min(peer_backfill_info[*].begin, backfill_info.begin) in the event that client
13416  * io created objects since the last scan.  For this reason, we call
13417  * update_range() again before continuing backfill.
13418  */
13419 uint64_t PrimaryLogPG::recover_backfill(
13420   uint64_t max,
13421   ThreadPool::TPHandle &handle, bool *work_started)
13422 {
13423   dout(10) << __func__ << " (" << max << ")"
13424            << " bft=" << get_backfill_targets()
13425            << " last_backfill_started " << last_backfill_started
13426            << (new_backfill ? " new_backfill":"")
13427            << dendl;
13428   ceph_assert(!get_backfill_targets().empty());
13429
13430   // Initialize from prior backfill state
13431   if (new_backfill) {
13432     // on_activate() was called prior to getting here
13433     ceph_assert(last_backfill_started == recovery_state.earliest_backfill());
13434     new_backfill = false;
13435
13436     // initialize BackfillIntervals
13437     for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
13438          i != get_backfill_targets().end();
13439          ++i) {
13440       peer_backfill_info[*i].reset(
13441         recovery_state.get_peer_info(*i).last_backfill);
13442     }
13443     backfill_info.reset(last_backfill_started);
13444
13445     backfills_in_flight.clear();
13446     pending_backfill_updates.clear();
13447   }
13448
13449   for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
13450        i != get_backfill_targets().end();
13451        ++i) {
13452     dout(10) << "peer osd." << *i
13453            << " info " << recovery_state.get_peer_info(*i)
13454            << " interval " << peer_backfill_info[*i].begin
13455            << "-" << peer_backfill_info[*i].end
13456            << " " << peer_backfill_info[*i].objects.size() << " objects"
13457            << dendl;
13458   }
13459
13460   // update our local interval to cope with recent changes
13461   backfill_info.begin = last_backfill_started;
13462   update_range(&backfill_info, handle);
13463
13464   unsigned ops = 0;
13465   vector<boost::tuple<hobject_t, eversion_t, pg_shard_t> > to_remove;
13466   set<hobject_t> add_to_stat;
13467
13468   for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
13469        i != get_backfill_targets().end();
13470        ++i) {
13471     peer_backfill_info[*i].trim_to(
13472       std::max(
13473         recovery_state.get_peer_info(*i).last_backfill,
13474         last_backfill_started));
13475   }
13476   backfill_info.trim_to(last_backfill_started);
13477
13478   PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
13479   while (ops < max) {
13480     if (backfill_info.begin <= earliest_peer_backfill() &&
13481         !backfill_info.extends_to_end() && backfill_info.empty()) {
13482       hobject_t next = backfill_info.end;
13483       backfill_info.reset(next);
13484       backfill_info.end = hobject_t::get_max();
13485       update_range(&backfill_info, handle);
13486       backfill_info.trim();
13487     }
13488
13489     dout(20) << "   my backfill interval " << backfill_info << dendl;
13490
13491     bool sent_scan = false;
13492     for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
13493          i != get_backfill_targets().end();
13494          ++i) {
13495       pg_shard_t bt = *i;
13496       BackfillInterval& pbi = peer_backfill_info[bt];
13497
13498       dout(20) << " peer shard " << bt << " backfill " << pbi << dendl;
13499       if (pbi.begin <= backfill_info.begin &&
13500           !pbi.extends_to_end() && pbi.empty()) {
13501         dout(10) << " scanning peer osd." << bt << " from " << pbi.end << dendl;
13502         epoch_t e = get_osdmap_epoch();
13503         MOSDPGScan *m = new MOSDPGScan(
13504           MOSDPGScan::OP_SCAN_GET_DIGEST, pg_whoami, e, get_last_peering_reset(),
13505           spg_t(info.pgid.pgid, bt.shard),
13506           pbi.end, hobject_t());
13507         osd->send_message_osd_cluster(bt.osd, m, get_osdmap_epoch());
13508         ceph_assert(waiting_on_backfill.find(bt) == waiting_on_backfill.end());
13509         waiting_on_backfill.insert(bt);
13510         sent_scan = true;
13511       }
13512     }
13513
13514     // Count simultaneous scans as a single op and let those complete
13515     if (sent_scan) {
13516       ops++;
13517       start_recovery_op(hobject_t::get_max()); // XXX: was pbi.end
13518       break;
13519     }
13520
13521     if (backfill_info.empty() && all_peer_done()) {
13522       dout(10) << " reached end for both local and all peers" << dendl;
13523       break;
13524     }
13525
13526     // Get object within set of peers to operate on and
13527     // the set of targets for which that object applies.
13528     hobject_t check = earliest_peer_backfill();
13529
13530     if (check < backfill_info.begin) {
13531
13532       set<pg_shard_t> check_targets;
13533       for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
13534            i != get_backfill_targets().end();
13535            ++i) {
13536         pg_shard_t bt = *i;
13537         BackfillInterval& pbi = peer_backfill_info[bt];
13538         if (pbi.begin == check)
13539           check_targets.insert(bt);
13540       }
13541       ceph_assert(!check_targets.empty());
13542
13543       dout(20) << " BACKFILL removing " << check
13544                << " from peers " << check_targets << dendl;
13545       for (set<pg_shard_t>::iterator i = check_targets.begin();
13546            i != check_targets.end();
13547            ++i) {
13548         pg_shard_t bt = *i;
13549         BackfillInterval& pbi = peer_backfill_info[bt];
13550         ceph_assert(pbi.begin == check);
13551
13552         to_remove.push_back(boost::make_tuple(check, pbi.objects.begin()->second, bt));
13553         pbi.pop_front();
13554       }
13555
13556       last_backfill_started = check;
13557
13558       // Don't increment ops here because deletions
13559       // are cheap and not replied to unlike real recovery_ops,
13560       // and we can't increment ops without requeueing ourself
13561       // for recovery.
13562     } else {
13563       eversion_t& obj_v = backfill_info.objects.begin()->second;
13564
13565       vector<pg_shard_t> need_ver_targs, missing_targs, keep_ver_targs, skip_targs;
13566       for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
13567            i != get_backfill_targets().end();
13568            ++i) {
13569         pg_shard_t bt = *i;
13570         BackfillInterval& pbi = peer_backfill_info[bt];
13571         // Find all check peers that have the wrong version
13572         if (check == backfill_info.begin && check == pbi.begin) {
13573           if (pbi.objects.begin()->second != obj_v) {
13574             need_ver_targs.push_back(bt);
13575           } else {
13576             keep_ver_targs.push_back(bt);
13577           }
13578         } else {
13579           const pg_info_t& pinfo = recovery_state.get_peer_info(bt);
13580
13581           // Only include peers that we've caught up to their backfill line
13582           // otherwise, they only appear to be missing this object
13583           // because their pbi.begin > backfill_info.begin.
13584           if (backfill_info.begin > pinfo.last_backfill)
13585             missing_targs.push_back(bt);
13586           else
13587             skip_targs.push_back(bt);
13588         }
13589       }
13590
13591       if (!keep_ver_targs.empty()) {
13592         // These peers have version obj_v
13593         dout(20) << " BACKFILL keeping " << check
13594                  << " with ver " << obj_v
13595                  << " on peers " << keep_ver_targs << dendl;
13596         //assert(!waiting_for_degraded_object.count(check));
13597       }
13598       if (!need_ver_targs.empty() || !missing_targs.empty()) {
13599         ObjectContextRef obc = get_object_context(backfill_info.begin, false);
13600         ceph_assert(obc);
13601         if (obc->get_recovery_read()) {
13602           if (!need_ver_targs.empty()) {
13603             dout(20) << " BACKFILL replacing " << check
13604                    << " with ver " << obj_v
13605                    << " to peers " << need_ver_targs << dendl;
13606           }
13607           if (!missing_targs.empty()) {
13608             dout(20) << " BACKFILL pushing " << backfill_info.begin
13609                  << " with ver " << obj_v
13610                  << " to peers " << missing_targs << dendl;
13611           }
13612           vector<pg_shard_t> all_push = need_ver_targs;
13613           all_push.insert(all_push.end(), missing_targs.begin(), missing_targs.end());
13614
13615           handle.reset_tp_timeout();
13616           int r = prep_backfill_object_push(backfill_info.begin, obj_v, obc, all_push, h);
13617           if (r < 0) {
13618             *work_started = true;
13619             dout(0) << __func__ << " Error " << r << " trying to backfill " << backfill_info.begin << dendl;
13620             break;
13621           }
13622           ops++;
13623         } else {
13624           *work_started = true;
13625           dout(20) << "backfill blocking on " << backfill_info.begin
13626                    << "; could not get rw_manager lock" << dendl;
13627           break;
13628         }
13629       }
13630       dout(20) << "need_ver_targs=" << need_ver_targs
13631                << " keep_ver_targs=" << keep_ver_targs << dendl;
13632       dout(20) << "backfill_targets=" << get_backfill_targets()
13633                << " missing_targs=" << missing_targs
13634                << " skip_targs=" << skip_targs << dendl;
13635
13636       last_backfill_started = backfill_info.begin;
13637       add_to_stat.insert(backfill_info.begin); // XXX: Only one for all pushes?
13638       backfill_info.pop_front();
13639       vector<pg_shard_t> check_targets = need_ver_targs;
13640       check_targets.insert(check_targets.end(), keep_ver_targs.begin(), keep_ver_targs.end());
13641       for (vector<pg_shard_t>::iterator i = check_targets.begin();
13642            i != check_targets.end();
13643            ++i) {
13644         pg_shard_t bt = *i;
13645         BackfillInterval& pbi = peer_backfill_info[bt];
13646         pbi.pop_front();
13647       }
13648     }
13649   }
13650
13651   for (set<hobject_t>::iterator i = add_to_stat.begin();
13652        i != add_to_stat.end();
13653        ++i) {
13654     ObjectContextRef obc = get_object_context(*i, false);
13655     ceph_assert(obc);
13656     pg_stat_t stat;
13657     add_object_context_to_pg_stat(obc, &stat);
13658     pending_backfill_updates[*i] = stat;
13659   }
13660   map<pg_shard_t,MOSDPGBackfillRemove*> reqs;
13661   for (unsigned i = 0; i < to_remove.size(); ++i) {
13662     handle.reset_tp_timeout();
13663     const hobject_t& oid = to_remove[i].get<0>();
13664     eversion_t v = to_remove[i].get<1>();
13665     pg_shard_t peer = to_remove[i].get<2>();
13666     MOSDPGBackfillRemove *m;
13667     auto it = reqs.find(peer);
13668     if (it != reqs.end()) {
13669       m = it->second;
13670     } else {
13671       m = reqs[peer] = new MOSDPGBackfillRemove(
13672         spg_t(info.pgid.pgid, peer.shard),
13673         get_osdmap_epoch());
13674     }
13675     m->ls.push_back(make_pair(oid, v));
13676
13677     if (oid <= last_backfill_started)
13678       pending_backfill_updates[oid]; // add empty stat!
13679   }
13680   for (auto p : reqs) {
13681     osd->send_message_osd_cluster(p.first.osd, p.second,
13682                                   get_osdmap_epoch());
13683   }
13684
13685   pgbackend->run_recovery_op(h, get_recovery_op_priority());
13686
13687   hobject_t backfill_pos =
13688     std::min(backfill_info.begin, earliest_peer_backfill());
13689   dout(5) << "backfill_pos is " << backfill_pos << dendl;
13690   for (set<hobject_t>::iterator i = backfills_in_flight.begin();
13691        i != backfills_in_flight.end();
13692        ++i) {
13693     dout(20) << *i << " is still in flight" << dendl;
13694   }
13695
13696   hobject_t next_backfill_to_complete = backfills_in_flight.empty() ?
13697     backfill_pos : *(backfills_in_flight.begin());
13698   hobject_t new_last_backfill = recovery_state.earliest_backfill();
13699   dout(10) << "starting new_last_backfill at " << new_last_backfill << dendl;
13700   for (map<hobject_t, pg_stat_t>::iterator i =
13701          pending_backfill_updates.begin();
13702        i != pending_backfill_updates.end() &&
13703          i->first < next_backfill_to_complete;
13704        pending_backfill_updates.erase(i++)) {
13705     dout(20) << " pending_backfill_update " << i->first << dendl;
13706     ceph_assert(i->first > new_last_backfill);
13707     // carried from a previous round – if we are here, then we had to
13708     // be requeued (by e.g. on_global_recover()) and those operations
13709     // are done.
13710     recovery_state.update_complete_backfill_object_stats(
13711       i->first,
13712       i->second);
13713     new_last_backfill = i->first;
13714   }
13715   dout(10) << "possible new_last_backfill at " << new_last_backfill << dendl;
13716
13717   ceph_assert(!pending_backfill_updates.empty() ||
13718          new_last_backfill == last_backfill_started);
13719   if (pending_backfill_updates.empty() &&
13720       backfill_pos.is_max()) {
13721     ceph_assert(backfills_in_flight.empty());
13722     new_last_backfill = backfill_pos;
13723     last_backfill_started = backfill_pos;
13724   }
13725   dout(10) << "final new_last_backfill at " << new_last_backfill << dendl;
13726
13727   // If new_last_backfill == MAX, then we will send OP_BACKFILL_FINISH to
13728   // all the backfill targets.  Otherwise, we will move last_backfill up on
13729   // those targets need it and send OP_BACKFILL_PROGRESS to them.
13730   for (set<pg_shard_t>::const_iterator i = get_backfill_targets().begin();
13731        i != get_backfill_targets().end();
13732        ++i) {
13733     pg_shard_t bt = *i;
13734     const pg_info_t& pinfo = recovery_state.get_peer_info(bt);
13735
13736     if (new_last_backfill > pinfo.last_backfill) {
13737       recovery_state.update_peer_last_backfill(bt, new_last_backfill);
13738       epoch_t e = get_osdmap_epoch();
13739       MOSDPGBackfill *m = NULL;
13740       if (pinfo.last_backfill.is_max()) {
13741         m = new MOSDPGBackfill(
13742           MOSDPGBackfill::OP_BACKFILL_FINISH,
13743           e,
13744           get_last_peering_reset(),
13745           spg_t(info.pgid.pgid, bt.shard));
13746         // Use default priority here, must match sub_op priority
13747         start_recovery_op(hobject_t::get_max());
13748       } else {
13749         m = new MOSDPGBackfill(
13750           MOSDPGBackfill::OP_BACKFILL_PROGRESS,
13751           e,
13752           get_last_peering_reset(),
13753           spg_t(info.pgid.pgid, bt.shard));
13754         // Use default priority here, must match sub_op priority
13755       }
13756       m->last_backfill = pinfo.last_backfill;
13757       m->stats = pinfo.stats;
13758       osd->send_message_osd_cluster(bt.osd, m, get_osdmap_epoch());
13759       dout(10) << " peer " << bt
13760                << " num_objects now " << pinfo.stats.stats.sum.num_objects
13761                << " / " << info.stats.stats.sum.num_objects << dendl;
13762     }
13763   }
13764
13765   if (ops)
13766     *work_started = true;
13767   return ops;
13768 }
13769
13770 int PrimaryLogPG::prep_backfill_object_push(
13771   hobject_t oid, eversion_t v,
13772   ObjectContextRef obc,
13773   vector<pg_shard_t> peers,
13774   PGBackend::RecoveryHandle *h)
13775 {
13776   dout(10) << __func__ << " " << oid << " v " << v << " to peers " << peers << dendl;
13777   ceph_assert(!peers.empty());
13778
13779   backfills_in_flight.insert(oid);
13780   recovery_state.prepare_backfill_for_missing(oid, v, peers);
13781
13782   ceph_assert(!recovering.count(oid));
13783
13784   start_recovery_op(oid);
13785   recovering.insert(make_pair(oid, obc));
13786
13787   int r = pgbackend->recover_object(
13788     oid,
13789     v,
13790     ObjectContextRef(),
13791     obc,
13792     h);
13793   if (r < 0) {
13794     dout(0) << __func__ << " Error " << r << " on oid " << oid << dendl;
13795     on_failed_pull({ pg_whoami }, oid, v);
13796   }
13797   return r;
13798 }
13799
13800 void PrimaryLogPG::update_range(
13801   BackfillInterval *bi,
13802   ThreadPool::TPHandle &handle)
13803 {
13804   int local_min = cct->_conf->osd_backfill_scan_min;
13805   int local_max = cct->_conf->osd_backfill_scan_max;
13806
13807   if (bi->version < info.log_tail) {
13808     dout(10) << __func__<< ": bi is old, rescanning local backfill_info"
13809              << dendl;
13810     bi->version = info.last_update;
13811     scan_range(local_min, local_max, bi, handle);
13812   }
13813
13814   if (bi->version >= projected_last_update) {
13815     dout(10) << __func__<< ": bi is current " << dendl;
13816     ceph_assert(bi->version == projected_last_update);
13817   } else if (bi->version >= info.log_tail) {
13818     if (recovery_state.get_pg_log().get_log().empty() && projected_log.empty()) {
13819       /* Because we don't move log_tail on split, the log might be
13820        * empty even if log_tail != last_update.  However, the only
13821        * way to get here with an empty log is if log_tail is actually
13822        * eversion_t(), because otherwise the entry which changed
13823        * last_update since the last scan would have to be present.
13824        */
13825       ceph_assert(bi->version == eversion_t());
13826       return;
13827     }
13828
13829     dout(10) << __func__<< ": bi is old, (" << bi->version
13830              << ") can be updated with log to projected_last_update "
13831              << projected_last_update << dendl;
13832
13833     auto func = [&](const pg_log_entry_t &e) {
13834       dout(10) << __func__ << ": updating from version " << e.version
13835                << dendl;
13836       const hobject_t &soid = e.soid;
13837       if (soid >= bi->begin &&
13838           soid < bi->end) {
13839         if (e.is_update()) {
13840           dout(10) << __func__ << ": " << e.soid << " updated to version "
13841                    << e.version << dendl;
13842           bi->objects.erase(e.soid);
13843           bi->objects.insert(
13844             make_pair(
13845               e.soid,
13846               e.version));
13847         } else if (e.is_delete()) {
13848           dout(10) << __func__ << ": " << e.soid << " removed" << dendl;
13849           bi->objects.erase(e.soid);
13850         }
13851       }
13852     };
13853     dout(10) << "scanning pg log first" << dendl;
13854     recovery_state.get_pg_log().get_log().scan_log_after(bi->version, func);
13855     dout(10) << "scanning projected log" << dendl;
13856     projected_log.scan_log_after(bi->version, func);
13857     bi->version = projected_last_update;
13858   } else {
13859     ceph_abort_msg("scan_range should have raised bi->version past log_tail");
13860   }
13861 }
13862
13863 void PrimaryLogPG::scan_range(
13864   int min, int max, BackfillInterval *bi,
13865   ThreadPool::TPHandle &handle)
13866 {
13867   ceph_assert(is_locked());
13868   dout(10) << "scan_range from " << bi->begin << dendl;
13869   bi->clear_objects();
13870
13871   vector<hobject_t> ls;
13872   ls.reserve(max);
13873   int r = pgbackend->objects_list_partial(bi->begin, min, max, &ls, &bi->end);
13874   ceph_assert(r >= 0);
13875   dout(10) << " got " << ls.size() << " items, next " << bi->end << dendl;
13876   dout(20) << ls << dendl;
13877
13878   for (vector<hobject_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
13879     handle.reset_tp_timeout();
13880     ObjectContextRef obc;
13881     if (is_primary())
13882       obc = object_contexts.lookup(*p);
13883     if (obc) {
13884       if (!obc->obs.exists) {
13885         /* If the object does not exist here, it must have been removed
13886          * between the collection_list_partial and here.  This can happen
13887          * for the first item in the range, which is usually last_backfill.
13888          */
13889         continue;
13890       }
13891       bi->objects[*p] = obc->obs.oi.version;
13892       dout(20) << "  " << *p << " " << obc->obs.oi.version << dendl;
13893     } else {
13894       bufferlist bl;
13895       int r = pgbackend->objects_get_attr(*p, OI_ATTR, &bl);
13896       /* If the object does not exist here, it must have been removed
13897        * between the collection_list_partial and here.  This can happen
13898        * for the first item in the range, which is usually last_backfill.
13899        */
13900       if (r == -ENOENT)
13901         continue;
13902
13903       ceph_assert(r >= 0);
13904       object_info_t oi(bl);
13905       bi->objects[*p] = oi.version;
13906       dout(20) << "  " << *p << " " << oi.version << dendl;
13907     }
13908   }
13909 }
13910
13911
13912 /** check_local
13913  *
13914  * verifies that stray objects have been deleted
13915  */
13916 void PrimaryLogPG::check_local()
13917 {
13918   dout(10) << __func__ << dendl;
13919
13920   ceph_assert(
13921     info.last_update >=
13922     recovery_state.get_pg_log().get_tail());  // otherwise we need some help!
13923
13924   if (!cct->_conf->osd_debug_verify_stray_on_activate)
13925     return;
13926
13927   // just scan the log.
13928   set<hobject_t> did;
13929   for (list<pg_log_entry_t>::const_reverse_iterator p = recovery_state.get_pg_log().get_log().log.rbegin();
13930        p != recovery_state.get_pg_log().get_log().log.rend();
13931        ++p) {
13932     if (did.count(p->soid))
13933       continue;
13934     did.insert(p->soid);
13935
13936     if (p->is_delete() && !is_missing_object(p->soid)) {
13937       dout(10) << " checking " << p->soid
13938                << " at " << p->version << dendl;
13939       struct stat st;
13940       int r = osd->store->stat(
13941         ch,
13942         ghobject_t(p->soid, ghobject_t::NO_GEN, pg_whoami.shard),
13943         &st);
13944       if (r != -ENOENT) {
13945         derr << __func__ << " " << p->soid << " exists, but should have been "
13946              << "deleted" << dendl;
13947         ceph_abort_msg("erroneously present object");
13948       }
13949     } else {
13950       // ignore old(+missing) objects
13951     }
13952   }
13953 }
13954
13955
13956
13957 // ===========================
13958 // hit sets
13959
13960 hobject_t PrimaryLogPG::get_hit_set_current_object(utime_t stamp)
13961 {
13962   ostringstream ss;
13963   ss << "hit_set_" << info.pgid.pgid << "_current_" << stamp;
13964   hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "",
13965                  info.pgid.ps(), info.pgid.pool(),
13966                  cct->_conf->osd_hit_set_namespace);
13967   dout(20) << __func__ << " " << hoid << dendl;
13968   return hoid;
13969 }
13970
13971 hobject_t PrimaryLogPG::get_hit_set_archive_object(utime_t start,
13972                                                    utime_t end,
13973                                                    bool using_gmt)
13974 {
13975   ostringstream ss;
13976   ss << "hit_set_" << info.pgid.pgid << "_archive_";
13977   if (using_gmt) {
13978     start.gmtime(ss, true /* legacy pre-octopus form */) << "_";
13979     end.gmtime(ss, true /* legacy pre-octopus form */);
13980   } else {
13981     start.localtime(ss, true /* legacy pre-octopus form */) << "_";
13982     end.localtime(ss, true /* legacy pre-octopus form */);
13983   }
13984   hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "",
13985                  info.pgid.ps(), info.pgid.pool(),
13986                  cct->_conf->osd_hit_set_namespace);
13987   dout(20) << __func__ << " " << hoid << dendl;
13988   return hoid;
13989 }
13990
13991 void PrimaryLogPG::hit_set_clear()
13992 {
13993   dout(20) << __func__ << dendl;
13994   hit_set.reset();
13995   hit_set_start_stamp = utime_t();
13996 }
13997
13998 void PrimaryLogPG::hit_set_setup()
13999 {
14000   if (!is_active() ||
14001       !is_primary()) {
14002     hit_set_clear();
14003     return;
14004   }
14005
14006   if (is_active() && is_primary() &&
14007       (!pool.info.hit_set_count ||
14008        !pool.info.hit_set_period ||
14009        pool.info.hit_set_params.get_type() == HitSet::TYPE_NONE)) {
14010     hit_set_clear();
14011
14012     // only primary is allowed to remove all the hit set objects
14013     hit_set_remove_all();
14014     return;
14015   }
14016
14017   // FIXME: discard any previous data for now
14018   hit_set_create();
14019
14020   // include any writes we know about from the pg log.  this doesn't
14021   // capture reads, but it is better than nothing!
14022   hit_set_apply_log();
14023 }
14024
14025 void PrimaryLogPG::hit_set_remove_all()
14026 {
14027   // If any archives are degraded we skip this
14028   for (auto p = info.hit_set.history.begin();
14029        p != info.hit_set.history.end();
14030        ++p) {
14031     hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
14032
14033     // Once we hit a degraded object just skip
14034     if (is_degraded_or_backfilling_object(aoid))
14035       return;
14036     if (m_scrubber->write_blocked_by_scrub(aoid))
14037       return;
14038   }
14039
14040   if (!info.hit_set.history.empty()) {
14041     auto p = info.hit_set.history.rbegin();
14042     ceph_assert(p != info.hit_set.history.rend());
14043     hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
14044     ceph_assert(!is_degraded_or_backfilling_object(oid));
14045     ObjectContextRef obc = get_object_context(oid, false);
14046     ceph_assert(obc);
14047
14048     OpContextUPtr ctx = simple_opc_create(obc);
14049     ctx->at_version = get_next_version();
14050     ctx->updated_hset_history = info.hit_set;
14051     utime_t now = ceph_clock_now();
14052     ctx->mtime = now;
14053     hit_set_trim(ctx, 0);
14054     simple_opc_submit(std::move(ctx));
14055   }
14056
14057   recovery_state.update_hset(pg_hit_set_history_t());
14058   if (agent_state) {
14059     agent_state->discard_hit_sets();
14060   }
14061 }
14062
14063 void PrimaryLogPG::hit_set_create()
14064 {
14065   utime_t now = ceph_clock_now();
14066   // make a copy of the params to modify
14067   HitSet::Params params(pool.info.hit_set_params);
14068
14069   dout(20) << __func__ << " " << params << dendl;
14070   if (pool.info.hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
14071     BloomHitSet::Params *p =
14072       static_cast<BloomHitSet::Params*>(params.impl.get());
14073
14074     // convert false positive rate so it holds up across the full period
14075     p->set_fpp(p->get_fpp() / pool.info.hit_set_count);
14076     if (p->get_fpp() <= 0.0)
14077       p->set_fpp(.01);  // fpp cannot be zero!
14078
14079     // if we don't have specified size, estimate target size based on the
14080     // previous bin!
14081     if (p->target_size == 0 && hit_set) {
14082       utime_t dur = now - hit_set_start_stamp;
14083       unsigned unique = hit_set->approx_unique_insert_count();
14084       dout(20) << __func__ << " previous set had approx " << unique
14085                << " unique items over " << dur << " seconds" << dendl;
14086       p->target_size = (double)unique * (double)pool.info.hit_set_period
14087                      / (double)dur;
14088     }
14089     if (p->target_size <
14090         static_cast<uint64_t>(cct->_conf->osd_hit_set_min_size))
14091       p->target_size = cct->_conf->osd_hit_set_min_size;
14092
14093     if (p->target_size
14094         > static_cast<uint64_t>(cct->_conf->osd_hit_set_max_size))
14095       p->target_size = cct->_conf->osd_hit_set_max_size;
14096
14097     p->seed = now.sec();
14098
14099     dout(10) << __func__ << " target_size " << p->target_size
14100              << " fpp " << p->get_fpp() << dendl;
14101   }
14102   hit_set.reset(new HitSet(params));
14103   hit_set_start_stamp = now;
14104 }
14105
14106 /**
14107  * apply log entries to set
14108  *
14109  * this would only happen after peering, to at least capture writes
14110  * during an interval that was potentially lost.
14111  */
14112 bool PrimaryLogPG::hit_set_apply_log()
14113 {
14114   if (!hit_set)
14115     return false;
14116
14117   eversion_t to = info.last_update;
14118   eversion_t from = info.hit_set.current_last_update;
14119   if (to <= from) {
14120     dout(20) << __func__ << " no update" << dendl;
14121     return false;
14122   }
14123
14124   dout(20) << __func__ << " " << to << " .. " << info.last_update << dendl;
14125   list<pg_log_entry_t>::const_reverse_iterator p =
14126     recovery_state.get_pg_log().get_log().log.rbegin();
14127   while (p != recovery_state.get_pg_log().get_log().log.rend() && p->version > to)
14128     ++p;
14129   while (p != recovery_state.get_pg_log().get_log().log.rend() && p->version > from) {
14130     hit_set->insert(p->soid);
14131     ++p;
14132   }
14133
14134   return true;
14135 }
14136
14137 void PrimaryLogPG::hit_set_persist()
14138 {
14139   dout(10) << __func__  << dendl;
14140   bufferlist bl;
14141   unsigned max = pool.info.hit_set_count;
14142
14143   utime_t now = ceph_clock_now();
14144   hobject_t oid;
14145
14146   // If any archives are degraded we skip this persist request
14147   // account for the additional entry being added below
14148   for (auto p = info.hit_set.history.begin();
14149        p != info.hit_set.history.end();
14150        ++p) {
14151     hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
14152
14153     // Once we hit a degraded object just skip further trim
14154     if (is_degraded_or_backfilling_object(aoid))
14155       return;
14156     if (m_scrubber->write_blocked_by_scrub(aoid))
14157       return;
14158   }
14159
14160   // If backfill is in progress and we could possibly overlap with the
14161   // hit_set_* objects, back off.  Since these all have
14162   // hobject_t::hash set to pgid.ps(), and those sort first, we can
14163   // look just at that.  This is necessary because our transactions
14164   // may include a modify of the new hit_set *and* a delete of the
14165   // old one, and this may span the backfill boundary.
14166   for (set<pg_shard_t>::const_iterator p = get_backfill_targets().begin();
14167        p != get_backfill_targets().end();
14168        ++p) {
14169     const pg_info_t& pi = recovery_state.get_peer_info(*p);
14170     if (pi.last_backfill == hobject_t() ||
14171         pi.last_backfill.get_hash() == info.pgid.ps()) {
14172       dout(10) << __func__ << " backfill target osd." << *p
14173                << " last_backfill has not progressed past pgid ps"
14174                << dendl;
14175       return;
14176     }
14177   }
14178
14179
14180   pg_hit_set_info_t new_hset = pg_hit_set_info_t(pool.info.use_gmt_hitset);
14181   new_hset.begin = hit_set_start_stamp;
14182   new_hset.end = now;
14183   oid = get_hit_set_archive_object(
14184     new_hset.begin,
14185     new_hset.end,
14186     new_hset.using_gmt);
14187
14188   // If the current object is degraded we skip this persist request
14189   if (m_scrubber->write_blocked_by_scrub(oid))
14190     return;
14191
14192   hit_set->seal();
14193   encode(*hit_set, bl);
14194   dout(20) << __func__ << " archive " << oid << dendl;
14195
14196   if (agent_state) {
14197     agent_state->add_hit_set(new_hset.begin, hit_set);
14198     uint32_t size = agent_state->hit_set_map.size();
14199     if (size >= pool.info.hit_set_count) {
14200       size = pool.info.hit_set_count > 0 ? pool.info.hit_set_count - 1: 0;
14201     }
14202     hit_set_in_memory_trim(size);
14203   }
14204
14205   ObjectContextRef obc = get_object_context(oid, true);
14206   OpContextUPtr ctx = simple_opc_create(obc);
14207
14208   ctx->at_version = get_next_version();
14209   ctx->updated_hset_history = info.hit_set;
14210   pg_hit_set_history_t &updated_hit_set_hist = *(ctx->updated_hset_history);
14211
14212   updated_hit_set_hist.current_last_update = info.last_update;
14213   new_hset.version = ctx->at_version;
14214
14215   updated_hit_set_hist.history.push_back(new_hset);
14216   hit_set_create();
14217
14218   // fabricate an object_info_t and SnapSet
14219   obc->obs.oi.version = ctx->at_version;
14220   obc->obs.oi.mtime = now;
14221   obc->obs.oi.size = bl.length();
14222   obc->obs.exists = true;
14223   obc->obs.oi.set_data_digest(bl.crc32c(-1));
14224
14225   ctx->new_obs = obc->obs;
14226
14227   ctx->new_snapset = obc->ssc->snapset;
14228
14229   ctx->delta_stats.num_objects++;
14230   ctx->delta_stats.num_objects_hit_set_archive++;
14231
14232   ctx->delta_stats.num_bytes += bl.length();
14233   ctx->delta_stats.num_bytes_hit_set_archive += bl.length();
14234
14235   bufferlist bss;
14236   encode(ctx->new_snapset, bss);
14237   bufferlist boi(sizeof(ctx->new_obs.oi));
14238   encode(ctx->new_obs.oi, boi,
14239            get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
14240
14241   ctx->op_t->create(oid);
14242   if (bl.length()) {
14243     ctx->op_t->write(oid, 0, bl.length(), bl, 0);
14244     write_update_size_and_usage(ctx->delta_stats, obc->obs.oi, ctx->modified_ranges,
14245         0, bl.length());
14246     ctx->clean_regions.mark_data_region_dirty(0, bl.length());
14247   }
14248   map <string, bufferlist> attrs;
14249   attrs[OI_ATTR] = std::move(boi);
14250   attrs[SS_ATTR] = std::move(bss);
14251   setattrs_maybe_cache(ctx->obc, ctx->op_t.get(), attrs);
14252   ctx->log.push_back(
14253     pg_log_entry_t(
14254       pg_log_entry_t::MODIFY,
14255       oid,
14256       ctx->at_version,
14257       eversion_t(),
14258       0,
14259       osd_reqid_t(),
14260       ctx->mtime,
14261       0)
14262     );
14263   ctx->log.back().clean_regions = ctx->clean_regions;
14264
14265   hit_set_trim(ctx, max);
14266
14267   simple_opc_submit(std::move(ctx));
14268 }
14269
14270 void PrimaryLogPG::hit_set_trim(OpContextUPtr &ctx, unsigned max)
14271 {
14272   ceph_assert(ctx->updated_hset_history);
14273   pg_hit_set_history_t &updated_hit_set_hist =
14274     *(ctx->updated_hset_history);
14275   for (unsigned num = updated_hit_set_hist.history.size(); num > max; --num) {
14276     list<pg_hit_set_info_t>::iterator p = updated_hit_set_hist.history.begin();
14277     ceph_assert(p != updated_hit_set_hist.history.end());
14278     hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
14279
14280     ceph_assert(!is_degraded_or_backfilling_object(oid));
14281
14282     dout(20) << __func__ << " removing " << oid << dendl;
14283     ++ctx->at_version.version;
14284     ctx->log.push_back(
14285         pg_log_entry_t(pg_log_entry_t::DELETE,
14286                        oid,
14287                        ctx->at_version,
14288                        p->version,
14289                        0,
14290                        osd_reqid_t(),
14291                        ctx->mtime,
14292                        0));
14293
14294     ctx->op_t->remove(oid);
14295     updated_hit_set_hist.history.pop_front();
14296
14297     ObjectContextRef obc = get_object_context(oid, false);
14298     ceph_assert(obc);
14299     --ctx->delta_stats.num_objects;
14300     --ctx->delta_stats.num_objects_hit_set_archive;
14301     ctx->delta_stats.num_bytes -= obc->obs.oi.size;
14302     ctx->delta_stats.num_bytes_hit_set_archive -= obc->obs.oi.size;
14303   }
14304 }
14305
14306 void PrimaryLogPG::hit_set_in_memory_trim(uint32_t max_in_memory)
14307 {
14308   while (agent_state->hit_set_map.size() > max_in_memory) {
14309     agent_state->remove_oldest_hit_set();
14310   }
14311 }
14312
14313
14314 // =======================================
14315 // cache agent
14316
14317 void PrimaryLogPG::agent_setup()
14318 {
14319   ceph_assert(is_locked());
14320   if (!is_active() ||
14321       !is_primary() ||
14322       state_test(PG_STATE_PREMERGE) ||
14323       pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE ||
14324       pool.info.tier_of < 0 ||
14325       !get_osdmap()->have_pg_pool(pool.info.tier_of)) {
14326     agent_clear();
14327     return;
14328   }
14329   if (!agent_state) {
14330     agent_state.reset(new TierAgentState);
14331
14332     // choose random starting position
14333     agent_state->position = hobject_t();
14334     agent_state->position.pool = info.pgid.pool();
14335     agent_state->position.set_hash(pool.info.get_random_pg_position(
14336       info.pgid.pgid,
14337       rand()));
14338     agent_state->start = agent_state->position;
14339
14340     dout(10) << __func__ << " allocated new state, position "
14341              << agent_state->position << dendl;
14342   } else {
14343     dout(10) << __func__ << " keeping existing state" << dendl;
14344   }
14345
14346   if (info.stats.stats_invalid) {
14347     osd->clog->warn() << "pg " << info.pgid << " has invalid (post-split) stats; must scrub before tier agent can activate";
14348   }
14349
14350   agent_choose_mode();
14351 }
14352
14353 void PrimaryLogPG::agent_clear()
14354 {
14355   agent_stop();
14356   agent_state.reset(NULL);
14357 }
14358
14359 // Return false if no objects operated on since start of object hash space
14360 bool PrimaryLogPG::agent_work(int start_max, int agent_flush_quota)
14361 {
14362   std::scoped_lock locker{*this};
14363   if (!agent_state) {
14364     dout(10) << __func__ << " no agent state, stopping" << dendl;
14365     return true;
14366   }
14367
14368   ceph_assert(!recovery_state.is_deleting());
14369
14370   if (agent_state->is_idle()) {
14371     dout(10) << __func__ << " idle, stopping" << dendl;
14372     return true;
14373   }
14374
14375   osd->logger->inc(l_osd_agent_wake);
14376
14377   dout(10) << __func__
14378            << " max " << start_max
14379            << ", flush " << agent_state->get_flush_mode_name()
14380            << ", evict " << agent_state->get_evict_mode_name()
14381            << ", pos " << agent_state->position
14382            << dendl;
14383   ceph_assert(is_primary());
14384   ceph_assert(is_active());
14385
14386   agent_load_hit_sets();
14387
14388   const pg_pool_t *base_pool = get_osdmap()->get_pg_pool(pool.info.tier_of);
14389   ceph_assert(base_pool);
14390
14391   int ls_min = 1;
14392   int ls_max = cct->_conf->osd_pool_default_cache_max_evict_check_size;
14393
14394   // list some objects.  this conveniently lists clones (oldest to
14395   // newest) before heads... the same order we want to flush in.
14396   //
14397   // NOTE: do not flush the Sequencer.  we will assume that the
14398   // listing we get back is imprecise.
14399   vector<hobject_t> ls;
14400   hobject_t next;
14401   int r = pgbackend->objects_list_partial(agent_state->position, ls_min, ls_max,
14402                                           &ls, &next);
14403   ceph_assert(r >= 0);
14404   dout(20) << __func__ << " got " << ls.size() << " objects" << dendl;
14405   int started = 0;
14406   for (vector<hobject_t>::iterator p = ls.begin();
14407        p != ls.end();
14408        ++p) {
14409     if (p->nspace == cct->_conf->osd_hit_set_namespace) {
14410       dout(20) << __func__ << " skip (hit set) " << *p << dendl;
14411       osd->logger->inc(l_osd_agent_skip);
14412       continue;
14413     }
14414     if (is_degraded_or_backfilling_object(*p)) {
14415       dout(20) << __func__ << " skip (degraded) " << *p << dendl;
14416       osd->logger->inc(l_osd_agent_skip);
14417       continue;
14418     }
14419     if (is_missing_object(p->get_head())) {
14420       dout(20) << __func__ << " skip (missing head) " << *p << dendl;
14421       osd->logger->inc(l_osd_agent_skip);
14422       continue;
14423     }
14424     ObjectContextRef obc = get_object_context(*p, false, NULL);
14425     if (!obc) {
14426       // we didn't flush; we may miss something here.
14427       dout(20) << __func__ << " skip (no obc) " << *p << dendl;
14428       osd->logger->inc(l_osd_agent_skip);
14429       continue;
14430     }
14431     if (!obc->obs.exists) {
14432       dout(20) << __func__ << " skip (dne) " << obc->obs.oi.soid << dendl;
14433       osd->logger->inc(l_osd_agent_skip);
14434       continue;
14435     }
14436     if (m_scrubber->range_intersects_scrub(obc->obs.oi.soid,
14437                                obc->obs.oi.soid.get_head())) {
14438       dout(20) << __func__ << " skip (scrubbing) " << obc->obs.oi << dendl;
14439       osd->logger->inc(l_osd_agent_skip);
14440       continue;
14441     }
14442     if (obc->is_blocked()) {
14443       dout(20) << __func__ << " skip (blocked) " << obc->obs.oi << dendl;
14444       osd->logger->inc(l_osd_agent_skip);
14445       continue;
14446     }
14447     if (obc->is_request_pending()) {
14448       dout(20) << __func__ << " skip (request pending) " << obc->obs.oi << dendl;
14449       osd->logger->inc(l_osd_agent_skip);
14450       continue;
14451     }
14452
14453     // be careful flushing omap to an EC pool.
14454     if (!base_pool->supports_omap() &&
14455         obc->obs.oi.is_omap()) {
14456       dout(20) << __func__ << " skip (omap to EC) " << obc->obs.oi << dendl;
14457       osd->logger->inc(l_osd_agent_skip);
14458       continue;
14459     }
14460
14461     if (agent_state->evict_mode != TierAgentState::EVICT_MODE_IDLE &&
14462         agent_maybe_evict(obc, false))
14463       ++started;
14464     else if (agent_state->flush_mode != TierAgentState::FLUSH_MODE_IDLE &&
14465              agent_flush_quota > 0 && agent_maybe_flush(obc)) {
14466       ++started;
14467       --agent_flush_quota;
14468     }
14469     if (started >= start_max) {
14470       // If finishing early, set "next" to the next object
14471       if (++p != ls.end())
14472         next = *p;
14473       break;
14474     }
14475   }
14476
14477   if (++agent_state->hist_age > cct->_conf->osd_agent_hist_halflife) {
14478     dout(20) << __func__ << " resetting atime and temp histograms" << dendl;
14479     agent_state->hist_age = 0;
14480     agent_state->temp_hist.decay();
14481   }
14482
14483   // Total objects operated on so far
14484   int total_started = agent_state->started + started;
14485   bool need_delay = false;
14486
14487   dout(20) << __func__ << " start pos " << agent_state->position
14488     << " next start pos " << next
14489     << " started " << total_started << dendl;
14490
14491   // See if we've made a full pass over the object hash space
14492   // This might check at most ls_max objects a second time to notice that
14493   // we've checked every objects at least once.
14494   if (agent_state->position < agent_state->start &&
14495       next >= agent_state->start) {
14496     dout(20) << __func__ << " wrap around " << agent_state->start << dendl;
14497     if (total_started == 0)
14498       need_delay = true;
14499     else
14500       total_started = 0;
14501     agent_state->start = next;
14502   }
14503   agent_state->started = total_started;
14504
14505   // See if we are starting from beginning
14506   if (next.is_max())
14507     agent_state->position = hobject_t();
14508   else
14509     agent_state->position = next;
14510
14511   // Discard old in memory HitSets
14512   hit_set_in_memory_trim(pool.info.hit_set_count);
14513
14514   if (need_delay) {
14515     ceph_assert(agent_state->delaying == false);
14516     agent_delay();
14517     return false;
14518   }
14519   agent_choose_mode();
14520   return true;
14521 }
14522
14523 void PrimaryLogPG::agent_load_hit_sets()
14524 {
14525   if (agent_state->evict_mode == TierAgentState::EVICT_MODE_IDLE) {
14526     return;
14527   }
14528
14529   if (agent_state->hit_set_map.size() < info.hit_set.history.size()) {
14530     dout(10) << __func__ << dendl;
14531     for (auto p = info.hit_set.history.begin();
14532          p != info.hit_set.history.end(); ++p) {
14533       if (agent_state->hit_set_map.count(p->begin.sec()) == 0) {
14534         dout(10) << __func__ << " loading " << p->begin << "-"
14535                  << p->end << dendl;
14536         if (!pool.info.is_replicated()) {
14537           // FIXME: EC not supported here yet
14538           derr << __func__ << " on non-replicated pool" << dendl;
14539           break;
14540         }
14541
14542         hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
14543         if (is_unreadable_object(oid)) {
14544           dout(10) << __func__ << " unreadable " << oid << ", waiting" << dendl;
14545           break;
14546         }
14547
14548         ObjectContextRef obc = get_object_context(oid, false);
14549         if (!obc) {
14550           derr << __func__ << ": could not load hitset " << oid << dendl;
14551           break;
14552         }
14553
14554         bufferlist bl;
14555         {
14556           int r = osd->store->read(ch, ghobject_t(oid), 0, 0, bl);
14557           ceph_assert(r >= 0);
14558         }
14559         HitSetRef hs(new HitSet);
14560         bufferlist::const_iterator pbl = bl.begin();
14561         decode(*hs, pbl);
14562         agent_state->add_hit_set(p->begin.sec(), hs);
14563       }
14564     }
14565   }
14566 }
14567
14568 bool PrimaryLogPG::agent_maybe_flush(ObjectContextRef& obc)
14569 {
14570   if (!obc->obs.oi.is_dirty()) {
14571     dout(20) << __func__ << " skip (clean) " << obc->obs.oi << dendl;
14572     osd->logger->inc(l_osd_agent_skip);
14573     return false;
14574   }
14575   if (obc->obs.oi.is_cache_pinned()) {
14576     dout(20) << __func__ << " skip (cache_pinned) " << obc->obs.oi << dendl;
14577     osd->logger->inc(l_osd_agent_skip);
14578     return false;
14579   }
14580
14581   utime_t now = ceph_clock_now();
14582   utime_t ob_local_mtime;
14583   if (obc->obs.oi.local_mtime != utime_t()) {
14584     ob_local_mtime = obc->obs.oi.local_mtime;
14585   } else {
14586     ob_local_mtime = obc->obs.oi.mtime;
14587   }
14588   bool evict_mode_full =
14589     (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL);
14590   if (!evict_mode_full &&
14591       obc->obs.oi.soid.snap == CEPH_NOSNAP &&  // snaps immutable; don't delay
14592       (ob_local_mtime + utime_t(pool.info.cache_min_flush_age, 0) > now)) {
14593     dout(20) << __func__ << " skip (too young) " << obc->obs.oi << dendl;
14594     osd->logger->inc(l_osd_agent_skip);
14595     return false;
14596   }
14597
14598   if (osd->agent_is_active_oid(obc->obs.oi.soid)) {
14599     dout(20) << __func__ << " skip (flushing) " << obc->obs.oi << dendl;
14600     osd->logger->inc(l_osd_agent_skip);
14601     return false;
14602   }
14603
14604   dout(10) << __func__ << " flushing " << obc->obs.oi << dendl;
14605
14606   // FIXME: flush anything dirty, regardless of what distribution of
14607   // ages we expect.
14608
14609   hobject_t oid = obc->obs.oi.soid;
14610   osd->agent_start_op(oid);
14611   // no need to capture a pg ref, can't outlive fop or ctx
14612   std::function<void()> on_flush = [this, oid]() {
14613     osd->agent_finish_op(oid);
14614   };
14615
14616   int result = start_flush(
14617     OpRequestRef(), obc, false, NULL,
14618     on_flush);
14619   if (result != -EINPROGRESS) {
14620     on_flush();
14621     dout(10) << __func__ << " start_flush() failed " << obc->obs.oi
14622       << " with " << result << dendl;
14623     osd->logger->inc(l_osd_agent_skip);
14624     return false;
14625   }
14626
14627   osd->logger->inc(l_osd_agent_flush);
14628   return true;
14629 }
14630
14631 bool PrimaryLogPG::agent_maybe_evict(ObjectContextRef& obc, bool after_flush)
14632 {
14633   const hobject_t& soid = obc->obs.oi.soid;
14634   if (!after_flush && obc->obs.oi.is_dirty()) {
14635     dout(20) << __func__ << " skip (dirty) " << obc->obs.oi << dendl;
14636     return false;
14637   }
14638   // This is already checked by agent_work() which passes after_flush = false
14639   if (after_flush && m_scrubber->range_intersects_scrub(soid, soid.get_head())) {
14640       dout(20) << __func__ << " skip (scrubbing) " << obc->obs.oi << dendl;
14641       return false;
14642   }
14643   if (!obc->obs.oi.watchers.empty()) {
14644     dout(20) << __func__ << " skip (watchers) " << obc->obs.oi << dendl;
14645     return false;
14646   }
14647   if (obc->is_blocked()) {
14648     dout(20) << __func__ << " skip (blocked) " << obc->obs.oi << dendl;
14649     return false;
14650   }
14651   if (obc->obs.oi.is_cache_pinned()) {
14652     dout(20) << __func__ << " skip (cache_pinned) " << obc->obs.oi << dendl;
14653     return false;
14654   }
14655
14656   if (soid.snap == CEPH_NOSNAP) {
14657     int result = _verify_no_head_clones(soid, obc->ssc->snapset);
14658     if (result < 0) {
14659       dout(20) << __func__ << " skip (clones) " << obc->obs.oi << dendl;
14660       return false;
14661     }
14662   }
14663
14664   if (agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL) {
14665     // is this object old than cache_min_evict_age?
14666     utime_t now = ceph_clock_now();
14667     utime_t ob_local_mtime;
14668     if (obc->obs.oi.local_mtime != utime_t()) {
14669       ob_local_mtime = obc->obs.oi.local_mtime;
14670     } else {
14671       ob_local_mtime = obc->obs.oi.mtime;
14672     }
14673     if (ob_local_mtime + utime_t(pool.info.cache_min_evict_age, 0) > now) {
14674       dout(20) << __func__ << " skip (too young) " << obc->obs.oi << dendl;
14675       osd->logger->inc(l_osd_agent_skip);
14676       return false;
14677     }
14678     // is this object old and/or cold enough?
14679     int temp = 0;
14680     uint64_t temp_upper = 0, temp_lower = 0;
14681     if (hit_set)
14682       agent_estimate_temp(soid, &temp);
14683     agent_state->temp_hist.add(temp);
14684     agent_state->temp_hist.get_position_micro(temp, &temp_lower, &temp_upper);
14685
14686     dout(20) << __func__
14687              << " temp " << temp
14688              << " pos " << temp_lower << "-" << temp_upper
14689              << ", evict_effort " << agent_state->evict_effort
14690              << dendl;
14691     dout(30) << "agent_state:\n";
14692     Formatter *f = Formatter::create("");
14693     f->open_object_section("agent_state");
14694     agent_state->dump(f);
14695     f->close_section();
14696     f->flush(*_dout);
14697     delete f;
14698     *_dout << dendl;
14699
14700     if (1000000 - temp_upper >= agent_state->evict_effort)
14701       return false;
14702   }
14703
14704   dout(10) << __func__ << " evicting " << obc->obs.oi << dendl;
14705   OpContextUPtr ctx = simple_opc_create(obc);
14706
14707   auto null_op_req = OpRequestRef();
14708   if (!ctx->lock_manager.get_lock_type(
14709         RWState::RWWRITE,
14710         obc->obs.oi.soid,
14711         obc,
14712         null_op_req)) {
14713     close_op_ctx(ctx.release());
14714     dout(20) << __func__ << " skip (cannot get lock) " << obc->obs.oi << dendl;
14715     return false;
14716   }
14717
14718   osd->agent_start_evict_op();
14719   ctx->register_on_finish(
14720     [this]() {
14721       osd->agent_finish_evict_op();
14722     });
14723
14724   ctx->at_version = get_next_version();
14725   ceph_assert(ctx->new_obs.exists);
14726   int r = _delete_oid(ctx.get(), true, false);
14727   if (obc->obs.oi.is_omap())
14728     ctx->delta_stats.num_objects_omap--;
14729   ctx->delta_stats.num_evict++;
14730   ctx->delta_stats.num_evict_kb += shift_round_up(obc->obs.oi.size, 10);
14731   if (obc->obs.oi.is_dirty())
14732     --ctx->delta_stats.num_objects_dirty;
14733   ceph_assert(r == 0);
14734   finish_ctx(ctx.get(), pg_log_entry_t::DELETE);
14735   simple_opc_submit(std::move(ctx));
14736   osd->logger->inc(l_osd_tier_evict);
14737   osd->logger->inc(l_osd_agent_evict);
14738   return true;
14739 }
14740
14741 void PrimaryLogPG::agent_stop()
14742 {
14743   dout(20) << __func__ << dendl;
14744   if (agent_state && !agent_state->is_idle()) {
14745     agent_state->evict_mode = TierAgentState::EVICT_MODE_IDLE;
14746     agent_state->flush_mode = TierAgentState::FLUSH_MODE_IDLE;
14747     osd->agent_disable_pg(this, agent_state->evict_effort);
14748   }
14749 }
14750
14751 void PrimaryLogPG::agent_delay()
14752 {
14753   dout(20) << __func__ << dendl;
14754   if (agent_state && !agent_state->is_idle()) {
14755     ceph_assert(agent_state->delaying == false);
14756     agent_state->delaying = true;
14757     osd->agent_disable_pg(this, agent_state->evict_effort);
14758   }
14759 }
14760
14761 void PrimaryLogPG::agent_choose_mode_restart()
14762 {
14763   dout(20) << __func__ << dendl;
14764   std::scoped_lock locker{*this};
14765   if (agent_state && agent_state->delaying) {
14766     agent_state->delaying = false;
14767     agent_choose_mode(true);
14768   }
14769 }
14770
14771 bool PrimaryLogPG::agent_choose_mode(bool restart, OpRequestRef op)
14772 {
14773   bool requeued = false;
14774   // Let delay play out
14775   if (agent_state->delaying) {
14776     dout(20) << __func__ << " " << this << " delaying, ignored" << dendl;
14777     return requeued;
14778   }
14779
14780   TierAgentState::flush_mode_t flush_mode = TierAgentState::FLUSH_MODE_IDLE;
14781   TierAgentState::evict_mode_t evict_mode = TierAgentState::EVICT_MODE_IDLE;
14782   unsigned evict_effort = 0;
14783
14784   if (info.stats.stats_invalid) {
14785     // idle; stats can't be trusted until we scrub.
14786     dout(20) << __func__ << " stats invalid (post-split), idle" << dendl;
14787     goto skip_calc;
14788   }
14789
14790   {
14791   uint64_t divisor = pool.info.get_pg_num_divisor(info.pgid.pgid);
14792   ceph_assert(divisor > 0);
14793
14794   // adjust (effective) user objects down based on the number
14795   // of HitSet objects, which should not count toward our total since
14796   // they cannot be flushed.
14797   uint64_t unflushable = info.stats.stats.sum.num_objects_hit_set_archive;
14798
14799   // also exclude omap objects if ec backing pool
14800   const pg_pool_t *base_pool = get_osdmap()->get_pg_pool(pool.info.tier_of);
14801   ceph_assert(base_pool);
14802   if (!base_pool->supports_omap())
14803     unflushable += info.stats.stats.sum.num_objects_omap;
14804
14805   uint64_t num_user_objects = info.stats.stats.sum.num_objects;
14806   if (num_user_objects > unflushable)
14807     num_user_objects -= unflushable;
14808   else
14809     num_user_objects = 0;
14810
14811   uint64_t num_user_bytes = info.stats.stats.sum.num_bytes;
14812   uint64_t unflushable_bytes = info.stats.stats.sum.num_bytes_hit_set_archive;
14813   num_user_bytes -= unflushable_bytes;
14814   uint64_t num_overhead_bytes = osd->store->estimate_objects_overhead(num_user_objects);
14815   num_user_bytes += num_overhead_bytes;
14816
14817   // also reduce the num_dirty by num_objects_omap
14818   int64_t num_dirty = info.stats.stats.sum.num_objects_dirty;
14819   if (!base_pool->supports_omap()) {
14820     if (num_dirty > info.stats.stats.sum.num_objects_omap)
14821       num_dirty -= info.stats.stats.sum.num_objects_omap;
14822     else
14823       num_dirty = 0;
14824   }
14825
14826   dout(10) << __func__
14827            << " flush_mode: "
14828            << TierAgentState::get_flush_mode_name(agent_state->flush_mode)
14829            << " evict_mode: "
14830            << TierAgentState::get_evict_mode_name(agent_state->evict_mode)
14831            << " num_objects: " << info.stats.stats.sum.num_objects
14832            << " num_bytes: " << info.stats.stats.sum.num_bytes
14833            << " num_objects_dirty: " << info.stats.stats.sum.num_objects_dirty
14834            << " num_objects_omap: " << info.stats.stats.sum.num_objects_omap
14835            << " num_dirty: " << num_dirty
14836            << " num_user_objects: " << num_user_objects
14837            << " num_user_bytes: " << num_user_bytes
14838            << " num_overhead_bytes: " << num_overhead_bytes
14839            << " pool.info.target_max_bytes: " << pool.info.target_max_bytes
14840            << " pool.info.target_max_objects: " << pool.info.target_max_objects
14841            << dendl;
14842
14843   // get dirty, full ratios
14844   uint64_t dirty_micro = 0;
14845   uint64_t full_micro = 0;
14846   if (pool.info.target_max_bytes && num_user_objects > 0) {
14847     uint64_t avg_size = num_user_bytes / num_user_objects;
14848     dirty_micro =
14849       num_dirty * avg_size * 1000000 /
14850       std::max<uint64_t>(pool.info.target_max_bytes / divisor, 1);
14851     full_micro =
14852       num_user_objects * avg_size * 1000000 /
14853       std::max<uint64_t>(pool.info.target_max_bytes / divisor, 1);
14854   }
14855   if (pool.info.target_max_objects > 0) {
14856     uint64_t dirty_objects_micro =
14857       num_dirty * 1000000 /
14858       std::max<uint64_t>(pool.info.target_max_objects / divisor, 1);
14859     if (dirty_objects_micro > dirty_micro)
14860       dirty_micro = dirty_objects_micro;
14861     uint64_t full_objects_micro =
14862       num_user_objects * 1000000 /
14863       std::max<uint64_t>(pool.info.target_max_objects / divisor, 1);
14864     if (full_objects_micro > full_micro)
14865       full_micro = full_objects_micro;
14866   }
14867   dout(20) << __func__ << " dirty " << ((float)dirty_micro / 1000000.0)
14868            << " full " << ((float)full_micro / 1000000.0)
14869            << dendl;
14870
14871   // flush mode
14872   uint64_t flush_target = pool.info.cache_target_dirty_ratio_micro;
14873   uint64_t flush_high_target = pool.info.cache_target_dirty_high_ratio_micro;
14874   uint64_t flush_slop = (float)flush_target * cct->_conf->osd_agent_slop;
14875   if (restart || agent_state->flush_mode == TierAgentState::FLUSH_MODE_IDLE) {
14876     flush_target += flush_slop;
14877     flush_high_target += flush_slop;
14878   } else {
14879     flush_target -= std::min(flush_target, flush_slop);
14880     flush_high_target -= std::min(flush_high_target, flush_slop);
14881   }
14882
14883   if (dirty_micro > flush_high_target) {
14884     flush_mode = TierAgentState::FLUSH_MODE_HIGH;
14885   } else if (dirty_micro > flush_target || (!flush_target && num_dirty > 0)) {
14886     flush_mode = TierAgentState::FLUSH_MODE_LOW;
14887   }
14888
14889   // evict mode
14890   uint64_t evict_target = pool.info.cache_target_full_ratio_micro;
14891   uint64_t evict_slop = (float)evict_target * cct->_conf->osd_agent_slop;
14892   if (restart || agent_state->evict_mode == TierAgentState::EVICT_MODE_IDLE)
14893     evict_target += evict_slop;
14894   else
14895     evict_target -= std::min(evict_target, evict_slop);
14896
14897   if (full_micro > 1000000) {
14898     // evict anything clean
14899     evict_mode = TierAgentState::EVICT_MODE_FULL;
14900     evict_effort = 1000000;
14901   } else if (full_micro > evict_target) {
14902     // set effort in [0..1] range based on where we are between
14903     evict_mode = TierAgentState::EVICT_MODE_SOME;
14904     uint64_t over = full_micro - evict_target;
14905     uint64_t span  = 1000000 - evict_target;
14906     evict_effort = std::max(over * 1000000 / span,
14907                             uint64_t(1000000.0 *
14908                                      cct->_conf->osd_agent_min_evict_effort));
14909
14910     // quantize effort to avoid too much reordering in the agent_queue.
14911     uint64_t inc = cct->_conf->osd_agent_quantize_effort * 1000000;
14912     ceph_assert(inc > 0);
14913     uint64_t was = evict_effort;
14914     evict_effort -= evict_effort % inc;
14915     if (evict_effort < inc)
14916       evict_effort = inc;
14917     ceph_assert(evict_effort >= inc && evict_effort <= 1000000);
14918     dout(30) << __func__ << " evict_effort " << was << " quantized by " << inc << " to " << evict_effort << dendl;
14919   }
14920   }
14921
14922   skip_calc:
14923   bool old_idle = agent_state->is_idle();
14924   if (flush_mode != agent_state->flush_mode) {
14925     dout(5) << __func__ << " flush_mode "
14926             << TierAgentState::get_flush_mode_name(agent_state->flush_mode)
14927             << " -> "
14928             << TierAgentState::get_flush_mode_name(flush_mode)
14929             << dendl;
14930     recovery_state.update_stats(
14931       [=](auto &history, auto &stats) {
14932         if (flush_mode == TierAgentState::FLUSH_MODE_HIGH) {
14933           osd->agent_inc_high_count();
14934           stats.stats.sum.num_flush_mode_high = 1;
14935         } else if (flush_mode == TierAgentState::FLUSH_MODE_LOW) {
14936           stats.stats.sum.num_flush_mode_low = 1;
14937         }
14938         if (agent_state->flush_mode == TierAgentState::FLUSH_MODE_HIGH) {
14939           osd->agent_dec_high_count();
14940           stats.stats.sum.num_flush_mode_high = 0;
14941         } else if (agent_state->flush_mode == TierAgentState::FLUSH_MODE_LOW) {
14942           stats.stats.sum.num_flush_mode_low = 0;
14943         }
14944         return false;
14945       });
14946     agent_state->flush_mode = flush_mode;
14947   }
14948   if (evict_mode != agent_state->evict_mode) {
14949     dout(5) << __func__ << " evict_mode "
14950             << TierAgentState::get_evict_mode_name(agent_state->evict_mode)
14951             << " -> "
14952             << TierAgentState::get_evict_mode_name(evict_mode)
14953             << dendl;
14954     if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL &&
14955         is_active()) {
14956       if (op)
14957         requeue_op(op);
14958       requeue_ops(waiting_for_flush);
14959       requeue_ops(waiting_for_active);
14960       requeue_ops(waiting_for_readable);
14961       requeue_ops(waiting_for_scrub);
14962       requeue_ops(waiting_for_cache_not_full);
14963       objects_blocked_on_cache_full.clear();
14964       requeued = true;
14965     }
14966     recovery_state.update_stats(
14967       [=](auto &history, auto &stats) {
14968         if (evict_mode == TierAgentState::EVICT_MODE_SOME) {
14969           stats.stats.sum.num_evict_mode_some = 1;
14970         } else if (evict_mode == TierAgentState::EVICT_MODE_FULL) {
14971           stats.stats.sum.num_evict_mode_full = 1;
14972         }
14973         if (agent_state->evict_mode == TierAgentState::EVICT_MODE_SOME) {
14974           stats.stats.sum.num_evict_mode_some = 0;
14975         } else if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
14976           stats.stats.sum.num_evict_mode_full = 0;
14977         }
14978         return false;
14979       });
14980     agent_state->evict_mode = evict_mode;
14981   }
14982   uint64_t old_effort = agent_state->evict_effort;
14983   if (evict_effort != agent_state->evict_effort) {
14984     dout(5) << __func__ << " evict_effort "
14985             << ((float)agent_state->evict_effort / 1000000.0)
14986             << " -> "
14987             << ((float)evict_effort / 1000000.0)
14988             << dendl;
14989     agent_state->evict_effort = evict_effort;
14990   }
14991
14992   // NOTE: we are using evict_effort as a proxy for *all* agent effort
14993   // (including flush).  This is probably fine (they should be
14994   // correlated) but it is not precisely correct.
14995   if (agent_state->is_idle()) {
14996     if (!restart && !old_idle) {
14997       osd->agent_disable_pg(this, old_effort);
14998     }
14999   } else {
15000     if (restart || old_idle) {
15001       osd->agent_enable_pg(this, agent_state->evict_effort);
15002     } else if (old_effort != agent_state->evict_effort) {
15003       osd->agent_adjust_pg(this, old_effort, agent_state->evict_effort);
15004     }
15005   }
15006   return requeued;
15007 }
15008
15009 void PrimaryLogPG::agent_estimate_temp(const hobject_t& oid, int *temp)
15010 {
15011   ceph_assert(hit_set);
15012   ceph_assert(temp);
15013   *temp = 0;
15014   if (hit_set->contains(oid))
15015     *temp = 1000000;
15016   unsigned i = 0;
15017   int last_n = pool.info.hit_set_search_last_n;
15018   for (map<time_t,HitSetRef>::reverse_iterator p =
15019        agent_state->hit_set_map.rbegin(); last_n > 0 &&
15020        p != agent_state->hit_set_map.rend(); ++p, ++i) {
15021     if (p->second->contains(oid)) {
15022       *temp += pool.info.get_grade(i);
15023       --last_n;
15024     }
15025   }
15026 }
15027
15028 // Dup op detection
15029
15030 bool PrimaryLogPG::already_complete(eversion_t v)
15031 {
15032   dout(20) << __func__ << ": " << v << dendl;
15033   for (xlist<RepGather*>::iterator i = repop_queue.begin();
15034        !i.end();
15035        ++i) {
15036     dout(20) << __func__ << ": " << **i << dendl;
15037     // skip copy from temp object ops
15038     if ((*i)->v == eversion_t()) {
15039       dout(20) << __func__ << ": " << **i
15040                << " version is empty" << dendl;
15041       continue;
15042     }
15043     if ((*i)->v > v) {
15044       dout(20) << __func__ << ": " << **i
15045                << " (*i)->v past v" << dendl;
15046       break;
15047     }
15048     if (!(*i)->all_committed) {
15049       dout(20) << __func__ << ": " << **i
15050                << " not committed, returning false"
15051                << dendl;
15052       return false;
15053     }
15054   }
15055   dout(20) << __func__ << ": returning true" << dendl;
15056   return true;
15057 }
15058
15059
15060 // ==========================================================================================
15061 // SCRUB
15062
15063 void PrimaryLogPG::do_replica_scrub_map(OpRequestRef op)
15064 {
15065   dout(15) << __func__ << " is scrub active? " << m_scrubber->is_scrub_active() << dendl;
15066   op->mark_started();
15067
15068   if (!m_scrubber->is_scrub_active()) {
15069     dout(10) << __func__ << " scrub isn't active" << dendl;
15070     return;
15071   }
15072   m_scrubber->map_from_replica(op);
15073 }
15074
15075 bool PrimaryLogPG::_range_available_for_scrub(const hobject_t& begin,
15076                                               const hobject_t& end)
15077 {
15078   pair<hobject_t, ObjectContextRef> next;
15079   next.second = object_contexts.lookup(begin);
15080   next.first = begin;
15081   bool more = true;
15082   while (more && next.first < end) {
15083     if (next.second && next.second->is_blocked()) {
15084       next.second->requeue_scrub_on_unblock = true;
15085       dout(10) << __func__ << ": scrub delayed, "
15086                << next.first << " is blocked"
15087                << dendl;
15088       return false;
15089     }
15090     more = object_contexts.get_next(next.first, &next);
15091   }
15092   return true;
15093 }
15094
15095
15096 int PrimaryLogPG::rep_repair_primary_object(const hobject_t& soid, OpContext *ctx)
15097 {
15098   OpRequestRef op = ctx->op;
15099   // Only supports replicated pools
15100   ceph_assert(!pool.info.is_erasure());
15101   ceph_assert(is_primary());
15102
15103   dout(10) << __func__ << " " << soid
15104            << " peers osd.{" << get_acting_recovery_backfill() << "}" << dendl;
15105
15106   if (!is_clean()) {
15107     block_for_clean(soid, op);
15108     return -EAGAIN;
15109   }
15110
15111   ceph_assert(!recovery_state.get_pg_log().get_missing().is_missing(soid));
15112   auto& oi = ctx->new_obs.oi;
15113   eversion_t v = oi.version;
15114
15115   if (primary_error(soid, v)) {
15116     dout(0) << __func__ << " No other replicas available for " << soid << dendl;
15117     // XXX: If we knew that there is no down osd which could include this
15118     // object, it would be nice if we could return EIO here.
15119     // If a "never fail" flag was available, that could be used
15120     // for rbd to NOT return EIO until object marked lost.
15121
15122     // Drop through to save this op in case an osd comes up with the object.
15123   }
15124
15125   // Restart the op after object becomes readable again
15126   waiting_for_unreadable_object[soid].push_back(op);
15127   op->mark_delayed("waiting for missing object");
15128
15129   ceph_assert(is_clean());
15130   state_set(PG_STATE_REPAIR);
15131   state_clear(PG_STATE_CLEAN);
15132   queue_peering_event(
15133       PGPeeringEventRef(
15134         std::make_shared<PGPeeringEvent>(
15135         get_osdmap_epoch(),
15136         get_osdmap_epoch(),
15137         PeeringState::DoRecovery())));
15138
15139   return -EAGAIN;
15140 }
15141
15142 /*---SnapTrimmer Logging---*/
15143 #undef dout_prefix
15144 #define dout_prefix pg->gen_prefix(*_dout)
15145
15146 void PrimaryLogPG::SnapTrimmer::log_enter(const char *state_name)
15147 {
15148   ldout(pg->cct, 20) << "enter " << state_name << dendl;
15149 }
15150
15151 void PrimaryLogPG::SnapTrimmer::log_exit(const char *state_name, utime_t enter_time)
15152 {
15153   ldout(pg->cct, 20) << "exit " << state_name << dendl;
15154 }
15155
15156 bool PrimaryLogPG::SnapTrimmer::permit_trim() {
15157   return
15158     pg->is_clean() &&
15159     !pg->m_scrubber->is_scrub_active() &&
15160     !pg->snap_trimq.empty();
15161 }
15162
15163 /*---SnapTrimmer states---*/
15164 #undef dout_prefix
15165 #define dout_prefix (context< SnapTrimmer >().pg->gen_prefix(*_dout) \
15166                      << "SnapTrimmer state<" << get_state_name() << ">: ")
15167
15168 /* NotTrimming */
15169 PrimaryLogPG::NotTrimming::NotTrimming(my_context ctx)
15170   : my_base(ctx),
15171     NamedState(nullptr, "NotTrimming")
15172 {
15173   context< SnapTrimmer >().log_enter(state_name);
15174 }
15175
15176 void PrimaryLogPG::NotTrimming::exit()
15177 {
15178   context< SnapTrimmer >().log_exit(state_name, enter_time);
15179 }
15180
15181 boost::statechart::result PrimaryLogPG::NotTrimming::react(const KickTrim&)
15182 {
15183   PrimaryLogPG *pg = context< SnapTrimmer >().pg;
15184   ldout(pg->cct, 10) << "NotTrimming react KickTrim" << dendl;
15185
15186   if (!(pg->is_primary() && pg->is_active())) {
15187     ldout(pg->cct, 10) << "NotTrimming not primary or active" << dendl;
15188     return discard_event();
15189   }
15190   if (!pg->is_clean() ||
15191       pg->snap_trimq.empty()) {
15192     ldout(pg->cct, 10) << "NotTrimming not clean or nothing to trim" << dendl;
15193     return discard_event();
15194   }
15195   if (pg->m_scrubber->is_scrub_active()) {
15196     ldout(pg->cct, 10) << " scrubbing, will requeue snap_trimmer after" << dendl;
15197     return transit< WaitScrub >();
15198   } else {
15199     return transit< Trimming >();
15200   }
15201 }
15202
15203 boost::statechart::result PrimaryLogPG::WaitReservation::react(const SnapTrimReserved&)
15204 {
15205   PrimaryLogPG *pg = context< SnapTrimmer >().pg;
15206   ldout(pg->cct, 10) << "WaitReservation react SnapTrimReserved" << dendl;
15207
15208   pending = nullptr;
15209   if (!context< SnapTrimmer >().can_trim()) {
15210     post_event(KickTrim());
15211     return transit< NotTrimming >();
15212   }
15213
15214   context<Trimming>().snap_to_trim = pg->snap_trimq.range_start();
15215   ldout(pg->cct, 10) << "NotTrimming: trimming "
15216                      << pg->snap_trimq.range_start()
15217                      << dendl;
15218   return transit< AwaitAsyncWork >();
15219 }
15220
15221 /* AwaitAsyncWork */
15222 PrimaryLogPG::AwaitAsyncWork::AwaitAsyncWork(my_context ctx)
15223   : my_base(ctx),
15224     NamedState(nullptr, "Trimming/AwaitAsyncWork")
15225 {
15226   auto *pg = context< SnapTrimmer >().pg;
15227   context< SnapTrimmer >().log_enter(state_name);
15228   context< SnapTrimmer >().pg->osd->queue_for_snap_trim(pg);
15229   pg->state_set(PG_STATE_SNAPTRIM);
15230   pg->state_clear(PG_STATE_SNAPTRIM_ERROR);
15231   pg->publish_stats_to_osd();
15232 }
15233
15234 boost::statechart::result PrimaryLogPG::AwaitAsyncWork::react(const DoSnapWork&)
15235 {
15236   PrimaryLogPGRef pg = context< SnapTrimmer >().pg;
15237   snapid_t snap_to_trim = context<Trimming>().snap_to_trim;
15238   auto &in_flight = context<Trimming>().in_flight;
15239   ceph_assert(in_flight.empty());
15240
15241   ceph_assert(pg->is_primary() && pg->is_active());
15242   if (!context< SnapTrimmer >().can_trim()) {
15243     ldout(pg->cct, 10) << "something changed, reverting to NotTrimming" << dendl;
15244     post_event(KickTrim());
15245     return transit< NotTrimming >();
15246   }
15247
15248   ldout(pg->cct, 10) << "AwaitAsyncWork: trimming snap " << snap_to_trim << dendl;
15249
15250   vector<hobject_t> to_trim;
15251   unsigned max = pg->cct->_conf->osd_pg_max_concurrent_snap_trims;
15252   to_trim.reserve(max);
15253   int r = pg->snap_mapper.get_next_objects_to_trim(
15254     snap_to_trim,
15255     max,
15256     &to_trim);
15257   if (r != 0 && r != -ENOENT) {
15258     lderr(pg->cct) << "get_next_objects_to_trim returned "
15259                    << cpp_strerror(r) << dendl;
15260     ceph_abort_msg("get_next_objects_to_trim returned an invalid code");
15261   } else if (r == -ENOENT) {
15262     // Done!
15263     ldout(pg->cct, 10) << "got ENOENT" << dendl;
15264
15265     pg->snap_trimq.erase(snap_to_trim);
15266
15267     if (pg->snap_trimq_repeat.count(snap_to_trim)) {
15268       ldout(pg->cct, 10) << " removing from snap_trimq_repeat" << dendl;
15269       pg->snap_trimq_repeat.erase(snap_to_trim);
15270     } else {
15271       ldout(pg->cct, 10) << "adding snap " << snap_to_trim
15272                          << " to purged_snaps"
15273                          << dendl;
15274       ObjectStore::Transaction t;
15275       pg->recovery_state.adjust_purged_snaps(
15276         [snap_to_trim](auto &purged_snaps) {
15277           purged_snaps.insert(snap_to_trim);
15278         });
15279       pg->write_if_dirty(t);
15280
15281       ldout(pg->cct, 10) << "purged_snaps now "
15282                          << pg->info.purged_snaps << ", snap_trimq now "
15283                          << pg->snap_trimq << dendl;
15284
15285       int tr = pg->osd->store->queue_transaction(pg->ch, std::move(t), NULL);
15286       ceph_assert(tr == 0);
15287
15288       pg->recovery_state.share_pg_info();
15289     }
15290     post_event(KickTrim());
15291     return transit< NotTrimming >();
15292   }
15293   ceph_assert(!to_trim.empty());
15294
15295   for (auto &&object: to_trim) {
15296     // Get next
15297     ldout(pg->cct, 10) << "AwaitAsyncWork react trimming " << object << dendl;
15298     OpContextUPtr ctx;
15299     int error = pg->trim_object(in_flight.empty(), object, snap_to_trim, &ctx);
15300     if (error) {
15301       if (error == -ENOLCK) {
15302         ldout(pg->cct, 10) << "could not get write lock on obj "
15303                            << object << dendl;
15304       } else {
15305         pg->state_set(PG_STATE_SNAPTRIM_ERROR);
15306         ldout(pg->cct, 10) << "Snaptrim error=" << error << dendl;
15307       }
15308       if (!in_flight.empty()) {
15309         ldout(pg->cct, 10) << "letting the ones we already started finish" << dendl;
15310         return transit< WaitRepops >();
15311       }
15312       if (error == -ENOLCK) {
15313         ldout(pg->cct, 10) << "waiting for it to clear"
15314                            << dendl;
15315         return transit< WaitRWLock >();
15316       } else {
15317         return transit< NotTrimming >();
15318       }
15319     }
15320
15321     in_flight.insert(object);
15322     ctx->register_on_success(
15323       [pg, object, &in_flight]() {
15324         ceph_assert(in_flight.find(object) != in_flight.end());
15325         in_flight.erase(object);
15326         if (in_flight.empty()) {
15327           if (pg->state_test(PG_STATE_SNAPTRIM_ERROR)) {
15328             pg->snap_trimmer_machine.process_event(Reset());
15329           } else {
15330             pg->snap_trimmer_machine.process_event(RepopsComplete());
15331           }
15332         }
15333       });
15334
15335     pg->simple_opc_submit(std::move(ctx));
15336   }
15337
15338   return transit< WaitRepops >();
15339 }
15340
15341 void PrimaryLogPG::setattr_maybe_cache(
15342   ObjectContextRef obc,
15343   PGTransaction *t,
15344   const string &key,
15345   bufferlist &val)
15346 {
15347   t->setattr(obc->obs.oi.soid, key, val);
15348 }
15349
15350 void PrimaryLogPG::setattrs_maybe_cache(
15351   ObjectContextRef obc,
15352   PGTransaction *t,
15353   map<string, bufferlist> &attrs)
15354 {
15355   t->setattrs(obc->obs.oi.soid, attrs);
15356 }
15357
15358 void PrimaryLogPG::rmattr_maybe_cache(
15359   ObjectContextRef obc,
15360   PGTransaction *t,
15361   const string &key)
15362 {
15363   t->rmattr(obc->obs.oi.soid, key);
15364 }
15365
15366 int PrimaryLogPG::getattr_maybe_cache(
15367   ObjectContextRef obc,
15368   const string &key,
15369   bufferlist *val)
15370 {
15371   if (pool.info.is_erasure()) {
15372     map<string, bufferlist>::iterator i = obc->attr_cache.find(key);
15373     if (i != obc->attr_cache.end()) {
15374       if (val)
15375         *val = i->second;
15376       return 0;
15377     } else {
15378       return -ENODATA;
15379     }
15380   }
15381   return pgbackend->objects_get_attr(obc->obs.oi.soid, key, val);
15382 }
15383
15384 int PrimaryLogPG::getattrs_maybe_cache(
15385   ObjectContextRef obc,
15386   map<string, bufferlist> *out)
15387 {
15388   int r = 0;
15389   ceph_assert(out);
15390   if (pool.info.is_erasure()) {
15391     *out = obc->attr_cache;
15392   } else {
15393     r = pgbackend->objects_get_attrs(obc->obs.oi.soid, out);
15394   }
15395   map<string, bufferlist> tmp;
15396   for (map<string, bufferlist>::iterator i = out->begin();
15397        i != out->end();
15398        ++i) {
15399     if (i->first.size() > 1 && i->first[0] == '_')
15400       tmp[i->first.substr(1, i->first.size())] = std::move(i->second);
15401   }
15402   tmp.swap(*out);
15403   return r;
15404 }
15405
15406 bool PrimaryLogPG::check_failsafe_full() {
15407     return osd->check_failsafe_full(get_dpp());
15408 }
15409
15410 bool PrimaryLogPG::maybe_preempt_replica_scrub(const hobject_t& oid)
15411 {
15412   return m_scrubber->write_blocked_by_scrub(oid);
15413 }
15414
15415 void intrusive_ptr_add_ref(PrimaryLogPG *pg) { pg->get("intptr"); }
15416 void intrusive_ptr_release(PrimaryLogPG *pg) { pg->put("intptr"); }
15417
15418 #ifdef PG_DEBUG_REFS
15419 uint64_t get_with_id(PrimaryLogPG *pg) { return pg->get_with_id(); }
15420 void put_with_id(PrimaryLogPG *pg, uint64_t id) { return pg->put_with_id(id); }
15421 #endif
15422
15423 void intrusive_ptr_add_ref(PrimaryLogPG::RepGather *repop) { repop->get(); }
15424 void intrusive_ptr_release(PrimaryLogPG::RepGather *repop) { repop->put(); }