ceph/src/osd/PrimaryLogPG.cc

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Ceph - scalable distributed file system
   5  *
   6  * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
   7  * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
   8  *
   9  * Author: Loic Dachary <loic@dachary.org>
  10  *
  11  * This is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public
  13  * License version 2.1, as published by the Free Software
  14  * Foundation.  See file COPYING.
  15  *
  16  */
  17
  18 #include "boost/tuple/tuple.hpp"
  19 #include "boost/intrusive_ptr.hpp"
  20 #include "PG.h"
  21 #include "PrimaryLogPG.h"
  22 #include "OSD.h"
  23 #include "OpRequest.h"
  24 #include "ScrubStore.h"
  25 #include "Session.h"
  26 #include "objclass/objclass.h"
  27
  28 #include "common/errno.h"
  29 #include "common/scrub_types.h"
  30 #include "common/perf_counters.h"
  31
  32 #include "messages/MOSDOp.h"
  33 #include "messages/MOSDBackoff.h"
  34 #include "messages/MOSDPGTrim.h"
  35 #include "messages/MOSDPGScan.h"
  36 #include "messages/MOSDRepScrub.h"
  37 #include "messages/MOSDPGBackfill.h"
  38 #include "messages/MOSDPGBackfillRemove.h"
  39 #include "messages/MOSDPGUpdateLogMissing.h"
  40 #include "messages/MOSDPGUpdateLogMissingReply.h"
  41 #include "messages/MCommandReply.h"
  42 #include "messages/MOSDScrubReserve.h"
  43 #include "mds/inode_backtrace.h" // Ugh
  44 #include "common/EventTrace.h"
  45
  46 #include "common/config.h"
  47 #include "include/compat.h"
  48 #include "mon/MonClient.h"
  49 #include "osdc/Objecter.h"
  50 #include "json_spirit/json_spirit_value.h"
  51 #include "json_spirit/json_spirit_reader.h"
  52 #include "include/ceph_assert.h"  // json_spirit clobbers it
  53 #include "include/rados/rados_types.hpp"
  54
  55 #ifdef WITH_LTTNG
  56 #include "tracing/osd.h"
  57 #else
  58 #define tracepoint(...)
  59 #endif
  60
  61 #define dout_context cct
  62 #define dout_subsys ceph_subsys_osd
  63 #define DOUT_PREFIX_ARGS this, osd->whoami, get_osdmap()
  64 #undef dout_prefix
  65 #define dout_prefix _prefix(_dout, this)
  66 template <typename T>
  67 static ostream& _prefix(std::ostream *_dout, T *pg) {
  68   return pg->gen_prefix(*_dout);
  69 }
  70
  71
  72 #include <sstream>
  73 #include <utility>
  74
  75 #include <errno.h>
  76
  77 MEMPOOL_DEFINE_OBJECT_FACTORY(PrimaryLogPG, replicatedpg, osd);
  78
  79 PGLSFilter::PGLSFilter() : cct(nullptr)
  80 {
  81 }
  82
  83 PGLSFilter::~PGLSFilter()
  84 {
  85 }
  86
  87 /**
  88  * The CopyCallback class defines an interface for completions to the
  89  * copy_start code. Users of the copy infrastructure must implement
  90  * one and give an instance of the class to start_copy.
  91  *
  92  * The implementer is responsible for making sure that the CopyCallback
  93  * can associate itself with the correct copy operation.
  94  */
  95 class PrimaryLogPG::CopyCallback : public GenContext<CopyCallbackResults> {
  96 protected:
  97   CopyCallback() {}
  98   /**
  99    * results.get<0>() is the return code: 0 for success; -ECANCELED if
 100    * the operation was cancelled by the local OSD; -errno for other issues.
 101    * results.get<1>() is a pointer to a CopyResults object, which you are
 102    * responsible for deleting.
 103    */
 104   void finish(CopyCallbackResults results_) override = 0;
 105
 106 public:
 107   /// Provide the final size of the copied object to the CopyCallback
 108   ~CopyCallback() override {}
 109 };
 110
 111 template <typename T>
 112 class PrimaryLogPG::BlessedGenContext : public GenContext<T> {
 113   PrimaryLogPGRef pg;
 114   unique_ptr<GenContext<T>> c;
 115   epoch_t e;
 116 public:
 117   BlessedGenContext(PrimaryLogPG *pg, GenContext<T> *c, epoch_t e)
 118     : pg(pg), c(c), e(e) {}
 119   void finish(T t) override {
 120     pg->lock();
 121     if (pg->pg_has_reset_since(e))
 122       c.reset();
 123     else
 124       c.release()->complete(t);
 125     pg->unlock();
 126   }
 127   bool sync_finish(T t) {
 128     // we assume here all blessed/wrapped Contexts can complete synchronously.
 129     c.release()->complete(t);
 130     return true;
 131   }
 132 };
 133
 134 GenContext<ThreadPool::TPHandle&> *PrimaryLogPG::bless_gencontext(
 135   GenContext<ThreadPool::TPHandle&> *c) {
 136   return new BlessedGenContext<ThreadPool::TPHandle&>(
 137     this, c, get_osdmap_epoch());
 138 }
 139
 140 template <typename T>
 141 class PrimaryLogPG::UnlockedBlessedGenContext : public GenContext<T> {
 142   PrimaryLogPGRef pg;
 143   unique_ptr<GenContext<T>> c;
 144   epoch_t e;
 145 public:
 146   UnlockedBlessedGenContext(PrimaryLogPG *pg, GenContext<T> *c, epoch_t e)
 147     : pg(pg), c(c), e(e) {}
 148   void finish(T t) override {
 149     if (pg->pg_has_reset_since(e))
 150       c.reset();
 151     else
 152       c.release()->complete(t);
 153   }
 154   bool sync_finish(T t) {
 155     // we assume here all blessed/wrapped Contexts can complete synchronously.
 156     c.release()->complete(t);
 157     return true;
 158   }
 159 };
 160
 161 GenContext<ThreadPool::TPHandle&> *PrimaryLogPG::bless_unlocked_gencontext(
 162   GenContext<ThreadPool::TPHandle&> *c) {
 163   return new UnlockedBlessedGenContext<ThreadPool::TPHandle&>(
 164     this, c, get_osdmap_epoch());
 165 }
 166
 167 class PrimaryLogPG::BlessedContext : public Context {
 168   PrimaryLogPGRef pg;
 169   unique_ptr<Context> c;
 170   epoch_t e;
 171 public:
 172   BlessedContext(PrimaryLogPG *pg, Context *c, epoch_t e)
 173     : pg(pg), c(c), e(e) {}
 174   void finish(int r) override {
 175     pg->lock();
 176     if (pg->pg_has_reset_since(e))
 177       c.reset();
 178     else
 179       c.release()->complete(r);
 180     pg->unlock();
 181   }
 182   bool sync_finish(int r) {
 183     // we assume here all blessed/wrapped Contexts can complete synchronously.
 184     c.release()->complete(r);
 185     return true;
 186   }
 187 };
 188
 189 Context *PrimaryLogPG::bless_context(Context *c) {
 190   return new BlessedContext(this, c, get_osdmap_epoch());
 191 }
 192
 193 class PrimaryLogPG::C_PG_ObjectContext : public Context {
 194   PrimaryLogPGRef pg;
 195   ObjectContext *obc;
 196   public:
 197   C_PG_ObjectContext(PrimaryLogPG *p, ObjectContext *o) :
 198     pg(p), obc(o) {}
 199   void finish(int r) override {
 200     pg->object_context_destructor_callback(obc);
 201   }
 202 };
 203
 204 struct OnReadComplete : public Context {
 205   PrimaryLogPG *pg;
 206   PrimaryLogPG::OpContext *opcontext;
 207   OnReadComplete(
 208     PrimaryLogPG *pg,
 209     PrimaryLogPG::OpContext *ctx) : pg(pg), opcontext(ctx) {}
 210   void finish(int r) override {
 211     opcontext->finish_read(pg);
 212   }
 213   ~OnReadComplete() override {}
 214 };
 215
 216 class PrimaryLogPG::C_OSD_AppliedRecoveredObject : public Context {
 217   PrimaryLogPGRef pg;
 218   ObjectContextRef obc;
 219   public:
 220   C_OSD_AppliedRecoveredObject(PrimaryLogPG *p, ObjectContextRef o) :
 221     pg(p), obc(o) {}
 222   bool sync_finish(int r) override {
 223     pg->_applied_recovered_object(obc);
 224     return true;
 225   }
 226   void finish(int r) override {
 227     pg->lock();
 228     pg->_applied_recovered_object(obc);
 229     pg->unlock();
 230   }
 231 };
 232
 233 class PrimaryLogPG::C_OSD_CommittedPushedObject : public Context {
 234   PrimaryLogPGRef pg;
 235   epoch_t epoch;
 236   eversion_t last_complete;
 237   public:
 238   C_OSD_CommittedPushedObject(
 239     PrimaryLogPG *p, epoch_t epoch, eversion_t lc) :
 240     pg(p), epoch(epoch), last_complete(lc) {
 241   }
 242   void finish(int r) override {
 243     pg->_committed_pushed_object(epoch, last_complete);
 244   }
 245 };
 246
 247 class PrimaryLogPG::C_OSD_AppliedRecoveredObjectReplica : public Context {
 248   PrimaryLogPGRef pg;
 249   public:
 250   explicit C_OSD_AppliedRecoveredObjectReplica(PrimaryLogPG *p) :
 251     pg(p) {}
 252   bool sync_finish(int r) override {
 253     pg->_applied_recovered_object_replica();
 254     return true;
 255   }
 256   void finish(int r) override {
 257     pg->lock();
 258     pg->_applied_recovered_object_replica();
 259     pg->unlock();
 260   }
 261 };
 262
 263 // OpContext
 264 void PrimaryLogPG::OpContext::start_async_reads(PrimaryLogPG *pg)
 265 {
 266   inflightreads = 1;
 267   list<pair<boost::tuple<uint64_t, uint64_t, unsigned>,
 268             pair<bufferlist*, Context*> > > in;
 269   in.swap(pending_async_reads);
 270   pg->pgbackend->objects_read_async(
 271     obc->obs.oi.soid,
 272     in,
 273     new OnReadComplete(pg, this), pg->get_pool().fast_read);
 274 }
 275 void PrimaryLogPG::OpContext::finish_read(PrimaryLogPG *pg)
 276 {
 277   ceph_assert(inflightreads > 0);
 278   --inflightreads;
 279   if (async_reads_complete()) {
 280     ceph_assert(pg->in_progress_async_reads.size());
 281     ceph_assert(pg->in_progress_async_reads.front().second == this);
 282     pg->in_progress_async_reads.pop_front();
 283
 284     // Restart the op context now that all reads have been
 285     // completed. Read failures will be handled by the op finisher
 286     pg->execute_ctx(this);
 287   }
 288 }
 289
 290 class CopyFromCallback : public PrimaryLogPG::CopyCallback {
 291 public:
 292   PrimaryLogPG::CopyResults *results = nullptr;
 293   PrimaryLogPG::OpContext *ctx;
 294   OSDOp &osd_op;
 295
 296   CopyFromCallback(PrimaryLogPG::OpContext *ctx, OSDOp &osd_op)
 297     : ctx(ctx), osd_op(osd_op) {
 298   }
 299   ~CopyFromCallback() override {}
 300
 301   void finish(PrimaryLogPG::CopyCallbackResults results_) override {
 302     results = results_.get<1>();
 303     int r = results_.get<0>();
 304
 305     // for finish_copyfrom
 306     ctx->user_at_version = results->user_version;
 307
 308     if (r >= 0) {
 309       ctx->pg->execute_ctx(ctx);
 310     } else {
 311       if (r != -ECANCELED) { // on cancel just toss it out; client resends
 312         if (ctx->op)
 313           ctx->pg->osd->reply_op_error(ctx->op, r);
 314       } else if (results->should_requeue) {
 315         if (ctx->op)
 316           ctx->pg->requeue_op(ctx->op);
 317       }
 318       ctx->pg->close_op_ctx(ctx);
 319     }
 320   }
 321
 322   bool is_temp_obj_used() {
 323     return results->started_temp_obj;
 324   }
 325   uint64_t get_data_size() {
 326     return results->object_size;
 327   }
 328 };
 329
 330 struct CopyFromFinisher : public PrimaryLogPG::OpFinisher {
 331   CopyFromCallback *copy_from_callback;
 332
 333   explicit CopyFromFinisher(CopyFromCallback *copy_from_callback)
 334     : copy_from_callback(copy_from_callback) {
 335   }
 336
 337   int execute() override {
 338     // instance will be destructed after this method completes
 339     copy_from_callback->ctx->pg->finish_copyfrom(copy_from_callback);
 340     return 0;
 341   }
 342 };
 343
 344 // ======================
 345 // PGBackend::Listener
 346
 347 void PrimaryLogPG::on_local_recover(
 348   const hobject_t &hoid,
 349   const ObjectRecoveryInfo &_recovery_info,
 350   ObjectContextRef obc,
 351   bool is_delete,
 352   ObjectStore::Transaction *t
 353   )
 354 {
 355   dout(10) << __func__ << ": " << hoid << dendl;
 356
 357   ObjectRecoveryInfo recovery_info(_recovery_info);
 358   clear_object_snap_mapping(t, hoid);
 359   if (!is_delete && recovery_info.soid.is_snap()) {
 360     OSDriver::OSTransaction _t(osdriver.get_transaction(t));
 361     set<snapid_t> snaps;
 362     dout(20) << " snapset " << recovery_info.ss << dendl;
 363     auto p = recovery_info.ss.clone_snaps.find(hoid.snap);
 364     if (p != recovery_info.ss.clone_snaps.end()) {
 365       snaps.insert(p->second.begin(), p->second.end());
 366       dout(20) << " snaps " << snaps << dendl;
 367       snap_mapper.add_oid(
 368         recovery_info.soid,
 369         snaps,
 370         &_t);
 371     } else {
 372       derr << __func__ << " " << hoid << " had no clone_snaps" << dendl;
 373     }
 374   }
 375   if (!is_delete && pg_log.get_missing().is_missing(recovery_info.soid) &&
 376       pg_log.get_missing().get_items().find(recovery_info.soid)->second.need > recovery_info.version) {
 377     ceph_assert(is_primary());
 378     const pg_log_entry_t *latest = pg_log.get_log().objects.find(recovery_info.soid)->second;
 379     if (latest->op == pg_log_entry_t::LOST_REVERT &&
 380         latest->reverting_to == recovery_info.version) {
 381       dout(10) << " got old revert version " << recovery_info.version
 382                << " for " << *latest << dendl;
 383       recovery_info.version = latest->version;
 384       // update the attr to the revert event version
 385       recovery_info.oi.prior_version = recovery_info.oi.version;
 386       recovery_info.oi.version = latest->version;
 387       bufferlist bl;
 388       encode(recovery_info.oi, bl,
 389                get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
 390       ceph_assert(!pool.info.is_erasure());
 391       t->setattr(coll, ghobject_t(recovery_info.soid), OI_ATTR, bl);
 392       if (obc)
 393         obc->attr_cache[OI_ATTR] = bl;
 394     }
 395   }
 396
 397   // keep track of active pushes for scrub
 398   ++active_pushes;
 399
 400   if (recovery_info.version > pg_log.get_can_rollback_to()) {
 401     /* This can only happen during a repair, and even then, it would
 402      * be one heck of a race.  If we are repairing the object, the
 403      * write in question must be fully committed, so it's not valid
 404      * to roll it back anyway (and we'll be rolled forward shortly
 405      * anyway) */
 406     PGLogEntryHandler h{this, t};
 407     pg_log.roll_forward_to(recovery_info.version, &h);
 408   }
 409   recover_got(recovery_info.soid, recovery_info.version);
 410
 411   if (is_primary()) {
 412     if (!is_delete) {
 413       obc->obs.exists = true;
 414
 415       bool got = obc->get_recovery_read();
 416       ceph_assert(got);
 417
 418       ceph_assert(recovering.count(obc->obs.oi.soid));
 419       recovering[obc->obs.oi.soid] = obc;
 420       obc->obs.oi = recovery_info.oi;  // may have been updated above
 421     }
 422
 423     t->register_on_applied(new C_OSD_AppliedRecoveredObject(this, obc));
 424
 425     publish_stats_to_osd();
 426     ceph_assert(missing_loc.needs_recovery(hoid));
 427     if (!is_delete)
 428       missing_loc.add_location(hoid, pg_whoami);
 429     release_backoffs(hoid);
 430     if (!is_unreadable_object(hoid)) {
 431       auto unreadable_object_entry = waiting_for_unreadable_object.find(hoid);
 432       if (unreadable_object_entry != waiting_for_unreadable_object.end()) {
 433         dout(20) << " kicking unreadable waiters on " << hoid << dendl;
 434         requeue_ops(unreadable_object_entry->second);
 435         waiting_for_unreadable_object.erase(unreadable_object_entry);
 436       }
 437     }
 438   } else {
 439     t->register_on_applied(
 440       new C_OSD_AppliedRecoveredObjectReplica(this));
 441
 442   }
 443
 444   t->register_on_commit(
 445     new C_OSD_CommittedPushedObject(
 446       this,
 447       get_osdmap_epoch(),
 448       info.last_complete));
 449
 450   // update pg
 451   dirty_info = true;
 452   write_if_dirty(*t);
 453 }
 454
 455 void PrimaryLogPG::on_global_recover(
 456   const hobject_t &soid,
 457   const object_stat_sum_t &stat_diff,
 458   bool is_delete)
 459 {
 460   info.stats.stats.sum.add(stat_diff);
 461   missing_loc.recovered(soid);
 462   publish_stats_to_osd();
 463   dout(10) << "pushed " << soid << " to all replicas" << dendl;
 464   map<hobject_t, ObjectContextRef>::iterator i = recovering.find(soid);
 465   ceph_assert(i != recovering.end());
 466
 467   if (i->second && i->second->rwstate.recovery_read_marker) {
 468     // recover missing won't have had an obc, but it gets filled in
 469     // during on_local_recover
 470     ceph_assert(i->second);
 471     list<OpRequestRef> requeue_list;
 472     i->second->drop_recovery_read(&requeue_list);
 473     requeue_ops(requeue_list);
 474   }
 475
 476   backfills_in_flight.erase(soid);
 477
 478   recovering.erase(i);
 479   finish_recovery_op(soid);
 480   release_backoffs(soid);
 481   auto degraded_object_entry = waiting_for_degraded_object.find(soid);
 482   if (degraded_object_entry != waiting_for_degraded_object.end()) {
 483     dout(20) << " kicking degraded waiters on " << soid << dendl;
 484     requeue_ops(degraded_object_entry->second);
 485     waiting_for_degraded_object.erase(degraded_object_entry);
 486   }
 487   auto unreadable_object_entry = waiting_for_unreadable_object.find(soid);
 488   if (unreadable_object_entry != waiting_for_unreadable_object.end()) {
 489     dout(20) << " kicking unreadable waiters on " << soid << dendl;
 490     requeue_ops(unreadable_object_entry->second);
 491     waiting_for_unreadable_object.erase(unreadable_object_entry);
 492   }
 493   finish_degraded_object(soid);
 494 }
 495
 496 void PrimaryLogPG::on_peer_recover(
 497   pg_shard_t peer,
 498   const hobject_t &soid,
 499   const ObjectRecoveryInfo &recovery_info)
 500 {
 501   publish_stats_to_osd();
 502   // done!
 503   peer_missing[peer].got(soid, recovery_info.version);
 504   missing_loc.add_location(soid, peer);
 505 }
 506
 507 void PrimaryLogPG::begin_peer_recover(
 508   pg_shard_t peer,
 509   const hobject_t soid)
 510 {
 511   peer_missing[peer].revise_have(soid, eversion_t());
 512 }
 513
 514 void PrimaryLogPG::schedule_recovery_work(
 515   GenContext<ThreadPool::TPHandle&> *c)
 516 {
 517   osd->queue_recovery_context(this, c);
 518 }
 519
 520 void PrimaryLogPG::send_message_osd_cluster(
 521   int peer, Message *m, epoch_t from_epoch)
 522 {
 523   osd->send_message_osd_cluster(peer, m, from_epoch);
 524 }
 525
 526 void PrimaryLogPG::send_message_osd_cluster(
 527   Message *m, Connection *con)
 528 {
 529   osd->send_message_osd_cluster(m, con);
 530 }
 531
 532 void PrimaryLogPG::send_message_osd_cluster(
 533   Message *m, const ConnectionRef& con)
 534 {
 535   osd->send_message_osd_cluster(m, con);
 536 }
 537
 538 void PrimaryLogPG::on_primary_error(
 539   const hobject_t &oid,
 540   eversion_t v)
 541 {
 542   dout(0) << __func__ << ": oid " << oid << " version " << v << dendl;
 543   primary_failed(oid);
 544   primary_error(oid, v);
 545   backfill_add_missing(oid, v);
 546 }
 547
 548 void PrimaryLogPG::backfill_add_missing(
 549   const hobject_t &oid,
 550   eversion_t v)
 551 {
 552   dout(0) << __func__ << ": oid " << oid << " version " << v << dendl;
 553   backfills_in_flight.erase(oid);
 554   missing_loc.add_missing(oid, v, eversion_t());
 555 }
 556
 557 bool PrimaryLogPG::should_send_op(
 558   pg_shard_t peer,
 559   const hobject_t &hoid) {
 560   if (peer == get_primary())
 561     return true;
 562   ceph_assert(peer_info.count(peer));
 563   bool should_send =
 564       hoid.pool != (int64_t)info.pgid.pool() ||
 565       hoid <= last_backfill_started ||
 566       hoid <= peer_info[peer].last_backfill;
 567   if (!should_send) {
 568     ceph_assert(is_backfill_targets(peer));
 569     dout(10) << __func__ << " issue_repop shipping empty opt to osd." << peer
 570              << ", object " << hoid
 571              << " beyond std::max(last_backfill_started "
 572              << ", peer_info[peer].last_backfill "
 573              << peer_info[peer].last_backfill << ")" << dendl;
 574     return should_send;
 575   }
 576   if (async_recovery_targets.count(peer) && peer_missing[peer].is_missing(hoid)) {
 577     should_send = false;
 578     dout(10) << __func__ << " issue_repop shipping empty opt to osd." << peer
 579              << ", object " << hoid
 580              << " which is pending recovery in async_recovery_targets" << dendl;
 581   }
 582   return should_send;
 583 }
 584
 585
 586 ConnectionRef PrimaryLogPG::get_con_osd_cluster(
 587   int peer, epoch_t from_epoch)
 588 {
 589   return osd->get_con_osd_cluster(peer, from_epoch);
 590 }
 591
 592 PerfCounters *PrimaryLogPG::get_logger()
 593 {
 594   return osd->logger;
 595 }
 596
 597
 598 // ====================
 599 // missing objects
 600
 601 bool PrimaryLogPG::is_missing_object(const hobject_t& soid) const
 602 {
 603   return pg_log.get_missing().get_items().count(soid);
 604 }
 605
 606 void PrimaryLogPG::maybe_kick_recovery(
 607   const hobject_t &soid)
 608 {
 609   eversion_t v;
 610   bool work_started = false;
 611   if (!missing_loc.needs_recovery(soid, &v))
 612     return;
 613
 614   map<hobject_t, ObjectContextRef>::const_iterator p = recovering.find(soid);
 615   if (p != recovering.end()) {
 616     dout(7) << "object " << soid << " v " << v << ", already recovering." << dendl;
 617   } else if (missing_loc.is_unfound(soid)) {
 618     dout(7) << "object " << soid << " v " << v << ", is unfound." << dendl;
 619   } else {
 620     dout(7) << "object " << soid << " v " << v << ", recovering." << dendl;
 621     PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
 622     if (is_missing_object(soid)) {
 623       recover_missing(soid, v, cct->_conf->osd_client_op_priority, h);
 624     } else if (missing_loc.is_deleted(soid)) {
 625       prep_object_replica_deletes(soid, v, h, &work_started);
 626     } else {
 627       prep_object_replica_pushes(soid, v, h, &work_started);
 628     }
 629     pgbackend->run_recovery_op(h, cct->_conf->osd_client_op_priority);
 630   }
 631 }
 632
 633 void PrimaryLogPG::wait_for_unreadable_object(
 634   const hobject_t& soid, OpRequestRef op)
 635 {
 636   ceph_assert(is_unreadable_object(soid));
 637   maybe_kick_recovery(soid);
 638   waiting_for_unreadable_object[soid].push_back(op);
 639   op->mark_delayed("waiting for missing object");
 640 }
 641
 642 bool PrimaryLogPG::is_degraded_or_backfilling_object(const hobject_t& soid)
 643 {
 644   /* The conditions below may clear (on_local_recover, before we queue
 645    * the transaction) before we actually requeue the degraded waiters
 646    * in on_global_recover after the transaction completes.
 647    */
 648   if (waiting_for_degraded_object.count(soid))
 649     return true;
 650   if (pg_log.get_missing().get_items().count(soid))
 651     return true;
 652   ceph_assert(!acting_recovery_backfill.empty());
 653   for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
 654        i != acting_recovery_backfill.end();
 655        ++i) {
 656     if (*i == get_primary()) continue;
 657     pg_shard_t peer = *i;
 658     auto peer_missing_entry = peer_missing.find(peer);
 659     // If an object is missing on an async_recovery_target, return false.
 660     // This will not block the op and the object is async recovered later.
 661     if (peer_missing_entry != peer_missing.end() &&
 662         peer_missing_entry->second.get_items().count(soid)) {
 663       if (async_recovery_targets.count(peer))
 664         continue;
 665       else
 666         return true;
 667     }
 668     // Object is degraded if after last_backfill AND
 669     // we are backfilling it
 670     if (is_backfill_targets(peer) &&
 671         peer_info[peer].last_backfill <= soid &&
 672         last_backfill_started >= soid &&
 673         backfills_in_flight.count(soid))
 674       return true;
 675   }
 676   return false;
 677 }
 678
 679 bool PrimaryLogPG::is_degraded_on_async_recovery_target(const hobject_t& soid)
 680 {
 681   for (auto &i: async_recovery_targets) {
 682     auto peer_missing_entry = peer_missing.find(i);
 683     if (peer_missing_entry != peer_missing.end() &&
 684         peer_missing_entry->second.get_items().count(soid)) {
 685       dout(30) << __func__ << " " << soid << dendl;
 686       return true;
 687     }
 688   }
 689   return false;
 690 }
 691
 692 void PrimaryLogPG::wait_for_degraded_object(const hobject_t& soid, OpRequestRef op)
 693 {
 694   ceph_assert(is_degraded_or_backfilling_object(soid) || is_degraded_on_async_recovery_target(soid));
 695
 696   maybe_kick_recovery(soid);
 697   waiting_for_degraded_object[soid].push_back(op);
 698   op->mark_delayed("waiting for degraded object");
 699 }
 700
 701 void PrimaryLogPG::block_write_on_full_cache(
 702   const hobject_t& _oid, OpRequestRef op)
 703 {
 704   const hobject_t oid = _oid.get_head();
 705   dout(20) << __func__ << ": blocking object " << oid
 706            << " on full cache" << dendl;
 707   objects_blocked_on_cache_full.insert(oid);
 708   waiting_for_cache_not_full.push_back(op);
 709   op->mark_delayed("waiting for cache not full");
 710 }
 711
 712 void PrimaryLogPG::block_for_clean(
 713   const hobject_t& oid, OpRequestRef op)
 714 {
 715   dout(20) << __func__ << ": blocking object " << oid
 716            << " on primary repair" << dendl;
 717   waiting_for_clean_to_primary_repair.push_back(op);
 718   op->mark_delayed("waiting for clean to repair");
 719 }
 720
 721 void PrimaryLogPG::block_write_on_snap_rollback(
 722   const hobject_t& oid, ObjectContextRef obc, OpRequestRef op)
 723 {
 724   dout(20) << __func__ << ": blocking object " << oid.get_head()
 725            << " on snap promotion " << obc->obs.oi.soid << dendl;
 726   // otherwise, we'd have blocked in do_op
 727   ceph_assert(oid.is_head());
 728   ceph_assert(objects_blocked_on_snap_promotion.count(oid) == 0);
 729   objects_blocked_on_snap_promotion[oid] = obc;
 730   wait_for_blocked_object(obc->obs.oi.soid, op);
 731 }
 732
 733 void PrimaryLogPG::block_write_on_degraded_snap(
 734   const hobject_t& snap, OpRequestRef op)
 735 {
 736   dout(20) << __func__ << ": blocking object " << snap.get_head()
 737            << " on degraded snap " << snap << dendl;
 738   // otherwise, we'd have blocked in do_op
 739   ceph_assert(objects_blocked_on_degraded_snap.count(snap.get_head()) == 0);
 740   objects_blocked_on_degraded_snap[snap.get_head()] = snap.snap;
 741   wait_for_degraded_object(snap, op);
 742 }
 743
 744 bool PrimaryLogPG::maybe_await_blocked_head(
 745   const hobject_t &hoid,
 746   OpRequestRef op)
 747 {
 748   ObjectContextRef obc;
 749   obc = object_contexts.lookup(hoid.get_head());
 750   if (obc) {
 751     if (obc->is_blocked()) {
 752       wait_for_blocked_object(obc->obs.oi.soid, op);
 753       return true;
 754     } else {
 755       return false;
 756     }
 757   }
 758   return false;
 759 }
 760
 761 void PrimaryLogPG::wait_for_blocked_object(const hobject_t& soid, OpRequestRef op)
 762 {
 763   dout(10) << __func__ << " " << soid << " " << op << dendl;
 764   waiting_for_blocked_object[soid].push_back(op);
 765   op->mark_delayed("waiting for blocked object");
 766 }
 767
 768 void PrimaryLogPG::maybe_force_recovery()
 769 {
 770   // no force if not in degraded/recovery/backfill states
 771   if (!is_degraded() &&
 772       !state_test(PG_STATE_RECOVERING |
 773                   PG_STATE_RECOVERY_WAIT |
 774                   PG_STATE_BACKFILLING |
 775                   PG_STATE_BACKFILL_WAIT |
 776                   PG_STATE_BACKFILL_TOOFULL))
 777     return;
 778
 779   if (pg_log.get_log().approx_size() <
 780       cct->_conf->osd_max_pg_log_entries *
 781         cct->_conf->osd_force_recovery_pg_log_entries_factor)
 782     return;
 783
 784   // find the oldest missing object
 785   version_t min_version = pg_log.get_log().head.version;
 786   hobject_t soid;
 787   if (!pg_log.get_missing().get_rmissing().empty()) {
 788     min_version = pg_log.get_missing().get_rmissing().begin()->first;
 789     soid = pg_log.get_missing().get_rmissing().begin()->second;
 790   }
 791   ceph_assert(!acting_recovery_backfill.empty());
 792   for (set<pg_shard_t>::iterator it = acting_recovery_backfill.begin();
 793        it != acting_recovery_backfill.end();
 794        ++it) {
 795     if (*it == get_primary()) continue;
 796     pg_shard_t peer = *it;
 797     auto it_missing = peer_missing.find(peer);
 798     if (it_missing != peer_missing.end() &&
 799         !it_missing->second.get_rmissing().empty()) {
 800       const auto& min_obj = peer_missing[peer].get_rmissing().begin();
 801       dout(20) << __func__ << " peer " << peer << " min_version " << min_obj->first
 802                << " oid " << min_obj->second << dendl;
 803       if (min_version > min_obj->first) {
 804         min_version = min_obj->first;
 805         soid = min_obj->second;
 806       }
 807     }
 808   }
 809
 810   // recover it
 811   if (soid != hobject_t())
 812     maybe_kick_recovery(soid);
 813 }
 814
 815 class PGLSPlainFilter : public PGLSFilter {
 816   string val;
 817 public:
 818   int init(bufferlist::const_iterator &params) override
 819   {
 820     try {
 821       decode(xattr, params);
 822       decode(val, params);
 823     } catch (buffer::error &e) {
 824       return -EINVAL;
 825     }
 826
 827     return 0;
 828   }
 829   ~PGLSPlainFilter() override {}
 830   bool filter(const hobject_t &obj, bufferlist& xattr_data,
 831                       bufferlist& outdata) override;
 832 };
 833
 834 class PGLSParentFilter : public PGLSFilter {
 835   inodeno_t parent_ino;
 836 public:
 837   CephContext* cct;
 838   explicit PGLSParentFilter(CephContext* cct) : cct(cct) {
 839     xattr = "_parent";
 840   }
 841   int init(bufferlist::const_iterator &params) override
 842   {
 843     try {
 844       decode(parent_ino, params);
 845     } catch (buffer::error &e) {
 846       return -EINVAL;
 847     }
 848     generic_dout(0) << "parent_ino=" << parent_ino << dendl;
 849
 850     return 0;
 851   }
 852   ~PGLSParentFilter() override {}
 853   bool filter(const hobject_t &obj, bufferlist& xattr_data,
 854                       bufferlist& outdata) override;
 855 };
 856
 857 bool PGLSParentFilter::filter(const hobject_t &obj,
 858                               bufferlist& xattr_data, bufferlist& outdata)
 859 {
 860   auto iter = xattr_data.cbegin();
 861   inode_backtrace_t bt;
 862
 863   generic_dout(0) << "PGLSParentFilter::filter" << dendl;
 864
 865   decode(bt, iter);
 866
 867   vector<inode_backpointer_t>::iterator vi;
 868   for (vi = bt.ancestors.begin(); vi != bt.ancestors.end(); ++vi) {
 869     generic_dout(0) << "vi->dirino=" << vi->dirino << " parent_ino=" << parent_ino << dendl;
 870     if (vi->dirino == parent_ino) {
 871       encode(*vi, outdata);
 872       return true;
 873     }
 874   }
 875
 876   return false;
 877 }
 878
 879 bool PGLSPlainFilter::filter(const hobject_t &obj,
 880                              bufferlist& xattr_data, bufferlist& outdata)
 881 {
 882   if (val.size() != xattr_data.length())
 883     return false;
 884
 885   if (memcmp(val.c_str(), xattr_data.c_str(), val.size()))
 886     return false;
 887
 888   return true;
 889 }
 890
 891 bool PrimaryLogPG::pgls_filter(PGLSFilter *filter, hobject_t& sobj, bufferlist& outdata)
 892 {
 893   bufferlist bl;
 894
 895   // If filter has expressed an interest in an xattr, load it.
 896   if (!filter->get_xattr().empty()) {
 897     int ret = pgbackend->objects_get_attr(
 898       sobj,
 899       filter->get_xattr(),
 900       &bl);
 901     dout(0) << "getattr (sobj=" << sobj << ", attr=" << filter->get_xattr() << ") returned " << ret << dendl;
 902     if (ret < 0) {
 903       if (ret != -ENODATA || filter->reject_empty_xattr()) {
 904         return false;
 905       }
 906     }
 907   }
 908
 909   return filter->filter(sobj, bl, outdata);
 910 }
 911
 912 int PrimaryLogPG::get_pgls_filter(bufferlist::const_iterator& iter, PGLSFilter **pfilter)
 913 {
 914   string type;
 915   PGLSFilter *filter;
 916
 917   try {
 918     decode(type, iter);
 919   }
 920   catch (buffer::error& e) {
 921     return -EINVAL;
 922   }
 923
 924   if (type.compare("parent") == 0) {
 925     filter = new PGLSParentFilter(cct);
 926   } else if (type.compare("plain") == 0) {
 927     filter = new PGLSPlainFilter();
 928   } else {
 929     std::size_t dot = type.find(".");
 930     if (dot == std::string::npos || dot == 0 || dot == type.size() - 1) {
 931       return -EINVAL;
 932     }
 933
 934     const std::string class_name = type.substr(0, dot);
 935     const std::string filter_name = type.substr(dot + 1);
 936     ClassHandler::ClassData *cls = NULL;
 937     int r = osd->class_handler->open_class(class_name, &cls);
 938     if (r != 0) {
 939       derr << "Error opening class '" << class_name << "': "
 940            << cpp_strerror(r) << dendl;
 941       if (r != -EPERM) // propogate permission error
 942         r = -EINVAL;
 943       return r;
 944     } else {
 945       ceph_assert(cls);
 946     }
 947
 948     ClassHandler::ClassFilter *class_filter = cls->get_filter(filter_name);
 949     if (class_filter == NULL) {
 950       derr << "Error finding filter '" << filter_name << "' in class "
 951            << class_name << dendl;
 952       return -EINVAL;
 953     }
 954     filter = class_filter->fn();
 955     if (!filter) {
 956       // Object classes are obliged to return us something, but let's
 957       // give an error rather than asserting out.
 958       derr << "Buggy class " << class_name << " failed to construct "
 959               "filter " << filter_name << dendl;
 960       return -EINVAL;
 961     }
 962   }
 963
 964   ceph_assert(filter);
 965   int r = filter->init(iter);
 966   if (r < 0) {
 967     derr << "Error initializing filter " << type << ": "
 968          << cpp_strerror(r) << dendl;
 969     delete filter;
 970     return -EINVAL;
 971   } else {
 972     // Successfully constructed and initialized, return it.
 973     *pfilter = filter;
 974     return 0;
 975   }
 976 }
 977
 978
 979 // ==========================================================
 980
 981 int PrimaryLogPG::do_command(
 982   cmdmap_t cmdmap,
 983   ostream& ss,
 984   bufferlist& idata,
 985   bufferlist& odata,
 986   ConnectionRef con,
 987   ceph_tid_t tid)
 988 {
 989   string prefix;
 990   string format;
 991
 992   cmd_getval(cct, cmdmap, "format", format);
 993   boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json"));
 994
 995   string command;
 996   cmd_getval(cct, cmdmap, "cmd", command);
 997   if (command == "query") {
 998     f->open_object_section("pg");
 999     f->dump_string("state", pg_state_string(get_state()));
1000     f->dump_stream("snap_trimq") << snap_trimq;
1001     f->dump_unsigned("snap_trimq_len", snap_trimq.size());
1002     f->dump_unsigned("epoch", get_osdmap_epoch());
1003     f->open_array_section("up");
1004     for (vector<int>::iterator p = up.begin(); p != up.end(); ++p)
1005       f->dump_unsigned("osd", *p);
1006     f->close_section();
1007     f->open_array_section("acting");
1008     for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p)
1009       f->dump_unsigned("osd", *p);
1010     f->close_section();
1011     if (!backfill_targets.empty()) {
1012       f->open_array_section("backfill_targets");
1013       for (set<pg_shard_t>::iterator p = backfill_targets.begin();
1014            p != backfill_targets.end();
1015            ++p)
1016         f->dump_stream("shard") << *p;
1017       f->close_section();
1018     }
1019     if (!async_recovery_targets.empty()) {
1020       f->open_array_section("async_recovery_targets");
1021       for (set<pg_shard_t>::iterator p = async_recovery_targets.begin();
1022            p != async_recovery_targets.end();
1023            ++p)
1024         f->dump_stream("shard") << *p;
1025       f->close_section();
1026     }
1027     if (!acting_recovery_backfill.empty()) {
1028       f->open_array_section("acting_recovery_backfill");
1029       for (set<pg_shard_t>::iterator p = acting_recovery_backfill.begin();
1030            p != acting_recovery_backfill.end();
1031            ++p)
1032         f->dump_stream("shard") << *p;
1033       f->close_section();
1034     }
1035     f->open_object_section("info");
1036     _update_calc_stats();
1037     info.dump(f.get());
1038     f->close_section();
1039
1040     f->open_array_section("peer_info");
1041     for (map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
1042          p != peer_info.end();
1043          ++p) {
1044       f->open_object_section("info");
1045       f->dump_stream("peer") << p->first;
1046       p->second.dump(f.get());
1047       f->close_section();
1048     }
1049     f->close_section();
1050
1051     f->open_array_section("recovery_state");
1052     handle_query_state(f.get());
1053     f->close_section();
1054
1055     f->open_object_section("agent_state");
1056     if (agent_state)
1057       agent_state->dump(f.get());
1058     f->close_section();
1059
1060     f->close_section();
1061     f->flush(odata);
1062     return 0;
1063   }
1064   else if (command == "mark_unfound_lost") {
1065     string mulcmd;
1066     cmd_getval(cct, cmdmap, "mulcmd", mulcmd);
1067     int mode = -1;
1068     if (mulcmd == "revert") {
1069       if (pool.info.is_erasure()) {
1070         ss << "mode must be 'delete' for ec pool";
1071         return -EINVAL;
1072       }
1073       mode = pg_log_entry_t::LOST_REVERT;
1074     } else if (mulcmd == "delete") {
1075       mode = pg_log_entry_t::LOST_DELETE;
1076     } else {
1077       ss << "mode must be 'revert' or 'delete'; mark not yet implemented";
1078       return -EINVAL;
1079     }
1080     ceph_assert(mode == pg_log_entry_t::LOST_REVERT ||
1081            mode == pg_log_entry_t::LOST_DELETE);
1082
1083     if (!is_primary()) {
1084       ss << "not primary";
1085       return -EROFS;
1086     }
1087
1088     uint64_t unfound = missing_loc.num_unfound();
1089     if (!unfound) {
1090       ss << "pg has no unfound objects";
1091       return 0;  // make command idempotent
1092     }
1093
1094     if (!all_unfound_are_queried_or_lost(get_osdmap())) {
1095       ss << "pg has " << unfound
1096          << " unfound objects but we haven't probed all sources, not marking lost";
1097       return -EINVAL;
1098     }
1099
1100     mark_all_unfound_lost(mode, con, tid);
1101     return -EAGAIN;
1102   }
1103   else if (command == "list_unfound") {
1104     hobject_t offset;
1105     string offset_json;
1106     bool show_offset = false;
1107     if (cmd_getval(cct, cmdmap, "offset", offset_json)) {
1108       json_spirit::Value v;
1109       try {
1110         if (!json_spirit::read(offset_json, v))
1111           throw std::runtime_error("bad json");
1112         offset.decode(v);
1113       } catch (std::runtime_error& e) {
1114         ss << "error parsing offset: " << e.what();
1115         return -EINVAL;
1116       }
1117       show_offset = true;
1118     }
1119     f->open_object_section("missing");
1120     if (show_offset) {
1121       f->open_object_section("offset");
1122       offset.dump(f.get());
1123       f->close_section();
1124     }
1125     auto &needs_recovery_map = missing_loc.get_needs_recovery();
1126     f->dump_int("num_missing", needs_recovery_map.size());
1127     f->dump_int("num_unfound", get_num_unfound());
1128     map<hobject_t, pg_missing_item>::const_iterator p =
1129       needs_recovery_map.upper_bound(offset);
1130     {
1131       f->open_array_section("objects");
1132       int32_t num = 0;
1133       for (; p != needs_recovery_map.end() && num < cct->_conf->osd_command_max_records; ++p) {
1134         if (missing_loc.is_unfound(p->first)) {
1135           f->open_object_section("object");
1136           {
1137             f->open_object_section("oid");
1138             p->first.dump(f.get());
1139             f->close_section();
1140           }
1141           p->second.dump(f.get()); // have, need keys
1142           {
1143             f->open_array_section("locations");
1144             for (set<pg_shard_t>::iterator r =
1145                 missing_loc.get_locations(p->first).begin();
1146                 r != missing_loc.get_locations(p->first).end();
1147                 ++r)
1148               f->dump_stream("shard") << *r;
1149             f->close_section();
1150           }
1151           f->close_section();
1152           num++;
1153         }
1154       }
1155       f->close_section();
1156     }
1157     f->dump_bool("more", p != needs_recovery_map.end());
1158     f->close_section();
1159     f->flush(odata);
1160     return 0;
1161   }
1162
1163   ss << "unknown pg command " << prefix;
1164   return -EINVAL;
1165 }
1166
1167 // ==========================================================
1168
1169 void PrimaryLogPG::do_pg_op(OpRequestRef op)
1170 {
1171   // NOTE: this is non-const because we modify the OSDOp.outdata in
1172   // place
1173   MOSDOp *m = static_cast<MOSDOp *>(op->get_nonconst_req());
1174   ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
1175   dout(10) << "do_pg_op " << *m << dendl;
1176
1177   op->mark_started();
1178
1179   int result = 0;
1180   string cname, mname;
1181   PGLSFilter *filter = NULL;
1182   bufferlist filter_out;
1183
1184   snapid_t snapid = m->get_snapid();
1185
1186   vector<OSDOp> ops = m->ops;
1187
1188   for (vector<OSDOp>::iterator p = ops.begin(); p != ops.end(); ++p) {
1189     OSDOp& osd_op = *p;
1190     auto bp = p->indata.cbegin();
1191     switch (p->op.op) {
1192     case CEPH_OSD_OP_PGNLS_FILTER:
1193       try {
1194         decode(cname, bp);
1195         decode(mname, bp);
1196       }
1197       catch (const buffer::error& e) {
1198         dout(0) << "unable to decode PGLS_FILTER description in " << *m << dendl;
1199         result = -EINVAL;
1200         break;
1201       }
1202       if (filter) {
1203         delete filter;
1204         filter = NULL;
1205       }
1206       result = get_pgls_filter(bp, &filter);
1207       if (result < 0)
1208         break;
1209
1210       ceph_assert(filter);
1211
1212       // fall through
1213
1214     case CEPH_OSD_OP_PGNLS:
1215       if (snapid != CEPH_NOSNAP) {
1216         result = -EINVAL;
1217         break;
1218       }
1219       if (get_osdmap()->raw_pg_to_pg(m->get_pg()) != info.pgid.pgid) {
1220         dout(10) << " pgnls pg=" << m->get_pg()
1221                  << " " << get_osdmap()->raw_pg_to_pg(m->get_pg())
1222                  << " != " << info.pgid << dendl;
1223         result = 0; // hmm?
1224       } else {
1225         unsigned list_size = std::min<uint64_t>(cct->_conf->osd_max_pgls,
1226                                                 p->op.pgls.count);
1227
1228         dout(10) << " pgnls pg=" << m->get_pg() << " count " << list_size
1229                  << dendl;
1230         // read into a buffer
1231         vector<hobject_t> sentries;
1232         pg_nls_response_t response;
1233         try {
1234           decode(response.handle, bp);
1235         }
1236         catch (const buffer::error& e) {
1237           dout(0) << "unable to decode PGNLS handle in " << *m << dendl;
1238           result = -EINVAL;
1239           break;
1240         }
1241
1242         hobject_t next;
1243         hobject_t lower_bound = response.handle;
1244         hobject_t pg_start = info.pgid.pgid.get_hobj_start();
1245         hobject_t pg_end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1246         dout(10) << " pgnls lower_bound " << lower_bound
1247                  << " pg_end " << pg_end << dendl;
1248         if (((!lower_bound.is_max() && lower_bound >= pg_end) ||
1249              (lower_bound != hobject_t() && lower_bound < pg_start))) {
1250           // this should only happen with a buggy client.
1251           dout(10) << "outside of PG bounds " << pg_start << " .. "
1252                    << pg_end << dendl;
1253           result = -EINVAL;
1254           break;
1255         }
1256
1257         hobject_t current = lower_bound;
1258         int r = pgbackend->objects_list_partial(
1259           current,
1260           list_size,
1261           list_size,
1262           &sentries,
1263           &next);
1264         if (r != 0) {
1265           result = -EINVAL;
1266           break;
1267         }
1268
1269         map<hobject_t, pg_missing_item>::const_iterator missing_iter =
1270           pg_log.get_missing().get_items().lower_bound(current);
1271         vector<hobject_t>::iterator ls_iter = sentries.begin();
1272         hobject_t _max = hobject_t::get_max();
1273         while (1) {
1274           const hobject_t &mcand =
1275             missing_iter == pg_log.get_missing().get_items().end() ?
1276             _max :
1277             missing_iter->first;
1278           const hobject_t &lcand =
1279             ls_iter == sentries.end() ?
1280             _max :
1281             *ls_iter;
1282
1283           hobject_t candidate;
1284           if (mcand == lcand) {
1285             candidate = mcand;
1286             if (!mcand.is_max()) {
1287               ++ls_iter;
1288               ++missing_iter;
1289             }
1290           } else if (mcand < lcand) {
1291             candidate = mcand;
1292             ceph_assert(!mcand.is_max());
1293             ++missing_iter;
1294           } else {
1295             candidate = lcand;
1296             ceph_assert(!lcand.is_max());
1297             ++ls_iter;
1298           }
1299
1300           dout(10) << " pgnls candidate 0x" << std::hex << candidate.get_hash()
1301                    << " vs lower bound 0x" << lower_bound.get_hash()
1302                    << std::dec << dendl;
1303
1304           if (candidate >= next) {
1305             break;
1306           }
1307
1308           if (response.entries.size() == list_size) {
1309             next = candidate;
1310             break;
1311           }
1312
1313           if (candidate.snap != CEPH_NOSNAP)
1314             continue;
1315
1316           // skip internal namespace
1317           if (candidate.get_namespace() == cct->_conf->osd_hit_set_namespace)
1318             continue;
1319
1320           if (missing_loc.is_deleted(candidate))
1321             continue;
1322
1323           // skip wrong namespace
1324           if (m->get_hobj().nspace != librados::all_nspaces &&
1325                candidate.get_namespace() != m->get_hobj().nspace)
1326             continue;
1327
1328           if (filter && !pgls_filter(filter, candidate, filter_out))
1329             continue;
1330
1331           dout(20) << "pgnls item 0x" << std::hex
1332             << candidate.get_hash()
1333             << ", rev 0x" << hobject_t::_reverse_bits(candidate.get_hash())
1334             << std::dec << " "
1335             << candidate.oid.name << dendl;
1336
1337           librados::ListObjectImpl item;
1338           item.nspace = candidate.get_namespace();
1339           item.oid = candidate.oid.name;
1340           item.locator = candidate.get_key();
1341           response.entries.push_back(item);
1342         }
1343
1344         if (next.is_max() &&
1345             missing_iter == pg_log.get_missing().get_items().end() &&
1346             ls_iter == sentries.end()) {
1347           result = 1;
1348
1349           // Set response.handle to the start of the next PG according
1350           // to the object sort order.
1351           response.handle = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1352         } else {
1353           response.handle = next;
1354         }
1355         dout(10) << "pgnls handle=" << response.handle << dendl;
1356         encode(response, osd_op.outdata);
1357         if (filter)
1358           encode(filter_out, osd_op.outdata);
1359         dout(10) << " pgnls result=" << result << " outdata.length()="
1360                  << osd_op.outdata.length() << dendl;
1361       }
1362       break;
1363
1364     case CEPH_OSD_OP_PGLS_FILTER:
1365       try {
1366         decode(cname, bp);
1367         decode(mname, bp);
1368       }
1369       catch (const buffer::error& e) {
1370         dout(0) << "unable to decode PGLS_FILTER description in " << *m << dendl;
1371         result = -EINVAL;
1372         break;
1373       }
1374       if (filter) {
1375         delete filter;
1376         filter = NULL;
1377       }
1378       result = get_pgls_filter(bp, &filter);
1379       if (result < 0)
1380         break;
1381
1382       ceph_assert(filter);
1383
1384       // fall through
1385
1386     case CEPH_OSD_OP_PGLS:
1387       if (snapid != CEPH_NOSNAP) {
1388         result = -EINVAL;
1389         break;
1390       }
1391       if (get_osdmap()->raw_pg_to_pg(m->get_pg()) != info.pgid.pgid) {
1392         dout(10) << " pgls pg=" << m->get_pg()
1393                  << " " << get_osdmap()->raw_pg_to_pg(m->get_pg())
1394                  << " != " << info.pgid << dendl;
1395         result = 0; // hmm?
1396       } else {
1397         unsigned list_size = std::min<uint64_t>(cct->_conf->osd_max_pgls,
1398                                                 p->op.pgls.count);
1399
1400         dout(10) << " pgls pg=" << m->get_pg() << " count " << list_size << dendl;
1401         // read into a buffer
1402         vector<hobject_t> sentries;
1403         pg_ls_response_t response;
1404         try {
1405           decode(response.handle, bp);
1406         }
1407         catch (const buffer::error& e) {
1408           dout(0) << "unable to decode PGLS handle in " << *m << dendl;
1409           result = -EINVAL;
1410           break;
1411         }
1412
1413         hobject_t next;
1414         hobject_t current = response.handle;
1415         int r = pgbackend->objects_list_partial(
1416           current,
1417           list_size,
1418           list_size,
1419           &sentries,
1420           &next);
1421         if (r != 0) {
1422           result = -EINVAL;
1423           break;
1424         }
1425
1426         ceph_assert(snapid == CEPH_NOSNAP || pg_log.get_missing().get_items().empty());
1427
1428         map<hobject_t, pg_missing_item>::const_iterator missing_iter =
1429           pg_log.get_missing().get_items().lower_bound(current);
1430         vector<hobject_t>::iterator ls_iter = sentries.begin();
1431         hobject_t _max = hobject_t::get_max();
1432         while (1) {
1433           const hobject_t &mcand =
1434             missing_iter == pg_log.get_missing().get_items().end() ?
1435             _max :
1436             missing_iter->first;
1437           const hobject_t &lcand =
1438             ls_iter == sentries.end() ?
1439             _max :
1440             *ls_iter;
1441
1442           hobject_t candidate;
1443           if (mcand == lcand) {
1444             candidate = mcand;
1445             if (!mcand.is_max()) {
1446               ++ls_iter;
1447               ++missing_iter;
1448             }
1449           } else if (mcand < lcand) {
1450             candidate = mcand;
1451             ceph_assert(!mcand.is_max());
1452             ++missing_iter;
1453           } else {
1454             candidate = lcand;
1455             ceph_assert(!lcand.is_max());
1456             ++ls_iter;
1457           }
1458
1459           if (candidate >= next) {
1460             break;
1461           }
1462
1463           if (response.entries.size() == list_size) {
1464             next = candidate;
1465             break;
1466           }
1467
1468           if (candidate.snap != CEPH_NOSNAP)
1469             continue;
1470
1471           // skip wrong namespace
1472           if (candidate.get_namespace() != m->get_hobj().nspace)
1473             continue;
1474
1475           if (missing_loc.is_deleted(candidate))
1476             continue;
1477
1478           if (filter && !pgls_filter(filter, candidate, filter_out))
1479             continue;
1480
1481           response.entries.push_back(make_pair(candidate.oid,
1482                                                candidate.get_key()));
1483         }
1484         if (next.is_max() &&
1485             missing_iter == pg_log.get_missing().get_items().end() &&
1486             ls_iter == sentries.end()) {
1487           result = 1;
1488         }
1489         response.handle = next;
1490         encode(response, osd_op.outdata);
1491         if (filter)
1492           encode(filter_out, osd_op.outdata);
1493         dout(10) << " pgls result=" << result << " outdata.length()="
1494                  << osd_op.outdata.length() << dendl;
1495       }
1496       break;
1497
1498     case CEPH_OSD_OP_PG_HITSET_LS:
1499       {
1500         list< pair<utime_t,utime_t> > ls;
1501         for (list<pg_hit_set_info_t>::const_iterator p = info.hit_set.history.begin();
1502              p != info.hit_set.history.end();
1503              ++p)
1504           ls.push_back(make_pair(p->begin, p->end));
1505         if (hit_set)
1506           ls.push_back(make_pair(hit_set_start_stamp, utime_t()));
1507         encode(ls, osd_op.outdata);
1508       }
1509       break;
1510
1511     case CEPH_OSD_OP_PG_HITSET_GET:
1512       {
1513         utime_t stamp(osd_op.op.hit_set_get.stamp);
1514         if (hit_set_start_stamp && stamp >= hit_set_start_stamp) {
1515           // read the current in-memory HitSet, not the version we've
1516           // checkpointed.
1517           if (!hit_set) {
1518             result= -ENOENT;
1519             break;
1520           }
1521           encode(*hit_set, osd_op.outdata);
1522           result = osd_op.outdata.length();
1523         } else {
1524           // read an archived HitSet.
1525           hobject_t oid;
1526           for (list<pg_hit_set_info_t>::const_iterator p = info.hit_set.history.begin();
1527                p != info.hit_set.history.end();
1528                ++p) {
1529             if (stamp >= p->begin && stamp <= p->end) {
1530               oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
1531               break;
1532             }
1533           }
1534           if (oid == hobject_t()) {
1535             result = -ENOENT;
1536             break;
1537           }
1538           if (!pool.info.is_replicated()) {
1539             // FIXME: EC not supported yet
1540             result = -EOPNOTSUPP;
1541             break;
1542           }
1543           if (is_unreadable_object(oid)) {
1544             wait_for_unreadable_object(oid, op);
1545             delete filter;
1546             return;
1547           }
1548           result = osd->store->read(ch, ghobject_t(oid), 0, 0, osd_op.outdata);
1549         }
1550       }
1551       break;
1552
1553    case CEPH_OSD_OP_SCRUBLS:
1554       result = do_scrub_ls(m, &osd_op);
1555       break;
1556
1557     default:
1558       result = -EINVAL;
1559       break;
1560     }
1561
1562     if (result < 0)
1563       break;
1564   }
1565
1566   // reply
1567   MOSDOpReply *reply = new MOSDOpReply(m, 0, get_osdmap_epoch(),
1568                                        CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK,
1569                                        false);
1570   reply->claim_op_out_data(ops);
1571   reply->set_result(result);
1572   reply->set_reply_versions(info.last_update, info.last_user_version);
1573   osd->send_message_osd_client(reply, m->get_connection());
1574   delete filter;
1575 }
1576
1577 int PrimaryLogPG::do_scrub_ls(MOSDOp *m, OSDOp *osd_op)
1578 {
1579   if (m->get_pg() != info.pgid.pgid) {
1580     dout(10) << " scrubls pg=" << m->get_pg() << " != " << info.pgid << dendl;
1581     return -EINVAL; // hmm?
1582   }
1583   auto bp = osd_op->indata.cbegin();
1584   scrub_ls_arg_t arg;
1585   try {
1586     arg.decode(bp);
1587   } catch (buffer::error&) {
1588     dout(10) << " corrupted scrub_ls_arg_t" << dendl;
1589     return -EINVAL;
1590   }
1591   int r = 0;
1592   scrub_ls_result_t result = {.interval = info.history.same_interval_since};
1593   if (arg.interval != 0 && arg.interval != info.history.same_interval_since) {
1594     r = -EAGAIN;
1595   } else if (!scrubber.store) {
1596     r = -ENOENT;
1597   } else if (arg.get_snapsets) {
1598     result.vals = scrubber.store->get_snap_errors(osd->store,
1599                                                   get_pgid().pool(),
1600                                                   arg.start_after,
1601                                                   arg.max_return);
1602   } else {
1603     result.vals = scrubber.store->get_object_errors(osd->store,
1604                                                     get_pgid().pool(),
1605                                                     arg.start_after,
1606                                                     arg.max_return);
1607   }
1608   encode(result, osd_op->outdata);
1609   return r;
1610 }
1611
1612 void PrimaryLogPG::calc_trim_to()
1613 {
1614   size_t target = cct->_conf->osd_min_pg_log_entries;
1615   if (is_degraded() ||
1616       state_test(PG_STATE_RECOVERING |
1617                  PG_STATE_RECOVERY_WAIT |
1618                  PG_STATE_BACKFILLING |
1619                  PG_STATE_BACKFILL_WAIT |
1620                  PG_STATE_BACKFILL_TOOFULL)) {
1621     target = cct->_conf->osd_max_pg_log_entries;
1622   }
1623
1624   eversion_t limit = std::min(
1625     min_last_complete_ondisk,
1626     pg_log.get_can_rollback_to());
1627   if (limit != eversion_t() &&
1628       limit != pg_trim_to &&
1629       pg_log.get_log().approx_size() > target) {
1630     size_t num_to_trim = std::min(pg_log.get_log().approx_size() - target,
1631                              cct->_conf->osd_pg_log_trim_max);
1632     if (num_to_trim < cct->_conf->osd_pg_log_trim_min &&
1633         cct->_conf->osd_pg_log_trim_max >= cct->_conf->osd_pg_log_trim_min) {
1634       return;
1635     }
1636     list<pg_log_entry_t>::const_iterator it = pg_log.get_log().log.begin();
1637     eversion_t new_trim_to;
1638     for (size_t i = 0; i < num_to_trim; ++i) {
1639       new_trim_to = it->version;
1640       ++it;
1641       if (new_trim_to > limit) {
1642         new_trim_to = limit;
1643         dout(10) << "calc_trim_to trimming to min_last_complete_ondisk" << dendl;
1644         break;
1645       }
1646     }
1647     dout(10) << "calc_trim_to " << pg_trim_to << " -> " << new_trim_to << dendl;
1648     pg_trim_to = new_trim_to;
1649     assert(pg_trim_to <= pg_log.get_head());
1650     assert(pg_trim_to <= min_last_complete_ondisk);
1651   }
1652 }
1653
1654 void PrimaryLogPG::calc_trim_to_aggressive()
1655 {
1656   size_t target = cct->_conf->osd_min_pg_log_entries;
1657   if (is_degraded() ||
1658       state_test(PG_STATE_RECOVERING |
1659                  PG_STATE_RECOVERY_WAIT |
1660                  PG_STATE_BACKFILLING |
1661                  PG_STATE_BACKFILL_WAIT |
1662                  PG_STATE_BACKFILL_TOOFULL)) {
1663     target = cct->_conf->osd_max_pg_log_entries;
1664   }
1665   // limit pg log trimming up to the can_rollback_to value
1666   eversion_t limit = std::min(
1667     pg_log.get_head(),
1668     pg_log.get_can_rollback_to());
1669   dout(10) << __func__ << " limit = " << limit << dendl;
1670
1671   if (limit != eversion_t() &&
1672       limit != pg_trim_to &&
1673       pg_log.get_log().approx_size() > target) {
1674     dout(10) << __func__ << " approx pg log length =  "
1675              << pg_log.get_log().approx_size() << dendl;
1676     uint64_t num_to_trim = std::min<uint64_t>(pg_log.get_log().approx_size() - target,
1677                                               cct->_conf->osd_pg_log_trim_max);
1678     dout(10) << __func__ << " num_to_trim =  " << num_to_trim << dendl;
1679     if (num_to_trim < cct->_conf->osd_pg_log_trim_min &&
1680         cct->_conf->osd_pg_log_trim_max >= cct->_conf->osd_pg_log_trim_min) {
1681       return;
1682     }
1683     auto it = pg_log.get_log().log.begin(); // oldest log entry
1684     auto rit = pg_log.get_log().log.rbegin();
1685     eversion_t by_n_to_keep; // start from tail
1686     eversion_t by_n_to_trim = eversion_t::max(); // start from head
1687     for (size_t i = 0; it != pg_log.get_log().log.end(); ++it, ++rit) {
1688       i++;
1689       if (i > target && by_n_to_keep == eversion_t()) {
1690         by_n_to_keep = rit->version;
1691       }
1692       if (i >= num_to_trim && by_n_to_trim == eversion_t::max()) {
1693         by_n_to_trim = it->version;
1694       }
1695       if (by_n_to_keep != eversion_t() &&
1696           by_n_to_trim != eversion_t::max()) {
1697         break;
1698       }
1699     }
1700
1701     if (by_n_to_keep == eversion_t()) {
1702       return;
1703     }
1704
1705     pg_trim_to = std::min({by_n_to_keep, by_n_to_trim, limit});
1706     dout(10) << __func__ << " pg_trim_to now " << pg_trim_to << dendl;
1707     ceph_assert(pg_trim_to <= pg_log.get_head());
1708   }
1709 }
1710
1711 PrimaryLogPG::PrimaryLogPG(OSDService *o, OSDMapRef curmap,
1712                            const PGPool &_pool,
1713                            const map<string,string>& ec_profile, spg_t p) :
1714   PG(o, curmap, _pool, p),
1715   pgbackend(
1716     PGBackend::build_pg_backend(
1717       _pool.info, ec_profile, this, coll_t(p), ch, o->store, cct)),
1718   object_contexts(o->cct, o->cct->_conf->osd_pg_object_context_cache_count),
1719   snapset_contexts_lock("PrimaryLogPG::snapset_contexts_lock"),
1720   new_backfill(false),
1721   temp_seq(0),
1722   snap_trimmer_machine(this)
1723 {
1724   missing_loc.set_backend_predicates(
1725     pgbackend->get_is_readable_predicate(),
1726     pgbackend->get_is_recoverable_predicate());
1727   snap_trimmer_machine.initiate();
1728 }
1729
1730 void PrimaryLogPG::get_src_oloc(const object_t& oid, const object_locator_t& oloc, object_locator_t& src_oloc)
1731 {
1732   src_oloc = oloc;
1733   if (oloc.key.empty())
1734     src_oloc.key = oid.name;
1735 }
1736
1737 void PrimaryLogPG::handle_backoff(OpRequestRef& op)
1738 {
1739   const MOSDBackoff *m = static_cast<const MOSDBackoff*>(op->get_req());
1740   SessionRef session{static_cast<Session*>(m->get_connection()->get_priv().get())};
1741   if (!session)
1742     return;  // drop it.
1743   hobject_t begin = info.pgid.pgid.get_hobj_start();
1744   hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1745   if (begin < m->begin) {
1746     begin = m->begin;
1747   }
1748   if (end > m->end) {
1749     end = m->end;
1750   }
1751   dout(10) << __func__ << " backoff ack id " << m->id
1752            << " [" << begin << "," << end << ")" << dendl;
1753   session->ack_backoff(cct, m->pgid, m->id, begin, end);
1754 }
1755
1756 void PrimaryLogPG::do_request(
1757   OpRequestRef& op,
1758   ThreadPool::TPHandle &handle)
1759 {
1760   if (op->osd_trace) {
1761     op->pg_trace.init("pg op", &trace_endpoint, &op->osd_trace);
1762     op->pg_trace.event("do request");
1763   }
1764   // make sure we have a new enough map
1765   auto p = waiting_for_map.find(op->get_source());
1766   if (p != waiting_for_map.end()) {
1767     // preserve ordering
1768     dout(20) << __func__ << " waiting_for_map "
1769              << p->first << " not empty, queueing" << dendl;
1770     p->second.push_back(op);
1771     op->mark_delayed("waiting_for_map not empty");
1772     return;
1773   }
1774   if (!have_same_or_newer_map(op->min_epoch)) {
1775     dout(20) << __func__ << " min " << op->min_epoch
1776              << ", queue on waiting_for_map " << op->get_source() << dendl;
1777     waiting_for_map[op->get_source()].push_back(op);
1778     op->mark_delayed("op must wait for map");
1779     osd->request_osdmap_update(op->min_epoch);
1780     return;
1781   }
1782
1783   if (can_discard_request(op)) {
1784     return;
1785   }
1786
1787   // pg-wide backoffs
1788   const Message *m = op->get_req();
1789   int msg_type = m->get_type();
1790   if (m->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF)) {
1791     SessionRef session{static_cast<Session*>(m->get_connection()->get_priv().get())};
1792     if (!session)
1793       return;  // drop it.
1794
1795     if (msg_type == CEPH_MSG_OSD_OP) {
1796       if (session->check_backoff(cct, info.pgid,
1797                                  info.pgid.pgid.get_hobj_start(), m)) {
1798         return;
1799       }
1800
1801       bool backoff =
1802         is_down() ||
1803         is_incomplete() ||
1804         (!is_active() && is_peered());
1805       if (g_conf()->osd_backoff_on_peering && !backoff) {
1806         if (is_peering()) {
1807           backoff = true;
1808         }
1809       }
1810       if (backoff) {
1811         add_pg_backoff(session);
1812         return;
1813       }
1814     }
1815     // pg backoff acks at pg-level
1816     if (msg_type == CEPH_MSG_OSD_BACKOFF) {
1817       const MOSDBackoff *ba = static_cast<const MOSDBackoff*>(m);
1818       if (ba->begin != ba->end) {
1819         handle_backoff(op);
1820         return;
1821       }
1822     }
1823   }
1824
1825   if (!is_peered()) {
1826     // Delay unless PGBackend says it's ok
1827     if (pgbackend->can_handle_while_inactive(op)) {
1828       bool handled = pgbackend->handle_message(op);
1829       ceph_assert(handled);
1830       return;
1831     } else {
1832       waiting_for_peered.push_back(op);
1833       op->mark_delayed("waiting for peered");
1834       return;
1835     }
1836   }
1837
1838   if (flushes_in_progress > 0) {
1839     dout(20) << flushes_in_progress
1840              << " flushes_in_progress pending "
1841              << "waiting for flush on " << op << dendl;
1842     waiting_for_flush.push_back(op);
1843     op->mark_delayed("waiting for flush");
1844     return;
1845   }
1846
1847   ceph_assert(is_peered() && flushes_in_progress == 0);
1848   if (pgbackend->handle_message(op))
1849     return;
1850
1851   switch (msg_type) {
1852   case CEPH_MSG_OSD_OP:
1853   case CEPH_MSG_OSD_BACKOFF:
1854     if (!is_active()) {
1855       dout(20) << " peered, not active, waiting for active on " << op << dendl;
1856       waiting_for_active.push_back(op);
1857       op->mark_delayed("waiting for active");
1858       return;
1859     }
1860     switch (msg_type) {
1861     case CEPH_MSG_OSD_OP:
1862       // verify client features
1863       if ((pool.info.has_tiers() || pool.info.is_tier()) &&
1864           !op->has_feature(CEPH_FEATURE_OSD_CACHEPOOL)) {
1865         osd->reply_op_error(op, -EOPNOTSUPP);
1866         return;
1867       }
1868       do_op(op);
1869       break;
1870     case CEPH_MSG_OSD_BACKOFF:
1871       // object-level backoff acks handled in osdop context
1872       handle_backoff(op);
1873       break;
1874     }
1875     break;
1876
1877   case MSG_OSD_PG_SCAN:
1878     do_scan(op, handle);
1879     break;
1880
1881   case MSG_OSD_PG_BACKFILL:
1882     do_backfill(op);
1883     break;
1884
1885   case MSG_OSD_PG_BACKFILL_REMOVE:
1886     do_backfill_remove(op);
1887     break;
1888
1889   case MSG_OSD_SCRUB_RESERVE:
1890     {
1891       const MOSDScrubReserve *m =
1892         static_cast<const MOSDScrubReserve*>(op->get_req());
1893       switch (m->type) {
1894       case MOSDScrubReserve::REQUEST:
1895         handle_scrub_reserve_request(op);
1896         break;
1897       case MOSDScrubReserve::GRANT:
1898         handle_scrub_reserve_grant(op, m->from);
1899         break;
1900       case MOSDScrubReserve::REJECT:
1901         handle_scrub_reserve_reject(op, m->from);
1902         break;
1903       case MOSDScrubReserve::RELEASE:
1904         handle_scrub_reserve_release(op);
1905         break;
1906       }
1907     }
1908     break;
1909
1910   case MSG_OSD_REP_SCRUB:
1911     replica_scrub(op, handle);
1912     break;
1913
1914   case MSG_OSD_REP_SCRUBMAP:
1915     do_replica_scrub_map(op);
1916     break;
1917
1918   case MSG_OSD_PG_UPDATE_LOG_MISSING:
1919     do_update_log_missing(op);
1920     break;
1921
1922   case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY:
1923     do_update_log_missing_reply(op);
1924     break;
1925
1926   default:
1927     ceph_abort_msg("bad message type in do_request");
1928   }
1929 }
1930
1931 hobject_t PrimaryLogPG::earliest_backfill() const
1932 {
1933   hobject_t e = hobject_t::get_max();
1934   for (set<pg_shard_t>::iterator i = backfill_targets.begin();
1935        i != backfill_targets.end();
1936        ++i) {
1937     pg_shard_t bt = *i;
1938     map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(bt);
1939     ceph_assert(iter != peer_info.end());
1940     if (iter->second.last_backfill < e)
1941       e = iter->second.last_backfill;
1942   }
1943   return e;
1944 }
1945
1946 /** do_op - do an op
1947  * pg lock will be held (if multithreaded)
1948  * osd_lock NOT held.
1949  */
1950 void PrimaryLogPG::do_op(OpRequestRef& op)
1951 {
1952   FUNCTRACE(cct);
1953   // NOTE: take a non-const pointer here; we must be careful not to
1954   // change anything that will break other reads on m (operator<<).
1955   MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
1956   ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
1957   if (m->finish_decode()) {
1958     op->reset_desc();   // for TrackedOp
1959     m->clear_payload();
1960   }
1961
1962   dout(20) << __func__ << ": op " << *m << dendl;
1963
1964   hobject_t head = m->get_hobj();
1965   head.snap = CEPH_NOSNAP;
1966
1967   if (!info.pgid.pgid.contains(
1968         info.pgid.pgid.get_split_bits(pool.info.get_pg_num()), head)) {
1969     derr << __func__ << " " << info.pgid.pgid << " does not contain "
1970          << head << " pg_num " << pool.info.get_pg_num() << " hash "
1971          << std::hex << head.get_hash() << std::dec << dendl;
1972     osd->clog->warn() << info.pgid.pgid << " does not contain " << head
1973                       << " op " << *m;
1974     ceph_assert(!cct->_conf->osd_debug_misdirected_ops);
1975     return;
1976   }
1977
1978   bool can_backoff =
1979     m->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF);
1980   SessionRef session;
1981   if (can_backoff) {
1982     session = static_cast<Session*>(m->get_connection()->get_priv().get());
1983     if (!session.get()) {
1984       dout(10) << __func__ << " no session" << dendl;
1985       return;
1986     }
1987
1988     if (session->check_backoff(cct, info.pgid, head, m)) {
1989       return;
1990     }
1991   }
1992
1993   if (m->has_flag(CEPH_OSD_FLAG_PARALLELEXEC)) {
1994     // not implemented.
1995     dout(20) << __func__ << ": PARALLELEXEC not implemented " << *m << dendl;
1996     osd->reply_op_error(op, -EINVAL);
1997     return;
1998   }
1999
2000   if (op->rmw_flags == 0) {
2001     int r = osd->osd->init_op_flags(op);
2002     if (r) {
2003       osd->reply_op_error(op, r);
2004       return;
2005     }
2006   }
2007
2008   if ((m->get_flags() & (CEPH_OSD_FLAG_BALANCE_READS |
2009                          CEPH_OSD_FLAG_LOCALIZE_READS)) &&
2010       op->may_read() &&
2011       !(op->may_write() || op->may_cache())) {
2012     // balanced reads; any replica will do
2013     if (!(is_primary() || is_replica())) {
2014       osd->handle_misdirected_op(this, op);
2015       return;
2016     }
2017   } else {
2018     // normal case; must be primary
2019     if (!is_primary()) {
2020       osd->handle_misdirected_op(this, op);
2021       return;
2022     }
2023   }
2024
2025   if (!op_has_sufficient_caps(op)) {
2026     osd->reply_op_error(op, -EPERM);
2027     return;
2028   }
2029
2030   if (op->includes_pg_op()) {
2031     return do_pg_op(op);
2032   }
2033
2034   // object name too long?
2035   if (m->get_oid().name.size() > cct->_conf->osd_max_object_name_len) {
2036     dout(4) << "do_op name is longer than "
2037             << cct->_conf->osd_max_object_name_len
2038             << " bytes" << dendl;
2039     osd->reply_op_error(op, -ENAMETOOLONG);
2040     return;
2041   }
2042   if (m->get_hobj().get_key().size() > cct->_conf->osd_max_object_name_len) {
2043     dout(4) << "do_op locator is longer than "
2044             << cct->_conf->osd_max_object_name_len
2045             << " bytes" << dendl;
2046     osd->reply_op_error(op, -ENAMETOOLONG);
2047     return;
2048   }
2049   if (m->get_hobj().nspace.size() > cct->_conf->osd_max_object_namespace_len) {
2050     dout(4) << "do_op namespace is longer than "
2051             << cct->_conf->osd_max_object_namespace_len
2052             << " bytes" << dendl;
2053     osd->reply_op_error(op, -ENAMETOOLONG);
2054     return;
2055   }
2056
2057   if (int r = osd->store->validate_hobject_key(head)) {
2058     dout(4) << "do_op object " << head << " invalid for backing store: "
2059             << r << dendl;
2060     osd->reply_op_error(op, r);
2061     return;
2062   }
2063
2064   // blacklisted?
2065   if (get_osdmap()->is_blacklisted(m->get_source_addr())) {
2066     dout(10) << "do_op " << m->get_source_addr() << " is blacklisted" << dendl;
2067     osd->reply_op_error(op, -EBLACKLISTED);
2068     return;
2069   }
2070
2071   // order this op as a write?
2072   bool write_ordered = op->rwordered();
2073
2074   // discard due to cluster full transition?  (we discard any op that
2075   // originates before the cluster or pool is marked full; the client
2076   // will resend after the full flag is removed or if they expect the
2077   // op to succeed despite being full).  The except is FULL_FORCE and
2078   // FULL_TRY ops, which there is no reason to discard because they
2079   // bypass all full checks anyway.  If this op isn't write or
2080   // read-ordered, we skip.
2081   // FIXME: we exclude mds writes for now.
2082   if (write_ordered && !(m->get_source().is_mds() ||
2083                          m->has_flag(CEPH_OSD_FLAG_FULL_TRY) ||
2084                          m->has_flag(CEPH_OSD_FLAG_FULL_FORCE)) &&
2085       info.history.last_epoch_marked_full > m->get_map_epoch()) {
2086     dout(10) << __func__ << " discarding op sent before full " << m << " "
2087              << *m << dendl;
2088     return;
2089   }
2090   // mds should have stopped writing before this point.
2091   // We can't allow OSD to become non-startable even if mds
2092   // could be writing as part of file removals.
2093   if (write_ordered && osd->check_failsafe_full(get_dpp()) &&
2094       !m->has_flag(CEPH_OSD_FLAG_FULL_TRY)) {
2095     dout(10) << __func__ << " fail-safe full check failed, dropping request." << dendl;
2096     return;
2097   }
2098   int64_t poolid = get_pgid().pool();
2099   if (op->may_write()) {
2100
2101     const pg_pool_t *pi = get_osdmap()->get_pg_pool(poolid);
2102     if (!pi) {
2103       return;
2104     }
2105
2106     // invalid?
2107     if (m->get_snapid() != CEPH_NOSNAP) {
2108       dout(20) << __func__ << ": write to clone not valid " << *m << dendl;
2109       osd->reply_op_error(op, -EINVAL);
2110       return;
2111     }
2112
2113     // too big?
2114     if (cct->_conf->osd_max_write_size &&
2115         m->get_data_len() > cct->_conf->osd_max_write_size << 20) {
2116       // journal can't hold commit!
2117       derr << "do_op msg data len " << m->get_data_len()
2118            << " > osd_max_write_size " << (cct->_conf->osd_max_write_size << 20)
2119            << " on " << *m << dendl;
2120       osd->reply_op_error(op, -OSD_WRITETOOBIG);
2121       return;
2122     }
2123   }
2124
2125   dout(10) << "do_op " << *m
2126            << (op->may_write() ? " may_write" : "")
2127            << (op->may_read() ? " may_read" : "")
2128            << (op->may_cache() ? " may_cache" : "")
2129            << " -> " << (write_ordered ? "write-ordered" : "read-ordered")
2130            << " flags " << ceph_osd_flag_string(m->get_flags())
2131            << dendl;
2132
2133   // missing object?
2134   if (is_unreadable_object(head)) {
2135     if (!is_primary()) {
2136       osd->reply_op_error(op, -EAGAIN);
2137       return;
2138     }
2139     if (can_backoff &&
2140         (g_conf()->osd_backoff_on_degraded ||
2141          (g_conf()->osd_backoff_on_unfound && missing_loc.is_unfound(head)))) {
2142       add_backoff(session, head, head);
2143       maybe_kick_recovery(head);
2144     } else {
2145       wait_for_unreadable_object(head, op);
2146     }
2147     return;
2148   }
2149
2150   if (write_ordered) {
2151     // degraded object?
2152     if (is_degraded_or_backfilling_object(head)) {
2153       if (can_backoff && g_conf()->osd_backoff_on_degraded) {
2154         add_backoff(session, head, head);
2155         maybe_kick_recovery(head);
2156       } else {
2157         wait_for_degraded_object(head, op);
2158       }
2159       return;
2160     }
2161
2162     if (scrubber.is_chunky_scrub_active() && write_blocked_by_scrub(head)) {
2163       dout(20) << __func__ << ": waiting for scrub" << dendl;
2164       waiting_for_scrub.push_back(op);
2165       op->mark_delayed("waiting for scrub");
2166       return;
2167     }
2168
2169     // blocked on snap?
2170     if (auto blocked_iter = objects_blocked_on_degraded_snap.find(head);
2171         blocked_iter != std::end(objects_blocked_on_degraded_snap)) {
2172       hobject_t to_wait_on(head);
2173       to_wait_on.snap = blocked_iter->second;
2174       wait_for_degraded_object(to_wait_on, op);
2175       return;
2176     }
2177     if (auto blocked_snap_promote_iter = objects_blocked_on_snap_promotion.find(head);
2178         blocked_snap_promote_iter != std::end(objects_blocked_on_snap_promotion)) {
2179       wait_for_blocked_object(blocked_snap_promote_iter->second->obs.oi.soid, op);
2180       return;
2181     }
2182     if (objects_blocked_on_cache_full.count(head)) {
2183       block_write_on_full_cache(head, op);
2184       return;
2185     }
2186   }
2187
2188   // dup/resent?
2189   if (op->may_write() || op->may_cache()) {
2190     // warning: we will get back *a* request for this reqid, but not
2191     // necessarily the most recent.  this happens with flush and
2192     // promote ops, but we can't possible have both in our log where
2193     // the original request is still not stable on disk, so for our
2194     // purposes here it doesn't matter which one we get.
2195     eversion_t version;
2196     version_t user_version;
2197     int return_code = 0;
2198     bool got = check_in_progress_op(
2199       m->get_reqid(), &version, &user_version, &return_code);
2200     if (got) {
2201       dout(3) << __func__ << " dup " << m->get_reqid()
2202               << " version " << version << dendl;
2203       if (already_complete(version)) {
2204         osd->reply_op_error(op, return_code, version, user_version);
2205       } else {
2206         dout(10) << " waiting for " << version << " to commit" << dendl;
2207         // always queue ondisk waiters, so that we can requeue if needed
2208         waiting_for_ondisk[version].emplace_back(op, user_version, return_code);
2209         op->mark_delayed("waiting for ondisk");
2210       }
2211       return;
2212     }
2213   }
2214
2215   ObjectContextRef obc;
2216   bool can_create = op->may_write();
2217   hobject_t missing_oid;
2218
2219   // kludge around the fact that LIST_SNAPS sets CEPH_SNAPDIR for LIST_SNAPS
2220   hobject_t _oid_head;
2221   if (m->get_snapid() == CEPH_SNAPDIR) {
2222     _oid_head = m->get_hobj().get_head();
2223   }
2224   const hobject_t& oid =
2225     m->get_snapid() == CEPH_SNAPDIR ? _oid_head : m->get_hobj();
2226
2227   // make sure LIST_SNAPS is on CEPH_SNAPDIR and nothing else
2228   for (vector<OSDOp>::iterator p = m->ops.begin(); p != m->ops.end(); ++p) {
2229     OSDOp& osd_op = *p;
2230
2231     if (osd_op.op.op == CEPH_OSD_OP_LIST_SNAPS) {
2232       if (m->get_snapid() != CEPH_SNAPDIR) {
2233         dout(10) << "LIST_SNAPS with incorrect context" << dendl;
2234         osd->reply_op_error(op, -EINVAL);
2235         return;
2236       }
2237     } else {
2238       if (m->get_snapid() == CEPH_SNAPDIR) {
2239         dout(10) << "non-LIST_SNAPS on snapdir" << dendl;
2240         osd->reply_op_error(op, -EINVAL);
2241         return;
2242       }
2243     }
2244   }
2245
2246   // io blocked on obc?
2247   if (!m->has_flag(CEPH_OSD_FLAG_FLUSH) &&
2248       maybe_await_blocked_head(oid, op)) {
2249     return;
2250   }
2251
2252   int r = find_object_context(
2253     oid, &obc, can_create,
2254     m->has_flag(CEPH_OSD_FLAG_MAP_SNAP_CLONE),
2255     &missing_oid);
2256
2257   // LIST_SNAPS needs the ssc too
2258   if (obc &&
2259       m->get_snapid() == CEPH_SNAPDIR &&
2260       !obc->ssc) {
2261     obc->ssc = get_snapset_context(oid, true);
2262   }
2263
2264   if (r == -EAGAIN) {
2265     // If we're not the primary of this OSD, we just return -EAGAIN. Otherwise,
2266     // we have to wait for the object.
2267     if (is_primary()) {
2268       // missing the specific snap we need; requeue and wait.
2269       ceph_assert(!op->may_write()); // only happens on a read/cache
2270       wait_for_unreadable_object(missing_oid, op);
2271       return;
2272     }
2273   } else if (r == 0) {
2274     if (is_unreadable_object(obc->obs.oi.soid)) {
2275       dout(10) << __func__ << ": clone " << obc->obs.oi.soid
2276                << " is unreadable, waiting" << dendl;
2277       wait_for_unreadable_object(obc->obs.oi.soid, op);
2278       return;
2279     }
2280
2281     // degraded object?  (the check above was for head; this could be a clone)
2282     if (write_ordered &&
2283         obc->obs.oi.soid.snap != CEPH_NOSNAP &&
2284         is_degraded_or_backfilling_object(obc->obs.oi.soid)) {
2285       dout(10) << __func__ << ": clone " << obc->obs.oi.soid
2286                << " is degraded, waiting" << dendl;
2287       wait_for_degraded_object(obc->obs.oi.soid, op);
2288       return;
2289     }
2290   }
2291
2292   bool in_hit_set = false;
2293   if (hit_set) {
2294     if (obc.get()) {
2295       if (obc->obs.oi.soid != hobject_t() && hit_set->contains(obc->obs.oi.soid))
2296         in_hit_set = true;
2297     } else {
2298       if (missing_oid != hobject_t() && hit_set->contains(missing_oid))
2299         in_hit_set = true;
2300     }
2301     if (!op->hitset_inserted) {
2302       hit_set->insert(oid);
2303       op->hitset_inserted = true;
2304       if (hit_set->is_full() ||
2305           hit_set_start_stamp + pool.info.hit_set_period <= m->get_recv_stamp()) {
2306         hit_set_persist();
2307       }
2308     }
2309   }
2310
2311   if (agent_state) {
2312     if (agent_choose_mode(false, op))
2313       return;
2314   }
2315
2316   if (obc.get() && obc->obs.exists && obc->obs.oi.has_manifest()) {
2317     if (maybe_handle_manifest(op,
2318                                write_ordered,
2319                                obc))
2320     return;
2321   }
2322
2323   if (maybe_handle_cache(op,
2324                          write_ordered,
2325                          obc,
2326                          r,
2327                          missing_oid,
2328                          false,
2329                          in_hit_set))
2330     return;
2331
2332   if (r && (r != -ENOENT || !obc)) {
2333     // copy the reqids for copy get on ENOENT
2334     if (r == -ENOENT &&
2335         (m->ops[0].op.op == CEPH_OSD_OP_COPY_GET)) {
2336       fill_in_copy_get_noent(op, oid, m->ops[0]);
2337       return;
2338     }
2339     dout(20) << __func__ << ": find_object_context got error " << r << dendl;
2340     if (op->may_write() &&
2341         get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
2342       record_write_error(op, oid, nullptr, r);
2343     } else {
2344       osd->reply_op_error(op, r);
2345     }
2346     return;
2347   }
2348
2349   // make sure locator is consistent
2350   object_locator_t oloc(obc->obs.oi.soid);
2351   if (m->get_object_locator() != oloc) {
2352     dout(10) << " provided locator " << m->get_object_locator()
2353              << " != object's " << obc->obs.oi.soid << dendl;
2354     osd->clog->warn() << "bad locator " << m->get_object_locator()
2355                      << " on object " << oloc
2356                       << " op " << *m;
2357   }
2358
2359   // io blocked on obc?
2360   if (obc->is_blocked() &&
2361       !m->has_flag(CEPH_OSD_FLAG_FLUSH)) {
2362     wait_for_blocked_object(obc->obs.oi.soid, op);
2363     return;
2364   }
2365
2366   dout(25) << __func__ << " oi " << obc->obs.oi << dendl;
2367
2368   OpContext *ctx = new OpContext(op, m->get_reqid(), &m->ops, obc, this);
2369
2370   if (m->has_flag(CEPH_OSD_FLAG_SKIPRWLOCKS)) {
2371     dout(20) << __func__ << ": skipping rw locks" << dendl;
2372   } else if (m->get_flags() & CEPH_OSD_FLAG_FLUSH) {
2373     dout(20) << __func__ << ": part of flush, will ignore write lock" << dendl;
2374
2375     // verify there is in fact a flush in progress
2376     // FIXME: we could make this a stronger test.
2377     map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(obc->obs.oi.soid);
2378     if (p == flush_ops.end()) {
2379       dout(10) << __func__ << " no flush in progress, aborting" << dendl;
2380       reply_ctx(ctx, -EINVAL);
2381       return;
2382     }
2383   } else if (!get_rw_locks(write_ordered, ctx)) {
2384     dout(20) << __func__ << " waiting for rw locks " << dendl;
2385     op->mark_delayed("waiting for rw locks");
2386     close_op_ctx(ctx);
2387     return;
2388   }
2389   dout(20) << __func__ << " obc " << *obc << dendl;
2390
2391   if (r) {
2392     dout(20) << __func__ << " returned an error: " << r << dendl;
2393     close_op_ctx(ctx);
2394     if (op->may_write() &&
2395         get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
2396       record_write_error(op, oid, nullptr, r);
2397     } else {
2398       osd->reply_op_error(op, r);
2399     }
2400     return;
2401   }
2402
2403   if (m->has_flag(CEPH_OSD_FLAG_IGNORE_CACHE)) {
2404     ctx->ignore_cache = true;
2405   }
2406
2407   if ((op->may_read()) && (obc->obs.oi.is_lost())) {
2408     // This object is lost. Reading from it returns an error.
2409     dout(20) << __func__ << ": object " << obc->obs.oi.soid
2410              << " is lost" << dendl;
2411     reply_ctx(ctx, -ENFILE);
2412     return;
2413   }
2414   if (!op->may_write() &&
2415       !op->may_cache() &&
2416       (!obc->obs.exists ||
2417        ((m->get_snapid() != CEPH_SNAPDIR) &&
2418         obc->obs.oi.is_whiteout()))) {
2419     // copy the reqids for copy get on ENOENT
2420     if (m->ops[0].op.op == CEPH_OSD_OP_COPY_GET) {
2421       fill_in_copy_get_noent(op, oid, m->ops[0]);
2422       close_op_ctx(ctx);
2423       return;
2424     }
2425     reply_ctx(ctx, -ENOENT);
2426     return;
2427   }
2428
2429   op->mark_started();
2430
2431   execute_ctx(ctx);
2432   utime_t prepare_latency = ceph_clock_now();
2433   prepare_latency -= op->get_dequeued_time();
2434   osd->logger->tinc(l_osd_op_prepare_lat, prepare_latency);
2435   if (op->may_read() && op->may_write()) {
2436     osd->logger->tinc(l_osd_op_rw_prepare_lat, prepare_latency);
2437   } else if (op->may_read()) {
2438     osd->logger->tinc(l_osd_op_r_prepare_lat, prepare_latency);
2439   } else if (op->may_write() || op->may_cache()) {
2440     osd->logger->tinc(l_osd_op_w_prepare_lat, prepare_latency);
2441   }
2442
2443   // force recovery of the oldest missing object if too many logs
2444   maybe_force_recovery();
2445 }
2446
2447 PrimaryLogPG::cache_result_t PrimaryLogPG::maybe_handle_manifest_detail(
2448   OpRequestRef op,
2449   bool write_ordered,
2450   ObjectContextRef obc)
2451 {
2452   ceph_assert(obc);
2453   if (static_cast<const MOSDOp *>(op->get_req())->get_flags() &
2454       CEPH_OSD_FLAG_IGNORE_REDIRECT) {
2455     dout(20) << __func__ << ": ignoring redirect due to flag" << dendl;
2456     return cache_result_t::NOOP;
2457   }
2458
2459   // if it is write-ordered and blocked, stop now
2460   if (obc->is_blocked() && write_ordered) {
2461     // we're already doing something with this object
2462     dout(20) << __func__ << " blocked on " << obc->obs.oi.soid << dendl;
2463     return cache_result_t::NOOP;
2464   }
2465
2466   vector<OSDOp> ops = static_cast<const MOSDOp*>(op->get_req())->ops;
2467   for (vector<OSDOp>::iterator p = ops.begin(); p != ops.end(); ++p) {
2468     OSDOp& osd_op = *p;
2469     ceph_osd_op& op = osd_op.op;
2470     if (op.op == CEPH_OSD_OP_SET_REDIRECT ||
2471         op.op == CEPH_OSD_OP_SET_CHUNK ||
2472         op.op == CEPH_OSD_OP_TIER_PROMOTE ||
2473         op.op == CEPH_OSD_OP_UNSET_MANIFEST) {
2474       return cache_result_t::NOOP;
2475     }
2476   }
2477
2478   switch (obc->obs.oi.manifest.type) {
2479   case object_manifest_t::TYPE_REDIRECT:
2480     if (op->may_write() || write_ordered) {
2481       do_proxy_write(op, obc);
2482     } else {
2483       // promoted object
2484       if (obc->obs.oi.size != 0) {
2485         return cache_result_t::NOOP;
2486       }
2487       do_proxy_read(op, obc);
2488     }
2489     return cache_result_t::HANDLED_PROXY;
2490   case object_manifest_t::TYPE_CHUNKED:
2491     {
2492       if (can_proxy_chunked_read(op, obc)) {
2493         map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(obc->obs.oi.soid);
2494         if (p != flush_ops.end()) {
2495           do_proxy_chunked_op(op, obc->obs.oi.soid, obc, true);
2496           return cache_result_t::HANDLED_PROXY;
2497         }
2498         do_proxy_chunked_op(op, obc->obs.oi.soid, obc, write_ordered);
2499         return cache_result_t::HANDLED_PROXY;
2500       }
2501
2502       MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
2503       ceph_assert(m->get_type() == CEPH_MSG_OSD_OP);
2504       hobject_t head = m->get_hobj();
2505
2506       if (is_degraded_or_backfilling_object(head)) {
2507         dout(20) << __func__ << ": " << head << " is degraded, waiting" << dendl;
2508         wait_for_degraded_object(head, op);
2509         return cache_result_t::BLOCKED_RECOVERY;
2510       }
2511
2512       if (write_blocked_by_scrub(head)) {
2513         dout(20) << __func__ << ": waiting for scrub" << dendl;
2514         waiting_for_scrub.push_back(op);
2515         op->mark_delayed("waiting for scrub");
2516         return cache_result_t::BLOCKED_RECOVERY;
2517       }
2518
2519       for (auto& p : obc->obs.oi.manifest.chunk_map) {
2520         if (p.second.is_missing()) {
2521           const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
2522           const object_locator_t oloc = m->get_object_locator();
2523           promote_object(obc, obc->obs.oi.soid, oloc, op, NULL);
2524           return cache_result_t::BLOCKED_PROMOTE;
2525         }
2526       }
2527
2528       bool all_dirty = true;
2529       for (auto& p : obc->obs.oi.manifest.chunk_map) {
2530         if (!p.second.is_dirty()) {
2531           all_dirty = false;
2532         }
2533       }
2534       if (all_dirty) {
2535         start_flush(OpRequestRef(), obc, true, NULL, boost::none);
2536       }
2537       return cache_result_t::NOOP;
2538     }
2539   default:
2540     ceph_abort_msg("unrecognized manifest type");
2541   }
2542
2543   return cache_result_t::NOOP;
2544 }
2545
2546 struct C_ManifestFlush : public Context {
2547   PrimaryLogPGRef pg;
2548   hobject_t oid;
2549   epoch_t lpr;
2550   ceph_tid_t tid;
2551   utime_t start;
2552   uint64_t offset;
2553   uint64_t last_offset;
2554   C_ManifestFlush(PrimaryLogPG *p, hobject_t o, epoch_t e)
2555     : pg(p), oid(o), lpr(e),
2556       tid(0), start(ceph_clock_now())
2557   {}
2558   void finish(int r) override {
2559     if (r == -ECANCELED)
2560       return;
2561     pg->lock();
2562     pg->handle_manifest_flush(oid, tid, r, offset, last_offset, lpr);
2563     pg->osd->logger->tinc(l_osd_tier_flush_lat, ceph_clock_now() - start);
2564     pg->unlock();
2565   }
2566 };
2567
2568 void PrimaryLogPG::handle_manifest_flush(hobject_t oid, ceph_tid_t tid, int r,
2569                                          uint64_t offset, uint64_t last_offset,
2570                                          epoch_t lpr)
2571 {
2572   map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(oid);
2573   if (p == flush_ops.end()) {
2574     dout(10) << __func__ << " no flush_op found" << dendl;
2575     return;
2576   }
2577   if (p->second->rval < 0) {
2578     return;
2579   }
2580   p->second->io_results[offset] = r;
2581   for (auto &ior: p->second->io_results) {
2582     if (ior.second < 0) {
2583       finish_manifest_flush(oid, tid, r, p->second->obc, last_offset);
2584       p->second->rval = r;
2585       return;
2586     }
2587   }
2588   if (p->second->chunks == p->second->io_results.size()) {
2589     if (lpr == get_last_peering_reset()) {
2590       ceph_assert(p->second->obc);
2591       finish_manifest_flush(oid, tid, r, p->second->obc, last_offset);
2592     }
2593   }
2594 }
2595
2596 int PrimaryLogPG::start_manifest_flush(OpRequestRef op, ObjectContextRef obc, bool blocking,
2597                                        boost::optional<std::function<void()>> &&on_flush)
2598 {
2599   auto p = obc->obs.oi.manifest.chunk_map.begin();
2600   FlushOpRef manifest_fop(std::make_shared<FlushOp>());
2601   manifest_fop->op = op;
2602   manifest_fop->obc = obc;
2603   manifest_fop->flushed_version = obc->obs.oi.user_version;
2604   manifest_fop->blocking = blocking;
2605   manifest_fop->on_flush = std::move(on_flush);
2606   int r = do_manifest_flush(op, obc, manifest_fop, p->first, blocking);
2607   if (r < 0) {
2608     return r;
2609   }
2610
2611   flush_ops[obc->obs.oi.soid] = manifest_fop;
2612   return -EINPROGRESS;
2613 }
2614
2615 int PrimaryLogPG::do_manifest_flush(OpRequestRef op, ObjectContextRef obc, FlushOpRef manifest_fop,
2616                                     uint64_t start_offset, bool block)
2617 {
2618   struct object_manifest_t &manifest = obc->obs.oi.manifest;
2619   hobject_t soid = obc->obs.oi.soid;
2620   ceph_tid_t tid;
2621   SnapContext snapc;
2622   uint64_t max_copy_size = 0, last_offset = 0;
2623
2624   map<uint64_t, chunk_info_t>::iterator iter = manifest.chunk_map.find(start_offset);
2625   ceph_assert(iter != manifest.chunk_map.end());
2626   for (;iter != manifest.chunk_map.end(); ++iter) {
2627     if (iter->second.is_dirty()) {
2628       last_offset = iter->first;
2629       max_copy_size += iter->second.length;
2630     }
2631     if (get_copy_chunk_size() < max_copy_size) {
2632       break;
2633     }
2634   }
2635
2636   iter = manifest.chunk_map.find(start_offset);
2637   for (;iter != manifest.chunk_map.end(); ++iter) {
2638     if (!iter->second.is_dirty()) {
2639       continue;
2640     }
2641     uint64_t tgt_length = iter->second.length;
2642     uint64_t tgt_offset= iter->second.offset;
2643     hobject_t tgt_soid = iter->second.oid;
2644     object_locator_t oloc(tgt_soid);
2645     ObjectOperation obj_op;
2646     bufferlist chunk_data;
2647     int r = pgbackend->objects_read_sync(
2648         soid, iter->first, tgt_length, 0, &chunk_data);
2649     if (r < 0) {
2650       dout(0) << __func__ << " read fail " << " offset: " << tgt_offset
2651               << " len: " << tgt_length << " r: " << r << dendl;
2652       return r;
2653     }
2654     if (!chunk_data.length()) {
2655       return -ENODATA;
2656     }
2657
2658     unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY |
2659                      CEPH_OSD_FLAG_RWORDERED;
2660     tgt_length = chunk_data.length();
2661     pg_pool_t::fingerprint_t fp_algo_t = pool.info.get_fingerprint_type();
2662     if (iter->second.has_reference() &&
2663         fp_algo_t != pg_pool_t::TYPE_FINGERPRINT_NONE) {
2664       switch (fp_algo_t) {
2665         case pg_pool_t::TYPE_FINGERPRINT_SHA1:
2666           {
2667             sha1_digest_t sha1r = chunk_data.sha1();
2668             object_t fp_oid = sha1r.to_str();
2669             bufferlist in;
2670             if (fp_oid != tgt_soid.oid) {
2671               // decrement old chunk's reference count
2672               ObjectOperation dec_op;
2673               cls_chunk_refcount_put_op put_call;
2674               ::encode(put_call, in);
2675               dec_op.call("refcount", "chunk_put", in);
2676               // we don't care dec_op's completion. scrub for dedup will fix this.
2677               tid = osd->objecter->mutate(
2678                 tgt_soid.oid, oloc, dec_op, snapc,
2679                 ceph::real_clock::from_ceph_timespec(obc->obs.oi.mtime),
2680                 flags, NULL);
2681               in.clear();
2682             }
2683             tgt_soid.oid = fp_oid;
2684             iter->second.oid = tgt_soid;
2685             // add data op
2686             ceph_osd_op osd_op;
2687             osd_op.extent.offset = 0;
2688             osd_op.extent.length = chunk_data.length();
2689             encode(osd_op, in);
2690             encode(soid, in);
2691             in.append(chunk_data);
2692             obj_op.call("cas", "cas_write_or_get", in);
2693             break;
2694           }
2695         default:
2696           assert(0 == "unrecognized fingerprint type");
2697           break;
2698       }
2699     } else {
2700       obj_op.add_data(CEPH_OSD_OP_WRITE, tgt_offset, tgt_length, chunk_data);
2701     }
2702
2703     C_ManifestFlush *fin = new C_ManifestFlush(this, soid, get_last_peering_reset());
2704     fin->offset = iter->first;
2705     fin->last_offset = last_offset;
2706     manifest_fop->chunks++;
2707
2708     unsigned n = info.pgid.hash_to_shard(osd->m_objecter_finishers);
2709     tid = osd->objecter->mutate(
2710       tgt_soid.oid, oloc, obj_op, snapc,
2711       ceph::real_clock::from_ceph_timespec(obc->obs.oi.mtime),
2712       flags, new C_OnFinisher(fin, osd->objecter_finishers[n]));
2713     fin->tid = tid;
2714     manifest_fop->io_tids[iter->first] = tid;
2715
2716     dout(20) << __func__ << " offset: " << tgt_offset << " len: " << tgt_length
2717             << " oid: " << tgt_soid.oid << " ori oid: " << soid.oid.name
2718             << " tid: " << tid << dendl;
2719     if (last_offset < iter->first) {
2720       break;
2721     }
2722   }
2723
2724   return 0;
2725 }
2726
2727 void PrimaryLogPG::finish_manifest_flush(hobject_t oid, ceph_tid_t tid, int r,
2728                                          ObjectContextRef obc, uint64_t last_offset)
2729 {
2730   dout(10) << __func__ << " " << oid << " tid " << tid
2731            << " " << cpp_strerror(r) << " last_offset: " << last_offset << dendl;
2732   map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(oid);
2733   if (p == flush_ops.end()) {
2734     dout(10) << __func__ << " no flush_op found" << dendl;
2735     return;
2736   }
2737   map<uint64_t, chunk_info_t>::iterator iter =
2738       obc->obs.oi.manifest.chunk_map.find(last_offset);
2739   ceph_assert(iter != obc->obs.oi.manifest.chunk_map.end());
2740   for (;iter != obc->obs.oi.manifest.chunk_map.end(); ++iter) {
2741     if (iter->second.is_dirty() && last_offset < iter->first) {
2742       do_manifest_flush(p->second->op, obc, p->second, iter->first, p->second->blocking);
2743       return;
2744     }
2745   }
2746   finish_flush(oid, tid, r);
2747 }
2748
2749 void PrimaryLogPG::record_write_error(OpRequestRef op, const hobject_t &soid,
2750                                       MOSDOpReply *orig_reply, int r)
2751 {
2752   dout(20) << __func__ << " r=" << r << dendl;
2753   ceph_assert(op->may_write());
2754   const osd_reqid_t &reqid = static_cast<const MOSDOp*>(op->get_req())->get_reqid();
2755   mempool::osd_pglog::list<pg_log_entry_t> entries;
2756   entries.push_back(pg_log_entry_t(pg_log_entry_t::ERROR, soid,
2757                                    get_next_version(), eversion_t(), 0,
2758                                    reqid, utime_t(), r));
2759
2760   struct OnComplete {
2761     PrimaryLogPG *pg;
2762     OpRequestRef op;
2763     boost::intrusive_ptr<MOSDOpReply> orig_reply;
2764     int r;
2765     OnComplete(
2766       PrimaryLogPG *pg,
2767       OpRequestRef op,
2768       MOSDOpReply *orig_reply,
2769       int r)
2770       : pg(pg), op(op),
2771         orig_reply(orig_reply, false /* take over ref */), r(r)
2772       {}
2773     void operator()() {
2774       ldpp_dout(pg, 20) << "finished " << __func__ << " r=" << r << dendl;
2775       const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
2776       int flags = m->get_flags() & (CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
2777       MOSDOpReply *reply = orig_reply.detach();
2778       if (reply == nullptr) {
2779         reply = new MOSDOpReply(m, r, pg->get_osdmap_epoch(),
2780                                 flags, true);
2781       }
2782       ldpp_dout(pg, 10) << " sending commit on " << *m << " " << reply << dendl;
2783       pg->osd->send_message_osd_client(reply, m->get_connection());
2784     }
2785   };
2786
2787   ObcLockManager lock_manager;
2788   submit_log_entries(
2789     entries,
2790     std::move(lock_manager),
2791     boost::optional<std::function<void(void)> >(
2792       OnComplete(this, op, orig_reply, r)),
2793     op,
2794     r);
2795 }
2796
2797 PrimaryLogPG::cache_result_t PrimaryLogPG::maybe_handle_cache_detail(
2798   OpRequestRef op,
2799   bool write_ordered,
2800   ObjectContextRef obc,
2801   int r, hobject_t missing_oid,
2802   bool must_promote,
2803   bool in_hit_set,
2804   ObjectContextRef *promote_obc)
2805 {
2806   // return quickly if caching is not enabled
2807   if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)
2808     return cache_result_t::NOOP;
2809
2810   if (op &&
2811       op->get_req() &&
2812       op->get_req()->get_type() == CEPH_MSG_OSD_OP &&
2813       (static_cast<const MOSDOp *>(op->get_req())->get_flags() &
2814        CEPH_OSD_FLAG_IGNORE_CACHE)) {
2815     dout(20) << __func__ << ": ignoring cache due to flag" << dendl;
2816     return cache_result_t::NOOP;
2817   }
2818
2819   must_promote = must_promote || op->need_promote();
2820
2821   if (obc)
2822     dout(25) << __func__ << " " << obc->obs.oi << " "
2823              << (obc->obs.exists ? "exists" : "DNE")
2824              << " missing_oid " << missing_oid
2825              << " must_promote " << (int)must_promote
2826              << " in_hit_set " << (int)in_hit_set
2827              << dendl;
2828   else
2829     dout(25) << __func__ << " (no obc)"
2830              << " missing_oid " << missing_oid
2831              << " must_promote " << (int)must_promote
2832              << " in_hit_set " << (int)in_hit_set
2833              << dendl;
2834
2835   // if it is write-ordered and blocked, stop now
2836   if (obc.get() && obc->is_blocked() && write_ordered) {
2837     // we're already doing something with this object
2838     dout(20) << __func__ << " blocked on " << obc->obs.oi.soid << dendl;
2839     return cache_result_t::NOOP;
2840   }
2841
2842   if (r == -ENOENT && missing_oid == hobject_t()) {
2843     // we know this object is logically absent (e.g., an undefined clone)
2844     return cache_result_t::NOOP;
2845   }
2846
2847   if (obc.get() && obc->obs.exists) {
2848     osd->logger->inc(l_osd_op_cache_hit);
2849     return cache_result_t::NOOP;
2850   }
2851   if (!is_primary()) {
2852     dout(20) << __func__ << " cache miss; ask the primary" << dendl;
2853     osd->reply_op_error(op, -EAGAIN);
2854     return cache_result_t::REPLIED_WITH_EAGAIN;
2855   }
2856
2857   if (missing_oid == hobject_t() && obc.get()) {
2858     missing_oid = obc->obs.oi.soid;
2859   }
2860
2861   const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
2862   const object_locator_t oloc = m->get_object_locator();
2863
2864   if (op->need_skip_handle_cache()) {
2865     return cache_result_t::NOOP;
2866   }
2867
2868   OpRequestRef promote_op;
2869
2870   switch (pool.info.cache_mode) {
2871   case pg_pool_t::CACHEMODE_WRITEBACK:
2872     if (agent_state &&
2873         agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2874       if (!op->may_write() && !op->may_cache() &&
2875           !write_ordered && !must_promote) {
2876         dout(20) << __func__ << " cache pool full, proxying read" << dendl;
2877         do_proxy_read(op);
2878         return cache_result_t::HANDLED_PROXY;
2879       }
2880       dout(20) << __func__ << " cache pool full, waiting" << dendl;
2881       block_write_on_full_cache(missing_oid, op);
2882       return cache_result_t::BLOCKED_FULL;
2883     }
2884
2885     if (must_promote || (!hit_set && !op->need_skip_promote())) {
2886       promote_object(obc, missing_oid, oloc, op, promote_obc);
2887       return cache_result_t::BLOCKED_PROMOTE;
2888     }
2889
2890     if (op->may_write() || op->may_cache()) {
2891       do_proxy_write(op);
2892
2893       // Promote too?
2894       if (!op->need_skip_promote() &&
2895           maybe_promote(obc, missing_oid, oloc, in_hit_set,
2896                       pool.info.min_write_recency_for_promote,
2897                       OpRequestRef(),
2898                       promote_obc)) {
2899         return cache_result_t::BLOCKED_PROMOTE;
2900       }
2901       return cache_result_t::HANDLED_PROXY;
2902     } else {
2903       do_proxy_read(op);
2904
2905       // Avoid duplicate promotion
2906       if (obc.get() && obc->is_blocked()) {
2907         if (promote_obc)
2908           *promote_obc = obc;
2909         return cache_result_t::BLOCKED_PROMOTE;
2910       }
2911
2912       // Promote too?
2913       if (!op->need_skip_promote()) {
2914         (void)maybe_promote(obc, missing_oid, oloc, in_hit_set,
2915                             pool.info.min_read_recency_for_promote,
2916                             promote_op, promote_obc);
2917       }
2918
2919       return cache_result_t::HANDLED_PROXY;
2920     }
2921     ceph_abort_msg("unreachable");
2922     return cache_result_t::NOOP;
2923
2924   case pg_pool_t::CACHEMODE_FORWARD:
2925     // FIXME: this mode allows requests to be reordered.
2926     do_cache_redirect(op);
2927     return cache_result_t::HANDLED_REDIRECT;
2928
2929   case pg_pool_t::CACHEMODE_READONLY:
2930     // TODO: clean this case up
2931     if (!obc.get() && r == -ENOENT) {
2932       // we don't have the object and op's a read
2933       promote_object(obc, missing_oid, oloc, op, promote_obc);
2934       return cache_result_t::BLOCKED_PROMOTE;
2935     }
2936     if (!r) { // it must be a write
2937       do_cache_redirect(op);
2938       return cache_result_t::HANDLED_REDIRECT;
2939     }
2940     // crap, there was a failure of some kind
2941     return cache_result_t::NOOP;
2942
2943   case pg_pool_t::CACHEMODE_READFORWARD:
2944     // Do writeback to the cache tier for writes
2945     if (op->may_write() || write_ordered || must_promote) {
2946       if (agent_state &&
2947           agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2948         dout(20) << __func__ << " cache pool full, waiting" << dendl;
2949         block_write_on_full_cache(missing_oid, op);
2950         return cache_result_t::BLOCKED_FULL;
2951       }
2952       promote_object(obc, missing_oid, oloc, op, promote_obc);
2953       return cache_result_t::BLOCKED_PROMOTE;
2954     }
2955
2956     // If it is a read, we can read, we need to forward it
2957     do_cache_redirect(op);
2958     return cache_result_t::HANDLED_REDIRECT;
2959
2960   case pg_pool_t::CACHEMODE_PROXY:
2961     if (!must_promote) {
2962       if (op->may_write() || op->may_cache() || write_ordered) {
2963         do_proxy_write(op);
2964         return cache_result_t::HANDLED_PROXY;
2965       } else {
2966         do_proxy_read(op);
2967         return cache_result_t::HANDLED_PROXY;
2968       }
2969     }
2970     // ugh, we're forced to promote.
2971     if (agent_state &&
2972         agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2973       dout(20) << __func__ << " cache pool full, waiting" << dendl;
2974       block_write_on_full_cache(missing_oid, op);
2975       return cache_result_t::BLOCKED_FULL;
2976     }
2977     promote_object(obc, missing_oid, oloc, op, promote_obc);
2978     return cache_result_t::BLOCKED_PROMOTE;
2979
2980   case pg_pool_t::CACHEMODE_READPROXY:
2981     // Do writeback to the cache tier for writes
2982     if (op->may_write() || write_ordered || must_promote) {
2983       if (agent_state &&
2984           agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2985         dout(20) << __func__ << " cache pool full, waiting" << dendl;
2986         block_write_on_full_cache(missing_oid, op);
2987         return cache_result_t::BLOCKED_FULL;
2988       }
2989       promote_object(obc, missing_oid, oloc, op, promote_obc);
2990       return cache_result_t::BLOCKED_PROMOTE;
2991     }
2992
2993     // If it is a read, we can read, we need to proxy it
2994     do_proxy_read(op);
2995     return cache_result_t::HANDLED_PROXY;
2996
2997   default:
2998     ceph_abort_msg("unrecognized cache_mode");
2999   }
3000   return cache_result_t::NOOP;
3001 }
3002
3003 bool PrimaryLogPG::maybe_promote(ObjectContextRef obc,
3004                                  const hobject_t& missing_oid,
3005                                  const object_locator_t& oloc,
3006                                  bool in_hit_set,
3007                                  uint32_t recency,
3008                                  OpRequestRef promote_op,
3009                                  ObjectContextRef *promote_obc)
3010 {
3011   dout(20) << __func__ << " missing_oid " << missing_oid
3012            << "  in_hit_set " << in_hit_set << dendl;
3013
3014   switch (recency) {
3015   case 0:
3016     break;
3017   case 1:
3018     // Check if in the current hit set
3019     if (in_hit_set) {
3020       break;
3021     } else {
3022       // not promoting
3023       return false;
3024     }
3025     break;
3026   default:
3027     {
3028       unsigned count = (int)in_hit_set;
3029       if (count) {
3030         // Check if in other hit sets
3031         const hobject_t& oid = obc.get() ? obc->obs.oi.soid : missing_oid;
3032         for (map<time_t,HitSetRef>::reverse_iterator itor =
3033                agent_state->hit_set_map.rbegin();
3034              itor != agent_state->hit_set_map.rend();
3035              ++itor) {
3036           if (!itor->second->contains(oid)) {
3037             break;
3038           }
3039           ++count;
3040           if (count >= recency) {
3041             break;
3042           }
3043         }
3044       }
3045       if (count >= recency) {
3046         break;
3047       }
3048       return false;     // not promoting
3049     }
3050     break;
3051   }
3052
3053   if (osd->promote_throttle()) {
3054     dout(10) << __func__ << " promote throttled" << dendl;
3055     return false;
3056   }
3057   promote_object(obc, missing_oid, oloc, promote_op, promote_obc);
3058   return true;
3059 }
3060
3061 void PrimaryLogPG::do_cache_redirect(OpRequestRef op)
3062 {
3063   const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
3064   int flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
3065   MOSDOpReply *reply = new MOSDOpReply(m, -ENOENT, get_osdmap_epoch(),
3066                                        flags, false);
3067   request_redirect_t redir(m->get_object_locator(), pool.info.tier_of);
3068   reply->set_redirect(redir);
3069   dout(10) << "sending redirect to pool " << pool.info.tier_of << " for op "
3070            << op << dendl;
3071   m->get_connection()->send_message(reply);
3072   return;
3073 }
3074
3075 struct C_ProxyRead : public Context {
3076   PrimaryLogPGRef pg;
3077   hobject_t oid;
3078   epoch_t last_peering_reset;
3079   ceph_tid_t tid;
3080   PrimaryLogPG::ProxyReadOpRef prdop;
3081   utime_t start;
3082   C_ProxyRead(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
3083              const PrimaryLogPG::ProxyReadOpRef& prd)
3084     : pg(p), oid(o), last_peering_reset(lpr),
3085       tid(0), prdop(prd), start(ceph_clock_now())
3086   {}
3087   void finish(int r) override {
3088     if (prdop->canceled)
3089       return;
3090     pg->lock();
3091     if (prdop->canceled) {
3092       pg->unlock();
3093       return;
3094     }
3095     if (last_peering_reset == pg->get_last_peering_reset()) {
3096       pg->finish_proxy_read(oid, tid, r);
3097       pg->osd->logger->tinc(l_osd_tier_r_lat, ceph_clock_now() - start);
3098     }
3099     pg->unlock();
3100   }
3101 };
3102
3103 struct C_ProxyChunkRead : public Context {
3104   PrimaryLogPGRef pg;
3105   hobject_t oid;
3106   epoch_t last_peering_reset;
3107   ceph_tid_t tid;
3108   PrimaryLogPG::ProxyReadOpRef prdop;
3109   utime_t start;
3110   ObjectOperation *obj_op;
3111   int op_index = 0;
3112   uint64_t req_offset = 0;
3113   ObjectContextRef obc;
3114   uint64_t req_total_len = 0;
3115   C_ProxyChunkRead(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
3116                    const PrimaryLogPG::ProxyReadOpRef& prd)
3117     : pg(p), oid(o), last_peering_reset(lpr),
3118       tid(0), prdop(prd), start(ceph_clock_now()), obj_op(NULL)
3119   {}
3120   void finish(int r) override {
3121     if (prdop->canceled)
3122       return;
3123     pg->lock();
3124     if (prdop->canceled) {
3125       pg->unlock();
3126       return;
3127     }
3128     if (last_peering_reset == pg->get_last_peering_reset()) {
3129       if (r >= 0) {
3130         if (!prdop->ops[op_index].outdata.length()) {
3131           ceph_assert(req_total_len);
3132           bufferlist list;
3133           bufferptr bptr(req_total_len);
3134           list.push_back(std::move(bptr));
3135           prdop->ops[op_index].outdata.append(list);
3136         }
3137         ceph_assert(obj_op);
3138         uint64_t copy_offset;
3139         if (req_offset >= prdop->ops[op_index].op.extent.offset) {
3140           copy_offset = req_offset - prdop->ops[op_index].op.extent.offset;
3141         } else {
3142           copy_offset = 0;
3143         }
3144         prdop->ops[op_index].outdata.copy_in(copy_offset, obj_op->ops[0].outdata.length(),
3145                                              obj_op->ops[0].outdata.c_str());
3146       }
3147
3148       pg->finish_proxy_read(oid, tid, r);
3149       pg->osd->logger->tinc(l_osd_tier_r_lat, ceph_clock_now() - start);
3150       if (obj_op) {
3151         delete obj_op;
3152       }
3153     }
3154     pg->unlock();
3155   }
3156 };
3157
3158 void PrimaryLogPG::do_proxy_read(OpRequestRef op, ObjectContextRef obc)
3159 {
3160   // NOTE: non-const here because the ProxyReadOp needs mutable refs to
3161   // stash the result in the request's OSDOp vector
3162   MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
3163   object_locator_t oloc;
3164   hobject_t soid;
3165   /* extensible tier */
3166   if (obc && obc->obs.exists && obc->obs.oi.has_manifest()) {
3167     switch (obc->obs.oi.manifest.type) {
3168       case object_manifest_t::TYPE_REDIRECT:
3169           oloc = object_locator_t(obc->obs.oi.manifest.redirect_target);
3170           soid = obc->obs.oi.manifest.redirect_target;
3171           break;
3172       default:
3173         ceph_abort_msg("unrecognized manifest type");
3174     }
3175   } else {
3176   /* proxy */
3177     soid = m->get_hobj();
3178     oloc = object_locator_t(m->get_object_locator());
3179     oloc.pool = pool.info.tier_of;
3180   }
3181   unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY;
3182
3183   // pass through some original flags that make sense.
3184   //  - leave out redirection and balancing flags since we are
3185   //    already proxying through the primary
3186   //  - leave off read/write/exec flags that are derived from the op
3187   flags |= m->get_flags() & (CEPH_OSD_FLAG_RWORDERED |
3188                              CEPH_OSD_FLAG_ORDERSNAP |
3189                              CEPH_OSD_FLAG_ENFORCE_SNAPC |
3190                              CEPH_OSD_FLAG_MAP_SNAP_CLONE);
3191
3192   dout(10) << __func__ << " Start proxy read for " << *m << dendl;
3193
3194   ProxyReadOpRef prdop(std::make_shared<ProxyReadOp>(op, soid, m->ops));
3195
3196   ObjectOperation obj_op;
3197   obj_op.dup(prdop->ops);
3198
3199   if (pool.info.cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
3200       (agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL)) {
3201     for (unsigned i = 0; i < obj_op.ops.size(); i++) {
3202       ceph_osd_op op = obj_op.ops[i].op;
3203       switch (op.op) {
3204         case CEPH_OSD_OP_READ:
3205         case CEPH_OSD_OP_SYNC_READ:
3206         case CEPH_OSD_OP_SPARSE_READ:
3207         case CEPH_OSD_OP_CHECKSUM:
3208         case CEPH_OSD_OP_CMPEXT:
3209           op.flags = (op.flags | CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL) &
3210                        ~(CEPH_OSD_OP_FLAG_FADVISE_DONTNEED | CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
3211       }
3212     }
3213   }
3214
3215   C_ProxyRead *fin = new C_ProxyRead(this, soid, get_last_peering_reset(),
3216                                      prdop);
3217   unsigned n = info.pgid.hash_to_shard(osd->m_objecter_finishers);
3218   ceph_tid_t tid = osd->objecter->read(
3219     soid.oid, oloc, obj_op,
3220     m->get_snapid(), NULL,
3221     flags, new C_OnFinisher(fin, osd->objecter_finishers[n]),
3222     &prdop->user_version,
3223     &prdop->data_offset,
3224     m->get_features());
3225   fin->tid = tid;
3226   prdop->objecter_tid = tid;
3227   proxyread_ops[tid] = prdop;
3228   in_progress_proxy_ops[soid].push_back(op);
3229 }
3230
3231 void PrimaryLogPG::finish_proxy_read(hobject_t oid, ceph_tid_t tid, int r)
3232 {
3233   dout(10) << __func__ << " " << oid << " tid " << tid
3234            << " " << cpp_strerror(r) << dendl;
3235
3236   map<ceph_tid_t, ProxyReadOpRef>::iterator p = proxyread_ops.find(tid);
3237   if (p == proxyread_ops.end()) {
3238     dout(10) << __func__ << " no proxyread_op found" << dendl;
3239     return;
3240   }
3241   ProxyReadOpRef prdop = p->second;
3242   if (tid != prdop->objecter_tid) {
3243     dout(10) << __func__ << " tid " << tid << " != prdop " << prdop
3244              << " tid " << prdop->objecter_tid << dendl;
3245     return;
3246   }
3247   if (oid != prdop->soid) {
3248     dout(10) << __func__ << " oid " << oid << " != prdop " << prdop
3249              << " soid " << prdop->soid << dendl;
3250     return;
3251   }
3252   proxyread_ops.erase(tid);
3253
3254   map<hobject_t, list<OpRequestRef>>::iterator q = in_progress_proxy_ops.find(oid);
3255   if (q == in_progress_proxy_ops.end()) {
3256     dout(10) << __func__ << " no in_progress_proxy_ops found" << dendl;
3257     return;
3258   }
3259   ceph_assert(q->second.size());
3260   list<OpRequestRef>::iterator it = std::find(q->second.begin(),
3261                                               q->second.end(),
3262                                               prdop->op);
3263   ceph_assert(it != q->second.end());
3264   OpRequestRef op = *it;
3265   q->second.erase(it);
3266   if (q->second.size() == 0) {
3267     in_progress_proxy_ops.erase(oid);
3268   } else if (std::find(q->second.begin(),
3269                        q->second.end(),
3270                        prdop->op) != q->second.end()) {
3271     /* multiple read case */
3272     dout(20) << __func__ << " " << oid << " is not completed  " << dendl;
3273     return;
3274   }
3275
3276   osd->logger->inc(l_osd_tier_proxy_read);
3277
3278   const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
3279   OpContext *ctx = new OpContext(op, m->get_reqid(), &prdop->ops, this);
3280   ctx->reply = new MOSDOpReply(m, 0, get_osdmap_epoch(), 0, false);
3281   ctx->user_at_version = prdop->user_version;
3282   ctx->data_off = prdop->data_offset;
3283   ctx->ignore_log_op_stats = true;
3284   complete_read_ctx(r, ctx);
3285 }
3286
3287 void PrimaryLogPG::kick_proxy_ops_blocked(hobject_t& soid)
3288 {
3289   map<hobject_t, list<OpRequestRef>>::iterator p = in_progress_proxy_ops.find(soid);
3290   if (p == in_progress_proxy_ops.end())
3291     return;
3292
3293   list<OpRequestRef>& ls = p->second;
3294   dout(10) << __func__ << " " << soid << " requeuing " << ls.size() << " requests" << dendl;
3295   requeue_ops(ls);
3296   in_progress_proxy_ops.erase(p);
3297 }
3298
3299 void PrimaryLogPG::cancel_proxy_read(ProxyReadOpRef prdop,
3300                                      vector<ceph_tid_t> *tids)
3301 {
3302   dout(10) << __func__ << " " << prdop->soid << dendl;
3303   prdop->canceled = true;
3304
3305   // cancel objecter op, if we can
3306   if (prdop->objecter_tid) {
3307     tids->push_back(prdop->objecter_tid);
3308     for (uint32_t i = 0; i < prdop->ops.size(); i++) {
3309       prdop->ops[i].outdata.clear();
3310     }
3311     proxyread_ops.erase(prdop->objecter_tid);
3312     prdop->objecter_tid = 0;
3313   }
3314 }
3315
3316 void PrimaryLogPG::cancel_proxy_ops(bool requeue, vector<ceph_tid_t> *tids)
3317 {
3318   dout(10) << __func__ << dendl;
3319
3320   // cancel proxy reads
3321   map<ceph_tid_t, ProxyReadOpRef>::iterator p = proxyread_ops.begin();
3322   while (p != proxyread_ops.end()) {
3323     cancel_proxy_read((p++)->second, tids);
3324   }
3325
3326   // cancel proxy writes
3327   map<ceph_tid_t, ProxyWriteOpRef>::iterator q = proxywrite_ops.begin();
3328   while (q != proxywrite_ops.end()) {
3329     cancel_proxy_write((q++)->second, tids);
3330   }
3331
3332   if (requeue) {
3333     map<hobject_t, list<OpRequestRef>>::iterator p =
3334       in_progress_proxy_ops.begin();
3335     while (p != in_progress_proxy_ops.end()) {
3336       list<OpRequestRef>& ls = p->second;
3337       dout(10) << __func__ << " " << p->first << " requeuing " << ls.size()
3338                << " requests" << dendl;
3339       requeue_ops(ls);
3340       in_progress_proxy_ops.erase(p++);
3341     }
3342   } else {
3343     in_progress_proxy_ops.clear();
3344   }
3345 }
3346
3347 struct C_ProxyWrite_Commit : public Context {
3348   PrimaryLogPGRef pg;
3349   hobject_t oid;
3350   epoch_t last_peering_reset;
3351   ceph_tid_t tid;
3352   PrimaryLogPG::ProxyWriteOpRef pwop;
3353   C_ProxyWrite_Commit(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
3354                       const PrimaryLogPG::ProxyWriteOpRef& pw)
3355     : pg(p), oid(o), last_peering_reset(lpr),
3356       tid(0), pwop(pw)
3357   {}
3358   void finish(int r) override {
3359     if (pwop->canceled)
3360       return;
3361     pg->lock();
3362     if (pwop->canceled) {
3363       pg->unlock();
3364       return;
3365     }
3366     if (last_peering_reset == pg->get_last_peering_reset()) {
3367       pg->finish_proxy_write(oid, tid, r);
3368     }
3369     pg->unlock();
3370   }
3371 };
3372
3373 void PrimaryLogPG::do_proxy_write(OpRequestRef op, ObjectContextRef obc)
3374 {
3375   // NOTE: non-const because ProxyWriteOp takes a mutable ref
3376   MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
3377   object_locator_t oloc;
3378   SnapContext snapc(m->get_snap_seq(), m->get_snaps());
3379   hobject_t soid;
3380   /* extensible tier */
3381   if (obc && obc->obs.exists && obc->obs.oi.has_manifest()) {
3382     switch (obc->obs.oi.manifest.type) {
3383       case object_manifest_t::TYPE_REDIRECT:
3384           oloc = object_locator_t(obc->obs.oi.manifest.redirect_target);
3385           soid = obc->obs.oi.manifest.redirect_target;
3386           break;
3387       default:
3388         ceph_abort_msg("unrecognized manifest type");
3389     }
3390   } else {
3391   /* proxy */
3392     soid = m->get_hobj();
3393     oloc = object_locator_t(m->get_object_locator());
3394     oloc.pool = pool.info.tier_of;
3395   }
3396
3397   unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY;
3398   if (!(op->may_write() || op->may_cache())) {
3399     flags |= CEPH_OSD_FLAG_RWORDERED;
3400   }
3401   dout(10) << __func__ << " Start proxy write for " << *m << dendl;
3402
3403   ProxyWriteOpRef pwop(std::make_shared<ProxyWriteOp>(op, soid, m->ops, m->get_reqid()));
3404   pwop->ctx = new OpContext(op, m->get_reqid(), &pwop->ops, this);
3405   pwop->mtime = m->get_mtime();
3406
3407   ObjectOperation obj_op;
3408   obj_op.dup(pwop->ops);
3409
3410   C_ProxyWrite_Commit *fin = new C_ProxyWrite_Commit(
3411       this, soid, get_last_peering_reset(), pwop);
3412   unsigned n = info.pgid.hash_to_shard(osd->m_objecter_finishers);
3413   ceph_tid_t tid = osd->objecter->mutate(
3414     soid.oid, oloc, obj_op, snapc,
3415     ceph::real_clock::from_ceph_timespec(pwop->mtime),
3416     flags, new C_OnFinisher(fin, osd->objecter_finishers[n]),
3417     &pwop->user_version, pwop->reqid);
3418   fin->tid = tid;
3419   pwop->objecter_tid = tid;
3420   proxywrite_ops[tid] = pwop;
3421   in_progress_proxy_ops[soid].push_back(op);
3422 }
3423
3424 void PrimaryLogPG::do_proxy_chunked_op(OpRequestRef op, const hobject_t& missing_oid,
3425                                        ObjectContextRef obc, bool write_ordered)
3426 {
3427   MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
3428   OSDOp *osd_op = NULL;
3429   for (unsigned int i = 0; i < m->ops.size(); i++) {
3430     osd_op = &m->ops[i];
3431     uint64_t cursor = osd_op->op.extent.offset;
3432     uint64_t op_length = osd_op->op.extent.offset + osd_op->op.extent.length;
3433     uint64_t chunk_length = 0, chunk_index = 0, req_len = 0;
3434     object_manifest_t *manifest = &obc->obs.oi.manifest;
3435     map <uint64_t, map<uint64_t, uint64_t>> chunk_read;
3436
3437     while (cursor < op_length) {
3438       chunk_index = 0;
3439       chunk_length = 0;
3440       /* find the right chunk position for cursor */
3441       for (auto &p : manifest->chunk_map) {
3442         if (p.first <= cursor && p.first + p.second.length > cursor) {
3443           chunk_length = p.second.length;
3444           chunk_index = p.first;
3445           break;
3446         }
3447       }
3448       /* no index */
3449       if (!chunk_index && !chunk_length) {
3450         if (cursor == osd_op->op.extent.offset) {
3451           OpContext *ctx = new OpContext(op, m->get_reqid(), &m->ops, this);
3452           ctx->reply = new MOSDOpReply(m, 0, get_osdmap_epoch(), 0, false);
3453           ctx->data_off = osd_op->op.extent.offset;
3454           ctx->ignore_log_op_stats = true;
3455           complete_read_ctx(0, ctx);
3456         }
3457         break;
3458       }
3459       uint64_t next_length = chunk_length;
3460       /* the size to read -> | op length | */
3461       /*                     |   a chunk   | */
3462       if (cursor + next_length > op_length) {
3463         next_length = op_length - cursor;
3464       }
3465       /* the size to read -> |   op length   | */
3466       /*                     |   a chunk | */
3467       if (cursor + next_length > chunk_index + chunk_length) {
3468         next_length = chunk_index + chunk_length - cursor;
3469       }
3470
3471       chunk_read[cursor] = {{chunk_index, next_length}};
3472       cursor += next_length;
3473     }
3474
3475     req_len = cursor - osd_op->op.extent.offset;
3476     for (auto &p : chunk_read) {
3477       auto chunks = p.second.begin();
3478       dout(20) << __func__ << " chunk_index: " << chunks->first
3479               << " next_length: " << chunks->second << " cursor: "
3480               << p.first << dendl;
3481       do_proxy_chunked_read(op, obc, i, chunks->first, p.first, chunks->second, req_len, write_ordered);
3482     }
3483   }
3484 }
3485
3486 struct RefCountCallback : public Context {
3487 public:
3488   PrimaryLogPG *pg;
3489   PrimaryLogPG::OpContext *ctx;
3490   OSDOp& osd_op;
3491   epoch_t last_peering_reset;
3492
3493   RefCountCallback(PrimaryLogPG *pg, PrimaryLogPG::OpContext *ctx,
3494                   OSDOp &osd_op, epoch_t lpr)
3495     : pg(pg), ctx(ctx), osd_op(osd_op), last_peering_reset(lpr)
3496   {}
3497   void finish(int r) override {
3498     pg->lock();
3499     if (last_peering_reset == pg->get_last_peering_reset()) {
3500       if (r >= 0) {
3501        osd_op.rval = 0;
3502        pg->execute_ctx(ctx);
3503       } else {
3504        if (ctx->op) {
3505          pg->osd->reply_op_error(ctx->op, r);
3506        }
3507        pg->close_op_ctx(ctx);
3508       }
3509     }
3510     pg->unlock();
3511   }
3512 };
3513
3514 struct SetManifestFinisher : public PrimaryLogPG::OpFinisher {
3515   OSDOp& osd_op;
3516
3517   explicit SetManifestFinisher(OSDOp& osd_op) : osd_op(osd_op) {
3518   }
3519
3520   int execute() override {
3521     return osd_op.rval;
3522   }
3523 };
3524
3525 void PrimaryLogPG::refcount_manifest(ObjectContextRef obc, object_locator_t oloc, hobject_t soid,
3526                                      SnapContext snapc, bool get, Context *cb, uint64_t offset)
3527 {
3528   unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY |
3529                    CEPH_OSD_FLAG_RWORDERED;
3530
3531   dout(10) << __func__ << " Start refcount for " << soid << dendl;
3532
3533   ObjectOperation obj_op;
3534   bufferlist in;
3535   if (get) {
3536     cls_chunk_refcount_get_op call;
3537     call.source = obc->obs.oi.soid;
3538     ::encode(call, in);
3539     obj_op.call("cas", "chunk_get", in);
3540   } else {
3541     cls_chunk_refcount_put_op call;
3542     call.source = obc->obs.oi.soid;
3543     ::encode(call, in);
3544     obj_op.call("cas", "chunk_put", in);
3545   }
3546
3547   unsigned n = info.pgid.hash_to_shard(osd->m_objecter_finishers);
3548   Context *c;
3549   if (cb) {
3550     c = new C_OnFinisher(cb, osd->objecter_finishers[n]);
3551   } else {
3552     c = NULL;
3553   }
3554
3555   osd->objecter->mutate(
3556     soid.oid, oloc, obj_op, snapc,
3557     ceph::real_clock::from_ceph_timespec(obc->obs.oi.mtime),
3558     flags, c);
3559 }
3560
3561 void PrimaryLogPG::do_proxy_chunked_read(OpRequestRef op, ObjectContextRef obc, int op_index,
3562                                          uint64_t chunk_index, uint64_t req_offset, uint64_t req_length,
3563                                          uint64_t req_total_len, bool write_ordered)
3564 {
3565   MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
3566   object_manifest_t *manifest = &obc->obs.oi.manifest;
3567   if (!manifest->chunk_map.count(chunk_index)) {
3568     return;
3569   }
3570   uint64_t chunk_length = manifest->chunk_map[chunk_index].length;
3571   hobject_t soid = manifest->chunk_map[chunk_index].oid;
3572   hobject_t ori_soid = m->get_hobj();
3573   object_locator_t oloc(soid);
3574   unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY;
3575   if (write_ordered) {
3576     flags |= CEPH_OSD_FLAG_RWORDERED;
3577   }
3578
3579   if (!chunk_length || soid == hobject_t()) {
3580     return;
3581   }
3582
3583   /* same as do_proxy_read() */
3584   flags |= m->get_flags() & (CEPH_OSD_FLAG_RWORDERED |
3585                              CEPH_OSD_FLAG_ORDERSNAP |
3586                              CEPH_OSD_FLAG_ENFORCE_SNAPC |
3587                              CEPH_OSD_FLAG_MAP_SNAP_CLONE);
3588
3589   dout(10) << __func__ << " Start do chunk proxy read for " << *m
3590            << " index: " << op_index << " oid: " << soid.oid.name << " req_offset: " << req_offset
3591            << " req_length: " << req_length << dendl;
3592
3593   ProxyReadOpRef prdop(std::make_shared<ProxyReadOp>(op, ori_soid, m->ops));
3594
3595   ObjectOperation *pobj_op = new ObjectOperation;
3596   OSDOp &osd_op = pobj_op->add_op(m->ops[op_index].op.op);
3597
3598   if (chunk_index <= req_offset) {
3599     osd_op.op.extent.offset = manifest->chunk_map[chunk_index].offset + req_offset - chunk_index;
3600   } else {
3601     ceph_abort_msg("chunk_index > req_offset");
3602   }
3603   osd_op.op.extent.length = req_length;
3604
3605   ObjectOperation obj_op;
3606   obj_op.dup(pobj_op->ops);
3607
3608   C_ProxyChunkRead *fin = new C_ProxyChunkRead(this, ori_soid, get_last_peering_reset(),
3609                                                prdop);
3610   fin->obj_op = pobj_op;
3611   fin->op_index = op_index;
3612   fin->req_offset = req_offset;
3613   fin->obc = obc;
3614   fin->req_total_len = req_total_len;
3615
3616   unsigned n = info.pgid.hash_to_shard(osd->m_objecter_finishers);
3617   ceph_tid_t tid = osd->objecter->read(
3618     soid.oid, oloc, obj_op,
3619     m->get_snapid(), NULL,
3620     flags, new C_OnFinisher(fin, osd->objecter_finishers[n]),
3621     &prdop->user_version,
3622     &prdop->data_offset,
3623     m->get_features());
3624   fin->tid = tid;
3625   prdop->objecter_tid = tid;
3626   proxyread_ops[tid] = prdop;
3627   in_progress_proxy_ops[ori_soid].push_back(op);
3628 }
3629
3630 bool PrimaryLogPG::can_proxy_chunked_read(OpRequestRef op, ObjectContextRef obc)
3631 {
3632   MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
3633   OSDOp *osd_op = NULL;
3634   bool ret = true;
3635   for (unsigned int i = 0; i < m->ops.size(); i++) {
3636     osd_op = &m->ops[i];
3637     ceph_osd_op op = osd_op->op;
3638     switch (op.op) {
3639       case CEPH_OSD_OP_READ:
3640       case CEPH_OSD_OP_SYNC_READ: {
3641         uint64_t cursor = osd_op->op.extent.offset;
3642         uint64_t remain = osd_op->op.extent.length;
3643
3644         /* requested chunks exist in chunk_map ? */
3645         for (auto &p : obc->obs.oi.manifest.chunk_map) {
3646           if (p.first <= cursor && p.first + p.second.length > cursor) {
3647             if (!p.second.is_missing()) {
3648               return false;
3649             }
3650             if (p.second.length >= remain) {
3651               remain = 0;
3652               break;
3653             } else {
3654               remain = remain - p.second.length;
3655             }
3656             cursor += p.second.length;
3657           }
3658         }
3659
3660         if (remain) {
3661           dout(20) << __func__ << " requested chunks don't exist in chunk_map " << dendl;
3662           return false;
3663         }
3664         continue;
3665       }
3666       default:
3667         return false;
3668     }
3669   }
3670   return ret;
3671 }
3672
3673 void PrimaryLogPG::finish_proxy_write(hobject_t oid, ceph_tid_t tid, int r)
3674 {
3675   dout(10) << __func__ << " " << oid << " tid " << tid
3676            << " " << cpp_strerror(r) << dendl;
3677
3678   map<ceph_tid_t, ProxyWriteOpRef>::iterator p = proxywrite_ops.find(tid);
3679   if (p == proxywrite_ops.end()) {
3680     dout(10) << __func__ << " no proxywrite_op found" << dendl;
3681     return;
3682   }
3683   ProxyWriteOpRef pwop = p->second;
3684   ceph_assert(tid == pwop->objecter_tid);
3685   ceph_assert(oid == pwop->soid);
3686
3687   proxywrite_ops.erase(tid);
3688
3689   map<hobject_t, list<OpRequestRef> >::iterator q = in_progress_proxy_ops.find(oid);
3690   if (q == in_progress_proxy_ops.end()) {
3691     dout(10) << __func__ << " no in_progress_proxy_ops found" << dendl;
3692     delete pwop->ctx;
3693     pwop->ctx = NULL;
3694     return;
3695   }
3696   list<OpRequestRef>& in_progress_op = q->second;
3697   ceph_assert(in_progress_op.size());
3698   list<OpRequestRef>::iterator it = std::find(in_progress_op.begin(),
3699                                               in_progress_op.end(),
3700                                               pwop->op);
3701   ceph_assert(it != in_progress_op.end());
3702   in_progress_op.erase(it);
3703   if (in_progress_op.size() == 0) {
3704     in_progress_proxy_ops.erase(oid);
3705   } else if (std::find(in_progress_op.begin(),
3706                         in_progress_op.end(),
3707                         pwop->op) != in_progress_op.end()) {
3708     if (pwop->ctx)
3709       delete pwop->ctx;
3710     pwop->ctx = NULL;
3711     dout(20) << __func__ << " " << oid << " tid " << tid
3712             << " in_progress_op size: "
3713             << in_progress_op.size() << dendl;
3714     return;
3715   }
3716
3717   osd->logger->inc(l_osd_tier_proxy_write);
3718
3719   const MOSDOp *m = static_cast<const MOSDOp*>(pwop->op->get_req());
3720   ceph_assert(m != NULL);
3721
3722   if (!pwop->sent_reply) {
3723     // send commit.
3724     MOSDOpReply *reply = pwop->ctx->reply;
3725     if (reply)
3726       pwop->ctx->reply = NULL;
3727     else {
3728       reply = new MOSDOpReply(m, r, get_osdmap_epoch(), 0, true);
3729       reply->set_reply_versions(eversion_t(), pwop->user_version);
3730     }
3731     reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
3732     dout(10) << " sending commit on " << pwop << " " << reply << dendl;
3733     osd->send_message_osd_client(reply, m->get_connection());
3734     pwop->sent_reply = true;
3735     pwop->ctx->op->mark_commit_sent();
3736   }
3737
3738   delete pwop->ctx;
3739   pwop->ctx = NULL;
3740 }
3741
3742 void PrimaryLogPG::cancel_proxy_write(ProxyWriteOpRef pwop,
3743                                       vector<ceph_tid_t> *tids)
3744 {
3745   dout(10) << __func__ << " " << pwop->soid << dendl;
3746   pwop->canceled = true;
3747
3748   // cancel objecter op, if we can
3749   if (pwop->objecter_tid) {
3750     tids->push_back(pwop->objecter_tid);
3751     delete pwop->ctx;
3752     pwop->ctx = NULL;
3753     proxywrite_ops.erase(pwop->objecter_tid);
3754     pwop->objecter_tid = 0;
3755   }
3756 }
3757
3758 class PromoteCallback: public PrimaryLogPG::CopyCallback {
3759   ObjectContextRef obc;
3760   PrimaryLogPG *pg;
3761   utime_t start;
3762 public:
3763   PromoteCallback(ObjectContextRef obc_, PrimaryLogPG *pg_)
3764     : obc(obc_),
3765       pg(pg_),
3766       start(ceph_clock_now()) {}
3767
3768   void finish(PrimaryLogPG::CopyCallbackResults results) override {
3769     PrimaryLogPG::CopyResults *results_data = results.get<1>();
3770     int r = results.get<0>();
3771     pg->finish_promote(r, results_data, obc);
3772     pg->osd->logger->tinc(l_osd_tier_promote_lat, ceph_clock_now() - start);
3773   }
3774 };
3775
3776 class PromoteManifestCallback: public PrimaryLogPG::CopyCallback {
3777   ObjectContextRef obc;
3778   PrimaryLogPG *pg;
3779   utime_t start;
3780   PrimaryLogPG::OpContext *ctx;
3781   PrimaryLogPG::CopyCallbackResults promote_results;
3782 public:
3783   PromoteManifestCallback(ObjectContextRef obc_, PrimaryLogPG *pg_, PrimaryLogPG::OpContext *ctx = NULL)
3784     : obc(obc_),
3785       pg(pg_),
3786       start(ceph_clock_now()), ctx(ctx) {}
3787
3788   void finish(PrimaryLogPG::CopyCallbackResults results) override {
3789     PrimaryLogPG::CopyResults *results_data = results.get<1>();
3790     int r = results.get<0>();
3791     if (ctx) {
3792       promote_results = results;
3793       pg->execute_ctx(ctx);
3794     } else {
3795       pg->finish_promote_manifest(r, results_data, obc);
3796     }
3797     pg->osd->logger->tinc(l_osd_tier_promote_lat, ceph_clock_now() - start);
3798   }
3799   friend struct PromoteFinisher;
3800 };
3801
3802 struct PromoteFinisher : public PrimaryLogPG::OpFinisher {
3803   PromoteManifestCallback *promote_callback;
3804
3805   explicit PromoteFinisher(PromoteManifestCallback *promote_callback)
3806     : promote_callback(promote_callback) {
3807   }
3808
3809   int execute() override {
3810     if (promote_callback->ctx->obc->obs.oi.manifest.is_redirect()) {
3811       promote_callback->ctx->pg->finish_promote(promote_callback->promote_results.get<0>(),
3812                                                 promote_callback->promote_results.get<1>(),
3813                                                 promote_callback->obc);
3814     } else if (promote_callback->ctx->obc->obs.oi.manifest.is_chunked()) {
3815       promote_callback->ctx->pg->finish_promote_manifest(promote_callback->promote_results.get<0>(),
3816                                                 promote_callback->promote_results.get<1>(),
3817                                                 promote_callback->obc);
3818     } else {
3819       ceph_abort_msg("unrecognized manifest type");
3820     }
3821     return 0;
3822   }
3823 };
3824
3825 void PrimaryLogPG::promote_object(ObjectContextRef obc,
3826                                   const hobject_t& missing_oid,
3827                                   const object_locator_t& oloc,
3828                                   OpRequestRef op,
3829                                   ObjectContextRef *promote_obc)
3830 {
3831   hobject_t hoid = obc ? obc->obs.oi.soid : missing_oid;
3832   ceph_assert(hoid != hobject_t());
3833   if (write_blocked_by_scrub(hoid)) {
3834     dout(10) << __func__ << " " << hoid
3835              << " blocked by scrub" << dendl;
3836     if (op) {
3837       waiting_for_scrub.push_back(op);
3838       op->mark_delayed("waiting for scrub");
3839       dout(10) << __func__ << " " << hoid
3840                << " placing op in waiting_for_scrub" << dendl;
3841     } else {
3842       dout(10) << __func__ << " " << hoid
3843                << " no op, dropping on the floor" << dendl;
3844     }
3845     return;
3846   }
3847   if (!obc) { // we need to create an ObjectContext
3848     ceph_assert(missing_oid != hobject_t());
3849     obc = get_object_context(missing_oid, true);
3850   }
3851   if (promote_obc)
3852     *promote_obc = obc;
3853
3854   /*
3855    * Before promote complete, if there are  proxy-reads for the object,
3856    * for this case we don't use DONTNEED.
3857    */
3858   unsigned src_fadvise_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL;
3859   map<hobject_t, list<OpRequestRef>>::iterator q = in_progress_proxy_ops.find(obc->obs.oi.soid);
3860   if (q == in_progress_proxy_ops.end()) {
3861     src_fadvise_flags |= LIBRADOS_OP_FLAG_FADVISE_DONTNEED;
3862   }
3863
3864   CopyCallback *cb;
3865   object_locator_t my_oloc;
3866   hobject_t src_hoid;
3867   if (!obc->obs.oi.has_manifest()) {
3868     my_oloc = oloc;
3869     my_oloc.pool = pool.info.tier_of;
3870     src_hoid = obc->obs.oi.soid;
3871     cb = new PromoteCallback(obc, this);
3872   } else {
3873     if (obc->obs.oi.manifest.is_chunked()) {
3874       src_hoid = obc->obs.oi.soid;
3875       cb = new PromoteManifestCallback(obc, this);
3876     } else if (obc->obs.oi.manifest.is_redirect()) {
3877       object_locator_t src_oloc(obc->obs.oi.manifest.redirect_target);
3878       my_oloc = src_oloc;
3879       src_hoid = obc->obs.oi.manifest.redirect_target;
3880       cb = new PromoteCallback(obc, this);
3881     } else {
3882       ceph_abort_msg("unrecognized manifest type");
3883     }
3884   }
3885
3886   unsigned flags = CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
3887                    CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
3888                    CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE |
3889                    CEPH_OSD_COPY_FROM_FLAG_RWORDERED;
3890   start_copy(cb, obc, src_hoid, my_oloc, 0, flags,
3891              obc->obs.oi.soid.snap == CEPH_NOSNAP,
3892              src_fadvise_flags, 0);
3893
3894   ceph_assert(obc->is_blocked());
3895
3896   if (op)
3897     wait_for_blocked_object(obc->obs.oi.soid, op);
3898   info.stats.stats.sum.num_promote++;
3899 }
3900
3901 void PrimaryLogPG::execute_ctx(OpContext *ctx)
3902 {
3903   FUNCTRACE(cct);
3904   dout(10) << __func__ << " " << ctx << dendl;
3905   ctx->reset_obs(ctx->obc);
3906   ctx->update_log_only = false; // reset in case finish_copyfrom() is re-running execute_ctx
3907   OpRequestRef op = ctx->op;
3908   const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
3909   ObjectContextRef obc = ctx->obc;
3910   const hobject_t& soid = obc->obs.oi.soid;
3911
3912   // this method must be idempotent since we may call it several times
3913   // before we finally apply the resulting transaction.
3914   ctx->op_t.reset(new PGTransaction);
3915
3916   if (op->may_write() || op->may_cache()) {
3917     // snap
3918     if (!(m->has_flag(CEPH_OSD_FLAG_ENFORCE_SNAPC)) &&
3919         pool.info.is_pool_snaps_mode()) {
3920       // use pool's snapc
3921       ctx->snapc = pool.snapc;
3922     } else {
3923       // client specified snapc
3924       ctx->snapc.seq = m->get_snap_seq();
3925       ctx->snapc.snaps = m->get_snaps();
3926       filter_snapc(ctx->snapc.snaps);
3927     }
3928     if ((m->has_flag(CEPH_OSD_FLAG_ORDERSNAP)) &&
3929         ctx->snapc.seq < obc->ssc->snapset.seq) {
3930       dout(10) << " ORDERSNAP flag set and snapc seq " << ctx->snapc.seq
3931                << " < snapset seq " << obc->ssc->snapset.seq
3932                << " on " << obc->obs.oi.soid << dendl;
3933       reply_ctx(ctx, -EOLDSNAPC);
3934       return;
3935     }
3936
3937     // version
3938     ctx->at_version = get_next_version();
3939     ctx->mtime = m->get_mtime();
3940
3941     dout(10) << __func__ << " " << soid << " " << *ctx->ops
3942              << " ov " << obc->obs.oi.version << " av " << ctx->at_version
3943              << " snapc " << ctx->snapc
3944              << " snapset " << obc->ssc->snapset
3945              << dendl;
3946   } else {
3947     dout(10) << __func__ << " " << soid << " " << *ctx->ops
3948              << " ov " << obc->obs.oi.version
3949              << dendl;
3950   }
3951
3952   if (!ctx->user_at_version)
3953     ctx->user_at_version = obc->obs.oi.user_version;
3954   dout(30) << __func__ << " user_at_version " << ctx->user_at_version << dendl;
3955
3956   {
3957 #ifdef WITH_LTTNG
3958     osd_reqid_t reqid = ctx->op->get_reqid();
3959 #endif
3960     tracepoint(osd, prepare_tx_enter, reqid.name._type,
3961         reqid.name._num, reqid.tid, reqid.inc);
3962   }
3963
3964   int result = prepare_transaction(ctx);
3965
3966   {
3967 #ifdef WITH_LTTNG
3968     osd_reqid_t reqid = ctx->op->get_reqid();
3969 #endif
3970     tracepoint(osd, prepare_tx_exit, reqid.name._type,
3971         reqid.name._num, reqid.tid, reqid.inc);
3972   }
3973
3974   bool pending_async_reads = !ctx->pending_async_reads.empty();
3975   if (result == -EINPROGRESS || pending_async_reads) {
3976     // come back later.
3977     if (pending_async_reads) {
3978       ceph_assert(pool.info.is_erasure());
3979       in_progress_async_reads.push_back(make_pair(op, ctx));
3980       ctx->start_async_reads(this);
3981     }
3982     return;
3983   }
3984
3985   if (result == -EAGAIN) {
3986     // clean up after the ctx
3987     close_op_ctx(ctx);
3988     return;
3989   }
3990
3991   bool successful_write = !ctx->op_t->empty() && op->may_write() && result >= 0;
3992   // prepare the reply
3993   ctx->reply = new MOSDOpReply(m, 0, get_osdmap_epoch(), 0,
3994                                successful_write);
3995
3996   // Write operations aren't allowed to return a data payload because
3997   // we can't do so reliably. If the client has to resend the request
3998   // and it has already been applied, we will return 0 with no
3999   // payload.  Non-deterministic behavior is no good.  However, it is
4000   // possible to construct an operation that does a read, does a guard
4001   // check (e.g., CMPXATTR), and then a write.  Then we either succeed
4002   // with the write, or return a CMPXATTR and the read value.
4003   if (successful_write) {
4004     // write.  normalize the result code.
4005     dout(20) << " zeroing write result code " << result << dendl;
4006     result = 0;
4007   }
4008   ctx->reply->set_result(result);
4009
4010   // read or error?
4011   if ((ctx->op_t->empty() || result < 0) && !ctx->update_log_only) {
4012     // finish side-effects
4013     if (result >= 0)
4014       do_osd_op_effects(ctx, m->get_connection());
4015
4016     complete_read_ctx(result, ctx);
4017     return;
4018   }
4019
4020   ctx->reply->set_reply_versions(ctx->at_version, ctx->user_at_version);
4021
4022   ceph_assert(op->may_write() || op->may_cache());
4023
4024   // trim log?
4025   if (hard_limit_pglog())
4026     calc_trim_to_aggressive();
4027   else
4028     calc_trim_to();
4029
4030   // verify that we are doing this in order?
4031   if (cct->_conf->osd_debug_op_order && m->get_source().is_client() &&
4032       !pool.info.is_tier() && !pool.info.has_tiers()) {
4033     map<client_t,ceph_tid_t>& cm = debug_op_order[obc->obs.oi.soid];
4034     ceph_tid_t t = m->get_tid();
4035     client_t n = m->get_source().num();
4036     map<client_t,ceph_tid_t>::iterator p = cm.find(n);
4037     if (p == cm.end()) {
4038       dout(20) << " op order client." << n << " tid " << t << " (first)" << dendl;
4039       cm[n] = t;
4040     } else {
4041       dout(20) << " op order client." << n << " tid " << t << " last was " << p->second << dendl;
4042       if (p->second > t) {
4043         derr << "bad op order, already applied " << p->second << " > this " << t << dendl;
4044         ceph_abort_msg("out of order op");
4045       }
4046       p->second = t;
4047     }
4048   }
4049
4050   if (ctx->update_log_only) {
4051     if (result >= 0)
4052       do_osd_op_effects(ctx, m->get_connection());
4053
4054     dout(20) << __func__ << " update_log_only -- result=" << result << dendl;
4055     // save just what we need from ctx
4056     MOSDOpReply *reply = ctx->reply;
4057     ctx->reply = nullptr;
4058     reply->claim_op_out_data(*ctx->ops);
4059     reply->get_header().data_off = (ctx->data_off ? *ctx->data_off : 0);
4060     close_op_ctx(ctx);
4061
4062     if (result == -ENOENT) {
4063       reply->set_enoent_reply_versions(info.last_update,
4064                                        info.last_user_version);
4065     }
4066     reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
4067     // append to pg log for dup detection - don't save buffers for now
4068     record_write_error(op, soid, reply, result);
4069     return;
4070   }
4071
4072   // no need to capture PG ref, repop cancel will handle that
4073   // Can capture the ctx by pointer, it's owned by the repop
4074   ctx->register_on_commit(
4075     [m, ctx, this](){
4076       if (ctx->op)
4077         log_op_stats(*ctx->op, ctx->bytes_written, ctx->bytes_read);
4078
4079       if (m && !ctx->sent_reply) {
4080         MOSDOpReply *reply = ctx->reply;
4081         if (reply)
4082           ctx->reply = nullptr;
4083         else {
4084           reply = new MOSDOpReply(m, 0, get_osdmap_epoch(), 0, true);
4085           reply->set_reply_versions(ctx->at_version,
4086                                     ctx->user_at_version);
4087         }
4088         reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
4089         dout(10) << " sending reply on " << *m << " " << reply << dendl;
4090         osd->send_message_osd_client(reply, m->get_connection());
4091         ctx->sent_reply = true;
4092         ctx->op->mark_commit_sent();
4093       }
4094     });
4095   ctx->register_on_success(
4096     [ctx, this]() {
4097       do_osd_op_effects(
4098         ctx,
4099         ctx->op ? ctx->op->get_req()->get_connection() :
4100         ConnectionRef());
4101     });
4102   ctx->register_on_finish(
4103     [ctx]() {
4104       delete ctx;
4105     });
4106
4107   // issue replica writes
4108   ceph_tid_t rep_tid = osd->get_tid();
4109
4110   RepGather *repop = new_repop(ctx, obc, rep_tid);
4111
4112   issue_repop(repop, ctx);
4113   eval_repop(repop);
4114   repop->put();
4115 }
4116
4117 void PrimaryLogPG::close_op_ctx(OpContext *ctx) {
4118   release_object_locks(ctx->lock_manager);
4119
4120   ctx->op_t.reset();
4121
4122   for (auto p = ctx->on_finish.begin(); p != ctx->on_finish.end();
4123        ctx->on_finish.erase(p++)) {
4124     (*p)();
4125   }
4126   delete ctx;
4127 }
4128
4129 void PrimaryLogPG::reply_ctx(OpContext *ctx, int r)
4130 {
4131   if (ctx->op)
4132     osd->reply_op_error(ctx->op, r);
4133   close_op_ctx(ctx);
4134 }
4135
4136 void PrimaryLogPG::reply_ctx(OpContext *ctx, int r, eversion_t v, version_t uv)
4137 {
4138   if (ctx->op)
4139     osd->reply_op_error(ctx->op, r, v, uv);
4140   close_op_ctx(ctx);
4141 }
4142
4143 void PrimaryLogPG::log_op_stats(const OpRequest& op,
4144                                 const uint64_t inb,
4145                                 const uint64_t outb)
4146 {
4147   const MOSDOp* const m = static_cast<const MOSDOp*>(op.get_req());
4148   const utime_t now = ceph_clock_now();
4149
4150   const utime_t latency = now - m->get_recv_stamp();
4151   const utime_t process_latency = now - op.get_dequeued_time();
4152
4153   osd->logger->inc(l_osd_op);
4154
4155   osd->logger->inc(l_osd_op_outb, outb);
4156   osd->logger->inc(l_osd_op_inb, inb);
4157   osd->logger->tinc(l_osd_op_lat, latency);
4158   osd->logger->tinc(l_osd_op_process_lat, process_latency);
4159
4160   if (op.may_read() && op.may_write()) {
4161     osd->logger->inc(l_osd_op_rw);
4162     osd->logger->inc(l_osd_op_rw_inb, inb);
4163     osd->logger->inc(l_osd_op_rw_outb, outb);
4164     osd->logger->tinc(l_osd_op_rw_lat, latency);
4165     osd->logger->hinc(l_osd_op_rw_lat_inb_hist, latency.to_nsec(), inb);
4166     osd->logger->hinc(l_osd_op_rw_lat_outb_hist, latency.to_nsec(), outb);
4167     osd->logger->tinc(l_osd_op_rw_process_lat, process_latency);
4168   } else if (op.may_read()) {
4169     osd->logger->inc(l_osd_op_r);
4170     osd->logger->inc(l_osd_op_r_outb, outb);
4171     osd->logger->tinc(l_osd_op_r_lat, latency);
4172     osd->logger->hinc(l_osd_op_r_lat_outb_hist, latency.to_nsec(), outb);
4173     osd->logger->tinc(l_osd_op_r_process_lat, process_latency);
4174   } else if (op.may_write() || op.may_cache()) {
4175     osd->logger->inc(l_osd_op_w);
4176     osd->logger->inc(l_osd_op_w_inb, inb);
4177     osd->logger->tinc(l_osd_op_w_lat, latency);
4178     osd->logger->hinc(l_osd_op_w_lat_inb_hist, latency.to_nsec(), inb);
4179     osd->logger->tinc(l_osd_op_w_process_lat, process_latency);
4180   } else {
4181     ceph_abort();
4182   }
4183
4184   dout(15) << "log_op_stats " << *m
4185            << " inb " << inb
4186            << " outb " << outb
4187            << " lat " << latency << dendl;
4188
4189   if (m_dynamic_perf_stats.is_enabled()) {
4190     m_dynamic_perf_stats.add(osd, info, op, inb, outb, latency);
4191   }
4192 }
4193
4194 void PrimaryLogPG::set_dynamic_perf_stats_queries(
4195     const std::list<OSDPerfMetricQuery> &queries)
4196 {
4197   m_dynamic_perf_stats.set_queries(queries);
4198 }
4199
4200 void PrimaryLogPG::get_dynamic_perf_stats(DynamicPerfStats *stats)
4201 {
4202   std::swap(m_dynamic_perf_stats, *stats);
4203 }
4204
4205 void PrimaryLogPG::do_scan(
4206   OpRequestRef op,
4207   ThreadPool::TPHandle &handle)
4208 {
4209   const MOSDPGScan *m = static_cast<const MOSDPGScan*>(op->get_req());
4210   ceph_assert(m->get_type() == MSG_OSD_PG_SCAN);
4211   dout(10) << "do_scan " << *m << dendl;
4212
4213   op->mark_started();
4214
4215   switch (m->op) {
4216   case MOSDPGScan::OP_SCAN_GET_DIGEST:
4217     {
4218       auto dpp = get_dpp();
4219       if (osd->check_backfill_full(dpp)) {
4220         dout(1) << __func__ << ": Canceling backfill: Full." << dendl;
4221         queue_peering_event(
4222           PGPeeringEventRef(
4223             std::make_shared<PGPeeringEvent>(
4224               get_osdmap_epoch(),
4225               get_osdmap_epoch(),
4226               BackfillTooFull())));
4227         return;
4228       }
4229
4230       BackfillInterval bi;
4231       bi.begin = m->begin;
4232       // No need to flush, there won't be any in progress writes occuring
4233       // past m->begin
4234       scan_range(
4235         cct->_conf->osd_backfill_scan_min,
4236         cct->_conf->osd_backfill_scan_max,
4237         &bi,
4238         handle);
4239       MOSDPGScan *reply = new MOSDPGScan(
4240         MOSDPGScan::OP_SCAN_DIGEST,
4241         pg_whoami,
4242         get_osdmap_epoch(), m->query_epoch,
4243         spg_t(info.pgid.pgid, get_primary().shard), bi.begin, bi.end);
4244       encode(bi.objects, reply->get_data());
4245       osd->send_message_osd_cluster(reply, m->get_connection());
4246     }
4247     break;
4248
4249   case MOSDPGScan::OP_SCAN_DIGEST:
4250     {
4251       pg_shard_t from = m->from;
4252
4253       // Check that from is in backfill_targets vector
4254       ceph_assert(is_backfill_targets(from));
4255
4256       BackfillInterval& bi = peer_backfill_info[from];
4257       bi.begin = m->begin;
4258       bi.end = m->end;
4259       auto p = m->get_data().cbegin();
4260
4261       // take care to preserve ordering!
4262       bi.clear_objects();
4263       ::decode_noclear(bi.objects, p);
4264
4265       if (waiting_on_backfill.erase(from)) {
4266         if (waiting_on_backfill.empty()) {
4267           ceph_assert(peer_backfill_info.size() == backfill_targets.size());
4268           finish_recovery_op(hobject_t::get_max());
4269         }
4270       } else {
4271         // we canceled backfill for a while due to a too full, and this
4272         // is an extra response from a non-too-full peer
4273         dout(20) << __func__ << " canceled backfill (too full?)" << dendl;
4274       }
4275     }
4276     break;
4277   }
4278 }
4279
4280 void PrimaryLogPG::do_backfill(OpRequestRef op)
4281 {
4282   const MOSDPGBackfill *m = static_cast<const MOSDPGBackfill*>(op->get_req());
4283   ceph_assert(m->get_type() == MSG_OSD_PG_BACKFILL);
4284   dout(10) << "do_backfill " << *m << dendl;
4285
4286   op->mark_started();
4287
4288   switch (m->op) {
4289   case MOSDPGBackfill::OP_BACKFILL_FINISH:
4290     {
4291       ceph_assert(cct->_conf->osd_kill_backfill_at != 1);
4292
4293       MOSDPGBackfill *reply = new MOSDPGBackfill(
4294         MOSDPGBackfill::OP_BACKFILL_FINISH_ACK,
4295         get_osdmap_epoch(),
4296         m->query_epoch,
4297         spg_t(info.pgid.pgid, get_primary().shard));
4298       reply->set_priority(get_recovery_op_priority());
4299       osd->send_message_osd_cluster(reply, m->get_connection());
4300       queue_peering_event(
4301         PGPeeringEventRef(
4302           std::make_shared<PGPeeringEvent>(
4303             get_osdmap_epoch(),
4304             get_osdmap_epoch(),
4305             RecoveryDone())));
4306     }
4307     // fall-thru
4308
4309   case MOSDPGBackfill::OP_BACKFILL_PROGRESS:
4310     {
4311       ceph_assert(cct->_conf->osd_kill_backfill_at != 2);
4312
4313       info.set_last_backfill(m->last_backfill);
4314       // During backfill submit_push_data() tracks num_bytes which is needed in case
4315       // backfill stops and starts again.  We want to know how many bytes this
4316       // pg is consuming on the disk in order to compute amount of new data
4317       // reserved to hold backfill if it won't fit.
4318       if (m->op == MOSDPGBackfill::OP_BACKFILL_PROGRESS) {
4319         dout(0) << __func__ << " primary " << m->stats.stats.sum.num_bytes << " local " << info.stats.stats.sum.num_bytes << dendl;
4320         int64_t bytes = info.stats.stats.sum.num_bytes;
4321         info.stats = m->stats;
4322         info.stats.stats.sum.num_bytes = bytes;
4323       } else {
4324         dout(0) << __func__ << " final " << m->stats.stats.sum.num_bytes << " replaces local " << info.stats.stats.sum.num_bytes << dendl;
4325         info.stats = m->stats;
4326       }
4327
4328       ObjectStore::Transaction t;
4329       dirty_info = true;
4330       write_if_dirty(t);
4331       int tr = osd->store->queue_transaction(ch, std::move(t), NULL);
4332       ceph_assert(tr == 0);
4333     }
4334     break;
4335
4336   case MOSDPGBackfill::OP_BACKFILL_FINISH_ACK:
4337     {
4338       ceph_assert(is_primary());
4339       ceph_assert(cct->_conf->osd_kill_backfill_at != 3);
4340       finish_recovery_op(hobject_t::get_max());
4341     }
4342     break;
4343   }
4344 }
4345
4346 void PrimaryLogPG::do_backfill_remove(OpRequestRef op)
4347 {
4348   const MOSDPGBackfillRemove *m = static_cast<const MOSDPGBackfillRemove*>(
4349     op->get_req());
4350   ceph_assert(m->get_type() == MSG_OSD_PG_BACKFILL_REMOVE);
4351   dout(7) << __func__ << " " << m->ls << dendl;
4352
4353   op->mark_started();
4354
4355   ObjectStore::Transaction t;
4356   for (auto& p : m->ls) {
4357     if (is_remote_backfilling()) {
4358       struct stat st;
4359       int r = osd->store->stat(ch, ghobject_t(p.first, ghobject_t::NO_GEN,
4360                                pg_whoami.shard) , &st);
4361       if (r == 0) {
4362         sub_local_num_bytes(st.st_size);
4363         int64_t usersize;
4364         if (pool.info.is_erasure()) {
4365           bufferlist bv;
4366           int r = osd->store->getattr(
4367               ch,
4368               ghobject_t(p.first, ghobject_t::NO_GEN, pg_whoami.shard),
4369               OI_ATTR,
4370               bv);
4371           if (r >= 0) {
4372             object_info_t oi(bv);
4373             usersize = oi.size * pgbackend->get_ec_data_chunk_count();
4374           } else {
4375             dout(0) << __func__ << " " << ghobject_t(p.first, ghobject_t::NO_GEN, pg_whoami.shard)
4376                     << " can't get object info" << dendl;
4377             usersize = 0;
4378           }
4379         } else {
4380           usersize = st.st_size;
4381         }
4382         sub_num_bytes(usersize);
4383         dout(10) << __func__ << " " << ghobject_t(p.first, ghobject_t::NO_GEN, pg_whoami.shard)
4384                  << " sub actual data by " << st.st_size
4385                  << " sub num_bytes by " << usersize
4386                  << dendl;
4387       }
4388     }
4389     remove_snap_mapped_object(t, p.first);
4390   }
4391   int r = osd->store->queue_transaction(ch, std::move(t), NULL);
4392   ceph_assert(r == 0);
4393 }
4394
4395 int PrimaryLogPG::trim_object(
4396   bool first, const hobject_t &coid, PrimaryLogPG::OpContextUPtr *ctxp)
4397 {
4398   *ctxp = NULL;
4399
4400   // load clone info
4401   bufferlist bl;
4402   ObjectContextRef obc = get_object_context(coid, false, NULL);
4403   if (!obc || !obc->ssc || !obc->ssc->exists) {
4404     osd->clog->error() << __func__ << ": Can not trim " << coid
4405       << " repair needed " << (obc ? "(no obc->ssc or !exists)" : "(no obc)");
4406     return -ENOENT;
4407   }
4408
4409   hobject_t head_oid = coid.get_head();
4410   ObjectContextRef head_obc = get_object_context(head_oid, false);
4411   if (!head_obc) {
4412     osd->clog->error() << __func__ << ": Can not trim " << coid
4413       << " repair needed, no snapset obc for " << head_oid;
4414     return -ENOENT;
4415   }
4416
4417   SnapSet& snapset = obc->ssc->snapset;
4418
4419   object_info_t &coi = obc->obs.oi;
4420   auto citer = snapset.clone_snaps.find(coid.snap);
4421   if (citer == snapset.clone_snaps.end()) {
4422     osd->clog->error() << "No clone_snaps in snapset " << snapset
4423                        << " for object " << coid << "\n";
4424     return -ENOENT;
4425   }
4426   set<snapid_t> old_snaps(citer->second.begin(), citer->second.end());
4427   if (old_snaps.empty()) {
4428     osd->clog->error() << "No object info snaps for object " << coid;
4429     return -ENOENT;
4430   }
4431
4432   dout(10) << coid << " old_snaps " << old_snaps
4433            << " old snapset " << snapset << dendl;
4434   if (snapset.seq == 0) {
4435     osd->clog->error() << "No snapset.seq for object " << coid;
4436     return -ENOENT;
4437   }
4438
4439   set<snapid_t> new_snaps;
4440   for (set<snapid_t>::iterator i = old_snaps.begin();
4441        i != old_snaps.end();
4442        ++i) {
4443     if (!pool.info.is_removed_snap(*i))
4444       new_snaps.insert(*i);
4445   }
4446
4447   vector<snapid_t>::iterator p = snapset.clones.end();
4448
4449   if (new_snaps.empty()) {
4450     p = std::find(snapset.clones.begin(), snapset.clones.end(), coid.snap);
4451     if (p == snapset.clones.end()) {
4452       osd->clog->error() << "Snap " << coid.snap << " not in clones";
4453       return -ENOENT;
4454     }
4455   }
4456
4457   OpContextUPtr ctx = simple_opc_create(obc);
4458   ctx->head_obc = head_obc;
4459
4460   if (!ctx->lock_manager.get_snaptrimmer_write(
4461         coid,
4462         obc,
4463         first)) {
4464     close_op_ctx(ctx.release());
4465     dout(10) << __func__ << ": Unable to get a wlock on " << coid << dendl;
4466     return -ENOLCK;
4467   }
4468
4469   if (!ctx->lock_manager.get_snaptrimmer_write(
4470         head_oid,
4471         head_obc,
4472         first)) {
4473     close_op_ctx(ctx.release());
4474     dout(10) << __func__ << ": Unable to get a wlock on " << head_oid << dendl;
4475     return -ENOLCK;
4476   }
4477
4478   ctx->at_version = get_next_version();
4479
4480   PGTransaction *t = ctx->op_t.get();
4481
4482   if (new_snaps.empty()) {
4483     // remove clone
4484     dout(10) << coid << " snaps " << old_snaps << " -> "
4485              << new_snaps << " ... deleting" << dendl;
4486
4487     // ...from snapset
4488     ceph_assert(p != snapset.clones.end());
4489
4490     snapid_t last = coid.snap;
4491     ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(last);
4492
4493     if (p != snapset.clones.begin()) {
4494       // not the oldest... merge overlap into next older clone
4495       vector<snapid_t>::iterator n = p - 1;
4496       hobject_t prev_coid = coid;
4497       prev_coid.snap = *n;
4498       bool adjust_prev_bytes = is_present_clone(prev_coid);
4499
4500       if (adjust_prev_bytes)
4501         ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(*n);
4502
4503       snapset.clone_overlap[*n].intersection_of(
4504         snapset.clone_overlap[*p]);
4505
4506       if (adjust_prev_bytes)
4507         ctx->delta_stats.num_bytes += snapset.get_clone_bytes(*n);
4508     }
4509     ctx->delta_stats.num_objects--;
4510     if (coi.is_dirty())
4511       ctx->delta_stats.num_objects_dirty--;
4512     if (coi.is_omap())
4513       ctx->delta_stats.num_objects_omap--;
4514     if (coi.is_whiteout()) {
4515       dout(20) << __func__ << " trimming whiteout on " << coid << dendl;
4516       ctx->delta_stats.num_whiteouts--;
4517     }
4518     ctx->delta_stats.num_object_clones--;
4519     if (coi.is_cache_pinned())
4520       ctx->delta_stats.num_objects_pinned--;
4521     if (coi.has_manifest())
4522       ctx->delta_stats.num_objects_manifest--;
4523     obc->obs.exists = false;
4524
4525     snapset.clones.erase(p);
4526     snapset.clone_overlap.erase(last);
4527     snapset.clone_size.erase(last);
4528     snapset.clone_snaps.erase(last);
4529
4530     ctx->log.push_back(
4531       pg_log_entry_t(
4532         pg_log_entry_t::DELETE,
4533         coid,
4534         ctx->at_version,
4535         ctx->obs->oi.version,
4536         0,
4537         osd_reqid_t(),
4538         ctx->mtime,
4539         0)
4540       );
4541     t->remove(coid);
4542     t->update_snaps(
4543       coid,
4544       old_snaps,
4545       new_snaps);
4546
4547     coi = object_info_t(coid);
4548
4549     ctx->at_version.version++;
4550   } else {
4551     // save adjusted snaps for this object
4552     dout(10) << coid << " snaps " << old_snaps << " -> " << new_snaps << dendl;
4553     snapset.clone_snaps[coid.snap] =
4554       vector<snapid_t>(new_snaps.rbegin(), new_snaps.rend());
4555     // we still do a 'modify' event on this object just to trigger a
4556     // snapmapper.update ... :(
4557
4558     coi.prior_version = coi.version;
4559     coi.version = ctx->at_version;
4560     bl.clear();
4561     encode(coi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
4562     t->setattr(coid, OI_ATTR, bl);
4563
4564     ctx->log.push_back(
4565       pg_log_entry_t(
4566         pg_log_entry_t::MODIFY,
4567         coid,
4568         coi.version,
4569         coi.prior_version,
4570         0,
4571         osd_reqid_t(),
4572         ctx->mtime,
4573         0)
4574       );
4575     ctx->at_version.version++;
4576
4577     t->update_snaps(
4578       coid,
4579       old_snaps,
4580       new_snaps);
4581   }
4582
4583   // save head snapset
4584   dout(10) << coid << " new snapset " << snapset << " on "
4585            << head_obc->obs.oi << dendl;
4586   if (snapset.clones.empty() &&
4587       (head_obc->obs.oi.is_whiteout() &&
4588        !(head_obc->obs.oi.is_dirty() && pool.info.is_tier()) &&
4589        !head_obc->obs.oi.is_cache_pinned())) {
4590     // NOTE: this arguably constitutes minor interference with the
4591     // tiering agent if this is a cache tier since a snap trim event
4592     // is effectively evicting a whiteout we might otherwise want to
4593     // keep around.
4594     dout(10) << coid << " removing " << head_oid << dendl;
4595     ctx->log.push_back(
4596       pg_log_entry_t(
4597         pg_log_entry_t::DELETE,
4598         head_oid,
4599         ctx->at_version,
4600         head_obc->obs.oi.version,
4601         0,
4602         osd_reqid_t(),
4603         ctx->mtime,
4604         0)
4605       );
4606     derr << "removing snap head" << dendl;
4607     object_info_t& oi = head_obc->obs.oi;
4608     ctx->delta_stats.num_objects--;
4609     if (oi.is_dirty()) {
4610       ctx->delta_stats.num_objects_dirty--;
4611     }
4612     if (oi.is_omap())
4613       ctx->delta_stats.num_objects_omap--;
4614     if (oi.is_whiteout()) {
4615       dout(20) << __func__ << " trimming whiteout on " << oi.soid << dendl;
4616       ctx->delta_stats.num_whiteouts--;
4617     }
4618     if (oi.is_cache_pinned()) {
4619       ctx->delta_stats.num_objects_pinned--;
4620     }
4621     if (coi.has_manifest())
4622       ctx->delta_stats.num_objects_manifest--;
4623     head_obc->obs.exists = false;
4624     head_obc->obs.oi = object_info_t(head_oid);
4625     t->remove(head_oid);
4626   } else {
4627     dout(10) << coid << " filtering snapset on " << head_oid << dendl;
4628     snapset.filter(pool.info);
4629     dout(10) << coid << " writing updated snapset on " << head_oid
4630              << ", snapset is " << snapset << dendl;
4631     ctx->log.push_back(
4632       pg_log_entry_t(
4633         pg_log_entry_t::MODIFY,
4634         head_oid,
4635         ctx->at_version,
4636         head_obc->obs.oi.version,
4637         0,
4638         osd_reqid_t(),
4639         ctx->mtime,
4640         0)
4641       );
4642
4643     head_obc->obs.oi.prior_version = head_obc->obs.oi.version;
4644     head_obc->obs.oi.version = ctx->at_version;
4645
4646     map <string, bufferlist> attrs;
4647     bl.clear();
4648     encode(snapset, bl);
4649     attrs[SS_ATTR].claim(bl);
4650
4651     bl.clear();
4652     encode(head_obc->obs.oi, bl,
4653              get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
4654     attrs[OI_ATTR].claim(bl);
4655     t->setattrs(head_oid, attrs);
4656   }
4657
4658   *ctxp = std::move(ctx);
4659   return 0;
4660 }
4661
4662 void PrimaryLogPG::kick_snap_trim()
4663 {
4664   ceph_assert(is_active());
4665   ceph_assert(is_primary());
4666   if (is_clean() &&
4667       !state_test(PG_STATE_PREMERGE) &&
4668       !snap_trimq.empty()) {
4669     if (get_osdmap()->test_flag(CEPH_OSDMAP_NOSNAPTRIM)) {
4670       dout(10) << __func__ << ": nosnaptrim set, not kicking" << dendl;
4671     } else {
4672       dout(10) << __func__ << ": clean and snaps to trim, kicking" << dendl;
4673       snap_trimmer_machine.process_event(KickTrim());
4674     }
4675   }
4676 }
4677
4678 void PrimaryLogPG::snap_trimmer_scrub_complete()
4679 {
4680   if (is_primary() && is_active() && is_clean()) {
4681     ceph_assert(!snap_trimq.empty());
4682     snap_trimmer_machine.process_event(ScrubComplete());
4683   }
4684 }
4685
4686 void PrimaryLogPG::snap_trimmer(epoch_t queued)
4687 {
4688   if (deleting || pg_has_reset_since(queued)) {
4689     return;
4690   }
4691
4692   ceph_assert(is_primary());
4693
4694   dout(10) << "snap_trimmer posting" << dendl;
4695   snap_trimmer_machine.process_event(DoSnapWork());
4696   dout(10) << "snap_trimmer complete" << dendl;
4697   return;
4698 }
4699
4700 int PrimaryLogPG::do_xattr_cmp_u64(int op, __u64 v1, bufferlist& xattr)
4701 {
4702   __u64 v2;
4703
4704   string v2s(xattr.c_str(), xattr.length());
4705   if (v2s.length())
4706     v2 = strtoull(v2s.c_str(), NULL, 10);
4707   else
4708     v2 = 0;
4709
4710   dout(20) << "do_xattr_cmp_u64 '" << v1 << "' vs '" << v2 << "' op " << op << dendl;
4711
4712   switch (op) {
4713   case CEPH_OSD_CMPXATTR_OP_EQ:
4714     return (v1 == v2);
4715   case CEPH_OSD_CMPXATTR_OP_NE:
4716     return (v1 != v2);
4717   case CEPH_OSD_CMPXATTR_OP_GT:
4718     return (v1 > v2);
4719   case CEPH_OSD_CMPXATTR_OP_GTE:
4720     return (v1 >= v2);
4721   case CEPH_OSD_CMPXATTR_OP_LT:
4722     return (v1 < v2);
4723   case CEPH_OSD_CMPXATTR_OP_LTE:
4724     return (v1 <= v2);
4725   default:
4726     return -EINVAL;
4727   }
4728 }
4729
4730 int PrimaryLogPG::do_xattr_cmp_str(int op, string& v1s, bufferlist& xattr)
4731 {
4732   string v2s(xattr.c_str(), xattr.length());
4733
4734   dout(20) << "do_xattr_cmp_str '" << v1s << "' vs '" << v2s << "' op " << op << dendl;
4735
4736   switch (op) {
4737   case CEPH_OSD_CMPXATTR_OP_EQ:
4738     return (v1s.compare(v2s) == 0);
4739   case CEPH_OSD_CMPXATTR_OP_NE:
4740     return (v1s.compare(v2s) != 0);
4741   case CEPH_OSD_CMPXATTR_OP_GT:
4742     return (v1s.compare(v2s) > 0);
4743   case CEPH_OSD_CMPXATTR_OP_GTE:
4744     return (v1s.compare(v2s) >= 0);
4745   case CEPH_OSD_CMPXATTR_OP_LT:
4746     return (v1s.compare(v2s) < 0);
4747   case CEPH_OSD_CMPXATTR_OP_LTE:
4748     return (v1s.compare(v2s) <= 0);
4749   default:
4750     return -EINVAL;
4751   }
4752 }
4753
4754 int PrimaryLogPG::do_writesame(OpContext *ctx, OSDOp& osd_op)
4755 {
4756   ceph_osd_op& op = osd_op.op;
4757   vector<OSDOp> write_ops(1);
4758   OSDOp& write_op = write_ops[0];
4759   uint64_t write_length = op.writesame.length;
4760   int result = 0;
4761
4762   if (!write_length)
4763     return 0;
4764
4765   if (!op.writesame.data_length || write_length % op.writesame.data_length)
4766     return -EINVAL;
4767
4768   if (op.writesame.data_length != osd_op.indata.length()) {
4769     derr << "invalid length ws data length " << op.writesame.data_length << " actual len " << osd_op.indata.length() << dendl;
4770     return -EINVAL;
4771   }
4772
4773   while (write_length) {
4774     write_op.indata.append(osd_op.indata);
4775     write_length -= op.writesame.data_length;
4776   }
4777
4778   write_op.op.op = CEPH_OSD_OP_WRITE;
4779   write_op.op.extent.offset = op.writesame.offset;
4780   write_op.op.extent.length = op.writesame.length;
4781   result = do_osd_ops(ctx, write_ops);
4782   if (result < 0)
4783     derr << "do_writesame do_osd_ops failed " << result << dendl;
4784
4785   return result;
4786 }
4787
4788 // ========================================================================
4789 // low level osd ops
4790
4791 int PrimaryLogPG::do_tmap2omap(OpContext *ctx, unsigned flags)
4792 {
4793   dout(20) << " convert tmap to omap for " << ctx->new_obs.oi.soid << dendl;
4794   bufferlist header, vals;
4795   int r = _get_tmap(ctx, &header, &vals);
4796   if (r < 0) {
4797     if (r == -ENODATA && (flags & CEPH_OSD_TMAP2OMAP_NULLOK))
4798       r = 0;
4799     return r;
4800   }
4801
4802   vector<OSDOp> ops(3);
4803
4804   ops[0].op.op = CEPH_OSD_OP_TRUNCATE;
4805   ops[0].op.extent.offset = 0;
4806   ops[0].op.extent.length = 0;
4807
4808   ops[1].op.op = CEPH_OSD_OP_OMAPSETHEADER;
4809   ops[1].indata.claim(header);
4810
4811   ops[2].op.op = CEPH_OSD_OP_OMAPSETVALS;
4812   ops[2].indata.claim(vals);
4813
4814   return do_osd_ops(ctx, ops);
4815 }
4816
4817 int PrimaryLogPG::do_tmapup_slow(OpContext *ctx, bufferlist::const_iterator& bp,
4818                                  OSDOp& osd_op, bufferlist& bl)
4819 {
4820   // decode
4821   bufferlist header;
4822   map<string, bufferlist> m;
4823   if (bl.length()) {
4824     auto p = bl.cbegin();
4825     decode(header, p);
4826     decode(m, p);
4827     ceph_assert(p.end());
4828   }
4829
4830   // do the update(s)
4831   while (!bp.end()) {
4832     __u8 op;
4833     string key;
4834     decode(op, bp);
4835
4836     switch (op) {
4837     case CEPH_OSD_TMAP_SET: // insert key
4838       {
4839         decode(key, bp);
4840         bufferlist data;
4841         decode(data, bp);
4842         m[key] = data;
4843       }
4844       break;
4845     case CEPH_OSD_TMAP_RM: // remove key
4846       decode(key, bp);
4847       if (!m.count(key)) {
4848         return -ENOENT;
4849       }
4850       m.erase(key);
4851       break;
4852     case CEPH_OSD_TMAP_RMSLOPPY: // remove key
4853       decode(key, bp);
4854       m.erase(key);
4855       break;
4856     case CEPH_OSD_TMAP_HDR: // update header
4857       {
4858         decode(header, bp);
4859       }
4860       break;
4861     default:
4862       return -EINVAL;
4863     }
4864   }
4865
4866   // reencode
4867   bufferlist obl;
4868   encode(header, obl);
4869   encode(m, obl);
4870
4871   // write it out
4872   vector<OSDOp> nops(1);
4873   OSDOp& newop = nops[0];
4874   newop.op.op = CEPH_OSD_OP_WRITEFULL;
4875   newop.op.extent.offset = 0;
4876   newop.op.extent.length = obl.length();
4877   newop.indata = obl;
4878   do_osd_ops(ctx, nops);
4879   osd_op.outdata.claim(newop.outdata);
4880   return 0;
4881 }
4882
4883 int PrimaryLogPG::do_tmapup(OpContext *ctx, bufferlist::const_iterator& bp, OSDOp& osd_op)
4884 {
4885   bufferlist::const_iterator orig_bp = bp;
4886   int result = 0;
4887   if (bp.end()) {
4888     dout(10) << "tmapup is a no-op" << dendl;
4889   } else {
4890     // read the whole object
4891     vector<OSDOp> nops(1);
4892     OSDOp& newop = nops[0];
4893     newop.op.op = CEPH_OSD_OP_READ;
4894     newop.op.extent.offset = 0;
4895     newop.op.extent.length = 0;
4896     result = do_osd_ops(ctx, nops);
4897
4898     dout(10) << "tmapup read " << newop.outdata.length() << dendl;
4899
4900     dout(30) << " starting is \n";
4901     newop.outdata.hexdump(*_dout);
4902     *_dout << dendl;
4903
4904     auto ip = newop.outdata.cbegin();
4905     bufferlist obl;
4906
4907     dout(30) << "the update command is: \n";
4908     osd_op.indata.hexdump(*_dout);
4909     *_dout << dendl;
4910
4911     // header
4912     bufferlist header;
4913     __u32 nkeys = 0;
4914     if (newop.outdata.length()) {
4915       decode(header, ip);
4916       decode(nkeys, ip);
4917     }
4918     dout(10) << "tmapup header " << header.length() << dendl;
4919
4920     if (!bp.end() && *bp == CEPH_OSD_TMAP_HDR) {
4921       ++bp;
4922       decode(header, bp);
4923       dout(10) << "tmapup new header " << header.length() << dendl;
4924     }
4925
4926     encode(header, obl);
4927
4928     dout(20) << "tmapup initial nkeys " << nkeys << dendl;
4929
4930     // update keys
4931     bufferlist newkeydata;
4932     string nextkey, last_in_key;
4933     bufferlist nextval;
4934     bool have_next = false;
4935     if (!ip.end()) {
4936       have_next = true;
4937       decode(nextkey, ip);
4938       decode(nextval, ip);
4939     }
4940     while (!bp.end() && !result) {
4941       __u8 op;
4942       string key;
4943       try {
4944         decode(op, bp);
4945         decode(key, bp);
4946       }
4947       catch (buffer::error& e) {
4948         return -EINVAL;
4949       }
4950       if (key < last_in_key) {
4951         dout(5) << "tmapup warning: key '" << key << "' < previous key '" << last_in_key
4952                 << "', falling back to an inefficient (unsorted) update" << dendl;
4953         bp = orig_bp;
4954         return do_tmapup_slow(ctx, bp, osd_op, newop.outdata);
4955       }
4956       last_in_key = key;
4957
4958       dout(10) << "tmapup op " << (int)op << " key " << key << dendl;
4959
4960       // skip existing intervening keys
4961       bool key_exists = false;
4962       while (have_next && !key_exists) {
4963         dout(20) << "  (have_next=" << have_next << " nextkey=" << nextkey << ")" << dendl;
4964         if (nextkey > key)
4965           break;
4966         if (nextkey < key) {
4967           // copy untouched.
4968           encode(nextkey, newkeydata);
4969           encode(nextval, newkeydata);
4970           dout(20) << "  keep " << nextkey << " " << nextval.length() << dendl;
4971         } else {
4972           // don't copy; discard old value.  and stop.
4973           dout(20) << "  drop " << nextkey << " " << nextval.length() << dendl;
4974           key_exists = true;
4975           nkeys--;
4976         }
4977         if (!ip.end()) {
4978           decode(nextkey, ip);
4979           decode(nextval, ip);
4980         } else {
4981           have_next = false;
4982         }
4983       }
4984
4985       if (op == CEPH_OSD_TMAP_SET) {
4986         bufferlist val;
4987         try {
4988           decode(val, bp);
4989         }
4990         catch (buffer::error& e) {
4991           return -EINVAL;
4992         }
4993         encode(key, newkeydata);
4994         encode(val, newkeydata);
4995         dout(20) << "   set " << key << " " << val.length() << dendl;
4996         nkeys++;
4997       } else if (op == CEPH_OSD_TMAP_CREATE) {
4998         if (key_exists) {
4999           return -EEXIST;
5000         }
5001         bufferlist val;
5002         try {
5003           decode(val, bp);
5004         }
5005         catch (buffer::error& e) {
5006           return -EINVAL;
5007         }
5008         encode(key, newkeydata);
5009         encode(val, newkeydata);
5010         dout(20) << "   create " << key << " " << val.length() << dendl;
5011         nkeys++;
5012       } else if (op == CEPH_OSD_TMAP_RM) {
5013         // do nothing.
5014         if (!key_exists) {
5015           return -ENOENT;
5016         }
5017       } else if (op == CEPH_OSD_TMAP_RMSLOPPY) {
5018         // do nothing
5019       } else {
5020         dout(10) << "  invalid tmap op " << (int)op << dendl;
5021         return -EINVAL;
5022       }
5023     }
5024
5025     // copy remaining
5026     if (have_next) {
5027       encode(nextkey, newkeydata);
5028       encode(nextval, newkeydata);
5029       dout(20) << "  keep " << nextkey << " " << nextval.length() << dendl;
5030     }
5031     if (!ip.end()) {
5032       bufferlist rest;
5033       rest.substr_of(newop.outdata, ip.get_off(), newop.outdata.length() - ip.get_off());
5034       dout(20) << "  keep trailing " << rest.length()
5035                << " at " << newkeydata.length() << dendl;
5036       newkeydata.claim_append(rest);
5037     }
5038
5039     // encode final key count + key data
5040     dout(20) << "tmapup final nkeys " << nkeys << dendl;
5041     encode(nkeys, obl);
5042     obl.claim_append(newkeydata);
5043
5044     if (0) {
5045       dout(30) << " final is \n";
5046       obl.hexdump(*_dout);
5047       *_dout << dendl;
5048
5049       // sanity check
5050       auto tp = obl.cbegin();
5051       bufferlist h;
5052       decode(h, tp);
5053       map<string,bufferlist> d;
5054       decode(d, tp);
5055       ceph_assert(tp.end());
5056       dout(0) << " **** debug sanity check, looks ok ****" << dendl;
5057     }
5058
5059     // write it out
5060     if (!result) {
5061       dout(20) << "tmapput write " << obl.length() << dendl;
5062       newop.op.op = CEPH_OSD_OP_WRITEFULL;
5063       newop.op.extent.offset = 0;
5064       newop.op.extent.length = obl.length();
5065       newop.indata = obl;
5066       do_osd_ops(ctx, nops);
5067       osd_op.outdata.claim(newop.outdata);
5068     }
5069   }
5070   return result;
5071 }
5072
5073 static int check_offset_and_length(uint64_t offset, uint64_t length,
5074   uint64_t max, DoutPrefixProvider *dpp)
5075 {
5076   if (offset >= max ||
5077       length > max ||
5078       offset + length > max) {
5079     ldpp_dout(dpp, 10) << __func__ << " "
5080       << "osd_max_object_size: " << max
5081       << "; Hard limit of object size is 4GB." << dendl;
5082     return -EFBIG;
5083   }
5084
5085   return 0;
5086 }
5087
5088 struct FillInVerifyExtent : public Context {
5089   ceph_le64 *r;
5090   int32_t *rval;
5091   bufferlist *outdatap;
5092   boost::optional<uint32_t> maybe_crc;
5093   uint64_t size;
5094   OSDService *osd;
5095   hobject_t soid;
5096   __le32 flags;
5097   FillInVerifyExtent(ceph_le64 *r, int32_t *rv, bufferlist *blp,
5098                      boost::optional<uint32_t> mc, uint64_t size,
5099                      OSDService *osd, hobject_t soid, __le32 flags) :
5100     r(r), rval(rv), outdatap(blp), maybe_crc(mc),
5101     size(size), osd(osd), soid(soid), flags(flags) {}
5102   void finish(int len) override {
5103     *r = len;
5104     if (len < 0) {
5105       *rval = len;
5106       return;
5107     }
5108     *rval = 0;
5109
5110     // whole object?  can we verify the checksum?
5111     if (maybe_crc && *r == size) {
5112       uint32_t crc = outdatap->crc32c(-1);
5113       if (maybe_crc != crc) {
5114         osd->clog->error() << std::hex << " full-object read crc 0x" << crc
5115                            << " != expected 0x" << *maybe_crc
5116                            << std::dec << " on " << soid;
5117         if (!(flags & CEPH_OSD_OP_FLAG_FAILOK)) {
5118           *rval = -EIO;
5119           *r = 0;
5120         }
5121       }
5122     }
5123   }
5124 };
5125
5126 struct ToSparseReadResult : public Context {
5127   int* result;
5128   bufferlist* data_bl;
5129   uint64_t data_offset;
5130   ceph_le64* len;
5131   ToSparseReadResult(int* result, bufferlist* bl, uint64_t offset,
5132                      ceph_le64* len)
5133     : result(result), data_bl(bl), data_offset(offset),len(len) {}
5134   void finish(int r) override {
5135     if (r < 0) {
5136       *result = r;
5137       return;
5138     }
5139     *result = 0;
5140     *len = r;
5141     bufferlist outdata;
5142     map<uint64_t, uint64_t> extents = {{data_offset, r}};
5143     encode(extents, outdata);
5144     ::encode_destructively(*data_bl, outdata);
5145     data_bl->swap(outdata);
5146   }
5147 };
5148
5149 template<typename V>
5150 static string list_keys(const map<string, V>& m) {
5151   string s;
5152   for (typename map<string, V>::const_iterator itr = m.begin(); itr != m.end(); ++itr) {
5153     if (!s.empty()) {
5154       s.push_back(',');
5155     }
5156     s.append(itr->first);
5157   }
5158   return s;
5159 }
5160
5161 template<typename T>
5162 static string list_entries(const T& m) {
5163   string s;
5164   for (typename T::const_iterator itr = m.begin(); itr != m.end(); ++itr) {
5165     if (!s.empty()) {
5166       s.push_back(',');
5167     }
5168     s.append(*itr);
5169   }
5170   return s;
5171 }
5172
5173 void PrimaryLogPG::maybe_create_new_object(
5174   OpContext *ctx,
5175   bool ignore_transaction)
5176 {
5177   ObjectState& obs = ctx->new_obs;
5178   if (!obs.exists) {
5179     ctx->delta_stats.num_objects++;
5180     obs.exists = true;
5181     ceph_assert(!obs.oi.is_whiteout());
5182     obs.oi.new_object();
5183     if (!ignore_transaction)
5184       ctx->op_t->create(obs.oi.soid);
5185   } else if (obs.oi.is_whiteout()) {
5186     dout(10) << __func__ << " clearing whiteout on " << obs.oi.soid << dendl;
5187     ctx->new_obs.oi.clear_flag(object_info_t::FLAG_WHITEOUT);
5188     --ctx->delta_stats.num_whiteouts;
5189   }
5190 }
5191
5192 struct ReadFinisher : public PrimaryLogPG::OpFinisher {
5193   OSDOp& osd_op;
5194
5195   explicit ReadFinisher(OSDOp& osd_op) : osd_op(osd_op) {
5196   }
5197
5198   int execute() override {
5199     return osd_op.rval;
5200   }
5201 };
5202
5203 struct C_ChecksumRead : public Context {
5204   PrimaryLogPG *primary_log_pg;
5205   OSDOp &osd_op;
5206   Checksummer::CSumType csum_type;
5207   bufferlist init_value_bl;
5208   ceph_le64 read_length;
5209   bufferlist read_bl;
5210   Context *fill_extent_ctx;
5211
5212   C_ChecksumRead(PrimaryLogPG *primary_log_pg, OSDOp &osd_op,
5213                  Checksummer::CSumType csum_type, bufferlist &&init_value_bl,
5214                  boost::optional<uint32_t> maybe_crc, uint64_t size,
5215                  OSDService *osd, hobject_t soid, __le32 flags)
5216     : primary_log_pg(primary_log_pg), osd_op(osd_op),
5217       csum_type(csum_type), init_value_bl(std::move(init_value_bl)),
5218       fill_extent_ctx(new FillInVerifyExtent(&read_length, &osd_op.rval,
5219                                              &read_bl, maybe_crc, size,
5220                                              osd, soid, flags)) {
5221   }
5222   ~C_ChecksumRead() override {
5223     delete fill_extent_ctx;
5224   }
5225
5226   void finish(int r) override {
5227     fill_extent_ctx->complete(r);
5228     fill_extent_ctx = nullptr;
5229
5230     if (osd_op.rval >= 0) {
5231       bufferlist::const_iterator init_value_bl_it = init_value_bl.begin();
5232       osd_op.rval = primary_log_pg->finish_checksum(osd_op, csum_type,
5233                                                     &init_value_bl_it, read_bl);
5234     }
5235   }
5236 };
5237
5238 int PrimaryLogPG::do_checksum(OpContext *ctx, OSDOp& osd_op,
5239                               bufferlist::const_iterator *bl_it)
5240 {
5241   dout(20) << __func__ << dendl;
5242
5243   auto& op = osd_op.op;
5244   if (op.checksum.chunk_size > 0) {
5245     if (op.checksum.length == 0) {
5246       dout(10) << __func__ << ": length required when chunk size provided"
5247                << dendl;
5248       return -EINVAL;
5249     }
5250     if (op.checksum.length % op.checksum.chunk_size != 0) {
5251       dout(10) << __func__ << ": length not aligned to chunk size" << dendl;
5252       return -EINVAL;
5253     }
5254   }
5255
5256   auto& oi = ctx->new_obs.oi;
5257   if (op.checksum.offset == 0 && op.checksum.length == 0) {
5258     // zeroed offset+length implies checksum whole object
5259     op.checksum.length = oi.size;
5260   } else if (op.checksum.offset >= oi.size) {
5261     // read size was trimmed to zero, do nothing
5262     // see PrimaryLogPG::do_read
5263     return 0;
5264   } else if (op.extent.offset + op.extent.length > oi.size) {
5265     op.extent.length = oi.size - op.extent.offset;
5266     if (op.checksum.chunk_size > 0 &&
5267         op.checksum.length % op.checksum.chunk_size != 0) {
5268       dout(10) << __func__ << ": length (trimmed to 0x"
5269                << std::hex << op.checksum.length
5270                << ") not aligned to chunk size 0x"
5271                << op.checksum.chunk_size << std::dec
5272                << dendl;
5273       return -EINVAL;
5274     }
5275   }
5276
5277   Checksummer::CSumType csum_type;
5278   switch (op.checksum.type) {
5279   case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH32:
5280     csum_type = Checksummer::CSUM_XXHASH32;
5281     break;
5282   case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH64:
5283     csum_type = Checksummer::CSUM_XXHASH64;
5284     break;
5285   case CEPH_OSD_CHECKSUM_OP_TYPE_CRC32C:
5286     csum_type = Checksummer::CSUM_CRC32C;
5287     break;
5288   default:
5289     dout(10) << __func__ << ": unknown crc type ("
5290              << static_cast<uint32_t>(op.checksum.type) << ")" << dendl;
5291     return -EINVAL;
5292   }
5293
5294   size_t csum_init_value_size = Checksummer::get_csum_init_value_size(csum_type);
5295   if (bl_it->get_remaining() < csum_init_value_size) {
5296     dout(10) << __func__ << ": init value not provided" << dendl;
5297     return -EINVAL;
5298   }
5299
5300   bufferlist init_value_bl;
5301   init_value_bl.substr_of(bl_it->get_bl(), bl_it->get_off(),
5302                           csum_init_value_size);
5303   bl_it->advance(csum_init_value_size);
5304
5305   if (pool.info.is_erasure() && op.checksum.length > 0) {
5306     // If there is a data digest and it is possible we are reading
5307     // entire object, pass the digest.
5308     boost::optional<uint32_t> maybe_crc;
5309     if (oi.is_data_digest() && op.checksum.offset == 0 &&
5310         op.checksum.length >= oi.size) {
5311       maybe_crc = oi.data_digest;
5312     }
5313
5314     // async read
5315     auto& soid = oi.soid;
5316     auto checksum_ctx = new C_ChecksumRead(this, osd_op, csum_type,
5317                                            std::move(init_value_bl), maybe_crc,
5318                                            oi.size, osd, soid, op.flags);
5319
5320     ctx->pending_async_reads.push_back({
5321       {op.checksum.offset, op.checksum.length, op.flags},
5322       {&checksum_ctx->read_bl, checksum_ctx}});
5323
5324     dout(10) << __func__ << ": async_read noted for " << soid << dendl;
5325     ctx->op_finishers[ctx->current_osd_subop_num].reset(
5326       new ReadFinisher(osd_op));
5327     return -EINPROGRESS;
5328   }
5329
5330   // sync read
5331   std::vector<OSDOp> read_ops(1);
5332   auto& read_op = read_ops[0];
5333   if (op.checksum.length > 0) {
5334     read_op.op.op = CEPH_OSD_OP_READ;
5335     read_op.op.flags = op.flags;
5336     read_op.op.extent.offset = op.checksum.offset;
5337     read_op.op.extent.length = op.checksum.length;
5338     read_op.op.extent.truncate_size = 0;
5339     read_op.op.extent.truncate_seq = 0;
5340
5341     int r = do_osd_ops(ctx, read_ops);
5342     if (r < 0) {
5343       derr << __func__ << ": do_osd_ops failed: " << cpp_strerror(r) << dendl;
5344       return r;
5345     }
5346   }
5347
5348   bufferlist::const_iterator init_value_bl_it = init_value_bl.begin();
5349   return finish_checksum(osd_op, csum_type, &init_value_bl_it,
5350                          read_op.outdata);
5351 }
5352
5353 int PrimaryLogPG::finish_checksum(OSDOp& osd_op,
5354                                   Checksummer::CSumType csum_type,
5355                                   bufferlist::const_iterator *init_value_bl_it,
5356                                   const bufferlist &read_bl) {
5357   dout(20) << __func__ << dendl;
5358
5359   auto& op = osd_op.op;
5360
5361   if (op.checksum.length > 0 && read_bl.length() != op.checksum.length) {
5362     derr << __func__ << ": bytes read " << read_bl.length() << " != "
5363          << op.checksum.length << dendl;
5364     return -EINVAL;
5365   }
5366
5367   size_t csum_chunk_size = (op.checksum.chunk_size != 0 ?
5368                               op.checksum.chunk_size : read_bl.length());
5369   uint32_t csum_count = (csum_chunk_size > 0 ?
5370                            read_bl.length() / csum_chunk_size : 0);
5371
5372   bufferlist csum;
5373   bufferptr csum_data;
5374   if (csum_count > 0) {
5375     size_t csum_value_size = Checksummer::get_csum_value_size(csum_type);
5376     csum_data = buffer::create(csum_value_size * csum_count);
5377     csum_data.zero();
5378     csum.append(csum_data);
5379
5380     switch (csum_type) {
5381     case Checksummer::CSUM_XXHASH32:
5382       {
5383         Checksummer::xxhash32::init_value_t init_value;
5384         decode(init_value, *init_value_bl_it);
5385         Checksummer::calculate<Checksummer::xxhash32>(
5386           init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
5387           &csum_data);
5388       }
5389       break;
5390     case Checksummer::CSUM_XXHASH64:
5391       {
5392         Checksummer::xxhash64::init_value_t init_value;
5393         decode(init_value, *init_value_bl_it);
5394         Checksummer::calculate<Checksummer::xxhash64>(
5395           init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
5396           &csum_data);
5397       }
5398       break;
5399     case Checksummer::CSUM_CRC32C:
5400       {
5401         Checksummer::crc32c::init_value_t init_value;
5402         decode(init_value, *init_value_bl_it);
5403         Checksummer::calculate<Checksummer::crc32c>(
5404           init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
5405           &csum_data);
5406       }
5407       break;
5408     default:
5409       break;
5410     }
5411   }
5412
5413   encode(csum_count, osd_op.outdata);
5414   osd_op.outdata.claim_append(csum);
5415   return 0;
5416 }
5417
5418 struct C_ExtentCmpRead : public Context {
5419   PrimaryLogPG *primary_log_pg;
5420   OSDOp &osd_op;
5421   ceph_le64 read_length{};
5422   bufferlist read_bl;
5423   Context *fill_extent_ctx;
5424
5425   C_ExtentCmpRead(PrimaryLogPG *primary_log_pg, OSDOp &osd_op,
5426                   boost::optional<uint32_t> maybe_crc, uint64_t size,
5427                   OSDService *osd, hobject_t soid, __le32 flags)
5428     : primary_log_pg(primary_log_pg), osd_op(osd_op),
5429       fill_extent_ctx(new FillInVerifyExtent(&read_length, &osd_op.rval,
5430                                              &read_bl, maybe_crc, size,
5431                                              osd, soid, flags)) {
5432   }
5433   ~C_ExtentCmpRead() override {
5434     delete fill_extent_ctx;
5435   }
5436
5437   void finish(int r) override {
5438     if (r == -ENOENT) {
5439       osd_op.rval = 0;
5440       read_bl.clear();
5441       delete fill_extent_ctx;
5442     } else {
5443       fill_extent_ctx->complete(r);
5444     }
5445     fill_extent_ctx = nullptr;
5446
5447     if (osd_op.rval >= 0) {
5448       osd_op.rval = primary_log_pg->finish_extent_cmp(osd_op, read_bl);
5449     }
5450   }
5451 };
5452
5453 int PrimaryLogPG::do_extent_cmp(OpContext *ctx, OSDOp& osd_op)
5454 {
5455   dout(20) << __func__ << dendl;
5456   ceph_osd_op& op = osd_op.op;
5457
5458   auto& oi = ctx->new_obs.oi;
5459   uint64_t size = oi.size;
5460   if ((oi.truncate_seq < op.extent.truncate_seq) &&
5461       (op.extent.offset + op.extent.length > op.extent.truncate_size)) {
5462     size = op.extent.truncate_size;
5463   }
5464
5465   if (op.extent.offset >= size) {
5466     op.extent.length = 0;
5467   } else if (op.extent.offset + op.extent.length > size) {
5468     op.extent.length = size - op.extent.offset;
5469   }
5470
5471   if (op.extent.length == 0) {
5472     dout(20) << __func__ << " zero length extent" << dendl;
5473     return finish_extent_cmp(osd_op, bufferlist{});
5474   } else if (!ctx->obs->exists || ctx->obs->oi.is_whiteout()) {
5475     dout(20) << __func__ << " object DNE" << dendl;
5476     return finish_extent_cmp(osd_op, {});
5477   } else if (pool.info.is_erasure()) {
5478     // If there is a data digest and it is possible we are reading
5479     // entire object, pass the digest.
5480     boost::optional<uint32_t> maybe_crc;
5481     if (oi.is_data_digest() && op.checksum.offset == 0 &&
5482         op.checksum.length >= oi.size) {
5483       maybe_crc = oi.data_digest;
5484     }
5485
5486     // async read
5487     auto& soid = oi.soid;
5488     auto extent_cmp_ctx = new C_ExtentCmpRead(this, osd_op, maybe_crc, oi.size,
5489                                               osd, soid, op.flags);
5490     ctx->pending_async_reads.push_back({
5491       {op.extent.offset, op.extent.length, op.flags},
5492       {&extent_cmp_ctx->read_bl, extent_cmp_ctx}});
5493
5494     dout(10) << __func__ << ": async_read noted for " << soid << dendl;
5495
5496     ctx->op_finishers[ctx->current_osd_subop_num].reset(
5497       new ReadFinisher(osd_op));
5498     return -EINPROGRESS;
5499   }
5500
5501   // sync read
5502   vector<OSDOp> read_ops(1);
5503   OSDOp& read_op = read_ops[0];
5504
5505   read_op.op.op = CEPH_OSD_OP_SYNC_READ;
5506   read_op.op.extent.offset = op.extent.offset;
5507   read_op.op.extent.length = op.extent.length;
5508   read_op.op.extent.truncate_seq = op.extent.truncate_seq;
5509   read_op.op.extent.truncate_size = op.extent.truncate_size;
5510
5511   int result = do_osd_ops(ctx, read_ops);
5512   if (result < 0) {
5513     derr << __func__ << " failed " << result << dendl;
5514     return result;
5515   }
5516   return finish_extent_cmp(osd_op, read_op.outdata);
5517 }
5518
5519 int PrimaryLogPG::finish_extent_cmp(OSDOp& osd_op, const bufferlist &read_bl)
5520 {
5521   for (uint64_t idx = 0; idx < osd_op.indata.length(); ++idx) {
5522     char read_byte = (idx < read_bl.length() ? read_bl[idx] : 0);
5523     if (osd_op.indata[idx] != read_byte) {
5524         return (-MAX_ERRNO - idx);
5525     }
5526   }
5527
5528   return 0;
5529 }
5530
5531 int PrimaryLogPG::do_read(OpContext *ctx, OSDOp& osd_op) {
5532   dout(20) << __func__ << dendl;
5533   auto& op = osd_op.op;
5534   auto& oi = ctx->new_obs.oi;
5535   auto& soid = oi.soid;
5536   __u32 seq = oi.truncate_seq;
5537   uint64_t size = oi.size;
5538   bool trimmed_read = false;
5539
5540   dout(30) << __func__ << " oi.size: " << oi.size << dendl;
5541   dout(30) << __func__ << " oi.truncate_seq: " << oi.truncate_seq << dendl;
5542   dout(30) << __func__ << " op.extent.truncate_seq: " << op.extent.truncate_seq << dendl;
5543   dout(30) << __func__ << " op.extent.truncate_size: " << op.extent.truncate_size << dendl;
5544
5545   // are we beyond truncate_size?
5546   if ( (seq < op.extent.truncate_seq) &&
5547        (op.extent.offset + op.extent.length > op.extent.truncate_size) &&
5548        (size > op.extent.truncate_size) )
5549     size = op.extent.truncate_size;
5550
5551   if (op.extent.length == 0) //length is zero mean read the whole object
5552     op.extent.length = size;
5553
5554   if (op.extent.offset >= size) {
5555     op.extent.length = 0;
5556     trimmed_read = true;
5557   } else if (op.extent.offset + op.extent.length > size) {
5558     op.extent.length = size - op.extent.offset;
5559     trimmed_read = true;
5560   }
5561
5562   dout(30) << __func__ << "op.extent.length is now " << op.extent.length << dendl;
5563
5564   // read into a buffer
5565   int result = 0;
5566   if (trimmed_read && op.extent.length == 0) {
5567     // read size was trimmed to zero and it is expected to do nothing
5568     // a read operation of 0 bytes does *not* do nothing, this is why
5569     // the trimmed_read boolean is needed
5570   } else if (pool.info.is_erasure()) {
5571     // The initialisation below is required to silence a false positive
5572     // -Wmaybe-uninitialized warning
5573     boost::optional<uint32_t> maybe_crc = boost::make_optional(false, uint32_t());
5574     // If there is a data digest and it is possible we are reading
5575     // entire object, pass the digest.  FillInVerifyExtent will
5576     // will check the oi.size again.
5577     if (oi.is_data_digest() && op.extent.offset == 0 &&
5578         op.extent.length >= oi.size)
5579       maybe_crc = oi.data_digest;
5580     ctx->pending_async_reads.push_back(
5581       make_pair(
5582         boost::make_tuple(op.extent.offset, op.extent.length, op.flags),
5583         make_pair(&osd_op.outdata,
5584                   new FillInVerifyExtent(&op.extent.length, &osd_op.rval,
5585                                          &osd_op.outdata, maybe_crc, oi.size,
5586                                          osd, soid, op.flags))));
5587     dout(10) << " async_read noted for " << soid << dendl;
5588
5589     ctx->op_finishers[ctx->current_osd_subop_num].reset(
5590       new ReadFinisher(osd_op));
5591   } else {
5592     int r = pgbackend->objects_read_sync(
5593       soid, op.extent.offset, op.extent.length, op.flags, &osd_op.outdata);
5594     // whole object?  can we verify the checksum?
5595     if (r >= 0 && op.extent.offset == 0 &&
5596         (uint64_t)r == oi.size && oi.is_data_digest()) {
5597       uint32_t crc = osd_op.outdata.crc32c(-1);
5598       if (oi.data_digest != crc) {
5599         osd->clog->error() << info.pgid << std::hex
5600                            << " full-object read crc 0x" << crc
5601                            << " != expected 0x" << oi.data_digest
5602                            << std::dec << " on " << soid;
5603         r = -EIO; // try repair later
5604       }
5605     }
5606     if (r == -EIO) {
5607       r = rep_repair_primary_object(soid, ctx);
5608     }
5609     if (r >= 0)
5610       op.extent.length = r;
5611     else if (r == -EAGAIN) {
5612       result = -EAGAIN;
5613     } else {
5614       result = r;
5615       op.extent.length = 0;
5616     }
5617     dout(10) << " read got " << r << " / " << op.extent.length
5618              << " bytes from obj " << soid << dendl;
5619   }
5620   if (result >= 0) {
5621     ctx->delta_stats.num_rd_kb += shift_round_up(op.extent.length, 10);
5622     ctx->delta_stats.num_rd++;
5623   }
5624   return result;
5625 }
5626
5627 int PrimaryLogPG::do_sparse_read(OpContext *ctx, OSDOp& osd_op) {
5628   dout(20) << __func__ << dendl;
5629   auto& op = osd_op.op;
5630   auto& oi = ctx->new_obs.oi;
5631   auto& soid = oi.soid;
5632
5633   if (op.extent.truncate_seq) {
5634     dout(0) << "sparse_read does not support truncation sequence " << dendl;
5635     return -EINVAL;
5636   }
5637
5638   ++ctx->num_read;
5639   if (pool.info.is_erasure()) {
5640     // translate sparse read to a normal one if not supported
5641     uint64_t offset = op.extent.offset;
5642     uint64_t length = op.extent.length;
5643     if (offset > oi.size) {
5644       length = 0;
5645     } else if (offset + length > oi.size) {
5646       length = oi.size - offset;
5647     }
5648
5649     if (length > 0) {
5650       ctx->pending_async_reads.push_back(
5651         make_pair(
5652           boost::make_tuple(offset, length, op.flags),
5653           make_pair(
5654             &osd_op.outdata,
5655             new ToSparseReadResult(&osd_op.rval, &osd_op.outdata, offset,
5656                                    &op.extent.length))));
5657       dout(10) << " async_read (was sparse_read) noted for " << soid << dendl;
5658
5659       ctx->op_finishers[ctx->current_osd_subop_num].reset(
5660         new ReadFinisher(osd_op));
5661     } else {
5662       dout(10) << " sparse read ended up empty for " << soid << dendl;
5663       map<uint64_t, uint64_t> extents;
5664       encode(extents, osd_op.outdata);
5665     }
5666   } else {
5667     // read into a buffer
5668     map<uint64_t, uint64_t> m;
5669     uint32_t total_read = 0;
5670     int r = osd->store->fiemap(ch, ghobject_t(soid, ghobject_t::NO_GEN,
5671                                               info.pgid.shard),
5672                                op.extent.offset, op.extent.length, m);
5673     if (r < 0)  {
5674       return r;
5675     }
5676
5677     map<uint64_t, uint64_t>::iterator miter;
5678     bufferlist data_bl;
5679     uint64_t last = op.extent.offset;
5680     for (miter = m.begin(); miter != m.end(); ++miter) {
5681       // verify hole?
5682       if (cct->_conf->osd_verify_sparse_read_holes &&
5683           last < miter->first) {
5684         bufferlist t;
5685         uint64_t len = miter->first - last;
5686         r = pgbackend->objects_read_sync(soid, last, len, op.flags, &t);
5687         if (r < 0) {
5688           osd->clog->error() << coll << " " << soid
5689                              << " sparse-read failed to read: "
5690                              << r;
5691         } else if (!t.is_zero()) {
5692           osd->clog->error() << coll << " " << soid
5693                              << " sparse-read found data in hole "
5694                              << last << "~" << len;
5695         }
5696       }
5697
5698       bufferlist tmpbl;
5699       r = pgbackend->objects_read_sync(soid, miter->first, miter->second,
5700                                        op.flags, &tmpbl);
5701       if (r == -EIO) {
5702         r = rep_repair_primary_object(soid, ctx);
5703       }
5704       if (r < 0) {
5705         return r;
5706       }
5707
5708       // this is usually happen when we get extent that exceeds the actual file
5709       // size
5710       if (r < (int)miter->second)
5711         miter->second = r;
5712       total_read += r;
5713       dout(10) << "sparse-read " << miter->first << "@" << miter->second
5714                << dendl;
5715       data_bl.claim_append(tmpbl);
5716       last = miter->first + r;
5717     }
5718
5719     // verify trailing hole?
5720     if (cct->_conf->osd_verify_sparse_read_holes) {
5721       uint64_t end = std::min<uint64_t>(op.extent.offset + op.extent.length,
5722                                         oi.size);
5723       if (last < end) {
5724         bufferlist t;
5725         uint64_t len = end - last;
5726         r = pgbackend->objects_read_sync(soid, last, len, op.flags, &t);
5727         if (r < 0) {
5728           osd->clog->error() << coll << " " << soid
5729                              << " sparse-read failed to read: " << r;
5730         } else if (!t.is_zero()) {
5731           osd->clog->error() << coll << " " << soid
5732                              << " sparse-read found data in hole "
5733                              << last << "~" << len;
5734         }
5735       }
5736     }
5737
5738     // Why SPARSE_READ need checksum? In fact, librbd always use sparse-read.
5739     // Maybe at first, there is no much whole objects. With continued use, more
5740     // and more whole object exist. So from this point, for spare-read add
5741     // checksum make sense.
5742     if (total_read == oi.size && oi.is_data_digest()) {
5743       uint32_t crc = data_bl.crc32c(-1);
5744       if (oi.data_digest != crc) {
5745         osd->clog->error() << info.pgid << std::hex
5746           << " full-object read crc 0x" << crc
5747           << " != expected 0x" << oi.data_digest
5748           << std::dec << " on " << soid;
5749         r = rep_repair_primary_object(soid, ctx);
5750         if (r < 0) {
5751           return r;
5752         }
5753       }
5754     }
5755
5756     op.extent.length = total_read;
5757
5758     encode(m, osd_op.outdata); // re-encode since it might be modified
5759     ::encode_destructively(data_bl, osd_op.outdata);
5760
5761     dout(10) << " sparse_read got " << total_read << " bytes from object "
5762              << soid << dendl;
5763   }
5764
5765   ctx->delta_stats.num_rd_kb += shift_round_up(op.extent.length, 10);
5766   ctx->delta_stats.num_rd++;
5767   return 0;
5768 }
5769
5770 int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
5771 {
5772   int result = 0;
5773   SnapSetContext *ssc = ctx->obc->ssc;
5774   ObjectState& obs = ctx->new_obs;
5775   object_info_t& oi = obs.oi;
5776   const hobject_t& soid = oi.soid;
5777   const bool skip_data_digest = osd->store->has_builtin_csum() &&
5778     osd->osd_skip_data_digest;
5779
5780   PGTransaction* t = ctx->op_t.get();
5781
5782   dout(10) << "do_osd_op " << soid << " " << ops << dendl;
5783
5784   ctx->current_osd_subop_num = 0;
5785   for (auto p = ops.begin(); p != ops.end(); ++p, ctx->current_osd_subop_num++, ctx->processed_subop_count++) {
5786     OSDOp& osd_op = *p;
5787     ceph_osd_op& op = osd_op.op;
5788
5789     OpFinisher* op_finisher = nullptr;
5790     {
5791       auto op_finisher_it = ctx->op_finishers.find(ctx->current_osd_subop_num);
5792       if (op_finisher_it != ctx->op_finishers.end()) {
5793         op_finisher = op_finisher_it->second.get();
5794       }
5795     }
5796
5797     // TODO: check endianness (__le32 vs uint32_t, etc.)
5798     // The fields in ceph_osd_op are little-endian (according to the definition in rados.h),
5799     // but the code in this function seems to treat them as native-endian.  What should the
5800     // tracepoints do?
5801     tracepoint(osd, do_osd_op_pre, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op), op.flags);
5802
5803     dout(10) << "do_osd_op  " << osd_op << dendl;
5804
5805     auto bp = osd_op.indata.cbegin();
5806
5807     // user-visible modifcation?
5808     switch (op.op) {
5809       // non user-visible modifications
5810     case CEPH_OSD_OP_WATCH:
5811     case CEPH_OSD_OP_CACHE_EVICT:
5812     case CEPH_OSD_OP_CACHE_FLUSH:
5813     case CEPH_OSD_OP_CACHE_TRY_FLUSH:
5814     case CEPH_OSD_OP_UNDIRTY:
5815     case CEPH_OSD_OP_COPY_FROM:  // we handle user_version update explicitly
5816     case CEPH_OSD_OP_CACHE_PIN:
5817     case CEPH_OSD_OP_CACHE_UNPIN:
5818     case CEPH_OSD_OP_SET_REDIRECT:
5819     case CEPH_OSD_OP_TIER_PROMOTE:
5820       break;
5821     default:
5822       if (op.op & CEPH_OSD_OP_MODE_WR)
5823         ctx->user_modify = true;
5824     }
5825
5826     // munge -1 truncate to 0 truncate
5827     if (ceph_osd_op_uses_extent(op.op) &&
5828         op.extent.truncate_seq == 1 &&
5829         op.extent.truncate_size == (-1ULL)) {
5830       op.extent.truncate_size = 0;
5831       op.extent.truncate_seq = 0;
5832     }
5833
5834     // munge ZERO -> TRUNCATE?  (don't munge to DELETE or we risk hosing attributes)
5835     if (op.op == CEPH_OSD_OP_ZERO &&
5836         obs.exists &&
5837         op.extent.offset < static_cast<Option::size_t>(osd->osd_max_object_size) &&
5838         op.extent.length >= 1 &&
5839         op.extent.length <= static_cast<Option::size_t>(osd->osd_max_object_size) &&
5840         op.extent.offset + op.extent.length >= oi.size) {
5841       if (op.extent.offset >= oi.size) {
5842         // no-op
5843         goto fail;
5844       }
5845       dout(10) << " munging ZERO " << op.extent.offset << "~" << op.extent.length
5846                << " -> TRUNCATE " << op.extent.offset << " (old size is " << oi.size << ")" << dendl;
5847       op.op = CEPH_OSD_OP_TRUNCATE;
5848     }
5849
5850     switch (op.op) {
5851
5852       // --- READS ---
5853
5854     case CEPH_OSD_OP_CMPEXT:
5855       ++ctx->num_read;
5856       tracepoint(osd, do_osd_op_pre_extent_cmp, soid.oid.name.c_str(),
5857                  soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
5858                  op.extent.length, op.extent.truncate_size,
5859                  op.extent.truncate_seq);
5860
5861       if (op_finisher == nullptr) {
5862         result = do_extent_cmp(ctx, osd_op);
5863       } else {
5864         result = op_finisher->execute();
5865       }
5866       break;
5867
5868     case CEPH_OSD_OP_SYNC_READ:
5869       if (pool.info.is_erasure()) {
5870         result = -EOPNOTSUPP;
5871         break;
5872       }
5873       // fall through
5874     case CEPH_OSD_OP_READ:
5875       ++ctx->num_read;
5876       tracepoint(osd, do_osd_op_pre_read, soid.oid.name.c_str(),
5877                  soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
5878                  op.extent.length, op.extent.truncate_size,
5879                  op.extent.truncate_seq);
5880       if (op_finisher == nullptr) {
5881         if (!ctx->data_off) {
5882           ctx->data_off = op.extent.offset;
5883         }
5884         result = do_read(ctx, osd_op);
5885       } else {
5886         result = op_finisher->execute();
5887       }
5888       break;
5889
5890     case CEPH_OSD_OP_CHECKSUM:
5891       ++ctx->num_read;
5892       {
5893         tracepoint(osd, do_osd_op_pre_checksum, soid.oid.name.c_str(),
5894                    soid.snap.val, oi.size, oi.truncate_seq, op.checksum.type,
5895                    op.checksum.offset, op.checksum.length,
5896                    op.checksum.chunk_size);
5897
5898         if (op_finisher == nullptr) {
5899           result = do_checksum(ctx, osd_op, &bp);
5900         } else {
5901           result = op_finisher->execute();
5902         }
5903       }
5904       break;
5905
5906     /* map extents */
5907     case CEPH_OSD_OP_MAPEXT:
5908       tracepoint(osd, do_osd_op_pre_mapext, soid.oid.name.c_str(), soid.snap.val, op.extent.offset, op.extent.length);
5909       if (pool.info.is_erasure()) {
5910         result = -EOPNOTSUPP;
5911         break;
5912       }
5913       ++ctx->num_read;
5914       {
5915         // read into a buffer
5916         bufferlist bl;
5917         int r = osd->store->fiemap(ch, ghobject_t(soid, ghobject_t::NO_GEN,
5918                                                   info.pgid.shard),
5919                                    op.extent.offset, op.extent.length, bl);
5920         osd_op.outdata.claim(bl);
5921         if (r < 0)
5922           result = r;
5923         else
5924           ctx->delta_stats.num_rd_kb += shift_round_up(bl.length(), 10);
5925         ctx->delta_stats.num_rd++;
5926         dout(10) << " map_extents done on object " << soid << dendl;
5927       }
5928       break;
5929
5930     /* map extents */
5931     case CEPH_OSD_OP_SPARSE_READ:
5932       tracepoint(osd, do_osd_op_pre_sparse_read, soid.oid.name.c_str(),
5933                  soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
5934                  op.extent.length, op.extent.truncate_size,
5935                  op.extent.truncate_seq);
5936       if (op_finisher == nullptr) {
5937         result = do_sparse_read(ctx, osd_op);
5938       } else {
5939         result = op_finisher->execute();
5940       }
5941       break;
5942
5943     case CEPH_OSD_OP_CALL:
5944       {
5945         string cname, mname;
5946         bufferlist indata;
5947         try {
5948           bp.copy(op.cls.class_len, cname);
5949           bp.copy(op.cls.method_len, mname);
5950           bp.copy(op.cls.indata_len, indata);
5951         } catch (buffer::error& e) {
5952           dout(10) << "call unable to decode class + method + indata" << dendl;
5953           dout(30) << "in dump: ";
5954           osd_op.indata.hexdump(*_dout);
5955           *_dout << dendl;
5956           result = -EINVAL;
5957           tracepoint(osd, do_osd_op_pre_call, soid.oid.name.c_str(), soid.snap.val, "???", "???");
5958           break;
5959         }
5960         tracepoint(osd, do_osd_op_pre_call, soid.oid.name.c_str(), soid.snap.val, cname.c_str(), mname.c_str());
5961
5962         ClassHandler::ClassData *cls;
5963         result = osd->class_handler->open_class(cname, &cls);
5964         ceph_assert(result == 0);   // init_op_flags() already verified this works.
5965
5966         ClassHandler::ClassMethod *method = cls->get_method(mname.c_str());
5967         if (!method) {
5968           dout(10) << "call method " << cname << "." << mname << " does not exist" << dendl;
5969           result = -EOPNOTSUPP;
5970           break;
5971         }
5972
5973         int flags = method->get_flags();
5974         if (flags & CLS_METHOD_WR)
5975           ctx->user_modify = true;
5976
5977         bufferlist outdata;
5978         dout(10) << "call method " << cname << "." << mname << dendl;
5979         int prev_rd = ctx->num_read;
5980         int prev_wr = ctx->num_write;
5981         result = method->exec((cls_method_context_t)&ctx, indata, outdata);
5982
5983         if (ctx->num_read > prev_rd && !(flags & CLS_METHOD_RD)) {
5984           derr << "method " << cname << "." << mname << " tried to read object but is not marked RD" << dendl;
5985           result = -EIO;
5986           break;
5987         }
5988         if (ctx->num_write > prev_wr && !(flags & CLS_METHOD_WR)) {
5989           derr << "method " << cname << "." << mname << " tried to update object but is not marked WR" << dendl;
5990           result = -EIO;
5991           break;
5992         }
5993
5994         dout(10) << "method called response length=" << outdata.length() << dendl;
5995         op.extent.length = outdata.length();
5996         osd_op.outdata.claim_append(outdata);
5997         dout(30) << "out dump: ";
5998         osd_op.outdata.hexdump(*_dout);
5999         *_dout << dendl;
6000       }
6001       break;
6002
6003     case CEPH_OSD_OP_STAT:
6004       // note: stat does not require RD
6005       {
6006         tracepoint(osd, do_osd_op_pre_stat, soid.oid.name.c_str(), soid.snap.val);
6007
6008         if (obs.exists && !oi.is_whiteout()) {
6009           encode(oi.size, osd_op.outdata);
6010           encode(oi.mtime, osd_op.outdata);
6011           dout(10) << "stat oi has " << oi.size << " " << oi.mtime << dendl;
6012         } else {
6013           result = -ENOENT;
6014           dout(10) << "stat oi object does not exist" << dendl;
6015         }
6016
6017         ctx->delta_stats.num_rd++;
6018       }
6019       break;
6020
6021     case CEPH_OSD_OP_ISDIRTY:
6022       ++ctx->num_read;
6023       {
6024         tracepoint(osd, do_osd_op_pre_isdirty, soid.oid.name.c_str(), soid.snap.val);
6025         bool is_dirty = obs.oi.is_dirty();
6026         encode(is_dirty, osd_op.outdata);
6027         ctx->delta_stats.num_rd++;
6028         result = 0;
6029       }
6030       break;
6031
6032     case CEPH_OSD_OP_UNDIRTY:
6033       ++ctx->num_write;
6034       {
6035         tracepoint(osd, do_osd_op_pre_undirty, soid.oid.name.c_str(), soid.snap.val);
6036         if (oi.is_dirty()) {
6037           ctx->undirty = true;  // see make_writeable()
6038           ctx->modify = true;
6039           ctx->delta_stats.num_wr++;
6040         }
6041         result = 0;
6042       }
6043       break;
6044
6045     case CEPH_OSD_OP_CACHE_TRY_FLUSH:
6046       ++ctx->num_write;
6047       {
6048         tracepoint(osd, do_osd_op_pre_try_flush, soid.oid.name.c_str(), soid.snap.val);
6049         if (ctx->lock_type != ObjectContext::RWState::RWNONE) {
6050           dout(10) << "cache-try-flush without SKIPRWLOCKS flag set" << dendl;
6051           result = -EINVAL;
6052           break;
6053         }
6054         if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
6055           result = -EINVAL;
6056           break;
6057         }
6058         if (!obs.exists) {
6059           result = 0;
6060           break;
6061         }
6062         if (oi.is_cache_pinned()) {
6063           dout(10) << "cache-try-flush on a pinned object, consider unpin this object first" << dendl;
6064           result = -EPERM;
6065           break;
6066         }
6067         if (oi.is_dirty()) {
6068           result = start_flush(ctx->op, ctx->obc, false, NULL, boost::none);
6069           if (result == -EINPROGRESS)
6070             result = -EAGAIN;
6071         } else {
6072           result = 0;
6073         }
6074       }
6075       break;
6076
6077     case CEPH_OSD_OP_CACHE_FLUSH:
6078       ++ctx->num_write;
6079       {
6080         tracepoint(osd, do_osd_op_pre_cache_flush, soid.oid.name.c_str(), soid.snap.val);
6081         if (ctx->lock_type == ObjectContext::RWState::RWNONE) {
6082           dout(10) << "cache-flush with SKIPRWLOCKS flag set" << dendl;
6083           result = -EINVAL;
6084           break;
6085         }
6086         if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
6087           result = -EINVAL;
6088           break;
6089         }
6090         if (!obs.exists) {
6091           result = 0;
6092           break;
6093         }
6094         if (oi.is_cache_pinned()) {
6095           dout(10) << "cache-flush on a pinned object, consider unpin this object first" << dendl;
6096           result = -EPERM;
6097           break;
6098         }
6099         hobject_t missing;
6100         if (oi.is_dirty()) {
6101           result = start_flush(ctx->op, ctx->obc, true, &missing, boost::none);
6102           if (result == -EINPROGRESS)
6103             result = -EAGAIN;
6104         } else {
6105           result = 0;
6106         }
6107         // Check special return value which has set missing_return
6108         if (result == -ENOENT) {
6109           dout(10) << __func__ << " CEPH_OSD_OP_CACHE_FLUSH got ENOENT" << dendl;
6110           ceph_assert(!missing.is_min());
6111           wait_for_unreadable_object(missing, ctx->op);
6112           // Error code which is used elsewhere when wait_for_unreadable_object() is used
6113           result = -EAGAIN;
6114         }
6115       }
6116       break;
6117
6118     case CEPH_OSD_OP_CACHE_EVICT:
6119       ++ctx->num_write;
6120       {
6121         tracepoint(osd, do_osd_op_pre_cache_evict, soid.oid.name.c_str(), soid.snap.val);
6122         if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
6123           result = -EINVAL;
6124           break;
6125         }
6126         if (!obs.exists) {
6127           result = 0;
6128           break;
6129         }
6130         if (oi.is_cache_pinned()) {
6131           dout(10) << "cache-evict on a pinned object, consider unpin this object first" << dendl;
6132           result = -EPERM;
6133           break;
6134         }
6135         if (oi.is_dirty()) {
6136           result = -EBUSY;
6137           break;
6138         }
6139         if (!oi.watchers.empty()) {
6140           result = -EBUSY;
6141           break;
6142         }
6143         if (soid.snap == CEPH_NOSNAP) {
6144           result = _verify_no_head_clones(soid, ssc->snapset);
6145           if (result < 0)
6146             break;
6147         }
6148         result = _delete_oid(ctx, true, false);
6149         if (result >= 0) {
6150           // mark that this is a cache eviction to avoid triggering normal
6151           // make_writeable() clone creation in finish_ctx()
6152           ctx->cache_evict = true;
6153         }
6154         osd->logger->inc(l_osd_tier_evict);
6155       }
6156       break;
6157
6158     case CEPH_OSD_OP_GETXATTR:
6159       ++ctx->num_read;
6160       {
6161         string aname;
6162         bp.copy(op.xattr.name_len, aname);
6163         tracepoint(osd, do_osd_op_pre_getxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
6164         string name = "_" + aname;
6165         int r = getattr_maybe_cache(
6166           ctx->obc,
6167           name,
6168           &(osd_op.outdata));
6169         if (r >= 0) {
6170           op.xattr.value_len = osd_op.outdata.length();
6171           result = 0;
6172           ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
6173         } else
6174           result = r;
6175
6176         ctx->delta_stats.num_rd++;
6177       }
6178       break;
6179
6180    case CEPH_OSD_OP_GETXATTRS:
6181       ++ctx->num_read;
6182       {
6183         tracepoint(osd, do_osd_op_pre_getxattrs, soid.oid.name.c_str(), soid.snap.val);
6184         map<string, bufferlist> out;
6185         result = getattrs_maybe_cache(
6186           ctx->obc,
6187           &out);
6188
6189         bufferlist bl;
6190         encode(out, bl);
6191         ctx->delta_stats.num_rd_kb += shift_round_up(bl.length(), 10);
6192         ctx->delta_stats.num_rd++;
6193         osd_op.outdata.claim_append(bl);
6194       }
6195       break;
6196
6197     case CEPH_OSD_OP_CMPXATTR:
6198       ++ctx->num_read;
6199       {
6200         string aname;
6201         bp.copy(op.xattr.name_len, aname);
6202         tracepoint(osd, do_osd_op_pre_cmpxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
6203         string name = "_" + aname;
6204         name[op.xattr.name_len + 1] = 0;
6205
6206         bufferlist xattr;
6207         result = getattr_maybe_cache(
6208           ctx->obc,
6209           name,
6210           &xattr);
6211         if (result < 0 && result != -EEXIST && result != -ENODATA)
6212           break;
6213
6214         ctx->delta_stats.num_rd++;
6215         ctx->delta_stats.num_rd_kb += shift_round_up(xattr.length(), 10);
6216
6217         switch (op.xattr.cmp_mode) {
6218         case CEPH_OSD_CMPXATTR_MODE_STRING:
6219           {
6220             string val;
6221             bp.copy(op.xattr.value_len, val);
6222             val[op.xattr.value_len] = 0;
6223             dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name << " val=" << val
6224                      << " op=" << (int)op.xattr.cmp_op << " mode=" << (int)op.xattr.cmp_mode << dendl;
6225             result = do_xattr_cmp_str(op.xattr.cmp_op, val, xattr);
6226           }
6227           break;
6228
6229         case CEPH_OSD_CMPXATTR_MODE_U64:
6230           {
6231             uint64_t u64val;
6232             try {
6233               decode(u64val, bp);
6234             }
6235             catch (buffer::error& e) {
6236               result = -EINVAL;
6237               goto fail;
6238             }
6239             dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name << " val=" << u64val
6240                      << " op=" << (int)op.xattr.cmp_op << " mode=" << (int)op.xattr.cmp_mode << dendl;
6241             result = do_xattr_cmp_u64(op.xattr.cmp_op, u64val, xattr);
6242           }
6243           break;
6244
6245         default:
6246           dout(10) << "bad cmp mode " << (int)op.xattr.cmp_mode << dendl;
6247           result = -EINVAL;
6248         }
6249
6250         if (!result) {
6251           dout(10) << "comparison returned false" << dendl;
6252           result = -ECANCELED;
6253           break;
6254         }
6255         if (result < 0) {
6256           dout(10) << "comparison returned " << result << " " << cpp_strerror(-result) << dendl;
6257           break;
6258         }
6259
6260         dout(10) << "comparison returned true" << dendl;
6261       }
6262       break;
6263
6264     case CEPH_OSD_OP_ASSERT_VER:
6265       ++ctx->num_read;
6266       {
6267         uint64_t ver = op.assert_ver.ver;
6268         tracepoint(osd, do_osd_op_pre_assert_ver, soid.oid.name.c_str(), soid.snap.val, ver);
6269         if (!ver)
6270           result = -EINVAL;
6271         else if (ver < oi.user_version)
6272           result = -ERANGE;
6273         else if (ver > oi.user_version)
6274           result = -EOVERFLOW;
6275       }
6276       break;
6277
6278     case CEPH_OSD_OP_LIST_WATCHERS:
6279       ++ctx->num_read;
6280       {
6281         tracepoint(osd, do_osd_op_pre_list_watchers, soid.oid.name.c_str(), soid.snap.val);
6282         obj_list_watch_response_t resp;
6283
6284         map<pair<uint64_t, entity_name_t>, watch_info_t>::const_iterator oi_iter;
6285         for (oi_iter = oi.watchers.begin(); oi_iter != oi.watchers.end();
6286                                        ++oi_iter) {
6287           dout(20) << "key cookie=" << oi_iter->first.first
6288                << " entity=" << oi_iter->first.second << " "
6289                << oi_iter->second << dendl;
6290           ceph_assert(oi_iter->first.first == oi_iter->second.cookie);
6291           ceph_assert(oi_iter->first.second.is_client());
6292
6293           watch_item_t wi(oi_iter->first.second, oi_iter->second.cookie,
6294                  oi_iter->second.timeout_seconds, oi_iter->second.addr);
6295           resp.entries.push_back(wi);
6296         }
6297
6298         resp.encode(osd_op.outdata, ctx->get_features());
6299         result = 0;
6300
6301         ctx->delta_stats.num_rd++;
6302         break;
6303       }
6304
6305     case CEPH_OSD_OP_LIST_SNAPS:
6306       ++ctx->num_read;
6307       {
6308         tracepoint(osd, do_osd_op_pre_list_snaps, soid.oid.name.c_str(), soid.snap.val);
6309         obj_list_snap_response_t resp;
6310
6311         if (!ssc) {
6312           ssc = ctx->obc->ssc = get_snapset_context(soid, false);
6313         }
6314         ceph_assert(ssc);
6315         dout(20) << " snapset " << ssc->snapset << dendl;
6316
6317         int clonecount = ssc->snapset.clones.size();
6318         clonecount++;  // for head
6319         resp.clones.reserve(clonecount);
6320         for (auto clone_iter = ssc->snapset.clones.begin();
6321              clone_iter != ssc->snapset.clones.end(); ++clone_iter) {
6322           clone_info ci;
6323           ci.cloneid = *clone_iter;
6324
6325           hobject_t clone_oid = soid;
6326           clone_oid.snap = *clone_iter;
6327
6328           auto p = ssc->snapset.clone_snaps.find(*clone_iter);
6329           if (p == ssc->snapset.clone_snaps.end()) {
6330             osd->clog->error() << "osd." << osd->whoami
6331                                << ": inconsistent clone_snaps found for oid "
6332                                << soid << " clone " << *clone_iter
6333                                << " snapset " << ssc->snapset;
6334             result = -EINVAL;
6335             break;
6336           }
6337           for (auto q = p->second.rbegin(); q != p->second.rend(); ++q) {
6338             ci.snaps.push_back(*q);
6339           }
6340
6341           dout(20) << " clone " << *clone_iter << " snaps " << ci.snaps << dendl;
6342
6343           map<snapid_t, interval_set<uint64_t> >::const_iterator coi;
6344           coi = ssc->snapset.clone_overlap.find(ci.cloneid);
6345           if (coi == ssc->snapset.clone_overlap.end()) {
6346             osd->clog->error() << "osd." << osd->whoami
6347                                << ": inconsistent clone_overlap found for oid "
6348                               << soid << " clone " << *clone_iter;
6349             result = -EINVAL;
6350             break;
6351           }
6352           const interval_set<uint64_t> &o = coi->second;
6353           ci.overlap.reserve(o.num_intervals());
6354           for (interval_set<uint64_t>::const_iterator r = o.begin();
6355                r != o.end(); ++r) {
6356             ci.overlap.push_back(pair<uint64_t,uint64_t>(r.get_start(),
6357                                                          r.get_len()));
6358           }
6359
6360           map<snapid_t, uint64_t>::const_iterator si;
6361           si = ssc->snapset.clone_size.find(ci.cloneid);
6362           if (si == ssc->snapset.clone_size.end()) {
6363             osd->clog->error() << "osd." << osd->whoami
6364                                << ": inconsistent clone_size found for oid "
6365                                << soid << " clone " << *clone_iter;
6366             result = -EINVAL;
6367             break;
6368           }
6369           ci.size = si->second;
6370
6371           resp.clones.push_back(ci);
6372         }
6373         if (result < 0) {
6374           break;
6375         }
6376         if (!ctx->obc->obs.oi.is_whiteout()) {
6377           ceph_assert(obs.exists);
6378           clone_info ci;
6379           ci.cloneid = CEPH_NOSNAP;
6380
6381           //Size for HEAD is oi.size
6382           ci.size = oi.size;
6383
6384           resp.clones.push_back(ci);
6385         }
6386         resp.seq = ssc->snapset.seq;
6387
6388         resp.encode(osd_op.outdata);
6389         result = 0;
6390
6391         ctx->delta_stats.num_rd++;
6392         break;
6393       }
6394
6395    case CEPH_OSD_OP_NOTIFY:
6396       ++ctx->num_read;
6397       {
6398         uint32_t timeout;
6399         bufferlist bl;
6400
6401         try {
6402           uint32_t ver; // obsolete
6403           decode(ver, bp);
6404           decode(timeout, bp);
6405           decode(bl, bp);
6406         } catch (const buffer::error &e) {
6407           timeout = 0;
6408         }
6409         tracepoint(osd, do_osd_op_pre_notify, soid.oid.name.c_str(), soid.snap.val, timeout);
6410         if (!timeout)
6411           timeout = cct->_conf->osd_default_notify_timeout;
6412
6413         notify_info_t n;
6414         n.timeout = timeout;
6415         n.notify_id = osd->get_next_id(get_osdmap_epoch());
6416         n.cookie = op.watch.cookie;
6417         n.bl = bl;
6418         ctx->notifies.push_back(n);
6419
6420         // return our unique notify id to the client
6421         encode(n.notify_id, osd_op.outdata);
6422       }
6423       break;
6424
6425     case CEPH_OSD_OP_NOTIFY_ACK:
6426       ++ctx->num_read;
6427       {
6428         try {
6429           uint64_t notify_id = 0;
6430           uint64_t watch_cookie = 0;
6431           decode(notify_id, bp);
6432           decode(watch_cookie, bp);
6433           bufferlist reply_bl;
6434           if (!bp.end()) {
6435             decode(reply_bl, bp);
6436           }
6437           tracepoint(osd, do_osd_op_pre_notify_ack, soid.oid.name.c_str(), soid.snap.val, notify_id, watch_cookie, "Y");
6438           OpContext::NotifyAck ack(notify_id, watch_cookie, reply_bl);
6439           ctx->notify_acks.push_back(ack);
6440         } catch (const buffer::error &e) {
6441           tracepoint(osd, do_osd_op_pre_notify_ack, soid.oid.name.c_str(), soid.snap.val, op.watch.cookie, 0, "N");
6442           OpContext::NotifyAck ack(
6443             // op.watch.cookie is actually the notify_id for historical reasons
6444             op.watch.cookie
6445             );
6446           ctx->notify_acks.push_back(ack);
6447         }
6448       }
6449       break;
6450
6451     case CEPH_OSD_OP_SETALLOCHINT:
6452       ++ctx->num_write;
6453       {
6454         tracepoint(osd, do_osd_op_pre_setallochint, soid.oid.name.c_str(), soid.snap.val, op.alloc_hint.expected_object_size, op.alloc_hint.expected_write_size);
6455         maybe_create_new_object(ctx);
6456         oi.expected_object_size = op.alloc_hint.expected_object_size;
6457         oi.expected_write_size = op.alloc_hint.expected_write_size;
6458         oi.alloc_hint_flags = op.alloc_hint.flags;
6459         t->set_alloc_hint(soid, op.alloc_hint.expected_object_size,
6460                           op.alloc_hint.expected_write_size,
6461                           op.alloc_hint.flags);
6462         result = 0;
6463       }
6464       break;
6465
6466
6467       // --- WRITES ---
6468
6469       // -- object data --
6470
6471     case CEPH_OSD_OP_WRITE:
6472       ++ctx->num_write;
6473       { // write
6474         __u32 seq = oi.truncate_seq;
6475         tracepoint(osd, do_osd_op_pre_write, soid.oid.name.c_str(), soid.snap.val, oi.size, seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
6476         if (op.extent.length != osd_op.indata.length()) {
6477           result = -EINVAL;
6478           break;
6479         }
6480
6481         if (pool.info.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED))
6482           op.flags = op.flags | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
6483
6484         if (pool.info.requires_aligned_append() &&
6485             (op.extent.offset % pool.info.required_alignment() != 0)) {
6486           result = -EOPNOTSUPP;
6487           break;
6488         }
6489
6490         if (!obs.exists) {
6491           if (pool.info.requires_aligned_append() && op.extent.offset) {
6492             result = -EOPNOTSUPP;
6493             break;
6494           }
6495         } else if (op.extent.offset != oi.size &&
6496                    pool.info.requires_aligned_append()) {
6497           result = -EOPNOTSUPP;
6498           break;
6499         }
6500
6501         if (seq && (seq > op.extent.truncate_seq) &&
6502             (op.extent.offset + op.extent.length > oi.size)) {
6503           // old write, arrived after trimtrunc
6504           op.extent.length = (op.extent.offset > oi.size ? 0 : oi.size - op.extent.offset);
6505           dout(10) << " old truncate_seq " << op.extent.truncate_seq << " < current " << seq
6506                    << ", adjusting write length to " << op.extent.length << dendl;
6507           bufferlist t;
6508           t.substr_of(osd_op.indata, 0, op.extent.length);
6509           osd_op.indata.swap(t);
6510         }
6511         if (op.extent.truncate_seq > seq) {
6512           // write arrives before trimtrunc
6513           if (obs.exists && !oi.is_whiteout()) {
6514             dout(10) << " truncate_seq " << op.extent.truncate_seq << " > current " << seq
6515                      << ", truncating to " << op.extent.truncate_size << dendl;
6516             t->truncate(soid, op.extent.truncate_size);
6517             oi.truncate_seq = op.extent.truncate_seq;
6518             oi.truncate_size = op.extent.truncate_size;
6519             if (oi.size > op.extent.truncate_size) {
6520               interval_set<uint64_t> trim;
6521               trim.insert(op.extent.truncate_size,
6522                 oi.size - op.extent.truncate_size);
6523               ctx->modified_ranges.union_of(trim);
6524             }
6525             if (op.extent.truncate_size != oi.size) {
6526               truncate_update_size_and_usage(ctx->delta_stats,
6527                                              oi,
6528                                              op.extent.truncate_size);
6529             }
6530           } else {
6531             dout(10) << " truncate_seq " << op.extent.truncate_seq << " > current " << seq
6532                      << ", but object is new" << dendl;
6533             oi.truncate_seq = op.extent.truncate_seq;
6534             oi.truncate_size = op.extent.truncate_size;
6535           }
6536         }
6537         result = check_offset_and_length(
6538           op.extent.offset, op.extent.length,
6539           static_cast<Option::size_t>(osd->osd_max_object_size), get_dpp());
6540         if (result < 0)
6541           break;
6542
6543         maybe_create_new_object(ctx);
6544
6545         if (op.extent.length == 0) {
6546           if (op.extent.offset > oi.size) {
6547             t->truncate(
6548               soid, op.extent.offset);
6549           } else {
6550             t->nop(soid);
6551           }
6552         } else {
6553           t->write(
6554             soid, op.extent.offset, op.extent.length, osd_op.indata, op.flags);
6555         }
6556
6557         if (op.extent.offset == 0 && op.extent.length >= oi.size
6558             && !skip_data_digest) {
6559           obs.oi.set_data_digest(osd_op.indata.crc32c(-1));
6560         } else if (op.extent.offset == oi.size && obs.oi.is_data_digest()) {
6561           if (skip_data_digest) {
6562             obs.oi.clear_data_digest();
6563           } else {
6564             obs.oi.set_data_digest(osd_op.indata.crc32c(obs.oi.data_digest));
6565           }
6566         } else {
6567           obs.oi.clear_data_digest();
6568         }
6569         write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
6570                                     op.extent.offset, op.extent.length);
6571
6572       }
6573       break;
6574
6575     case CEPH_OSD_OP_WRITEFULL:
6576       ++ctx->num_write;
6577       { // write full object
6578         tracepoint(osd, do_osd_op_pre_writefull, soid.oid.name.c_str(), soid.snap.val, oi.size, 0, op.extent.length);
6579
6580         if (op.extent.length != osd_op.indata.length()) {
6581           result = -EINVAL;
6582           break;
6583         }
6584         result = check_offset_and_length(
6585           0, op.extent.length,
6586           static_cast<Option::size_t>(osd->osd_max_object_size), get_dpp());
6587         if (result < 0)
6588           break;
6589
6590         if (pool.info.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED))
6591           op.flags = op.flags | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
6592
6593         maybe_create_new_object(ctx);
6594         if (pool.info.is_erasure()) {
6595           t->truncate(soid, 0);
6596         } else if (obs.exists && op.extent.length < oi.size) {
6597           t->truncate(soid, op.extent.length);
6598         }
6599         if (op.extent.length) {
6600           t->write(soid, 0, op.extent.length, osd_op.indata, op.flags);
6601         }
6602         if (!skip_data_digest) {
6603           obs.oi.set_data_digest(osd_op.indata.crc32c(-1));
6604         } else {
6605           obs.oi.clear_data_digest();
6606         }
6607
6608         write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
6609             0, op.extent.length, true);
6610       }
6611       break;
6612
6613     case CEPH_OSD_OP_WRITESAME:
6614       ++ctx->num_write;
6615       tracepoint(osd, do_osd_op_pre_writesame, soid.oid.name.c_str(), soid.snap.val, oi.size, op.writesame.offset, op.writesame.length, op.writesame.data_length);
6616       result = do_writesame(ctx, osd_op);
6617       break;
6618
6619     case CEPH_OSD_OP_ROLLBACK :
6620       ++ctx->num_write;
6621       tracepoint(osd, do_osd_op_pre_rollback, soid.oid.name.c_str(), soid.snap.val);
6622       result = _rollback_to(ctx, op);
6623       break;
6624
6625     case CEPH_OSD_OP_ZERO:
6626       tracepoint(osd, do_osd_op_pre_zero, soid.oid.name.c_str(), soid.snap.val, op.extent.offset, op.extent.length);
6627       if (pool.info.requires_aligned_append()) {
6628         result = -EOPNOTSUPP;
6629         break;
6630       }
6631       ++ctx->num_write;
6632       { // zero
6633         result = check_offset_and_length(
6634           op.extent.offset, op.extent.length,
6635           static_cast<Option::size_t>(osd->osd_max_object_size), get_dpp());
6636         if (result < 0)
6637           break;
6638
6639         ceph_assert(op.extent.length);
6640         if (obs.exists && !oi.is_whiteout()) {
6641           t->zero(soid, op.extent.offset, op.extent.length);
6642           interval_set<uint64_t> ch;
6643           ch.insert(op.extent.offset, op.extent.length);
6644           ctx->modified_ranges.union_of(ch);
6645           ctx->delta_stats.num_wr++;
6646           oi.clear_data_digest();
6647         } else {
6648           // no-op
6649         }
6650       }
6651       break;
6652     case CEPH_OSD_OP_CREATE:
6653       ++ctx->num_write;
6654       {
6655         tracepoint(osd, do_osd_op_pre_create, soid.oid.name.c_str(), soid.snap.val);
6656         int flags = le32_to_cpu(op.flags);
6657         if (obs.exists && !oi.is_whiteout() &&
6658             (flags & CEPH_OSD_OP_FLAG_EXCL)) {
6659           result = -EEXIST; /* this is an exclusive create */
6660         } else {
6661           if (osd_op.indata.length()) {
6662             auto p = osd_op.indata.cbegin();
6663             string category;
6664             try {
6665               decode(category, p);
6666             }
6667             catch (buffer::error& e) {
6668               result = -EINVAL;
6669               goto fail;
6670             }
6671             // category is no longer implemented.
6672           }
6673           if (result >= 0) {
6674             maybe_create_new_object(ctx);
6675             t->nop(soid);
6676           }
6677         }
6678       }
6679       break;
6680
6681     case CEPH_OSD_OP_TRIMTRUNC:
6682       op.extent.offset = op.extent.truncate_size;
6683       // falling through
6684
6685     case CEPH_OSD_OP_TRUNCATE:
6686       tracepoint(osd, do_osd_op_pre_truncate, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
6687       if (pool.info.requires_aligned_append()) {
6688         result = -EOPNOTSUPP;
6689         break;
6690       }
6691       ++ctx->num_write;
6692       {
6693         // truncate
6694         if (!obs.exists || oi.is_whiteout()) {
6695           dout(10) << " object dne, truncate is a no-op" << dendl;
6696           break;
6697         }
6698
6699         result = check_offset_and_length(
6700           op.extent.offset, op.extent.length,
6701           static_cast<Option::size_t>(osd->osd_max_object_size), get_dpp());
6702         if (result < 0)
6703           break;
6704
6705         if (op.extent.truncate_seq) {
6706           ceph_assert(op.extent.offset == op.extent.truncate_size);
6707           if (op.extent.truncate_seq <= oi.truncate_seq) {
6708             dout(10) << " truncate seq " << op.extent.truncate_seq << " <= current " << oi.truncate_seq
6709                      << ", no-op" << dendl;
6710             break; // old
6711           }
6712           dout(10) << " truncate seq " << op.extent.truncate_seq << " > current " << oi.truncate_seq
6713                    << ", truncating" << dendl;
6714           oi.truncate_seq = op.extent.truncate_seq;
6715           oi.truncate_size = op.extent.truncate_size;
6716         }
6717
6718         maybe_create_new_object(ctx);
6719         t->truncate(soid, op.extent.offset);
6720         if (oi.size > op.extent.offset) {
6721           interval_set<uint64_t> trim;
6722           trim.insert(op.extent.offset, oi.size-op.extent.offset);
6723           ctx->modified_ranges.union_of(trim);
6724         }
6725         if (op.extent.offset != oi.size) {
6726           truncate_update_size_and_usage(ctx->delta_stats,
6727                                          oi,
6728                                          op.extent.offset);
6729         }
6730         ctx->delta_stats.num_wr++;
6731         // do no set exists, or we will break above DELETE -> TRUNCATE munging.
6732
6733         oi.clear_data_digest();
6734       }
6735       break;
6736
6737     case CEPH_OSD_OP_DELETE:
6738       ++ctx->num_write;
6739       tracepoint(osd, do_osd_op_pre_delete, soid.oid.name.c_str(), soid.snap.val);
6740       {
6741         if (oi.has_manifest()) {
6742           if ((oi.flags & object_info_t::FLAG_REDIRECT_HAS_REFERENCE) && oi.manifest.is_redirect()) {
6743             ctx->register_on_commit(
6744               [oi, ctx, this](){
6745               object_locator_t target_oloc(oi.manifest.redirect_target);
6746               refcount_manifest(ctx->obc, target_oloc, oi.manifest.redirect_target,
6747                                 SnapContext(), false, NULL, 0);
6748             });
6749           } else if (oi.manifest.is_chunked()) {
6750             ctx->register_on_commit(
6751               [oi, ctx, this](){
6752               for (auto p : oi.manifest.chunk_map) {
6753                 if (p.second.has_reference()) {
6754                   object_locator_t target_oloc(p.second.oid);
6755                   refcount_manifest(ctx->obc, target_oloc, p.second.oid,
6756                                     SnapContext(), false, NULL, p.first);
6757                 }
6758               }
6759             });
6760           }
6761         }
6762         result = _delete_oid(ctx, false, ctx->ignore_cache);
6763       }
6764       break;
6765
6766     case CEPH_OSD_OP_WATCH:
6767       ++ctx->num_write;
6768       {
6769         tracepoint(osd, do_osd_op_pre_watch, soid.oid.name.c_str(), soid.snap.val,
6770                    op.watch.cookie, op.watch.op);
6771         if (!obs.exists) {
6772           result = -ENOENT;
6773           break;
6774         }
6775         uint64_t cookie = op.watch.cookie;
6776         entity_name_t entity = ctx->reqid.name;
6777         ObjectContextRef obc = ctx->obc;
6778
6779         dout(10) << "watch " << ceph_osd_watch_op_name(op.watch.op)
6780                  << ": ctx->obc=" << (void *)obc.get() << " cookie=" << cookie
6781                  << " oi.version=" << oi.version.version << " ctx->at_version=" << ctx->at_version << dendl;
6782         dout(10) << "watch: oi.user_version=" << oi.user_version<< dendl;
6783         dout(10) << "watch: peer_addr="
6784           << ctx->op->get_req()->get_connection()->get_peer_addr() << dendl;
6785
6786         uint32_t timeout = cct->_conf->osd_client_watch_timeout;
6787         if (op.watch.timeout != 0) {
6788           timeout = op.watch.timeout;
6789         }
6790
6791         watch_info_t w(cookie, timeout,
6792           ctx->op->get_req()->get_connection()->get_peer_addr());
6793         if (op.watch.op == CEPH_OSD_WATCH_OP_WATCH ||
6794             op.watch.op == CEPH_OSD_WATCH_OP_LEGACY_WATCH) {
6795           if (oi.watchers.count(make_pair(cookie, entity))) {
6796             dout(10) << " found existing watch " << w << " by " << entity << dendl;
6797           } else {
6798             dout(10) << " registered new watch " << w << " by " << entity << dendl;
6799             oi.watchers[make_pair(cookie, entity)] = w;
6800             t->nop(soid);  // make sure update the object_info on disk!
6801           }
6802           bool will_ping = (op.watch.op == CEPH_OSD_WATCH_OP_WATCH);
6803           ctx->watch_connects.push_back(make_pair(w, will_ping));
6804         } else if (op.watch.op == CEPH_OSD_WATCH_OP_RECONNECT) {
6805           if (!oi.watchers.count(make_pair(cookie, entity))) {
6806             result = -ENOTCONN;
6807             break;
6808           }
6809           dout(10) << " found existing watch " << w << " by " << entity << dendl;
6810           ctx->watch_connects.push_back(make_pair(w, true));
6811         } else if (op.watch.op == CEPH_OSD_WATCH_OP_PING) {
6812           /* Note: WATCH with PING doesn't cause may_write() to return true,
6813            * so if there is nothing else in the transaction, this is going
6814            * to run do_osd_op_effects, but not write out a log entry */
6815           if (!oi.watchers.count(make_pair(cookie, entity))) {
6816             result = -ENOTCONN;
6817             break;
6818           }
6819           map<pair<uint64_t,entity_name_t>,WatchRef>::iterator p =
6820             obc->watchers.find(make_pair(cookie, entity));
6821           if (p == obc->watchers.end() ||
6822               !p->second->is_connected()) {
6823             // client needs to reconnect
6824             result = -ETIMEDOUT;
6825             break;
6826           }
6827           dout(10) << " found existing watch " << w << " by " << entity << dendl;
6828           p->second->got_ping(ceph_clock_now());
6829           result = 0;
6830         } else if (op.watch.op == CEPH_OSD_WATCH_OP_UNWATCH) {
6831           map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator oi_iter =
6832             oi.watchers.find(make_pair(cookie, entity));
6833           if (oi_iter != oi.watchers.end()) {
6834             dout(10) << " removed watch " << oi_iter->second << " by "
6835                      << entity << dendl;
6836             oi.watchers.erase(oi_iter);
6837             t->nop(soid);  // update oi on disk
6838             ctx->watch_disconnects.push_back(
6839               watch_disconnect_t(cookie, entity, false));
6840           } else {
6841             dout(10) << " can't remove: no watch by " << entity << dendl;
6842           }
6843         }
6844       }
6845       break;
6846
6847     case CEPH_OSD_OP_CACHE_PIN:
6848       tracepoint(osd, do_osd_op_pre_cache_pin, soid.oid.name.c_str(), soid.snap.val);
6849       if ((!pool.info.is_tier() ||
6850           pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)) {
6851         result = -EINVAL;
6852         dout(10) << " pin object is only allowed on the cache tier " << dendl;
6853         break;
6854       }
6855       ++ctx->num_write;
6856       {
6857         if (!obs.exists || oi.is_whiteout()) {
6858           result = -ENOENT;
6859           break;
6860         }
6861
6862         if (!oi.is_cache_pinned()) {
6863           oi.set_flag(object_info_t::FLAG_CACHE_PIN);
6864           ctx->modify = true;
6865           ctx->delta_stats.num_objects_pinned++;
6866           ctx->delta_stats.num_wr++;
6867         }
6868         result = 0;
6869       }
6870       break;
6871
6872     case CEPH_OSD_OP_CACHE_UNPIN:
6873       tracepoint(osd, do_osd_op_pre_cache_unpin, soid.oid.name.c_str(), soid.snap.val);
6874       if ((!pool.info.is_tier() ||
6875           pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)) {
6876         result = -EINVAL;
6877         dout(10) << " pin object is only allowed on the cache tier " << dendl;
6878         break;
6879       }
6880       ++ctx->num_write;
6881       {
6882         if (!obs.exists || oi.is_whiteout()) {
6883           result = -ENOENT;
6884           break;
6885         }
6886
6887         if (oi.is_cache_pinned()) {
6888           oi.clear_flag(object_info_t::FLAG_CACHE_PIN);
6889           ctx->modify = true;
6890           ctx->delta_stats.num_objects_pinned--;
6891           ctx->delta_stats.num_wr++;
6892         }
6893         result = 0;
6894       }
6895       break;
6896
6897     case CEPH_OSD_OP_SET_REDIRECT:
6898       ++ctx->num_write;
6899       {
6900         if (pool.info.is_tier()) {
6901           result = -EINVAL;
6902           break;
6903         }
6904         if (!obs.exists) {
6905           result = -ENOENT;
6906           break;
6907         }
6908         if (get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS) {
6909           result = -EOPNOTSUPP;
6910           break;
6911         }
6912
6913         object_t target_name;
6914         object_locator_t target_oloc;
6915         snapid_t target_snapid = (uint64_t)op.copy_from.snapid;
6916         version_t target_version = op.copy_from.src_version;
6917         try {
6918           decode(target_name, bp);
6919           decode(target_oloc, bp);
6920         }
6921         catch (buffer::error& e) {
6922           result = -EINVAL;
6923           goto fail;
6924         }
6925         pg_t raw_pg;
6926         get_osdmap()->object_locator_to_pg(target_name, target_oloc, raw_pg);
6927         hobject_t target(target_name, target_oloc.key, target_snapid,
6928                 raw_pg.ps(), raw_pg.pool(),
6929                 target_oloc.nspace);
6930         if (target == soid) {
6931           dout(20) << " set-redirect self is invalid" << dendl;
6932           result = -EINVAL;
6933           break;
6934         }
6935
6936         bool need_reference = (osd_op.op.flags & CEPH_OSD_OP_FLAG_WITH_REFERENCE);
6937         bool has_reference = (oi.flags & object_info_t::FLAG_REDIRECT_HAS_REFERENCE);
6938         if (has_reference) {
6939           result = -EINVAL;
6940           dout(5) << " the object is already a manifest " << dendl;
6941           break;
6942         }
6943         if (op_finisher == nullptr && need_reference) {
6944           // start
6945           ctx->op_finishers[ctx->current_osd_subop_num].reset(
6946             new SetManifestFinisher(osd_op));
6947           RefCountCallback *fin = new RefCountCallback(
6948             this, ctx, osd_op, get_last_peering_reset());
6949           refcount_manifest(ctx->obc, target_oloc, target, SnapContext(),
6950                             true, fin, 0);
6951           result = -EINPROGRESS;
6952         } else {
6953           // finish
6954           if (op_finisher) {
6955             result = op_finisher->execute();
6956             ceph_assert(result == 0);
6957           }
6958
6959           if (!oi.has_manifest() && !oi.manifest.is_redirect())
6960             ctx->delta_stats.num_objects_manifest++;
6961
6962           oi.set_flag(object_info_t::FLAG_MANIFEST);
6963           oi.manifest.redirect_target = target;
6964           oi.manifest.type = object_manifest_t::TYPE_REDIRECT;
6965           t->truncate(soid, 0);
6966           if (oi.is_omap() && pool.info.supports_omap()) {
6967             t->omap_clear(soid);
6968             obs.oi.clear_omap_digest();
6969             obs.oi.clear_flag(object_info_t::FLAG_OMAP);
6970           }
6971           ctx->delta_stats.num_bytes -= oi.size;
6972           oi.size = 0;
6973           oi.new_object();
6974           oi.user_version = target_version;
6975           ctx->user_at_version = target_version;
6976           /* rm_attrs */
6977           map<string,bufferlist> rmattrs;
6978           result = getattrs_maybe_cache(ctx->obc, &rmattrs);
6979           if (result < 0) {
6980             return result;
6981           }
6982           map<string, bufferlist>::iterator iter;
6983           for (iter = rmattrs.begin(); iter != rmattrs.end(); ++iter) {
6984             const string& name = iter->first;
6985             t->rmattr(soid, name);
6986           }
6987           if (!has_reference && need_reference) {
6988             oi.set_flag(object_info_t::FLAG_REDIRECT_HAS_REFERENCE);
6989           }
6990           dout(10) << "set-redirect oid:" << oi.soid << " user_version: " << oi.user_version << dendl;
6991           if (op_finisher) {
6992             ctx->op_finishers.erase(ctx->current_osd_subop_num);
6993           }
6994         }
6995       }
6996
6997       break;
6998
6999     case CEPH_OSD_OP_SET_CHUNK:
7000       ++ctx->num_write;
7001       {
7002         if (pool.info.is_tier()) {
7003           result = -EINVAL;
7004           break;
7005         }
7006         if (!obs.exists) {
7007           result = -ENOENT;
7008           break;
7009         }
7010         if (get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS) {
7011           result = -EOPNOTSUPP;
7012           break;
7013         }
7014
7015         object_locator_t tgt_oloc;
7016         uint64_t src_offset, src_length, tgt_offset;
7017         object_t tgt_name;
7018         try {
7019           decode(src_offset, bp);
7020           decode(src_length, bp);
7021           decode(tgt_oloc, bp);
7022           decode(tgt_name, bp);
7023           decode(tgt_offset, bp);
7024         }
7025         catch (buffer::error& e) {
7026           result = -EINVAL;
7027           goto fail;
7028         }
7029
7030         if (!src_length) {
7031           result = -EINVAL;
7032           goto fail;
7033         }
7034
7035         for (auto &p : oi.manifest.chunk_map) {
7036           if ((p.first <= src_offset && p.first + p.second.length > src_offset) ||
7037               (p.first > src_offset && p.first <= src_offset + src_length)) {
7038             dout(20) << __func__ << " overlapped !! offset: " << src_offset << " length: " << src_length
7039                     << " chunk_info: " << p << dendl;
7040             result = -EOPNOTSUPP;
7041             goto fail;
7042           }
7043         }
7044
7045         if (!oi.manifest.is_chunked()) {
7046           oi.manifest.clear();
7047         }
7048
7049         pg_t raw_pg;
7050         chunk_info_t chunk_info;
7051         get_osdmap()->object_locator_to_pg(tgt_name, tgt_oloc, raw_pg);
7052         hobject_t target(tgt_name, tgt_oloc.key, snapid_t(),
7053                          raw_pg.ps(), raw_pg.pool(),
7054                          tgt_oloc.nspace);
7055         bool need_reference = (osd_op.op.flags & CEPH_OSD_OP_FLAG_WITH_REFERENCE);
7056         bool has_reference = (oi.manifest.chunk_map.find(src_offset) != oi.manifest.chunk_map.end()) &&
7057                              (oi.manifest.chunk_map[src_offset].flags & chunk_info_t::FLAG_HAS_REFERENCE);
7058         if (has_reference) {
7059           result = -EINVAL;
7060           dout(5) << " the object is already a manifest " << dendl;
7061           break;
7062         }
7063         if (op_finisher == nullptr && need_reference) {
7064           // start
7065           ctx->op_finishers[ctx->current_osd_subop_num].reset(
7066             new SetManifestFinisher(osd_op));
7067           RefCountCallback *fin = new RefCountCallback(
7068             this, ctx, osd_op, get_last_peering_reset());
7069           refcount_manifest(ctx->obc, tgt_oloc, target, SnapContext(),
7070                             true, fin, src_offset);
7071           result = -EINPROGRESS;
7072         } else {
7073           if (op_finisher) {
7074             result = op_finisher->execute();
7075             ceph_assert(result == 0);
7076           }
7077
7078           chunk_info_t chunk_info;
7079           chunk_info.set_flag(chunk_info_t::FLAG_MISSING);
7080           chunk_info.oid = target;
7081           chunk_info.offset = tgt_offset;
7082           chunk_info.length= src_length;
7083           oi.manifest.chunk_map[src_offset] = chunk_info;
7084           if (!oi.has_manifest() && !oi.manifest.is_chunked())
7085             ctx->delta_stats.num_objects_manifest++;
7086           oi.set_flag(object_info_t::FLAG_MANIFEST);
7087           oi.manifest.type = object_manifest_t::TYPE_CHUNKED;
7088           if (!has_reference && need_reference) {
7089             oi.manifest.chunk_map[src_offset].set_flag(chunk_info_t::FLAG_HAS_REFERENCE);
7090           }
7091           if (need_reference && pool.info.get_fingerprint_type() != pg_pool_t::TYPE_FINGERPRINT_NONE) {
7092             oi.manifest.chunk_map[src_offset].set_flag(chunk_info_t::FLAG_HAS_FINGERPRINT);
7093           }
7094           ctx->modify = true;
7095
7096           dout(10) << "set-chunked oid:" << oi.soid << " user_version: " << oi.user_version
7097                    << " chunk_info: " << chunk_info << dendl;
7098           if (op_finisher) {
7099             ctx->op_finishers.erase(ctx->current_osd_subop_num);
7100           }
7101         }
7102       }
7103
7104       break;
7105
7106     case CEPH_OSD_OP_TIER_PROMOTE:
7107       ++ctx->num_write;
7108       {
7109         if (pool.info.is_tier()) {
7110           result = -EINVAL;
7111           break;
7112         }
7113         if (!obs.exists) {
7114           result = -ENOENT;
7115           break;
7116         }
7117         if (get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS) {
7118           result = -EOPNOTSUPP;
7119           break;
7120         }
7121         if (!obs.oi.has_manifest()) {
7122           result = 0;
7123           break;
7124         }
7125
7126         if (op_finisher == nullptr) {
7127           PromoteManifestCallback *cb;
7128           object_locator_t my_oloc;
7129           hobject_t src_hoid;
7130
7131           if (obs.oi.manifest.is_chunked()) {
7132             src_hoid = obs.oi.soid;
7133             cb = new PromoteManifestCallback(ctx->obc, this, ctx);
7134           } else if (obs.oi.manifest.is_redirect()) {
7135             object_locator_t src_oloc(obs.oi.manifest.redirect_target);
7136             my_oloc = src_oloc;
7137             src_hoid = obs.oi.manifest.redirect_target;
7138             cb = new PromoteManifestCallback(ctx->obc, this, ctx);
7139           } else {
7140             ceph_abort_msg("unrecognized manifest type");
7141           }
7142           ctx->op_finishers[ctx->current_osd_subop_num].reset(
7143             new PromoteFinisher(cb));
7144           unsigned flags = CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
7145                            CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
7146                            CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE |
7147                            CEPH_OSD_COPY_FROM_FLAG_RWORDERED;
7148           unsigned src_fadvise_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL;
7149           start_copy(cb, ctx->obc, src_hoid, my_oloc, 0, flags,
7150                      obs.oi.soid.snap == CEPH_NOSNAP,
7151                      src_fadvise_flags, 0);
7152
7153           dout(10) << "tier-promote oid:" << oi.soid << " manifest: " << obs.oi.manifest << dendl;
7154           result = -EINPROGRESS;
7155         } else {
7156           result = op_finisher->execute();
7157           ceph_assert(result == 0);
7158           ctx->op_finishers.erase(ctx->current_osd_subop_num);
7159         }
7160       }
7161
7162       break;
7163
7164     case CEPH_OSD_OP_UNSET_MANIFEST:
7165       ++ctx->num_write;
7166       {
7167         if (pool.info.is_tier()) {
7168           result = -EINVAL;
7169           break;
7170         }
7171         if (!obs.exists) {
7172           result = -ENOENT;
7173           break;
7174         }
7175         if (!oi.has_manifest()) {
7176           result = -EOPNOTSUPP;
7177           break;
7178         }
7179         if (get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS) {
7180           result = -EOPNOTSUPP;
7181           break;
7182         }
7183
7184         if (oi.manifest.is_redirect()) {
7185           if ((oi.flags & object_info_t::FLAG_REDIRECT_HAS_REFERENCE)) {
7186             ctx->register_on_commit(
7187               [oi, ctx, this](){
7188               object_locator_t target_oloc(oi.manifest.redirect_target);
7189               refcount_manifest(ctx->obc, target_oloc, oi.manifest.redirect_target,
7190                                 SnapContext(), false, NULL, 0);
7191             });
7192           }
7193         } else if (oi.manifest.is_chunked()) {
7194             ctx->register_on_commit(
7195               [oi, ctx, this](){
7196               for (auto p : oi.manifest.chunk_map) {
7197                 if (p.second.flags & chunk_info_t::FLAG_HAS_REFERENCE) {
7198                   object_locator_t target_oloc(p.second.oid);
7199                   refcount_manifest(ctx->obc, target_oloc, p.second.oid,
7200                                     SnapContext(), false, NULL, p.first);
7201                 }
7202               }
7203             });
7204         } else {
7205           ceph_abort_msg("unrecognized manifest type");
7206         }
7207
7208         oi.clear_flag(object_info_t::FLAG_MANIFEST);
7209         oi.manifest = object_manifest_t();
7210         ctx->delta_stats.num_objects_manifest--;
7211         ctx->delta_stats.num_wr++;
7212         ctx->modify = true;
7213       }
7214
7215       break;
7216
7217       // -- object attrs --
7218
7219     case CEPH_OSD_OP_SETXATTR:
7220       ++ctx->num_write;
7221       {
7222         if (cct->_conf->osd_max_attr_size > 0 &&
7223             op.xattr.value_len > cct->_conf->osd_max_attr_size) {
7224           tracepoint(osd, do_osd_op_pre_setxattr, soid.oid.name.c_str(), soid.snap.val, "???");
7225           result = -EFBIG;
7226           break;
7227         }
7228         unsigned max_name_len =
7229           std::min<uint64_t>(osd->store->get_max_attr_name_length(),
7230                              cct->_conf->osd_max_attr_name_len);
7231         if (op.xattr.name_len > max_name_len) {
7232           result = -ENAMETOOLONG;
7233           break;
7234         }
7235         maybe_create_new_object(ctx);
7236         string aname;
7237         bp.copy(op.xattr.name_len, aname);
7238         tracepoint(osd, do_osd_op_pre_setxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
7239         string name = "_" + aname;
7240         bufferlist bl;
7241         bp.copy(op.xattr.value_len, bl);
7242         t->setattr(soid, name, bl);
7243         ctx->delta_stats.num_wr++;
7244       }
7245       break;
7246
7247     case CEPH_OSD_OP_RMXATTR:
7248       ++ctx->num_write;
7249       {
7250         string aname;
7251         bp.copy(op.xattr.name_len, aname);
7252         tracepoint(osd, do_osd_op_pre_rmxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
7253         if (!obs.exists || oi.is_whiteout()) {
7254           result = -ENOENT;
7255           break;
7256         }
7257         string name = "_" + aname;
7258         t->rmattr(soid, name);
7259         ctx->delta_stats.num_wr++;
7260       }
7261       break;
7262
7263
7264       // -- fancy writers --
7265     case CEPH_OSD_OP_APPEND:
7266       {
7267         tracepoint(osd, do_osd_op_pre_append, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
7268         // just do it inline; this works because we are happy to execute
7269         // fancy op on replicas as well.
7270         vector<OSDOp> nops(1);
7271         OSDOp& newop = nops[0];
7272         newop.op.op = CEPH_OSD_OP_WRITE;
7273         newop.op.extent.offset = oi.size;
7274         newop.op.extent.length = op.extent.length;
7275         newop.op.extent.truncate_seq = oi.truncate_seq;
7276         newop.indata = osd_op.indata;
7277         result = do_osd_ops(ctx, nops);
7278         osd_op.outdata.claim(newop.outdata);
7279       }
7280       break;
7281
7282     case CEPH_OSD_OP_STARTSYNC:
7283       t->nop(soid);
7284       break;
7285
7286       // -- trivial map --
7287     case CEPH_OSD_OP_TMAPGET:
7288       tracepoint(osd, do_osd_op_pre_tmapget, soid.oid.name.c_str(), soid.snap.val);
7289       if (pool.info.is_erasure()) {
7290         result = -EOPNOTSUPP;
7291         break;
7292       }
7293       {
7294         vector<OSDOp> nops(1);
7295         OSDOp& newop = nops[0];
7296         newop.op.op = CEPH_OSD_OP_SYNC_READ;
7297         newop.op.extent.offset = 0;
7298         newop.op.extent.length = 0;
7299         do_osd_ops(ctx, nops);
7300         osd_op.outdata.claim(newop.outdata);
7301       }
7302       break;
7303
7304     case CEPH_OSD_OP_TMAPPUT:
7305       tracepoint(osd, do_osd_op_pre_tmapput, soid.oid.name.c_str(), soid.snap.val);
7306       if (pool.info.is_erasure()) {
7307         result = -EOPNOTSUPP;
7308         break;
7309       }
7310       {
7311         //_dout_lock.Lock();
7312         //osd_op.data.hexdump(*_dout);
7313         //_dout_lock.Unlock();
7314
7315         // verify sort order
7316         bool unsorted = false;
7317         if (true) {
7318           bufferlist header;
7319           decode(header, bp);
7320           uint32_t n;
7321           decode(n, bp);
7322           string last_key;
7323           while (n--) {
7324             string key;
7325             decode(key, bp);
7326             dout(10) << "tmapput key " << key << dendl;
7327             bufferlist val;
7328             decode(val, bp);
7329             if (key < last_key) {
7330               dout(10) << "TMAPPUT is unordered; resorting" << dendl;
7331               unsorted = true;
7332               break;
7333             }
7334             last_key = key;
7335           }
7336         }
7337
7338         // write it
7339         vector<OSDOp> nops(1);
7340         OSDOp& newop = nops[0];
7341         newop.op.op = CEPH_OSD_OP_WRITEFULL;
7342         newop.op.extent.offset = 0;
7343         newop.op.extent.length = osd_op.indata.length();
7344         newop.indata = osd_op.indata;
7345
7346         if (unsorted) {
7347           bp = osd_op.indata.begin();
7348           bufferlist header;
7349           map<string, bufferlist> m;
7350           decode(header, bp);
7351           decode(m, bp);
7352           ceph_assert(bp.end());
7353           bufferlist newbl;
7354           encode(header, newbl);
7355           encode(m, newbl);
7356           newop.indata = newbl;
7357         }
7358         result = do_osd_ops(ctx, nops);
7359         ceph_assert(result == 0);
7360       }
7361       break;
7362
7363     case CEPH_OSD_OP_TMAPUP:
7364       tracepoint(osd, do_osd_op_pre_tmapup, soid.oid.name.c_str(), soid.snap.val);
7365       if (pool.info.is_erasure()) {
7366         result = -EOPNOTSUPP;
7367         break;
7368       }
7369       ++ctx->num_write;
7370       result = do_tmapup(ctx, bp, osd_op);
7371       break;
7372
7373     case CEPH_OSD_OP_TMAP2OMAP:
7374       ++ctx->num_write;
7375       tracepoint(osd, do_osd_op_pre_tmap2omap, soid.oid.name.c_str(), soid.snap.val);
7376       result = do_tmap2omap(ctx, op.tmap2omap.flags);
7377       break;
7378
7379       // OMAP Read ops
7380     case CEPH_OSD_OP_OMAPGETKEYS:
7381       ++ctx->num_read;
7382       {
7383         string start_after;
7384         uint64_t max_return;
7385         try {
7386           decode(start_after, bp);
7387           decode(max_return, bp);
7388         }
7389         catch (buffer::error& e) {
7390           result = -EINVAL;
7391           tracepoint(osd, do_osd_op_pre_omapgetkeys, soid.oid.name.c_str(), soid.snap.val, "???", 0);
7392           goto fail;
7393         }
7394         if (max_return > cct->_conf->osd_max_omap_entries_per_request) {
7395           max_return = cct->_conf->osd_max_omap_entries_per_request;
7396         }
7397         tracepoint(osd, do_osd_op_pre_omapgetkeys, soid.oid.name.c_str(), soid.snap.val, start_after.c_str(), max_return);
7398
7399         bufferlist bl;
7400         uint32_t num = 0;
7401         bool truncated = false;
7402         if (oi.is_omap()) {
7403           ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(
7404             ch, ghobject_t(soid)
7405             );
7406           ceph_assert(iter);
7407           iter->upper_bound(start_after);
7408           for (num = 0; iter->valid(); ++num, iter->next()) {
7409             if (num >= max_return ||
7410                 bl.length() >= cct->_conf->osd_max_omap_bytes_per_request) {
7411               truncated = true;
7412               break;
7413             }
7414             encode(iter->key(), bl);
7415           }
7416         } // else return empty out_set
7417         encode(num, osd_op.outdata);
7418         osd_op.outdata.claim_append(bl);
7419         encode(truncated, osd_op.outdata);
7420         ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
7421         ctx->delta_stats.num_rd++;
7422       }
7423       break;
7424
7425     case CEPH_OSD_OP_OMAPGETVALS:
7426       ++ctx->num_read;
7427       {
7428         string start_after;
7429         uint64_t max_return;
7430         string filter_prefix;
7431         try {
7432           decode(start_after, bp);
7433           decode(max_return, bp);
7434           decode(filter_prefix, bp);
7435         }
7436         catch (buffer::error& e) {
7437           result = -EINVAL;
7438           tracepoint(osd, do_osd_op_pre_omapgetvals, soid.oid.name.c_str(), soid.snap.val, "???", 0, "???");
7439           goto fail;
7440         }
7441         if (max_return > cct->_conf->osd_max_omap_entries_per_request) {
7442           max_return = cct->_conf->osd_max_omap_entries_per_request;
7443         }
7444         tracepoint(osd, do_osd_op_pre_omapgetvals, soid.oid.name.c_str(), soid.snap.val, start_after.c_str(), max_return, filter_prefix.c_str());
7445
7446         uint32_t num = 0;
7447         bool truncated = false;
7448         bufferlist bl;
7449         if (oi.is_omap()) {
7450           ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(
7451             ch, ghobject_t(soid)
7452             );
7453           if (!iter) {
7454             result = -ENOENT;
7455             goto fail;
7456           }
7457           iter->upper_bound(start_after);
7458           if (filter_prefix > start_after) iter->lower_bound(filter_prefix);
7459           for (num = 0;
7460                iter->valid() &&
7461                  iter->key().substr(0, filter_prefix.size()) == filter_prefix;
7462                ++num, iter->next()) {
7463             dout(20) << "Found key " << iter->key() << dendl;
7464             if (num >= max_return ||
7465                 bl.length() >= cct->_conf->osd_max_omap_bytes_per_request) {
7466               truncated = true;
7467               break;
7468             }
7469             encode(iter->key(), bl);
7470             encode(iter->value(), bl);
7471           }
7472         } // else return empty out_set
7473         encode(num, osd_op.outdata);
7474         osd_op.outdata.claim_append(bl);
7475         encode(truncated, osd_op.outdata);
7476         ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
7477         ctx->delta_stats.num_rd++;
7478       }
7479       break;
7480
7481     case CEPH_OSD_OP_OMAPGETHEADER:
7482       tracepoint(osd, do_osd_op_pre_omapgetheader, soid.oid.name.c_str(), soid.snap.val);
7483       if (!oi.is_omap()) {
7484         // return empty header
7485         break;
7486       }
7487       ++ctx->num_read;
7488       {
7489         osd->store->omap_get_header(ch, ghobject_t(soid), &osd_op.outdata);
7490         ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
7491         ctx->delta_stats.num_rd++;
7492       }
7493       break;
7494
7495     case CEPH_OSD_OP_OMAPGETVALSBYKEYS:
7496       ++ctx->num_read;
7497       {
7498         set<string> keys_to_get;
7499         try {
7500           decode(keys_to_get, bp);
7501         }
7502         catch (buffer::error& e) {
7503           result = -EINVAL;
7504           tracepoint(osd, do_osd_op_pre_omapgetvalsbykeys, soid.oid.name.c_str(), soid.snap.val, "???");
7505           goto fail;
7506         }
7507         tracepoint(osd, do_osd_op_pre_omapgetvalsbykeys, soid.oid.name.c_str(), soid.snap.val, list_entries(keys_to_get).c_str());
7508         map<string, bufferlist> out;
7509         if (oi.is_omap()) {
7510           osd->store->omap_get_values(ch, ghobject_t(soid), keys_to_get, &out);
7511         } // else return empty omap entries
7512         encode(out, osd_op.outdata);
7513         ctx->delta_stats.num_rd_kb += shift_round_up(osd_op.outdata.length(), 10);
7514         ctx->delta_stats.num_rd++;
7515       }
7516       break;
7517
7518     case CEPH_OSD_OP_OMAP_CMP:
7519       ++ctx->num_read;
7520       {
7521         if (!obs.exists || oi.is_whiteout()) {
7522           result = -ENOENT;
7523           tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, "???");
7524           break;
7525         }
7526         map<string, pair<bufferlist, int> > assertions;
7527         try {
7528           decode(assertions, bp);
7529         }
7530         catch (buffer::error& e) {
7531           result = -EINVAL;
7532           tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, "???");
7533           goto fail;
7534         }
7535         tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, list_keys(assertions).c_str());
7536
7537         map<string, bufferlist> out;
7538
7539         if (oi.is_omap()) {
7540           set<string> to_get;
7541           for (map<string, pair<bufferlist, int> >::iterator i = assertions.begin();
7542                i != assertions.end();
7543                ++i)
7544             to_get.insert(i->first);
7545           int r = osd->store->omap_get_values(ch, ghobject_t(soid),
7546                                               to_get, &out);
7547           if (r < 0) {
7548             result = r;
7549             break;
7550           }
7551         } // else leave out empty
7552
7553         //Should set num_rd_kb based on encode length of map
7554         ctx->delta_stats.num_rd++;
7555
7556         int r = 0;
7557         bufferlist empty;
7558         for (map<string, pair<bufferlist, int> >::iterator i = assertions.begin();
7559              i != assertions.end();
7560              ++i) {
7561           auto out_entry = out.find(i->first);
7562           bufferlist &bl = (out_entry != out.end()) ?
7563             out_entry->second : empty;
7564           switch (i->second.second) {
7565           case CEPH_OSD_CMPXATTR_OP_EQ:
7566             if (!(bl == i->second.first)) {
7567               r = -ECANCELED;
7568             }
7569             break;
7570           case CEPH_OSD_CMPXATTR_OP_LT:
7571             if (!(bl < i->second.first)) {
7572               r = -ECANCELED;
7573             }
7574             break;
7575           case CEPH_OSD_CMPXATTR_OP_GT:
7576             if (!(bl > i->second.first)) {
7577               r = -ECANCELED;
7578             }
7579             break;
7580           default:
7581             r = -EINVAL;
7582             break;
7583           }
7584           if (r < 0)
7585             break;
7586         }
7587         if (r < 0) {
7588           result = r;
7589         }
7590       }
7591       break;
7592
7593       // OMAP Write ops
7594     case CEPH_OSD_OP_OMAPSETVALS:
7595       if (!pool.info.supports_omap()) {
7596         result = -EOPNOTSUPP;
7597         tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
7598         break;
7599       }
7600       ++ctx->num_write;
7601       {
7602         maybe_create_new_object(ctx);
7603         bufferlist to_set_bl;
7604         try {
7605           decode_str_str_map_to_bl(bp, &to_set_bl);
7606         }
7607         catch (buffer::error& e) {
7608           result = -EINVAL;
7609           tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
7610           goto fail;
7611         }
7612         tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
7613         if (cct->_conf->subsys.should_gather<dout_subsys, 20>()) {
7614           dout(20) << "setting vals: " << dendl;
7615           map<string,bufferlist> to_set;
7616           bufferlist::const_iterator pt = to_set_bl.begin();
7617           decode(to_set, pt);
7618           for (map<string, bufferlist>::iterator i = to_set.begin();
7619                i != to_set.end();
7620                ++i) {
7621             dout(20) << "\t" << i->first << dendl;
7622           }
7623         }
7624         t->omap_setkeys(soid, to_set_bl);
7625         ctx->delta_stats.num_wr++;
7626         ctx->delta_stats.num_wr_kb += shift_round_up(to_set_bl.length(), 10);
7627       }
7628       obs.oi.set_flag(object_info_t::FLAG_OMAP);
7629       obs.oi.clear_omap_digest();
7630       break;
7631
7632     case CEPH_OSD_OP_OMAPSETHEADER:
7633       tracepoint(osd, do_osd_op_pre_omapsetheader, soid.oid.name.c_str(), soid.snap.val);
7634       if (!pool.info.supports_omap()) {
7635         result = -EOPNOTSUPP;
7636         break;
7637       }
7638       ++ctx->num_write;
7639       {
7640         maybe_create_new_object(ctx);
7641         t->omap_setheader(soid, osd_op.indata);
7642         ctx->delta_stats.num_wr++;
7643       }
7644       obs.oi.set_flag(object_info_t::FLAG_OMAP);
7645       obs.oi.clear_omap_digest();
7646       break;
7647
7648     case CEPH_OSD_OP_OMAPCLEAR:
7649       tracepoint(osd, do_osd_op_pre_omapclear, soid.oid.name.c_str(), soid.snap.val);
7650       if (!pool.info.supports_omap()) {
7651         result = -EOPNOTSUPP;
7652         break;
7653       }
7654       ++ctx->num_write;
7655       {
7656         if (!obs.exists || oi.is_whiteout()) {
7657           result = -ENOENT;
7658           break;
7659         }
7660         if (oi.is_omap()) {
7661           t->omap_clear(soid);
7662           ctx->delta_stats.num_wr++;
7663           obs.oi.clear_omap_digest();
7664           obs.oi.clear_flag(object_info_t::FLAG_OMAP);
7665         }
7666       }
7667       break;
7668
7669     case CEPH_OSD_OP_OMAPRMKEYS:
7670       if (!pool.info.supports_omap()) {
7671         result = -EOPNOTSUPP;
7672         tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
7673         break;
7674       }
7675       ++ctx->num_write;
7676       {
7677         if (!obs.exists || oi.is_whiteout()) {
7678           result = -ENOENT;
7679           tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
7680           break;
7681         }
7682         bufferlist to_rm_bl;
7683         try {
7684           decode_str_set_to_bl(bp, &to_rm_bl);
7685         }
7686         catch (buffer::error& e) {
7687           result = -EINVAL;
7688           tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
7689           goto fail;
7690         }
7691         tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
7692         t->omap_rmkeys(soid, to_rm_bl);
7693         ctx->delta_stats.num_wr++;
7694       }
7695       obs.oi.clear_omap_digest();
7696       break;
7697
7698     case CEPH_OSD_OP_COPY_GET:
7699       ++ctx->num_read;
7700       tracepoint(osd, do_osd_op_pre_copy_get, soid.oid.name.c_str(),
7701                  soid.snap.val);
7702       if (op_finisher == nullptr) {
7703         result = do_copy_get(ctx, bp, osd_op, ctx->obc);
7704       } else {
7705         result = op_finisher->execute();
7706       }
7707       break;
7708
7709     case CEPH_OSD_OP_COPY_FROM:
7710       ++ctx->num_write;
7711       {
7712         object_t src_name;
7713         object_locator_t src_oloc;
7714         snapid_t src_snapid = (uint64_t)op.copy_from.snapid;
7715         version_t src_version = op.copy_from.src_version;
7716         try {
7717           decode(src_name, bp);
7718           decode(src_oloc, bp);
7719         }
7720         catch (buffer::error& e) {
7721           result = -EINVAL;
7722           tracepoint(osd,
7723                      do_osd_op_pre_copy_from,
7724                      soid.oid.name.c_str(),
7725                      soid.snap.val,
7726                      "???",
7727                      0,
7728                      "???",
7729                      "???",
7730                      0,
7731                      src_snapid,
7732                      src_version);
7733           goto fail;
7734         }
7735         tracepoint(osd,
7736                    do_osd_op_pre_copy_from,
7737                    soid.oid.name.c_str(),
7738                    soid.snap.val,
7739                    src_name.name.c_str(),
7740                    src_oloc.pool,
7741                    src_oloc.key.c_str(),
7742                    src_oloc.nspace.c_str(),
7743                    src_oloc.hash,
7744                    src_snapid,
7745                    src_version);
7746         if (op_finisher == nullptr) {
7747           // start
7748           pg_t raw_pg;
7749           get_osdmap()->object_locator_to_pg(src_name, src_oloc, raw_pg);
7750           hobject_t src(src_name, src_oloc.key, src_snapid,
7751                         raw_pg.ps(), raw_pg.pool(),
7752                         src_oloc.nspace);
7753           if (src == soid) {
7754             dout(20) << " copy from self is invalid" << dendl;
7755             result = -EINVAL;
7756             break;
7757           }
7758           CopyFromCallback *cb = new CopyFromCallback(ctx, osd_op);
7759           ctx->op_finishers[ctx->current_osd_subop_num].reset(
7760             new CopyFromFinisher(cb));
7761           start_copy(cb, ctx->obc, src, src_oloc, src_version,
7762                      op.copy_from.flags,
7763                      false,
7764                      op.copy_from.src_fadvise_flags,
7765                      op.flags);
7766           result = -EINPROGRESS;
7767         } else {
7768           // finish
7769           result = op_finisher->execute();
7770           ceph_assert(result == 0);
7771
7772           // COPY_FROM cannot be executed multiple times -- it must restart
7773           ctx->op_finishers.erase(ctx->current_osd_subop_num);
7774         }
7775       }
7776       break;
7777
7778     default:
7779       tracepoint(osd, do_osd_op_pre_unknown, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op));
7780       dout(1) << "unrecognized osd op " << op.op
7781               << " " << ceph_osd_op_name(op.op)
7782               << dendl;
7783       result = -EOPNOTSUPP;
7784     }
7785
7786   fail:
7787     osd_op.rval = result;
7788     tracepoint(osd, do_osd_op_post, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op), op.flags, result);
7789     if (result < 0 && (op.flags & CEPH_OSD_OP_FLAG_FAILOK) &&
7790         result != -EAGAIN && result != -EINPROGRESS)
7791       result = 0;
7792
7793     if (result < 0)
7794       break;
7795   }
7796   return result;
7797 }
7798
7799 int PrimaryLogPG::_get_tmap(OpContext *ctx, bufferlist *header, bufferlist *vals)
7800 {
7801   if (ctx->new_obs.oi.size == 0) {
7802     dout(20) << "unable to get tmap for zero sized " << ctx->new_obs.oi.soid << dendl;
7803     return -ENODATA;
7804   }
7805   vector<OSDOp> nops(1);
7806   OSDOp &newop = nops[0];
7807   newop.op.op = CEPH_OSD_OP_TMAPGET;
7808   do_osd_ops(ctx, nops);
7809   try {
7810     bufferlist::const_iterator i = newop.outdata.begin();
7811     decode(*header, i);
7812     (*vals).substr_of(newop.outdata, i.get_off(), i.get_remaining());
7813   } catch (...) {
7814     dout(20) << "unsuccessful at decoding tmap for " << ctx->new_obs.oi.soid
7815              << dendl;
7816     return -EINVAL;
7817   }
7818   dout(20) << "successful at decoding tmap for " << ctx->new_obs.oi.soid
7819            << dendl;
7820   return 0;
7821 }
7822
7823 int PrimaryLogPG::_verify_no_head_clones(const hobject_t& soid,
7824                                         const SnapSet& ss)
7825 {
7826   // verify that all clones have been evicted
7827   dout(20) << __func__ << " verifying clones are absent "
7828            << ss << dendl;
7829   for (vector<snapid_t>::const_iterator p = ss.clones.begin();
7830        p != ss.clones.end();
7831        ++p) {
7832     hobject_t clone_oid = soid;
7833     clone_oid.snap = *p;
7834     if (is_missing_object(clone_oid))
7835       return -EBUSY;
7836     ObjectContextRef clone_obc = get_object_context(clone_oid, false);
7837     if (clone_obc && clone_obc->obs.exists) {
7838       dout(10) << __func__ << " cannot evict head before clone "
7839                << clone_oid << dendl;
7840       return -EBUSY;
7841     }
7842     if (copy_ops.count(clone_oid)) {
7843       dout(10) << __func__ << " cannot evict head, pending promote on clone "
7844                << clone_oid << dendl;
7845       return -EBUSY;
7846     }
7847   }
7848   return 0;
7849 }
7850
7851 inline int PrimaryLogPG::_delete_oid(
7852   OpContext *ctx,
7853   bool no_whiteout,     // no whiteouts, no matter what.
7854   bool try_no_whiteout) // try not to whiteout
7855 {
7856   SnapSet& snapset = ctx->new_snapset;
7857   ObjectState& obs = ctx->new_obs;
7858   object_info_t& oi = obs.oi;
7859   const hobject_t& soid = oi.soid;
7860   PGTransaction* t = ctx->op_t.get();
7861
7862   // cache: cache: set whiteout on delete?
7863   bool whiteout = false;
7864   if (pool.info.cache_mode != pg_pool_t::CACHEMODE_NONE
7865       && !no_whiteout
7866       && !try_no_whiteout) {
7867     whiteout = true;
7868   }
7869
7870   // in luminous or later, we can't delete the head if there are
7871   // clones. we trust the caller passing no_whiteout has already
7872   // verified they don't exist.
7873   if (!snapset.clones.empty() ||
7874       (!ctx->snapc.snaps.empty() && ctx->snapc.snaps[0] > snapset.seq)) {
7875     if (no_whiteout) {
7876       dout(20) << __func__ << " has or will have clones but no_whiteout=1"
7877                << dendl;
7878     } else {
7879       dout(20) << __func__ << " has or will have clones; will whiteout"
7880                << dendl;
7881       whiteout = true;
7882     }
7883   }
7884   dout(20) << __func__ << " " << soid << " whiteout=" << (int)whiteout
7885            << " no_whiteout=" << (int)no_whiteout
7886            << " try_no_whiteout=" << (int)try_no_whiteout
7887            << dendl;
7888   if (!obs.exists || (obs.oi.is_whiteout() && whiteout))
7889     return -ENOENT;
7890
7891   t->remove(soid);
7892
7893   if (oi.size > 0) {
7894     interval_set<uint64_t> ch;
7895     ch.insert(0, oi.size);
7896     ctx->modified_ranges.union_of(ch);
7897   }
7898
7899   ctx->delta_stats.num_wr++;
7900   if (soid.is_snap()) {
7901     ceph_assert(ctx->obc->ssc->snapset.clone_overlap.count(soid.snap));
7902     ctx->delta_stats.num_bytes -= ctx->obc->ssc->snapset.get_clone_bytes(soid.snap);
7903   } else {
7904     ctx->delta_stats.num_bytes -= oi.size;
7905   }
7906   oi.size = 0;
7907   oi.new_object();
7908
7909   // disconnect all watchers
7910   for (map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator p =
7911          oi.watchers.begin();
7912        p != oi.watchers.end();
7913        ++p) {
7914     dout(20) << __func__ << " will disconnect watcher " << p->first << dendl;
7915     ctx->watch_disconnects.push_back(
7916       watch_disconnect_t(p->first.first, p->first.second, true));
7917   }
7918   oi.watchers.clear();
7919
7920   if (whiteout) {
7921     dout(20) << __func__ << " setting whiteout on " << soid << dendl;
7922     oi.set_flag(object_info_t::FLAG_WHITEOUT);
7923     ctx->delta_stats.num_whiteouts++;
7924     t->create(soid);
7925     osd->logger->inc(l_osd_tier_whiteout);
7926     return 0;
7927   }
7928
7929   // delete the head
7930   ctx->delta_stats.num_objects--;
7931   if (soid.is_snap())
7932     ctx->delta_stats.num_object_clones--;
7933   if (oi.is_whiteout()) {
7934     dout(20) << __func__ << " deleting whiteout on " << soid << dendl;
7935     ctx->delta_stats.num_whiteouts--;
7936     oi.clear_flag(object_info_t::FLAG_WHITEOUT);
7937   }
7938   if (oi.is_cache_pinned()) {
7939     ctx->delta_stats.num_objects_pinned--;
7940   }
7941   if (oi.has_manifest()) {
7942     ctx->delta_stats.num_objects_manifest--;
7943   }
7944   obs.exists = false;
7945   return 0;
7946 }
7947
7948 int PrimaryLogPG::_rollback_to(OpContext *ctx, ceph_osd_op& op)
7949 {
7950   SnapSet& snapset = ctx->new_snapset;
7951   ObjectState& obs = ctx->new_obs;
7952   object_info_t& oi = obs.oi;
7953   const hobject_t& soid = oi.soid;
7954   PGTransaction* t = ctx->op_t.get();
7955   snapid_t snapid = (uint64_t)op.snap.snapid;
7956   hobject_t missing_oid;
7957
7958   dout(10) << "_rollback_to " << soid << " snapid " << snapid << dendl;
7959
7960   ObjectContextRef rollback_to;
7961
7962   int ret = find_object_context(
7963     hobject_t(soid.oid, soid.get_key(), snapid, soid.get_hash(), info.pgid.pool(),
7964               soid.get_namespace()),
7965     &rollback_to, false, false, &missing_oid);
7966   if (ret == -EAGAIN) {
7967     /* clone must be missing */
7968     ceph_assert(is_degraded_or_backfilling_object(missing_oid) || is_degraded_on_async_recovery_target(missing_oid));
7969     dout(20) << "_rollback_to attempted to roll back to a missing or backfilling clone "
7970              << missing_oid << " (requested snapid: ) " << snapid << dendl;
7971     block_write_on_degraded_snap(missing_oid, ctx->op);
7972     return ret;
7973   }
7974   {
7975     ObjectContextRef promote_obc;
7976     cache_result_t tier_mode_result;
7977     if (obs.exists && obs.oi.has_manifest()) {
7978       tier_mode_result =
7979         maybe_handle_manifest_detail(
7980           ctx->op,
7981           true,
7982           rollback_to);
7983     } else {
7984       tier_mode_result =
7985         maybe_handle_cache_detail(
7986           ctx->op,
7987           true,
7988           rollback_to,
7989           ret,
7990           missing_oid,
7991           true,
7992           false,
7993           &promote_obc);
7994     }
7995     switch (tier_mode_result) {
7996     case cache_result_t::NOOP:
7997       break;
7998     case cache_result_t::BLOCKED_PROMOTE:
7999       ceph_assert(promote_obc);
8000       block_write_on_snap_rollback(soid, promote_obc, ctx->op);
8001       return -EAGAIN;
8002     case cache_result_t::BLOCKED_FULL:
8003       block_write_on_full_cache(soid, ctx->op);
8004       return -EAGAIN;
8005     case cache_result_t::REPLIED_WITH_EAGAIN:
8006       ceph_abort_msg("this can't happen, no rollback on replica");
8007     default:
8008       ceph_abort_msg("must promote was set, other values are not valid");
8009       return -EAGAIN;
8010     }
8011   }
8012
8013   if (ret == -ENOENT || (rollback_to && rollback_to->obs.oi.is_whiteout())) {
8014     // there's no snapshot here, or there's no object.
8015     // if there's no snapshot, we delete the object; otherwise, do nothing.
8016     dout(20) << "_rollback_to deleting head on " << soid.oid
8017              << " because got ENOENT|whiteout on find_object_context" << dendl;
8018     if (ctx->obc->obs.oi.watchers.size()) {
8019       // Cannot delete an object with watchers
8020       ret = -EBUSY;
8021     } else {
8022       _delete_oid(ctx, false, false);
8023       ret = 0;
8024     }
8025   } else if (ret) {
8026     // ummm....huh? It *can't* return anything else at time of writing.
8027     ceph_abort_msg("unexpected error code in _rollback_to");
8028   } else { //we got our context, let's use it to do the rollback!
8029     hobject_t& rollback_to_sobject = rollback_to->obs.oi.soid;
8030     if (is_degraded_or_backfilling_object(rollback_to_sobject) ||
8031         is_degraded_on_async_recovery_target(rollback_to_sobject)) {
8032       dout(20) << "_rollback_to attempted to roll back to a degraded object "
8033                << rollback_to_sobject << " (requested snapid: ) " << snapid << dendl;
8034       block_write_on_degraded_snap(rollback_to_sobject, ctx->op);
8035       ret = -EAGAIN;
8036     } else if (rollback_to->obs.oi.soid.snap == CEPH_NOSNAP) {
8037       // rolling back to the head; we just need to clone it.
8038       ctx->modify = true;
8039     } else {
8040       /* 1) Delete current head
8041        * 2) Clone correct snapshot into head
8042        * 3) Calculate clone_overlaps by following overlaps
8043        *    forward from rollback snapshot */
8044       dout(10) << "_rollback_to deleting " << soid.oid
8045                << " and rolling back to old snap" << dendl;
8046
8047       if (obs.exists) {
8048         t->remove(soid);
8049       }
8050       t->clone(soid, rollback_to_sobject);
8051       t->add_obc(rollback_to);
8052
8053       map<snapid_t, interval_set<uint64_t> >::iterator iter =
8054         snapset.clone_overlap.lower_bound(snapid);
8055       ceph_assert(iter != snapset.clone_overlap.end());
8056       interval_set<uint64_t> overlaps = iter->second;
8057       for ( ;
8058             iter != snapset.clone_overlap.end();
8059             ++iter)
8060         overlaps.intersection_of(iter->second);
8061
8062       if (obs.oi.size > 0) {
8063         interval_set<uint64_t> modified;
8064         modified.insert(0, obs.oi.size);
8065         overlaps.intersection_of(modified);
8066         modified.subtract(overlaps);
8067         ctx->modified_ranges.union_of(modified);
8068       }
8069
8070       // Adjust the cached objectcontext
8071       maybe_create_new_object(ctx, true);
8072       ctx->delta_stats.num_bytes -= obs.oi.size;
8073       ctx->delta_stats.num_bytes += rollback_to->obs.oi.size;
8074       obs.oi.size = rollback_to->obs.oi.size;
8075       if (rollback_to->obs.oi.is_data_digest())
8076         obs.oi.set_data_digest(rollback_to->obs.oi.data_digest);
8077       else
8078         obs.oi.clear_data_digest();
8079       if (rollback_to->obs.oi.is_omap_digest())
8080         obs.oi.set_omap_digest(rollback_to->obs.oi.omap_digest);
8081       else
8082         obs.oi.clear_omap_digest();
8083
8084       if (rollback_to->obs.oi.is_omap()) {
8085         dout(10) << __func__ << " setting omap flag on " << obs.oi.soid << dendl;
8086         obs.oi.set_flag(object_info_t::FLAG_OMAP);
8087       } else {
8088         dout(10) << __func__ << " clearing omap flag on " << obs.oi.soid << dendl;
8089         obs.oi.clear_flag(object_info_t::FLAG_OMAP);
8090       }
8091     }
8092   }
8093   return ret;
8094 }
8095
8096 void PrimaryLogPG::_make_clone(
8097   OpContext *ctx,
8098   PGTransaction* t,
8099   ObjectContextRef obc,
8100   const hobject_t& head, const hobject_t& coid,
8101   object_info_t *poi)
8102 {
8103   bufferlist bv;
8104   encode(*poi, bv, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
8105
8106   t->clone(coid, head);
8107   setattr_maybe_cache(obc, t, OI_ATTR, bv);
8108   rmattr_maybe_cache(obc, t, SS_ATTR);
8109 }
8110
8111 void PrimaryLogPG::make_writeable(OpContext *ctx)
8112 {
8113   const hobject_t& soid = ctx->obs->oi.soid;
8114   SnapContext& snapc = ctx->snapc;
8115
8116   // clone?
8117   ceph_assert(soid.snap == CEPH_NOSNAP);
8118   dout(20) << "make_writeable " << soid << " snapset=" << ctx->new_snapset
8119            << "  snapc=" << snapc << dendl;
8120
8121   bool was_dirty = ctx->obc->obs.oi.is_dirty();
8122   if (ctx->new_obs.exists) {
8123     // we will mark the object dirty
8124     if (ctx->undirty && was_dirty) {
8125       dout(20) << " clearing DIRTY flag" << dendl;
8126       ceph_assert(ctx->new_obs.oi.is_dirty());
8127       ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
8128       --ctx->delta_stats.num_objects_dirty;
8129       osd->logger->inc(l_osd_tier_clean);
8130     } else if (!was_dirty && !ctx->undirty) {
8131       dout(20) << " setting DIRTY flag" << dendl;
8132       ctx->new_obs.oi.set_flag(object_info_t::FLAG_DIRTY);
8133       ++ctx->delta_stats.num_objects_dirty;
8134       osd->logger->inc(l_osd_tier_dirty);
8135     }
8136   } else {
8137     if (was_dirty) {
8138       dout(20) << " deletion, decrementing num_dirty and clearing flag" << dendl;
8139       ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
8140       --ctx->delta_stats.num_objects_dirty;
8141     }
8142   }
8143
8144   if ((ctx->new_obs.exists &&
8145        ctx->new_obs.oi.is_omap()) &&
8146       (!ctx->obc->obs.exists ||
8147        !ctx->obc->obs.oi.is_omap())) {
8148     ++ctx->delta_stats.num_objects_omap;
8149   }
8150   if ((!ctx->new_obs.exists ||
8151        !ctx->new_obs.oi.is_omap()) &&
8152       (ctx->obc->obs.exists &&
8153        ctx->obc->obs.oi.is_omap())) {
8154     --ctx->delta_stats.num_objects_omap;
8155   }
8156
8157   if (ctx->new_snapset.seq > snapc.seq) {
8158     dout(10) << " op snapset is old" << dendl;
8159   }
8160
8161   if ((ctx->obs->exists && !ctx->obs->oi.is_whiteout()) && // head exist(ed)
8162       snapc.snaps.size() &&                 // there are snaps
8163       !ctx->cache_evict &&
8164       snapc.snaps[0] > ctx->new_snapset.seq) {  // existing object is old
8165     // clone
8166     hobject_t coid = soid;
8167     coid.snap = snapc.seq;
8168
8169     unsigned l;
8170     for (l = 1;
8171          l < snapc.snaps.size() && snapc.snaps[l] > ctx->new_snapset.seq;
8172          l++) ;
8173
8174     vector<snapid_t> snaps(l);
8175     for (unsigned i=0; i<l; i++)
8176       snaps[i] = snapc.snaps[i];
8177
8178     // prepare clone
8179     object_info_t static_snap_oi(coid);
8180     object_info_t *snap_oi;
8181     if (is_primary()) {
8182       ctx->clone_obc = object_contexts.lookup_or_create(static_snap_oi.soid);
8183       ctx->clone_obc->destructor_callback =
8184         new C_PG_ObjectContext(this, ctx->clone_obc.get());
8185       ctx->clone_obc->obs.oi = static_snap_oi;
8186       ctx->clone_obc->obs.exists = true;
8187       ctx->clone_obc->ssc = ctx->obc->ssc;
8188       ctx->clone_obc->ssc->ref++;
8189       if (pool.info.is_erasure())
8190         ctx->clone_obc->attr_cache = ctx->obc->attr_cache;
8191       snap_oi = &ctx->clone_obc->obs.oi;
8192       bool got = ctx->lock_manager.get_write_greedy(
8193         coid,
8194         ctx->clone_obc,
8195         ctx->op);
8196       ceph_assert(got);
8197       dout(20) << " got greedy write on clone_obc " << *ctx->clone_obc << dendl;
8198     } else {
8199       snap_oi = &static_snap_oi;
8200     }
8201     snap_oi->version = ctx->at_version;
8202     snap_oi->prior_version = ctx->obs->oi.version;
8203     snap_oi->copy_user_bits(ctx->obs->oi);
8204
8205     _make_clone(ctx, ctx->op_t.get(), ctx->clone_obc, soid, coid, snap_oi);
8206
8207     ctx->delta_stats.num_objects++;
8208     if (snap_oi->is_dirty()) {
8209       ctx->delta_stats.num_objects_dirty++;
8210       osd->logger->inc(l_osd_tier_dirty);
8211     }
8212     if (snap_oi->is_omap())
8213       ctx->delta_stats.num_objects_omap++;
8214     if (snap_oi->is_cache_pinned())
8215       ctx->delta_stats.num_objects_pinned++;
8216     if (snap_oi->has_manifest())
8217       ctx->delta_stats.num_objects_manifest++;
8218     ctx->delta_stats.num_object_clones++;
8219     ctx->new_snapset.clones.push_back(coid.snap);
8220     ctx->new_snapset.clone_size[coid.snap] = ctx->obs->oi.size;
8221     ctx->new_snapset.clone_snaps[coid.snap] = snaps;
8222
8223     // clone_overlap should contain an entry for each clone
8224     // (an empty interval_set if there is no overlap)
8225     ctx->new_snapset.clone_overlap[coid.snap];
8226     if (ctx->obs->oi.size)
8227       ctx->new_snapset.clone_overlap[coid.snap].insert(0, ctx->obs->oi.size);
8228
8229     // log clone
8230     dout(10) << " cloning v " << ctx->obs->oi.version
8231              << " to " << coid << " v " << ctx->at_version
8232              << " snaps=" << snaps
8233              << " snapset=" << ctx->new_snapset << dendl;
8234     ctx->log.push_back(pg_log_entry_t(
8235                          pg_log_entry_t::CLONE, coid, ctx->at_version,
8236                          ctx->obs->oi.version,
8237                          ctx->obs->oi.user_version,
8238                          osd_reqid_t(), ctx->new_obs.oi.mtime, 0));
8239     encode(snaps, ctx->log.back().snaps);
8240
8241     ctx->at_version.version++;
8242   }
8243
8244   // update most recent clone_overlap and usage stats
8245   if (ctx->new_snapset.clones.size() > 0) {
8246     // the clone_overlap is difference of range between head and clones.
8247     // we need to check whether the most recent clone exists, if it's
8248     // been evicted, it's not included in the stats, but the clone_overlap
8249     // is still exist in the snapset, so we should update the
8250     // clone_overlap to make it sense.
8251     hobject_t last_clone_oid = soid;
8252     last_clone_oid.snap = ctx->new_snapset.clone_overlap.rbegin()->first;
8253     interval_set<uint64_t> &newest_overlap =
8254       ctx->new_snapset.clone_overlap.rbegin()->second;
8255     ctx->modified_ranges.intersection_of(newest_overlap);
8256     if (is_present_clone(last_clone_oid)) {
8257       // modified_ranges is still in use by the clone
8258       ctx->delta_stats.num_bytes += ctx->modified_ranges.size();
8259     }
8260     newest_overlap.subtract(ctx->modified_ranges);
8261   }
8262
8263   if (snapc.seq > ctx->new_snapset.seq) {
8264     // update snapset with latest snap context
8265     ctx->new_snapset.seq = snapc.seq;
8266     ctx->new_snapset.snaps = snapc.snaps;
8267   }
8268   dout(20) << "make_writeable " << soid
8269            << " done, snapset=" << ctx->new_snapset << dendl;
8270 }
8271
8272
8273 void PrimaryLogPG::write_update_size_and_usage(object_stat_sum_t& delta_stats, object_info_t& oi,
8274                                                interval_set<uint64_t>& modified, uint64_t offset,
8275                                                uint64_t length, bool write_full)
8276 {
8277   interval_set<uint64_t> ch;
8278   if (write_full) {
8279     if (oi.size)
8280       ch.insert(0, oi.size);
8281   } else if (length)
8282     ch.insert(offset, length);
8283   modified.union_of(ch);
8284   if (write_full ||
8285       (offset + length > oi.size && length)) {
8286     uint64_t new_size = offset + length;
8287     delta_stats.num_bytes -= oi.size;
8288     delta_stats.num_bytes += new_size;
8289     oi.size = new_size;
8290   }
8291
8292   if (oi.has_manifest() && oi.manifest.is_chunked()) {
8293     for (auto &p : oi.manifest.chunk_map) {
8294       if ((p.first <= offset && p.first + p.second.length > offset) ||
8295           (p.first > offset && p.first <= offset + length)) {
8296         p.second.clear_flag(chunk_info_t::FLAG_MISSING);
8297         p.second.set_flag(chunk_info_t::FLAG_DIRTY);
8298       }
8299     }
8300   }
8301   delta_stats.num_wr++;
8302   delta_stats.num_wr_kb += shift_round_up(length, 10);
8303 }
8304
8305 void PrimaryLogPG::truncate_update_size_and_usage(
8306   object_stat_sum_t& delta_stats,
8307   object_info_t& oi,
8308   uint64_t truncate_size)
8309 {
8310   if (oi.size != truncate_size) {
8311     delta_stats.num_bytes -= oi.size;
8312     delta_stats.num_bytes += truncate_size;
8313     oi.size = truncate_size;
8314   }
8315 }
8316
8317 void PrimaryLogPG::complete_disconnect_watches(
8318   ObjectContextRef obc,
8319   const list<watch_disconnect_t> &to_disconnect)
8320 {
8321   for (list<watch_disconnect_t>::const_iterator i =
8322          to_disconnect.begin();
8323        i != to_disconnect.end();
8324        ++i) {
8325     pair<uint64_t, entity_name_t> watcher(i->cookie, i->name);
8326     auto watchers_entry = obc->watchers.find(watcher);
8327     if (watchers_entry != obc->watchers.end()) {
8328       WatchRef watch = watchers_entry->second;
8329       dout(10) << "do_osd_op_effects disconnect watcher " << watcher << dendl;
8330       obc->watchers.erase(watcher);
8331       watch->remove(i->send_disconnect);
8332     } else {
8333       dout(10) << "do_osd_op_effects disconnect failed to find watcher "
8334                << watcher << dendl;
8335     }
8336   }
8337 }
8338
8339 void PrimaryLogPG::do_osd_op_effects(OpContext *ctx, const ConnectionRef& conn)
8340 {
8341   entity_name_t entity = ctx->reqid.name;
8342   dout(15) << "do_osd_op_effects " << entity << " con " << conn.get() << dendl;
8343
8344   // disconnects first
8345   complete_disconnect_watches(ctx->obc, ctx->watch_disconnects);
8346
8347   ceph_assert(conn);
8348
8349   auto session = conn->get_priv();
8350   if (!session)
8351     return;
8352
8353   for (list<pair<watch_info_t,bool> >::iterator i = ctx->watch_connects.begin();
8354        i != ctx->watch_connects.end();
8355        ++i) {
8356     pair<uint64_t, entity_name_t> watcher(i->first.cookie, entity);
8357     dout(15) << "do_osd_op_effects applying watch connect on session "
8358              << session.get() << " watcher " << watcher << dendl;
8359     WatchRef watch;
8360     if (ctx->obc->watchers.count(watcher)) {
8361       dout(15) << "do_osd_op_effects found existing watch watcher " << watcher
8362                << dendl;
8363       watch = ctx->obc->watchers[watcher];
8364     } else {
8365       dout(15) << "do_osd_op_effects new watcher " << watcher
8366                << dendl;
8367       watch = Watch::makeWatchRef(
8368         this, osd, ctx->obc, i->first.timeout_seconds,
8369         i->first.cookie, entity, conn->get_peer_addr());
8370       ctx->obc->watchers.insert(
8371         make_pair(
8372           watcher,
8373           watch));
8374     }
8375     watch->connect(conn, i->second);
8376   }
8377
8378   for (list<notify_info_t>::iterator p = ctx->notifies.begin();
8379        p != ctx->notifies.end();
8380        ++p) {
8381     dout(10) << "do_osd_op_effects, notify " << *p << dendl;
8382     ConnectionRef conn(ctx->op->get_req()->get_connection());
8383     NotifyRef notif(
8384       Notify::makeNotifyRef(
8385         conn,
8386         ctx->reqid.name.num(),
8387         p->bl,
8388         p->timeout,
8389         p->cookie,
8390         p->notify_id,
8391         ctx->obc->obs.oi.user_version,
8392         osd));
8393     for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator i =
8394            ctx->obc->watchers.begin();
8395          i != ctx->obc->watchers.end();
8396          ++i) {
8397       dout(10) << "starting notify on watch " << i->first << dendl;
8398       i->second->start_notify(notif);
8399     }
8400     notif->init();
8401   }
8402
8403   for (list<OpContext::NotifyAck>::iterator p = ctx->notify_acks.begin();
8404        p != ctx->notify_acks.end();
8405        ++p) {
8406     if (p->watch_cookie)
8407       dout(10) << "notify_ack " << make_pair(p->watch_cookie.get(), p->notify_id) << dendl;
8408     else
8409       dout(10) << "notify_ack " << make_pair("NULL", p->notify_id) << dendl;
8410     for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator i =
8411            ctx->obc->watchers.begin();
8412          i != ctx->obc->watchers.end();
8413          ++i) {
8414       if (i->first.second != entity) continue;
8415       if (p->watch_cookie &&
8416           p->watch_cookie.get() != i->first.first) continue;
8417       dout(10) << "acking notify on watch " << i->first << dendl;
8418       i->second->notify_ack(p->notify_id, p->reply_bl);
8419     }
8420   }
8421 }
8422
8423 hobject_t PrimaryLogPG::generate_temp_object(const hobject_t& target)
8424 {
8425   ostringstream ss;
8426   ss << "temp_" << info.pgid << "_" << get_role()
8427      << "_" << osd->monc->get_global_id() << "_" << (++temp_seq);
8428   hobject_t hoid = target.make_temp_hobject(ss.str());
8429   dout(20) << __func__ << " " << hoid << dendl;
8430   return hoid;
8431 }
8432
8433 hobject_t PrimaryLogPG::get_temp_recovery_object(
8434   const hobject_t& target,
8435   eversion_t version)
8436 {
8437   ostringstream ss;
8438   ss << "temp_recovering_" << info.pgid  // (note this includes the shardid)
8439      << "_" << version
8440      << "_" << info.history.same_interval_since
8441      << "_" << target.snap;
8442   // pgid + version + interval + snapid is unique, and short
8443   hobject_t hoid = target.make_temp_hobject(ss.str());
8444   dout(20) << __func__ << " " << hoid << dendl;
8445   return hoid;
8446 }
8447
8448 int PrimaryLogPG::prepare_transaction(OpContext *ctx)
8449 {
8450   ceph_assert(!ctx->ops->empty());
8451
8452   // valid snap context?
8453   if (!ctx->snapc.is_valid()) {
8454     dout(10) << " invalid snapc " << ctx->snapc << dendl;
8455     return -EINVAL;
8456   }
8457
8458   // prepare the actual mutation
8459   int result = do_osd_ops(ctx, *ctx->ops);
8460   if (result < 0) {
8461     if (ctx->op->may_write() &&
8462         get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
8463       // need to save the error code in the pg log, to detect dup ops,
8464       // but do nothing else
8465       ctx->update_log_only = true;
8466     }
8467     return result;
8468   }
8469
8470   // read-op?  write-op noop? done?
8471   if (ctx->op_t->empty() && !ctx->modify) {
8472     if (ctx->pending_async_reads.empty())
8473       unstable_stats.add(ctx->delta_stats);
8474     if (ctx->op->may_write() &&
8475         get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
8476       ctx->update_log_only = true;
8477     }
8478     return result;
8479   }
8480
8481   // check for full
8482   if ((ctx->delta_stats.num_bytes > 0 ||
8483        ctx->delta_stats.num_objects > 0) &&  // FIXME: keys?
8484       (pool.info.has_flag(pg_pool_t::FLAG_FULL) ||
8485        get_osdmap()->test_flag(CEPH_OSDMAP_FULL))) {
8486     const MOSDOp *m = static_cast<const MOSDOp*>(ctx->op->get_req());
8487     if (ctx->reqid.name.is_mds() ||   // FIXME: ignore MDS for now
8488         m->has_flag(CEPH_OSD_FLAG_FULL_FORCE)) {
8489       dout(20) << __func__ << " full, but proceeding due to FULL_FORCE or MDS"
8490                << dendl;
8491     } else if (m->has_flag(CEPH_OSD_FLAG_FULL_TRY)) {
8492       // they tried, they failed.
8493       dout(20) << __func__ << " full, replying to FULL_TRY op" << dendl;
8494       return pool.info.has_flag(pg_pool_t::FLAG_FULL_QUOTA) ? -EDQUOT : -ENOSPC;
8495     } else {
8496       // drop request
8497       dout(20) << __func__ << " full, dropping request (bad client)" << dendl;
8498       return -EAGAIN;
8499     }
8500   }
8501
8502   const hobject_t& soid = ctx->obs->oi.soid;
8503   // clone, if necessary
8504   if (soid.snap == CEPH_NOSNAP)
8505     make_writeable(ctx);
8506
8507   finish_ctx(ctx,
8508              ctx->new_obs.exists ? pg_log_entry_t::MODIFY :
8509              pg_log_entry_t::DELETE);
8510
8511   return result;
8512 }
8513
8514 void PrimaryLogPG::finish_ctx(OpContext *ctx, int log_op_type)
8515 {
8516   const hobject_t& soid = ctx->obs->oi.soid;
8517   dout(20) << __func__ << " " << soid << " " << ctx
8518            << " op " << pg_log_entry_t::get_op_name(log_op_type)
8519            << dendl;
8520   utime_t now = ceph_clock_now();
8521
8522   // finish and log the op.
8523   if (ctx->user_modify) {
8524     // update the user_version for any modify ops, except for the watch op
8525     ctx->user_at_version = std::max(info.last_user_version, ctx->new_obs.oi.user_version) + 1;
8526     /* In order for new clients and old clients to interoperate properly
8527      * when exchanging versions, we need to lower bound the user_version
8528      * (which our new clients pay proper attention to)
8529      * by the at_version (which is all the old clients can ever see). */
8530     if (ctx->at_version.version > ctx->user_at_version)
8531       ctx->user_at_version = ctx->at_version.version;
8532     ctx->new_obs.oi.user_version = ctx->user_at_version;
8533   }
8534   ctx->bytes_written = ctx->op_t->get_bytes_written();
8535
8536   if (ctx->new_obs.exists) {
8537     ctx->new_obs.oi.version = ctx->at_version;
8538     ctx->new_obs.oi.prior_version = ctx->obs->oi.version;
8539     ctx->new_obs.oi.last_reqid = ctx->reqid;
8540     if (ctx->mtime != utime_t()) {
8541       ctx->new_obs.oi.mtime = ctx->mtime;
8542       dout(10) << " set mtime to " << ctx->new_obs.oi.mtime << dendl;
8543       ctx->new_obs.oi.local_mtime = now;
8544     } else {
8545       dout(10) << " mtime unchanged at " << ctx->new_obs.oi.mtime << dendl;
8546     }
8547
8548     // object_info_t
8549     map <string, bufferlist> attrs;
8550     bufferlist bv(sizeof(ctx->new_obs.oi));
8551     encode(ctx->new_obs.oi, bv,
8552              get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
8553     attrs[OI_ATTR].claim(bv);
8554
8555     // snapset
8556     if (soid.snap == CEPH_NOSNAP) {
8557       dout(10) << " final snapset " << ctx->new_snapset
8558                << " in " << soid << dendl;
8559       bufferlist bss;
8560       encode(ctx->new_snapset, bss);
8561       attrs[SS_ATTR].claim(bss);
8562     } else {
8563       dout(10) << " no snapset (this is a clone)" << dendl;
8564     }
8565     ctx->op_t->setattrs(soid, attrs);
8566   } else {
8567     // reset cached oi
8568     ctx->new_obs.oi = object_info_t(ctx->obc->obs.oi.soid);
8569   }
8570
8571   // append to log
8572   ctx->log.push_back(pg_log_entry_t(log_op_type, soid, ctx->at_version,
8573                                     ctx->obs->oi.version,
8574                                     ctx->user_at_version, ctx->reqid,
8575                                     ctx->mtime, 0));
8576   if (soid.snap < CEPH_NOSNAP) {
8577     switch (log_op_type) {
8578     case pg_log_entry_t::MODIFY:
8579     case pg_log_entry_t::PROMOTE:
8580     case pg_log_entry_t::CLEAN:
8581       dout(20) << __func__ << " encoding snaps from " << ctx->new_snapset
8582                << dendl;
8583       encode(ctx->new_snapset.clone_snaps[soid.snap], ctx->log.back().snaps);
8584       break;
8585     default:
8586       break;
8587     }
8588   }
8589
8590   if (!ctx->extra_reqids.empty()) {
8591     dout(20) << __func__ << "  extra_reqids " << ctx->extra_reqids << " "
8592              << ctx->extra_reqid_return_codes << dendl;
8593     ctx->log.back().extra_reqids.swap(ctx->extra_reqids);
8594     ctx->log.back().extra_reqid_return_codes.swap(ctx->extra_reqid_return_codes);
8595   }
8596
8597   // apply new object state.
8598   ctx->obc->obs = ctx->new_obs;
8599
8600   if (soid.is_head() && !ctx->obc->obs.exists) {
8601     ctx->obc->ssc->exists = false;
8602     ctx->obc->ssc->snapset = SnapSet();
8603   } else {
8604     ctx->obc->ssc->exists = true;
8605     ctx->obc->ssc->snapset = ctx->new_snapset;
8606   }
8607 }
8608
8609 void PrimaryLogPG::apply_stats(
8610   const hobject_t &soid,
8611   const object_stat_sum_t &delta_stats) {
8612
8613   info.stats.stats.add(delta_stats);
8614   info.stats.stats.floor(0);
8615
8616   for (set<pg_shard_t>::iterator i = backfill_targets.begin();
8617        i != backfill_targets.end();
8618        ++i) {
8619     pg_shard_t bt = *i;
8620     pg_info_t& pinfo = peer_info[bt];
8621     if (soid <= pinfo.last_backfill)
8622       pinfo.stats.stats.add(delta_stats);
8623     else if (soid <= last_backfill_started)
8624       pending_backfill_updates[soid].stats.add(delta_stats);
8625   }
8626
8627   if (is_primary() && scrubber.active) {
8628     if (soid < scrubber.start) {
8629       dout(20) << __func__ << " " << soid << " < [" << scrubber.start
8630                << "," << scrubber.end << ")" << dendl;
8631       scrub_cstat.add(delta_stats);
8632     } else {
8633       dout(20) << __func__ << " " << soid << " >= [" << scrubber.start
8634                << "," << scrubber.end << ")" << dendl;
8635     }
8636   }
8637 }
8638
8639 void PrimaryLogPG::complete_read_ctx(int result, OpContext *ctx)
8640 {
8641   const MOSDOp *m = static_cast<const MOSDOp*>(ctx->op->get_req());
8642   ceph_assert(ctx->async_reads_complete());
8643
8644   for (vector<OSDOp>::iterator p = ctx->ops->begin();
8645     p != ctx->ops->end() && result >= 0; ++p) {
8646     if (p->rval < 0 && !(p->op.flags & CEPH_OSD_OP_FLAG_FAILOK)) {
8647       result = p->rval;
8648       break;
8649     }
8650     ctx->bytes_read += p->outdata.length();
8651   }
8652   ctx->reply->claim_op_out_data(*ctx->ops);
8653   ctx->reply->get_header().data_off = (ctx->data_off ? *ctx->data_off : 0);
8654
8655   MOSDOpReply *reply = ctx->reply;
8656   ctx->reply = nullptr;
8657
8658   if (result >= 0) {
8659     if (!ctx->ignore_log_op_stats) {
8660       log_op_stats(*ctx->op, ctx->bytes_written, ctx->bytes_read);
8661
8662       publish_stats_to_osd();
8663     }
8664
8665     // on read, return the current object version
8666     if (ctx->obs) {
8667       reply->set_reply_versions(eversion_t(), ctx->obs->oi.user_version);
8668     } else {
8669       reply->set_reply_versions(eversion_t(), ctx->user_at_version);
8670     }
8671   } else if (result == -ENOENT) {
8672     // on ENOENT, set a floor for what the next user version will be.
8673     reply->set_enoent_reply_versions(info.last_update, info.last_user_version);
8674   }
8675
8676   reply->set_result(result);
8677   reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
8678   osd->send_message_osd_client(reply, m->get_connection());
8679   close_op_ctx(ctx);
8680 }
8681
8682 // ========================================================================
8683 // copyfrom
8684
8685 struct C_Copyfrom : public Context {
8686   PrimaryLogPGRef pg;
8687   hobject_t oid;
8688   epoch_t last_peering_reset;
8689   ceph_tid_t tid;
8690   PrimaryLogPG::CopyOpRef cop;  // used for keeping the cop alive
8691   C_Copyfrom(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
8692              const PrimaryLogPG::CopyOpRef& c)
8693     : pg(p), oid(o), last_peering_reset(lpr),
8694       tid(0), cop(c)
8695   {}
8696   void finish(int r) override {
8697     if (r == -ECANCELED)
8698       return;
8699     pg->lock();
8700     if (last_peering_reset == pg->get_last_peering_reset()) {
8701       pg->process_copy_chunk(oid, tid, r);
8702       cop.reset();
8703     }
8704     pg->unlock();
8705   }
8706 };
8707
8708 struct C_CopyFrom_AsyncReadCb : public Context {
8709   OSDOp *osd_op;
8710   object_copy_data_t reply_obj;
8711   uint64_t features;
8712   size_t len;
8713   C_CopyFrom_AsyncReadCb(OSDOp *osd_op, uint64_t features) :
8714     osd_op(osd_op), features(features), len(0) {}
8715   void finish(int r) override {
8716     osd_op->rval = r;
8717     if (r < 0) {
8718       return;
8719     }
8720
8721     ceph_assert(len > 0);
8722     ceph_assert(len <= reply_obj.data.length());
8723     bufferlist bl;
8724     bl.substr_of(reply_obj.data, 0, len);
8725     reply_obj.data.swap(bl);
8726     encode(reply_obj, osd_op->outdata, features);
8727   }
8728 };
8729
8730 struct C_CopyChunk : public Context {
8731   PrimaryLogPGRef pg;
8732   hobject_t oid;
8733   epoch_t last_peering_reset;
8734   ceph_tid_t tid;
8735   PrimaryLogPG::CopyOpRef cop;  // used for keeping the cop alive
8736   uint64_t offset = 0;
8737   C_CopyChunk(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
8738              const PrimaryLogPG::CopyOpRef& c)
8739     : pg(p), oid(o), last_peering_reset(lpr),
8740       tid(0), cop(c)
8741   {}
8742   void finish(int r) override {
8743     if (r == -ECANCELED)
8744       return;
8745     pg->lock();
8746     if (last_peering_reset == pg->get_last_peering_reset()) {
8747       pg->process_copy_chunk_manifest(oid, tid, r, offset);
8748       cop.reset();
8749     }
8750     pg->unlock();
8751   }
8752 };
8753
8754 int PrimaryLogPG::do_copy_get(OpContext *ctx, bufferlist::const_iterator& bp,
8755                               OSDOp& osd_op, ObjectContextRef &obc)
8756 {
8757   object_info_t& oi = obc->obs.oi;
8758   hobject_t& soid = oi.soid;
8759   int result = 0;
8760   object_copy_cursor_t cursor;
8761   uint64_t out_max;
8762   try {
8763     decode(cursor, bp);
8764     decode(out_max, bp);
8765   }
8766   catch (buffer::error& e) {
8767     result = -EINVAL;
8768     return result;
8769   }
8770
8771   const MOSDOp *op = reinterpret_cast<const MOSDOp*>(ctx->op->get_req());
8772   uint64_t features = op->get_features();
8773
8774   bool async_read_started = false;
8775   object_copy_data_t _reply_obj;
8776   C_CopyFrom_AsyncReadCb *cb = nullptr;
8777   if (pool.info.is_erasure()) {
8778     cb = new C_CopyFrom_AsyncReadCb(&osd_op, features);
8779   }
8780   object_copy_data_t &reply_obj = cb ? cb->reply_obj : _reply_obj;
8781   // size, mtime
8782   reply_obj.size = oi.size;
8783   reply_obj.mtime = oi.mtime;
8784   ceph_assert(obc->ssc);
8785   if (soid.snap < CEPH_NOSNAP) {
8786     auto p = obc->ssc->snapset.clone_snaps.find(soid.snap);
8787     ceph_assert(p != obc->ssc->snapset.clone_snaps.end()); // warn?
8788     reply_obj.snaps = p->second;
8789   } else {
8790     reply_obj.snap_seq = obc->ssc->snapset.seq;
8791   }
8792   if (oi.is_data_digest()) {
8793     reply_obj.flags |= object_copy_data_t::FLAG_DATA_DIGEST;
8794     reply_obj.data_digest = oi.data_digest;
8795   }
8796   if (oi.is_omap_digest()) {
8797     reply_obj.flags |= object_copy_data_t::FLAG_OMAP_DIGEST;
8798     reply_obj.omap_digest = oi.omap_digest;
8799   }
8800   reply_obj.truncate_seq = oi.truncate_seq;
8801   reply_obj.truncate_size = oi.truncate_size;
8802
8803   // attrs
8804   map<string,bufferlist>& out_attrs = reply_obj.attrs;
8805   if (!cursor.attr_complete) {
8806     result = getattrs_maybe_cache(
8807       ctx->obc,
8808       &out_attrs);
8809     if (result < 0) {
8810       if (cb) {
8811         delete cb;
8812       }
8813       return result;
8814     }
8815     cursor.attr_complete = true;
8816     dout(20) << " got attrs" << dendl;
8817   }
8818
8819   int64_t left = out_max - osd_op.outdata.length();
8820
8821   // data
8822   bufferlist& bl = reply_obj.data;
8823   if (left > 0 && !cursor.data_complete) {
8824     if (cursor.data_offset < oi.size) {
8825       uint64_t max_read = std::min(oi.size - cursor.data_offset, (uint64_t)left);
8826       if (cb) {
8827         async_read_started = true;
8828         ctx->pending_async_reads.push_back(
8829           make_pair(
8830             boost::make_tuple(cursor.data_offset, max_read, osd_op.op.flags),
8831             make_pair(&bl, cb)));
8832         cb->len = max_read;
8833
8834         ctx->op_finishers[ctx->current_osd_subop_num].reset(
8835           new ReadFinisher(osd_op));
8836         result = -EINPROGRESS;
8837
8838         dout(10) << __func__ << ": async_read noted for " << soid << dendl;
8839       } else {
8840         result = pgbackend->objects_read_sync(
8841           oi.soid, cursor.data_offset, max_read, osd_op.op.flags, &bl);
8842         if (result < 0)
8843           return result;
8844       }
8845       left -= max_read;
8846       cursor.data_offset += max_read;
8847     }
8848     if (cursor.data_offset == oi.size) {
8849       cursor.data_complete = true;
8850       dout(20) << " got data" << dendl;
8851     }
8852     ceph_assert(cursor.data_offset <= oi.size);
8853   }
8854
8855   // omap
8856   uint32_t omap_keys = 0;
8857   if (!pool.info.supports_omap() || !oi.is_omap()) {
8858     cursor.omap_complete = true;
8859   } else {
8860     if (left > 0 && !cursor.omap_complete) {
8861       ceph_assert(cursor.data_complete);
8862       if (cursor.omap_offset.empty()) {
8863         osd->store->omap_get_header(ch, ghobject_t(oi.soid),
8864                                     &reply_obj.omap_header);
8865       }
8866       bufferlist omap_data;
8867       ObjectMap::ObjectMapIterator iter =
8868         osd->store->get_omap_iterator(ch, ghobject_t(oi.soid));
8869       ceph_assert(iter);
8870       iter->upper_bound(cursor.omap_offset);
8871       for (; iter->valid(); iter->next()) {
8872         ++omap_keys;
8873         encode(iter->key(), omap_data);
8874         encode(iter->value(), omap_data);
8875         left -= iter->key().length() + 4 + iter->value().length() + 4;
8876         if (left <= 0)
8877           break;
8878       }
8879       if (omap_keys) {
8880         encode(omap_keys, reply_obj.omap_data);
8881         reply_obj.omap_data.claim_append(omap_data);
8882       }
8883       if (iter->valid()) {
8884         cursor.omap_offset = iter->key();
8885       } else {
8886         cursor.omap_complete = true;
8887         dout(20) << " got omap" << dendl;
8888       }
8889     }
8890   }
8891
8892   if (cursor.is_complete()) {
8893     // include reqids only in the final step.  this is a bit fragile
8894     // but it works...
8895     pg_log.get_log().get_object_reqids(ctx->obc->obs.oi.soid, 10,
8896                                        &reply_obj.reqids,
8897                                        &reply_obj.reqid_return_codes);
8898     dout(20) << " got reqids" << dendl;
8899   }
8900
8901   dout(20) << " cursor.is_complete=" << cursor.is_complete()
8902            << " " << out_attrs.size() << " attrs"
8903            << " " << bl.length() << " bytes"
8904            << " " << reply_obj.omap_header.length() << " omap header bytes"
8905            << " " << reply_obj.omap_data.length() << " omap data bytes in "
8906            << omap_keys << " keys"
8907            << " " << reply_obj.reqids.size() << " reqids"
8908            << dendl;
8909   reply_obj.cursor = cursor;
8910   if (!async_read_started) {
8911     encode(reply_obj, osd_op.outdata, features);
8912   }
8913   if (cb && !async_read_started) {
8914     delete cb;
8915   }
8916
8917   if (result > 0) {
8918     result = 0;
8919   }
8920   return result;
8921 }
8922
8923 void PrimaryLogPG::fill_in_copy_get_noent(OpRequestRef& op, hobject_t oid,
8924                                           OSDOp& osd_op)
8925 {
8926   // NOTE: we take non-const ref here for claim_op_out_data below; we must
8927   // be careful not to modify anything else that will upset a racing
8928   // operator<<
8929   MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
8930   uint64_t features = m->get_features();
8931   object_copy_data_t reply_obj;
8932
8933   pg_log.get_log().get_object_reqids(oid, 10, &reply_obj.reqids,
8934                                      &reply_obj.reqid_return_codes);
8935   dout(20) << __func__ << " got reqids " << reply_obj.reqids << dendl;
8936   encode(reply_obj, osd_op.outdata, features);
8937   osd_op.rval = -ENOENT;
8938   MOSDOpReply *reply = new MOSDOpReply(m, 0, get_osdmap_epoch(), 0, false);
8939   reply->claim_op_out_data(m->ops);
8940   reply->set_result(-ENOENT);
8941   reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
8942   osd->send_message_osd_client(reply, m->get_connection());
8943 }
8944
8945 void PrimaryLogPG::start_copy(CopyCallback *cb, ObjectContextRef obc,
8946                               hobject_t src, object_locator_t oloc,
8947                               version_t version, unsigned flags,
8948                               bool mirror_snapset,
8949                               unsigned src_obj_fadvise_flags,
8950                               unsigned dest_obj_fadvise_flags)
8951 {
8952   const hobject_t& dest = obc->obs.oi.soid;
8953   dout(10) << __func__ << " " << dest
8954            << " from " << src << " " << oloc << " v" << version
8955            << " flags " << flags
8956            << (mirror_snapset ? " mirror_snapset" : "")
8957            << dendl;
8958
8959   ceph_assert(!mirror_snapset || src.snap == CEPH_NOSNAP);
8960
8961   // cancel a previous in-progress copy?
8962   if (copy_ops.count(dest)) {
8963     // FIXME: if the src etc match, we could avoid restarting from the
8964     // beginning.
8965     CopyOpRef cop = copy_ops[dest];
8966     vector<ceph_tid_t> tids;
8967     cancel_copy(cop, false, &tids);
8968     osd->objecter->op_cancel(tids, -ECANCELED);
8969   }
8970
8971   CopyOpRef cop(std::make_shared<CopyOp>(cb, obc, src, oloc, version, flags,
8972                            mirror_snapset, src_obj_fadvise_flags,
8973                            dest_obj_fadvise_flags));
8974   copy_ops[dest] = cop;
8975   obc->start_block();
8976
8977   if (!obc->obs.oi.has_manifest()) {
8978     _copy_some(obc, cop);
8979   } else {
8980     if (obc->obs.oi.manifest.is_redirect()) {
8981       _copy_some(obc, cop);
8982     } else if (obc->obs.oi.manifest.is_chunked()) {
8983       auto p = obc->obs.oi.manifest.chunk_map.begin();
8984       _copy_some_manifest(obc, cop, p->first);
8985     } else {
8986       ceph_abort_msg("unrecognized manifest type");
8987     }
8988   }
8989 }
8990
8991 void PrimaryLogPG::_copy_some(ObjectContextRef obc, CopyOpRef cop)
8992 {
8993   dout(10) << __func__ << " " << *obc << " " << cop << dendl;
8994
8995   unsigned flags = 0;
8996   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_FLUSH)
8997     flags |= CEPH_OSD_FLAG_FLUSH;
8998   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE)
8999     flags |= CEPH_OSD_FLAG_IGNORE_CACHE;
9000   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY)
9001     flags |= CEPH_OSD_FLAG_IGNORE_OVERLAY;
9002   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE)
9003     flags |= CEPH_OSD_FLAG_MAP_SNAP_CLONE;
9004   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_RWORDERED)
9005     flags |= CEPH_OSD_FLAG_RWORDERED;
9006
9007   C_GatherBuilder gather(cct);
9008
9009   if (cop->cursor.is_initial() && cop->mirror_snapset) {
9010     // list snaps too.
9011     ceph_assert(cop->src.snap == CEPH_NOSNAP);
9012     ObjectOperation op;
9013     op.list_snaps(&cop->results.snapset, NULL);
9014     ceph_tid_t tid = osd->objecter->read(cop->src.oid, cop->oloc, op,
9015                                     CEPH_SNAPDIR, NULL,
9016                                     flags, gather.new_sub(), NULL);
9017     cop->objecter_tid2 = tid;
9018   }
9019
9020   ObjectOperation op;
9021   if (cop->results.user_version) {
9022     op.assert_version(cop->results.user_version);
9023   } else {
9024     // we should learn the version after the first chunk, if we didn't know
9025     // it already!
9026     ceph_assert(cop->cursor.is_initial());
9027   }
9028   op.copy_get(&cop->cursor, get_copy_chunk_size(),
9029               &cop->results.object_size, &cop->results.mtime,
9030               &cop->attrs, &cop->data, &cop->omap_header, &cop->omap_data,
9031               &cop->results.snaps, &cop->results.snap_seq,
9032               &cop->results.flags,
9033               &cop->results.source_data_digest,
9034               &cop->results.source_omap_digest,
9035               &cop->results.reqids,
9036               &cop->results.reqid_return_codes,
9037               &cop->results.truncate_seq,
9038               &cop->results.truncate_size,
9039               &cop->rval);
9040   op.set_last_op_flags(cop->src_obj_fadvise_flags);
9041
9042   C_Copyfrom *fin = new C_Copyfrom(this, obc->obs.oi.soid,
9043                                    get_last_peering_reset(), cop);
9044   unsigned n = info.pgid.hash_to_shard(osd->m_objecter_finishers);
9045   gather.set_finisher(new C_OnFinisher(fin,
9046                                        osd->objecter_finishers[n]));
9047
9048   ceph_tid_t tid = osd->objecter->read(cop->src.oid, cop->oloc, op,
9049                                   cop->src.snap, NULL,
9050                                   flags,
9051                                   gather.new_sub(),
9052                                   // discover the object version if we don't know it yet
9053                                   cop->results.user_version ? NULL : &cop->results.user_version);
9054   fin->tid = tid;
9055   cop->objecter_tid = tid;
9056   gather.activate();
9057 }
9058
9059 void PrimaryLogPG::_copy_some_manifest(ObjectContextRef obc, CopyOpRef cop, uint64_t start_offset)
9060 {
9061   dout(10) << __func__ << " " << *obc << " " << cop << dendl;
9062
9063   unsigned flags = 0;
9064   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_FLUSH)
9065     flags |= CEPH_OSD_FLAG_FLUSH;
9066   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE)
9067     flags |= CEPH_OSD_FLAG_IGNORE_CACHE;
9068   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY)
9069     flags |= CEPH_OSD_FLAG_IGNORE_OVERLAY;
9070   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE)
9071     flags |= CEPH_OSD_FLAG_MAP_SNAP_CLONE;
9072   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_RWORDERED)
9073     flags |= CEPH_OSD_FLAG_RWORDERED;
9074
9075   int num_chunks = 0;
9076   uint64_t last_offset = 0, chunks_size = 0;
9077   object_manifest_t *manifest = &obc->obs.oi.manifest;
9078   map<uint64_t, chunk_info_t>::iterator iter = manifest->chunk_map.find(start_offset);
9079   for (;iter != manifest->chunk_map.end(); ++iter) {
9080     num_chunks++;
9081     chunks_size += iter->second.length;
9082     last_offset = iter->first;
9083     if (get_copy_chunk_size() < chunks_size) {
9084       break;
9085     }
9086   }
9087
9088   cop->num_chunk = num_chunks;
9089   cop->start_offset = start_offset;
9090   cop->last_offset = last_offset;
9091   dout(20) << __func__ << " oid " << obc->obs.oi.soid << " num_chunks: " << num_chunks
9092           << " start_offset: " << start_offset << " chunks_size: " << chunks_size
9093           << " last_offset: " << last_offset << dendl;
9094
9095   iter = manifest->chunk_map.find(start_offset);
9096   for (;iter != manifest->chunk_map.end(); ++iter) {
9097     uint64_t obj_offset = iter->first;
9098     uint64_t length = manifest->chunk_map[iter->first].length;
9099     hobject_t soid = manifest->chunk_map[iter->first].oid;
9100     object_locator_t oloc(soid);
9101     CopyCallback * cb = NULL;
9102     CopyOpRef sub_cop(std::make_shared<CopyOp>(cb, ObjectContextRef(), cop->src, oloc,
9103                        cop->results.user_version, cop->flags, cop->mirror_snapset,
9104                        cop->src_obj_fadvise_flags, cop->dest_obj_fadvise_flags));
9105     sub_cop->cursor.data_offset = obj_offset;
9106     cop->chunk_cops[obj_offset] = sub_cop;
9107
9108     int s = sub_cop->chunk_ops.size();
9109     sub_cop->chunk_ops.resize(s+1);
9110     sub_cop->chunk_ops[s].op.op =  CEPH_OSD_OP_READ;
9111     sub_cop->chunk_ops[s].op.extent.offset = manifest->chunk_map[iter->first].offset;
9112     sub_cop->chunk_ops[s].op.extent.length = length;
9113
9114     ObjectOperation op;
9115     op.dup(sub_cop->chunk_ops);
9116
9117     dout(20) << __func__ << " tgt_oid: " << soid.oid << " tgt_offset: "
9118             << manifest->chunk_map[iter->first].offset
9119             << " length: " << length << " pool id: " << oloc.pool << dendl;
9120
9121     if (cop->results.user_version) {
9122       op.assert_version(cop->results.user_version);
9123     } else {
9124       // we should learn the version after the first chunk, if we didn't know
9125       // it already!
9126       ceph_assert(cop->cursor.is_initial());
9127     }
9128     op.set_last_op_flags(cop->src_obj_fadvise_flags);
9129
9130     C_CopyChunk *fin = new C_CopyChunk(this, obc->obs.oi.soid,
9131                                      get_last_peering_reset(), cop);
9132     fin->offset = obj_offset;
9133     unsigned n = info.pgid.hash_to_shard(osd->m_objecter_finishers);
9134
9135     ceph_tid_t tid = osd->objecter->read(soid.oid, oloc, op,
9136                                     sub_cop->src.snap, NULL,
9137                                     flags,
9138                                     new C_OnFinisher(fin, osd->objecter_finishers[n]),
9139                                     // discover the object version if we don't know it yet
9140                                     sub_cop->results.user_version ? NULL : &sub_cop->results.user_version);
9141     fin->tid = tid;
9142     sub_cop->objecter_tid = tid;
9143     if (last_offset < iter->first) {
9144       break;
9145     }
9146   }
9147 }
9148
9149 void PrimaryLogPG::process_copy_chunk(hobject_t oid, ceph_tid_t tid, int r)
9150 {
9151   dout(10) << __func__ << " " << oid << " tid " << tid
9152            << " " << cpp_strerror(r) << dendl;
9153   map<hobject_t,CopyOpRef>::iterator p = copy_ops.find(oid);
9154   if (p == copy_ops.end()) {
9155     dout(10) << __func__ << " no copy_op found" << dendl;
9156     return;
9157   }
9158   CopyOpRef cop = p->second;
9159   if (tid != cop->objecter_tid) {
9160     dout(10) << __func__ << " tid " << tid << " != cop " << cop
9161              << " tid " << cop->objecter_tid << dendl;
9162     return;
9163   }
9164
9165   if (cop->omap_data.length() || cop->omap_header.length())
9166     cop->results.has_omap = true;
9167
9168   if (r >= 0 && !pool.info.supports_omap() &&
9169       (cop->omap_data.length() || cop->omap_header.length())) {
9170     r = -EOPNOTSUPP;
9171   }
9172   cop->objecter_tid = 0;
9173   cop->objecter_tid2 = 0;  // assume this ordered before us (if it happened)
9174   ObjectContextRef& cobc = cop->obc;
9175
9176   if (r < 0)
9177     goto out;
9178
9179   ceph_assert(cop->rval >= 0);
9180
9181   if (oid.snap < CEPH_NOSNAP && !cop->results.snaps.empty()) {
9182     // verify snap hasn't been deleted
9183     vector<snapid_t>::iterator p = cop->results.snaps.begin();
9184     while (p != cop->results.snaps.end()) {
9185       if (pool.info.is_removed_snap(*p)) {
9186         dout(10) << __func__ << " clone snap " << *p << " has been deleted"
9187                  << dendl;
9188         for (vector<snapid_t>::iterator q = p + 1;
9189              q != cop->results.snaps.end();
9190              ++q)
9191           *(q - 1) = *q;
9192         cop->results.snaps.resize(cop->results.snaps.size() - 1);
9193       } else {
9194         ++p;
9195       }
9196     }
9197     if (cop->results.snaps.empty()) {
9198       dout(10) << __func__ << " no more snaps for " << oid << dendl;
9199       r = -ENOENT;
9200       goto out;
9201     }
9202   }
9203
9204   ceph_assert(cop->rval >= 0);
9205
9206   if (!cop->temp_cursor.data_complete) {
9207     cop->results.data_digest = cop->data.crc32c(cop->results.data_digest);
9208   }
9209   if (pool.info.supports_omap() && !cop->temp_cursor.omap_complete) {
9210     if (cop->omap_header.length()) {
9211       cop->results.omap_digest =
9212         cop->omap_header.crc32c(cop->results.omap_digest);
9213     }
9214     if (cop->omap_data.length()) {
9215       bufferlist keys;
9216       keys.substr_of(cop->omap_data, 4, cop->omap_data.length() - 4);
9217       cop->results.omap_digest = keys.crc32c(cop->results.omap_digest);
9218     }
9219   }
9220
9221   if (!cop->temp_cursor.attr_complete) {
9222     for (map<string,bufferlist>::iterator p = cop->attrs.begin();
9223          p != cop->attrs.end();
9224          ++p) {
9225       cop->results.attrs[string("_") + p->first] = p->second;
9226     }
9227     cop->attrs.clear();
9228   }
9229
9230   if (!cop->cursor.is_complete()) {
9231     // write out what we have so far
9232     if (cop->temp_cursor.is_initial()) {
9233       ceph_assert(!cop->results.started_temp_obj);
9234       cop->results.started_temp_obj = true;
9235       cop->results.temp_oid = generate_temp_object(oid);
9236       dout(20) << __func__ << " using temp " << cop->results.temp_oid << dendl;
9237     }
9238     ObjectContextRef tempobc = get_object_context(cop->results.temp_oid, true);
9239     OpContextUPtr ctx = simple_opc_create(tempobc);
9240     if (cop->temp_cursor.is_initial()) {
9241       ctx->new_temp_oid = cop->results.temp_oid;
9242     }
9243     _write_copy_chunk(cop, ctx->op_t.get());
9244     simple_opc_submit(std::move(ctx));
9245     dout(10) << __func__ << " fetching more" << dendl;
9246     _copy_some(cobc, cop);
9247     return;
9248   }
9249
9250   // verify digests?
9251   if (cop->results.is_data_digest() || cop->results.is_omap_digest()) {
9252     dout(20) << __func__ << std::hex
9253       << " got digest: rx data 0x" << cop->results.data_digest
9254       << " omap 0x" << cop->results.omap_digest
9255       << ", source: data 0x" << cop->results.source_data_digest
9256       << " omap 0x" <<  cop->results.source_omap_digest
9257       << std::dec
9258       << " flags " << cop->results.flags
9259       << dendl;
9260   }
9261   if (cop->results.is_data_digest() &&
9262       cop->results.data_digest != cop->results.source_data_digest) {
9263     derr << __func__ << std::hex << " data digest 0x" << cop->results.data_digest
9264          << " != source 0x" << cop->results.source_data_digest << std::dec
9265          << dendl;
9266     osd->clog->error() << info.pgid << " copy from " << cop->src
9267                        << " to " << cop->obc->obs.oi.soid << std::hex
9268                        << " data digest 0x" << cop->results.data_digest
9269                        << " != source 0x" << cop->results.source_data_digest
9270                        << std::dec;
9271     r = -EIO;
9272     goto out;
9273   }
9274   if (cop->results.is_omap_digest() &&
9275       cop->results.omap_digest != cop->results.source_omap_digest) {
9276     derr << __func__ << std::hex
9277          << " omap digest 0x" << cop->results.omap_digest
9278          << " != source 0x" << cop->results.source_omap_digest
9279          << std::dec << dendl;
9280     osd->clog->error() << info.pgid << " copy from " << cop->src
9281                        << " to " << cop->obc->obs.oi.soid << std::hex
9282                        << " omap digest 0x" << cop->results.omap_digest
9283                        << " != source 0x" << cop->results.source_omap_digest
9284                        << std::dec;
9285     r = -EIO;
9286     goto out;
9287   }
9288   if (cct->_conf->osd_debug_inject_copyfrom_error) {
9289     derr << __func__ << " injecting copyfrom failure" << dendl;
9290     r = -EIO;
9291     goto out;
9292   }
9293
9294   cop->results.fill_in_final_tx = std::function<void(PGTransaction*)>(
9295     [this, &cop /* avoid ref cycle */](PGTransaction *t) {
9296       ObjectState& obs = cop->obc->obs;
9297       if (cop->temp_cursor.is_initial()) {
9298         dout(20) << "fill_in_final_tx: writing "
9299                  << "directly to final object" << dendl;
9300         // write directly to final object
9301         cop->results.temp_oid = obs.oi.soid;
9302         _write_copy_chunk(cop, t);
9303       } else {
9304         // finish writing to temp object, then move into place
9305         dout(20) << "fill_in_final_tx: writing to temp object" << dendl;
9306         _write_copy_chunk(cop, t);
9307         t->rename(obs.oi.soid, cop->results.temp_oid);
9308       }
9309       t->setattrs(obs.oi.soid, cop->results.attrs);
9310     });
9311
9312   dout(20) << __func__ << " success; committing" << dendl;
9313
9314  out:
9315   dout(20) << __func__ << " complete r = " << cpp_strerror(r) << dendl;
9316   CopyCallbackResults results(r, &cop->results);
9317   cop->cb->complete(results);
9318
9319   copy_ops.erase(cobc->obs.oi.soid);
9320   cobc->stop_block();
9321
9322   if (r < 0 && cop->results.started_temp_obj) {
9323     dout(10) << __func__ << " deleting partial temp object "
9324              << cop->results.temp_oid << dendl;
9325     ObjectContextRef tempobc = get_object_context(cop->results.temp_oid, true);
9326     OpContextUPtr ctx = simple_opc_create(tempobc);
9327     ctx->op_t->remove(cop->results.temp_oid);
9328     ctx->discard_temp_oid = cop->results.temp_oid;
9329     simple_opc_submit(std::move(ctx));
9330   }
9331
9332   // cancel and requeue proxy ops on this object
9333   if (!r) {
9334     cancel_and_requeue_proxy_ops(cobc->obs.oi.soid);
9335   }
9336
9337   kick_object_context_blocked(cobc);
9338 }
9339
9340 void PrimaryLogPG::process_copy_chunk_manifest(hobject_t oid, ceph_tid_t tid, int r, uint64_t offset)
9341 {
9342   dout(10) << __func__ << " " << oid << " tid " << tid
9343            << " " << cpp_strerror(r) << dendl;
9344   map<hobject_t,CopyOpRef>::iterator p = copy_ops.find(oid);
9345   if (p == copy_ops.end()) {
9346     dout(10) << __func__ << " no copy_op found" << dendl;
9347     return;
9348   }
9349   CopyOpRef obj_cop = p->second;
9350   CopyOpRef chunk_cop = obj_cop->chunk_cops[offset];
9351
9352   if (tid != chunk_cop->objecter_tid) {
9353     dout(10) << __func__ << " tid " << tid << " != cop " << chunk_cop
9354              << " tid " << chunk_cop->objecter_tid << dendl;
9355     return;
9356   }
9357
9358   if (chunk_cop->omap_data.length() || chunk_cop->omap_header.length()) {
9359     r = -EOPNOTSUPP;
9360   }
9361
9362   chunk_cop->objecter_tid = 0;
9363   chunk_cop->objecter_tid2 = 0;  // assume this ordered before us (if it happened)
9364   ObjectContextRef& cobc = obj_cop->obc;
9365   OSDOp &chunk_data = chunk_cop->chunk_ops[0];
9366
9367   if (r < 0) {
9368     obj_cop->failed = true;
9369     goto out;
9370   }
9371
9372   if (obj_cop->failed) {
9373     return;
9374   }
9375   if (!chunk_data.outdata.length()) {
9376     r = -EIO;
9377     obj_cop->failed = true;
9378     goto out;
9379   }
9380
9381   obj_cop->num_chunk--;
9382
9383   /* check all of the copyop are completed */
9384   if (obj_cop->num_chunk) {
9385     dout(20) << __func__ << " num_chunk: " << obj_cop->num_chunk << dendl;
9386     return;
9387   }
9388
9389   {
9390     OpContextUPtr ctx = simple_opc_create(obj_cop->obc);
9391     if (!ctx->lock_manager.take_write_lock(
9392           obj_cop->obc->obs.oi.soid,
9393           obj_cop->obc)) {
9394       // recovery op can take read lock.
9395       // so need to wait for recovery completion
9396       r = -EAGAIN;
9397       obj_cop->failed = true;
9398       close_op_ctx(ctx.release());
9399       goto out;
9400     }
9401     dout(20) << __func__ << " took lock on obc, " << obj_cop->obc->rwstate << dendl;
9402
9403     PGTransaction *t = ctx->op_t.get();
9404     ObjectState& obs = ctx->new_obs;
9405     for (auto p : obj_cop->chunk_cops) {
9406       OSDOp &sub_chunk = p.second->chunk_ops[0];
9407       t->write(cobc->obs.oi.soid,
9408               p.second->cursor.data_offset,
9409               sub_chunk.outdata.length(),
9410               sub_chunk.outdata,
9411               p.second->dest_obj_fadvise_flags);
9412       dout(20) << __func__ << " offset: " << p.second->cursor.data_offset
9413               << " length: " << sub_chunk.outdata.length() << dendl;
9414       write_update_size_and_usage(ctx->delta_stats, obs.oi, ctx->modified_ranges,
9415                                   p.second->cursor.data_offset, sub_chunk.outdata.length());
9416       obs.oi.manifest.chunk_map[p.second->cursor.data_offset].clear_flag(chunk_info_t::FLAG_DIRTY);
9417       obs.oi.manifest.chunk_map[p.second->cursor.data_offset].clear_flag(chunk_info_t::FLAG_MISSING);
9418       sub_chunk.outdata.clear();
9419     }
9420     obs.oi.clear_data_digest();
9421     ctx->at_version = get_next_version();
9422     finish_ctx(ctx.get(), pg_log_entry_t::PROMOTE);
9423     simple_opc_submit(std::move(ctx));
9424
9425     auto p = cobc->obs.oi.manifest.chunk_map.rbegin();
9426     /* check remaining work */
9427     if (p != cobc->obs.oi.manifest.chunk_map.rend()) {
9428       if (obj_cop->last_offset >= p->first + p->second.length) {
9429         for (auto &en : cobc->obs.oi.manifest.chunk_map) {
9430           if (obj_cop->last_offset < en.first) {
9431             _copy_some_manifest(cobc, obj_cop, en.first);
9432             return;
9433           }
9434         }
9435       }
9436     }
9437   }
9438
9439  out:
9440   dout(20) << __func__ << " complete r = " << cpp_strerror(r) << dendl;
9441   CopyCallbackResults results(r, &obj_cop->results);
9442   obj_cop->cb->complete(results);
9443
9444   copy_ops.erase(cobc->obs.oi.soid);
9445   cobc->stop_block();
9446
9447   // cancel and requeue proxy ops on this object
9448   if (!r) {
9449     cancel_and_requeue_proxy_ops(cobc->obs.oi.soid);
9450   }
9451
9452   kick_object_context_blocked(cobc);
9453 }
9454
9455 void PrimaryLogPG::cancel_and_requeue_proxy_ops(hobject_t oid) {
9456   vector<ceph_tid_t> tids;
9457   for (map<ceph_tid_t, ProxyReadOpRef>::iterator it = proxyread_ops.begin();
9458       it != proxyread_ops.end();) {
9459     if (it->second->soid == oid) {
9460       cancel_proxy_read((it++)->second, &tids);
9461     } else {
9462       ++it;
9463     }
9464   }
9465   for (map<ceph_tid_t, ProxyWriteOpRef>::iterator it = proxywrite_ops.begin();
9466        it != proxywrite_ops.end();) {
9467     if (it->second->soid == oid) {
9468       cancel_proxy_write((it++)->second, &tids);
9469     } else {
9470       ++it;
9471     }
9472   }
9473   osd->objecter->op_cancel(tids, -ECANCELED);
9474   kick_proxy_ops_blocked(oid);
9475 }
9476
9477 void PrimaryLogPG::_write_copy_chunk(CopyOpRef cop, PGTransaction *t)
9478 {
9479   dout(20) << __func__ << " " << cop
9480            << " " << cop->attrs.size() << " attrs"
9481            << " " << cop->data.length() << " bytes"
9482            << " " << cop->omap_header.length() << " omap header bytes"
9483            << " " << cop->omap_data.length() << " omap data bytes"
9484            << dendl;
9485   if (!cop->temp_cursor.attr_complete) {
9486     t->create(cop->results.temp_oid);
9487   }
9488   if (!cop->temp_cursor.data_complete) {
9489     ceph_assert(cop->data.length() + cop->temp_cursor.data_offset ==
9490            cop->cursor.data_offset);
9491     if (pool.info.required_alignment() &&
9492         !cop->cursor.data_complete) {
9493       /**
9494        * Trim off the unaligned bit at the end, we'll adjust cursor.data_offset
9495        * to pick it up on the next pass.
9496        */
9497       ceph_assert(cop->temp_cursor.data_offset %
9498              pool.info.required_alignment() == 0);
9499       if (cop->data.length() % pool.info.required_alignment() != 0) {
9500         uint64_t to_trim =
9501           cop->data.length() % pool.info.required_alignment();
9502         bufferlist bl;
9503         bl.substr_of(cop->data, 0, cop->data.length() - to_trim);
9504         cop->data.swap(bl);
9505         cop->cursor.data_offset -= to_trim;
9506         ceph_assert(cop->data.length() + cop->temp_cursor.data_offset ==
9507                cop->cursor.data_offset);
9508       }
9509     }
9510     if (cop->data.length()) {
9511       t->write(
9512         cop->results.temp_oid,
9513         cop->temp_cursor.data_offset,
9514         cop->data.length(),
9515         cop->data,
9516         cop->dest_obj_fadvise_flags);
9517     }
9518     cop->data.clear();
9519   }
9520   if (pool.info.supports_omap()) {
9521     if (!cop->temp_cursor.omap_complete) {
9522       if (cop->omap_header.length()) {
9523         t->omap_setheader(
9524           cop->results.temp_oid,
9525           cop->omap_header);
9526         cop->omap_header.clear();
9527       }
9528       if (cop->omap_data.length()) {
9529         map<string,bufferlist> omap;
9530         bufferlist::const_iterator p = cop->omap_data.begin();
9531         decode(omap, p);
9532         t->omap_setkeys(cop->results.temp_oid, omap);
9533         cop->omap_data.clear();
9534       }
9535     }
9536   } else {
9537     ceph_assert(cop->omap_header.length() == 0);
9538     ceph_assert(cop->omap_data.length() == 0);
9539   }
9540   cop->temp_cursor = cop->cursor;
9541 }
9542
9543 void PrimaryLogPG::finish_copyfrom(CopyFromCallback *cb)
9544 {
9545   OpContext *ctx = cb->ctx;
9546   dout(20) << "finish_copyfrom on " << ctx->obs->oi.soid << dendl;
9547
9548   ObjectState& obs = ctx->new_obs;
9549   if (obs.exists) {
9550     dout(20) << __func__ << ": exists, removing" << dendl;
9551     ctx->op_t->remove(obs.oi.soid);
9552   } else {
9553     ctx->delta_stats.num_objects++;
9554     obs.exists = true;
9555   }
9556   if (cb->is_temp_obj_used()) {
9557     ctx->discard_temp_oid = cb->results->temp_oid;
9558   }
9559   cb->results->fill_in_final_tx(ctx->op_t.get());
9560
9561   // CopyFromCallback fills this in for us
9562   obs.oi.user_version = ctx->user_at_version;
9563
9564   if (cb->results->is_data_digest()) {
9565     obs.oi.set_data_digest(cb->results->data_digest);
9566   } else {
9567     obs.oi.clear_data_digest();
9568   }
9569   if (cb->results->is_omap_digest()) {
9570     obs.oi.set_omap_digest(cb->results->omap_digest);
9571   } else {
9572     obs.oi.clear_omap_digest();
9573   }
9574
9575   obs.oi.truncate_seq = cb->results->truncate_seq;
9576   obs.oi.truncate_size = cb->results->truncate_size;
9577
9578   ctx->extra_reqids = cb->results->reqids;
9579   ctx->extra_reqid_return_codes = cb->results->reqid_return_codes;
9580
9581   // cache: clear whiteout?
9582   if (obs.oi.is_whiteout()) {
9583     dout(10) << __func__ << " clearing whiteout on " << obs.oi.soid << dendl;
9584     obs.oi.clear_flag(object_info_t::FLAG_WHITEOUT);
9585     --ctx->delta_stats.num_whiteouts;
9586   }
9587
9588   if (cb->results->has_omap) {
9589     dout(10) << __func__ << " setting omap flag on " << obs.oi.soid << dendl;
9590     obs.oi.set_flag(object_info_t::FLAG_OMAP);
9591   } else {
9592     dout(10) << __func__ << " clearing omap flag on " << obs.oi.soid << dendl;
9593     obs.oi.clear_flag(object_info_t::FLAG_OMAP);
9594   }
9595
9596   interval_set<uint64_t> ch;
9597   if (obs.oi.size > 0)
9598     ch.insert(0, obs.oi.size);
9599   ctx->modified_ranges.union_of(ch);
9600
9601   if (cb->get_data_size() != obs.oi.size) {
9602     ctx->delta_stats.num_bytes -= obs.oi.size;
9603     obs.oi.size = cb->get_data_size();
9604     ctx->delta_stats.num_bytes += obs.oi.size;
9605   }
9606   ctx->delta_stats.num_wr++;
9607   ctx->delta_stats.num_wr_kb += shift_round_up(obs.oi.size, 10);
9608
9609   osd->logger->inc(l_osd_copyfrom);
9610 }
9611
9612 void PrimaryLogPG::finish_promote(int r, CopyResults *results,
9613                                   ObjectContextRef obc)
9614 {
9615   const hobject_t& soid = obc->obs.oi.soid;
9616   dout(10) << __func__ << " " << soid << " r=" << r
9617            << " uv" << results->user_version << dendl;
9618
9619   if (r == -ECANCELED) {
9620     return;
9621   }
9622
9623   if (r != -ENOENT && soid.is_snap()) {
9624     if (results->snaps.empty()) {
9625       // we must have read "snap" content from the head object in
9626       // the base pool.  use snap_seq to construct what snaps should
9627       // be for this clone (what is was before we evicted the clean
9628       // clone from this pool, and what it will be when we flush and
9629       // the clone eventually happens in the base pool).
9630       SnapSet& snapset = obc->ssc->snapset;
9631       vector<snapid_t>::iterator p = snapset.snaps.begin();
9632       while (p != snapset.snaps.end() && *p > soid.snap)
9633         ++p;
9634       while (p != snapset.snaps.end() && *p > results->snap_seq) {
9635         results->snaps.push_back(*p);
9636         ++p;
9637       }
9638     }
9639
9640     dout(20) << __func__ << " snaps " << results->snaps << dendl;
9641     filter_snapc(results->snaps);
9642
9643     dout(20) << __func__ << " filtered snaps " << results->snaps << dendl;
9644     if (results->snaps.empty()) {
9645       dout(20) << __func__
9646                << " snaps are empty, clone is invalid,"
9647                << " setting r to ENOENT" << dendl;
9648       r = -ENOENT;
9649     }
9650   }
9651
9652   if (r < 0 && results->started_temp_obj) {
9653     dout(10) << __func__ << " abort; will clean up partial work" << dendl;
9654     ObjectContextRef tempobc = get_object_context(results->temp_oid, false);
9655     ceph_assert(tempobc);
9656     OpContextUPtr ctx = simple_opc_create(tempobc);
9657     ctx->op_t->remove(results->temp_oid);
9658     simple_opc_submit(std::move(ctx));
9659     results->started_temp_obj = false;
9660   }
9661
9662   if (r == -ENOENT && soid.is_snap()) {
9663     dout(10) << __func__
9664              << ": enoent while trying to promote clone, " << soid
9665              << " must have been trimmed, removing from snapset"
9666              << dendl;
9667     hobject_t head(soid.get_head());
9668     ObjectContextRef obc = get_object_context(head, false);
9669     ceph_assert(obc);
9670
9671     OpContextUPtr tctx = simple_opc_create(obc);
9672     tctx->at_version = get_next_version();
9673     filter_snapc(tctx->new_snapset.snaps);
9674     vector<snapid_t> new_clones;
9675     map<snapid_t, vector<snapid_t>> new_clone_snaps;
9676     for (vector<snapid_t>::iterator i = tctx->new_snapset.clones.begin();
9677          i != tctx->new_snapset.clones.end();
9678          ++i) {
9679       if (*i != soid.snap) {
9680         new_clones.push_back(*i);
9681         auto p = tctx->new_snapset.clone_snaps.find(*i);
9682         if (p != tctx->new_snapset.clone_snaps.end()) {
9683           new_clone_snaps[*i] = p->second;
9684         }
9685       }
9686     }
9687     tctx->new_snapset.clones.swap(new_clones);
9688     tctx->new_snapset.clone_overlap.erase(soid.snap);
9689     tctx->new_snapset.clone_size.erase(soid.snap);
9690     tctx->new_snapset.clone_snaps.swap(new_clone_snaps);
9691
9692     // take RWWRITE lock for duration of our local write.  ignore starvation.
9693     if (!tctx->lock_manager.take_write_lock(
9694           head,
9695           obc)) {
9696       ceph_abort_msg("problem!");
9697     }
9698     dout(20) << __func__ << " took lock on obc, " << obc->rwstate << dendl;
9699
9700     finish_ctx(tctx.get(), pg_log_entry_t::PROMOTE);
9701
9702     simple_opc_submit(std::move(tctx));
9703     return;
9704   }
9705
9706   bool whiteout = false;
9707   if (r == -ENOENT) {
9708     ceph_assert(soid.snap == CEPH_NOSNAP); // snap case is above
9709     dout(10) << __func__ << " whiteout " << soid << dendl;
9710     whiteout = true;
9711   }
9712
9713   if (r < 0 && !whiteout) {
9714     derr << __func__ << " unexpected promote error " << cpp_strerror(r) << dendl;
9715     // pass error to everyone blocked on this object
9716     // FIXME: this is pretty sloppy, but at this point we got
9717     // something unexpected and don't have many other options.
9718     map<hobject_t,list<OpRequestRef>>::iterator blocked_iter =
9719       waiting_for_blocked_object.find(soid);
9720     if (blocked_iter != waiting_for_blocked_object.end()) {
9721       while (!blocked_iter->second.empty()) {
9722         osd->reply_op_error(blocked_iter->second.front(), r);
9723         blocked_iter->second.pop_front();
9724       }
9725       waiting_for_blocked_object.erase(blocked_iter);
9726     }
9727     return;
9728   }
9729
9730   osd->promote_finish(results->object_size);
9731
9732   OpContextUPtr tctx =  simple_opc_create(obc);
9733   tctx->at_version = get_next_version();
9734
9735   if (!obc->obs.oi.has_manifest()) {
9736     ++tctx->delta_stats.num_objects;
9737   }
9738   if (soid.snap < CEPH_NOSNAP)
9739     ++tctx->delta_stats.num_object_clones;
9740   tctx->new_obs.exists = true;
9741
9742   tctx->extra_reqids = results->reqids;
9743   tctx->extra_reqid_return_codes = results->reqid_return_codes;
9744
9745   if (whiteout) {
9746     // create a whiteout
9747     tctx->op_t->create(soid);
9748     tctx->new_obs.oi.set_flag(object_info_t::FLAG_WHITEOUT);
9749     ++tctx->delta_stats.num_whiteouts;
9750     dout(20) << __func__ << " creating whiteout on " << soid << dendl;
9751     osd->logger->inc(l_osd_tier_whiteout);
9752   } else {
9753     if (results->has_omap) {
9754       dout(10) << __func__ << " setting omap flag on " << soid << dendl;
9755       tctx->new_obs.oi.set_flag(object_info_t::FLAG_OMAP);
9756       ++tctx->delta_stats.num_objects_omap;
9757     }
9758
9759     results->fill_in_final_tx(tctx->op_t.get());
9760     if (results->started_temp_obj) {
9761       tctx->discard_temp_oid = results->temp_oid;
9762     }
9763     tctx->new_obs.oi.size = results->object_size;
9764     tctx->new_obs.oi.user_version = results->user_version;
9765     if (results->is_data_digest()) {
9766       tctx->new_obs.oi.set_data_digest(results->data_digest);
9767     } else {
9768       tctx->new_obs.oi.clear_data_digest();
9769     }
9770     if (results->is_omap_digest()) {
9771       tctx->new_obs.oi.set_omap_digest(results->omap_digest);
9772     } else {
9773       tctx->new_obs.oi.clear_omap_digest();
9774     }
9775     tctx->new_obs.oi.truncate_seq = results->truncate_seq;
9776     tctx->new_obs.oi.truncate_size = results->truncate_size;
9777
9778     if (soid.snap != CEPH_NOSNAP) {
9779       ceph_assert(obc->ssc->snapset.clone_snaps.count(soid.snap));
9780       ceph_assert(obc->ssc->snapset.clone_size.count(soid.snap));
9781       ceph_assert(obc->ssc->snapset.clone_size[soid.snap] ==
9782              results->object_size);
9783       ceph_assert(obc->ssc->snapset.clone_overlap.count(soid.snap));
9784
9785       tctx->delta_stats.num_bytes += obc->ssc->snapset.get_clone_bytes(soid.snap);
9786     } else {
9787       tctx->delta_stats.num_bytes += results->object_size;
9788     }
9789   }
9790
9791   if (results->mirror_snapset) {
9792     ceph_assert(tctx->new_obs.oi.soid.snap == CEPH_NOSNAP);
9793     tctx->new_snapset.from_snap_set(
9794       results->snapset,
9795       get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS);
9796   }
9797   dout(20) << __func__ << " new_snapset " << tctx->new_snapset << dendl;
9798
9799   // take RWWRITE lock for duration of our local write.  ignore starvation.
9800   if (!tctx->lock_manager.take_write_lock(
9801         obc->obs.oi.soid,
9802         obc)) {
9803     ceph_abort_msg("problem!");
9804   }
9805   dout(20) << __func__ << " took lock on obc, " << obc->rwstate << dendl;
9806
9807   finish_ctx(tctx.get(), pg_log_entry_t::PROMOTE);
9808
9809   simple_opc_submit(std::move(tctx));
9810
9811   osd->logger->inc(l_osd_tier_promote);
9812
9813   if (agent_state &&
9814       agent_state->is_idle())
9815     agent_choose_mode();
9816 }
9817
9818 void PrimaryLogPG::finish_promote_manifest(int r, CopyResults *results,
9819                                             ObjectContextRef obc)
9820 {
9821   const hobject_t& soid = obc->obs.oi.soid;
9822   dout(10) << __func__ << " " << soid << " r=" << r
9823            << " uv" << results->user_version << dendl;
9824
9825   if (r == -ECANCELED || r == -EAGAIN) {
9826     return;
9827   }
9828
9829   if (r < 0) {
9830     derr << __func__ << " unexpected promote error " << cpp_strerror(r) << dendl;
9831     // pass error to everyone blocked on this object
9832     // FIXME: this is pretty sloppy, but at this point we got
9833     // something unexpected and don't have many other options.
9834     map<hobject_t,list<OpRequestRef>>::iterator blocked_iter =
9835       waiting_for_blocked_object.find(soid);
9836     if (blocked_iter != waiting_for_blocked_object.end()) {
9837       while (!blocked_iter->second.empty()) {
9838         osd->reply_op_error(blocked_iter->second.front(), r);
9839         blocked_iter->second.pop_front();
9840       }
9841       waiting_for_blocked_object.erase(blocked_iter);
9842     }
9843     return;
9844   }
9845
9846   osd->promote_finish(results->object_size);
9847   osd->logger->inc(l_osd_tier_promote);
9848
9849   if (agent_state &&
9850       agent_state->is_idle())
9851     agent_choose_mode();
9852 }
9853
9854 void PrimaryLogPG::cancel_copy(CopyOpRef cop, bool requeue,
9855                                vector<ceph_tid_t> *tids)
9856 {
9857   dout(10) << __func__ << " " << cop->obc->obs.oi.soid
9858            << " from " << cop->src << " " << cop->oloc
9859            << " v" << cop->results.user_version << dendl;
9860
9861   // cancel objecter op, if we can
9862   if (cop->objecter_tid) {
9863     tids->push_back(cop->objecter_tid);
9864     cop->objecter_tid = 0;
9865     if (cop->objecter_tid2) {
9866       tids->push_back(cop->objecter_tid2);
9867       cop->objecter_tid2 = 0;
9868     }
9869   }
9870
9871   copy_ops.erase(cop->obc->obs.oi.soid);
9872   cop->obc->stop_block();
9873
9874   kick_object_context_blocked(cop->obc);
9875   cop->results.should_requeue = requeue;
9876   CopyCallbackResults result(-ECANCELED, &cop->results);
9877   cop->cb->complete(result);
9878
9879   // There may still be an objecter callback referencing this copy op.
9880   // That callback will not need the obc since it's been canceled, and
9881   // we need the obc reference to go away prior to flush.
9882   cop->obc = ObjectContextRef();
9883 }
9884
9885 void PrimaryLogPG::cancel_copy_ops(bool requeue, vector<ceph_tid_t> *tids)
9886 {
9887   dout(10) << __func__ << dendl;
9888   map<hobject_t,CopyOpRef>::iterator p = copy_ops.begin();
9889   while (p != copy_ops.end()) {
9890     // requeue this op? can I queue up all of them?
9891     cancel_copy((p++)->second, requeue, tids);
9892   }
9893 }
9894
9895
9896 // ========================================================================
9897 // flush
9898 //
9899 // Flush a dirty object in the cache tier by writing it back to the
9900 // base tier.  The sequence looks like:
9901 //
9902 //  * send a copy-from operation to the base tier to copy the current
9903 //    version of the object
9904 //  * base tier will pull the object via (perhaps multiple) copy-get(s)
9905 //  * on completion, we check if the object has been modified.  if so,
9906 //    just reply with -EAGAIN.
9907 //  * try to take a write lock so we can clear the dirty flag.  if this
9908 //    fails, wait and retry
9909 //  * start a repop that clears the bit.
9910 //
9911 // If we have to wait, we will retry by coming back through the
9912 // start_flush method.  We check if a flush is already in progress
9913 // and, if so, try to finish it by rechecking the version and trying
9914 // to clear the dirty bit.
9915 //
9916 // In order for the cache-flush (a write op) to not block the copy-get
9917 // from reading the object, the client *must* set the SKIPRWLOCKS
9918 // flag.
9919 //
9920 // NOTE: normally writes are strictly ordered for the client, but
9921 // flushes are special in that they can be reordered with respect to
9922 // other writes.  In particular, we can't have a flush request block
9923 // an update to the cache pool object!
9924
9925 struct C_Flush : public Context {
9926   PrimaryLogPGRef pg;
9927   hobject_t oid;
9928   epoch_t last_peering_reset;
9929   ceph_tid_t tid;
9930   utime_t start;
9931   C_Flush(PrimaryLogPG *p, hobject_t o, epoch_t lpr)
9932     : pg(p), oid(o), last_peering_reset(lpr),
9933       tid(0), start(ceph_clock_now())
9934   {}
9935   void finish(int r) override {
9936     if (r == -ECANCELED)
9937       return;
9938     pg->lock();
9939     if (last_peering_reset == pg->get_last_peering_reset()) {
9940       pg->finish_flush(oid, tid, r);
9941       pg->osd->logger->tinc(l_osd_tier_flush_lat, ceph_clock_now() - start);
9942     }
9943     pg->unlock();
9944   }
9945 };
9946
9947 int PrimaryLogPG::start_flush(
9948   OpRequestRef op, ObjectContextRef obc,
9949   bool blocking, hobject_t *pmissing,
9950   boost::optional<std::function<void()>> &&on_flush)
9951 {
9952   const object_info_t& oi = obc->obs.oi;
9953   const hobject_t& soid = oi.soid;
9954   dout(10) << __func__ << " " << soid
9955            << " v" << oi.version
9956            << " uv" << oi.user_version
9957            << " " << (blocking ? "blocking" : "non-blocking/best-effort")
9958            << dendl;
9959
9960   // get a filtered snapset, need to remove removed snaps
9961   SnapSet snapset = obc->ssc->snapset.get_filtered(pool.info);
9962
9963   // verify there are no (older) check for dirty clones
9964   {
9965     dout(20) << " snapset " << snapset << dendl;
9966     vector<snapid_t>::reverse_iterator p = snapset.clones.rbegin();
9967     while (p != snapset.clones.rend() && *p >= soid.snap)
9968       ++p;
9969     if (p != snapset.clones.rend()) {
9970       hobject_t next = soid;
9971       next.snap = *p;
9972       ceph_assert(next.snap < soid.snap);
9973       if (pg_log.get_missing().is_missing(next)) {
9974         dout(10) << __func__ << " missing clone is " << next << dendl;
9975         if (pmissing)
9976           *pmissing = next;
9977         return -ENOENT;
9978       }
9979       ObjectContextRef older_obc = get_object_context(next, false);
9980       if (older_obc) {
9981         dout(20) << __func__ << " next oldest clone is " << older_obc->obs.oi
9982                  << dendl;
9983         if (older_obc->obs.oi.is_dirty()) {
9984           dout(10) << __func__ << " next oldest clone is dirty: "
9985                    << older_obc->obs.oi << dendl;
9986           return -EBUSY;
9987         }
9988       } else {
9989         dout(20) << __func__ << " next oldest clone " << next
9990                  << " is not present; implicitly clean" << dendl;
9991       }
9992     } else {
9993       dout(20) << __func__ << " no older clones" << dendl;
9994     }
9995   }
9996
9997   if (blocking)
9998     obc->start_block();
9999
10000   map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(soid);
10001   if (p != flush_ops.end()) {
10002     FlushOpRef fop = p->second;
10003     if (fop->op == op) {
10004       // we couldn't take the write lock on a cache-try-flush before;
10005       // now we are trying again for the lock.
10006       return try_flush_mark_clean(fop);
10007     }
10008     if (fop->flushed_version == obc->obs.oi.user_version &&
10009         (fop->blocking || !blocking)) {
10010       // nonblocking can join anything
10011       // blocking can only join a blocking flush
10012       dout(20) << __func__ << " piggybacking on existing flush " << dendl;
10013       if (op)
10014         fop->dup_ops.push_back(op);
10015       return -EAGAIN;   // clean up this ctx; op will retry later
10016     }
10017
10018     // cancel current flush since it will fail anyway, or because we
10019     // are blocking and the existing flush is nonblocking.
10020     dout(20) << __func__ << " canceling previous flush; it will fail" << dendl;
10021     if (fop->op)
10022       osd->reply_op_error(fop->op, -EBUSY);
10023     while (!fop->dup_ops.empty()) {
10024       osd->reply_op_error(fop->dup_ops.front(), -EBUSY);
10025       fop->dup_ops.pop_front();
10026     }
10027     vector<ceph_tid_t> tids;
10028     cancel_flush(fop, false, &tids);
10029     osd->objecter->op_cancel(tids, -ECANCELED);
10030   }
10031
10032   if (obc->obs.oi.has_manifest() && obc->obs.oi.manifest.is_chunked()) {
10033     int r = start_manifest_flush(op, obc, blocking, std::move(on_flush));
10034     if (r != -EINPROGRESS) {
10035       if (blocking)
10036         obc->stop_block();
10037     }
10038     return r;
10039   }
10040
10041   /**
10042    * In general, we need to send a delete and a copyfrom.
10043    * Consider snapc 10:[10, 9, 8, 4, 3, 2]:[10(10, 9), 4(4,3,2)]
10044    * where 4 is marked as clean.  To flush 10, we have to:
10045    * 1) delete 4:[4,3,2] -- Logically, the object does not exist after 4
10046    * 2) copyfrom 8:[8,4,3,2] -- flush object after snap 8
10047    *
10048    * There is a complicating case.  Supposed there had been a clone 7
10049    * for snaps [7, 6] which has been trimmed since they no longer exist.
10050    * In the base pool, we'd have 5:[4,3,2]:[4(4,3,2)]+head.  When we submit
10051    * the delete, the snap will be promoted to 5, and the head will become
10052    * a whiteout.  When the copy-from goes through, we'll end up with
10053    * 8:[8,4,3,2]:[4(4,3,2)]+head.
10054    *
10055    * Another complication is the case where there is an interval change
10056    * after doing the delete and the flush but before marking the object
10057    * clean.  We'll happily delete head and then recreate it at the same
10058    * sequence number, which works out ok.
10059    */
10060
10061   SnapContext snapc, dsnapc;
10062   if (snapset.seq != 0) {
10063     if (soid.snap == CEPH_NOSNAP) {
10064       snapc.seq = snapset.seq;
10065       snapc.snaps = snapset.snaps;
10066     } else {
10067       snapid_t min_included_snap;
10068       auto p = snapset.clone_snaps.find(soid.snap);
10069       ceph_assert(p != snapset.clone_snaps.end());
10070       min_included_snap = p->second.back();
10071       snapc = snapset.get_ssc_as_of(min_included_snap - 1);
10072     }
10073
10074     snapid_t prev_snapc = 0;
10075     for (vector<snapid_t>::reverse_iterator citer = snapset.clones.rbegin();
10076          citer != snapset.clones.rend();
10077          ++citer) {
10078       if (*citer < soid.snap) {
10079         prev_snapc = *citer;
10080         break;
10081       }
10082     }
10083
10084     dsnapc = snapset.get_ssc_as_of(prev_snapc);
10085   }
10086
10087   object_locator_t base_oloc(soid);
10088   base_oloc.pool = pool.info.tier_of;
10089
10090   if (dsnapc.seq < snapc.seq) {
10091     ObjectOperation o;
10092     o.remove();
10093     osd->objecter->mutate(
10094       soid.oid,
10095       base_oloc,
10096       o,
10097       dsnapc,
10098       ceph::real_clock::from_ceph_timespec(oi.mtime),
10099       (CEPH_OSD_FLAG_IGNORE_OVERLAY |
10100        CEPH_OSD_FLAG_ENFORCE_SNAPC),
10101       NULL /* no callback, we'll rely on the ordering w.r.t the next op */);
10102   }
10103
10104   FlushOpRef fop(std::make_shared<FlushOp>());
10105   fop->obc = obc;
10106   fop->flushed_version = oi.user_version;
10107   fop->blocking = blocking;
10108   fop->on_flush = std::move(on_flush);
10109   fop->op = op;
10110
10111   ObjectOperation o;
10112   if (oi.is_whiteout()) {
10113     fop->removal = true;
10114     o.remove();
10115   } else {
10116     object_locator_t oloc(soid);
10117     o.copy_from(soid.oid.name, soid.snap, oloc, oi.user_version,
10118                 CEPH_OSD_COPY_FROM_FLAG_FLUSH |
10119                 CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
10120                 CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
10121                 CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE,
10122                 LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL|LIBRADOS_OP_FLAG_FADVISE_NOCACHE);
10123
10124     //mean the base tier don't cache data after this
10125     if (agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL)
10126       o.set_last_op_flags(LIBRADOS_OP_FLAG_FADVISE_DONTNEED);
10127   }
10128   C_Flush *fin = new C_Flush(this, soid, get_last_peering_reset());
10129
10130   unsigned n = info.pgid.hash_to_shard(osd->m_objecter_finishers);
10131   ceph_tid_t tid = osd->objecter->mutate(
10132     soid.oid, base_oloc, o, snapc,
10133     ceph::real_clock::from_ceph_timespec(oi.mtime),
10134     CEPH_OSD_FLAG_IGNORE_OVERLAY | CEPH_OSD_FLAG_ENFORCE_SNAPC,
10135     new C_OnFinisher(fin,
10136                      osd->objecter_finishers[n]));
10137   /* we're under the pg lock and fin->finish() is grabbing that */
10138   fin->tid = tid;
10139   fop->objecter_tid = tid;
10140
10141   flush_ops[soid] = fop;
10142   info.stats.stats.sum.num_flush++;
10143   info.stats.stats.sum.num_flush_kb += shift_round_up(oi.size, 10);
10144   return -EINPROGRESS;
10145 }
10146
10147 void PrimaryLogPG::finish_flush(hobject_t oid, ceph_tid_t tid, int r)
10148 {
10149   dout(10) << __func__ << " " << oid << " tid " << tid
10150            << " " << cpp_strerror(r) << dendl;
10151   map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(oid);
10152   if (p == flush_ops.end()) {
10153     dout(10) << __func__ << " no flush_op found" << dendl;
10154     return;
10155   }
10156   FlushOpRef fop = p->second;
10157   if (tid != fop->objecter_tid && !fop->obc->obs.oi.has_manifest()) {
10158     dout(10) << __func__ << " tid " << tid << " != fop " << fop
10159              << " tid " << fop->objecter_tid << dendl;
10160     return;
10161   }
10162   ObjectContextRef obc = fop->obc;
10163   fop->objecter_tid = 0;
10164
10165   if (r < 0 && !(r == -ENOENT && fop->removal)) {
10166     if (fop->op)
10167       osd->reply_op_error(fop->op, -EBUSY);
10168     if (fop->blocking) {
10169       obc->stop_block();
10170       kick_object_context_blocked(obc);
10171     }
10172
10173     if (!fop->dup_ops.empty()) {
10174       dout(20) << __func__ << " requeueing dups" << dendl;
10175       requeue_ops(fop->dup_ops);
10176     }
10177     if (fop->on_flush) {
10178       (*(fop->on_flush))();
10179       fop->on_flush = boost::none;
10180     }
10181     flush_ops.erase(oid);
10182     return;
10183   }
10184
10185   r = try_flush_mark_clean(fop);
10186   if (r == -EBUSY && fop->op) {
10187     osd->reply_op_error(fop->op, r);
10188   }
10189 }
10190
10191 int PrimaryLogPG::try_flush_mark_clean(FlushOpRef fop)
10192 {
10193   ObjectContextRef obc = fop->obc;
10194   const hobject_t& oid = obc->obs.oi.soid;
10195
10196   if (fop->blocking) {
10197     obc->stop_block();
10198     kick_object_context_blocked(obc);
10199   }
10200
10201   if (fop->flushed_version != obc->obs.oi.user_version ||
10202       !obc->obs.exists) {
10203     if (obc->obs.exists)
10204       dout(10) << __func__ << " flushed_version " << fop->flushed_version
10205                << " != current " << obc->obs.oi.user_version
10206                << dendl;
10207     else
10208       dout(10) << __func__ << " object no longer exists" << dendl;
10209
10210     if (!fop->dup_ops.empty()) {
10211       dout(20) << __func__ << " requeueing dups" << dendl;
10212       requeue_ops(fop->dup_ops);
10213     }
10214     if (fop->on_flush) {
10215       (*(fop->on_flush))();
10216       fop->on_flush = boost::none;
10217     }
10218     flush_ops.erase(oid);
10219     if (fop->blocking)
10220       osd->logger->inc(l_osd_tier_flush_fail);
10221     else
10222       osd->logger->inc(l_osd_tier_try_flush_fail);
10223     return -EBUSY;
10224   }
10225
10226   if (!fop->blocking &&
10227       write_blocked_by_scrub(oid)) {
10228     if (fop->op) {
10229       dout(10) << __func__ << " blocked by scrub" << dendl;
10230       requeue_op(fop->op);
10231       requeue_ops(fop->dup_ops);
10232       return -EAGAIN;    // will retry
10233     } else {
10234       osd->logger->inc(l_osd_tier_try_flush_fail);
10235       vector<ceph_tid_t> tids;
10236       cancel_flush(fop, false, &tids);
10237       osd->objecter->op_cancel(tids, -ECANCELED);
10238       return -ECANCELED;
10239     }
10240   }
10241
10242   // successfully flushed, can we evict this object?
10243   if (!obc->obs.oi.has_manifest() && !fop->op &&
10244       agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_IDLE &&
10245       agent_maybe_evict(obc, true)) {
10246     osd->logger->inc(l_osd_tier_clean);
10247     if (fop->on_flush) {
10248       (*(fop->on_flush))();
10249       fop->on_flush = boost::none;
10250     }
10251     flush_ops.erase(oid);
10252     return 0;
10253   }
10254
10255   dout(10) << __func__ << " clearing DIRTY flag for " << oid << dendl;
10256   OpContextUPtr ctx = simple_opc_create(fop->obc);
10257
10258   // successfully flushed; can we clear the dirty bit?
10259   // try to take the lock manually, since we don't
10260   // have a ctx yet.
10261   if (ctx->lock_manager.get_lock_type(
10262         ObjectContext::RWState::RWWRITE,
10263         oid,
10264         obc,
10265         fop->op)) {
10266     dout(20) << __func__ << " took write lock" << dendl;
10267   } else if (fop->op) {
10268     dout(10) << __func__ << " waiting on write lock " << fop->op << " "
10269              << fop->dup_ops << dendl;
10270     // fop->op is now waiting on the lock; get fop->dup_ops to wait too.
10271     for (auto op : fop->dup_ops) {
10272       bool locked = ctx->lock_manager.get_lock_type(
10273         ObjectContext::RWState::RWWRITE,
10274         oid,
10275         obc,
10276         op);
10277       ceph_assert(!locked);
10278     }
10279     close_op_ctx(ctx.release());
10280     return -EAGAIN;    // will retry
10281   } else {
10282     dout(10) << __func__ << " failed write lock, no op; failing" << dendl;
10283     close_op_ctx(ctx.release());
10284     osd->logger->inc(l_osd_tier_try_flush_fail);
10285     vector<ceph_tid_t> tids;
10286     cancel_flush(fop, false, &tids);
10287     osd->objecter->op_cancel(tids, -ECANCELED);
10288     return -ECANCELED;
10289   }
10290
10291   if (fop->on_flush) {
10292     ctx->register_on_finish(*(fop->on_flush));
10293     fop->on_flush = boost::none;
10294   }
10295
10296   ctx->at_version = get_next_version();
10297
10298   ctx->new_obs = obc->obs;
10299   ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
10300   --ctx->delta_stats.num_objects_dirty;
10301   if (fop->obc->obs.oi.has_manifest()) {
10302     ceph_assert(obc->obs.oi.manifest.is_chunked());
10303     PGTransaction* t = ctx->op_t.get();
10304     uint64_t chunks_size = 0;
10305     for (auto &p : ctx->new_obs.oi.manifest.chunk_map) {
10306       chunks_size += p.second.length;
10307     }
10308     if (ctx->new_obs.oi.is_omap() && pool.info.supports_omap()) {
10309       t->omap_clear(oid);
10310       ctx->new_obs.oi.clear_omap_digest();
10311       ctx->new_obs.oi.clear_flag(object_info_t::FLAG_OMAP);
10312     }
10313     if (obc->obs.oi.size == chunks_size) {
10314       t->truncate(oid, 0);
10315       interval_set<uint64_t> trim;
10316       trim.insert(0, ctx->new_obs.oi.size);
10317       ctx->modified_ranges.union_of(trim);
10318       truncate_update_size_and_usage(ctx->delta_stats,
10319                                      ctx->new_obs.oi,
10320                                      0);
10321       ctx->new_obs.oi.new_object();
10322       for (auto &p : ctx->new_obs.oi.manifest.chunk_map) {
10323         p.second.clear_flag(chunk_info_t::FLAG_DIRTY);
10324         p.second.set_flag(chunk_info_t::FLAG_MISSING);
10325       }
10326     } else {
10327       for (auto &p : ctx->new_obs.oi.manifest.chunk_map) {
10328         if (p.second.is_dirty()) {
10329           dout(20) << __func__ << " offset: " << p.second.offset
10330                   << " length: " << p.second.length << dendl;
10331           p.second.clear_flag(chunk_info_t::FLAG_DIRTY);
10332           p.second.clear_flag(chunk_info_t::FLAG_MISSING); // CLEAN
10333         }
10334       }
10335     }
10336   }
10337
10338   finish_ctx(ctx.get(), pg_log_entry_t::CLEAN);
10339
10340   osd->logger->inc(l_osd_tier_clean);
10341
10342   if (!fop->dup_ops.empty() || fop->op) {
10343     dout(20) << __func__ << " requeueing for " << ctx->at_version << dendl;
10344     list<OpRequestRef> ls;
10345     if (fop->op)
10346       ls.push_back(fop->op);
10347     ls.splice(ls.end(), fop->dup_ops);
10348     requeue_ops(ls);
10349   }
10350
10351   simple_opc_submit(std::move(ctx));
10352
10353   flush_ops.erase(oid);
10354
10355   if (fop->blocking)
10356     osd->logger->inc(l_osd_tier_flush);
10357   else
10358     osd->logger->inc(l_osd_tier_try_flush);
10359
10360   return -EINPROGRESS;
10361 }
10362
10363 void PrimaryLogPG::cancel_flush(FlushOpRef fop, bool requeue,
10364                                 vector<ceph_tid_t> *tids)
10365 {
10366   dout(10) << __func__ << " " << fop->obc->obs.oi.soid << " tid "
10367            << fop->objecter_tid << dendl;
10368   if (fop->objecter_tid) {
10369     tids->push_back(fop->objecter_tid);
10370     fop->objecter_tid = 0;
10371   }
10372   if (fop->io_tids.size()) {
10373     for (auto &p : fop->io_tids) {
10374       tids->push_back(p.second);
10375       p.second = 0;
10376     }
10377   }
10378   if (fop->blocking && fop->obc->is_blocked()) {
10379     fop->obc->stop_block();
10380     kick_object_context_blocked(fop->obc);
10381   }
10382   if (requeue) {
10383     if (fop->op)
10384       requeue_op(fop->op);
10385     requeue_ops(fop->dup_ops);
10386   }
10387   if (fop->on_flush) {
10388     (*(fop->on_flush))();
10389     fop->on_flush = boost::none;
10390   }
10391   flush_ops.erase(fop->obc->obs.oi.soid);
10392 }
10393
10394 void PrimaryLogPG::cancel_flush_ops(bool requeue, vector<ceph_tid_t> *tids)
10395 {
10396   dout(10) << __func__ << dendl;
10397   map<hobject_t,FlushOpRef>::iterator p = flush_ops.begin();
10398   while (p != flush_ops.end()) {
10399     cancel_flush((p++)->second, requeue, tids);
10400   }
10401 }
10402
10403 bool PrimaryLogPG::is_present_clone(hobject_t coid)
10404 {
10405   if (!pool.info.allow_incomplete_clones())
10406     return true;
10407   if (is_missing_object(coid))
10408     return true;
10409   ObjectContextRef obc = get_object_context(coid, false);
10410   return obc && obc->obs.exists;
10411 }
10412
10413 // ========================================================================
10414 // rep op gather
10415
10416 class C_OSD_RepopCommit : public Context {
10417   PrimaryLogPGRef pg;
10418   boost::intrusive_ptr<PrimaryLogPG::RepGather> repop;
10419 public:
10420   C_OSD_RepopCommit(PrimaryLogPG *pg, PrimaryLogPG::RepGather *repop)
10421     : pg(pg), repop(repop) {}
10422   void finish(int) override {
10423     pg->repop_all_committed(repop.get());
10424   }
10425 };
10426
10427 void PrimaryLogPG::repop_all_committed(RepGather *repop)
10428 {
10429   dout(10) << __func__ << ": repop tid " << repop->rep_tid << " all committed "
10430            << dendl;
10431   repop->all_committed = true;
10432   if (!repop->rep_aborted) {
10433     if (repop->v != eversion_t()) {
10434       last_update_ondisk = repop->v;
10435       last_complete_ondisk = repop->pg_local_last_complete;
10436     }
10437     eval_repop(repop);
10438   }
10439 }
10440
10441 void PrimaryLogPG::op_applied(const eversion_t &applied_version)
10442 {
10443   dout(10) << "op_applied version " << applied_version << dendl;
10444   ceph_assert(applied_version != eversion_t());
10445   ceph_assert(applied_version <= info.last_update);
10446   last_update_applied = applied_version;
10447   if (is_primary()) {
10448     if (scrubber.active) {
10449       if (last_update_applied >= scrubber.subset_last_update) {
10450         requeue_scrub(ops_blocked_by_scrub());
10451       }
10452     } else {
10453       ceph_assert(scrubber.start == scrubber.end);
10454     }
10455   }
10456 }
10457
10458 void PrimaryLogPG::eval_repop(RepGather *repop)
10459 {
10460   const MOSDOp *m = NULL;
10461   if (repop->op)
10462     m = static_cast<const MOSDOp *>(repop->op->get_req());
10463
10464   if (m)
10465     dout(10) << "eval_repop " << *repop << dendl;
10466   else
10467     dout(10) << "eval_repop " << *repop << " (no op)" << dendl;
10468
10469   // ondisk?
10470   if (repop->all_committed) {
10471     dout(10) << " commit: " << *repop << dendl;
10472     for (auto p = repop->on_committed.begin();
10473          p != repop->on_committed.end();
10474          repop->on_committed.erase(p++)) {
10475       (*p)();
10476     }
10477     // send dup commits, in order
10478     auto it = waiting_for_ondisk.find(repop->v);
10479     if (it != waiting_for_ondisk.end()) {
10480       ceph_assert(waiting_for_ondisk.begin()->first == repop->v);
10481       for (auto& i : it->second) {
10482         int return_code = repop->r;
10483         if (return_code >= 0) {
10484           return_code = std::get<2>(i);
10485         }
10486         osd->reply_op_error(std::get<0>(i), return_code, repop->v,
10487                             std::get<1>(i));
10488       }
10489       waiting_for_ondisk.erase(it);
10490     }
10491
10492     publish_stats_to_osd();
10493     calc_min_last_complete_ondisk();
10494
10495     dout(10) << " removing " << *repop << dendl;
10496     ceph_assert(!repop_queue.empty());
10497     dout(20) << "   q front is " << *repop_queue.front() << dendl;
10498     if (repop_queue.front() == repop) {
10499       RepGather *to_remove = nullptr;
10500       while (!repop_queue.empty() &&
10501              (to_remove = repop_queue.front())->all_committed) {
10502         repop_queue.pop_front();
10503         for (auto p = to_remove->on_success.begin();
10504              p != to_remove->on_success.end();
10505              to_remove->on_success.erase(p++)) {
10506           (*p)();
10507         }
10508         remove_repop(to_remove);
10509       }
10510     }
10511   }
10512 }
10513
10514 void PrimaryLogPG::issue_repop(RepGather *repop, OpContext *ctx)
10515 {
10516   FUNCTRACE(cct);
10517   const hobject_t& soid = ctx->obs->oi.soid;
10518   dout(7) << "issue_repop rep_tid " << repop->rep_tid
10519           << " o " << soid
10520           << dendl;
10521
10522   repop->v = ctx->at_version;
10523   if (ctx->at_version > eversion_t()) {
10524     for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
10525          i != acting_recovery_backfill.end();
10526          ++i) {
10527       if (*i == get_primary()) continue;
10528       pg_info_t &pinfo = peer_info[*i];
10529       // keep peer_info up to date
10530       if (pinfo.last_complete == pinfo.last_update)
10531         pinfo.last_complete = ctx->at_version;
10532       pinfo.last_update = ctx->at_version;
10533     }
10534   }
10535
10536   ctx->op_t->add_obc(ctx->obc);
10537   if (ctx->clone_obc) {
10538     ctx->op_t->add_obc(ctx->clone_obc);
10539   }
10540   if (ctx->head_obc) {
10541     ctx->op_t->add_obc(ctx->head_obc);
10542   }
10543
10544   Context *on_all_commit = new C_OSD_RepopCommit(this, repop);
10545   if (!(ctx->log.empty())) {
10546     ceph_assert(ctx->at_version >= projected_last_update);
10547     projected_last_update = ctx->at_version;
10548   }
10549   for (auto &&entry: ctx->log) {
10550     projected_log.add(entry);
10551   }
10552
10553   bool requires_missing_loc = false;
10554   for (set<pg_shard_t>::iterator i = async_recovery_targets.begin();
10555        i != async_recovery_targets.end();
10556        ++i) {
10557     if (*i == get_primary() || !peer_missing[*i].is_missing(soid)) continue;
10558     requires_missing_loc = true;
10559     for (auto &&entry: ctx->log) {
10560       peer_missing[*i].add_next_event(entry);
10561     }
10562   }
10563
10564   if (requires_missing_loc) {
10565     for (auto &&entry: ctx->log) {
10566       dout(30) << __func__ << " missing_loc before: "
10567                << missing_loc.get_locations(entry.soid) << dendl;
10568       missing_loc.add_missing(entry.soid, entry.version,
10569                               eversion_t(), entry.is_delete());
10570       // clear out missing_loc
10571       missing_loc.clear_location(entry.soid);
10572       for (auto &i: actingset) {
10573         if (!peer_missing[i].is_missing(entry.soid))
10574           missing_loc.add_location(entry.soid, i);
10575       }
10576       dout(30) << __func__ << " missing_loc after: "
10577                << missing_loc.get_locations(entry.soid) << dendl;
10578     }
10579   }
10580
10581   pgbackend->submit_transaction(
10582     soid,
10583     ctx->delta_stats,
10584     ctx->at_version,
10585     std::move(ctx->op_t),
10586     pg_trim_to,
10587     min_last_complete_ondisk,
10588     ctx->log,
10589     ctx->updated_hset_history,
10590     on_all_commit,
10591     repop->rep_tid,
10592     ctx->reqid,
10593     ctx->op);
10594 }
10595
10596 PrimaryLogPG::RepGather *PrimaryLogPG::new_repop(
10597   OpContext *ctx, ObjectContextRef obc,
10598   ceph_tid_t rep_tid)
10599 {
10600   if (ctx->op)
10601     dout(10) << "new_repop rep_tid " << rep_tid << " on " << *ctx->op->get_req() << dendl;
10602   else
10603     dout(10) << "new_repop rep_tid " << rep_tid << " (no op)" << dendl;
10604
10605   RepGather *repop = new RepGather(
10606     ctx, rep_tid, info.last_complete);
10607
10608   repop->start = ceph_clock_now();
10609
10610   repop_queue.push_back(&repop->queue_item);
10611   repop->get();
10612
10613   osd->logger->inc(l_osd_op_wip);
10614
10615   dout(10) << __func__ << ": " << *repop << dendl;
10616   return repop;
10617 }
10618
10619 boost::intrusive_ptr<PrimaryLogPG::RepGather> PrimaryLogPG::new_repop(
10620   eversion_t version,
10621   int r,
10622   ObcLockManager &&manager,
10623   OpRequestRef &&op,
10624   boost::optional<std::function<void(void)> > &&on_complete)
10625 {
10626   RepGather *repop = new RepGather(
10627     std::move(manager),
10628     std::move(op),
10629     std::move(on_complete),
10630     osd->get_tid(),
10631     info.last_complete,
10632     r);
10633   repop->v = version;
10634
10635   repop->start = ceph_clock_now();
10636
10637   repop_queue.push_back(&repop->queue_item);
10638
10639   osd->logger->inc(l_osd_op_wip);
10640
10641   dout(10) << __func__ << ": " << *repop << dendl;
10642   return boost::intrusive_ptr<RepGather>(repop);
10643 }
10644
10645 void PrimaryLogPG::remove_repop(RepGather *repop)
10646 {
10647   dout(20) << __func__ << " " << *repop << dendl;
10648
10649   for (auto p = repop->on_finish.begin();
10650        p != repop->on_finish.end();
10651        repop->on_finish.erase(p++)) {
10652     (*p)();
10653   }
10654
10655   release_object_locks(
10656     repop->lock_manager);
10657   repop->put();
10658
10659   osd->logger->dec(l_osd_op_wip);
10660 }
10661
10662 PrimaryLogPG::OpContextUPtr PrimaryLogPG::simple_opc_create(ObjectContextRef obc)
10663 {
10664   dout(20) << __func__ << " " << obc->obs.oi.soid << dendl;
10665   ceph_tid_t rep_tid = osd->get_tid();
10666   osd_reqid_t reqid(osd->get_cluster_msgr_name(), 0, rep_tid);
10667   OpContextUPtr ctx(new OpContext(OpRequestRef(), reqid, nullptr, obc, this));
10668   ctx->op_t.reset(new PGTransaction());
10669   ctx->mtime = ceph_clock_now();
10670   return ctx;
10671 }
10672
10673 void PrimaryLogPG::simple_opc_submit(OpContextUPtr ctx)
10674 {
10675   RepGather *repop = new_repop(ctx.get(), ctx->obc, ctx->reqid.tid);
10676   dout(20) << __func__ << " " << repop << dendl;
10677   issue_repop(repop, ctx.get());
10678   eval_repop(repop);
10679   if (hard_limit_pglog())
10680     calc_trim_to_aggressive();
10681   else
10682     calc_trim_to();
10683   repop->put();
10684 }
10685
10686
10687 void PrimaryLogPG::submit_log_entries(
10688   const mempool::osd_pglog::list<pg_log_entry_t> &entries,
10689   ObcLockManager &&manager,
10690   boost::optional<std::function<void(void)> > &&_on_complete,
10691   OpRequestRef op,
10692   int r)
10693 {
10694   dout(10) << __func__ << " " << entries << dendl;
10695   ceph_assert(is_primary());
10696
10697   eversion_t version;
10698   if (!entries.empty()) {
10699     ceph_assert(entries.rbegin()->version >= projected_last_update);
10700     version = projected_last_update = entries.rbegin()->version;
10701   }
10702
10703   boost::intrusive_ptr<RepGather> repop;
10704   boost::optional<std::function<void(void)> > on_complete;
10705   if (get_osdmap()->require_osd_release >= CEPH_RELEASE_JEWEL) {
10706     repop = new_repop(
10707       version,
10708       r,
10709       std::move(manager),
10710       std::move(op),
10711       std::move(_on_complete));
10712   } else {
10713     on_complete = std::move(_on_complete);
10714   }
10715
10716   pgbackend->call_write_ordered(
10717     [this, entries, repop, on_complete]() {
10718       ObjectStore::Transaction t;
10719       eversion_t old_last_update = info.last_update;
10720       merge_new_log_entries(entries, t, pg_trim_to, min_last_complete_ondisk);
10721
10722
10723       set<pg_shard_t> waiting_on;
10724       for (set<pg_shard_t>::const_iterator i = acting_recovery_backfill.begin();
10725            i != acting_recovery_backfill.end();
10726            ++i) {
10727         pg_shard_t peer(*i);
10728         if (peer == pg_whoami) continue;
10729         ceph_assert(peer_missing.count(peer));
10730         ceph_assert(peer_info.count(peer));
10731         if (get_osdmap()->require_osd_release >= CEPH_RELEASE_JEWEL) {
10732           ceph_assert(repop);
10733           MOSDPGUpdateLogMissing *m = new MOSDPGUpdateLogMissing(
10734             entries,
10735             spg_t(info.pgid.pgid, i->shard),
10736             pg_whoami.shard,
10737             get_osdmap_epoch(),
10738             last_peering_reset,
10739             repop->rep_tid,
10740             pg_trim_to,
10741             min_last_complete_ondisk);
10742           osd->send_message_osd_cluster(
10743             peer.osd, m, get_osdmap_epoch());
10744           waiting_on.insert(peer);
10745         } else {
10746           MOSDPGLog *m = new MOSDPGLog(
10747             peer.shard, pg_whoami.shard,
10748             info.last_update.epoch,
10749             info, last_peering_reset);
10750           m->log.log = entries;
10751           m->log.tail = old_last_update;
10752           m->log.head = info.last_update;
10753           osd->send_message_osd_cluster(
10754             peer.osd, m, get_osdmap_epoch());
10755         }
10756       }
10757       ceph_tid_t rep_tid = repop->rep_tid;
10758       waiting_on.insert(pg_whoami);
10759       log_entry_update_waiting_on.insert(
10760         make_pair(
10761           rep_tid,
10762           LogUpdateCtx{std::move(repop), std::move(waiting_on)}
10763           ));
10764       struct OnComplete : public Context {
10765         PrimaryLogPGRef pg;
10766         ceph_tid_t rep_tid;
10767         epoch_t epoch;
10768         OnComplete(
10769           PrimaryLogPGRef pg,
10770           ceph_tid_t rep_tid,
10771           epoch_t epoch)
10772           : pg(pg), rep_tid(rep_tid), epoch(epoch) {}
10773         void finish(int) override {
10774           pg->lock();
10775           if (!pg->pg_has_reset_since(epoch)) {
10776             auto it = pg->log_entry_update_waiting_on.find(rep_tid);
10777             ceph_assert(it != pg->log_entry_update_waiting_on.end());
10778             auto it2 = it->second.waiting_on.find(pg->pg_whoami);
10779             ceph_assert(it2 != it->second.waiting_on.end());
10780             it->second.waiting_on.erase(it2);
10781             if (it->second.waiting_on.empty()) {
10782               pg->repop_all_committed(it->second.repop.get());
10783               pg->log_entry_update_waiting_on.erase(it);
10784             }
10785           }
10786           pg->unlock();
10787         }
10788       };
10789       t.register_on_commit(
10790         new OnComplete{this, rep_tid, get_osdmap_epoch()});
10791       int r = osd->store->queue_transaction(ch, std::move(t), NULL);
10792       ceph_assert(r == 0);
10793       op_applied(info.last_update);
10794     });
10795
10796   if (hard_limit_pglog())
10797     calc_trim_to_aggressive();
10798   else
10799     calc_trim_to();
10800 }
10801
10802 void PrimaryLogPG::cancel_log_updates()
10803 {
10804   // get rid of all the LogUpdateCtx so their references to repops are
10805   // dropped
10806   log_entry_update_waiting_on.clear();
10807 }
10808
10809 // -------------------------------------------------------
10810
10811 void PrimaryLogPG::get_watchers(list<obj_watch_item_t> *ls)
10812 {
10813   lock();
10814   pair<hobject_t, ObjectContextRef> i;
10815   while (object_contexts.get_next(i.first, &i)) {
10816     ObjectContextRef obc(i.second);
10817     get_obc_watchers(obc, *ls);
10818   }
10819   unlock();
10820 }
10821
10822 void PrimaryLogPG::get_obc_watchers(ObjectContextRef obc, list<obj_watch_item_t> &pg_watchers)
10823 {
10824   for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j =
10825          obc->watchers.begin();
10826         j != obc->watchers.end();
10827         ++j) {
10828     obj_watch_item_t owi;
10829
10830     owi.obj = obc->obs.oi.soid;
10831     owi.wi.addr = j->second->get_peer_addr();
10832     owi.wi.name = j->second->get_entity();
10833     owi.wi.cookie = j->second->get_cookie();
10834     owi.wi.timeout_seconds = j->second->get_timeout();
10835
10836     dout(30) << "watch: Found oid=" << owi.obj << " addr=" << owi.wi.addr
10837       << " name=" << owi.wi.name << " cookie=" << owi.wi.cookie << dendl;
10838
10839     pg_watchers.push_back(owi);
10840   }
10841 }
10842
10843 void PrimaryLogPG::check_blacklisted_watchers()
10844 {
10845   dout(20) << "PrimaryLogPG::check_blacklisted_watchers for pg " << get_pgid() << dendl;
10846   pair<hobject_t, ObjectContextRef> i;
10847   while (object_contexts.get_next(i.first, &i))
10848     check_blacklisted_obc_watchers(i.second);
10849 }
10850
10851 void PrimaryLogPG::check_blacklisted_obc_watchers(ObjectContextRef obc)
10852 {
10853   dout(20) << "PrimaryLogPG::check_blacklisted_obc_watchers for obc " << obc->obs.oi.soid << dendl;
10854   for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator k =
10855          obc->watchers.begin();
10856         k != obc->watchers.end();
10857         ) {
10858     //Advance iterator now so handle_watch_timeout() can erase element
10859     map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j = k++;
10860     dout(30) << "watch: Found " << j->second->get_entity() << " cookie " << j->second->get_cookie() << dendl;
10861     entity_addr_t ea = j->second->get_peer_addr();
10862     dout(30) << "watch: Check entity_addr_t " << ea << dendl;
10863     if (get_osdmap()->is_blacklisted(ea)) {
10864       dout(10) << "watch: Found blacklisted watcher for " << ea << dendl;
10865       ceph_assert(j->second->get_pg() == this);
10866       j->second->unregister_cb();
10867       handle_watch_timeout(j->second);
10868     }
10869   }
10870 }
10871
10872 void PrimaryLogPG::populate_obc_watchers(ObjectContextRef obc)
10873 {
10874   ceph_assert(is_active());
10875   auto it_objects = pg_log.get_log().objects.find(obc->obs.oi.soid);
10876   ceph_assert((recovering.count(obc->obs.oi.soid) ||
10877           !is_missing_object(obc->obs.oi.soid)) ||
10878          (it_objects != pg_log.get_log().objects.end() && // or this is a revert... see recover_primary()
10879           it_objects->second->op ==
10880             pg_log_entry_t::LOST_REVERT &&
10881           it_objects->second->reverting_to ==
10882             obc->obs.oi.version));
10883
10884   dout(10) << "populate_obc_watchers " << obc->obs.oi.soid << dendl;
10885   ceph_assert(obc->watchers.empty());
10886   // populate unconnected_watchers
10887   for (map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator p =
10888         obc->obs.oi.watchers.begin();
10889        p != obc->obs.oi.watchers.end();
10890        ++p) {
10891     utime_t expire = info.stats.last_became_active;
10892     expire += p->second.timeout_seconds;
10893     dout(10) << "  unconnected watcher " << p->first << " will expire " << expire << dendl;
10894     WatchRef watch(
10895       Watch::makeWatchRef(
10896         this, osd, obc, p->second.timeout_seconds, p->first.first,
10897         p->first.second, p->second.addr));
10898     watch->disconnect();
10899     obc->watchers.insert(
10900       make_pair(
10901         make_pair(p->first.first, p->first.second),
10902         watch));
10903   }
10904   // Look for watchers from blacklisted clients and drop
10905   check_blacklisted_obc_watchers(obc);
10906 }
10907
10908 void PrimaryLogPG::handle_watch_timeout(WatchRef watch)
10909 {
10910   ObjectContextRef obc = watch->get_obc(); // handle_watch_timeout owns this ref
10911   dout(10) << "handle_watch_timeout obc " << obc << dendl;
10912
10913   if (!is_active()) {
10914     dout(10) << "handle_watch_timeout not active, no-op" << dendl;
10915     return;
10916   }
10917   if (!obc->obs.exists) {
10918     dout(10) << __func__ << " object " << obc->obs.oi.soid << " dne" << dendl;
10919     return;
10920   }
10921   if (is_degraded_or_backfilling_object(obc->obs.oi.soid)) {
10922     callbacks_for_degraded_object[obc->obs.oi.soid].push_back(
10923       watch->get_delayed_cb()
10924       );
10925     dout(10) << "handle_watch_timeout waiting for degraded on obj "
10926              << obc->obs.oi.soid
10927              << dendl;
10928     return;
10929   }
10930
10931   if (write_blocked_by_scrub(obc->obs.oi.soid)) {
10932     dout(10) << "handle_watch_timeout waiting for scrub on obj "
10933              << obc->obs.oi.soid
10934              << dendl;
10935     scrubber.add_callback(
10936       watch->get_delayed_cb() // This callback!
10937       );
10938     return;
10939   }
10940
10941   OpContextUPtr ctx = simple_opc_create(obc);
10942   ctx->at_version = get_next_version();
10943
10944   object_info_t& oi = ctx->new_obs.oi;
10945   oi.watchers.erase(make_pair(watch->get_cookie(),
10946                               watch->get_entity()));
10947
10948   list<watch_disconnect_t> watch_disconnects = {
10949     watch_disconnect_t(watch->get_cookie(), watch->get_entity(), true)
10950   };
10951   ctx->register_on_success(
10952     [this, obc, watch_disconnects]() {
10953       complete_disconnect_watches(obc, watch_disconnects);
10954     });
10955
10956
10957   PGTransaction *t = ctx->op_t.get();
10958   ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::MODIFY, obc->obs.oi.soid,
10959                                     ctx->at_version,
10960                                     oi.version,
10961                                     0,
10962                                     osd_reqid_t(), ctx->mtime, 0));
10963
10964   oi.prior_version = obc->obs.oi.version;
10965   oi.version = ctx->at_version;
10966   bufferlist bl;
10967   encode(oi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
10968   t->setattr(obc->obs.oi.soid, OI_ATTR, bl);
10969
10970   // apply new object state.
10971   ctx->obc->obs = ctx->new_obs;
10972
10973   // no ctx->delta_stats
10974   simple_opc_submit(std::move(ctx));
10975 }
10976
10977 ObjectContextRef PrimaryLogPG::create_object_context(const object_info_t& oi,
10978                                                      SnapSetContext *ssc)
10979 {
10980   ObjectContextRef obc(object_contexts.lookup_or_create(oi.soid));
10981   ceph_assert(obc->destructor_callback == NULL);
10982   obc->destructor_callback = new C_PG_ObjectContext(this, obc.get());
10983   obc->obs.oi = oi;
10984   obc->obs.exists = false;
10985   obc->ssc = ssc;
10986   if (ssc)
10987     register_snapset_context(ssc);
10988   dout(10) << "create_object_context " << (void*)obc.get() << " " << oi.soid << " " << dendl;
10989   if (is_active())
10990     populate_obc_watchers(obc);
10991   return obc;
10992 }
10993
10994 ObjectContextRef PrimaryLogPG::get_object_context(
10995   const hobject_t& soid,
10996   bool can_create,
10997   const map<string, bufferlist> *attrs)
10998 {
10999   auto it_objects = pg_log.get_log().objects.find(soid);
11000   ceph_assert(
11001     attrs || !pg_log.get_missing().is_missing(soid) ||
11002     // or this is a revert... see recover_primary()
11003     (it_objects != pg_log.get_log().objects.end() &&
11004       it_objects->second->op ==
11005       pg_log_entry_t::LOST_REVERT));
11006   ObjectContextRef obc = object_contexts.lookup(soid);
11007   osd->logger->inc(l_osd_object_ctx_cache_total);
11008   if (obc) {
11009     osd->logger->inc(l_osd_object_ctx_cache_hit);
11010     dout(10) << __func__ << ": found obc in cache: " << obc
11011              << dendl;
11012   } else {
11013     dout(10) << __func__ << ": obc NOT found in cache: " << soid << dendl;
11014     // check disk
11015     bufferlist bv;
11016     if (attrs) {
11017       auto it_oi = attrs->find(OI_ATTR);
11018       ceph_assert(it_oi != attrs->end());
11019       bv = it_oi->second;
11020     } else {
11021       int r = pgbackend->objects_get_attr(soid, OI_ATTR, &bv);
11022       if (r < 0) {
11023         if (!can_create) {
11024           dout(10) << __func__ << ": no obc for soid "
11025                    << soid << " and !can_create"
11026                    << dendl;
11027           return ObjectContextRef();   // -ENOENT!
11028         }
11029
11030         dout(10) << __func__ << ": no obc for soid "
11031                  << soid << " but can_create"
11032                  << dendl;
11033         // new object.
11034         object_info_t oi(soid);
11035         SnapSetContext *ssc = get_snapset_context(
11036           soid, true, 0, false);
11037         ceph_assert(ssc);
11038         obc = create_object_context(oi, ssc);
11039         dout(10) << __func__ << ": " << obc << " " << soid
11040                  << " " << obc->rwstate
11041                  << " oi: " << obc->obs.oi
11042                  << " ssc: " << obc->ssc
11043                  << " snapset: " << obc->ssc->snapset << dendl;
11044         return obc;
11045       }
11046     }
11047
11048     object_info_t oi;
11049     try {
11050       bufferlist::const_iterator bliter = bv.begin();
11051       decode(oi, bliter);
11052     } catch (...) {
11053       dout(0) << __func__ << ": obc corrupt: " << soid << dendl;
11054       return ObjectContextRef();   // -ENOENT!
11055     }
11056
11057     ceph_assert(oi.soid.pool == (int64_t)info.pgid.pool());
11058
11059     obc = object_contexts.lookup_or_create(oi.soid);
11060     obc->destructor_callback = new C_PG_ObjectContext(this, obc.get());
11061     obc->obs.oi = oi;
11062     obc->obs.exists = true;
11063
11064     obc->ssc = get_snapset_context(
11065       soid, true,
11066       soid.has_snapset() ? attrs : 0);
11067
11068     if (is_active())
11069       populate_obc_watchers(obc);
11070
11071     if (pool.info.is_erasure()) {
11072       if (attrs) {
11073         obc->attr_cache = *attrs;
11074       } else {
11075         int r = pgbackend->objects_get_attrs(
11076           soid,
11077           &obc->attr_cache);
11078         ceph_assert(r == 0);
11079       }
11080     }
11081
11082     dout(10) << __func__ << ": creating obc from disk: " << obc
11083              << dendl;
11084   }
11085
11086   // XXX: Caller doesn't expect this
11087   if (obc->ssc == NULL) {
11088     derr << __func__ << ": obc->ssc not available, not returning context" << dendl;
11089     return ObjectContextRef();   // -ENOENT!
11090   }
11091
11092   dout(10) << __func__ << ": " << obc << " " << soid
11093            << " " << obc->rwstate
11094            << " oi: " << obc->obs.oi
11095            << " exists: " << (int)obc->obs.exists
11096            << " ssc: " << obc->ssc
11097            << " snapset: " << obc->ssc->snapset << dendl;
11098   return obc;
11099 }
11100
11101 void PrimaryLogPG::context_registry_on_change()
11102 {
11103   pair<hobject_t, ObjectContextRef> i;
11104   while (object_contexts.get_next(i.first, &i)) {
11105     ObjectContextRef obc(i.second);
11106     if (obc) {
11107       for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j =
11108              obc->watchers.begin();
11109            j != obc->watchers.end();
11110            obc->watchers.erase(j++)) {
11111         j->second->discard();
11112       }
11113     }
11114   }
11115 }
11116
11117
11118 /*
11119  * If we return an error, and set *pmissing, then promoting that
11120  * object may help.
11121  *
11122  * If we return -EAGAIN, we will always set *pmissing to the missing
11123  * object to wait for.
11124  *
11125  * If we return an error but do not set *pmissing, then we know the
11126  * object does not exist.
11127  */
11128 int PrimaryLogPG::find_object_context(const hobject_t& oid,
11129                                       ObjectContextRef *pobc,
11130                                       bool can_create,
11131                                       bool map_snapid_to_clone,
11132                                       hobject_t *pmissing)
11133 {
11134   FUNCTRACE(cct);
11135   ceph_assert(oid.pool == static_cast<int64_t>(info.pgid.pool()));
11136   // want the head?
11137   if (oid.snap == CEPH_NOSNAP) {
11138     ObjectContextRef obc = get_object_context(oid, can_create);
11139     if (!obc) {
11140       if (pmissing)
11141         *pmissing = oid;
11142       return -ENOENT;
11143     }
11144     dout(10) << __func__ << " " << oid
11145        << " @" << oid.snap
11146        << " oi=" << obc->obs.oi
11147        << dendl;
11148     *pobc = obc;
11149
11150     return 0;
11151   }
11152
11153   hobject_t head = oid.get_head();
11154
11155   // we want a snap
11156   if (!map_snapid_to_clone && pool.info.is_removed_snap(oid.snap)) {
11157     dout(10) << __func__ << " snap " << oid.snap << " is removed" << dendl;
11158     return -ENOENT;
11159   }
11160
11161   SnapSetContext *ssc = get_snapset_context(oid, can_create);
11162   if (!ssc || !(ssc->exists || can_create)) {
11163     dout(20) << __func__ << " " << oid << " no snapset" << dendl;
11164     if (pmissing)
11165       *pmissing = head;  // start by getting the head
11166     if (ssc)
11167       put_snapset_context(ssc);
11168     return -ENOENT;
11169   }
11170
11171   if (map_snapid_to_clone) {
11172     dout(10) << __func__ << " " << oid << " @" << oid.snap
11173              << " snapset " << ssc->snapset
11174              << " map_snapid_to_clone=true" << dendl;
11175     if (oid.snap > ssc->snapset.seq) {
11176       // already must be readable
11177       ObjectContextRef obc = get_object_context(head, false);
11178       dout(10) << __func__ << " " << oid << " @" << oid.snap
11179                << " snapset " << ssc->snapset
11180                << " maps to head" << dendl;
11181       *pobc = obc;
11182       put_snapset_context(ssc);
11183       return (obc && obc->obs.exists) ? 0 : -ENOENT;
11184     } else {
11185       vector<snapid_t>::const_iterator citer = std::find(
11186         ssc->snapset.clones.begin(),
11187         ssc->snapset.clones.end(),
11188         oid.snap);
11189       if (citer == ssc->snapset.clones.end()) {
11190         dout(10) << __func__ << " " << oid << " @" << oid.snap
11191                  << " snapset " << ssc->snapset
11192                  << " maps to nothing" << dendl;
11193         put_snapset_context(ssc);
11194         return -ENOENT;
11195       }
11196
11197       dout(10) << __func__ << " " << oid << " @" << oid.snap
11198                << " snapset " << ssc->snapset
11199                << " maps to " << oid << dendl;
11200
11201       if (pg_log.get_missing().is_missing(oid)) {
11202         dout(10) << __func__ << " " << oid << " @" << oid.snap
11203                  << " snapset " << ssc->snapset
11204                  << " " << oid << " is missing" << dendl;
11205         if (pmissing)
11206           *pmissing = oid;
11207         put_snapset_context(ssc);
11208         return -EAGAIN;
11209       }
11210
11211       ObjectContextRef obc = get_object_context(oid, false);
11212       if (!obc || !obc->obs.exists) {
11213         dout(10) << __func__ << " " << oid << " @" << oid.snap
11214                  << " snapset " << ssc->snapset
11215                  << " " << oid << " is not present" << dendl;
11216         if (pmissing)
11217           *pmissing = oid;
11218         put_snapset_context(ssc);
11219         return -ENOENT;
11220       }
11221       dout(10) << __func__ << " " << oid << " @" << oid.snap
11222                << " snapset " << ssc->snapset
11223                << " " << oid << " HIT" << dendl;
11224       *pobc = obc;
11225       put_snapset_context(ssc);
11226       return 0;
11227     }
11228     ceph_abort(); //unreachable
11229   }
11230
11231   dout(10) << __func__ << " " << oid << " @" << oid.snap
11232            << " snapset " << ssc->snapset << dendl;
11233
11234   // head?
11235   if (oid.snap > ssc->snapset.seq) {
11236     ObjectContextRef obc = get_object_context(head, false);
11237     dout(10) << __func__ << " " << head
11238              << " want " << oid.snap << " > snapset seq " << ssc->snapset.seq
11239              << " -- HIT " << obc->obs
11240              << dendl;
11241     if (!obc->ssc)
11242       obc->ssc = ssc;
11243     else {
11244       ceph_assert(ssc == obc->ssc);
11245       put_snapset_context(ssc);
11246     }
11247     *pobc = obc;
11248     return 0;
11249   }
11250
11251   // which clone would it be?
11252   unsigned k = 0;
11253   while (k < ssc->snapset.clones.size() &&
11254          ssc->snapset.clones[k] < oid.snap)
11255     k++;
11256   if (k == ssc->snapset.clones.size()) {
11257     dout(10) << __func__ << " no clones with last >= oid.snap "
11258              << oid.snap << " -- DNE" << dendl;
11259     put_snapset_context(ssc);
11260     return -ENOENT;
11261   }
11262   hobject_t soid(oid.oid, oid.get_key(), ssc->snapset.clones[k], oid.get_hash(),
11263                  info.pgid.pool(), oid.get_namespace());
11264
11265   if (pg_log.get_missing().is_missing(soid)) {
11266     dout(20) << __func__ << " " << soid << " missing, try again later"
11267              << dendl;
11268     if (pmissing)
11269       *pmissing = soid;
11270     put_snapset_context(ssc);
11271     return -EAGAIN;
11272   }
11273
11274   ObjectContextRef obc = get_object_context(soid, false);
11275   if (!obc || !obc->obs.exists) {
11276     if (pmissing)
11277       *pmissing = soid;
11278     put_snapset_context(ssc);
11279     if (is_degraded_or_backfilling_object(soid)) {
11280       dout(20) << __func__ << " clone is degraded or backfilling " << soid << dendl;
11281       return -EAGAIN;
11282     } else if (is_degraded_on_async_recovery_target(soid)) {
11283       dout(20) << __func__ << " clone is recovering " << soid << dendl;
11284       return -EAGAIN;
11285     } else {
11286       dout(20) << __func__ << " missing clone " << soid << dendl;
11287       return -ENOENT;
11288     }
11289   }
11290
11291   if (!obc->ssc) {
11292     obc->ssc = ssc;
11293   } else {
11294     ceph_assert(obc->ssc == ssc);
11295     put_snapset_context(ssc);
11296   }
11297   ssc = 0;
11298
11299   // clone
11300   dout(20) << __func__ << " " << soid
11301            << " snapset " << obc->ssc->snapset
11302            << dendl;
11303   snapid_t first, last;
11304   auto p = obc->ssc->snapset.clone_snaps.find(soid.snap);
11305   ceph_assert(p != obc->ssc->snapset.clone_snaps.end());
11306   if (p->second.empty()) {
11307     dout(1) << __func__ << " " << soid << " empty snapset -- DNE" << dendl;
11308     ceph_assert(!cct->_conf->osd_debug_verify_snaps);
11309     return -ENOENT;
11310   }
11311   first = p->second.back();
11312   last = p->second.front();
11313   if (first <= oid.snap) {
11314     dout(20) << __func__ << " " << soid << " [" << first << "," << last
11315              << "] contains " << oid.snap << " -- HIT " << obc->obs << dendl;
11316     *pobc = obc;
11317     return 0;
11318   } else {
11319     dout(20) << __func__ << " " << soid << " [" << first << "," << last
11320              << "] does not contain " << oid.snap << " -- DNE" << dendl;
11321     return -ENOENT;
11322   }
11323 }
11324
11325 void PrimaryLogPG::object_context_destructor_callback(ObjectContext *obc)
11326 {
11327   if (obc->ssc)
11328     put_snapset_context(obc->ssc);
11329 }
11330
11331 void PrimaryLogPG::add_object_context_to_pg_stat(ObjectContextRef obc, pg_stat_t *pgstat)
11332 {
11333   object_info_t& oi = obc->obs.oi;
11334
11335   dout(10) << __func__ << " " << oi.soid << dendl;
11336   ceph_assert(!oi.soid.is_snapdir());
11337
11338   object_stat_sum_t stat;
11339   stat.num_objects++;
11340   if (oi.is_dirty())
11341     stat.num_objects_dirty++;
11342   if (oi.is_whiteout())
11343     stat.num_whiteouts++;
11344   if (oi.is_omap())
11345     stat.num_objects_omap++;
11346   if (oi.is_cache_pinned())
11347     stat.num_objects_pinned++;
11348   if (oi.has_manifest())
11349     stat.num_objects_manifest++;
11350
11351   if (oi.soid.is_snap()) {
11352     stat.num_object_clones++;
11353
11354     if (!obc->ssc)
11355       obc->ssc = get_snapset_context(oi.soid, false);
11356     ceph_assert(obc->ssc);
11357     stat.num_bytes += obc->ssc->snapset.get_clone_bytes(oi.soid.snap);
11358   } else {
11359     stat.num_bytes += oi.size;
11360   }
11361
11362   // add it in
11363   pgstat->stats.sum.add(stat);
11364 }
11365
11366 void PrimaryLogPG::kick_object_context_blocked(ObjectContextRef obc)
11367 {
11368   const hobject_t& soid = obc->obs.oi.soid;
11369   if (obc->is_blocked()) {
11370     dout(10) << __func__ << " " << soid << " still blocked" << dendl;
11371     return;
11372   }
11373
11374   map<hobject_t, list<OpRequestRef>>::iterator p = waiting_for_blocked_object.find(soid);
11375   if (p != waiting_for_blocked_object.end()) {
11376     list<OpRequestRef>& ls = p->second;
11377     dout(10) << __func__ << " " << soid << " requeuing " << ls.size() << " requests" << dendl;
11378     requeue_ops(ls);
11379     waiting_for_blocked_object.erase(p);
11380   }
11381
11382   map<hobject_t, ObjectContextRef>::iterator i =
11383     objects_blocked_on_snap_promotion.find(obc->obs.oi.soid.get_head());
11384   if (i != objects_blocked_on_snap_promotion.end()) {
11385     ceph_assert(i->second == obc);
11386     objects_blocked_on_snap_promotion.erase(i);
11387   }
11388
11389   if (obc->requeue_scrub_on_unblock) {
11390     obc->requeue_scrub_on_unblock = false;
11391     requeue_scrub();
11392   }
11393 }
11394
11395 SnapSetContext *PrimaryLogPG::get_snapset_context(
11396   const hobject_t& oid,
11397   bool can_create,
11398   const map<string, bufferlist> *attrs,
11399   bool oid_existed)
11400 {
11401   std::lock_guard l(snapset_contexts_lock);
11402   SnapSetContext *ssc;
11403   map<hobject_t, SnapSetContext*>::iterator p = snapset_contexts.find(
11404     oid.get_snapdir());
11405   if (p != snapset_contexts.end()) {
11406     if (can_create || p->second->exists) {
11407       ssc = p->second;
11408     } else {
11409       return NULL;
11410     }
11411   } else {
11412     bufferlist bv;
11413     if (!attrs) {
11414       int r = -ENOENT;
11415       if (!(oid.is_head() && !oid_existed)) {
11416         r = pgbackend->objects_get_attr(oid.get_head(), SS_ATTR, &bv);
11417       }
11418       if (r < 0 && !can_create)
11419         return NULL;
11420     } else {
11421       auto it_ss = attrs->find(SS_ATTR);
11422       ceph_assert(it_ss != attrs->end());
11423       bv = it_ss->second;
11424     }
11425     ssc = new SnapSetContext(oid.get_snapdir());
11426     _register_snapset_context(ssc);
11427     if (bv.length()) {
11428       bufferlist::const_iterator bvp = bv.begin();
11429       try {
11430         ssc->snapset.decode(bvp);
11431       } catch (buffer::error& e) {
11432         dout(0) << __func__ << " Can't decode snapset: " << e << dendl;
11433         return NULL;
11434       }
11435       ssc->exists = true;
11436     } else {
11437       ssc->exists = false;
11438     }
11439   }
11440   ceph_assert(ssc);
11441   ssc->ref++;
11442   return ssc;
11443 }
11444
11445 void PrimaryLogPG::put_snapset_context(SnapSetContext *ssc)
11446 {
11447   std::lock_guard l(snapset_contexts_lock);
11448   --ssc->ref;
11449   if (ssc->ref == 0) {
11450     if (ssc->registered)
11451       snapset_contexts.erase(ssc->oid);
11452     delete ssc;
11453   }
11454 }
11455
11456 /*
11457  * Return values:
11458  *  NONE  - didn't pull anything
11459  *  YES   - pulled what the caller wanted
11460  *  HEAD  - needed to pull head first
11461  */
11462 enum { PULL_NONE, PULL_HEAD, PULL_YES };
11463
11464 int PrimaryLogPG::recover_missing(
11465   const hobject_t &soid, eversion_t v,
11466   int priority,
11467   PGBackend::RecoveryHandle *h)
11468 {
11469   if (missing_loc.is_unfound(soid)) {
11470     dout(7) << __func__ << " " << soid
11471             << " v " << v
11472             << " but it is unfound" << dendl;
11473     return PULL_NONE;
11474   }
11475
11476   if (missing_loc.is_deleted(soid)) {
11477     start_recovery_op(soid);
11478     ceph_assert(!recovering.count(soid));
11479     recovering.insert(make_pair(soid, ObjectContextRef()));
11480     epoch_t cur_epoch = get_osdmap_epoch();
11481     remove_missing_object(soid, v, new FunctionContext(
11482      [=](int) {
11483        lock();
11484        if (!pg_has_reset_since(cur_epoch)) {
11485          bool object_missing = false;
11486          for (const auto& shard : acting_recovery_backfill) {
11487            if (shard == pg_whoami)
11488              continue;
11489            if (peer_missing[shard].is_missing(soid)) {
11490              dout(20) << __func__ << ": soid " << soid << " needs to be deleted from replica " << shard << dendl;
11491              object_missing = true;
11492              break;
11493            }
11494          }
11495          if (!object_missing) {
11496            object_stat_sum_t stat_diff;
11497            stat_diff.num_objects_recovered = 1;
11498            if (scrub_after_recovery)
11499              stat_diff.num_objects_repaired = 1;
11500            on_global_recover(soid, stat_diff, true);
11501          } else {
11502            auto recovery_handle = pgbackend->open_recovery_op();
11503            pgbackend->recover_delete_object(soid, v, recovery_handle);
11504            pgbackend->run_recovery_op(recovery_handle, priority);
11505          }
11506        }
11507        unlock();
11508      }));
11509     return PULL_YES;
11510   }
11511
11512   // is this a snapped object?  if so, consult the snapset.. we may not need the entire object!
11513   ObjectContextRef obc;
11514   ObjectContextRef head_obc;
11515   if (soid.snap && soid.snap < CEPH_NOSNAP) {
11516     // do we have the head?
11517     hobject_t head = soid.get_head();
11518     if (pg_log.get_missing().is_missing(head)) {
11519       if (recovering.count(head)) {
11520         dout(10) << " missing but already recovering head " << head << dendl;
11521         return PULL_NONE;
11522       } else {
11523         int r = recover_missing(
11524           head, pg_log.get_missing().get_items().find(head)->second.need, priority,
11525           h);
11526         if (r != PULL_NONE)
11527           return PULL_HEAD;
11528         return PULL_NONE;
11529       }
11530     }
11531     head_obc = get_object_context(
11532       head,
11533       false,
11534       0);
11535     ceph_assert(head_obc);
11536   }
11537   start_recovery_op(soid);
11538   ceph_assert(!recovering.count(soid));
11539   recovering.insert(make_pair(soid, obc));
11540   int r = pgbackend->recover_object(
11541     soid,
11542     v,
11543     head_obc,
11544     obc,
11545     h);
11546   // This is only a pull which shouldn't return an error
11547   ceph_assert(r >= 0);
11548   return PULL_YES;
11549 }
11550
11551 void PrimaryLogPG::remove_missing_object(const hobject_t &soid,
11552                                          eversion_t v, Context *on_complete)
11553 {
11554   dout(20) << __func__ << " " << soid << " " << v << dendl;
11555   ceph_assert(on_complete != nullptr);
11556   // delete locally
11557   ObjectStore::Transaction t;
11558   remove_snap_mapped_object(t, soid);
11559
11560   ObjectRecoveryInfo recovery_info;
11561   recovery_info.soid = soid;
11562   recovery_info.version = v;
11563
11564   epoch_t cur_epoch = get_osdmap_epoch();
11565   t.register_on_complete(new FunctionContext(
11566      [=](int) {
11567        lock();
11568        if (!pg_has_reset_since(cur_epoch)) {
11569          ObjectStore::Transaction t2;
11570          on_local_recover(soid, recovery_info, ObjectContextRef(), true, &t2);
11571          t2.register_on_complete(on_complete);
11572          int r = osd->store->queue_transaction(ch, std::move(t2), nullptr);
11573          ceph_assert(r == 0);
11574          unlock();
11575        } else {
11576          unlock();
11577          on_complete->complete(-EAGAIN);
11578        }
11579      }));
11580   int r = osd->store->queue_transaction(ch, std::move(t), nullptr);
11581   ceph_assert(r == 0);
11582 }
11583
11584 void PrimaryLogPG::finish_degraded_object(const hobject_t& oid)
11585 {
11586   dout(10) << __func__ << " " << oid << dendl;
11587   if (callbacks_for_degraded_object.count(oid)) {
11588     list<Context*> contexts;
11589     contexts.swap(callbacks_for_degraded_object[oid]);
11590     callbacks_for_degraded_object.erase(oid);
11591     for (list<Context*>::iterator i = contexts.begin();
11592          i != contexts.end();
11593          ++i) {
11594       (*i)->complete(0);
11595     }
11596   }
11597   map<hobject_t, snapid_t>::iterator i = objects_blocked_on_degraded_snap.find(
11598     oid.get_head());
11599   if (i != objects_blocked_on_degraded_snap.end() &&
11600       i->second == oid.snap)
11601     objects_blocked_on_degraded_snap.erase(i);
11602 }
11603
11604 void PrimaryLogPG::_committed_pushed_object(
11605   epoch_t epoch, eversion_t last_complete)
11606 {
11607   lock();
11608   if (!pg_has_reset_since(epoch)) {
11609     dout(10) << __func__ << " last_complete " << last_complete << " now ondisk" << dendl;
11610     last_complete_ondisk = last_complete;
11611
11612     if (last_complete_ondisk == info.last_update) {
11613       if (!is_primary()) {
11614         // Either we are a replica or backfill target.
11615         // we are fully up to date.  tell the primary!
11616         osd->send_message_osd_cluster(
11617           get_primary().osd,
11618           new MOSDPGTrim(
11619             get_osdmap_epoch(),
11620             spg_t(info.pgid.pgid, get_primary().shard),
11621             last_complete_ondisk),
11622           get_osdmap_epoch());
11623       } else {
11624         calc_min_last_complete_ondisk();
11625       }
11626     }
11627
11628   } else {
11629     dout(10) << __func__ << " pg has changed, not touching last_complete_ondisk" << dendl;
11630   }
11631
11632   unlock();
11633 }
11634
11635 void PrimaryLogPG::_applied_recovered_object(ObjectContextRef obc)
11636 {
11637   dout(20) << __func__ << dendl;
11638   if (obc) {
11639     dout(20) << "obc = " << *obc << dendl;
11640   }
11641   ceph_assert(active_pushes >= 1);
11642   --active_pushes;
11643
11644   // requeue an active chunky scrub waiting on recovery ops
11645   if (!deleting && active_pushes == 0
11646       && scrubber.is_chunky_scrub_active()) {
11647     requeue_scrub(ops_blocked_by_scrub());
11648   }
11649 }
11650
11651 void PrimaryLogPG::_applied_recovered_object_replica()
11652 {
11653   dout(20) << __func__ << dendl;
11654   ceph_assert(active_pushes >= 1);
11655   --active_pushes;
11656
11657   // requeue an active chunky scrub waiting on recovery ops
11658   if (!deleting && active_pushes == 0 &&
11659       scrubber.active_rep_scrub && static_cast<const MOSDRepScrub*>(
11660         scrubber.active_rep_scrub->get_req())->chunky) {
11661     auto& op = scrubber.active_rep_scrub;
11662     osd->enqueue_back(
11663       OpQueueItem(
11664         unique_ptr<OpQueueItem::OpQueueable>(new PGOpItem(info.pgid, op)),
11665         op->get_req()->get_cost(),
11666         op->get_req()->get_priority(),
11667         op->get_req()->get_recv_stamp(),
11668         op->get_req()->get_source().num(),
11669         get_osdmap_epoch()));
11670     scrubber.active_rep_scrub.reset();
11671   }
11672 }
11673
11674 void PrimaryLogPG::recover_got(hobject_t oid, eversion_t v)
11675 {
11676   dout(10) << "got missing " << oid << " v " << v << dendl;
11677   pg_log.recover_got(oid, v, info);
11678   if (pg_log.get_log().complete_to != pg_log.get_log().log.end()) {
11679     dout(10) << "last_complete now " << info.last_complete
11680              << " log.complete_to " << pg_log.get_log().complete_to->version
11681              << dendl;
11682   } else {
11683     dout(10) << "last_complete now " << info.last_complete
11684              << " log.complete_to at end" << dendl;
11685     //below is not true in the repair case.
11686     //assert(missing.num_missing() == 0);  // otherwise, complete_to was wrong.
11687     ceph_assert(info.last_complete == info.last_update);
11688   }
11689 }
11690
11691 void PrimaryLogPG::primary_failed(const hobject_t &soid)
11692 {
11693   list<pg_shard_t> fl = { pg_whoami };
11694   failed_push(fl, soid);
11695 }
11696
11697 void PrimaryLogPG::failed_push(const list<pg_shard_t> &from, const hobject_t &soid)
11698 {
11699   dout(20) << __func__ << ": " << soid << dendl;
11700   ceph_assert(recovering.count(soid));
11701   auto obc = recovering[soid];
11702   if (obc) {
11703     list<OpRequestRef> blocked_ops;
11704     obc->drop_recovery_read(&blocked_ops);
11705     requeue_ops(blocked_ops);
11706   }
11707   recovering.erase(soid);
11708   for (auto&& i : from)
11709     missing_loc.remove_location(soid, i);
11710   dout(0) << __func__ << " " << soid << " from shard " << from
11711           << ", reps on " << missing_loc.get_locations(soid)
11712           << " unfound? " << missing_loc.is_unfound(soid) << dendl;
11713   finish_recovery_op(soid);  // close out this attempt,
11714 }
11715
11716 eversion_t PrimaryLogPG::pick_newest_available(const hobject_t& oid)
11717 {
11718   eversion_t v;
11719   pg_missing_item pmi;
11720   bool is_missing = pg_log.get_missing().is_missing(oid, &pmi);
11721   ceph_assert(is_missing);
11722   v = pmi.have;
11723   dout(10) << "pick_newest_available " << oid << " " << v << " on osd." << osd->whoami << " (local)" << dendl;
11724
11725   ceph_assert(!acting_recovery_backfill.empty());
11726   for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
11727        i != acting_recovery_backfill.end();
11728        ++i) {
11729     if (*i == get_primary()) continue;
11730     pg_shard_t peer = *i;
11731     if (!peer_missing[peer].is_missing(oid)) {
11732       continue;
11733     }
11734     eversion_t h = peer_missing[peer].get_items().at(oid).have;
11735     dout(10) << "pick_newest_available " << oid << " " << h << " on osd." << peer << dendl;
11736     if (h > v)
11737       v = h;
11738   }
11739
11740   dout(10) << "pick_newest_available " << oid << " " << v << " (newest)" << dendl;
11741   return v;
11742 }
11743
11744 void PrimaryLogPG::do_update_log_missing(OpRequestRef &op)
11745 {
11746   const MOSDPGUpdateLogMissing *m = static_cast<const MOSDPGUpdateLogMissing*>(
11747     op->get_req());
11748   ceph_assert(m->get_type() == MSG_OSD_PG_UPDATE_LOG_MISSING);
11749   ObjectStore::Transaction t;
11750   boost::optional<eversion_t> op_trim_to, op_roll_forward_to;
11751   if (m->pg_trim_to != eversion_t())
11752     op_trim_to = m->pg_trim_to;
11753   if (m->pg_roll_forward_to != eversion_t())
11754     op_roll_forward_to = m->pg_roll_forward_to;
11755
11756   dout(20) << __func__ << " op_trim_to = " << op_trim_to << " op_roll_forward_to = " << op_roll_forward_to << dendl;
11757
11758   append_log_entries_update_missing(m->entries, t, op_trim_to, op_roll_forward_to);
11759   eversion_t new_lcod = info.last_complete;
11760
11761   Context *complete = new FunctionContext(
11762     [=](int) {
11763       const MOSDPGUpdateLogMissing *msg = static_cast<const MOSDPGUpdateLogMissing*>(
11764         op->get_req());
11765       lock();
11766       if (!pg_has_reset_since(msg->get_epoch())) {
11767         update_last_complete_ondisk(new_lcod);
11768         MOSDPGUpdateLogMissingReply *reply =
11769           new MOSDPGUpdateLogMissingReply(
11770             spg_t(info.pgid.pgid, primary_shard().shard),
11771             pg_whoami.shard,
11772             msg->get_epoch(),
11773             msg->min_epoch,
11774             msg->get_tid(),
11775             new_lcod);
11776         reply->set_priority(CEPH_MSG_PRIO_HIGH);
11777         msg->get_connection()->send_message(reply);
11778       }
11779       unlock();
11780     });
11781
11782   if (get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
11783     t.register_on_commit(complete);
11784   } else {
11785     /* Hack to work around the fact that ReplicatedBackend sends
11786      * ack+commit if commit happens first
11787      *
11788      * This behavior is no longer necessary, but we preserve it so old
11789      * primaries can keep their repops in order */
11790     if (pool.info.is_erasure()) {
11791       t.register_on_complete(complete);
11792     } else {
11793       t.register_on_commit(complete);
11794     }
11795   }
11796   int tr = osd->store->queue_transaction(
11797     ch,
11798     std::move(t),
11799     nullptr);
11800   ceph_assert(tr == 0);
11801   op_applied(info.last_update);
11802 }
11803
11804 void PrimaryLogPG::do_update_log_missing_reply(OpRequestRef &op)
11805 {
11806   const MOSDPGUpdateLogMissingReply *m =
11807     static_cast<const MOSDPGUpdateLogMissingReply*>(
11808     op->get_req());
11809   dout(20) << __func__ << " got reply from "
11810            << m->get_from() << dendl;
11811
11812   auto it = log_entry_update_waiting_on.find(m->get_tid());
11813   if (it != log_entry_update_waiting_on.end()) {
11814     if (it->second.waiting_on.count(m->get_from())) {
11815       it->second.waiting_on.erase(m->get_from());
11816       if (m->last_complete_ondisk != eversion_t()) {
11817         update_peer_last_complete_ondisk(m->get_from(), m->last_complete_ondisk);
11818       }
11819     } else {
11820       osd->clog->error()
11821         << info.pgid << " got reply "
11822         << *m << " from shard we are not waiting for "
11823         << m->get_from();
11824     }
11825
11826     if (it->second.waiting_on.empty()) {
11827       repop_all_committed(it->second.repop.get());
11828       log_entry_update_waiting_on.erase(it);
11829     }
11830   } else {
11831     osd->clog->error()
11832       << info.pgid << " got reply "
11833       << *m << " on unknown tid " << m->get_tid();
11834   }
11835 }
11836
11837 /* Mark all unfound objects as lost.
11838  */
11839 void PrimaryLogPG::mark_all_unfound_lost(
11840   int what,
11841   ConnectionRef con,
11842   ceph_tid_t tid)
11843 {
11844   dout(3) << __func__ << " " << pg_log_entry_t::get_op_name(what) << dendl;
11845   list<hobject_t> oids;
11846
11847   dout(30) << __func__ << ": log before:\n";
11848   pg_log.get_log().print(*_dout);
11849   *_dout << dendl;
11850
11851   mempool::osd_pglog::list<pg_log_entry_t> log_entries;
11852
11853   utime_t mtime = ceph_clock_now();
11854   map<hobject_t, pg_missing_item>::const_iterator m =
11855     missing_loc.get_needs_recovery().begin();
11856   map<hobject_t, pg_missing_item>::const_iterator mend =
11857     missing_loc.get_needs_recovery().end();
11858
11859   ObcLockManager manager;
11860   eversion_t v = get_next_version();
11861   v.epoch = get_osdmap_epoch();
11862   uint64_t num_unfound = missing_loc.num_unfound();
11863   while (m != mend) {
11864     const hobject_t &oid(m->first);
11865     if (!missing_loc.is_unfound(oid)) {
11866       // We only care about unfound objects
11867       ++m;
11868       continue;
11869     }
11870
11871     ObjectContextRef obc;
11872     eversion_t prev;
11873
11874     switch (what) {
11875     case pg_log_entry_t::LOST_MARK:
11876       ceph_abort_msg("actually, not implemented yet!");
11877       break;
11878
11879     case pg_log_entry_t::LOST_REVERT:
11880       prev = pick_newest_available(oid);
11881       if (prev > eversion_t()) {
11882         // log it
11883         pg_log_entry_t e(
11884           pg_log_entry_t::LOST_REVERT, oid, v,
11885           m->second.need, 0, osd_reqid_t(), mtime, 0);
11886         e.reverting_to = prev;
11887         e.mark_unrollbackable();
11888         log_entries.push_back(e);
11889         dout(10) << e << dendl;
11890
11891         // we are now missing the new version; recovery code will sort it out.
11892         ++v.version;
11893         ++m;
11894         break;
11895       }
11896
11897     case pg_log_entry_t::LOST_DELETE:
11898       {
11899         pg_log_entry_t e(pg_log_entry_t::LOST_DELETE, oid, v, m->second.need,
11900                          0, osd_reqid_t(), mtime, 0);
11901         if (get_osdmap()->require_osd_release >= CEPH_RELEASE_JEWEL) {
11902           if (pool.info.require_rollback()) {
11903             e.mod_desc.try_rmobject(v.version);
11904           } else {
11905             e.mark_unrollbackable();
11906           }
11907         } // otherwise, just do what we used to do
11908         dout(10) << e << dendl;
11909         log_entries.push_back(e);
11910         oids.push_back(oid);
11911
11912         // If context found mark object as deleted in case
11913         // of racing with new creation.  This can happen if
11914         // object lost and EIO at primary.
11915         obc = object_contexts.lookup(oid);
11916         if (obc)
11917           obc->obs.exists = false;
11918
11919         ++v.version;
11920         ++m;
11921       }
11922       break;
11923
11924     default:
11925       ceph_abort();
11926     }
11927   }
11928
11929   info.stats.stats_invalid = true;
11930
11931   submit_log_entries(
11932     log_entries,
11933     std::move(manager),
11934     boost::optional<std::function<void(void)> >(
11935       [this, oids, con, num_unfound, tid]() {
11936         if (perform_deletes_during_peering()) {
11937           for (auto oid : oids) {
11938             // clear old locations - merge_new_log_entries will have
11939             // handled rebuilding missing_loc for each of these
11940             // objects if we have the RECOVERY_DELETES flag
11941             missing_loc.recovered(oid);
11942           }
11943         }
11944
11945         if (is_recovery_unfound()) {
11946           queue_peering_event(
11947             PGPeeringEventRef(
11948               std::make_shared<PGPeeringEvent>(
11949               get_osdmap_epoch(),
11950               get_osdmap_epoch(),
11951               DoRecovery())));
11952         } else if (is_backfill_unfound()) {
11953           queue_peering_event(
11954             PGPeeringEventRef(
11955               std::make_shared<PGPeeringEvent>(
11956               get_osdmap_epoch(),
11957               get_osdmap_epoch(),
11958               RequestBackfill())));
11959         } else {
11960           queue_recovery();
11961         }
11962
11963         stringstream ss;
11964         ss << "pg has " << num_unfound
11965            << " objects unfound and apparently lost marking";
11966         string rs = ss.str();
11967         dout(0) << "do_command r=" << 0 << " " << rs << dendl;
11968         osd->clog->info() << rs;
11969         if (con) {
11970           MCommandReply *reply = new MCommandReply(0, rs);
11971           reply->set_tid(tid);
11972           con->send_message(reply);
11973         }
11974       }),
11975     OpRequestRef());
11976 }
11977
11978 void PrimaryLogPG::_split_into(pg_t child_pgid, PG *child, unsigned split_bits)
11979 {
11980   ceph_assert(repop_queue.empty());
11981 }
11982
11983 /*
11984  * pg status change notification
11985  */
11986
11987 void PrimaryLogPG::apply_and_flush_repops(bool requeue)
11988 {
11989   list<OpRequestRef> rq;
11990
11991   // apply all repops
11992   while (!repop_queue.empty()) {
11993     RepGather *repop = repop_queue.front();
11994     repop_queue.pop_front();
11995     dout(10) << " canceling repop tid " << repop->rep_tid << dendl;
11996     repop->rep_aborted = true;
11997     repop->on_committed.clear();
11998     repop->on_success.clear();
11999
12000     if (requeue) {
12001       if (repop->op) {
12002         dout(10) << " requeuing " << *repop->op->get_req() << dendl;
12003         rq.push_back(repop->op);
12004         repop->op = OpRequestRef();
12005       }
12006
12007       // also requeue any dups, interleaved into position
12008       auto p = waiting_for_ondisk.find(repop->v);
12009       if (p != waiting_for_ondisk.end()) {
12010         dout(10) << " also requeuing ondisk waiters " << p->second << dendl;
12011         for (auto& i : p->second) {
12012           rq.push_back(std::get<0>(i));
12013         }
12014         waiting_for_ondisk.erase(p);
12015       }
12016     }
12017
12018     remove_repop(repop);
12019   }
12020
12021   ceph_assert(repop_queue.empty());
12022
12023   if (requeue) {
12024     requeue_ops(rq);
12025     if (!waiting_for_ondisk.empty()) {
12026       for (auto& i : waiting_for_ondisk) {
12027         for (auto& j : i.second) {
12028           derr << __func__ << ": op " << *(std::get<0>(j)->get_req())
12029                << " waiting on " << i.first << dendl;
12030         }
12031       }
12032       ceph_assert(waiting_for_ondisk.empty());
12033     }
12034   }
12035
12036   waiting_for_ondisk.clear();
12037 }
12038
12039 void PrimaryLogPG::on_flushed()
12040 {
12041   ceph_assert(flushes_in_progress > 0);
12042   flushes_in_progress--;
12043   if (flushes_in_progress == 0) {
12044     requeue_ops(waiting_for_flush);
12045   }
12046   if (!is_peered() || !is_primary()) {
12047     pair<hobject_t, ObjectContextRef> i;
12048     while (object_contexts.get_next(i.first, &i)) {
12049       derr << __func__ << ": object " << i.first << " obc still alive" << dendl;
12050     }
12051     ceph_assert(object_contexts.empty());
12052   }
12053 }
12054
12055 void PrimaryLogPG::on_removal(ObjectStore::Transaction *t)
12056 {
12057   dout(10) << __func__ << dendl;
12058
12059   // adjust info to backfill
12060   info.set_last_backfill(hobject_t());
12061   pg_log.reset_backfill();
12062   dirty_info = true;
12063
12064   // clear log
12065   PGLogEntryHandler rollbacker{this, t};
12066   pg_log.roll_forward(&rollbacker);
12067
12068   on_shutdown();
12069 }
12070
12071 void PrimaryLogPG::clear_async_reads()
12072 {
12073   dout(10) << __func__ << dendl;
12074   for(auto& i : in_progress_async_reads) {
12075     dout(10) << "clear ctx: "
12076              << "OpRequestRef " << i.first
12077              << " OpContext " << i.second
12078              << dendl;
12079     close_op_ctx(i.second);
12080   }
12081 }
12082
12083 void PrimaryLogPG::clear_cache()
12084 {
12085   object_contexts.clear();
12086 }
12087
12088 void PrimaryLogPG::on_shutdown()
12089 {
12090   dout(10) << __func__ << dendl;
12091
12092   // handles queue races
12093   deleting = true;
12094
12095   if (recovery_queued) {
12096     recovery_queued = false;
12097     osd->clear_queued_recovery(this);
12098   }
12099
12100   clear_scrub_reserved();
12101   scrub_clear_state();
12102
12103   unreg_next_scrub();
12104
12105   vector<ceph_tid_t> tids;
12106   cancel_copy_ops(false, &tids);
12107   cancel_flush_ops(false, &tids);
12108   cancel_proxy_ops(false, &tids);
12109   osd->objecter->op_cancel(tids, -ECANCELED);
12110
12111   apply_and_flush_repops(false);
12112   cancel_log_updates();
12113   // we must remove PGRefs, so do this this prior to release_backoffs() callers
12114   clear_backoffs();
12115   // clean up snap trim references
12116   snap_trimmer_machine.process_event(Reset());
12117
12118   pgbackend->on_change();
12119
12120   context_registry_on_change();
12121   object_contexts.clear();
12122
12123   clear_async_reads();
12124
12125   osd->remote_reserver.cancel_reservation(info.pgid);
12126   osd->local_reserver.cancel_reservation(info.pgid);
12127
12128   clear_primary_state();
12129   cancel_recovery();
12130
12131   if (is_primary()) {
12132     osd->clear_ready_to_merge(this);
12133   }
12134 }
12135
12136 void PrimaryLogPG::on_activate()
12137 {
12138   // all clean?
12139   if (needs_recovery()) {
12140     dout(10) << "activate not all replicas are up-to-date, queueing recovery" << dendl;
12141     queue_peering_event(
12142       PGPeeringEventRef(
12143         std::make_shared<PGPeeringEvent>(
12144           get_osdmap_epoch(),
12145           get_osdmap_epoch(),
12146           DoRecovery())));
12147   } else if (needs_backfill()) {
12148     dout(10) << "activate queueing backfill" << dendl;
12149     queue_peering_event(
12150       PGPeeringEventRef(
12151         std::make_shared<PGPeeringEvent>(
12152           get_osdmap_epoch(),
12153           get_osdmap_epoch(),
12154           RequestBackfill())));
12155   } else {
12156     dout(10) << "activate all replicas clean, no recovery" << dendl;
12157     eio_errors_to_process = false;
12158     queue_peering_event(
12159       PGPeeringEventRef(
12160         std::make_shared<PGPeeringEvent>(
12161           get_osdmap_epoch(),
12162           get_osdmap_epoch(),
12163           AllReplicasRecovered())));
12164   }
12165
12166   publish_stats_to_osd();
12167
12168   if (!backfill_targets.empty()) {
12169     last_backfill_started = earliest_backfill();
12170     new_backfill = true;
12171     ceph_assert(!last_backfill_started.is_max());
12172     dout(5) << __func__ << ": bft=" << backfill_targets
12173            << " from " << last_backfill_started << dendl;
12174     for (set<pg_shard_t>::iterator i = backfill_targets.begin();
12175          i != backfill_targets.end();
12176          ++i) {
12177       dout(5) << "target shard " << *i
12178              << " from " << peer_info[*i].last_backfill
12179              << dendl;
12180     }
12181   }
12182
12183   hit_set_setup();
12184   agent_setup();
12185 }
12186
12187 void PrimaryLogPG::_on_new_interval()
12188 {
12189   dout(20) << __func__ << " checking missing set deletes flag. missing = " << pg_log.get_missing() << dendl;
12190   if (!pg_log.get_missing().may_include_deletes &&
12191       get_osdmap()->test_flag(CEPH_OSDMAP_RECOVERY_DELETES)) {
12192     pg_log.rebuild_missing_set_with_deletes(osd->store, ch, info);
12193   }
12194   ceph_assert(pg_log.get_missing().may_include_deletes == get_osdmap()->test_flag(CEPH_OSDMAP_RECOVERY_DELETES));
12195 }
12196
12197 void PrimaryLogPG::on_change(ObjectStore::Transaction *t)
12198 {
12199   dout(10) << __func__ << dendl;
12200
12201   if (hit_set && hit_set->insert_count() == 0) {
12202     dout(20) << " discarding empty hit_set" << dendl;
12203     hit_set_clear();
12204   }
12205
12206   if (recovery_queued) {
12207     recovery_queued = false;
12208     osd->clear_queued_recovery(this);
12209   }
12210
12211   // requeue everything in the reverse order they should be
12212   // reexamined.
12213   requeue_ops(waiting_for_peered);
12214   requeue_ops(waiting_for_flush);
12215   requeue_ops(waiting_for_active);
12216
12217   clear_scrub_reserved();
12218
12219   vector<ceph_tid_t> tids;
12220   cancel_copy_ops(is_primary(), &tids);
12221   cancel_flush_ops(is_primary(), &tids);
12222   cancel_proxy_ops(is_primary(), &tids);
12223   osd->objecter->op_cancel(tids, -ECANCELED);
12224
12225   // requeue object waiters
12226   for (auto& p : waiting_for_unreadable_object) {
12227     release_backoffs(p.first);
12228   }
12229   if (is_primary()) {
12230     requeue_object_waiters(waiting_for_unreadable_object);
12231   } else {
12232     waiting_for_unreadable_object.clear();
12233   }
12234   for (map<hobject_t,list<OpRequestRef>>::iterator p = waiting_for_degraded_object.begin();
12235        p != waiting_for_degraded_object.end();
12236        waiting_for_degraded_object.erase(p++)) {
12237     release_backoffs(p->first);
12238     if (is_primary())
12239       requeue_ops(p->second);
12240     else
12241       p->second.clear();
12242     finish_degraded_object(p->first);
12243   }
12244
12245   // requeues waiting_for_scrub
12246   scrub_clear_state();
12247
12248   for (auto p = waiting_for_blocked_object.begin();
12249        p != waiting_for_blocked_object.end();
12250        waiting_for_blocked_object.erase(p++)) {
12251     if (is_primary())
12252       requeue_ops(p->second);
12253     else
12254       p->second.clear();
12255   }
12256   for (auto i = callbacks_for_degraded_object.begin();
12257        i != callbacks_for_degraded_object.end();
12258     ) {
12259     finish_degraded_object((i++)->first);
12260   }
12261   ceph_assert(callbacks_for_degraded_object.empty());
12262
12263   if (is_primary()) {
12264     requeue_ops(waiting_for_cache_not_full);
12265   } else {
12266     waiting_for_cache_not_full.clear();
12267   }
12268   objects_blocked_on_cache_full.clear();
12269
12270   for (list<pair<OpRequestRef, OpContext*> >::iterator i =
12271          in_progress_async_reads.begin();
12272        i != in_progress_async_reads.end();
12273        in_progress_async_reads.erase(i++)) {
12274     close_op_ctx(i->second);
12275     if (is_primary())
12276       requeue_op(i->first);
12277   }
12278
12279   // this will requeue ops we were working on but didn't finish, and
12280   // any dups
12281   apply_and_flush_repops(is_primary());
12282   cancel_log_updates();
12283
12284   // do this *after* apply_and_flush_repops so that we catch any newly
12285   // registered watches.
12286   context_registry_on_change();
12287
12288   pgbackend->on_change_cleanup(t);
12289   scrubber.cleanup_store(t);
12290   pgbackend->on_change();
12291
12292   // clear snap_trimmer state
12293   snap_trimmer_machine.process_event(Reset());
12294
12295   debug_op_order.clear();
12296   unstable_stats.clear();
12297
12298   // we don't want to cache object_contexts through the interval change
12299   // NOTE: we actually assert that all currently live references are dead
12300   // by the time the flush for the next interval completes.
12301   object_contexts.clear();
12302
12303   // should have been cleared above by finishing all of the degraded objects
12304   ceph_assert(objects_blocked_on_degraded_snap.empty());
12305 }
12306
12307 void PrimaryLogPG::on_role_change()
12308 {
12309   dout(10) << __func__ << dendl;
12310   if (get_role() != 0 && hit_set) {
12311     dout(10) << " clearing hit set" << dendl;
12312     hit_set_clear();
12313   }
12314 }
12315
12316 void PrimaryLogPG::on_pool_change()
12317 {
12318   dout(10) << __func__ << dendl;
12319   // requeue cache full waiters just in case the cache_mode is
12320   // changing away from writeback mode.  note that if we are not
12321   // active the normal requeuing machinery is sufficient (and properly
12322   // ordered).
12323   if (is_active() &&
12324       pool.info.cache_mode != pg_pool_t::CACHEMODE_WRITEBACK &&
12325       !waiting_for_cache_not_full.empty()) {
12326     dout(10) << __func__ << " requeuing full waiters (not in writeback) "
12327              << dendl;
12328     requeue_ops(waiting_for_cache_not_full);
12329     objects_blocked_on_cache_full.clear();
12330   }
12331   hit_set_setup();
12332   agent_setup();
12333 }
12334
12335 // clear state.  called on recovery completion AND cancellation.
12336 void PrimaryLogPG::_clear_recovery_state()
12337 {
12338   missing_loc.clear();
12339 #ifdef DEBUG_RECOVERY_OIDS
12340   recovering_oids.clear();
12341 #endif
12342   last_backfill_started = hobject_t();
12343   set<hobject_t>::iterator i = backfills_in_flight.begin();
12344   while (i != backfills_in_flight.end()) {
12345     ceph_assert(recovering.count(*i));
12346     backfills_in_flight.erase(i++);
12347   }
12348
12349   list<OpRequestRef> blocked_ops;
12350   for (map<hobject_t, ObjectContextRef>::iterator i = recovering.begin();
12351        i != recovering.end();
12352        recovering.erase(i++)) {
12353     if (i->second) {
12354       i->second->drop_recovery_read(&blocked_ops);
12355       requeue_ops(blocked_ops);
12356     }
12357   }
12358   ceph_assert(backfills_in_flight.empty());
12359   pending_backfill_updates.clear();
12360   ceph_assert(recovering.empty());
12361   pgbackend->clear_recovery_state();
12362 }
12363
12364 void PrimaryLogPG::cancel_pull(const hobject_t &soid)
12365 {
12366   dout(20) << __func__ << ": " << soid << dendl;
12367   ceph_assert(recovering.count(soid));
12368   ObjectContextRef obc = recovering[soid];
12369   if (obc) {
12370     list<OpRequestRef> blocked_ops;
12371     obc->drop_recovery_read(&blocked_ops);
12372     requeue_ops(blocked_ops);
12373   }
12374   recovering.erase(soid);
12375   finish_recovery_op(soid);
12376   release_backoffs(soid);
12377   if (waiting_for_degraded_object.count(soid)) {
12378     dout(20) << " kicking degraded waiters on " << soid << dendl;
12379     requeue_ops(waiting_for_degraded_object[soid]);
12380     waiting_for_degraded_object.erase(soid);
12381   }
12382   if (waiting_for_unreadable_object.count(soid)) {
12383     dout(20) << " kicking unreadable waiters on " << soid << dendl;
12384     requeue_ops(waiting_for_unreadable_object[soid]);
12385     waiting_for_unreadable_object.erase(soid);
12386   }
12387   if (is_missing_object(soid))
12388     pg_log.set_last_requested(0); // get recover_primary to start over
12389   finish_degraded_object(soid);
12390 }
12391
12392 void PrimaryLogPG::check_recovery_sources(const OSDMapRef& osdmap)
12393 {
12394   /*
12395    * check that any peers we are planning to (or currently) pulling
12396    * objects from are dealt with.
12397    */
12398   missing_loc.check_recovery_sources(osdmap);
12399   pgbackend->check_recovery_sources(osdmap);
12400
12401   for (set<pg_shard_t>::iterator i = peer_log_requested.begin();
12402        i != peer_log_requested.end();
12403        ) {
12404     if (!osdmap->is_up(i->osd)) {
12405       dout(10) << "peer_log_requested removing " << *i << dendl;
12406       peer_log_requested.erase(i++);
12407     } else {
12408       ++i;
12409     }
12410   }
12411
12412   for (set<pg_shard_t>::iterator i = peer_missing_requested.begin();
12413        i != peer_missing_requested.end();
12414        ) {
12415     if (!osdmap->is_up(i->osd)) {
12416       dout(10) << "peer_missing_requested removing " << *i << dendl;
12417       peer_missing_requested.erase(i++);
12418     } else {
12419       ++i;
12420     }
12421   }
12422 }
12423
12424 bool PrimaryLogPG::start_recovery_ops(
12425   uint64_t max,
12426   ThreadPool::TPHandle &handle,
12427   uint64_t *ops_started)
12428 {
12429   uint64_t& started = *ops_started;
12430   started = 0;
12431   bool work_in_progress = false;
12432   bool recovery_started = false;
12433   ceph_assert(is_primary());
12434   ceph_assert(is_peered());
12435   ceph_assert(!is_deleting());
12436
12437   ceph_assert(recovery_queued);
12438   recovery_queued = false;
12439
12440   if (!state_test(PG_STATE_RECOVERING) &&
12441       !state_test(PG_STATE_BACKFILLING)) {
12442     /* TODO: I think this case is broken and will make do_recovery()
12443      * unhappy since we're returning false */
12444     dout(10) << "recovery raced and were queued twice, ignoring!" << dendl;
12445     return have_unfound();
12446   }
12447
12448   const auto &missing = pg_log.get_missing();
12449
12450   unsigned int num_missing = missing.num_missing();
12451   uint64_t num_unfound = get_num_unfound();
12452
12453   if (num_missing == 0) {
12454     info.last_complete = info.last_update;
12455   }
12456
12457   if (num_missing == num_unfound) {
12458     // All of the missing objects we have are unfound.
12459     // Recover the replicas.
12460     started = recover_replicas(max, handle, &recovery_started);
12461   }
12462   if (!started) {
12463     // We still have missing objects that we should grab from replicas.
12464     started += recover_primary(max, handle);
12465   }
12466   if (!started && num_unfound != get_num_unfound()) {
12467     // second chance to recovery replicas
12468     started = recover_replicas(max, handle, &recovery_started);
12469   }
12470
12471   if (started || recovery_started)
12472     work_in_progress = true;
12473
12474   bool deferred_backfill = false;
12475   if (recovering.empty() &&
12476       state_test(PG_STATE_BACKFILLING) &&
12477       !backfill_targets.empty() && started < max &&
12478       missing.num_missing() == 0 &&
12479       waiting_on_backfill.empty()) {
12480     if (get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL)) {
12481       dout(10) << "deferring backfill due to NOBACKFILL" << dendl;
12482       deferred_backfill = true;
12483     } else if (get_osdmap()->test_flag(CEPH_OSDMAP_NOREBALANCE) &&
12484                !is_degraded())  {
12485       dout(10) << "deferring backfill due to NOREBALANCE" << dendl;
12486       deferred_backfill = true;
12487     } else if (!backfill_reserved) {
12488       dout(10) << "deferring backfill due to !backfill_reserved" << dendl;
12489       if (!backfill_reserving) {
12490         dout(10) << "queueing RequestBackfill" << dendl;
12491         backfill_reserving = true;
12492         queue_peering_event(
12493           PGPeeringEventRef(
12494             std::make_shared<PGPeeringEvent>(
12495               get_osdmap_epoch(),
12496               get_osdmap_epoch(),
12497               RequestBackfill())));
12498       }
12499       deferred_backfill = true;
12500     } else {
12501       started += recover_backfill(max - started, handle, &work_in_progress);
12502     }
12503   }
12504
12505   dout(10) << " started " << started << dendl;
12506   osd->logger->inc(l_osd_rop, started);
12507
12508   if (!recovering.empty() ||
12509       work_in_progress || recovery_ops_active > 0 || deferred_backfill)
12510     return !work_in_progress && have_unfound();
12511
12512   ceph_assert(recovering.empty());
12513   ceph_assert(recovery_ops_active == 0);
12514
12515   dout(10) << __func__ << " needs_recovery: "
12516            << missing_loc.get_needs_recovery()
12517            << dendl;
12518   dout(10) << __func__ << " missing_loc: "
12519            << missing_loc.get_missing_locs()
12520            << dendl;
12521   int unfound = get_num_unfound();
12522   if (unfound) {
12523     dout(10) << " still have " << unfound << " unfound" << dendl;
12524     return true;
12525   }
12526
12527   if (missing.num_missing() > 0) {
12528     // this shouldn't happen!
12529     osd->clog->error() << info.pgid << " Unexpected Error: recovery ending with "
12530                        << missing.num_missing() << ": " << missing.get_items();
12531     return false;
12532   }
12533
12534   if (needs_recovery()) {
12535     // this shouldn't happen!
12536     // We already checked num_missing() so we must have missing replicas
12537     osd->clog->error() << info.pgid
12538                        << " Unexpected Error: recovery ending with missing replicas";
12539     return false;
12540   }
12541
12542   if (state_test(PG_STATE_RECOVERING)) {
12543     state_clear(PG_STATE_RECOVERING);
12544     state_clear(PG_STATE_FORCED_RECOVERY);
12545     if (needs_backfill()) {
12546       dout(10) << "recovery done, queuing backfill" << dendl;
12547       queue_peering_event(
12548         PGPeeringEventRef(
12549           std::make_shared<PGPeeringEvent>(
12550             get_osdmap_epoch(),
12551             get_osdmap_epoch(),
12552             RequestBackfill())));
12553     } else {
12554       dout(10) << "recovery done, no backfill" << dendl;
12555       eio_errors_to_process = false;
12556       state_clear(PG_STATE_FORCED_BACKFILL);
12557       queue_peering_event(
12558         PGPeeringEventRef(
12559           std::make_shared<PGPeeringEvent>(
12560             get_osdmap_epoch(),
12561             get_osdmap_epoch(),
12562             AllReplicasRecovered())));
12563     }
12564   } else { // backfilling
12565     state_clear(PG_STATE_BACKFILLING);
12566     state_clear(PG_STATE_FORCED_BACKFILL);
12567     state_clear(PG_STATE_FORCED_RECOVERY);
12568     dout(10) << "recovery done, backfill done" << dendl;
12569     eio_errors_to_process = false;
12570     queue_peering_event(
12571       PGPeeringEventRef(
12572         std::make_shared<PGPeeringEvent>(
12573           get_osdmap_epoch(),
12574           get_osdmap_epoch(),
12575           Backfilled())));
12576   }
12577
12578   return false;
12579 }
12580
12581 /**
12582  * do one recovery op.
12583  * return true if done, false if nothing left to do.
12584  */
12585 uint64_t PrimaryLogPG::recover_primary(uint64_t max, ThreadPool::TPHandle &handle)
12586 {
12587   ceph_assert(is_primary());
12588
12589   const auto &missing = pg_log.get_missing();
12590
12591   dout(10) << __func__ << " recovering " << recovering.size()
12592            << " in pg,"
12593            << " missing " << missing << dendl;
12594
12595   dout(25) << __func__ << " " << missing.get_items() << dendl;
12596
12597   // look at log!
12598   pg_log_entry_t *latest = 0;
12599   unsigned started = 0;
12600   int skipped = 0;
12601
12602   PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
12603   map<version_t, hobject_t>::const_iterator p =
12604     missing.get_rmissing().lower_bound(pg_log.get_log().last_requested);
12605   while (p != missing.get_rmissing().end()) {
12606     handle.reset_tp_timeout();
12607     hobject_t soid;
12608     version_t v = p->first;
12609
12610     auto it_objects = pg_log.get_log().objects.find(p->second);
12611     if (it_objects != pg_log.get_log().objects.end()) {
12612       latest = it_objects->second;
12613       ceph_assert(latest->is_update() || latest->is_delete());
12614       soid = latest->soid;
12615     } else {
12616       latest = 0;
12617       soid = p->second;
12618     }
12619     const pg_missing_item& item = missing.get_items().find(p->second)->second;
12620     ++p;
12621
12622     hobject_t head = soid.get_head();
12623
12624     eversion_t need = item.need;
12625
12626     dout(10) << __func__ << " "
12627              << soid << " " << item.need
12628              << (missing.is_missing(soid) ? " (missing)":"")
12629              << (missing.is_missing(head) ? " (missing head)":"")
12630              << (recovering.count(soid) ? " (recovering)":"")
12631              << (recovering.count(head) ? " (recovering head)":"")
12632              << dendl;
12633
12634     if (latest) {
12635       switch (latest->op) {
12636       case pg_log_entry_t::CLONE:
12637         /*
12638          * Handling for this special case removed for now, until we
12639          * can correctly construct an accurate SnapSet from the old
12640          * one.
12641          */
12642         break;
12643
12644       case pg_log_entry_t::LOST_REVERT:
12645         {
12646           if (item.have == latest->reverting_to) {
12647             ObjectContextRef obc = get_object_context(soid, true);
12648
12649             if (obc->obs.oi.version == latest->version) {
12650               // I'm already reverting
12651               dout(10) << " already reverting " << soid << dendl;
12652             } else {
12653               dout(10) << " reverting " << soid << " to " << latest->prior_version << dendl;
12654               obc->obs.oi.version = latest->version;
12655
12656               ObjectStore::Transaction t;
12657               bufferlist b2;
12658               obc->obs.oi.encode(
12659                 b2,
12660                 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
12661               ceph_assert(!pool.info.require_rollback());
12662               t.setattr(coll, ghobject_t(soid), OI_ATTR, b2);
12663
12664               recover_got(soid, latest->version);
12665               missing_loc.add_location(soid, pg_whoami);
12666
12667               ++active_pushes;
12668
12669               t.register_on_applied(new C_OSD_AppliedRecoveredObject(this, obc));
12670               t.register_on_commit(new C_OSD_CommittedPushedObject(
12671                                      this,
12672                                      get_osdmap_epoch(),
12673                                      info.last_complete));
12674               osd->store->queue_transaction(ch, std::move(t));
12675               continue;
12676             }
12677           } else {
12678             /*
12679              * Pull the old version of the object.  Update missing_loc here to have the location
12680              * of the version we want.
12681              *
12682              * This doesn't use the usual missing_loc paths, but that's okay:
12683              *  - if we have it locally, we hit the case above, and go from there.
12684              *  - if we don't, we always pass through this case during recovery and set up the location
12685              *    properly.
12686              *  - this way we don't need to mangle the missing code to be general about needing an old
12687              *    version...
12688              */
12689             eversion_t alternate_need = latest->reverting_to;
12690             dout(10) << " need to pull prior_version " << alternate_need << " for revert " << item << dendl;
12691
12692             for (map<pg_shard_t, pg_missing_t>::iterator p = peer_missing.begin();
12693                  p != peer_missing.end();
12694                  ++p)
12695               if (p->second.is_missing(soid, need) &&
12696                   p->second.get_items().at(soid).have == alternate_need) {
12697                 missing_loc.add_location(soid, p->first);
12698               }
12699             dout(10) << " will pull " << alternate_need << " or " << need
12700                      << " from one of " << missing_loc.get_locations(soid)
12701                      << dendl;
12702           }
12703         }
12704         break;
12705       }
12706     }
12707
12708     if (!recovering.count(soid)) {
12709       if (recovering.count(head)) {
12710         ++skipped;
12711       } else {
12712         int r = recover_missing(
12713           soid, need, get_recovery_op_priority(), h);
12714         switch (r) {
12715         case PULL_YES:
12716           ++started;
12717           break;
12718         case PULL_HEAD:
12719           ++started;
12720         case PULL_NONE:
12721           ++skipped;
12722           break;
12723         default:
12724           ceph_abort();
12725         }
12726         if (started >= max)
12727           break;
12728       }
12729     }
12730
12731     // only advance last_requested if we haven't skipped anything
12732     if (!skipped)
12733       pg_log.set_last_requested(v);
12734   }
12735
12736   pgbackend->run_recovery_op(h, get_recovery_op_priority());
12737   return started;
12738 }
12739
12740 bool PrimaryLogPG::primary_error(
12741   const hobject_t& soid, eversion_t v)
12742 {
12743   pg_log.missing_add(soid, v, eversion_t());
12744   pg_log.set_last_requested(0);
12745   missing_loc.remove_location(soid, pg_whoami);
12746   bool uhoh = true;
12747   ceph_assert(!acting_recovery_backfill.empty());
12748   for (set<pg_shard_t>::iterator i = acting_recovery_backfill.begin();
12749        i != acting_recovery_backfill.end();
12750        ++i) {
12751     if (*i == get_primary()) continue;
12752     pg_shard_t peer = *i;
12753     if (!peer_missing[peer].is_missing(soid, v)) {
12754       missing_loc.add_location(soid, peer);
12755       dout(10) << info.pgid << " unexpectedly missing " << soid << " v" << v
12756                << ", there should be a copy on shard " << peer << dendl;
12757       uhoh = false;
12758     }
12759   }
12760   if (uhoh)
12761     osd->clog->error() << info.pgid << " missing primary copy of " << soid << ", unfound";
12762   else
12763     osd->clog->error() << info.pgid << " missing primary copy of " << soid
12764                          << ", will try copies on " << missing_loc.get_locations(soid);
12765   return uhoh;
12766 }
12767
12768 int PrimaryLogPG::prep_object_replica_deletes(
12769   const hobject_t& soid, eversion_t v,
12770   PGBackend::RecoveryHandle *h,
12771   bool *work_started)
12772 {
12773   ceph_assert(is_primary());
12774   dout(10) << __func__ << ": on " << soid << dendl;
12775
12776   ObjectContextRef obc = get_object_context(soid, false);
12777   if (obc) {
12778     if (!obc->get_recovery_read()) {
12779       dout(20) << "replica delete delayed on " << soid
12780                << "; could not get rw_manager lock" << dendl;
12781       *work_started = true;
12782       return 0;
12783     } else {
12784       dout(20) << "replica delete got recovery read lock on " << soid
12785                << dendl;
12786     }
12787   }
12788
12789   start_recovery_op(soid);
12790   ceph_assert(!recovering.count(soid));
12791   if (!obc)
12792     recovering.insert(make_pair(soid, ObjectContextRef()));
12793   else
12794     recovering.insert(make_pair(soid, obc));
12795
12796   pgbackend->recover_delete_object(soid, v, h);
12797   return 1;
12798 }
12799
12800 int PrimaryLogPG::prep_object_replica_pushes(
12801   const hobject_t& soid, eversion_t v,
12802   PGBackend::RecoveryHandle *h,
12803   bool *work_started)
12804 {
12805   ceph_assert(is_primary());
12806   dout(10) << __func__ << ": on " << soid << dendl;
12807
12808   // NOTE: we know we will get a valid oloc off of disk here.
12809   ObjectContextRef obc = get_object_context(soid, false);
12810   if (!obc) {
12811     primary_error(soid, v);
12812     return 0;
12813   }
12814
12815   if (!obc->get_recovery_read()) {
12816     dout(20) << "recovery delayed on " << soid
12817              << "; could not get rw_manager lock" << dendl;
12818     *work_started = true;
12819     return 0;
12820   } else {
12821     dout(20) << "recovery got recovery read lock on " << soid
12822              << dendl;
12823   }
12824
12825   start_recovery_op(soid);
12826   ceph_assert(!recovering.count(soid));
12827   recovering.insert(make_pair(soid, obc));
12828
12829   /* We need this in case there is an in progress write on the object.  In fact,
12830    * the only possible write is an update to the xattr due to a lost_revert --
12831    * a client write would be blocked since the object is degraded.
12832    * In almost all cases, therefore, this lock should be uncontended.
12833    */
12834   int r = pgbackend->recover_object(
12835     soid,
12836     v,
12837     ObjectContextRef(),
12838     obc, // has snapset context
12839     h);
12840   if (r < 0) {
12841     dout(0) << __func__ << " Error " << r << " on oid " << soid << dendl;
12842     primary_failed(soid);
12843     primary_error(soid, v);
12844     return 0;
12845   }
12846   return 1;
12847 }
12848
12849 uint64_t PrimaryLogPG::recover_replicas(uint64_t max, ThreadPool::TPHandle &handle,
12850   bool *work_started)
12851 {
12852   dout(10) << __func__ << "(" << max << ")" << dendl;
12853   uint64_t started = 0;
12854
12855   PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
12856
12857   // this is FAR from an optimal recovery order.  pretty lame, really.
12858   ceph_assert(!acting_recovery_backfill.empty());
12859   // choose replicas to recover, replica has the shortest missing list first
12860   // so we can bring it back to normal ASAP
12861   std::vector<std::pair<unsigned int, pg_shard_t>> replicas_by_num_missing,
12862     async_by_num_missing;
12863   replicas_by_num_missing.reserve(acting_recovery_backfill.size() - 1);
12864   for (auto &p: acting_recovery_backfill) {
12865     if (p == get_primary()) {
12866       continue;
12867     }
12868     auto pm = peer_missing.find(p);
12869     ceph_assert(pm != peer_missing.end());
12870     auto nm = pm->second.num_missing();
12871     if (nm != 0) {
12872       if (async_recovery_targets.count(p)) {
12873         async_by_num_missing.push_back(make_pair(nm, p));
12874       } else {
12875         replicas_by_num_missing.push_back(make_pair(nm, p));
12876       }
12877     }
12878   }
12879   // sort by number of missing objects, in ascending order.
12880   auto func = [](const std::pair<unsigned int, pg_shard_t> &lhs,
12881                  const std::pair<unsigned int, pg_shard_t> &rhs) {
12882     return lhs.first < rhs.first;
12883   };
12884   // acting goes first
12885   std::sort(replicas_by_num_missing.begin(), replicas_by_num_missing.end(), func);
12886   // then async_recovery_targets
12887   std::sort(async_by_num_missing.begin(), async_by_num_missing.end(), func);
12888   replicas_by_num_missing.insert(replicas_by_num_missing.end(),
12889     async_by_num_missing.begin(), async_by_num_missing.end());
12890   for (auto &replica: replicas_by_num_missing) {
12891     pg_shard_t &peer = replica.second;
12892     ceph_assert(peer != get_primary());
12893     map<pg_shard_t, pg_missing_t>::const_iterator pm = peer_missing.find(peer);
12894     ceph_assert(pm != peer_missing.end());
12895     map<pg_shard_t, pg_info_t>::const_iterator pi = peer_info.find(peer);
12896     ceph_assert(pi != peer_info.end());
12897     size_t m_sz = pm->second.num_missing();
12898
12899     dout(10) << " peer osd." << peer << " missing " << m_sz << " objects." << dendl;
12900     dout(20) << " peer osd." << peer << " missing " << pm->second.get_items() << dendl;
12901
12902     // oldest first!
12903     const pg_missing_t &m(pm->second);
12904     for (map<version_t, hobject_t>::const_iterator p = m.get_rmissing().begin();
12905          p != m.get_rmissing().end() && started < max;
12906            ++p) {
12907       handle.reset_tp_timeout();
12908       const hobject_t soid(p->second);
12909
12910       if (missing_loc.is_unfound(soid)) {
12911         dout(10) << __func__ << ": " << soid << " still unfound" << dendl;
12912         continue;
12913       }
12914
12915       if (soid > pi->second.last_backfill) {
12916         if (!recovering.count(soid)) {
12917           derr << __func__ << ": object " << soid << " last_backfill " << pi->second.last_backfill << dendl;
12918           derr << __func__ << ": object added to missing set for backfill, but "
12919                << "is not in recovering, error!" << dendl;
12920           ceph_abort();
12921         }
12922         continue;
12923       }
12924
12925       if (recovering.count(soid)) {
12926         dout(10) << __func__ << ": already recovering " << soid << dendl;
12927         continue;
12928       }
12929
12930       if (missing_loc.is_deleted(soid)) {
12931         dout(10) << __func__ << ": " << soid << " is a delete, removing" << dendl;
12932         map<hobject_t,pg_missing_item>::const_iterator r = m.get_items().find(soid);
12933         started += prep_object_replica_deletes(soid, r->second.need, h, work_started);
12934         continue;
12935       }
12936
12937       if (soid.is_snap() && pg_log.get_missing().is_missing(soid.get_head())) {
12938         dout(10) << __func__ << ": " << soid.get_head()
12939                  << " still missing on primary" << dendl;
12940         continue;
12941       }
12942
12943       if (pg_log.get_missing().is_missing(soid)) {
12944         dout(10) << __func__ << ": " << soid << " still missing on primary" << dendl;
12945         continue;
12946       }
12947
12948       dout(10) << __func__ << ": recover_object_replicas(" << soid << ")" << dendl;
12949       map<hobject_t,pg_missing_item>::const_iterator r = m.get_items().find(soid);
12950       started += prep_object_replica_pushes(soid, r->second.need, h, work_started);
12951     }
12952   }
12953
12954   pgbackend->run_recovery_op(h, get_recovery_op_priority());
12955   return started;
12956 }
12957
12958 hobject_t PrimaryLogPG::earliest_peer_backfill() const
12959 {
12960   hobject_t e = hobject_t::get_max();
12961   for (set<pg_shard_t>::const_iterator i = backfill_targets.begin();
12962        i != backfill_targets.end();
12963        ++i) {
12964     pg_shard_t peer = *i;
12965     map<pg_shard_t, BackfillInterval>::const_iterator iter =
12966       peer_backfill_info.find(peer);
12967     ceph_assert(iter != peer_backfill_info.end());
12968     if (iter->second.begin < e)
12969       e = iter->second.begin;
12970   }
12971   return e;
12972 }
12973
12974 bool PrimaryLogPG::all_peer_done() const
12975 {
12976   // Primary hasn't got any more objects
12977   ceph_assert(backfill_info.empty());
12978
12979   for (set<pg_shard_t>::const_iterator i = backfill_targets.begin();
12980        i != backfill_targets.end();
12981        ++i) {
12982     pg_shard_t bt = *i;
12983     map<pg_shard_t, BackfillInterval>::const_iterator piter =
12984       peer_backfill_info.find(bt);
12985     ceph_assert(piter != peer_backfill_info.end());
12986     const BackfillInterval& pbi = piter->second;
12987     // See if peer has more to process
12988     if (!pbi.extends_to_end() || !pbi.empty())
12989         return false;
12990   }
12991   return true;
12992 }
12993
12994 /**
12995  * recover_backfill
12996  *
12997  * Invariants:
12998  *
12999  * backfilled: fully pushed to replica or present in replica's missing set (both
13000  * our copy and theirs).
13001  *
13002  * All objects on a backfill_target in
13003  * [MIN,peer_backfill_info[backfill_target].begin) are valid; logically-removed
13004  * objects have been actually deleted and all logically-valid objects are replicated.
13005  * There may be PG objects in this interval yet to be backfilled.
13006  *
13007  * All objects in PG in [MIN,backfill_info.begin) have been backfilled to all
13008  * backfill_targets.  There may be objects on backfill_target(s) yet to be deleted.
13009  *
13010  * For a backfill target, all objects < std::min(peer_backfill_info[target].begin,
13011  *     backfill_info.begin) in PG are backfilled.  No deleted objects in this
13012  * interval remain on the backfill target.
13013  *
13014  * For a backfill target, all objects <= peer_info[target].last_backfill
13015  * have been backfilled to target
13016  *
13017  * There *MAY* be missing/outdated objects between last_backfill_started and
13018  * std::min(peer_backfill_info[*].begin, backfill_info.begin) in the event that client
13019  * io created objects since the last scan.  For this reason, we call
13020  * update_range() again before continuing backfill.
13021  */
13022 uint64_t PrimaryLogPG::recover_backfill(
13023   uint64_t max,
13024   ThreadPool::TPHandle &handle, bool *work_started)
13025 {
13026   dout(10) << __func__ << " (" << max << ")"
13027            << " bft=" << backfill_targets
13028            << " last_backfill_started " << last_backfill_started
13029            << (new_backfill ? " new_backfill":"")
13030            << dendl;
13031   ceph_assert(!backfill_targets.empty());
13032
13033   // Initialize from prior backfill state
13034   if (new_backfill) {
13035     // on_activate() was called prior to getting here
13036     ceph_assert(last_backfill_started == earliest_backfill());
13037     new_backfill = false;
13038
13039     // initialize BackfillIntervals
13040     for (set<pg_shard_t>::iterator i = backfill_targets.begin();
13041          i != backfill_targets.end();
13042          ++i) {
13043       peer_backfill_info[*i].reset(peer_info[*i].last_backfill);
13044     }
13045     backfill_info.reset(last_backfill_started);
13046
13047     backfills_in_flight.clear();
13048     pending_backfill_updates.clear();
13049   }
13050
13051   for (set<pg_shard_t>::iterator i = backfill_targets.begin();
13052        i != backfill_targets.end();
13053        ++i) {
13054     dout(10) << "peer osd." << *i
13055            << " info " << peer_info[*i]
13056            << " interval " << peer_backfill_info[*i].begin
13057            << "-" << peer_backfill_info[*i].end
13058            << " " << peer_backfill_info[*i].objects.size() << " objects"
13059            << dendl;
13060   }
13061
13062   // update our local interval to cope with recent changes
13063   backfill_info.begin = last_backfill_started;
13064   update_range(&backfill_info, handle);
13065
13066   unsigned ops = 0;
13067   vector<boost::tuple<hobject_t, eversion_t, pg_shard_t> > to_remove;
13068   set<hobject_t> add_to_stat;
13069
13070   for (set<pg_shard_t>::iterator i = backfill_targets.begin();
13071        i != backfill_targets.end();
13072        ++i) {
13073     peer_backfill_info[*i].trim_to(
13074       std::max(peer_info[*i].last_backfill, last_backfill_started));
13075   }
13076   backfill_info.trim_to(last_backfill_started);
13077
13078   PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
13079   while (ops < max) {
13080     if (backfill_info.begin <= earliest_peer_backfill() &&
13081         !backfill_info.extends_to_end() && backfill_info.empty()) {
13082       hobject_t next = backfill_info.end;
13083       backfill_info.reset(next);
13084       backfill_info.end = hobject_t::get_max();
13085       update_range(&backfill_info, handle);
13086       backfill_info.trim();
13087     }
13088
13089     dout(20) << "   my backfill interval " << backfill_info << dendl;
13090
13091     bool sent_scan = false;
13092     for (set<pg_shard_t>::iterator i = backfill_targets.begin();
13093          i != backfill_targets.end();
13094          ++i) {
13095       pg_shard_t bt = *i;
13096       BackfillInterval& pbi = peer_backfill_info[bt];
13097
13098       dout(20) << " peer shard " << bt << " backfill " << pbi << dendl;
13099       if (pbi.begin <= backfill_info.begin &&
13100           !pbi.extends_to_end() && pbi.empty()) {
13101         dout(10) << " scanning peer osd." << bt << " from " << pbi.end << dendl;
13102         epoch_t e = get_osdmap_epoch();
13103         MOSDPGScan *m = new MOSDPGScan(
13104           MOSDPGScan::OP_SCAN_GET_DIGEST, pg_whoami, e, last_peering_reset,
13105           spg_t(info.pgid.pgid, bt.shard),
13106           pbi.end, hobject_t());
13107         osd->send_message_osd_cluster(bt.osd, m, get_osdmap_epoch());
13108         ceph_assert(waiting_on_backfill.find(bt) == waiting_on_backfill.end());
13109         waiting_on_backfill.insert(bt);
13110         sent_scan = true;
13111       }
13112     }
13113
13114     // Count simultaneous scans as a single op and let those complete
13115     if (sent_scan) {
13116       ops++;
13117       start_recovery_op(hobject_t::get_max()); // XXX: was pbi.end
13118       break;
13119     }
13120
13121     if (backfill_info.empty() && all_peer_done()) {
13122       dout(10) << " reached end for both local and all peers" << dendl;
13123       break;
13124     }
13125
13126     // Get object within set of peers to operate on and
13127     // the set of targets for which that object applies.
13128     hobject_t check = earliest_peer_backfill();
13129
13130     if (check < backfill_info.begin) {
13131
13132       set<pg_shard_t> check_targets;
13133       for (set<pg_shard_t>::iterator i = backfill_targets.begin();
13134            i != backfill_targets.end();
13135            ++i) {
13136         pg_shard_t bt = *i;
13137         BackfillInterval& pbi = peer_backfill_info[bt];
13138         if (pbi.begin == check)
13139           check_targets.insert(bt);
13140       }
13141       ceph_assert(!check_targets.empty());
13142
13143       dout(20) << " BACKFILL removing " << check
13144                << " from peers " << check_targets << dendl;
13145       for (set<pg_shard_t>::iterator i = check_targets.begin();
13146            i != check_targets.end();
13147            ++i) {
13148         pg_shard_t bt = *i;
13149         BackfillInterval& pbi = peer_backfill_info[bt];
13150         ceph_assert(pbi.begin == check);
13151
13152         to_remove.push_back(boost::make_tuple(check, pbi.objects.begin()->second, bt));
13153         pbi.pop_front();
13154       }
13155
13156       last_backfill_started = check;
13157
13158       // Don't increment ops here because deletions
13159       // are cheap and not replied to unlike real recovery_ops,
13160       // and we can't increment ops without requeueing ourself
13161       // for recovery.
13162     } else {
13163       eversion_t& obj_v = backfill_info.objects.begin()->second;
13164
13165       vector<pg_shard_t> need_ver_targs, missing_targs, keep_ver_targs, skip_targs;
13166       for (set<pg_shard_t>::iterator i = backfill_targets.begin();
13167            i != backfill_targets.end();
13168            ++i) {
13169         pg_shard_t bt = *i;
13170         BackfillInterval& pbi = peer_backfill_info[bt];
13171         // Find all check peers that have the wrong version
13172         if (check == backfill_info.begin && check == pbi.begin) {
13173           if (pbi.objects.begin()->second != obj_v) {
13174             need_ver_targs.push_back(bt);
13175           } else {
13176             keep_ver_targs.push_back(bt);
13177           }
13178         } else {
13179           pg_info_t& pinfo = peer_info[bt];
13180
13181           // Only include peers that we've caught up to their backfill line
13182           // otherwise, they only appear to be missing this object
13183           // because their pbi.begin > backfill_info.begin.
13184           if (backfill_info.begin > pinfo.last_backfill)
13185             missing_targs.push_back(bt);
13186           else
13187             skip_targs.push_back(bt);
13188         }
13189       }
13190
13191       if (!keep_ver_targs.empty()) {
13192         // These peers have version obj_v
13193         dout(20) << " BACKFILL keeping " << check
13194                  << " with ver " << obj_v
13195                  << " on peers " << keep_ver_targs << dendl;
13196         //assert(!waiting_for_degraded_object.count(check));
13197       }
13198       if (!need_ver_targs.empty() || !missing_targs.empty()) {
13199         ObjectContextRef obc = get_object_context(backfill_info.begin, false);
13200         ceph_assert(obc);
13201         if (obc->get_recovery_read()) {
13202           if (!need_ver_targs.empty()) {
13203             dout(20) << " BACKFILL replacing " << check
13204                    << " with ver " << obj_v
13205                    << " to peers " << need_ver_targs << dendl;
13206           }
13207           if (!missing_targs.empty()) {
13208             dout(20) << " BACKFILL pushing " << backfill_info.begin
13209                  << " with ver " << obj_v
13210                  << " to peers " << missing_targs << dendl;
13211           }
13212           vector<pg_shard_t> all_push = need_ver_targs;
13213           all_push.insert(all_push.end(), missing_targs.begin(), missing_targs.end());
13214
13215           handle.reset_tp_timeout();
13216           int r = prep_backfill_object_push(backfill_info.begin, obj_v, obc, all_push, h);
13217           if (r < 0) {
13218             *work_started = true;
13219             dout(0) << __func__ << " Error " << r << " trying to backfill " << backfill_info.begin << dendl;
13220             break;
13221           }
13222           ops++;
13223         } else {
13224           *work_started = true;
13225           dout(20) << "backfill blocking on " << backfill_info.begin
13226                    << "; could not get rw_manager lock" << dendl;
13227           break;
13228         }
13229       }
13230       dout(20) << "need_ver_targs=" << need_ver_targs
13231                << " keep_ver_targs=" << keep_ver_targs << dendl;
13232       dout(20) << "backfill_targets=" << backfill_targets
13233                << " missing_targs=" << missing_targs
13234                << " skip_targs=" << skip_targs << dendl;
13235
13236       last_backfill_started = backfill_info.begin;
13237       add_to_stat.insert(backfill_info.begin); // XXX: Only one for all pushes?
13238       backfill_info.pop_front();
13239       vector<pg_shard_t> check_targets = need_ver_targs;
13240       check_targets.insert(check_targets.end(), keep_ver_targs.begin(), keep_ver_targs.end());
13241       for (vector<pg_shard_t>::iterator i = check_targets.begin();
13242            i != check_targets.end();
13243            ++i) {
13244         pg_shard_t bt = *i;
13245         BackfillInterval& pbi = peer_backfill_info[bt];
13246         pbi.pop_front();
13247       }
13248     }
13249   }
13250
13251   hobject_t backfill_pos =
13252     std::min(backfill_info.begin, earliest_peer_backfill());
13253
13254   for (set<hobject_t>::iterator i = add_to_stat.begin();
13255        i != add_to_stat.end();
13256        ++i) {
13257     ObjectContextRef obc = get_object_context(*i, false);
13258     ceph_assert(obc);
13259     pg_stat_t stat;
13260     add_object_context_to_pg_stat(obc, &stat);
13261     pending_backfill_updates[*i] = stat;
13262   }
13263   map<pg_shard_t,MOSDPGBackfillRemove*> reqs;
13264   for (unsigned i = 0; i < to_remove.size(); ++i) {
13265     handle.reset_tp_timeout();
13266     const hobject_t& oid = to_remove[i].get<0>();
13267     eversion_t v = to_remove[i].get<1>();
13268     pg_shard_t peer = to_remove[i].get<2>();
13269     MOSDPGBackfillRemove *m;
13270     auto it = reqs.find(peer);
13271     if (it != reqs.end()) {
13272       m = it->second;
13273     } else {
13274       m = reqs[peer] = new MOSDPGBackfillRemove(
13275         spg_t(info.pgid.pgid, peer.shard),
13276         get_osdmap_epoch());
13277     }
13278     m->ls.push_back(make_pair(oid, v));
13279
13280     if (oid <= last_backfill_started)
13281       pending_backfill_updates[oid]; // add empty stat!
13282   }
13283   for (auto p : reqs) {
13284     osd->send_message_osd_cluster(p.first.osd, p.second,
13285                                   get_osdmap_epoch());
13286   }
13287
13288   pgbackend->run_recovery_op(h, get_recovery_op_priority());
13289
13290   dout(5) << "backfill_pos is " << backfill_pos << dendl;
13291   for (set<hobject_t>::iterator i = backfills_in_flight.begin();
13292        i != backfills_in_flight.end();
13293        ++i) {
13294     dout(20) << *i << " is still in flight" << dendl;
13295   }
13296
13297   hobject_t next_backfill_to_complete = backfills_in_flight.empty() ?
13298     backfill_pos : *(backfills_in_flight.begin());
13299   hobject_t new_last_backfill = earliest_backfill();
13300   dout(10) << "starting new_last_backfill at " << new_last_backfill << dendl;
13301   for (map<hobject_t, pg_stat_t>::iterator i =
13302          pending_backfill_updates.begin();
13303        i != pending_backfill_updates.end() &&
13304          i->first < next_backfill_to_complete;
13305        pending_backfill_updates.erase(i++)) {
13306     dout(20) << " pending_backfill_update " << i->first << dendl;
13307     ceph_assert(i->first > new_last_backfill);
13308     for (set<pg_shard_t>::iterator j = backfill_targets.begin();
13309          j != backfill_targets.end();
13310          ++j) {
13311       pg_shard_t bt = *j;
13312       pg_info_t& pinfo = peer_info[bt];
13313       //Add stats to all peers that were missing object
13314       if (i->first > pinfo.last_backfill)
13315         pinfo.stats.add(i->second);
13316     }
13317     new_last_backfill = i->first;
13318   }
13319   dout(10) << "possible new_last_backfill at " << new_last_backfill << dendl;
13320
13321   ceph_assert(!pending_backfill_updates.empty() ||
13322          new_last_backfill == last_backfill_started);
13323   if (pending_backfill_updates.empty() &&
13324       backfill_pos.is_max()) {
13325     ceph_assert(backfills_in_flight.empty());
13326     new_last_backfill = backfill_pos;
13327     last_backfill_started = backfill_pos;
13328   }
13329   dout(10) << "final new_last_backfill at " << new_last_backfill << dendl;
13330
13331   // If new_last_backfill == MAX, then we will send OP_BACKFILL_FINISH to
13332   // all the backfill targets.  Otherwise, we will move last_backfill up on
13333   // those targets need it and send OP_BACKFILL_PROGRESS to them.
13334   for (set<pg_shard_t>::iterator i = backfill_targets.begin();
13335        i != backfill_targets.end();
13336        ++i) {
13337     pg_shard_t bt = *i;
13338     pg_info_t& pinfo = peer_info[bt];
13339
13340     if (new_last_backfill > pinfo.last_backfill) {
13341       pinfo.set_last_backfill(new_last_backfill);
13342       epoch_t e = get_osdmap_epoch();
13343       MOSDPGBackfill *m = NULL;
13344       if (pinfo.last_backfill.is_max()) {
13345         m = new MOSDPGBackfill(
13346           MOSDPGBackfill::OP_BACKFILL_FINISH,
13347           e,
13348           last_peering_reset,
13349           spg_t(info.pgid.pgid, bt.shard));
13350         // Use default priority here, must match sub_op priority
13351         /* pinfo.stats might be wrong if we did log-based recovery on the
13352          * backfilled portion in addition to continuing backfill.
13353          */
13354         pinfo.stats = info.stats;
13355         start_recovery_op(hobject_t::get_max());
13356       } else {
13357         m = new MOSDPGBackfill(
13358           MOSDPGBackfill::OP_BACKFILL_PROGRESS,
13359           e,
13360           last_peering_reset,
13361           spg_t(info.pgid.pgid, bt.shard));
13362         // Use default priority here, must match sub_op priority
13363       }
13364       m->last_backfill = pinfo.last_backfill;
13365       m->stats = pinfo.stats;
13366       osd->send_message_osd_cluster(bt.osd, m, get_osdmap_epoch());
13367       dout(10) << " peer " << bt
13368                << " num_objects now " << pinfo.stats.stats.sum.num_objects
13369                << " / " << info.stats.stats.sum.num_objects << dendl;
13370     }
13371   }
13372
13373   if (ops)
13374     *work_started = true;
13375   return ops;
13376 }
13377
13378 int PrimaryLogPG::prep_backfill_object_push(
13379   hobject_t oid, eversion_t v,
13380   ObjectContextRef obc,
13381   vector<pg_shard_t> peers,
13382   PGBackend::RecoveryHandle *h)
13383 {
13384   dout(10) << __func__ << " " << oid << " v " << v << " to peers " << peers << dendl;
13385   ceph_assert(!peers.empty());
13386
13387   backfills_in_flight.insert(oid);
13388   for (unsigned int i = 0 ; i < peers.size(); ++i) {
13389     map<pg_shard_t, pg_missing_t>::iterator bpm = peer_missing.find(peers[i]);
13390     ceph_assert(bpm != peer_missing.end());
13391     bpm->second.add(oid, eversion_t(), eversion_t(), false);
13392   }
13393
13394   ceph_assert(!recovering.count(oid));
13395
13396   start_recovery_op(oid);
13397   recovering.insert(make_pair(oid, obc));
13398
13399   // We need to take the read_lock here in order to flush in-progress writes
13400   int r = pgbackend->recover_object(
13401     oid,
13402     v,
13403     ObjectContextRef(),
13404     obc,
13405     h);
13406   if (r < 0) {
13407     dout(0) << __func__ << " Error " << r << " on oid " << oid << dendl;
13408     primary_failed(oid);
13409     primary_error(oid, v);
13410     backfills_in_flight.erase(oid);
13411     missing_loc.add_missing(oid, v, eversion_t());
13412   }
13413   return r;
13414 }
13415
13416 void PrimaryLogPG::update_range(
13417   BackfillInterval *bi,
13418   ThreadPool::TPHandle &handle)
13419 {
13420   int local_min = cct->_conf->osd_backfill_scan_min;
13421   int local_max = cct->_conf->osd_backfill_scan_max;
13422
13423   if (bi->version < info.log_tail) {
13424     dout(10) << __func__<< ": bi is old, rescanning local backfill_info"
13425              << dendl;
13426     bi->version = info.last_update;
13427     scan_range(local_min, local_max, bi, handle);
13428   }
13429
13430   if (bi->version >= projected_last_update) {
13431     dout(10) << __func__<< ": bi is current " << dendl;
13432     ceph_assert(bi->version == projected_last_update);
13433   } else if (bi->version >= info.log_tail) {
13434     if (pg_log.get_log().empty() && projected_log.empty()) {
13435       /* Because we don't move log_tail on split, the log might be
13436        * empty even if log_tail != last_update.  However, the only
13437        * way to get here with an empty log is if log_tail is actually
13438        * eversion_t(), because otherwise the entry which changed
13439        * last_update since the last scan would have to be present.
13440        */
13441       ceph_assert(bi->version == eversion_t());
13442       return;
13443     }
13444
13445     dout(10) << __func__<< ": bi is old, (" << bi->version
13446              << ") can be updated with log to projected_last_update "
13447              << projected_last_update << dendl;
13448
13449     auto func = [&](const pg_log_entry_t &e) {
13450       dout(10) << __func__ << ": updating from version " << e.version
13451                << dendl;
13452       const hobject_t &soid = e.soid;
13453       if (soid >= bi->begin &&
13454           soid < bi->end) {
13455         if (e.is_update()) {
13456           dout(10) << __func__ << ": " << e.soid << " updated to version "
13457                    << e.version << dendl;
13458           bi->objects.erase(e.soid);
13459           bi->objects.insert(
13460             make_pair(
13461               e.soid,
13462               e.version));
13463         } else if (e.is_delete()) {
13464           dout(10) << __func__ << ": " << e.soid << " removed" << dendl;
13465           bi->objects.erase(e.soid);
13466         }
13467       }
13468     };
13469     dout(10) << "scanning pg log first" << dendl;
13470     pg_log.get_log().scan_log_after(bi->version, func);
13471     dout(10) << "scanning projected log" << dendl;
13472     projected_log.scan_log_after(bi->version, func);
13473     bi->version = projected_last_update;
13474   } else {
13475     ceph_abort_msg("scan_range should have raised bi->version past log_tail");
13476   }
13477 }
13478
13479 void PrimaryLogPG::scan_range(
13480   int min, int max, BackfillInterval *bi,
13481   ThreadPool::TPHandle &handle)
13482 {
13483   ceph_assert(is_locked());
13484   dout(10) << "scan_range from " << bi->begin << dendl;
13485   bi->clear_objects();
13486
13487   vector<hobject_t> ls;
13488   ls.reserve(max);
13489   int r = pgbackend->objects_list_partial(bi->begin, min, max, &ls, &bi->end);
13490   ceph_assert(r >= 0);
13491   dout(10) << " got " << ls.size() << " items, next " << bi->end << dendl;
13492   dout(20) << ls << dendl;
13493
13494   for (vector<hobject_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
13495     handle.reset_tp_timeout();
13496     ObjectContextRef obc;
13497     if (is_primary())
13498       obc = object_contexts.lookup(*p);
13499     if (obc) {
13500       bi->objects[*p] = obc->obs.oi.version;
13501       dout(20) << "  " << *p << " " << obc->obs.oi.version << dendl;
13502     } else {
13503       bufferlist bl;
13504       int r = pgbackend->objects_get_attr(*p, OI_ATTR, &bl);
13505
13506       /* If the object does not exist here, it must have been removed
13507          * between the collection_list_partial and here.  This can happen
13508          * for the first item in the range, which is usually last_backfill.
13509          */
13510       if (r == -ENOENT)
13511         continue;
13512
13513       ceph_assert(r >= 0);
13514       object_info_t oi(bl);
13515       bi->objects[*p] = oi.version;
13516       dout(20) << "  " << *p << " " << oi.version << dendl;
13517     }
13518   }
13519 }
13520
13521
13522 /** check_local
13523  *
13524  * verifies that stray objects have been deleted
13525  */
13526 void PrimaryLogPG::check_local()
13527 {
13528   dout(10) << __func__ << dendl;
13529
13530   ceph_assert(info.last_update >= pg_log.get_tail());  // otherwise we need some help!
13531
13532   if (!cct->_conf->osd_debug_verify_stray_on_activate)
13533     return;
13534
13535   // just scan the log.
13536   set<hobject_t> did;
13537   for (list<pg_log_entry_t>::const_reverse_iterator p = pg_log.get_log().log.rbegin();
13538        p != pg_log.get_log().log.rend();
13539        ++p) {
13540     if (did.count(p->soid))
13541       continue;
13542     did.insert(p->soid);
13543
13544     if (p->is_delete() && !is_missing_object(p->soid)) {
13545       dout(10) << " checking " << p->soid
13546                << " at " << p->version << dendl;
13547       struct stat st;
13548       int r = osd->store->stat(
13549         ch,
13550         ghobject_t(p->soid, ghobject_t::NO_GEN, pg_whoami.shard),
13551         &st);
13552       if (r != -ENOENT) {
13553         derr << __func__ << " " << p->soid << " exists, but should have been "
13554              << "deleted" << dendl;
13555         ceph_abort_msg("erroneously present object");
13556       }
13557     } else {
13558       // ignore old(+missing) objects
13559     }
13560   }
13561 }
13562
13563
13564
13565 // ===========================
13566 // hit sets
13567
13568 hobject_t PrimaryLogPG::get_hit_set_current_object(utime_t stamp)
13569 {
13570   ostringstream ss;
13571   ss << "hit_set_" << info.pgid.pgid << "_current_" << stamp;
13572   hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "",
13573                  info.pgid.ps(), info.pgid.pool(),
13574                  cct->_conf->osd_hit_set_namespace);
13575   dout(20) << __func__ << " " << hoid << dendl;
13576   return hoid;
13577 }
13578
13579 hobject_t PrimaryLogPG::get_hit_set_archive_object(utime_t start,
13580                                                    utime_t end,
13581                                                    bool using_gmt)
13582 {
13583   ostringstream ss;
13584   ss << "hit_set_" << info.pgid.pgid << "_archive_";
13585   if (using_gmt) {
13586     start.gmtime(ss) << "_";
13587     end.gmtime(ss);
13588   } else {
13589     start.localtime(ss) << "_";
13590     end.localtime(ss);
13591   }
13592   hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "",
13593                  info.pgid.ps(), info.pgid.pool(),
13594                  cct->_conf->osd_hit_set_namespace);
13595   dout(20) << __func__ << " " << hoid << dendl;
13596   return hoid;
13597 }
13598
13599 void PrimaryLogPG::hit_set_clear()
13600 {
13601   dout(20) << __func__ << dendl;
13602   hit_set.reset();
13603   hit_set_start_stamp = utime_t();
13604 }
13605
13606 void PrimaryLogPG::hit_set_setup()
13607 {
13608   if (!is_active() ||
13609       !is_primary()) {
13610     hit_set_clear();
13611     return;
13612   }
13613
13614   if (is_active() && is_primary() &&
13615       (!pool.info.hit_set_count ||
13616        !pool.info.hit_set_period ||
13617        pool.info.hit_set_params.get_type() == HitSet::TYPE_NONE)) {
13618     hit_set_clear();
13619
13620     // only primary is allowed to remove all the hit set objects
13621     hit_set_remove_all();
13622     return;
13623   }
13624
13625   // FIXME: discard any previous data for now
13626   hit_set_create();
13627
13628   // include any writes we know about from the pg log.  this doesn't
13629   // capture reads, but it is better than nothing!
13630   hit_set_apply_log();
13631 }
13632
13633 void PrimaryLogPG::hit_set_remove_all()
13634 {
13635   // If any archives are degraded we skip this
13636   for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin();
13637        p != info.hit_set.history.end();
13638        ++p) {
13639     hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
13640
13641     // Once we hit a degraded object just skip
13642     if (is_degraded_or_backfilling_object(aoid))
13643       return;
13644     if (write_blocked_by_scrub(aoid))
13645       return;
13646   }
13647
13648   if (!info.hit_set.history.empty()) {
13649     list<pg_hit_set_info_t>::reverse_iterator p = info.hit_set.history.rbegin();
13650     ceph_assert(p != info.hit_set.history.rend());
13651     hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
13652     ceph_assert(!is_degraded_or_backfilling_object(oid));
13653     ObjectContextRef obc = get_object_context(oid, false);
13654     ceph_assert(obc);
13655
13656     OpContextUPtr ctx = simple_opc_create(obc);
13657     ctx->at_version = get_next_version();
13658     ctx->updated_hset_history = info.hit_set;
13659     utime_t now = ceph_clock_now();
13660     ctx->mtime = now;
13661     hit_set_trim(ctx, 0);
13662     simple_opc_submit(std::move(ctx));
13663   }
13664
13665   info.hit_set = pg_hit_set_history_t();
13666   if (agent_state) {
13667     agent_state->discard_hit_sets();
13668   }
13669 }
13670
13671 void PrimaryLogPG::hit_set_create()
13672 {
13673   utime_t now = ceph_clock_now();
13674   // make a copy of the params to modify
13675   HitSet::Params params(pool.info.hit_set_params);
13676
13677   dout(20) << __func__ << " " << params << dendl;
13678   if (pool.info.hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
13679     BloomHitSet::Params *p =
13680       static_cast<BloomHitSet::Params*>(params.impl.get());
13681
13682     // convert false positive rate so it holds up across the full period
13683     p->set_fpp(p->get_fpp() / pool.info.hit_set_count);
13684     if (p->get_fpp() <= 0.0)
13685       p->set_fpp(.01);  // fpp cannot be zero!
13686
13687     // if we don't have specified size, estimate target size based on the
13688     // previous bin!
13689     if (p->target_size == 0 && hit_set) {
13690       utime_t dur = now - hit_set_start_stamp;
13691       unsigned unique = hit_set->approx_unique_insert_count();
13692       dout(20) << __func__ << " previous set had approx " << unique
13693                << " unique items over " << dur << " seconds" << dendl;
13694       p->target_size = (double)unique * (double)pool.info.hit_set_period
13695                      / (double)dur;
13696     }
13697     if (p->target_size <
13698         static_cast<uint64_t>(cct->_conf->osd_hit_set_min_size))
13699       p->target_size = cct->_conf->osd_hit_set_min_size;
13700
13701     if (p->target_size
13702         > static_cast<uint64_t>(cct->_conf->osd_hit_set_max_size))
13703       p->target_size = cct->_conf->osd_hit_set_max_size;
13704
13705     p->seed = now.sec();
13706
13707     dout(10) << __func__ << " target_size " << p->target_size
13708              << " fpp " << p->get_fpp() << dendl;
13709   }
13710   hit_set.reset(new HitSet(params));
13711   hit_set_start_stamp = now;
13712 }
13713
13714 /**
13715  * apply log entries to set
13716  *
13717  * this would only happen after peering, to at least capture writes
13718  * during an interval that was potentially lost.
13719  */
13720 bool PrimaryLogPG::hit_set_apply_log()
13721 {
13722   if (!hit_set)
13723     return false;
13724
13725   eversion_t to = info.last_update;
13726   eversion_t from = info.hit_set.current_last_update;
13727   if (to <= from) {
13728     dout(20) << __func__ << " no update" << dendl;
13729     return false;
13730   }
13731
13732   dout(20) << __func__ << " " << to << " .. " << info.last_update << dendl;
13733   list<pg_log_entry_t>::const_reverse_iterator p = pg_log.get_log().log.rbegin();
13734   while (p != pg_log.get_log().log.rend() && p->version > to)
13735     ++p;
13736   while (p != pg_log.get_log().log.rend() && p->version > from) {
13737     hit_set->insert(p->soid);
13738     ++p;
13739   }
13740
13741   return true;
13742 }
13743
13744 void PrimaryLogPG::hit_set_persist()
13745 {
13746   dout(10) << __func__  << dendl;
13747   bufferlist bl;
13748   unsigned max = pool.info.hit_set_count;
13749
13750   utime_t now = ceph_clock_now();
13751   hobject_t oid;
13752
13753   // If any archives are degraded we skip this persist request
13754   // account for the additional entry being added below
13755   for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin();
13756        p != info.hit_set.history.end();
13757        ++p) {
13758     hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
13759
13760     // Once we hit a degraded object just skip further trim
13761     if (is_degraded_or_backfilling_object(aoid))
13762       return;
13763     if (write_blocked_by_scrub(aoid))
13764       return;
13765   }
13766
13767   // If backfill is in progress and we could possibly overlap with the
13768   // hit_set_* objects, back off.  Since these all have
13769   // hobject_t::hash set to pgid.ps(), and those sort first, we can
13770   // look just at that.  This is necessary because our transactions
13771   // may include a modify of the new hit_set *and* a delete of the
13772   // old one, and this may span the backfill boundary.
13773   for (set<pg_shard_t>::iterator p = backfill_targets.begin();
13774        p != backfill_targets.end();
13775        ++p) {
13776     ceph_assert(peer_info.count(*p));
13777     const pg_info_t& pi = peer_info[*p];
13778     if (pi.last_backfill == hobject_t() ||
13779         pi.last_backfill.get_hash() == info.pgid.ps()) {
13780       dout(10) << __func__ << " backfill target osd." << *p
13781                << " last_backfill has not progressed past pgid ps"
13782                << dendl;
13783       return;
13784     }
13785   }
13786
13787
13788   pg_hit_set_info_t new_hset = pg_hit_set_info_t(pool.info.use_gmt_hitset);
13789   new_hset.begin = hit_set_start_stamp;
13790   new_hset.end = now;
13791   oid = get_hit_set_archive_object(
13792     new_hset.begin,
13793     new_hset.end,
13794     new_hset.using_gmt);
13795
13796   // If the current object is degraded we skip this persist request
13797   if (write_blocked_by_scrub(oid))
13798     return;
13799
13800   hit_set->seal();
13801   encode(*hit_set, bl);
13802   dout(20) << __func__ << " archive " << oid << dendl;
13803
13804   if (agent_state) {
13805     agent_state->add_hit_set(new_hset.begin, hit_set);
13806     uint32_t size = agent_state->hit_set_map.size();
13807     if (size >= pool.info.hit_set_count) {
13808       size = pool.info.hit_set_count > 0 ? pool.info.hit_set_count - 1: 0;
13809     }
13810     hit_set_in_memory_trim(size);
13811   }
13812
13813   ObjectContextRef obc = get_object_context(oid, true);
13814   OpContextUPtr ctx = simple_opc_create(obc);
13815
13816   ctx->at_version = get_next_version();
13817   ctx->updated_hset_history = info.hit_set;
13818   pg_hit_set_history_t &updated_hit_set_hist = *(ctx->updated_hset_history);
13819
13820   updated_hit_set_hist.current_last_update = info.last_update;
13821   new_hset.version = ctx->at_version;
13822
13823   updated_hit_set_hist.history.push_back(new_hset);
13824   hit_set_create();
13825
13826   // fabricate an object_info_t and SnapSet
13827   obc->obs.oi.version = ctx->at_version;
13828   obc->obs.oi.mtime = now;
13829   obc->obs.oi.size = bl.length();
13830   obc->obs.exists = true;
13831   obc->obs.oi.set_data_digest(bl.crc32c(-1));
13832
13833   ctx->new_obs = obc->obs;
13834
13835   ctx->new_snapset = obc->ssc->snapset;
13836
13837   ctx->delta_stats.num_objects++;
13838   ctx->delta_stats.num_objects_hit_set_archive++;
13839
13840   ctx->delta_stats.num_bytes += bl.length();
13841   ctx->delta_stats.num_bytes_hit_set_archive += bl.length();
13842
13843   bufferlist bss;
13844   encode(ctx->new_snapset, bss);
13845   bufferlist boi(sizeof(ctx->new_obs.oi));
13846   encode(ctx->new_obs.oi, boi,
13847            get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
13848
13849   ctx->op_t->create(oid);
13850   if (bl.length()) {
13851     ctx->op_t->write(oid, 0, bl.length(), bl, 0);
13852   }
13853   map <string, bufferlist> attrs;
13854   attrs[OI_ATTR].claim(boi);
13855   attrs[SS_ATTR].claim(bss);
13856   setattrs_maybe_cache(ctx->obc, ctx->op_t.get(), attrs);
13857   ctx->log.push_back(
13858     pg_log_entry_t(
13859       pg_log_entry_t::MODIFY,
13860       oid,
13861       ctx->at_version,
13862       eversion_t(),
13863       0,
13864       osd_reqid_t(),
13865       ctx->mtime,
13866       0)
13867     );
13868
13869   hit_set_trim(ctx, max);
13870
13871   simple_opc_submit(std::move(ctx));
13872 }
13873
13874 void PrimaryLogPG::hit_set_trim(OpContextUPtr &ctx, unsigned max)
13875 {
13876   ceph_assert(ctx->updated_hset_history);
13877   pg_hit_set_history_t &updated_hit_set_hist =
13878     *(ctx->updated_hset_history);
13879   for (unsigned num = updated_hit_set_hist.history.size(); num > max; --num) {
13880     list<pg_hit_set_info_t>::iterator p = updated_hit_set_hist.history.begin();
13881     ceph_assert(p != updated_hit_set_hist.history.end());
13882     hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
13883
13884     ceph_assert(!is_degraded_or_backfilling_object(oid));
13885
13886     dout(20) << __func__ << " removing " << oid << dendl;
13887     ++ctx->at_version.version;
13888     ctx->log.push_back(
13889         pg_log_entry_t(pg_log_entry_t::DELETE,
13890                        oid,
13891                        ctx->at_version,
13892                        p->version,
13893                        0,
13894                        osd_reqid_t(),
13895                        ctx->mtime,
13896                        0));
13897
13898     ctx->op_t->remove(oid);
13899     updated_hit_set_hist.history.pop_front();
13900
13901     ObjectContextRef obc = get_object_context(oid, false);
13902     ceph_assert(obc);
13903     --ctx->delta_stats.num_objects;
13904     --ctx->delta_stats.num_objects_hit_set_archive;
13905     ctx->delta_stats.num_bytes -= obc->obs.oi.size;
13906     ctx->delta_stats.num_bytes_hit_set_archive -= obc->obs.oi.size;
13907   }
13908 }
13909
13910 void PrimaryLogPG::hit_set_in_memory_trim(uint32_t max_in_memory)
13911 {
13912   while (agent_state->hit_set_map.size() > max_in_memory) {
13913     agent_state->remove_oldest_hit_set();
13914   }
13915 }
13916
13917
13918 // =======================================
13919 // cache agent
13920
13921 void PrimaryLogPG::agent_setup()
13922 {
13923   ceph_assert(is_locked());
13924   if (!is_active() ||
13925       !is_primary() ||
13926       state_test(PG_STATE_PREMERGE) ||
13927       pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE ||
13928       pool.info.tier_of < 0 ||
13929       !get_osdmap()->have_pg_pool(pool.info.tier_of)) {
13930     agent_clear();
13931     return;
13932   }
13933   if (!agent_state) {
13934     agent_state.reset(new TierAgentState);
13935
13936     // choose random starting position
13937     agent_state->position = hobject_t();
13938     agent_state->position.pool = info.pgid.pool();
13939     agent_state->position.set_hash(pool.info.get_random_pg_position(
13940       info.pgid.pgid,
13941       rand()));
13942     agent_state->start = agent_state->position;
13943
13944     dout(10) << __func__ << " allocated new state, position "
13945              << agent_state->position << dendl;
13946   } else {
13947     dout(10) << __func__ << " keeping existing state" << dendl;
13948   }
13949
13950   if (info.stats.stats_invalid) {
13951     osd->clog->warn() << "pg " << info.pgid << " has invalid (post-split) stats; must scrub before tier agent can activate";
13952   }
13953
13954   agent_choose_mode();
13955 }
13956
13957 void PrimaryLogPG::agent_clear()
13958 {
13959   agent_stop();
13960   agent_state.reset(NULL);
13961 }
13962
13963 // Return false if no objects operated on since start of object hash space
13964 bool PrimaryLogPG::agent_work(int start_max, int agent_flush_quota)
13965 {
13966   lock();
13967   if (!agent_state) {
13968     dout(10) << __func__ << " no agent state, stopping" << dendl;
13969     unlock();
13970     return true;
13971   }
13972
13973   ceph_assert(!deleting);
13974
13975   if (agent_state->is_idle()) {
13976     dout(10) << __func__ << " idle, stopping" << dendl;
13977     unlock();
13978     return true;
13979   }
13980
13981   osd->logger->inc(l_osd_agent_wake);
13982
13983   dout(10) << __func__
13984            << " max " << start_max
13985            << ", flush " << agent_state->get_flush_mode_name()
13986            << ", evict " << agent_state->get_evict_mode_name()
13987            << ", pos " << agent_state->position
13988            << dendl;
13989   ceph_assert(is_primary());
13990   ceph_assert(is_active());
13991
13992   agent_load_hit_sets();
13993
13994   const pg_pool_t *base_pool = get_osdmap()->get_pg_pool(pool.info.tier_of);
13995   ceph_assert(base_pool);
13996
13997   int ls_min = 1;
13998   int ls_max = cct->_conf->osd_pool_default_cache_max_evict_check_size;
13999
14000   // list some objects.  this conveniently lists clones (oldest to
14001   // newest) before heads... the same order we want to flush in.
14002   //
14003   // NOTE: do not flush the Sequencer.  we will assume that the
14004   // listing we get back is imprecise.
14005   vector<hobject_t> ls;
14006   hobject_t next;
14007   int r = pgbackend->objects_list_partial(agent_state->position, ls_min, ls_max,
14008                                           &ls, &next);
14009   ceph_assert(r >= 0);
14010   dout(20) << __func__ << " got " << ls.size() << " objects" << dendl;
14011   int started = 0;
14012   for (vector<hobject_t>::iterator p = ls.begin();
14013        p != ls.end();
14014        ++p) {
14015     if (p->nspace == cct->_conf->osd_hit_set_namespace) {
14016       dout(20) << __func__ << " skip (hit set) " << *p << dendl;
14017       osd->logger->inc(l_osd_agent_skip);
14018       continue;
14019     }
14020     if (is_degraded_or_backfilling_object(*p)) {
14021       dout(20) << __func__ << " skip (degraded) " << *p << dendl;
14022       osd->logger->inc(l_osd_agent_skip);
14023       continue;
14024     }
14025     if (is_missing_object(p->get_head())) {
14026       dout(20) << __func__ << " skip (missing head) " << *p << dendl;
14027       osd->logger->inc(l_osd_agent_skip);
14028       continue;
14029     }
14030     ObjectContextRef obc = get_object_context(*p, false, NULL);
14031     if (!obc) {
14032       // we didn't flush; we may miss something here.
14033       dout(20) << __func__ << " skip (no obc) " << *p << dendl;
14034       osd->logger->inc(l_osd_agent_skip);
14035       continue;
14036     }
14037     if (!obc->obs.exists) {
14038       dout(20) << __func__ << " skip (dne) " << obc->obs.oi.soid << dendl;
14039       osd->logger->inc(l_osd_agent_skip);
14040       continue;
14041     }
14042     if (range_intersects_scrub(obc->obs.oi.soid,
14043                                obc->obs.oi.soid.get_head())) {
14044       dout(20) << __func__ << " skip (scrubbing) " << obc->obs.oi << dendl;
14045       osd->logger->inc(l_osd_agent_skip);
14046       continue;
14047     }
14048     if (obc->is_blocked()) {
14049       dout(20) << __func__ << " skip (blocked) " << obc->obs.oi << dendl;
14050       osd->logger->inc(l_osd_agent_skip);
14051       continue;
14052     }
14053     if (obc->is_request_pending()) {
14054       dout(20) << __func__ << " skip (request pending) " << obc->obs.oi << dendl;
14055       osd->logger->inc(l_osd_agent_skip);
14056       continue;
14057     }
14058
14059     // be careful flushing omap to an EC pool.
14060     if (!base_pool->supports_omap() &&
14061         obc->obs.oi.is_omap()) {
14062       dout(20) << __func__ << " skip (omap to EC) " << obc->obs.oi << dendl;
14063       osd->logger->inc(l_osd_agent_skip);
14064       continue;
14065     }
14066
14067     if (agent_state->evict_mode != TierAgentState::EVICT_MODE_IDLE &&
14068         agent_maybe_evict(obc, false))
14069       ++started;
14070     else if (agent_state->flush_mode != TierAgentState::FLUSH_MODE_IDLE &&
14071              agent_flush_quota > 0 && agent_maybe_flush(obc)) {
14072       ++started;
14073       --agent_flush_quota;
14074     }
14075     if (started >= start_max) {
14076       // If finishing early, set "next" to the next object
14077       if (++p != ls.end())
14078         next = *p;
14079       break;
14080     }
14081   }
14082
14083   if (++agent_state->hist_age > cct->_conf->osd_agent_hist_halflife) {
14084     dout(20) << __func__ << " resetting atime and temp histograms" << dendl;
14085     agent_state->hist_age = 0;
14086     agent_state->temp_hist.decay();
14087   }
14088
14089   // Total objects operated on so far
14090   int total_started = agent_state->started + started;
14091   bool need_delay = false;
14092
14093   dout(20) << __func__ << " start pos " << agent_state->position
14094     << " next start pos " << next
14095     << " started " << total_started << dendl;
14096
14097   // See if we've made a full pass over the object hash space
14098   // This might check at most ls_max objects a second time to notice that
14099   // we've checked every objects at least once.
14100   if (agent_state->position < agent_state->start &&
14101       next >= agent_state->start) {
14102     dout(20) << __func__ << " wrap around " << agent_state->start << dendl;
14103     if (total_started == 0)
14104       need_delay = true;
14105     else
14106       total_started = 0;
14107     agent_state->start = next;
14108   }
14109   agent_state->started = total_started;
14110
14111   // See if we are starting from beginning
14112   if (next.is_max())
14113     agent_state->position = hobject_t();
14114   else
14115     agent_state->position = next;
14116
14117   // Discard old in memory HitSets
14118   hit_set_in_memory_trim(pool.info.hit_set_count);
14119
14120   if (need_delay) {
14121     ceph_assert(agent_state->delaying == false);
14122     agent_delay();
14123     unlock();
14124     return false;
14125   }
14126   agent_choose_mode();
14127   unlock();
14128   return true;
14129 }
14130
14131 void PrimaryLogPG::agent_load_hit_sets()
14132 {
14133   if (agent_state->evict_mode == TierAgentState::EVICT_MODE_IDLE) {
14134     return;
14135   }
14136
14137   if (agent_state->hit_set_map.size() < info.hit_set.history.size()) {
14138     dout(10) << __func__ << dendl;
14139     for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin();
14140          p != info.hit_set.history.end(); ++p) {
14141       if (agent_state->hit_set_map.count(p->begin.sec()) == 0) {
14142         dout(10) << __func__ << " loading " << p->begin << "-"
14143                  << p->end << dendl;
14144         if (!pool.info.is_replicated()) {
14145           // FIXME: EC not supported here yet
14146           derr << __func__ << " on non-replicated pool" << dendl;
14147           break;
14148         }
14149
14150         hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
14151         if (is_unreadable_object(oid)) {
14152           dout(10) << __func__ << " unreadable " << oid << ", waiting" << dendl;
14153           break;
14154         }
14155
14156         ObjectContextRef obc = get_object_context(oid, false);
14157         if (!obc) {
14158           derr << __func__ << ": could not load hitset " << oid << dendl;
14159           break;
14160         }
14161
14162         bufferlist bl;
14163         {
14164           int r = osd->store->read(ch, ghobject_t(oid), 0, 0, bl);
14165           ceph_assert(r >= 0);
14166         }
14167         HitSetRef hs(new HitSet);
14168         bufferlist::const_iterator pbl = bl.begin();
14169         decode(*hs, pbl);
14170         agent_state->add_hit_set(p->begin.sec(), hs);
14171       }
14172     }
14173   }
14174 }
14175
14176 bool PrimaryLogPG::agent_maybe_flush(ObjectContextRef& obc)
14177 {
14178   if (!obc->obs.oi.is_dirty()) {
14179     dout(20) << __func__ << " skip (clean) " << obc->obs.oi << dendl;
14180     osd->logger->inc(l_osd_agent_skip);
14181     return false;
14182   }
14183   if (obc->obs.oi.is_cache_pinned()) {
14184     dout(20) << __func__ << " skip (cache_pinned) " << obc->obs.oi << dendl;
14185     osd->logger->inc(l_osd_agent_skip);
14186     return false;
14187   }
14188
14189   utime_t now = ceph_clock_now();
14190   utime_t ob_local_mtime;
14191   if (obc->obs.oi.local_mtime != utime_t()) {
14192     ob_local_mtime = obc->obs.oi.local_mtime;
14193   } else {
14194     ob_local_mtime = obc->obs.oi.mtime;
14195   }
14196   bool evict_mode_full =
14197     (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL);
14198   if (!evict_mode_full &&
14199       obc->obs.oi.soid.snap == CEPH_NOSNAP &&  // snaps immutable; don't delay
14200       (ob_local_mtime + utime_t(pool.info.cache_min_flush_age, 0) > now)) {
14201     dout(20) << __func__ << " skip (too young) " << obc->obs.oi << dendl;
14202     osd->logger->inc(l_osd_agent_skip);
14203     return false;
14204   }
14205
14206   if (osd->agent_is_active_oid(obc->obs.oi.soid)) {
14207     dout(20) << __func__ << " skip (flushing) " << obc->obs.oi << dendl;
14208     osd->logger->inc(l_osd_agent_skip);
14209     return false;
14210   }
14211
14212   dout(10) << __func__ << " flushing " << obc->obs.oi << dendl;
14213
14214   // FIXME: flush anything dirty, regardless of what distribution of
14215   // ages we expect.
14216
14217   hobject_t oid = obc->obs.oi.soid;
14218   osd->agent_start_op(oid);
14219   // no need to capture a pg ref, can't outlive fop or ctx
14220   std::function<void()> on_flush = [this, oid]() {
14221     osd->agent_finish_op(oid);
14222   };
14223
14224   int result = start_flush(
14225     OpRequestRef(), obc, false, NULL,
14226     on_flush);
14227   if (result != -EINPROGRESS) {
14228     on_flush();
14229     dout(10) << __func__ << " start_flush() failed " << obc->obs.oi
14230       << " with " << result << dendl;
14231     osd->logger->inc(l_osd_agent_skip);
14232     return false;
14233   }
14234
14235   osd->logger->inc(l_osd_agent_flush);
14236   return true;
14237 }
14238
14239 bool PrimaryLogPG::agent_maybe_evict(ObjectContextRef& obc, bool after_flush)
14240 {
14241   const hobject_t& soid = obc->obs.oi.soid;
14242   if (!after_flush && obc->obs.oi.is_dirty()) {
14243     dout(20) << __func__ << " skip (dirty) " << obc->obs.oi << dendl;
14244     return false;
14245   }
14246   if (!obc->obs.oi.watchers.empty()) {
14247     dout(20) << __func__ << " skip (watchers) " << obc->obs.oi << dendl;
14248     return false;
14249   }
14250   if (obc->is_blocked()) {
14251     dout(20) << __func__ << " skip (blocked) " << obc->obs.oi << dendl;
14252     return false;
14253   }
14254   if (obc->obs.oi.is_cache_pinned()) {
14255     dout(20) << __func__ << " skip (cache_pinned) " << obc->obs.oi << dendl;
14256     return false;
14257   }
14258
14259   if (soid.snap == CEPH_NOSNAP) {
14260     int result = _verify_no_head_clones(soid, obc->ssc->snapset);
14261     if (result < 0) {
14262       dout(20) << __func__ << " skip (clones) " << obc->obs.oi << dendl;
14263       return false;
14264     }
14265   }
14266
14267   if (agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL) {
14268     // is this object old than cache_min_evict_age?
14269     utime_t now = ceph_clock_now();
14270     utime_t ob_local_mtime;
14271     if (obc->obs.oi.local_mtime != utime_t()) {
14272       ob_local_mtime = obc->obs.oi.local_mtime;
14273     } else {
14274       ob_local_mtime = obc->obs.oi.mtime;
14275     }
14276     if (ob_local_mtime + utime_t(pool.info.cache_min_evict_age, 0) > now) {
14277       dout(20) << __func__ << " skip (too young) " << obc->obs.oi << dendl;
14278       osd->logger->inc(l_osd_agent_skip);
14279       return false;
14280     }
14281     // is this object old and/or cold enough?
14282     int temp = 0;
14283     uint64_t temp_upper = 0, temp_lower = 0;
14284     if (hit_set)
14285       agent_estimate_temp(soid, &temp);
14286     agent_state->temp_hist.add(temp);
14287     agent_state->temp_hist.get_position_micro(temp, &temp_lower, &temp_upper);
14288
14289     dout(20) << __func__
14290              << " temp " << temp
14291              << " pos " << temp_lower << "-" << temp_upper
14292              << ", evict_effort " << agent_state->evict_effort
14293              << dendl;
14294     dout(30) << "agent_state:\n";
14295     Formatter *f = Formatter::create("");
14296     f->open_object_section("agent_state");
14297     agent_state->dump(f);
14298     f->close_section();
14299     f->flush(*_dout);
14300     delete f;
14301     *_dout << dendl;
14302
14303     if (1000000 - temp_upper >= agent_state->evict_effort)
14304       return false;
14305   }
14306
14307   dout(10) << __func__ << " evicting " << obc->obs.oi << dendl;
14308   OpContextUPtr ctx = simple_opc_create(obc);
14309
14310   auto null_op_req = OpRequestRef();
14311   if (!ctx->lock_manager.get_lock_type(
14312         ObjectContext::RWState::RWWRITE,
14313         obc->obs.oi.soid,
14314         obc,
14315         null_op_req)) {
14316     close_op_ctx(ctx.release());
14317     dout(20) << __func__ << " skip (cannot get lock) " << obc->obs.oi << dendl;
14318     return false;
14319   }
14320
14321   osd->agent_start_evict_op();
14322   ctx->register_on_finish(
14323     [this]() {
14324       osd->agent_finish_evict_op();
14325     });
14326
14327   ctx->at_version = get_next_version();
14328   ceph_assert(ctx->new_obs.exists);
14329   int r = _delete_oid(ctx.get(), true, false);
14330   if (obc->obs.oi.is_omap())
14331     ctx->delta_stats.num_objects_omap--;
14332   ctx->delta_stats.num_evict++;
14333   ctx->delta_stats.num_evict_kb += shift_round_up(obc->obs.oi.size, 10);
14334   if (obc->obs.oi.is_dirty())
14335     --ctx->delta_stats.num_objects_dirty;
14336   ceph_assert(r == 0);
14337   finish_ctx(ctx.get(), pg_log_entry_t::DELETE);
14338   simple_opc_submit(std::move(ctx));
14339   osd->logger->inc(l_osd_tier_evict);
14340   osd->logger->inc(l_osd_agent_evict);
14341   return true;
14342 }
14343
14344 void PrimaryLogPG::agent_stop()
14345 {
14346   dout(20) << __func__ << dendl;
14347   if (agent_state && !agent_state->is_idle()) {
14348     agent_state->evict_mode = TierAgentState::EVICT_MODE_IDLE;
14349     agent_state->flush_mode = TierAgentState::FLUSH_MODE_IDLE;
14350     osd->agent_disable_pg(this, agent_state->evict_effort);
14351   }
14352 }
14353
14354 void PrimaryLogPG::agent_delay()
14355 {
14356   dout(20) << __func__ << dendl;
14357   if (agent_state && !agent_state->is_idle()) {
14358     ceph_assert(agent_state->delaying == false);
14359     agent_state->delaying = true;
14360     osd->agent_disable_pg(this, agent_state->evict_effort);
14361   }
14362 }
14363
14364 void PrimaryLogPG::agent_choose_mode_restart()
14365 {
14366   dout(20) << __func__ << dendl;
14367   lock();
14368   if (agent_state && agent_state->delaying) {
14369     agent_state->delaying = false;
14370     agent_choose_mode(true);
14371   }
14372   unlock();
14373 }
14374
14375 bool PrimaryLogPG::agent_choose_mode(bool restart, OpRequestRef op)
14376 {
14377   bool requeued = false;
14378   // Let delay play out
14379   if (agent_state->delaying) {
14380     dout(20) << __func__ << " " << this << " delaying, ignored" << dendl;
14381     return requeued;
14382   }
14383
14384   TierAgentState::flush_mode_t flush_mode = TierAgentState::FLUSH_MODE_IDLE;
14385   TierAgentState::evict_mode_t evict_mode = TierAgentState::EVICT_MODE_IDLE;
14386   unsigned evict_effort = 0;
14387
14388   if (info.stats.stats_invalid) {
14389     // idle; stats can't be trusted until we scrub.
14390     dout(20) << __func__ << " stats invalid (post-split), idle" << dendl;
14391     goto skip_calc;
14392   }
14393
14394   {
14395   uint64_t divisor = pool.info.get_pg_num_divisor(info.pgid.pgid);
14396   ceph_assert(divisor > 0);
14397
14398   // adjust (effective) user objects down based on the number
14399   // of HitSet objects, which should not count toward our total since
14400   // they cannot be flushed.
14401   uint64_t unflushable = info.stats.stats.sum.num_objects_hit_set_archive;
14402
14403   // also exclude omap objects if ec backing pool
14404   const pg_pool_t *base_pool = get_osdmap()->get_pg_pool(pool.info.tier_of);
14405   ceph_assert(base_pool);
14406   if (!base_pool->supports_omap())
14407     unflushable += info.stats.stats.sum.num_objects_omap;
14408
14409   uint64_t num_user_objects = info.stats.stats.sum.num_objects;
14410   if (num_user_objects > unflushable)
14411     num_user_objects -= unflushable;
14412   else
14413     num_user_objects = 0;
14414
14415   uint64_t num_user_bytes = info.stats.stats.sum.num_bytes;
14416   uint64_t unflushable_bytes = info.stats.stats.sum.num_bytes_hit_set_archive;
14417   num_user_bytes -= unflushable_bytes;
14418   uint64_t num_overhead_bytes = osd->store->estimate_objects_overhead(num_user_objects);
14419   num_user_bytes += num_overhead_bytes;
14420
14421   // also reduce the num_dirty by num_objects_omap
14422   int64_t num_dirty = info.stats.stats.sum.num_objects_dirty;
14423   if (!base_pool->supports_omap()) {
14424     if (num_dirty > info.stats.stats.sum.num_objects_omap)
14425       num_dirty -= info.stats.stats.sum.num_objects_omap;
14426     else
14427       num_dirty = 0;
14428   }
14429
14430   dout(10) << __func__
14431            << " flush_mode: "
14432            << TierAgentState::get_flush_mode_name(agent_state->flush_mode)
14433            << " evict_mode: "
14434            << TierAgentState::get_evict_mode_name(agent_state->evict_mode)
14435            << " num_objects: " << info.stats.stats.sum.num_objects
14436            << " num_bytes: " << info.stats.stats.sum.num_bytes
14437            << " num_objects_dirty: " << info.stats.stats.sum.num_objects_dirty
14438            << " num_objects_omap: " << info.stats.stats.sum.num_objects_omap
14439            << " num_dirty: " << num_dirty
14440            << " num_user_objects: " << num_user_objects
14441            << " num_user_bytes: " << num_user_bytes
14442            << " num_overhead_bytes: " << num_overhead_bytes
14443            << " pool.info.target_max_bytes: " << pool.info.target_max_bytes
14444            << " pool.info.target_max_objects: " << pool.info.target_max_objects
14445            << dendl;
14446
14447   // get dirty, full ratios
14448   uint64_t dirty_micro = 0;
14449   uint64_t full_micro = 0;
14450   if (pool.info.target_max_bytes && num_user_objects > 0) {
14451     uint64_t avg_size = num_user_bytes / num_user_objects;
14452     dirty_micro =
14453       num_dirty * avg_size * 1000000 /
14454       std::max<uint64_t>(pool.info.target_max_bytes / divisor, 1);
14455     full_micro =
14456       num_user_objects * avg_size * 1000000 /
14457       std::max<uint64_t>(pool.info.target_max_bytes / divisor, 1);
14458   }
14459   if (pool.info.target_max_objects > 0) {
14460     uint64_t dirty_objects_micro =
14461       num_dirty * 1000000 /
14462       std::max<uint64_t>(pool.info.target_max_objects / divisor, 1);
14463     if (dirty_objects_micro > dirty_micro)
14464       dirty_micro = dirty_objects_micro;
14465     uint64_t full_objects_micro =
14466       num_user_objects * 1000000 /
14467       std::max<uint64_t>(pool.info.target_max_objects / divisor, 1);
14468     if (full_objects_micro > full_micro)
14469       full_micro = full_objects_micro;
14470   }
14471   dout(20) << __func__ << " dirty " << ((float)dirty_micro / 1000000.0)
14472            << " full " << ((float)full_micro / 1000000.0)
14473            << dendl;
14474
14475   // flush mode
14476   uint64_t flush_target = pool.info.cache_target_dirty_ratio_micro;
14477   uint64_t flush_high_target = pool.info.cache_target_dirty_high_ratio_micro;
14478   uint64_t flush_slop = (float)flush_target * cct->_conf->osd_agent_slop;
14479   if (restart || agent_state->flush_mode == TierAgentState::FLUSH_MODE_IDLE) {
14480     flush_target += flush_slop;
14481     flush_high_target += flush_slop;
14482   } else {
14483     flush_target -= std::min(flush_target, flush_slop);
14484     flush_high_target -= std::min(flush_high_target, flush_slop);
14485   }
14486
14487   if (dirty_micro > flush_high_target) {
14488     flush_mode = TierAgentState::FLUSH_MODE_HIGH;
14489   } else if (dirty_micro > flush_target || (!flush_target && num_dirty > 0)) {
14490     flush_mode = TierAgentState::FLUSH_MODE_LOW;
14491   }
14492
14493   // evict mode
14494   uint64_t evict_target = pool.info.cache_target_full_ratio_micro;
14495   uint64_t evict_slop = (float)evict_target * cct->_conf->osd_agent_slop;
14496   if (restart || agent_state->evict_mode == TierAgentState::EVICT_MODE_IDLE)
14497     evict_target += evict_slop;
14498   else
14499     evict_target -= std::min(evict_target, evict_slop);
14500
14501   if (full_micro > 1000000) {
14502     // evict anything clean
14503     evict_mode = TierAgentState::EVICT_MODE_FULL;
14504     evict_effort = 1000000;
14505   } else if (full_micro > evict_target) {
14506     // set effort in [0..1] range based on where we are between
14507     evict_mode = TierAgentState::EVICT_MODE_SOME;
14508     uint64_t over = full_micro - evict_target;
14509     uint64_t span  = 1000000 - evict_target;
14510     evict_effort = std::max(over * 1000000 / span,
14511                             uint64_t(1000000.0 *
14512                                      cct->_conf->osd_agent_min_evict_effort));
14513
14514     // quantize effort to avoid too much reordering in the agent_queue.
14515     uint64_t inc = cct->_conf->osd_agent_quantize_effort * 1000000;
14516     ceph_assert(inc > 0);
14517     uint64_t was = evict_effort;
14518     evict_effort -= evict_effort % inc;
14519     if (evict_effort < inc)
14520       evict_effort = inc;
14521     ceph_assert(evict_effort >= inc && evict_effort <= 1000000);
14522     dout(30) << __func__ << " evict_effort " << was << " quantized by " << inc << " to " << evict_effort << dendl;
14523   }
14524   }
14525
14526   skip_calc:
14527   bool old_idle = agent_state->is_idle();
14528   if (flush_mode != agent_state->flush_mode) {
14529     dout(5) << __func__ << " flush_mode "
14530             << TierAgentState::get_flush_mode_name(agent_state->flush_mode)
14531             << " -> "
14532             << TierAgentState::get_flush_mode_name(flush_mode)
14533             << dendl;
14534     if (flush_mode == TierAgentState::FLUSH_MODE_HIGH) {
14535       osd->agent_inc_high_count();
14536       info.stats.stats.sum.num_flush_mode_high = 1;
14537     } else if (flush_mode == TierAgentState::FLUSH_MODE_LOW) {
14538       info.stats.stats.sum.num_flush_mode_low = 1;
14539     }
14540     if (agent_state->flush_mode == TierAgentState::FLUSH_MODE_HIGH) {
14541       osd->agent_dec_high_count();
14542       info.stats.stats.sum.num_flush_mode_high = 0;
14543     } else if (agent_state->flush_mode == TierAgentState::FLUSH_MODE_LOW) {
14544       info.stats.stats.sum.num_flush_mode_low = 0;
14545     }
14546     agent_state->flush_mode = flush_mode;
14547   }
14548   if (evict_mode != agent_state->evict_mode) {
14549     dout(5) << __func__ << " evict_mode "
14550             << TierAgentState::get_evict_mode_name(agent_state->evict_mode)
14551             << " -> "
14552             << TierAgentState::get_evict_mode_name(evict_mode)
14553             << dendl;
14554     if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL &&
14555         is_active()) {
14556       if (op)
14557         requeue_op(op);
14558       requeue_ops(waiting_for_flush);
14559       requeue_ops(waiting_for_active);
14560       requeue_ops(waiting_for_scrub);
14561       requeue_ops(waiting_for_cache_not_full);
14562       objects_blocked_on_cache_full.clear();
14563       requeued = true;
14564     }
14565     if (evict_mode == TierAgentState::EVICT_MODE_SOME) {
14566       info.stats.stats.sum.num_evict_mode_some = 1;
14567     } else if (evict_mode == TierAgentState::EVICT_MODE_FULL) {
14568       info.stats.stats.sum.num_evict_mode_full = 1;
14569     }
14570     if (agent_state->evict_mode == TierAgentState::EVICT_MODE_SOME) {
14571       info.stats.stats.sum.num_evict_mode_some = 0;
14572     } else if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
14573       info.stats.stats.sum.num_evict_mode_full = 0;
14574     }
14575     agent_state->evict_mode = evict_mode;
14576   }
14577   uint64_t old_effort = agent_state->evict_effort;
14578   if (evict_effort != agent_state->evict_effort) {
14579     dout(5) << __func__ << " evict_effort "
14580             << ((float)agent_state->evict_effort / 1000000.0)
14581             << " -> "
14582             << ((float)evict_effort / 1000000.0)
14583             << dendl;
14584     agent_state->evict_effort = evict_effort;
14585   }
14586
14587   // NOTE: we are using evict_effort as a proxy for *all* agent effort
14588   // (including flush).  This is probably fine (they should be
14589   // correlated) but it is not precisely correct.
14590   if (agent_state->is_idle()) {
14591     if (!restart && !old_idle) {
14592       osd->agent_disable_pg(this, old_effort);
14593     }
14594   } else {
14595     if (restart || old_idle) {
14596       osd->agent_enable_pg(this, agent_state->evict_effort);
14597     } else if (old_effort != agent_state->evict_effort) {
14598       osd->agent_adjust_pg(this, old_effort, agent_state->evict_effort);
14599     }
14600   }
14601   return requeued;
14602 }
14603
14604 void PrimaryLogPG::agent_estimate_temp(const hobject_t& oid, int *temp)
14605 {
14606   ceph_assert(hit_set);
14607   ceph_assert(temp);
14608   *temp = 0;
14609   if (hit_set->contains(oid))
14610     *temp = 1000000;
14611   unsigned i = 0;
14612   int last_n = pool.info.hit_set_search_last_n;
14613   for (map<time_t,HitSetRef>::reverse_iterator p =
14614        agent_state->hit_set_map.rbegin(); last_n > 0 &&
14615        p != agent_state->hit_set_map.rend(); ++p, ++i) {
14616     if (p->second->contains(oid)) {
14617       *temp += pool.info.get_grade(i);
14618       --last_n;
14619     }
14620   }
14621 }
14622
14623 // Dup op detection
14624
14625 bool PrimaryLogPG::already_complete(eversion_t v)
14626 {
14627   dout(20) << __func__ << ": " << v << dendl;
14628   for (xlist<RepGather*>::iterator i = repop_queue.begin();
14629        !i.end();
14630        ++i) {
14631     dout(20) << __func__ << ": " << **i << dendl;
14632     // skip copy from temp object ops
14633     if ((*i)->v == eversion_t()) {
14634       dout(20) << __func__ << ": " << **i
14635                << " version is empty" << dendl;
14636       continue;
14637     }
14638     if ((*i)->v > v) {
14639       dout(20) << __func__ << ": " << **i
14640                << " (*i)->v past v" << dendl;
14641       break;
14642     }
14643     if (!(*i)->all_committed) {
14644       dout(20) << __func__ << ": " << **i
14645                << " not committed, returning false"
14646                << dendl;
14647       return false;
14648     }
14649   }
14650   dout(20) << __func__ << ": returning true" << dendl;
14651   return true;
14652 }
14653
14654 bool PrimaryLogPG::already_ack(eversion_t v)
14655 {
14656   dout(20) << __func__ << ": " << v << dendl;
14657   for (xlist<RepGather*>::iterator i = repop_queue.begin();
14658        !i.end();
14659        ++i) {
14660     // skip copy from temp object ops
14661     if ((*i)->v == eversion_t()) {
14662       dout(20) << __func__ << ": " << **i
14663                << " version is empty" << dendl;
14664       continue;
14665     }
14666     if ((*i)->v > v) {
14667       dout(20) << __func__ << ": " << **i
14668                << " (*i)->v past v" << dendl;
14669       break;
14670     }
14671   }
14672   dout(20) << __func__ << ": returning true" << dendl;
14673   return true;
14674 }
14675
14676
14677 // ==========================================================================================
14678 // SCRUB
14679
14680
14681 bool PrimaryLogPG::_range_available_for_scrub(
14682   const hobject_t &begin, const hobject_t &end)
14683 {
14684   pair<hobject_t, ObjectContextRef> next;
14685   next.second = object_contexts.lookup(begin);
14686   next.first = begin;
14687   bool more = true;
14688   while (more && next.first < end) {
14689     if (next.second && next.second->is_blocked()) {
14690       next.second->requeue_scrub_on_unblock = true;
14691       dout(10) << __func__ << ": scrub delayed, "
14692                << next.first << " is blocked"
14693                << dendl;
14694       return false;
14695     }
14696     more = object_contexts.get_next(next.first, &next);
14697   }
14698   return true;
14699 }
14700
14701 static bool doing_clones(const boost::optional<SnapSet> &snapset,
14702                          const vector<snapid_t>::reverse_iterator &curclone) {
14703     return snapset && curclone != snapset.get().clones.rend();
14704 }
14705
14706 void PrimaryLogPG::log_missing(unsigned missing,
14707                         const boost::optional<hobject_t> &head,
14708                         LogChannelRef clog,
14709                         const spg_t &pgid,
14710                         const char *func,
14711                         const char *mode,
14712                         bool allow_incomplete_clones)
14713 {
14714   ceph_assert(head);
14715   if (allow_incomplete_clones) {
14716     dout(20) << func << " " << mode << " " << pgid << " " << head.get()
14717                << " skipped " << missing << " clone(s) in cache tier" << dendl;
14718   } else {
14719     clog->info() << mode << " " << pgid << " " << head.get()
14720                        << " : " << missing << " missing clone(s)";
14721   }
14722 }
14723
14724 unsigned PrimaryLogPG::process_clones_to(const boost::optional<hobject_t> &head,
14725   const boost::optional<SnapSet> &snapset,
14726   LogChannelRef clog,
14727   const spg_t &pgid,
14728   const char *mode,
14729   bool allow_incomplete_clones,
14730   boost::optional<snapid_t> target,
14731   vector<snapid_t>::reverse_iterator *curclone,
14732   inconsistent_snapset_wrapper &e)
14733 {
14734   ceph_assert(head);
14735   ceph_assert(snapset);
14736   unsigned missing = 0;
14737
14738   // NOTE: clones are in descending order, thus **curclone > target test here
14739   hobject_t next_clone(head.get());
14740   while(doing_clones(snapset, *curclone) && (!target || **curclone > *target)) {
14741     ++missing;
14742     // it is okay to be missing one or more clones in a cache tier.
14743     // skip higher-numbered clones in the list.
14744     if (!allow_incomplete_clones) {
14745       next_clone.snap = **curclone;
14746       clog->error() << mode << " " << pgid << " " << head.get()
14747                          << " : expected clone " << next_clone << " " << missing
14748                          << " missing";
14749       ++scrubber.shallow_errors;
14750       e.set_clone_missing(next_clone.snap);
14751     }
14752     // Clones are descending
14753     ++(*curclone);
14754   }
14755   return missing;
14756 }
14757
14758 /*
14759  * Validate consistency of the object info and snap sets.
14760  *
14761  * We are sort of comparing 2 lists. The main loop is on objmap.objects. But
14762  * the comparison of the objects is against multiple snapset.clones. There are
14763  * multiple clone lists and in between lists we expect head.
14764  *
14765  * Example
14766  *
14767  * objects              expected
14768  * =======              =======
14769  * obj1 snap 1          head, unexpected obj1 snap 1
14770  * obj2 head            head, match
14771  *              [SnapSet clones 6 4 2 1]
14772  * obj2 snap 7          obj2 snap 6, unexpected obj2 snap 7
14773  * obj2 snap 6          obj2 snap 6, match
14774  * obj2 snap 4          obj2 snap 4, match
14775  * obj3 head            obj2 snap 2 (expected), obj2 snap 1 (expected), match
14776  *              [Snapset clones 3 1]
14777  * obj3 snap 3          obj3 snap 3 match
14778  * obj3 snap 1          obj3 snap 1 match
14779  * obj4 head            head, match
14780  *              [Snapset clones 4]
14781  * EOL                  obj4 snap 4, (expected)
14782  */
14783 void PrimaryLogPG::scrub_snapshot_metadata(
14784   ScrubMap &scrubmap,
14785   const map<hobject_t,
14786             pair<boost::optional<uint32_t>,
14787                  boost::optional<uint32_t>>> &missing_digest)
14788 {
14789   dout(10) << __func__ << dendl;
14790
14791   bool repair = state_test(PG_STATE_REPAIR);
14792   bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
14793   const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
14794   boost::optional<snapid_t> all_clones;   // Unspecified snapid_t or boost::none
14795
14796   // traverse in reverse order.
14797   boost::optional<hobject_t> head;
14798   boost::optional<SnapSet> snapset; // If initialized so will head (above)
14799   vector<snapid_t>::reverse_iterator curclone; // Defined only if snapset initialized
14800   unsigned missing = 0;
14801   inconsistent_snapset_wrapper soid_error, head_error;
14802   unsigned soid_error_count = 0;
14803
14804   for (map<hobject_t,ScrubMap::object>::reverse_iterator
14805        p = scrubmap.objects.rbegin(); p != scrubmap.objects.rend(); ++p) {
14806     const hobject_t& soid = p->first;
14807     ceph_assert(!soid.is_snapdir());
14808     soid_error = inconsistent_snapset_wrapper{soid};
14809     object_stat_sum_t stat;
14810     boost::optional<object_info_t> oi;
14811
14812     stat.num_objects++;
14813
14814     if (soid.nspace == cct->_conf->osd_hit_set_namespace)
14815       stat.num_objects_hit_set_archive++;
14816
14817     if (soid.is_snap()) {
14818       // it's a clone
14819       stat.num_object_clones++;
14820     }
14821
14822     // basic checks.
14823     if (p->second.attrs.count(OI_ATTR) == 0) {
14824       oi = boost::none;
14825       osd->clog->error() << mode << " " << info.pgid << " " << soid
14826                         << " : no '" << OI_ATTR << "' attr";
14827       ++scrubber.shallow_errors;
14828       soid_error.set_info_missing();
14829     } else {
14830       bufferlist bv;
14831       bv.push_back(p->second.attrs[OI_ATTR]);
14832       try {
14833         oi = object_info_t(); // Initialize optional<> before decode into it
14834         oi.get().decode(bv);
14835       } catch (buffer::error& e) {
14836         oi = boost::none;
14837         osd->clog->error() << mode << " " << info.pgid << " " << soid
14838                 << " : can't decode '" << OI_ATTR << "' attr " << e.what();
14839         ++scrubber.shallow_errors;
14840         soid_error.set_info_corrupted();
14841         soid_error.set_info_missing(); // Not available too
14842       }
14843     }
14844
14845     if (oi) {
14846       if (pgbackend->be_get_ondisk_size(oi->size) != p->second.size) {
14847         osd->clog->error() << mode << " " << info.pgid << " " << soid
14848                            << " : on disk size (" << p->second.size
14849                            << ") does not match object info size ("
14850                            << oi->size << ") adjusted for ondisk to ("
14851                            << pgbackend->be_get_ondisk_size(oi->size)
14852                            << ")";
14853         soid_error.set_size_mismatch();
14854         ++scrubber.shallow_errors;
14855       }
14856
14857       dout(20) << mode << "  " << soid << " " << oi.get() << dendl;
14858
14859       // A clone num_bytes will be added later when we have snapset
14860       if (!soid.is_snap()) {
14861         stat.num_bytes += oi->size;
14862       }
14863       if (soid.nspace == cct->_conf->osd_hit_set_namespace)
14864         stat.num_bytes_hit_set_archive += oi->size;
14865
14866       if (oi->is_dirty())
14867         ++stat.num_objects_dirty;
14868       if (oi->is_whiteout())
14869         ++stat.num_whiteouts;
14870       if (oi->is_omap())
14871         ++stat.num_objects_omap;
14872       if (oi->is_cache_pinned())
14873         ++stat.num_objects_pinned;
14874       if (oi->has_manifest())
14875         ++stat.num_objects_manifest;
14876     }
14877
14878     // Check for any problems while processing clones
14879     if (doing_clones(snapset, curclone)) {
14880       boost::optional<snapid_t> target;
14881       // Expecting an object with snap for current head
14882       if (soid.has_snapset() || soid.get_head() != head->get_head()) {
14883
14884         dout(10) << __func__ << " " << mode << " " << info.pgid << " new object "
14885                  << soid << " while processing " << head.get() << dendl;
14886
14887         target = all_clones;
14888       } else {
14889         ceph_assert(soid.is_snap());
14890         target = soid.snap;
14891       }
14892
14893       // Log any clones we were expecting to be there up to target
14894       // This will set missing, but will be a no-op if snap.soid == *curclone.
14895       missing += process_clones_to(head, snapset, osd->clog, info.pgid, mode,
14896                         pool.info.allow_incomplete_clones(), target, &curclone,
14897                         head_error);
14898     }
14899     bool expected;
14900     // Check doing_clones() again in case we ran process_clones_to()
14901     if (doing_clones(snapset, curclone)) {
14902       // A head would have processed all clones above
14903       // or all greater than *curclone.
14904       ceph_assert(soid.is_snap() && *curclone <= soid.snap);
14905
14906       // After processing above clone snap should match the expected curclone
14907       expected = (*curclone == soid.snap);
14908     } else {
14909       // If we aren't doing clones any longer, then expecting head
14910       expected = soid.has_snapset();
14911     }
14912     if (!expected) {
14913       // If we couldn't read the head's snapset, just ignore clones
14914       if (head && !snapset) {
14915         osd->clog->error() << mode << " " << info.pgid << " " << soid
14916                           << " : clone ignored due to missing snapset";
14917       } else {
14918         osd->clog->error() << mode << " " << info.pgid << " " << soid
14919                            << " : is an unexpected clone";
14920       }
14921       ++scrubber.shallow_errors;
14922       soid_error.set_headless();
14923       scrubber.store->add_snap_error(pool.id, soid_error);
14924       ++soid_error_count;
14925       if (head && soid.get_head() == head->get_head())
14926         head_error.set_clone(soid.snap);
14927       continue;
14928     }
14929
14930     // new snapset?
14931     if (soid.has_snapset()) {
14932
14933       if (missing) {
14934         log_missing(missing, head, osd->clog, info.pgid, __func__, mode,
14935                     pool.info.allow_incomplete_clones());
14936       }
14937
14938       // Save previous head error information
14939       if (head && (head_error.errors || soid_error_count))
14940         scrubber.store->add_snap_error(pool.id, head_error);
14941       // Set this as a new head object
14942       head = soid;
14943       missing = 0;
14944       head_error = soid_error;
14945       soid_error_count = 0;
14946
14947       dout(20) << __func__ << " " << mode << " new head " << head << dendl;
14948
14949       if (p->second.attrs.count(SS_ATTR) == 0) {
14950         osd->clog->error() << mode << " " << info.pgid << " " << soid
14951                           << " : no '" << SS_ATTR << "' attr";
14952         ++scrubber.shallow_errors;
14953         snapset = boost::none;
14954         head_error.set_snapset_missing();
14955       } else {
14956         bufferlist bl;
14957         bl.push_back(p->second.attrs[SS_ATTR]);
14958         auto blp = bl.cbegin();
14959         try {
14960           snapset = SnapSet(); // Initialize optional<> before decoding into it
14961           decode(snapset.get(), blp);
14962           head_error.ss_bl.push_back(p->second.attrs[SS_ATTR]);
14963         } catch (buffer::error& e) {
14964           snapset = boost::none;
14965           osd->clog->error() << mode << " " << info.pgid << " " << soid
14966                 << " : can't decode '" << SS_ATTR << "' attr " << e.what();
14967           ++scrubber.shallow_errors;
14968           head_error.set_snapset_corrupted();
14969         }
14970       }
14971
14972       if (snapset) {
14973         // what will be next?
14974         curclone = snapset->clones.rbegin();
14975
14976         if (!snapset->clones.empty()) {
14977           dout(20) << "  snapset " << snapset.get() << dendl;
14978           if (snapset->seq == 0) {
14979             osd->clog->error() << mode << " " << info.pgid << " " << soid
14980                                << " : snaps.seq not set";
14981             ++scrubber.shallow_errors;
14982             head_error.set_snapset_error();
14983           }
14984         }
14985       }
14986     } else {
14987       ceph_assert(soid.is_snap());
14988       ceph_assert(head);
14989       ceph_assert(snapset);
14990       ceph_assert(soid.snap == *curclone);
14991
14992       dout(20) << __func__ << " " << mode << " matched clone " << soid << dendl;
14993
14994       if (snapset->clone_size.count(soid.snap) == 0) {
14995         osd->clog->error() << mode << " " << info.pgid << " " << soid
14996                            << " : is missing in clone_size";
14997         ++scrubber.shallow_errors;
14998         soid_error.set_size_mismatch();
14999       } else {
15000         if (oi && oi->size != snapset->clone_size[soid.snap]) {
15001           osd->clog->error() << mode << " " << info.pgid << " " << soid
15002                              << " : size " << oi->size << " != clone_size "
15003                              << snapset->clone_size[*curclone];
15004           ++scrubber.shallow_errors;
15005           soid_error.set_size_mismatch();
15006         }
15007
15008         if (snapset->clone_overlap.count(soid.snap) == 0) {
15009           osd->clog->error() << mode << " " << info.pgid << " " << soid
15010                              << " : is missing in clone_overlap";
15011           ++scrubber.shallow_errors;
15012           soid_error.set_size_mismatch();
15013         } else {
15014           // This checking is based on get_clone_bytes().  The first 2 asserts
15015           // can't happen because we know we have a clone_size and
15016           // a clone_overlap.  Now we check that the interval_set won't
15017           // cause the last assert.
15018           uint64_t size = snapset->clone_size.find(soid.snap)->second;
15019           const interval_set<uint64_t> &overlap =
15020                 snapset->clone_overlap.find(soid.snap)->second;
15021           bool bad_interval_set = false;
15022           for (interval_set<uint64_t>::const_iterator i = overlap.begin();
15023                i != overlap.end(); ++i) {
15024             if (size < i.get_len()) {
15025               bad_interval_set = true;
15026               break;
15027             }
15028             size -= i.get_len();
15029           }
15030
15031           if (bad_interval_set) {
15032             osd->clog->error() << mode << " " << info.pgid << " " << soid
15033                                << " : bad interval_set in clone_overlap";
15034             ++scrubber.shallow_errors;
15035             soid_error.set_size_mismatch();
15036           } else {
15037             stat.num_bytes += snapset->get_clone_bytes(soid.snap);
15038           }
15039         }
15040       }
15041
15042       // what's next?
15043       ++curclone;
15044       if (soid_error.errors) {
15045         scrubber.store->add_snap_error(pool.id, soid_error);
15046         ++soid_error_count;
15047       }
15048     }
15049
15050     scrub_cstat.add(stat);
15051   }
15052
15053   if (doing_clones(snapset, curclone)) {
15054     dout(10) << __func__ << " " << mode << " " << info.pgid
15055              << " No more objects while processing " << head.get() << dendl;
15056
15057     missing += process_clones_to(head, snapset, osd->clog, info.pgid, mode,
15058                       pool.info.allow_incomplete_clones(), all_clones, &curclone,
15059                       head_error);
15060   }
15061   // There could be missing found by the test above or even
15062   // before dropping out of the loop for the last head.
15063   if (missing) {
15064     log_missing(missing, head, osd->clog, info.pgid, __func__,
15065                 mode, pool.info.allow_incomplete_clones());
15066   }
15067   if (head && (head_error.errors || soid_error_count))
15068     scrubber.store->add_snap_error(pool.id, head_error);
15069
15070   for (auto p = missing_digest.begin(); p != missing_digest.end(); ++p) {
15071     ceph_assert(!p->first.is_snapdir());
15072     dout(10) << __func__ << " recording digests for " << p->first << dendl;
15073     ObjectContextRef obc = get_object_context(p->first, false);
15074     if (!obc) {
15075       osd->clog->error() << info.pgid << " " << mode
15076                          << " cannot get object context for object "
15077                          << p->first;
15078       continue;
15079     } else if (obc->obs.oi.soid != p->first) {
15080       osd->clog->error() << info.pgid << " " << mode
15081                          << " " << p->first
15082                          << " : object has a valid oi attr with a mismatched name, "
15083                          << " obc->obs.oi.soid: " << obc->obs.oi.soid;
15084       continue;
15085     }
15086     OpContextUPtr ctx = simple_opc_create(obc);
15087     ctx->at_version = get_next_version();
15088     ctx->mtime = utime_t();      // do not update mtime
15089     if (p->second.first) {
15090       ctx->new_obs.oi.set_data_digest(*p->second.first);
15091     } else {
15092       ctx->new_obs.oi.clear_data_digest();
15093     }
15094     if (p->second.second) {
15095       ctx->new_obs.oi.set_omap_digest(*p->second.second);
15096     } else {
15097       ctx->new_obs.oi.clear_omap_digest();
15098     }
15099     finish_ctx(ctx.get(), pg_log_entry_t::MODIFY);
15100
15101     ctx->register_on_success(
15102       [this]() {
15103         dout(20) << "updating scrub digest" << dendl;
15104         if (--scrubber.num_digest_updates_pending == 0) {
15105           requeue_scrub();
15106         }
15107       });
15108
15109     simple_opc_submit(std::move(ctx));
15110     ++scrubber.num_digest_updates_pending;
15111   }
15112
15113   dout(10) << __func__ << " (" << mode << ") finish" << dendl;
15114 }
15115
15116 void PrimaryLogPG::_scrub_clear_state()
15117 {
15118   scrub_cstat = object_stat_collection_t();
15119 }
15120
15121 void PrimaryLogPG::_scrub_finish()
15122 {
15123   bool repair = state_test(PG_STATE_REPAIR);
15124   bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
15125   const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
15126
15127   if (info.stats.stats_invalid) {
15128     info.stats.stats = scrub_cstat;
15129     info.stats.stats_invalid = false;
15130
15131     if (agent_state)
15132       agent_choose_mode();
15133   }
15134
15135   dout(10) << mode << " got "
15136            << scrub_cstat.sum.num_objects << "/" << info.stats.stats.sum.num_objects << " objects, "
15137            << scrub_cstat.sum.num_object_clones << "/" << info.stats.stats.sum.num_object_clones << " clones, "
15138            << scrub_cstat.sum.num_objects_dirty << "/" << info.stats.stats.sum.num_objects_dirty << " dirty, "
15139            << scrub_cstat.sum.num_objects_omap << "/" << info.stats.stats.sum.num_objects_omap << " omap, "
15140            << scrub_cstat.sum.num_objects_pinned << "/" << info.stats.stats.sum.num_objects_pinned << " pinned, "
15141            << scrub_cstat.sum.num_objects_hit_set_archive << "/" << info.stats.stats.sum.num_objects_hit_set_archive << " hit_set_archive, "
15142            << scrub_cstat.sum.num_bytes << "/" << info.stats.stats.sum.num_bytes << " bytes, "
15143            << scrub_cstat.sum.num_objects_manifest << "/" << info.stats.stats.sum.num_objects_manifest << " manifest objects, "
15144            << scrub_cstat.sum.num_bytes_hit_set_archive << "/" << info.stats.stats.sum.num_bytes_hit_set_archive << " hit_set_archive bytes."
15145            << dendl;
15146
15147   if (scrub_cstat.sum.num_objects != info.stats.stats.sum.num_objects ||
15148       scrub_cstat.sum.num_object_clones != info.stats.stats.sum.num_object_clones ||
15149       (scrub_cstat.sum.num_objects_dirty != info.stats.stats.sum.num_objects_dirty &&
15150        !info.stats.dirty_stats_invalid) ||
15151       (scrub_cstat.sum.num_objects_omap != info.stats.stats.sum.num_objects_omap &&
15152        !info.stats.omap_stats_invalid) ||
15153       (scrub_cstat.sum.num_objects_pinned != info.stats.stats.sum.num_objects_pinned &&
15154        !info.stats.pin_stats_invalid) ||
15155       (scrub_cstat.sum.num_objects_hit_set_archive != info.stats.stats.sum.num_objects_hit_set_archive &&
15156        !info.stats.hitset_stats_invalid) ||
15157       (scrub_cstat.sum.num_bytes_hit_set_archive != info.stats.stats.sum.num_bytes_hit_set_archive &&
15158        !info.stats.hitset_bytes_stats_invalid) ||
15159       (scrub_cstat.sum.num_objects_manifest != info.stats.stats.sum.num_objects_manifest &&
15160        !info.stats.manifest_stats_invalid) ||
15161       scrub_cstat.sum.num_whiteouts != info.stats.stats.sum.num_whiteouts ||
15162       scrub_cstat.sum.num_bytes != info.stats.stats.sum.num_bytes) {
15163     osd->clog->error() << info.pgid << " " << mode
15164                       << " : stat mismatch, got "
15165                       << scrub_cstat.sum.num_objects << "/" << info.stats.stats.sum.num_objects << " objects, "
15166                       << scrub_cstat.sum.num_object_clones << "/" << info.stats.stats.sum.num_object_clones << " clones, "
15167                       << scrub_cstat.sum.num_objects_dirty << "/" << info.stats.stats.sum.num_objects_dirty << " dirty, "
15168                       << scrub_cstat.sum.num_objects_omap << "/" << info.stats.stats.sum.num_objects_omap << " omap, "
15169                       << scrub_cstat.sum.num_objects_pinned << "/" << info.stats.stats.sum.num_objects_pinned << " pinned, "
15170                       << scrub_cstat.sum.num_objects_hit_set_archive << "/" << info.stats.stats.sum.num_objects_hit_set_archive << " hit_set_archive, "
15171                       << scrub_cstat.sum.num_whiteouts << "/" << info.stats.stats.sum.num_whiteouts << " whiteouts, "
15172                       << scrub_cstat.sum.num_bytes << "/" << info.stats.stats.sum.num_bytes << " bytes, "
15173                       << scrub_cstat.sum.num_objects_manifest << "/" << info.stats.stats.sum.num_objects_manifest << " manifest objects, "
15174                       << scrub_cstat.sum.num_bytes_hit_set_archive << "/" << info.stats.stats.sum.num_bytes_hit_set_archive << " hit_set_archive bytes.";
15175     ++scrubber.shallow_errors;
15176
15177     if (repair) {
15178       ++scrubber.fixed;
15179       info.stats.stats = scrub_cstat;
15180       info.stats.dirty_stats_invalid = false;
15181       info.stats.omap_stats_invalid = false;
15182       info.stats.hitset_stats_invalid = false;
15183       info.stats.hitset_bytes_stats_invalid = false;
15184       info.stats.pin_stats_invalid = false;
15185       info.stats.manifest_stats_invalid = false;
15186       publish_stats_to_osd();
15187       share_pg_info();
15188     }
15189   }
15190   // Clear object context cache to get repair information
15191   if (repair)
15192     object_contexts.clear();
15193 }
15194
15195 bool PrimaryLogPG::check_osdmap_full(const set<pg_shard_t> &missing_on)
15196 {
15197     return osd->check_osdmap_full(missing_on);
15198 }
15199
15200 int PrimaryLogPG::rep_repair_primary_object(const hobject_t& soid, OpContext *ctx)
15201 {
15202   OpRequestRef op = ctx->op;
15203   // Only supports replicated pools
15204   ceph_assert(!pool.info.is_erasure());
15205   ceph_assert(is_primary());
15206
15207   dout(10) << __func__ << " " << soid
15208            << " peers osd.{" << acting_recovery_backfill << "}" << dendl;
15209
15210   if (!is_clean()) {
15211     block_for_clean(soid, op);
15212     return -EAGAIN;
15213   }
15214
15215   ceph_assert(!pg_log.get_missing().is_missing(soid));
15216   auto& oi = ctx->new_obs.oi;
15217   eversion_t v = oi.version;
15218
15219   missing_loc.add_missing(soid, v, eversion_t());
15220   if (primary_error(soid, v)) {
15221     dout(0) << __func__ << " No other replicas available for " << soid << dendl;
15222     // XXX: If we knew that there is no down osd which could include this
15223     // object, it would be nice if we could return EIO here.
15224     // If a "never fail" flag was available, that could be used
15225     // for rbd to NOT return EIO until object marked lost.
15226
15227     // Drop through to save this op in case an osd comes up with the object.
15228   }
15229
15230   // Restart the op after object becomes readable again
15231   waiting_for_unreadable_object[soid].push_back(op);
15232   op->mark_delayed("waiting for missing object");
15233
15234   if (!eio_errors_to_process) {
15235     eio_errors_to_process = true;
15236     ceph_assert(is_clean());
15237     state_set(PG_STATE_REPAIR);
15238     queue_peering_event(
15239         PGPeeringEventRef(
15240           std::make_shared<PGPeeringEvent>(
15241           get_osdmap_epoch(),
15242           get_osdmap_epoch(),
15243           DoRecovery())));
15244   } else {
15245     // A prior error must have already cleared clean state and queued recovery
15246     // or a map change has triggered re-peering.
15247     // Not inlining the recovery by calling maybe_kick_recovery(soid);
15248     dout(5) << __func__<< ": Read error on " << soid << ", but already seen errors" << dendl;
15249   }
15250
15251   return -EAGAIN;
15252 }
15253
15254 /*---SnapTrimmer Logging---*/
15255 #undef dout_prefix
15256 #define dout_prefix pg->gen_prefix(*_dout)
15257
15258 void PrimaryLogPG::SnapTrimmer::log_enter(const char *state_name)
15259 {
15260   ldout(pg->cct, 20) << "enter " << state_name << dendl;
15261 }
15262
15263 void PrimaryLogPG::SnapTrimmer::log_exit(const char *state_name, utime_t enter_time)
15264 {
15265   ldout(pg->cct, 20) << "exit " << state_name << dendl;
15266 }
15267
15268 /*---SnapTrimmer states---*/
15269 #undef dout_prefix
15270 #define dout_prefix (context< SnapTrimmer >().pg->gen_prefix(*_dout) \
15271                      << "SnapTrimmer state<" << get_state_name() << ">: ")
15272
15273 /* NotTrimming */
15274 PrimaryLogPG::NotTrimming::NotTrimming(my_context ctx)
15275   : my_base(ctx),
15276     NamedState(context< SnapTrimmer >().pg, "NotTrimming")
15277 {
15278   context< SnapTrimmer >().log_enter(state_name);
15279 }
15280
15281 void PrimaryLogPG::NotTrimming::exit()
15282 {
15283   context< SnapTrimmer >().log_exit(state_name, enter_time);
15284 }
15285
15286 boost::statechart::result PrimaryLogPG::NotTrimming::react(const KickTrim&)
15287 {
15288   PrimaryLogPG *pg = context< SnapTrimmer >().pg;
15289   ldout(pg->cct, 10) << "NotTrimming react KickTrim" << dendl;
15290
15291   if (!(pg->is_primary() && pg->is_active())) {
15292     ldout(pg->cct, 10) << "NotTrimming not primary or active" << dendl;
15293     return discard_event();
15294   }
15295   if (!pg->is_clean() ||
15296       pg->snap_trimq.empty()) {
15297     ldout(pg->cct, 10) << "NotTrimming not clean or nothing to trim" << dendl;
15298     return discard_event();
15299   }
15300   if (pg->scrubber.active) {
15301     ldout(pg->cct, 10) << " scrubbing, will requeue snap_trimmer after" << dendl;
15302     return transit< WaitScrub >();
15303   } else {
15304     return transit< Trimming >();
15305   }
15306 }
15307
15308 boost::statechart::result PrimaryLogPG::WaitReservation::react(const SnapTrimReserved&)
15309 {
15310   PrimaryLogPG *pg = context< SnapTrimmer >().pg;
15311   ldout(pg->cct, 10) << "WaitReservation react SnapTrimReserved" << dendl;
15312
15313   pending = nullptr;
15314   if (!context< SnapTrimmer >().can_trim()) {
15315     post_event(KickTrim());
15316     return transit< NotTrimming >();
15317   }
15318
15319   context<Trimming>().snap_to_trim = pg->snap_trimq.range_start();
15320   ldout(pg->cct, 10) << "NotTrimming: trimming "
15321                      << pg->snap_trimq.range_start()
15322                      << dendl;
15323   return transit< AwaitAsyncWork >();
15324 }
15325
15326 /* AwaitAsyncWork */
15327 PrimaryLogPG::AwaitAsyncWork::AwaitAsyncWork(my_context ctx)
15328   : my_base(ctx),
15329     NamedState(context< SnapTrimmer >().pg, "Trimming/AwaitAsyncWork")
15330 {
15331   auto *pg = context< SnapTrimmer >().pg;
15332   context< SnapTrimmer >().log_enter(state_name);
15333   context< SnapTrimmer >().pg->osd->queue_for_snap_trim(pg);
15334   pg->state_set(PG_STATE_SNAPTRIM);
15335   pg->state_clear(PG_STATE_SNAPTRIM_ERROR);
15336   pg->publish_stats_to_osd();
15337 }
15338
15339 boost::statechart::result PrimaryLogPG::AwaitAsyncWork::react(const DoSnapWork&)
15340 {
15341   PrimaryLogPGRef pg = context< SnapTrimmer >().pg;
15342   snapid_t snap_to_trim = context<Trimming>().snap_to_trim;
15343   auto &in_flight = context<Trimming>().in_flight;
15344   ceph_assert(in_flight.empty());
15345
15346   ceph_assert(pg->is_primary() && pg->is_active());
15347   if (!context< SnapTrimmer >().can_trim()) {
15348     ldout(pg->cct, 10) << "something changed, reverting to NotTrimming" << dendl;
15349     post_event(KickTrim());
15350     return transit< NotTrimming >();
15351   }
15352
15353   ldout(pg->cct, 10) << "AwaitAsyncWork: trimming snap " << snap_to_trim << dendl;
15354
15355   vector<hobject_t> to_trim;
15356   unsigned max = pg->cct->_conf->osd_pg_max_concurrent_snap_trims;
15357   to_trim.reserve(max);
15358   int r = pg->snap_mapper.get_next_objects_to_trim(
15359     snap_to_trim,
15360     max,
15361     &to_trim);
15362   if (r != 0 && r != -ENOENT) {
15363     lderr(pg->cct) << "get_next_objects_to_trim returned "
15364                    << cpp_strerror(r) << dendl;
15365     ceph_abort_msg("get_next_objects_to_trim returned an invalid code");
15366   } else if (r == -ENOENT) {
15367     // Done!
15368     ldout(pg->cct, 10) << "got ENOENT" << dendl;
15369
15370     ldout(pg->cct, 10) << "adding snap " << snap_to_trim
15371                        << " to purged_snaps"
15372                        << dendl;
15373     pg->info.purged_snaps.insert(snap_to_trim);
15374     pg->snap_trimq.erase(snap_to_trim);
15375     ldout(pg->cct, 10) << "purged_snaps now "
15376                        << pg->info.purged_snaps << ", snap_trimq now "
15377                        << pg->snap_trimq << dendl;
15378
15379     ObjectStore::Transaction t;
15380     pg->dirty_big_info = true;
15381     pg->write_if_dirty(t);
15382     int tr = pg->osd->store->queue_transaction(pg->ch, std::move(t), NULL);
15383     ceph_assert(tr == 0);
15384
15385     pg->share_pg_info();
15386     post_event(KickTrim());
15387     return transit< NotTrimming >();
15388   }
15389   ceph_assert(!to_trim.empty());
15390
15391   for (auto &&object: to_trim) {
15392     // Get next
15393     ldout(pg->cct, 10) << "AwaitAsyncWork react trimming " << object << dendl;
15394     OpContextUPtr ctx;
15395     int error = pg->trim_object(in_flight.empty(), object, &ctx);
15396     if (error) {
15397       if (error == -ENOLCK) {
15398         ldout(pg->cct, 10) << "could not get write lock on obj "
15399                            << object << dendl;
15400       } else {
15401         pg->state_set(PG_STATE_SNAPTRIM_ERROR);
15402         ldout(pg->cct, 10) << "Snaptrim error=" << error << dendl;
15403       }
15404       if (!in_flight.empty()) {
15405         ldout(pg->cct, 10) << "letting the ones we already started finish" << dendl;
15406         return transit< WaitRepops >();
15407       }
15408       if (error == -ENOLCK) {
15409         ldout(pg->cct, 10) << "waiting for it to clear"
15410                            << dendl;
15411         return transit< WaitRWLock >();
15412       } else {
15413         return transit< NotTrimming >();
15414       }
15415     }
15416
15417     in_flight.insert(object);
15418     ctx->register_on_success(
15419       [pg, object, &in_flight]() {
15420         ceph_assert(in_flight.find(object) != in_flight.end());
15421         in_flight.erase(object);
15422         if (in_flight.empty()) {
15423           if (pg->state_test(PG_STATE_SNAPTRIM_ERROR)) {
15424             pg->snap_trimmer_machine.process_event(Reset());
15425           } else {
15426             pg->snap_trimmer_machine.process_event(RepopsComplete());
15427           }
15428         }
15429       });
15430
15431     pg->simple_opc_submit(std::move(ctx));
15432   }
15433
15434   return transit< WaitRepops >();
15435 }
15436
15437 void PrimaryLogPG::setattr_maybe_cache(
15438   ObjectContextRef obc,
15439   PGTransaction *t,
15440   const string &key,
15441   bufferlist &val)
15442 {
15443   t->setattr(obc->obs.oi.soid, key, val);
15444 }
15445
15446 void PrimaryLogPG::setattrs_maybe_cache(
15447   ObjectContextRef obc,
15448   PGTransaction *t,
15449   map<string, bufferlist> &attrs)
15450 {
15451   t->setattrs(obc->obs.oi.soid, attrs);
15452 }
15453
15454 void PrimaryLogPG::rmattr_maybe_cache(
15455   ObjectContextRef obc,
15456   PGTransaction *t,
15457   const string &key)
15458 {
15459   t->rmattr(obc->obs.oi.soid, key);
15460 }
15461
15462 int PrimaryLogPG::getattr_maybe_cache(
15463   ObjectContextRef obc,
15464   const string &key,
15465   bufferlist *val)
15466 {
15467   if (pool.info.is_erasure()) {
15468     map<string, bufferlist>::iterator i = obc->attr_cache.find(key);
15469     if (i != obc->attr_cache.end()) {
15470       if (val)
15471         *val = i->second;
15472       return 0;
15473     } else {
15474       return -ENODATA;
15475     }
15476   }
15477   return pgbackend->objects_get_attr(obc->obs.oi.soid, key, val);
15478 }
15479
15480 int PrimaryLogPG::getattrs_maybe_cache(
15481   ObjectContextRef obc,
15482   map<string, bufferlist> *out)
15483 {
15484   int r = 0;
15485   ceph_assert(out);
15486   if (pool.info.is_erasure()) {
15487     *out = obc->attr_cache;
15488   } else {
15489     r = pgbackend->objects_get_attrs(obc->obs.oi.soid, out);
15490   }
15491   map<string, bufferlist> tmp;
15492   for (map<string, bufferlist>::iterator i = out->begin();
15493        i != out->end();
15494        ++i) {
15495     if (i->first.size() > 1 && i->first[0] == '_')
15496       tmp[i->first.substr(1, i->first.size())].claim(i->second);
15497   }
15498   tmp.swap(*out);
15499   return r;
15500 }
15501
15502 bool PrimaryLogPG::check_failsafe_full() {
15503     return osd->check_failsafe_full(get_dpp());
15504 }
15505
15506 void intrusive_ptr_add_ref(PrimaryLogPG *pg) { pg->get("intptr"); }
15507 void intrusive_ptr_release(PrimaryLogPG *pg) { pg->put("intptr"); }
15508
15509 #ifdef PG_DEBUG_REFS
15510 uint64_t get_with_id(PrimaryLogPG *pg) { return pg->get_with_id(); }
15511 void put_with_id(PrimaryLogPG *pg, uint64_t id) { return pg->put_with_id(id); }
15512 #endif
15513
15514 void intrusive_ptr_add_ref(PrimaryLogPG::RepGather *repop) { repop->get(); }
15515 void intrusive_ptr_release(PrimaryLogPG::RepGather *repop) { repop->put(); }