ceph/src/osd/PrimaryLogPG.cc

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Ceph - scalable distributed file system
   5  *
   6  * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
   7  * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
   8  *
   9  * Author: Loic Dachary <loic@dachary.org>
  10  *
  11  * This is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public
  13  * License version 2.1, as published by the Free Software
  14  * Foundation.  See file COPYING.
  15  *
  16  */
  17
  18 #include "boost/tuple/tuple.hpp"
  19 #include "boost/intrusive_ptr.hpp"
  20 #include "PG.h"
  21 #include "PrimaryLogPG.h"
  22 #include "OSD.h"
  23 #include "OpRequest.h"
  24 #include "ScrubStore.h"
  25 #include "Session.h"
  26 #include "objclass/objclass.h"
  27
  28 #include "common/errno.h"
  29 #include "common/scrub_types.h"
  30 #include "common/perf_counters.h"
  31
  32 #include "messages/MOSDOp.h"
  33 #include "messages/MOSDBackoff.h"
  34 #include "messages/MOSDSubOp.h"
  35 #include "messages/MOSDSubOpReply.h"
  36 #include "messages/MOSDPGTrim.h"
  37 #include "messages/MOSDPGScan.h"
  38 #include "messages/MOSDRepScrub.h"
  39 #include "messages/MOSDPGBackfill.h"
  40 #include "messages/MOSDPGBackfillRemove.h"
  41 #include "messages/MOSDPGUpdateLogMissing.h"
  42 #include "messages/MOSDPGUpdateLogMissingReply.h"
  43 #include "messages/MCommandReply.h"
  44 #include "messages/MOSDScrubReserve.h"
  45 #include "mds/inode_backtrace.h" // Ugh
  46 #include "common/EventTrace.h"
  47
  48 #include "common/config.h"
  49 #include "include/compat.h"
  50 #include "mon/MonClient.h"
  51 #include "osdc/Objecter.h"
  52 #include "json_spirit/json_spirit_value.h"
  53 #include "json_spirit/json_spirit_reader.h"
  54 #include "include/assert.h"  // json_spirit clobbers it
  55 #include "include/rados/rados_types.hpp"
  56
  57 #ifdef WITH_LTTNG
  58 #include "tracing/osd.h"
  59 #else
  60 #define tracepoint(...)
  61 #endif
  62
  63 #define dout_context cct
  64 #define dout_subsys ceph_subsys_osd
  65 #define DOUT_PREFIX_ARGS this, osd->whoami, get_osdmap()
  66 #undef dout_prefix
  67 #define dout_prefix _prefix(_dout, this)
  68 template <typename T>
  69 static ostream& _prefix(std::ostream *_dout, T *pg) {
  70   return *_dout << pg->gen_prefix();
  71 }
  72
  73
  74 #include <sstream>
  75 #include <utility>
  76
  77 #include <errno.h>
  78
  79 MEMPOOL_DEFINE_OBJECT_FACTORY(PrimaryLogPG, replicatedpg, osd);
  80
  81 PGLSFilter::PGLSFilter() : cct(nullptr)
  82 {
  83 }
  84
  85 PGLSFilter::~PGLSFilter()
  86 {
  87 }
  88
  89 struct PrimaryLogPG::C_OSD_OnApplied : Context {
  90   PrimaryLogPGRef pg;
  91   epoch_t epoch;
  92   eversion_t v;
  93   C_OSD_OnApplied(
  94     PrimaryLogPGRef pg,
  95     epoch_t epoch,
  96     eversion_t v)
  97     : pg(pg), epoch(epoch), v(v) {}
  98   void finish(int) override {
  99     pg->lock();
 100     if (!pg->pg_has_reset_since(epoch))
 101       pg->op_applied(v);
 102     pg->unlock();
 103   }
 104 };
 105
 106 /**
 107  * The CopyCallback class defines an interface for completions to the
 108  * copy_start code. Users of the copy infrastructure must implement
 109  * one and give an instance of the class to start_copy.
 110  *
 111  * The implementer is responsible for making sure that the CopyCallback
 112  * can associate itself with the correct copy operation.
 113  */
 114 class PrimaryLogPG::CopyCallback : public GenContext<CopyCallbackResults> {
 115 protected:
 116   CopyCallback() {}
 117   /**
 118    * results.get<0>() is the return code: 0 for success; -ECANCELED if
 119    * the operation was cancelled by the local OSD; -errno for other issues.
 120    * results.get<1>() is a pointer to a CopyResults object, which you are
 121    * responsible for deleting.
 122    */
 123   void finish(CopyCallbackResults results_) override = 0;
 124
 125 public:
 126   /// Provide the final size of the copied object to the CopyCallback
 127   ~CopyCallback() override {}
 128 };
 129
 130 template <typename T>
 131 class PrimaryLogPG::BlessedGenContext : public GenContext<T> {
 132   PrimaryLogPGRef pg;
 133   unique_ptr<GenContext<T>> c;
 134   epoch_t e;
 135 public:
 136   BlessedGenContext(PrimaryLogPG *pg, GenContext<T> *c, epoch_t e)
 137     : pg(pg), c(c), e(e) {}
 138   void finish(T t) override {
 139     pg->lock();
 140     if (pg->pg_has_reset_since(e))
 141       c.reset();
 142     else
 143       c.release()->complete(t);
 144     pg->unlock();
 145   }
 146 };
 147
 148 GenContext<ThreadPool::TPHandle&> *PrimaryLogPG::bless_gencontext(
 149   GenContext<ThreadPool::TPHandle&> *c) {
 150   return new BlessedGenContext<ThreadPool::TPHandle&>(
 151     this, c, get_osdmap()->get_epoch());
 152 }
 153
 154 class PrimaryLogPG::BlessedContext : public Context {
 155   PrimaryLogPGRef pg;
 156   unique_ptr<Context> c;
 157   epoch_t e;
 158 public:
 159   BlessedContext(PrimaryLogPG *pg, Context *c, epoch_t e)
 160     : pg(pg), c(c), e(e) {}
 161   void finish(int r) override {
 162     pg->lock();
 163     if (pg->pg_has_reset_since(e))
 164       c.reset();
 165     else
 166       c.release()->complete(r);
 167     pg->unlock();
 168   }
 169 };
 170
 171
 172 Context *PrimaryLogPG::bless_context(Context *c) {
 173   return new BlessedContext(this, c, get_osdmap()->get_epoch());
 174 }
 175
 176 class PrimaryLogPG::C_PG_ObjectContext : public Context {
 177   PrimaryLogPGRef pg;
 178   ObjectContext *obc;
 179   public:
 180   C_PG_ObjectContext(PrimaryLogPG *p, ObjectContext *o) :
 181     pg(p), obc(o) {}
 182   void finish(int r) override {
 183     pg->object_context_destructor_callback(obc);
 184   }
 185 };
 186
 187 class PrimaryLogPG::C_OSD_OndiskWriteUnlock : public Context {
 188   ObjectContextRef obc, obc2, obc3;
 189   public:
 190   C_OSD_OndiskWriteUnlock(
 191     ObjectContextRef o,
 192     ObjectContextRef o2 = ObjectContextRef(),
 193     ObjectContextRef o3 = ObjectContextRef()) : obc(o), obc2(o2), obc3(o3) {}
 194   void finish(int r) override {
 195     obc->ondisk_write_unlock();
 196     if (obc2)
 197       obc2->ondisk_write_unlock();
 198     if (obc3)
 199       obc3->ondisk_write_unlock();
 200   }
 201 };
 202
 203 struct OnReadComplete : public Context {
 204   PrimaryLogPG *pg;
 205   PrimaryLogPG::OpContext *opcontext;
 206   OnReadComplete(
 207     PrimaryLogPG *pg,
 208     PrimaryLogPG::OpContext *ctx) : pg(pg), opcontext(ctx) {}
 209   void finish(int r) override {
 210     opcontext->finish_read(pg);
 211   }
 212   ~OnReadComplete() override {}
 213 };
 214
 215 class PrimaryLogPG::C_OSD_AppliedRecoveredObject : public Context {
 216   PrimaryLogPGRef pg;
 217   ObjectContextRef obc;
 218   public:
 219   C_OSD_AppliedRecoveredObject(PrimaryLogPG *p, ObjectContextRef o) :
 220     pg(p), obc(o) {}
 221   void finish(int r) override {
 222     pg->_applied_recovered_object(obc);
 223   }
 224 };
 225
 226 class PrimaryLogPG::C_OSD_CommittedPushedObject : public Context {
 227   PrimaryLogPGRef pg;
 228   epoch_t epoch;
 229   eversion_t last_complete;
 230   public:
 231   C_OSD_CommittedPushedObject(
 232     PrimaryLogPG *p, epoch_t epoch, eversion_t lc) :
 233     pg(p), epoch(epoch), last_complete(lc) {
 234   }
 235   void finish(int r) override {
 236     pg->_committed_pushed_object(epoch, last_complete);
 237   }
 238 };
 239
 240 class PrimaryLogPG::C_OSD_AppliedRecoveredObjectReplica : public Context {
 241   PrimaryLogPGRef pg;
 242   public:
 243   explicit C_OSD_AppliedRecoveredObjectReplica(PrimaryLogPG *p) :
 244     pg(p) {}
 245   void finish(int r) override {
 246     pg->_applied_recovered_object_replica();
 247   }
 248 };
 249
 250 // OpContext
 251 void PrimaryLogPG::OpContext::start_async_reads(PrimaryLogPG *pg)
 252 {
 253   inflightreads = 1;
 254   list<pair<boost::tuple<uint64_t, uint64_t, unsigned>,
 255             pair<bufferlist*, Context*> > > in;
 256   in.swap(pending_async_reads);
 257   pg->pgbackend->objects_read_async(
 258     obc->obs.oi.soid,
 259     in,
 260     new OnReadComplete(pg, this), pg->get_pool().fast_read);
 261 }
 262 void PrimaryLogPG::OpContext::finish_read(PrimaryLogPG *pg)
 263 {
 264   assert(inflightreads > 0);
 265   --inflightreads;
 266   if (async_reads_complete()) {
 267     assert(pg->in_progress_async_reads.size());
 268     assert(pg->in_progress_async_reads.front().second == this);
 269     pg->in_progress_async_reads.pop_front();
 270
 271     // Restart the op context now that all reads have been
 272     // completed. Read failures will be handled by the op finisher
 273     pg->execute_ctx(this);
 274   }
 275 }
 276
 277 class CopyFromCallback : public PrimaryLogPG::CopyCallback {
 278 public:
 279   PrimaryLogPG::CopyResults *results = nullptr;
 280   PrimaryLogPG::OpContext *ctx;
 281   OSDOp &osd_op;
 282
 283   CopyFromCallback(PrimaryLogPG::OpContext *ctx, OSDOp &osd_op)
 284     : ctx(ctx), osd_op(osd_op) {
 285   }
 286   ~CopyFromCallback() override {}
 287
 288   void finish(PrimaryLogPG::CopyCallbackResults results_) override {
 289     results = results_.get<1>();
 290     int r = results_.get<0>();
 291
 292     // for finish_copyfrom
 293     ctx->user_at_version = results->user_version;
 294
 295     if (r >= 0) {
 296       ctx->pg->execute_ctx(ctx);
 297     } else {
 298       if (r != -ECANCELED) { // on cancel just toss it out; client resends
 299         if (ctx->op)
 300           ctx->pg->osd->reply_op_error(ctx->op, r);
 301       } else if (results->should_requeue) {
 302         if (ctx->op)
 303           ctx->pg->requeue_op(ctx->op);
 304       }
 305       ctx->pg->close_op_ctx(ctx);
 306     }
 307   }
 308
 309   bool is_temp_obj_used() {
 310     return results->started_temp_obj;
 311   }
 312   uint64_t get_data_size() {
 313     return results->object_size;
 314   }
 315 };
 316
 317 struct CopyFromFinisher : public PrimaryLogPG::OpFinisher {
 318   CopyFromCallback *copy_from_callback;
 319
 320   CopyFromFinisher(CopyFromCallback *copy_from_callback)
 321     : copy_from_callback(copy_from_callback) {
 322   }
 323
 324   int execute() override {
 325     // instance will be destructed after this method completes
 326     copy_from_callback->ctx->pg->finish_copyfrom(copy_from_callback);
 327     return 0;
 328   }
 329 };
 330
 331 // ======================
 332 // PGBackend::Listener
 333
 334 void PrimaryLogPG::on_local_recover(
 335   const hobject_t &hoid,
 336   const ObjectRecoveryInfo &_recovery_info,
 337   ObjectContextRef obc,
 338   bool is_delete,
 339   ObjectStore::Transaction *t
 340   )
 341 {
 342   dout(10) << __func__ << ": " << hoid << dendl;
 343
 344   ObjectRecoveryInfo recovery_info(_recovery_info);
 345   clear_object_snap_mapping(t, hoid);
 346   if (!is_delete && recovery_info.soid.is_snap()) {
 347     OSDriver::OSTransaction _t(osdriver.get_transaction(t));
 348     set<snapid_t> snaps;
 349     dout(20) << " snapset " << recovery_info.ss
 350              << " legacy_snaps " << recovery_info.oi.legacy_snaps << dendl;
 351     if (recovery_info.ss.is_legacy() ||
 352         recovery_info.ss.seq == 0 /* jewel osd doesn't populate this */) {
 353       assert(recovery_info.oi.legacy_snaps.size());
 354       snaps.insert(recovery_info.oi.legacy_snaps.begin(),
 355                    recovery_info.oi.legacy_snaps.end());
 356     } else {
 357       auto p = recovery_info.ss.clone_snaps.find(hoid.snap);
 358       assert(p != recovery_info.ss.clone_snaps.end());  // hmm, should we warn?
 359       snaps.insert(p->second.begin(), p->second.end());
 360     }
 361     dout(20) << " snaps " << snaps << dendl;
 362     snap_mapper.add_oid(
 363       recovery_info.soid,
 364       snaps,
 365       &_t);
 366   }
 367   if (!is_delete && pg_log.get_missing().is_missing(recovery_info.soid) &&
 368       pg_log.get_missing().get_items().find(recovery_info.soid)->second.need > recovery_info.version) {
 369     assert(is_primary());
 370     const pg_log_entry_t *latest = pg_log.get_log().objects.find(recovery_info.soid)->second;
 371     if (latest->op == pg_log_entry_t::LOST_REVERT &&
 372         latest->reverting_to == recovery_info.version) {
 373       dout(10) << " got old revert version " << recovery_info.version
 374                << " for " << *latest << dendl;
 375       recovery_info.version = latest->version;
 376       // update the attr to the revert event version
 377       recovery_info.oi.prior_version = recovery_info.oi.version;
 378       recovery_info.oi.version = latest->version;
 379       bufferlist bl;
 380       ::encode(recovery_info.oi, bl,
 381                get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
 382       assert(!pool.info.require_rollback());
 383       t->setattr(coll, ghobject_t(recovery_info.soid), OI_ATTR, bl);
 384       if (obc)
 385         obc->attr_cache[OI_ATTR] = bl;
 386     }
 387   }
 388
 389   // keep track of active pushes for scrub
 390   ++active_pushes;
 391
 392   if (recovery_info.version > pg_log.get_can_rollback_to()) {
 393     /* This can only happen during a repair, and even then, it would
 394      * be one heck of a race.  If we are repairing the object, the
 395      * write in question must be fully committed, so it's not valid
 396      * to roll it back anyway (and we'll be rolled forward shortly
 397      * anyway) */
 398     PGLogEntryHandler h{this, t};
 399     pg_log.roll_forward_to(recovery_info.version, &h);
 400   }
 401   recover_got(recovery_info.soid, recovery_info.version);
 402
 403   if (is_primary()) {
 404     if (!is_delete) {
 405       obc->obs.exists = true;
 406       obc->ondisk_write_lock();
 407
 408       bool got = obc->get_recovery_read();
 409       assert(got);
 410
 411       assert(recovering.count(obc->obs.oi.soid));
 412       recovering[obc->obs.oi.soid] = obc;
 413       obc->obs.oi = recovery_info.oi;  // may have been updated above
 414       t->register_on_applied_sync(new C_OSD_OndiskWriteUnlock(obc));
 415     }
 416
 417     t->register_on_applied(new C_OSD_AppliedRecoveredObject(this, obc));
 418
 419     publish_stats_to_osd();
 420     assert(missing_loc.needs_recovery(hoid));
 421     if (!is_delete)
 422       missing_loc.add_location(hoid, pg_whoami);
 423     release_backoffs(hoid);
 424     if (!is_unreadable_object(hoid)) {
 425       auto unreadable_object_entry = waiting_for_unreadable_object.find(hoid);
 426       if (unreadable_object_entry != waiting_for_unreadable_object.end()) {
 427         dout(20) << " kicking unreadable waiters on " << hoid << dendl;
 428         requeue_ops(unreadable_object_entry->second);
 429         waiting_for_unreadable_object.erase(unreadable_object_entry);
 430       }
 431     }
 432   } else {
 433     t->register_on_applied(
 434       new C_OSD_AppliedRecoveredObjectReplica(this));
 435
 436   }
 437
 438   t->register_on_commit(
 439     new C_OSD_CommittedPushedObject(
 440       this,
 441       get_osdmap()->get_epoch(),
 442       info.last_complete));
 443
 444   // update pg
 445   dirty_info = true;
 446   write_if_dirty(*t);
 447 }
 448
 449 void PrimaryLogPG::on_global_recover(
 450   const hobject_t &soid,
 451   const object_stat_sum_t &stat_diff,
 452   bool is_delete)
 453 {
 454   info.stats.stats.sum.add(stat_diff);
 455   missing_loc.recovered(soid);
 456   publish_stats_to_osd();
 457   dout(10) << "pushed " << soid << " to all replicas" << dendl;
 458   map<hobject_t, ObjectContextRef>::iterator i = recovering.find(soid);
 459   assert(i != recovering.end());
 460
 461   if (!is_delete) {
 462     // recover missing won't have had an obc, but it gets filled in
 463     // during on_local_recover
 464     assert(i->second);
 465     list<OpRequestRef> requeue_list;
 466     i->second->drop_recovery_read(&requeue_list);
 467     requeue_ops(requeue_list);
 468   }
 469
 470   backfills_in_flight.erase(soid);
 471
 472   recovering.erase(i);
 473   finish_recovery_op(soid);
 474   release_backoffs(soid);
 475   auto degraded_object_entry = waiting_for_degraded_object.find(soid);
 476   if (degraded_object_entry != waiting_for_degraded_object.end()) {
 477     dout(20) << " kicking degraded waiters on " << soid << dendl;
 478     requeue_ops(degraded_object_entry->second);
 479     waiting_for_degraded_object.erase(degraded_object_entry);
 480   }
 481   auto unreadable_object_entry = waiting_for_unreadable_object.find(soid);
 482   if (unreadable_object_entry != waiting_for_unreadable_object.end()) {
 483     dout(20) << " kicking unreadable waiters on " << soid << dendl;
 484     requeue_ops(unreadable_object_entry->second);
 485     waiting_for_unreadable_object.erase(unreadable_object_entry);
 486   }
 487   finish_degraded_object(soid);
 488 }
 489
 490 void PrimaryLogPG::on_peer_recover(
 491   pg_shard_t peer,
 492   const hobject_t &soid,
 493   const ObjectRecoveryInfo &recovery_info)
 494 {
 495   publish_stats_to_osd();
 496   // done!
 497   peer_missing[peer].got(soid, recovery_info.version);
 498 }
 499
 500 void PrimaryLogPG::begin_peer_recover(
 501   pg_shard_t peer,
 502   const hobject_t soid)
 503 {
 504   peer_missing[peer].revise_have(soid, eversion_t());
 505 }
 506
 507 void PrimaryLogPG::schedule_recovery_work(
 508   GenContext<ThreadPool::TPHandle&> *c)
 509 {
 510   osd->recovery_gen_wq.queue(c);
 511 }
 512
 513 void PrimaryLogPG::send_message_osd_cluster(
 514   int peer, Message *m, epoch_t from_epoch)
 515 {
 516   osd->send_message_osd_cluster(peer, m, from_epoch);
 517 }
 518
 519 void PrimaryLogPG::send_message_osd_cluster(
 520   Message *m, Connection *con)
 521 {
 522   osd->send_message_osd_cluster(m, con);
 523 }
 524
 525 void PrimaryLogPG::send_message_osd_cluster(
 526   Message *m, const ConnectionRef& con)
 527 {
 528   osd->send_message_osd_cluster(m, con);
 529 }
 530
 531 void PrimaryLogPG::on_primary_error(
 532   const hobject_t &oid,
 533   eversion_t v)
 534 {
 535   dout(0) << __func__ << ": oid " << oid << " version " << v << dendl;
 536   primary_failed(oid);
 537   primary_error(oid, v);
 538   backfill_add_missing(oid, v);
 539 }
 540
 541 void PrimaryLogPG::backfill_add_missing(
 542   const hobject_t &oid,
 543   eversion_t v)
 544 {
 545   dout(0) << __func__ << ": oid " << oid << " version " << v << dendl;
 546   backfills_in_flight.erase(oid);
 547   missing_loc.add_missing(oid, v, eversion_t());
 548 }
 549
 550 ConnectionRef PrimaryLogPG::get_con_osd_cluster(
 551   int peer, epoch_t from_epoch)
 552 {
 553   return osd->get_con_osd_cluster(peer, from_epoch);
 554 }
 555
 556 PerfCounters *PrimaryLogPG::get_logger()
 557 {
 558   return osd->logger;
 559 }
 560
 561
 562 // ====================
 563 // missing objects
 564
 565 bool PrimaryLogPG::is_missing_object(const hobject_t& soid) const
 566 {
 567   return pg_log.get_missing().get_items().count(soid);
 568 }
 569
 570 void PrimaryLogPG::maybe_kick_recovery(
 571   const hobject_t &soid)
 572 {
 573   eversion_t v;
 574   if (!missing_loc.needs_recovery(soid, &v))
 575     return;
 576
 577   map<hobject_t, ObjectContextRef>::const_iterator p = recovering.find(soid);
 578   if (p != recovering.end()) {
 579     dout(7) << "object " << soid << " v " << v << ", already recovering." << dendl;
 580   } else if (missing_loc.is_unfound(soid)) {
 581     dout(7) << "object " << soid << " v " << v << ", is unfound." << dendl;
 582   } else {
 583     dout(7) << "object " << soid << " v " << v << ", recovering." << dendl;
 584     PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
 585     if (is_missing_object(soid)) {
 586       recover_missing(soid, v, cct->_conf->osd_client_op_priority, h);
 587     } else if (missing_loc.is_deleted(soid)) {
 588       prep_object_replica_deletes(soid, v, h);
 589     } else {
 590       prep_object_replica_pushes(soid, v, h);
 591     }
 592     pgbackend->run_recovery_op(h, cct->_conf->osd_client_op_priority);
 593   }
 594 }
 595
 596 void PrimaryLogPG::wait_for_unreadable_object(
 597   const hobject_t& soid, OpRequestRef op)
 598 {
 599   assert(is_unreadable_object(soid));
 600   maybe_kick_recovery(soid);
 601   waiting_for_unreadable_object[soid].push_back(op);
 602   op->mark_delayed("waiting for missing object");
 603 }
 604
 605 bool PrimaryLogPG::is_degraded_or_backfilling_object(const hobject_t& soid)
 606 {
 607   /* The conditions below may clear (on_local_recover, before we queue
 608    * the transaction) before we actually requeue the degraded waiters
 609    * in on_global_recover after the transaction completes.
 610    */
 611   if (waiting_for_degraded_object.count(soid))
 612     return true;
 613   if (pg_log.get_missing().get_items().count(soid))
 614     return true;
 615   assert(!actingbackfill.empty());
 616   for (set<pg_shard_t>::iterator i = actingbackfill.begin();
 617        i != actingbackfill.end();
 618        ++i) {
 619     if (*i == get_primary()) continue;
 620     pg_shard_t peer = *i;
 621     auto peer_missing_entry = peer_missing.find(peer);
 622     if (peer_missing_entry != peer_missing.end() &&
 623         peer_missing_entry->second.get_items().count(soid))
 624       return true;
 625
 626     // Object is degraded if after last_backfill AND
 627     // we are backfilling it
 628     if (is_backfill_targets(peer) &&
 629         peer_info[peer].last_backfill <= soid &&
 630         last_backfill_started >= soid &&
 631         backfills_in_flight.count(soid))
 632       return true;
 633   }
 634   return false;
 635 }
 636
 637 void PrimaryLogPG::wait_for_degraded_object(const hobject_t& soid, OpRequestRef op)
 638 {
 639   assert(is_degraded_or_backfilling_object(soid));
 640
 641   maybe_kick_recovery(soid);
 642   waiting_for_degraded_object[soid].push_back(op);
 643   op->mark_delayed("waiting for degraded object");
 644 }
 645
 646 void PrimaryLogPG::block_write_on_full_cache(
 647   const hobject_t& _oid, OpRequestRef op)
 648 {
 649   const hobject_t oid = _oid.get_head();
 650   dout(20) << __func__ << ": blocking object " << oid
 651            << " on full cache" << dendl;
 652   objects_blocked_on_cache_full.insert(oid);
 653   waiting_for_cache_not_full.push_back(op);
 654   op->mark_delayed("waiting for cache not full");
 655 }
 656
 657 void PrimaryLogPG::block_for_clean(
 658   const hobject_t& oid, OpRequestRef op)
 659 {
 660   dout(20) << __func__ << ": blocking object " << oid
 661            << " on primary repair" << dendl;
 662   waiting_for_clean_to_primary_repair.push_back(op);
 663   op->mark_delayed("waiting for clean to repair");
 664 }
 665
 666 void PrimaryLogPG::block_write_on_snap_rollback(
 667   const hobject_t& oid, ObjectContextRef obc, OpRequestRef op)
 668 {
 669   dout(20) << __func__ << ": blocking object " << oid.get_head()
 670            << " on snap promotion " << obc->obs.oi.soid << dendl;
 671   // otherwise, we'd have blocked in do_op
 672   assert(oid.is_head());
 673   assert(objects_blocked_on_snap_promotion.count(oid) == 0);
 674   objects_blocked_on_snap_promotion[oid] = obc;
 675   wait_for_blocked_object(obc->obs.oi.soid, op);
 676 }
 677
 678 void PrimaryLogPG::block_write_on_degraded_snap(
 679   const hobject_t& snap, OpRequestRef op)
 680 {
 681   dout(20) << __func__ << ": blocking object " << snap.get_head()
 682            << " on degraded snap " << snap << dendl;
 683   // otherwise, we'd have blocked in do_op
 684   assert(objects_blocked_on_degraded_snap.count(snap.get_head()) == 0);
 685   objects_blocked_on_degraded_snap[snap.get_head()] = snap.snap;
 686   wait_for_degraded_object(snap, op);
 687 }
 688
 689 bool PrimaryLogPG::maybe_await_blocked_snapset(
 690   const hobject_t &hoid,
 691   OpRequestRef op)
 692 {
 693   ObjectContextRef obc;
 694   obc = object_contexts.lookup(hoid.get_head());
 695   if (obc) {
 696     if (obc->is_blocked()) {
 697       wait_for_blocked_object(obc->obs.oi.soid, op);
 698       return true;
 699     } else {
 700       return false;
 701     }
 702   }
 703   obc = object_contexts.lookup(hoid.get_snapdir());
 704   if (obc) {
 705     if (obc->is_blocked()) {
 706       wait_for_blocked_object(obc->obs.oi.soid, op);
 707       return true;
 708     } else {
 709       return false;
 710     }
 711   }
 712   return false;
 713 }
 714
 715 void PrimaryLogPG::wait_for_blocked_object(const hobject_t& soid, OpRequestRef op)
 716 {
 717   dout(10) << __func__ << " " << soid << " " << op << dendl;
 718   waiting_for_blocked_object[soid].push_back(op);
 719   op->mark_delayed("waiting for blocked object");
 720 }
 721
 722 void PrimaryLogPG::maybe_force_recovery()
 723 {
 724   // no force if not in degraded/recovery/backfill states
 725   if (!is_degraded() &&
 726       !state_test(PG_STATE_RECOVERING |
 727                   PG_STATE_RECOVERY_WAIT |
 728                   PG_STATE_BACKFILLING |
 729                   PG_STATE_BACKFILL_WAIT |
 730                   PG_STATE_BACKFILL_TOOFULL))
 731     return;
 732
 733   if (pg_log.get_log().approx_size() <
 734       cct->_conf->osd_max_pg_log_entries *
 735         cct->_conf->osd_force_recovery_pg_log_entries_factor)
 736     return;
 737
 738   // find the oldest missing object
 739   version_t min_version = 0;
 740   hobject_t soid;
 741   if (!pg_log.get_missing().get_items().empty()) {
 742     min_version = pg_log.get_missing().get_rmissing().begin()->first;
 743     soid = pg_log.get_missing().get_rmissing().begin()->second;
 744   }
 745   assert(!actingbackfill.empty());
 746   for (set<pg_shard_t>::iterator it = actingbackfill.begin();
 747        it != actingbackfill.end();
 748        ++it) {
 749     if (*it == get_primary()) continue;
 750     pg_shard_t peer = *it;
 751     if (peer_missing.count(peer) &&
 752         !peer_missing[peer].get_items().empty() &&
 753         min_version > peer_missing[peer].get_rmissing().begin()->first) {
 754       min_version = peer_missing[peer].get_rmissing().begin()->first;
 755       soid = peer_missing[peer].get_rmissing().begin()->second;
 756     }
 757   }
 758
 759   // recover it
 760   if (soid != hobject_t())
 761     maybe_kick_recovery(soid);
 762 }
 763
 764 class PGLSPlainFilter : public PGLSFilter {
 765   string val;
 766 public:
 767   int init(bufferlist::iterator &params) override
 768   {
 769     try {
 770       ::decode(xattr, params);
 771       ::decode(val, params);
 772     } catch (buffer::error &e) {
 773       return -EINVAL;
 774     }
 775
 776     return 0;
 777   }
 778   ~PGLSPlainFilter() override {}
 779   bool filter(const hobject_t &obj, bufferlist& xattr_data,
 780                       bufferlist& outdata) override;
 781 };
 782
 783 class PGLSParentFilter : public PGLSFilter {
 784   inodeno_t parent_ino;
 785 public:
 786   CephContext* cct;
 787   PGLSParentFilter(CephContext* cct) : cct(cct) {
 788     xattr = "_parent";
 789   }
 790   int init(bufferlist::iterator &params) override
 791   {
 792     try {
 793       ::decode(parent_ino, params);
 794     } catch (buffer::error &e) {
 795       return -EINVAL;
 796     }
 797     generic_dout(0) << "parent_ino=" << parent_ino << dendl;
 798
 799     return 0;
 800   }
 801   ~PGLSParentFilter() override {}
 802   bool filter(const hobject_t &obj, bufferlist& xattr_data,
 803                       bufferlist& outdata) override;
 804 };
 805
 806 bool PGLSParentFilter::filter(const hobject_t &obj,
 807                               bufferlist& xattr_data, bufferlist& outdata)
 808 {
 809   bufferlist::iterator iter = xattr_data.begin();
 810   inode_backtrace_t bt;
 811
 812   generic_dout(0) << "PGLSParentFilter::filter" << dendl;
 813
 814   ::decode(bt, iter);
 815
 816   vector<inode_backpointer_t>::iterator vi;
 817   for (vi = bt.ancestors.begin(); vi != bt.ancestors.end(); ++vi) {
 818     generic_dout(0) << "vi->dirino=" << vi->dirino << " parent_ino=" << parent_ino << dendl;
 819     if (vi->dirino == parent_ino) {
 820       ::encode(*vi, outdata);
 821       return true;
 822     }
 823   }
 824
 825   return false;
 826 }
 827
 828 bool PGLSPlainFilter::filter(const hobject_t &obj,
 829                              bufferlist& xattr_data, bufferlist& outdata)
 830 {
 831   if (val.size() != xattr_data.length())
 832     return false;
 833
 834   if (memcmp(val.c_str(), xattr_data.c_str(), val.size()))
 835     return false;
 836
 837   return true;
 838 }
 839
 840 bool PrimaryLogPG::pgls_filter(PGLSFilter *filter, hobject_t& sobj, bufferlist& outdata)
 841 {
 842   bufferlist bl;
 843
 844   // If filter has expressed an interest in an xattr, load it.
 845   if (!filter->get_xattr().empty()) {
 846     int ret = pgbackend->objects_get_attr(
 847       sobj,
 848       filter->get_xattr(),
 849       &bl);
 850     dout(0) << "getattr (sobj=" << sobj << ", attr=" << filter->get_xattr() << ") returned " << ret << dendl;
 851     if (ret < 0) {
 852       if (ret != -ENODATA || filter->reject_empty_xattr()) {
 853         return false;
 854       }
 855     }
 856   }
 857
 858   return filter->filter(sobj, bl, outdata);
 859 }
 860
 861 int PrimaryLogPG::get_pgls_filter(bufferlist::iterator& iter, PGLSFilter **pfilter)
 862 {
 863   string type;
 864   PGLSFilter *filter;
 865
 866   try {
 867     ::decode(type, iter);
 868   }
 869   catch (buffer::error& e) {
 870     return -EINVAL;
 871   }
 872
 873   if (type.compare("parent") == 0) {
 874     filter = new PGLSParentFilter(cct);
 875   } else if (type.compare("plain") == 0) {
 876     filter = new PGLSPlainFilter();
 877   } else {
 878     std::size_t dot = type.find(".");
 879     if (dot == std::string::npos || dot == 0 || dot == type.size() - 1) {
 880       return -EINVAL;
 881     }
 882
 883     const std::string class_name = type.substr(0, dot);
 884     const std::string filter_name = type.substr(dot + 1);
 885     ClassHandler::ClassData *cls = NULL;
 886     int r = osd->class_handler->open_class(class_name, &cls);
 887     if (r != 0) {
 888       derr << "Error opening class '" << class_name << "': "
 889            << cpp_strerror(r) << dendl;
 890       if (r != -EPERM) // propogate permission error
 891         r = -EINVAL;
 892       return r;
 893     } else {
 894       assert(cls);
 895     }
 896
 897     ClassHandler::ClassFilter *class_filter = cls->get_filter(filter_name);
 898     if (class_filter == NULL) {
 899       derr << "Error finding filter '" << filter_name << "' in class "
 900            << class_name << dendl;
 901       return -EINVAL;
 902     }
 903     filter = class_filter->fn();
 904     if (!filter) {
 905       // Object classes are obliged to return us something, but let's
 906       // give an error rather than asserting out.
 907       derr << "Buggy class " << class_name << " failed to construct "
 908               "filter " << filter_name << dendl;
 909       return -EINVAL;
 910     }
 911   }
 912
 913   assert(filter);
 914   int r = filter->init(iter);
 915   if (r < 0) {
 916     derr << "Error initializing filter " << type << ": "
 917          << cpp_strerror(r) << dendl;
 918     delete filter;
 919     return -EINVAL;
 920   } else {
 921     // Successfully constructed and initialized, return it.
 922     *pfilter = filter;
 923     return 0;
 924   }
 925 }
 926
 927
 928 // ==========================================================
 929
 930 int PrimaryLogPG::do_command(
 931   cmdmap_t cmdmap,
 932   ostream& ss,
 933   bufferlist& idata,
 934   bufferlist& odata,
 935   ConnectionRef con,
 936   ceph_tid_t tid)
 937 {
 938   const auto &missing = pg_log.get_missing();
 939   string prefix;
 940   string format;
 941
 942   cmd_getval(cct, cmdmap, "format", format);
 943   boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json"));
 944
 945   string command;
 946   cmd_getval(cct, cmdmap, "cmd", command);
 947   if (command == "query") {
 948     f->open_object_section("pg");
 949     f->dump_string("state", pg_state_string(get_state()));
 950     f->dump_stream("snap_trimq") << snap_trimq;
 951     f->dump_unsigned("snap_trimq_len", snap_trimq.size());
 952     f->dump_unsigned("epoch", get_osdmap()->get_epoch());
 953     f->open_array_section("up");
 954     for (vector<int>::iterator p = up.begin(); p != up.end(); ++p)
 955       f->dump_unsigned("osd", *p);
 956     f->close_section();
 957     f->open_array_section("acting");
 958     for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p)
 959       f->dump_unsigned("osd", *p);
 960     f->close_section();
 961     if (!backfill_targets.empty()) {
 962       f->open_array_section("backfill_targets");
 963       for (set<pg_shard_t>::iterator p = backfill_targets.begin();
 964            p != backfill_targets.end();
 965            ++p)
 966         f->dump_stream("shard") << *p;
 967       f->close_section();
 968     }
 969     if (!actingbackfill.empty()) {
 970       f->open_array_section("actingbackfill");
 971       for (set<pg_shard_t>::iterator p = actingbackfill.begin();
 972            p != actingbackfill.end();
 973            ++p)
 974         f->dump_stream("shard") << *p;
 975       f->close_section();
 976     }
 977     f->open_object_section("info");
 978     _update_calc_stats();
 979     info.dump(f.get());
 980     f->close_section();
 981
 982     f->open_array_section("peer_info");
 983     for (map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
 984          p != peer_info.end();
 985          ++p) {
 986       f->open_object_section("info");
 987       f->dump_stream("peer") << p->first;
 988       p->second.dump(f.get());
 989       f->close_section();
 990     }
 991     f->close_section();
 992
 993     f->open_array_section("recovery_state");
 994     handle_query_state(f.get());
 995     f->close_section();
 996
 997     f->open_object_section("agent_state");
 998     if (agent_state)
 999       agent_state->dump(f.get());
1000     f->close_section();
1001
1002     f->close_section();
1003     f->flush(odata);
1004     return 0;
1005   }
1006   else if (command == "mark_unfound_lost") {
1007     string mulcmd;
1008     cmd_getval(cct, cmdmap, "mulcmd", mulcmd);
1009     int mode = -1;
1010     if (mulcmd == "revert") {
1011       if (pool.info.ec_pool()) {
1012         ss << "mode must be 'delete' for ec pool";
1013         return -EINVAL;
1014       }
1015       mode = pg_log_entry_t::LOST_REVERT;
1016     } else if (mulcmd == "delete") {
1017       mode = pg_log_entry_t::LOST_DELETE;
1018     } else {
1019       ss << "mode must be 'revert' or 'delete'; mark not yet implemented";
1020       return -EINVAL;
1021     }
1022     assert(mode == pg_log_entry_t::LOST_REVERT ||
1023            mode == pg_log_entry_t::LOST_DELETE);
1024
1025     if (!is_primary()) {
1026       ss << "not primary";
1027       return -EROFS;
1028     }
1029
1030     uint64_t unfound = missing_loc.num_unfound();
1031     if (!unfound) {
1032       ss << "pg has no unfound objects";
1033       return 0;  // make command idempotent
1034     }
1035
1036     if (!all_unfound_are_queried_or_lost(get_osdmap())) {
1037       ss << "pg has " << unfound
1038          << " unfound objects but we haven't probed all sources, not marking lost";
1039       return -EINVAL;
1040     }
1041
1042     mark_all_unfound_lost(mode, con, tid);
1043     return -EAGAIN;
1044   }
1045   else if (command == "list_missing") {
1046     hobject_t offset;
1047     string offset_json;
1048     if (cmd_getval(cct, cmdmap, "offset", offset_json)) {
1049       json_spirit::Value v;
1050       try {
1051         if (!json_spirit::read(offset_json, v))
1052           throw std::runtime_error("bad json");
1053         offset.decode(v);
1054       } catch (std::runtime_error& e) {
1055         ss << "error parsing offset: " << e.what();
1056         return -EINVAL;
1057       }
1058     }
1059     f->open_object_section("missing");
1060     {
1061       f->open_object_section("offset");
1062       offset.dump(f.get());
1063       f->close_section();
1064     }
1065     f->dump_int("num_missing", missing.num_missing());
1066     f->dump_int("num_unfound", get_num_unfound());
1067     const map<hobject_t, pg_missing_item> &needs_recovery_map =
1068       missing_loc.get_needs_recovery();
1069     map<hobject_t, pg_missing_item>::const_iterator p =
1070       needs_recovery_map.upper_bound(offset);
1071     {
1072       f->open_array_section("objects");
1073       int32_t num = 0;
1074       for (; p != needs_recovery_map.end() && num < cct->_conf->osd_command_max_records; ++p) {
1075         if (missing_loc.is_unfound(p->first)) {
1076           f->open_object_section("object");
1077           {
1078             f->open_object_section("oid");
1079             p->first.dump(f.get());
1080             f->close_section();
1081           }
1082           p->second.dump(f.get()); // have, need keys
1083           {
1084             f->open_array_section("locations");
1085             for (set<pg_shard_t>::iterator r =
1086                 missing_loc.get_locations(p->first).begin();
1087                 r != missing_loc.get_locations(p->first).end();
1088                 ++r)
1089               f->dump_stream("shard") << *r;
1090             f->close_section();
1091           }
1092           f->close_section();
1093           num++;
1094         }
1095       }
1096       f->close_section();
1097     }
1098     f->dump_bool("more", p != needs_recovery_map.end());
1099     f->close_section();
1100     f->flush(odata);
1101     return 0;
1102   }
1103
1104   ss << "unknown pg command " << prefix;
1105   return -EINVAL;
1106 }
1107
1108 // ==========================================================
1109
1110 void PrimaryLogPG::do_pg_op(OpRequestRef op)
1111 {
1112   // NOTE: this is non-const because we modify the OSDOp.outdata in
1113   // place
1114   MOSDOp *m = static_cast<MOSDOp *>(op->get_nonconst_req());
1115   assert(m->get_type() == CEPH_MSG_OSD_OP);
1116   dout(10) << "do_pg_op " << *m << dendl;
1117
1118   op->mark_started();
1119
1120   int result = 0;
1121   string cname, mname;
1122   PGLSFilter *filter = NULL;
1123   bufferlist filter_out;
1124
1125   snapid_t snapid = m->get_snapid();
1126
1127   vector<OSDOp> ops = m->ops;
1128
1129   for (vector<OSDOp>::iterator p = ops.begin(); p != ops.end(); ++p) {
1130     OSDOp& osd_op = *p;
1131     bufferlist::iterator bp = p->indata.begin();
1132     switch (p->op.op) {
1133     case CEPH_OSD_OP_PGNLS_FILTER:
1134       try {
1135         ::decode(cname, bp);
1136         ::decode(mname, bp);
1137       }
1138       catch (const buffer::error& e) {
1139         dout(0) << "unable to decode PGLS_FILTER description in " << *m << dendl;
1140         result = -EINVAL;
1141         break;
1142       }
1143       if (filter) {
1144         delete filter;
1145         filter = NULL;
1146       }
1147       result = get_pgls_filter(bp, &filter);
1148       if (result < 0)
1149         break;
1150
1151       assert(filter);
1152
1153       // fall through
1154
1155     case CEPH_OSD_OP_PGNLS:
1156       if (snapid != CEPH_NOSNAP) {
1157         result = -EINVAL;
1158         break;
1159       }
1160       if (get_osdmap()->raw_pg_to_pg(m->get_pg()) != info.pgid.pgid) {
1161         dout(10) << " pgnls pg=" << m->get_pg()
1162                  << " " << get_osdmap()->raw_pg_to_pg(m->get_pg())
1163                  << " != " << info.pgid << dendl;
1164         result = 0; // hmm?
1165       } else {
1166         unsigned list_size = MIN(cct->_conf->osd_max_pgls, p->op.pgls.count);
1167
1168         dout(10) << " pgnls pg=" << m->get_pg() << " count " << list_size << dendl;
1169         // read into a buffer
1170         vector<hobject_t> sentries;
1171         pg_nls_response_t response;
1172         try {
1173           ::decode(response.handle, bp);
1174         }
1175         catch (const buffer::error& e) {
1176           dout(0) << "unable to decode PGNLS handle in " << *m << dendl;
1177           result = -EINVAL;
1178           break;
1179         }
1180
1181         hobject_t next;
1182         hobject_t lower_bound = response.handle;
1183         hobject_t pg_start = info.pgid.pgid.get_hobj_start();
1184         hobject_t pg_end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1185         dout(10) << " pgnls lower_bound " << lower_bound
1186                  << " pg_end " << pg_end << dendl;
1187         if (((!lower_bound.is_max() && lower_bound >= pg_end) ||
1188              (lower_bound != hobject_t() && lower_bound < pg_start))) {
1189           // this should only happen with a buggy client.
1190           dout(10) << "outside of PG bounds " << pg_start << " .. "
1191                    << pg_end << dendl;
1192           result = -EINVAL;
1193           break;
1194         }
1195
1196         hobject_t current = lower_bound;
1197         osr->flush();
1198         int r = pgbackend->objects_list_partial(
1199           current,
1200           list_size,
1201           list_size,
1202           &sentries,
1203           &next);
1204         if (r != 0) {
1205           result = -EINVAL;
1206           break;
1207         }
1208
1209         map<hobject_t, pg_missing_item>::const_iterator missing_iter =
1210           pg_log.get_missing().get_items().lower_bound(current);
1211         vector<hobject_t>::iterator ls_iter = sentries.begin();
1212         hobject_t _max = hobject_t::get_max();
1213         while (1) {
1214           const hobject_t &mcand =
1215             missing_iter == pg_log.get_missing().get_items().end() ?
1216             _max :
1217             missing_iter->first;
1218           const hobject_t &lcand =
1219             ls_iter == sentries.end() ?
1220             _max :
1221             *ls_iter;
1222
1223           hobject_t candidate;
1224           if (mcand == lcand) {
1225             candidate = mcand;
1226             if (!mcand.is_max()) {
1227               ++ls_iter;
1228               ++missing_iter;
1229             }
1230           } else if (mcand < lcand) {
1231             candidate = mcand;
1232             assert(!mcand.is_max());
1233             ++missing_iter;
1234           } else {
1235             candidate = lcand;
1236             assert(!lcand.is_max());
1237             ++ls_iter;
1238           }
1239
1240           dout(10) << " pgnls candidate 0x" << std::hex << candidate.get_hash()
1241             << " vs lower bound 0x" << lower_bound.get_hash() << dendl;
1242
1243           if (candidate >= next) {
1244             break;
1245           }
1246
1247           if (response.entries.size() == list_size) {
1248             next = candidate;
1249             break;
1250           }
1251
1252           // skip snapdir objects
1253           if (candidate.snap == CEPH_SNAPDIR)
1254             continue;
1255
1256           if (candidate.snap != CEPH_NOSNAP)
1257             continue;
1258
1259           // skip internal namespace
1260           if (candidate.get_namespace() == cct->_conf->osd_hit_set_namespace)
1261             continue;
1262
1263           if (missing_loc.is_deleted(candidate))
1264             continue;
1265
1266           // skip wrong namespace
1267           if (m->get_hobj().nspace != librados::all_nspaces &&
1268                candidate.get_namespace() != m->get_hobj().nspace)
1269             continue;
1270
1271           if (filter && !pgls_filter(filter, candidate, filter_out))
1272             continue;
1273
1274           dout(20) << "pgnls item 0x" << std::hex
1275             << candidate.get_hash()
1276             << ", rev 0x" << hobject_t::_reverse_bits(candidate.get_hash())
1277             << std::dec << " "
1278             << candidate.oid.name << dendl;
1279
1280           librados::ListObjectImpl item;
1281           item.nspace = candidate.get_namespace();
1282           item.oid = candidate.oid.name;
1283           item.locator = candidate.get_key();
1284           response.entries.push_back(item);
1285         }
1286
1287         if (next.is_max() &&
1288             missing_iter == pg_log.get_missing().get_items().end() &&
1289             ls_iter == sentries.end()) {
1290           result = 1;
1291
1292           // Set response.handle to the start of the next PG according
1293           // to the object sort order.
1294           response.handle = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1295         } else {
1296           response.handle = next;
1297         }
1298         dout(10) << "pgnls handle=" << response.handle << dendl;
1299         ::encode(response, osd_op.outdata);
1300         if (filter)
1301           ::encode(filter_out, osd_op.outdata);
1302         dout(10) << " pgnls result=" << result << " outdata.length()="
1303                  << osd_op.outdata.length() << dendl;
1304       }
1305       break;
1306
1307     case CEPH_OSD_OP_PGLS_FILTER:
1308       try {
1309         ::decode(cname, bp);
1310         ::decode(mname, bp);
1311       }
1312       catch (const buffer::error& e) {
1313         dout(0) << "unable to decode PGLS_FILTER description in " << *m << dendl;
1314         result = -EINVAL;
1315         break;
1316       }
1317       if (filter) {
1318         delete filter;
1319         filter = NULL;
1320       }
1321       result = get_pgls_filter(bp, &filter);
1322       if (result < 0)
1323         break;
1324
1325       assert(filter);
1326
1327       // fall through
1328
1329     case CEPH_OSD_OP_PGLS:
1330       if (snapid != CEPH_NOSNAP) {
1331         result = -EINVAL;
1332         break;
1333       }
1334       if (get_osdmap()->raw_pg_to_pg(m->get_pg()) != info.pgid.pgid) {
1335         dout(10) << " pgls pg=" << m->get_pg()
1336                  << " " << get_osdmap()->raw_pg_to_pg(m->get_pg())
1337                  << " != " << info.pgid << dendl;
1338         result = 0; // hmm?
1339       } else {
1340         unsigned list_size = MIN(cct->_conf->osd_max_pgls, p->op.pgls.count);
1341
1342         dout(10) << " pgls pg=" << m->get_pg() << " count " << list_size << dendl;
1343         // read into a buffer
1344         vector<hobject_t> sentries;
1345         pg_ls_response_t response;
1346         try {
1347           ::decode(response.handle, bp);
1348         }
1349         catch (const buffer::error& e) {
1350           dout(0) << "unable to decode PGLS handle in " << *m << dendl;
1351           result = -EINVAL;
1352           break;
1353         }
1354
1355         hobject_t next;
1356         hobject_t current = response.handle;
1357         osr->flush();
1358         int r = pgbackend->objects_list_partial(
1359           current,
1360           list_size,
1361           list_size,
1362           &sentries,
1363           &next);
1364         if (r != 0) {
1365           result = -EINVAL;
1366           break;
1367         }
1368
1369         assert(snapid == CEPH_NOSNAP || pg_log.get_missing().get_items().empty());
1370
1371         map<hobject_t, pg_missing_item>::const_iterator missing_iter =
1372           pg_log.get_missing().get_items().lower_bound(current);
1373         vector<hobject_t>::iterator ls_iter = sentries.begin();
1374         hobject_t _max = hobject_t::get_max();
1375         while (1) {
1376           const hobject_t &mcand =
1377             missing_iter == pg_log.get_missing().get_items().end() ?
1378             _max :
1379             missing_iter->first;
1380           const hobject_t &lcand =
1381             ls_iter == sentries.end() ?
1382             _max :
1383             *ls_iter;
1384
1385           hobject_t candidate;
1386           if (mcand == lcand) {
1387             candidate = mcand;
1388             if (!mcand.is_max()) {
1389               ++ls_iter;
1390               ++missing_iter;
1391             }
1392           } else if (mcand < lcand) {
1393             candidate = mcand;
1394             assert(!mcand.is_max());
1395             ++missing_iter;
1396           } else {
1397             candidate = lcand;
1398             assert(!lcand.is_max());
1399             ++ls_iter;
1400           }
1401
1402           if (candidate >= next) {
1403             break;
1404           }
1405
1406           if (response.entries.size() == list_size) {
1407             next = candidate;
1408             break;
1409           }
1410
1411           // skip snapdir objects
1412           if (candidate.snap == CEPH_SNAPDIR)
1413             continue;
1414
1415           if (candidate.snap != CEPH_NOSNAP)
1416             continue;
1417
1418           // skip wrong namespace
1419           if (candidate.get_namespace() != m->get_hobj().nspace)
1420             continue;
1421
1422           if (missing_loc.is_deleted(candidate))
1423             continue;
1424
1425           if (filter && !pgls_filter(filter, candidate, filter_out))
1426             continue;
1427
1428           response.entries.push_back(make_pair(candidate.oid,
1429                                                candidate.get_key()));
1430         }
1431         if (next.is_max() &&
1432             missing_iter == pg_log.get_missing().get_items().end() &&
1433             ls_iter == sentries.end()) {
1434           result = 1;
1435         }
1436         response.handle = next;
1437         ::encode(response, osd_op.outdata);
1438         if (filter)
1439           ::encode(filter_out, osd_op.outdata);
1440         dout(10) << " pgls result=" << result << " outdata.length()="
1441                  << osd_op.outdata.length() << dendl;
1442       }
1443       break;
1444
1445     case CEPH_OSD_OP_PG_HITSET_LS:
1446       {
1447         list< pair<utime_t,utime_t> > ls;
1448         for (list<pg_hit_set_info_t>::const_iterator p = info.hit_set.history.begin();
1449              p != info.hit_set.history.end();
1450              ++p)
1451           ls.push_back(make_pair(p->begin, p->end));
1452         if (hit_set)
1453           ls.push_back(make_pair(hit_set_start_stamp, utime_t()));
1454         ::encode(ls, osd_op.outdata);
1455       }
1456       break;
1457
1458     case CEPH_OSD_OP_PG_HITSET_GET:
1459       {
1460         utime_t stamp(osd_op.op.hit_set_get.stamp);
1461         if (hit_set_start_stamp && stamp >= hit_set_start_stamp) {
1462           // read the current in-memory HitSet, not the version we've
1463           // checkpointed.
1464           if (!hit_set) {
1465             result= -ENOENT;
1466             break;
1467           }
1468           ::encode(*hit_set, osd_op.outdata);
1469           result = osd_op.outdata.length();
1470         } else {
1471           // read an archived HitSet.
1472           hobject_t oid;
1473           for (list<pg_hit_set_info_t>::const_iterator p = info.hit_set.history.begin();
1474                p != info.hit_set.history.end();
1475                ++p) {
1476             if (stamp >= p->begin && stamp <= p->end) {
1477               oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
1478               break;
1479             }
1480           }
1481           if (oid == hobject_t()) {
1482             result = -ENOENT;
1483             break;
1484           }
1485           if (!pool.info.is_replicated()) {
1486             // FIXME: EC not supported yet
1487             result = -EOPNOTSUPP;
1488             break;
1489           }
1490           if (is_unreadable_object(oid)) {
1491             wait_for_unreadable_object(oid, op);
1492             delete filter;
1493             return;
1494           }
1495           result = osd->store->read(ch, ghobject_t(oid), 0, 0, osd_op.outdata);
1496         }
1497       }
1498       break;
1499
1500    case CEPH_OSD_OP_SCRUBLS:
1501       result = do_scrub_ls(m, &osd_op);
1502       break;
1503
1504     default:
1505       result = -EINVAL;
1506       break;
1507     }
1508
1509     if (result < 0)
1510       break;
1511   }
1512
1513   // reply
1514   MOSDOpReply *reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(),
1515                                        CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK,
1516                                        false);
1517   reply->claim_op_out_data(ops);
1518   reply->set_result(result);
1519   reply->set_reply_versions(info.last_update, info.last_user_version);
1520   osd->send_message_osd_client(reply, m->get_connection());
1521   delete filter;
1522 }
1523
1524 int PrimaryLogPG::do_scrub_ls(MOSDOp *m, OSDOp *osd_op)
1525 {
1526   if (m->get_pg() != info.pgid.pgid) {
1527     dout(10) << " scrubls pg=" << m->get_pg() << " != " << info.pgid << dendl;
1528     return -EINVAL; // hmm?
1529   }
1530   auto bp = osd_op->indata.begin();
1531   scrub_ls_arg_t arg;
1532   try {
1533     arg.decode(bp);
1534   } catch (buffer::error&) {
1535     dout(10) << " corrupted scrub_ls_arg_t" << dendl;
1536     return -EINVAL;
1537   }
1538   int r = 0;
1539   scrub_ls_result_t result = {.interval = info.history.same_interval_since};
1540   if (arg.interval != 0 && arg.interval != info.history.same_interval_since) {
1541     r = -EAGAIN;
1542   } else if (!scrubber.store) {
1543     r = -ENOENT;
1544   } else if (arg.get_snapsets) {
1545     result.vals = scrubber.store->get_snap_errors(osd->store,
1546                                                   get_pgid().pool(),
1547                                                   arg.start_after,
1548                                                   arg.max_return);
1549   } else {
1550     result.vals = scrubber.store->get_object_errors(osd->store,
1551                                                     get_pgid().pool(),
1552                                                     arg.start_after,
1553                                                     arg.max_return);
1554   }
1555   ::encode(result, osd_op->outdata);
1556   return r;
1557 }
1558
1559 void PrimaryLogPG::calc_trim_to()
1560 {
1561   size_t target = cct->_conf->osd_min_pg_log_entries;
1562   if (is_degraded() ||
1563       state_test(PG_STATE_RECOVERING |
1564                  PG_STATE_RECOVERY_WAIT |
1565                  PG_STATE_BACKFILLING |
1566                  PG_STATE_BACKFILL_WAIT |
1567                  PG_STATE_BACKFILL_TOOFULL)) {
1568     target = cct->_conf->osd_max_pg_log_entries;
1569   }
1570
1571   eversion_t limit = MIN(
1572     min_last_complete_ondisk,
1573     pg_log.get_can_rollback_to());
1574   if (limit != eversion_t() &&
1575       limit != pg_trim_to &&
1576       pg_log.get_log().approx_size() > target) {
1577     size_t num_to_trim = pg_log.get_log().approx_size() - target;
1578     if (num_to_trim < cct->_conf->osd_pg_log_trim_min) {
1579       return;
1580     }
1581     list<pg_log_entry_t>::const_iterator it = pg_log.get_log().log.begin();
1582     eversion_t new_trim_to;
1583     for (size_t i = 0; i < num_to_trim; ++i) {
1584       new_trim_to = it->version;
1585       ++it;
1586       if (new_trim_to > limit) {
1587         new_trim_to = limit;
1588         dout(10) << "calc_trim_to trimming to min_last_complete_ondisk" << dendl;
1589         break;
1590       }
1591     }
1592     dout(10) << "calc_trim_to " << pg_trim_to << " -> " << new_trim_to << dendl;
1593     pg_trim_to = new_trim_to;
1594     assert(pg_trim_to <= pg_log.get_head());
1595     assert(pg_trim_to <= min_last_complete_ondisk);
1596   }
1597 }
1598
1599 PrimaryLogPG::PrimaryLogPG(OSDService *o, OSDMapRef curmap,
1600                            const PGPool &_pool, spg_t p) :
1601   PG(o, curmap, _pool, p),
1602   pgbackend(
1603     PGBackend::build_pg_backend(
1604       _pool.info, curmap, this, coll_t(p), ch, o->store, cct)),
1605   object_contexts(o->cct, o->cct->_conf->osd_pg_object_context_cache_count),
1606   snapset_contexts_lock("PrimaryLogPG::snapset_contexts_lock"),
1607   new_backfill(false),
1608   temp_seq(0),
1609   snap_trimmer_machine(this)
1610 {
1611   missing_loc.set_backend_predicates(
1612     pgbackend->get_is_readable_predicate(),
1613     pgbackend->get_is_recoverable_predicate());
1614   snap_trimmer_machine.initiate();
1615 }
1616
1617 void PrimaryLogPG::get_src_oloc(const object_t& oid, const object_locator_t& oloc, object_locator_t& src_oloc)
1618 {
1619   src_oloc = oloc;
1620   if (oloc.key.empty())
1621     src_oloc.key = oid.name;
1622 }
1623
1624 void PrimaryLogPG::handle_backoff(OpRequestRef& op)
1625 {
1626   const MOSDBackoff *m = static_cast<const MOSDBackoff*>(op->get_req());
1627   SessionRef session = static_cast<Session*>(m->get_connection()->get_priv());
1628   if (!session)
1629     return;  // drop it.
1630   session->put();  // get_priv takes a ref, and so does the SessionRef
1631   hobject_t begin = info.pgid.pgid.get_hobj_start();
1632   hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1633   if (begin < m->begin) {
1634     begin = m->begin;
1635   }
1636   if (end > m->end) {
1637     end = m->end;
1638   }
1639   dout(10) << __func__ << " backoff ack id " << m->id
1640            << " [" << begin << "," << end << ")" << dendl;
1641   session->ack_backoff(cct, m->pgid, m->id, begin, end);
1642 }
1643
1644 void PrimaryLogPG::do_request(
1645   OpRequestRef& op,
1646   ThreadPool::TPHandle &handle)
1647 {
1648   if (op->osd_trace) {
1649     op->pg_trace.init("pg op", &trace_endpoint, &op->osd_trace);
1650     op->pg_trace.event("do request");
1651   }
1652   // make sure we have a new enough map
1653   auto p = waiting_for_map.find(op->get_source());
1654   if (p != waiting_for_map.end()) {
1655     // preserve ordering
1656     dout(20) << __func__ << " waiting_for_map "
1657              << p->first << " not empty, queueing" << dendl;
1658     p->second.push_back(op);
1659     op->mark_delayed("waiting_for_map not empty");
1660     return;
1661   }
1662   if (!have_same_or_newer_map(op->min_epoch)) {
1663     dout(20) << __func__ << " min " << op->min_epoch
1664              << ", queue on waiting_for_map " << op->get_source() << dendl;
1665     waiting_for_map[op->get_source()].push_back(op);
1666     op->mark_delayed("op must wait for map");
1667     osd->request_osdmap_update(op->min_epoch);
1668     return;
1669   }
1670
1671   if (can_discard_request(op)) {
1672     return;
1673   }
1674
1675   // pg-wide backoffs
1676   const Message *m = op->get_req();
1677   if (m->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF)) {
1678     SessionRef session = static_cast<Session*>(m->get_connection()->get_priv());
1679     if (!session)
1680       return;  // drop it.
1681     session->put();  // get_priv takes a ref, and so does the SessionRef
1682
1683     if (op->get_req()->get_type() == CEPH_MSG_OSD_OP) {
1684       if (session->check_backoff(cct, info.pgid,
1685                                  info.pgid.pgid.get_hobj_start(), m)) {
1686         return;
1687       }
1688
1689       bool backoff =
1690         is_down() ||
1691         is_incomplete() ||
1692         (!is_active() && is_peered());
1693       if (g_conf->osd_backoff_on_peering && !backoff) {
1694         if (is_peering()) {
1695           backoff = true;
1696         }
1697       }
1698       if (backoff) {
1699         add_pg_backoff(session);
1700         return;
1701       }
1702     }
1703     // pg backoff acks at pg-level
1704     if (op->get_req()->get_type() == CEPH_MSG_OSD_BACKOFF) {
1705       const MOSDBackoff *ba = static_cast<const MOSDBackoff*>(m);
1706       if (ba->begin != ba->end) {
1707         handle_backoff(op);
1708         return;
1709       }
1710     }
1711   }
1712
1713   if (!is_peered()) {
1714     // Delay unless PGBackend says it's ok
1715     if (pgbackend->can_handle_while_inactive(op)) {
1716       bool handled = pgbackend->handle_message(op);
1717       assert(handled);
1718       return;
1719     } else {
1720       waiting_for_peered.push_back(op);
1721       op->mark_delayed("waiting for peered");
1722       return;
1723     }
1724   }
1725
1726   if (flushes_in_progress > 0) {
1727     dout(20) << flushes_in_progress
1728              << " flushes_in_progress pending "
1729              << "waiting for flush on " << op << dendl;
1730     waiting_for_flush.push_back(op);
1731     op->mark_delayed("waiting for flush");
1732     return;
1733   }
1734
1735   assert(is_peered() && flushes_in_progress == 0);
1736   if (pgbackend->handle_message(op))
1737     return;
1738
1739   switch (op->get_req()->get_type()) {
1740   case CEPH_MSG_OSD_OP:
1741   case CEPH_MSG_OSD_BACKOFF:
1742     if (!is_active()) {
1743       dout(20) << " peered, not active, waiting for active on " << op << dendl;
1744       waiting_for_active.push_back(op);
1745       op->mark_delayed("waiting for active");
1746       return;
1747     }
1748     switch (op->get_req()->get_type()) {
1749     case CEPH_MSG_OSD_OP:
1750       // verify client features
1751       if ((pool.info.has_tiers() || pool.info.is_tier()) &&
1752           !op->has_feature(CEPH_FEATURE_OSD_CACHEPOOL)) {
1753         osd->reply_op_error(op, -EOPNOTSUPP);
1754         return;
1755       }
1756       do_op(op);
1757       break;
1758     case CEPH_MSG_OSD_BACKOFF:
1759       // object-level backoff acks handled in osdop context
1760       handle_backoff(op);
1761       break;
1762     }
1763     break;
1764
1765   case MSG_OSD_SUBOP:
1766     do_sub_op(op);
1767     break;
1768
1769   case MSG_OSD_SUBOPREPLY:
1770     do_sub_op_reply(op);
1771     break;
1772
1773   case MSG_OSD_PG_SCAN:
1774     do_scan(op, handle);
1775     break;
1776
1777   case MSG_OSD_PG_BACKFILL:
1778     do_backfill(op);
1779     break;
1780
1781   case MSG_OSD_PG_BACKFILL_REMOVE:
1782     do_backfill_remove(op);
1783     break;
1784
1785   case MSG_OSD_SCRUB_RESERVE:
1786     {
1787       const MOSDScrubReserve *m =
1788         static_cast<const MOSDScrubReserve*>(op->get_req());
1789       switch (m->type) {
1790       case MOSDScrubReserve::REQUEST:
1791         handle_scrub_reserve_request(op);
1792         break;
1793       case MOSDScrubReserve::GRANT:
1794         handle_scrub_reserve_grant(op, m->from);
1795         break;
1796       case MOSDScrubReserve::REJECT:
1797         handle_scrub_reserve_reject(op, m->from);
1798         break;
1799       case MOSDScrubReserve::RELEASE:
1800         handle_scrub_reserve_release(op);
1801         break;
1802       }
1803     }
1804     break;
1805
1806   case MSG_OSD_REP_SCRUB:
1807     replica_scrub(op, handle);
1808     break;
1809
1810   case MSG_OSD_REP_SCRUBMAP:
1811     do_replica_scrub_map(op);
1812     break;
1813
1814   case MSG_OSD_PG_UPDATE_LOG_MISSING:
1815     do_update_log_missing(op);
1816     break;
1817
1818   case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY:
1819     do_update_log_missing_reply(op);
1820     break;
1821
1822   default:
1823     assert(0 == "bad message type in do_request");
1824   }
1825 }
1826
1827 hobject_t PrimaryLogPG::earliest_backfill() const
1828 {
1829   hobject_t e = hobject_t::get_max();
1830   for (set<pg_shard_t>::iterator i = backfill_targets.begin();
1831        i != backfill_targets.end();
1832        ++i) {
1833     pg_shard_t bt = *i;
1834     map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(bt);
1835     assert(iter != peer_info.end());
1836     if (iter->second.last_backfill < e)
1837       e = iter->second.last_backfill;
1838   }
1839   return e;
1840 }
1841
1842 /** do_op - do an op
1843  * pg lock will be held (if multithreaded)
1844  * osd_lock NOT held.
1845  */
1846 void PrimaryLogPG::do_op(OpRequestRef& op)
1847 {
1848   FUNCTRACE();
1849   // NOTE: take a non-const pointer here; we must be careful not to
1850   // change anything that will break other reads on m (operator<<).
1851   MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
1852   assert(m->get_type() == CEPH_MSG_OSD_OP);
1853   if (m->finish_decode()) {
1854     op->reset_desc();   // for TrackedOp
1855     m->clear_payload();
1856   }
1857
1858   dout(20) << __func__ << ": op " << *m << dendl;
1859
1860   hobject_t head = m->get_hobj();
1861   head.snap = CEPH_NOSNAP;
1862
1863   if (!info.pgid.pgid.contains(
1864         info.pgid.pgid.get_split_bits(pool.info.get_pg_num()), head)) {
1865     derr << __func__ << " " << info.pgid.pgid << " does not contain "
1866          << head << " pg_num " << pool.info.get_pg_num() << " hash "
1867          << std::hex << head.get_hash() << std::dec << dendl;
1868     osd->clog->warn() << info.pgid.pgid << " does not contain " << head
1869                       << " op " << *m;
1870     assert(!cct->_conf->osd_debug_misdirected_ops);
1871     return;
1872   }
1873
1874   bool can_backoff =
1875     m->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF);
1876   SessionRef session;
1877   if (can_backoff) {
1878     session = static_cast<Session*>(m->get_connection()->get_priv());
1879     if (!session.get()) {
1880       dout(10) << __func__ << " no session" << dendl;
1881       return;
1882     }
1883     session->put();  // get_priv() takes a ref, and so does the intrusive_ptr
1884
1885     if (session->check_backoff(cct, info.pgid, head, m)) {
1886       return;
1887     }
1888   }
1889
1890   if (m->has_flag(CEPH_OSD_FLAG_PARALLELEXEC)) {
1891     // not implemented.
1892     dout(20) << __func__ << ": PARALLELEXEC not implemented " << *m << dendl;
1893     osd->reply_op_error(op, -EINVAL);
1894     return;
1895   }
1896
1897   if (op->rmw_flags == 0) {
1898     int r = osd->osd->init_op_flags(op);
1899     if (r) {
1900       osd->reply_op_error(op, r);
1901       return;
1902     }
1903   }
1904
1905   if ((m->get_flags() & (CEPH_OSD_FLAG_BALANCE_READS |
1906                          CEPH_OSD_FLAG_LOCALIZE_READS)) &&
1907       op->may_read() &&
1908       !(op->may_write() || op->may_cache())) {
1909     // balanced reads; any replica will do
1910     if (!(is_primary() || is_replica())) {
1911       osd->handle_misdirected_op(this, op);
1912       return;
1913     }
1914   } else {
1915     // normal case; must be primary
1916     if (!is_primary()) {
1917       osd->handle_misdirected_op(this, op);
1918       return;
1919     }
1920   }
1921
1922   if (!op_has_sufficient_caps(op)) {
1923     osd->reply_op_error(op, -EPERM);
1924     return;
1925   }
1926
1927   if (op->includes_pg_op()) {
1928     return do_pg_op(op);
1929   }
1930
1931   // object name too long?
1932   if (m->get_oid().name.size() > cct->_conf->osd_max_object_name_len) {
1933     dout(4) << "do_op name is longer than "
1934             << cct->_conf->osd_max_object_name_len
1935             << " bytes" << dendl;
1936     osd->reply_op_error(op, -ENAMETOOLONG);
1937     return;
1938   }
1939   if (m->get_hobj().get_key().size() > cct->_conf->osd_max_object_name_len) {
1940     dout(4) << "do_op locator is longer than "
1941             << cct->_conf->osd_max_object_name_len
1942             << " bytes" << dendl;
1943     osd->reply_op_error(op, -ENAMETOOLONG);
1944     return;
1945   }
1946   if (m->get_hobj().nspace.size() > cct->_conf->osd_max_object_namespace_len) {
1947     dout(4) << "do_op namespace is longer than "
1948             << cct->_conf->osd_max_object_namespace_len
1949             << " bytes" << dendl;
1950     osd->reply_op_error(op, -ENAMETOOLONG);
1951     return;
1952   }
1953
1954   if (int r = osd->store->validate_hobject_key(head)) {
1955     dout(4) << "do_op object " << head << " invalid for backing store: "
1956             << r << dendl;
1957     osd->reply_op_error(op, r);
1958     return;
1959   }
1960
1961   // blacklisted?
1962   if (get_osdmap()->is_blacklisted(m->get_source_addr())) {
1963     dout(10) << "do_op " << m->get_source_addr() << " is blacklisted" << dendl;
1964     osd->reply_op_error(op, -EBLACKLISTED);
1965     return;
1966   }
1967
1968   // order this op as a write?
1969   bool write_ordered = op->rwordered();
1970
1971   // discard due to cluster full transition?  (we discard any op that
1972   // originates before the cluster or pool is marked full; the client
1973   // will resend after the full flag is removed or if they expect the
1974   // op to succeed despite being full).  The except is FULL_FORCE and
1975   // FULL_TRY ops, which there is no reason to discard because they
1976   // bypass all full checks anyway.  If this op isn't write or
1977   // read-ordered, we skip.
1978   // FIXME: we exclude mds writes for now.
1979   if (write_ordered && !(m->get_source().is_mds() ||
1980                          m->has_flag(CEPH_OSD_FLAG_FULL_TRY) ||
1981                          m->has_flag(CEPH_OSD_FLAG_FULL_FORCE)) &&
1982       info.history.last_epoch_marked_full > m->get_map_epoch()) {
1983     dout(10) << __func__ << " discarding op sent before full " << m << " "
1984              << *m << dendl;
1985     return;
1986   }
1987   // mds should have stopped writing before this point.
1988   // We can't allow OSD to become non-startable even if mds
1989   // could be writing as part of file removals.
1990   ostringstream ss;
1991   if (write_ordered && osd->check_failsafe_full(ss)) {
1992     dout(10) << __func__ << " fail-safe full check failed, dropping request"
1993              << ss.str()
1994              << dendl;
1995     return;
1996   }
1997   int64_t poolid = get_pgid().pool();
1998   if (op->may_write()) {
1999
2000     const pg_pool_t *pi = get_osdmap()->get_pg_pool(poolid);
2001     if (!pi) {
2002       return;
2003     }
2004
2005     // invalid?
2006     if (m->get_snapid() != CEPH_NOSNAP) {
2007       dout(20) << __func__ << ": write to clone not valid " << *m << dendl;
2008       osd->reply_op_error(op, -EINVAL);
2009       return;
2010     }
2011
2012     // too big?
2013     if (cct->_conf->osd_max_write_size &&
2014         m->get_data_len() > cct->_conf->osd_max_write_size << 20) {
2015       // journal can't hold commit!
2016       derr << "do_op msg data len " << m->get_data_len()
2017            << " > osd_max_write_size " << (cct->_conf->osd_max_write_size << 20)
2018            << " on " << *m << dendl;
2019       osd->reply_op_error(op, -OSD_WRITETOOBIG);
2020       return;
2021     }
2022   }
2023
2024   dout(10) << "do_op " << *m
2025            << (op->may_write() ? " may_write" : "")
2026            << (op->may_read() ? " may_read" : "")
2027            << (op->may_cache() ? " may_cache" : "")
2028            << " -> " << (write_ordered ? "write-ordered" : "read-ordered")
2029            << " flags " << ceph_osd_flag_string(m->get_flags())
2030            << dendl;
2031
2032   // missing object?
2033   if (is_unreadable_object(head)) {
2034     if (!is_primary()) {
2035       osd->reply_op_error(op, -EAGAIN);
2036       return;
2037     }
2038     if (can_backoff &&
2039         (g_conf->osd_backoff_on_degraded ||
2040          (g_conf->osd_backoff_on_unfound && missing_loc.is_unfound(head)))) {
2041       add_backoff(session, head, head);
2042       maybe_kick_recovery(head);
2043     } else {
2044       wait_for_unreadable_object(head, op);
2045     }
2046     return;
2047   }
2048
2049   // degraded object?
2050   if (write_ordered && is_degraded_or_backfilling_object(head)) {
2051     if (can_backoff && g_conf->osd_backoff_on_degraded) {
2052       add_backoff(session, head, head);
2053       maybe_kick_recovery(head);
2054     } else {
2055       wait_for_degraded_object(head, op);
2056     }
2057     return;
2058   }
2059
2060   if (write_ordered &&
2061       scrubber.write_blocked_by_scrub(head)) {
2062     dout(20) << __func__ << ": waiting for scrub" << dendl;
2063     waiting_for_scrub.push_back(op);
2064     op->mark_delayed("waiting for scrub");
2065     return;
2066   }
2067
2068   // blocked on snap?
2069   map<hobject_t, snapid_t>::iterator blocked_iter =
2070     objects_blocked_on_degraded_snap.find(head);
2071   if (write_ordered && blocked_iter != objects_blocked_on_degraded_snap.end()) {
2072     hobject_t to_wait_on(head);
2073     to_wait_on.snap = blocked_iter->second;
2074     wait_for_degraded_object(to_wait_on, op);
2075     return;
2076   }
2077   map<hobject_t, ObjectContextRef>::iterator blocked_snap_promote_iter =
2078     objects_blocked_on_snap_promotion.find(head);
2079   if (write_ordered &&
2080       blocked_snap_promote_iter != objects_blocked_on_snap_promotion.end()) {
2081     wait_for_blocked_object(
2082       blocked_snap_promote_iter->second->obs.oi.soid,
2083       op);
2084     return;
2085   }
2086   if (write_ordered && objects_blocked_on_cache_full.count(head)) {
2087     block_write_on_full_cache(head, op);
2088     return;
2089   }
2090
2091   // missing snapdir?
2092   hobject_t snapdir = head.get_snapdir();
2093
2094   if (is_unreadable_object(snapdir)) {
2095     wait_for_unreadable_object(snapdir, op);
2096     return;
2097   }
2098
2099   // degraded object?
2100   if (write_ordered && is_degraded_or_backfilling_object(snapdir)) {
2101     wait_for_degraded_object(snapdir, op);
2102     return;
2103   }
2104
2105   // dup/resent?
2106   if (op->may_write() || op->may_cache()) {
2107     // warning: we will get back *a* request for this reqid, but not
2108     // necessarily the most recent.  this happens with flush and
2109     // promote ops, but we can't possible have both in our log where
2110     // the original request is still not stable on disk, so for our
2111     // purposes here it doesn't matter which one we get.
2112     eversion_t version;
2113     version_t user_version;
2114     int return_code = 0;
2115     bool got = check_in_progress_op(
2116       m->get_reqid(), &version, &user_version, &return_code);
2117     if (got) {
2118       dout(3) << __func__ << " dup " << m->get_reqid()
2119               << " version " << version << dendl;
2120       if (already_complete(version)) {
2121         osd->reply_op_error(op, return_code, version, user_version);
2122       } else {
2123         dout(10) << " waiting for " << version << " to commit" << dendl;
2124         // always queue ondisk waiters, so that we can requeue if needed
2125         waiting_for_ondisk[version].push_back(make_pair(op, user_version));
2126         op->mark_delayed("waiting for ondisk");
2127       }
2128       return;
2129     }
2130   }
2131
2132   ObjectContextRef obc;
2133   bool can_create = op->may_write() || op->may_cache();
2134   hobject_t missing_oid;
2135   const hobject_t& oid = m->get_hobj();
2136
2137   // io blocked on obc?
2138   if (!m->has_flag(CEPH_OSD_FLAG_FLUSH) &&
2139       maybe_await_blocked_snapset(oid, op)) {
2140     return;
2141   }
2142
2143   int r = find_object_context(
2144     oid, &obc, can_create,
2145     m->has_flag(CEPH_OSD_FLAG_MAP_SNAP_CLONE),
2146     &missing_oid);
2147
2148   if (r == -EAGAIN) {
2149     // If we're not the primary of this OSD, we just return -EAGAIN. Otherwise,
2150     // we have to wait for the object.
2151     if (is_primary()) {
2152       // missing the specific snap we need; requeue and wait.
2153       assert(!op->may_write()); // only happens on a read/cache
2154       wait_for_unreadable_object(missing_oid, op);
2155       return;
2156     }
2157   } else if (r == 0) {
2158     if (is_unreadable_object(obc->obs.oi.soid)) {
2159       dout(10) << __func__ << ": clone " << obc->obs.oi.soid
2160                << " is unreadable, waiting" << dendl;
2161       wait_for_unreadable_object(obc->obs.oi.soid, op);
2162       return;
2163     }
2164
2165     // degraded object?  (the check above was for head; this could be a clone)
2166     if (write_ordered &&
2167         obc->obs.oi.soid.snap != CEPH_NOSNAP &&
2168         is_degraded_or_backfilling_object(obc->obs.oi.soid)) {
2169       dout(10) << __func__ << ": clone " << obc->obs.oi.soid
2170                << " is degraded, waiting" << dendl;
2171       wait_for_degraded_object(obc->obs.oi.soid, op);
2172       return;
2173     }
2174   }
2175
2176   bool in_hit_set = false;
2177   if (hit_set) {
2178     if (obc.get()) {
2179       if (obc->obs.oi.soid != hobject_t() && hit_set->contains(obc->obs.oi.soid))
2180         in_hit_set = true;
2181     } else {
2182       if (missing_oid != hobject_t() && hit_set->contains(missing_oid))
2183         in_hit_set = true;
2184     }
2185     if (!op->hitset_inserted) {
2186       hit_set->insert(oid);
2187       op->hitset_inserted = true;
2188       if (hit_set->is_full() ||
2189           hit_set_start_stamp + pool.info.hit_set_period <= m->get_recv_stamp()) {
2190         hit_set_persist();
2191       }
2192     }
2193   }
2194
2195   if (agent_state) {
2196     if (agent_choose_mode(false, op))
2197       return;
2198   }
2199
2200   if (obc.get() && obc->obs.exists && obc->obs.oi.has_manifest()) {
2201     if (maybe_handle_manifest(op,
2202                                write_ordered,
2203                                obc))
2204     return;
2205   }
2206
2207   if (maybe_handle_cache(op,
2208                          write_ordered,
2209                          obc,
2210                          r,
2211                          missing_oid,
2212                          false,
2213                          in_hit_set))
2214     return;
2215
2216   if (r && (r != -ENOENT || !obc)) {
2217     // copy the reqids for copy get on ENOENT
2218     if (r == -ENOENT &&
2219         (m->ops[0].op.op == CEPH_OSD_OP_COPY_GET)) {
2220       fill_in_copy_get_noent(op, oid, m->ops[0]);
2221       return;
2222     }
2223     dout(20) << __func__ << ": find_object_context got error " << r << dendl;
2224     if (op->may_write() &&
2225         get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
2226       record_write_error(op, oid, nullptr, r);
2227     } else {
2228       osd->reply_op_error(op, r);
2229     }
2230     return;
2231   }
2232
2233   // make sure locator is consistent
2234   object_locator_t oloc(obc->obs.oi.soid);
2235   if (m->get_object_locator() != oloc) {
2236     dout(10) << " provided locator " << m->get_object_locator()
2237              << " != object's " << obc->obs.oi.soid << dendl;
2238     osd->clog->warn() << "bad locator " << m->get_object_locator()
2239                      << " on object " << oloc
2240                       << " op " << *m;
2241   }
2242
2243   // io blocked on obc?
2244   if (obc->is_blocked() &&
2245       !m->has_flag(CEPH_OSD_FLAG_FLUSH)) {
2246     wait_for_blocked_object(obc->obs.oi.soid, op);
2247     return;
2248   }
2249
2250   dout(25) << __func__ << " oi " << obc->obs.oi << dendl;
2251
2252   for (vector<OSDOp>::iterator p = m->ops.begin(); p != m->ops.end(); ++p) {
2253     OSDOp& osd_op = *p;
2254
2255     // make sure LIST_SNAPS is on CEPH_SNAPDIR and nothing else
2256     if (osd_op.op.op == CEPH_OSD_OP_LIST_SNAPS &&
2257         m->get_snapid() != CEPH_SNAPDIR) {
2258       dout(10) << "LIST_SNAPS with incorrect context" << dendl;
2259       osd->reply_op_error(op, -EINVAL);
2260       return;
2261     }
2262   }
2263
2264   OpContext *ctx = new OpContext(op, m->get_reqid(), &m->ops, obc, this);
2265
2266   if (!obc->obs.exists)
2267     ctx->snapset_obc = get_object_context(obc->obs.oi.soid.get_snapdir(), false);
2268
2269   /* Due to obc caching, we might have a cached non-existent snapset_obc
2270    * for the snapdir.  If so, we can ignore it.  Subsequent parts of the
2271    * do_op pipeline make decisions based on whether snapset_obc is
2272    * populated.
2273    */
2274   if (ctx->snapset_obc && !ctx->snapset_obc->obs.exists)
2275     ctx->snapset_obc = ObjectContextRef();
2276
2277   if (m->has_flag(CEPH_OSD_FLAG_SKIPRWLOCKS)) {
2278     dout(20) << __func__ << ": skipping rw locks" << dendl;
2279   } else if (m->get_flags() & CEPH_OSD_FLAG_FLUSH) {
2280     dout(20) << __func__ << ": part of flush, will ignore write lock" << dendl;
2281
2282     // verify there is in fact a flush in progress
2283     // FIXME: we could make this a stronger test.
2284     map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(obc->obs.oi.soid);
2285     if (p == flush_ops.end()) {
2286       dout(10) << __func__ << " no flush in progress, aborting" << dendl;
2287       reply_ctx(ctx, -EINVAL);
2288       return;
2289     }
2290   } else if (!get_rw_locks(write_ordered, ctx)) {
2291     dout(20) << __func__ << " waiting for rw locks " << dendl;
2292     op->mark_delayed("waiting for rw locks");
2293     close_op_ctx(ctx);
2294     return;
2295   }
2296   dout(20) << __func__ << " obc " << *obc << dendl;
2297
2298   if (r) {
2299     dout(20) << __func__ << " returned an error: " << r << dendl;
2300     close_op_ctx(ctx);
2301     if (op->may_write() &&
2302         get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
2303       record_write_error(op, oid, nullptr, r);
2304     } else {
2305       osd->reply_op_error(op, r);
2306     }
2307     return;
2308   }
2309
2310   if (m->has_flag(CEPH_OSD_FLAG_IGNORE_CACHE)) {
2311     ctx->ignore_cache = true;
2312   }
2313
2314   if ((op->may_read()) && (obc->obs.oi.is_lost())) {
2315     // This object is lost. Reading from it returns an error.
2316     dout(20) << __func__ << ": object " << obc->obs.oi.soid
2317              << " is lost" << dendl;
2318     reply_ctx(ctx, -ENFILE);
2319     return;
2320   }
2321   if (!op->may_write() &&
2322       !op->may_cache() &&
2323       (!obc->obs.exists ||
2324        ((m->get_snapid() != CEPH_SNAPDIR) &&
2325         obc->obs.oi.is_whiteout()))) {
2326     // copy the reqids for copy get on ENOENT
2327     if (m->ops[0].op.op == CEPH_OSD_OP_COPY_GET) {
2328       fill_in_copy_get_noent(op, oid, m->ops[0]);
2329       close_op_ctx(ctx);
2330       return;
2331     }
2332     reply_ctx(ctx, -ENOENT);
2333     return;
2334   }
2335
2336   op->mark_started();
2337
2338   execute_ctx(ctx);
2339   utime_t prepare_latency = ceph_clock_now();
2340   prepare_latency -= op->get_dequeued_time();
2341   osd->logger->tinc(l_osd_op_prepare_lat, prepare_latency);
2342   if (op->may_read() && op->may_write()) {
2343     osd->logger->tinc(l_osd_op_rw_prepare_lat, prepare_latency);
2344   } else if (op->may_read()) {
2345     osd->logger->tinc(l_osd_op_r_prepare_lat, prepare_latency);
2346   } else if (op->may_write() || op->may_cache()) {
2347     osd->logger->tinc(l_osd_op_w_prepare_lat, prepare_latency);
2348   }
2349
2350   // force recovery of the oldest missing object if too many logs
2351   maybe_force_recovery();
2352 }
2353
2354 PrimaryLogPG::cache_result_t PrimaryLogPG::maybe_handle_manifest_detail(
2355   OpRequestRef op,
2356   bool write_ordered,
2357   ObjectContextRef obc)
2358 {
2359   if (static_cast<const MOSDOp *>(op->get_req())->get_flags() &
2360       CEPH_OSD_FLAG_IGNORE_REDIRECT) {
2361     dout(20) << __func__ << ": ignoring redirect due to flag" << dendl;
2362     return cache_result_t::NOOP;
2363   }
2364
2365   if (obc)
2366     dout(10) << __func__ << " " << obc->obs.oi << " "
2367        << (obc->obs.exists ? "exists" : "DNE")
2368        << dendl;
2369
2370   // if it is write-ordered and blocked, stop now
2371   if (obc.get() && obc->is_blocked() && write_ordered) {
2372     // we're already doing something with this object
2373     dout(20) << __func__ << " blocked on " << obc->obs.oi.soid << dendl;
2374     return cache_result_t::NOOP;
2375   }
2376
2377   vector<OSDOp> ops = static_cast<const MOSDOp*>(op->get_req())->ops;
2378   for (vector<OSDOp>::iterator p = ops.begin(); p != ops.end(); ++p) {
2379     OSDOp& osd_op = *p;
2380     ceph_osd_op& op = osd_op.op;
2381     if (op.op == CEPH_OSD_OP_SET_REDIRECT) {
2382       return cache_result_t::NOOP;
2383     }
2384   }
2385
2386   switch (obc->obs.oi.manifest.type) {
2387   case object_manifest_t::TYPE_REDIRECT:
2388     if (op->may_write() || write_ordered) {
2389       do_proxy_write(op, obc->obs.oi.soid, obc);
2390     } else {
2391       do_proxy_read(op, obc);
2392     }
2393     return cache_result_t::HANDLED_PROXY;
2394   case object_manifest_t::TYPE_CHUNKED:
2395   default:
2396     assert(0 == "unrecognized manifest type");
2397   }
2398
2399   return cache_result_t::NOOP;
2400 }
2401
2402 void PrimaryLogPG::record_write_error(OpRequestRef op, const hobject_t &soid,
2403                                       MOSDOpReply *orig_reply, int r)
2404 {
2405   dout(20) << __func__ << " r=" << r << dendl;
2406   assert(op->may_write());
2407   const osd_reqid_t &reqid = static_cast<const MOSDOp*>(op->get_req())->get_reqid();
2408   mempool::osd_pglog::list<pg_log_entry_t> entries;
2409   entries.push_back(pg_log_entry_t(pg_log_entry_t::ERROR, soid,
2410                                    get_next_version(), eversion_t(), 0,
2411                                    reqid, utime_t(), r));
2412
2413   struct OnComplete {
2414     PrimaryLogPG *pg;
2415     OpRequestRef op;
2416     boost::intrusive_ptr<MOSDOpReply> orig_reply;
2417     int r;
2418     OnComplete(
2419       PrimaryLogPG *pg,
2420       OpRequestRef op,
2421       MOSDOpReply *orig_reply,
2422       int r)
2423       : pg(pg), op(op),
2424         orig_reply(orig_reply, false /* take over ref */), r(r)
2425       {}
2426     void operator()() {
2427       ldpp_dout(pg, 20) << "finished " << __func__ << " r=" << r << dendl;
2428       const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
2429       int flags = m->get_flags() & (CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
2430       MOSDOpReply *reply = orig_reply.detach();
2431       if (reply == nullptr) {
2432         reply = new MOSDOpReply(m, r, pg->get_osdmap()->get_epoch(),
2433                                 flags, true);
2434       }
2435       ldpp_dout(pg, 10) << " sending commit on " << *m << " " << reply << dendl;
2436       pg->osd->send_message_osd_client(reply, m->get_connection());
2437     }
2438   };
2439
2440   ObcLockManager lock_manager;
2441   submit_log_entries(
2442     entries,
2443     std::move(lock_manager),
2444     boost::optional<std::function<void(void)> >(
2445       OnComplete(this, op, orig_reply, r)),
2446     op,
2447     r);
2448 }
2449
2450 PrimaryLogPG::cache_result_t PrimaryLogPG::maybe_handle_cache_detail(
2451   OpRequestRef op,
2452   bool write_ordered,
2453   ObjectContextRef obc,
2454   int r, hobject_t missing_oid,
2455   bool must_promote,
2456   bool in_hit_set,
2457   ObjectContextRef *promote_obc)
2458 {
2459   // return quickly if caching is not enabled
2460   if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)
2461     return cache_result_t::NOOP;
2462
2463   if (op &&
2464       op->get_req() &&
2465       op->get_req()->get_type() == CEPH_MSG_OSD_OP &&
2466       (static_cast<const MOSDOp *>(op->get_req())->get_flags() &
2467        CEPH_OSD_FLAG_IGNORE_CACHE)) {
2468     dout(20) << __func__ << ": ignoring cache due to flag" << dendl;
2469     return cache_result_t::NOOP;
2470   }
2471
2472   must_promote = must_promote || op->need_promote();
2473
2474   if (obc)
2475     dout(25) << __func__ << " " << obc->obs.oi << " "
2476              << (obc->obs.exists ? "exists" : "DNE")
2477              << " missing_oid " << missing_oid
2478              << " must_promote " << (int)must_promote
2479              << " in_hit_set " << (int)in_hit_set
2480              << dendl;
2481   else
2482     dout(25) << __func__ << " (no obc)"
2483              << " missing_oid " << missing_oid
2484              << " must_promote " << (int)must_promote
2485              << " in_hit_set " << (int)in_hit_set
2486              << dendl;
2487
2488   // if it is write-ordered and blocked, stop now
2489   if (obc.get() && obc->is_blocked() && write_ordered) {
2490     // we're already doing something with this object
2491     dout(20) << __func__ << " blocked on " << obc->obs.oi.soid << dendl;
2492     return cache_result_t::NOOP;
2493   }
2494
2495   if (r == -ENOENT && missing_oid == hobject_t()) {
2496     // we know this object is logically absent (e.g., an undefined clone)
2497     return cache_result_t::NOOP;
2498   }
2499
2500   if (obc.get() && obc->obs.exists) {
2501     osd->logger->inc(l_osd_op_cache_hit);
2502     return cache_result_t::NOOP;
2503   }
2504   if (!is_primary()) {
2505     dout(20) << __func__ << " cache miss; ask the primary" << dendl;
2506     osd->reply_op_error(op, -EAGAIN);
2507     return cache_result_t::REPLIED_WITH_EAGAIN;
2508   }
2509
2510   if (missing_oid == hobject_t() && obc.get()) {
2511     missing_oid = obc->obs.oi.soid;
2512   }
2513
2514   const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
2515   const object_locator_t oloc = m->get_object_locator();
2516
2517   if (op->need_skip_handle_cache()) {
2518     return cache_result_t::NOOP;
2519   }
2520
2521   // older versions do not proxy the feature bits.
2522   bool can_proxy_write = get_osdmap()->get_up_osd_features() &
2523     CEPH_FEATURE_OSD_PROXY_WRITE_FEATURES;
2524   OpRequestRef promote_op;
2525
2526   switch (pool.info.cache_mode) {
2527   case pg_pool_t::CACHEMODE_WRITEBACK:
2528     if (agent_state &&
2529         agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2530       if (!op->may_write() && !op->may_cache() &&
2531           !write_ordered && !must_promote) {
2532         dout(20) << __func__ << " cache pool full, proxying read" << dendl;
2533         do_proxy_read(op);
2534         return cache_result_t::HANDLED_PROXY;
2535       }
2536       dout(20) << __func__ << " cache pool full, waiting" << dendl;
2537       block_write_on_full_cache(missing_oid, op);
2538       return cache_result_t::BLOCKED_FULL;
2539     }
2540
2541     if (must_promote || (!hit_set && !op->need_skip_promote())) {
2542       promote_object(obc, missing_oid, oloc, op, promote_obc);
2543       return cache_result_t::BLOCKED_PROMOTE;
2544     }
2545
2546     if (op->may_write() || op->may_cache()) {
2547       if (can_proxy_write) {
2548         do_proxy_write(op, missing_oid);
2549       } else {
2550         // promote if can't proxy the write
2551         promote_object(obc, missing_oid, oloc, op, promote_obc);
2552         return cache_result_t::BLOCKED_PROMOTE;
2553       }
2554
2555       // Promote too?
2556       if (!op->need_skip_promote() &&
2557           maybe_promote(obc, missing_oid, oloc, in_hit_set,
2558                       pool.info.min_write_recency_for_promote,
2559                       OpRequestRef(),
2560                       promote_obc)) {
2561         return cache_result_t::BLOCKED_PROMOTE;
2562       }
2563       return cache_result_t::HANDLED_PROXY;
2564     } else {
2565       do_proxy_read(op);
2566
2567       // Avoid duplicate promotion
2568       if (obc.get() && obc->is_blocked()) {
2569         if (promote_obc)
2570           *promote_obc = obc;
2571         return cache_result_t::BLOCKED_PROMOTE;
2572       }
2573
2574       // Promote too?
2575       if (!op->need_skip_promote()) {
2576         (void)maybe_promote(obc, missing_oid, oloc, in_hit_set,
2577                             pool.info.min_read_recency_for_promote,
2578                             promote_op, promote_obc);
2579       }
2580
2581       return cache_result_t::HANDLED_PROXY;
2582     }
2583     assert(0 == "unreachable");
2584     return cache_result_t::NOOP;
2585
2586   case pg_pool_t::CACHEMODE_FORWARD:
2587     // FIXME: this mode allows requests to be reordered.
2588     do_cache_redirect(op);
2589     return cache_result_t::HANDLED_REDIRECT;
2590
2591   case pg_pool_t::CACHEMODE_READONLY:
2592     // TODO: clean this case up
2593     if (!obc.get() && r == -ENOENT) {
2594       // we don't have the object and op's a read
2595       promote_object(obc, missing_oid, oloc, op, promote_obc);
2596       return cache_result_t::BLOCKED_PROMOTE;
2597     }
2598     if (!r) { // it must be a write
2599       do_cache_redirect(op);
2600       return cache_result_t::HANDLED_REDIRECT;
2601     }
2602     // crap, there was a failure of some kind
2603     return cache_result_t::NOOP;
2604
2605   case pg_pool_t::CACHEMODE_READFORWARD:
2606     // Do writeback to the cache tier for writes
2607     if (op->may_write() || write_ordered || must_promote) {
2608       if (agent_state &&
2609           agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2610         dout(20) << __func__ << " cache pool full, waiting" << dendl;
2611         block_write_on_full_cache(missing_oid, op);
2612         return cache_result_t::BLOCKED_FULL;
2613       }
2614       promote_object(obc, missing_oid, oloc, op, promote_obc);
2615       return cache_result_t::BLOCKED_PROMOTE;
2616     }
2617
2618     // If it is a read, we can read, we need to forward it
2619     do_cache_redirect(op);
2620     return cache_result_t::HANDLED_REDIRECT;
2621
2622   case pg_pool_t::CACHEMODE_PROXY:
2623     if (!must_promote) {
2624       if (op->may_write() || op->may_cache() || write_ordered) {
2625         if (can_proxy_write) {
2626           do_proxy_write(op, missing_oid);
2627           return cache_result_t::HANDLED_PROXY;
2628         }
2629       } else {
2630         do_proxy_read(op);
2631         return cache_result_t::HANDLED_PROXY;
2632       }
2633     }
2634     // ugh, we're forced to promote.
2635     if (agent_state &&
2636         agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2637       dout(20) << __func__ << " cache pool full, waiting" << dendl;
2638       block_write_on_full_cache(missing_oid, op);
2639       return cache_result_t::BLOCKED_FULL;
2640     }
2641     promote_object(obc, missing_oid, oloc, op, promote_obc);
2642     return cache_result_t::BLOCKED_PROMOTE;
2643
2644   case pg_pool_t::CACHEMODE_READPROXY:
2645     // Do writeback to the cache tier for writes
2646     if (op->may_write() || write_ordered || must_promote) {
2647       if (agent_state &&
2648           agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2649         dout(20) << __func__ << " cache pool full, waiting" << dendl;
2650         block_write_on_full_cache(missing_oid, op);
2651         return cache_result_t::BLOCKED_FULL;
2652       }
2653       promote_object(obc, missing_oid, oloc, op, promote_obc);
2654       return cache_result_t::BLOCKED_PROMOTE;
2655     }
2656
2657     // If it is a read, we can read, we need to proxy it
2658     do_proxy_read(op);
2659     return cache_result_t::HANDLED_PROXY;
2660
2661   default:
2662     assert(0 == "unrecognized cache_mode");
2663   }
2664   return cache_result_t::NOOP;
2665 }
2666
2667 bool PrimaryLogPG::maybe_promote(ObjectContextRef obc,
2668                                  const hobject_t& missing_oid,
2669                                  const object_locator_t& oloc,
2670                                  bool in_hit_set,
2671                                  uint32_t recency,
2672                                  OpRequestRef promote_op,
2673                                  ObjectContextRef *promote_obc)
2674 {
2675   dout(20) << __func__ << " missing_oid " << missing_oid
2676            << "  in_hit_set " << in_hit_set << dendl;
2677
2678   switch (recency) {
2679   case 0:
2680     break;
2681   case 1:
2682     // Check if in the current hit set
2683     if (in_hit_set) {
2684       break;
2685     } else {
2686       // not promoting
2687       return false;
2688     }
2689     break;
2690   default:
2691     {
2692       unsigned count = (int)in_hit_set;
2693       if (count) {
2694         // Check if in other hit sets
2695         const hobject_t& oid = obc.get() ? obc->obs.oi.soid : missing_oid;
2696         for (map<time_t,HitSetRef>::reverse_iterator itor =
2697                agent_state->hit_set_map.rbegin();
2698              itor != agent_state->hit_set_map.rend();
2699              ++itor) {
2700           if (!itor->second->contains(oid)) {
2701             break;
2702           }
2703           ++count;
2704           if (count >= recency) {
2705             break;
2706           }
2707         }
2708       }
2709       if (count >= recency) {
2710         break;
2711       }
2712       return false;     // not promoting
2713     }
2714     break;
2715   }
2716
2717   if (osd->promote_throttle()) {
2718     dout(10) << __func__ << " promote throttled" << dendl;
2719     return false;
2720   }
2721   promote_object(obc, missing_oid, oloc, promote_op, promote_obc);
2722   return true;
2723 }
2724
2725 void PrimaryLogPG::do_cache_redirect(OpRequestRef op)
2726 {
2727   const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
2728   int flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
2729   MOSDOpReply *reply = new MOSDOpReply(m, -ENOENT,
2730                                        get_osdmap()->get_epoch(), flags, false);
2731   request_redirect_t redir(m->get_object_locator(), pool.info.tier_of);
2732   reply->set_redirect(redir);
2733   dout(10) << "sending redirect to pool " << pool.info.tier_of << " for op "
2734            << op << dendl;
2735   m->get_connection()->send_message(reply);
2736   return;
2737 }
2738
2739 struct C_ProxyRead : public Context {
2740   PrimaryLogPGRef pg;
2741   hobject_t oid;
2742   epoch_t last_peering_reset;
2743   ceph_tid_t tid;
2744   PrimaryLogPG::ProxyReadOpRef prdop;
2745   utime_t start;
2746   C_ProxyRead(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
2747              const PrimaryLogPG::ProxyReadOpRef& prd)
2748     : pg(p), oid(o), last_peering_reset(lpr),
2749       tid(0), prdop(prd), start(ceph_clock_now())
2750   {}
2751   void finish(int r) override {
2752     if (prdop->canceled)
2753       return;
2754     pg->lock();
2755     if (prdop->canceled) {
2756       pg->unlock();
2757       return;
2758     }
2759     if (last_peering_reset == pg->get_last_peering_reset()) {
2760       pg->finish_proxy_read(oid, tid, r);
2761       pg->osd->logger->tinc(l_osd_tier_r_lat, ceph_clock_now() - start);
2762     }
2763     pg->unlock();
2764   }
2765 };
2766
2767 void PrimaryLogPG::do_proxy_read(OpRequestRef op, ObjectContextRef obc)
2768 {
2769   // NOTE: non-const here because the ProxyReadOp needs mutable refs to
2770   // stash the result in the request's OSDOp vector
2771   MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
2772   object_locator_t oloc;
2773   hobject_t soid;
2774   /* extensible tier */
2775   if (obc && obc->obs.exists && obc->obs.oi.has_manifest()) {
2776     switch (obc->obs.oi.manifest.type) {
2777       case object_manifest_t::TYPE_REDIRECT:
2778           oloc = object_locator_t(obc->obs.oi.manifest.redirect_target);
2779           soid = obc->obs.oi.manifest.redirect_target;
2780           break;
2781       case object_manifest_t::TYPE_CHUNKED:
2782       default:
2783         assert(0 == "unrecognized manifest type");
2784     }
2785   } else {
2786   /* proxy */
2787     soid = m->get_hobj();
2788     oloc = object_locator_t(m->get_object_locator());
2789     oloc.pool = pool.info.tier_of;
2790   }
2791   unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY;
2792
2793   // pass through some original flags that make sense.
2794   //  - leave out redirection and balancing flags since we are
2795   //    already proxying through the primary
2796   //  - leave off read/write/exec flags that are derived from the op
2797   flags |= m->get_flags() & (CEPH_OSD_FLAG_RWORDERED |
2798                              CEPH_OSD_FLAG_ORDERSNAP |
2799                              CEPH_OSD_FLAG_ENFORCE_SNAPC |
2800                              CEPH_OSD_FLAG_MAP_SNAP_CLONE);
2801
2802   dout(10) << __func__ << " Start proxy read for " << *m << dendl;
2803
2804   ProxyReadOpRef prdop(std::make_shared<ProxyReadOp>(op, soid, m->ops));
2805
2806   ObjectOperation obj_op;
2807   obj_op.dup(prdop->ops);
2808
2809   if (pool.info.cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
2810       (agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL)) {
2811     for (unsigned i = 0; i < obj_op.ops.size(); i++) {
2812       ceph_osd_op op = obj_op.ops[i].op;
2813       switch (op.op) {
2814         case CEPH_OSD_OP_READ:
2815         case CEPH_OSD_OP_SYNC_READ:
2816         case CEPH_OSD_OP_SPARSE_READ:
2817         case CEPH_OSD_OP_CHECKSUM:
2818         case CEPH_OSD_OP_CMPEXT:
2819           op.flags = (op.flags | CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL) &
2820                        ~(CEPH_OSD_OP_FLAG_FADVISE_DONTNEED | CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
2821       }
2822     }
2823   }
2824
2825   C_ProxyRead *fin = new C_ProxyRead(this, soid, get_last_peering_reset(),
2826                                      prdop);
2827   ceph_tid_t tid = osd->objecter->read(
2828     soid.oid, oloc, obj_op,
2829     m->get_snapid(), NULL,
2830     flags, new C_OnFinisher(fin, &osd->objecter_finisher),
2831     &prdop->user_version,
2832     &prdop->data_offset,
2833     m->get_features());
2834   fin->tid = tid;
2835   prdop->objecter_tid = tid;
2836   proxyread_ops[tid] = prdop;
2837   in_progress_proxy_ops[soid].push_back(op);
2838 }
2839
2840 void PrimaryLogPG::finish_proxy_read(hobject_t oid, ceph_tid_t tid, int r)
2841 {
2842   dout(10) << __func__ << " " << oid << " tid " << tid
2843            << " " << cpp_strerror(r) << dendl;
2844
2845   map<ceph_tid_t, ProxyReadOpRef>::iterator p = proxyread_ops.find(tid);
2846   if (p == proxyread_ops.end()) {
2847     dout(10) << __func__ << " no proxyread_op found" << dendl;
2848     return;
2849   }
2850   ProxyReadOpRef prdop = p->second;
2851   if (tid != prdop->objecter_tid) {
2852     dout(10) << __func__ << " tid " << tid << " != prdop " << prdop
2853              << " tid " << prdop->objecter_tid << dendl;
2854     return;
2855   }
2856   if (oid != prdop->soid) {
2857     dout(10) << __func__ << " oid " << oid << " != prdop " << prdop
2858              << " soid " << prdop->soid << dendl;
2859     return;
2860   }
2861   proxyread_ops.erase(tid);
2862
2863   map<hobject_t, list<OpRequestRef>>::iterator q = in_progress_proxy_ops.find(oid);
2864   if (q == in_progress_proxy_ops.end()) {
2865     dout(10) << __func__ << " no in_progress_proxy_ops found" << dendl;
2866     return;
2867   }
2868   assert(q->second.size());
2869   list<OpRequestRef>::iterator it = std::find(q->second.begin(),
2870                                               q->second.end(),
2871                                               prdop->op);
2872   assert(it != q->second.end());
2873   OpRequestRef op = *it;
2874   q->second.erase(it);
2875   if (q->second.size() == 0) {
2876     in_progress_proxy_ops.erase(oid);
2877   }
2878
2879   osd->logger->inc(l_osd_tier_proxy_read);
2880
2881   const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
2882   OpContext *ctx = new OpContext(op, m->get_reqid(), &prdop->ops, this);
2883   ctx->reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0, false);
2884   ctx->user_at_version = prdop->user_version;
2885   ctx->data_off = prdop->data_offset;
2886   ctx->ignore_log_op_stats = true;
2887   complete_read_ctx(r, ctx);
2888 }
2889
2890 void PrimaryLogPG::kick_proxy_ops_blocked(hobject_t& soid)
2891 {
2892   map<hobject_t, list<OpRequestRef>>::iterator p = in_progress_proxy_ops.find(soid);
2893   if (p == in_progress_proxy_ops.end())
2894     return;
2895
2896   list<OpRequestRef>& ls = p->second;
2897   dout(10) << __func__ << " " << soid << " requeuing " << ls.size() << " requests" << dendl;
2898   requeue_ops(ls);
2899   in_progress_proxy_ops.erase(p);
2900 }
2901
2902 void PrimaryLogPG::cancel_proxy_read(ProxyReadOpRef prdop)
2903 {
2904   dout(10) << __func__ << " " << prdop->soid << dendl;
2905   prdop->canceled = true;
2906
2907   // cancel objecter op, if we can
2908   if (prdop->objecter_tid) {
2909     osd->objecter->op_cancel(prdop->objecter_tid, -ECANCELED);
2910     for (uint32_t i = 0; i < prdop->ops.size(); i++) {
2911       prdop->ops[i].outdata.clear();
2912     }
2913     proxyread_ops.erase(prdop->objecter_tid);
2914     prdop->objecter_tid = 0;
2915   }
2916 }
2917
2918 void PrimaryLogPG::cancel_proxy_ops(bool requeue)
2919 {
2920   dout(10) << __func__ << dendl;
2921
2922   // cancel proxy reads
2923   map<ceph_tid_t, ProxyReadOpRef>::iterator p = proxyread_ops.begin();
2924   while (p != proxyread_ops.end()) {
2925     cancel_proxy_read((p++)->second);
2926   }
2927
2928   // cancel proxy writes
2929   map<ceph_tid_t, ProxyWriteOpRef>::iterator q = proxywrite_ops.begin();
2930   while (q != proxywrite_ops.end()) {
2931     cancel_proxy_write((q++)->second);
2932   }
2933
2934   if (requeue) {
2935     map<hobject_t, list<OpRequestRef>>::iterator p =
2936       in_progress_proxy_ops.begin();
2937     while (p != in_progress_proxy_ops.end()) {
2938       list<OpRequestRef>& ls = p->second;
2939       dout(10) << __func__ << " " << p->first << " requeuing " << ls.size()
2940                << " requests" << dendl;
2941       requeue_ops(ls);
2942       in_progress_proxy_ops.erase(p++);
2943     }
2944   } else {
2945     in_progress_proxy_ops.clear();
2946   }
2947 }
2948
2949 struct C_ProxyWrite_Commit : public Context {
2950   PrimaryLogPGRef pg;
2951   hobject_t oid;
2952   epoch_t last_peering_reset;
2953   ceph_tid_t tid;
2954   PrimaryLogPG::ProxyWriteOpRef pwop;
2955   C_ProxyWrite_Commit(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
2956                       const PrimaryLogPG::ProxyWriteOpRef& pw)
2957     : pg(p), oid(o), last_peering_reset(lpr),
2958       tid(0), pwop(pw)
2959   {}
2960   void finish(int r) override {
2961     if (pwop->canceled)
2962       return;
2963     pg->lock();
2964     if (pwop->canceled) {
2965       pg->unlock();
2966       return;
2967     }
2968     if (last_peering_reset == pg->get_last_peering_reset()) {
2969       pg->finish_proxy_write(oid, tid, r);
2970     }
2971     pg->unlock();
2972   }
2973 };
2974
2975 void PrimaryLogPG::do_proxy_write(OpRequestRef op, const hobject_t& missing_oid, ObjectContextRef obc)
2976 {
2977   // NOTE: non-const because ProxyWriteOp takes a mutable ref
2978   MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
2979   object_locator_t oloc;
2980   SnapContext snapc(m->get_snap_seq(), m->get_snaps());
2981   hobject_t soid;
2982   /* extensible tier */
2983   if (obc && obc->obs.exists && obc->obs.oi.has_manifest()) {
2984     switch (obc->obs.oi.manifest.type) {
2985       case object_manifest_t::TYPE_REDIRECT:
2986           oloc = object_locator_t(obc->obs.oi.manifest.redirect_target);
2987           soid = obc->obs.oi.manifest.redirect_target;
2988           break;
2989       case object_manifest_t::TYPE_CHUNKED:
2990       default:
2991         assert(0 == "unrecognized manifest type");
2992     }
2993   } else {
2994   /* proxy */
2995     soid = m->get_hobj();
2996     oloc = object_locator_t(m->get_object_locator());
2997     oloc.pool = pool.info.tier_of;
2998   }
2999
3000   unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY;
3001   if (!(op->may_write() || op->may_cache())) {
3002     flags |= CEPH_OSD_FLAG_RWORDERED;
3003   }
3004   dout(10) << __func__ << " Start proxy write for " << *m << dendl;
3005
3006   ProxyWriteOpRef pwop(std::make_shared<ProxyWriteOp>(op, soid, m->ops, m->get_reqid()));
3007   pwop->ctx = new OpContext(op, m->get_reqid(), &pwop->ops, this);
3008   pwop->mtime = m->get_mtime();
3009
3010   ObjectOperation obj_op;
3011   obj_op.dup(pwop->ops);
3012
3013   C_ProxyWrite_Commit *fin = new C_ProxyWrite_Commit(
3014       this, soid, get_last_peering_reset(), pwop);
3015   ceph_tid_t tid = osd->objecter->mutate(
3016     soid.oid, oloc, obj_op, snapc,
3017     ceph::real_clock::from_ceph_timespec(pwop->mtime),
3018     flags, new C_OnFinisher(fin, &osd->objecter_finisher),
3019     &pwop->user_version, pwop->reqid);
3020   fin->tid = tid;
3021   pwop->objecter_tid = tid;
3022   proxywrite_ops[tid] = pwop;
3023   in_progress_proxy_ops[soid].push_back(op);
3024 }
3025
3026 void PrimaryLogPG::finish_proxy_write(hobject_t oid, ceph_tid_t tid, int r)
3027 {
3028   dout(10) << __func__ << " " << oid << " tid " << tid
3029            << " " << cpp_strerror(r) << dendl;
3030
3031   map<ceph_tid_t, ProxyWriteOpRef>::iterator p = proxywrite_ops.find(tid);
3032   if (p == proxywrite_ops.end()) {
3033     dout(10) << __func__ << " no proxywrite_op found" << dendl;
3034     return;
3035   }
3036   ProxyWriteOpRef pwop = p->second;
3037   assert(tid == pwop->objecter_tid);
3038   assert(oid == pwop->soid);
3039
3040   proxywrite_ops.erase(tid);
3041
3042   map<hobject_t, list<OpRequestRef> >::iterator q = in_progress_proxy_ops.find(oid);
3043   if (q == in_progress_proxy_ops.end()) {
3044     dout(10) << __func__ << " no in_progress_proxy_ops found" << dendl;
3045     delete pwop->ctx;
3046     pwop->ctx = NULL;
3047     return;
3048   }
3049   list<OpRequestRef>& in_progress_op = q->second;
3050   assert(in_progress_op.size());
3051   list<OpRequestRef>::iterator it = std::find(in_progress_op.begin(),
3052                                               in_progress_op.end(),
3053                                               pwop->op);
3054   assert(it != in_progress_op.end());
3055   in_progress_op.erase(it);
3056   if (in_progress_op.size() == 0) {
3057     in_progress_proxy_ops.erase(oid);
3058   }
3059
3060   osd->logger->inc(l_osd_tier_proxy_write);
3061
3062   const MOSDOp *m = static_cast<const MOSDOp*>(pwop->op->get_req());
3063   assert(m != NULL);
3064
3065   if (!pwop->sent_reply) {
3066     // send commit.
3067     MOSDOpReply *reply = pwop->ctx->reply;
3068     if (reply)
3069       pwop->ctx->reply = NULL;
3070     else {
3071       reply = new MOSDOpReply(m, r, get_osdmap()->get_epoch(), 0, true);
3072       reply->set_reply_versions(eversion_t(), pwop->user_version);
3073     }
3074     reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
3075     dout(10) << " sending commit on " << pwop << " " << reply << dendl;
3076     osd->send_message_osd_client(reply, m->get_connection());
3077     pwop->sent_reply = true;
3078     pwop->ctx->op->mark_commit_sent();
3079   }
3080
3081   delete pwop->ctx;
3082   pwop->ctx = NULL;
3083 }
3084
3085 void PrimaryLogPG::cancel_proxy_write(ProxyWriteOpRef pwop)
3086 {
3087   dout(10) << __func__ << " " << pwop->soid << dendl;
3088   pwop->canceled = true;
3089
3090   // cancel objecter op, if we can
3091   if (pwop->objecter_tid) {
3092     osd->objecter->op_cancel(pwop->objecter_tid, -ECANCELED);
3093     delete pwop->ctx;
3094     pwop->ctx = NULL;
3095     proxywrite_ops.erase(pwop->objecter_tid);
3096     pwop->objecter_tid = 0;
3097   }
3098 }
3099
3100 class PromoteCallback: public PrimaryLogPG::CopyCallback {
3101   ObjectContextRef obc;
3102   PrimaryLogPG *pg;
3103   utime_t start;
3104 public:
3105   PromoteCallback(ObjectContextRef obc_, PrimaryLogPG *pg_)
3106     : obc(obc_),
3107       pg(pg_),
3108       start(ceph_clock_now()) {}
3109
3110   void finish(PrimaryLogPG::CopyCallbackResults results) override {
3111     PrimaryLogPG::CopyResults *results_data = results.get<1>();
3112     int r = results.get<0>();
3113     pg->finish_promote(r, results_data, obc);
3114     pg->osd->logger->tinc(l_osd_tier_promote_lat, ceph_clock_now() - start);
3115   }
3116 };
3117
3118 void PrimaryLogPG::promote_object(ObjectContextRef obc,
3119                                   const hobject_t& missing_oid,
3120                                   const object_locator_t& oloc,
3121                                   OpRequestRef op,
3122                                   ObjectContextRef *promote_obc)
3123 {
3124   hobject_t hoid = obc ? obc->obs.oi.soid : missing_oid;
3125   assert(hoid != hobject_t());
3126   if (scrubber.write_blocked_by_scrub(hoid)) {
3127     dout(10) << __func__ << " " << hoid
3128              << " blocked by scrub" << dendl;
3129     if (op) {
3130       waiting_for_scrub.push_back(op);
3131       op->mark_delayed("waiting for scrub");
3132       dout(10) << __func__ << " " << hoid
3133                << " placing op in waiting_for_scrub" << dendl;
3134     } else {
3135       dout(10) << __func__ << " " << hoid
3136                << " no op, dropping on the floor" << dendl;
3137     }
3138     return;
3139   }
3140   if (!obc) { // we need to create an ObjectContext
3141     assert(missing_oid != hobject_t());
3142     obc = get_object_context(missing_oid, true);
3143   }
3144   if (promote_obc)
3145     *promote_obc = obc;
3146
3147   /*
3148    * Before promote complete, if there are  proxy-reads for the object,
3149    * for this case we don't use DONTNEED.
3150    */
3151   unsigned src_fadvise_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL;
3152   map<hobject_t, list<OpRequestRef>>::iterator q = in_progress_proxy_ops.find(obc->obs.oi.soid);
3153   if (q == in_progress_proxy_ops.end()) {
3154     src_fadvise_flags |= LIBRADOS_OP_FLAG_FADVISE_DONTNEED;
3155   }
3156
3157   PromoteCallback *cb = new PromoteCallback(obc, this);
3158   object_locator_t my_oloc = oloc;
3159   my_oloc.pool = pool.info.tier_of;
3160
3161   unsigned flags = CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
3162                    CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
3163                    CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE |
3164                    CEPH_OSD_COPY_FROM_FLAG_RWORDERED;
3165   start_copy(cb, obc, obc->obs.oi.soid, my_oloc, 0, flags,
3166              obc->obs.oi.soid.snap == CEPH_NOSNAP,
3167              src_fadvise_flags, 0);
3168
3169   assert(obc->is_blocked());
3170
3171   if (op)
3172     wait_for_blocked_object(obc->obs.oi.soid, op);
3173   info.stats.stats.sum.num_promote++;
3174 }
3175
3176 void PrimaryLogPG::execute_ctx(OpContext *ctx)
3177 {
3178   FUNCTRACE();
3179   dout(10) << __func__ << " " << ctx << dendl;
3180   ctx->reset_obs(ctx->obc);
3181   ctx->update_log_only = false; // reset in case finish_copyfrom() is re-running execute_ctx
3182   OpRequestRef op = ctx->op;
3183   const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
3184   ObjectContextRef obc = ctx->obc;
3185   const hobject_t& soid = obc->obs.oi.soid;
3186
3187   // this method must be idempotent since we may call it several times
3188   // before we finally apply the resulting transaction.
3189   ctx->op_t.reset(new PGTransaction);
3190
3191   if (op->may_write() || op->may_cache()) {
3192     // snap
3193     if (!(m->has_flag(CEPH_OSD_FLAG_ENFORCE_SNAPC)) &&
3194         pool.info.is_pool_snaps_mode()) {
3195       // use pool's snapc
3196       ctx->snapc = pool.snapc;
3197     } else {
3198       // client specified snapc
3199       ctx->snapc.seq = m->get_snap_seq();
3200       ctx->snapc.snaps = m->get_snaps();
3201       filter_snapc(ctx->snapc.snaps);
3202     }
3203     if ((m->has_flag(CEPH_OSD_FLAG_ORDERSNAP)) &&
3204         ctx->snapc.seq < obc->ssc->snapset.seq) {
3205       dout(10) << " ORDERSNAP flag set and snapc seq " << ctx->snapc.seq
3206                << " < snapset seq " << obc->ssc->snapset.seq
3207                << " on " << obc->obs.oi.soid << dendl;
3208       reply_ctx(ctx, -EOLDSNAPC);
3209       return;
3210     }
3211
3212     // version
3213     ctx->at_version = get_next_version();
3214     ctx->mtime = m->get_mtime();
3215
3216     dout(10) << __func__ << " " << soid << " " << *ctx->ops
3217              << " ov " << obc->obs.oi.version << " av " << ctx->at_version
3218              << " snapc " << ctx->snapc
3219              << " snapset " << obc->ssc->snapset
3220              << dendl;
3221   } else {
3222     dout(10) << __func__ << " " << soid << " " << *ctx->ops
3223              << " ov " << obc->obs.oi.version
3224              << dendl;
3225   }
3226
3227   if (!ctx->user_at_version)
3228     ctx->user_at_version = obc->obs.oi.user_version;
3229   dout(30) << __func__ << " user_at_version " << ctx->user_at_version << dendl;
3230
3231   if (op->may_read()) {
3232     dout(10) << " taking ondisk_read_lock" << dendl;
3233     obc->ondisk_read_lock();
3234   }
3235
3236   {
3237 #ifdef WITH_LTTNG
3238     osd_reqid_t reqid = ctx->op->get_reqid();
3239 #endif
3240     tracepoint(osd, prepare_tx_enter, reqid.name._type,
3241         reqid.name._num, reqid.tid, reqid.inc);
3242   }
3243
3244   int result = prepare_transaction(ctx);
3245
3246   {
3247 #ifdef WITH_LTTNG
3248     osd_reqid_t reqid = ctx->op->get_reqid();
3249 #endif
3250     tracepoint(osd, prepare_tx_exit, reqid.name._type,
3251         reqid.name._num, reqid.tid, reqid.inc);
3252   }
3253
3254   if (op->may_read()) {
3255     dout(10) << " dropping ondisk_read_lock" << dendl;
3256     obc->ondisk_read_unlock();
3257   }
3258
3259   bool pending_async_reads = !ctx->pending_async_reads.empty();
3260   if (result == -EINPROGRESS || pending_async_reads) {
3261     // come back later.
3262     if (pending_async_reads) {
3263       in_progress_async_reads.push_back(make_pair(op, ctx));
3264       ctx->start_async_reads(this);
3265     }
3266     return;
3267   }
3268
3269   if (result == -EAGAIN) {
3270     // clean up after the ctx
3271     close_op_ctx(ctx);
3272     return;
3273   }
3274
3275   bool successful_write = !ctx->op_t->empty() && op->may_write() && result >= 0;
3276   // prepare the reply
3277   ctx->reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0,
3278                                successful_write);
3279
3280   // Write operations aren't allowed to return a data payload because
3281   // we can't do so reliably. If the client has to resend the request
3282   // and it has already been applied, we will return 0 with no
3283   // payload.  Non-deterministic behavior is no good.  However, it is
3284   // possible to construct an operation that does a read, does a guard
3285   // check (e.g., CMPXATTR), and then a write.  Then we either succeed
3286   // with the write, or return a CMPXATTR and the read value.
3287   if (successful_write) {
3288     // write.  normalize the result code.
3289     dout(20) << " zeroing write result code " << result << dendl;
3290     result = 0;
3291   }
3292   ctx->reply->set_result(result);
3293
3294   // read or error?
3295   if ((ctx->op_t->empty() || result < 0) && !ctx->update_log_only) {
3296     // finish side-effects
3297     if (result >= 0)
3298       do_osd_op_effects(ctx, m->get_connection());
3299
3300     complete_read_ctx(result, ctx);
3301     return;
3302   }
3303
3304   ctx->reply->set_reply_versions(ctx->at_version, ctx->user_at_version);
3305
3306   assert(op->may_write() || op->may_cache());
3307
3308   // trim log?
3309   calc_trim_to();
3310
3311   // verify that we are doing this in order?
3312   if (cct->_conf->osd_debug_op_order && m->get_source().is_client() &&
3313       !pool.info.is_tier() && !pool.info.has_tiers()) {
3314     map<client_t,ceph_tid_t>& cm = debug_op_order[obc->obs.oi.soid];
3315     ceph_tid_t t = m->get_tid();
3316     client_t n = m->get_source().num();
3317     map<client_t,ceph_tid_t>::iterator p = cm.find(n);
3318     if (p == cm.end()) {
3319       dout(20) << " op order client." << n << " tid " << t << " (first)" << dendl;
3320       cm[n] = t;
3321     } else {
3322       dout(20) << " op order client." << n << " tid " << t << " last was " << p->second << dendl;
3323       if (p->second > t) {
3324         derr << "bad op order, already applied " << p->second << " > this " << t << dendl;
3325         assert(0 == "out of order op");
3326       }
3327       p->second = t;
3328     }
3329   }
3330
3331   if (ctx->update_log_only) {
3332     if (result >= 0)
3333       do_osd_op_effects(ctx, m->get_connection());
3334
3335     dout(20) << __func__ << " update_log_only -- result=" << result << dendl;
3336     // save just what we need from ctx
3337     MOSDOpReply *reply = ctx->reply;
3338     ctx->reply = nullptr;
3339     reply->claim_op_out_data(*ctx->ops);
3340     reply->get_header().data_off = (ctx->data_off ? *ctx->data_off : 0);
3341     close_op_ctx(ctx);
3342
3343     if (result == -ENOENT) {
3344       reply->set_enoent_reply_versions(info.last_update,
3345                                        info.last_user_version);
3346     }
3347     reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
3348     // append to pg log for dup detection - don't save buffers for now
3349     record_write_error(op, soid, reply, result);
3350     return;
3351   }
3352
3353   // no need to capture PG ref, repop cancel will handle that
3354   // Can capture the ctx by pointer, it's owned by the repop
3355   ctx->register_on_commit(
3356     [m, ctx, this](){
3357       if (ctx->op)
3358         log_op_stats(
3359           ctx);
3360
3361       if (m && !ctx->sent_reply) {
3362         MOSDOpReply *reply = ctx->reply;
3363         if (reply)
3364           ctx->reply = nullptr;
3365         else {
3366           reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0, true);
3367           reply->set_reply_versions(ctx->at_version,
3368                                     ctx->user_at_version);
3369         }
3370         reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
3371         dout(10) << " sending reply on " << *m << " " << reply << dendl;
3372         osd->send_message_osd_client(reply, m->get_connection());
3373         ctx->sent_reply = true;
3374         ctx->op->mark_commit_sent();
3375       }
3376     });
3377   ctx->register_on_success(
3378     [ctx, this]() {
3379       do_osd_op_effects(
3380         ctx,
3381         ctx->op ? ctx->op->get_req()->get_connection() :
3382         ConnectionRef());
3383     });
3384   ctx->register_on_finish(
3385     [ctx, this]() {
3386       delete ctx;
3387     });
3388
3389   // issue replica writes
3390   ceph_tid_t rep_tid = osd->get_tid();
3391
3392   RepGather *repop = new_repop(ctx, obc, rep_tid);
3393
3394   issue_repop(repop, ctx);
3395   eval_repop(repop);
3396   repop->put();
3397 }
3398
3399 void PrimaryLogPG::close_op_ctx(OpContext *ctx) {
3400   release_object_locks(ctx->lock_manager);
3401
3402   ctx->op_t.reset();
3403
3404   for (auto p = ctx->on_finish.begin(); p != ctx->on_finish.end();
3405        ctx->on_finish.erase(p++)) {
3406     (*p)();
3407   }
3408   delete ctx;
3409 }
3410
3411 void PrimaryLogPG::reply_ctx(OpContext *ctx, int r)
3412 {
3413   if (ctx->op)
3414     osd->reply_op_error(ctx->op, r);
3415   close_op_ctx(ctx);
3416 }
3417
3418 void PrimaryLogPG::reply_ctx(OpContext *ctx, int r, eversion_t v, version_t uv)
3419 {
3420   if (ctx->op)
3421     osd->reply_op_error(ctx->op, r, v, uv);
3422   close_op_ctx(ctx);
3423 }
3424
3425 void PrimaryLogPG::log_op_stats(OpContext *ctx)
3426 {
3427   OpRequestRef op = ctx->op;
3428   const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
3429
3430   utime_t now = ceph_clock_now();
3431   utime_t latency = now;
3432   latency -= ctx->op->get_req()->get_recv_stamp();
3433   utime_t process_latency = now;
3434   process_latency -= ctx->op->get_dequeued_time();
3435
3436   uint64_t inb = ctx->bytes_written;
3437   uint64_t outb = ctx->bytes_read;
3438
3439   osd->logger->inc(l_osd_op);
3440
3441   osd->logger->inc(l_osd_op_outb, outb);
3442   osd->logger->inc(l_osd_op_inb, inb);
3443   osd->logger->tinc(l_osd_op_lat, latency);
3444   osd->logger->tinc(l_osd_op_process_lat, process_latency);
3445
3446   if (op->may_read() && op->may_write()) {
3447     osd->logger->inc(l_osd_op_rw);
3448     osd->logger->inc(l_osd_op_rw_inb, inb);
3449     osd->logger->inc(l_osd_op_rw_outb, outb);
3450     osd->logger->tinc(l_osd_op_rw_lat, latency);
3451     osd->logger->hinc(l_osd_op_rw_lat_inb_hist, latency.to_nsec(), inb);
3452     osd->logger->hinc(l_osd_op_rw_lat_outb_hist, latency.to_nsec(), outb);
3453     osd->logger->tinc(l_osd_op_rw_process_lat, process_latency);
3454   } else if (op->may_read()) {
3455     osd->logger->inc(l_osd_op_r);
3456     osd->logger->inc(l_osd_op_r_outb, outb);
3457     osd->logger->tinc(l_osd_op_r_lat, latency);
3458     osd->logger->hinc(l_osd_op_r_lat_outb_hist, latency.to_nsec(), outb);
3459     osd->logger->tinc(l_osd_op_r_process_lat, process_latency);
3460   } else if (op->may_write() || op->may_cache()) {
3461     osd->logger->inc(l_osd_op_w);
3462     osd->logger->inc(l_osd_op_w_inb, inb);
3463     osd->logger->tinc(l_osd_op_w_lat, latency);
3464     osd->logger->hinc(l_osd_op_w_lat_inb_hist, latency.to_nsec(), inb);
3465     osd->logger->tinc(l_osd_op_w_process_lat, process_latency);
3466   } else
3467     ceph_abort();
3468
3469   dout(15) << "log_op_stats " << *m
3470            << " inb " << inb
3471            << " outb " << outb
3472            << " lat " << latency << dendl;
3473 }
3474
3475 void PrimaryLogPG::do_sub_op(OpRequestRef op)
3476 {
3477   const MOSDSubOp *m = static_cast<const MOSDSubOp*>(op->get_req());
3478   assert(have_same_or_newer_map(m->map_epoch));
3479   assert(m->get_type() == MSG_OSD_SUBOP);
3480   dout(15) << "do_sub_op " << *op->get_req() << dendl;
3481
3482   if (!is_peered()) {
3483     waiting_for_peered.push_back(op);
3484     op->mark_delayed("waiting for active");
3485     return;
3486   }
3487
3488   const OSDOp *first = NULL;
3489   if (m->ops.size() >= 1) {
3490     first = &m->ops[0];
3491   }
3492
3493   if (first) {
3494     switch (first->op.op) {
3495     case CEPH_OSD_OP_DELETE:
3496       sub_op_remove(op);
3497       return;
3498     case CEPH_OSD_OP_SCRUB_RESERVE:
3499       handle_scrub_reserve_request(op);
3500       return;
3501     case CEPH_OSD_OP_SCRUB_UNRESERVE:
3502       handle_scrub_reserve_release(op);
3503       return;
3504     case CEPH_OSD_OP_SCRUB_MAP:
3505       sub_op_scrub_map(op);
3506       return;
3507     }
3508   }
3509 }
3510
3511 void PrimaryLogPG::do_sub_op_reply(OpRequestRef op)
3512 {
3513   const MOSDSubOpReply *r = static_cast<const MOSDSubOpReply *>(op->get_req());
3514   assert(r->get_type() == MSG_OSD_SUBOPREPLY);
3515   if (r->ops.size() >= 1) {
3516     const OSDOp& first = r->ops[0];
3517     switch (first.op.op) {
3518     case CEPH_OSD_OP_SCRUB_RESERVE:
3519       {
3520         pg_shard_t from = r->from;
3521         bufferlist::iterator p = const_cast<bufferlist&>(r->get_data()).begin();
3522         bool reserved;
3523         ::decode(reserved, p);
3524         if (reserved) {
3525           handle_scrub_reserve_grant(op, from);
3526         } else {
3527           handle_scrub_reserve_reject(op, from);
3528         }
3529       }
3530       return;
3531     }
3532   }
3533 }
3534
3535 void PrimaryLogPG::do_scan(
3536   OpRequestRef op,
3537   ThreadPool::TPHandle &handle)
3538 {
3539   const MOSDPGScan *m = static_cast<const MOSDPGScan*>(op->get_req());
3540   assert(m->get_type() == MSG_OSD_PG_SCAN);
3541   dout(10) << "do_scan " << *m << dendl;
3542
3543   op->mark_started();
3544
3545   switch (m->op) {
3546   case MOSDPGScan::OP_SCAN_GET_DIGEST:
3547     {
3548       ostringstream ss;
3549       if (osd->check_backfill_full(ss)) {
3550         dout(1) << __func__ << ": Canceling backfill, " << ss.str() << dendl;
3551         queue_peering_event(
3552           CephPeeringEvtRef(
3553             std::make_shared<CephPeeringEvt>(
3554               get_osdmap()->get_epoch(),
3555               get_osdmap()->get_epoch(),
3556               BackfillTooFull())));
3557         return;
3558       }
3559
3560       BackfillInterval bi;
3561       bi.begin = m->begin;
3562       // No need to flush, there won't be any in progress writes occuring
3563       // past m->begin
3564       scan_range(
3565         cct->_conf->osd_backfill_scan_min,
3566         cct->_conf->osd_backfill_scan_max,
3567         &bi,
3568         handle);
3569       MOSDPGScan *reply = new MOSDPGScan(
3570         MOSDPGScan::OP_SCAN_DIGEST,
3571         pg_whoami,
3572         get_osdmap()->get_epoch(), m->query_epoch,
3573         spg_t(info.pgid.pgid, get_primary().shard), bi.begin, bi.end);
3574       ::encode(bi.objects, reply->get_data());
3575       osd->send_message_osd_cluster(reply, m->get_connection());
3576     }
3577     break;
3578
3579   case MOSDPGScan::OP_SCAN_DIGEST:
3580     {
3581       pg_shard_t from = m->from;
3582
3583       // Check that from is in backfill_targets vector
3584       assert(is_backfill_targets(from));
3585
3586       BackfillInterval& bi = peer_backfill_info[from];
3587       bi.begin = m->begin;
3588       bi.end = m->end;
3589       bufferlist::iterator p = const_cast<bufferlist&>(m->get_data()).begin();
3590
3591       // take care to preserve ordering!
3592       bi.clear_objects();
3593       ::decode_noclear(bi.objects, p);
3594
3595       if (waiting_on_backfill.erase(from)) {
3596         if (waiting_on_backfill.empty()) {
3597           assert(peer_backfill_info.size() == backfill_targets.size());
3598           finish_recovery_op(hobject_t::get_max());
3599         }
3600       } else {
3601         // we canceled backfill for a while due to a too full, and this
3602         // is an extra response from a non-too-full peer
3603       }
3604     }
3605     break;
3606   }
3607 }
3608
3609 void PrimaryLogPG::do_backfill(OpRequestRef op)
3610 {
3611   const MOSDPGBackfill *m = static_cast<const MOSDPGBackfill*>(op->get_req());
3612   assert(m->get_type() == MSG_OSD_PG_BACKFILL);
3613   dout(10) << "do_backfill " << *m << dendl;
3614
3615   op->mark_started();
3616
3617   switch (m->op) {
3618   case MOSDPGBackfill::OP_BACKFILL_FINISH:
3619     {
3620       assert(cct->_conf->osd_kill_backfill_at != 1);
3621
3622       MOSDPGBackfill *reply = new MOSDPGBackfill(
3623         MOSDPGBackfill::OP_BACKFILL_FINISH_ACK,
3624         get_osdmap()->get_epoch(),
3625         m->query_epoch,
3626         spg_t(info.pgid.pgid, get_primary().shard));
3627       reply->set_priority(get_recovery_op_priority());
3628       osd->send_message_osd_cluster(reply, m->get_connection());
3629       queue_peering_event(
3630         CephPeeringEvtRef(
3631           std::make_shared<CephPeeringEvt>(
3632             get_osdmap()->get_epoch(),
3633             get_osdmap()->get_epoch(),
3634             RecoveryDone())));
3635     }
3636     // fall-thru
3637
3638   case MOSDPGBackfill::OP_BACKFILL_PROGRESS:
3639     {
3640       assert(cct->_conf->osd_kill_backfill_at != 2);
3641
3642       info.set_last_backfill(m->last_backfill);
3643       info.stats = m->stats;
3644
3645       ObjectStore::Transaction t;
3646       dirty_info = true;
3647       write_if_dirty(t);
3648       int tr = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
3649       assert(tr == 0);
3650     }
3651     break;
3652
3653   case MOSDPGBackfill::OP_BACKFILL_FINISH_ACK:
3654     {
3655       assert(is_primary());
3656       assert(cct->_conf->osd_kill_backfill_at != 3);
3657       finish_recovery_op(hobject_t::get_max());
3658     }
3659     break;
3660   }
3661 }
3662
3663 void PrimaryLogPG::do_backfill_remove(OpRequestRef op)
3664 {
3665   const MOSDPGBackfillRemove *m = static_cast<const MOSDPGBackfillRemove*>(
3666     op->get_req());
3667   assert(m->get_type() == MSG_OSD_PG_BACKFILL_REMOVE);
3668   dout(7) << __func__ << " " << m->ls << dendl;
3669
3670   op->mark_started();
3671
3672   ObjectStore::Transaction t;
3673   for (auto& p : m->ls) {
3674     remove_snap_mapped_object(t, p.first);
3675   }
3676   int r = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
3677   assert(r == 0);
3678 }
3679
3680 int PrimaryLogPG::trim_object(
3681   bool first, const hobject_t &coid, PrimaryLogPG::OpContextUPtr *ctxp)
3682 {
3683   *ctxp = NULL;
3684   // load clone info
3685   bufferlist bl;
3686   ObjectContextRef obc = get_object_context(coid, false, NULL);
3687   if (!obc || !obc->ssc || !obc->ssc->exists) {
3688     osd->clog->error() << __func__ << ": Can not trim " << coid
3689       << " repair needed " << (obc ? "(no obc->ssc or !exists)" : "(no obc)");
3690     return -ENOENT;
3691   }
3692
3693   hobject_t snapoid(
3694     coid.oid, coid.get_key(),
3695     obc->ssc->snapset.head_exists ? CEPH_NOSNAP:CEPH_SNAPDIR, coid.get_hash(),
3696     info.pgid.pool(), coid.get_namespace());
3697   ObjectContextRef snapset_obc = get_object_context(snapoid, false);
3698   if (!snapset_obc) {
3699     osd->clog->error() << __func__ << ": Can not trim " << coid
3700       << " repair needed, no snapset obc for " << snapoid;
3701     return -ENOENT;
3702   }
3703
3704   SnapSet& snapset = obc->ssc->snapset;
3705
3706   bool legacy = snapset.is_legacy() ||
3707     get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
3708
3709   object_info_t &coi = obc->obs.oi;
3710   set<snapid_t> old_snaps;
3711   if (legacy) {
3712     old_snaps.insert(coi.legacy_snaps.begin(), coi.legacy_snaps.end());
3713   } else {
3714     auto p = snapset.clone_snaps.find(coid.snap);
3715     if (p == snapset.clone_snaps.end()) {
3716       osd->clog->error() << "No clone_snaps in snapset " << snapset
3717                          << " for object " << coid << "\n";
3718       return -ENOENT;
3719     }
3720     old_snaps.insert(snapset.clone_snaps[coid.snap].begin(),
3721                      snapset.clone_snaps[coid.snap].end());
3722   }
3723   if (old_snaps.empty()) {
3724     osd->clog->error() << "No object info snaps for object " << coid;
3725     return -ENOENT;
3726   }
3727
3728   dout(10) << coid << " old_snaps " << old_snaps
3729            << " old snapset " << snapset << dendl;
3730   if (snapset.seq == 0) {
3731     osd->clog->error() << "No snapset.seq for object " << coid;
3732     return -ENOENT;
3733   }
3734
3735   set<snapid_t> new_snaps;
3736   for (set<snapid_t>::iterator i = old_snaps.begin();
3737        i != old_snaps.end();
3738        ++i) {
3739     if (!pool.info.is_removed_snap(*i))
3740       new_snaps.insert(*i);
3741   }
3742
3743   vector<snapid_t>::iterator p = snapset.clones.end();
3744
3745   if (new_snaps.empty()) {
3746     p = std::find(snapset.clones.begin(), snapset.clones.end(), coid.snap);
3747     if (p == snapset.clones.end()) {
3748       osd->clog->error() << "Snap " << coid.snap << " not in clones";
3749       return -ENOENT;
3750     }
3751   }
3752
3753   OpContextUPtr ctx = simple_opc_create(obc);
3754   ctx->snapset_obc = snapset_obc;
3755
3756   if (!ctx->lock_manager.get_snaptrimmer_write(
3757         coid,
3758         obc,
3759         first)) {
3760     close_op_ctx(ctx.release());
3761     dout(10) << __func__ << ": Unable to get a wlock on " << coid << dendl;
3762     return -ENOLCK;
3763   }
3764
3765   if (!ctx->lock_manager.get_snaptrimmer_write(
3766         snapoid,
3767         snapset_obc,
3768         first)) {
3769     close_op_ctx(ctx.release());
3770     dout(10) << __func__ << ": Unable to get a wlock on " << snapoid << dendl;
3771     return -ENOLCK;
3772   }
3773
3774   ctx->at_version = get_next_version();
3775
3776   PGTransaction *t = ctx->op_t.get();
3777
3778   if (new_snaps.empty()) {
3779     // remove clone
3780     dout(10) << coid << " snaps " << old_snaps << " -> "
3781              << new_snaps << " ... deleting" << dendl;
3782
3783     // ...from snapset
3784     assert(p != snapset.clones.end());
3785
3786     snapid_t last = coid.snap;
3787     ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(last);
3788
3789     if (p != snapset.clones.begin()) {
3790       // not the oldest... merge overlap into next older clone
3791       vector<snapid_t>::iterator n = p - 1;
3792       hobject_t prev_coid = coid;
3793       prev_coid.snap = *n;
3794       bool adjust_prev_bytes = is_present_clone(prev_coid);
3795
3796       if (adjust_prev_bytes)
3797         ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(*n);
3798
3799       snapset.clone_overlap[*n].intersection_of(
3800         snapset.clone_overlap[*p]);
3801
3802       if (adjust_prev_bytes)
3803         ctx->delta_stats.num_bytes += snapset.get_clone_bytes(*n);
3804     }
3805     ctx->delta_stats.num_objects--;
3806     if (coi.is_dirty())
3807       ctx->delta_stats.num_objects_dirty--;
3808     if (coi.is_omap())
3809       ctx->delta_stats.num_objects_omap--;
3810     if (coi.is_whiteout()) {
3811       dout(20) << __func__ << " trimming whiteout on " << coid << dendl;
3812       ctx->delta_stats.num_whiteouts--;
3813     }
3814     ctx->delta_stats.num_object_clones--;
3815     if (coi.is_cache_pinned())
3816       ctx->delta_stats.num_objects_pinned--;
3817     obc->obs.exists = false;
3818
3819     snapset.clones.erase(p);
3820     snapset.clone_overlap.erase(last);
3821     snapset.clone_size.erase(last);
3822     snapset.clone_snaps.erase(last);
3823
3824     ctx->log.push_back(
3825       pg_log_entry_t(
3826         pg_log_entry_t::DELETE,
3827         coid,
3828         ctx->at_version,
3829         ctx->obs->oi.version,
3830         0,
3831         osd_reqid_t(),
3832         ctx->mtime,
3833         0)
3834       );
3835     t->remove(coid);
3836     t->update_snaps(
3837       coid,
3838       old_snaps,
3839       new_snaps);
3840
3841     coi = object_info_t(coid);
3842
3843     ctx->at_version.version++;
3844   } else {
3845     // save adjusted snaps for this object
3846     dout(10) << coid << " snaps " << old_snaps << " -> " << new_snaps << dendl;
3847     if (legacy) {
3848       coi.legacy_snaps = vector<snapid_t>(new_snaps.rbegin(), new_snaps.rend());
3849     } else {
3850       snapset.clone_snaps[coid.snap] = vector<snapid_t>(new_snaps.rbegin(),
3851                                                         new_snaps.rend());
3852       // we still do a 'modify' event on this object just to trigger a
3853       // snapmapper.update ... :(
3854     }
3855
3856     coi.prior_version = coi.version;
3857     coi.version = ctx->at_version;
3858     bl.clear();
3859     ::encode(coi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
3860     t->setattr(coid, OI_ATTR, bl);
3861
3862     ctx->log.push_back(
3863       pg_log_entry_t(
3864         pg_log_entry_t::MODIFY,
3865         coid,
3866         coi.version,
3867         coi.prior_version,
3868         0,
3869         osd_reqid_t(),
3870         ctx->mtime,
3871         0)
3872       );
3873     ctx->at_version.version++;
3874
3875     t->update_snaps(
3876       coid,
3877       old_snaps,
3878       new_snaps);
3879   }
3880
3881   // save head snapset
3882   dout(10) << coid << " new snapset " << snapset << " on "
3883            << snapset_obc->obs.oi << dendl;
3884   if (snapset.clones.empty() &&
3885       (!snapset.head_exists ||
3886        (snapset_obc->obs.oi.is_whiteout() &&
3887         !(snapset_obc->obs.oi.is_dirty() && pool.info.is_tier()) &&
3888         !snapset_obc->obs.oi.is_cache_pinned()))) {
3889     // NOTE: this arguably constitutes minor interference with the
3890     // tiering agent if this is a cache tier since a snap trim event
3891     // is effectively evicting a whiteout we might otherwise want to
3892     // keep around.
3893     dout(10) << coid << " removing " << snapoid << dendl;
3894     ctx->log.push_back(
3895       pg_log_entry_t(
3896         pg_log_entry_t::DELETE,
3897         snapoid,
3898         ctx->at_version,
3899         ctx->snapset_obc->obs.oi.version,
3900         0,
3901         osd_reqid_t(),
3902         ctx->mtime,
3903         0)
3904       );
3905     if (snapoid.is_head()) {
3906       derr << "removing snap head" << dendl;
3907       object_info_t& oi = ctx->snapset_obc->obs.oi;
3908       ctx->delta_stats.num_objects--;
3909       if (oi.is_dirty()) {
3910         ctx->delta_stats.num_objects_dirty--;
3911       }
3912       if (oi.is_omap())
3913         ctx->delta_stats.num_objects_omap--;
3914       if (oi.is_whiteout()) {
3915         dout(20) << __func__ << " trimming whiteout on " << oi.soid << dendl;
3916         ctx->delta_stats.num_whiteouts--;
3917       }
3918       if (oi.is_cache_pinned()) {
3919         ctx->delta_stats.num_objects_pinned--;
3920       }
3921     }
3922     ctx->snapset_obc->obs.exists = false;
3923     ctx->snapset_obc->obs.oi = object_info_t(snapoid);
3924     t->remove(snapoid);
3925   } else {
3926     dout(10) << coid << " filtering snapset on " << snapoid << dendl;
3927     snapset.filter(pool.info);
3928     dout(10) << coid << " writing updated snapset on " << snapoid
3929              << ", snapset is " << snapset << dendl;
3930     ctx->log.push_back(
3931       pg_log_entry_t(
3932         pg_log_entry_t::MODIFY,
3933         snapoid,
3934         ctx->at_version,
3935         ctx->snapset_obc->obs.oi.version,
3936         0,
3937         osd_reqid_t(),
3938         ctx->mtime,
3939         0)
3940       );
3941
3942     ctx->snapset_obc->obs.oi.prior_version =
3943       ctx->snapset_obc->obs.oi.version;
3944     ctx->snapset_obc->obs.oi.version = ctx->at_version;
3945
3946     map <string, bufferlist> attrs;
3947     bl.clear();
3948     ::encode(snapset, bl);
3949     attrs[SS_ATTR].claim(bl);
3950
3951     bl.clear();
3952     ::encode(ctx->snapset_obc->obs.oi, bl,
3953              get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
3954     attrs[OI_ATTR].claim(bl);
3955     t->setattrs(snapoid, attrs);
3956   }
3957
3958   *ctxp = std::move(ctx);
3959   return 0;
3960 }
3961
3962 void PrimaryLogPG::kick_snap_trim()
3963 {
3964   assert(is_active());
3965   assert(is_primary());
3966   if (is_clean() && !snap_trimq.empty()) {
3967     dout(10) << __func__ << ": clean and snaps to trim, kicking" << dendl;
3968     snap_trimmer_machine.process_event(KickTrim());
3969   }
3970 }
3971
3972 void PrimaryLogPG::snap_trimmer_scrub_complete()
3973 {
3974   if (is_primary() && is_active() && is_clean()) {
3975     assert(!snap_trimq.empty());
3976     snap_trimmer_machine.process_event(ScrubComplete());
3977   }
3978 }
3979
3980 void PrimaryLogPG::snap_trimmer(epoch_t queued)
3981 {
3982   if (deleting || pg_has_reset_since(queued)) {
3983     return;
3984   }
3985
3986   assert(is_primary());
3987
3988   dout(10) << "snap_trimmer posting" << dendl;
3989   snap_trimmer_machine.process_event(DoSnapWork());
3990   dout(10) << "snap_trimmer complete" << dendl;
3991   return;
3992 }
3993
3994 int PrimaryLogPG::do_xattr_cmp_u64(int op, __u64 v1, bufferlist& xattr)
3995 {
3996   __u64 v2;
3997
3998   string v2s(xattr.c_str(), xattr.length());
3999   if (v2s.length())
4000     v2 = strtoull(v2s.c_str(), NULL, 10);
4001   else
4002     v2 = 0;
4003
4004   dout(20) << "do_xattr_cmp_u64 '" << v1 << "' vs '" << v2 << "' op " << op << dendl;
4005
4006   switch (op) {
4007   case CEPH_OSD_CMPXATTR_OP_EQ:
4008     return (v1 == v2);
4009   case CEPH_OSD_CMPXATTR_OP_NE:
4010     return (v1 != v2);
4011   case CEPH_OSD_CMPXATTR_OP_GT:
4012     return (v1 > v2);
4013   case CEPH_OSD_CMPXATTR_OP_GTE:
4014     return (v1 >= v2);
4015   case CEPH_OSD_CMPXATTR_OP_LT:
4016     return (v1 < v2);
4017   case CEPH_OSD_CMPXATTR_OP_LTE:
4018     return (v1 <= v2);
4019   default:
4020     return -EINVAL;
4021   }
4022 }
4023
4024 int PrimaryLogPG::do_xattr_cmp_str(int op, string& v1s, bufferlist& xattr)
4025 {
4026   string v2s(xattr.c_str(), xattr.length());
4027
4028   dout(20) << "do_xattr_cmp_str '" << v1s << "' vs '" << v2s << "' op " << op << dendl;
4029
4030   switch (op) {
4031   case CEPH_OSD_CMPXATTR_OP_EQ:
4032     return (v1s.compare(v2s) == 0);
4033   case CEPH_OSD_CMPXATTR_OP_NE:
4034     return (v1s.compare(v2s) != 0);
4035   case CEPH_OSD_CMPXATTR_OP_GT:
4036     return (v1s.compare(v2s) > 0);
4037   case CEPH_OSD_CMPXATTR_OP_GTE:
4038     return (v1s.compare(v2s) >= 0);
4039   case CEPH_OSD_CMPXATTR_OP_LT:
4040     return (v1s.compare(v2s) < 0);
4041   case CEPH_OSD_CMPXATTR_OP_LTE:
4042     return (v1s.compare(v2s) <= 0);
4043   default:
4044     return -EINVAL;
4045   }
4046 }
4047
4048 int PrimaryLogPG::do_writesame(OpContext *ctx, OSDOp& osd_op)
4049 {
4050   ceph_osd_op& op = osd_op.op;
4051   vector<OSDOp> write_ops(1);
4052   OSDOp& write_op = write_ops[0];
4053   uint64_t write_length = op.writesame.length;
4054   int result = 0;
4055
4056   if (!write_length)
4057     return 0;
4058
4059   if (!op.writesame.data_length || write_length % op.writesame.data_length)
4060     return -EINVAL;
4061
4062   if (op.writesame.data_length != osd_op.indata.length()) {
4063     derr << "invalid length ws data length " << op.writesame.data_length << " actual len " << osd_op.indata.length() << dendl;
4064     return -EINVAL;
4065   }
4066
4067   while (write_length) {
4068     write_op.indata.append(osd_op.indata);
4069     write_length -= op.writesame.data_length;
4070   }
4071
4072   write_op.op.op = CEPH_OSD_OP_WRITE;
4073   write_op.op.extent.offset = op.writesame.offset;
4074   write_op.op.extent.length = op.writesame.length;
4075   result = do_osd_ops(ctx, write_ops);
4076   if (result < 0)
4077     derr << "do_writesame do_osd_ops failed " << result << dendl;
4078
4079   return result;
4080 }
4081
4082 // ========================================================================
4083 // low level osd ops
4084
4085 int PrimaryLogPG::do_tmap2omap(OpContext *ctx, unsigned flags)
4086 {
4087   dout(20) << " convert tmap to omap for " << ctx->new_obs.oi.soid << dendl;
4088   bufferlist header, vals;
4089   int r = _get_tmap(ctx, &header, &vals);
4090   if (r < 0) {
4091     if (r == -ENODATA && (flags & CEPH_OSD_TMAP2OMAP_NULLOK))
4092       r = 0;
4093     return r;
4094   }
4095
4096   vector<OSDOp> ops(3);
4097
4098   ops[0].op.op = CEPH_OSD_OP_TRUNCATE;
4099   ops[0].op.extent.offset = 0;
4100   ops[0].op.extent.length = 0;
4101
4102   ops[1].op.op = CEPH_OSD_OP_OMAPSETHEADER;
4103   ops[1].indata.claim(header);
4104
4105   ops[2].op.op = CEPH_OSD_OP_OMAPSETVALS;
4106   ops[2].indata.claim(vals);
4107
4108   return do_osd_ops(ctx, ops);
4109 }
4110
4111 int PrimaryLogPG::do_tmapup_slow(OpContext *ctx, bufferlist::iterator& bp, OSDOp& osd_op,
4112                                     bufferlist& bl)
4113 {
4114   // decode
4115   bufferlist header;
4116   map<string, bufferlist> m;
4117   if (bl.length()) {
4118     bufferlist::iterator p = bl.begin();
4119     ::decode(header, p);
4120     ::decode(m, p);
4121     assert(p.end());
4122   }
4123
4124   // do the update(s)
4125   while (!bp.end()) {
4126     __u8 op;
4127     string key;
4128     ::decode(op, bp);
4129
4130     switch (op) {
4131     case CEPH_OSD_TMAP_SET: // insert key
4132       {
4133         ::decode(key, bp);
4134         bufferlist data;
4135         ::decode(data, bp);
4136         m[key] = data;
4137       }
4138       break;
4139     case CEPH_OSD_TMAP_RM: // remove key
4140       ::decode(key, bp);
4141       if (!m.count(key)) {
4142         return -ENOENT;
4143       }
4144       m.erase(key);
4145       break;
4146     case CEPH_OSD_TMAP_RMSLOPPY: // remove key
4147       ::decode(key, bp);
4148       m.erase(key);
4149       break;
4150     case CEPH_OSD_TMAP_HDR: // update header
4151       {
4152         ::decode(header, bp);
4153       }
4154       break;
4155     default:
4156       return -EINVAL;
4157     }
4158   }
4159
4160   // reencode
4161   bufferlist obl;
4162   ::encode(header, obl);
4163   ::encode(m, obl);
4164
4165   // write it out
4166   vector<OSDOp> nops(1);
4167   OSDOp& newop = nops[0];
4168   newop.op.op = CEPH_OSD_OP_WRITEFULL;
4169   newop.op.extent.offset = 0;
4170   newop.op.extent.length = obl.length();
4171   newop.indata = obl;
4172   do_osd_ops(ctx, nops);
4173   osd_op.outdata.claim(newop.outdata);
4174   return 0;
4175 }
4176
4177 int PrimaryLogPG::do_tmapup(OpContext *ctx, bufferlist::iterator& bp, OSDOp& osd_op)
4178 {
4179   bufferlist::iterator orig_bp = bp;
4180   int result = 0;
4181   if (bp.end()) {
4182     dout(10) << "tmapup is a no-op" << dendl;
4183   } else {
4184     // read the whole object
4185     vector<OSDOp> nops(1);
4186     OSDOp& newop = nops[0];
4187     newop.op.op = CEPH_OSD_OP_READ;
4188     newop.op.extent.offset = 0;
4189     newop.op.extent.length = 0;
4190     result = do_osd_ops(ctx, nops);
4191
4192     dout(10) << "tmapup read " << newop.outdata.length() << dendl;
4193
4194     dout(30) << " starting is \n";
4195     newop.outdata.hexdump(*_dout);
4196     *_dout << dendl;
4197
4198     bufferlist::iterator ip = newop.outdata.begin();
4199     bufferlist obl;
4200
4201     dout(30) << "the update command is: \n";
4202     osd_op.indata.hexdump(*_dout);
4203     *_dout << dendl;
4204
4205     // header
4206     bufferlist header;
4207     __u32 nkeys = 0;
4208     if (newop.outdata.length()) {
4209       ::decode(header, ip);
4210       ::decode(nkeys, ip);
4211     }
4212     dout(10) << "tmapup header " << header.length() << dendl;
4213
4214     if (!bp.end() && *bp == CEPH_OSD_TMAP_HDR) {
4215       ++bp;
4216       ::decode(header, bp);
4217       dout(10) << "tmapup new header " << header.length() << dendl;
4218     }
4219
4220     ::encode(header, obl);
4221
4222     dout(20) << "tmapup initial nkeys " << nkeys << dendl;
4223
4224     // update keys
4225     bufferlist newkeydata;
4226     string nextkey, last_in_key;
4227     bufferlist nextval;
4228     bool have_next = false;
4229     if (!ip.end()) {
4230       have_next = true;
4231       ::decode(nextkey, ip);
4232       ::decode(nextval, ip);
4233     }
4234     while (!bp.end() && !result) {
4235       __u8 op;
4236       string key;
4237       try {
4238         ::decode(op, bp);
4239         ::decode(key, bp);
4240       }
4241       catch (buffer::error& e) {
4242         return -EINVAL;
4243       }
4244       if (key < last_in_key) {
4245         dout(5) << "tmapup warning: key '" << key << "' < previous key '" << last_in_key
4246                 << "', falling back to an inefficient (unsorted) update" << dendl;
4247         bp = orig_bp;
4248         return do_tmapup_slow(ctx, bp, osd_op, newop.outdata);
4249       }
4250       last_in_key = key;
4251
4252       dout(10) << "tmapup op " << (int)op << " key " << key << dendl;
4253
4254       // skip existing intervening keys
4255       bool key_exists = false;
4256       while (have_next && !key_exists) {
4257         dout(20) << "  (have_next=" << have_next << " nextkey=" << nextkey << ")" << dendl;
4258         if (nextkey > key)
4259           break;
4260         if (nextkey < key) {
4261           // copy untouched.
4262           ::encode(nextkey, newkeydata);
4263           ::encode(nextval, newkeydata);
4264           dout(20) << "  keep " << nextkey << " " << nextval.length() << dendl;
4265         } else {
4266           // don't copy; discard old value.  and stop.
4267           dout(20) << "  drop " << nextkey << " " << nextval.length() << dendl;
4268           key_exists = true;
4269           nkeys--;
4270         }
4271         if (!ip.end()) {
4272           ::decode(nextkey, ip);
4273           ::decode(nextval, ip);
4274         } else {
4275           have_next = false;
4276         }
4277       }
4278
4279       if (op == CEPH_OSD_TMAP_SET) {
4280         bufferlist val;
4281         try {
4282           ::decode(val, bp);
4283         }
4284         catch (buffer::error& e) {
4285           return -EINVAL;
4286         }
4287         ::encode(key, newkeydata);
4288         ::encode(val, newkeydata);
4289         dout(20) << "   set " << key << " " << val.length() << dendl;
4290         nkeys++;
4291       } else if (op == CEPH_OSD_TMAP_CREATE) {
4292         if (key_exists) {
4293           return -EEXIST;
4294         }
4295         bufferlist val;
4296         try {
4297           ::decode(val, bp);
4298         }
4299         catch (buffer::error& e) {
4300           return -EINVAL;
4301         }
4302         ::encode(key, newkeydata);
4303         ::encode(val, newkeydata);
4304         dout(20) << "   create " << key << " " << val.length() << dendl;
4305         nkeys++;
4306       } else if (op == CEPH_OSD_TMAP_RM) {
4307         // do nothing.
4308         if (!key_exists) {
4309           return -ENOENT;
4310         }
4311       } else if (op == CEPH_OSD_TMAP_RMSLOPPY) {
4312         // do nothing
4313       } else {
4314         dout(10) << "  invalid tmap op " << (int)op << dendl;
4315         return -EINVAL;
4316       }
4317     }
4318
4319     // copy remaining
4320     if (have_next) {
4321       ::encode(nextkey, newkeydata);
4322       ::encode(nextval, newkeydata);
4323       dout(20) << "  keep " << nextkey << " " << nextval.length() << dendl;
4324     }
4325     if (!ip.end()) {
4326       bufferlist rest;
4327       rest.substr_of(newop.outdata, ip.get_off(), newop.outdata.length() - ip.get_off());
4328       dout(20) << "  keep trailing " << rest.length()
4329                << " at " << newkeydata.length() << dendl;
4330       newkeydata.claim_append(rest);
4331     }
4332
4333     // encode final key count + key data
4334     dout(20) << "tmapup final nkeys " << nkeys << dendl;
4335     ::encode(nkeys, obl);
4336     obl.claim_append(newkeydata);
4337
4338     if (0) {
4339       dout(30) << " final is \n";
4340       obl.hexdump(*_dout);
4341       *_dout << dendl;
4342
4343       // sanity check
4344       bufferlist::iterator tp = obl.begin();
4345       bufferlist h;
4346       ::decode(h, tp);
4347       map<string,bufferlist> d;
4348       ::decode(d, tp);
4349       assert(tp.end());
4350       dout(0) << " **** debug sanity check, looks ok ****" << dendl;
4351     }
4352
4353     // write it out
4354     if (!result) {
4355       dout(20) << "tmapput write " << obl.length() << dendl;
4356       newop.op.op = CEPH_OSD_OP_WRITEFULL;
4357       newop.op.extent.offset = 0;
4358       newop.op.extent.length = obl.length();
4359       newop.indata = obl;
4360       do_osd_ops(ctx, nops);
4361       osd_op.outdata.claim(newop.outdata);
4362     }
4363   }
4364   return result;
4365 }
4366
4367 static int check_offset_and_length(uint64_t offset, uint64_t length, uint64_t max)
4368 {
4369   if (offset >= max ||
4370       length > max ||
4371       offset + length > max)
4372     return -EFBIG;
4373
4374   return 0;
4375 }
4376
4377 struct FillInVerifyExtent : public Context {
4378   ceph_le64 *r;
4379   int32_t *rval;
4380   bufferlist *outdatap;
4381   boost::optional<uint32_t> maybe_crc;
4382   uint64_t size;
4383   OSDService *osd;
4384   hobject_t soid;
4385   __le32 flags;
4386   FillInVerifyExtent(ceph_le64 *r, int32_t *rv, bufferlist *blp,
4387                      boost::optional<uint32_t> mc, uint64_t size,
4388                      OSDService *osd, hobject_t soid, __le32 flags) :
4389     r(r), rval(rv), outdatap(blp), maybe_crc(mc),
4390     size(size), osd(osd), soid(soid), flags(flags) {}
4391   void finish(int len) override {
4392     *r = len;
4393     if (len < 0) {
4394       *rval = len;
4395       return;
4396     }
4397     *rval = 0;
4398
4399     // whole object?  can we verify the checksum?
4400     if (maybe_crc && *r == size) {
4401       uint32_t crc = outdatap->crc32c(-1);
4402       if (maybe_crc != crc) {
4403         osd->clog->error() << std::hex << " full-object read crc 0x" << crc
4404                            << " != expected 0x" << *maybe_crc
4405                            << std::dec << " on " << soid;
4406         if (!(flags & CEPH_OSD_OP_FLAG_FAILOK)) {
4407           *rval = -EIO;
4408           *r = 0;
4409         }
4410       }
4411     }
4412   }
4413 };
4414
4415 struct ToSparseReadResult : public Context {
4416   int* result;
4417   bufferlist* data_bl;
4418   uint64_t data_offset;
4419   ceph_le64* len;
4420   ToSparseReadResult(int* result, bufferlist* bl, uint64_t offset,
4421                      ceph_le64* len)
4422     : result(result), data_bl(bl), data_offset(offset),len(len) {}
4423   void finish(int r) override {
4424     if (r < 0) {
4425       *result = r;
4426       return;
4427     }
4428     *result = 0;
4429     *len = r;
4430     bufferlist outdata;
4431     map<uint64_t, uint64_t> extents = {{data_offset, r}};
4432     ::encode(extents, outdata);
4433     ::encode_destructively(*data_bl, outdata);
4434     data_bl->swap(outdata);
4435   }
4436 };
4437
4438 template<typename V>
4439 static string list_keys(const map<string, V>& m) {
4440   string s;
4441   for (typename map<string, V>::const_iterator itr = m.begin(); itr != m.end(); ++itr) {
4442     if (!s.empty()) {
4443       s.push_back(',');
4444     }
4445     s.append(itr->first);
4446   }
4447   return s;
4448 }
4449
4450 template<typename T>
4451 static string list_entries(const T& m) {
4452   string s;
4453   for (typename T::const_iterator itr = m.begin(); itr != m.end(); ++itr) {
4454     if (!s.empty()) {
4455       s.push_back(',');
4456     }
4457     s.append(*itr);
4458   }
4459   return s;
4460 }
4461
4462 void PrimaryLogPG::maybe_create_new_object(
4463   OpContext *ctx,
4464   bool ignore_transaction)
4465 {
4466   ObjectState& obs = ctx->new_obs;
4467   if (!obs.exists) {
4468     ctx->delta_stats.num_objects++;
4469     obs.exists = true;
4470     assert(!obs.oi.is_whiteout());
4471     obs.oi.new_object();
4472     if (!ignore_transaction)
4473       ctx->op_t->create(obs.oi.soid);
4474   } else if (obs.oi.is_whiteout()) {
4475     dout(10) << __func__ << " clearing whiteout on " << obs.oi.soid << dendl;
4476     ctx->new_obs.oi.clear_flag(object_info_t::FLAG_WHITEOUT);
4477     --ctx->delta_stats.num_whiteouts;
4478   }
4479 }
4480
4481 struct ReadFinisher : public PrimaryLogPG::OpFinisher {
4482   OSDOp& osd_op;
4483
4484   ReadFinisher(OSDOp& osd_op) : osd_op(osd_op) {
4485   }
4486
4487   int execute() override {
4488     return osd_op.rval;
4489   }
4490 };
4491
4492 struct C_ChecksumRead : public Context {
4493   PrimaryLogPG *primary_log_pg;
4494   OSDOp &osd_op;
4495   Checksummer::CSumType csum_type;
4496   bufferlist init_value_bl;
4497   ceph_le64 read_length;
4498   bufferlist read_bl;
4499   Context *fill_extent_ctx;
4500
4501   C_ChecksumRead(PrimaryLogPG *primary_log_pg, OSDOp &osd_op,
4502                  Checksummer::CSumType csum_type, bufferlist &&init_value_bl,
4503                  boost::optional<uint32_t> maybe_crc, uint64_t size,
4504                  OSDService *osd, hobject_t soid, __le32 flags)
4505     : primary_log_pg(primary_log_pg), osd_op(osd_op),
4506       csum_type(csum_type), init_value_bl(std::move(init_value_bl)),
4507       fill_extent_ctx(new FillInVerifyExtent(&read_length, &osd_op.rval,
4508                                              &read_bl, maybe_crc, size,
4509                                              osd, soid, flags)) {
4510   }
4511   ~C_ChecksumRead() override {
4512     delete fill_extent_ctx;
4513   }
4514
4515   void finish(int r) override {
4516     fill_extent_ctx->complete(r);
4517     fill_extent_ctx = nullptr;
4518
4519     if (osd_op.rval >= 0) {
4520       bufferlist::iterator init_value_bl_it = init_value_bl.begin();
4521       osd_op.rval = primary_log_pg->finish_checksum(osd_op, csum_type,
4522                                                     &init_value_bl_it, read_bl);
4523     }
4524   }
4525 };
4526
4527 int PrimaryLogPG::do_checksum(OpContext *ctx, OSDOp& osd_op,
4528                               bufferlist::iterator *bl_it)
4529 {
4530   dout(20) << __func__ << dendl;
4531
4532   auto& op = osd_op.op;
4533   if (op.checksum.chunk_size > 0) {
4534     if (op.checksum.length == 0) {
4535       dout(10) << __func__ << ": length required when chunk size provided"
4536                << dendl;
4537       return -EINVAL;
4538     }
4539     if (op.checksum.length % op.checksum.chunk_size != 0) {
4540       dout(10) << __func__ << ": length not aligned to chunk size" << dendl;
4541       return -EINVAL;
4542     }
4543   }
4544
4545   auto& oi = ctx->new_obs.oi;
4546   if (op.checksum.offset == 0 && op.checksum.length == 0) {
4547     // zeroed offset+length implies checksum whole object
4548     op.checksum.length = oi.size;
4549   } else if (op.checksum.offset + op.checksum.length > oi.size) {
4550     return -EOVERFLOW;
4551   }
4552
4553   Checksummer::CSumType csum_type;
4554   switch (op.checksum.type) {
4555   case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH32:
4556     csum_type = Checksummer::CSUM_XXHASH32;
4557     break;
4558   case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH64:
4559     csum_type = Checksummer::CSUM_XXHASH64;
4560     break;
4561   case CEPH_OSD_CHECKSUM_OP_TYPE_CRC32C:
4562     csum_type = Checksummer::CSUM_CRC32C;
4563     break;
4564   default:
4565     dout(10) << __func__ << ": unknown crc type ("
4566              << static_cast<uint32_t>(op.checksum.type) << ")" << dendl;
4567     return -EINVAL;
4568   }
4569
4570   size_t csum_init_value_size = Checksummer::get_csum_init_value_size(csum_type);
4571   if (bl_it->get_remaining() < csum_init_value_size) {
4572     dout(10) << __func__ << ": init value not provided" << dendl;
4573     return -EINVAL;
4574   }
4575
4576   bufferlist init_value_bl;
4577   init_value_bl.substr_of(bl_it->get_bl(), bl_it->get_off(),
4578                           csum_init_value_size);
4579   bl_it->advance(csum_init_value_size);
4580
4581   if (pool.info.require_rollback() && op.checksum.length > 0) {
4582     // If there is a data digest and it is possible we are reading
4583     // entire object, pass the digest.
4584     boost::optional<uint32_t> maybe_crc;
4585     if (oi.is_data_digest() && op.checksum.offset == 0 &&
4586         op.checksum.length >= oi.size) {
4587       maybe_crc = oi.data_digest;
4588     }
4589
4590     // async read
4591     auto& soid = oi.soid;
4592     auto checksum_ctx = new C_ChecksumRead(this, osd_op, csum_type,
4593                                            std::move(init_value_bl), maybe_crc,
4594                                            oi.size, osd, soid, op.flags);
4595
4596     ctx->pending_async_reads.push_back({
4597       {op.checksum.offset, op.checksum.length, op.flags},
4598       {&checksum_ctx->read_bl, checksum_ctx}});
4599
4600     dout(10) << __func__ << ": async_read noted for " << soid << dendl;
4601     ctx->op_finishers[ctx->current_osd_subop_num].reset(
4602       new ReadFinisher(osd_op));
4603     return -EINPROGRESS;
4604   }
4605
4606   // sync read
4607   std::vector<OSDOp> read_ops(1);
4608   auto& read_op = read_ops[0];
4609   if (op.checksum.length > 0) {
4610     read_op.op.op = CEPH_OSD_OP_READ;
4611     read_op.op.flags = op.flags;
4612     read_op.op.extent.offset = op.checksum.offset;
4613     read_op.op.extent.length = op.checksum.length;
4614     read_op.op.extent.truncate_size = 0;
4615     read_op.op.extent.truncate_seq = 0;
4616
4617     int r = do_osd_ops(ctx, read_ops);
4618     if (r < 0) {
4619       derr << __func__ << ": do_osd_ops failed: " << cpp_strerror(r) << dendl;
4620       return r;
4621     }
4622   }
4623
4624   bufferlist::iterator init_value_bl_it = init_value_bl.begin();
4625   return finish_checksum(osd_op, csum_type, &init_value_bl_it,
4626                          read_op.outdata);
4627 }
4628
4629 int PrimaryLogPG::finish_checksum(OSDOp& osd_op,
4630                                   Checksummer::CSumType csum_type,
4631                                   bufferlist::iterator *init_value_bl_it,
4632                                   const bufferlist &read_bl) {
4633   dout(20) << __func__ << dendl;
4634
4635   auto& op = osd_op.op;
4636
4637   if (op.checksum.length > 0 && read_bl.length() != op.checksum.length) {
4638     derr << __func__ << ": bytes read " << read_bl.length() << " != "
4639          << op.checksum.length << dendl;
4640     return -EINVAL;
4641   }
4642
4643   size_t csum_chunk_size = (op.checksum.chunk_size != 0 ?
4644                               op.checksum.chunk_size : read_bl.length());
4645   uint32_t csum_count = (csum_chunk_size > 0 ?
4646                            read_bl.length() / csum_chunk_size : 0);
4647
4648   bufferlist csum;
4649   bufferptr csum_data;
4650   if (csum_count > 0) {
4651     size_t csum_value_size = Checksummer::get_csum_value_size(csum_type);
4652     csum_data = buffer::create(csum_value_size * csum_count);
4653     csum_data.zero();
4654     csum.append(csum_data);
4655
4656     switch (csum_type) {
4657     case Checksummer::CSUM_XXHASH32:
4658       {
4659         Checksummer::xxhash32::init_value_t init_value;
4660         ::decode(init_value, *init_value_bl_it);
4661         Checksummer::calculate<Checksummer::xxhash32>(
4662           init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
4663           &csum_data);
4664       }
4665       break;
4666     case Checksummer::CSUM_XXHASH64:
4667       {
4668         Checksummer::xxhash64::init_value_t init_value;
4669         ::decode(init_value, *init_value_bl_it);
4670         Checksummer::calculate<Checksummer::xxhash64>(
4671           init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
4672           &csum_data);
4673       }
4674       break;
4675     case Checksummer::CSUM_CRC32C:
4676       {
4677         Checksummer::crc32c::init_value_t init_value;
4678         ::decode(init_value, *init_value_bl_it);
4679         Checksummer::calculate<Checksummer::crc32c>(
4680           init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
4681           &csum_data);
4682       }
4683       break;
4684     default:
4685       break;
4686     }
4687   }
4688
4689   ::encode(csum_count, osd_op.outdata);
4690   osd_op.outdata.claim_append(csum);
4691   return 0;
4692 }
4693
4694 struct C_ExtentCmpRead : public Context {
4695   PrimaryLogPG *primary_log_pg;
4696   OSDOp &osd_op;
4697   ceph_le64 read_length;
4698   bufferlist read_bl;
4699   Context *fill_extent_ctx;
4700
4701   C_ExtentCmpRead(PrimaryLogPG *primary_log_pg, OSDOp &osd_op,
4702                   boost::optional<uint32_t> maybe_crc, uint64_t size,
4703                   OSDService *osd, hobject_t soid, __le32 flags)
4704     : primary_log_pg(primary_log_pg), osd_op(osd_op),
4705       fill_extent_ctx(new FillInVerifyExtent(&read_length, &osd_op.rval,
4706                                              &read_bl, maybe_crc, size,
4707                                              osd, soid, flags)) {
4708   }
4709   ~C_ExtentCmpRead() override {
4710     delete fill_extent_ctx;
4711   }
4712
4713   void finish(int r) override {
4714     if (r == -ENOENT) {
4715       osd_op.rval = 0;
4716       read_bl.clear();
4717       delete fill_extent_ctx;
4718     } else {
4719       fill_extent_ctx->complete(r);
4720     }
4721     fill_extent_ctx = nullptr;
4722
4723     if (osd_op.rval >= 0) {
4724       osd_op.rval = primary_log_pg->finish_extent_cmp(osd_op, read_bl);
4725     }
4726   }
4727 };
4728
4729 int PrimaryLogPG::do_extent_cmp(OpContext *ctx, OSDOp& osd_op)
4730 {
4731   dout(20) << __func__ << dendl;
4732   ceph_osd_op& op = osd_op.op;
4733
4734   auto& oi = ctx->new_obs.oi;
4735   uint64_t size = oi.size;
4736   if ((oi.truncate_seq < op.extent.truncate_seq) &&
4737       (op.extent.offset + op.extent.length > op.extent.truncate_size)) {
4738     size = op.extent.truncate_size;
4739   }
4740
4741   if (op.extent.offset >= size) {
4742     op.extent.length = 0;
4743   } else if (op.extent.offset + op.extent.length > size) {
4744     op.extent.length = size - op.extent.offset;
4745   }
4746
4747   if (op.extent.length == 0) {
4748     dout(20) << __func__ << " zero length extent" << dendl;
4749     return finish_extent_cmp(osd_op, bufferlist{});
4750   } else if (!ctx->obs->exists || ctx->obs->oi.is_whiteout()) {
4751     dout(20) << __func__ << " object DNE" << dendl;
4752     return finish_extent_cmp(osd_op, {});
4753   } else if (pool.info.require_rollback()) {
4754     // If there is a data digest and it is possible we are reading
4755     // entire object, pass the digest.
4756     boost::optional<uint32_t> maybe_crc;
4757     if (oi.is_data_digest() && op.checksum.offset == 0 &&
4758         op.checksum.length >= oi.size) {
4759       maybe_crc = oi.data_digest;
4760     }
4761
4762     // async read
4763     auto& soid = oi.soid;
4764     auto extent_cmp_ctx = new C_ExtentCmpRead(this, osd_op, maybe_crc, oi.size,
4765                                               osd, soid, op.flags);
4766     ctx->pending_async_reads.push_back({
4767       {op.extent.offset, op.extent.length, op.flags},
4768       {&extent_cmp_ctx->read_bl, extent_cmp_ctx}});
4769
4770     dout(10) << __func__ << ": async_read noted for " << soid << dendl;
4771
4772     ctx->op_finishers[ctx->current_osd_subop_num].reset(
4773       new ReadFinisher(osd_op));
4774     return -EINPROGRESS;
4775   }
4776
4777   // sync read
4778   vector<OSDOp> read_ops(1);
4779   OSDOp& read_op = read_ops[0];
4780
4781   read_op.op.op = CEPH_OSD_OP_SYNC_READ;
4782   read_op.op.extent.offset = op.extent.offset;
4783   read_op.op.extent.length = op.extent.length;
4784   read_op.op.extent.truncate_seq = op.extent.truncate_seq;
4785   read_op.op.extent.truncate_size = op.extent.truncate_size;
4786
4787   int result = do_osd_ops(ctx, read_ops);
4788   if (result < 0) {
4789     derr << __func__ << " failed " << result << dendl;
4790     return result;
4791   }
4792   return finish_extent_cmp(osd_op, read_op.outdata);
4793 }
4794
4795 int PrimaryLogPG::finish_extent_cmp(OSDOp& osd_op, const bufferlist &read_bl)
4796 {
4797   for (uint64_t idx = 0; idx < osd_op.indata.length(); ++idx) {
4798     char read_byte = (idx < read_bl.length() ? read_bl[idx] : 0);
4799     if (osd_op.indata[idx] != read_byte) {
4800         return (-MAX_ERRNO - idx);
4801     }
4802   }
4803
4804   return 0;
4805 }
4806
4807 int PrimaryLogPG::do_read(OpContext *ctx, OSDOp& osd_op) {
4808   dout(20) << __func__ << dendl;
4809   auto& op = osd_op.op;
4810   auto& oi = ctx->new_obs.oi;
4811   auto& soid = oi.soid;
4812   __u32 seq = oi.truncate_seq;
4813   uint64_t size = oi.size;
4814   bool trimmed_read = false;
4815
4816   // are we beyond truncate_size?
4817   if ( (seq < op.extent.truncate_seq) &&
4818        (op.extent.offset + op.extent.length > op.extent.truncate_size) )
4819     size = op.extent.truncate_size;
4820
4821   if (op.extent.length == 0) //length is zero mean read the whole object
4822     op.extent.length = size;
4823
4824   if (op.extent.offset >= size) {
4825     op.extent.length = 0;
4826     trimmed_read = true;
4827   } else if (op.extent.offset + op.extent.length > size) {
4828     op.extent.length = size - op.extent.offset;
4829     trimmed_read = true;
4830   }
4831
4832   // read into a buffer
4833   int result = 0;
4834   if (trimmed_read && op.extent.length == 0) {
4835     // read size was trimmed to zero and it is expected to do nothing
4836     // a read operation of 0 bytes does *not* do nothing, this is why
4837     // the trimmed_read boolean is needed
4838   } else if (pool.info.require_rollback()) {
4839     boost::optional<uint32_t> maybe_crc;
4840     // If there is a data digest and it is possible we are reading
4841     // entire object, pass the digest.  FillInVerifyExtent will
4842     // will check the oi.size again.
4843     if (oi.is_data_digest() && op.extent.offset == 0 &&
4844         op.extent.length >= oi.size)
4845       maybe_crc = oi.data_digest;
4846     ctx->pending_async_reads.push_back(
4847       make_pair(
4848         boost::make_tuple(op.extent.offset, op.extent.length, op.flags),
4849         make_pair(&osd_op.outdata,
4850                   new FillInVerifyExtent(&op.extent.length, &osd_op.rval,
4851                                          &osd_op.outdata, maybe_crc, oi.size,
4852                                          osd, soid, op.flags))));
4853     dout(10) << " async_read noted for " << soid << dendl;
4854
4855     ctx->op_finishers[ctx->current_osd_subop_num].reset(
4856       new ReadFinisher(osd_op));
4857   } else {
4858     int r = pgbackend->objects_read_sync(
4859       soid, op.extent.offset, op.extent.length, op.flags, &osd_op.outdata);
4860     if (r == -EIO) {
4861       r = rep_repair_primary_object(soid, ctx->op);
4862     }
4863     if (r >= 0)
4864       op.extent.length = r;
4865     else {
4866       result = r;
4867       op.extent.length = 0;
4868     }
4869     dout(10) << " read got " << r << " / " << op.extent.length
4870              << " bytes from obj " << soid << dendl;
4871
4872     // whole object?  can we verify the checksum?
4873     if (op.extent.length == oi.size && oi.is_data_digest()) {
4874       uint32_t crc = osd_op.outdata.crc32c(-1);
4875       if (oi.data_digest != crc) {
4876         osd->clog->error() << info.pgid << std::hex
4877                            << " full-object read crc 0x" << crc
4878                            << " != expected 0x" << oi.data_digest
4879                            << std::dec << " on " << soid;
4880         // FIXME fall back to replica or something?
4881         result = -EIO;
4882       }
4883     }
4884   }
4885
4886   // XXX the op.extent.length is the requested length for async read
4887   // On error this length is changed to 0 after the error comes back.
4888   ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(op.extent.length, 10);
4889   ctx->delta_stats.num_rd++;
4890   return result;
4891 }
4892
4893 int PrimaryLogPG::do_sparse_read(OpContext *ctx, OSDOp& osd_op) {
4894   dout(20) << __func__ << dendl;
4895   auto& op = osd_op.op;
4896   auto& oi = ctx->new_obs.oi;
4897   auto& soid = oi.soid;
4898
4899   if (op.extent.truncate_seq) {
4900     dout(0) << "sparse_read does not support truncation sequence " << dendl;
4901     return -EINVAL;
4902   }
4903
4904   ++ctx->num_read;
4905   if (pool.info.ec_pool()) {
4906     // translate sparse read to a normal one if not supported
4907     uint64_t offset = op.extent.offset;
4908     uint64_t length = op.extent.length;
4909     if (offset > oi.size) {
4910       length = 0;
4911     } else if (offset + length > oi.size) {
4912       length = oi.size - offset;
4913     }
4914
4915     if (length > 0) {
4916       ctx->pending_async_reads.push_back(
4917         make_pair(
4918           boost::make_tuple(offset, length, op.flags),
4919           make_pair(
4920             &osd_op.outdata,
4921             new ToSparseReadResult(&osd_op.rval, &osd_op.outdata, offset,
4922                                    &op.extent.length))));
4923       dout(10) << " async_read (was sparse_read) noted for " << soid << dendl;
4924
4925       ctx->op_finishers[ctx->current_osd_subop_num].reset(
4926         new ReadFinisher(osd_op));
4927     } else {
4928       dout(10) << " sparse read ended up empty for " << soid << dendl;
4929       map<uint64_t, uint64_t> extents;
4930       ::encode(extents, osd_op.outdata);
4931     }
4932   } else {
4933     // read into a buffer
4934     map<uint64_t, uint64_t> m;
4935     uint32_t total_read = 0;
4936     int r = osd->store->fiemap(ch, ghobject_t(soid, ghobject_t::NO_GEN,
4937                                               info.pgid.shard),
4938                                op.extent.offset, op.extent.length, m);
4939     if (r < 0)  {
4940       return r;
4941     }
4942
4943     map<uint64_t, uint64_t>::iterator miter;
4944     bufferlist data_bl;
4945     uint64_t last = op.extent.offset;
4946     for (miter = m.begin(); miter != m.end(); ++miter) {
4947       // verify hole?
4948       if (cct->_conf->osd_verify_sparse_read_holes &&
4949           last < miter->first) {
4950         bufferlist t;
4951         uint64_t len = miter->first - last;
4952         r = pgbackend->objects_read_sync(soid, last, len, op.flags, &t);
4953         if (r < 0) {
4954           osd->clog->error() << coll << " " << soid
4955                              << " sparse-read failed to read: "
4956                              << r;
4957         } else if (!t.is_zero()) {
4958           osd->clog->error() << coll << " " << soid
4959                              << " sparse-read found data in hole "
4960                              << last << "~" << len;
4961         }
4962       }
4963
4964       bufferlist tmpbl;
4965       r = pgbackend->objects_read_sync(soid, miter->first, miter->second,
4966                                        op.flags, &tmpbl);
4967       if (r == -EIO) {
4968         r = rep_repair_primary_object(soid, ctx->op);
4969       }
4970       if (r < 0) {
4971         return r;
4972       }
4973
4974       // this is usually happen when we get extent that exceeds the actual file
4975       // size
4976       if (r < (int)miter->second)
4977         miter->second = r;
4978       total_read += r;
4979       dout(10) << "sparse-read " << miter->first << "@" << miter->second
4980                << dendl;
4981       data_bl.claim_append(tmpbl);
4982       last = miter->first + r;
4983     }
4984
4985     if (r < 0) {
4986       return r;
4987     }
4988
4989     // verify trailing hole?
4990     if (cct->_conf->osd_verify_sparse_read_holes) {
4991       uint64_t end = MIN(op.extent.offset + op.extent.length, oi.size);
4992       if (last < end) {
4993         bufferlist t;
4994         uint64_t len = end - last;
4995         r = pgbackend->objects_read_sync(soid, last, len, op.flags, &t);
4996         if (r < 0) {
4997           osd->clog->error() << coll << " " << soid
4998                              << " sparse-read failed to read: " << r;
4999         } else if (!t.is_zero()) {
5000           osd->clog->error() << coll << " " << soid
5001                              << " sparse-read found data in hole "
5002                              << last << "~" << len;
5003         }
5004       }
5005     }
5006
5007     // Why SPARSE_READ need checksum? In fact, librbd always use sparse-read.
5008     // Maybe at first, there is no much whole objects. With continued use, more
5009     // and more whole object exist. So from this point, for spare-read add
5010     // checksum make sense.
5011     if (total_read == oi.size && oi.is_data_digest()) {
5012       uint32_t crc = data_bl.crc32c(-1);
5013       if (oi.data_digest != crc) {
5014         osd->clog->error() << info.pgid << std::hex
5015           << " full-object read crc 0x" << crc
5016           << " != expected 0x" << oi.data_digest
5017           << std::dec << " on " << soid;
5018         // FIXME fall back to replica or something?
5019         return -EIO;
5020       }
5021     }
5022
5023     op.extent.length = total_read;
5024
5025     ::encode(m, osd_op.outdata); // re-encode since it might be modified
5026     ::encode_destructively(data_bl, osd_op.outdata);
5027
5028     dout(10) << " sparse_read got " << total_read << " bytes from object "
5029              << soid << dendl;
5030   }
5031
5032   ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(op.extent.length, 10);
5033   ctx->delta_stats.num_rd++;
5034   return 0;
5035 }
5036
5037 int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
5038 {
5039   int result = 0;
5040   SnapSetContext *ssc = ctx->obc->ssc;
5041   ObjectState& obs = ctx->new_obs;
5042   object_info_t& oi = obs.oi;
5043   const hobject_t& soid = oi.soid;
5044
5045   PGTransaction* t = ctx->op_t.get();
5046
5047   dout(10) << "do_osd_op " << soid << " " << ops << dendl;
5048
5049   ctx->current_osd_subop_num = 0;
5050   for (auto p = ops.begin(); p != ops.end(); ++p, ctx->current_osd_subop_num++, ctx->processed_subop_count++) {
5051     OSDOp& osd_op = *p;
5052     ceph_osd_op& op = osd_op.op;
5053
5054     OpFinisher* op_finisher = nullptr;
5055     {
5056       auto op_finisher_it = ctx->op_finishers.find(ctx->current_osd_subop_num);
5057       if (op_finisher_it != ctx->op_finishers.end()) {
5058         op_finisher = op_finisher_it->second.get();
5059       }
5060     }
5061
5062     // TODO: check endianness (__le32 vs uint32_t, etc.)
5063     // The fields in ceph_osd_op are little-endian (according to the definition in rados.h),
5064     // but the code in this function seems to treat them as native-endian.  What should the
5065     // tracepoints do?
5066     tracepoint(osd, do_osd_op_pre, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op), op.flags);
5067
5068     dout(10) << "do_osd_op  " << osd_op << dendl;
5069
5070     bufferlist::iterator bp = osd_op.indata.begin();
5071
5072     // user-visible modifcation?
5073     switch (op.op) {
5074       // non user-visible modifications
5075     case CEPH_OSD_OP_WATCH:
5076     case CEPH_OSD_OP_CACHE_EVICT:
5077     case CEPH_OSD_OP_CACHE_FLUSH:
5078     case CEPH_OSD_OP_CACHE_TRY_FLUSH:
5079     case CEPH_OSD_OP_UNDIRTY:
5080     case CEPH_OSD_OP_COPY_FROM:  // we handle user_version update explicitly
5081     case CEPH_OSD_OP_CACHE_PIN:
5082     case CEPH_OSD_OP_CACHE_UNPIN:
5083     case CEPH_OSD_OP_SET_REDIRECT:
5084       break;
5085     default:
5086       if (op.op & CEPH_OSD_OP_MODE_WR)
5087         ctx->user_modify = true;
5088     }
5089
5090     // munge -1 truncate to 0 truncate
5091     if (ceph_osd_op_uses_extent(op.op) &&
5092         op.extent.truncate_seq == 1 &&
5093         op.extent.truncate_size == (-1ULL)) {
5094       op.extent.truncate_size = 0;
5095       op.extent.truncate_seq = 0;
5096     }
5097
5098     // munge ZERO -> TRUNCATE?  (don't munge to DELETE or we risk hosing attributes)
5099     if (op.op == CEPH_OSD_OP_ZERO &&
5100         obs.exists &&
5101         op.extent.offset < cct->_conf->osd_max_object_size &&
5102         op.extent.length >= 1 &&
5103         op.extent.length <= cct->_conf->osd_max_object_size &&
5104         op.extent.offset + op.extent.length >= oi.size) {
5105       if (op.extent.offset >= oi.size) {
5106         // no-op
5107         goto fail;
5108       }
5109       dout(10) << " munging ZERO " << op.extent.offset << "~" << op.extent.length
5110                << " -> TRUNCATE " << op.extent.offset << " (old size is " << oi.size << ")" << dendl;
5111       op.op = CEPH_OSD_OP_TRUNCATE;
5112     }
5113
5114     switch (op.op) {
5115
5116       // --- READS ---
5117
5118     case CEPH_OSD_OP_CMPEXT:
5119       ++ctx->num_read;
5120       tracepoint(osd, do_osd_op_pre_extent_cmp, soid.oid.name.c_str(),
5121                  soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
5122                  op.extent.length, op.extent.truncate_size,
5123                  op.extent.truncate_seq);
5124
5125       if (op_finisher == nullptr) {
5126         result = do_extent_cmp(ctx, osd_op);
5127       } else {
5128         result = op_finisher->execute();
5129       }
5130       break;
5131
5132     case CEPH_OSD_OP_SYNC_READ:
5133       if (pool.info.require_rollback()) {
5134         result = -EOPNOTSUPP;
5135         break;
5136       }
5137       // fall through
5138     case CEPH_OSD_OP_READ:
5139       ++ctx->num_read;
5140       tracepoint(osd, do_osd_op_pre_read, soid.oid.name.c_str(),
5141                  soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
5142                  op.extent.length, op.extent.truncate_size,
5143                  op.extent.truncate_seq);
5144       if (op_finisher == nullptr) {
5145         if (!ctx->data_off) {
5146           ctx->data_off = op.extent.offset;
5147         }
5148         result = do_read(ctx, osd_op);
5149       } else {
5150         result = op_finisher->execute();
5151       }
5152       break;
5153
5154     case CEPH_OSD_OP_CHECKSUM:
5155       ++ctx->num_read;
5156       {
5157         tracepoint(osd, do_osd_op_pre_checksum, soid.oid.name.c_str(),
5158                    soid.snap.val, oi.size, oi.truncate_seq, op.checksum.type,
5159                    op.checksum.offset, op.checksum.length,
5160                    op.checksum.chunk_size);
5161
5162         if (op_finisher == nullptr) {
5163           result = do_checksum(ctx, osd_op, &bp);
5164         } else {
5165           result = op_finisher->execute();
5166         }
5167       }
5168       break;
5169
5170     /* map extents */
5171     case CEPH_OSD_OP_MAPEXT:
5172       tracepoint(osd, do_osd_op_pre_mapext, soid.oid.name.c_str(), soid.snap.val, op.extent.offset, op.extent.length);
5173       if (pool.info.require_rollback()) {
5174         result = -EOPNOTSUPP;
5175         break;
5176       }
5177       ++ctx->num_read;
5178       {
5179         // read into a buffer
5180         bufferlist bl;
5181         int r = osd->store->fiemap(ch, ghobject_t(soid, ghobject_t::NO_GEN,
5182                                                   info.pgid.shard),
5183                                    op.extent.offset, op.extent.length, bl);
5184         osd_op.outdata.claim(bl);
5185         if (r < 0)
5186           result = r;
5187         else
5188           ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(bl.length(), 10);
5189         ctx->delta_stats.num_rd++;
5190         dout(10) << " map_extents done on object " << soid << dendl;
5191       }
5192       break;
5193
5194     /* map extents */
5195     case CEPH_OSD_OP_SPARSE_READ:
5196       tracepoint(osd, do_osd_op_pre_sparse_read, soid.oid.name.c_str(),
5197                  soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
5198                  op.extent.length, op.extent.truncate_size,
5199                  op.extent.truncate_seq);
5200       if (op_finisher == nullptr) {
5201         result = do_sparse_read(ctx, osd_op);
5202       } else {
5203         result = op_finisher->execute();
5204       }
5205       break;
5206
5207     case CEPH_OSD_OP_CALL:
5208       {
5209         string cname, mname;
5210         bufferlist indata;
5211         try {
5212           bp.copy(op.cls.class_len, cname);
5213           bp.copy(op.cls.method_len, mname);
5214           bp.copy(op.cls.indata_len, indata);
5215         } catch (buffer::error& e) {
5216           dout(10) << "call unable to decode class + method + indata" << dendl;
5217           dout(30) << "in dump: ";
5218           osd_op.indata.hexdump(*_dout);
5219           *_dout << dendl;
5220           result = -EINVAL;
5221           tracepoint(osd, do_osd_op_pre_call, soid.oid.name.c_str(), soid.snap.val, "???", "???");
5222           break;
5223         }
5224         tracepoint(osd, do_osd_op_pre_call, soid.oid.name.c_str(), soid.snap.val, cname.c_str(), mname.c_str());
5225
5226         ClassHandler::ClassData *cls;
5227         result = osd->class_handler->open_class(cname, &cls);
5228         assert(result == 0);   // init_op_flags() already verified this works.
5229
5230         ClassHandler::ClassMethod *method = cls->get_method(mname.c_str());
5231         if (!method) {
5232           dout(10) << "call method " << cname << "." << mname << " does not exist" << dendl;
5233           result = -EOPNOTSUPP;
5234           break;
5235         }
5236
5237         int flags = method->get_flags();
5238         if (flags & CLS_METHOD_WR)
5239           ctx->user_modify = true;
5240
5241         bufferlist outdata;
5242         dout(10) << "call method " << cname << "." << mname << dendl;
5243         int prev_rd = ctx->num_read;
5244         int prev_wr = ctx->num_write;
5245         result = method->exec((cls_method_context_t)&ctx, indata, outdata);
5246
5247         if (ctx->num_read > prev_rd && !(flags & CLS_METHOD_RD)) {
5248           derr << "method " << cname << "." << mname << " tried to read object but is not marked RD" << dendl;
5249           result = -EIO;
5250           break;
5251         }
5252         if (ctx->num_write > prev_wr && !(flags & CLS_METHOD_WR)) {
5253           derr << "method " << cname << "." << mname << " tried to update object but is not marked WR" << dendl;
5254           result = -EIO;
5255           break;
5256         }
5257
5258         dout(10) << "method called response length=" << outdata.length() << dendl;
5259         op.extent.length = outdata.length();
5260         osd_op.outdata.claim_append(outdata);
5261         dout(30) << "out dump: ";
5262         osd_op.outdata.hexdump(*_dout);
5263         *_dout << dendl;
5264       }
5265       break;
5266
5267     case CEPH_OSD_OP_STAT:
5268       // note: stat does not require RD
5269       {
5270         tracepoint(osd, do_osd_op_pre_stat, soid.oid.name.c_str(), soid.snap.val);
5271
5272         if (obs.exists && !oi.is_whiteout()) {
5273           ::encode(oi.size, osd_op.outdata);
5274           ::encode(oi.mtime, osd_op.outdata);
5275           dout(10) << "stat oi has " << oi.size << " " << oi.mtime << dendl;
5276         } else {
5277           result = -ENOENT;
5278           dout(10) << "stat oi object does not exist" << dendl;
5279         }
5280
5281         ctx->delta_stats.num_rd++;
5282       }
5283       break;
5284
5285     case CEPH_OSD_OP_ISDIRTY:
5286       ++ctx->num_read;
5287       {
5288         tracepoint(osd, do_osd_op_pre_isdirty, soid.oid.name.c_str(), soid.snap.val);
5289         bool is_dirty = obs.oi.is_dirty();
5290         ::encode(is_dirty, osd_op.outdata);
5291         ctx->delta_stats.num_rd++;
5292         result = 0;
5293       }
5294       break;
5295
5296     case CEPH_OSD_OP_UNDIRTY:
5297       ++ctx->num_write;
5298       {
5299         tracepoint(osd, do_osd_op_pre_undirty, soid.oid.name.c_str(), soid.snap.val);
5300         if (oi.is_dirty()) {
5301           ctx->undirty = true;  // see make_writeable()
5302           ctx->modify = true;
5303           ctx->delta_stats.num_wr++;
5304         }
5305         result = 0;
5306       }
5307       break;
5308
5309     case CEPH_OSD_OP_CACHE_TRY_FLUSH:
5310       ++ctx->num_write;
5311       {
5312         tracepoint(osd, do_osd_op_pre_try_flush, soid.oid.name.c_str(), soid.snap.val);
5313         if (ctx->lock_type != ObjectContext::RWState::RWNONE) {
5314           dout(10) << "cache-try-flush without SKIPRWLOCKS flag set" << dendl;
5315           result = -EINVAL;
5316           break;
5317         }
5318         if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
5319           result = -EINVAL;
5320           break;
5321         }
5322         if (!obs.exists) {
5323           result = 0;
5324           break;
5325         }
5326         if (oi.is_cache_pinned()) {
5327           dout(10) << "cache-try-flush on a pinned object, consider unpin this object first" << dendl;
5328           result = -EPERM;
5329           break;
5330         }
5331         if (oi.is_dirty()) {
5332           result = start_flush(ctx->op, ctx->obc, false, NULL, boost::none);
5333           if (result == -EINPROGRESS)
5334             result = -EAGAIN;
5335         } else {
5336           result = 0;
5337         }
5338       }
5339       break;
5340
5341     case CEPH_OSD_OP_CACHE_FLUSH:
5342       ++ctx->num_write;
5343       {
5344         tracepoint(osd, do_osd_op_pre_cache_flush, soid.oid.name.c_str(), soid.snap.val);
5345         if (ctx->lock_type == ObjectContext::RWState::RWNONE) {
5346           dout(10) << "cache-flush with SKIPRWLOCKS flag set" << dendl;
5347           result = -EINVAL;
5348           break;
5349         }
5350         if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
5351           result = -EINVAL;
5352           break;
5353         }
5354         if (!obs.exists) {
5355           result = 0;
5356           break;
5357         }
5358         if (oi.is_cache_pinned()) {
5359           dout(10) << "cache-flush on a pinned object, consider unpin this object first" << dendl;
5360           result = -EPERM;
5361           break;
5362         }
5363         hobject_t missing;
5364         if (oi.is_dirty()) {
5365           result = start_flush(ctx->op, ctx->obc, true, &missing, boost::none);
5366           if (result == -EINPROGRESS)
5367             result = -EAGAIN;
5368         } else {
5369           result = 0;
5370         }
5371         // Check special return value which has set missing_return
5372         if (result == -ENOENT) {
5373           dout(10) << __func__ << " CEPH_OSD_OP_CACHE_FLUSH got ENOENT" << dendl;
5374           assert(!missing.is_min());
5375           wait_for_unreadable_object(missing, ctx->op);
5376           // Error code which is used elsewhere when wait_for_unreadable_object() is used
5377           result = -EAGAIN;
5378         }
5379       }
5380       break;
5381
5382     case CEPH_OSD_OP_CACHE_EVICT:
5383       ++ctx->num_write;
5384       {
5385         tracepoint(osd, do_osd_op_pre_cache_evict, soid.oid.name.c_str(), soid.snap.val);
5386         if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
5387           result = -EINVAL;
5388           break;
5389         }
5390         if (!obs.exists) {
5391           result = 0;
5392           break;
5393         }
5394         if (oi.is_cache_pinned()) {
5395           dout(10) << "cache-evict on a pinned object, consider unpin this object first" << dendl;
5396           result = -EPERM;
5397           break;
5398         }
5399         if (oi.is_dirty()) {
5400           result = -EBUSY;
5401           break;
5402         }
5403         if (!oi.watchers.empty()) {
5404           result = -EBUSY;
5405           break;
5406         }
5407         if (soid.snap == CEPH_NOSNAP) {
5408           result = _verify_no_head_clones(soid, ssc->snapset);
5409           if (result < 0)
5410             break;
5411         }
5412         result = _delete_oid(ctx, true, false);
5413         if (result >= 0) {
5414           // mark that this is a cache eviction to avoid triggering normal
5415           // make_writeable() clone or snapdir object creation in finish_ctx()
5416           ctx->cache_evict = true;
5417         }
5418         osd->logger->inc(l_osd_tier_evict);
5419       }
5420       break;
5421
5422     case CEPH_OSD_OP_GETXATTR:
5423       ++ctx->num_read;
5424       {
5425         string aname;
5426         bp.copy(op.xattr.name_len, aname);
5427         tracepoint(osd, do_osd_op_pre_getxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
5428         string name = "_" + aname;
5429         int r = getattr_maybe_cache(
5430           ctx->obc,
5431           name,
5432           &(osd_op.outdata));
5433         if (r >= 0) {
5434           op.xattr.value_len = osd_op.outdata.length();
5435           result = 0;
5436           ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
5437         } else
5438           result = r;
5439
5440         ctx->delta_stats.num_rd++;
5441       }
5442       break;
5443
5444    case CEPH_OSD_OP_GETXATTRS:
5445       ++ctx->num_read;
5446       {
5447         tracepoint(osd, do_osd_op_pre_getxattrs, soid.oid.name.c_str(), soid.snap.val);
5448         map<string, bufferlist> out;
5449         result = getattrs_maybe_cache(
5450           ctx->obc,
5451           &out);
5452
5453         bufferlist bl;
5454         ::encode(out, bl);
5455         ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(bl.length(), 10);
5456         ctx->delta_stats.num_rd++;
5457         osd_op.outdata.claim_append(bl);
5458       }
5459       break;
5460
5461     case CEPH_OSD_OP_CMPXATTR:
5462       ++ctx->num_read;
5463       {
5464         string aname;
5465         bp.copy(op.xattr.name_len, aname);
5466         tracepoint(osd, do_osd_op_pre_cmpxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
5467         string name = "_" + aname;
5468         name[op.xattr.name_len + 1] = 0;
5469
5470         bufferlist xattr;
5471         result = getattr_maybe_cache(
5472           ctx->obc,
5473           name,
5474           &xattr);
5475         if (result < 0 && result != -EEXIST && result != -ENODATA)
5476           break;
5477
5478         ctx->delta_stats.num_rd++;
5479         ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(xattr.length(), 10);
5480
5481         switch (op.xattr.cmp_mode) {
5482         case CEPH_OSD_CMPXATTR_MODE_STRING:
5483           {
5484             string val;
5485             bp.copy(op.xattr.value_len, val);
5486             val[op.xattr.value_len] = 0;
5487             dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name << " val=" << val
5488                      << " op=" << (int)op.xattr.cmp_op << " mode=" << (int)op.xattr.cmp_mode << dendl;
5489             result = do_xattr_cmp_str(op.xattr.cmp_op, val, xattr);
5490           }
5491           break;
5492
5493         case CEPH_OSD_CMPXATTR_MODE_U64:
5494           {
5495             uint64_t u64val;
5496             try {
5497               ::decode(u64val, bp);
5498             }
5499             catch (buffer::error& e) {
5500               result = -EINVAL;
5501               goto fail;
5502             }
5503             dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name << " val=" << u64val
5504                      << " op=" << (int)op.xattr.cmp_op << " mode=" << (int)op.xattr.cmp_mode << dendl;
5505             result = do_xattr_cmp_u64(op.xattr.cmp_op, u64val, xattr);
5506           }
5507           break;
5508
5509         default:
5510           dout(10) << "bad cmp mode " << (int)op.xattr.cmp_mode << dendl;
5511           result = -EINVAL;
5512         }
5513
5514         if (!result) {
5515           dout(10) << "comparison returned false" << dendl;
5516           result = -ECANCELED;
5517           break;
5518         }
5519         if (result < 0) {
5520           dout(10) << "comparison returned " << result << " " << cpp_strerror(-result) << dendl;
5521           break;
5522         }
5523
5524         dout(10) << "comparison returned true" << dendl;
5525       }
5526       break;
5527
5528     case CEPH_OSD_OP_ASSERT_VER:
5529       ++ctx->num_read;
5530       {
5531         uint64_t ver = op.assert_ver.ver;
5532         tracepoint(osd, do_osd_op_pre_assert_ver, soid.oid.name.c_str(), soid.snap.val, ver);
5533         if (!ver)
5534           result = -EINVAL;
5535         else if (ver < oi.user_version)
5536           result = -ERANGE;
5537         else if (ver > oi.user_version)
5538           result = -EOVERFLOW;
5539       }
5540       break;
5541
5542     case CEPH_OSD_OP_LIST_WATCHERS:
5543       ++ctx->num_read;
5544       {
5545         tracepoint(osd, do_osd_op_pre_list_watchers, soid.oid.name.c_str(), soid.snap.val);
5546         obj_list_watch_response_t resp;
5547
5548         map<pair<uint64_t, entity_name_t>, watch_info_t>::const_iterator oi_iter;
5549         for (oi_iter = oi.watchers.begin(); oi_iter != oi.watchers.end();
5550                                        ++oi_iter) {
5551           dout(20) << "key cookie=" << oi_iter->first.first
5552                << " entity=" << oi_iter->first.second << " "
5553                << oi_iter->second << dendl;
5554           assert(oi_iter->first.first == oi_iter->second.cookie);
5555           assert(oi_iter->first.second.is_client());
5556
5557           watch_item_t wi(oi_iter->first.second, oi_iter->second.cookie,
5558                  oi_iter->second.timeout_seconds, oi_iter->second.addr);
5559           resp.entries.push_back(wi);
5560         }
5561
5562         resp.encode(osd_op.outdata, ctx->get_features());
5563         result = 0;
5564
5565         ctx->delta_stats.num_rd++;
5566         break;
5567       }
5568
5569     case CEPH_OSD_OP_LIST_SNAPS:
5570       ++ctx->num_read;
5571       {
5572         tracepoint(osd, do_osd_op_pre_list_snaps, soid.oid.name.c_str(), soid.snap.val);
5573         obj_list_snap_response_t resp;
5574
5575         if (!ssc) {
5576           ssc = ctx->obc->ssc = get_snapset_context(soid, false);
5577         }
5578         assert(ssc);
5579
5580         int clonecount = ssc->snapset.clones.size();
5581         if (ssc->snapset.head_exists)
5582           clonecount++;
5583         resp.clones.reserve(clonecount);
5584         for (auto clone_iter = ssc->snapset.clones.begin();
5585              clone_iter != ssc->snapset.clones.end(); ++clone_iter) {
5586           clone_info ci;
5587           ci.cloneid = *clone_iter;
5588
5589           hobject_t clone_oid = soid;
5590           clone_oid.snap = *clone_iter;
5591
5592           if (!ssc->snapset.is_legacy()) {
5593             auto p = ssc->snapset.clone_snaps.find(*clone_iter);
5594             if (p == ssc->snapset.clone_snaps.end()) {
5595               osd->clog->error() << "osd." << osd->whoami
5596                                  << ": inconsistent clone_snaps found for oid "
5597                                  << soid << " clone " << *clone_iter
5598                                  << " snapset " << ssc->snapset;
5599               result = -EINVAL;
5600               break;
5601             }
5602             for (auto q = p->second.rbegin(); q != p->second.rend(); ++q) {
5603               ci.snaps.push_back(*q);
5604             }
5605           } else {
5606             /* No need to take a lock here.  We are only inspecting state cached on
5607              * in the ObjectContext, so we aren't performing an actual read unless
5608              * the clone obc is not already loaded (in which case, it cannot have
5609              * an in progress write).  We also do not risk exposing uncommitted
5610              * state since we do have a read lock on the head object or snapdir,
5611              * which we would have to write lock in order to make user visible
5612              * modifications to the snapshot state (snap trim related mutations
5613              * are not user visible).
5614              */
5615             if (is_missing_object(clone_oid)) {
5616               dout(20) << "LIST_SNAPS " << clone_oid << " missing" << dendl;
5617               wait_for_unreadable_object(clone_oid, ctx->op);
5618               result = -EAGAIN;
5619               break;
5620             }
5621
5622             ObjectContextRef clone_obc = get_object_context(clone_oid, false);
5623             if (!clone_obc) {
5624               if (maybe_handle_cache(
5625                     ctx->op, true, clone_obc, -ENOENT, clone_oid, true)) {
5626                 // promoting the clone
5627                 result = -EAGAIN;
5628               } else {
5629                 osd->clog->error() << "osd." << osd->whoami
5630                                    << ": missing clone " << clone_oid
5631                                    << " for oid "
5632                                    << soid;
5633                 // should not happen
5634                 result = -ENOENT;
5635               }
5636               break;
5637             }
5638             for (vector<snapid_t>::reverse_iterator p =
5639                    clone_obc->obs.oi.legacy_snaps.rbegin();
5640                  p != clone_obc->obs.oi.legacy_snaps.rend();
5641                  ++p) {
5642               ci.snaps.push_back(*p);
5643             }
5644           }
5645
5646           dout(20) << " clone " << *clone_iter << " snaps " << ci.snaps << dendl;
5647
5648           map<snapid_t, interval_set<uint64_t> >::const_iterator coi;
5649           coi = ssc->snapset.clone_overlap.find(ci.cloneid);
5650           if (coi == ssc->snapset.clone_overlap.end()) {
5651             osd->clog->error() << "osd." << osd->whoami
5652                                << ": inconsistent clone_overlap found for oid "
5653                               << soid << " clone " << *clone_iter;
5654             result = -EINVAL;
5655             break;
5656           }
5657           const interval_set<uint64_t> &o = coi->second;
5658           ci.overlap.reserve(o.num_intervals());
5659           for (interval_set<uint64_t>::const_iterator r = o.begin();
5660                r != o.end(); ++r) {
5661             ci.overlap.push_back(pair<uint64_t,uint64_t>(r.get_start(),
5662                                                          r.get_len()));
5663           }
5664
5665           map<snapid_t, uint64_t>::const_iterator si;
5666           si = ssc->snapset.clone_size.find(ci.cloneid);
5667           if (si == ssc->snapset.clone_size.end()) {
5668             osd->clog->error() << "osd." << osd->whoami
5669                                << ": inconsistent clone_size found for oid "
5670                                << soid << " clone " << *clone_iter;
5671             result = -EINVAL;
5672             break;
5673           }
5674           ci.size = si->second;
5675
5676           resp.clones.push_back(ci);
5677         }
5678         if (result < 0) {
5679           break;
5680         }
5681         if (ssc->snapset.head_exists &&
5682             !ctx->obc->obs.oi.is_whiteout()) {
5683           assert(obs.exists);
5684           clone_info ci;
5685           ci.cloneid = CEPH_NOSNAP;
5686
5687           //Size for HEAD is oi.size
5688           ci.size = oi.size;
5689
5690           resp.clones.push_back(ci);
5691         }
5692         resp.seq = ssc->snapset.seq;
5693
5694         resp.encode(osd_op.outdata);
5695         result = 0;
5696
5697         ctx->delta_stats.num_rd++;
5698         break;
5699       }
5700
5701    case CEPH_OSD_OP_NOTIFY:
5702       ++ctx->num_read;
5703       {
5704         uint32_t timeout;
5705         bufferlist bl;
5706
5707         try {
5708           uint32_t ver; // obsolete
5709           ::decode(ver, bp);
5710           ::decode(timeout, bp);
5711           ::decode(bl, bp);
5712         } catch (const buffer::error &e) {
5713           timeout = 0;
5714         }
5715         tracepoint(osd, do_osd_op_pre_notify, soid.oid.name.c_str(), soid.snap.val, timeout);
5716         if (!timeout)
5717           timeout = cct->_conf->osd_default_notify_timeout;
5718
5719         notify_info_t n;
5720         n.timeout = timeout;
5721         n.notify_id = osd->get_next_id(get_osdmap()->get_epoch());
5722         n.cookie = op.watch.cookie;
5723         n.bl = bl;
5724         ctx->notifies.push_back(n);
5725
5726         // return our unique notify id to the client
5727         ::encode(n.notify_id, osd_op.outdata);
5728       }
5729       break;
5730
5731     case CEPH_OSD_OP_NOTIFY_ACK:
5732       ++ctx->num_read;
5733       {
5734         try {
5735           uint64_t notify_id = 0;
5736           uint64_t watch_cookie = 0;
5737           ::decode(notify_id, bp);
5738           ::decode(watch_cookie, bp);
5739           bufferlist reply_bl;
5740           if (!bp.end()) {
5741             ::decode(reply_bl, bp);
5742           }
5743           tracepoint(osd, do_osd_op_pre_notify_ack, soid.oid.name.c_str(), soid.snap.val, notify_id, watch_cookie, "Y");
5744           OpContext::NotifyAck ack(notify_id, watch_cookie, reply_bl);
5745           ctx->notify_acks.push_back(ack);
5746         } catch (const buffer::error &e) {
5747           tracepoint(osd, do_osd_op_pre_notify_ack, soid.oid.name.c_str(), soid.snap.val, op.watch.cookie, 0, "N");
5748           OpContext::NotifyAck ack(
5749             // op.watch.cookie is actually the notify_id for historical reasons
5750             op.watch.cookie
5751             );
5752           ctx->notify_acks.push_back(ack);
5753         }
5754       }
5755       break;
5756
5757     case CEPH_OSD_OP_SETALLOCHINT:
5758       ++ctx->num_write;
5759       {
5760         tracepoint(osd, do_osd_op_pre_setallochint, soid.oid.name.c_str(), soid.snap.val, op.alloc_hint.expected_object_size, op.alloc_hint.expected_write_size);
5761         maybe_create_new_object(ctx);
5762         oi.expected_object_size = op.alloc_hint.expected_object_size;
5763         oi.expected_write_size = op.alloc_hint.expected_write_size;
5764         oi.alloc_hint_flags = op.alloc_hint.flags;
5765         t->set_alloc_hint(soid, op.alloc_hint.expected_object_size,
5766                           op.alloc_hint.expected_write_size,
5767                           op.alloc_hint.flags);
5768         ctx->delta_stats.num_wr++;
5769         result = 0;
5770       }
5771       break;
5772
5773
5774       // --- WRITES ---
5775
5776       // -- object data --
5777
5778     case CEPH_OSD_OP_WRITE:
5779       ++ctx->num_write;
5780       { // write
5781         __u32 seq = oi.truncate_seq;
5782         tracepoint(osd, do_osd_op_pre_write, soid.oid.name.c_str(), soid.snap.val, oi.size, seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
5783         if (op.extent.length != osd_op.indata.length()) {
5784           result = -EINVAL;
5785           break;
5786         }
5787
5788         if (pool.info.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED))
5789           op.flags = op.flags | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
5790
5791         if (pool.info.requires_aligned_append() &&
5792             (op.extent.offset % pool.info.required_alignment() != 0)) {
5793           result = -EOPNOTSUPP;
5794           break;
5795         }
5796
5797         if (!obs.exists) {
5798           if (pool.info.requires_aligned_append() && op.extent.offset) {
5799             result = -EOPNOTSUPP;
5800             break;
5801           }
5802         } else if (op.extent.offset != oi.size &&
5803                    pool.info.requires_aligned_append()) {
5804           result = -EOPNOTSUPP;
5805           break;
5806         }
5807
5808         if (seq && (seq > op.extent.truncate_seq) &&
5809             (op.extent.offset + op.extent.length > oi.size)) {
5810           // old write, arrived after trimtrunc
5811           op.extent.length = (op.extent.offset > oi.size ? 0 : oi.size - op.extent.offset);
5812           dout(10) << " old truncate_seq " << op.extent.truncate_seq << " < current " << seq
5813                    << ", adjusting write length to " << op.extent.length << dendl;
5814           bufferlist t;
5815           t.substr_of(osd_op.indata, 0, op.extent.length);
5816           osd_op.indata.swap(t);
5817         }
5818         if (op.extent.truncate_seq > seq) {
5819           // write arrives before trimtrunc
5820           if (obs.exists && !oi.is_whiteout()) {
5821             dout(10) << " truncate_seq " << op.extent.truncate_seq << " > current " << seq
5822                      << ", truncating to " << op.extent.truncate_size << dendl;
5823             t->truncate(soid, op.extent.truncate_size);
5824             oi.truncate_seq = op.extent.truncate_seq;
5825             oi.truncate_size = op.extent.truncate_size;
5826             if (op.extent.truncate_size != oi.size) {
5827               ctx->delta_stats.num_bytes -= oi.size;
5828               ctx->delta_stats.num_bytes += op.extent.truncate_size;
5829               oi.size = op.extent.truncate_size;
5830             }
5831           } else {
5832             dout(10) << " truncate_seq " << op.extent.truncate_seq << " > current " << seq
5833                      << ", but object is new" << dendl;
5834             oi.truncate_seq = op.extent.truncate_seq;
5835             oi.truncate_size = op.extent.truncate_size;
5836           }
5837         }
5838         result = check_offset_and_length(op.extent.offset, op.extent.length, cct->_conf->osd_max_object_size);
5839         if (result < 0)
5840           break;
5841
5842         maybe_create_new_object(ctx);
5843
5844         if (op.extent.length == 0) {
5845           if (op.extent.offset > oi.size) {
5846             t->truncate(
5847               soid, op.extent.offset);
5848           } else {
5849             t->nop(soid);
5850           }
5851         } else {
5852           t->write(
5853             soid, op.extent.offset, op.extent.length, osd_op.indata, op.flags);
5854         }
5855
5856         if (op.extent.offset == 0 && op.extent.length >= oi.size)
5857           obs.oi.set_data_digest(osd_op.indata.crc32c(-1));
5858         else if (op.extent.offset == oi.size && obs.oi.is_data_digest())
5859           obs.oi.set_data_digest(osd_op.indata.crc32c(obs.oi.data_digest));
5860         else
5861           obs.oi.clear_data_digest();
5862         write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
5863                                     op.extent.offset, op.extent.length);
5864
5865       }
5866       break;
5867
5868     case CEPH_OSD_OP_WRITEFULL:
5869       ++ctx->num_write;
5870       { // write full object
5871         tracepoint(osd, do_osd_op_pre_writefull, soid.oid.name.c_str(), soid.snap.val, oi.size, 0, op.extent.length);
5872
5873         if (op.extent.length != osd_op.indata.length()) {
5874           result = -EINVAL;
5875           break;
5876         }
5877         result = check_offset_and_length(0, op.extent.length, cct->_conf->osd_max_object_size);
5878         if (result < 0)
5879           break;
5880
5881         if (pool.info.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED))
5882           op.flags = op.flags | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
5883
5884         maybe_create_new_object(ctx);
5885         if (pool.info.require_rollback()) {
5886           t->truncate(soid, 0);
5887         } else if (obs.exists && op.extent.length < oi.size) {
5888           t->truncate(soid, op.extent.length);
5889         }
5890         if (op.extent.length) {
5891           t->write(soid, 0, op.extent.length, osd_op.indata, op.flags);
5892         }
5893         obs.oi.set_data_digest(osd_op.indata.crc32c(-1));
5894
5895         write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
5896             0, op.extent.length, true);
5897       }
5898       break;
5899
5900     case CEPH_OSD_OP_WRITESAME:
5901       ++ctx->num_write;
5902       tracepoint(osd, do_osd_op_pre_writesame, soid.oid.name.c_str(), soid.snap.val, oi.size, op.writesame.offset, op.writesame.length, op.writesame.data_length);
5903       result = do_writesame(ctx, osd_op);
5904       break;
5905
5906     case CEPH_OSD_OP_ROLLBACK :
5907       ++ctx->num_write;
5908       tracepoint(osd, do_osd_op_pre_rollback, soid.oid.name.c_str(), soid.snap.val);
5909       result = _rollback_to(ctx, op);
5910       break;
5911
5912     case CEPH_OSD_OP_ZERO:
5913       tracepoint(osd, do_osd_op_pre_zero, soid.oid.name.c_str(), soid.snap.val, op.extent.offset, op.extent.length);
5914       if (pool.info.requires_aligned_append()) {
5915         result = -EOPNOTSUPP;
5916         break;
5917       }
5918       ++ctx->num_write;
5919       { // zero
5920         result = check_offset_and_length(op.extent.offset, op.extent.length, cct->_conf->osd_max_object_size);
5921         if (result < 0)
5922           break;
5923         assert(op.extent.length);
5924         if (obs.exists && !oi.is_whiteout()) {
5925           t->zero(soid, op.extent.offset, op.extent.length);
5926           interval_set<uint64_t> ch;
5927           ch.insert(op.extent.offset, op.extent.length);
5928           ctx->modified_ranges.union_of(ch);
5929           ctx->delta_stats.num_wr++;
5930           oi.clear_data_digest();
5931         } else {
5932           // no-op
5933         }
5934       }
5935       break;
5936     case CEPH_OSD_OP_CREATE:
5937       ++ctx->num_write;
5938       {
5939         tracepoint(osd, do_osd_op_pre_create, soid.oid.name.c_str(), soid.snap.val);
5940         int flags = le32_to_cpu(op.flags);
5941         if (obs.exists && !oi.is_whiteout() &&
5942             (flags & CEPH_OSD_OP_FLAG_EXCL)) {
5943           result = -EEXIST; /* this is an exclusive create */
5944         } else {
5945           if (osd_op.indata.length()) {
5946             bufferlist::iterator p = osd_op.indata.begin();
5947             string category;
5948             try {
5949               ::decode(category, p);
5950             }
5951             catch (buffer::error& e) {
5952               result = -EINVAL;
5953               goto fail;
5954             }
5955             // category is no longer implemented.
5956           }
5957           if (result >= 0) {
5958             maybe_create_new_object(ctx);
5959             t->nop(soid);
5960           }
5961         }
5962       }
5963       break;
5964
5965     case CEPH_OSD_OP_TRIMTRUNC:
5966       op.extent.offset = op.extent.truncate_size;
5967       // falling through
5968
5969     case CEPH_OSD_OP_TRUNCATE:
5970       tracepoint(osd, do_osd_op_pre_truncate, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
5971       if (pool.info.requires_aligned_append()) {
5972         result = -EOPNOTSUPP;
5973         break;
5974       }
5975       ++ctx->num_write;
5976       {
5977         // truncate
5978         if (!obs.exists || oi.is_whiteout()) {
5979           dout(10) << " object dne, truncate is a no-op" << dendl;
5980           break;
5981         }
5982
5983         if (op.extent.offset > cct->_conf->osd_max_object_size) {
5984           result = -EFBIG;
5985           break;
5986         }
5987
5988         if (op.extent.truncate_seq) {
5989           assert(op.extent.offset == op.extent.truncate_size);
5990           if (op.extent.truncate_seq <= oi.truncate_seq) {
5991             dout(10) << " truncate seq " << op.extent.truncate_seq << " <= current " << oi.truncate_seq
5992                      << ", no-op" << dendl;
5993             break; // old
5994           }
5995           dout(10) << " truncate seq " << op.extent.truncate_seq << " > current " << oi.truncate_seq
5996                    << ", truncating" << dendl;
5997           oi.truncate_seq = op.extent.truncate_seq;
5998           oi.truncate_size = op.extent.truncate_size;
5999         }
6000
6001         maybe_create_new_object(ctx);
6002         t->truncate(soid, op.extent.offset);
6003         if (oi.size > op.extent.offset) {
6004           interval_set<uint64_t> trim;
6005           trim.insert(op.extent.offset, oi.size-op.extent.offset);
6006           ctx->modified_ranges.union_of(trim);
6007         }
6008         if (op.extent.offset != oi.size) {
6009           ctx->delta_stats.num_bytes -= oi.size;
6010           ctx->delta_stats.num_bytes += op.extent.offset;
6011           oi.size = op.extent.offset;
6012         }
6013         ctx->delta_stats.num_wr++;
6014         // do no set exists, or we will break above DELETE -> TRUNCATE munging.
6015
6016         oi.clear_data_digest();
6017       }
6018       break;
6019
6020     case CEPH_OSD_OP_DELETE:
6021       ++ctx->num_write;
6022       tracepoint(osd, do_osd_op_pre_delete, soid.oid.name.c_str(), soid.snap.val);
6023       {
6024         result = _delete_oid(ctx, false, ctx->ignore_cache);
6025       }
6026       break;
6027
6028     case CEPH_OSD_OP_WATCH:
6029       ++ctx->num_write;
6030       {
6031         tracepoint(osd, do_osd_op_pre_watch, soid.oid.name.c_str(), soid.snap.val,
6032                    op.watch.cookie, op.watch.op);
6033         if (!obs.exists) {
6034           result = -ENOENT;
6035           break;
6036         }
6037         uint64_t cookie = op.watch.cookie;
6038         entity_name_t entity = ctx->reqid.name;
6039         ObjectContextRef obc = ctx->obc;
6040
6041         dout(10) << "watch " << ceph_osd_watch_op_name(op.watch.op)
6042                  << ": ctx->obc=" << (void *)obc.get() << " cookie=" << cookie
6043                  << " oi.version=" << oi.version.version << " ctx->at_version=" << ctx->at_version << dendl;
6044         dout(10) << "watch: oi.user_version=" << oi.user_version<< dendl;
6045         dout(10) << "watch: peer_addr="
6046           << ctx->op->get_req()->get_connection()->get_peer_addr() << dendl;
6047
6048         uint32_t timeout = cct->_conf->osd_client_watch_timeout;
6049         if (op.watch.timeout != 0) {
6050           timeout = op.watch.timeout;
6051         }
6052
6053         watch_info_t w(cookie, timeout,
6054           ctx->op->get_req()->get_connection()->get_peer_addr());
6055         if (op.watch.op == CEPH_OSD_WATCH_OP_WATCH ||
6056             op.watch.op == CEPH_OSD_WATCH_OP_LEGACY_WATCH) {
6057           if (oi.watchers.count(make_pair(cookie, entity))) {
6058             dout(10) << " found existing watch " << w << " by " << entity << dendl;
6059           } else {
6060             dout(10) << " registered new watch " << w << " by " << entity << dendl;
6061             oi.watchers[make_pair(cookie, entity)] = w;
6062             t->nop(soid);  // make sure update the object_info on disk!
6063           }
6064           bool will_ping = (op.watch.op == CEPH_OSD_WATCH_OP_WATCH);
6065           ctx->watch_connects.push_back(make_pair(w, will_ping));
6066         } else if (op.watch.op == CEPH_OSD_WATCH_OP_RECONNECT) {
6067           if (!oi.watchers.count(make_pair(cookie, entity))) {
6068             result = -ENOTCONN;
6069             break;
6070           }
6071           dout(10) << " found existing watch " << w << " by " << entity << dendl;
6072           ctx->watch_connects.push_back(make_pair(w, true));
6073         } else if (op.watch.op == CEPH_OSD_WATCH_OP_PING) {
6074           /* Note: WATCH with PING doesn't cause may_write() to return true,
6075            * so if there is nothing else in the transaction, this is going
6076            * to run do_osd_op_effects, but not write out a log entry */
6077           if (!oi.watchers.count(make_pair(cookie, entity))) {
6078             result = -ENOTCONN;
6079             break;
6080           }
6081           map<pair<uint64_t,entity_name_t>,WatchRef>::iterator p =
6082             obc->watchers.find(make_pair(cookie, entity));
6083           if (p == obc->watchers.end() ||
6084               !p->second->is_connected()) {
6085             // client needs to reconnect
6086             result = -ETIMEDOUT;
6087             break;
6088           }
6089           dout(10) << " found existing watch " << w << " by " << entity << dendl;
6090           p->second->got_ping(ceph_clock_now());
6091           result = 0;
6092         } else if (op.watch.op == CEPH_OSD_WATCH_OP_UNWATCH) {
6093           map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator oi_iter =
6094             oi.watchers.find(make_pair(cookie, entity));
6095           if (oi_iter != oi.watchers.end()) {
6096             dout(10) << " removed watch " << oi_iter->second << " by "
6097                      << entity << dendl;
6098             oi.watchers.erase(oi_iter);
6099             t->nop(soid);  // update oi on disk
6100             ctx->watch_disconnects.push_back(
6101               watch_disconnect_t(cookie, entity, false));
6102           } else {
6103             dout(10) << " can't remove: no watch by " << entity << dendl;
6104           }
6105         }
6106       }
6107       break;
6108
6109     case CEPH_OSD_OP_CACHE_PIN:
6110       tracepoint(osd, do_osd_op_pre_cache_pin, soid.oid.name.c_str(), soid.snap.val);
6111       if ((!pool.info.is_tier() ||
6112           pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)) {
6113         result = -EINVAL;
6114         dout(10) << " pin object is only allowed on the cache tier " << dendl;
6115         break;
6116       }
6117       ++ctx->num_write;
6118       {
6119         if (!obs.exists || oi.is_whiteout()) {
6120           result = -ENOENT;
6121           break;
6122         }
6123
6124         if (!oi.is_cache_pinned()) {
6125           oi.set_flag(object_info_t::FLAG_CACHE_PIN);
6126           ctx->modify = true;
6127           ctx->delta_stats.num_objects_pinned++;
6128           ctx->delta_stats.num_wr++;
6129         }
6130         result = 0;
6131       }
6132       break;
6133
6134     case CEPH_OSD_OP_CACHE_UNPIN:
6135       tracepoint(osd, do_osd_op_pre_cache_unpin, soid.oid.name.c_str(), soid.snap.val);
6136       if ((!pool.info.is_tier() ||
6137           pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)) {
6138         result = -EINVAL;
6139         dout(10) << " pin object is only allowed on the cache tier " << dendl;
6140         break;
6141       }
6142       ++ctx->num_write;
6143       {
6144         if (!obs.exists || oi.is_whiteout()) {
6145           result = -ENOENT;
6146           break;
6147         }
6148
6149         if (oi.is_cache_pinned()) {
6150           oi.clear_flag(object_info_t::FLAG_CACHE_PIN);
6151           ctx->modify = true;
6152           ctx->delta_stats.num_objects_pinned--;
6153           ctx->delta_stats.num_wr++;
6154         }
6155         result = 0;
6156       }
6157       break;
6158
6159     case CEPH_OSD_OP_SET_REDIRECT:
6160       ++ctx->num_write;
6161       {
6162         if (pool.info.is_tier()) {
6163           result = -EINVAL;
6164           break;
6165         }
6166         if (!obs.exists) {
6167           result = -ENOENT;
6168           break;
6169         }
6170         if (get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS) {
6171           result = -EOPNOTSUPP;
6172           break;
6173         }
6174
6175         object_t target_name;
6176         object_locator_t target_oloc;
6177         snapid_t target_snapid = (uint64_t)op.copy_from.snapid;
6178         version_t target_version = op.copy_from.src_version;
6179         try {
6180           ::decode(target_name, bp);
6181           ::decode(target_oloc, bp);
6182         }
6183         catch (buffer::error& e) {
6184           result = -EINVAL;
6185           goto fail;
6186         }
6187         pg_t raw_pg;
6188         get_osdmap()->object_locator_to_pg(target_name, target_oloc, raw_pg);
6189         hobject_t target(target_name, target_oloc.key, target_snapid,
6190                 raw_pg.ps(), raw_pg.pool(),
6191                 target_oloc.nspace);
6192         if (target == soid) {
6193           dout(20) << " set-redirect self is invalid" << dendl;
6194           result = -EINVAL;
6195           break;
6196         }
6197         oi.set_flag(object_info_t::FLAG_MANIFEST);
6198         oi.manifest.redirect_target = target;
6199         oi.manifest.type = object_manifest_t::TYPE_REDIRECT;
6200         t->truncate(soid, 0);
6201         if (oi.is_omap() && pool.info.supports_omap()) {
6202           t->omap_clear(soid);
6203           obs.oi.clear_omap_digest();
6204           obs.oi.clear_flag(object_info_t::FLAG_OMAP);
6205         }
6206         ctx->delta_stats.num_bytes -= oi.size;
6207         oi.size = 0;
6208         oi.new_object();
6209         oi.user_version = target_version;
6210         ctx->user_at_version = target_version;
6211         /* rm_attrs */
6212         map<string,bufferlist> rmattrs;
6213         result = getattrs_maybe_cache(ctx->obc,
6214                     &rmattrs);
6215         if (result < 0) {
6216           return result;
6217         }
6218         map<string, bufferlist>::iterator iter;
6219         for (iter = rmattrs.begin(); iter != rmattrs.end(); ++iter) {
6220           const string& name = iter->first;
6221           t->rmattr(soid, name);
6222         }
6223         dout(10) << "set-redirect oid:" << oi.soid << " user_version: " << oi.user_version << dendl;
6224       }
6225
6226       break;
6227
6228       // -- object attrs --
6229
6230     case CEPH_OSD_OP_SETXATTR:
6231       ++ctx->num_write;
6232       {
6233         if (cct->_conf->osd_max_attr_size > 0 &&
6234             op.xattr.value_len > cct->_conf->osd_max_attr_size) {
6235           tracepoint(osd, do_osd_op_pre_setxattr, soid.oid.name.c_str(), soid.snap.val, "???");
6236           result = -EFBIG;
6237           break;
6238         }
6239         unsigned max_name_len = MIN(osd->store->get_max_attr_name_length(),
6240                                     cct->_conf->osd_max_attr_name_len);
6241         if (op.xattr.name_len > max_name_len) {
6242           result = -ENAMETOOLONG;
6243           break;
6244         }
6245         maybe_create_new_object(ctx);
6246         string aname;
6247         bp.copy(op.xattr.name_len, aname);
6248         tracepoint(osd, do_osd_op_pre_setxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
6249         string name = "_" + aname;
6250         bufferlist bl;
6251         bp.copy(op.xattr.value_len, bl);
6252         t->setattr(soid, name, bl);
6253         ctx->delta_stats.num_wr++;
6254       }
6255       break;
6256
6257     case CEPH_OSD_OP_RMXATTR:
6258       ++ctx->num_write;
6259       {
6260         string aname;
6261         bp.copy(op.xattr.name_len, aname);
6262         tracepoint(osd, do_osd_op_pre_rmxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
6263         if (!obs.exists || oi.is_whiteout()) {
6264           result = -ENOENT;
6265           break;
6266         }
6267         string name = "_" + aname;
6268         t->rmattr(soid, name);
6269         ctx->delta_stats.num_wr++;
6270       }
6271       break;
6272
6273
6274       // -- fancy writers --
6275     case CEPH_OSD_OP_APPEND:
6276       {
6277         tracepoint(osd, do_osd_op_pre_append, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
6278         // just do it inline; this works because we are happy to execute
6279         // fancy op on replicas as well.
6280         vector<OSDOp> nops(1);
6281         OSDOp& newop = nops[0];
6282         newop.op.op = CEPH_OSD_OP_WRITE;
6283         newop.op.extent.offset = oi.size;
6284         newop.op.extent.length = op.extent.length;
6285         newop.op.extent.truncate_seq = oi.truncate_seq;
6286         newop.indata = osd_op.indata;
6287         result = do_osd_ops(ctx, nops);
6288         osd_op.outdata.claim(newop.outdata);
6289       }
6290       break;
6291
6292     case CEPH_OSD_OP_STARTSYNC:
6293       tracepoint(osd, do_osd_op_pre_startsync, soid.oid.name.c_str(), soid.snap.val);
6294       t->nop(soid);
6295       break;
6296
6297
6298       // -- trivial map --
6299     case CEPH_OSD_OP_TMAPGET:
6300       tracepoint(osd, do_osd_op_pre_tmapget, soid.oid.name.c_str(), soid.snap.val);
6301       if (pool.info.require_rollback()) {
6302         result = -EOPNOTSUPP;
6303         break;
6304       }
6305       {
6306         vector<OSDOp> nops(1);
6307         OSDOp& newop = nops[0];
6308         newop.op.op = CEPH_OSD_OP_SYNC_READ;
6309         newop.op.extent.offset = 0;
6310         newop.op.extent.length = 0;
6311         do_osd_ops(ctx, nops);
6312         osd_op.outdata.claim(newop.outdata);
6313       }
6314       break;
6315
6316     case CEPH_OSD_OP_TMAPPUT:
6317       tracepoint(osd, do_osd_op_pre_tmapput, soid.oid.name.c_str(), soid.snap.val);
6318       if (pool.info.require_rollback()) {
6319         result = -EOPNOTSUPP;
6320         break;
6321       }
6322       {
6323         //_dout_lock.Lock();
6324         //osd_op.data.hexdump(*_dout);
6325         //_dout_lock.Unlock();
6326
6327         // verify sort order
6328         bool unsorted = false;
6329         if (true) {
6330           bufferlist header;
6331           ::decode(header, bp);
6332           uint32_t n;
6333           ::decode(n, bp);
6334           string last_key;
6335           while (n--) {
6336             string key;
6337             ::decode(key, bp);
6338             dout(10) << "tmapput key " << key << dendl;
6339             bufferlist val;
6340             ::decode(val, bp);
6341             if (key < last_key) {
6342               dout(10) << "TMAPPUT is unordered; resorting" << dendl;
6343               unsorted = true;
6344               break;
6345             }
6346             last_key = key;
6347           }
6348         }
6349
6350         // write it
6351         vector<OSDOp> nops(1);
6352         OSDOp& newop = nops[0];
6353         newop.op.op = CEPH_OSD_OP_WRITEFULL;
6354         newop.op.extent.offset = 0;
6355         newop.op.extent.length = osd_op.indata.length();
6356         newop.indata = osd_op.indata;
6357
6358         if (unsorted) {
6359           bp = osd_op.indata.begin();
6360           bufferlist header;
6361           map<string, bufferlist> m;
6362           ::decode(header, bp);
6363           ::decode(m, bp);
6364           assert(bp.end());
6365           bufferlist newbl;
6366           ::encode(header, newbl);
6367           ::encode(m, newbl);
6368           newop.indata = newbl;
6369         }
6370         result = do_osd_ops(ctx, nops);
6371         assert(result == 0);
6372       }
6373       break;
6374
6375     case CEPH_OSD_OP_TMAPUP:
6376       tracepoint(osd, do_osd_op_pre_tmapup, soid.oid.name.c_str(), soid.snap.val);
6377       if (pool.info.require_rollback()) {
6378         result = -EOPNOTSUPP;
6379         break;
6380       }
6381       ++ctx->num_write;
6382       result = do_tmapup(ctx, bp, osd_op);
6383       break;
6384
6385     case CEPH_OSD_OP_TMAP2OMAP:
6386       ++ctx->num_write;
6387       tracepoint(osd, do_osd_op_pre_tmap2omap, soid.oid.name.c_str(), soid.snap.val);
6388       result = do_tmap2omap(ctx, op.tmap2omap.flags);
6389       break;
6390
6391       // OMAP Read ops
6392     case CEPH_OSD_OP_OMAPGETKEYS:
6393       ++ctx->num_read;
6394       {
6395         string start_after;
6396         uint64_t max_return;
6397         try {
6398           ::decode(start_after, bp);
6399           ::decode(max_return, bp);
6400         }
6401         catch (buffer::error& e) {
6402           result = -EINVAL;
6403           tracepoint(osd, do_osd_op_pre_omapgetkeys, soid.oid.name.c_str(), soid.snap.val, "???", 0);
6404           goto fail;
6405         }
6406         if (max_return > cct->_conf->osd_max_omap_entries_per_request) {
6407           max_return = cct->_conf->osd_max_omap_entries_per_request;
6408         }
6409         tracepoint(osd, do_osd_op_pre_omapgetkeys, soid.oid.name.c_str(), soid.snap.val, start_after.c_str(), max_return);
6410
6411         bufferlist bl;
6412         uint32_t num = 0;
6413         bool truncated = false;
6414         if (oi.is_omap()) {
6415           ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(
6416             coll, ghobject_t(soid)
6417             );
6418           assert(iter);
6419           iter->upper_bound(start_after);
6420           for (num = 0; iter->valid(); ++num, iter->next(false)) {
6421             if (num >= max_return ||
6422                 bl.length() >= cct->_conf->osd_max_omap_bytes_per_request) {
6423               truncated = true;
6424               break;
6425             }
6426             ::encode(iter->key(), bl);
6427           }
6428         } // else return empty out_set
6429         ::encode(num, osd_op.outdata);
6430         osd_op.outdata.claim_append(bl);
6431         ::encode(truncated, osd_op.outdata);
6432         ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
6433         ctx->delta_stats.num_rd++;
6434       }
6435       break;
6436
6437     case CEPH_OSD_OP_OMAPGETVALS:
6438       ++ctx->num_read;
6439       {
6440         string start_after;
6441         uint64_t max_return;
6442         string filter_prefix;
6443         try {
6444           ::decode(start_after, bp);
6445           ::decode(max_return, bp);
6446           ::decode(filter_prefix, bp);
6447         }
6448         catch (buffer::error& e) {
6449           result = -EINVAL;
6450           tracepoint(osd, do_osd_op_pre_omapgetvals, soid.oid.name.c_str(), soid.snap.val, "???", 0, "???");
6451           goto fail;
6452         }
6453         if (max_return > cct->_conf->osd_max_omap_entries_per_request) {
6454           max_return = cct->_conf->osd_max_omap_entries_per_request;
6455         }
6456         tracepoint(osd, do_osd_op_pre_omapgetvals, soid.oid.name.c_str(), soid.snap.val, start_after.c_str(), max_return, filter_prefix.c_str());
6457
6458         uint32_t num = 0;
6459         bool truncated = false;
6460         bufferlist bl;
6461         if (oi.is_omap()) {
6462           ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(
6463             coll, ghobject_t(soid)
6464             );
6465           if (!iter) {
6466             result = -ENOENT;
6467             goto fail;
6468           }
6469           iter->upper_bound(start_after);
6470           if (filter_prefix > start_after) iter->lower_bound(filter_prefix);
6471           for (num = 0;
6472                iter->valid() &&
6473                  iter->key().substr(0, filter_prefix.size()) == filter_prefix;
6474                ++num, iter->next(false)) {
6475             dout(20) << "Found key " << iter->key() << dendl;
6476             if (num >= max_return ||
6477                 bl.length() >= cct->_conf->osd_max_omap_bytes_per_request) {
6478               truncated = true;
6479               break;
6480             }
6481             ::encode(iter->key(), bl);
6482             ::encode(iter->value(), bl);
6483           }
6484         } // else return empty out_set
6485         ::encode(num, osd_op.outdata);
6486         osd_op.outdata.claim_append(bl);
6487         ::encode(truncated, osd_op.outdata);
6488         ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
6489         ctx->delta_stats.num_rd++;
6490       }
6491       break;
6492
6493     case CEPH_OSD_OP_OMAPGETHEADER:
6494       tracepoint(osd, do_osd_op_pre_omapgetheader, soid.oid.name.c_str(), soid.snap.val);
6495       if (!oi.is_omap()) {
6496         // return empty header
6497         break;
6498       }
6499       ++ctx->num_read;
6500       {
6501         osd->store->omap_get_header(ch, ghobject_t(soid), &osd_op.outdata);
6502         ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
6503         ctx->delta_stats.num_rd++;
6504       }
6505       break;
6506
6507     case CEPH_OSD_OP_OMAPGETVALSBYKEYS:
6508       ++ctx->num_read;
6509       {
6510         set<string> keys_to_get;
6511         try {
6512           ::decode(keys_to_get, bp);
6513         }
6514         catch (buffer::error& e) {
6515           result = -EINVAL;
6516           tracepoint(osd, do_osd_op_pre_omapgetvalsbykeys, soid.oid.name.c_str(), soid.snap.val, "???");
6517           goto fail;
6518         }
6519         tracepoint(osd, do_osd_op_pre_omapgetvalsbykeys, soid.oid.name.c_str(), soid.snap.val, list_entries(keys_to_get).c_str());
6520         map<string, bufferlist> out;
6521         if (oi.is_omap()) {
6522           osd->store->omap_get_values(ch, ghobject_t(soid), keys_to_get, &out);
6523         } // else return empty omap entries
6524         ::encode(out, osd_op.outdata);
6525         ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
6526         ctx->delta_stats.num_rd++;
6527       }
6528       break;
6529
6530     case CEPH_OSD_OP_OMAP_CMP:
6531       ++ctx->num_read;
6532       {
6533         if (!obs.exists || oi.is_whiteout()) {
6534           result = -ENOENT;
6535           tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, "???");
6536           break;
6537         }
6538         map<string, pair<bufferlist, int> > assertions;
6539         try {
6540           ::decode(assertions, bp);
6541         }
6542         catch (buffer::error& e) {
6543           result = -EINVAL;
6544           tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, "???");
6545           goto fail;
6546         }
6547         tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, list_keys(assertions).c_str());
6548
6549         map<string, bufferlist> out;
6550
6551         if (oi.is_omap()) {
6552           set<string> to_get;
6553           for (map<string, pair<bufferlist, int> >::iterator i = assertions.begin();
6554                i != assertions.end();
6555                ++i)
6556             to_get.insert(i->first);
6557           int r = osd->store->omap_get_values(ch, ghobject_t(soid),
6558                                               to_get, &out);
6559           if (r < 0) {
6560             result = r;
6561             break;
6562           }
6563         } // else leave out empty
6564
6565         //Should set num_rd_kb based on encode length of map
6566         ctx->delta_stats.num_rd++;
6567
6568         int r = 0;
6569         bufferlist empty;
6570         for (map<string, pair<bufferlist, int> >::iterator i = assertions.begin();
6571              i != assertions.end();
6572              ++i) {
6573           auto out_entry = out.find(i->first);
6574           bufferlist &bl = (out_entry != out.end()) ?
6575             out_entry->second : empty;
6576           switch (i->second.second) {
6577           case CEPH_OSD_CMPXATTR_OP_EQ:
6578             if (!(bl == i->second.first)) {
6579               r = -ECANCELED;
6580             }
6581             break;
6582           case CEPH_OSD_CMPXATTR_OP_LT:
6583             if (!(bl < i->second.first)) {
6584               r = -ECANCELED;
6585             }
6586             break;
6587           case CEPH_OSD_CMPXATTR_OP_GT:
6588             if (!(bl > i->second.first)) {
6589               r = -ECANCELED;
6590             }
6591             break;
6592           default:
6593             r = -EINVAL;
6594             break;
6595           }
6596           if (r < 0)
6597             break;
6598         }
6599         if (r < 0) {
6600           result = r;
6601         }
6602       }
6603       break;
6604
6605       // OMAP Write ops
6606     case CEPH_OSD_OP_OMAPSETVALS:
6607       if (!pool.info.supports_omap()) {
6608         result = -EOPNOTSUPP;
6609         tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
6610         break;
6611       }
6612       ++ctx->num_write;
6613       {
6614         maybe_create_new_object(ctx);
6615         bufferlist to_set_bl;
6616         try {
6617           decode_str_str_map_to_bl(bp, &to_set_bl);
6618         }
6619         catch (buffer::error& e) {
6620           result = -EINVAL;
6621           tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
6622           goto fail;
6623         }
6624         tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
6625         if (cct->_conf->subsys.should_gather(dout_subsys, 20)) {
6626           dout(20) << "setting vals: " << dendl;
6627           map<string,bufferlist> to_set;
6628           bufferlist::iterator pt = to_set_bl.begin();
6629           ::decode(to_set, pt);
6630           for (map<string, bufferlist>::iterator i = to_set.begin();
6631                i != to_set.end();
6632                ++i) {
6633             dout(20) << "\t" << i->first << dendl;
6634           }
6635         }
6636         t->omap_setkeys(soid, to_set_bl);
6637         ctx->delta_stats.num_wr++;
6638       }
6639       obs.oi.set_flag(object_info_t::FLAG_OMAP);
6640       obs.oi.clear_omap_digest();
6641       break;
6642
6643     case CEPH_OSD_OP_OMAPSETHEADER:
6644       tracepoint(osd, do_osd_op_pre_omapsetheader, soid.oid.name.c_str(), soid.snap.val);
6645       if (!pool.info.supports_omap()) {
6646         result = -EOPNOTSUPP;
6647         break;
6648       }
6649       ++ctx->num_write;
6650       {
6651         maybe_create_new_object(ctx);
6652         t->omap_setheader(soid, osd_op.indata);
6653         ctx->delta_stats.num_wr++;
6654       }
6655       obs.oi.set_flag(object_info_t::FLAG_OMAP);
6656       obs.oi.clear_omap_digest();
6657       break;
6658
6659     case CEPH_OSD_OP_OMAPCLEAR:
6660       tracepoint(osd, do_osd_op_pre_omapclear, soid.oid.name.c_str(), soid.snap.val);
6661       if (!pool.info.supports_omap()) {
6662         result = -EOPNOTSUPP;
6663         break;
6664       }
6665       ++ctx->num_write;
6666       {
6667         if (!obs.exists || oi.is_whiteout()) {
6668           result = -ENOENT;
6669           break;
6670         }
6671         if (oi.is_omap()) {
6672           t->omap_clear(soid);
6673           ctx->delta_stats.num_wr++;
6674           obs.oi.clear_omap_digest();
6675           obs.oi.clear_flag(object_info_t::FLAG_OMAP);
6676         }
6677       }
6678       break;
6679
6680     case CEPH_OSD_OP_OMAPRMKEYS:
6681       if (!pool.info.supports_omap()) {
6682         result = -EOPNOTSUPP;
6683         tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
6684         break;
6685       }
6686       ++ctx->num_write;
6687       {
6688         if (!obs.exists || oi.is_whiteout()) {
6689           result = -ENOENT;
6690           tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
6691           break;
6692         }
6693         bufferlist to_rm_bl;
6694         try {
6695           decode_str_set_to_bl(bp, &to_rm_bl);
6696         }
6697         catch (buffer::error& e) {
6698           result = -EINVAL;
6699           tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
6700           goto fail;
6701         }
6702         tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
6703         t->omap_rmkeys(soid, to_rm_bl);
6704         ctx->delta_stats.num_wr++;
6705       }
6706       obs.oi.clear_omap_digest();
6707       break;
6708
6709     case CEPH_OSD_OP_COPY_GET:
6710       ++ctx->num_read;
6711       tracepoint(osd, do_osd_op_pre_copy_get, soid.oid.name.c_str(),
6712                  soid.snap.val);
6713       if (op_finisher == nullptr) {
6714         result = do_copy_get(ctx, bp, osd_op, ctx->obc);
6715       } else {
6716         result = op_finisher->execute();
6717       }
6718       break;
6719
6720     case CEPH_OSD_OP_COPY_FROM:
6721       ++ctx->num_write;
6722       {
6723         object_t src_name;
6724         object_locator_t src_oloc;
6725         snapid_t src_snapid = (uint64_t)op.copy_from.snapid;
6726         version_t src_version = op.copy_from.src_version;
6727         try {
6728           ::decode(src_name, bp);
6729           ::decode(src_oloc, bp);
6730         }
6731         catch (buffer::error& e) {
6732           result = -EINVAL;
6733           tracepoint(osd,
6734                      do_osd_op_pre_copy_from,
6735                      soid.oid.name.c_str(),
6736                      soid.snap.val,
6737                      "???",
6738                      0,
6739                      "???",
6740                      "???",
6741                      0,
6742                      src_snapid,
6743                      src_version);
6744           goto fail;
6745         }
6746         tracepoint(osd,
6747                    do_osd_op_pre_copy_from,
6748                    soid.oid.name.c_str(),
6749                    soid.snap.val,
6750                    src_name.name.c_str(),
6751                    src_oloc.pool,
6752                    src_oloc.key.c_str(),
6753                    src_oloc.nspace.c_str(),
6754                    src_oloc.hash,
6755                    src_snapid,
6756                    src_version);
6757         if (op_finisher == nullptr) {
6758           // start
6759           pg_t raw_pg;
6760           get_osdmap()->object_locator_to_pg(src_name, src_oloc, raw_pg);
6761           hobject_t src(src_name, src_oloc.key, src_snapid,
6762                         raw_pg.ps(), raw_pg.pool(),
6763                         src_oloc.nspace);
6764           if (src == soid) {
6765             dout(20) << " copy from self is invalid" << dendl;
6766             result = -EINVAL;
6767             break;
6768           }
6769           CopyFromCallback *cb = new CopyFromCallback(ctx, osd_op);
6770           ctx->op_finishers[ctx->current_osd_subop_num].reset(
6771             new CopyFromFinisher(cb));
6772           start_copy(cb, ctx->obc, src, src_oloc, src_version,
6773                      op.copy_from.flags,
6774                      false,
6775                      op.copy_from.src_fadvise_flags,
6776                      op.flags);
6777           result = -EINPROGRESS;
6778         } else {
6779           // finish
6780           result = op_finisher->execute();
6781           assert(result == 0);
6782
6783           // COPY_FROM cannot be executed multiple times -- it must restart
6784           ctx->op_finishers.erase(ctx->current_osd_subop_num);
6785         }
6786       }
6787       break;
6788
6789     default:
6790       tracepoint(osd, do_osd_op_pre_unknown, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op));
6791       dout(1) << "unrecognized osd op " << op.op
6792               << " " << ceph_osd_op_name(op.op)
6793               << dendl;
6794       result = -EOPNOTSUPP;
6795     }
6796
6797   fail:
6798     osd_op.rval = result;
6799     tracepoint(osd, do_osd_op_post, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op), op.flags, result);
6800     if (result < 0 && (op.flags & CEPH_OSD_OP_FLAG_FAILOK))
6801       result = 0;
6802
6803     if (result < 0)
6804       break;
6805   }
6806   return result;
6807 }
6808
6809 int PrimaryLogPG::_get_tmap(OpContext *ctx, bufferlist *header, bufferlist *vals)
6810 {
6811   if (ctx->new_obs.oi.size == 0) {
6812     dout(20) << "unable to get tmap for zero sized " << ctx->new_obs.oi.soid << dendl;
6813     return -ENODATA;
6814   }
6815   vector<OSDOp> nops(1);
6816   OSDOp &newop = nops[0];
6817   newop.op.op = CEPH_OSD_OP_TMAPGET;
6818   do_osd_ops(ctx, nops);
6819   try {
6820     bufferlist::iterator i = newop.outdata.begin();
6821     ::decode(*header, i);
6822     (*vals).substr_of(newop.outdata, i.get_off(), i.get_remaining());
6823   } catch (...) {
6824     dout(20) << "unsuccessful at decoding tmap for " << ctx->new_obs.oi.soid
6825              << dendl;
6826     return -EINVAL;
6827   }
6828   dout(20) << "successful at decoding tmap for " << ctx->new_obs.oi.soid
6829            << dendl;
6830   return 0;
6831 }
6832
6833 int PrimaryLogPG::_verify_no_head_clones(const hobject_t& soid,
6834                                         const SnapSet& ss)
6835 {
6836   // verify that all clones have been evicted
6837   dout(20) << __func__ << " verifying clones are absent "
6838            << ss << dendl;
6839   for (vector<snapid_t>::const_iterator p = ss.clones.begin();
6840        p != ss.clones.end();
6841        ++p) {
6842     hobject_t clone_oid = soid;
6843     clone_oid.snap = *p;
6844     if (is_missing_object(clone_oid))
6845       return -EBUSY;
6846     ObjectContextRef clone_obc = get_object_context(clone_oid, false);
6847     if (clone_obc && clone_obc->obs.exists) {
6848       dout(10) << __func__ << " cannot evict head before clone "
6849                << clone_oid << dendl;
6850       return -EBUSY;
6851     }
6852     if (copy_ops.count(clone_oid)) {
6853       dout(10) << __func__ << " cannot evict head, pending promote on clone "
6854                << clone_oid << dendl;
6855       return -EBUSY;
6856     }
6857   }
6858   return 0;
6859 }
6860
6861 inline int PrimaryLogPG::_delete_oid(
6862   OpContext *ctx,
6863   bool no_whiteout,     // no whiteouts, no matter what.
6864   bool try_no_whiteout) // try not to whiteout
6865 {
6866   SnapSet& snapset = ctx->new_snapset;
6867   ObjectState& obs = ctx->new_obs;
6868   object_info_t& oi = obs.oi;
6869   const hobject_t& soid = oi.soid;
6870   PGTransaction* t = ctx->op_t.get();
6871
6872   // cache: cache: set whiteout on delete?
6873   bool whiteout = false;
6874   if (pool.info.cache_mode != pg_pool_t::CACHEMODE_NONE
6875       && !no_whiteout
6876       && !try_no_whiteout) {
6877     whiteout = true;
6878   }
6879   bool legacy;
6880   if (get_osdmap()->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
6881     legacy = false;
6882     // in luminous or later, we can't delete the head if there are
6883     // clones. we trust the caller passing no_whiteout has already
6884     // verified they don't exist.
6885     if (!snapset.clones.empty() ||
6886         (!ctx->snapc.snaps.empty() && ctx->snapc.snaps[0] > snapset.seq)) {
6887       if (no_whiteout) {
6888         dout(20) << __func__ << " has or will have clones but no_whiteout=1"
6889                  << dendl;
6890       } else {
6891         dout(20) << __func__ << " has or will have clones; will whiteout"
6892                  << dendl;
6893         whiteout = true;
6894       }
6895     }
6896   } else {
6897     legacy = true;
6898   }
6899   dout(20) << __func__ << " " << soid << " whiteout=" << (int)whiteout
6900            << " no_whiteout=" << (int)no_whiteout
6901            << " try_no_whiteout=" << (int)try_no_whiteout
6902            << dendl;
6903   if (!obs.exists || (obs.oi.is_whiteout() && whiteout))
6904     return -ENOENT;
6905
6906   t->remove(soid);
6907
6908   if (oi.size > 0) {
6909     interval_set<uint64_t> ch;
6910     ch.insert(0, oi.size);
6911     ctx->modified_ranges.union_of(ch);
6912   }
6913
6914   ctx->delta_stats.num_wr++;
6915   if (soid.is_snap()) {
6916     assert(ctx->obc->ssc->snapset.clone_overlap.count(soid.snap));
6917     ctx->delta_stats.num_bytes -= ctx->obc->ssc->snapset.get_clone_bytes(soid.snap);
6918   } else {
6919     ctx->delta_stats.num_bytes -= oi.size;
6920   }
6921   oi.size = 0;
6922   oi.new_object();
6923
6924   // disconnect all watchers
6925   for (map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator p =
6926          oi.watchers.begin();
6927        p != oi.watchers.end();
6928        ++p) {
6929     dout(20) << __func__ << " will disconnect watcher " << p->first << dendl;
6930     ctx->watch_disconnects.push_back(
6931       watch_disconnect_t(p->first.first, p->first.second, true));
6932   }
6933   oi.watchers.clear();
6934
6935   if (whiteout) {
6936     dout(20) << __func__ << " setting whiteout on " << soid << dendl;
6937     oi.set_flag(object_info_t::FLAG_WHITEOUT);
6938     ctx->delta_stats.num_whiteouts++;
6939     t->create(soid);
6940     osd->logger->inc(l_osd_tier_whiteout);
6941     return 0;
6942   }
6943
6944   // delete the head
6945   ctx->delta_stats.num_objects--;
6946   if (soid.is_snap())
6947     ctx->delta_stats.num_object_clones--;
6948   if (oi.is_whiteout()) {
6949     dout(20) << __func__ << " deleting whiteout on " << soid << dendl;
6950     ctx->delta_stats.num_whiteouts--;
6951     oi.clear_flag(object_info_t::FLAG_WHITEOUT);
6952   }
6953   if (oi.is_cache_pinned()) {
6954     ctx->delta_stats.num_objects_pinned--;
6955   }
6956   if ((legacy || snapset.is_legacy()) && soid.is_head()) {
6957     snapset.head_exists = false;
6958   }
6959   obs.exists = false;
6960   return 0;
6961 }
6962
6963 int PrimaryLogPG::_rollback_to(OpContext *ctx, ceph_osd_op& op)
6964 {
6965   SnapSet& snapset = ctx->new_snapset;
6966   ObjectState& obs = ctx->new_obs;
6967   object_info_t& oi = obs.oi;
6968   const hobject_t& soid = oi.soid;
6969   PGTransaction* t = ctx->op_t.get();
6970   snapid_t snapid = (uint64_t)op.snap.snapid;
6971   hobject_t missing_oid;
6972
6973   dout(10) << "_rollback_to " << soid << " snapid " << snapid << dendl;
6974
6975   ObjectContextRef rollback_to;
6976   int ret = find_object_context(
6977     hobject_t(soid.oid, soid.get_key(), snapid, soid.get_hash(), info.pgid.pool(),
6978               soid.get_namespace()),
6979     &rollback_to, false, false, &missing_oid);
6980   if (ret == -EAGAIN) {
6981     /* clone must be missing */
6982     assert(is_degraded_or_backfilling_object(missing_oid));
6983     dout(20) << "_rollback_to attempted to roll back to a missing or backfilling clone "
6984              << missing_oid << " (requested snapid: ) " << snapid << dendl;
6985     block_write_on_degraded_snap(missing_oid, ctx->op);
6986     return ret;
6987   }
6988   {
6989     ObjectContextRef promote_obc;
6990     cache_result_t tier_mode_result;
6991     if (obs.exists && obs.oi.has_manifest()) {
6992       tier_mode_result =
6993         maybe_handle_manifest_detail(
6994           ctx->op,
6995           true,
6996           rollback_to);
6997     } else {
6998       tier_mode_result =
6999         maybe_handle_cache_detail(
7000           ctx->op,
7001           true,
7002           rollback_to,
7003           ret,
7004           missing_oid,
7005           true,
7006           false,
7007           &promote_obc);
7008     }
7009     switch (tier_mode_result) {
7010     case cache_result_t::NOOP:
7011       break;
7012     case cache_result_t::BLOCKED_PROMOTE:
7013       assert(promote_obc);
7014       block_write_on_snap_rollback(soid, promote_obc, ctx->op);
7015       return -EAGAIN;
7016     case cache_result_t::BLOCKED_FULL:
7017       block_write_on_full_cache(soid, ctx->op);
7018       return -EAGAIN;
7019     case cache_result_t::REPLIED_WITH_EAGAIN:
7020       assert(0 == "this can't happen, no rollback on replica");
7021     default:
7022       assert(0 == "must promote was set, other values are not valid");
7023       return -EAGAIN;
7024     }
7025   }
7026
7027   if (ret == -ENOENT || (rollback_to && rollback_to->obs.oi.is_whiteout())) {
7028     // there's no snapshot here, or there's no object.
7029     // if there's no snapshot, we delete the object; otherwise, do nothing.
7030     dout(20) << "_rollback_to deleting head on " << soid.oid
7031              << " because got ENOENT|whiteout on find_object_context" << dendl;
7032     if (ctx->obc->obs.oi.watchers.size()) {
7033       // Cannot delete an object with watchers
7034       ret = -EBUSY;
7035     } else {
7036       _delete_oid(ctx, false, false);
7037       ret = 0;
7038     }
7039   } else if (ret) {
7040     // ummm....huh? It *can't* return anything else at time of writing.
7041     assert(0 == "unexpected error code in _rollback_to");
7042   } else { //we got our context, let's use it to do the rollback!
7043     hobject_t& rollback_to_sobject = rollback_to->obs.oi.soid;
7044     if (is_degraded_or_backfilling_object(rollback_to_sobject)) {
7045       dout(20) << "_rollback_to attempted to roll back to a degraded object "
7046                << rollback_to_sobject << " (requested snapid: ) " << snapid << dendl;
7047       block_write_on_degraded_snap(rollback_to_sobject, ctx->op);
7048       ret = -EAGAIN;
7049     } else if (rollback_to->obs.oi.soid.snap == CEPH_NOSNAP) {
7050       // rolling back to the head; we just need to clone it.
7051       ctx->modify = true;
7052     } else {
7053       /* 1) Delete current head
7054        * 2) Clone correct snapshot into head
7055        * 3) Calculate clone_overlaps by following overlaps
7056        *    forward from rollback snapshot */
7057       dout(10) << "_rollback_to deleting " << soid.oid
7058                << " and rolling back to old snap" << dendl;
7059
7060       if (obs.exists) {
7061         t->remove(soid);
7062       }
7063       t->clone(soid, rollback_to_sobject);
7064       snapset.head_exists = true;
7065       t->add_obc(rollback_to);
7066
7067       map<snapid_t, interval_set<uint64_t> >::iterator iter =
7068         snapset.clone_overlap.lower_bound(snapid);
7069       interval_set<uint64_t> overlaps = iter->second;
7070       assert(iter != snapset.clone_overlap.end());
7071       for ( ;
7072             iter != snapset.clone_overlap.end();
7073             ++iter)
7074         overlaps.intersection_of(iter->second);
7075
7076       if (obs.oi.size > 0) {
7077         interval_set<uint64_t> modified;
7078         modified.insert(0, obs.oi.size);
7079         overlaps.intersection_of(modified);
7080         modified.subtract(overlaps);
7081         ctx->modified_ranges.union_of(modified);
7082       }
7083
7084       // Adjust the cached objectcontext
7085       maybe_create_new_object(ctx, true);
7086       ctx->delta_stats.num_bytes -= obs.oi.size;
7087       ctx->delta_stats.num_bytes += rollback_to->obs.oi.size;
7088       obs.oi.size = rollback_to->obs.oi.size;
7089       if (rollback_to->obs.oi.is_data_digest())
7090         obs.oi.set_data_digest(rollback_to->obs.oi.data_digest);
7091       else
7092         obs.oi.clear_data_digest();
7093       if (rollback_to->obs.oi.is_omap_digest())
7094         obs.oi.set_omap_digest(rollback_to->obs.oi.omap_digest);
7095       else
7096         obs.oi.clear_omap_digest();
7097
7098       if (rollback_to->obs.oi.is_omap()) {
7099         dout(10) << __func__ << " setting omap flag on " << obs.oi.soid << dendl;
7100         obs.oi.set_flag(object_info_t::FLAG_OMAP);
7101       } else {
7102         dout(10) << __func__ << " clearing omap flag on " << obs.oi.soid << dendl;
7103         obs.oi.clear_flag(object_info_t::FLAG_OMAP);
7104       }
7105
7106       snapset.head_exists = true;
7107     }
7108   }
7109   return ret;
7110 }
7111
7112 void PrimaryLogPG::_make_clone(
7113   OpContext *ctx,
7114   PGTransaction* t,
7115   ObjectContextRef obc,
7116   const hobject_t& head, const hobject_t& coid,
7117   object_info_t *poi)
7118 {
7119   bufferlist bv;
7120   ::encode(*poi, bv, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
7121
7122   t->clone(coid, head);
7123   setattr_maybe_cache(obc, ctx, t, OI_ATTR, bv);
7124   rmattr_maybe_cache(obc, ctx, t, SS_ATTR);
7125 }
7126
7127 void PrimaryLogPG::make_writeable(OpContext *ctx)
7128 {
7129   const hobject_t& soid = ctx->obs->oi.soid;
7130   SnapContext& snapc = ctx->snapc;
7131
7132   // clone?
7133   assert(soid.snap == CEPH_NOSNAP);
7134   dout(20) << "make_writeable " << soid << " snapset=" << ctx->new_snapset
7135            << "  snapc=" << snapc << dendl;
7136
7137   bool was_dirty = ctx->obc->obs.oi.is_dirty();
7138   if (ctx->new_obs.exists) {
7139     // we will mark the object dirty
7140     if (ctx->undirty && was_dirty) {
7141       dout(20) << " clearing DIRTY flag" << dendl;
7142       assert(ctx->new_obs.oi.is_dirty());
7143       ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
7144       --ctx->delta_stats.num_objects_dirty;
7145       osd->logger->inc(l_osd_tier_clean);
7146     } else if (!was_dirty && !ctx->undirty) {
7147       dout(20) << " setting DIRTY flag" << dendl;
7148       ctx->new_obs.oi.set_flag(object_info_t::FLAG_DIRTY);
7149       ++ctx->delta_stats.num_objects_dirty;
7150       osd->logger->inc(l_osd_tier_dirty);
7151     }
7152   } else {
7153     if (was_dirty) {
7154       dout(20) << " deletion, decrementing num_dirty and clearing flag" << dendl;
7155       ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
7156       --ctx->delta_stats.num_objects_dirty;
7157     }
7158   }
7159
7160   if ((ctx->new_obs.exists &&
7161        ctx->new_obs.oi.is_omap()) &&
7162       (!ctx->obc->obs.exists ||
7163        !ctx->obc->obs.oi.is_omap())) {
7164     ++ctx->delta_stats.num_objects_omap;
7165   }
7166   if ((!ctx->new_obs.exists ||
7167        !ctx->new_obs.oi.is_omap()) &&
7168       (ctx->obc->obs.exists &&
7169        ctx->obc->obs.oi.is_omap())) {
7170     --ctx->delta_stats.num_objects_omap;
7171   }
7172
7173   // use newer snapc?
7174   if (ctx->new_snapset.seq > snapc.seq) {
7175     snapc.seq = ctx->new_snapset.seq;
7176     snapc.snaps = ctx->new_snapset.snaps;
7177     filter_snapc(snapc.snaps);
7178     dout(10) << " using newer snapc " << snapc << dendl;
7179   }
7180
7181   if ((ctx->obs->exists && !ctx->obs->oi.is_whiteout()) && // head exist(ed)
7182       snapc.snaps.size() &&                 // there are snaps
7183       !ctx->cache_evict &&
7184       snapc.snaps[0] > ctx->new_snapset.seq) {  // existing object is old
7185     // clone
7186     hobject_t coid = soid;
7187     coid.snap = snapc.seq;
7188
7189     unsigned l;
7190     for (l=1; l<snapc.snaps.size() && snapc.snaps[l] > ctx->new_snapset.seq; l++) ;
7191
7192     vector<snapid_t> snaps(l);
7193     for (unsigned i=0; i<l; i++)
7194       snaps[i] = snapc.snaps[i];
7195
7196     // prepare clone
7197     object_info_t static_snap_oi(coid);
7198     object_info_t *snap_oi;
7199     if (is_primary()) {
7200       ctx->clone_obc = object_contexts.lookup_or_create(static_snap_oi.soid);
7201       ctx->clone_obc->destructor_callback = new C_PG_ObjectContext(this, ctx->clone_obc.get());
7202       ctx->clone_obc->obs.oi = static_snap_oi;
7203       ctx->clone_obc->obs.exists = true;
7204       ctx->clone_obc->ssc = ctx->obc->ssc;
7205       ctx->clone_obc->ssc->ref++;
7206       if (pool.info.require_rollback())
7207         ctx->clone_obc->attr_cache = ctx->obc->attr_cache;
7208       snap_oi = &ctx->clone_obc->obs.oi;
7209       bool got = ctx->lock_manager.get_write_greedy(
7210         coid,
7211         ctx->clone_obc,
7212         ctx->op);
7213       assert(got);
7214       dout(20) << " got greedy write on clone_obc " << *ctx->clone_obc << dendl;
7215     } else {
7216       snap_oi = &static_snap_oi;
7217     }
7218     snap_oi->version = ctx->at_version;
7219     snap_oi->prior_version = ctx->obs->oi.version;
7220     snap_oi->copy_user_bits(ctx->obs->oi);
7221
7222     bool legacy = ctx->new_snapset.is_legacy() ||
7223       get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
7224     if (legacy) {
7225       snap_oi->legacy_snaps = snaps;
7226     }
7227
7228     _make_clone(ctx, ctx->op_t.get(), ctx->clone_obc, soid, coid, snap_oi);
7229
7230     ctx->delta_stats.num_objects++;
7231     if (snap_oi->is_dirty()) {
7232       ctx->delta_stats.num_objects_dirty++;
7233       osd->logger->inc(l_osd_tier_dirty);
7234     }
7235     if (snap_oi->is_omap())
7236       ctx->delta_stats.num_objects_omap++;
7237     if (snap_oi->is_cache_pinned())
7238       ctx->delta_stats.num_objects_pinned++;
7239     ctx->delta_stats.num_object_clones++;
7240     ctx->new_snapset.clones.push_back(coid.snap);
7241     ctx->new_snapset.clone_size[coid.snap] = ctx->obs->oi.size;
7242     if (!legacy) {
7243       ctx->new_snapset.clone_snaps[coid.snap] = snaps;
7244     }
7245
7246     // clone_overlap should contain an entry for each clone
7247     // (an empty interval_set if there is no overlap)
7248     ctx->new_snapset.clone_overlap[coid.snap];
7249     if (ctx->obs->oi.size)
7250       ctx->new_snapset.clone_overlap[coid.snap].insert(0, ctx->obs->oi.size);
7251
7252     // log clone
7253     dout(10) << " cloning v " << ctx->obs->oi.version
7254              << " to " << coid << " v " << ctx->at_version
7255              << " snaps=" << snaps
7256              << " snapset=" << ctx->new_snapset << dendl;
7257     ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::CLONE, coid, ctx->at_version,
7258                                       ctx->obs->oi.version,
7259                                       ctx->obs->oi.user_version,
7260                                       osd_reqid_t(), ctx->new_obs.oi.mtime, 0));
7261     ::encode(snaps, ctx->log.back().snaps);
7262
7263     ctx->at_version.version++;
7264   }
7265
7266   // update most recent clone_overlap and usage stats
7267   if (ctx->new_snapset.clones.size() > 0) {
7268     /* we need to check whether the most recent clone exists, if it's been evicted,
7269      * it's not included in the stats */
7270     hobject_t last_clone_oid = soid;
7271     last_clone_oid.snap = ctx->new_snapset.clone_overlap.rbegin()->first;
7272     if (is_present_clone(last_clone_oid)) {
7273       interval_set<uint64_t> &newest_overlap = ctx->new_snapset.clone_overlap.rbegin()->second;
7274       ctx->modified_ranges.intersection_of(newest_overlap);
7275       // modified_ranges is still in use by the clone
7276       add_interval_usage(ctx->modified_ranges, ctx->delta_stats);
7277       newest_overlap.subtract(ctx->modified_ranges);
7278     }
7279   }
7280
7281   // update snapset with latest snap context
7282   ctx->new_snapset.seq = snapc.seq;
7283   ctx->new_snapset.snaps = snapc.snaps;
7284   if (get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS) {
7285     // pessimistic assumption that this is a net-new legacy SnapSet
7286     ctx->delta_stats.num_legacy_snapsets++;
7287     ctx->new_snapset.head_exists = ctx->new_obs.exists;
7288   } else if (ctx->new_snapset.is_legacy()) {
7289     ctx->new_snapset.head_exists = ctx->new_obs.exists;
7290   }
7291   dout(20) << "make_writeable " << soid
7292            << " done, snapset=" << ctx->new_snapset << dendl;
7293 }
7294
7295
7296 void PrimaryLogPG::write_update_size_and_usage(object_stat_sum_t& delta_stats, object_info_t& oi,
7297                                                interval_set<uint64_t>& modified, uint64_t offset,
7298                                                uint64_t length, bool write_full)
7299 {
7300   interval_set<uint64_t> ch;
7301   if (write_full) {
7302     if (oi.size)
7303       ch.insert(0, oi.size);
7304   } else if (length)
7305     ch.insert(offset, length);
7306   modified.union_of(ch);
7307   if (write_full || offset + length > oi.size) {
7308     uint64_t new_size = offset + length;
7309     delta_stats.num_bytes -= oi.size;
7310     delta_stats.num_bytes += new_size;
7311     oi.size = new_size;
7312   }
7313   delta_stats.num_wr++;
7314   delta_stats.num_wr_kb += SHIFT_ROUND_UP(length, 10);
7315 }
7316
7317 void PrimaryLogPG::add_interval_usage(interval_set<uint64_t>& s, object_stat_sum_t& delta_stats)
7318 {
7319   for (interval_set<uint64_t>::const_iterator p = s.begin(); p != s.end(); ++p) {
7320     delta_stats.num_bytes += p.get_len();
7321   }
7322 }
7323
7324 void PrimaryLogPG::complete_disconnect_watches(
7325   ObjectContextRef obc,
7326   const list<watch_disconnect_t> &to_disconnect)
7327 {
7328   for (list<watch_disconnect_t>::const_iterator i =
7329          to_disconnect.begin();
7330        i != to_disconnect.end();
7331        ++i) {
7332     pair<uint64_t, entity_name_t> watcher(i->cookie, i->name);
7333     auto watchers_entry = obc->watchers.find(watcher);
7334     if (watchers_entry != obc->watchers.end()) {
7335       WatchRef watch = watchers_entry->second;
7336       dout(10) << "do_osd_op_effects disconnect watcher " << watcher << dendl;
7337       obc->watchers.erase(watcher);
7338       watch->remove(i->send_disconnect);
7339     } else {
7340       dout(10) << "do_osd_op_effects disconnect failed to find watcher "
7341                << watcher << dendl;
7342     }
7343   }
7344 }
7345
7346 void PrimaryLogPG::do_osd_op_effects(OpContext *ctx, const ConnectionRef& conn)
7347 {
7348   entity_name_t entity = ctx->reqid.name;
7349   dout(15) << "do_osd_op_effects " << entity << " con " << conn.get() << dendl;
7350
7351   // disconnects first
7352   complete_disconnect_watches(ctx->obc, ctx->watch_disconnects);
7353
7354   assert(conn);
7355
7356   boost::intrusive_ptr<Session> session((Session *)conn->get_priv());
7357   if (!session.get())
7358     return;
7359   session->put();  // get_priv() takes a ref, and so does the intrusive_ptr
7360
7361   for (list<pair<watch_info_t,bool> >::iterator i = ctx->watch_connects.begin();
7362        i != ctx->watch_connects.end();
7363        ++i) {
7364     pair<uint64_t, entity_name_t> watcher(i->first.cookie, entity);
7365     dout(15) << "do_osd_op_effects applying watch connect on session "
7366              << session.get() << " watcher " << watcher << dendl;
7367     WatchRef watch;
7368     if (ctx->obc->watchers.count(watcher)) {
7369       dout(15) << "do_osd_op_effects found existing watch watcher " << watcher
7370                << dendl;
7371       watch = ctx->obc->watchers[watcher];
7372     } else {
7373       dout(15) << "do_osd_op_effects new watcher " << watcher
7374                << dendl;
7375       watch = Watch::makeWatchRef(
7376         this, osd, ctx->obc, i->first.timeout_seconds,
7377         i->first.cookie, entity, conn->get_peer_addr());
7378       ctx->obc->watchers.insert(
7379         make_pair(
7380           watcher,
7381           watch));
7382     }
7383     watch->connect(conn, i->second);
7384   }
7385
7386   for (list<notify_info_t>::iterator p = ctx->notifies.begin();
7387        p != ctx->notifies.end();
7388        ++p) {
7389     dout(10) << "do_osd_op_effects, notify " << *p << dendl;
7390     ConnectionRef conn(ctx->op->get_req()->get_connection());
7391     NotifyRef notif(
7392       Notify::makeNotifyRef(
7393         conn,
7394         ctx->reqid.name.num(),
7395         p->bl,
7396         p->timeout,
7397         p->cookie,
7398         p->notify_id,
7399         ctx->obc->obs.oi.user_version,
7400         osd));
7401     for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator i =
7402            ctx->obc->watchers.begin();
7403          i != ctx->obc->watchers.end();
7404          ++i) {
7405       dout(10) << "starting notify on watch " << i->first << dendl;
7406       i->second->start_notify(notif);
7407     }
7408     notif->init();
7409   }
7410
7411   for (list<OpContext::NotifyAck>::iterator p = ctx->notify_acks.begin();
7412        p != ctx->notify_acks.end();
7413        ++p) {
7414     if (p->watch_cookie)
7415       dout(10) << "notify_ack " << make_pair(p->watch_cookie.get(), p->notify_id) << dendl;
7416     else
7417       dout(10) << "notify_ack " << make_pair("NULL", p->notify_id) << dendl;
7418     for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator i =
7419            ctx->obc->watchers.begin();
7420          i != ctx->obc->watchers.end();
7421          ++i) {
7422       if (i->first.second != entity) continue;
7423       if (p->watch_cookie &&
7424           p->watch_cookie.get() != i->first.first) continue;
7425       dout(10) << "acking notify on watch " << i->first << dendl;
7426       i->second->notify_ack(p->notify_id, p->reply_bl);
7427     }
7428   }
7429 }
7430
7431 hobject_t PrimaryLogPG::generate_temp_object(const hobject_t& target)
7432 {
7433   ostringstream ss;
7434   ss << "temp_" << info.pgid << "_" << get_role()
7435      << "_" << osd->monc->get_global_id() << "_" << (++temp_seq);
7436   hobject_t hoid = target.make_temp_hobject(ss.str());
7437   dout(20) << __func__ << " " << hoid << dendl;
7438   return hoid;
7439 }
7440
7441 hobject_t PrimaryLogPG::get_temp_recovery_object(
7442   const hobject_t& target,
7443   eversion_t version)
7444 {
7445   ostringstream ss;
7446   ss << "temp_recovering_" << info.pgid  // (note this includes the shardid)
7447      << "_" << version
7448      << "_" << info.history.same_interval_since
7449      << "_" << target.snap;
7450   // pgid + version + interval + snapid is unique, and short
7451   hobject_t hoid = target.make_temp_hobject(ss.str());
7452   dout(20) << __func__ << " " << hoid << dendl;
7453   return hoid;
7454 }
7455
7456 int PrimaryLogPG::prepare_transaction(OpContext *ctx)
7457 {
7458   assert(!ctx->ops->empty());
7459
7460   const hobject_t& soid = ctx->obs->oi.soid;
7461
7462   // valid snap context?
7463   if (!ctx->snapc.is_valid()) {
7464     dout(10) << " invalid snapc " << ctx->snapc << dendl;
7465     return -EINVAL;
7466   }
7467
7468   // prepare the actual mutation
7469   int result = do_osd_ops(ctx, *ctx->ops);
7470   if (result < 0) {
7471     if (ctx->op->may_write() &&
7472         get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
7473       // need to save the error code in the pg log, to detect dup ops,
7474       // but do nothing else
7475       ctx->update_log_only = true;
7476     }
7477     return result;
7478   }
7479
7480   // read-op?  write-op noop? done?
7481   if (ctx->op_t->empty() && !ctx->modify) {
7482     unstable_stats.add(ctx->delta_stats);
7483     if (ctx->op->may_write() &&
7484         get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
7485       ctx->update_log_only = true;
7486     }
7487     return result;
7488   }
7489
7490   // check for full
7491   if ((ctx->delta_stats.num_bytes > 0 ||
7492        ctx->delta_stats.num_objects > 0) &&  // FIXME: keys?
7493       (pool.info.has_flag(pg_pool_t::FLAG_FULL) ||
7494        get_osdmap()->test_flag(CEPH_OSDMAP_FULL))) {
7495     const MOSDOp *m = static_cast<const MOSDOp*>(ctx->op->get_req());
7496     if (ctx->reqid.name.is_mds() ||   // FIXME: ignore MDS for now
7497         m->has_flag(CEPH_OSD_FLAG_FULL_FORCE)) {
7498       dout(20) << __func__ << " full, but proceeding due to FULL_FORCE or MDS"
7499                << dendl;
7500     } else if (m->has_flag(CEPH_OSD_FLAG_FULL_TRY)) {
7501       // they tried, they failed.
7502       dout(20) << __func__ << " full, replying to FULL_TRY op" << dendl;
7503       return pool.info.has_flag(pg_pool_t::FLAG_FULL) ? -EDQUOT : -ENOSPC;
7504     } else {
7505       // drop request
7506       dout(20) << __func__ << " full, dropping request (bad client)" << dendl;
7507       return -EAGAIN;
7508     }
7509   }
7510
7511   // clone, if necessary
7512   if (soid.snap == CEPH_NOSNAP)
7513     make_writeable(ctx);
7514
7515   finish_ctx(ctx,
7516              ctx->new_obs.exists ? pg_log_entry_t::MODIFY :
7517              pg_log_entry_t::DELETE);
7518
7519   return result;
7520 }
7521
7522 void PrimaryLogPG::finish_ctx(OpContext *ctx, int log_op_type, bool maintain_ssc)
7523 {
7524   const hobject_t& soid = ctx->obs->oi.soid;
7525   dout(20) << __func__ << " " << soid << " " << ctx
7526            << " op " << pg_log_entry_t::get_op_name(log_op_type)
7527            << dendl;
7528   utime_t now = ceph_clock_now();
7529
7530   // snapset
7531   bufferlist bss;
7532
7533   if (soid.snap == CEPH_NOSNAP && maintain_ssc) {
7534     ::encode(ctx->new_snapset, bss);
7535     assert(ctx->new_obs.exists == ctx->new_snapset.head_exists ||
7536            !ctx->new_snapset.is_legacy());
7537
7538     if (ctx->new_obs.exists) {
7539       if (!ctx->obs->exists) {
7540         if (ctx->snapset_obc && ctx->snapset_obc->obs.exists) {
7541           hobject_t snapoid = soid.get_snapdir();
7542           dout(10) << " removing unneeded snapdir " << snapoid << dendl;
7543           ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::DELETE, snapoid,
7544               ctx->at_version,
7545               ctx->snapset_obc->obs.oi.version,
7546               0, osd_reqid_t(), ctx->mtime, 0));
7547           ctx->op_t->remove(snapoid);
7548
7549           ctx->at_version.version++;
7550
7551           ctx->snapset_obc->obs.exists = false;
7552         }
7553       }
7554     } else if (!ctx->new_snapset.clones.empty() &&
7555                !ctx->cache_evict &&
7556                !ctx->new_snapset.head_exists &&
7557                (!ctx->snapset_obc || !ctx->snapset_obc->obs.exists)) {
7558       // save snapset on _snap
7559       hobject_t snapoid(soid.oid, soid.get_key(), CEPH_SNAPDIR, soid.get_hash(),
7560                         info.pgid.pool(), soid.get_namespace());
7561       dout(10) << " final snapset " << ctx->new_snapset
7562                << " in " << snapoid << dendl;
7563       assert(get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS);
7564       ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::MODIFY, snapoid,
7565                                         ctx->at_version,
7566                                         eversion_t(),
7567                                         0, osd_reqid_t(), ctx->mtime, 0));
7568
7569       if (!ctx->snapset_obc)
7570         ctx->snapset_obc = get_object_context(snapoid, true);
7571       bool got = false;
7572       if (ctx->lock_type == ObjectContext::RWState::RWWRITE) {
7573         got = ctx->lock_manager.get_write_greedy(
7574           snapoid,
7575           ctx->snapset_obc,
7576           ctx->op);
7577       } else {
7578         assert(ctx->lock_type == ObjectContext::RWState::RWEXCL);
7579         got = ctx->lock_manager.get_lock_type(
7580           ObjectContext::RWState::RWEXCL,
7581           snapoid,
7582           ctx->snapset_obc,
7583           ctx->op);
7584       }
7585       assert(got);
7586       dout(20) << " got greedy write on snapset_obc " << *ctx->snapset_obc << dendl;
7587       ctx->snapset_obc->obs.exists = true;
7588       ctx->snapset_obc->obs.oi.version = ctx->at_version;
7589       ctx->snapset_obc->obs.oi.last_reqid = ctx->reqid;
7590       ctx->snapset_obc->obs.oi.mtime = ctx->mtime;
7591       ctx->snapset_obc->obs.oi.local_mtime = now;
7592
7593       map<string, bufferlist> attrs;
7594       bufferlist bv(sizeof(ctx->new_obs.oi));
7595       ::encode(ctx->snapset_obc->obs.oi, bv,
7596                get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
7597       ctx->op_t->create(snapoid);
7598       attrs[OI_ATTR].claim(bv);
7599       attrs[SS_ATTR].claim(bss);
7600       setattrs_maybe_cache(ctx->snapset_obc, ctx, ctx->op_t.get(), attrs);
7601       ctx->at_version.version++;
7602     }
7603   }
7604
7605   // finish and log the op.
7606   if (ctx->user_modify) {
7607     // update the user_version for any modify ops, except for the watch op
7608     ctx->user_at_version = MAX(info.last_user_version, ctx->new_obs.oi.user_version) + 1;
7609     /* In order for new clients and old clients to interoperate properly
7610      * when exchanging versions, we need to lower bound the user_version
7611      * (which our new clients pay proper attention to)
7612      * by the at_version (which is all the old clients can ever see). */
7613     if (ctx->at_version.version > ctx->user_at_version)
7614       ctx->user_at_version = ctx->at_version.version;
7615     ctx->new_obs.oi.user_version = ctx->user_at_version;
7616   }
7617   ctx->bytes_written = ctx->op_t->get_bytes_written();
7618
7619   if (ctx->new_obs.exists) {
7620     // on the head object
7621     ctx->new_obs.oi.version = ctx->at_version;
7622     ctx->new_obs.oi.prior_version = ctx->obs->oi.version;
7623     ctx->new_obs.oi.last_reqid = ctx->reqid;
7624     if (ctx->mtime != utime_t()) {
7625       ctx->new_obs.oi.mtime = ctx->mtime;
7626       dout(10) << " set mtime to " << ctx->new_obs.oi.mtime << dendl;
7627       ctx->new_obs.oi.local_mtime = now;
7628     } else {
7629       dout(10) << " mtime unchanged at " << ctx->new_obs.oi.mtime << dendl;
7630     }
7631
7632     map <string, bufferlist> attrs;
7633     bufferlist bv(sizeof(ctx->new_obs.oi));
7634     ::encode(ctx->new_obs.oi, bv,
7635              get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
7636     attrs[OI_ATTR].claim(bv);
7637
7638     if (soid.snap == CEPH_NOSNAP) {
7639       dout(10) << " final snapset " << ctx->new_snapset
7640                << " in " << soid << dendl;
7641       attrs[SS_ATTR].claim(bss);
7642     } else {
7643       dout(10) << " no snapset (this is a clone)" << dendl;
7644     }
7645     ctx->op_t->setattrs(soid, attrs);
7646   } else {
7647     ctx->new_obs.oi = object_info_t(ctx->obc->obs.oi.soid);
7648   }
7649
7650   bool legacy_snapset = ctx->new_snapset.is_legacy() ||
7651     get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
7652
7653   // append to log
7654   ctx->log.push_back(pg_log_entry_t(log_op_type, soid, ctx->at_version,
7655                                     ctx->obs->oi.version,
7656                                     ctx->user_at_version, ctx->reqid,
7657                                     ctx->mtime, 0));
7658   if (soid.snap < CEPH_NOSNAP) {
7659     switch (log_op_type) {
7660     case pg_log_entry_t::MODIFY:
7661     case pg_log_entry_t::PROMOTE:
7662     case pg_log_entry_t::CLEAN:
7663       if (legacy_snapset) {
7664         dout(20) << __func__ << " encoding legacy_snaps "
7665                  << ctx->new_obs.oi.legacy_snaps
7666                  << dendl;
7667         ::encode(ctx->new_obs.oi.legacy_snaps, ctx->log.back().snaps);
7668       } else {
7669         dout(20) << __func__ << " encoding snaps from " << ctx->new_snapset
7670                  << dendl;
7671         ::encode(ctx->new_snapset.clone_snaps[soid.snap], ctx->log.back().snaps);
7672       }
7673       break;
7674     default:
7675       break;
7676     }
7677   }
7678
7679   if (!ctx->extra_reqids.empty()) {
7680     dout(20) << __func__ << "  extra_reqids " << ctx->extra_reqids << dendl;
7681     ctx->log.back().extra_reqids.swap(ctx->extra_reqids);
7682   }
7683
7684   // apply new object state.
7685   ctx->obc->obs = ctx->new_obs;
7686
7687   if (soid.is_head() && !ctx->obc->obs.exists &&
7688       (!maintain_ssc || ctx->cache_evict)) {
7689     ctx->obc->ssc->exists = false;
7690     ctx->obc->ssc->snapset = SnapSet();
7691   } else {
7692     ctx->obc->ssc->exists = true;
7693     ctx->obc->ssc->snapset = ctx->new_snapset;
7694   }
7695 }
7696
7697 void PrimaryLogPG::apply_stats(
7698   const hobject_t &soid,
7699   const object_stat_sum_t &delta_stats) {
7700
7701   info.stats.stats.add(delta_stats);
7702
7703   for (set<pg_shard_t>::iterator i = backfill_targets.begin();
7704        i != backfill_targets.end();
7705        ++i) {
7706     pg_shard_t bt = *i;
7707     pg_info_t& pinfo = peer_info[bt];
7708     if (soid <= pinfo.last_backfill)
7709       pinfo.stats.stats.add(delta_stats);
7710     else if (soid <= last_backfill_started)
7711       pending_backfill_updates[soid].stats.add(delta_stats);
7712   }
7713
7714   if (is_primary() && scrubber.active) {
7715     if (soid < scrubber.start) {
7716       dout(20) << __func__ << " " << soid << " < [" << scrubber.start
7717                << "," << scrubber.end << ")" << dendl;
7718       scrub_cstat.add(delta_stats);
7719     } else {
7720       dout(20) << __func__ << " " << soid << " >= [" << scrubber.start
7721                << "," << scrubber.end << ")" << dendl;
7722     }
7723   }
7724 }
7725
7726 void PrimaryLogPG::complete_read_ctx(int result, OpContext *ctx)
7727 {
7728   const MOSDOp *m = static_cast<const MOSDOp*>(ctx->op->get_req());
7729   assert(ctx->async_reads_complete());
7730
7731   for (vector<OSDOp>::iterator p = ctx->ops->begin();
7732     p != ctx->ops->end() && result >= 0; ++p) {
7733     if (p->rval < 0 && !(p->op.flags & CEPH_OSD_OP_FLAG_FAILOK)) {
7734       result = p->rval;
7735       break;
7736     }
7737     ctx->bytes_read += p->outdata.length();
7738   }
7739   ctx->reply->claim_op_out_data(*ctx->ops);
7740   ctx->reply->get_header().data_off = (ctx->data_off ? *ctx->data_off : 0);
7741
7742   MOSDOpReply *reply = ctx->reply;
7743   ctx->reply = nullptr;
7744
7745   if (result >= 0) {
7746     if (!ctx->ignore_log_op_stats) {
7747       log_op_stats(ctx);
7748       publish_stats_to_osd();
7749     }
7750
7751     // on read, return the current object version
7752     if (ctx->obs) {
7753       reply->set_reply_versions(eversion_t(), ctx->obs->oi.user_version);
7754     } else {
7755       reply->set_reply_versions(eversion_t(), ctx->user_at_version);
7756     }
7757   } else if (result == -ENOENT) {
7758     // on ENOENT, set a floor for what the next user version will be.
7759     reply->set_enoent_reply_versions(info.last_update, info.last_user_version);
7760   }
7761
7762   reply->set_result(result);
7763   reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
7764   osd->send_message_osd_client(reply, m->get_connection());
7765   close_op_ctx(ctx);
7766 }
7767
7768 // ========================================================================
7769 // copyfrom
7770
7771 struct C_Copyfrom : public Context {
7772   PrimaryLogPGRef pg;
7773   hobject_t oid;
7774   epoch_t last_peering_reset;
7775   ceph_tid_t tid;
7776   PrimaryLogPG::CopyOpRef cop;
7777   C_Copyfrom(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
7778              const PrimaryLogPG::CopyOpRef& c)
7779     : pg(p), oid(o), last_peering_reset(lpr),
7780       tid(0), cop(c)
7781   {}
7782   void finish(int r) override {
7783     if (r == -ECANCELED)
7784       return;
7785     pg->lock();
7786     if (last_peering_reset == pg->get_last_peering_reset()) {
7787       pg->process_copy_chunk(oid, tid, r);
7788     }
7789     pg->unlock();
7790   }
7791 };
7792
7793 struct C_CopyFrom_AsyncReadCb : public Context {
7794   OSDOp *osd_op;
7795   object_copy_data_t reply_obj;
7796   uint64_t features;
7797   size_t len;
7798   C_CopyFrom_AsyncReadCb(OSDOp *osd_op, uint64_t features) :
7799     osd_op(osd_op), features(features), len(0) {}
7800   void finish(int r) override {
7801     osd_op->rval = r;
7802     if (r < 0) {
7803       return;
7804     }
7805
7806     assert(len > 0);
7807     assert(len <= reply_obj.data.length());
7808     bufferlist bl;
7809     bl.substr_of(reply_obj.data, 0, len);
7810     reply_obj.data.swap(bl);
7811     ::encode(reply_obj, osd_op->outdata, features);
7812   }
7813 };
7814
7815 int PrimaryLogPG::do_copy_get(OpContext *ctx, bufferlist::iterator& bp,
7816                               OSDOp& osd_op, ObjectContextRef &obc)
7817 {
7818   object_info_t& oi = obc->obs.oi;
7819   hobject_t& soid = oi.soid;
7820   int result = 0;
7821   object_copy_cursor_t cursor;
7822   uint64_t out_max;
7823   try {
7824     ::decode(cursor, bp);
7825     ::decode(out_max, bp);
7826   }
7827   catch (buffer::error& e) {
7828     result = -EINVAL;
7829     return result;
7830   }
7831
7832   const MOSDOp *op = reinterpret_cast<const MOSDOp*>(ctx->op->get_req());
7833   uint64_t features = op->get_features();
7834
7835   bool async_read_started = false;
7836   object_copy_data_t _reply_obj;
7837   C_CopyFrom_AsyncReadCb *cb = NULL;
7838   if (pool.info.require_rollback()) {
7839     cb = new C_CopyFrom_AsyncReadCb(&osd_op, features);
7840   }
7841   object_copy_data_t &reply_obj = cb ? cb->reply_obj : _reply_obj;
7842   // size, mtime
7843   reply_obj.size = oi.size;
7844   reply_obj.mtime = oi.mtime;
7845   assert(obc->ssc);
7846   if (soid.snap < CEPH_NOSNAP) {
7847     if (obc->ssc->snapset.is_legacy()) {
7848       reply_obj.snaps = oi.legacy_snaps;
7849     } else {
7850       auto p = obc->ssc->snapset.clone_snaps.find(soid.snap);
7851       assert(p != obc->ssc->snapset.clone_snaps.end()); // warn?
7852       reply_obj.snaps = p->second;
7853     }
7854   } else {
7855     reply_obj.snap_seq = obc->ssc->snapset.seq;
7856   }
7857   if (oi.is_data_digest()) {
7858     reply_obj.flags |= object_copy_data_t::FLAG_DATA_DIGEST;
7859     reply_obj.data_digest = oi.data_digest;
7860   }
7861   if (oi.is_omap_digest()) {
7862     reply_obj.flags |= object_copy_data_t::FLAG_OMAP_DIGEST;
7863     reply_obj.omap_digest = oi.omap_digest;
7864   }
7865   reply_obj.truncate_seq = oi.truncate_seq;
7866   reply_obj.truncate_size = oi.truncate_size;
7867
7868   // attrs
7869   map<string,bufferlist>& out_attrs = reply_obj.attrs;
7870   if (!cursor.attr_complete) {
7871     result = getattrs_maybe_cache(
7872       ctx->obc,
7873       &out_attrs);
7874     if (result < 0) {
7875       if (cb) {
7876         delete cb;
7877       }
7878       return result;
7879     }
7880     cursor.attr_complete = true;
7881     dout(20) << " got attrs" << dendl;
7882   }
7883
7884   int64_t left = out_max - osd_op.outdata.length();
7885
7886   // data
7887   bufferlist& bl = reply_obj.data;
7888   if (left > 0 && !cursor.data_complete) {
7889     if (cursor.data_offset < oi.size) {
7890       uint64_t max_read = MIN(oi.size - cursor.data_offset, (uint64_t)left);
7891       if (cb) {
7892         async_read_started = true;
7893         ctx->pending_async_reads.push_back(
7894           make_pair(
7895             boost::make_tuple(cursor.data_offset, max_read, osd_op.op.flags),
7896             make_pair(&bl, cb)));
7897         cb->len = max_read;
7898
7899         ctx->op_finishers[ctx->current_osd_subop_num].reset(
7900           new ReadFinisher(osd_op));
7901         result = -EINPROGRESS;
7902
7903         dout(10) << __func__ << ": async_read noted for " << soid << dendl;
7904       } else {
7905         result = pgbackend->objects_read_sync(
7906           oi.soid, cursor.data_offset, max_read, osd_op.op.flags, &bl);
7907         if (result < 0)
7908           return result;
7909       }
7910       left -= max_read;
7911       cursor.data_offset += max_read;
7912     }
7913     if (cursor.data_offset == oi.size) {
7914       cursor.data_complete = true;
7915       dout(20) << " got data" << dendl;
7916     }
7917     assert(cursor.data_offset <= oi.size);
7918   }
7919
7920   // omap
7921   uint32_t omap_keys = 0;
7922   if (!pool.info.supports_omap() || !oi.is_omap()) {
7923     cursor.omap_complete = true;
7924   } else {
7925     if (left > 0 && !cursor.omap_complete) {
7926       assert(cursor.data_complete);
7927       if (cursor.omap_offset.empty()) {
7928         osd->store->omap_get_header(ch, ghobject_t(oi.soid),
7929                                     &reply_obj.omap_header);
7930       }
7931       bufferlist omap_data;
7932       ObjectMap::ObjectMapIterator iter =
7933         osd->store->get_omap_iterator(coll, ghobject_t(oi.soid));
7934       assert(iter);
7935       iter->upper_bound(cursor.omap_offset);
7936       for (; iter->valid(); iter->next(false)) {
7937         ++omap_keys;
7938         ::encode(iter->key(), omap_data);
7939         ::encode(iter->value(), omap_data);
7940         left -= iter->key().length() + 4 + iter->value().length() + 4;
7941         if (left <= 0)
7942           break;
7943       }
7944       if (omap_keys) {
7945         ::encode(omap_keys, reply_obj.omap_data);
7946         reply_obj.omap_data.claim_append(omap_data);
7947       }
7948       if (iter->valid()) {
7949         cursor.omap_offset = iter->key();
7950       } else {
7951         cursor.omap_complete = true;
7952         dout(20) << " got omap" << dendl;
7953       }
7954     }
7955   }
7956
7957   if (cursor.is_complete()) {
7958     // include reqids only in the final step.  this is a bit fragile
7959     // but it works...
7960     pg_log.get_log().get_object_reqids(ctx->obc->obs.oi.soid, 10, &reply_obj.reqids);
7961     dout(20) << " got reqids" << dendl;
7962   }
7963
7964   dout(20) << " cursor.is_complete=" << cursor.is_complete()
7965            << " " << out_attrs.size() << " attrs"
7966            << " " << bl.length() << " bytes"
7967            << " " << reply_obj.omap_header.length() << " omap header bytes"
7968            << " " << reply_obj.omap_data.length() << " omap data bytes in "
7969            << omap_keys << " keys"
7970            << " " << reply_obj.reqids.size() << " reqids"
7971            << dendl;
7972   reply_obj.cursor = cursor;
7973   if (!async_read_started) {
7974     ::encode(reply_obj, osd_op.outdata, features);
7975   }
7976   if (cb && !async_read_started) {
7977     delete cb;
7978   }
7979
7980   if (result > 0) {
7981     result = 0;
7982   }
7983   return result;
7984 }
7985
7986 void PrimaryLogPG::fill_in_copy_get_noent(OpRequestRef& op, hobject_t oid,
7987                                           OSDOp& osd_op)
7988 {
7989   // NOTE: we take non-const ref here for claim_op_out_data below; we must
7990   // be careful not to modify anything else that will upset a racing
7991   // operator<<
7992   MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
7993   uint64_t features = m->get_features();
7994   object_copy_data_t reply_obj;
7995
7996   pg_log.get_log().get_object_reqids(oid, 10, &reply_obj.reqids);
7997   dout(20) << __func__ << " got reqids " << reply_obj.reqids << dendl;
7998   ::encode(reply_obj, osd_op.outdata, features);
7999   osd_op.rval = -ENOENT;
8000   MOSDOpReply *reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0, false);
8001   reply->claim_op_out_data(m->ops);
8002   reply->set_result(-ENOENT);
8003   reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
8004   osd->send_message_osd_client(reply, m->get_connection());
8005 }
8006
8007 void PrimaryLogPG::start_copy(CopyCallback *cb, ObjectContextRef obc,
8008                               hobject_t src, object_locator_t oloc,
8009                               version_t version, unsigned flags,
8010                               bool mirror_snapset,
8011                               unsigned src_obj_fadvise_flags,
8012                               unsigned dest_obj_fadvise_flags)
8013 {
8014   const hobject_t& dest = obc->obs.oi.soid;
8015   dout(10) << __func__ << " " << dest
8016            << " from " << src << " " << oloc << " v" << version
8017            << " flags " << flags
8018            << (mirror_snapset ? " mirror_snapset" : "")
8019            << dendl;
8020
8021   assert(!mirror_snapset || (src.snap == CEPH_NOSNAP ||
8022                              src.snap == CEPH_SNAPDIR));
8023
8024   // cancel a previous in-progress copy?
8025   if (copy_ops.count(dest)) {
8026     // FIXME: if the src etc match, we could avoid restarting from the
8027     // beginning.
8028     CopyOpRef cop = copy_ops[dest];
8029     cancel_copy(cop, false);
8030   }
8031
8032   CopyOpRef cop(std::make_shared<CopyOp>(cb, obc, src, oloc, version, flags,
8033                            mirror_snapset, src_obj_fadvise_flags,
8034                            dest_obj_fadvise_flags));
8035   copy_ops[dest] = cop;
8036   obc->start_block();
8037
8038   _copy_some(obc, cop);
8039 }
8040
8041 void PrimaryLogPG::_copy_some(ObjectContextRef obc, CopyOpRef cop)
8042 {
8043   dout(10) << __func__ << " " << obc << " " << cop << dendl;
8044
8045   unsigned flags = 0;
8046   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_FLUSH)
8047     flags |= CEPH_OSD_FLAG_FLUSH;
8048   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE)
8049     flags |= CEPH_OSD_FLAG_IGNORE_CACHE;
8050   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY)
8051     flags |= CEPH_OSD_FLAG_IGNORE_OVERLAY;
8052   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE)
8053     flags |= CEPH_OSD_FLAG_MAP_SNAP_CLONE;
8054   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_RWORDERED)
8055     flags |= CEPH_OSD_FLAG_RWORDERED;
8056
8057   C_GatherBuilder gather(cct);
8058
8059   if (cop->cursor.is_initial() && cop->mirror_snapset) {
8060     // list snaps too.
8061     assert(cop->src.snap == CEPH_NOSNAP);
8062     ObjectOperation op;
8063     op.list_snaps(&cop->results.snapset, NULL);
8064     ceph_tid_t tid = osd->objecter->read(cop->src.oid, cop->oloc, op,
8065                                     CEPH_SNAPDIR, NULL,
8066                                     flags, gather.new_sub(), NULL);
8067     cop->objecter_tid2 = tid;
8068   }
8069
8070   ObjectOperation op;
8071   if (cop->results.user_version) {
8072     op.assert_version(cop->results.user_version);
8073   } else {
8074     // we should learn the version after the first chunk, if we didn't know
8075     // it already!
8076     assert(cop->cursor.is_initial());
8077   }
8078   op.copy_get(&cop->cursor, get_copy_chunk_size(),
8079               &cop->results.object_size, &cop->results.mtime,
8080               &cop->attrs, &cop->data, &cop->omap_header, &cop->omap_data,
8081               &cop->results.snaps, &cop->results.snap_seq,
8082               &cop->results.flags,
8083               &cop->results.source_data_digest,
8084               &cop->results.source_omap_digest,
8085               &cop->results.reqids,
8086               &cop->results.truncate_seq,
8087               &cop->results.truncate_size,
8088               &cop->rval);
8089   op.set_last_op_flags(cop->src_obj_fadvise_flags);
8090
8091   C_Copyfrom *fin = new C_Copyfrom(this, obc->obs.oi.soid,
8092                                    get_last_peering_reset(), cop);
8093   gather.set_finisher(new C_OnFinisher(fin,
8094                                        &osd->objecter_finisher));
8095
8096   ceph_tid_t tid = osd->objecter->read(cop->src.oid, cop->oloc, op,
8097                                   cop->src.snap, NULL,
8098                                   flags,
8099                                   gather.new_sub(),
8100                                   // discover the object version if we don't know it yet
8101                                   cop->results.user_version ? NULL : &cop->results.user_version);
8102   fin->tid = tid;
8103   cop->objecter_tid = tid;
8104   gather.activate();
8105 }
8106
8107 void PrimaryLogPG::process_copy_chunk(hobject_t oid, ceph_tid_t tid, int r)
8108 {
8109   dout(10) << __func__ << " " << oid << " tid " << tid
8110            << " " << cpp_strerror(r) << dendl;
8111   map<hobject_t,CopyOpRef>::iterator p = copy_ops.find(oid);
8112   if (p == copy_ops.end()) {
8113     dout(10) << __func__ << " no copy_op found" << dendl;
8114     return;
8115   }
8116   CopyOpRef cop = p->second;
8117   if (tid != cop->objecter_tid) {
8118     dout(10) << __func__ << " tid " << tid << " != cop " << cop
8119              << " tid " << cop->objecter_tid << dendl;
8120     return;
8121   }
8122
8123   if (cop->omap_data.length() || cop->omap_header.length())
8124     cop->results.has_omap = true;
8125
8126   if (r >= 0 && !pool.info.supports_omap() &&
8127       (cop->omap_data.length() || cop->omap_header.length())) {
8128     r = -EOPNOTSUPP;
8129   }
8130   cop->objecter_tid = 0;
8131   cop->objecter_tid2 = 0;  // assume this ordered before us (if it happened)
8132   ObjectContextRef& cobc = cop->obc;
8133
8134   if (r < 0)
8135     goto out;
8136
8137   assert(cop->rval >= 0);
8138
8139   if (oid.snap < CEPH_NOSNAP && !cop->results.snaps.empty()) {
8140     // verify snap hasn't been deleted
8141     vector<snapid_t>::iterator p = cop->results.snaps.begin();
8142     while (p != cop->results.snaps.end()) {
8143       if (pool.info.is_removed_snap(*p)) {
8144         dout(10) << __func__ << " clone snap " << *p << " has been deleted"
8145                  << dendl;
8146         for (vector<snapid_t>::iterator q = p + 1;
8147              q != cop->results.snaps.end();
8148              ++q)
8149           *(q - 1) = *q;
8150         cop->results.snaps.resize(cop->results.snaps.size() - 1);
8151       } else {
8152         ++p;
8153       }
8154     }
8155     if (cop->results.snaps.empty()) {
8156       dout(10) << __func__ << " no more snaps for " << oid << dendl;
8157       r = -ENOENT;
8158       goto out;
8159     }
8160   }
8161
8162   assert(cop->rval >= 0);
8163
8164   if (!cop->temp_cursor.data_complete) {
8165     cop->results.data_digest = cop->data.crc32c(cop->results.data_digest);
8166   }
8167   if (pool.info.supports_omap() && !cop->temp_cursor.omap_complete) {
8168     if (cop->omap_header.length()) {
8169       cop->results.omap_digest =
8170         cop->omap_header.crc32c(cop->results.omap_digest);
8171     }
8172     if (cop->omap_data.length()) {
8173       bufferlist keys;
8174       keys.substr_of(cop->omap_data, 4, cop->omap_data.length() - 4);
8175       cop->results.omap_digest = keys.crc32c(cop->results.omap_digest);
8176     }
8177   }
8178
8179   if (!cop->temp_cursor.attr_complete) {
8180     for (map<string,bufferlist>::iterator p = cop->attrs.begin();
8181          p != cop->attrs.end();
8182          ++p) {
8183       cop->results.attrs[string("_") + p->first] = p->second;
8184     }
8185     cop->attrs.clear();
8186   }
8187
8188   if (!cop->cursor.is_complete()) {
8189     // write out what we have so far
8190     if (cop->temp_cursor.is_initial()) {
8191       assert(!cop->results.started_temp_obj);
8192       cop->results.started_temp_obj = true;
8193       cop->results.temp_oid = generate_temp_object(oid);
8194       dout(20) << __func__ << " using temp " << cop->results.temp_oid << dendl;
8195     }
8196     ObjectContextRef tempobc = get_object_context(cop->results.temp_oid, true);
8197     OpContextUPtr ctx = simple_opc_create(tempobc);
8198     if (cop->temp_cursor.is_initial()) {
8199       ctx->new_temp_oid = cop->results.temp_oid;
8200     }
8201     _write_copy_chunk(cop, ctx->op_t.get());
8202     simple_opc_submit(std::move(ctx));
8203     dout(10) << __func__ << " fetching more" << dendl;
8204     _copy_some(cobc, cop);
8205     return;
8206   }
8207
8208   // verify digests?
8209   if (cop->results.is_data_digest() || cop->results.is_omap_digest()) {
8210     dout(20) << __func__ << std::hex
8211       << " got digest: rx data 0x" << cop->results.data_digest
8212       << " omap 0x" << cop->results.omap_digest
8213       << ", source: data 0x" << cop->results.source_data_digest
8214       << " omap 0x" <<  cop->results.source_omap_digest
8215       << std::dec
8216       << " flags " << cop->results.flags
8217       << dendl;
8218   }
8219   if (cop->results.is_data_digest() &&
8220       cop->results.data_digest != cop->results.source_data_digest) {
8221     derr << __func__ << std::hex << " data digest 0x" << cop->results.data_digest
8222          << " != source 0x" << cop->results.source_data_digest << std::dec
8223          << dendl;
8224     osd->clog->error() << info.pgid << " copy from " << cop->src
8225                        << " to " << cop->obc->obs.oi.soid << std::hex
8226                        << " data digest 0x" << cop->results.data_digest
8227                        << " != source 0x" << cop->results.source_data_digest
8228                        << std::dec;
8229     r = -EIO;
8230     goto out;
8231   }
8232   if (cop->results.is_omap_digest() &&
8233       cop->results.omap_digest != cop->results.source_omap_digest) {
8234     derr << __func__ << std::hex
8235          << " omap digest 0x" << cop->results.omap_digest
8236          << " != source 0x" << cop->results.source_omap_digest
8237          << std::dec << dendl;
8238     osd->clog->error() << info.pgid << " copy from " << cop->src
8239                        << " to " << cop->obc->obs.oi.soid << std::hex
8240                        << " omap digest 0x" << cop->results.omap_digest
8241                        << " != source 0x" << cop->results.source_omap_digest
8242                        << std::dec;
8243     r = -EIO;
8244     goto out;
8245   }
8246   if (cct->_conf->osd_debug_inject_copyfrom_error) {
8247     derr << __func__ << " injecting copyfrom failure" << dendl;
8248     r = -EIO;
8249     goto out;
8250   }
8251
8252   cop->results.fill_in_final_tx = std::function<void(PGTransaction*)>(
8253     [this, &cop /* avoid ref cycle */](PGTransaction *t) {
8254       ObjectState& obs = cop->obc->obs;
8255       if (cop->temp_cursor.is_initial()) {
8256         dout(20) << "fill_in_final_tx: writing "
8257                  << "directly to final object" << dendl;
8258         // write directly to final object
8259         cop->results.temp_oid = obs.oi.soid;
8260         _write_copy_chunk(cop, t);
8261       } else {
8262         // finish writing to temp object, then move into place
8263         dout(20) << "fill_in_final_tx: writing to temp object" << dendl;
8264         _write_copy_chunk(cop, t);
8265         t->rename(obs.oi.soid, cop->results.temp_oid);
8266       }
8267       t->setattrs(obs.oi.soid, cop->results.attrs);
8268     });
8269
8270   dout(20) << __func__ << " success; committing" << dendl;
8271
8272  out:
8273   dout(20) << __func__ << " complete r = " << cpp_strerror(r) << dendl;
8274   CopyCallbackResults results(r, &cop->results);
8275   cop->cb->complete(results);
8276
8277   copy_ops.erase(cobc->obs.oi.soid);
8278   cobc->stop_block();
8279
8280   if (r < 0 && cop->results.started_temp_obj) {
8281     dout(10) << __func__ << " deleting partial temp object "
8282              << cop->results.temp_oid << dendl;
8283     ObjectContextRef tempobc = get_object_context(cop->results.temp_oid, true);
8284     OpContextUPtr ctx = simple_opc_create(tempobc);
8285     ctx->op_t->remove(cop->results.temp_oid);
8286     ctx->discard_temp_oid = cop->results.temp_oid;
8287     simple_opc_submit(std::move(ctx));
8288   }
8289
8290   // cancel and requeue proxy ops on this object
8291   if (!r) {
8292     for (map<ceph_tid_t, ProxyReadOpRef>::iterator it = proxyread_ops.begin();
8293         it != proxyread_ops.end();) {
8294       if (it->second->soid == cobc->obs.oi.soid) {
8295         cancel_proxy_read((it++)->second);
8296       } else {
8297         ++it;
8298       }
8299     }
8300     for (map<ceph_tid_t, ProxyWriteOpRef>::iterator it = proxywrite_ops.begin();
8301          it != proxywrite_ops.end();) {
8302       if (it->second->soid == cobc->obs.oi.soid) {
8303         cancel_proxy_write((it++)->second);
8304       } else {
8305         ++it;
8306       }
8307     }
8308     kick_proxy_ops_blocked(cobc->obs.oi.soid);
8309   }
8310
8311   kick_object_context_blocked(cobc);
8312 }
8313
8314 void PrimaryLogPG::_write_copy_chunk(CopyOpRef cop, PGTransaction *t)
8315 {
8316   dout(20) << __func__ << " " << cop
8317            << " " << cop->attrs.size() << " attrs"
8318            << " " << cop->data.length() << " bytes"
8319            << " " << cop->omap_header.length() << " omap header bytes"
8320            << " " << cop->omap_data.length() << " omap data bytes"
8321            << dendl;
8322   if (!cop->temp_cursor.attr_complete) {
8323     t->create(cop->results.temp_oid);
8324   }
8325   if (!cop->temp_cursor.data_complete) {
8326     assert(cop->data.length() + cop->temp_cursor.data_offset ==
8327            cop->cursor.data_offset);
8328     if (pool.info.requires_aligned_append() &&
8329         !cop->cursor.data_complete) {
8330       /**
8331        * Trim off the unaligned bit at the end, we'll adjust cursor.data_offset
8332        * to pick it up on the next pass.
8333        */
8334       assert(cop->temp_cursor.data_offset %
8335              pool.info.required_alignment() == 0);
8336       if (cop->data.length() % pool.info.required_alignment() != 0) {
8337         uint64_t to_trim =
8338           cop->data.length() % pool.info.required_alignment();
8339         bufferlist bl;
8340         bl.substr_of(cop->data, 0, cop->data.length() - to_trim);
8341         cop->data.swap(bl);
8342         cop->cursor.data_offset -= to_trim;
8343         assert(cop->data.length() + cop->temp_cursor.data_offset ==
8344                cop->cursor.data_offset);
8345       }
8346     }
8347     if (cop->data.length()) {
8348       t->write(
8349         cop->results.temp_oid,
8350         cop->temp_cursor.data_offset,
8351         cop->data.length(),
8352         cop->data,
8353         cop->dest_obj_fadvise_flags);
8354     }
8355     cop->data.clear();
8356   }
8357   if (pool.info.supports_omap()) {
8358     if (!cop->temp_cursor.omap_complete) {
8359       if (cop->omap_header.length()) {
8360         t->omap_setheader(
8361           cop->results.temp_oid,
8362           cop->omap_header);
8363         cop->omap_header.clear();
8364       }
8365       if (cop->omap_data.length()) {
8366         map<string,bufferlist> omap;
8367         bufferlist::iterator p = cop->omap_data.begin();
8368         ::decode(omap, p);
8369         t->omap_setkeys(cop->results.temp_oid, omap);
8370         cop->omap_data.clear();
8371       }
8372     }
8373   } else {
8374     assert(cop->omap_header.length() == 0);
8375     assert(cop->omap_data.length() == 0);
8376   }
8377   cop->temp_cursor = cop->cursor;
8378 }
8379
8380 void PrimaryLogPG::finish_copyfrom(CopyFromCallback *cb)
8381 {
8382   OpContext *ctx = cb->ctx;
8383   dout(20) << "finish_copyfrom on " << ctx->obs->oi.soid << dendl;
8384
8385   ObjectState& obs = ctx->new_obs;
8386   if (obs.exists) {
8387     dout(20) << __func__ << ": exists, removing" << dendl;
8388     ctx->op_t->remove(obs.oi.soid);
8389   } else {
8390     ctx->delta_stats.num_objects++;
8391     obs.exists = true;
8392   }
8393   if (cb->is_temp_obj_used()) {
8394     ctx->discard_temp_oid = cb->results->temp_oid;
8395   }
8396   cb->results->fill_in_final_tx(ctx->op_t.get());
8397
8398   // CopyFromCallback fills this in for us
8399   obs.oi.user_version = ctx->user_at_version;
8400
8401   obs.oi.set_data_digest(cb->results->data_digest);
8402   obs.oi.set_omap_digest(cb->results->omap_digest);
8403
8404   obs.oi.truncate_seq = cb->results->truncate_seq;
8405   obs.oi.truncate_size = cb->results->truncate_size;
8406
8407   ctx->extra_reqids = cb->results->reqids;
8408
8409   // cache: clear whiteout?
8410   if (obs.oi.is_whiteout()) {
8411     dout(10) << __func__ << " clearing whiteout on " << obs.oi.soid << dendl;
8412     obs.oi.clear_flag(object_info_t::FLAG_WHITEOUT);
8413     --ctx->delta_stats.num_whiteouts;
8414   }
8415
8416   if (cb->results->has_omap) {
8417     dout(10) << __func__ << " setting omap flag on " << obs.oi.soid << dendl;
8418     obs.oi.set_flag(object_info_t::FLAG_OMAP);
8419   } else {
8420     dout(10) << __func__ << " clearing omap flag on " << obs.oi.soid << dendl;
8421     obs.oi.clear_flag(object_info_t::FLAG_OMAP);
8422   }
8423
8424   interval_set<uint64_t> ch;
8425   if (obs.oi.size > 0)
8426     ch.insert(0, obs.oi.size);
8427   ctx->modified_ranges.union_of(ch);
8428
8429   if (cb->get_data_size() != obs.oi.size) {
8430     ctx->delta_stats.num_bytes -= obs.oi.size;
8431     obs.oi.size = cb->get_data_size();
8432     ctx->delta_stats.num_bytes += obs.oi.size;
8433   }
8434   ctx->delta_stats.num_wr++;
8435   ctx->delta_stats.num_wr_kb += SHIFT_ROUND_UP(obs.oi.size, 10);
8436
8437   osd->logger->inc(l_osd_copyfrom);
8438 }
8439
8440 void PrimaryLogPG::finish_promote(int r, CopyResults *results,
8441                                   ObjectContextRef obc)
8442 {
8443   const hobject_t& soid = obc->obs.oi.soid;
8444   dout(10) << __func__ << " " << soid << " r=" << r
8445            << " uv" << results->user_version << dendl;
8446
8447   if (r == -ECANCELED) {
8448     return;
8449   }
8450
8451   if (r != -ENOENT && soid.is_snap()) {
8452     if (results->snaps.empty()) {
8453       // we must have read "snap" content from the head object in
8454       // the base pool.  use snap_seq to construct what snaps should
8455       // be for this clone (what is was before we evicted the clean
8456       // clone from this pool, and what it will be when we flush and
8457       // the clone eventually happens in the base pool).
8458       SnapSet& snapset = obc->ssc->snapset;
8459       vector<snapid_t>::iterator p = snapset.snaps.begin();
8460       while (p != snapset.snaps.end() && *p > soid.snap)
8461         ++p;
8462       while (p != snapset.snaps.end() && *p > results->snap_seq) {
8463         results->snaps.push_back(*p);
8464         ++p;
8465       }
8466     }
8467
8468     dout(20) << __func__ << " snaps " << results->snaps << dendl;
8469     filter_snapc(results->snaps);
8470
8471     dout(20) << __func__ << " filtered snaps " << results->snaps << dendl;
8472     if (results->snaps.empty()) {
8473       dout(20) << __func__
8474                << " snaps are empty, clone is invalid,"
8475                << " setting r to ENOENT" << dendl;
8476       r = -ENOENT;
8477     }
8478   }
8479
8480   if (r < 0 && results->started_temp_obj) {
8481     dout(10) << __func__ << " abort; will clean up partial work" << dendl;
8482     ObjectContextRef tempobc = get_object_context(results->temp_oid, false);
8483     assert(tempobc);
8484     OpContextUPtr ctx = simple_opc_create(tempobc);
8485     ctx->op_t->remove(results->temp_oid);
8486     simple_opc_submit(std::move(ctx));
8487     results->started_temp_obj = false;
8488   }
8489
8490   if (r == -ENOENT && soid.is_snap()) {
8491     dout(10) << __func__
8492              << ": enoent while trying to promote clone, " << soid
8493              << " must have been trimmed, removing from snapset"
8494              << dendl;
8495     hobject_t head(soid.get_head());
8496     ObjectContextRef obc = get_object_context(head, false);
8497     assert(obc);
8498
8499     OpContextUPtr tctx = simple_opc_create(obc);
8500     tctx->at_version = get_next_version();
8501     filter_snapc(tctx->new_snapset.snaps);
8502     vector<snapid_t> new_clones;
8503     map<snapid_t, vector<snapid_t>> new_clone_snaps;
8504     for (vector<snapid_t>::iterator i = tctx->new_snapset.clones.begin();
8505          i != tctx->new_snapset.clones.end();
8506          ++i) {
8507       if (*i != soid.snap) {
8508         new_clones.push_back(*i);
8509         auto p = tctx->new_snapset.clone_snaps.find(*i);
8510         if (p != tctx->new_snapset.clone_snaps.end()) {
8511           new_clone_snaps[*i] = p->second;
8512         }
8513       }
8514     }
8515     tctx->new_snapset.clones.swap(new_clones);
8516     tctx->new_snapset.clone_overlap.erase(soid.snap);
8517     tctx->new_snapset.clone_size.erase(soid.snap);
8518     tctx->new_snapset.clone_snaps.swap(new_clone_snaps);
8519
8520     // take RWWRITE lock for duration of our local write.  ignore starvation.
8521     if (!tctx->lock_manager.take_write_lock(
8522           head,
8523           obc)) {
8524       assert(0 == "problem!");
8525     }
8526     dout(20) << __func__ << " took lock on obc, " << obc->rwstate << dendl;
8527
8528     finish_ctx(tctx.get(), pg_log_entry_t::PROMOTE);
8529
8530     simple_opc_submit(std::move(tctx));
8531     return;
8532   }
8533
8534   bool whiteout = false;
8535   if (r == -ENOENT) {
8536     assert(soid.snap == CEPH_NOSNAP); // snap case is above
8537     dout(10) << __func__ << " whiteout " << soid << dendl;
8538     whiteout = true;
8539   }
8540
8541   if (r < 0 && !whiteout) {
8542     derr << __func__ << " unexpected promote error " << cpp_strerror(r) << dendl;
8543     // pass error to everyone blocked on this object
8544     // FIXME: this is pretty sloppy, but at this point we got
8545     // something unexpected and don't have many other options.
8546     map<hobject_t,list<OpRequestRef>>::iterator blocked_iter =
8547       waiting_for_blocked_object.find(soid);
8548     if (blocked_iter != waiting_for_blocked_object.end()) {
8549       while (!blocked_iter->second.empty()) {
8550         osd->reply_op_error(blocked_iter->second.front(), r);
8551         blocked_iter->second.pop_front();
8552       }
8553       waiting_for_blocked_object.erase(blocked_iter);
8554     }
8555     return;
8556   }
8557
8558   osd->promote_finish(results->object_size);
8559
8560   OpContextUPtr tctx =  simple_opc_create(obc);
8561   tctx->at_version = get_next_version();
8562
8563   ++tctx->delta_stats.num_objects;
8564   if (soid.snap < CEPH_NOSNAP)
8565     ++tctx->delta_stats.num_object_clones;
8566   tctx->new_obs.exists = true;
8567
8568   tctx->extra_reqids = results->reqids;
8569
8570   bool legacy_snapset = tctx->new_snapset.is_legacy() ||
8571     get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
8572
8573   if (whiteout) {
8574     // create a whiteout
8575     tctx->op_t->create(soid);
8576     tctx->new_obs.oi.set_flag(object_info_t::FLAG_WHITEOUT);
8577     ++tctx->delta_stats.num_whiteouts;
8578     dout(20) << __func__ << " creating whiteout on " << soid << dendl;
8579     osd->logger->inc(l_osd_tier_whiteout);
8580   } else {
8581     if (results->has_omap) {
8582       dout(10) << __func__ << " setting omap flag on " << soid << dendl;
8583       tctx->new_obs.oi.set_flag(object_info_t::FLAG_OMAP);
8584       ++tctx->delta_stats.num_objects_omap;
8585     }
8586
8587     results->fill_in_final_tx(tctx->op_t.get());
8588     if (results->started_temp_obj) {
8589       tctx->discard_temp_oid = results->temp_oid;
8590     }
8591     tctx->new_obs.oi.size = results->object_size;
8592     tctx->new_obs.oi.user_version = results->user_version;
8593     // Don't care src object whether have data or omap digest
8594     if (results->object_size)
8595       tctx->new_obs.oi.set_data_digest(results->data_digest);
8596     if (results->has_omap)
8597       tctx->new_obs.oi.set_omap_digest(results->omap_digest);
8598     tctx->new_obs.oi.truncate_seq = results->truncate_seq;
8599     tctx->new_obs.oi.truncate_size = results->truncate_size;
8600
8601     if (soid.snap != CEPH_NOSNAP) {
8602       if (legacy_snapset) {
8603         tctx->new_obs.oi.legacy_snaps = results->snaps;
8604         assert(!tctx->new_obs.oi.legacy_snaps.empty());
8605       } else {
8606         // it's already in the snapset
8607         assert(obc->ssc->snapset.clone_snaps.count(soid.snap));
8608       }
8609       assert(obc->ssc->snapset.clone_size.count(soid.snap));
8610       assert(obc->ssc->snapset.clone_size[soid.snap] ==
8611              results->object_size);
8612       assert(obc->ssc->snapset.clone_overlap.count(soid.snap));
8613
8614       tctx->delta_stats.num_bytes += obc->ssc->snapset.get_clone_bytes(soid.snap);
8615     } else {
8616       tctx->delta_stats.num_bytes += results->object_size;
8617     }
8618   }
8619
8620   if (results->mirror_snapset) {
8621     assert(tctx->new_obs.oi.soid.snap == CEPH_NOSNAP);
8622     tctx->new_snapset.from_snap_set(
8623       results->snapset,
8624       get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS);
8625   }
8626   tctx->new_snapset.head_exists = true;
8627   dout(20) << __func__ << " new_snapset " << tctx->new_snapset << dendl;
8628
8629   // take RWWRITE lock for duration of our local write.  ignore starvation.
8630   if (!tctx->lock_manager.take_write_lock(
8631         obc->obs.oi.soid,
8632         obc)) {
8633     assert(0 == "problem!");
8634   }
8635   dout(20) << __func__ << " took lock on obc, " << obc->rwstate << dendl;
8636
8637   finish_ctx(tctx.get(), pg_log_entry_t::PROMOTE);
8638
8639   simple_opc_submit(std::move(tctx));
8640
8641   osd->logger->inc(l_osd_tier_promote);
8642
8643   if (agent_state &&
8644       agent_state->is_idle())
8645     agent_choose_mode();
8646 }
8647
8648 void PrimaryLogPG::cancel_copy(CopyOpRef cop, bool requeue)
8649 {
8650   dout(10) << __func__ << " " << cop->obc->obs.oi.soid
8651            << " from " << cop->src << " " << cop->oloc
8652            << " v" << cop->results.user_version << dendl;
8653
8654   // cancel objecter op, if we can
8655   if (cop->objecter_tid) {
8656     osd->objecter->op_cancel(cop->objecter_tid, -ECANCELED);
8657     cop->objecter_tid = 0;
8658     if (cop->objecter_tid2) {
8659       osd->objecter->op_cancel(cop->objecter_tid2, -ECANCELED);
8660       cop->objecter_tid2 = 0;
8661     }
8662   }
8663
8664   copy_ops.erase(cop->obc->obs.oi.soid);
8665   cop->obc->stop_block();
8666
8667   kick_object_context_blocked(cop->obc);
8668   cop->results.should_requeue = requeue;
8669   CopyCallbackResults result(-ECANCELED, &cop->results);
8670   cop->cb->complete(result);
8671
8672   // There may still be an objecter callback referencing this copy op.
8673   // That callback will not need the obc since it's been canceled, and
8674   // we need the obc reference to go away prior to flush.
8675   cop->obc = ObjectContextRef();
8676 }
8677
8678 void PrimaryLogPG::cancel_copy_ops(bool requeue)
8679 {
8680   dout(10) << __func__ << dendl;
8681   map<hobject_t,CopyOpRef>::iterator p = copy_ops.begin();
8682   while (p != copy_ops.end()) {
8683     // requeue this op? can I queue up all of them?
8684     cancel_copy((p++)->second, requeue);
8685   }
8686 }
8687
8688
8689 // ========================================================================
8690 // flush
8691 //
8692 // Flush a dirty object in the cache tier by writing it back to the
8693 // base tier.  The sequence looks like:
8694 //
8695 //  * send a copy-from operation to the base tier to copy the current
8696 //    version of the object
8697 //  * base tier will pull the object via (perhaps multiple) copy-get(s)
8698 //  * on completion, we check if the object has been modified.  if so,
8699 //    just reply with -EAGAIN.
8700 //  * try to take a write lock so we can clear the dirty flag.  if this
8701 //    fails, wait and retry
8702 //  * start a repop that clears the bit.
8703 //
8704 // If we have to wait, we will retry by coming back through the
8705 // start_flush method.  We check if a flush is already in progress
8706 // and, if so, try to finish it by rechecking the version and trying
8707 // to clear the dirty bit.
8708 //
8709 // In order for the cache-flush (a write op) to not block the copy-get
8710 // from reading the object, the client *must* set the SKIPRWLOCKS
8711 // flag.
8712 //
8713 // NOTE: normally writes are strictly ordered for the client, but
8714 // flushes are special in that they can be reordered with respect to
8715 // other writes.  In particular, we can't have a flush request block
8716 // an update to the cache pool object!
8717
8718 struct C_Flush : public Context {
8719   PrimaryLogPGRef pg;
8720   hobject_t oid;
8721   epoch_t last_peering_reset;
8722   ceph_tid_t tid;
8723   utime_t start;
8724   C_Flush(PrimaryLogPG *p, hobject_t o, epoch_t lpr)
8725     : pg(p), oid(o), last_peering_reset(lpr),
8726       tid(0), start(ceph_clock_now())
8727   {}
8728   void finish(int r) override {
8729     if (r == -ECANCELED)
8730       return;
8731     pg->lock();
8732     if (last_peering_reset == pg->get_last_peering_reset()) {
8733       pg->finish_flush(oid, tid, r);
8734       pg->osd->logger->tinc(l_osd_tier_flush_lat, ceph_clock_now() - start);
8735     }
8736     pg->unlock();
8737   }
8738 };
8739
8740 int PrimaryLogPG::start_flush(
8741   OpRequestRef op, ObjectContextRef obc,
8742   bool blocking, hobject_t *pmissing,
8743   boost::optional<std::function<void()>> &&on_flush)
8744 {
8745   const object_info_t& oi = obc->obs.oi;
8746   const hobject_t& soid = oi.soid;
8747   dout(10) << __func__ << " " << soid
8748            << " v" << oi.version
8749            << " uv" << oi.user_version
8750            << " " << (blocking ? "blocking" : "non-blocking/best-effort")
8751            << dendl;
8752
8753   // get a filtered snapset, need to remove removed snaps
8754   SnapSet snapset = obc->ssc->snapset.get_filtered(pool.info);
8755
8756   // verify there are no (older) check for dirty clones
8757   {
8758     dout(20) << " snapset " << snapset << dendl;
8759     vector<snapid_t>::reverse_iterator p = snapset.clones.rbegin();
8760     while (p != snapset.clones.rend() && *p >= soid.snap)
8761       ++p;
8762     if (p != snapset.clones.rend()) {
8763       hobject_t next = soid;
8764       next.snap = *p;
8765       assert(next.snap < soid.snap);
8766       if (pg_log.get_missing().is_missing(next)) {
8767         dout(10) << __func__ << " missing clone is " << next << dendl;
8768         if (pmissing)
8769           *pmissing = next;
8770         return -ENOENT;
8771       }
8772       ObjectContextRef older_obc = get_object_context(next, false);
8773       if (older_obc) {
8774         dout(20) << __func__ << " next oldest clone is " << older_obc->obs.oi
8775                  << dendl;
8776         if (older_obc->obs.oi.is_dirty()) {
8777           dout(10) << __func__ << " next oldest clone is dirty: "
8778                    << older_obc->obs.oi << dendl;
8779           return -EBUSY;
8780         }
8781       } else {
8782         dout(20) << __func__ << " next oldest clone " << next
8783                  << " is not present; implicitly clean" << dendl;
8784       }
8785     } else {
8786       dout(20) << __func__ << " no older clones" << dendl;
8787     }
8788   }
8789
8790   if (blocking)
8791     obc->start_block();
8792
8793   map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(soid);
8794   if (p != flush_ops.end()) {
8795     FlushOpRef fop = p->second;
8796     if (fop->op == op) {
8797       // we couldn't take the write lock on a cache-try-flush before;
8798       // now we are trying again for the lock.
8799       return try_flush_mark_clean(fop);
8800     }
8801     if (fop->flushed_version == obc->obs.oi.user_version &&
8802         (fop->blocking || !blocking)) {
8803       // nonblocking can join anything
8804       // blocking can only join a blocking flush
8805       dout(20) << __func__ << " piggybacking on existing flush " << dendl;
8806       if (op)
8807         fop->dup_ops.push_back(op);
8808       return -EAGAIN;   // clean up this ctx; op will retry later
8809     }
8810
8811     // cancel current flush since it will fail anyway, or because we
8812     // are blocking and the existing flush is nonblocking.
8813     dout(20) << __func__ << " canceling previous flush; it will fail" << dendl;
8814     if (fop->op)
8815       osd->reply_op_error(fop->op, -EBUSY);
8816     while (!fop->dup_ops.empty()) {
8817       osd->reply_op_error(fop->dup_ops.front(), -EBUSY);
8818       fop->dup_ops.pop_front();
8819     }
8820     cancel_flush(fop, false);
8821   }
8822
8823   /**
8824    * In general, we need to send a delete and a copyfrom.
8825    * Consider snapc 10:[10, 9, 8, 4, 3, 2]:[10(10, 9), 4(4,3,2)]
8826    * where 4 is marked as clean.  To flush 10, we have to:
8827    * 1) delete 4:[4,3,2] -- Logically, the object does not exist after 4
8828    * 2) copyfrom 8:[8,4,3,2] -- flush object after snap 8
8829    *
8830    * There is a complicating case.  Supposed there had been a clone 7
8831    * for snaps [7, 6] which has been trimmed since they no longer exist.
8832    * In the base pool, we'd have 5:[4,3,2]:[4(4,3,2)]+head.  When we submit
8833    * the delete, the snap will be promoted to 5, and the head will become
8834    * a snapdir.  When the copy-from goes through, we'll end up with
8835    * 8:[8,4,3,2]:[4(4,3,2)]+head.
8836    *
8837    * Another complication is the case where there is an interval change
8838    * after doing the delete and the flush but before marking the object
8839    * clean.  We'll happily delete head and then recreate it at the same
8840    * sequence number, which works out ok.
8841    */
8842
8843   SnapContext snapc, dsnapc;
8844   if (snapset.seq != 0) {
8845     if (soid.snap == CEPH_NOSNAP) {
8846       snapc.seq = snapset.seq;
8847       snapc.snaps = snapset.snaps;
8848     } else {
8849       snapid_t min_included_snap;
8850       if (snapset.is_legacy()) {
8851         min_included_snap = oi.legacy_snaps.back();
8852       } else {
8853         auto p = snapset.clone_snaps.find(soid.snap);
8854         assert(p != snapset.clone_snaps.end());
8855         min_included_snap = p->second.back();
8856       }
8857       snapc = snapset.get_ssc_as_of(min_included_snap - 1);
8858     }
8859
8860     snapid_t prev_snapc = 0;
8861     for (vector<snapid_t>::reverse_iterator citer = snapset.clones.rbegin();
8862          citer != snapset.clones.rend();
8863          ++citer) {
8864       if (*citer < soid.snap) {
8865         prev_snapc = *citer;
8866         break;
8867       }
8868     }
8869
8870     dsnapc = snapset.get_ssc_as_of(prev_snapc);
8871   }
8872
8873   object_locator_t base_oloc(soid);
8874   base_oloc.pool = pool.info.tier_of;
8875
8876   if (dsnapc.seq < snapc.seq) {
8877     ObjectOperation o;
8878     o.remove();
8879     osd->objecter->mutate(
8880       soid.oid,
8881       base_oloc,
8882       o,
8883       dsnapc,
8884       ceph::real_clock::from_ceph_timespec(oi.mtime),
8885       (CEPH_OSD_FLAG_IGNORE_OVERLAY |
8886        CEPH_OSD_FLAG_ENFORCE_SNAPC),
8887       NULL /* no callback, we'll rely on the ordering w.r.t the next op */);
8888   }
8889
8890   FlushOpRef fop(std::make_shared<FlushOp>());
8891   fop->obc = obc;
8892   fop->flushed_version = oi.user_version;
8893   fop->blocking = blocking;
8894   fop->on_flush = std::move(on_flush);
8895   fop->op = op;
8896
8897   ObjectOperation o;
8898   if (oi.is_whiteout()) {
8899     fop->removal = true;
8900     o.remove();
8901   } else {
8902     object_locator_t oloc(soid);
8903     o.copy_from(soid.oid.name, soid.snap, oloc, oi.user_version,
8904                 CEPH_OSD_COPY_FROM_FLAG_FLUSH |
8905                 CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
8906                 CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
8907                 CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE,
8908                 LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL|LIBRADOS_OP_FLAG_FADVISE_NOCACHE);
8909
8910     //mean the base tier don't cache data after this
8911     if (agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL)
8912       o.set_last_op_flags(LIBRADOS_OP_FLAG_FADVISE_DONTNEED);
8913   }
8914   C_Flush *fin = new C_Flush(this, soid, get_last_peering_reset());
8915
8916   ceph_tid_t tid = osd->objecter->mutate(
8917     soid.oid, base_oloc, o, snapc,
8918     ceph::real_clock::from_ceph_timespec(oi.mtime),
8919     CEPH_OSD_FLAG_IGNORE_OVERLAY | CEPH_OSD_FLAG_ENFORCE_SNAPC,
8920     new C_OnFinisher(fin,
8921                      &osd->objecter_finisher));
8922   /* we're under the pg lock and fin->finish() is grabbing that */
8923   fin->tid = tid;
8924   fop->objecter_tid = tid;
8925
8926   flush_ops[soid] = fop;
8927   info.stats.stats.sum.num_flush++;
8928   info.stats.stats.sum.num_flush_kb += SHIFT_ROUND_UP(oi.size, 10);
8929   return -EINPROGRESS;
8930 }
8931
8932 void PrimaryLogPG::finish_flush(hobject_t oid, ceph_tid_t tid, int r)
8933 {
8934   dout(10) << __func__ << " " << oid << " tid " << tid
8935            << " " << cpp_strerror(r) << dendl;
8936   map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(oid);
8937   if (p == flush_ops.end()) {
8938     dout(10) << __func__ << " no flush_op found" << dendl;
8939     return;
8940   }
8941   FlushOpRef fop = p->second;
8942   if (tid != fop->objecter_tid) {
8943     dout(10) << __func__ << " tid " << tid << " != fop " << fop
8944              << " tid " << fop->objecter_tid << dendl;
8945     return;
8946   }
8947   ObjectContextRef obc = fop->obc;
8948   fop->objecter_tid = 0;
8949
8950   if (r < 0 && !(r == -ENOENT && fop->removal)) {
8951     if (fop->op)
8952       osd->reply_op_error(fop->op, -EBUSY);
8953     if (fop->blocking) {
8954       obc->stop_block();
8955       kick_object_context_blocked(obc);
8956     }
8957
8958     if (!fop->dup_ops.empty()) {
8959       dout(20) << __func__ << " requeueing dups" << dendl;
8960       requeue_ops(fop->dup_ops);
8961     }
8962     if (fop->on_flush) {
8963       (*(fop->on_flush))();
8964       fop->on_flush = boost::none;
8965     }
8966     flush_ops.erase(oid);
8967     return;
8968   }
8969
8970   r = try_flush_mark_clean(fop);
8971   if (r == -EBUSY && fop->op) {
8972     osd->reply_op_error(fop->op, r);
8973   }
8974 }
8975
8976 int PrimaryLogPG::try_flush_mark_clean(FlushOpRef fop)
8977 {
8978   ObjectContextRef obc = fop->obc;
8979   const hobject_t& oid = obc->obs.oi.soid;
8980
8981   if (fop->blocking) {
8982     obc->stop_block();
8983     kick_object_context_blocked(obc);
8984   }
8985
8986   if (fop->flushed_version != obc->obs.oi.user_version ||
8987       !obc->obs.exists) {
8988     if (obc->obs.exists)
8989       dout(10) << __func__ << " flushed_version " << fop->flushed_version
8990                << " != current " << obc->obs.oi.user_version
8991                << dendl;
8992     else
8993       dout(10) << __func__ << " object no longer exists" << dendl;
8994
8995     if (!fop->dup_ops.empty()) {
8996       dout(20) << __func__ << " requeueing dups" << dendl;
8997       requeue_ops(fop->dup_ops);
8998     }
8999     if (fop->on_flush) {
9000       (*(fop->on_flush))();
9001       fop->on_flush = boost::none;
9002     }
9003     flush_ops.erase(oid);
9004     if (fop->blocking)
9005       osd->logger->inc(l_osd_tier_flush_fail);
9006     else
9007       osd->logger->inc(l_osd_tier_try_flush_fail);
9008     return -EBUSY;
9009   }
9010
9011   if (!fop->blocking &&
9012       scrubber.write_blocked_by_scrub(oid)) {
9013     if (fop->op) {
9014       dout(10) << __func__ << " blocked by scrub" << dendl;
9015       requeue_op(fop->op);
9016       requeue_ops(fop->dup_ops);
9017       return -EAGAIN;    // will retry
9018     } else {
9019       osd->logger->inc(l_osd_tier_try_flush_fail);
9020       cancel_flush(fop, false);
9021       return -ECANCELED;
9022     }
9023   }
9024
9025   // successfully flushed, can we evict this object?
9026   if (!fop->op && agent_state->evict_mode != TierAgentState::EVICT_MODE_IDLE &&
9027       agent_maybe_evict(obc, true)) {
9028     osd->logger->inc(l_osd_tier_clean);
9029     if (fop->on_flush) {
9030       (*(fop->on_flush))();
9031       fop->on_flush = boost::none;
9032     }
9033     flush_ops.erase(oid);
9034     return 0;
9035   }
9036
9037   dout(10) << __func__ << " clearing DIRTY flag for " << oid << dendl;
9038   OpContextUPtr ctx = simple_opc_create(fop->obc);
9039
9040   // successfully flushed; can we clear the dirty bit?
9041   // try to take the lock manually, since we don't
9042   // have a ctx yet.
9043   if (ctx->lock_manager.get_lock_type(
9044         ObjectContext::RWState::RWWRITE,
9045         oid,
9046         obc,
9047         fop->op)) {
9048     dout(20) << __func__ << " took write lock" << dendl;
9049   } else if (fop->op) {
9050     dout(10) << __func__ << " waiting on write lock" << dendl;
9051     close_op_ctx(ctx.release());
9052     requeue_op(fop->op);
9053     requeue_ops(fop->dup_ops);
9054     return -EAGAIN;    // will retry
9055   } else {
9056     dout(10) << __func__ << " failed write lock, no op; failing" << dendl;
9057     close_op_ctx(ctx.release());
9058     osd->logger->inc(l_osd_tier_try_flush_fail);
9059     cancel_flush(fop, false);
9060     return -ECANCELED;
9061   }
9062
9063   if (fop->on_flush) {
9064     ctx->register_on_finish(*(fop->on_flush));
9065     fop->on_flush = boost::none;
9066   }
9067
9068   ctx->at_version = get_next_version();
9069
9070   ctx->new_obs = obc->obs;
9071   ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
9072   --ctx->delta_stats.num_objects_dirty;
9073
9074   finish_ctx(ctx.get(), pg_log_entry_t::CLEAN);
9075
9076   osd->logger->inc(l_osd_tier_clean);
9077
9078   if (!fop->dup_ops.empty() || fop->op) {
9079     dout(20) << __func__ << " requeueing for " << ctx->at_version << dendl;
9080     list<OpRequestRef> ls;
9081     if (fop->op)
9082       ls.push_back(fop->op);
9083     ls.splice(ls.end(), fop->dup_ops);
9084     requeue_ops(ls);
9085   }
9086
9087   simple_opc_submit(std::move(ctx));
9088
9089   flush_ops.erase(oid);
9090
9091   if (fop->blocking)
9092     osd->logger->inc(l_osd_tier_flush);
9093   else
9094     osd->logger->inc(l_osd_tier_try_flush);
9095
9096   return -EINPROGRESS;
9097 }
9098
9099 void PrimaryLogPG::cancel_flush(FlushOpRef fop, bool requeue)
9100 {
9101   dout(10) << __func__ << " " << fop->obc->obs.oi.soid << " tid "
9102            << fop->objecter_tid << dendl;
9103   if (fop->objecter_tid) {
9104     osd->objecter->op_cancel(fop->objecter_tid, -ECANCELED);
9105     fop->objecter_tid = 0;
9106   }
9107   if (fop->blocking) {
9108     fop->obc->stop_block();
9109     kick_object_context_blocked(fop->obc);
9110   }
9111   if (requeue) {
9112     if (fop->op)
9113       requeue_op(fop->op);
9114     requeue_ops(fop->dup_ops);
9115   }
9116   if (fop->on_flush) {
9117     (*(fop->on_flush))();
9118     fop->on_flush = boost::none;
9119   }
9120   flush_ops.erase(fop->obc->obs.oi.soid);
9121 }
9122
9123 void PrimaryLogPG::cancel_flush_ops(bool requeue)
9124 {
9125   dout(10) << __func__ << dendl;
9126   map<hobject_t,FlushOpRef>::iterator p = flush_ops.begin();
9127   while (p != flush_ops.end()) {
9128     cancel_flush((p++)->second, requeue);
9129   }
9130 }
9131
9132 bool PrimaryLogPG::is_present_clone(hobject_t coid)
9133 {
9134   if (!pool.info.allow_incomplete_clones())
9135     return true;
9136   if (is_missing_object(coid))
9137     return true;
9138   ObjectContextRef obc = get_object_context(coid, false);
9139   return obc && obc->obs.exists;
9140 }
9141
9142 // ========================================================================
9143 // rep op gather
9144
9145 class C_OSD_RepopApplied : public Context {
9146   PrimaryLogPGRef pg;
9147   boost::intrusive_ptr<PrimaryLogPG::RepGather> repop;
9148 public:
9149   C_OSD_RepopApplied(PrimaryLogPG *pg, PrimaryLogPG::RepGather *repop)
9150   : pg(pg), repop(repop) {}
9151   void finish(int) override {
9152     pg->repop_all_applied(repop.get());
9153   }
9154 };
9155
9156
9157 void PrimaryLogPG::repop_all_applied(RepGather *repop)
9158 {
9159   dout(10) << __func__ << ": repop tid " << repop->rep_tid << " all applied "
9160            << dendl;
9161   assert(!repop->applies_with_commit);
9162   repop->all_applied = true;
9163   if (!repop->rep_aborted) {
9164     eval_repop(repop);
9165   }
9166 }
9167
9168 class C_OSD_RepopCommit : public Context {
9169   PrimaryLogPGRef pg;
9170   boost::intrusive_ptr<PrimaryLogPG::RepGather> repop;
9171 public:
9172   C_OSD_RepopCommit(PrimaryLogPG *pg, PrimaryLogPG::RepGather *repop)
9173     : pg(pg), repop(repop) {}
9174   void finish(int) override {
9175     pg->repop_all_committed(repop.get());
9176   }
9177 };
9178
9179 void PrimaryLogPG::repop_all_committed(RepGather *repop)
9180 {
9181   dout(10) << __func__ << ": repop tid " << repop->rep_tid << " all committed "
9182            << dendl;
9183   repop->all_committed = true;
9184   if (repop->applies_with_commit) {
9185     assert(!repop->all_applied);
9186     repop->all_applied = true;
9187   }
9188
9189   if (!repop->rep_aborted) {
9190     if (repop->v != eversion_t()) {
9191       last_update_ondisk = repop->v;
9192       last_complete_ondisk = repop->pg_local_last_complete;
9193     }
9194     eval_repop(repop);
9195   }
9196 }
9197
9198 void PrimaryLogPG::op_applied(const eversion_t &applied_version)
9199 {
9200   dout(10) << "op_applied version " << applied_version << dendl;
9201   if (applied_version == eversion_t())
9202     return;
9203   assert(applied_version > last_update_applied);
9204   assert(applied_version <= info.last_update);
9205   last_update_applied = applied_version;
9206   if (is_primary()) {
9207     if (scrubber.active) {
9208       if (last_update_applied >= scrubber.subset_last_update) {
9209         if (ops_blocked_by_scrub()) {
9210           requeue_scrub(true);
9211         } else {
9212           requeue_scrub(false);
9213         }
9214
9215       }
9216     } else {
9217       assert(scrubber.start == scrubber.end);
9218     }
9219   } else {
9220     if (scrubber.active_rep_scrub) {
9221       if (last_update_applied >= static_cast<const MOSDRepScrub*>(
9222             scrubber.active_rep_scrub->get_req())->scrub_to) {
9223         osd->enqueue_back(
9224           info.pgid,
9225           PGQueueable(scrubber.active_rep_scrub, get_osdmap()->get_epoch()));
9226         scrubber.active_rep_scrub = OpRequestRef();
9227       }
9228     }
9229   }
9230 }
9231
9232 void PrimaryLogPG::eval_repop(RepGather *repop)
9233 {
9234   const MOSDOp *m = NULL;
9235   if (repop->op)
9236     m = static_cast<const MOSDOp *>(repop->op->get_req());
9237
9238   if (m)
9239     dout(10) << "eval_repop " << *repop
9240              << (repop->rep_done ? " DONE" : "")
9241              << dendl;
9242   else
9243     dout(10) << "eval_repop " << *repop << " (no op)"
9244              << (repop->rep_done ? " DONE" : "")
9245              << dendl;
9246
9247   if (repop->rep_done)
9248     return;
9249
9250   // ondisk?
9251   if (repop->all_committed) {
9252     dout(10) << " commit: " << *repop << dendl;
9253     for (auto p = repop->on_committed.begin();
9254          p != repop->on_committed.end();
9255          repop->on_committed.erase(p++)) {
9256       (*p)();
9257     }
9258     // send dup commits, in order
9259     if (waiting_for_ondisk.count(repop->v)) {
9260       assert(waiting_for_ondisk.begin()->first == repop->v);
9261       for (list<pair<OpRequestRef, version_t> >::iterator i =
9262              waiting_for_ondisk[repop->v].begin();
9263            i != waiting_for_ondisk[repop->v].end();
9264            ++i) {
9265         osd->reply_op_error(i->first, repop->r, repop->v,
9266                             i->second);
9267       }
9268       waiting_for_ondisk.erase(repop->v);
9269     }
9270   }
9271
9272   // applied?
9273   if (repop->all_applied) {
9274     if (repop->applies_with_commit) {
9275       assert(repop->on_applied.empty());
9276     }
9277     dout(10) << " applied: " << *repop << " " << dendl;
9278     for (auto p = repop->on_applied.begin();
9279          p != repop->on_applied.end();
9280          repop->on_applied.erase(p++)) {
9281       (*p)();
9282     }
9283   }
9284
9285   // done.
9286   if (repop->all_applied && repop->all_committed) {
9287     repop->rep_done = true;
9288
9289     publish_stats_to_osd();
9290     calc_min_last_complete_ondisk();
9291
9292     dout(10) << " removing " << *repop << dendl;
9293     assert(!repop_queue.empty());
9294     dout(20) << "   q front is " << *repop_queue.front() << dendl;
9295     if (repop_queue.front() != repop) {
9296       if (!repop->applies_with_commit) {
9297         dout(0) << " removing " << *repop << dendl;
9298         dout(0) << "   q front is " << *repop_queue.front() << dendl;
9299         assert(repop_queue.front() == repop);
9300       }
9301     } else {
9302       RepGather *to_remove = nullptr;
9303       while (!repop_queue.empty() &&
9304              (to_remove = repop_queue.front())->rep_done) {
9305         repop_queue.pop_front();
9306         for (auto p = to_remove->on_success.begin();
9307              p != to_remove->on_success.end();
9308              to_remove->on_success.erase(p++)) {
9309           (*p)();
9310         }
9311         remove_repop(to_remove);
9312       }
9313     }
9314   }
9315 }
9316
9317 void PrimaryLogPG::issue_repop(RepGather *repop, OpContext *ctx)
9318 {
9319   FUNCTRACE();
9320   const hobject_t& soid = ctx->obs->oi.soid;
9321   dout(7) << "issue_repop rep_tid " << repop->rep_tid
9322           << " o " << soid
9323           << dendl;
9324
9325   repop->v = ctx->at_version;
9326   if (ctx->at_version > eversion_t()) {
9327     for (set<pg_shard_t>::iterator i = actingbackfill.begin();
9328          i != actingbackfill.end();
9329          ++i) {
9330       if (*i == get_primary()) continue;
9331       pg_info_t &pinfo = peer_info[*i];
9332       // keep peer_info up to date
9333       if (pinfo.last_complete == pinfo.last_update)
9334         pinfo.last_complete = ctx->at_version;
9335       pinfo.last_update = ctx->at_version;
9336     }
9337   }
9338
9339   ctx->obc->ondisk_write_lock();
9340
9341   bool unlock_snapset_obc = false;
9342   ctx->op_t->add_obc(ctx->obc);
9343   if (ctx->clone_obc) {
9344     ctx->clone_obc->ondisk_write_lock();
9345     ctx->op_t->add_obc(ctx->clone_obc);
9346   }
9347   if (ctx->snapset_obc && ctx->snapset_obc->obs.oi.soid !=
9348       ctx->obc->obs.oi.soid) {
9349     ctx->snapset_obc->ondisk_write_lock();
9350     unlock_snapset_obc = true;
9351     ctx->op_t->add_obc(ctx->snapset_obc);
9352   }
9353
9354   Context *on_all_commit = new C_OSD_RepopCommit(this, repop);
9355   Context *on_all_applied = new C_OSD_RepopApplied(this, repop);
9356   Context *onapplied_sync = new C_OSD_OndiskWriteUnlock(
9357     ctx->obc,
9358     ctx->clone_obc,
9359     unlock_snapset_obc ? ctx->snapset_obc : ObjectContextRef());
9360   if (!(ctx->log.empty())) {
9361     assert(ctx->at_version >= projected_last_update);
9362     projected_last_update = ctx->at_version;
9363   }
9364   for (auto &&entry: ctx->log) {
9365     projected_log.add(entry);
9366   }
9367   pgbackend->submit_transaction(
9368     soid,
9369     ctx->delta_stats,
9370     ctx->at_version,
9371     std::move(ctx->op_t),
9372     pg_trim_to,
9373     min_last_complete_ondisk,
9374     ctx->log,
9375     ctx->updated_hset_history,
9376     onapplied_sync,
9377     on_all_applied,
9378     on_all_commit,
9379     repop->rep_tid,
9380     ctx->reqid,
9381     ctx->op);
9382 }
9383
9384 PrimaryLogPG::RepGather *PrimaryLogPG::new_repop(
9385   OpContext *ctx, ObjectContextRef obc,
9386   ceph_tid_t rep_tid)
9387 {
9388   if (ctx->op)
9389     dout(10) << "new_repop rep_tid " << rep_tid << " on " << *ctx->op->get_req() << dendl;
9390   else
9391     dout(10) << "new_repop rep_tid " << rep_tid << " (no op)" << dendl;
9392
9393   RepGather *repop = new RepGather(
9394     ctx, rep_tid, info.last_complete, false);
9395
9396   repop->start = ceph_clock_now();
9397
9398   repop_queue.push_back(&repop->queue_item);
9399   repop->get();
9400
9401   osd->logger->inc(l_osd_op_wip);
9402
9403   dout(10) << __func__ << ": " << *repop << dendl;
9404   return repop;
9405 }
9406
9407 boost::intrusive_ptr<PrimaryLogPG::RepGather> PrimaryLogPG::new_repop(
9408   eversion_t version,
9409   int r,
9410   ObcLockManager &&manager,
9411   OpRequestRef &&op,
9412   boost::optional<std::function<void(void)> > &&on_complete)
9413 {
9414   RepGather *repop = new RepGather(
9415     std::move(manager),
9416     std::move(op),
9417     std::move(on_complete),
9418     osd->get_tid(),
9419     info.last_complete,
9420     true,
9421     r);
9422   repop->v = version;
9423
9424   repop->start = ceph_clock_now();
9425
9426   repop_queue.push_back(&repop->queue_item);
9427
9428   osd->logger->inc(l_osd_op_wip);
9429
9430   dout(10) << __func__ << ": " << *repop << dendl;
9431   return boost::intrusive_ptr<RepGather>(repop);
9432 }
9433
9434 void PrimaryLogPG::remove_repop(RepGather *repop)
9435 {
9436   dout(20) << __func__ << " " << *repop << dendl;
9437
9438   for (auto p = repop->on_finish.begin();
9439        p != repop->on_finish.end();
9440        repop->on_finish.erase(p++)) {
9441     (*p)();
9442   }
9443
9444   release_object_locks(
9445     repop->lock_manager);
9446   repop->put();
9447
9448   osd->logger->dec(l_osd_op_wip);
9449 }
9450
9451 PrimaryLogPG::OpContextUPtr PrimaryLogPG::simple_opc_create(ObjectContextRef obc)
9452 {
9453   dout(20) << __func__ << " " << obc->obs.oi.soid << dendl;
9454   ceph_tid_t rep_tid = osd->get_tid();
9455   osd_reqid_t reqid(osd->get_cluster_msgr_name(), 0, rep_tid);
9456   OpContextUPtr ctx(new OpContext(OpRequestRef(), reqid, nullptr, obc, this));
9457   ctx->op_t.reset(new PGTransaction());
9458   ctx->mtime = ceph_clock_now();
9459   return ctx;
9460 }
9461
9462 void PrimaryLogPG::simple_opc_submit(OpContextUPtr ctx)
9463 {
9464   RepGather *repop = new_repop(ctx.get(), ctx->obc, ctx->reqid.tid);
9465   dout(20) << __func__ << " " << repop << dendl;
9466   issue_repop(repop, ctx.get());
9467   eval_repop(repop);
9468   calc_trim_to();
9469   repop->put();
9470 }
9471
9472
9473 void PrimaryLogPG::submit_log_entries(
9474   const mempool::osd_pglog::list<pg_log_entry_t> &entries,
9475   ObcLockManager &&manager,
9476   boost::optional<std::function<void(void)> > &&_on_complete,
9477   OpRequestRef op,
9478   int r)
9479 {
9480   dout(10) << __func__ << " " << entries << dendl;
9481   assert(is_primary());
9482
9483   eversion_t version;
9484   if (!entries.empty()) {
9485     assert(entries.rbegin()->version >= projected_last_update);
9486     version = projected_last_update = entries.rbegin()->version;
9487   }
9488
9489   boost::intrusive_ptr<RepGather> repop;
9490   boost::optional<std::function<void(void)> > on_complete;
9491   if (get_osdmap()->require_osd_release >= CEPH_RELEASE_JEWEL) {
9492     repop = new_repop(
9493       version,
9494       r,
9495       std::move(manager),
9496       std::move(op),
9497       std::move(_on_complete));
9498   } else {
9499     on_complete = std::move(_on_complete);
9500   }
9501
9502   pgbackend->call_write_ordered(
9503     [this, entries, repop, on_complete]() {
9504       ObjectStore::Transaction t;
9505       eversion_t old_last_update = info.last_update;
9506       merge_new_log_entries(entries, t);
9507
9508
9509       set<pg_shard_t> waiting_on;
9510       for (set<pg_shard_t>::const_iterator i = actingbackfill.begin();
9511            i != actingbackfill.end();
9512            ++i) {
9513         pg_shard_t peer(*i);
9514         if (peer == pg_whoami) continue;
9515         assert(peer_missing.count(peer));
9516         assert(peer_info.count(peer));
9517         if (get_osdmap()->require_osd_release >= CEPH_RELEASE_JEWEL) {
9518           assert(repop);
9519           MOSDPGUpdateLogMissing *m = new MOSDPGUpdateLogMissing(
9520             entries,
9521             spg_t(info.pgid.pgid, i->shard),
9522             pg_whoami.shard,
9523             get_osdmap()->get_epoch(),
9524             last_peering_reset,
9525             repop->rep_tid);
9526           osd->send_message_osd_cluster(
9527             peer.osd, m, get_osdmap()->get_epoch());
9528           waiting_on.insert(peer);
9529         } else {
9530           MOSDPGLog *m = new MOSDPGLog(
9531             peer.shard, pg_whoami.shard,
9532             info.last_update.epoch,
9533             info);
9534           m->log.log = entries;
9535           m->log.tail = old_last_update;
9536           m->log.head = info.last_update;
9537           osd->send_message_osd_cluster(
9538             peer.osd, m, get_osdmap()->get_epoch());
9539         }
9540       }
9541       if (get_osdmap()->require_osd_release >= CEPH_RELEASE_JEWEL) {
9542         ceph_tid_t rep_tid = repop->rep_tid;
9543         waiting_on.insert(pg_whoami);
9544         log_entry_update_waiting_on.insert(
9545           make_pair(
9546             rep_tid,
9547             LogUpdateCtx{std::move(repop), std::move(waiting_on)}
9548             ));
9549         struct OnComplete : public Context {
9550           PrimaryLogPGRef pg;
9551           ceph_tid_t rep_tid;
9552           epoch_t epoch;
9553           OnComplete(
9554             PrimaryLogPGRef pg,
9555             ceph_tid_t rep_tid,
9556             epoch_t epoch)
9557             : pg(pg), rep_tid(rep_tid), epoch(epoch) {}
9558           void finish(int) override {
9559             pg->lock();
9560             if (!pg->pg_has_reset_since(epoch)) {
9561               auto it = pg->log_entry_update_waiting_on.find(rep_tid);
9562               assert(it != pg->log_entry_update_waiting_on.end());
9563               auto it2 = it->second.waiting_on.find(pg->pg_whoami);
9564               assert(it2 != it->second.waiting_on.end());
9565               it->second.waiting_on.erase(it2);
9566               if (it->second.waiting_on.empty()) {
9567                 pg->repop_all_committed(it->second.repop.get());
9568                 pg->log_entry_update_waiting_on.erase(it);
9569               }
9570             }
9571             pg->unlock();
9572           }
9573         };
9574         t.register_on_commit(
9575           new OnComplete{this, rep_tid, get_osdmap()->get_epoch()});
9576       } else {
9577         if (on_complete) {
9578           struct OnComplete : public Context {
9579             PrimaryLogPGRef pg;
9580             std::function<void(void)> on_complete;
9581             epoch_t epoch;
9582             OnComplete(
9583               PrimaryLogPGRef pg,
9584               const std::function<void(void)> &on_complete,
9585               epoch_t epoch)
9586               : pg(pg),
9587                 on_complete(std::move(on_complete)),
9588                 epoch(epoch) {}
9589             void finish(int) override {
9590               pg->lock();
9591               if (!pg->pg_has_reset_since(epoch))
9592                 on_complete();
9593               pg->unlock();
9594             }
9595           };
9596           t.register_on_complete(
9597             new OnComplete{
9598               this, *on_complete, get_osdmap()->get_epoch()
9599                 });
9600         }
9601       }
9602       t.register_on_applied(
9603         new C_OSD_OnApplied{this, get_osdmap()->get_epoch(), info.last_update});
9604       int r = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
9605       assert(r == 0);
9606     });
9607 }
9608
9609 void PrimaryLogPG::cancel_log_updates()
9610 {
9611   // get rid of all the LogUpdateCtx so their references to repops are
9612   // dropped
9613   log_entry_update_waiting_on.clear();
9614 }
9615
9616 // -------------------------------------------------------
9617
9618 void PrimaryLogPG::get_watchers(list<obj_watch_item_t> &pg_watchers)
9619 {
9620   pair<hobject_t, ObjectContextRef> i;
9621   while (object_contexts.get_next(i.first, &i)) {
9622     ObjectContextRef obc(i.second);
9623     get_obc_watchers(obc, pg_watchers);
9624   }
9625 }
9626
9627 void PrimaryLogPG::get_obc_watchers(ObjectContextRef obc, list<obj_watch_item_t> &pg_watchers)
9628 {
9629   for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j =
9630          obc->watchers.begin();
9631         j != obc->watchers.end();
9632         ++j) {
9633     obj_watch_item_t owi;
9634
9635     owi.obj = obc->obs.oi.soid;
9636     owi.wi.addr = j->second->get_peer_addr();
9637     owi.wi.name = j->second->get_entity();
9638     owi.wi.cookie = j->second->get_cookie();
9639     owi.wi.timeout_seconds = j->second->get_timeout();
9640
9641     dout(30) << "watch: Found oid=" << owi.obj << " addr=" << owi.wi.addr
9642       << " name=" << owi.wi.name << " cookie=" << owi.wi.cookie << dendl;
9643
9644     pg_watchers.push_back(owi);
9645   }
9646 }
9647
9648 void PrimaryLogPG::check_blacklisted_watchers()
9649 {
9650   dout(20) << "PrimaryLogPG::check_blacklisted_watchers for pg " << get_pgid() << dendl;
9651   pair<hobject_t, ObjectContextRef> i;
9652   while (object_contexts.get_next(i.first, &i))
9653     check_blacklisted_obc_watchers(i.second);
9654 }
9655
9656 void PrimaryLogPG::check_blacklisted_obc_watchers(ObjectContextRef obc)
9657 {
9658   dout(20) << "PrimaryLogPG::check_blacklisted_obc_watchers for obc " << obc->obs.oi.soid << dendl;
9659   for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator k =
9660          obc->watchers.begin();
9661         k != obc->watchers.end();
9662         ) {
9663     //Advance iterator now so handle_watch_timeout() can erase element
9664     map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j = k++;
9665     dout(30) << "watch: Found " << j->second->get_entity() << " cookie " << j->second->get_cookie() << dendl;
9666     entity_addr_t ea = j->second->get_peer_addr();
9667     dout(30) << "watch: Check entity_addr_t " << ea << dendl;
9668     if (get_osdmap()->is_blacklisted(ea)) {
9669       dout(10) << "watch: Found blacklisted watcher for " << ea << dendl;
9670       assert(j->second->get_pg() == this);
9671       j->second->unregister_cb();
9672       handle_watch_timeout(j->second);
9673     }
9674   }
9675 }
9676
9677 void PrimaryLogPG::populate_obc_watchers(ObjectContextRef obc)
9678 {
9679   assert(is_active());
9680   assert((recovering.count(obc->obs.oi.soid) ||
9681           !is_missing_object(obc->obs.oi.soid)) ||
9682          (pg_log.get_log().objects.count(obc->obs.oi.soid) && // or this is a revert... see recover_primary()
9683           pg_log.get_log().objects.find(obc->obs.oi.soid)->second->op ==
9684             pg_log_entry_t::LOST_REVERT &&
9685           pg_log.get_log().objects.find(obc->obs.oi.soid)->second->reverting_to ==
9686             obc->obs.oi.version));
9687
9688   dout(10) << "populate_obc_watchers " << obc->obs.oi.soid << dendl;
9689   assert(obc->watchers.empty());
9690   // populate unconnected_watchers
9691   for (map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator p =
9692         obc->obs.oi.watchers.begin();
9693        p != obc->obs.oi.watchers.end();
9694        ++p) {
9695     utime_t expire = info.stats.last_became_active;
9696     expire += p->second.timeout_seconds;
9697     dout(10) << "  unconnected watcher " << p->first << " will expire " << expire << dendl;
9698     WatchRef watch(
9699       Watch::makeWatchRef(
9700         this, osd, obc, p->second.timeout_seconds, p->first.first,
9701         p->first.second, p->second.addr));
9702     watch->disconnect();
9703     obc->watchers.insert(
9704       make_pair(
9705         make_pair(p->first.first, p->first.second),
9706         watch));
9707   }
9708   // Look for watchers from blacklisted clients and drop
9709   check_blacklisted_obc_watchers(obc);
9710 }
9711
9712 void PrimaryLogPG::handle_watch_timeout(WatchRef watch)
9713 {
9714   ObjectContextRef obc = watch->get_obc(); // handle_watch_timeout owns this ref
9715   dout(10) << "handle_watch_timeout obc " << obc << dendl;
9716
9717   if (!is_active()) {
9718     dout(10) << "handle_watch_timeout not active, no-op" << dendl;
9719     return;
9720   }
9721   if (is_degraded_or_backfilling_object(obc->obs.oi.soid)) {
9722     callbacks_for_degraded_object[obc->obs.oi.soid].push_back(
9723       watch->get_delayed_cb()
9724       );
9725     dout(10) << "handle_watch_timeout waiting for degraded on obj "
9726              << obc->obs.oi.soid
9727              << dendl;
9728     return;
9729   }
9730
9731   if (scrubber.write_blocked_by_scrub(obc->obs.oi.soid)) {
9732     dout(10) << "handle_watch_timeout waiting for scrub on obj "
9733              << obc->obs.oi.soid
9734              << dendl;
9735     scrubber.add_callback(
9736       watch->get_delayed_cb() // This callback!
9737       );
9738     return;
9739   }
9740
9741   OpContextUPtr ctx = simple_opc_create(obc);
9742   ctx->at_version = get_next_version();
9743
9744   object_info_t& oi = ctx->new_obs.oi;
9745   oi.watchers.erase(make_pair(watch->get_cookie(),
9746                               watch->get_entity()));
9747
9748   list<watch_disconnect_t> watch_disconnects = {
9749     watch_disconnect_t(watch->get_cookie(), watch->get_entity(), true)
9750   };
9751   ctx->register_on_success(
9752     [this, obc, watch_disconnects]() {
9753       complete_disconnect_watches(obc, watch_disconnects);
9754     });
9755
9756
9757   PGTransaction *t = ctx->op_t.get();
9758   ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::MODIFY, obc->obs.oi.soid,
9759                                     ctx->at_version,
9760                                     oi.version,
9761                                     0,
9762                                     osd_reqid_t(), ctx->mtime, 0));
9763
9764   oi.prior_version = obc->obs.oi.version;
9765   oi.version = ctx->at_version;
9766   bufferlist bl;
9767   ::encode(oi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
9768   t->setattr(obc->obs.oi.soid, OI_ATTR, bl);
9769
9770   // apply new object state.
9771   ctx->obc->obs = ctx->new_obs;
9772
9773   // no ctx->delta_stats
9774   simple_opc_submit(std::move(ctx));
9775 }
9776
9777 ObjectContextRef PrimaryLogPG::create_object_context(const object_info_t& oi,
9778                                                      SnapSetContext *ssc)
9779 {
9780   ObjectContextRef obc(object_contexts.lookup_or_create(oi.soid));
9781   assert(obc->destructor_callback == NULL);
9782   obc->destructor_callback = new C_PG_ObjectContext(this, obc.get());
9783   obc->obs.oi = oi;
9784   obc->obs.exists = false;
9785   obc->ssc = ssc;
9786   if (ssc)
9787     register_snapset_context(ssc);
9788   dout(10) << "create_object_context " << (void*)obc.get() << " " << oi.soid << " " << dendl;
9789   if (is_active())
9790     populate_obc_watchers(obc);
9791   return obc;
9792 }
9793
9794 ObjectContextRef PrimaryLogPG::get_object_context(
9795   const hobject_t& soid,
9796   bool can_create,
9797   const map<string, bufferlist> *attrs)
9798 {
9799   assert(
9800     attrs || !pg_log.get_missing().is_missing(soid) ||
9801     // or this is a revert... see recover_primary()
9802     (pg_log.get_log().objects.count(soid) &&
9803       pg_log.get_log().objects.find(soid)->second->op ==
9804       pg_log_entry_t::LOST_REVERT));
9805   ObjectContextRef obc = object_contexts.lookup(soid);
9806   osd->logger->inc(l_osd_object_ctx_cache_total);
9807   if (obc) {
9808     osd->logger->inc(l_osd_object_ctx_cache_hit);
9809     dout(10) << __func__ << ": found obc in cache: " << obc
9810              << dendl;
9811   } else {
9812     dout(10) << __func__ << ": obc NOT found in cache: " << soid << dendl;
9813     // check disk
9814     bufferlist bv;
9815     if (attrs) {
9816       assert(attrs->count(OI_ATTR));
9817       bv = attrs->find(OI_ATTR)->second;
9818     } else {
9819       int r = pgbackend->objects_get_attr(soid, OI_ATTR, &bv);
9820       if (r < 0) {
9821         if (!can_create) {
9822           dout(10) << __func__ << ": no obc for soid "
9823                    << soid << " and !can_create"
9824                    << dendl;
9825           return ObjectContextRef();   // -ENOENT!
9826         }
9827
9828         dout(10) << __func__ << ": no obc for soid "
9829                  << soid << " but can_create"
9830                  << dendl;
9831         // new object.
9832         object_info_t oi(soid);
9833         SnapSetContext *ssc = get_snapset_context(
9834           soid, true, 0, false);
9835         assert(ssc);
9836         obc = create_object_context(oi, ssc);
9837         dout(10) << __func__ << ": " << obc << " " << soid
9838                  << " " << obc->rwstate
9839                  << " oi: " << obc->obs.oi
9840                  << " ssc: " << obc->ssc
9841                  << " snapset: " << obc->ssc->snapset << dendl;
9842         return obc;
9843       }
9844     }
9845
9846     object_info_t oi;
9847     try {
9848       bufferlist::iterator bliter = bv.begin();
9849       ::decode(oi, bliter);
9850     } catch (...) {
9851       dout(0) << __func__ << ": obc corrupt: " << soid << dendl;
9852       return ObjectContextRef();   // -ENOENT!
9853     }
9854
9855     assert(oi.soid.pool == (int64_t)info.pgid.pool());
9856
9857     obc = object_contexts.lookup_or_create(oi.soid);
9858     obc->destructor_callback = new C_PG_ObjectContext(this, obc.get());
9859     obc->obs.oi = oi;
9860     obc->obs.exists = true;
9861
9862     obc->ssc = get_snapset_context(
9863       soid, true,
9864       soid.has_snapset() ? attrs : 0);
9865
9866     if (is_active())
9867       populate_obc_watchers(obc);
9868
9869     if (pool.info.require_rollback()) {
9870       if (attrs) {
9871         obc->attr_cache = *attrs;
9872       } else {
9873         int r = pgbackend->objects_get_attrs(
9874           soid,
9875           &obc->attr_cache);
9876         assert(r == 0);
9877       }
9878     }
9879
9880     dout(10) << __func__ << ": creating obc from disk: " << obc
9881              << dendl;
9882   }
9883
9884   // XXX: Caller doesn't expect this
9885   if (obc->ssc == NULL) {
9886     derr << __func__ << ": obc->ssc not available, not returning context" << dendl;
9887     return ObjectContextRef();   // -ENOENT!
9888   }
9889
9890   dout(10) << __func__ << ": " << obc << " " << soid
9891            << " " << obc->rwstate
9892            << " oi: " << obc->obs.oi
9893            << " exists: " << (int)obc->obs.exists
9894            << " ssc: " << obc->ssc
9895            << " snapset: " << obc->ssc->snapset << dendl;
9896   return obc;
9897 }
9898
9899 void PrimaryLogPG::context_registry_on_change()
9900 {
9901   pair<hobject_t, ObjectContextRef> i;
9902   while (object_contexts.get_next(i.first, &i)) {
9903     ObjectContextRef obc(i.second);
9904     if (obc) {
9905       for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j =
9906              obc->watchers.begin();
9907            j != obc->watchers.end();
9908            obc->watchers.erase(j++)) {
9909         j->second->discard();
9910       }
9911     }
9912   }
9913 }
9914
9915
9916 /*
9917  * If we return an error, and set *pmissing, then promoting that
9918  * object may help.
9919  *
9920  * If we return -EAGAIN, we will always set *pmissing to the missing
9921  * object to wait for.
9922  *
9923  * If we return an error but do not set *pmissing, then we know the
9924  * object does not exist.
9925  */
9926 int PrimaryLogPG::find_object_context(const hobject_t& oid,
9927                                       ObjectContextRef *pobc,
9928                                       bool can_create,
9929                                       bool map_snapid_to_clone,
9930                                       hobject_t *pmissing)
9931 {
9932   FUNCTRACE();
9933   assert(oid.pool == static_cast<int64_t>(info.pgid.pool()));
9934   // want the head?
9935   if (oid.snap == CEPH_NOSNAP) {
9936     ObjectContextRef obc = get_object_context(oid, can_create);
9937     if (!obc) {
9938       if (pmissing)
9939         *pmissing = oid;
9940       return -ENOENT;
9941     }
9942     dout(10) << "find_object_context " << oid
9943        << " @" << oid.snap
9944        << " oi=" << obc->obs.oi
9945        << dendl;
9946     *pobc = obc;
9947
9948     return 0;
9949   }
9950
9951   hobject_t head = oid.get_head();
9952
9953   // want the snapdir?
9954   if (oid.snap == CEPH_SNAPDIR) {
9955     // return head or snapdir, whichever exists.
9956     ObjectContextRef headobc = get_object_context(head, can_create);
9957     ObjectContextRef obc = headobc;
9958     if (!obc || !obc->obs.exists)
9959       obc = get_object_context(oid, can_create);
9960     if (!obc || !obc->obs.exists) {
9961       // if we have neither, we would want to promote the head.
9962       if (pmissing)
9963         *pmissing = head;
9964       if (pobc)
9965         *pobc = headobc; // may be null
9966       return -ENOENT;
9967     }
9968     dout(10) << "find_object_context " << oid
9969              << " @" << oid.snap
9970              << " oi=" << obc->obs.oi
9971              << dendl;
9972     *pobc = obc;
9973
9974     // always populate ssc for SNAPDIR...
9975     if (!obc->ssc)
9976       obc->ssc = get_snapset_context(
9977         oid, true);
9978     return 0;
9979   }
9980
9981   // we want a snap
9982   if (!map_snapid_to_clone && pool.info.is_removed_snap(oid.snap)) {
9983     dout(10) << __func__ << " snap " << oid.snap << " is removed" << dendl;
9984     return -ENOENT;
9985   }
9986
9987   SnapSetContext *ssc = get_snapset_context(oid, can_create);
9988   if (!ssc || !(ssc->exists || can_create)) {
9989     dout(20) << __func__ << " " << oid << " no snapset" << dendl;
9990     if (pmissing)
9991       *pmissing = head;  // start by getting the head
9992     if (ssc)
9993       put_snapset_context(ssc);
9994     return -ENOENT;
9995   }
9996
9997   if (map_snapid_to_clone) {
9998     dout(10) << "find_object_context " << oid << " @" << oid.snap
9999              << " snapset " << ssc->snapset
10000              << " map_snapid_to_clone=true" << dendl;
10001     if (oid.snap > ssc->snapset.seq) {
10002       // already must be readable
10003       ObjectContextRef obc = get_object_context(head, false);
10004       dout(10) << "find_object_context " << oid << " @" << oid.snap
10005                << " snapset " << ssc->snapset
10006                << " maps to head" << dendl;
10007       *pobc = obc;
10008       put_snapset_context(ssc);
10009       return (obc && obc->obs.exists) ? 0 : -ENOENT;
10010     } else {
10011       vector<snapid_t>::const_iterator citer = std::find(
10012         ssc->snapset.clones.begin(),
10013         ssc->snapset.clones.end(),
10014         oid.snap);
10015       if (citer == ssc->snapset.clones.end()) {
10016         dout(10) << "find_object_context " << oid << " @" << oid.snap
10017                  << " snapset " << ssc->snapset
10018                  << " maps to nothing" << dendl;
10019         put_snapset_context(ssc);
10020         return -ENOENT;
10021       }
10022
10023       dout(10) << "find_object_context " << oid << " @" << oid.snap
10024                << " snapset " << ssc->snapset
10025                << " maps to " << oid << dendl;
10026
10027       if (pg_log.get_missing().is_missing(oid)) {
10028         dout(10) << "find_object_context " << oid << " @" << oid.snap
10029                  << " snapset " << ssc->snapset
10030                  << " " << oid << " is missing" << dendl;
10031         if (pmissing)
10032           *pmissing = oid;
10033         put_snapset_context(ssc);
10034         return -EAGAIN;
10035       }
10036
10037       ObjectContextRef obc = get_object_context(oid, false);
10038       if (!obc || !obc->obs.exists) {
10039         dout(10) << "find_object_context " << oid << " @" << oid.snap
10040                  << " snapset " << ssc->snapset
10041                  << " " << oid << " is not present" << dendl;
10042         if (pmissing)
10043           *pmissing = oid;
10044         put_snapset_context(ssc);
10045         return -ENOENT;
10046       }
10047       dout(10) << "find_object_context " << oid << " @" << oid.snap
10048                << " snapset " << ssc->snapset
10049                << " " << oid << " HIT" << dendl;
10050       *pobc = obc;
10051       put_snapset_context(ssc);
10052       return 0;
10053     }
10054     ceph_abort(); //unreachable
10055   }
10056
10057   dout(10) << "find_object_context " << oid << " @" << oid.snap
10058            << " snapset " << ssc->snapset << dendl;
10059
10060   // head?
10061   if (oid.snap > ssc->snapset.seq) {
10062     if (ssc->snapset.head_exists) {
10063       ObjectContextRef obc = get_object_context(head, false);
10064       dout(10) << "find_object_context  " << head
10065                << " want " << oid.snap << " > snapset seq " << ssc->snapset.seq
10066                << " -- HIT " << obc->obs
10067                << dendl;
10068       if (!obc->ssc)
10069         obc->ssc = ssc;
10070       else {
10071         assert(ssc == obc->ssc);
10072         put_snapset_context(ssc);
10073       }
10074       *pobc = obc;
10075       return 0;
10076     }
10077     dout(10) << "find_object_context  " << head
10078              << " want " << oid.snap << " > snapset seq " << ssc->snapset.seq
10079              << " but head dne -- DNE"
10080              << dendl;
10081     put_snapset_context(ssc);
10082     return -ENOENT;
10083   }
10084
10085   // which clone would it be?
10086   unsigned k = 0;
10087   while (k < ssc->snapset.clones.size() &&
10088          ssc->snapset.clones[k] < oid.snap)
10089     k++;
10090   if (k == ssc->snapset.clones.size()) {
10091     dout(10) << "find_object_context  no clones with last >= oid.snap "
10092              << oid.snap << " -- DNE" << dendl;
10093     put_snapset_context(ssc);
10094     return -ENOENT;
10095   }
10096   hobject_t soid(oid.oid, oid.get_key(), ssc->snapset.clones[k], oid.get_hash(),
10097                  info.pgid.pool(), oid.get_namespace());
10098
10099   if (pg_log.get_missing().is_missing(soid)) {
10100     dout(20) << "find_object_context  " << soid << " missing, try again later"
10101              << dendl;
10102     if (pmissing)
10103       *pmissing = soid;
10104     put_snapset_context(ssc);
10105     return -EAGAIN;
10106   }
10107
10108   ObjectContextRef obc = get_object_context(soid, false);
10109   if (!obc || !obc->obs.exists) {
10110     if (pmissing)
10111       *pmissing = soid;
10112     put_snapset_context(ssc);
10113     if (is_degraded_or_backfilling_object(soid)) {
10114       dout(20) << __func__ << " clone is degraded or backfilling " << soid << dendl;
10115       return -EAGAIN;
10116     } else {
10117       dout(20) << __func__ << " missing clone " << soid << dendl;
10118       return -ENOENT;
10119     }
10120   }
10121
10122   if (!obc->ssc) {
10123     obc->ssc = ssc;
10124   } else {
10125     assert(obc->ssc == ssc);
10126     put_snapset_context(ssc);
10127   }
10128   ssc = 0;
10129
10130   // clone
10131   dout(20) << "find_object_context  " << soid
10132            << " snapset " << obc->ssc->snapset
10133            << " legacy_snaps " << obc->obs.oi.legacy_snaps
10134            << dendl;
10135   snapid_t first, last;
10136   if (obc->ssc->snapset.is_legacy()) {
10137     first = obc->obs.oi.legacy_snaps.back();
10138     last = obc->obs.oi.legacy_snaps.front();
10139   } else {
10140     auto p = obc->ssc->snapset.clone_snaps.find(soid.snap);
10141     assert(p != obc->ssc->snapset.clone_snaps.end());
10142     first = p->second.back();
10143     last = p->second.front();
10144   }
10145   if (first <= oid.snap) {
10146     dout(20) << "find_object_context  " << soid << " [" << first << "," << last
10147              << "] contains " << oid.snap << " -- HIT " << obc->obs << dendl;
10148     *pobc = obc;
10149     return 0;
10150   } else {
10151     dout(20) << "find_object_context  " << soid << " [" << first << "," << last
10152              << "] does not contain " << oid.snap << " -- DNE" << dendl;
10153     return -ENOENT;
10154   }
10155 }
10156
10157 void PrimaryLogPG::object_context_destructor_callback(ObjectContext *obc)
10158 {
10159   if (obc->ssc)
10160     put_snapset_context(obc->ssc);
10161 }
10162
10163 void PrimaryLogPG::add_object_context_to_pg_stat(ObjectContextRef obc, pg_stat_t *pgstat)
10164 {
10165   object_info_t& oi = obc->obs.oi;
10166
10167   dout(10) << "add_object_context_to_pg_stat " << oi.soid << dendl;
10168   object_stat_sum_t stat;
10169
10170   stat.num_bytes += oi.size;
10171
10172   if (oi.soid.snap != CEPH_SNAPDIR)
10173     stat.num_objects++;
10174   if (oi.is_dirty())
10175     stat.num_objects_dirty++;
10176   if (oi.is_whiteout())
10177     stat.num_whiteouts++;
10178   if (oi.is_omap())
10179     stat.num_objects_omap++;
10180   if (oi.is_cache_pinned())
10181     stat.num_objects_pinned++;
10182
10183   if (oi.soid.snap && oi.soid.snap != CEPH_NOSNAP && oi.soid.snap != CEPH_SNAPDIR) {
10184     stat.num_object_clones++;
10185
10186     if (!obc->ssc)
10187       obc->ssc = get_snapset_context(oi.soid, false);
10188     assert(obc->ssc);
10189
10190     // subtract off clone overlap
10191     if (obc->ssc->snapset.clone_overlap.count(oi.soid.snap)) {
10192       interval_set<uint64_t>& o = obc->ssc->snapset.clone_overlap[oi.soid.snap];
10193       for (interval_set<uint64_t>::const_iterator r = o.begin();
10194            r != o.end();
10195            ++r) {
10196         stat.num_bytes -= r.get_len();
10197       }
10198     }
10199   }
10200
10201   // add it in
10202   pgstat->stats.sum.add(stat);
10203 }
10204
10205 void PrimaryLogPG::kick_object_context_blocked(ObjectContextRef obc)
10206 {
10207   const hobject_t& soid = obc->obs.oi.soid;
10208   if (obc->is_blocked()) {
10209     dout(10) << __func__ << " " << soid << " still blocked" << dendl;
10210     return;
10211   }
10212
10213   map<hobject_t, list<OpRequestRef>>::iterator p = waiting_for_blocked_object.find(soid);
10214   if (p != waiting_for_blocked_object.end()) {
10215     list<OpRequestRef>& ls = p->second;
10216     dout(10) << __func__ << " " << soid << " requeuing " << ls.size() << " requests" << dendl;
10217     requeue_ops(ls);
10218     waiting_for_blocked_object.erase(p);
10219   }
10220
10221   map<hobject_t, ObjectContextRef>::iterator i =
10222     objects_blocked_on_snap_promotion.find(obc->obs.oi.soid.get_head());
10223   if (i != objects_blocked_on_snap_promotion.end()) {
10224     assert(i->second == obc);
10225     objects_blocked_on_snap_promotion.erase(i);
10226   }
10227
10228   if (obc->requeue_scrub_on_unblock) {
10229     obc->requeue_scrub_on_unblock = false;
10230     requeue_scrub();
10231   }
10232 }
10233
10234 SnapSetContext *PrimaryLogPG::get_snapset_context(
10235   const hobject_t& oid,
10236   bool can_create,
10237   const map<string, bufferlist> *attrs,
10238   bool oid_existed)
10239 {
10240   Mutex::Locker l(snapset_contexts_lock);
10241   SnapSetContext *ssc;
10242   map<hobject_t, SnapSetContext*>::iterator p = snapset_contexts.find(
10243     oid.get_snapdir());
10244   if (p != snapset_contexts.end()) {
10245     if (can_create || p->second->exists) {
10246       ssc = p->second;
10247     } else {
10248       return NULL;
10249     }
10250   } else {
10251     bufferlist bv;
10252     if (!attrs) {
10253       int r = -ENOENT;
10254       if (!(oid.is_head() && !oid_existed))
10255         r = pgbackend->objects_get_attr(oid.get_head(), SS_ATTR, &bv);
10256       if (r < 0) {
10257         // try _snapset
10258         if (!(oid.is_snapdir() && !oid_existed))
10259           r = pgbackend->objects_get_attr(oid.get_snapdir(), SS_ATTR, &bv);
10260         if (r < 0 && !can_create)
10261           return NULL;
10262       }
10263     } else {
10264       assert(attrs->count(SS_ATTR));
10265       bv = attrs->find(SS_ATTR)->second;
10266     }
10267     ssc = new SnapSetContext(oid.get_snapdir());
10268     _register_snapset_context(ssc);
10269     if (bv.length()) {
10270       bufferlist::iterator bvp = bv.begin();
10271       try {
10272         ssc->snapset.decode(bvp);
10273       } catch (buffer::error& e) {
10274         dout(0) << __func__ << " Can't decode snapset: " << e << dendl;
10275         return NULL;
10276       }
10277       ssc->exists = true;
10278     } else {
10279       ssc->exists = false;
10280     }
10281   }
10282   assert(ssc);
10283   ssc->ref++;
10284   return ssc;
10285 }
10286
10287 void PrimaryLogPG::put_snapset_context(SnapSetContext *ssc)
10288 {
10289   Mutex::Locker l(snapset_contexts_lock);
10290   --ssc->ref;
10291   if (ssc->ref == 0) {
10292     if (ssc->registered)
10293       snapset_contexts.erase(ssc->oid);
10294     delete ssc;
10295   }
10296 }
10297
10298 /** pull - request object from a peer
10299  */
10300
10301 /*
10302  * Return values:
10303  *  NONE  - didn't pull anything
10304  *  YES   - pulled what the caller wanted
10305  *  OTHER - needed to pull something else first (_head or _snapdir)
10306  */
10307 enum { PULL_NONE, PULL_OTHER, PULL_YES };
10308
10309 int PrimaryLogPG::recover_missing(
10310   const hobject_t &soid, eversion_t v,
10311   int priority,
10312   PGBackend::RecoveryHandle *h)
10313 {
10314   if (missing_loc.is_unfound(soid)) {
10315     dout(7) << "pull " << soid
10316             << " v " << v
10317             << " but it is unfound" << dendl;
10318     return PULL_NONE;
10319   }
10320
10321   if (missing_loc.is_deleted(soid)) {
10322     start_recovery_op(soid);
10323     assert(!recovering.count(soid));
10324     recovering.insert(make_pair(soid, ObjectContextRef()));
10325     epoch_t cur_epoch = get_osdmap()->get_epoch();
10326     remove_missing_object(soid, v, new FunctionContext(
10327      [=](int) {
10328        lock();
10329        if (!pg_has_reset_since(cur_epoch)) {
10330          bool object_missing = false;
10331          for (const auto& shard : actingbackfill) {
10332            if (shard == pg_whoami)
10333              continue;
10334            if (peer_missing[shard].is_missing(soid)) {
10335              dout(20) << __func__ << ": soid " << soid << " needs to be deleted from replica " << shard << dendl;
10336              object_missing = true;
10337              break;
10338            }
10339          }
10340          if (!object_missing) {
10341            object_stat_sum_t stat_diff;
10342            stat_diff.num_objects_recovered = 1;
10343            on_global_recover(soid, stat_diff, true);
10344          } else {
10345            auto recovery_handle = pgbackend->open_recovery_op();
10346            pgbackend->recover_delete_object(soid, v, recovery_handle);
10347            pgbackend->run_recovery_op(recovery_handle, priority);
10348          }
10349        }
10350        unlock();
10351      }));
10352     return PULL_YES;
10353   }
10354
10355   // is this a snapped object?  if so, consult the snapset.. we may not need the entire object!
10356   ObjectContextRef obc;
10357   ObjectContextRef head_obc;
10358   if (soid.snap && soid.snap < CEPH_NOSNAP) {
10359     // do we have the head and/or snapdir?
10360     hobject_t head = soid.get_head();
10361     if (pg_log.get_missing().is_missing(head)) {
10362       if (recovering.count(head)) {
10363         dout(10) << " missing but already recovering head " << head << dendl;
10364         return PULL_NONE;
10365       } else {
10366         int r = recover_missing(
10367           head, pg_log.get_missing().get_items().find(head)->second.need, priority,
10368           h);
10369         if (r != PULL_NONE)
10370           return PULL_OTHER;
10371         return PULL_NONE;
10372       }
10373     }
10374     head = soid.get_snapdir();
10375     if (pg_log.get_missing().is_missing(head)) {
10376       if (recovering.count(head)) {
10377         dout(10) << " missing but already recovering snapdir " << head << dendl;
10378         return PULL_NONE;
10379       } else {
10380         int r = recover_missing(
10381           head, pg_log.get_missing().get_items().find(head)->second.need, priority,
10382           h);
10383         if (r != PULL_NONE)
10384           return PULL_OTHER;
10385         return PULL_NONE;
10386       }
10387     }
10388
10389     // we must have one or the other
10390     head_obc = get_object_context(
10391       soid.get_head(),
10392       false,
10393       0);
10394     if (!head_obc)
10395       head_obc = get_object_context(
10396         soid.get_snapdir(),
10397         false,
10398         0);
10399     assert(head_obc);
10400   }
10401   start_recovery_op(soid);
10402   assert(!recovering.count(soid));
10403   recovering.insert(make_pair(soid, obc));
10404   int r = pgbackend->recover_object(
10405     soid,
10406     v,
10407     head_obc,
10408     obc,
10409     h);
10410   // This is only a pull which shouldn't return an error
10411   assert(r >= 0);
10412   return PULL_YES;
10413 }
10414
10415 void PrimaryLogPG::send_remove_op(
10416   const hobject_t& oid, eversion_t v, pg_shard_t peer)
10417 {
10418   ceph_tid_t tid = osd->get_tid();
10419   osd_reqid_t rid(osd->get_cluster_msgr_name(), 0, tid);
10420
10421   dout(10) << "send_remove_op " << oid << " from osd." << peer
10422            << " tid " << tid << dendl;
10423
10424   MOSDSubOp *subop = new MOSDSubOp(
10425     rid, pg_whoami, spg_t(info.pgid.pgid, peer.shard),
10426     oid, CEPH_OSD_FLAG_ACK,
10427     get_osdmap()->get_epoch(), tid, v);
10428   subop->ops = vector<OSDOp>(1);
10429   subop->ops[0].op.op = CEPH_OSD_OP_DELETE;
10430
10431   osd->send_message_osd_cluster(peer.osd, subop, get_osdmap()->get_epoch());
10432 }
10433
10434 void PrimaryLogPG::remove_missing_object(const hobject_t &soid,
10435                                          eversion_t v, Context *on_complete)
10436 {
10437   dout(20) << __func__ << " " << soid << " " << v << dendl;
10438   assert(on_complete != nullptr);
10439   // delete locally
10440   ObjectStore::Transaction t;
10441   remove_snap_mapped_object(t, soid);
10442
10443   ObjectRecoveryInfo recovery_info;
10444   recovery_info.soid = soid;
10445   recovery_info.version = v;
10446
10447   epoch_t cur_epoch = get_osdmap()->get_epoch();
10448   t.register_on_complete(new FunctionContext(
10449      [=](int) {
10450        lock();
10451        if (!pg_has_reset_since(cur_epoch)) {
10452          ObjectStore::Transaction t2;
10453          on_local_recover(soid, recovery_info, ObjectContextRef(), true, &t2);
10454          t2.register_on_complete(on_complete);
10455          int r = osd->store->queue_transaction(osr.get(), std::move(t2), nullptr);
10456          assert(r == 0);
10457          unlock();
10458        } else {
10459          unlock();
10460          on_complete->complete(-EAGAIN);
10461        }
10462      }));
10463   int r = osd->store->queue_transaction(osr.get(), std::move(t), nullptr);
10464   assert(r == 0);
10465 }
10466
10467 void PrimaryLogPG::finish_degraded_object(const hobject_t& oid)
10468 {
10469   dout(10) << "finish_degraded_object " << oid << dendl;
10470   if (callbacks_for_degraded_object.count(oid)) {
10471     list<Context*> contexts;
10472     contexts.swap(callbacks_for_degraded_object[oid]);
10473     callbacks_for_degraded_object.erase(oid);
10474     for (list<Context*>::iterator i = contexts.begin();
10475          i != contexts.end();
10476          ++i) {
10477       (*i)->complete(0);
10478     }
10479   }
10480   map<hobject_t, snapid_t>::iterator i = objects_blocked_on_degraded_snap.find(
10481     oid.get_head());
10482   if (i != objects_blocked_on_degraded_snap.end() &&
10483       i->second == oid.snap)
10484     objects_blocked_on_degraded_snap.erase(i);
10485 }
10486
10487 void PrimaryLogPG::_committed_pushed_object(
10488   epoch_t epoch, eversion_t last_complete)
10489 {
10490   lock();
10491   if (!pg_has_reset_since(epoch)) {
10492     dout(10) << "_committed_pushed_object last_complete " << last_complete << " now ondisk" << dendl;
10493     last_complete_ondisk = last_complete;
10494
10495     if (last_complete_ondisk == info.last_update) {
10496       if (!is_primary()) {
10497         // Either we are a replica or backfill target.
10498         // we are fully up to date.  tell the primary!
10499         osd->send_message_osd_cluster(
10500           get_primary().osd,
10501           new MOSDPGTrim(
10502             get_osdmap()->get_epoch(),
10503             spg_t(info.pgid.pgid, get_primary().shard),
10504             last_complete_ondisk),
10505           get_osdmap()->get_epoch());
10506       } else {
10507         calc_min_last_complete_ondisk();
10508       }
10509     }
10510
10511   } else {
10512     dout(10) << "_committed_pushed_object pg has changed, not touching last_complete_ondisk" << dendl;
10513   }
10514
10515   unlock();
10516 }
10517
10518 void PrimaryLogPG::_applied_recovered_object(ObjectContextRef obc)
10519 {
10520   lock();
10521   dout(20) << __func__ << dendl;
10522   if (obc) {
10523     dout(20) << "obc = " << *obc << dendl;
10524   }
10525   assert(active_pushes >= 1);
10526   --active_pushes;
10527
10528   // requeue an active chunky scrub waiting on recovery ops
10529   if (!deleting && active_pushes == 0
10530       && scrubber.is_chunky_scrub_active()) {
10531     if (ops_blocked_by_scrub()) {
10532       requeue_scrub(true);
10533     } else {
10534       requeue_scrub(false);
10535     }
10536   }
10537   unlock();
10538 }
10539
10540 void PrimaryLogPG::_applied_recovered_object_replica()
10541 {
10542   lock();
10543   dout(20) << __func__ << dendl;
10544   assert(active_pushes >= 1);
10545   --active_pushes;
10546
10547   // requeue an active chunky scrub waiting on recovery ops
10548   if (!deleting && active_pushes == 0 &&
10549       scrubber.active_rep_scrub && static_cast<const MOSDRepScrub*>(
10550         scrubber.active_rep_scrub->get_req())->chunky) {
10551     osd->enqueue_back(
10552       info.pgid,
10553       PGQueueable(scrubber.active_rep_scrub, get_osdmap()->get_epoch()));
10554     scrubber.active_rep_scrub = OpRequestRef();
10555   }
10556   unlock();
10557 }
10558
10559 void PrimaryLogPG::recover_got(hobject_t oid, eversion_t v)
10560 {
10561   dout(10) << "got missing " << oid << " v " << v << dendl;
10562   pg_log.recover_got(oid, v, info);
10563   if (pg_log.get_log().complete_to != pg_log.get_log().log.end()) {
10564     dout(10) << "last_complete now " << info.last_complete
10565              << " log.complete_to " << pg_log.get_log().complete_to->version
10566              << dendl;
10567   } else {
10568     dout(10) << "last_complete now " << info.last_complete
10569              << " log.complete_to at end" << dendl;
10570     //below is not true in the repair case.
10571     //assert(missing.num_missing() == 0);  // otherwise, complete_to was wrong.
10572     assert(info.last_complete == info.last_update);
10573   }
10574 }
10575
10576 void PrimaryLogPG::primary_failed(const hobject_t &soid)
10577 {
10578   list<pg_shard_t> fl = { pg_whoami };
10579   failed_push(fl, soid);
10580 }
10581
10582 void PrimaryLogPG::failed_push(const list<pg_shard_t> &from, const hobject_t &soid)
10583 {
10584   dout(20) << __func__ << ": " << soid << dendl;
10585   assert(recovering.count(soid));
10586   auto obc = recovering[soid];
10587   if (obc) {
10588     list<OpRequestRef> blocked_ops;
10589     obc->drop_recovery_read(&blocked_ops);
10590     requeue_ops(blocked_ops);
10591   }
10592   recovering.erase(soid);
10593   for (auto&& i : from)
10594     missing_loc.remove_location(soid, i);
10595   dout(0) << __func__ << " " << soid << " from shard " << from
10596           << ", reps on " << missing_loc.get_locations(soid)
10597           << " unfound? " << missing_loc.is_unfound(soid) << dendl;
10598   finish_recovery_op(soid);  // close out this attempt,
10599 }
10600
10601 void PrimaryLogPG::sub_op_remove(OpRequestRef op)
10602 {
10603   const MOSDSubOp *m = static_cast<const MOSDSubOp*>(op->get_req());
10604   assert(m->get_type() == MSG_OSD_SUBOP);
10605   dout(7) << "sub_op_remove " << m->poid << dendl;
10606
10607   op->mark_started();
10608
10609   ObjectStore::Transaction t;
10610   remove_snap_mapped_object(t, m->poid);
10611   int r = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
10612   assert(r == 0);
10613 }
10614
10615 eversion_t PrimaryLogPG::pick_newest_available(const hobject_t& oid)
10616 {
10617   eversion_t v;
10618   pg_missing_item pmi;
10619   bool is_missing = pg_log.get_missing().is_missing(oid, &pmi);
10620   assert(is_missing);
10621   v = pmi.have;
10622   dout(10) << "pick_newest_available " << oid << " " << v << " on osd." << osd->whoami << " (local)" << dendl;
10623
10624   assert(!actingbackfill.empty());
10625   for (set<pg_shard_t>::iterator i = actingbackfill.begin();
10626        i != actingbackfill.end();
10627        ++i) {
10628     if (*i == get_primary()) continue;
10629     pg_shard_t peer = *i;
10630     if (!peer_missing[peer].is_missing(oid)) {
10631       continue;
10632     }
10633     eversion_t h = peer_missing[peer].get_items().at(oid).have;
10634     dout(10) << "pick_newest_available " << oid << " " << h << " on osd." << peer << dendl;
10635     if (h > v)
10636       v = h;
10637   }
10638
10639   dout(10) << "pick_newest_available " << oid << " " << v << " (newest)" << dendl;
10640   return v;
10641 }
10642
10643 void PrimaryLogPG::do_update_log_missing(OpRequestRef &op)
10644 {
10645   const MOSDPGUpdateLogMissing *m = static_cast<const MOSDPGUpdateLogMissing*>(
10646     op->get_req());
10647   assert(m->get_type() == MSG_OSD_PG_UPDATE_LOG_MISSING);
10648   ObjectStore::Transaction t;
10649   append_log_entries_update_missing(m->entries, t);
10650
10651   Context *complete = new FunctionContext(
10652     [=](int) {
10653       const MOSDPGUpdateLogMissing *msg = static_cast<const MOSDPGUpdateLogMissing*>(
10654         op->get_req());
10655       lock();
10656       if (!pg_has_reset_since(msg->get_epoch())) {
10657         MOSDPGUpdateLogMissingReply *reply =
10658           new MOSDPGUpdateLogMissingReply(
10659             spg_t(info.pgid.pgid, primary_shard().shard),
10660             pg_whoami.shard,
10661             msg->get_epoch(),
10662             msg->min_epoch,
10663             msg->get_tid());
10664         reply->set_priority(CEPH_MSG_PRIO_HIGH);
10665         msg->get_connection()->send_message(reply);
10666       }
10667       unlock();
10668     });
10669
10670   if (get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
10671     t.register_on_commit(complete);
10672   } else {
10673     /* Hack to work around the fact that ReplicatedBackend sends
10674      * ack+commit if commit happens first
10675      *
10676      * This behavior is no longer necessary, but we preserve it so old
10677      * primaries can keep their repops in order */
10678     if (pool.info.ec_pool()) {
10679       t.register_on_complete(complete);
10680     } else {
10681       t.register_on_commit(complete);
10682     }
10683   }
10684   t.register_on_applied(
10685     new C_OSD_OnApplied{this, get_osdmap()->get_epoch(), info.last_update});
10686   int tr = osd->store->queue_transaction(
10687     osr.get(),
10688     std::move(t),
10689     nullptr);
10690   assert(tr == 0);
10691 }
10692
10693 void PrimaryLogPG::do_update_log_missing_reply(OpRequestRef &op)
10694 {
10695   const MOSDPGUpdateLogMissingReply *m =
10696     static_cast<const MOSDPGUpdateLogMissingReply*>(
10697     op->get_req());
10698   dout(20) << __func__ << " got reply from "
10699            << m->get_from() << dendl;
10700
10701   auto it = log_entry_update_waiting_on.find(m->get_tid());
10702   if (it != log_entry_update_waiting_on.end()) {
10703     if (it->second.waiting_on.count(m->get_from())) {
10704       it->second.waiting_on.erase(m->get_from());
10705     } else {
10706       osd->clog->error()
10707         << info.pgid << " got reply "
10708         << *m << " from shard we are not waiting for "
10709         << m->get_from();
10710     }
10711
10712     if (it->second.waiting_on.empty()) {
10713       repop_all_committed(it->second.repop.get());
10714       log_entry_update_waiting_on.erase(it);
10715     }
10716   } else {
10717     osd->clog->error()
10718       << info.pgid << " got reply "
10719       << *m << " on unknown tid " << m->get_tid();
10720   }
10721 }
10722
10723 /* Mark all unfound objects as lost.
10724  */
10725 void PrimaryLogPG::mark_all_unfound_lost(
10726   int what,
10727   ConnectionRef con,
10728   ceph_tid_t tid)
10729 {
10730   dout(3) << __func__ << " " << pg_log_entry_t::get_op_name(what) << dendl;
10731   list<hobject_t> oids;
10732
10733   dout(30) << __func__ << ": log before:\n";
10734   pg_log.get_log().print(*_dout);
10735   *_dout << dendl;
10736
10737   mempool::osd_pglog::list<pg_log_entry_t> log_entries;
10738
10739   utime_t mtime = ceph_clock_now();
10740   map<hobject_t, pg_missing_item>::const_iterator m =
10741     missing_loc.get_needs_recovery().begin();
10742   map<hobject_t, pg_missing_item>::const_iterator mend =
10743     missing_loc.get_needs_recovery().end();
10744
10745   ObcLockManager manager;
10746   eversion_t v = get_next_version();
10747   v.epoch = get_osdmap()->get_epoch();
10748   uint64_t num_unfound = missing_loc.num_unfound();
10749   while (m != mend) {
10750     const hobject_t &oid(m->first);
10751     if (!missing_loc.is_unfound(oid)) {
10752       // We only care about unfound objects
10753       ++m;
10754       continue;
10755     }
10756
10757     ObjectContextRef obc;
10758     eversion_t prev;
10759
10760     switch (what) {
10761     case pg_log_entry_t::LOST_MARK:
10762       assert(0 == "actually, not implemented yet!");
10763       break;
10764
10765     case pg_log_entry_t::LOST_REVERT:
10766       prev = pick_newest_available(oid);
10767       if (prev > eversion_t()) {
10768         // log it
10769         pg_log_entry_t e(
10770           pg_log_entry_t::LOST_REVERT, oid, v,
10771           m->second.need, 0, osd_reqid_t(), mtime, 0);
10772         e.reverting_to = prev;
10773         e.mark_unrollbackable();
10774         log_entries.push_back(e);
10775         dout(10) << e << dendl;
10776
10777         // we are now missing the new version; recovery code will sort it out.
10778         ++v.version;
10779         ++m;
10780         break;
10781       }
10782
10783     case pg_log_entry_t::LOST_DELETE:
10784       {
10785         pg_log_entry_t e(pg_log_entry_t::LOST_DELETE, oid, v, m->second.need,
10786                          0, osd_reqid_t(), mtime, 0);
10787         if (get_osdmap()->require_osd_release >= CEPH_RELEASE_JEWEL) {
10788           if (pool.info.require_rollback()) {
10789             e.mod_desc.try_rmobject(v.version);
10790           } else {
10791             e.mark_unrollbackable();
10792           }
10793         } // otherwise, just do what we used to do
10794         dout(10) << e << dendl;
10795         log_entries.push_back(e);
10796         oids.push_back(oid);
10797
10798         // If context found mark object as deleted in case
10799         // of racing with new creation.  This can happen if
10800         // object lost and EIO at primary.
10801         obc = object_contexts.lookup(oid);
10802         if (obc)
10803           obc->obs.exists = false;
10804
10805         ++v.version;
10806         ++m;
10807       }
10808       break;
10809
10810     default:
10811       ceph_abort();
10812     }
10813   }
10814
10815   info.stats.stats_invalid = true;
10816
10817   submit_log_entries(
10818     log_entries,
10819     std::move(manager),
10820     boost::optional<std::function<void(void)> >(
10821       [this, oids, con, num_unfound, tid]() {
10822         if (perform_deletes_during_peering()) {
10823           for (auto oid : oids) {
10824             // clear old locations - merge_new_log_entries will have
10825             // handled rebuilding missing_loc for each of these
10826             // objects if we have the RECOVERY_DELETES flag
10827             missing_loc.recovered(oid);
10828           }
10829         }
10830
10831         if (is_recovery_unfound()) {
10832           queue_peering_event(
10833             CephPeeringEvtRef(
10834               std::make_shared<CephPeeringEvt>(
10835               get_osdmap()->get_epoch(),
10836               get_osdmap()->get_epoch(),
10837               DoRecovery())));
10838         } else if (is_backfill_unfound()) {
10839           queue_peering_event(
10840             CephPeeringEvtRef(
10841               std::make_shared<CephPeeringEvt>(
10842               get_osdmap()->get_epoch(),
10843               get_osdmap()->get_epoch(),
10844               RequestBackfill())));
10845         } else {
10846           queue_recovery();
10847         }
10848
10849         stringstream ss;
10850         ss << "pg has " << num_unfound
10851            << " objects unfound and apparently lost marking";
10852         string rs = ss.str();
10853         dout(0) << "do_command r=" << 0 << " " << rs << dendl;
10854         osd->clog->info() << rs;
10855         if (con) {
10856           MCommandReply *reply = new MCommandReply(0, rs);
10857           reply->set_tid(tid);
10858           con->send_message(reply);
10859         }
10860       }),
10861     OpRequestRef());
10862 }
10863
10864 void PrimaryLogPG::_split_into(pg_t child_pgid, PG *child, unsigned split_bits)
10865 {
10866   assert(repop_queue.empty());
10867 }
10868
10869 /*
10870  * pg status change notification
10871  */
10872
10873 void PrimaryLogPG::apply_and_flush_repops(bool requeue)
10874 {
10875   list<OpRequestRef> rq;
10876
10877   // apply all repops
10878   while (!repop_queue.empty()) {
10879     RepGather *repop = repop_queue.front();
10880     repop_queue.pop_front();
10881     dout(10) << " canceling repop tid " << repop->rep_tid << dendl;
10882     repop->rep_aborted = true;
10883     repop->on_applied.clear();
10884     repop->on_committed.clear();
10885     repop->on_success.clear();
10886
10887     if (requeue) {
10888       if (repop->op) {
10889         dout(10) << " requeuing " << *repop->op->get_req() << dendl;
10890         rq.push_back(repop->op);
10891         repop->op = OpRequestRef();
10892       }
10893
10894       // also requeue any dups, interleaved into position
10895       map<eversion_t, list<pair<OpRequestRef, version_t> > >::iterator p =
10896         waiting_for_ondisk.find(repop->v);
10897       if (p != waiting_for_ondisk.end()) {
10898         dout(10) << " also requeuing ondisk waiters " << p->second << dendl;
10899         for (list<pair<OpRequestRef, version_t> >::iterator i =
10900                p->second.begin();
10901              i != p->second.end();
10902              ++i) {
10903           rq.push_back(i->first);
10904         }
10905         waiting_for_ondisk.erase(p);
10906       }
10907     }
10908
10909     remove_repop(repop);
10910   }
10911
10912   assert(repop_queue.empty());
10913
10914   if (requeue) {
10915     requeue_ops(rq);
10916     if (!waiting_for_ondisk.empty()) {
10917       for (map<eversion_t, list<pair<OpRequestRef, version_t> > >::iterator i =
10918              waiting_for_ondisk.begin();
10919            i != waiting_for_ondisk.end();
10920            ++i) {
10921         for (list<pair<OpRequestRef, version_t> >::iterator j =
10922                i->second.begin();
10923              j != i->second.end();
10924              ++j) {
10925           derr << __func__ << ": op " << *(j->first->get_req()) << " waiting on "
10926                << i->first << dendl;
10927         }
10928       }
10929       assert(waiting_for_ondisk.empty());
10930     }
10931   }
10932
10933   waiting_for_ondisk.clear();
10934 }
10935
10936 void PrimaryLogPG::on_flushed()
10937 {
10938   assert(flushes_in_progress > 0);
10939   flushes_in_progress--;
10940   if (flushes_in_progress == 0) {
10941     requeue_ops(waiting_for_flush);
10942   }
10943   if (!is_peered() || !is_primary()) {
10944     pair<hobject_t, ObjectContextRef> i;
10945     while (object_contexts.get_next(i.first, &i)) {
10946       derr << "on_flushed: object " << i.first << " obc still alive" << dendl;
10947     }
10948     assert(object_contexts.empty());
10949   }
10950   pgbackend->on_flushed();
10951 }
10952
10953 void PrimaryLogPG::on_removal(ObjectStore::Transaction *t)
10954 {
10955   dout(10) << "on_removal" << dendl;
10956
10957   // adjust info to backfill
10958   info.set_last_backfill(hobject_t());
10959   pg_log.reset_backfill();
10960   dirty_info = true;
10961
10962
10963   // clear log
10964   PGLogEntryHandler rollbacker{this, t};
10965   pg_log.roll_forward(&rollbacker);
10966
10967   write_if_dirty(*t);
10968
10969   if (!deleting)
10970     on_shutdown();
10971 }
10972
10973 void PrimaryLogPG::clear_async_reads()
10974 {
10975   dout(10) << __func__ << dendl;
10976   for(auto& i : in_progress_async_reads) {
10977     dout(10) << "clear ctx: "
10978              << "OpRequestRef " << i.first
10979              << " OpContext " << i.second
10980              << dendl;
10981     close_op_ctx(i.second);
10982   }
10983 }
10984
10985 void PrimaryLogPG::on_shutdown()
10986 {
10987   dout(10) << "on_shutdown" << dendl;
10988
10989   // remove from queues
10990   osd->pg_stat_queue_dequeue(this);
10991   osd->peering_wq.dequeue(this);
10992
10993   // handles queue races
10994   deleting = true;
10995
10996   if (recovery_queued) {
10997     recovery_queued = false;
10998     osd->clear_queued_recovery(this);
10999   }
11000
11001   clear_scrub_reserved();
11002   scrub_clear_state();
11003
11004   unreg_next_scrub();
11005   cancel_copy_ops(false);
11006   cancel_flush_ops(false);
11007   cancel_proxy_ops(false);
11008   apply_and_flush_repops(false);
11009   cancel_log_updates();
11010   // we must remove PGRefs, so do this this prior to release_backoffs() callers
11011   clear_backoffs();
11012   // clean up snap trim references
11013   snap_trimmer_machine.process_event(Reset());
11014
11015   pgbackend->on_change();
11016
11017   context_registry_on_change();
11018   object_contexts.clear();
11019
11020   clear_async_reads();
11021
11022   osd->remote_reserver.cancel_reservation(info.pgid);
11023   osd->local_reserver.cancel_reservation(info.pgid);
11024
11025   clear_primary_state();
11026   cancel_recovery();
11027 }
11028
11029 void PrimaryLogPG::on_activate()
11030 {
11031   // all clean?
11032   if (needs_recovery()) {
11033     dout(10) << "activate not all replicas are up-to-date, queueing recovery" << dendl;
11034     queue_peering_event(
11035       CephPeeringEvtRef(
11036         std::make_shared<CephPeeringEvt>(
11037           get_osdmap()->get_epoch(),
11038           get_osdmap()->get_epoch(),
11039           DoRecovery())));
11040   } else if (needs_backfill()) {
11041     dout(10) << "activate queueing backfill" << dendl;
11042     queue_peering_event(
11043       CephPeeringEvtRef(
11044         std::make_shared<CephPeeringEvt>(
11045           get_osdmap()->get_epoch(),
11046           get_osdmap()->get_epoch(),
11047           RequestBackfill())));
11048   } else {
11049     dout(10) << "activate all replicas clean, no recovery" << dendl;
11050     eio_errors_to_process = false;
11051     queue_peering_event(
11052       CephPeeringEvtRef(
11053         std::make_shared<CephPeeringEvt>(
11054           get_osdmap()->get_epoch(),
11055           get_osdmap()->get_epoch(),
11056           AllReplicasRecovered())));
11057   }
11058
11059   publish_stats_to_osd();
11060
11061   if (!backfill_targets.empty()) {
11062     last_backfill_started = earliest_backfill();
11063     new_backfill = true;
11064     assert(!last_backfill_started.is_max());
11065     dout(5) << "on activate: bft=" << backfill_targets
11066            << " from " << last_backfill_started << dendl;
11067     for (set<pg_shard_t>::iterator i = backfill_targets.begin();
11068          i != backfill_targets.end();
11069          ++i) {
11070       dout(5) << "target shard " << *i
11071              << " from " << peer_info[*i].last_backfill
11072              << dendl;
11073     }
11074   }
11075
11076   hit_set_setup();
11077   agent_setup();
11078 }
11079
11080 void PrimaryLogPG::_on_new_interval()
11081 {
11082   dout(20) << __func__ << " checking missing set deletes flag. missing = " << pg_log.get_missing() << dendl;
11083   if (!pg_log.get_missing().may_include_deletes &&
11084       get_osdmap()->test_flag(CEPH_OSDMAP_RECOVERY_DELETES)) {
11085     pg_log.rebuild_missing_set_with_deletes(osd->store, coll, info);
11086   }
11087   assert(pg_log.get_missing().may_include_deletes == get_osdmap()->test_flag(CEPH_OSDMAP_RECOVERY_DELETES));
11088 }
11089
11090 void PrimaryLogPG::on_change(ObjectStore::Transaction *t)
11091 {
11092   dout(10) << "on_change" << dendl;
11093
11094   if (hit_set && hit_set->insert_count() == 0) {
11095     dout(20) << " discarding empty hit_set" << dendl;
11096     hit_set_clear();
11097   }
11098
11099   if (recovery_queued) {
11100     recovery_queued = false;
11101     osd->clear_queued_recovery(this);
11102   }
11103
11104   // requeue everything in the reverse order they should be
11105   // reexamined.
11106   requeue_ops(waiting_for_peered);
11107   requeue_ops(waiting_for_flush);
11108   requeue_ops(waiting_for_active);
11109
11110   clear_scrub_reserved();
11111
11112   cancel_copy_ops(is_primary());
11113   cancel_flush_ops(is_primary());
11114   cancel_proxy_ops(is_primary());
11115
11116   // requeue object waiters
11117   for (auto& p : waiting_for_unreadable_object) {
11118     release_backoffs(p.first);
11119   }
11120   if (is_primary()) {
11121     requeue_object_waiters(waiting_for_unreadable_object);
11122   } else {
11123     waiting_for_unreadable_object.clear();
11124   }
11125   for (map<hobject_t,list<OpRequestRef>>::iterator p = waiting_for_degraded_object.begin();
11126        p != waiting_for_degraded_object.end();
11127        waiting_for_degraded_object.erase(p++)) {
11128     release_backoffs(p->first);
11129     if (is_primary())
11130       requeue_ops(p->second);
11131     else
11132       p->second.clear();
11133     finish_degraded_object(p->first);
11134   }
11135
11136   // requeues waiting_for_scrub
11137   scrub_clear_state();
11138
11139   for (auto p = waiting_for_blocked_object.begin();
11140        p != waiting_for_blocked_object.end();
11141        waiting_for_blocked_object.erase(p++)) {
11142     if (is_primary())
11143       requeue_ops(p->second);
11144     else
11145       p->second.clear();
11146   }
11147   for (auto i = callbacks_for_degraded_object.begin();
11148        i != callbacks_for_degraded_object.end();
11149     ) {
11150     finish_degraded_object((i++)->first);
11151   }
11152   assert(callbacks_for_degraded_object.empty());
11153
11154   if (is_primary()) {
11155     requeue_ops(waiting_for_cache_not_full);
11156   } else {
11157     waiting_for_cache_not_full.clear();
11158   }
11159   objects_blocked_on_cache_full.clear();
11160
11161   for (list<pair<OpRequestRef, OpContext*> >::iterator i =
11162          in_progress_async_reads.begin();
11163        i != in_progress_async_reads.end();
11164        in_progress_async_reads.erase(i++)) {
11165     close_op_ctx(i->second);
11166     if (is_primary())
11167       requeue_op(i->first);
11168   }
11169
11170   // this will requeue ops we were working on but didn't finish, and
11171   // any dups
11172   apply_and_flush_repops(is_primary());
11173   cancel_log_updates();
11174
11175   // do this *after* apply_and_flush_repops so that we catch any newly
11176   // registered watches.
11177   context_registry_on_change();
11178
11179   pgbackend->on_change_cleanup(t);
11180   scrubber.cleanup_store(t);
11181   pgbackend->on_change();
11182
11183   // clear snap_trimmer state
11184   snap_trimmer_machine.process_event(Reset());
11185
11186   debug_op_order.clear();
11187   unstable_stats.clear();
11188
11189   // we don't want to cache object_contexts through the interval change
11190   // NOTE: we actually assert that all currently live references are dead
11191   // by the time the flush for the next interval completes.
11192   object_contexts.clear();
11193
11194   // should have been cleared above by finishing all of the degraded objects
11195   assert(objects_blocked_on_degraded_snap.empty());
11196 }
11197
11198 void PrimaryLogPG::on_role_change()
11199 {
11200   dout(10) << "on_role_change" << dendl;
11201   if (get_role() != 0 && hit_set) {
11202     dout(10) << " clearing hit set" << dendl;
11203     hit_set_clear();
11204   }
11205 }
11206
11207 void PrimaryLogPG::on_pool_change()
11208 {
11209   dout(10) << __func__ << dendl;
11210   // requeue cache full waiters just in case the cache_mode is
11211   // changing away from writeback mode.  note that if we are not
11212   // active the normal requeuing machinery is sufficient (and properly
11213   // ordered).
11214   if (is_active() &&
11215       pool.info.cache_mode != pg_pool_t::CACHEMODE_WRITEBACK &&
11216       !waiting_for_cache_not_full.empty()) {
11217     dout(10) << __func__ << " requeuing full waiters (not in writeback) "
11218              << dendl;
11219     requeue_ops(waiting_for_cache_not_full);
11220     objects_blocked_on_cache_full.clear();
11221   }
11222   hit_set_setup();
11223   agent_setup();
11224 }
11225
11226 // clear state.  called on recovery completion AND cancellation.
11227 void PrimaryLogPG::_clear_recovery_state()
11228 {
11229   missing_loc.clear();
11230 #ifdef DEBUG_RECOVERY_OIDS
11231   recovering_oids.clear();
11232 #endif
11233   last_backfill_started = hobject_t();
11234   set<hobject_t>::iterator i = backfills_in_flight.begin();
11235   while (i != backfills_in_flight.end()) {
11236     assert(recovering.count(*i));
11237     backfills_in_flight.erase(i++);
11238   }
11239
11240   list<OpRequestRef> blocked_ops;
11241   for (map<hobject_t, ObjectContextRef>::iterator i = recovering.begin();
11242        i != recovering.end();
11243        recovering.erase(i++)) {
11244     if (i->second) {
11245       i->second->drop_recovery_read(&blocked_ops);
11246       requeue_ops(blocked_ops);
11247     }
11248   }
11249   assert(backfills_in_flight.empty());
11250   pending_backfill_updates.clear();
11251   assert(recovering.empty());
11252   pgbackend->clear_recovery_state();
11253 }
11254
11255 void PrimaryLogPG::cancel_pull(const hobject_t &soid)
11256 {
11257   dout(20) << __func__ << ": " << soid << dendl;
11258   assert(recovering.count(soid));
11259   ObjectContextRef obc = recovering[soid];
11260   if (obc) {
11261     list<OpRequestRef> blocked_ops;
11262     obc->drop_recovery_read(&blocked_ops);
11263     requeue_ops(blocked_ops);
11264   }
11265   recovering.erase(soid);
11266   finish_recovery_op(soid);
11267   release_backoffs(soid);
11268   if (waiting_for_degraded_object.count(soid)) {
11269     dout(20) << " kicking degraded waiters on " << soid << dendl;
11270     requeue_ops(waiting_for_degraded_object[soid]);
11271     waiting_for_degraded_object.erase(soid);
11272   }
11273   if (waiting_for_unreadable_object.count(soid)) {
11274     dout(20) << " kicking unreadable waiters on " << soid << dendl;
11275     requeue_ops(waiting_for_unreadable_object[soid]);
11276     waiting_for_unreadable_object.erase(soid);
11277   }
11278   if (is_missing_object(soid))
11279     pg_log.set_last_requested(0); // get recover_primary to start over
11280   finish_degraded_object(soid);
11281 }
11282
11283 void PrimaryLogPG::check_recovery_sources(const OSDMapRef& osdmap)
11284 {
11285   /*
11286    * check that any peers we are planning to (or currently) pulling
11287    * objects from are dealt with.
11288    */
11289   missing_loc.check_recovery_sources(osdmap);
11290   pgbackend->check_recovery_sources(osdmap);
11291
11292   for (set<pg_shard_t>::iterator i = peer_log_requested.begin();
11293        i != peer_log_requested.end();
11294        ) {
11295     if (!osdmap->is_up(i->osd)) {
11296       dout(10) << "peer_log_requested removing " << *i << dendl;
11297       peer_log_requested.erase(i++);
11298     } else {
11299       ++i;
11300     }
11301   }
11302
11303   for (set<pg_shard_t>::iterator i = peer_missing_requested.begin();
11304        i != peer_missing_requested.end();
11305        ) {
11306     if (!osdmap->is_up(i->osd)) {
11307       dout(10) << "peer_missing_requested removing " << *i << dendl;
11308       peer_missing_requested.erase(i++);
11309     } else {
11310       ++i;
11311     }
11312   }
11313 }
11314
11315 void PG::MissingLoc::check_recovery_sources(const OSDMapRef& osdmap)
11316 {
11317   set<pg_shard_t> now_down;
11318   for (set<pg_shard_t>::iterator p = missing_loc_sources.begin();
11319        p != missing_loc_sources.end();
11320        ) {
11321     if (osdmap->is_up(p->osd)) {
11322       ++p;
11323       continue;
11324     }
11325     ldout(pg->cct, 10) << "check_recovery_sources source osd." << *p << " now down" << dendl;
11326     now_down.insert(*p);
11327     missing_loc_sources.erase(p++);
11328   }
11329
11330   if (now_down.empty()) {
11331     ldout(pg->cct, 10) << "check_recovery_sources no source osds (" << missing_loc_sources << ") went down" << dendl;
11332   } else {
11333     ldout(pg->cct, 10) << "check_recovery_sources sources osds " << now_down << " now down, remaining sources are "
11334                        << missing_loc_sources << dendl;
11335
11336     // filter missing_loc
11337     map<hobject_t, set<pg_shard_t>>::iterator p = missing_loc.begin();
11338     while (p != missing_loc.end()) {
11339       set<pg_shard_t>::iterator q = p->second.begin();
11340       while (q != p->second.end())
11341         if (now_down.count(*q)) {
11342           p->second.erase(q++);
11343         } else {
11344           ++q;
11345         }
11346       if (p->second.empty())
11347         missing_loc.erase(p++);
11348       else
11349         ++p;
11350     }
11351   }
11352 }
11353
11354
11355 bool PrimaryLogPG::start_recovery_ops(
11356   uint64_t max,
11357   ThreadPool::TPHandle &handle,
11358   uint64_t *ops_started)
11359 {
11360   uint64_t& started = *ops_started;
11361   started = 0;
11362   bool work_in_progress = false;
11363   assert(is_primary());
11364
11365   if (!state_test(PG_STATE_RECOVERING) &&
11366       !state_test(PG_STATE_BACKFILLING)) {
11367     /* TODO: I think this case is broken and will make do_recovery()
11368      * unhappy since we're returning false */
11369     dout(10) << "recovery raced and were queued twice, ignoring!" << dendl;
11370     return false;
11371   }
11372
11373   const auto &missing = pg_log.get_missing();
11374
11375   unsigned int num_missing = missing.num_missing();
11376   uint64_t num_unfound = get_num_unfound();
11377
11378   if (num_missing == 0) {
11379     info.last_complete = info.last_update;
11380   }
11381
11382   if (num_missing == num_unfound) {
11383     // All of the missing objects we have are unfound.
11384     // Recover the replicas.
11385     started = recover_replicas(max, handle);
11386   }
11387   if (!started) {
11388     // We still have missing objects that we should grab from replicas.
11389     started += recover_primary(max, handle);
11390   }
11391   if (!started && num_unfound != get_num_unfound()) {
11392     // second chance to recovery replicas
11393     started = recover_replicas(max, handle);
11394   }
11395
11396   if (started)
11397     work_in_progress = true;
11398
11399   bool deferred_backfill = false;
11400   if (recovering.empty() &&
11401       state_test(PG_STATE_BACKFILLING) &&
11402       !backfill_targets.empty() && started < max &&
11403       missing.num_missing() == 0 &&
11404       waiting_on_backfill.empty()) {
11405     if (get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL)) {
11406       dout(10) << "deferring backfill due to NOBACKFILL" << dendl;
11407       deferred_backfill = true;
11408     } else if (get_osdmap()->test_flag(CEPH_OSDMAP_NOREBALANCE) &&
11409                !is_degraded())  {
11410       dout(10) << "deferring backfill due to NOREBALANCE" << dendl;
11411       deferred_backfill = true;
11412     } else if (!backfill_reserved) {
11413       dout(10) << "deferring backfill due to !backfill_reserved" << dendl;
11414       if (!backfill_reserving) {
11415         dout(10) << "queueing RequestBackfill" << dendl;
11416         backfill_reserving = true;
11417         queue_peering_event(
11418           CephPeeringEvtRef(
11419             std::make_shared<CephPeeringEvt>(
11420               get_osdmap()->get_epoch(),
11421               get_osdmap()->get_epoch(),
11422               RequestBackfill())));
11423       }
11424       deferred_backfill = true;
11425     } else {
11426       started += recover_backfill(max - started, handle, &work_in_progress);
11427     }
11428   }
11429
11430   dout(10) << " started " << started << dendl;
11431   osd->logger->inc(l_osd_rop, started);
11432
11433   if (!recovering.empty() ||
11434       work_in_progress || recovery_ops_active > 0 || deferred_backfill)
11435     return work_in_progress;
11436
11437   assert(recovering.empty());
11438   assert(recovery_ops_active == 0);
11439
11440   dout(10) << __func__ << " needs_recovery: "
11441            << missing_loc.get_needs_recovery()
11442            << dendl;
11443   dout(10) << __func__ << " missing_loc: "
11444            << missing_loc.get_missing_locs()
11445            << dendl;
11446   int unfound = get_num_unfound();
11447   if (unfound) {
11448     dout(10) << " still have " << unfound << " unfound" << dendl;
11449     return work_in_progress;
11450   }
11451
11452   if (missing.num_missing() > 0) {
11453     // this shouldn't happen!
11454     osd->clog->error() << info.pgid << " Unexpected Error: recovery ending with "
11455                        << missing.num_missing() << ": " << missing.get_items();
11456     return work_in_progress;
11457   }
11458
11459   if (needs_recovery()) {
11460     // this shouldn't happen!
11461     // We already checked num_missing() so we must have missing replicas
11462     osd->clog->error() << info.pgid
11463                        << " Unexpected Error: recovery ending with missing replicas";
11464     return work_in_progress;
11465   }
11466
11467   if (state_test(PG_STATE_RECOVERING)) {
11468     state_clear(PG_STATE_RECOVERING);
11469     state_clear(PG_STATE_FORCED_RECOVERY);
11470     if (needs_backfill()) {
11471       dout(10) << "recovery done, queuing backfill" << dendl;
11472       queue_peering_event(
11473         CephPeeringEvtRef(
11474           std::make_shared<CephPeeringEvt>(
11475             get_osdmap()->get_epoch(),
11476             get_osdmap()->get_epoch(),
11477             RequestBackfill())));
11478     } else {
11479       dout(10) << "recovery done, no backfill" << dendl;
11480       eio_errors_to_process = false;
11481       state_clear(PG_STATE_FORCED_BACKFILL);
11482       queue_peering_event(
11483         CephPeeringEvtRef(
11484           std::make_shared<CephPeeringEvt>(
11485             get_osdmap()->get_epoch(),
11486             get_osdmap()->get_epoch(),
11487             AllReplicasRecovered())));
11488     }
11489   } else { // backfilling
11490     state_clear(PG_STATE_BACKFILLING);
11491     state_clear(PG_STATE_FORCED_BACKFILL);
11492     state_clear(PG_STATE_FORCED_RECOVERY);
11493     dout(10) << "recovery done, backfill done" << dendl;
11494     eio_errors_to_process = false;
11495     queue_peering_event(
11496       CephPeeringEvtRef(
11497         std::make_shared<CephPeeringEvt>(
11498           get_osdmap()->get_epoch(),
11499           get_osdmap()->get_epoch(),
11500           Backfilled())));
11501   }
11502
11503   return false;
11504 }
11505
11506 /**
11507  * do one recovery op.
11508  * return true if done, false if nothing left to do.
11509  */
11510 uint64_t PrimaryLogPG::recover_primary(uint64_t max, ThreadPool::TPHandle &handle)
11511 {
11512   assert(is_primary());
11513
11514   const auto &missing = pg_log.get_missing();
11515
11516   dout(10) << "recover_primary recovering " << recovering.size()
11517            << " in pg" << dendl;
11518   dout(10) << "recover_primary " << missing << dendl;
11519   dout(25) << "recover_primary " << missing.get_items() << dendl;
11520
11521   // look at log!
11522   pg_log_entry_t *latest = 0;
11523   unsigned started = 0;
11524   int skipped = 0;
11525
11526   PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
11527   map<version_t, hobject_t>::const_iterator p =
11528     missing.get_rmissing().lower_bound(pg_log.get_log().last_requested);
11529   while (p != missing.get_rmissing().end()) {
11530     handle.reset_tp_timeout();
11531     hobject_t soid;
11532     version_t v = p->first;
11533
11534     if (pg_log.get_log().objects.count(p->second)) {
11535       latest = pg_log.get_log().objects.find(p->second)->second;
11536       assert(latest->is_update() || latest->is_delete());
11537       soid = latest->soid;
11538     } else {
11539       latest = 0;
11540       soid = p->second;
11541     }
11542     const pg_missing_item& item = missing.get_items().find(p->second)->second;
11543     ++p;
11544
11545     hobject_t head = soid.get_head();
11546
11547     eversion_t need = item.need;
11548
11549     dout(10) << "recover_primary "
11550              << soid << " " << item.need
11551              << (missing.is_missing(soid) ? " (missing)":"")
11552              << (missing.is_missing(head) ? " (missing head)":"")
11553              << (recovering.count(soid) ? " (recovering)":"")
11554              << (recovering.count(head) ? " (recovering head)":"")
11555              << dendl;
11556
11557     if (latest) {
11558       switch (latest->op) {
11559       case pg_log_entry_t::CLONE:
11560         /*
11561          * Handling for this special case removed for now, until we
11562          * can correctly construct an accurate SnapSet from the old
11563          * one.
11564          */
11565         break;
11566
11567       case pg_log_entry_t::LOST_REVERT:
11568         {
11569           if (item.have == latest->reverting_to) {
11570             ObjectContextRef obc = get_object_context(soid, true);
11571
11572             if (obc->obs.oi.version == latest->version) {
11573               // I'm already reverting
11574               dout(10) << " already reverting " << soid << dendl;
11575             } else {
11576               dout(10) << " reverting " << soid << " to " << latest->prior_version << dendl;
11577               obc->ondisk_write_lock();
11578               obc->obs.oi.version = latest->version;
11579
11580               ObjectStore::Transaction t;
11581               bufferlist b2;
11582               obc->obs.oi.encode(
11583                 b2,
11584                 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
11585               assert(!pool.info.require_rollback());
11586               t.setattr(coll, ghobject_t(soid), OI_ATTR, b2);
11587
11588               recover_got(soid, latest->version);
11589               missing_loc.add_location(soid, pg_whoami);
11590
11591               ++active_pushes;
11592
11593               osd->store->queue_transaction(osr.get(), std::move(t),
11594                                             new C_OSD_AppliedRecoveredObject(this, obc),
11595                                             new C_OSD_CommittedPushedObject(
11596                                               this,
11597                                               get_osdmap()->get_epoch(),
11598                                               info.last_complete),
11599                                             new C_OSD_OndiskWriteUnlock(obc));
11600               continue;
11601             }
11602           } else {
11603             /*
11604              * Pull the old version of the object.  Update missing_loc here to have the location
11605              * of the version we want.
11606              *
11607              * This doesn't use the usual missing_loc paths, but that's okay:
11608              *  - if we have it locally, we hit the case above, and go from there.
11609              *  - if we don't, we always pass through this case during recovery and set up the location
11610              *    properly.
11611              *  - this way we don't need to mangle the missing code to be general about needing an old
11612              *    version...
11613              */
11614             eversion_t alternate_need = latest->reverting_to;
11615             dout(10) << " need to pull prior_version " << alternate_need << " for revert " << item << dendl;
11616
11617             for (map<pg_shard_t, pg_missing_t>::iterator p = peer_missing.begin();
11618                  p != peer_missing.end();
11619                  ++p)
11620               if (p->second.is_missing(soid, need) &&
11621                   p->second.get_items().at(soid).have == alternate_need) {
11622                 missing_loc.add_location(soid, p->first);
11623               }
11624             dout(10) << " will pull " << alternate_need << " or " << need
11625                      << " from one of " << missing_loc.get_locations(soid)
11626                      << dendl;
11627           }
11628         }
11629         break;
11630       }
11631     }
11632
11633     if (!recovering.count(soid)) {
11634       if (recovering.count(head)) {
11635         ++skipped;
11636       } else {
11637         int r = recover_missing(
11638           soid, need, get_recovery_op_priority(), h);
11639         switch (r) {
11640         case PULL_YES:
11641           ++started;
11642           break;
11643         case PULL_OTHER:
11644           ++started;
11645         case PULL_NONE:
11646           ++skipped;
11647           break;
11648         default:
11649           ceph_abort();
11650         }
11651         if (started >= max)
11652           break;
11653       }
11654     }
11655
11656     // only advance last_requested if we haven't skipped anything
11657     if (!skipped)
11658       pg_log.set_last_requested(v);
11659   }
11660
11661   pgbackend->run_recovery_op(h, get_recovery_op_priority());
11662   return started;
11663 }
11664
11665 bool PrimaryLogPG::primary_error(
11666   const hobject_t& soid, eversion_t v)
11667 {
11668   pg_log.missing_add(soid, v, eversion_t());
11669   pg_log.set_last_requested(0);
11670   missing_loc.remove_location(soid, pg_whoami);
11671   bool uhoh = true;
11672   assert(!actingbackfill.empty());
11673   for (set<pg_shard_t>::iterator i = actingbackfill.begin();
11674        i != actingbackfill.end();
11675        ++i) {
11676     if (*i == get_primary()) continue;
11677     pg_shard_t peer = *i;
11678     if (!peer_missing[peer].is_missing(soid, v)) {
11679       missing_loc.add_location(soid, peer);
11680       dout(10) << info.pgid << " unexpectedly missing " << soid << " v" << v
11681                << ", there should be a copy on shard " << peer << dendl;
11682       uhoh = false;
11683     }
11684   }
11685   if (uhoh)
11686     osd->clog->error() << info.pgid << " missing primary copy of " << soid << ", unfound";
11687   else
11688     osd->clog->error() << info.pgid << " missing primary copy of " << soid
11689                          << ", will try copies on " << missing_loc.get_locations(soid);
11690   return uhoh;
11691 }
11692
11693 int PrimaryLogPG::prep_object_replica_deletes(
11694   const hobject_t& soid, eversion_t v,
11695   PGBackend::RecoveryHandle *h)
11696 {
11697   assert(is_primary());
11698   dout(10) << __func__ << ": on " << soid << dendl;
11699
11700   start_recovery_op(soid);
11701   assert(!recovering.count(soid));
11702   recovering.insert(make_pair(soid, ObjectContextRef()));
11703
11704   pgbackend->recover_delete_object(soid, v, h);
11705   return 1;
11706 }
11707
11708 int PrimaryLogPG::prep_object_replica_pushes(
11709   const hobject_t& soid, eversion_t v,
11710   PGBackend::RecoveryHandle *h)
11711 {
11712   assert(is_primary());
11713   dout(10) << __func__ << ": on " << soid << dendl;
11714
11715   // NOTE: we know we will get a valid oloc off of disk here.
11716   ObjectContextRef obc = get_object_context(soid, false);
11717   if (!obc) {
11718     primary_error(soid, v);
11719     return 0;
11720   }
11721
11722   if (!obc->get_recovery_read()) {
11723     dout(20) << "recovery delayed on " << soid
11724              << "; could not get rw_manager lock" << dendl;
11725     return 0;
11726   } else {
11727     dout(20) << "recovery got recovery read lock on " << soid
11728              << dendl;
11729   }
11730
11731   start_recovery_op(soid);
11732   assert(!recovering.count(soid));
11733   recovering.insert(make_pair(soid, obc));
11734
11735   /* We need this in case there is an in progress write on the object.  In fact,
11736    * the only possible write is an update to the xattr due to a lost_revert --
11737    * a client write would be blocked since the object is degraded.
11738    * In almost all cases, therefore, this lock should be uncontended.
11739    */
11740   obc->ondisk_read_lock();
11741   int r = pgbackend->recover_object(
11742     soid,
11743     v,
11744     ObjectContextRef(),
11745     obc, // has snapset context
11746     h);
11747   obc->ondisk_read_unlock();
11748   if (r < 0) {
11749     dout(0) << __func__ << " Error " << r << " on oid " << soid << dendl;
11750     primary_failed(soid);
11751     primary_error(soid, v);
11752     return 0;
11753   }
11754   return 1;
11755 }
11756
11757 uint64_t PrimaryLogPG::recover_replicas(uint64_t max, ThreadPool::TPHandle &handle)
11758 {
11759   dout(10) << __func__ << "(" << max << ")" << dendl;
11760   uint64_t started = 0;
11761
11762   PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
11763
11764   // this is FAR from an optimal recovery order.  pretty lame, really.
11765   assert(!actingbackfill.empty());
11766   for (set<pg_shard_t>::iterator i = actingbackfill.begin();
11767        i != actingbackfill.end();
11768        ++i) {
11769     if (*i == get_primary()) continue;
11770     pg_shard_t peer = *i;
11771     map<pg_shard_t, pg_missing_t>::const_iterator pm = peer_missing.find(peer);
11772     assert(pm != peer_missing.end());
11773     map<pg_shard_t, pg_info_t>::const_iterator pi = peer_info.find(peer);
11774     assert(pi != peer_info.end());
11775     size_t m_sz = pm->second.num_missing();
11776
11777     dout(10) << " peer osd." << peer << " missing " << m_sz << " objects." << dendl;
11778     dout(20) << " peer osd." << peer << " missing " << pm->second.get_items() << dendl;
11779
11780     // oldest first!
11781     const pg_missing_t &m(pm->second);
11782     for (map<version_t, hobject_t>::const_iterator p = m.get_rmissing().begin();
11783          p != m.get_rmissing().end() && started < max;
11784            ++p) {
11785       handle.reset_tp_timeout();
11786       const hobject_t soid(p->second);
11787
11788       if (missing_loc.is_unfound(soid)) {
11789         dout(10) << __func__ << ": " << soid << " still unfound" << dendl;
11790         continue;
11791       }
11792
11793       if (soid > pi->second.last_backfill) {
11794         if (!recovering.count(soid)) {
11795           derr << __func__ << ": object " << soid << " last_backfill " << pi->second.last_backfill << dendl;
11796           derr << __func__ << ": object added to missing set for backfill, but "
11797                << "is not in recovering, error!" << dendl;
11798           ceph_abort();
11799         }
11800         continue;
11801       }
11802
11803       if (recovering.count(soid)) {
11804         dout(10) << __func__ << ": already recovering " << soid << dendl;
11805         continue;
11806       }
11807
11808       if (missing_loc.is_deleted(soid)) {
11809         dout(10) << __func__ << ": " << soid << " is a delete, removing" << dendl;
11810         map<hobject_t,pg_missing_item>::const_iterator r = m.get_items().find(soid);
11811         started += prep_object_replica_deletes(soid, r->second.need, h);
11812         continue;
11813       }
11814
11815       if (soid.is_snap() && pg_log.get_missing().is_missing(soid.get_head())) {
11816         dout(10) << __func__ << ": " << soid.get_head()
11817                  << " still missing on primary" << dendl;
11818         continue;
11819       }
11820
11821       if (soid.is_snap() && pg_log.get_missing().is_missing(soid.get_snapdir())) {
11822         dout(10) << __func__ << ": " << soid.get_snapdir()
11823                  << " still missing on primary" << dendl;
11824         continue;
11825       }
11826
11827       if (pg_log.get_missing().is_missing(soid)) {
11828         dout(10) << __func__ << ": " << soid << " still missing on primary" << dendl;
11829         continue;
11830       }
11831
11832       dout(10) << __func__ << ": recover_object_replicas(" << soid << ")" << dendl;
11833       map<hobject_t,pg_missing_item>::const_iterator r = m.get_items().find(soid);
11834       started += prep_object_replica_pushes(soid, r->second.need,
11835                                             h);
11836     }
11837   }
11838
11839   pgbackend->run_recovery_op(h, get_recovery_op_priority());
11840   return started;
11841 }
11842
11843 hobject_t PrimaryLogPG::earliest_peer_backfill() const
11844 {
11845   hobject_t e = hobject_t::get_max();
11846   for (set<pg_shard_t>::const_iterator i = backfill_targets.begin();
11847        i != backfill_targets.end();
11848        ++i) {
11849     pg_shard_t peer = *i;
11850     map<pg_shard_t, BackfillInterval>::const_iterator iter =
11851       peer_backfill_info.find(peer);
11852     assert(iter != peer_backfill_info.end());
11853     if (iter->second.begin < e)
11854       e = iter->second.begin;
11855   }
11856   return e;
11857 }
11858
11859 bool PrimaryLogPG::all_peer_done() const
11860 {
11861   // Primary hasn't got any more objects
11862   assert(backfill_info.empty());
11863
11864   for (set<pg_shard_t>::const_iterator i = backfill_targets.begin();
11865        i != backfill_targets.end();
11866        ++i) {
11867     pg_shard_t bt = *i;
11868     map<pg_shard_t, BackfillInterval>::const_iterator piter =
11869       peer_backfill_info.find(bt);
11870     assert(piter != peer_backfill_info.end());
11871     const BackfillInterval& pbi = piter->second;
11872     // See if peer has more to process
11873     if (!pbi.extends_to_end() || !pbi.empty())
11874         return false;
11875   }
11876   return true;
11877 }
11878
11879 /**
11880  * recover_backfill
11881  *
11882  * Invariants:
11883  *
11884  * backfilled: fully pushed to replica or present in replica's missing set (both
11885  * our copy and theirs).
11886  *
11887  * All objects on a backfill_target in
11888  * [MIN,peer_backfill_info[backfill_target].begin) are valid; logically-removed
11889  * objects have been actually deleted and all logically-valid objects are replicated.
11890  * There may be PG objects in this interval yet to be backfilled.
11891  *
11892  * All objects in PG in [MIN,backfill_info.begin) have been backfilled to all
11893  * backfill_targets.  There may be objects on backfill_target(s) yet to be deleted.
11894  *
11895  * For a backfill target, all objects < MIN(peer_backfill_info[target].begin,
11896  *     backfill_info.begin) in PG are backfilled.  No deleted objects in this
11897  * interval remain on the backfill target.
11898  *
11899  * For a backfill target, all objects <= peer_info[target].last_backfill
11900  * have been backfilled to target
11901  *
11902  * There *MAY* be missing/outdated objects between last_backfill_started and
11903  * MIN(peer_backfill_info[*].begin, backfill_info.begin) in the event that client
11904  * io created objects since the last scan.  For this reason, we call
11905  * update_range() again before continuing backfill.
11906  */
11907 uint64_t PrimaryLogPG::recover_backfill(
11908   uint64_t max,
11909   ThreadPool::TPHandle &handle, bool *work_started)
11910 {
11911   dout(10) << "recover_backfill (" << max << ")"
11912            << " bft=" << backfill_targets
11913            << " last_backfill_started " << last_backfill_started
11914            << (new_backfill ? " new_backfill":"")
11915            << dendl;
11916   assert(!backfill_targets.empty());
11917
11918   // Initialize from prior backfill state
11919   if (new_backfill) {
11920     // on_activate() was called prior to getting here
11921     assert(last_backfill_started == earliest_backfill());
11922     new_backfill = false;
11923
11924     // initialize BackfillIntervals
11925     for (set<pg_shard_t>::iterator i = backfill_targets.begin();
11926          i != backfill_targets.end();
11927          ++i) {
11928       peer_backfill_info[*i].reset(peer_info[*i].last_backfill);
11929     }
11930     backfill_info.reset(last_backfill_started);
11931
11932     backfills_in_flight.clear();
11933     pending_backfill_updates.clear();
11934   }
11935
11936   for (set<pg_shard_t>::iterator i = backfill_targets.begin();
11937        i != backfill_targets.end();
11938        ++i) {
11939     dout(10) << "peer osd." << *i
11940            << " info " << peer_info[*i]
11941            << " interval " << peer_backfill_info[*i].begin
11942            << "-" << peer_backfill_info[*i].end
11943            << " " << peer_backfill_info[*i].objects.size() << " objects"
11944            << dendl;
11945   }
11946
11947   // update our local interval to cope with recent changes
11948   backfill_info.begin = last_backfill_started;
11949   update_range(&backfill_info, handle);
11950
11951   unsigned ops = 0;
11952   vector<boost::tuple<hobject_t, eversion_t, pg_shard_t> > to_remove;
11953   set<hobject_t> add_to_stat;
11954
11955   for (set<pg_shard_t>::iterator i = backfill_targets.begin();
11956        i != backfill_targets.end();
11957        ++i) {
11958     peer_backfill_info[*i].trim_to(
11959       std::max(peer_info[*i].last_backfill, last_backfill_started));
11960   }
11961   backfill_info.trim_to(last_backfill_started);
11962
11963   PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
11964   while (ops < max) {
11965     if (backfill_info.begin <= earliest_peer_backfill() &&
11966         !backfill_info.extends_to_end() && backfill_info.empty()) {
11967       hobject_t next = backfill_info.end;
11968       backfill_info.reset(next);
11969       backfill_info.end = hobject_t::get_max();
11970       update_range(&backfill_info, handle);
11971       backfill_info.trim();
11972     }
11973
11974     dout(20) << "   my backfill interval " << backfill_info << dendl;
11975
11976     bool sent_scan = false;
11977     for (set<pg_shard_t>::iterator i = backfill_targets.begin();
11978          i != backfill_targets.end();
11979          ++i) {
11980       pg_shard_t bt = *i;
11981       BackfillInterval& pbi = peer_backfill_info[bt];
11982
11983       dout(20) << " peer shard " << bt << " backfill " << pbi << dendl;
11984       if (pbi.begin <= backfill_info.begin &&
11985           !pbi.extends_to_end() && pbi.empty()) {
11986         dout(10) << " scanning peer osd." << bt << " from " << pbi.end << dendl;
11987         epoch_t e = get_osdmap()->get_epoch();
11988         MOSDPGScan *m = new MOSDPGScan(
11989           MOSDPGScan::OP_SCAN_GET_DIGEST, pg_whoami, e, last_peering_reset,
11990           spg_t(info.pgid.pgid, bt.shard),
11991           pbi.end, hobject_t());
11992         osd->send_message_osd_cluster(bt.osd, m, get_osdmap()->get_epoch());
11993         assert(waiting_on_backfill.find(bt) == waiting_on_backfill.end());
11994         waiting_on_backfill.insert(bt);
11995         sent_scan = true;
11996       }
11997     }
11998
11999     // Count simultaneous scans as a single op and let those complete
12000     if (sent_scan) {
12001       ops++;
12002       start_recovery_op(hobject_t::get_max()); // XXX: was pbi.end
12003       break;
12004     }
12005
12006     if (backfill_info.empty() && all_peer_done()) {
12007       dout(10) << " reached end for both local and all peers" << dendl;
12008       break;
12009     }
12010
12011     // Get object within set of peers to operate on and
12012     // the set of targets for which that object applies.
12013     hobject_t check = earliest_peer_backfill();
12014
12015     if (check < backfill_info.begin) {
12016
12017       set<pg_shard_t> check_targets;
12018       for (set<pg_shard_t>::iterator i = backfill_targets.begin();
12019            i != backfill_targets.end();
12020            ++i) {
12021         pg_shard_t bt = *i;
12022         BackfillInterval& pbi = peer_backfill_info[bt];
12023         if (pbi.begin == check)
12024           check_targets.insert(bt);
12025       }
12026       assert(!check_targets.empty());
12027
12028       dout(20) << " BACKFILL removing " << check
12029                << " from peers " << check_targets << dendl;
12030       for (set<pg_shard_t>::iterator i = check_targets.begin();
12031            i != check_targets.end();
12032            ++i) {
12033         pg_shard_t bt = *i;
12034         BackfillInterval& pbi = peer_backfill_info[bt];
12035         assert(pbi.begin == check);
12036
12037         to_remove.push_back(boost::make_tuple(check, pbi.objects.begin()->second, bt));
12038         pbi.pop_front();
12039       }
12040
12041       /* This requires a bit of explanation.  We compare head against
12042        * last_backfill to determine whether to send an operation
12043        * to the replica.  A single write operation can touch up to three
12044        * objects: head, the snapdir, and a new clone which sorts closer to
12045        * head than any existing clone.  If last_backfill points at a clone,
12046        * the transaction won't be sent and all 3 must lie on the right side
12047        * of the line (i.e., we'll backfill them later).  If last_backfill
12048        * points at snapdir, it sorts greater than head, so we send the
12049        * transaction which is correct because all three must lie to the left
12050        * of the line.
12051        *
12052        * If it points at head, we have a bit of an issue.  If head actually
12053        * exists, no problem, because any transaction which touches snapdir
12054        * must end up creating it (and deleting head), so sending the
12055        * operation won't pose a problem -- we'll end up having to scan it,
12056        * but it'll end up being the right version so we won't bother to
12057        * rebackfill it.  However, if head doesn't exist, any write on head
12058        * will remove snapdir.  For a replicated pool, this isn't a problem,
12059        * ENOENT on remove isn't an issue and it's in backfill future anyway.
12060        * It only poses a problem for EC pools, because we never just delete
12061        * an object, we rename it into a rollback object.  That operation
12062        * will end up crashing the osd with ENOENT.  Tolerating the failure
12063        * wouldn't work either, even if snapdir exists, we'd be creating a
12064        * rollback object past the last_backfill line which wouldn't get
12065        * cleaned up (no rollback objects past the last_backfill line is an
12066        * existing important invariant).  Thus, let's avoid the whole issue
12067        * by just not updating last_backfill_started here if head doesn't
12068        * exist and snapdir does.  We aren't using up a recovery count here,
12069        * so we're going to recover snapdir immediately anyway.  We'll only
12070        * fail "backward" if we fail to get the rw lock and that just means
12071        * we'll re-process this section of the hash space again.
12072        *
12073        * I'm choosing this hack here because the really "correct" answer is
12074        * going to be to unify snapdir and head into a single object (a
12075        * snapdir is really just a confusing way to talk about head existing
12076        * as a whiteout), but doing that is going to be a somewhat larger
12077        * undertaking.
12078        *
12079        * @see http://tracker.ceph.com/issues/17668
12080        */
12081       if (!(check.is_head() &&
12082             backfill_info.begin.is_snapdir() &&
12083             check == backfill_info.begin.get_head()))
12084         last_backfill_started = check;
12085
12086       // Don't increment ops here because deletions
12087       // are cheap and not replied to unlike real recovery_ops,
12088       // and we can't increment ops without requeueing ourself
12089       // for recovery.
12090     } else {
12091       eversion_t& obj_v = backfill_info.objects.begin()->second;
12092
12093       vector<pg_shard_t> need_ver_targs, missing_targs, keep_ver_targs, skip_targs;
12094       for (set<pg_shard_t>::iterator i = backfill_targets.begin();
12095            i != backfill_targets.end();
12096            ++i) {
12097         pg_shard_t bt = *i;
12098         BackfillInterval& pbi = peer_backfill_info[bt];
12099         // Find all check peers that have the wrong version
12100         if (check == backfill_info.begin && check == pbi.begin) {
12101           if (pbi.objects.begin()->second != obj_v) {
12102             need_ver_targs.push_back(bt);
12103           } else {
12104             keep_ver_targs.push_back(bt);
12105           }
12106         } else {
12107           pg_info_t& pinfo = peer_info[bt];
12108
12109           // Only include peers that we've caught up to their backfill line
12110           // otherwise, they only appear to be missing this object
12111           // because their pbi.begin > backfill_info.begin.
12112           if (backfill_info.begin > pinfo.last_backfill)
12113             missing_targs.push_back(bt);
12114           else
12115             skip_targs.push_back(bt);
12116         }
12117       }
12118
12119       if (!keep_ver_targs.empty()) {
12120         // These peers have version obj_v
12121         dout(20) << " BACKFILL keeping " << check
12122                  << " with ver " << obj_v
12123                  << " on peers " << keep_ver_targs << dendl;
12124         //assert(!waiting_for_degraded_object.count(check));
12125       }
12126       if (!need_ver_targs.empty() || !missing_targs.empty()) {
12127         ObjectContextRef obc = get_object_context(backfill_info.begin, false);
12128         assert(obc);
12129         if (obc->get_recovery_read()) {
12130           if (!need_ver_targs.empty()) {
12131             dout(20) << " BACKFILL replacing " << check
12132                    << " with ver " << obj_v
12133                    << " to peers " << need_ver_targs << dendl;
12134           }
12135           if (!missing_targs.empty()) {
12136             dout(20) << " BACKFILL pushing " << backfill_info.begin
12137                  << " with ver " << obj_v
12138                  << " to peers " << missing_targs << dendl;
12139           }
12140           vector<pg_shard_t> all_push = need_ver_targs;
12141           all_push.insert(all_push.end(), missing_targs.begin(), missing_targs.end());
12142
12143           handle.reset_tp_timeout();
12144           int r = prep_backfill_object_push(backfill_info.begin, obj_v, obc, all_push, h);
12145           if (r < 0) {
12146             *work_started = true;
12147             dout(0) << __func__ << " Error " << r << " trying to backfill " << backfill_info.begin << dendl;
12148             break;
12149           }
12150           ops++;
12151         } else {
12152           *work_started = true;
12153           dout(20) << "backfill blocking on " << backfill_info.begin
12154                    << "; could not get rw_manager lock" << dendl;
12155           break;
12156         }
12157       }
12158       dout(20) << "need_ver_targs=" << need_ver_targs
12159                << " keep_ver_targs=" << keep_ver_targs << dendl;
12160       dout(20) << "backfill_targets=" << backfill_targets
12161                << " missing_targs=" << missing_targs
12162                << " skip_targs=" << skip_targs << dendl;
12163
12164       last_backfill_started = backfill_info.begin;
12165       add_to_stat.insert(backfill_info.begin); // XXX: Only one for all pushes?
12166       backfill_info.pop_front();
12167       vector<pg_shard_t> check_targets = need_ver_targs;
12168       check_targets.insert(check_targets.end(), keep_ver_targs.begin(), keep_ver_targs.end());
12169       for (vector<pg_shard_t>::iterator i = check_targets.begin();
12170            i != check_targets.end();
12171            ++i) {
12172         pg_shard_t bt = *i;
12173         BackfillInterval& pbi = peer_backfill_info[bt];
12174         pbi.pop_front();
12175       }
12176     }
12177   }
12178
12179   hobject_t backfill_pos =
12180     std::min(backfill_info.begin, earliest_peer_backfill());
12181
12182   for (set<hobject_t>::iterator i = add_to_stat.begin();
12183        i != add_to_stat.end();
12184        ++i) {
12185     ObjectContextRef obc = get_object_context(*i, false);
12186     assert(obc);
12187     pg_stat_t stat;
12188     add_object_context_to_pg_stat(obc, &stat);
12189     pending_backfill_updates[*i] = stat;
12190   }
12191   if (HAVE_FEATURE(get_min_upacting_features(), SERVER_LUMINOUS)) {
12192     map<pg_shard_t,MOSDPGBackfillRemove*> reqs;
12193     for (unsigned i = 0; i < to_remove.size(); ++i) {
12194       handle.reset_tp_timeout();
12195       const hobject_t& oid = to_remove[i].get<0>();
12196       eversion_t v = to_remove[i].get<1>();
12197       pg_shard_t peer = to_remove[i].get<2>();
12198       MOSDPGBackfillRemove *m;
12199       auto it = reqs.find(peer);
12200       if (it != reqs.end()) {
12201         m = it->second;
12202       } else {
12203         m = reqs[peer] = new MOSDPGBackfillRemove(
12204           spg_t(info.pgid.pgid, peer.shard),
12205           get_osdmap()->get_epoch());
12206       }
12207       m->ls.push_back(make_pair(oid, v));
12208
12209       if (oid <= last_backfill_started)
12210         pending_backfill_updates[oid]; // add empty stat!
12211     }
12212     for (auto p : reqs) {
12213       osd->send_message_osd_cluster(p.first.osd, p.second,
12214                                     get_osdmap()->get_epoch());
12215     }
12216   } else {
12217     // for jewel targets
12218     for (unsigned i = 0; i < to_remove.size(); ++i) {
12219       handle.reset_tp_timeout();
12220
12221       // ordered before any subsequent updates
12222       send_remove_op(to_remove[i].get<0>(), to_remove[i].get<1>(),
12223                      to_remove[i].get<2>());
12224
12225       if (to_remove[i].get<0>() <= last_backfill_started)
12226         pending_backfill_updates[to_remove[i].get<0>()]; // add empty stat!
12227     }
12228   }
12229
12230   pgbackend->run_recovery_op(h, get_recovery_op_priority());
12231
12232   dout(5) << "backfill_pos is " << backfill_pos << dendl;
12233   for (set<hobject_t>::iterator i = backfills_in_flight.begin();
12234        i != backfills_in_flight.end();
12235        ++i) {
12236     dout(20) << *i << " is still in flight" << dendl;
12237   }
12238
12239   hobject_t next_backfill_to_complete = backfills_in_flight.empty() ?
12240     backfill_pos : *(backfills_in_flight.begin());
12241   hobject_t new_last_backfill = earliest_backfill();
12242   dout(10) << "starting new_last_backfill at " << new_last_backfill << dendl;
12243   for (map<hobject_t, pg_stat_t>::iterator i =
12244          pending_backfill_updates.begin();
12245        i != pending_backfill_updates.end() &&
12246          i->first < next_backfill_to_complete;
12247        pending_backfill_updates.erase(i++)) {
12248     dout(20) << " pending_backfill_update " << i->first << dendl;
12249     assert(i->first > new_last_backfill);
12250     for (set<pg_shard_t>::iterator j = backfill_targets.begin();
12251          j != backfill_targets.end();
12252          ++j) {
12253       pg_shard_t bt = *j;
12254       pg_info_t& pinfo = peer_info[bt];
12255       //Add stats to all peers that were missing object
12256       if (i->first > pinfo.last_backfill)
12257         pinfo.stats.add(i->second);
12258     }
12259     new_last_backfill = i->first;
12260   }
12261   dout(10) << "possible new_last_backfill at " << new_last_backfill << dendl;
12262
12263   assert(!pending_backfill_updates.empty() ||
12264          new_last_backfill == last_backfill_started);
12265   if (pending_backfill_updates.empty() &&
12266       backfill_pos.is_max()) {
12267     assert(backfills_in_flight.empty());
12268     new_last_backfill = backfill_pos;
12269     last_backfill_started = backfill_pos;
12270   }
12271   dout(10) << "final new_last_backfill at " << new_last_backfill << dendl;
12272
12273   // If new_last_backfill == MAX, then we will send OP_BACKFILL_FINISH to
12274   // all the backfill targets.  Otherwise, we will move last_backfill up on
12275   // those targets need it and send OP_BACKFILL_PROGRESS to them.
12276   for (set<pg_shard_t>::iterator i = backfill_targets.begin();
12277        i != backfill_targets.end();
12278        ++i) {
12279     pg_shard_t bt = *i;
12280     pg_info_t& pinfo = peer_info[bt];
12281
12282     if (new_last_backfill > pinfo.last_backfill) {
12283       pinfo.set_last_backfill(new_last_backfill);
12284       epoch_t e = get_osdmap()->get_epoch();
12285       MOSDPGBackfill *m = NULL;
12286       if (pinfo.last_backfill.is_max()) {
12287         m = new MOSDPGBackfill(
12288           MOSDPGBackfill::OP_BACKFILL_FINISH,
12289           e,
12290           last_peering_reset,
12291           spg_t(info.pgid.pgid, bt.shard));
12292         // Use default priority here, must match sub_op priority
12293         /* pinfo.stats might be wrong if we did log-based recovery on the
12294          * backfilled portion in addition to continuing backfill.
12295          */
12296         pinfo.stats = info.stats;
12297         start_recovery_op(hobject_t::get_max());
12298       } else {
12299         m = new MOSDPGBackfill(
12300           MOSDPGBackfill::OP_BACKFILL_PROGRESS,
12301           e,
12302           last_peering_reset,
12303           spg_t(info.pgid.pgid, bt.shard));
12304         // Use default priority here, must match sub_op priority
12305       }
12306       m->last_backfill = pinfo.last_backfill;
12307       m->stats = pinfo.stats;
12308       osd->send_message_osd_cluster(bt.osd, m, get_osdmap()->get_epoch());
12309       dout(10) << " peer " << bt
12310                << " num_objects now " << pinfo.stats.stats.sum.num_objects
12311                << " / " << info.stats.stats.sum.num_objects << dendl;
12312     }
12313   }
12314
12315   if (ops)
12316     *work_started = true;
12317   return ops;
12318 }
12319
12320 int PrimaryLogPG::prep_backfill_object_push(
12321   hobject_t oid, eversion_t v,
12322   ObjectContextRef obc,
12323   vector<pg_shard_t> peers,
12324   PGBackend::RecoveryHandle *h)
12325 {
12326   dout(10) << __func__ << " " << oid << " v " << v << " to peers " << peers << dendl;
12327   assert(!peers.empty());
12328
12329   backfills_in_flight.insert(oid);
12330   for (unsigned int i = 0 ; i < peers.size(); ++i) {
12331     map<pg_shard_t, pg_missing_t>::iterator bpm = peer_missing.find(peers[i]);
12332     assert(bpm != peer_missing.end());
12333     bpm->second.add(oid, eversion_t(), eversion_t(), false);
12334   }
12335
12336   assert(!recovering.count(oid));
12337
12338   start_recovery_op(oid);
12339   recovering.insert(make_pair(oid, obc));
12340
12341   // We need to take the read_lock here in order to flush in-progress writes
12342   obc->ondisk_read_lock();
12343   int r = pgbackend->recover_object(
12344     oid,
12345     v,
12346     ObjectContextRef(),
12347     obc,
12348     h);
12349   obc->ondisk_read_unlock();
12350   if (r < 0) {
12351     dout(0) << __func__ << " Error " << r << " on oid " << oid << dendl;
12352     primary_failed(oid);
12353     primary_error(oid, v);
12354     backfills_in_flight.erase(oid);
12355     missing_loc.add_missing(oid, v, eversion_t());
12356   }
12357   return r;
12358 }
12359
12360 void PrimaryLogPG::update_range(
12361   BackfillInterval *bi,
12362   ThreadPool::TPHandle &handle)
12363 {
12364   int local_min = cct->_conf->osd_backfill_scan_min;
12365   int local_max = cct->_conf->osd_backfill_scan_max;
12366
12367   if (bi->version < info.log_tail) {
12368     dout(10) << __func__<< ": bi is old, rescanning local backfill_info"
12369              << dendl;
12370     if (last_update_applied >= info.log_tail) {
12371       bi->version = last_update_applied;
12372     } else {
12373       osr->flush();
12374       bi->version = info.last_update;
12375     }
12376     scan_range(local_min, local_max, bi, handle);
12377   }
12378
12379   if (bi->version >= projected_last_update) {
12380     dout(10) << __func__<< ": bi is current " << dendl;
12381     assert(bi->version == projected_last_update);
12382   } else if (bi->version >= info.log_tail) {
12383     if (pg_log.get_log().empty() && projected_log.empty()) {
12384       /* Because we don't move log_tail on split, the log might be
12385        * empty even if log_tail != last_update.  However, the only
12386        * way to get here with an empty log is if log_tail is actually
12387        * eversion_t(), because otherwise the entry which changed
12388        * last_update since the last scan would have to be present.
12389        */
12390       assert(bi->version == eversion_t());
12391       return;
12392     }
12393
12394     dout(10) << __func__<< ": bi is old, (" << bi->version
12395              << ") can be updated with log to projected_last_update "
12396              << projected_last_update << dendl;
12397
12398     auto func = [&](const pg_log_entry_t &e) {
12399       dout(10) << __func__ << ": updating from version " << e.version
12400                << dendl;
12401       const hobject_t &soid = e.soid;
12402       if (soid >= bi->begin &&
12403           soid < bi->end) {
12404         if (e.is_update()) {
12405           dout(10) << __func__ << ": " << e.soid << " updated to version "
12406                    << e.version << dendl;
12407           bi->objects.erase(e.soid);
12408           bi->objects.insert(
12409             make_pair(
12410               e.soid,
12411               e.version));
12412         } else if (e.is_delete()) {
12413           dout(10) << __func__ << ": " << e.soid << " removed" << dendl;
12414           bi->objects.erase(e.soid);
12415         }
12416       }
12417     };
12418     dout(10) << "scanning pg log first" << dendl;
12419     pg_log.get_log().scan_log_after(bi->version, func);
12420     dout(10) << "scanning projected log" << dendl;
12421     projected_log.scan_log_after(bi->version, func);
12422     bi->version = projected_last_update;
12423   } else {
12424     assert(0 == "scan_range should have raised bi->version past log_tail");
12425   }
12426 }
12427
12428 void PrimaryLogPG::scan_range(
12429   int min, int max, BackfillInterval *bi,
12430   ThreadPool::TPHandle &handle)
12431 {
12432   assert(is_locked());
12433   dout(10) << "scan_range from " << bi->begin << dendl;
12434   bi->clear_objects();
12435
12436   vector<hobject_t> ls;
12437   ls.reserve(max);
12438   int r = pgbackend->objects_list_partial(bi->begin, min, max, &ls, &bi->end);
12439   assert(r >= 0);
12440   dout(10) << " got " << ls.size() << " items, next " << bi->end << dendl;
12441   dout(20) << ls << dendl;
12442
12443   for (vector<hobject_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
12444     handle.reset_tp_timeout();
12445     ObjectContextRef obc;
12446     if (is_primary())
12447       obc = object_contexts.lookup(*p);
12448     if (obc) {
12449       bi->objects[*p] = obc->obs.oi.version;
12450       dout(20) << "  " << *p << " " << obc->obs.oi.version << dendl;
12451     } else {
12452       bufferlist bl;
12453       int r = pgbackend->objects_get_attr(*p, OI_ATTR, &bl);
12454
12455       /* If the object does not exist here, it must have been removed
12456          * between the collection_list_partial and here.  This can happen
12457          * for the first item in the range, which is usually last_backfill.
12458          */
12459       if (r == -ENOENT)
12460         continue;
12461
12462       assert(r >= 0);
12463       object_info_t oi(bl);
12464       bi->objects[*p] = oi.version;
12465       dout(20) << "  " << *p << " " << oi.version << dendl;
12466     }
12467   }
12468 }
12469
12470
12471 /** check_local
12472  *
12473  * verifies that stray objects have been deleted
12474  */
12475 void PrimaryLogPG::check_local()
12476 {
12477   dout(10) << __func__ << dendl;
12478
12479   assert(info.last_update >= pg_log.get_tail());  // otherwise we need some help!
12480
12481   if (!cct->_conf->osd_debug_verify_stray_on_activate)
12482     return;
12483
12484   // just scan the log.
12485   set<hobject_t> did;
12486   for (list<pg_log_entry_t>::const_reverse_iterator p = pg_log.get_log().log.rbegin();
12487        p != pg_log.get_log().log.rend();
12488        ++p) {
12489     if (did.count(p->soid))
12490       continue;
12491     did.insert(p->soid);
12492
12493     if (p->is_delete() && !is_missing_object(p->soid)) {
12494       dout(10) << " checking " << p->soid
12495                << " at " << p->version << dendl;
12496       struct stat st;
12497       int r = osd->store->stat(
12498         ch,
12499         ghobject_t(p->soid, ghobject_t::NO_GEN, pg_whoami.shard),
12500         &st);
12501       if (r != -ENOENT) {
12502         derr << __func__ << " " << p->soid << " exists, but should have been "
12503              << "deleted" << dendl;
12504         assert(0 == "erroneously present object");
12505       }
12506     } else {
12507       // ignore old(+missing) objects
12508     }
12509   }
12510 }
12511
12512
12513
12514 // ===========================
12515 // hit sets
12516
12517 hobject_t PrimaryLogPG::get_hit_set_current_object(utime_t stamp)
12518 {
12519   ostringstream ss;
12520   ss << "hit_set_" << info.pgid.pgid << "_current_" << stamp;
12521   hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "",
12522                  info.pgid.ps(), info.pgid.pool(),
12523                  cct->_conf->osd_hit_set_namespace);
12524   dout(20) << __func__ << " " << hoid << dendl;
12525   return hoid;
12526 }
12527
12528 hobject_t PrimaryLogPG::get_hit_set_archive_object(utime_t start,
12529                                                    utime_t end,
12530                                                    bool using_gmt)
12531 {
12532   ostringstream ss;
12533   ss << "hit_set_" << info.pgid.pgid << "_archive_";
12534   if (using_gmt) {
12535     start.gmtime(ss) << "_";
12536     end.gmtime(ss);
12537   } else {
12538     start.localtime(ss) << "_";
12539     end.localtime(ss);
12540   }
12541   hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "",
12542                  info.pgid.ps(), info.pgid.pool(),
12543                  cct->_conf->osd_hit_set_namespace);
12544   dout(20) << __func__ << " " << hoid << dendl;
12545   return hoid;
12546 }
12547
12548 void PrimaryLogPG::hit_set_clear()
12549 {
12550   dout(20) << __func__ << dendl;
12551   hit_set.reset();
12552   hit_set_start_stamp = utime_t();
12553 }
12554
12555 void PrimaryLogPG::hit_set_setup()
12556 {
12557   if (!is_active() ||
12558       !is_primary()) {
12559     hit_set_clear();
12560     return;
12561   }
12562
12563   if (is_active() && is_primary() &&
12564       (!pool.info.hit_set_count ||
12565        !pool.info.hit_set_period ||
12566        pool.info.hit_set_params.get_type() == HitSet::TYPE_NONE)) {
12567     hit_set_clear();
12568
12569     // only primary is allowed to remove all the hit set objects
12570     hit_set_remove_all();
12571     return;
12572   }
12573
12574   // FIXME: discard any previous data for now
12575   hit_set_create();
12576
12577   // include any writes we know about from the pg log.  this doesn't
12578   // capture reads, but it is better than nothing!
12579   hit_set_apply_log();
12580 }
12581
12582 void PrimaryLogPG::hit_set_remove_all()
12583 {
12584   // If any archives are degraded we skip this
12585   for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin();
12586        p != info.hit_set.history.end();
12587        ++p) {
12588     hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
12589
12590     // Once we hit a degraded object just skip
12591     if (is_degraded_or_backfilling_object(aoid))
12592       return;
12593     if (scrubber.write_blocked_by_scrub(aoid))
12594       return;
12595   }
12596
12597   if (!info.hit_set.history.empty()) {
12598     list<pg_hit_set_info_t>::reverse_iterator p = info.hit_set.history.rbegin();
12599     assert(p != info.hit_set.history.rend());
12600     hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
12601     assert(!is_degraded_or_backfilling_object(oid));
12602     ObjectContextRef obc = get_object_context(oid, false);
12603     assert(obc);
12604
12605     OpContextUPtr ctx = simple_opc_create(obc);
12606     ctx->at_version = get_next_version();
12607     ctx->updated_hset_history = info.hit_set;
12608     utime_t now = ceph_clock_now();
12609     ctx->mtime = now;
12610     hit_set_trim(ctx, 0);
12611     simple_opc_submit(std::move(ctx));
12612   }
12613
12614   info.hit_set = pg_hit_set_history_t();
12615   if (agent_state) {
12616     agent_state->discard_hit_sets();
12617   }
12618 }
12619
12620 void PrimaryLogPG::hit_set_create()
12621 {
12622   utime_t now = ceph_clock_now();
12623   // make a copy of the params to modify
12624   HitSet::Params params(pool.info.hit_set_params);
12625
12626   dout(20) << __func__ << " " << params << dendl;
12627   if (pool.info.hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
12628     BloomHitSet::Params *p =
12629       static_cast<BloomHitSet::Params*>(params.impl.get());
12630
12631     // convert false positive rate so it holds up across the full period
12632     p->set_fpp(p->get_fpp() / pool.info.hit_set_count);
12633     if (p->get_fpp() <= 0.0)
12634       p->set_fpp(.01);  // fpp cannot be zero!
12635
12636     // if we don't have specified size, estimate target size based on the
12637     // previous bin!
12638     if (p->target_size == 0 && hit_set) {
12639       utime_t dur = now - hit_set_start_stamp;
12640       unsigned unique = hit_set->approx_unique_insert_count();
12641       dout(20) << __func__ << " previous set had approx " << unique
12642                << " unique items over " << dur << " seconds" << dendl;
12643       p->target_size = (double)unique * (double)pool.info.hit_set_period
12644                      / (double)dur;
12645     }
12646     if (p->target_size <
12647         static_cast<uint64_t>(cct->_conf->osd_hit_set_min_size))
12648       p->target_size = cct->_conf->osd_hit_set_min_size;
12649
12650     if (p->target_size
12651         > static_cast<uint64_t>(cct->_conf->osd_hit_set_max_size))
12652       p->target_size = cct->_conf->osd_hit_set_max_size;
12653
12654     p->seed = now.sec();
12655
12656     dout(10) << __func__ << " target_size " << p->target_size
12657              << " fpp " << p->get_fpp() << dendl;
12658   }
12659   hit_set.reset(new HitSet(params));
12660   hit_set_start_stamp = now;
12661 }
12662
12663 /**
12664  * apply log entries to set
12665  *
12666  * this would only happen after peering, to at least capture writes
12667  * during an interval that was potentially lost.
12668  */
12669 bool PrimaryLogPG::hit_set_apply_log()
12670 {
12671   if (!hit_set)
12672     return false;
12673
12674   eversion_t to = info.last_update;
12675   eversion_t from = info.hit_set.current_last_update;
12676   if (to <= from) {
12677     dout(20) << __func__ << " no update" << dendl;
12678     return false;
12679   }
12680
12681   dout(20) << __func__ << " " << to << " .. " << info.last_update << dendl;
12682   list<pg_log_entry_t>::const_reverse_iterator p = pg_log.get_log().log.rbegin();
12683   while (p != pg_log.get_log().log.rend() && p->version > to)
12684     ++p;
12685   while (p != pg_log.get_log().log.rend() && p->version > from) {
12686     hit_set->insert(p->soid);
12687     ++p;
12688   }
12689
12690   return true;
12691 }
12692
12693 void PrimaryLogPG::hit_set_persist()
12694 {
12695   dout(10) << __func__  << dendl;
12696   bufferlist bl;
12697   unsigned max = pool.info.hit_set_count;
12698
12699   utime_t now = ceph_clock_now();
12700   hobject_t oid;
12701
12702   // If any archives are degraded we skip this persist request
12703   // account for the additional entry being added below
12704   for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin();
12705        p != info.hit_set.history.end();
12706        ++p) {
12707     hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
12708
12709     // Once we hit a degraded object just skip further trim
12710     if (is_degraded_or_backfilling_object(aoid))
12711       return;
12712     if (scrubber.write_blocked_by_scrub(aoid))
12713       return;
12714   }
12715
12716   // If backfill is in progress and we could possibly overlap with the
12717   // hit_set_* objects, back off.  Since these all have
12718   // hobject_t::hash set to pgid.ps(), and those sort first, we can
12719   // look just at that.  This is necessary because our transactions
12720   // may include a modify of the new hit_set *and* a delete of the
12721   // old one, and this may span the backfill boundary.
12722   for (set<pg_shard_t>::iterator p = backfill_targets.begin();
12723        p != backfill_targets.end();
12724        ++p) {
12725     assert(peer_info.count(*p));
12726     const pg_info_t& pi = peer_info[*p];
12727     if (pi.last_backfill == hobject_t() ||
12728         pi.last_backfill.get_hash() == info.pgid.ps()) {
12729       dout(10) << __func__ << " backfill target osd." << *p
12730                << " last_backfill has not progressed past pgid ps"
12731                << dendl;
12732       return;
12733     }
12734   }
12735
12736
12737   pg_hit_set_info_t new_hset = pg_hit_set_info_t(pool.info.use_gmt_hitset);
12738   new_hset.begin = hit_set_start_stamp;
12739   new_hset.end = now;
12740   oid = get_hit_set_archive_object(
12741     new_hset.begin,
12742     new_hset.end,
12743     new_hset.using_gmt);
12744
12745   // If the current object is degraded we skip this persist request
12746   if (scrubber.write_blocked_by_scrub(oid))
12747     return;
12748
12749   hit_set->seal();
12750   ::encode(*hit_set, bl);
12751   dout(20) << __func__ << " archive " << oid << dendl;
12752
12753   if (agent_state) {
12754     agent_state->add_hit_set(new_hset.begin, hit_set);
12755     uint32_t size = agent_state->hit_set_map.size();
12756     if (size >= pool.info.hit_set_count) {
12757       size = pool.info.hit_set_count > 0 ? pool.info.hit_set_count - 1: 0;
12758     }
12759     hit_set_in_memory_trim(size);
12760   }
12761
12762   ObjectContextRef obc = get_object_context(oid, true);
12763   OpContextUPtr ctx = simple_opc_create(obc);
12764
12765   ctx->at_version = get_next_version();
12766   ctx->updated_hset_history = info.hit_set;
12767   pg_hit_set_history_t &updated_hit_set_hist = *(ctx->updated_hset_history);
12768
12769   updated_hit_set_hist.current_last_update = info.last_update;
12770   new_hset.version = ctx->at_version;
12771
12772   updated_hit_set_hist.history.push_back(new_hset);
12773   hit_set_create();
12774
12775   // fabricate an object_info_t and SnapSet
12776   obc->obs.oi.version = ctx->at_version;
12777   obc->obs.oi.mtime = now;
12778   obc->obs.oi.size = bl.length();
12779   obc->obs.exists = true;
12780   obc->obs.oi.set_data_digest(bl.crc32c(-1));
12781
12782   ctx->new_obs = obc->obs;
12783
12784   obc->ssc->snapset.head_exists = true;
12785   ctx->new_snapset = obc->ssc->snapset;
12786
12787   ctx->delta_stats.num_objects++;
12788   ctx->delta_stats.num_objects_hit_set_archive++;
12789   ctx->delta_stats.num_bytes += bl.length();
12790   ctx->delta_stats.num_bytes_hit_set_archive += bl.length();
12791
12792   bufferlist bss;
12793   ::encode(ctx->new_snapset, bss);
12794   bufferlist boi(sizeof(ctx->new_obs.oi));
12795   ::encode(ctx->new_obs.oi, boi,
12796            get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
12797
12798   ctx->op_t->create(oid);
12799   if (bl.length()) {
12800     ctx->op_t->write(oid, 0, bl.length(), bl, 0);
12801   }
12802   map <string, bufferlist> attrs;
12803   attrs[OI_ATTR].claim(boi);
12804   attrs[SS_ATTR].claim(bss);
12805   setattrs_maybe_cache(ctx->obc, ctx.get(), ctx->op_t.get(), attrs);
12806   ctx->log.push_back(
12807     pg_log_entry_t(
12808       pg_log_entry_t::MODIFY,
12809       oid,
12810       ctx->at_version,
12811       eversion_t(),
12812       0,
12813       osd_reqid_t(),
12814       ctx->mtime,
12815       0)
12816     );
12817
12818   hit_set_trim(ctx, max);
12819
12820   simple_opc_submit(std::move(ctx));
12821 }
12822
12823 void PrimaryLogPG::hit_set_trim(OpContextUPtr &ctx, unsigned max)
12824 {
12825   assert(ctx->updated_hset_history);
12826   pg_hit_set_history_t &updated_hit_set_hist =
12827     *(ctx->updated_hset_history);
12828   for (unsigned num = updated_hit_set_hist.history.size(); num > max; --num) {
12829     list<pg_hit_set_info_t>::iterator p = updated_hit_set_hist.history.begin();
12830     assert(p != updated_hit_set_hist.history.end());
12831     hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
12832
12833     assert(!is_degraded_or_backfilling_object(oid));
12834
12835     dout(20) << __func__ << " removing " << oid << dendl;
12836     ++ctx->at_version.version;
12837     ctx->log.push_back(
12838         pg_log_entry_t(pg_log_entry_t::DELETE,
12839                        oid,
12840                        ctx->at_version,
12841                        p->version,
12842                        0,
12843                        osd_reqid_t(),
12844                        ctx->mtime,
12845                        0));
12846
12847     ctx->op_t->remove(oid);
12848     updated_hit_set_hist.history.pop_front();
12849
12850     ObjectContextRef obc = get_object_context(oid, false);
12851     assert(obc);
12852     --ctx->delta_stats.num_objects;
12853     --ctx->delta_stats.num_objects_hit_set_archive;
12854     ctx->delta_stats.num_bytes -= obc->obs.oi.size;
12855     ctx->delta_stats.num_bytes_hit_set_archive -= obc->obs.oi.size;
12856   }
12857 }
12858
12859 void PrimaryLogPG::hit_set_in_memory_trim(uint32_t max_in_memory)
12860 {
12861   while (agent_state->hit_set_map.size() > max_in_memory) {
12862     agent_state->remove_oldest_hit_set();
12863   }
12864 }
12865
12866
12867 // =======================================
12868 // cache agent
12869
12870 void PrimaryLogPG::agent_setup()
12871 {
12872   assert(is_locked());
12873   if (!is_active() ||
12874       !is_primary() ||
12875       pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE ||
12876       pool.info.tier_of < 0 ||
12877       !get_osdmap()->have_pg_pool(pool.info.tier_of)) {
12878     agent_clear();
12879     return;
12880   }
12881   if (!agent_state) {
12882     agent_state.reset(new TierAgentState);
12883
12884     // choose random starting position
12885     agent_state->position = hobject_t();
12886     agent_state->position.pool = info.pgid.pool();
12887     agent_state->position.set_hash(pool.info.get_random_pg_position(
12888       info.pgid.pgid,
12889       rand()));
12890     agent_state->start = agent_state->position;
12891
12892     dout(10) << __func__ << " allocated new state, position "
12893              << agent_state->position << dendl;
12894   } else {
12895     dout(10) << __func__ << " keeping existing state" << dendl;
12896   }
12897
12898   if (info.stats.stats_invalid) {
12899     osd->clog->warn() << "pg " << info.pgid << " has invalid (post-split) stats; must scrub before tier agent can activate";
12900   }
12901
12902   agent_choose_mode();
12903 }
12904
12905 void PrimaryLogPG::agent_clear()
12906 {
12907   agent_stop();
12908   agent_state.reset(NULL);
12909 }
12910
12911 // Return false if no objects operated on since start of object hash space
12912 bool PrimaryLogPG::agent_work(int start_max, int agent_flush_quota)
12913 {
12914   lock();
12915   if (!agent_state) {
12916     dout(10) << __func__ << " no agent state, stopping" << dendl;
12917     unlock();
12918     return true;
12919   }
12920
12921   assert(!deleting);
12922
12923   if (agent_state->is_idle()) {
12924     dout(10) << __func__ << " idle, stopping" << dendl;
12925     unlock();
12926     return true;
12927   }
12928
12929   osd->logger->inc(l_osd_agent_wake);
12930
12931   dout(10) << __func__
12932            << " max " << start_max
12933            << ", flush " << agent_state->get_flush_mode_name()
12934            << ", evict " << agent_state->get_evict_mode_name()
12935            << ", pos " << agent_state->position
12936            << dendl;
12937   assert(is_primary());
12938   assert(is_active());
12939
12940   agent_load_hit_sets();
12941
12942   const pg_pool_t *base_pool = get_osdmap()->get_pg_pool(pool.info.tier_of);
12943   assert(base_pool);
12944
12945   int ls_min = 1;
12946   int ls_max = cct->_conf->osd_pool_default_cache_max_evict_check_size;
12947
12948   // list some objects.  this conveniently lists clones (oldest to
12949   // newest) before heads... the same order we want to flush in.
12950   //
12951   // NOTE: do not flush the Sequencer.  we will assume that the
12952   // listing we get back is imprecise.
12953   vector<hobject_t> ls;
12954   hobject_t next;
12955   int r = pgbackend->objects_list_partial(agent_state->position, ls_min, ls_max,
12956                                           &ls, &next);
12957   assert(r >= 0);
12958   dout(20) << __func__ << " got " << ls.size() << " objects" << dendl;
12959   int started = 0;
12960   for (vector<hobject_t>::iterator p = ls.begin();
12961        p != ls.end();
12962        ++p) {
12963     if (p->nspace == cct->_conf->osd_hit_set_namespace) {
12964       dout(20) << __func__ << " skip (hit set) " << *p << dendl;
12965       osd->logger->inc(l_osd_agent_skip);
12966       continue;
12967     }
12968     if (is_degraded_or_backfilling_object(*p)) {
12969       dout(20) << __func__ << " skip (degraded) " << *p << dendl;
12970       osd->logger->inc(l_osd_agent_skip);
12971       continue;
12972     }
12973     if (is_missing_object(p->get_head())) {
12974       dout(20) << __func__ << " skip (missing head) " << *p << dendl;
12975       osd->logger->inc(l_osd_agent_skip);
12976       continue;
12977     }
12978     ObjectContextRef obc = get_object_context(*p, false, NULL);
12979     if (!obc) {
12980       // we didn't flush; we may miss something here.
12981       dout(20) << __func__ << " skip (no obc) " << *p << dendl;
12982       osd->logger->inc(l_osd_agent_skip);
12983       continue;
12984     }
12985     if (!obc->obs.exists) {
12986       dout(20) << __func__ << " skip (dne) " << obc->obs.oi.soid << dendl;
12987       osd->logger->inc(l_osd_agent_skip);
12988       continue;
12989     }
12990     if (scrubber.write_blocked_by_scrub(obc->obs.oi.soid)) {
12991       dout(20) << __func__ << " skip (scrubbing) " << obc->obs.oi << dendl;
12992       osd->logger->inc(l_osd_agent_skip);
12993       continue;
12994     }
12995     if (obc->is_blocked()) {
12996       dout(20) << __func__ << " skip (blocked) " << obc->obs.oi << dendl;
12997       osd->logger->inc(l_osd_agent_skip);
12998       continue;
12999     }
13000     if (obc->is_request_pending()) {
13001       dout(20) << __func__ << " skip (request pending) " << obc->obs.oi << dendl;
13002       osd->logger->inc(l_osd_agent_skip);
13003       continue;
13004     }
13005
13006     // be careful flushing omap to an EC pool.
13007     if (!base_pool->supports_omap() &&
13008         obc->obs.oi.is_omap()) {
13009       dout(20) << __func__ << " skip (omap to EC) " << obc->obs.oi << dendl;
13010       osd->logger->inc(l_osd_agent_skip);
13011       continue;
13012     }
13013
13014     if (agent_state->evict_mode != TierAgentState::EVICT_MODE_IDLE &&
13015         agent_maybe_evict(obc, false))
13016       ++started;
13017     else if (agent_state->flush_mode != TierAgentState::FLUSH_MODE_IDLE &&
13018              agent_flush_quota > 0 && agent_maybe_flush(obc)) {
13019       ++started;
13020       --agent_flush_quota;
13021     }
13022     if (started >= start_max) {
13023       // If finishing early, set "next" to the next object
13024       if (++p != ls.end())
13025         next = *p;
13026       break;
13027     }
13028   }
13029
13030   if (++agent_state->hist_age > cct->_conf->osd_agent_hist_halflife) {
13031     dout(20) << __func__ << " resetting atime and temp histograms" << dendl;
13032     agent_state->hist_age = 0;
13033     agent_state->temp_hist.decay();
13034   }
13035
13036   // Total objects operated on so far
13037   int total_started = agent_state->started + started;
13038   bool need_delay = false;
13039
13040   dout(20) << __func__ << " start pos " << agent_state->position
13041     << " next start pos " << next
13042     << " started " << total_started << dendl;
13043
13044   // See if we've made a full pass over the object hash space
13045   // This might check at most ls_max objects a second time to notice that
13046   // we've checked every objects at least once.
13047   if (agent_state->position < agent_state->start &&
13048       next >= agent_state->start) {
13049     dout(20) << __func__ << " wrap around " << agent_state->start << dendl;
13050     if (total_started == 0)
13051       need_delay = true;
13052     else
13053       total_started = 0;
13054     agent_state->start = next;
13055   }
13056   agent_state->started = total_started;
13057
13058   // See if we are starting from beginning
13059   if (next.is_max())
13060     agent_state->position = hobject_t();
13061   else
13062     agent_state->position = next;
13063
13064   // Discard old in memory HitSets
13065   hit_set_in_memory_trim(pool.info.hit_set_count);
13066
13067   if (need_delay) {
13068     assert(agent_state->delaying == false);
13069     agent_delay();
13070     unlock();
13071     return false;
13072   }
13073   agent_choose_mode();
13074   unlock();
13075   return true;
13076 }
13077
13078 void PrimaryLogPG::agent_load_hit_sets()
13079 {
13080   if (agent_state->evict_mode == TierAgentState::EVICT_MODE_IDLE) {
13081     return;
13082   }
13083
13084   if (agent_state->hit_set_map.size() < info.hit_set.history.size()) {
13085     dout(10) << __func__ << dendl;
13086     for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin();
13087          p != info.hit_set.history.end(); ++p) {
13088       if (agent_state->hit_set_map.count(p->begin.sec()) == 0) {
13089         dout(10) << __func__ << " loading " << p->begin << "-"
13090                  << p->end << dendl;
13091         if (!pool.info.is_replicated()) {
13092           // FIXME: EC not supported here yet
13093           derr << __func__ << " on non-replicated pool" << dendl;
13094           break;
13095         }
13096
13097         hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
13098         if (is_unreadable_object(oid)) {
13099           dout(10) << __func__ << " unreadable " << oid << ", waiting" << dendl;
13100           break;
13101         }
13102
13103         ObjectContextRef obc = get_object_context(oid, false);
13104         if (!obc) {
13105           derr << __func__ << ": could not load hitset " << oid << dendl;
13106           break;
13107         }
13108
13109         bufferlist bl;
13110         {
13111           obc->ondisk_read_lock();
13112           int r = osd->store->read(ch, ghobject_t(oid), 0, 0, bl);
13113           assert(r >= 0);
13114           obc->ondisk_read_unlock();
13115         }
13116         HitSetRef hs(new HitSet);
13117         bufferlist::iterator pbl = bl.begin();
13118         ::decode(*hs, pbl);
13119         agent_state->add_hit_set(p->begin.sec(), hs);
13120       }
13121     }
13122   }
13123 }
13124
13125 bool PrimaryLogPG::agent_maybe_flush(ObjectContextRef& obc)
13126 {
13127   if (!obc->obs.oi.is_dirty()) {
13128     dout(20) << __func__ << " skip (clean) " << obc->obs.oi << dendl;
13129     osd->logger->inc(l_osd_agent_skip);
13130     return false;
13131   }
13132   if (obc->obs.oi.is_cache_pinned()) {
13133     dout(20) << __func__ << " skip (cache_pinned) " << obc->obs.oi << dendl;
13134     osd->logger->inc(l_osd_agent_skip);
13135     return false;
13136   }
13137
13138   utime_t now = ceph_clock_now();
13139   utime_t ob_local_mtime;
13140   if (obc->obs.oi.local_mtime != utime_t()) {
13141     ob_local_mtime = obc->obs.oi.local_mtime;
13142   } else {
13143     ob_local_mtime = obc->obs.oi.mtime;
13144   }
13145   bool evict_mode_full =
13146     (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL);
13147   if (!evict_mode_full &&
13148       obc->obs.oi.soid.snap == CEPH_NOSNAP &&  // snaps immutable; don't delay
13149       (ob_local_mtime + utime_t(pool.info.cache_min_flush_age, 0) > now)) {
13150     dout(20) << __func__ << " skip (too young) " << obc->obs.oi << dendl;
13151     osd->logger->inc(l_osd_agent_skip);
13152     return false;
13153   }
13154
13155   if (osd->agent_is_active_oid(obc->obs.oi.soid)) {
13156     dout(20) << __func__ << " skip (flushing) " << obc->obs.oi << dendl;
13157     osd->logger->inc(l_osd_agent_skip);
13158     return false;
13159   }
13160
13161   dout(10) << __func__ << " flushing " << obc->obs.oi << dendl;
13162
13163   // FIXME: flush anything dirty, regardless of what distribution of
13164   // ages we expect.
13165
13166   hobject_t oid = obc->obs.oi.soid;
13167   osd->agent_start_op(oid);
13168   // no need to capture a pg ref, can't outlive fop or ctx
13169   std::function<void()> on_flush = [this, oid]() {
13170     osd->agent_finish_op(oid);
13171   };
13172
13173   int result = start_flush(
13174     OpRequestRef(), obc, false, NULL,
13175     on_flush);
13176   if (result != -EINPROGRESS) {
13177     on_flush();
13178     dout(10) << __func__ << " start_flush() failed " << obc->obs.oi
13179       << " with " << result << dendl;
13180     osd->logger->inc(l_osd_agent_skip);
13181     return false;
13182   }
13183
13184   osd->logger->inc(l_osd_agent_flush);
13185   return true;
13186 }
13187
13188 bool PrimaryLogPG::agent_maybe_evict(ObjectContextRef& obc, bool after_flush)
13189 {
13190   const hobject_t& soid = obc->obs.oi.soid;
13191   if (!after_flush && obc->obs.oi.is_dirty()) {
13192     dout(20) << __func__ << " skip (dirty) " << obc->obs.oi << dendl;
13193     return false;
13194   }
13195   if (!obc->obs.oi.watchers.empty()) {
13196     dout(20) << __func__ << " skip (watchers) " << obc->obs.oi << dendl;
13197     return false;
13198   }
13199   if (obc->is_blocked()) {
13200     dout(20) << __func__ << " skip (blocked) " << obc->obs.oi << dendl;
13201     return false;
13202   }
13203   if (obc->obs.oi.is_cache_pinned()) {
13204     dout(20) << __func__ << " skip (cache_pinned) " << obc->obs.oi << dendl;
13205     return false;
13206   }
13207
13208   if (soid.snap == CEPH_NOSNAP) {
13209     int result = _verify_no_head_clones(soid, obc->ssc->snapset);
13210     if (result < 0) {
13211       dout(20) << __func__ << " skip (clones) " << obc->obs.oi << dendl;
13212       return false;
13213     }
13214   }
13215
13216   if (agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL) {
13217     // is this object old than cache_min_evict_age?
13218     utime_t now = ceph_clock_now();
13219     utime_t ob_local_mtime;
13220     if (obc->obs.oi.local_mtime != utime_t()) {
13221       ob_local_mtime = obc->obs.oi.local_mtime;
13222     } else {
13223       ob_local_mtime = obc->obs.oi.mtime;
13224     }
13225     if (ob_local_mtime + utime_t(pool.info.cache_min_evict_age, 0) > now) {
13226       dout(20) << __func__ << " skip (too young) " << obc->obs.oi << dendl;
13227       osd->logger->inc(l_osd_agent_skip);
13228       return false;
13229     }
13230     // is this object old and/or cold enough?
13231     int temp = 0;
13232     uint64_t temp_upper = 0, temp_lower = 0;
13233     if (hit_set)
13234       agent_estimate_temp(soid, &temp);
13235     agent_state->temp_hist.add(temp);
13236     agent_state->temp_hist.get_position_micro(temp, &temp_lower, &temp_upper);
13237
13238     dout(20) << __func__
13239              << " temp " << temp
13240              << " pos " << temp_lower << "-" << temp_upper
13241              << ", evict_effort " << agent_state->evict_effort
13242              << dendl;
13243     dout(30) << "agent_state:\n";
13244     Formatter *f = Formatter::create("");
13245     f->open_object_section("agent_state");
13246     agent_state->dump(f);
13247     f->close_section();
13248     f->flush(*_dout);
13249     delete f;
13250     *_dout << dendl;
13251
13252     if (1000000 - temp_upper >= agent_state->evict_effort)
13253       return false;
13254   }
13255
13256   dout(10) << __func__ << " evicting " << obc->obs.oi << dendl;
13257   OpContextUPtr ctx = simple_opc_create(obc);
13258
13259   if (!ctx->lock_manager.get_lock_type(
13260         ObjectContext::RWState::RWWRITE,
13261         obc->obs.oi.soid,
13262         obc,
13263         OpRequestRef())) {
13264     close_op_ctx(ctx.release());
13265     dout(20) << __func__ << " skip (cannot get lock) " << obc->obs.oi << dendl;
13266     return false;
13267   }
13268
13269   osd->agent_start_evict_op();
13270   ctx->register_on_finish(
13271     [this]() {
13272       osd->agent_finish_evict_op();
13273     });
13274
13275   ctx->at_version = get_next_version();
13276   assert(ctx->new_obs.exists);
13277   int r = _delete_oid(ctx.get(), true, false);
13278   if (obc->obs.oi.is_omap())
13279     ctx->delta_stats.num_objects_omap--;
13280   ctx->delta_stats.num_evict++;
13281   ctx->delta_stats.num_evict_kb += SHIFT_ROUND_UP(obc->obs.oi.size, 10);
13282   if (obc->obs.oi.is_dirty())
13283     --ctx->delta_stats.num_objects_dirty;
13284   assert(r == 0);
13285   finish_ctx(ctx.get(), pg_log_entry_t::DELETE, false);
13286   simple_opc_submit(std::move(ctx));
13287   osd->logger->inc(l_osd_tier_evict);
13288   osd->logger->inc(l_osd_agent_evict);
13289   return true;
13290 }
13291
13292 void PrimaryLogPG::agent_stop()
13293 {
13294   dout(20) << __func__ << dendl;
13295   if (agent_state && !agent_state->is_idle()) {
13296     agent_state->evict_mode = TierAgentState::EVICT_MODE_IDLE;
13297     agent_state->flush_mode = TierAgentState::FLUSH_MODE_IDLE;
13298     osd->agent_disable_pg(this, agent_state->evict_effort);
13299   }
13300 }
13301
13302 void PrimaryLogPG::agent_delay()
13303 {
13304   dout(20) << __func__ << dendl;
13305   if (agent_state && !agent_state->is_idle()) {
13306     assert(agent_state->delaying == false);
13307     agent_state->delaying = true;
13308     osd->agent_disable_pg(this, agent_state->evict_effort);
13309   }
13310 }
13311
13312 void PrimaryLogPG::agent_choose_mode_restart()
13313 {
13314   dout(20) << __func__ << dendl;
13315   lock();
13316   if (agent_state && agent_state->delaying) {
13317     agent_state->delaying = false;
13318     agent_choose_mode(true);
13319   }
13320   unlock();
13321 }
13322
13323 bool PrimaryLogPG::agent_choose_mode(bool restart, OpRequestRef op)
13324 {
13325   bool requeued = false;
13326   // Let delay play out
13327   if (agent_state->delaying) {
13328     dout(20) << __func__ << this << " delaying, ignored" << dendl;
13329     return requeued;
13330   }
13331
13332   TierAgentState::flush_mode_t flush_mode = TierAgentState::FLUSH_MODE_IDLE;
13333   TierAgentState::evict_mode_t evict_mode = TierAgentState::EVICT_MODE_IDLE;
13334   unsigned evict_effort = 0;
13335
13336   if (info.stats.stats_invalid) {
13337     // idle; stats can't be trusted until we scrub.
13338     dout(20) << __func__ << " stats invalid (post-split), idle" << dendl;
13339     goto skip_calc;
13340   }
13341
13342   {
13343   uint64_t divisor = pool.info.get_pg_num_divisor(info.pgid.pgid);
13344   assert(divisor > 0);
13345
13346   // adjust (effective) user objects down based on the number
13347   // of HitSet objects, which should not count toward our total since
13348   // they cannot be flushed.
13349   uint64_t unflushable = info.stats.stats.sum.num_objects_hit_set_archive;
13350
13351   // also exclude omap objects if ec backing pool
13352   const pg_pool_t *base_pool = get_osdmap()->get_pg_pool(pool.info.tier_of);
13353   assert(base_pool);
13354   if (!base_pool->supports_omap())
13355     unflushable += info.stats.stats.sum.num_objects_omap;
13356
13357   uint64_t num_user_objects = info.stats.stats.sum.num_objects;
13358   if (num_user_objects > unflushable)
13359     num_user_objects -= unflushable;
13360   else
13361     num_user_objects = 0;
13362
13363   uint64_t num_user_bytes = info.stats.stats.sum.num_bytes;
13364   uint64_t unflushable_bytes = info.stats.stats.sum.num_bytes_hit_set_archive;
13365   num_user_bytes -= unflushable_bytes;
13366   uint64_t num_overhead_bytes = osd->store->estimate_objects_overhead(num_user_objects);
13367   num_user_bytes += num_overhead_bytes;
13368
13369   // also reduce the num_dirty by num_objects_omap
13370   int64_t num_dirty = info.stats.stats.sum.num_objects_dirty;
13371   if (!base_pool->supports_omap()) {
13372     if (num_dirty > info.stats.stats.sum.num_objects_omap)
13373       num_dirty -= info.stats.stats.sum.num_objects_omap;
13374     else
13375       num_dirty = 0;
13376   }
13377
13378   dout(10) << __func__
13379            << " flush_mode: "
13380            << TierAgentState::get_flush_mode_name(agent_state->flush_mode)
13381            << " evict_mode: "
13382            << TierAgentState::get_evict_mode_name(agent_state->evict_mode)
13383            << " num_objects: " << info.stats.stats.sum.num_objects
13384            << " num_bytes: " << info.stats.stats.sum.num_bytes
13385            << " num_objects_dirty: " << info.stats.stats.sum.num_objects_dirty
13386            << " num_objects_omap: " << info.stats.stats.sum.num_objects_omap
13387            << " num_dirty: " << num_dirty
13388            << " num_user_objects: " << num_user_objects
13389            << " num_user_bytes: " << num_user_bytes
13390            << " num_overhead_bytes: " << num_overhead_bytes
13391            << " pool.info.target_max_bytes: " << pool.info.target_max_bytes
13392            << " pool.info.target_max_objects: " << pool.info.target_max_objects
13393            << dendl;
13394
13395   // get dirty, full ratios
13396   uint64_t dirty_micro = 0;
13397   uint64_t full_micro = 0;
13398   if (pool.info.target_max_bytes && num_user_objects > 0) {
13399     uint64_t avg_size = num_user_bytes / num_user_objects;
13400     dirty_micro =
13401       num_dirty * avg_size * 1000000 /
13402       MAX(pool.info.target_max_bytes / divisor, 1);
13403     full_micro =
13404       num_user_objects * avg_size * 1000000 /
13405       MAX(pool.info.target_max_bytes / divisor, 1);
13406   }
13407   if (pool.info.target_max_objects > 0) {
13408     uint64_t dirty_objects_micro =
13409       num_dirty * 1000000 /
13410       MAX(pool.info.target_max_objects / divisor, 1);
13411     if (dirty_objects_micro > dirty_micro)
13412       dirty_micro = dirty_objects_micro;
13413     uint64_t full_objects_micro =
13414       num_user_objects * 1000000 /
13415       MAX(pool.info.target_max_objects / divisor, 1);
13416     if (full_objects_micro > full_micro)
13417       full_micro = full_objects_micro;
13418   }
13419   dout(20) << __func__ << " dirty " << ((float)dirty_micro / 1000000.0)
13420            << " full " << ((float)full_micro / 1000000.0)
13421            << dendl;
13422
13423   // flush mode
13424   uint64_t flush_target = pool.info.cache_target_dirty_ratio_micro;
13425   uint64_t flush_high_target = pool.info.cache_target_dirty_high_ratio_micro;
13426   uint64_t flush_slop = (float)flush_target * cct->_conf->osd_agent_slop;
13427   if (restart || agent_state->flush_mode == TierAgentState::FLUSH_MODE_IDLE) {
13428     flush_target += flush_slop;
13429     flush_high_target += flush_slop;
13430   } else {
13431     flush_target -= MIN(flush_target, flush_slop);
13432     flush_high_target -= MIN(flush_high_target, flush_slop);
13433   }
13434
13435   if (dirty_micro > flush_high_target) {
13436     flush_mode = TierAgentState::FLUSH_MODE_HIGH;
13437   } else if (dirty_micro > flush_target) {
13438     flush_mode = TierAgentState::FLUSH_MODE_LOW;
13439   }
13440
13441   // evict mode
13442   uint64_t evict_target = pool.info.cache_target_full_ratio_micro;
13443   uint64_t evict_slop = (float)evict_target * cct->_conf->osd_agent_slop;
13444   if (restart || agent_state->evict_mode == TierAgentState::EVICT_MODE_IDLE)
13445     evict_target += evict_slop;
13446   else
13447     evict_target -= MIN(evict_target, evict_slop);
13448
13449   if (full_micro > 1000000) {
13450     // evict anything clean
13451     evict_mode = TierAgentState::EVICT_MODE_FULL;
13452     evict_effort = 1000000;
13453   } else if (full_micro > evict_target) {
13454     // set effort in [0..1] range based on where we are between
13455     evict_mode = TierAgentState::EVICT_MODE_SOME;
13456     uint64_t over = full_micro - evict_target;
13457     uint64_t span  = 1000000 - evict_target;
13458     evict_effort = MAX(over * 1000000 / span,
13459                        (unsigned)(1000000.0 * cct->_conf->osd_agent_min_evict_effort));
13460
13461     // quantize effort to avoid too much reordering in the agent_queue.
13462     uint64_t inc = cct->_conf->osd_agent_quantize_effort * 1000000;
13463     assert(inc > 0);
13464     uint64_t was = evict_effort;
13465     evict_effort -= evict_effort % inc;
13466     if (evict_effort < inc)
13467       evict_effort = inc;
13468     assert(evict_effort >= inc && evict_effort <= 1000000);
13469     dout(30) << __func__ << " evict_effort " << was << " quantized by " << inc << " to " << evict_effort << dendl;
13470   }
13471   }
13472
13473   skip_calc:
13474   bool old_idle = agent_state->is_idle();
13475   if (flush_mode != agent_state->flush_mode) {
13476     dout(5) << __func__ << " flush_mode "
13477             << TierAgentState::get_flush_mode_name(agent_state->flush_mode)
13478             << " -> "
13479             << TierAgentState::get_flush_mode_name(flush_mode)
13480             << dendl;
13481     if (flush_mode == TierAgentState::FLUSH_MODE_HIGH) {
13482       osd->agent_inc_high_count();
13483       info.stats.stats.sum.num_flush_mode_high = 1;
13484     } else if (flush_mode == TierAgentState::FLUSH_MODE_LOW) {
13485       info.stats.stats.sum.num_flush_mode_low = 1;
13486     }
13487     if (agent_state->flush_mode == TierAgentState::FLUSH_MODE_HIGH) {
13488       osd->agent_dec_high_count();
13489       info.stats.stats.sum.num_flush_mode_high = 0;
13490     } else if (agent_state->flush_mode == TierAgentState::FLUSH_MODE_LOW) {
13491       info.stats.stats.sum.num_flush_mode_low = 0;
13492     }
13493     agent_state->flush_mode = flush_mode;
13494   }
13495   if (evict_mode != agent_state->evict_mode) {
13496     dout(5) << __func__ << " evict_mode "
13497             << TierAgentState::get_evict_mode_name(agent_state->evict_mode)
13498             << " -> "
13499             << TierAgentState::get_evict_mode_name(evict_mode)
13500             << dendl;
13501     if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL &&
13502         is_active()) {
13503       if (op)
13504         requeue_op(op);
13505       requeue_ops(waiting_for_flush);
13506       requeue_ops(waiting_for_active);
13507       requeue_ops(waiting_for_scrub);
13508       requeue_ops(waiting_for_cache_not_full);
13509       objects_blocked_on_cache_full.clear();
13510       requeued = true;
13511     }
13512     if (evict_mode == TierAgentState::EVICT_MODE_SOME) {
13513       info.stats.stats.sum.num_evict_mode_some = 1;
13514     } else if (evict_mode == TierAgentState::EVICT_MODE_FULL) {
13515       info.stats.stats.sum.num_evict_mode_full = 1;
13516     }
13517     if (agent_state->evict_mode == TierAgentState::EVICT_MODE_SOME) {
13518       info.stats.stats.sum.num_evict_mode_some = 0;
13519     } else if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
13520       info.stats.stats.sum.num_evict_mode_full = 0;
13521     }
13522     agent_state->evict_mode = evict_mode;
13523   }
13524   uint64_t old_effort = agent_state->evict_effort;
13525   if (evict_effort != agent_state->evict_effort) {
13526     dout(5) << __func__ << " evict_effort "
13527             << ((float)agent_state->evict_effort / 1000000.0)
13528             << " -> "
13529             << ((float)evict_effort / 1000000.0)
13530             << dendl;
13531     agent_state->evict_effort = evict_effort;
13532   }
13533
13534   // NOTE: we are using evict_effort as a proxy for *all* agent effort
13535   // (including flush).  This is probably fine (they should be
13536   // correlated) but it is not precisely correct.
13537   if (agent_state->is_idle()) {
13538     if (!restart && !old_idle) {
13539       osd->agent_disable_pg(this, old_effort);
13540     }
13541   } else {
13542     if (restart || old_idle) {
13543       osd->agent_enable_pg(this, agent_state->evict_effort);
13544     } else if (old_effort != agent_state->evict_effort) {
13545       osd->agent_adjust_pg(this, old_effort, agent_state->evict_effort);
13546     }
13547   }
13548   return requeued;
13549 }
13550
13551 void PrimaryLogPG::agent_estimate_temp(const hobject_t& oid, int *temp)
13552 {
13553   assert(hit_set);
13554   assert(temp);
13555   *temp = 0;
13556   if (hit_set->contains(oid))
13557     *temp = 1000000;
13558   unsigned i = 0;
13559   int last_n = pool.info.hit_set_search_last_n;
13560   for (map<time_t,HitSetRef>::reverse_iterator p =
13561        agent_state->hit_set_map.rbegin(); last_n > 0 &&
13562        p != agent_state->hit_set_map.rend(); ++p, ++i) {
13563     if (p->second->contains(oid)) {
13564       *temp += pool.info.get_grade(i);
13565       --last_n;
13566     }
13567   }
13568 }
13569
13570 // Dup op detection
13571
13572 bool PrimaryLogPG::already_complete(eversion_t v)
13573 {
13574   dout(20) << __func__ << ": " << v << dendl;
13575   for (xlist<RepGather*>::iterator i = repop_queue.begin();
13576        !i.end();
13577        ++i) {
13578     dout(20) << __func__ << ": " << **i << dendl;
13579     // skip copy from temp object ops
13580     if ((*i)->v == eversion_t()) {
13581       dout(20) << __func__ << ": " << **i
13582                << " version is empty" << dendl;
13583       continue;
13584     }
13585     if ((*i)->v > v) {
13586       dout(20) << __func__ << ": " << **i
13587                << " (*i)->v past v" << dendl;
13588       break;
13589     }
13590     if (!(*i)->all_committed) {
13591       dout(20) << __func__ << ": " << **i
13592                << " not committed, returning false"
13593                << dendl;
13594       return false;
13595     }
13596   }
13597   dout(20) << __func__ << ": returning true" << dendl;
13598   return true;
13599 }
13600
13601 bool PrimaryLogPG::already_ack(eversion_t v)
13602 {
13603   dout(20) << __func__ << ": " << v << dendl;
13604   for (xlist<RepGather*>::iterator i = repop_queue.begin();
13605        !i.end();
13606        ++i) {
13607     // skip copy from temp object ops
13608     if ((*i)->v == eversion_t()) {
13609       dout(20) << __func__ << ": " << **i
13610                << " version is empty" << dendl;
13611       continue;
13612     }
13613     if ((*i)->v > v) {
13614       dout(20) << __func__ << ": " << **i
13615                << " (*i)->v past v" << dendl;
13616       break;
13617     }
13618     if (!(*i)->all_applied) {
13619       dout(20) << __func__ << ": " << **i
13620                << " not applied, returning false"
13621                << dendl;
13622       return false;
13623     }
13624   }
13625   dout(20) << __func__ << ": returning true" << dendl;
13626   return true;
13627 }
13628
13629
13630 // ==========================================================================================
13631 // SCRUB
13632
13633
13634 bool PrimaryLogPG::_range_available_for_scrub(
13635   const hobject_t &begin, const hobject_t &end)
13636 {
13637   pair<hobject_t, ObjectContextRef> next;
13638   next.second = object_contexts.lookup(begin);
13639   next.first = begin;
13640   bool more = true;
13641   while (more && next.first < end) {
13642     if (next.second && next.second->is_blocked()) {
13643       next.second->requeue_scrub_on_unblock = true;
13644       dout(10) << __func__ << ": scrub delayed, "
13645                << next.first << " is blocked"
13646                << dendl;
13647       return false;
13648     }
13649     more = object_contexts.get_next(next.first, &next);
13650   }
13651   return true;
13652 }
13653
13654 static bool doing_clones(const boost::optional<SnapSet> &snapset,
13655                          const vector<snapid_t>::reverse_iterator &curclone) {
13656     return snapset && curclone != snapset.get().clones.rend();
13657 }
13658
13659 void PrimaryLogPG::log_missing(unsigned missing,
13660                         const boost::optional<hobject_t> &head,
13661                         LogChannelRef clog,
13662                         const spg_t &pgid,
13663                         const char *func,
13664                         const char *mode,
13665                         bool allow_incomplete_clones)
13666 {
13667   assert(head);
13668   if (allow_incomplete_clones) {
13669     dout(20) << func << " " << mode << " " << pgid << " " << head.get()
13670                << " skipped " << missing << " clone(s) in cache tier" << dendl;
13671   } else {
13672     clog->info() << mode << " " << pgid << " " << head.get()
13673                        << " " << missing << " missing clone(s)";
13674   }
13675 }
13676
13677 unsigned PrimaryLogPG::process_clones_to(const boost::optional<hobject_t> &head,
13678   const boost::optional<SnapSet> &snapset,
13679   LogChannelRef clog,
13680   const spg_t &pgid,
13681   const char *mode,
13682   bool allow_incomplete_clones,
13683   boost::optional<snapid_t> target,
13684   vector<snapid_t>::reverse_iterator *curclone,
13685   inconsistent_snapset_wrapper &e)
13686 {
13687   assert(head);
13688   assert(snapset);
13689   unsigned missing = 0;
13690
13691   // NOTE: clones are in descending order, thus **curclone > target test here
13692   hobject_t next_clone(head.get());
13693   while(doing_clones(snapset, *curclone) && (!target || **curclone > *target)) {
13694     ++missing;
13695     // it is okay to be missing one or more clones in a cache tier.
13696     // skip higher-numbered clones in the list.
13697     if (!allow_incomplete_clones) {
13698       next_clone.snap = **curclone;
13699       clog->error() << mode << " " << pgid << " " << head.get()
13700                          << " expected clone " << next_clone << " " << missing
13701                          << " missing";
13702       ++scrubber.shallow_errors;
13703       e.set_clone_missing(next_clone.snap);
13704     }
13705     // Clones are descending
13706     ++(*curclone);
13707   }
13708   return missing;
13709 }
13710
13711 /*
13712  * Validate consistency of the object info and snap sets.
13713  *
13714  * We are sort of comparing 2 lists. The main loop is on objmap.objects. But
13715  * the comparison of the objects is against multiple snapset.clones. There are
13716  * multiple clone lists and in between lists we expect head or snapdir.
13717  *
13718  * Example
13719  *
13720  * objects              expected
13721  * =======              =======
13722  * obj1 snap 1          head/snapdir, unexpected obj1 snap 1
13723  * obj2 head            head/snapdir, head ok
13724  *              [SnapSet clones 6 4 2 1]
13725  * obj2 snap 7          obj2 snap 6, unexpected obj2 snap 7
13726  * obj2 snap 6          obj2 snap 6, match
13727  * obj2 snap 4          obj2 snap 4, match
13728  * obj3 head            obj2 snap 2 (expected), obj2 snap 1 (expected), head ok
13729  *              [Snapset clones 3 1]
13730  * obj3 snap 3          obj3 snap 3 match
13731  * obj3 snap 1          obj3 snap 1 match
13732  * obj4 snapdir         head/snapdir, snapdir ok
13733  *              [Snapset clones 4]
13734  * EOL                  obj4 snap 4, (expected)
13735  */
13736 void PrimaryLogPG::scrub_snapshot_metadata(
13737   ScrubMap &scrubmap,
13738   const map<hobject_t, pair<uint32_t, uint32_t>> &missing_digest)
13739 {
13740   dout(10) << __func__ << dendl;
13741
13742   coll_t c(info.pgid);
13743   bool repair = state_test(PG_STATE_REPAIR);
13744   bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
13745   const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
13746   boost::optional<snapid_t> all_clones;   // Unspecified snapid_t or boost::none
13747
13748   /// snapsets to repair
13749   map<hobject_t,SnapSet> snapset_to_repair;
13750
13751   // traverse in reverse order.
13752   boost::optional<hobject_t> head;
13753   boost::optional<SnapSet> snapset; // If initialized so will head (above)
13754   vector<snapid_t>::reverse_iterator curclone; // Defined only if snapset initialized
13755   unsigned missing = 0;
13756   inconsistent_snapset_wrapper soid_error, head_error;
13757
13758   bufferlist last_data;
13759
13760   for (map<hobject_t,ScrubMap::object>::reverse_iterator
13761        p = scrubmap.objects.rbegin(); p != scrubmap.objects.rend(); ++p) {
13762     const hobject_t& soid = p->first;
13763     soid_error = inconsistent_snapset_wrapper{soid};
13764     object_stat_sum_t stat;
13765     boost::optional<object_info_t> oi;
13766
13767     if (!soid.is_snapdir())
13768       stat.num_objects++;
13769
13770     if (soid.nspace == cct->_conf->osd_hit_set_namespace)
13771       stat.num_objects_hit_set_archive++;
13772
13773     if (soid.is_snap()) {
13774       // it's a clone
13775       stat.num_object_clones++;
13776     }
13777
13778     // basic checks.
13779     if (p->second.attrs.count(OI_ATTR) == 0) {
13780       oi = boost::none;
13781       osd->clog->error() << mode << " " << info.pgid << " " << soid
13782                         << " no '" << OI_ATTR << "' attr";
13783       ++scrubber.shallow_errors;
13784       soid_error.set_oi_attr_missing();
13785     } else {
13786       bufferlist bv;
13787       bv.push_back(p->second.attrs[OI_ATTR]);
13788       try {
13789         oi = object_info_t(); // Initialize optional<> before decode into it
13790         oi.get().decode(bv);
13791       } catch (buffer::error& e) {
13792         oi = boost::none;
13793         osd->clog->error() << mode << " " << info.pgid << " " << soid
13794                 << " can't decode '" << OI_ATTR << "' attr " << e.what();
13795         ++scrubber.shallow_errors;
13796         soid_error.set_oi_attr_corrupted();
13797         soid_error.set_oi_attr_missing(); // Not available too
13798       }
13799     }
13800
13801     if (oi) {
13802       if (pgbackend->be_get_ondisk_size(oi->size) != p->second.size) {
13803         osd->clog->error() << mode << " " << info.pgid << " " << soid
13804                            << " on disk size (" << p->second.size
13805                            << ") does not match object info size ("
13806                            << oi->size << ") adjusted for ondisk to ("
13807                            << pgbackend->be_get_ondisk_size(oi->size)
13808                            << ")";
13809         soid_error.set_size_mismatch();
13810         ++scrubber.shallow_errors;
13811       }
13812
13813       dout(20) << mode << "  " << soid << " " << oi.get() << dendl;
13814
13815       // A clone num_bytes will be added later when we have snapset
13816       if (!soid.is_snap()) {
13817         stat.num_bytes += oi->size;
13818       }
13819       if (soid.nspace == cct->_conf->osd_hit_set_namespace)
13820         stat.num_bytes_hit_set_archive += oi->size;
13821
13822       if (!soid.is_snapdir()) {
13823         if (oi->is_dirty())
13824           ++stat.num_objects_dirty;
13825         if (oi->is_whiteout())
13826           ++stat.num_whiteouts;
13827         if (oi->is_omap())
13828           ++stat.num_objects_omap;
13829         if (oi->is_cache_pinned())
13830           ++stat.num_objects_pinned;
13831       }
13832     } else {
13833       // pessimistic assumption that this object might contain a
13834       // legacy SnapSet
13835       stat.num_legacy_snapsets++;
13836     }
13837
13838     // Check for any problems while processing clones
13839     if (doing_clones(snapset, curclone)) {
13840       boost::optional<snapid_t> target;
13841       // Expecting an object with snap for current head
13842       if (soid.has_snapset() || soid.get_head() != head->get_head()) {
13843
13844         dout(10) << __func__ << " " << mode << " " << info.pgid << " new object "
13845                  << soid << " while processing " << head.get() << dendl;
13846
13847         target = all_clones;
13848       } else {
13849         assert(soid.is_snap());
13850         target = soid.snap;
13851       }
13852
13853       // Log any clones we were expecting to be there up to target
13854       // This will set missing, but will be a no-op if snap.soid == *curclone.
13855       missing += process_clones_to(head, snapset, osd->clog, info.pgid, mode,
13856                         pool.info.allow_incomplete_clones(), target, &curclone,
13857                         head_error);
13858     }
13859     bool expected;
13860     // Check doing_clones() again in case we ran process_clones_to()
13861     if (doing_clones(snapset, curclone)) {
13862       // A head/snapdir would have processed all clones above
13863       // or all greater than *curclone.
13864       assert(soid.is_snap() && *curclone <= soid.snap);
13865
13866       // After processing above clone snap should match the expected curclone
13867       expected = (*curclone == soid.snap);
13868     } else {
13869       // If we aren't doing clones any longer, then expecting head/snapdir
13870       expected = soid.has_snapset();
13871     }
13872     if (!expected) {
13873       // If we couldn't read the head's snapset, just ignore clones
13874       if (head && !snapset) {
13875         osd->clog->error() << mode << " " << info.pgid << " " << soid
13876                           << " clone ignored due to missing snapset";
13877       } else {
13878         osd->clog->error() << mode << " " << info.pgid << " " << soid
13879                            << " is an unexpected clone";
13880       }
13881       ++scrubber.shallow_errors;
13882       soid_error.set_headless();
13883       scrubber.store->add_snap_error(pool.id, soid_error);
13884       if (head && soid.get_head() == head->get_head())
13885         head_error.set_clone(soid.snap);
13886       continue;
13887     }
13888
13889     // new snapset?
13890     if (soid.has_snapset()) {
13891
13892       if (missing) {
13893         log_missing(missing, head, osd->clog, info.pgid, __func__, mode,
13894                     pool.info.allow_incomplete_clones());
13895       }
13896
13897       // Save previous head error information
13898       if (head && head_error.errors)
13899         scrubber.store->add_snap_error(pool.id, head_error);
13900       // Set this as a new head object
13901       head = soid;
13902       missing = 0;
13903       head_error = soid_error;
13904
13905       dout(20) << __func__ << " " << mode << " new head " << head << dendl;
13906
13907       if (p->second.attrs.count(SS_ATTR) == 0) {
13908         osd->clog->error() << mode << " " << info.pgid << " " << soid
13909                           << " no '" << SS_ATTR << "' attr";
13910         ++scrubber.shallow_errors;
13911         snapset = boost::none;
13912         head_error.set_ss_attr_missing();
13913       } else {
13914         bufferlist bl;
13915         bl.push_back(p->second.attrs[SS_ATTR]);
13916         bufferlist::iterator blp = bl.begin();
13917         try {
13918           snapset = SnapSet(); // Initialize optional<> before decoding into it
13919           ::decode(snapset.get(), blp);
13920         } catch (buffer::error& e) {
13921           snapset = boost::none;
13922           osd->clog->error() << mode << " " << info.pgid << " " << soid
13923                 << " can't decode '" << SS_ATTR << "' attr " << e.what();
13924           ++scrubber.shallow_errors;
13925           head_error.set_ss_attr_corrupted();
13926         }
13927       }
13928
13929       if (snapset) {
13930         // what will be next?
13931         curclone = snapset->clones.rbegin();
13932
13933         if (!snapset->clones.empty()) {
13934           dout(20) << "  snapset " << snapset.get() << dendl;
13935           if (snapset->seq == 0) {
13936             osd->clog->error() << mode << " " << info.pgid << " " << soid
13937                                << " snaps.seq not set";
13938             ++scrubber.shallow_errors;
13939             head_error.set_snapset_mismatch();
13940           }
13941         }
13942
13943         if (soid.is_head() && !snapset->head_exists) {
13944           osd->clog->error() << mode << " " << info.pgid << " " << soid
13945                           << " snapset.head_exists=false, but head exists";
13946           ++scrubber.shallow_errors;
13947           head_error.set_head_mismatch();
13948           // Fix head_exists locally so is_legacy() returns correctly
13949           snapset->head_exists = true;
13950         }
13951         if (soid.is_snapdir() && snapset->head_exists) {
13952           osd->clog->error() << mode << " " << info.pgid << " " << soid
13953                           << " snapset.head_exists=true, but snapdir exists";
13954           ++scrubber.shallow_errors;
13955           head_error.set_head_mismatch();
13956           // For symmetry fix this too, but probably doesn't matter
13957           snapset->head_exists = false;
13958         }
13959
13960         if (get_osdmap()->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
13961           if (soid.is_snapdir()) {
13962             dout(10) << " will move snapset to head from " << soid << dendl;
13963             snapset_to_repair[soid.get_head()] = *snapset;
13964           } else if (snapset->is_legacy()) {
13965             dout(10) << " will convert legacy snapset on " << soid << " " << *snapset
13966                      << dendl;
13967             snapset_to_repair[soid.get_head()] = *snapset;
13968           }
13969         } else {
13970           stat.num_legacy_snapsets++;
13971         }
13972       } else {
13973         // pessimistic assumption that this object might contain a
13974         // legacy SnapSet
13975         stat.num_legacy_snapsets++;
13976       }
13977     } else {
13978       assert(soid.is_snap());
13979       assert(head);
13980       assert(snapset);
13981       assert(soid.snap == *curclone);
13982
13983       dout(20) << __func__ << " " << mode << " matched clone " << soid << dendl;
13984
13985       if (snapset->clone_size.count(soid.snap) == 0) {
13986         osd->clog->error() << mode << " " << info.pgid << " " << soid
13987                            << " is missing in clone_size";
13988         ++scrubber.shallow_errors;
13989         soid_error.set_size_mismatch();
13990       } else {
13991         if (oi && oi->size != snapset->clone_size[soid.snap]) {
13992           osd->clog->error() << mode << " " << info.pgid << " " << soid
13993                              << " size " << oi->size << " != clone_size "
13994                              << snapset->clone_size[*curclone];
13995           ++scrubber.shallow_errors;
13996           soid_error.set_size_mismatch();
13997         }
13998
13999         if (snapset->clone_overlap.count(soid.snap) == 0) {
14000           osd->clog->error() << mode << " " << info.pgid << " " << soid
14001                              << " is missing in clone_overlap";
14002           ++scrubber.shallow_errors;
14003           soid_error.set_size_mismatch();
14004         } else {
14005           // This checking is based on get_clone_bytes().  The first 2 asserts
14006           // can't happen because we know we have a clone_size and
14007           // a clone_overlap.  Now we check that the interval_set won't
14008           // cause the last assert.
14009           uint64_t size = snapset->clone_size.find(soid.snap)->second;
14010           const interval_set<uint64_t> &overlap =
14011                 snapset->clone_overlap.find(soid.snap)->second;
14012           bool bad_interval_set = false;
14013           for (interval_set<uint64_t>::const_iterator i = overlap.begin();
14014                i != overlap.end(); ++i) {
14015             if (size < i.get_len()) {
14016               bad_interval_set = true;
14017               break;
14018             }
14019             size -= i.get_len();
14020           }
14021
14022           if (bad_interval_set) {
14023             osd->clog->error() << mode << " " << info.pgid << " " << soid
14024                                << " bad interval_set in clone_overlap";
14025             ++scrubber.shallow_errors;
14026             soid_error.set_size_mismatch();
14027           } else {
14028             stat.num_bytes += snapset->get_clone_bytes(soid.snap);
14029           }
14030         }
14031       }
14032
14033       // migrate legacy_snaps to snapset?
14034       auto p = snapset_to_repair.find(soid.get_head());
14035       if (p != snapset_to_repair.end()) {
14036         if (!oi || oi->legacy_snaps.empty()) {
14037           osd->clog->error() << mode << " " << info.pgid << " " << soid
14038                              << " has no oi or legacy_snaps; cannot convert "
14039                              << *snapset;
14040           ++scrubber.shallow_errors;
14041         } else {
14042           dout(20) << __func__ << "   copying legacy_snaps " << oi->legacy_snaps
14043                    << " to snapset " << p->second << dendl;
14044           p->second.clone_snaps[soid.snap] = oi->legacy_snaps;
14045         }
14046       }
14047
14048       // what's next?
14049       ++curclone;
14050       if (soid_error.errors)
14051         scrubber.store->add_snap_error(pool.id, soid_error);
14052     }
14053
14054     scrub_cstat.add(stat);
14055   }
14056
14057   if (doing_clones(snapset, curclone)) {
14058     dout(10) << __func__ << " " << mode << " " << info.pgid
14059              << " No more objects while processing " << head.get() << dendl;
14060
14061     missing += process_clones_to(head, snapset, osd->clog, info.pgid, mode,
14062                       pool.info.allow_incomplete_clones(), all_clones, &curclone,
14063                       head_error);
14064   }
14065   // There could be missing found by the test above or even
14066   // before dropping out of the loop for the last head.
14067   if (missing) {
14068     log_missing(missing, head, osd->clog, info.pgid, __func__,
14069                 mode, pool.info.allow_incomplete_clones());
14070   }
14071   if (head && head_error.errors)
14072     scrubber.store->add_snap_error(pool.id, head_error);
14073
14074   for (map<hobject_t,pair<uint32_t,uint32_t>>::const_iterator p =
14075          missing_digest.begin();
14076        p != missing_digest.end();
14077        ++p) {
14078     if (p->first.is_snapdir())
14079       continue;
14080     dout(10) << __func__ << " recording digests for " << p->first << dendl;
14081     ObjectContextRef obc = get_object_context(p->first, false);
14082     if (!obc) {
14083       osd->clog->error() << info.pgid << " " << mode
14084                          << " cannot get object context for object "
14085                          << p->first;
14086       continue;
14087     } else if (obc->obs.oi.soid != p->first) {
14088       osd->clog->error() << info.pgid << " " << mode
14089                          << " object " << p->first
14090                          << " has a valid oi attr with a mismatched name, "
14091                          << " obc->obs.oi.soid: " << obc->obs.oi.soid;
14092       continue;
14093     }
14094     OpContextUPtr ctx = simple_opc_create(obc);
14095     ctx->at_version = get_next_version();
14096     ctx->mtime = utime_t();      // do not update mtime
14097     ctx->new_obs.oi.set_data_digest(p->second.first);
14098     ctx->new_obs.oi.set_omap_digest(p->second.second);
14099     finish_ctx(ctx.get(), pg_log_entry_t::MODIFY);
14100
14101     ctx->register_on_success(
14102       [this]() {
14103         dout(20) << "updating scrub digest" << dendl;
14104         if (--scrubber.num_digest_updates_pending == 0) {
14105           requeue_scrub();
14106         }
14107       });
14108
14109     simple_opc_submit(std::move(ctx));
14110     ++scrubber.num_digest_updates_pending;
14111   }
14112   for (auto& p : snapset_to_repair) {
14113     // cache pools may not have the clones, which means we won't know
14114     // what snaps they have.  fake out the clone_snaps entries anyway (with
14115     // blank snap lists).
14116     p.second.head_exists = true;
14117     if (pool.info.allow_incomplete_clones()) {
14118       for (auto s : p.second.clones) {
14119         if (p.second.clone_snaps.count(s) == 0) {
14120           dout(10) << __func__ << " " << p.first << " faking clone_snaps for "
14121                    << s << dendl;
14122           p.second.clone_snaps[s];
14123         }
14124       }
14125     }
14126     if (p.second.clones.size() != p.second.clone_snaps.size() ||
14127         p.second.is_legacy()) {
14128       // this happens if we encounter other errors above, like a missing
14129       // or extra clone.
14130       dout(10) << __func__ << " not writing snapset to " << p.first
14131                << " snapset " << p.second << " clones " << p.second.clones
14132                << "; didn't convert fully" << dendl;
14133       scrub_cstat.sum.num_legacy_snapsets++;
14134       continue;
14135     }
14136     dout(10) << __func__ << " writing snapset to " << p.first
14137              << " " << p.second << dendl;
14138     ObjectContextRef obc = get_object_context(p.first, true);
14139     if (!obc) {
14140       osd->clog->error() << info.pgid << " " << mode
14141                          << " cannot get object context for object "
14142                          << p.first;
14143       continue;
14144     } else if (obc->obs.oi.soid != p.first) {
14145       osd->clog->error() << info.pgid << " " << mode
14146                          << " object " << p.first
14147                          << " has a valid oi attr with a mismatched name, "
14148                          << " obc->obs.oi.soid: " << obc->obs.oi.soid;
14149       continue;
14150     }
14151     ObjectContextRef snapset_obc;
14152     if (!obc->obs.exists) {
14153       snapset_obc = get_object_context(p.first.get_snapdir(), false);
14154       if (!snapset_obc) {
14155         osd->clog->error() << info.pgid << " " << mode
14156                            << " cannot get object context for "
14157                            << p.first.get_snapdir();
14158         continue;
14159       }
14160     }
14161     OpContextUPtr ctx = simple_opc_create(obc);
14162     PGTransaction *t = ctx->op_t.get();
14163     ctx->snapset_obc = snapset_obc;
14164     ctx->at_version = get_next_version();
14165     ctx->mtime = utime_t();      // do not update mtime
14166     ctx->new_snapset = p.second;
14167     if (!ctx->new_obs.exists) {
14168       dout(20) << __func__ << "   making " << p.first << " a whiteout" << dendl;
14169       ctx->new_obs.exists = true;
14170       ctx->new_snapset.head_exists = true;
14171       ctx->new_obs.oi.set_flag(object_info_t::FLAG_WHITEOUT);
14172       ++ctx->delta_stats.num_whiteouts;
14173       ++ctx->delta_stats.num_objects;
14174       t->create(p.first);
14175       if (p.first < scrubber.start) {
14176         dout(20) << __func__ << " kludging around update outside of scrub range"
14177                  << dendl;
14178       } else {
14179         scrub_cstat.add(ctx->delta_stats);
14180       }
14181     }
14182     dout(20) << __func__ << "   final snapset " << ctx->new_snapset << dendl;
14183     assert(!ctx->new_snapset.is_legacy());
14184     finish_ctx(ctx.get(), pg_log_entry_t::MODIFY);
14185     ctx->register_on_success(
14186       [this]() {
14187         dout(20) << "updating snapset" << dendl;
14188         if (--scrubber.num_digest_updates_pending == 0) {
14189           requeue_scrub();
14190         }
14191       });
14192
14193     simple_opc_submit(std::move(ctx));
14194     ++scrubber.num_digest_updates_pending;
14195   }
14196
14197   dout(10) << __func__ << " (" << mode << ") finish" << dendl;
14198 }
14199
14200 void PrimaryLogPG::_scrub_clear_state()
14201 {
14202   scrub_cstat = object_stat_collection_t();
14203 }
14204
14205 void PrimaryLogPG::_scrub_finish()
14206 {
14207   bool repair = state_test(PG_STATE_REPAIR);
14208   bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
14209   const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
14210
14211   if (info.stats.stats_invalid) {
14212     info.stats.stats = scrub_cstat;
14213     info.stats.stats_invalid = false;
14214
14215     if (agent_state)
14216       agent_choose_mode();
14217   }
14218
14219   dout(10) << mode << " got "
14220            << scrub_cstat.sum.num_objects << "/" << info.stats.stats.sum.num_objects << " objects, "
14221            << scrub_cstat.sum.num_object_clones << "/" << info.stats.stats.sum.num_object_clones << " clones, "
14222            << scrub_cstat.sum.num_objects_dirty << "/" << info.stats.stats.sum.num_objects_dirty << " dirty, "
14223            << scrub_cstat.sum.num_objects_omap << "/" << info.stats.stats.sum.num_objects_omap << " omap, "
14224            << scrub_cstat.sum.num_objects_pinned << "/" << info.stats.stats.sum.num_objects_pinned << " pinned, "
14225            << scrub_cstat.sum.num_objects_hit_set_archive << "/" << info.stats.stats.sum.num_objects_hit_set_archive << " hit_set_archive, "
14226            << scrub_cstat.sum.num_bytes << "/" << info.stats.stats.sum.num_bytes << " bytes, "
14227            << scrub_cstat.sum.num_bytes_hit_set_archive << "/" << info.stats.stats.sum.num_bytes_hit_set_archive << " hit_set_archive bytes."
14228            << dendl;
14229
14230   if (scrub_cstat.sum.num_objects != info.stats.stats.sum.num_objects ||
14231       scrub_cstat.sum.num_object_clones != info.stats.stats.sum.num_object_clones ||
14232       (scrub_cstat.sum.num_objects_dirty != info.stats.stats.sum.num_objects_dirty &&
14233        !info.stats.dirty_stats_invalid) ||
14234       (scrub_cstat.sum.num_objects_omap != info.stats.stats.sum.num_objects_omap &&
14235        !info.stats.omap_stats_invalid) ||
14236       (scrub_cstat.sum.num_objects_pinned != info.stats.stats.sum.num_objects_pinned &&
14237        !info.stats.pin_stats_invalid) ||
14238       (scrub_cstat.sum.num_objects_hit_set_archive != info.stats.stats.sum.num_objects_hit_set_archive &&
14239        !info.stats.hitset_stats_invalid) ||
14240       (scrub_cstat.sum.num_bytes_hit_set_archive != info.stats.stats.sum.num_bytes_hit_set_archive &&
14241        !info.stats.hitset_bytes_stats_invalid) ||
14242       scrub_cstat.sum.num_whiteouts != info.stats.stats.sum.num_whiteouts ||
14243       scrub_cstat.sum.num_bytes != info.stats.stats.sum.num_bytes) {
14244     osd->clog->error() << info.pgid << " " << mode
14245                       << " stat mismatch, got "
14246                       << scrub_cstat.sum.num_objects << "/" << info.stats.stats.sum.num_objects << " objects, "
14247                       << scrub_cstat.sum.num_object_clones << "/" << info.stats.stats.sum.num_object_clones << " clones, "
14248                       << scrub_cstat.sum.num_objects_dirty << "/" << info.stats.stats.sum.num_objects_dirty << " dirty, "
14249                       << scrub_cstat.sum.num_objects_omap << "/" << info.stats.stats.sum.num_objects_omap << " omap, "
14250                       << scrub_cstat.sum.num_objects_pinned << "/" << info.stats.stats.sum.num_objects_pinned << " pinned, "
14251                       << scrub_cstat.sum.num_objects_hit_set_archive << "/" << info.stats.stats.sum.num_objects_hit_set_archive << " hit_set_archive, "
14252                       << scrub_cstat.sum.num_whiteouts << "/" << info.stats.stats.sum.num_whiteouts << " whiteouts, "
14253                       << scrub_cstat.sum.num_bytes << "/" << info.stats.stats.sum.num_bytes << " bytes, "
14254                       << scrub_cstat.sum.num_bytes_hit_set_archive << "/" << info.stats.stats.sum.num_bytes_hit_set_archive << " hit_set_archive bytes.";
14255     ++scrubber.shallow_errors;
14256
14257     if (repair) {
14258       ++scrubber.fixed;
14259       info.stats.stats = scrub_cstat;
14260       info.stats.dirty_stats_invalid = false;
14261       info.stats.omap_stats_invalid = false;
14262       info.stats.hitset_stats_invalid = false;
14263       info.stats.hitset_bytes_stats_invalid = false;
14264       publish_stats_to_osd();
14265       share_pg_info();
14266     }
14267   } else if (scrub_cstat.sum.num_legacy_snapsets !=
14268              info.stats.stats.sum.num_legacy_snapsets) {
14269     osd->clog->info() << info.pgid << " " << mode << " updated num_legacy_snapsets"
14270                       << " from " << info.stats.stats.sum.num_legacy_snapsets
14271                       << " -> " << scrub_cstat.sum.num_legacy_snapsets << "\n";
14272     info.stats.stats.sum.num_legacy_snapsets = scrub_cstat.sum.num_legacy_snapsets;
14273     publish_stats_to_osd();
14274     share_pg_info();
14275   }
14276   // Clear object context cache to get repair information
14277   if (repair)
14278     object_contexts.clear();
14279 }
14280
14281 bool PrimaryLogPG::check_osdmap_full(const set<pg_shard_t> &missing_on)
14282 {
14283     return osd->check_osdmap_full(missing_on);
14284 }
14285
14286 int PrimaryLogPG::rep_repair_primary_object(const hobject_t& soid, OpRequestRef op)
14287 {
14288   // Only supports replicated pools
14289   assert(!pool.info.require_rollback());
14290   assert(is_primary());
14291
14292   dout(10) << __func__ << " " << soid
14293            << " peers osd.{" << actingbackfill << "}" << dendl;
14294
14295   if (!is_clean()) {
14296     block_for_clean(soid, op);
14297     return -EAGAIN;
14298   }
14299
14300   assert(!pg_log.get_missing().is_missing(soid));
14301   bufferlist bv;
14302   object_info_t oi;
14303   eversion_t v;
14304   int r = get_pgbackend()->objects_get_attr(soid, OI_ATTR, &bv);
14305   if (r < 0) {
14306     // Leave v and try to repair without a version, getting attr failed
14307     dout(0) << __func__ << ": Need version of replica, objects_get_attr failed: "
14308             << soid << " error=" << r << dendl;
14309   } else try {
14310     bufferlist::iterator bliter = bv.begin();
14311     ::decode(oi, bliter);
14312     v = oi.version;
14313   } catch (...) {
14314     // Leave v as default constructed. This will fail when sent to older OSDs, but
14315     // not much worse than failing here.
14316     dout(0) << __func__ << ": Need version of replica, bad object_info_t: " << soid << dendl;
14317   }
14318
14319   missing_loc.add_missing(soid, v, eversion_t());
14320   if (primary_error(soid, v)) {
14321     dout(0) << __func__ << " No other replicas available for " << soid << dendl;
14322     // XXX: If we knew that there is no down osd which could include this
14323     // object, it would be nice if we could return EIO here.
14324     // If a "never fail" flag was available, that could be used
14325     // for rbd to NOT return EIO until object marked lost.
14326
14327     // Drop through to save this op in case an osd comes up with the object.
14328   }
14329
14330   // Restart the op after object becomes readable again
14331   waiting_for_unreadable_object[soid].push_back(op);
14332   op->mark_delayed("waiting for missing object");
14333
14334   if (!eio_errors_to_process) {
14335     eio_errors_to_process = true;
14336     assert(is_clean());
14337     queue_peering_event(
14338         CephPeeringEvtRef(
14339           std::make_shared<CephPeeringEvt>(
14340           get_osdmap()->get_epoch(),
14341           get_osdmap()->get_epoch(),
14342           DoRecovery())));
14343   } else {
14344     // A prior error must have already cleared clean state and queued recovery
14345     // or a map change has triggered re-peering.
14346     // Not inlining the recovery by calling maybe_kick_recovery(soid);
14347     dout(5) << __func__<< ": Read error on " << soid << ", but already seen errors" << dendl;
14348   }
14349
14350   return -EAGAIN;
14351 }
14352
14353 /*---SnapTrimmer Logging---*/
14354 #undef dout_prefix
14355 #define dout_prefix *_dout << pg->gen_prefix()
14356
14357 void PrimaryLogPG::SnapTrimmer::log_enter(const char *state_name)
14358 {
14359   ldout(pg->cct, 20) << "enter " << state_name << dendl;
14360 }
14361
14362 void PrimaryLogPG::SnapTrimmer::log_exit(const char *state_name, utime_t enter_time)
14363 {
14364   ldout(pg->cct, 20) << "exit " << state_name << dendl;
14365 }
14366
14367 /*---SnapTrimmer states---*/
14368 #undef dout_prefix
14369 #define dout_prefix (*_dout << context< SnapTrimmer >().pg->gen_prefix() \
14370                      << "SnapTrimmer state<" << get_state_name() << ">: ")
14371
14372 /* NotTrimming */
14373 PrimaryLogPG::NotTrimming::NotTrimming(my_context ctx)
14374   : my_base(ctx),
14375     NamedState(context< SnapTrimmer >().pg, "NotTrimming")
14376 {
14377   context< SnapTrimmer >().log_enter(state_name);
14378 }
14379
14380 void PrimaryLogPG::NotTrimming::exit()
14381 {
14382   context< SnapTrimmer >().log_exit(state_name, enter_time);
14383 }
14384
14385 boost::statechart::result PrimaryLogPG::NotTrimming::react(const KickTrim&)
14386 {
14387   PrimaryLogPG *pg = context< SnapTrimmer >().pg;
14388   ldout(pg->cct, 10) << "NotTrimming react KickTrim" << dendl;
14389
14390   if (!(pg->is_primary() && pg->is_active())) {
14391     ldout(pg->cct, 10) << "NotTrimming not primary or active" << dendl;
14392     return discard_event();
14393   }
14394   if (!pg->is_clean() ||
14395       pg->snap_trimq.empty()) {
14396     ldout(pg->cct, 10) << "NotTrimming not clean or nothing to trim" << dendl;
14397     return discard_event();
14398   }
14399   if (pg->scrubber.active) {
14400     ldout(pg->cct, 10) << " scrubbing, will requeue snap_trimmer after" << dendl;
14401     return transit< WaitScrub >();
14402   } else {
14403     return transit< Trimming >();
14404   }
14405 }
14406
14407 boost::statechart::result PrimaryLogPG::WaitReservation::react(const SnapTrimReserved&)
14408 {
14409   PrimaryLogPG *pg = context< SnapTrimmer >().pg;
14410   ldout(pg->cct, 10) << "WaitReservation react SnapTrimReserved" << dendl;
14411
14412   pending = nullptr;
14413   if (!context< SnapTrimmer >().can_trim()) {
14414     post_event(KickTrim());
14415     return transit< NotTrimming >();
14416   }
14417
14418   context<Trimming>().snap_to_trim = pg->snap_trimq.range_start();
14419   ldout(pg->cct, 10) << "NotTrimming: trimming "
14420                      << pg->snap_trimq.range_start()
14421                      << dendl;
14422   return transit< AwaitAsyncWork >();
14423 }
14424
14425 /* AwaitAsyncWork */
14426 PrimaryLogPG::AwaitAsyncWork::AwaitAsyncWork(my_context ctx)
14427   : my_base(ctx),
14428     NamedState(context< SnapTrimmer >().pg, "Trimming/AwaitAsyncWork")
14429 {
14430   auto *pg = context< SnapTrimmer >().pg;
14431   context< SnapTrimmer >().log_enter(state_name);
14432   context< SnapTrimmer >().pg->osd->queue_for_snap_trim(pg);
14433   pg->state_set(PG_STATE_SNAPTRIM);
14434   pg->state_clear(PG_STATE_SNAPTRIM_ERROR);
14435   pg->publish_stats_to_osd();
14436 }
14437
14438 boost::statechart::result PrimaryLogPG::AwaitAsyncWork::react(const DoSnapWork&)
14439 {
14440   PrimaryLogPGRef pg = context< SnapTrimmer >().pg;
14441   snapid_t snap_to_trim = context<Trimming>().snap_to_trim;
14442   auto &in_flight = context<Trimming>().in_flight;
14443   assert(in_flight.empty());
14444
14445   assert(pg->is_primary() && pg->is_active());
14446   if (!context< SnapTrimmer >().can_trim()) {
14447     ldout(pg->cct, 10) << "something changed, reverting to NotTrimming" << dendl;
14448     post_event(KickTrim());
14449     return transit< NotTrimming >();
14450   }
14451
14452   ldout(pg->cct, 10) << "AwaitAsyncWork: trimming snap " << snap_to_trim << dendl;
14453
14454   vector<hobject_t> to_trim;
14455   unsigned max = pg->cct->_conf->osd_pg_max_concurrent_snap_trims;
14456   to_trim.reserve(max);
14457   int r = pg->snap_mapper.get_next_objects_to_trim(
14458     snap_to_trim,
14459     max,
14460     &to_trim);
14461   if (r != 0 && r != -ENOENT) {
14462     lderr(pg->cct) << "get_next_objects_to_trim returned "
14463                    << cpp_strerror(r) << dendl;
14464     assert(0 == "get_next_objects_to_trim returned an invalid code");
14465   } else if (r == -ENOENT) {
14466     // Done!
14467     ldout(pg->cct, 10) << "got ENOENT" << dendl;
14468
14469     ldout(pg->cct, 10) << "adding snap " << snap_to_trim
14470                        << " to purged_snaps"
14471                        << dendl;
14472     pg->info.purged_snaps.insert(snap_to_trim);
14473     pg->snap_trimq.erase(snap_to_trim);
14474     ldout(pg->cct, 10) << "purged_snaps now "
14475                        << pg->info.purged_snaps << ", snap_trimq now "
14476                        << pg->snap_trimq << dendl;
14477
14478     ObjectStore::Transaction t;
14479     pg->dirty_big_info = true;
14480     pg->write_if_dirty(t);
14481     int tr = pg->osd->store->queue_transaction(pg->osr.get(), std::move(t), NULL);
14482     assert(tr == 0);
14483
14484     pg->share_pg_info();
14485     post_event(KickTrim());
14486     return transit< NotTrimming >();
14487   }
14488   assert(!to_trim.empty());
14489
14490   for (auto &&object: to_trim) {
14491     // Get next
14492     ldout(pg->cct, 10) << "AwaitAsyncWork react trimming " << object << dendl;
14493     OpContextUPtr ctx;
14494     int error = pg->trim_object(in_flight.empty(), object, &ctx);
14495     if (error) {
14496       if (error == -ENOLCK) {
14497         ldout(pg->cct, 10) << "could not get write lock on obj "
14498                            << object << dendl;
14499       } else {
14500         pg->state_set(PG_STATE_SNAPTRIM_ERROR);
14501         ldout(pg->cct, 10) << "Snaptrim error=" << error << dendl;
14502       }
14503       if (!in_flight.empty()) {
14504         ldout(pg->cct, 10) << "letting the ones we already started finish" << dendl;
14505         return transit< WaitRepops >();
14506       }
14507       if (error == -ENOLCK) {
14508         ldout(pg->cct, 10) << "waiting for it to clear"
14509                            << dendl;
14510         return transit< WaitRWLock >();
14511       } else {
14512         return transit< NotTrimming >();
14513       }
14514     }
14515
14516     in_flight.insert(object);
14517     ctx->register_on_success(
14518       [pg, object, &in_flight]() {
14519         assert(in_flight.find(object) != in_flight.end());
14520         in_flight.erase(object);
14521         if (in_flight.empty()) {
14522           if (pg->state_test(PG_STATE_SNAPTRIM_ERROR)) {
14523             pg->snap_trimmer_machine.process_event(Reset());
14524           } else {
14525             pg->snap_trimmer_machine.process_event(RepopsComplete());
14526           }
14527         }
14528       });
14529
14530     pg->simple_opc_submit(std::move(ctx));
14531   }
14532
14533   return transit< WaitRepops >();
14534 }
14535
14536 void PrimaryLogPG::setattr_maybe_cache(
14537   ObjectContextRef obc,
14538   OpContext *op,
14539   PGTransaction *t,
14540   const string &key,
14541   bufferlist &val)
14542 {
14543   t->setattr(obc->obs.oi.soid, key, val);
14544 }
14545
14546 void PrimaryLogPG::setattrs_maybe_cache(
14547   ObjectContextRef obc,
14548   OpContext *op,
14549   PGTransaction *t,
14550   map<string, bufferlist> &attrs)
14551 {
14552   t->setattrs(obc->obs.oi.soid, attrs);
14553 }
14554
14555 void PrimaryLogPG::rmattr_maybe_cache(
14556   ObjectContextRef obc,
14557   OpContext *op,
14558   PGTransaction *t,
14559   const string &key)
14560 {
14561   t->rmattr(obc->obs.oi.soid, key);
14562 }
14563
14564 int PrimaryLogPG::getattr_maybe_cache(
14565   ObjectContextRef obc,
14566   const string &key,
14567   bufferlist *val)
14568 {
14569   if (pool.info.require_rollback()) {
14570     map<string, bufferlist>::iterator i = obc->attr_cache.find(key);
14571     if (i != obc->attr_cache.end()) {
14572       if (val)
14573         *val = i->second;
14574       return 0;
14575     } else {
14576       return -ENODATA;
14577     }
14578   }
14579   return pgbackend->objects_get_attr(obc->obs.oi.soid, key, val);
14580 }
14581
14582 int PrimaryLogPG::getattrs_maybe_cache(
14583   ObjectContextRef obc,
14584   map<string, bufferlist> *out)
14585 {
14586   int r = 0;
14587   assert(out);
14588   if (pool.info.require_rollback()) {
14589     *out = obc->attr_cache;
14590   } else {
14591     r = pgbackend->objects_get_attrs(obc->obs.oi.soid, out);
14592   }
14593   map<string, bufferlist> tmp;
14594   for (map<string, bufferlist>::iterator i = out->begin();
14595        i != out->end();
14596        ++i) {
14597     if (i->first.size() > 1 && i->first[0] == '_')
14598       tmp[i->first.substr(1, i->first.size())].claim(i->second);
14599   }
14600   tmp.swap(*out);
14601   return r;
14602 }
14603
14604 bool PrimaryLogPG::check_failsafe_full(ostream &ss) {
14605     return osd->check_failsafe_full(ss);
14606 }
14607
14608 void intrusive_ptr_add_ref(PrimaryLogPG *pg) { pg->get("intptr"); }
14609 void intrusive_ptr_release(PrimaryLogPG *pg) { pg->put("intptr"); }
14610
14611 #ifdef PG_DEBUG_REFS
14612 uint64_t get_with_id(PrimaryLogPG *pg) { return pg->get_with_id(); }
14613 void put_with_id(PrimaryLogPG *pg, uint64_t id) { return pg->put_with_id(id); }
14614 #endif
14615
14616 void intrusive_ptr_add_ref(PrimaryLogPG::RepGather *repop) { repop->get(); }
14617 void intrusive_ptr_release(PrimaryLogPG::RepGather *repop) { repop->put(); }