ceph/src/osd/PrimaryLogPG.cc

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Ceph - scalable distributed file system
   5  *
   6  * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
   7  * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
   8  *
   9  * Author: Loic Dachary <loic@dachary.org>
  10  *
  11  * This is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public
  13  * License version 2.1, as published by the Free Software
  14  * Foundation.  See file COPYING.
  15  *
  16  */
  17
  18 #include "boost/tuple/tuple.hpp"
  19 #include "boost/intrusive_ptr.hpp"
  20 #include "PG.h"
  21 #include "PrimaryLogPG.h"
  22 #include "OSD.h"
  23 #include "OpRequest.h"
  24 #include "ScrubStore.h"
  25 #include "Session.h"
  26 #include "objclass/objclass.h"
  27
  28 #include "common/errno.h"
  29 #include "common/scrub_types.h"
  30 #include "common/perf_counters.h"
  31
  32 #include "messages/MOSDOp.h"
  33 #include "messages/MOSDBackoff.h"
  34 #include "messages/MOSDSubOp.h"
  35 #include "messages/MOSDSubOpReply.h"
  36 #include "messages/MOSDPGTrim.h"
  37 #include "messages/MOSDPGScan.h"
  38 #include "messages/MOSDRepScrub.h"
  39 #include "messages/MOSDPGBackfill.h"
  40 #include "messages/MOSDPGBackfillRemove.h"
  41 #include "messages/MOSDPGUpdateLogMissing.h"
  42 #include "messages/MOSDPGUpdateLogMissingReply.h"
  43 #include "messages/MCommandReply.h"
  44 #include "messages/MOSDScrubReserve.h"
  45 #include "mds/inode_backtrace.h" // Ugh
  46 #include "common/EventTrace.h"
  47
  48 #include "common/config.h"
  49 #include "include/compat.h"
  50 #include "mon/MonClient.h"
  51 #include "osdc/Objecter.h"
  52 #include "json_spirit/json_spirit_value.h"
  53 #include "json_spirit/json_spirit_reader.h"
  54 #include "include/assert.h"  // json_spirit clobbers it
  55 #include "include/rados/rados_types.hpp"
  56
  57 #ifdef WITH_LTTNG
  58 #include "tracing/osd.h"
  59 #else
  60 #define tracepoint(...)
  61 #endif
  62
  63 #define dout_context cct
  64 #define dout_subsys ceph_subsys_osd
  65 #define DOUT_PREFIX_ARGS this, osd->whoami, get_osdmap()
  66 #undef dout_prefix
  67 #define dout_prefix _prefix(_dout, this)
  68 template <typename T>
  69 static ostream& _prefix(std::ostream *_dout, T *pg) {
  70   return *_dout << pg->gen_prefix();
  71 }
  72
  73
  74 #include <sstream>
  75 #include <utility>
  76
  77 #include <errno.h>
  78
  79 MEMPOOL_DEFINE_OBJECT_FACTORY(PrimaryLogPG, replicatedpg, osd);
  80
  81 PGLSFilter::PGLSFilter() : cct(nullptr)
  82 {
  83 }
  84
  85 PGLSFilter::~PGLSFilter()
  86 {
  87 }
  88
  89 struct PrimaryLogPG::C_OSD_OnApplied : Context {
  90   PrimaryLogPGRef pg;
  91   epoch_t epoch;
  92   eversion_t v;
  93   C_OSD_OnApplied(
  94     PrimaryLogPGRef pg,
  95     epoch_t epoch,
  96     eversion_t v)
  97     : pg(pg), epoch(epoch), v(v) {}
  98   void finish(int) override {
  99     pg->lock();
 100     if (!pg->pg_has_reset_since(epoch))
 101       pg->op_applied(v);
 102     pg->unlock();
 103   }
 104 };
 105
 106 /**
 107  * The CopyCallback class defines an interface for completions to the
 108  * copy_start code. Users of the copy infrastructure must implement
 109  * one and give an instance of the class to start_copy.
 110  *
 111  * The implementer is responsible for making sure that the CopyCallback
 112  * can associate itself with the correct copy operation.
 113  */
 114 class PrimaryLogPG::CopyCallback : public GenContext<CopyCallbackResults> {
 115 protected:
 116   CopyCallback() {}
 117   /**
 118    * results.get<0>() is the return code: 0 for success; -ECANCELED if
 119    * the operation was cancelled by the local OSD; -errno for other issues.
 120    * results.get<1>() is a pointer to a CopyResults object, which you are
 121    * responsible for deleting.
 122    */
 123   void finish(CopyCallbackResults results_) override = 0;
 124
 125 public:
 126   /// Provide the final size of the copied object to the CopyCallback
 127   ~CopyCallback() override {}
 128 };
 129
 130 template <typename T>
 131 class PrimaryLogPG::BlessedGenContext : public GenContext<T> {
 132   PrimaryLogPGRef pg;
 133   unique_ptr<GenContext<T>> c;
 134   epoch_t e;
 135 public:
 136   BlessedGenContext(PrimaryLogPG *pg, GenContext<T> *c, epoch_t e)
 137     : pg(pg), c(c), e(e) {}
 138   void finish(T t) override {
 139     pg->lock();
 140     if (pg->pg_has_reset_since(e))
 141       c.reset();
 142     else
 143       c.release()->complete(t);
 144     pg->unlock();
 145   }
 146 };
 147
 148 GenContext<ThreadPool::TPHandle&> *PrimaryLogPG::bless_gencontext(
 149   GenContext<ThreadPool::TPHandle&> *c) {
 150   return new BlessedGenContext<ThreadPool::TPHandle&>(
 151     this, c, get_osdmap()->get_epoch());
 152 }
 153
 154 class PrimaryLogPG::BlessedContext : public Context {
 155   PrimaryLogPGRef pg;
 156   unique_ptr<Context> c;
 157   epoch_t e;
 158 public:
 159   BlessedContext(PrimaryLogPG *pg, Context *c, epoch_t e)
 160     : pg(pg), c(c), e(e) {}
 161   void finish(int r) override {
 162     pg->lock();
 163     if (pg->pg_has_reset_since(e))
 164       c.reset();
 165     else
 166       c.release()->complete(r);
 167     pg->unlock();
 168   }
 169 };
 170
 171
 172 Context *PrimaryLogPG::bless_context(Context *c) {
 173   return new BlessedContext(this, c, get_osdmap()->get_epoch());
 174 }
 175
 176 class PrimaryLogPG::C_PG_ObjectContext : public Context {
 177   PrimaryLogPGRef pg;
 178   ObjectContext *obc;
 179   public:
 180   C_PG_ObjectContext(PrimaryLogPG *p, ObjectContext *o) :
 181     pg(p), obc(o) {}
 182   void finish(int r) override {
 183     pg->object_context_destructor_callback(obc);
 184   }
 185 };
 186
 187 class PrimaryLogPG::C_OSD_OndiskWriteUnlock : public Context {
 188   ObjectContextRef obc, obc2, obc3;
 189   public:
 190   C_OSD_OndiskWriteUnlock(
 191     ObjectContextRef o,
 192     ObjectContextRef o2 = ObjectContextRef(),
 193     ObjectContextRef o3 = ObjectContextRef()) : obc(o), obc2(o2), obc3(o3) {}
 194   void finish(int r) override {
 195     obc->ondisk_write_unlock();
 196     if (obc2)
 197       obc2->ondisk_write_unlock();
 198     if (obc3)
 199       obc3->ondisk_write_unlock();
 200   }
 201 };
 202
 203 struct OnReadComplete : public Context {
 204   PrimaryLogPG *pg;
 205   PrimaryLogPG::OpContext *opcontext;
 206   OnReadComplete(
 207     PrimaryLogPG *pg,
 208     PrimaryLogPG::OpContext *ctx) : pg(pg), opcontext(ctx) {}
 209   void finish(int r) override {
 210     if (r < 0)
 211       opcontext->async_read_result = r;
 212     opcontext->finish_read(pg);
 213   }
 214   ~OnReadComplete() override {}
 215 };
 216
 217 class PrimaryLogPG::C_OSD_AppliedRecoveredObject : public Context {
 218   PrimaryLogPGRef pg;
 219   ObjectContextRef obc;
 220   public:
 221   C_OSD_AppliedRecoveredObject(PrimaryLogPG *p, ObjectContextRef o) :
 222     pg(p), obc(o) {}
 223   void finish(int r) override {
 224     pg->_applied_recovered_object(obc);
 225   }
 226 };
 227
 228 class PrimaryLogPG::C_OSD_CommittedPushedObject : public Context {
 229   PrimaryLogPGRef pg;
 230   epoch_t epoch;
 231   eversion_t last_complete;
 232   public:
 233   C_OSD_CommittedPushedObject(
 234     PrimaryLogPG *p, epoch_t epoch, eversion_t lc) :
 235     pg(p), epoch(epoch), last_complete(lc) {
 236   }
 237   void finish(int r) override {
 238     pg->_committed_pushed_object(epoch, last_complete);
 239   }
 240 };
 241
 242 class PrimaryLogPG::C_OSD_AppliedRecoveredObjectReplica : public Context {
 243   PrimaryLogPGRef pg;
 244   public:
 245   explicit C_OSD_AppliedRecoveredObjectReplica(PrimaryLogPG *p) :
 246     pg(p) {}
 247   void finish(int r) override {
 248     pg->_applied_recovered_object_replica();
 249   }
 250 };
 251
 252 // OpContext
 253 void PrimaryLogPG::OpContext::start_async_reads(PrimaryLogPG *pg)
 254 {
 255   inflightreads = 1;
 256   list<pair<boost::tuple<uint64_t, uint64_t, unsigned>,
 257             pair<bufferlist*, Context*> > > in;
 258   in.swap(pending_async_reads);
 259   pg->pgbackend->objects_read_async(
 260     obc->obs.oi.soid,
 261     in,
 262     new OnReadComplete(pg, this), pg->get_pool().fast_read);
 263 }
 264 void PrimaryLogPG::OpContext::finish_read(PrimaryLogPG *pg)
 265 {
 266   assert(inflightreads > 0);
 267   --inflightreads;
 268   if (async_reads_complete()) {
 269     assert(pg->in_progress_async_reads.size());
 270     assert(pg->in_progress_async_reads.front().second == this);
 271     pg->in_progress_async_reads.pop_front();
 272     pg->complete_read_ctx(async_read_result, this);
 273   }
 274 }
 275
 276 class CopyFromCallback: public PrimaryLogPG::CopyCallback {
 277 public:
 278   PrimaryLogPG::CopyResults *results;
 279   int retval;
 280   PrimaryLogPG::OpContext *ctx;
 281   explicit CopyFromCallback(PrimaryLogPG::OpContext *ctx_)
 282     : results(NULL),
 283       retval(0),
 284       ctx(ctx_) {}
 285   ~CopyFromCallback() override {}
 286
 287   void finish(PrimaryLogPG::CopyCallbackResults results_) override {
 288     results = results_.get<1>();
 289     int r = results_.get<0>();
 290     retval = r;
 291
 292     // for finish_copyfrom
 293     ctx->user_at_version = results->user_version;
 294
 295     if (r >= 0) {
 296       ctx->pg->execute_ctx(ctx);
 297     }
 298     ctx->copy_cb = NULL;
 299     if (r < 0) {
 300       if (r != -ECANCELED) { // on cancel just toss it out; client resends
 301         if (ctx->op)
 302           ctx->pg->osd->reply_op_error(ctx->op, r);
 303       } else if (results->should_requeue) {
 304         if (ctx->op)
 305           ctx->pg->requeue_op(ctx->op);
 306       }
 307       ctx->pg->close_op_ctx(ctx);
 308     }
 309   }
 310
 311   bool is_temp_obj_used() {
 312     return results->started_temp_obj;
 313   }
 314   uint64_t get_data_size() {
 315     return results->object_size;
 316   }
 317   int get_result() {
 318     return retval;
 319   }
 320 };
 321
 322 // ======================
 323 // PGBackend::Listener
 324
 325 void PrimaryLogPG::on_local_recover(
 326   const hobject_t &hoid,
 327   const ObjectRecoveryInfo &_recovery_info,
 328   ObjectContextRef obc,
 329   ObjectStore::Transaction *t
 330   )
 331 {
 332   dout(10) << __func__ << ": " << hoid << dendl;
 333
 334   ObjectRecoveryInfo recovery_info(_recovery_info);
 335   clear_object_snap_mapping(t, hoid);
 336   if (recovery_info.soid.is_snap()) {
 337     OSDriver::OSTransaction _t(osdriver.get_transaction(t));
 338     set<snapid_t> snaps;
 339     dout(20) << " snapset " << recovery_info.ss
 340              << " legacy_snaps " << recovery_info.oi.legacy_snaps << dendl;
 341     if (recovery_info.ss.is_legacy() ||
 342         recovery_info.ss.seq == 0 /* jewel osd doesn't populate this */) {
 343       assert(recovery_info.oi.legacy_snaps.size());
 344       snaps.insert(recovery_info.oi.legacy_snaps.begin(),
 345                    recovery_info.oi.legacy_snaps.end());
 346     } else {
 347       auto p = recovery_info.ss.clone_snaps.find(hoid.snap);
 348       assert(p != recovery_info.ss.clone_snaps.end());  // hmm, should we warn?
 349       snaps.insert(p->second.begin(), p->second.end());
 350     }
 351     dout(20) << " snaps " << snaps << dendl;
 352     snap_mapper.add_oid(
 353       recovery_info.soid,
 354       snaps,
 355       &_t);
 356   }
 357   if (pg_log.get_missing().is_missing(recovery_info.soid) &&
 358       pg_log.get_missing().get_items().find(recovery_info.soid)->second.need > recovery_info.version) {
 359     assert(is_primary());
 360     const pg_log_entry_t *latest = pg_log.get_log().objects.find(recovery_info.soid)->second;
 361     if (latest->op == pg_log_entry_t::LOST_REVERT &&
 362         latest->reverting_to == recovery_info.version) {
 363       dout(10) << " got old revert version " << recovery_info.version
 364                << " for " << *latest << dendl;
 365       recovery_info.version = latest->version;
 366       // update the attr to the revert event version
 367       recovery_info.oi.prior_version = recovery_info.oi.version;
 368       recovery_info.oi.version = latest->version;
 369       bufferlist bl;
 370       ::encode(recovery_info.oi, bl,
 371                get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
 372       assert(!pool.info.require_rollback());
 373       t->setattr(coll, ghobject_t(recovery_info.soid), OI_ATTR, bl);
 374       if (obc)
 375         obc->attr_cache[OI_ATTR] = bl;
 376     }
 377   }
 378
 379   // keep track of active pushes for scrub
 380   ++active_pushes;
 381
 382   if (recovery_info.version > pg_log.get_can_rollback_to()) {
 383     /* This can only happen during a repair, and even then, it would
 384      * be one heck of a race.  If we are repairing the object, the
 385      * write in question must be fully committed, so it's not valid
 386      * to roll it back anyway (and we'll be rolled forward shortly
 387      * anyway) */
 388     PGLogEntryHandler h{this, t};
 389     pg_log.roll_forward_to(recovery_info.version, &h);
 390   }
 391   recover_got(recovery_info.soid, recovery_info.version);
 392
 393   if (is_primary()) {
 394     assert(obc);
 395     obc->obs.exists = true;
 396     obc->ondisk_write_lock();
 397
 398     bool got = obc->get_recovery_read();
 399     assert(got);
 400
 401     assert(recovering.count(obc->obs.oi.soid));
 402     recovering[obc->obs.oi.soid] = obc;
 403     obc->obs.oi = recovery_info.oi;  // may have been updated above
 404
 405
 406     t->register_on_applied(new C_OSD_AppliedRecoveredObject(this, obc));
 407     t->register_on_applied_sync(new C_OSD_OndiskWriteUnlock(obc));
 408
 409     publish_stats_to_osd();
 410     assert(missing_loc.needs_recovery(hoid));
 411     missing_loc.add_location(hoid, pg_whoami);
 412     release_backoffs(hoid);
 413     if (!is_unreadable_object(hoid)) {
 414       auto unreadable_object_entry = waiting_for_unreadable_object.find(hoid);
 415       if (unreadable_object_entry != waiting_for_unreadable_object.end()) {
 416         dout(20) << " kicking unreadable waiters on " << hoid << dendl;
 417         requeue_ops(unreadable_object_entry->second);
 418         waiting_for_unreadable_object.erase(unreadable_object_entry);
 419       }
 420     }
 421   } else {
 422     t->register_on_applied(
 423       new C_OSD_AppliedRecoveredObjectReplica(this));
 424
 425   }
 426
 427   t->register_on_commit(
 428     new C_OSD_CommittedPushedObject(
 429       this,
 430       get_osdmap()->get_epoch(),
 431       info.last_complete));
 432
 433   // update pg
 434   dirty_info = true;
 435   write_if_dirty(*t);
 436 }
 437
 438 void PrimaryLogPG::on_global_recover(
 439   const hobject_t &soid,
 440   const object_stat_sum_t &stat_diff)
 441 {
 442   info.stats.stats.sum.add(stat_diff);
 443   missing_loc.recovered(soid);
 444   publish_stats_to_osd();
 445   dout(10) << "pushed " << soid << " to all replicas" << dendl;
 446   map<hobject_t, ObjectContextRef>::iterator i = recovering.find(soid);
 447   assert(i != recovering.end());
 448
 449   // recover missing won't have had an obc, but it gets filled in
 450   // during on_local_recover
 451   assert(i->second);
 452   list<OpRequestRef> requeue_list;
 453   i->second->drop_recovery_read(&requeue_list);
 454   requeue_ops(requeue_list);
 455
 456   backfills_in_flight.erase(soid);
 457
 458   recovering.erase(i);
 459   finish_recovery_op(soid);
 460   release_backoffs(soid);
 461   auto degraded_object_entry = waiting_for_degraded_object.find(soid);
 462   if (degraded_object_entry != waiting_for_degraded_object.end()) {
 463     dout(20) << " kicking degraded waiters on " << soid << dendl;
 464     requeue_ops(degraded_object_entry->second);
 465     waiting_for_degraded_object.erase(degraded_object_entry);
 466   }
 467   auto unreadable_object_entry = waiting_for_unreadable_object.find(soid);
 468   if (unreadable_object_entry != waiting_for_unreadable_object.end()) {
 469     dout(20) << " kicking unreadable waiters on " << soid << dendl;
 470     requeue_ops(unreadable_object_entry->second);
 471     waiting_for_unreadable_object.erase(unreadable_object_entry);
 472   }
 473   finish_degraded_object(soid);
 474 }
 475
 476 void PrimaryLogPG::on_peer_recover(
 477   pg_shard_t peer,
 478   const hobject_t &soid,
 479   const ObjectRecoveryInfo &recovery_info)
 480 {
 481   publish_stats_to_osd();
 482   // done!
 483   peer_missing[peer].got(soid, recovery_info.version);
 484 }
 485
 486 void PrimaryLogPG::begin_peer_recover(
 487   pg_shard_t peer,
 488   const hobject_t soid)
 489 {
 490   peer_missing[peer].revise_have(soid, eversion_t());
 491 }
 492
 493 void PrimaryLogPG::schedule_recovery_work(
 494   GenContext<ThreadPool::TPHandle&> *c)
 495 {
 496   osd->recovery_gen_wq.queue(c);
 497 }
 498
 499 void PrimaryLogPG::send_message_osd_cluster(
 500   int peer, Message *m, epoch_t from_epoch)
 501 {
 502   osd->send_message_osd_cluster(peer, m, from_epoch);
 503 }
 504
 505 void PrimaryLogPG::send_message_osd_cluster(
 506   Message *m, Connection *con)
 507 {
 508   osd->send_message_osd_cluster(m, con);
 509 }
 510
 511 void PrimaryLogPG::send_message_osd_cluster(
 512   Message *m, const ConnectionRef& con)
 513 {
 514   osd->send_message_osd_cluster(m, con);
 515 }
 516
 517 void PrimaryLogPG::on_primary_error(
 518   const hobject_t &oid,
 519   eversion_t v)
 520 {
 521   dout(0) << __func__ << ": oid " << oid << " version " << v << dendl;
 522   primary_failed(oid);
 523   primary_error(oid, v);
 524   backfills_in_flight.erase(oid);
 525   missing_loc.add_missing(oid, v, eversion_t());
 526 }
 527
 528 ConnectionRef PrimaryLogPG::get_con_osd_cluster(
 529   int peer, epoch_t from_epoch)
 530 {
 531   return osd->get_con_osd_cluster(peer, from_epoch);
 532 }
 533
 534 PerfCounters *PrimaryLogPG::get_logger()
 535 {
 536   return osd->logger;
 537 }
 538
 539
 540 // ====================
 541 // missing objects
 542
 543 bool PrimaryLogPG::is_missing_object(const hobject_t& soid) const
 544 {
 545   return pg_log.get_missing().get_items().count(soid);
 546 }
 547
 548 void PrimaryLogPG::maybe_kick_recovery(
 549   const hobject_t &soid)
 550 {
 551   eversion_t v;
 552   if (!missing_loc.needs_recovery(soid, &v))
 553     return;
 554
 555   map<hobject_t, ObjectContextRef>::const_iterator p = recovering.find(soid);
 556   if (p != recovering.end()) {
 557     dout(7) << "object " << soid << " v " << v << ", already recovering." << dendl;
 558   } else if (missing_loc.is_unfound(soid)) {
 559     dout(7) << "object " << soid << " v " << v << ", is unfound." << dendl;
 560   } else {
 561     dout(7) << "object " << soid << " v " << v << ", recovering." << dendl;
 562     PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
 563     if (is_missing_object(soid)) {
 564       recover_missing(soid, v, cct->_conf->osd_client_op_priority, h);
 565     } else {
 566       prep_object_replica_pushes(soid, v, h);
 567     }
 568     pgbackend->run_recovery_op(h, cct->_conf->osd_client_op_priority);
 569   }
 570 }
 571
 572 void PrimaryLogPG::wait_for_unreadable_object(
 573   const hobject_t& soid, OpRequestRef op)
 574 {
 575   assert(is_unreadable_object(soid));
 576   maybe_kick_recovery(soid);
 577   waiting_for_unreadable_object[soid].push_back(op);
 578   op->mark_delayed("waiting for missing object");
 579 }
 580
 581 bool PrimaryLogPG::is_degraded_or_backfilling_object(const hobject_t& soid)
 582 {
 583   /* The conditions below may clear (on_local_recover, before we queue
 584    * the transaction) before we actually requeue the degraded waiters
 585    * in on_global_recover after the transaction completes.
 586    */
 587   if (waiting_for_degraded_object.count(soid))
 588     return true;
 589   if (pg_log.get_missing().get_items().count(soid))
 590     return true;
 591   assert(!actingbackfill.empty());
 592   for (set<pg_shard_t>::iterator i = actingbackfill.begin();
 593        i != actingbackfill.end();
 594        ++i) {
 595     if (*i == get_primary()) continue;
 596     pg_shard_t peer = *i;
 597     auto peer_missing_entry = peer_missing.find(peer);
 598     if (peer_missing_entry != peer_missing.end() &&
 599         peer_missing_entry->second.get_items().count(soid))
 600       return true;
 601
 602     // Object is degraded if after last_backfill AND
 603     // we are backfilling it
 604     if (is_backfill_targets(peer) &&
 605         peer_info[peer].last_backfill <= soid &&
 606         last_backfill_started >= soid &&
 607         backfills_in_flight.count(soid))
 608       return true;
 609   }
 610   return false;
 611 }
 612
 613 void PrimaryLogPG::wait_for_degraded_object(const hobject_t& soid, OpRequestRef op)
 614 {
 615   assert(is_degraded_or_backfilling_object(soid));
 616
 617   maybe_kick_recovery(soid);
 618   waiting_for_degraded_object[soid].push_back(op);
 619   op->mark_delayed("waiting for degraded object");
 620 }
 621
 622 void PrimaryLogPG::block_write_on_full_cache(
 623   const hobject_t& _oid, OpRequestRef op)
 624 {
 625   const hobject_t oid = _oid.get_head();
 626   dout(20) << __func__ << ": blocking object " << oid
 627            << " on full cache" << dendl;
 628   objects_blocked_on_cache_full.insert(oid);
 629   waiting_for_cache_not_full.push_back(op);
 630   op->mark_delayed("waiting for cache not full");
 631 }
 632
 633 void PrimaryLogPG::block_for_clean(
 634   const hobject_t& oid, OpRequestRef op)
 635 {
 636   dout(20) << __func__ << ": blocking object " << oid
 637            << " on primary repair" << dendl;
 638   waiting_for_clean_to_primary_repair.push_back(op);
 639   op->mark_delayed("waiting for clean to repair");
 640 }
 641
 642 void PrimaryLogPG::block_write_on_snap_rollback(
 643   const hobject_t& oid, ObjectContextRef obc, OpRequestRef op)
 644 {
 645   dout(20) << __func__ << ": blocking object " << oid.get_head()
 646            << " on snap promotion " << obc->obs.oi.soid << dendl;
 647   // otherwise, we'd have blocked in do_op
 648   assert(oid.is_head());
 649   assert(objects_blocked_on_snap_promotion.count(oid) == 0);
 650   objects_blocked_on_snap_promotion[oid] = obc;
 651   wait_for_blocked_object(obc->obs.oi.soid, op);
 652 }
 653
 654 void PrimaryLogPG::block_write_on_degraded_snap(
 655   const hobject_t& snap, OpRequestRef op)
 656 {
 657   dout(20) << __func__ << ": blocking object " << snap.get_head()
 658            << " on degraded snap " << snap << dendl;
 659   // otherwise, we'd have blocked in do_op
 660   assert(objects_blocked_on_degraded_snap.count(snap.get_head()) == 0);
 661   objects_blocked_on_degraded_snap[snap.get_head()] = snap.snap;
 662   wait_for_degraded_object(snap, op);
 663 }
 664
 665 bool PrimaryLogPG::maybe_await_blocked_snapset(
 666   const hobject_t &hoid,
 667   OpRequestRef op)
 668 {
 669   ObjectContextRef obc;
 670   obc = object_contexts.lookup(hoid.get_head());
 671   if (obc) {
 672     if (obc->is_blocked()) {
 673       wait_for_blocked_object(obc->obs.oi.soid, op);
 674       return true;
 675     } else {
 676       return false;
 677     }
 678   }
 679   obc = object_contexts.lookup(hoid.get_snapdir());
 680   if (obc) {
 681     if (obc->is_blocked()) {
 682       wait_for_blocked_object(obc->obs.oi.soid, op);
 683       return true;
 684     } else {
 685       return false;
 686     }
 687   }
 688   return false;
 689 }
 690
 691 void PrimaryLogPG::wait_for_blocked_object(const hobject_t& soid, OpRequestRef op)
 692 {
 693   dout(10) << __func__ << " " << soid << " " << op << dendl;
 694   waiting_for_blocked_object[soid].push_back(op);
 695   op->mark_delayed("waiting for blocked object");
 696 }
 697
 698 void PrimaryLogPG::maybe_force_recovery()
 699 {
 700   // no force if not in degraded/recovery/backfill stats
 701   if (!is_degraded() &&
 702       !state_test(PG_STATE_RECOVERING |
 703                   PG_STATE_RECOVERY_WAIT |
 704                   PG_STATE_BACKFILL |
 705                   PG_STATE_BACKFILL_WAIT |
 706                   PG_STATE_BACKFILL_TOOFULL))
 707     return;
 708
 709   if (pg_log.get_log().approx_size() <
 710       cct->_conf->osd_max_pg_log_entries *
 711         cct->_conf->osd_force_recovery_pg_log_entries_factor)
 712     return;
 713
 714   // find the oldest missing object
 715   version_t min_version = 0;
 716   hobject_t soid;
 717   if (!pg_log.get_missing().get_items().empty()) {
 718     min_version = pg_log.get_missing().get_rmissing().begin()->first;
 719     soid = pg_log.get_missing().get_rmissing().begin()->second;
 720   }
 721   assert(!actingbackfill.empty());
 722   for (set<pg_shard_t>::iterator it = actingbackfill.begin();
 723        it != actingbackfill.end();
 724        ++it) {
 725     if (*it == get_primary()) continue;
 726     pg_shard_t peer = *it;
 727     if (peer_missing.count(peer) &&
 728         !peer_missing[peer].get_items().empty() &&
 729         min_version > peer_missing[peer].get_rmissing().begin()->first) {
 730       min_version = peer_missing[peer].get_rmissing().begin()->first;
 731       soid = peer_missing[peer].get_rmissing().begin()->second;
 732     }
 733   }
 734
 735   // recover it
 736   if (soid != hobject_t())
 737     maybe_kick_recovery(soid);
 738 }
 739
 740 class PGLSPlainFilter : public PGLSFilter {
 741   string val;
 742 public:
 743   int init(bufferlist::iterator &params) override
 744   {
 745     try {
 746       ::decode(xattr, params);
 747       ::decode(val, params);
 748     } catch (buffer::error &e) {
 749       return -EINVAL;
 750     }
 751
 752     return 0;
 753   }
 754   ~PGLSPlainFilter() override {}
 755   bool filter(const hobject_t &obj, bufferlist& xattr_data,
 756                       bufferlist& outdata) override;
 757 };
 758
 759 class PGLSParentFilter : public PGLSFilter {
 760   inodeno_t parent_ino;
 761 public:
 762   CephContext* cct;
 763   PGLSParentFilter(CephContext* cct) : cct(cct) {
 764     xattr = "_parent";
 765   }
 766   int init(bufferlist::iterator &params) override
 767   {
 768     try {
 769       ::decode(parent_ino, params);
 770     } catch (buffer::error &e) {
 771       return -EINVAL;
 772     }
 773     generic_dout(0) << "parent_ino=" << parent_ino << dendl;
 774
 775     return 0;
 776   }
 777   ~PGLSParentFilter() override {}
 778   bool filter(const hobject_t &obj, bufferlist& xattr_data,
 779                       bufferlist& outdata) override;
 780 };
 781
 782 bool PGLSParentFilter::filter(const hobject_t &obj,
 783                               bufferlist& xattr_data, bufferlist& outdata)
 784 {
 785   bufferlist::iterator iter = xattr_data.begin();
 786   inode_backtrace_t bt;
 787
 788   generic_dout(0) << "PGLSParentFilter::filter" << dendl;
 789
 790   ::decode(bt, iter);
 791
 792   vector<inode_backpointer_t>::iterator vi;
 793   for (vi = bt.ancestors.begin(); vi != bt.ancestors.end(); ++vi) {
 794     generic_dout(0) << "vi->dirino=" << vi->dirino << " parent_ino=" << parent_ino << dendl;
 795     if (vi->dirino == parent_ino) {
 796       ::encode(*vi, outdata);
 797       return true;
 798     }
 799   }
 800
 801   return false;
 802 }
 803
 804 bool PGLSPlainFilter::filter(const hobject_t &obj,
 805                              bufferlist& xattr_data, bufferlist& outdata)
 806 {
 807   if (val.size() != xattr_data.length())
 808     return false;
 809
 810   if (memcmp(val.c_str(), xattr_data.c_str(), val.size()))
 811     return false;
 812
 813   return true;
 814 }
 815
 816 bool PrimaryLogPG::pgls_filter(PGLSFilter *filter, hobject_t& sobj, bufferlist& outdata)
 817 {
 818   bufferlist bl;
 819
 820   // If filter has expressed an interest in an xattr, load it.
 821   if (!filter->get_xattr().empty()) {
 822     int ret = pgbackend->objects_get_attr(
 823       sobj,
 824       filter->get_xattr(),
 825       &bl);
 826     dout(0) << "getattr (sobj=" << sobj << ", attr=" << filter->get_xattr() << ") returned " << ret << dendl;
 827     if (ret < 0) {
 828       if (ret != -ENODATA || filter->reject_empty_xattr()) {
 829         return false;
 830       }
 831     }
 832   }
 833
 834   return filter->filter(sobj, bl, outdata);
 835 }
 836
 837 int PrimaryLogPG::get_pgls_filter(bufferlist::iterator& iter, PGLSFilter **pfilter)
 838 {
 839   string type;
 840   PGLSFilter *filter;
 841
 842   try {
 843     ::decode(type, iter);
 844   }
 845   catch (buffer::error& e) {
 846     return -EINVAL;
 847   }
 848
 849   if (type.compare("parent") == 0) {
 850     filter = new PGLSParentFilter(cct);
 851   } else if (type.compare("plain") == 0) {
 852     filter = new PGLSPlainFilter();
 853   } else {
 854     std::size_t dot = type.find(".");
 855     if (dot == std::string::npos || dot == 0 || dot == type.size() - 1) {
 856       return -EINVAL;
 857     }
 858
 859     const std::string class_name = type.substr(0, dot);
 860     const std::string filter_name = type.substr(dot + 1);
 861     ClassHandler::ClassData *cls = NULL;
 862     int r = osd->class_handler->open_class(class_name, &cls);
 863     if (r != 0) {
 864       derr << "Error opening class '" << class_name << "': "
 865            << cpp_strerror(r) << dendl;
 866       if (r != -EPERM) // propogate permission error
 867         r = -EINVAL;
 868       return r;
 869     } else {
 870       assert(cls);
 871     }
 872
 873     ClassHandler::ClassFilter *class_filter = cls->get_filter(filter_name);
 874     if (class_filter == NULL) {
 875       derr << "Error finding filter '" << filter_name << "' in class "
 876            << class_name << dendl;
 877       return -EINVAL;
 878     }
 879     filter = class_filter->fn();
 880     if (!filter) {
 881       // Object classes are obliged to return us something, but let's
 882       // give an error rather than asserting out.
 883       derr << "Buggy class " << class_name << " failed to construct "
 884               "filter " << filter_name << dendl;
 885       return -EINVAL;
 886     }
 887   }
 888
 889   assert(filter);
 890   int r = filter->init(iter);
 891   if (r < 0) {
 892     derr << "Error initializing filter " << type << ": "
 893          << cpp_strerror(r) << dendl;
 894     delete filter;
 895     return -EINVAL;
 896   } else {
 897     // Successfully constructed and initialized, return it.
 898     *pfilter = filter;
 899     return 0;
 900   }
 901 }
 902
 903
 904 // ==========================================================
 905
 906 int PrimaryLogPG::do_command(
 907   cmdmap_t cmdmap,
 908   ostream& ss,
 909   bufferlist& idata,
 910   bufferlist& odata,
 911   ConnectionRef con,
 912   ceph_tid_t tid)
 913 {
 914   const pg_missing_t &missing = pg_log.get_missing();
 915   string prefix;
 916   string format;
 917
 918   cmd_getval(cct, cmdmap, "format", format);
 919   boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json"));
 920
 921   string command;
 922   cmd_getval(cct, cmdmap, "cmd", command);
 923   if (command == "query") {
 924     f->open_object_section("pg");
 925     f->dump_string("state", pg_state_string(get_state()));
 926     f->dump_stream("snap_trimq") << snap_trimq;
 927     f->dump_unsigned("epoch", get_osdmap()->get_epoch());
 928     f->open_array_section("up");
 929     for (vector<int>::iterator p = up.begin(); p != up.end(); ++p)
 930       f->dump_unsigned("osd", *p);
 931     f->close_section();
 932     f->open_array_section("acting");
 933     for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p)
 934       f->dump_unsigned("osd", *p);
 935     f->close_section();
 936     if (!backfill_targets.empty()) {
 937       f->open_array_section("backfill_targets");
 938       for (set<pg_shard_t>::iterator p = backfill_targets.begin();
 939            p != backfill_targets.end();
 940            ++p)
 941         f->dump_stream("shard") << *p;
 942       f->close_section();
 943     }
 944     if (!actingbackfill.empty()) {
 945       f->open_array_section("actingbackfill");
 946       for (set<pg_shard_t>::iterator p = actingbackfill.begin();
 947            p != actingbackfill.end();
 948            ++p)
 949         f->dump_stream("shard") << *p;
 950       f->close_section();
 951     }
 952     f->open_object_section("info");
 953     _update_calc_stats();
 954     info.dump(f.get());
 955     f->close_section();
 956
 957     f->open_array_section("peer_info");
 958     for (map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
 959          p != peer_info.end();
 960          ++p) {
 961       f->open_object_section("info");
 962       f->dump_stream("peer") << p->first;
 963       p->second.dump(f.get());
 964       f->close_section();
 965     }
 966     f->close_section();
 967
 968     f->open_array_section("recovery_state");
 969     handle_query_state(f.get());
 970     f->close_section();
 971
 972     f->open_object_section("agent_state");
 973     if (agent_state)
 974       agent_state->dump(f.get());
 975     f->close_section();
 976
 977     f->close_section();
 978     f->flush(odata);
 979     return 0;
 980   }
 981   else if (command == "mark_unfound_lost") {
 982     string mulcmd;
 983     cmd_getval(cct, cmdmap, "mulcmd", mulcmd);
 984     int mode = -1;
 985     if (mulcmd == "revert") {
 986       if (pool.info.ec_pool()) {
 987         ss << "mode must be 'delete' for ec pool";
 988         return -EINVAL;
 989       }
 990       mode = pg_log_entry_t::LOST_REVERT;
 991     } else if (mulcmd == "delete") {
 992       mode = pg_log_entry_t::LOST_DELETE;
 993     } else {
 994       ss << "mode must be 'revert' or 'delete'; mark not yet implemented";
 995       return -EINVAL;
 996     }
 997     assert(mode == pg_log_entry_t::LOST_REVERT ||
 998            mode == pg_log_entry_t::LOST_DELETE);
 999
1000     if (!is_primary()) {
1001       ss << "not primary";
1002       return -EROFS;
1003     }
1004
1005     uint64_t unfound = missing_loc.num_unfound();
1006     if (!unfound) {
1007       ss << "pg has no unfound objects";
1008       return 0;  // make command idempotent
1009     }
1010
1011     if (!all_unfound_are_queried_or_lost(get_osdmap())) {
1012       ss << "pg has " << unfound
1013          << " unfound objects but we haven't probed all sources, not marking lost";
1014       return -EINVAL;
1015     }
1016
1017     mark_all_unfound_lost(mode, con, tid);
1018     return -EAGAIN;
1019   }
1020   else if (command == "list_missing") {
1021     hobject_t offset;
1022     string offset_json;
1023     if (cmd_getval(cct, cmdmap, "offset", offset_json)) {
1024       json_spirit::Value v;
1025       try {
1026         if (!json_spirit::read(offset_json, v))
1027           throw std::runtime_error("bad json");
1028         offset.decode(v);
1029       } catch (std::runtime_error& e) {
1030         ss << "error parsing offset: " << e.what();
1031         return -EINVAL;
1032       }
1033     }
1034     f->open_object_section("missing");
1035     {
1036       f->open_object_section("offset");
1037       offset.dump(f.get());
1038       f->close_section();
1039     }
1040     f->dump_int("num_missing", missing.num_missing());
1041     f->dump_int("num_unfound", get_num_unfound());
1042     const map<hobject_t, pg_missing_item> &needs_recovery_map =
1043       missing_loc.get_needs_recovery();
1044     map<hobject_t, pg_missing_item>::const_iterator p =
1045       needs_recovery_map.upper_bound(offset);
1046     {
1047       f->open_array_section("objects");
1048       int32_t num = 0;
1049       for (; p != needs_recovery_map.end() && num < cct->_conf->osd_command_max_records; ++p) {
1050         if (missing_loc.is_unfound(p->first)) {
1051           f->open_object_section("object");
1052           {
1053             f->open_object_section("oid");
1054             p->first.dump(f.get());
1055             f->close_section();
1056           }
1057           p->second.dump(f.get()); // have, need keys
1058           {
1059             f->open_array_section("locations");
1060             for (set<pg_shard_t>::iterator r =
1061                 missing_loc.get_locations(p->first).begin();
1062                 r != missing_loc.get_locations(p->first).end();
1063                 ++r)
1064               f->dump_stream("shard") << *r;
1065             f->close_section();
1066           }
1067           f->close_section();
1068           num++;
1069         }
1070       }
1071       f->close_section();
1072     }
1073     f->dump_bool("more", p != needs_recovery_map.end());
1074     f->close_section();
1075     f->flush(odata);
1076     return 0;
1077   }
1078
1079   ss << "unknown pg command " << prefix;
1080   return -EINVAL;
1081 }
1082
1083 // ==========================================================
1084
1085 void PrimaryLogPG::do_pg_op(OpRequestRef op)
1086 {
1087   // NOTE: this is non-const because we modify the OSDOp.outdata in
1088   // place
1089   MOSDOp *m = static_cast<MOSDOp *>(op->get_nonconst_req());
1090   assert(m->get_type() == CEPH_MSG_OSD_OP);
1091   dout(10) << "do_pg_op " << *m << dendl;
1092
1093   op->mark_started();
1094
1095   int result = 0;
1096   string cname, mname;
1097   PGLSFilter *filter = NULL;
1098   bufferlist filter_out;
1099
1100   snapid_t snapid = m->get_snapid();
1101
1102   vector<OSDOp> ops = m->ops;
1103
1104   for (vector<OSDOp>::iterator p = ops.begin(); p != ops.end(); ++p) {
1105     OSDOp& osd_op = *p;
1106     bufferlist::iterator bp = p->indata.begin();
1107     switch (p->op.op) {
1108     case CEPH_OSD_OP_PGNLS_FILTER:
1109       try {
1110         ::decode(cname, bp);
1111         ::decode(mname, bp);
1112       }
1113       catch (const buffer::error& e) {
1114         dout(0) << "unable to decode PGLS_FILTER description in " << *m << dendl;
1115         result = -EINVAL;
1116         break;
1117       }
1118       if (filter) {
1119         delete filter;
1120         filter = NULL;
1121       }
1122       result = get_pgls_filter(bp, &filter);
1123       if (result < 0)
1124         break;
1125
1126       assert(filter);
1127
1128       // fall through
1129
1130     case CEPH_OSD_OP_PGNLS:
1131       if (snapid != CEPH_NOSNAP) {
1132         result = -EINVAL;
1133         break;
1134       }
1135       if (get_osdmap()->raw_pg_to_pg(m->get_pg()) != info.pgid.pgid) {
1136         dout(10) << " pgnls pg=" << m->get_pg()
1137                  << " " << get_osdmap()->raw_pg_to_pg(m->get_pg())
1138                  << " != " << info.pgid << dendl;
1139         result = 0; // hmm?
1140       } else {
1141         unsigned list_size = MIN(cct->_conf->osd_max_pgls, p->op.pgls.count);
1142
1143         dout(10) << " pgnls pg=" << m->get_pg() << " count " << list_size << dendl;
1144         // read into a buffer
1145         vector<hobject_t> sentries;
1146         pg_nls_response_t response;
1147         try {
1148           ::decode(response.handle, bp);
1149         }
1150         catch (const buffer::error& e) {
1151           dout(0) << "unable to decode PGNLS handle in " << *m << dendl;
1152           result = -EINVAL;
1153           break;
1154         }
1155
1156         hobject_t next;
1157         hobject_t lower_bound = response.handle;
1158         hobject_t pg_start = info.pgid.pgid.get_hobj_start();
1159         hobject_t pg_end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1160         dout(10) << " pgnls lower_bound " << lower_bound
1161                  << " pg_end " << pg_end << dendl;
1162         if (((!lower_bound.is_max() && lower_bound >= pg_end) ||
1163              (lower_bound != hobject_t() && lower_bound < pg_start))) {
1164           // this should only happen with a buggy client.
1165           dout(10) << "outside of PG bounds " << pg_start << " .. "
1166                    << pg_end << dendl;
1167           result = -EINVAL;
1168           break;
1169         }
1170
1171         hobject_t current = lower_bound;
1172         osr->flush();
1173         int r = pgbackend->objects_list_partial(
1174           current,
1175           list_size,
1176           list_size,
1177           &sentries,
1178           &next);
1179         if (r != 0) {
1180           result = -EINVAL;
1181           break;
1182         }
1183
1184         map<hobject_t, pg_missing_item>::const_iterator missing_iter =
1185           pg_log.get_missing().get_items().lower_bound(current);
1186         vector<hobject_t>::iterator ls_iter = sentries.begin();
1187         hobject_t _max = hobject_t::get_max();
1188         while (1) {
1189           const hobject_t &mcand =
1190             missing_iter == pg_log.get_missing().get_items().end() ?
1191             _max :
1192             missing_iter->first;
1193           const hobject_t &lcand =
1194             ls_iter == sentries.end() ?
1195             _max :
1196             *ls_iter;
1197
1198           hobject_t candidate;
1199           if (mcand == lcand) {
1200             candidate = mcand;
1201             if (!mcand.is_max()) {
1202               ++ls_iter;
1203               ++missing_iter;
1204             }
1205           } else if (mcand < lcand) {
1206             candidate = mcand;
1207             assert(!mcand.is_max());
1208             ++missing_iter;
1209           } else {
1210             candidate = lcand;
1211             assert(!lcand.is_max());
1212             ++ls_iter;
1213           }
1214
1215           dout(10) << " pgnls candidate 0x" << std::hex << candidate.get_hash()
1216             << " vs lower bound 0x" << lower_bound.get_hash() << dendl;
1217
1218           if (candidate >= next) {
1219             break;
1220           }
1221
1222           if (response.entries.size() == list_size) {
1223             next = candidate;
1224             break;
1225           }
1226
1227           // skip snapdir objects
1228           if (candidate.snap == CEPH_SNAPDIR)
1229             continue;
1230
1231           if (candidate.snap != CEPH_NOSNAP)
1232             continue;
1233
1234           // skip internal namespace
1235           if (candidate.get_namespace() == cct->_conf->osd_hit_set_namespace)
1236             continue;
1237
1238           // skip wrong namespace
1239           if (m->get_hobj().nspace != librados::all_nspaces &&
1240                candidate.get_namespace() != m->get_hobj().nspace)
1241             continue;
1242
1243           if (filter && !pgls_filter(filter, candidate, filter_out))
1244             continue;
1245
1246           dout(20) << "pgnls item 0x" << std::hex
1247             << candidate.get_hash()
1248             << ", rev 0x" << hobject_t::_reverse_bits(candidate.get_hash())
1249             << std::dec << " "
1250             << candidate.oid.name << dendl;
1251
1252           librados::ListObjectImpl item;
1253           item.nspace = candidate.get_namespace();
1254           item.oid = candidate.oid.name;
1255           item.locator = candidate.get_key();
1256           response.entries.push_back(item);
1257         }
1258
1259         if (next.is_max() &&
1260             missing_iter == pg_log.get_missing().get_items().end() &&
1261             ls_iter == sentries.end()) {
1262           result = 1;
1263
1264           // Set response.handle to the start of the next PG according
1265           // to the object sort order.
1266           response.handle = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1267         } else {
1268           response.handle = next;
1269         }
1270         dout(10) << "pgnls handle=" << response.handle << dendl;
1271         ::encode(response, osd_op.outdata);
1272         if (filter)
1273           ::encode(filter_out, osd_op.outdata);
1274         dout(10) << " pgnls result=" << result << " outdata.length()="
1275                  << osd_op.outdata.length() << dendl;
1276       }
1277       break;
1278
1279     case CEPH_OSD_OP_PGLS_FILTER:
1280       try {
1281         ::decode(cname, bp);
1282         ::decode(mname, bp);
1283       }
1284       catch (const buffer::error& e) {
1285         dout(0) << "unable to decode PGLS_FILTER description in " << *m << dendl;
1286         result = -EINVAL;
1287         break;
1288       }
1289       if (filter) {
1290         delete filter;
1291         filter = NULL;
1292       }
1293       result = get_pgls_filter(bp, &filter);
1294       if (result < 0)
1295         break;
1296
1297       assert(filter);
1298
1299       // fall through
1300
1301     case CEPH_OSD_OP_PGLS:
1302       if (snapid != CEPH_NOSNAP) {
1303         result = -EINVAL;
1304         break;
1305       }
1306       if (get_osdmap()->raw_pg_to_pg(m->get_pg()) != info.pgid.pgid) {
1307         dout(10) << " pgls pg=" << m->get_pg()
1308                  << " " << get_osdmap()->raw_pg_to_pg(m->get_pg())
1309                  << " != " << info.pgid << dendl;
1310         result = 0; // hmm?
1311       } else {
1312         unsigned list_size = MIN(cct->_conf->osd_max_pgls, p->op.pgls.count);
1313
1314         dout(10) << " pgls pg=" << m->get_pg() << " count " << list_size << dendl;
1315         // read into a buffer
1316         vector<hobject_t> sentries;
1317         pg_ls_response_t response;
1318         try {
1319           ::decode(response.handle, bp);
1320         }
1321         catch (const buffer::error& e) {
1322           dout(0) << "unable to decode PGLS handle in " << *m << dendl;
1323           result = -EINVAL;
1324           break;
1325         }
1326
1327         hobject_t next;
1328         hobject_t current = response.handle;
1329         osr->flush();
1330         int r = pgbackend->objects_list_partial(
1331           current,
1332           list_size,
1333           list_size,
1334           &sentries,
1335           &next);
1336         if (r != 0) {
1337           result = -EINVAL;
1338           break;
1339         }
1340
1341         assert(snapid == CEPH_NOSNAP || pg_log.get_missing().get_items().empty());
1342
1343         map<hobject_t, pg_missing_item>::const_iterator missing_iter =
1344           pg_log.get_missing().get_items().lower_bound(current);
1345         vector<hobject_t>::iterator ls_iter = sentries.begin();
1346         hobject_t _max = hobject_t::get_max();
1347         while (1) {
1348           const hobject_t &mcand =
1349             missing_iter == pg_log.get_missing().get_items().end() ?
1350             _max :
1351             missing_iter->first;
1352           const hobject_t &lcand =
1353             ls_iter == sentries.end() ?
1354             _max :
1355             *ls_iter;
1356
1357           hobject_t candidate;
1358           if (mcand == lcand) {
1359             candidate = mcand;
1360             if (!mcand.is_max()) {
1361               ++ls_iter;
1362               ++missing_iter;
1363             }
1364           } else if (mcand < lcand) {
1365             candidate = mcand;
1366             assert(!mcand.is_max());
1367             ++missing_iter;
1368           } else {
1369             candidate = lcand;
1370             assert(!lcand.is_max());
1371             ++ls_iter;
1372           }
1373
1374           if (candidate >= next) {
1375             break;
1376           }
1377
1378           if (response.entries.size() == list_size) {
1379             next = candidate;
1380             break;
1381           }
1382
1383           // skip snapdir objects
1384           if (candidate.snap == CEPH_SNAPDIR)
1385             continue;
1386
1387           if (candidate.snap != CEPH_NOSNAP)
1388             continue;
1389
1390           // skip wrong namespace
1391           if (candidate.get_namespace() != m->get_hobj().nspace)
1392             continue;
1393
1394           if (filter && !pgls_filter(filter, candidate, filter_out))
1395             continue;
1396
1397           response.entries.push_back(make_pair(candidate.oid,
1398                                                candidate.get_key()));
1399         }
1400         if (next.is_max() &&
1401             missing_iter == pg_log.get_missing().get_items().end() &&
1402             ls_iter == sentries.end()) {
1403           result = 1;
1404         }
1405         response.handle = next;
1406         ::encode(response, osd_op.outdata);
1407         if (filter)
1408           ::encode(filter_out, osd_op.outdata);
1409         dout(10) << " pgls result=" << result << " outdata.length()="
1410                  << osd_op.outdata.length() << dendl;
1411       }
1412       break;
1413
1414     case CEPH_OSD_OP_PG_HITSET_LS:
1415       {
1416         list< pair<utime_t,utime_t> > ls;
1417         for (list<pg_hit_set_info_t>::const_iterator p = info.hit_set.history.begin();
1418              p != info.hit_set.history.end();
1419              ++p)
1420           ls.push_back(make_pair(p->begin, p->end));
1421         if (hit_set)
1422           ls.push_back(make_pair(hit_set_start_stamp, utime_t()));
1423         ::encode(ls, osd_op.outdata);
1424       }
1425       break;
1426
1427     case CEPH_OSD_OP_PG_HITSET_GET:
1428       {
1429         utime_t stamp(osd_op.op.hit_set_get.stamp);
1430         if (hit_set_start_stamp && stamp >= hit_set_start_stamp) {
1431           // read the current in-memory HitSet, not the version we've
1432           // checkpointed.
1433           if (!hit_set) {
1434             result= -ENOENT;
1435             break;
1436           }
1437           ::encode(*hit_set, osd_op.outdata);
1438           result = osd_op.outdata.length();
1439         } else {
1440           // read an archived HitSet.
1441           hobject_t oid;
1442           for (list<pg_hit_set_info_t>::const_iterator p = info.hit_set.history.begin();
1443                p != info.hit_set.history.end();
1444                ++p) {
1445             if (stamp >= p->begin && stamp <= p->end) {
1446               oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
1447               break;
1448             }
1449           }
1450           if (oid == hobject_t()) {
1451             result = -ENOENT;
1452             break;
1453           }
1454           if (!pool.info.is_replicated()) {
1455             // FIXME: EC not supported yet
1456             result = -EOPNOTSUPP;
1457             break;
1458           }
1459           if (is_unreadable_object(oid)) {
1460             wait_for_unreadable_object(oid, op);
1461             delete filter;
1462             return;
1463           }
1464           result = osd->store->read(ch, ghobject_t(oid), 0, 0, osd_op.outdata);
1465         }
1466       }
1467       break;
1468
1469    case CEPH_OSD_OP_SCRUBLS:
1470       result = do_scrub_ls(m, &osd_op);
1471       break;
1472
1473     default:
1474       result = -EINVAL;
1475       break;
1476     }
1477
1478     if (result < 0)
1479       break;
1480   }
1481
1482   // reply
1483   MOSDOpReply *reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(),
1484                                        CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK,
1485                                        false);
1486   reply->claim_op_out_data(ops);
1487   reply->set_result(result);
1488   reply->set_reply_versions(info.last_update, info.last_user_version);
1489   osd->send_message_osd_client(reply, m->get_connection());
1490   delete filter;
1491 }
1492
1493 int PrimaryLogPG::do_scrub_ls(MOSDOp *m, OSDOp *osd_op)
1494 {
1495   if (m->get_pg() != info.pgid.pgid) {
1496     dout(10) << " scrubls pg=" << m->get_pg() << " != " << info.pgid << dendl;
1497     return -EINVAL; // hmm?
1498   }
1499   auto bp = osd_op->indata.begin();
1500   scrub_ls_arg_t arg;
1501   try {
1502     arg.decode(bp);
1503   } catch (buffer::error&) {
1504     dout(10) << " corrupted scrub_ls_arg_t" << dendl;
1505     return -EINVAL;
1506   }
1507   int r = 0;
1508   scrub_ls_result_t result = {.interval = info.history.same_interval_since};
1509   if (arg.interval != 0 && arg.interval != info.history.same_interval_since) {
1510     r = -EAGAIN;
1511   } else if (!scrubber.store) {
1512     r = -ENOENT;
1513   } else if (arg.get_snapsets) {
1514     result.vals = scrubber.store->get_snap_errors(osd->store,
1515                                                   get_pgid().pool(),
1516                                                   arg.start_after,
1517                                                   arg.max_return);
1518   } else {
1519     result.vals = scrubber.store->get_object_errors(osd->store,
1520                                                     get_pgid().pool(),
1521                                                     arg.start_after,
1522                                                     arg.max_return);
1523   }
1524   ::encode(result, osd_op->outdata);
1525   return r;
1526 }
1527
1528 void PrimaryLogPG::calc_trim_to()
1529 {
1530   size_t target = cct->_conf->osd_min_pg_log_entries;
1531   if (is_degraded() ||
1532       state_test(PG_STATE_RECOVERING |
1533                  PG_STATE_RECOVERY_WAIT |
1534                  PG_STATE_BACKFILL |
1535                  PG_STATE_BACKFILL_WAIT |
1536                  PG_STATE_BACKFILL_TOOFULL)) {
1537     target = cct->_conf->osd_max_pg_log_entries;
1538   }
1539
1540   eversion_t limit = MIN(
1541     min_last_complete_ondisk,
1542     pg_log.get_can_rollback_to());
1543   if (limit != eversion_t() &&
1544       limit != pg_trim_to &&
1545       pg_log.get_log().approx_size() > target) {
1546     size_t num_to_trim = pg_log.get_log().approx_size() - target;
1547     if (num_to_trim < cct->_conf->osd_pg_log_trim_min) {
1548       return;
1549     }
1550     list<pg_log_entry_t>::const_iterator it = pg_log.get_log().log.begin();
1551     eversion_t new_trim_to;
1552     for (size_t i = 0; i < num_to_trim; ++i) {
1553       new_trim_to = it->version;
1554       ++it;
1555       if (new_trim_to > limit) {
1556         new_trim_to = limit;
1557         dout(10) << "calc_trim_to trimming to min_last_complete_ondisk" << dendl;
1558         break;
1559       }
1560     }
1561     dout(10) << "calc_trim_to " << pg_trim_to << " -> " << new_trim_to << dendl;
1562     pg_trim_to = new_trim_to;
1563     assert(pg_trim_to <= pg_log.get_head());
1564     assert(pg_trim_to <= min_last_complete_ondisk);
1565   }
1566 }
1567
1568 PrimaryLogPG::PrimaryLogPG(OSDService *o, OSDMapRef curmap,
1569                            const PGPool &_pool, spg_t p) :
1570   PG(o, curmap, _pool, p),
1571   pgbackend(
1572     PGBackend::build_pg_backend(
1573       _pool.info, curmap, this, coll_t(p), ch, o->store, cct)),
1574   object_contexts(o->cct, o->cct->_conf->osd_pg_object_context_cache_count),
1575   snapset_contexts_lock("PrimaryLogPG::snapset_contexts_lock"),
1576   new_backfill(false),
1577   temp_seq(0),
1578   snap_trimmer_machine(this)
1579 {
1580   missing_loc.set_backend_predicates(
1581     pgbackend->get_is_readable_predicate(),
1582     pgbackend->get_is_recoverable_predicate());
1583   snap_trimmer_machine.initiate();
1584 }
1585
1586 void PrimaryLogPG::get_src_oloc(const object_t& oid, const object_locator_t& oloc, object_locator_t& src_oloc)
1587 {
1588   src_oloc = oloc;
1589   if (oloc.key.empty())
1590     src_oloc.key = oid.name;
1591 }
1592
1593 void PrimaryLogPG::handle_backoff(OpRequestRef& op)
1594 {
1595   const MOSDBackoff *m = static_cast<const MOSDBackoff*>(op->get_req());
1596   SessionRef session = static_cast<Session*>(m->get_connection()->get_priv());
1597   if (!session)
1598     return;  // drop it.
1599   session->put();  // get_priv takes a ref, and so does the SessionRef
1600   hobject_t begin = info.pgid.pgid.get_hobj_start();
1601   hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1602   if (begin < m->begin) {
1603     begin = m->begin;
1604   }
1605   if (end > m->end) {
1606     end = m->end;
1607   }
1608   dout(10) << __func__ << " backoff ack id " << m->id
1609            << " [" << begin << "," << end << ")" << dendl;
1610   session->ack_backoff(cct, m->pgid, m->id, begin, end);
1611 }
1612
1613 void PrimaryLogPG::do_request(
1614   OpRequestRef& op,
1615   ThreadPool::TPHandle &handle)
1616 {
1617   if (op->osd_trace) {
1618     op->pg_trace.init("pg op", &trace_endpoint, &op->osd_trace);
1619     op->pg_trace.event("do request");
1620   }
1621   // make sure we have a new enough map
1622   auto p = waiting_for_map.find(op->get_source());
1623   if (p != waiting_for_map.end()) {
1624     // preserve ordering
1625     dout(20) << __func__ << " waiting_for_map "
1626              << p->first << " not empty, queueing" << dendl;
1627     p->second.push_back(op);
1628     op->mark_delayed("waiting_for_map not empty");
1629     return;
1630   }
1631   if (!have_same_or_newer_map(op->min_epoch)) {
1632     dout(20) << __func__ << " min " << op->min_epoch
1633              << ", queue on waiting_for_map " << op->get_source() << dendl;
1634     waiting_for_map[op->get_source()].push_back(op);
1635     op->mark_delayed("op must wait for map");
1636     return;
1637   }
1638
1639   if (can_discard_request(op)) {
1640     return;
1641   }
1642
1643   // pg-wide backoffs
1644   const Message *m = op->get_req();
1645   if (m->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF)) {
1646     SessionRef session = static_cast<Session*>(m->get_connection()->get_priv());
1647     if (!session)
1648       return;  // drop it.
1649     session->put();  // get_priv takes a ref, and so does the SessionRef
1650
1651     if (op->get_req()->get_type() == CEPH_MSG_OSD_OP) {
1652       if (session->check_backoff(cct, info.pgid,
1653                                  info.pgid.pgid.get_hobj_start(), m)) {
1654         return;
1655       }
1656
1657       bool backoff =
1658         is_down() ||
1659         is_incomplete() ||
1660         (!is_active() && is_peered());
1661       if (g_conf->osd_backoff_on_peering && !backoff) {
1662         if (is_peering()) {
1663           backoff = true;
1664         }
1665       }
1666       if (backoff) {
1667         add_pg_backoff(session);
1668         return;
1669       }
1670     }
1671     // pg backoff acks at pg-level
1672     if (op->get_req()->get_type() == CEPH_MSG_OSD_BACKOFF) {
1673       const MOSDBackoff *ba = static_cast<const MOSDBackoff*>(m);
1674       if (ba->begin != ba->end) {
1675         handle_backoff(op);
1676         return;
1677       }
1678     }
1679   }
1680
1681   if (flushes_in_progress > 0) {
1682     dout(20) << flushes_in_progress
1683              << " flushes_in_progress pending "
1684              << "waiting for active on " << op << dendl;
1685     waiting_for_peered.push_back(op);
1686     op->mark_delayed("waiting for peered");
1687     return;
1688   }
1689
1690   if (!is_peered()) {
1691     // Delay unless PGBackend says it's ok
1692     if (pgbackend->can_handle_while_inactive(op)) {
1693       bool handled = pgbackend->handle_message(op);
1694       assert(handled);
1695       return;
1696     } else {
1697       waiting_for_peered.push_back(op);
1698       op->mark_delayed("waiting for peered");
1699       return;
1700     }
1701   }
1702
1703   assert(is_peered() && flushes_in_progress == 0);
1704   if (pgbackend->handle_message(op))
1705     return;
1706
1707   switch (op->get_req()->get_type()) {
1708   case CEPH_MSG_OSD_OP:
1709   case CEPH_MSG_OSD_BACKOFF:
1710     if (!is_active()) {
1711       dout(20) << " peered, not active, waiting for active on " << op << dendl;
1712       waiting_for_active.push_back(op);
1713       op->mark_delayed("waiting for active");
1714       return;
1715     }
1716     switch (op->get_req()->get_type()) {
1717     case CEPH_MSG_OSD_OP:
1718       // verify client features
1719       if ((pool.info.has_tiers() || pool.info.is_tier()) &&
1720           !op->has_feature(CEPH_FEATURE_OSD_CACHEPOOL)) {
1721         osd->reply_op_error(op, -EOPNOTSUPP);
1722         return;
1723       }
1724       do_op(op);
1725       break;
1726     case CEPH_MSG_OSD_BACKOFF:
1727       // object-level backoff acks handled in osdop context
1728       handle_backoff(op);
1729       break;
1730     }
1731     break;
1732
1733   case MSG_OSD_SUBOP:
1734     do_sub_op(op);
1735     break;
1736
1737   case MSG_OSD_SUBOPREPLY:
1738     do_sub_op_reply(op);
1739     break;
1740
1741   case MSG_OSD_PG_SCAN:
1742     do_scan(op, handle);
1743     break;
1744
1745   case MSG_OSD_PG_BACKFILL:
1746     do_backfill(op);
1747     break;
1748
1749   case MSG_OSD_PG_BACKFILL_REMOVE:
1750     do_backfill_remove(op);
1751     break;
1752
1753   case MSG_OSD_SCRUB_RESERVE:
1754     {
1755       const MOSDScrubReserve *m =
1756         static_cast<const MOSDScrubReserve*>(op->get_req());
1757       switch (m->type) {
1758       case MOSDScrubReserve::REQUEST:
1759         handle_scrub_reserve_request(op);
1760         break;
1761       case MOSDScrubReserve::GRANT:
1762         handle_scrub_reserve_grant(op, m->from);
1763         break;
1764       case MOSDScrubReserve::REJECT:
1765         handle_scrub_reserve_reject(op, m->from);
1766         break;
1767       case MOSDScrubReserve::RELEASE:
1768         handle_scrub_reserve_release(op);
1769         break;
1770       }
1771     }
1772     break;
1773
1774   case MSG_OSD_REP_SCRUB:
1775     replica_scrub(op, handle);
1776     break;
1777
1778   case MSG_OSD_REP_SCRUBMAP:
1779     do_replica_scrub_map(op);
1780     break;
1781
1782   case MSG_OSD_PG_UPDATE_LOG_MISSING:
1783     do_update_log_missing(op);
1784     break;
1785
1786   case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY:
1787     do_update_log_missing_reply(op);
1788     break;
1789
1790   default:
1791     assert(0 == "bad message type in do_request");
1792   }
1793 }
1794
1795 hobject_t PrimaryLogPG::earliest_backfill() const
1796 {
1797   hobject_t e = hobject_t::get_max();
1798   for (set<pg_shard_t>::iterator i = backfill_targets.begin();
1799        i != backfill_targets.end();
1800        ++i) {
1801     pg_shard_t bt = *i;
1802     map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(bt);
1803     assert(iter != peer_info.end());
1804     if (iter->second.last_backfill < e)
1805       e = iter->second.last_backfill;
1806   }
1807   return e;
1808 }
1809
1810 /** do_op - do an op
1811  * pg lock will be held (if multithreaded)
1812  * osd_lock NOT held.
1813  */
1814 void PrimaryLogPG::do_op(OpRequestRef& op)
1815 {
1816   FUNCTRACE();
1817   // NOTE: take a non-const pointer here; we must be careful not to
1818   // change anything that will break other reads on m (operator<<).
1819   MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
1820   assert(m->get_type() == CEPH_MSG_OSD_OP);
1821   if (m->finish_decode()) {
1822     op->reset_desc();   // for TrackedOp
1823     m->clear_payload();
1824   }
1825
1826   dout(20) << __func__ << ": op " << *m << dendl;
1827
1828   hobject_t head = m->get_hobj();
1829   head.snap = CEPH_NOSNAP;
1830
1831   if (!info.pgid.pgid.contains(
1832         info.pgid.pgid.get_split_bits(pool.info.get_pg_num()), head)) {
1833     derr << __func__ << " " << info.pgid.pgid << " does not contain "
1834          << head << " pg_num " << pool.info.get_pg_num() << " hash "
1835          << std::hex << head.get_hash() << std::dec << dendl;
1836     osd->clog->warn() << info.pgid.pgid << " does not contain " << head
1837                       << " op " << *m;
1838     assert(!cct->_conf->osd_debug_misdirected_ops);
1839     return;
1840   }
1841
1842   bool can_backoff =
1843     m->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF);
1844   SessionRef session;
1845   if (can_backoff) {
1846     session = static_cast<Session*>(m->get_connection()->get_priv());
1847     if (!session.get()) {
1848       dout(10) << __func__ << " no session" << dendl;
1849       return;
1850     }
1851     session->put();  // get_priv() takes a ref, and so does the intrusive_ptr
1852
1853     if (session->check_backoff(cct, info.pgid, head, m)) {
1854       return;
1855     }
1856   }
1857
1858   if (m->has_flag(CEPH_OSD_FLAG_PARALLELEXEC)) {
1859     // not implemented.
1860     dout(20) << __func__ << ": PARALLELEXEC not implemented " << *m << dendl;
1861     osd->reply_op_error(op, -EINVAL);
1862     return;
1863   }
1864
1865   if (op->rmw_flags == 0) {
1866     int r = osd->osd->init_op_flags(op);
1867     if (r) {
1868       osd->reply_op_error(op, r);
1869       return;
1870     }
1871   }
1872
1873   if ((m->get_flags() & (CEPH_OSD_FLAG_BALANCE_READS |
1874                          CEPH_OSD_FLAG_LOCALIZE_READS)) &&
1875       op->may_read() &&
1876       !(op->may_write() || op->may_cache())) {
1877     // balanced reads; any replica will do
1878     if (!(is_primary() || is_replica())) {
1879       osd->handle_misdirected_op(this, op);
1880       return;
1881     }
1882   } else {
1883     // normal case; must be primary
1884     if (!is_primary()) {
1885       osd->handle_misdirected_op(this, op);
1886       return;
1887     }
1888   }
1889
1890   if (!op_has_sufficient_caps(op)) {
1891     osd->reply_op_error(op, -EPERM);
1892     return;
1893   }
1894
1895   if (op->includes_pg_op()) {
1896     return do_pg_op(op);
1897   }
1898
1899   // object name too long?
1900   if (m->get_oid().name.size() > cct->_conf->osd_max_object_name_len) {
1901     dout(4) << "do_op name is longer than "
1902             << cct->_conf->osd_max_object_name_len
1903             << " bytes" << dendl;
1904     osd->reply_op_error(op, -ENAMETOOLONG);
1905     return;
1906   }
1907   if (m->get_hobj().get_key().size() > cct->_conf->osd_max_object_name_len) {
1908     dout(4) << "do_op locator is longer than "
1909             << cct->_conf->osd_max_object_name_len
1910             << " bytes" << dendl;
1911     osd->reply_op_error(op, -ENAMETOOLONG);
1912     return;
1913   }
1914   if (m->get_hobj().nspace.size() > cct->_conf->osd_max_object_namespace_len) {
1915     dout(4) << "do_op namespace is longer than "
1916             << cct->_conf->osd_max_object_namespace_len
1917             << " bytes" << dendl;
1918     osd->reply_op_error(op, -ENAMETOOLONG);
1919     return;
1920   }
1921
1922   if (int r = osd->store->validate_hobject_key(head)) {
1923     dout(4) << "do_op object " << head << " invalid for backing store: "
1924             << r << dendl;
1925     osd->reply_op_error(op, r);
1926     return;
1927   }
1928
1929   // blacklisted?
1930   if (get_osdmap()->is_blacklisted(m->get_source_addr())) {
1931     dout(10) << "do_op " << m->get_source_addr() << " is blacklisted" << dendl;
1932     osd->reply_op_error(op, -EBLACKLISTED);
1933     return;
1934   }
1935
1936   // order this op as a write?
1937   bool write_ordered = op->rwordered();
1938
1939   // discard due to cluster full transition?  (we discard any op that
1940   // originates before the cluster or pool is marked full; the client
1941   // will resend after the full flag is removed or if they expect the
1942   // op to succeed despite being full).  The except is FULL_FORCE and
1943   // FULL_TRY ops, which there is no reason to discard because they
1944   // bypass all full checks anyway.  If this op isn't write or
1945   // read-ordered, we skip.
1946   // FIXME: we exclude mds writes for now.
1947   if (write_ordered && !(m->get_source().is_mds() ||
1948                          m->has_flag(CEPH_OSD_FLAG_FULL_TRY) ||
1949                          m->has_flag(CEPH_OSD_FLAG_FULL_FORCE)) &&
1950       info.history.last_epoch_marked_full > m->get_map_epoch()) {
1951     dout(10) << __func__ << " discarding op sent before full " << m << " "
1952              << *m << dendl;
1953     return;
1954   }
1955   // mds should have stopped writing before this point.
1956   // We can't allow OSD to become non-startable even if mds
1957   // could be writing as part of file removals.
1958   ostringstream ss;
1959   if (write_ordered && osd->check_failsafe_full(ss)) {
1960     dout(10) << __func__ << " fail-safe full check failed, dropping request"
1961              << ss.str()
1962              << dendl;
1963     return;
1964   }
1965   int64_t poolid = get_pgid().pool();
1966   if (op->may_write()) {
1967
1968     const pg_pool_t *pi = get_osdmap()->get_pg_pool(poolid);
1969     if (!pi) {
1970       return;
1971     }
1972
1973     // invalid?
1974     if (m->get_snapid() != CEPH_NOSNAP) {
1975       dout(20) << __func__ << ": write to clone not valid " << *m << dendl;
1976       osd->reply_op_error(op, -EINVAL);
1977       return;
1978     }
1979
1980     // too big?
1981     if (cct->_conf->osd_max_write_size &&
1982         m->get_data_len() > cct->_conf->osd_max_write_size << 20) {
1983       // journal can't hold commit!
1984       derr << "do_op msg data len " << m->get_data_len()
1985            << " > osd_max_write_size " << (cct->_conf->osd_max_write_size << 20)
1986            << " on " << *m << dendl;
1987       osd->reply_op_error(op, -OSD_WRITETOOBIG);
1988       return;
1989     }
1990   }
1991
1992   dout(10) << "do_op " << *m
1993            << (op->may_write() ? " may_write" : "")
1994            << (op->may_read() ? " may_read" : "")
1995            << (op->may_cache() ? " may_cache" : "")
1996            << " -> " << (write_ordered ? "write-ordered" : "read-ordered")
1997            << " flags " << ceph_osd_flag_string(m->get_flags())
1998            << dendl;
1999
2000   // missing object?
2001   if (is_unreadable_object(head)) {
2002     if (!is_primary()) {
2003       osd->reply_op_error(op, -EAGAIN);
2004       return;
2005     }
2006     if (can_backoff &&
2007         (g_conf->osd_backoff_on_degraded ||
2008          (g_conf->osd_backoff_on_unfound && missing_loc.is_unfound(head)))) {
2009       add_backoff(session, head, head);
2010       maybe_kick_recovery(head);
2011     } else {
2012       wait_for_unreadable_object(head, op);
2013     }
2014     return;
2015   }
2016
2017   // degraded object?
2018   if (write_ordered && is_degraded_or_backfilling_object(head)) {
2019     if (can_backoff && g_conf->osd_backoff_on_degraded) {
2020       add_backoff(session, head, head);
2021     } else {
2022       wait_for_degraded_object(head, op);
2023     }
2024     return;
2025   }
2026
2027   if (write_ordered &&
2028       scrubber.write_blocked_by_scrub(head)) {
2029     dout(20) << __func__ << ": waiting for scrub" << dendl;
2030     waiting_for_scrub.push_back(op);
2031     op->mark_delayed("waiting for scrub");
2032     return;
2033   }
2034
2035   // blocked on snap?
2036   map<hobject_t, snapid_t>::iterator blocked_iter =
2037     objects_blocked_on_degraded_snap.find(head);
2038   if (write_ordered && blocked_iter != objects_blocked_on_degraded_snap.end()) {
2039     hobject_t to_wait_on(head);
2040     to_wait_on.snap = blocked_iter->second;
2041     wait_for_degraded_object(to_wait_on, op);
2042     return;
2043   }
2044   map<hobject_t, ObjectContextRef>::iterator blocked_snap_promote_iter =
2045     objects_blocked_on_snap_promotion.find(head);
2046   if (write_ordered &&
2047       blocked_snap_promote_iter != objects_blocked_on_snap_promotion.end()) {
2048     wait_for_blocked_object(
2049       blocked_snap_promote_iter->second->obs.oi.soid,
2050       op);
2051     return;
2052   }
2053   if (write_ordered && objects_blocked_on_cache_full.count(head)) {
2054     block_write_on_full_cache(head, op);
2055     return;
2056   }
2057
2058   // missing snapdir?
2059   hobject_t snapdir = head.get_snapdir();
2060
2061   if (is_unreadable_object(snapdir)) {
2062     wait_for_unreadable_object(snapdir, op);
2063     return;
2064   }
2065
2066   // degraded object?
2067   if (write_ordered && is_degraded_or_backfilling_object(snapdir)) {
2068     wait_for_degraded_object(snapdir, op);
2069     return;
2070   }
2071
2072   // dup/resent?
2073   if (op->may_write() || op->may_cache()) {
2074     // warning: we will get back *a* request for this reqid, but not
2075     // necessarily the most recent.  this happens with flush and
2076     // promote ops, but we can't possible have both in our log where
2077     // the original request is still not stable on disk, so for our
2078     // purposes here it doesn't matter which one we get.
2079     eversion_t version;
2080     version_t user_version;
2081     int return_code = 0;
2082     bool got = check_in_progress_op(
2083       m->get_reqid(), &version, &user_version, &return_code);
2084     if (got) {
2085       dout(3) << __func__ << " dup " << m->get_reqid()
2086               << " version " << version << dendl;
2087       if (already_complete(version)) {
2088         osd->reply_op_error(op, return_code, version, user_version);
2089       } else {
2090         dout(10) << " waiting for " << version << " to commit" << dendl;
2091         // always queue ondisk waiters, so that we can requeue if needed
2092         waiting_for_ondisk[version].push_back(make_pair(op, user_version));
2093         op->mark_delayed("waiting for ondisk");
2094       }
2095       return;
2096     }
2097   }
2098
2099   ObjectContextRef obc;
2100   bool can_create = op->may_write() || op->may_cache();
2101   hobject_t missing_oid;
2102   const hobject_t& oid = m->get_hobj();
2103
2104   // io blocked on obc?
2105   if (!m->has_flag(CEPH_OSD_FLAG_FLUSH) &&
2106       maybe_await_blocked_snapset(oid, op)) {
2107     return;
2108   }
2109
2110   int r = find_object_context(
2111     oid, &obc, can_create,
2112     m->has_flag(CEPH_OSD_FLAG_MAP_SNAP_CLONE),
2113     &missing_oid);
2114
2115   if (r == -EAGAIN) {
2116     // If we're not the primary of this OSD, we just return -EAGAIN. Otherwise,
2117     // we have to wait for the object.
2118     if (is_primary()) {
2119       // missing the specific snap we need; requeue and wait.
2120       assert(!op->may_write()); // only happens on a read/cache
2121       wait_for_unreadable_object(missing_oid, op);
2122       return;
2123     }
2124   } else if (r == 0) {
2125     if (is_unreadable_object(obc->obs.oi.soid)) {
2126       dout(10) << __func__ << ": clone " << obc->obs.oi.soid
2127                << " is unreadable, waiting" << dendl;
2128       wait_for_unreadable_object(obc->obs.oi.soid, op);
2129       return;
2130     }
2131
2132     // degraded object?  (the check above was for head; this could be a clone)
2133     if (write_ordered &&
2134         obc->obs.oi.soid.snap != CEPH_NOSNAP &&
2135         is_degraded_or_backfilling_object(obc->obs.oi.soid)) {
2136       dout(10) << __func__ << ": clone " << obc->obs.oi.soid
2137                << " is degraded, waiting" << dendl;
2138       wait_for_degraded_object(obc->obs.oi.soid, op);
2139       return;
2140     }
2141   }
2142
2143   bool in_hit_set = false;
2144   if (hit_set) {
2145     if (obc.get()) {
2146       if (obc->obs.oi.soid != hobject_t() && hit_set->contains(obc->obs.oi.soid))
2147         in_hit_set = true;
2148     } else {
2149       if (missing_oid != hobject_t() && hit_set->contains(missing_oid))
2150         in_hit_set = true;
2151     }
2152     if (!op->hitset_inserted) {
2153       hit_set->insert(oid);
2154       op->hitset_inserted = true;
2155       if (hit_set->is_full() ||
2156           hit_set_start_stamp + pool.info.hit_set_period <= m->get_recv_stamp()) {
2157         hit_set_persist();
2158       }
2159     }
2160   }
2161
2162   if (agent_state) {
2163     if (agent_choose_mode(false, op))
2164       return;
2165   }
2166
2167   if (obc.get() && obc->obs.exists && obc->obs.oi.has_manifest()) {
2168     if (maybe_handle_manifest(op,
2169                                write_ordered,
2170                                obc))
2171     return;
2172   }
2173
2174   if (maybe_handle_cache(op,
2175                          write_ordered,
2176                          obc,
2177                          r,
2178                          missing_oid,
2179                          false,
2180                          in_hit_set))
2181     return;
2182
2183   if (r && (r != -ENOENT || !obc)) {
2184     // copy the reqids for copy get on ENOENT
2185     if (r == -ENOENT &&
2186         (m->ops[0].op.op == CEPH_OSD_OP_COPY_GET)) {
2187       fill_in_copy_get_noent(op, oid, m->ops[0]);
2188       return;
2189     }
2190     dout(20) << __func__ << ": find_object_context got error " << r << dendl;
2191     if (op->may_write() &&
2192         get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
2193       record_write_error(op, oid, nullptr, r);
2194     } else {
2195       osd->reply_op_error(op, r);
2196     }
2197     return;
2198   }
2199
2200   // make sure locator is consistent
2201   object_locator_t oloc(obc->obs.oi.soid);
2202   if (m->get_object_locator() != oloc) {
2203     dout(10) << " provided locator " << m->get_object_locator()
2204              << " != object's " << obc->obs.oi.soid << dendl;
2205     osd->clog->warn() << "bad locator " << m->get_object_locator()
2206                      << " on object " << oloc
2207                       << " op " << *m;
2208   }
2209
2210   // io blocked on obc?
2211   if (obc->is_blocked() &&
2212       !m->has_flag(CEPH_OSD_FLAG_FLUSH)) {
2213     wait_for_blocked_object(obc->obs.oi.soid, op);
2214     return;
2215   }
2216
2217   dout(25) << __func__ << " oi " << obc->obs.oi << dendl;
2218
2219   for (vector<OSDOp>::iterator p = m->ops.begin(); p != m->ops.end(); ++p) {
2220     OSDOp& osd_op = *p;
2221
2222     // make sure LIST_SNAPS is on CEPH_SNAPDIR and nothing else
2223     if (osd_op.op.op == CEPH_OSD_OP_LIST_SNAPS &&
2224         m->get_snapid() != CEPH_SNAPDIR) {
2225       dout(10) << "LIST_SNAPS with incorrect context" << dendl;
2226       osd->reply_op_error(op, -EINVAL);
2227       return;
2228     }
2229   }
2230
2231   OpContext *ctx = new OpContext(op, m->get_reqid(), m->ops, obc, this);
2232
2233   if (!obc->obs.exists)
2234     ctx->snapset_obc = get_object_context(obc->obs.oi.soid.get_snapdir(), false);
2235
2236   /* Due to obc caching, we might have a cached non-existent snapset_obc
2237    * for the snapdir.  If so, we can ignore it.  Subsequent parts of the
2238    * do_op pipeline make decisions based on whether snapset_obc is
2239    * populated.
2240    */
2241   if (ctx->snapset_obc && !ctx->snapset_obc->obs.exists)
2242     ctx->snapset_obc = ObjectContextRef();
2243
2244   if (m->has_flag(CEPH_OSD_FLAG_SKIPRWLOCKS)) {
2245     dout(20) << __func__ << ": skipping rw locks" << dendl;
2246   } else if (m->get_flags() & CEPH_OSD_FLAG_FLUSH) {
2247     dout(20) << __func__ << ": part of flush, will ignore write lock" << dendl;
2248
2249     // verify there is in fact a flush in progress
2250     // FIXME: we could make this a stronger test.
2251     map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(obc->obs.oi.soid);
2252     if (p == flush_ops.end()) {
2253       dout(10) << __func__ << " no flush in progress, aborting" << dendl;
2254       reply_ctx(ctx, -EINVAL);
2255       return;
2256     }
2257   } else if (!get_rw_locks(write_ordered, ctx)) {
2258     dout(20) << __func__ << " waiting for rw locks " << dendl;
2259     op->mark_delayed("waiting for rw locks");
2260     close_op_ctx(ctx);
2261     return;
2262   }
2263   dout(20) << __func__ << " obc " << *obc << dendl;
2264
2265   if (r) {
2266     dout(20) << __func__ << " returned an error: " << r << dendl;
2267     close_op_ctx(ctx);
2268     if (op->may_write() &&
2269         get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
2270       record_write_error(op, oid, nullptr, r);
2271     } else {
2272       osd->reply_op_error(op, r);
2273     }
2274     return;
2275   }
2276
2277   if (m->has_flag(CEPH_OSD_FLAG_IGNORE_CACHE)) {
2278     ctx->ignore_cache = true;
2279   }
2280
2281   if ((op->may_read()) && (obc->obs.oi.is_lost())) {
2282     // This object is lost. Reading from it returns an error.
2283     dout(20) << __func__ << ": object " << obc->obs.oi.soid
2284              << " is lost" << dendl;
2285     reply_ctx(ctx, -ENFILE);
2286     return;
2287   }
2288   if (!op->may_write() &&
2289       !op->may_cache() &&
2290       (!obc->obs.exists ||
2291        ((m->get_snapid() != CEPH_SNAPDIR) &&
2292         obc->obs.oi.is_whiteout()))) {
2293     // copy the reqids for copy get on ENOENT
2294     if (m->ops[0].op.op == CEPH_OSD_OP_COPY_GET) {
2295       fill_in_copy_get_noent(op, oid, m->ops[0]);
2296       close_op_ctx(ctx);
2297       return;
2298     }
2299     reply_ctx(ctx, -ENOENT);
2300     return;
2301   }
2302
2303   op->mark_started();
2304
2305   execute_ctx(ctx);
2306   utime_t prepare_latency = ceph_clock_now();
2307   prepare_latency -= op->get_dequeued_time();
2308   osd->logger->tinc(l_osd_op_prepare_lat, prepare_latency);
2309   if (op->may_read() && op->may_write()) {
2310     osd->logger->tinc(l_osd_op_rw_prepare_lat, prepare_latency);
2311   } else if (op->may_read()) {
2312     osd->logger->tinc(l_osd_op_r_prepare_lat, prepare_latency);
2313   } else if (op->may_write() || op->may_cache()) {
2314     osd->logger->tinc(l_osd_op_w_prepare_lat, prepare_latency);
2315   }
2316
2317   // force recovery of the oldest missing object if too many logs
2318   maybe_force_recovery();
2319 }
2320 PrimaryLogPG::cache_result_t PrimaryLogPG::maybe_handle_manifest_detail(
2321   OpRequestRef op,
2322   bool write_ordered,
2323   ObjectContextRef obc)
2324 {
2325   if (static_cast<const MOSDOp *>(op->get_req())->get_flags() &
2326       CEPH_OSD_FLAG_IGNORE_REDIRECT) {
2327     dout(20) << __func__ << ": ignoring redirect due to flag" << dendl;
2328     return cache_result_t::NOOP;
2329   }
2330
2331   if (obc)
2332     dout(10) << __func__ << " " << obc->obs.oi << " "
2333        << (obc->obs.exists ? "exists" : "DNE")
2334        << dendl;
2335
2336   // if it is write-ordered and blocked, stop now
2337   if (obc.get() && obc->is_blocked() && write_ordered) {
2338     // we're already doing something with this object
2339     dout(20) << __func__ << " blocked on " << obc->obs.oi.soid << dendl;
2340     return cache_result_t::NOOP;
2341   }
2342
2343   vector<OSDOp> ops = static_cast<const MOSDOp*>(op->get_req())->ops;
2344   for (vector<OSDOp>::iterator p = ops.begin(); p != ops.end(); ++p) {
2345     OSDOp& osd_op = *p;
2346     ceph_osd_op& op = osd_op.op;
2347     if (op.op == CEPH_OSD_OP_SET_REDIRECT) {
2348       return cache_result_t::NOOP;
2349     }
2350   }
2351
2352   switch (obc->obs.oi.manifest.type) {
2353   case object_manifest_t::TYPE_REDIRECT:
2354     if (op->may_write() || write_ordered) {
2355       do_proxy_write(op, obc->obs.oi.soid, obc);
2356     } else {
2357       do_proxy_read(op, obc);
2358     }
2359     return cache_result_t::HANDLED_PROXY;
2360   case object_manifest_t::TYPE_CHUNKED:
2361   default:
2362     assert(0 == "unrecognized manifest type");
2363   }
2364
2365   return cache_result_t::NOOP;
2366 }
2367
2368 void PrimaryLogPG::record_write_error(OpRequestRef op, const hobject_t &soid,
2369                                       MOSDOpReply *orig_reply, int r)
2370 {
2371   dout(20) << __func__ << " r=" << r << dendl;
2372   assert(op->may_write());
2373   const osd_reqid_t &reqid = static_cast<const MOSDOp*>(op->get_req())->get_reqid();
2374   ObjectContextRef obc;
2375   mempool::osd_pglog::list<pg_log_entry_t> entries;
2376   entries.push_back(pg_log_entry_t(pg_log_entry_t::ERROR, soid,
2377                                    get_next_version(), eversion_t(), 0,
2378                                    reqid, utime_t(), r));
2379
2380   struct OnComplete {
2381     PrimaryLogPG *pg;
2382     OpRequestRef op;
2383     boost::intrusive_ptr<MOSDOpReply> orig_reply;
2384     int r;
2385     OnComplete(
2386       PrimaryLogPG *pg,
2387       OpRequestRef op,
2388       MOSDOpReply *orig_reply,
2389       int r)
2390       : pg(pg), op(op),
2391         orig_reply(orig_reply, false /* take over ref */), r(r)
2392       {}
2393     void operator()() {
2394       ldpp_dout(pg, 20) << "finished " << __func__ << " r=" << r << dendl;
2395       const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
2396       int flags = m->get_flags() & (CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
2397       MOSDOpReply *reply = orig_reply.detach();
2398       if (reply == nullptr) {
2399         reply = new MOSDOpReply(m, r, pg->get_osdmap()->get_epoch(),
2400                                 flags, true);
2401       }
2402       ldpp_dout(pg, 10) << " sending commit on " << *m << " " << reply << dendl;
2403       pg->osd->send_message_osd_client(reply, m->get_connection());
2404     }
2405   };
2406
2407   ObcLockManager lock_manager;
2408   submit_log_entries(
2409     entries,
2410     std::move(lock_manager),
2411     boost::optional<std::function<void(void)> >(
2412       OnComplete(this, op, orig_reply, r)),
2413     op,
2414     r);
2415 }
2416
2417 PrimaryLogPG::cache_result_t PrimaryLogPG::maybe_handle_cache_detail(
2418   OpRequestRef op,
2419   bool write_ordered,
2420   ObjectContextRef obc,
2421   int r, hobject_t missing_oid,
2422   bool must_promote,
2423   bool in_hit_set,
2424   ObjectContextRef *promote_obc)
2425 {
2426   if (op &&
2427       op->get_req() &&
2428       op->get_req()->get_type() == CEPH_MSG_OSD_OP &&
2429       (static_cast<const MOSDOp *>(op->get_req())->get_flags() &
2430        CEPH_OSD_FLAG_IGNORE_CACHE)) {
2431     dout(20) << __func__ << ": ignoring cache due to flag" << dendl;
2432     return cache_result_t::NOOP;
2433   }
2434   // return quickly if caching is not enabled
2435   if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)
2436     return cache_result_t::NOOP;
2437
2438   must_promote = must_promote || op->need_promote();
2439
2440   if (obc)
2441     dout(25) << __func__ << " " << obc->obs.oi << " "
2442              << (obc->obs.exists ? "exists" : "DNE")
2443              << " missing_oid " << missing_oid
2444              << " must_promote " << (int)must_promote
2445              << " in_hit_set " << (int)in_hit_set
2446              << dendl;
2447   else
2448     dout(25) << __func__ << " (no obc)"
2449              << " missing_oid " << missing_oid
2450              << " must_promote " << (int)must_promote
2451              << " in_hit_set " << (int)in_hit_set
2452              << dendl;
2453
2454   // if it is write-ordered and blocked, stop now
2455   if (obc.get() && obc->is_blocked() && write_ordered) {
2456     // we're already doing something with this object
2457     dout(20) << __func__ << " blocked on " << obc->obs.oi.soid << dendl;
2458     return cache_result_t::NOOP;
2459   }
2460
2461   if (r == -ENOENT && missing_oid == hobject_t()) {
2462     // we know this object is logically absent (e.g., an undefined clone)
2463     return cache_result_t::NOOP;
2464   }
2465
2466   if (obc.get() && obc->obs.exists) {
2467     osd->logger->inc(l_osd_op_cache_hit);
2468     return cache_result_t::NOOP;
2469   }
2470
2471   if (missing_oid == hobject_t() && obc.get()) {
2472     missing_oid = obc->obs.oi.soid;
2473   }
2474
2475   const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
2476   const object_locator_t oloc = m->get_object_locator();
2477
2478   if (op->need_skip_handle_cache()) {
2479     return cache_result_t::NOOP;
2480   }
2481
2482   // older versions do not proxy the feature bits.
2483   bool can_proxy_write = get_osdmap()->get_up_osd_features() &
2484     CEPH_FEATURE_OSD_PROXY_WRITE_FEATURES;
2485   OpRequestRef promote_op;
2486
2487   switch (pool.info.cache_mode) {
2488   case pg_pool_t::CACHEMODE_WRITEBACK:
2489     if (agent_state &&
2490         agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2491       if (!op->may_write() && !op->may_cache() &&
2492           !write_ordered && !must_promote) {
2493         dout(20) << __func__ << " cache pool full, proxying read" << dendl;
2494         do_proxy_read(op);
2495         return cache_result_t::HANDLED_PROXY;
2496       }
2497       dout(20) << __func__ << " cache pool full, waiting" << dendl;
2498       block_write_on_full_cache(missing_oid, op);
2499       return cache_result_t::BLOCKED_FULL;
2500     }
2501
2502     if (must_promote || (!hit_set && !op->need_skip_promote())) {
2503       promote_object(obc, missing_oid, oloc, op, promote_obc);
2504       return cache_result_t::BLOCKED_PROMOTE;
2505     }
2506
2507     if (op->may_write() || op->may_cache()) {
2508       if (can_proxy_write) {
2509         do_proxy_write(op, missing_oid);
2510       } else {
2511         // promote if can't proxy the write
2512         promote_object(obc, missing_oid, oloc, op, promote_obc);
2513         return cache_result_t::BLOCKED_PROMOTE;
2514       }
2515
2516       // Promote too?
2517       if (!op->need_skip_promote() &&
2518           maybe_promote(obc, missing_oid, oloc, in_hit_set,
2519                       pool.info.min_write_recency_for_promote,
2520                       OpRequestRef(),
2521                       promote_obc)) {
2522         return cache_result_t::BLOCKED_PROMOTE;
2523       }
2524       return cache_result_t::HANDLED_PROXY;
2525     } else {
2526       do_proxy_read(op);
2527
2528       // Avoid duplicate promotion
2529       if (obc.get() && obc->is_blocked()) {
2530         if (promote_obc)
2531           *promote_obc = obc;
2532         return cache_result_t::BLOCKED_PROMOTE;
2533       }
2534
2535       // Promote too?
2536       if (!op->need_skip_promote()) {
2537         (void)maybe_promote(obc, missing_oid, oloc, in_hit_set,
2538                             pool.info.min_read_recency_for_promote,
2539                             promote_op, promote_obc);
2540       }
2541
2542       return cache_result_t::HANDLED_PROXY;
2543     }
2544     assert(0 == "unreachable");
2545     return cache_result_t::NOOP;
2546
2547   case pg_pool_t::CACHEMODE_FORWARD:
2548     // FIXME: this mode allows requests to be reordered.
2549     do_cache_redirect(op);
2550     return cache_result_t::HANDLED_REDIRECT;
2551
2552   case pg_pool_t::CACHEMODE_READONLY:
2553     // TODO: clean this case up
2554     if (!obc.get() && r == -ENOENT) {
2555       // we don't have the object and op's a read
2556       promote_object(obc, missing_oid, oloc, op, promote_obc);
2557       return cache_result_t::BLOCKED_PROMOTE;
2558     }
2559     if (!r) { // it must be a write
2560       do_cache_redirect(op);
2561       return cache_result_t::HANDLED_REDIRECT;
2562     }
2563     // crap, there was a failure of some kind
2564     return cache_result_t::NOOP;
2565
2566   case pg_pool_t::CACHEMODE_READFORWARD:
2567     // Do writeback to the cache tier for writes
2568     if (op->may_write() || write_ordered || must_promote) {
2569       if (agent_state &&
2570           agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2571         dout(20) << __func__ << " cache pool full, waiting" << dendl;
2572         block_write_on_full_cache(missing_oid, op);
2573         return cache_result_t::BLOCKED_FULL;
2574       }
2575       promote_object(obc, missing_oid, oloc, op, promote_obc);
2576       return cache_result_t::BLOCKED_PROMOTE;
2577     }
2578
2579     // If it is a read, we can read, we need to forward it
2580     do_cache_redirect(op);
2581     return cache_result_t::HANDLED_REDIRECT;
2582
2583   case pg_pool_t::CACHEMODE_PROXY:
2584     if (!must_promote) {
2585       if (op->may_write() || op->may_cache() || write_ordered) {
2586         if (can_proxy_write) {
2587           do_proxy_write(op, missing_oid);
2588           return cache_result_t::HANDLED_PROXY;
2589         }
2590       } else {
2591         do_proxy_read(op);
2592         return cache_result_t::HANDLED_PROXY;
2593       }
2594     }
2595     // ugh, we're forced to promote.
2596     if (agent_state &&
2597         agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2598       dout(20) << __func__ << " cache pool full, waiting" << dendl;
2599       block_write_on_full_cache(missing_oid, op);
2600       return cache_result_t::BLOCKED_FULL;
2601     }
2602     promote_object(obc, missing_oid, oloc, op, promote_obc);
2603     return cache_result_t::BLOCKED_PROMOTE;
2604
2605   case pg_pool_t::CACHEMODE_READPROXY:
2606     // Do writeback to the cache tier for writes
2607     if (op->may_write() || write_ordered || must_promote) {
2608       if (agent_state &&
2609           agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2610         dout(20) << __func__ << " cache pool full, waiting" << dendl;
2611         block_write_on_full_cache(missing_oid, op);
2612         return cache_result_t::BLOCKED_FULL;
2613       }
2614       promote_object(obc, missing_oid, oloc, op, promote_obc);
2615       return cache_result_t::BLOCKED_PROMOTE;
2616     }
2617
2618     // If it is a read, we can read, we need to proxy it
2619     do_proxy_read(op);
2620     return cache_result_t::HANDLED_PROXY;
2621
2622   default:
2623     assert(0 == "unrecognized cache_mode");
2624   }
2625   return cache_result_t::NOOP;
2626 }
2627
2628 bool PrimaryLogPG::maybe_promote(ObjectContextRef obc,
2629                                  const hobject_t& missing_oid,
2630                                  const object_locator_t& oloc,
2631                                  bool in_hit_set,
2632                                  uint32_t recency,
2633                                  OpRequestRef promote_op,
2634                                  ObjectContextRef *promote_obc)
2635 {
2636   dout(20) << __func__ << " missing_oid " << missing_oid
2637            << "  in_hit_set " << in_hit_set << dendl;
2638
2639   switch (recency) {
2640   case 0:
2641     break;
2642   case 1:
2643     // Check if in the current hit set
2644     if (in_hit_set) {
2645       break;
2646     } else {
2647       // not promoting
2648       return false;
2649     }
2650     break;
2651   default:
2652     {
2653       unsigned count = (int)in_hit_set;
2654       if (count) {
2655         // Check if in other hit sets
2656         const hobject_t& oid = obc.get() ? obc->obs.oi.soid : missing_oid;
2657         for (map<time_t,HitSetRef>::reverse_iterator itor =
2658                agent_state->hit_set_map.rbegin();
2659              itor != agent_state->hit_set_map.rend();
2660              ++itor) {
2661           if (!itor->second->contains(oid)) {
2662             break;
2663           }
2664           ++count;
2665           if (count >= recency) {
2666             break;
2667           }
2668         }
2669       }
2670       if (count >= recency) {
2671         break;
2672       }
2673       return false;     // not promoting
2674     }
2675     break;
2676   }
2677
2678   if (osd->promote_throttle()) {
2679     dout(10) << __func__ << " promote throttled" << dendl;
2680     return false;
2681   }
2682   promote_object(obc, missing_oid, oloc, promote_op, promote_obc);
2683   return true;
2684 }
2685
2686 void PrimaryLogPG::do_cache_redirect(OpRequestRef op)
2687 {
2688   const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
2689   int flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
2690   MOSDOpReply *reply = new MOSDOpReply(m, -ENOENT,
2691                                        get_osdmap()->get_epoch(), flags, false);
2692   request_redirect_t redir(m->get_object_locator(), pool.info.tier_of);
2693   reply->set_redirect(redir);
2694   dout(10) << "sending redirect to pool " << pool.info.tier_of << " for op "
2695            << op << dendl;
2696   m->get_connection()->send_message(reply);
2697   return;
2698 }
2699
2700 struct C_ProxyRead : public Context {
2701   PrimaryLogPGRef pg;
2702   hobject_t oid;
2703   epoch_t last_peering_reset;
2704   ceph_tid_t tid;
2705   PrimaryLogPG::ProxyReadOpRef prdop;
2706   utime_t start;
2707   C_ProxyRead(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
2708              const PrimaryLogPG::ProxyReadOpRef& prd)
2709     : pg(p), oid(o), last_peering_reset(lpr),
2710       tid(0), prdop(prd), start(ceph_clock_now())
2711   {}
2712   void finish(int r) override {
2713     if (prdop->canceled)
2714       return;
2715     pg->lock();
2716     if (prdop->canceled) {
2717       pg->unlock();
2718       return;
2719     }
2720     if (last_peering_reset == pg->get_last_peering_reset()) {
2721       pg->finish_proxy_read(oid, tid, r);
2722       pg->osd->logger->tinc(l_osd_tier_r_lat, ceph_clock_now() - start);
2723     }
2724     pg->unlock();
2725   }
2726 };
2727
2728 void PrimaryLogPG::do_proxy_read(OpRequestRef op, ObjectContextRef obc)
2729 {
2730   // NOTE: non-const here because the ProxyReadOp needs mutable refs to
2731   // stash the result in the request's OSDOp vector
2732   MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
2733   object_locator_t oloc;
2734   hobject_t soid;
2735   /* extensible tier */
2736   if (obc && obc->obs.exists && obc->obs.oi.has_manifest()) {
2737     switch (obc->obs.oi.manifest.type) {
2738       case object_manifest_t::TYPE_REDIRECT:
2739           oloc = object_locator_t(obc->obs.oi.manifest.redirect_target);
2740           soid = obc->obs.oi.manifest.redirect_target;
2741           break;
2742       case object_manifest_t::TYPE_CHUNKED:
2743       default:
2744         assert(0 == "unrecognized manifest type");
2745     }
2746   } else {
2747   /* proxy */
2748     soid = m->get_hobj();
2749     oloc = object_locator_t(m->get_object_locator());
2750     oloc.pool = pool.info.tier_of;
2751   }
2752   unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY;
2753
2754   // pass through some original flags that make sense.
2755   //  - leave out redirection and balancing flags since we are
2756   //    already proxying through the primary
2757   //  - leave off read/write/exec flags that are derived from the op
2758   flags |= m->get_flags() & (CEPH_OSD_FLAG_RWORDERED |
2759                              CEPH_OSD_FLAG_ORDERSNAP |
2760                              CEPH_OSD_FLAG_ENFORCE_SNAPC |
2761                              CEPH_OSD_FLAG_MAP_SNAP_CLONE);
2762
2763   dout(10) << __func__ << " Start proxy read for " << *m << dendl;
2764
2765   ProxyReadOpRef prdop(std::make_shared<ProxyReadOp>(op, soid, m->ops));
2766
2767   ObjectOperation obj_op;
2768   obj_op.dup(prdop->ops);
2769
2770   if (pool.info.cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
2771       (agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL)) {
2772     for (unsigned i = 0; i < obj_op.ops.size(); i++) {
2773       ceph_osd_op op = obj_op.ops[i].op;
2774       switch (op.op) {
2775         case CEPH_OSD_OP_READ:
2776         case CEPH_OSD_OP_SYNC_READ:
2777         case CEPH_OSD_OP_SPARSE_READ:
2778         case CEPH_OSD_OP_CHECKSUM:
2779           op.flags = (op.flags | CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL) &
2780                        ~(CEPH_OSD_OP_FLAG_FADVISE_DONTNEED | CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
2781       }
2782     }
2783   }
2784
2785   C_ProxyRead *fin = new C_ProxyRead(this, soid, get_last_peering_reset(),
2786                                      prdop);
2787   ceph_tid_t tid = osd->objecter->read(
2788     soid.oid, oloc, obj_op,
2789     m->get_snapid(), NULL,
2790     flags, new C_OnFinisher(fin, &osd->objecter_finisher),
2791     &prdop->user_version,
2792     &prdop->data_offset,
2793     m->get_features());
2794   fin->tid = tid;
2795   prdop->objecter_tid = tid;
2796   proxyread_ops[tid] = prdop;
2797   in_progress_proxy_ops[soid].push_back(op);
2798 }
2799
2800 void PrimaryLogPG::finish_proxy_read(hobject_t oid, ceph_tid_t tid, int r)
2801 {
2802   dout(10) << __func__ << " " << oid << " tid " << tid
2803            << " " << cpp_strerror(r) << dendl;
2804
2805   map<ceph_tid_t, ProxyReadOpRef>::iterator p = proxyread_ops.find(tid);
2806   if (p == proxyread_ops.end()) {
2807     dout(10) << __func__ << " no proxyread_op found" << dendl;
2808     return;
2809   }
2810   ProxyReadOpRef prdop = p->second;
2811   if (tid != prdop->objecter_tid) {
2812     dout(10) << __func__ << " tid " << tid << " != prdop " << prdop
2813              << " tid " << prdop->objecter_tid << dendl;
2814     return;
2815   }
2816   if (oid != prdop->soid) {
2817     dout(10) << __func__ << " oid " << oid << " != prdop " << prdop
2818              << " soid " << prdop->soid << dendl;
2819     return;
2820   }
2821   proxyread_ops.erase(tid);
2822
2823   map<hobject_t, list<OpRequestRef>>::iterator q = in_progress_proxy_ops.find(oid);
2824   if (q == in_progress_proxy_ops.end()) {
2825     dout(10) << __func__ << " no in_progress_proxy_ops found" << dendl;
2826     return;
2827   }
2828   assert(q->second.size());
2829   list<OpRequestRef>::iterator it = std::find(q->second.begin(),
2830                                               q->second.end(),
2831                                               prdop->op);
2832   assert(it != q->second.end());
2833   OpRequestRef op = *it;
2834   q->second.erase(it);
2835   if (q->second.size() == 0) {
2836     in_progress_proxy_ops.erase(oid);
2837   }
2838
2839   osd->logger->inc(l_osd_tier_proxy_read);
2840
2841   const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
2842   OpContext *ctx = new OpContext(op, m->get_reqid(), prdop->ops, this);
2843   ctx->reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0, false);
2844   ctx->user_at_version = prdop->user_version;
2845   ctx->data_off = prdop->data_offset;
2846   ctx->ignore_log_op_stats = true;
2847   complete_read_ctx(r, ctx);
2848 }
2849
2850 void PrimaryLogPG::kick_proxy_ops_blocked(hobject_t& soid)
2851 {
2852   map<hobject_t, list<OpRequestRef>>::iterator p = in_progress_proxy_ops.find(soid);
2853   if (p == in_progress_proxy_ops.end())
2854     return;
2855
2856   list<OpRequestRef>& ls = p->second;
2857   dout(10) << __func__ << " " << soid << " requeuing " << ls.size() << " requests" << dendl;
2858   requeue_ops(ls);
2859   in_progress_proxy_ops.erase(p);
2860 }
2861
2862 void PrimaryLogPG::cancel_proxy_read(ProxyReadOpRef prdop)
2863 {
2864   dout(10) << __func__ << " " << prdop->soid << dendl;
2865   prdop->canceled = true;
2866
2867   // cancel objecter op, if we can
2868   if (prdop->objecter_tid) {
2869     osd->objecter->op_cancel(prdop->objecter_tid, -ECANCELED);
2870     for (uint32_t i = 0; i < prdop->ops.size(); i++) {
2871       prdop->ops[i].outdata.clear();
2872     }
2873     proxyread_ops.erase(prdop->objecter_tid);
2874     prdop->objecter_tid = 0;
2875   }
2876 }
2877
2878 void PrimaryLogPG::cancel_proxy_ops(bool requeue)
2879 {
2880   dout(10) << __func__ << dendl;
2881
2882   // cancel proxy reads
2883   map<ceph_tid_t, ProxyReadOpRef>::iterator p = proxyread_ops.begin();
2884   while (p != proxyread_ops.end()) {
2885     cancel_proxy_read((p++)->second);
2886   }
2887
2888   // cancel proxy writes
2889   map<ceph_tid_t, ProxyWriteOpRef>::iterator q = proxywrite_ops.begin();
2890   while (q != proxywrite_ops.end()) {
2891     cancel_proxy_write((q++)->second);
2892   }
2893
2894   if (requeue) {
2895     map<hobject_t, list<OpRequestRef>>::iterator p =
2896       in_progress_proxy_ops.begin();
2897     while (p != in_progress_proxy_ops.end()) {
2898       list<OpRequestRef>& ls = p->second;
2899       dout(10) << __func__ << " " << p->first << " requeuing " << ls.size()
2900                << " requests" << dendl;
2901       requeue_ops(ls);
2902       in_progress_proxy_ops.erase(p++);
2903     }
2904   } else {
2905     in_progress_proxy_ops.clear();
2906   }
2907 }
2908
2909 struct C_ProxyWrite_Commit : public Context {
2910   PrimaryLogPGRef pg;
2911   hobject_t oid;
2912   epoch_t last_peering_reset;
2913   ceph_tid_t tid;
2914   PrimaryLogPG::ProxyWriteOpRef pwop;
2915   C_ProxyWrite_Commit(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
2916                       const PrimaryLogPG::ProxyWriteOpRef& pw)
2917     : pg(p), oid(o), last_peering_reset(lpr),
2918       tid(0), pwop(pw)
2919   {}
2920   void finish(int r) override {
2921     if (pwop->canceled)
2922       return;
2923     pg->lock();
2924     if (pwop->canceled) {
2925       pg->unlock();
2926       return;
2927     }
2928     if (last_peering_reset == pg->get_last_peering_reset()) {
2929       pg->finish_proxy_write(oid, tid, r);
2930     }
2931     pg->unlock();
2932   }
2933 };
2934
2935 void PrimaryLogPG::do_proxy_write(OpRequestRef op, const hobject_t& missing_oid, ObjectContextRef obc)
2936 {
2937   // NOTE: non-const because ProxyWriteOp takes a mutable ref
2938   MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
2939   object_locator_t oloc;
2940   SnapContext snapc(m->get_snap_seq(), m->get_snaps());
2941   hobject_t soid;
2942   /* extensible tier */
2943   if (obc && obc->obs.exists && obc->obs.oi.has_manifest()) {
2944     switch (obc->obs.oi.manifest.type) {
2945       case object_manifest_t::TYPE_REDIRECT:
2946           oloc = object_locator_t(obc->obs.oi.manifest.redirect_target);
2947           soid = obc->obs.oi.manifest.redirect_target;
2948           break;
2949       case object_manifest_t::TYPE_CHUNKED:
2950       default:
2951         assert(0 == "unrecognized manifest type");
2952     }
2953   } else {
2954   /* proxy */
2955     soid = m->get_hobj();
2956     oloc = object_locator_t(m->get_object_locator());
2957     oloc.pool = pool.info.tier_of;
2958   }
2959
2960   unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY;
2961   if (!(op->may_write() || op->may_cache())) {
2962     flags |= CEPH_OSD_FLAG_RWORDERED;
2963   }
2964   dout(10) << __func__ << " Start proxy write for " << *m << dendl;
2965
2966   ProxyWriteOpRef pwop(std::make_shared<ProxyWriteOp>(op, soid, m->ops, m->get_reqid()));
2967   pwop->ctx = new OpContext(op, m->get_reqid(), pwop->ops, this);
2968   pwop->mtime = m->get_mtime();
2969
2970   ObjectOperation obj_op;
2971   obj_op.dup(pwop->ops);
2972
2973   C_ProxyWrite_Commit *fin = new C_ProxyWrite_Commit(
2974       this, soid, get_last_peering_reset(), pwop);
2975   ceph_tid_t tid = osd->objecter->mutate(
2976     soid.oid, oloc, obj_op, snapc,
2977     ceph::real_clock::from_ceph_timespec(pwop->mtime),
2978     flags, new C_OnFinisher(fin, &osd->objecter_finisher),
2979     &pwop->user_version, pwop->reqid);
2980   fin->tid = tid;
2981   pwop->objecter_tid = tid;
2982   proxywrite_ops[tid] = pwop;
2983   in_progress_proxy_ops[soid].push_back(op);
2984 }
2985
2986 void PrimaryLogPG::finish_proxy_write(hobject_t oid, ceph_tid_t tid, int r)
2987 {
2988   dout(10) << __func__ << " " << oid << " tid " << tid
2989            << " " << cpp_strerror(r) << dendl;
2990
2991   map<ceph_tid_t, ProxyWriteOpRef>::iterator p = proxywrite_ops.find(tid);
2992   if (p == proxywrite_ops.end()) {
2993     dout(10) << __func__ << " no proxywrite_op found" << dendl;
2994     return;
2995   }
2996   ProxyWriteOpRef pwop = p->second;
2997   assert(tid == pwop->objecter_tid);
2998   assert(oid == pwop->soid);
2999
3000   proxywrite_ops.erase(tid);
3001
3002   map<hobject_t, list<OpRequestRef> >::iterator q = in_progress_proxy_ops.find(oid);
3003   if (q == in_progress_proxy_ops.end()) {
3004     dout(10) << __func__ << " no in_progress_proxy_ops found" << dendl;
3005     delete pwop->ctx;
3006     pwop->ctx = NULL;
3007     return;
3008   }
3009   list<OpRequestRef>& in_progress_op = q->second;
3010   assert(in_progress_op.size());
3011   list<OpRequestRef>::iterator it = std::find(in_progress_op.begin(),
3012                                               in_progress_op.end(),
3013                                               pwop->op);
3014   assert(it != in_progress_op.end());
3015   in_progress_op.erase(it);
3016   if (in_progress_op.size() == 0) {
3017     in_progress_proxy_ops.erase(oid);
3018   }
3019
3020   osd->logger->inc(l_osd_tier_proxy_write);
3021
3022   const MOSDOp *m = static_cast<const MOSDOp*>(pwop->op->get_req());
3023   assert(m != NULL);
3024
3025   if (!pwop->sent_reply) {
3026     // send commit.
3027     MOSDOpReply *reply = pwop->ctx->reply;
3028     if (reply)
3029       pwop->ctx->reply = NULL;
3030     else {
3031       reply = new MOSDOpReply(m, r, get_osdmap()->get_epoch(), 0, true);
3032       reply->set_reply_versions(eversion_t(), pwop->user_version);
3033     }
3034     reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
3035     dout(10) << " sending commit on " << pwop << " " << reply << dendl;
3036     osd->send_message_osd_client(reply, m->get_connection());
3037     pwop->sent_reply = true;
3038     pwop->ctx->op->mark_commit_sent();
3039   }
3040
3041   delete pwop->ctx;
3042   pwop->ctx = NULL;
3043 }
3044
3045 void PrimaryLogPG::cancel_proxy_write(ProxyWriteOpRef pwop)
3046 {
3047   dout(10) << __func__ << " " << pwop->soid << dendl;
3048   pwop->canceled = true;
3049
3050   // cancel objecter op, if we can
3051   if (pwop->objecter_tid) {
3052     osd->objecter->op_cancel(pwop->objecter_tid, -ECANCELED);
3053     delete pwop->ctx;
3054     pwop->ctx = NULL;
3055     proxywrite_ops.erase(pwop->objecter_tid);
3056     pwop->objecter_tid = 0;
3057   }
3058 }
3059
3060 class PromoteCallback: public PrimaryLogPG::CopyCallback {
3061   ObjectContextRef obc;
3062   PrimaryLogPG *pg;
3063   utime_t start;
3064 public:
3065   PromoteCallback(ObjectContextRef obc_, PrimaryLogPG *pg_)
3066     : obc(obc_),
3067       pg(pg_),
3068       start(ceph_clock_now()) {}
3069
3070   void finish(PrimaryLogPG::CopyCallbackResults results) override {
3071     PrimaryLogPG::CopyResults *results_data = results.get<1>();
3072     int r = results.get<0>();
3073     pg->finish_promote(r, results_data, obc);
3074     pg->osd->logger->tinc(l_osd_tier_promote_lat, ceph_clock_now() - start);
3075   }
3076 };
3077
3078 void PrimaryLogPG::promote_object(ObjectContextRef obc,
3079                                   const hobject_t& missing_oid,
3080                                   const object_locator_t& oloc,
3081                                   OpRequestRef op,
3082                                   ObjectContextRef *promote_obc)
3083 {
3084   hobject_t hoid = obc ? obc->obs.oi.soid : missing_oid;
3085   assert(hoid != hobject_t());
3086   if (scrubber.write_blocked_by_scrub(hoid)) {
3087     dout(10) << __func__ << " " << hoid
3088              << " blocked by scrub" << dendl;
3089     if (op) {
3090       waiting_for_scrub.push_back(op);
3091       op->mark_delayed("waiting for scrub");
3092       dout(10) << __func__ << " " << hoid
3093                << " placing op in waiting_for_scrub" << dendl;
3094     } else {
3095       dout(10) << __func__ << " " << hoid
3096                << " no op, dropping on the floor" << dendl;
3097     }
3098     return;
3099   }
3100   if (!obc) { // we need to create an ObjectContext
3101     assert(missing_oid != hobject_t());
3102     obc = get_object_context(missing_oid, true);
3103   }
3104   if (promote_obc)
3105     *promote_obc = obc;
3106
3107   /*
3108    * Before promote complete, if there are  proxy-reads for the object,
3109    * for this case we don't use DONTNEED.
3110    */
3111   unsigned src_fadvise_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL;
3112   map<hobject_t, list<OpRequestRef>>::iterator q = in_progress_proxy_ops.find(obc->obs.oi.soid);
3113   if (q == in_progress_proxy_ops.end()) {
3114     src_fadvise_flags |= LIBRADOS_OP_FLAG_FADVISE_DONTNEED;
3115   }
3116
3117   PromoteCallback *cb = new PromoteCallback(obc, this);
3118   object_locator_t my_oloc = oloc;
3119   my_oloc.pool = pool.info.tier_of;
3120
3121   unsigned flags = CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
3122                    CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
3123                    CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE |
3124                    CEPH_OSD_COPY_FROM_FLAG_RWORDERED;
3125   start_copy(cb, obc, obc->obs.oi.soid, my_oloc, 0, flags,
3126              obc->obs.oi.soid.snap == CEPH_NOSNAP,
3127              src_fadvise_flags, 0);
3128
3129   assert(obc->is_blocked());
3130
3131   if (op)
3132     wait_for_blocked_object(obc->obs.oi.soid, op);
3133   info.stats.stats.sum.num_promote++;
3134 }
3135
3136 void PrimaryLogPG::execute_ctx(OpContext *ctx)
3137 {
3138   FUNCTRACE();
3139   dout(10) << __func__ << " " << ctx << dendl;
3140   ctx->reset_obs(ctx->obc);
3141   ctx->update_log_only = false; // reset in case finish_copyfrom() is re-running execute_ctx
3142   OpRequestRef op = ctx->op;
3143   const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
3144   ObjectContextRef obc = ctx->obc;
3145   const hobject_t& soid = obc->obs.oi.soid;
3146
3147   // this method must be idempotent since we may call it several times
3148   // before we finally apply the resulting transaction.
3149   ctx->op_t.reset(new PGTransaction);
3150
3151   if (op->may_write() || op->may_cache()) {
3152     // snap
3153     if (!(m->has_flag(CEPH_OSD_FLAG_ENFORCE_SNAPC)) &&
3154         pool.info.is_pool_snaps_mode()) {
3155       // use pool's snapc
3156       ctx->snapc = pool.snapc;
3157     } else {
3158       // client specified snapc
3159       ctx->snapc.seq = m->get_snap_seq();
3160       ctx->snapc.snaps = m->get_snaps();
3161       filter_snapc(ctx->snapc.snaps);
3162     }
3163     if ((m->has_flag(CEPH_OSD_FLAG_ORDERSNAP)) &&
3164         ctx->snapc.seq < obc->ssc->snapset.seq) {
3165       dout(10) << " ORDERSNAP flag set and snapc seq " << ctx->snapc.seq
3166                << " < snapset seq " << obc->ssc->snapset.seq
3167                << " on " << obc->obs.oi.soid << dendl;
3168       reply_ctx(ctx, -EOLDSNAPC);
3169       return;
3170     }
3171
3172     // version
3173     ctx->at_version = get_next_version();
3174     ctx->mtime = m->get_mtime();
3175
3176     dout(10) << __func__ << " " << soid << " " << ctx->ops
3177              << " ov " << obc->obs.oi.version << " av " << ctx->at_version
3178              << " snapc " << ctx->snapc
3179              << " snapset " << obc->ssc->snapset
3180              << dendl;
3181   } else {
3182     dout(10) << __func__ << " " << soid << " " << ctx->ops
3183              << " ov " << obc->obs.oi.version
3184              << dendl;
3185   }
3186
3187   if (!ctx->user_at_version)
3188     ctx->user_at_version = obc->obs.oi.user_version;
3189   dout(30) << __func__ << " user_at_version " << ctx->user_at_version << dendl;
3190
3191   if (op->may_read()) {
3192     dout(10) << " taking ondisk_read_lock" << dendl;
3193     obc->ondisk_read_lock();
3194   }
3195
3196   {
3197 #ifdef WITH_LTTNG
3198     osd_reqid_t reqid = ctx->op->get_reqid();
3199 #endif
3200     tracepoint(osd, prepare_tx_enter, reqid.name._type,
3201         reqid.name._num, reqid.tid, reqid.inc);
3202   }
3203
3204   int result = prepare_transaction(ctx);
3205
3206   {
3207 #ifdef WITH_LTTNG
3208     osd_reqid_t reqid = ctx->op->get_reqid();
3209 #endif
3210     tracepoint(osd, prepare_tx_exit, reqid.name._type,
3211         reqid.name._num, reqid.tid, reqid.inc);
3212   }
3213
3214   if (op->may_read()) {
3215     dout(10) << " dropping ondisk_read_lock" << dendl;
3216     obc->ondisk_read_unlock();
3217   }
3218
3219   if (result == -EINPROGRESS) {
3220     // come back later.
3221     return;
3222   }
3223
3224   if (result == -EAGAIN) {
3225     // clean up after the ctx
3226     close_op_ctx(ctx);
3227     return;
3228   }
3229
3230   bool successful_write = !ctx->op_t->empty() && op->may_write() && result >= 0;
3231   // prepare the reply
3232   ctx->reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0,
3233                                successful_write);
3234
3235   // Write operations aren't allowed to return a data payload because
3236   // we can't do so reliably. If the client has to resend the request
3237   // and it has already been applied, we will return 0 with no
3238   // payload.  Non-deterministic behavior is no good.  However, it is
3239   // possible to construct an operation that does a read, does a guard
3240   // check (e.g., CMPXATTR), and then a write.  Then we either succeed
3241   // with the write, or return a CMPXATTR and the read value.
3242   if (successful_write) {
3243     // write.  normalize the result code.
3244     dout(20) << " zeroing write result code " << result << dendl;
3245     result = 0;
3246   }
3247   ctx->reply->set_result(result);
3248
3249   // read or error?
3250   if ((ctx->op_t->empty() || result < 0) && !ctx->update_log_only) {
3251     // finish side-effects
3252     if (result >= 0)
3253       do_osd_op_effects(ctx, m->get_connection());
3254
3255     if (ctx->pending_async_reads.empty()) {
3256       complete_read_ctx(result, ctx);
3257     } else {
3258       in_progress_async_reads.push_back(make_pair(op, ctx));
3259       ctx->start_async_reads(this);
3260     }
3261
3262     return;
3263   }
3264
3265   ctx->reply->set_reply_versions(ctx->at_version, ctx->user_at_version);
3266
3267   assert(op->may_write() || op->may_cache());
3268
3269   // trim log?
3270   calc_trim_to();
3271
3272   // verify that we are doing this in order?
3273   if (cct->_conf->osd_debug_op_order && m->get_source().is_client() &&
3274       !pool.info.is_tier() && !pool.info.has_tiers()) {
3275     map<client_t,ceph_tid_t>& cm = debug_op_order[obc->obs.oi.soid];
3276     ceph_tid_t t = m->get_tid();
3277     client_t n = m->get_source().num();
3278     map<client_t,ceph_tid_t>::iterator p = cm.find(n);
3279     if (p == cm.end()) {
3280       dout(20) << " op order client." << n << " tid " << t << " (first)" << dendl;
3281       cm[n] = t;
3282     } else {
3283       dout(20) << " op order client." << n << " tid " << t << " last was " << p->second << dendl;
3284       if (p->second > t) {
3285         derr << "bad op order, already applied " << p->second << " > this " << t << dendl;
3286         assert(0 == "out of order op");
3287       }
3288       p->second = t;
3289     }
3290   }
3291
3292   if (ctx->update_log_only) {
3293     if (result >= 0)
3294       do_osd_op_effects(ctx, m->get_connection());
3295
3296     dout(20) << __func__ << " update_log_only -- result=" << result << dendl;
3297     // save just what we need from ctx
3298     MOSDOpReply *reply = ctx->reply;
3299     ctx->reply = nullptr;
3300     reply->claim_op_out_data(ctx->ops);
3301     reply->get_header().data_off = ctx->data_off;
3302     close_op_ctx(ctx);
3303
3304     if (result == -ENOENT) {
3305       reply->set_enoent_reply_versions(info.last_update,
3306                                        info.last_user_version);
3307     }
3308     reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
3309     // append to pg log for dup detection - don't save buffers for now
3310     record_write_error(op, soid, reply, result);
3311     return;
3312   }
3313
3314   // no need to capture PG ref, repop cancel will handle that
3315   // Can capture the ctx by pointer, it's owned by the repop
3316   ctx->register_on_commit(
3317     [m, ctx, this](){
3318       if (ctx->op)
3319         log_op_stats(
3320           ctx);
3321
3322       if (m && !ctx->sent_reply) {
3323         MOSDOpReply *reply = ctx->reply;
3324         if (reply)
3325           ctx->reply = nullptr;
3326         else {
3327           reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0, true);
3328           reply->set_reply_versions(ctx->at_version,
3329                                     ctx->user_at_version);
3330         }
3331         reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
3332         dout(10) << " sending reply on " << *m << " " << reply << dendl;
3333         osd->send_message_osd_client(reply, m->get_connection());
3334         ctx->sent_reply = true;
3335         ctx->op->mark_commit_sent();
3336       }
3337     });
3338   ctx->register_on_success(
3339     [ctx, this]() {
3340       do_osd_op_effects(
3341         ctx,
3342         ctx->op ? ctx->op->get_req()->get_connection() :
3343         ConnectionRef());
3344     });
3345   ctx->register_on_finish(
3346     [ctx, this]() {
3347       delete ctx;
3348     });
3349
3350   // issue replica writes
3351   ceph_tid_t rep_tid = osd->get_tid();
3352
3353   RepGather *repop = new_repop(ctx, obc, rep_tid);
3354
3355   issue_repop(repop, ctx);
3356   eval_repop(repop);
3357   repop->put();
3358 }
3359
3360 void PrimaryLogPG::reply_ctx(OpContext *ctx, int r)
3361 {
3362   if (ctx->op)
3363     osd->reply_op_error(ctx->op, r);
3364   close_op_ctx(ctx);
3365 }
3366
3367 void PrimaryLogPG::reply_ctx(OpContext *ctx, int r, eversion_t v, version_t uv)
3368 {
3369   if (ctx->op)
3370     osd->reply_op_error(ctx->op, r, v, uv);
3371   close_op_ctx(ctx);
3372 }
3373
3374 void PrimaryLogPG::log_op_stats(OpContext *ctx)
3375 {
3376   OpRequestRef op = ctx->op;
3377   const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
3378
3379   utime_t now = ceph_clock_now();
3380   utime_t latency = now;
3381   latency -= ctx->op->get_req()->get_recv_stamp();
3382   utime_t process_latency = now;
3383   process_latency -= ctx->op->get_dequeued_time();
3384
3385   uint64_t inb = ctx->bytes_written;
3386   uint64_t outb = ctx->bytes_read;
3387
3388   osd->logger->inc(l_osd_op);
3389
3390   osd->logger->inc(l_osd_op_outb, outb);
3391   osd->logger->inc(l_osd_op_inb, inb);
3392   osd->logger->tinc(l_osd_op_lat, latency);
3393   osd->logger->tinc(l_osd_op_process_lat, process_latency);
3394
3395   if (op->may_read() && op->may_write()) {
3396     osd->logger->inc(l_osd_op_rw);
3397     osd->logger->inc(l_osd_op_rw_inb, inb);
3398     osd->logger->inc(l_osd_op_rw_outb, outb);
3399     osd->logger->tinc(l_osd_op_rw_lat, latency);
3400     osd->logger->hinc(l_osd_op_rw_lat_inb_hist, latency.to_nsec(), inb);
3401     osd->logger->hinc(l_osd_op_rw_lat_outb_hist, latency.to_nsec(), outb);
3402     osd->logger->tinc(l_osd_op_rw_process_lat, process_latency);
3403   } else if (op->may_read()) {
3404     osd->logger->inc(l_osd_op_r);
3405     osd->logger->inc(l_osd_op_r_outb, outb);
3406     osd->logger->tinc(l_osd_op_r_lat, latency);
3407     osd->logger->hinc(l_osd_op_r_lat_outb_hist, latency.to_nsec(), outb);
3408     osd->logger->tinc(l_osd_op_r_process_lat, process_latency);
3409   } else if (op->may_write() || op->may_cache()) {
3410     osd->logger->inc(l_osd_op_w);
3411     osd->logger->inc(l_osd_op_w_inb, inb);
3412     osd->logger->tinc(l_osd_op_w_lat, latency);
3413     osd->logger->hinc(l_osd_op_w_lat_inb_hist, latency.to_nsec(), inb);
3414     osd->logger->tinc(l_osd_op_w_process_lat, process_latency);
3415   } else
3416     ceph_abort();
3417
3418   dout(15) << "log_op_stats " << *m
3419            << " inb " << inb
3420            << " outb " << outb
3421            << " lat " << latency << dendl;
3422 }
3423
3424 void PrimaryLogPG::do_sub_op(OpRequestRef op)
3425 {
3426   const MOSDSubOp *m = static_cast<const MOSDSubOp*>(op->get_req());
3427   assert(have_same_or_newer_map(m->map_epoch));
3428   assert(m->get_type() == MSG_OSD_SUBOP);
3429   dout(15) << "do_sub_op " << *op->get_req() << dendl;
3430
3431   if (!is_peered()) {
3432     waiting_for_peered.push_back(op);
3433     op->mark_delayed("waiting for active");
3434     return;
3435   }
3436
3437   const OSDOp *first = NULL;
3438   if (m->ops.size() >= 1) {
3439     first = &m->ops[0];
3440   }
3441
3442   if (first) {
3443     switch (first->op.op) {
3444     case CEPH_OSD_OP_DELETE:
3445       sub_op_remove(op);
3446       return;
3447     case CEPH_OSD_OP_SCRUB_RESERVE:
3448       handle_scrub_reserve_request(op);
3449       return;
3450     case CEPH_OSD_OP_SCRUB_UNRESERVE:
3451       handle_scrub_reserve_release(op);
3452       return;
3453     case CEPH_OSD_OP_SCRUB_MAP:
3454       sub_op_scrub_map(op);
3455       return;
3456     }
3457   }
3458 }
3459
3460 void PrimaryLogPG::do_sub_op_reply(OpRequestRef op)
3461 {
3462   const MOSDSubOpReply *r = static_cast<const MOSDSubOpReply *>(op->get_req());
3463   assert(r->get_type() == MSG_OSD_SUBOPREPLY);
3464   if (r->ops.size() >= 1) {
3465     const OSDOp& first = r->ops[0];
3466     switch (first.op.op) {
3467     case CEPH_OSD_OP_SCRUB_RESERVE:
3468       {
3469         pg_shard_t from = r->from;
3470         bufferlist::iterator p = const_cast<bufferlist&>(r->get_data()).begin();
3471         bool reserved;
3472         ::decode(reserved, p);
3473         if (reserved) {
3474           handle_scrub_reserve_grant(op, from);
3475         } else {
3476           handle_scrub_reserve_reject(op, from);
3477         }
3478       }
3479       return;
3480     }
3481   }
3482 }
3483
3484 void PrimaryLogPG::do_scan(
3485   OpRequestRef op,
3486   ThreadPool::TPHandle &handle)
3487 {
3488   const MOSDPGScan *m = static_cast<const MOSDPGScan*>(op->get_req());
3489   assert(m->get_type() == MSG_OSD_PG_SCAN);
3490   dout(10) << "do_scan " << *m << dendl;
3491
3492   op->mark_started();
3493
3494   switch (m->op) {
3495   case MOSDPGScan::OP_SCAN_GET_DIGEST:
3496     {
3497       ostringstream ss;
3498       if (osd->check_backfill_full(ss)) {
3499         dout(1) << __func__ << ": Canceling backfill, " << ss.str() << dendl;
3500         queue_peering_event(
3501           CephPeeringEvtRef(
3502             std::make_shared<CephPeeringEvt>(
3503               get_osdmap()->get_epoch(),
3504               get_osdmap()->get_epoch(),
3505               BackfillTooFull())));
3506         return;
3507       }
3508
3509       BackfillInterval bi;
3510       bi.begin = m->begin;
3511       // No need to flush, there won't be any in progress writes occuring
3512       // past m->begin
3513       scan_range(
3514         cct->_conf->osd_backfill_scan_min,
3515         cct->_conf->osd_backfill_scan_max,
3516         &bi,
3517         handle);
3518       MOSDPGScan *reply = new MOSDPGScan(
3519         MOSDPGScan::OP_SCAN_DIGEST,
3520         pg_whoami,
3521         get_osdmap()->get_epoch(), m->query_epoch,
3522         spg_t(info.pgid.pgid, get_primary().shard), bi.begin, bi.end);
3523       ::encode(bi.objects, reply->get_data());
3524       osd->send_message_osd_cluster(reply, m->get_connection());
3525     }
3526     break;
3527
3528   case MOSDPGScan::OP_SCAN_DIGEST:
3529     {
3530       pg_shard_t from = m->from;
3531
3532       // Check that from is in backfill_targets vector
3533       assert(is_backfill_targets(from));
3534
3535       BackfillInterval& bi = peer_backfill_info[from];
3536       bi.begin = m->begin;
3537       bi.end = m->end;
3538       bufferlist::iterator p = const_cast<bufferlist&>(m->get_data()).begin();
3539
3540       // take care to preserve ordering!
3541       bi.clear_objects();
3542       ::decode_noclear(bi.objects, p);
3543
3544       if (waiting_on_backfill.erase(from)) {
3545         if (waiting_on_backfill.empty()) {
3546           assert(peer_backfill_info.size() == backfill_targets.size());
3547           finish_recovery_op(hobject_t::get_max());
3548         }
3549       } else {
3550         // we canceled backfill for a while due to a too full, and this
3551         // is an extra response from a non-too-full peer
3552       }
3553     }
3554     break;
3555   }
3556 }
3557
3558 void PrimaryLogPG::do_backfill(OpRequestRef op)
3559 {
3560   const MOSDPGBackfill *m = static_cast<const MOSDPGBackfill*>(op->get_req());
3561   assert(m->get_type() == MSG_OSD_PG_BACKFILL);
3562   dout(10) << "do_backfill " << *m << dendl;
3563
3564   op->mark_started();
3565
3566   switch (m->op) {
3567   case MOSDPGBackfill::OP_BACKFILL_FINISH:
3568     {
3569       assert(cct->_conf->osd_kill_backfill_at != 1);
3570
3571       MOSDPGBackfill *reply = new MOSDPGBackfill(
3572         MOSDPGBackfill::OP_BACKFILL_FINISH_ACK,
3573         get_osdmap()->get_epoch(),
3574         m->query_epoch,
3575         spg_t(info.pgid.pgid, get_primary().shard));
3576       reply->set_priority(get_recovery_op_priority());
3577       osd->send_message_osd_cluster(reply, m->get_connection());
3578       queue_peering_event(
3579         CephPeeringEvtRef(
3580           std::make_shared<CephPeeringEvt>(
3581             get_osdmap()->get_epoch(),
3582             get_osdmap()->get_epoch(),
3583             RecoveryDone())));
3584     }
3585     // fall-thru
3586
3587   case MOSDPGBackfill::OP_BACKFILL_PROGRESS:
3588     {
3589       assert(cct->_conf->osd_kill_backfill_at != 2);
3590
3591       info.set_last_backfill(m->last_backfill);
3592       info.stats = m->stats;
3593
3594       ObjectStore::Transaction t;
3595       dirty_info = true;
3596       write_if_dirty(t);
3597       int tr = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
3598       assert(tr == 0);
3599     }
3600     break;
3601
3602   case MOSDPGBackfill::OP_BACKFILL_FINISH_ACK:
3603     {
3604       assert(is_primary());
3605       assert(cct->_conf->osd_kill_backfill_at != 3);
3606       finish_recovery_op(hobject_t::get_max());
3607     }
3608     break;
3609   }
3610 }
3611
3612 void PrimaryLogPG::do_backfill_remove(OpRequestRef op)
3613 {
3614   const MOSDPGBackfillRemove *m = static_cast<const MOSDPGBackfillRemove*>(
3615     op->get_req());
3616   assert(m->get_type() == MSG_OSD_PG_BACKFILL_REMOVE);
3617   dout(7) << __func__ << " " << m->ls << dendl;
3618
3619   op->mark_started();
3620
3621   ObjectStore::Transaction t;
3622   for (auto& p : m->ls) {
3623     remove_snap_mapped_object(t, p.first);
3624   }
3625   int r = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
3626   assert(r == 0);
3627 }
3628
3629 int PrimaryLogPG::trim_object(
3630   bool first, const hobject_t &coid, PrimaryLogPG::OpContextUPtr *ctxp)
3631 {
3632   *ctxp = NULL;
3633   // load clone info
3634   bufferlist bl;
3635   ObjectContextRef obc = get_object_context(coid, false, NULL);
3636   if (!obc || !obc->ssc || !obc->ssc->exists) {
3637     osd->clog->error() << __func__ << ": Can not trim " << coid
3638       << " repair needed " << (obc ? "(no obc->ssc or !exists)" : "(no obc)");
3639     return -ENOENT;
3640   }
3641
3642   hobject_t snapoid(
3643     coid.oid, coid.get_key(),
3644     obc->ssc->snapset.head_exists ? CEPH_NOSNAP:CEPH_SNAPDIR, coid.get_hash(),
3645     info.pgid.pool(), coid.get_namespace());
3646   ObjectContextRef snapset_obc = get_object_context(snapoid, false);
3647   if (!snapset_obc) {
3648     osd->clog->error() << __func__ << ": Can not trim " << coid
3649       << " repair needed, no snapset obc for " << snapoid;
3650     return -ENOENT;
3651   }
3652
3653   SnapSet& snapset = obc->ssc->snapset;
3654
3655   bool legacy = snapset.is_legacy() ||
3656     get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
3657
3658   object_info_t &coi = obc->obs.oi;
3659   set<snapid_t> old_snaps;
3660   if (legacy) {
3661     old_snaps.insert(coi.legacy_snaps.begin(), coi.legacy_snaps.end());
3662   } else {
3663     auto p = snapset.clone_snaps.find(coid.snap);
3664     if (p == snapset.clone_snaps.end()) {
3665       osd->clog->error() << __func__ << " No clone_snaps in snapset " << snapset
3666                          << " for " << coid << "\n";
3667       return -ENOENT;
3668     }
3669     old_snaps.insert(snapset.clone_snaps[coid.snap].begin(),
3670                      snapset.clone_snaps[coid.snap].end());
3671   }
3672   if (old_snaps.empty()) {
3673     osd->clog->error() << __func__ << " No object info snaps for " << coid;
3674     return -ENOENT;
3675   }
3676
3677   dout(10) << coid << " old_snaps " << old_snaps
3678            << " old snapset " << snapset << dendl;
3679   if (snapset.seq == 0) {
3680     osd->clog->error() << __func__ << " No snapset.seq for " << coid;
3681     return -ENOENT;
3682   }
3683
3684   set<snapid_t> new_snaps;
3685   for (set<snapid_t>::iterator i = old_snaps.begin();
3686        i != old_snaps.end();
3687        ++i) {
3688     if (!pool.info.is_removed_snap(*i))
3689       new_snaps.insert(*i);
3690   }
3691
3692   vector<snapid_t>::iterator p = snapset.clones.end();
3693
3694   if (new_snaps.empty()) {
3695     p = std::find(snapset.clones.begin(), snapset.clones.end(), coid.snap);
3696     if (p == snapset.clones.end()) {
3697       osd->clog->error() << __func__ << " Snap " << coid.snap << " not in clones";
3698       return -ENOENT;
3699     }
3700   }
3701
3702   OpContextUPtr ctx = simple_opc_create(obc);
3703   ctx->snapset_obc = snapset_obc;
3704
3705   if (!ctx->lock_manager.get_snaptrimmer_write(
3706         coid,
3707         obc,
3708         first)) {
3709     close_op_ctx(ctx.release());
3710     dout(10) << __func__ << ": Unable to get a wlock on " << coid << dendl;
3711     return -ENOLCK;
3712   }
3713
3714   if (!ctx->lock_manager.get_snaptrimmer_write(
3715         snapoid,
3716         snapset_obc,
3717         first)) {
3718     close_op_ctx(ctx.release());
3719     dout(10) << __func__ << ": Unable to get a wlock on " << snapoid << dendl;
3720     return -ENOLCK;
3721   }
3722
3723   ctx->at_version = get_next_version();
3724
3725   PGTransaction *t = ctx->op_t.get();
3726
3727   if (new_snaps.empty()) {
3728     // remove clone
3729     dout(10) << coid << " snaps " << old_snaps << " -> "
3730              << new_snaps << " ... deleting" << dendl;
3731
3732     // ...from snapset
3733     assert(p != snapset.clones.end());
3734
3735     snapid_t last = coid.snap;
3736     ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(last);
3737
3738     if (p != snapset.clones.begin()) {
3739       // not the oldest... merge overlap into next older clone
3740       vector<snapid_t>::iterator n = p - 1;
3741       hobject_t prev_coid = coid;
3742       prev_coid.snap = *n;
3743       bool adjust_prev_bytes = is_present_clone(prev_coid);
3744
3745       if (adjust_prev_bytes)
3746         ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(*n);
3747
3748       snapset.clone_overlap[*n].intersection_of(
3749         snapset.clone_overlap[*p]);
3750
3751       if (adjust_prev_bytes)
3752         ctx->delta_stats.num_bytes += snapset.get_clone_bytes(*n);
3753     }
3754     ctx->delta_stats.num_objects--;
3755     if (coi.is_dirty())
3756       ctx->delta_stats.num_objects_dirty--;
3757     if (coi.is_omap())
3758       ctx->delta_stats.num_objects_omap--;
3759     if (coi.is_whiteout()) {
3760       dout(20) << __func__ << " trimming whiteout on " << coid << dendl;
3761       ctx->delta_stats.num_whiteouts--;
3762     }
3763     ctx->delta_stats.num_object_clones--;
3764     if (coi.is_cache_pinned())
3765       ctx->delta_stats.num_objects_pinned--;
3766     obc->obs.exists = false;
3767
3768     snapset.clones.erase(p);
3769     snapset.clone_overlap.erase(last);
3770     snapset.clone_size.erase(last);
3771     snapset.clone_snaps.erase(last);
3772
3773     ctx->log.push_back(
3774       pg_log_entry_t(
3775         pg_log_entry_t::DELETE,
3776         coid,
3777         ctx->at_version,
3778         ctx->obs->oi.version,
3779         0,
3780         osd_reqid_t(),
3781         ctx->mtime,
3782         0)
3783       );
3784     t->remove(coid);
3785     t->update_snaps(
3786       coid,
3787       old_snaps,
3788       new_snaps);
3789
3790     coi = object_info_t(coid);
3791
3792     ctx->at_version.version++;
3793   } else {
3794     // save adjusted snaps for this object
3795     dout(10) << coid << " snaps " << old_snaps << " -> " << new_snaps << dendl;
3796     if (legacy) {
3797       coi.legacy_snaps = vector<snapid_t>(new_snaps.rbegin(), new_snaps.rend());
3798     } else {
3799       snapset.clone_snaps[coid.snap] = vector<snapid_t>(new_snaps.rbegin(),
3800                                                         new_snaps.rend());
3801       // we still do a 'modify' event on this object just to trigger a
3802       // snapmapper.update ... :(
3803     }
3804
3805     coi.prior_version = coi.version;
3806     coi.version = ctx->at_version;
3807     bl.clear();
3808     ::encode(coi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
3809     t->setattr(coid, OI_ATTR, bl);
3810
3811     ctx->log.push_back(
3812       pg_log_entry_t(
3813         pg_log_entry_t::MODIFY,
3814         coid,
3815         coi.version,
3816         coi.prior_version,
3817         0,
3818         osd_reqid_t(),
3819         ctx->mtime,
3820         0)
3821       );
3822     ctx->at_version.version++;
3823
3824     t->update_snaps(
3825       coid,
3826       old_snaps,
3827       new_snaps);
3828   }
3829
3830   // save head snapset
3831   dout(10) << coid << " new snapset " << snapset << " on "
3832            << snapset_obc->obs.oi << dendl;
3833   if (snapset.clones.empty() &&
3834       (!snapset.head_exists ||
3835        (snapset_obc->obs.oi.is_whiteout() &&
3836         !(snapset_obc->obs.oi.is_dirty() && pool.info.is_tier()) &&
3837         !snapset_obc->obs.oi.is_cache_pinned()))) {
3838     // NOTE: this arguably constitutes minor interference with the
3839     // tiering agent if this is a cache tier since a snap trim event
3840     // is effectively evicting a whiteout we might otherwise want to
3841     // keep around.
3842     dout(10) << coid << " removing " << snapoid << dendl;
3843     ctx->log.push_back(
3844       pg_log_entry_t(
3845         pg_log_entry_t::DELETE,
3846         snapoid,
3847         ctx->at_version,
3848         ctx->snapset_obc->obs.oi.version,
3849         0,
3850         osd_reqid_t(),
3851         ctx->mtime,
3852         0)
3853       );
3854     if (snapoid.is_head()) {
3855       derr << "removing snap head" << dendl;
3856       object_info_t& oi = ctx->snapset_obc->obs.oi;
3857       ctx->delta_stats.num_objects--;
3858       if (oi.is_dirty()) {
3859         ctx->delta_stats.num_objects_dirty--;
3860       }
3861       if (oi.is_omap())
3862         ctx->delta_stats.num_objects_omap--;
3863       if (oi.is_whiteout()) {
3864         dout(20) << __func__ << " trimming whiteout on " << oi.soid << dendl;
3865         ctx->delta_stats.num_whiteouts--;
3866       }
3867       if (oi.is_cache_pinned()) {
3868         ctx->delta_stats.num_objects_pinned--;
3869       }
3870     }
3871     ctx->snapset_obc->obs.exists = false;
3872     ctx->snapset_obc->obs.oi = object_info_t(snapoid);
3873     t->remove(snapoid);
3874   } else {
3875     dout(10) << coid << " filtering snapset on " << snapoid << dendl;
3876     snapset.filter(pool.info);
3877     dout(10) << coid << " writing updated snapset on " << snapoid
3878              << ", snapset is " << snapset << dendl;
3879     ctx->log.push_back(
3880       pg_log_entry_t(
3881         pg_log_entry_t::MODIFY,
3882         snapoid,
3883         ctx->at_version,
3884         ctx->snapset_obc->obs.oi.version,
3885         0,
3886         osd_reqid_t(),
3887         ctx->mtime,
3888         0)
3889       );
3890
3891     ctx->snapset_obc->obs.oi.prior_version =
3892       ctx->snapset_obc->obs.oi.version;
3893     ctx->snapset_obc->obs.oi.version = ctx->at_version;
3894
3895     map <string, bufferlist> attrs;
3896     bl.clear();
3897     ::encode(snapset, bl);
3898     attrs[SS_ATTR].claim(bl);
3899
3900     bl.clear();
3901     ::encode(ctx->snapset_obc->obs.oi, bl,
3902              get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
3903     attrs[OI_ATTR].claim(bl);
3904     t->setattrs(snapoid, attrs);
3905   }
3906
3907   *ctxp = std::move(ctx);
3908   return 0;
3909 }
3910
3911 void PrimaryLogPG::kick_snap_trim()
3912 {
3913   assert(is_active());
3914   assert(is_primary());
3915   if (is_clean() && !snap_trimq.empty()) {
3916     dout(10) << __func__ << ": clean and snaps to trim, kicking" << dendl;
3917     snap_trimmer_machine.process_event(KickTrim());
3918   }
3919 }
3920
3921 void PrimaryLogPG::snap_trimmer_scrub_complete()
3922 {
3923   if (is_primary() && is_active() && is_clean()) {
3924     assert(!snap_trimq.empty());
3925     snap_trimmer_machine.process_event(ScrubComplete());
3926   }
3927 }
3928
3929 void PrimaryLogPG::snap_trimmer(epoch_t queued)
3930 {
3931   if (deleting || pg_has_reset_since(queued)) {
3932     return;
3933   }
3934
3935   assert(is_primary());
3936
3937   dout(10) << "snap_trimmer posting" << dendl;
3938   snap_trimmer_machine.process_event(DoSnapWork());
3939   dout(10) << "snap_trimmer complete" << dendl;
3940   return;
3941 }
3942
3943 int PrimaryLogPG::do_xattr_cmp_u64(int op, __u64 v1, bufferlist& xattr)
3944 {
3945   __u64 v2;
3946
3947   string v2s(xattr.c_str(), xattr.length());
3948   if (v2s.length())
3949     v2 = strtoull(v2s.c_str(), NULL, 10);
3950   else
3951     v2 = 0;
3952
3953   dout(20) << "do_xattr_cmp_u64 '" << v1 << "' vs '" << v2 << "' op " << op << dendl;
3954
3955   switch (op) {
3956   case CEPH_OSD_CMPXATTR_OP_EQ:
3957     return (v1 == v2);
3958   case CEPH_OSD_CMPXATTR_OP_NE:
3959     return (v1 != v2);
3960   case CEPH_OSD_CMPXATTR_OP_GT:
3961     return (v1 > v2);
3962   case CEPH_OSD_CMPXATTR_OP_GTE:
3963     return (v1 >= v2);
3964   case CEPH_OSD_CMPXATTR_OP_LT:
3965     return (v1 < v2);
3966   case CEPH_OSD_CMPXATTR_OP_LTE:
3967     return (v1 <= v2);
3968   default:
3969     return -EINVAL;
3970   }
3971 }
3972
3973 int PrimaryLogPG::do_xattr_cmp_str(int op, string& v1s, bufferlist& xattr)
3974 {
3975   string v2s(xattr.c_str(), xattr.length());
3976
3977   dout(20) << "do_xattr_cmp_str '" << v1s << "' vs '" << v2s << "' op " << op << dendl;
3978
3979   switch (op) {
3980   case CEPH_OSD_CMPXATTR_OP_EQ:
3981     return (v1s.compare(v2s) == 0);
3982   case CEPH_OSD_CMPXATTR_OP_NE:
3983     return (v1s.compare(v2s) != 0);
3984   case CEPH_OSD_CMPXATTR_OP_GT:
3985     return (v1s.compare(v2s) > 0);
3986   case CEPH_OSD_CMPXATTR_OP_GTE:
3987     return (v1s.compare(v2s) >= 0);
3988   case CEPH_OSD_CMPXATTR_OP_LT:
3989     return (v1s.compare(v2s) < 0);
3990   case CEPH_OSD_CMPXATTR_OP_LTE:
3991     return (v1s.compare(v2s) <= 0);
3992   default:
3993     return -EINVAL;
3994   }
3995 }
3996
3997 int PrimaryLogPG::do_extent_cmp(OpContext *ctx, OSDOp& osd_op)
3998 {
3999   ceph_osd_op& op = osd_op.op;
4000   vector<OSDOp> read_ops(1);
4001   OSDOp& read_op = read_ops[0];
4002   int result = 0;
4003
4004   read_op.op.op = CEPH_OSD_OP_SYNC_READ;
4005   read_op.op.extent.offset = op.extent.offset;
4006   read_op.op.extent.length = op.extent.length;
4007   read_op.op.extent.truncate_seq = op.extent.truncate_seq;
4008   read_op.op.extent.truncate_size = op.extent.truncate_size;
4009
4010   result = do_osd_ops(ctx, read_ops);
4011   if (result < 0) {
4012     derr << "do_extent_cmp do_osd_ops failed " << result << dendl;
4013     return result;
4014   }
4015
4016   if (read_op.outdata.length() != osd_op.indata.length())
4017     return -EINVAL;
4018
4019   for (uint64_t p = 0; p < osd_op.indata.length(); p++) {
4020     if (read_op.outdata[p] != osd_op.indata[p]) {
4021       return (-MAX_ERRNO - p);
4022     }
4023   }
4024
4025   return result;
4026 }
4027
4028 int PrimaryLogPG::do_writesame(OpContext *ctx, OSDOp& osd_op)
4029 {
4030   ceph_osd_op& op = osd_op.op;
4031   vector<OSDOp> write_ops(1);
4032   OSDOp& write_op = write_ops[0];
4033   uint64_t write_length = op.writesame.length;
4034   int result = 0;
4035
4036   if (!write_length)
4037     return 0;
4038
4039   if (!op.writesame.data_length || write_length % op.writesame.data_length)
4040     return -EINVAL;
4041
4042   if (op.writesame.data_length != osd_op.indata.length()) {
4043     derr << "invalid length ws data length " << op.writesame.data_length << " actual len " << osd_op.indata.length() << dendl;
4044     return -EINVAL;
4045   }
4046
4047   while (write_length) {
4048     write_op.indata.append(osd_op.indata);
4049     write_length -= op.writesame.data_length;
4050   }
4051
4052   write_op.op.op = CEPH_OSD_OP_WRITE;
4053   write_op.op.extent.offset = op.writesame.offset;
4054   write_op.op.extent.length = op.writesame.length;
4055   result = do_osd_ops(ctx, write_ops);
4056   if (result < 0)
4057     derr << "do_writesame do_osd_ops failed " << result << dendl;
4058
4059   return result;
4060 }
4061
4062 // ========================================================================
4063 // low level osd ops
4064
4065 int PrimaryLogPG::do_tmap2omap(OpContext *ctx, unsigned flags)
4066 {
4067   dout(20) << " convert tmap to omap for " << ctx->new_obs.oi.soid << dendl;
4068   bufferlist header, vals;
4069   int r = _get_tmap(ctx, &header, &vals);
4070   if (r < 0) {
4071     if (r == -ENODATA && (flags & CEPH_OSD_TMAP2OMAP_NULLOK))
4072       r = 0;
4073     return r;
4074   }
4075
4076   vector<OSDOp> ops(3);
4077
4078   ops[0].op.op = CEPH_OSD_OP_TRUNCATE;
4079   ops[0].op.extent.offset = 0;
4080   ops[0].op.extent.length = 0;
4081
4082   ops[1].op.op = CEPH_OSD_OP_OMAPSETHEADER;
4083   ops[1].indata.claim(header);
4084
4085   ops[2].op.op = CEPH_OSD_OP_OMAPSETVALS;
4086   ops[2].indata.claim(vals);
4087
4088   return do_osd_ops(ctx, ops);
4089 }
4090
4091 int PrimaryLogPG::do_tmapup_slow(OpContext *ctx, bufferlist::iterator& bp, OSDOp& osd_op,
4092                                     bufferlist& bl)
4093 {
4094   // decode
4095   bufferlist header;
4096   map<string, bufferlist> m;
4097   if (bl.length()) {
4098     bufferlist::iterator p = bl.begin();
4099     ::decode(header, p);
4100     ::decode(m, p);
4101     assert(p.end());
4102   }
4103
4104   // do the update(s)
4105   while (!bp.end()) {
4106     __u8 op;
4107     string key;
4108     ::decode(op, bp);
4109
4110     switch (op) {
4111     case CEPH_OSD_TMAP_SET: // insert key
4112       {
4113         ::decode(key, bp);
4114         bufferlist data;
4115         ::decode(data, bp);
4116         m[key] = data;
4117       }
4118       break;
4119     case CEPH_OSD_TMAP_RM: // remove key
4120       ::decode(key, bp);
4121       if (!m.count(key)) {
4122         return -ENOENT;
4123       }
4124       m.erase(key);
4125       break;
4126     case CEPH_OSD_TMAP_RMSLOPPY: // remove key
4127       ::decode(key, bp);
4128       m.erase(key);
4129       break;
4130     case CEPH_OSD_TMAP_HDR: // update header
4131       {
4132         ::decode(header, bp);
4133       }
4134       break;
4135     default:
4136       return -EINVAL;
4137     }
4138   }
4139
4140   // reencode
4141   bufferlist obl;
4142   ::encode(header, obl);
4143   ::encode(m, obl);
4144
4145   // write it out
4146   vector<OSDOp> nops(1);
4147   OSDOp& newop = nops[0];
4148   newop.op.op = CEPH_OSD_OP_WRITEFULL;
4149   newop.op.extent.offset = 0;
4150   newop.op.extent.length = obl.length();
4151   newop.indata = obl;
4152   do_osd_ops(ctx, nops);
4153   osd_op.outdata.claim(newop.outdata);
4154   return 0;
4155 }
4156
4157 int PrimaryLogPG::do_tmapup(OpContext *ctx, bufferlist::iterator& bp, OSDOp& osd_op)
4158 {
4159   bufferlist::iterator orig_bp = bp;
4160   int result = 0;
4161   if (bp.end()) {
4162     dout(10) << "tmapup is a no-op" << dendl;
4163   } else {
4164     // read the whole object
4165     vector<OSDOp> nops(1);
4166     OSDOp& newop = nops[0];
4167     newop.op.op = CEPH_OSD_OP_READ;
4168     newop.op.extent.offset = 0;
4169     newop.op.extent.length = 0;
4170     result = do_osd_ops(ctx, nops);
4171
4172     dout(10) << "tmapup read " << newop.outdata.length() << dendl;
4173
4174     dout(30) << " starting is \n";
4175     newop.outdata.hexdump(*_dout);
4176     *_dout << dendl;
4177
4178     bufferlist::iterator ip = newop.outdata.begin();
4179     bufferlist obl;
4180
4181     dout(30) << "the update command is: \n";
4182     osd_op.indata.hexdump(*_dout);
4183     *_dout << dendl;
4184
4185     // header
4186     bufferlist header;
4187     __u32 nkeys = 0;
4188     if (newop.outdata.length()) {
4189       ::decode(header, ip);
4190       ::decode(nkeys, ip);
4191     }
4192     dout(10) << "tmapup header " << header.length() << dendl;
4193
4194     if (!bp.end() && *bp == CEPH_OSD_TMAP_HDR) {
4195       ++bp;
4196       ::decode(header, bp);
4197       dout(10) << "tmapup new header " << header.length() << dendl;
4198     }
4199
4200     ::encode(header, obl);
4201
4202     dout(20) << "tmapup initial nkeys " << nkeys << dendl;
4203
4204     // update keys
4205     bufferlist newkeydata;
4206     string nextkey, last_in_key;
4207     bufferlist nextval;
4208     bool have_next = false;
4209     if (!ip.end()) {
4210       have_next = true;
4211       ::decode(nextkey, ip);
4212       ::decode(nextval, ip);
4213     }
4214     while (!bp.end() && !result) {
4215       __u8 op;
4216       string key;
4217       try {
4218         ::decode(op, bp);
4219         ::decode(key, bp);
4220       }
4221       catch (buffer::error& e) {
4222         return -EINVAL;
4223       }
4224       if (key < last_in_key) {
4225         dout(5) << "tmapup warning: key '" << key << "' < previous key '" << last_in_key
4226                 << "', falling back to an inefficient (unsorted) update" << dendl;
4227         bp = orig_bp;
4228         return do_tmapup_slow(ctx, bp, osd_op, newop.outdata);
4229       }
4230       last_in_key = key;
4231
4232       dout(10) << "tmapup op " << (int)op << " key " << key << dendl;
4233
4234       // skip existing intervening keys
4235       bool key_exists = false;
4236       while (have_next && !key_exists) {
4237         dout(20) << "  (have_next=" << have_next << " nextkey=" << nextkey << ")" << dendl;
4238         if (nextkey > key)
4239           break;
4240         if (nextkey < key) {
4241           // copy untouched.
4242           ::encode(nextkey, newkeydata);
4243           ::encode(nextval, newkeydata);
4244           dout(20) << "  keep " << nextkey << " " << nextval.length() << dendl;
4245         } else {
4246           // don't copy; discard old value.  and stop.
4247           dout(20) << "  drop " << nextkey << " " << nextval.length() << dendl;
4248           key_exists = true;
4249           nkeys--;
4250         }
4251         if (!ip.end()) {
4252           ::decode(nextkey, ip);
4253           ::decode(nextval, ip);
4254         } else {
4255           have_next = false;
4256         }
4257       }
4258
4259       if (op == CEPH_OSD_TMAP_SET) {
4260         bufferlist val;
4261         try {
4262           ::decode(val, bp);
4263         }
4264         catch (buffer::error& e) {
4265           return -EINVAL;
4266         }
4267         ::encode(key, newkeydata);
4268         ::encode(val, newkeydata);
4269         dout(20) << "   set " << key << " " << val.length() << dendl;
4270         nkeys++;
4271       } else if (op == CEPH_OSD_TMAP_CREATE) {
4272         if (key_exists) {
4273           return -EEXIST;
4274         }
4275         bufferlist val;
4276         try {
4277           ::decode(val, bp);
4278         }
4279         catch (buffer::error& e) {
4280           return -EINVAL;
4281         }
4282         ::encode(key, newkeydata);
4283         ::encode(val, newkeydata);
4284         dout(20) << "   create " << key << " " << val.length() << dendl;
4285         nkeys++;
4286       } else if (op == CEPH_OSD_TMAP_RM) {
4287         // do nothing.
4288         if (!key_exists) {
4289           return -ENOENT;
4290         }
4291       } else if (op == CEPH_OSD_TMAP_RMSLOPPY) {
4292         // do nothing
4293       } else {
4294         dout(10) << "  invalid tmap op " << (int)op << dendl;
4295         return -EINVAL;
4296       }
4297     }
4298
4299     // copy remaining
4300     if (have_next) {
4301       ::encode(nextkey, newkeydata);
4302       ::encode(nextval, newkeydata);
4303       dout(20) << "  keep " << nextkey << " " << nextval.length() << dendl;
4304     }
4305     if (!ip.end()) {
4306       bufferlist rest;
4307       rest.substr_of(newop.outdata, ip.get_off(), newop.outdata.length() - ip.get_off());
4308       dout(20) << "  keep trailing " << rest.length()
4309                << " at " << newkeydata.length() << dendl;
4310       newkeydata.claim_append(rest);
4311     }
4312
4313     // encode final key count + key data
4314     dout(20) << "tmapup final nkeys " << nkeys << dendl;
4315     ::encode(nkeys, obl);
4316     obl.claim_append(newkeydata);
4317
4318     if (0) {
4319       dout(30) << " final is \n";
4320       obl.hexdump(*_dout);
4321       *_dout << dendl;
4322
4323       // sanity check
4324       bufferlist::iterator tp = obl.begin();
4325       bufferlist h;
4326       ::decode(h, tp);
4327       map<string,bufferlist> d;
4328       ::decode(d, tp);
4329       assert(tp.end());
4330       dout(0) << " **** debug sanity check, looks ok ****" << dendl;
4331     }
4332
4333     // write it out
4334     if (!result) {
4335       dout(20) << "tmapput write " << obl.length() << dendl;
4336       newop.op.op = CEPH_OSD_OP_WRITEFULL;
4337       newop.op.extent.offset = 0;
4338       newop.op.extent.length = obl.length();
4339       newop.indata = obl;
4340       do_osd_ops(ctx, nops);
4341       osd_op.outdata.claim(newop.outdata);
4342     }
4343   }
4344   return result;
4345 }
4346
4347 static int check_offset_and_length(uint64_t offset, uint64_t length, uint64_t max)
4348 {
4349   if (offset >= max ||
4350       length > max ||
4351       offset + length > max)
4352     return -EFBIG;
4353
4354   return 0;
4355 }
4356
4357 struct FillInVerifyExtent : public Context {
4358   ceph_le64 *r;
4359   int32_t *rval;
4360   bufferlist *outdatap;
4361   boost::optional<uint32_t> maybe_crc;
4362   uint64_t size;
4363   OSDService *osd;
4364   hobject_t soid;
4365   __le32 flags;
4366   FillInVerifyExtent(ceph_le64 *r, int32_t *rv, bufferlist *blp,
4367                      boost::optional<uint32_t> mc, uint64_t size,
4368                      OSDService *osd, hobject_t soid, __le32 flags) :
4369     r(r), rval(rv), outdatap(blp), maybe_crc(mc),
4370     size(size), osd(osd), soid(soid), flags(flags) {}
4371   void finish(int len) override {
4372     *rval = len;
4373     *r = len;
4374     if (len < 0)
4375       return;
4376     // whole object?  can we verify the checksum?
4377     if (maybe_crc && *r == size) {
4378       uint32_t crc = outdatap->crc32c(-1);
4379       if (maybe_crc != crc) {
4380         osd->clog->error() << std::hex << " full-object read crc 0x" << crc
4381                            << " != expected 0x" << *maybe_crc
4382                            << std::dec << " on " << soid;
4383         if (!(flags & CEPH_OSD_OP_FLAG_FAILOK)) {
4384           *rval = -EIO;
4385           *r = 0;
4386         }
4387       }
4388     }
4389   }
4390 };
4391
4392 struct ToSparseReadResult : public Context {
4393   bufferlist& data_bl;
4394   uint64_t data_offset;
4395   ceph_le64& len;
4396   ToSparseReadResult(bufferlist& bl, uint64_t offset, ceph_le64& len):
4397     data_bl(bl), data_offset(offset),len(len) {}
4398   void finish(int r) override {
4399     if (r < 0) return;
4400     len = r;
4401     bufferlist outdata;
4402     map<uint64_t, uint64_t> extents = {{data_offset, r}};
4403     ::encode(extents, outdata);
4404     ::encode_destructively(data_bl, outdata);
4405     data_bl.swap(outdata);
4406   }
4407 };
4408
4409 template<typename V>
4410 static string list_keys(const map<string, V>& m) {
4411   string s;
4412   for (typename map<string, V>::const_iterator itr = m.begin(); itr != m.end(); ++itr) {
4413     if (!s.empty()) {
4414       s.push_back(',');
4415     }
4416     s.append(itr->first);
4417   }
4418   return s;
4419 }
4420
4421 template<typename T>
4422 static string list_entries(const T& m) {
4423   string s;
4424   for (typename T::const_iterator itr = m.begin(); itr != m.end(); ++itr) {
4425     if (!s.empty()) {
4426       s.push_back(',');
4427     }
4428     s.append(*itr);
4429   }
4430   return s;
4431 }
4432
4433 void PrimaryLogPG::maybe_create_new_object(
4434   OpContext *ctx,
4435   bool ignore_transaction)
4436 {
4437   ObjectState& obs = ctx->new_obs;
4438   if (!obs.exists) {
4439     ctx->delta_stats.num_objects++;
4440     obs.exists = true;
4441     assert(!obs.oi.is_whiteout());
4442     obs.oi.new_object();
4443     if (!ignore_transaction)
4444       ctx->op_t->create(obs.oi.soid);
4445   } else if (obs.oi.is_whiteout()) {
4446     dout(10) << __func__ << " clearing whiteout on " << obs.oi.soid << dendl;
4447     ctx->new_obs.oi.clear_flag(object_info_t::FLAG_WHITEOUT);
4448     --ctx->delta_stats.num_whiteouts;
4449   }
4450 }
4451
4452 struct C_ChecksumRead : public Context {
4453   PrimaryLogPG *primary_log_pg;
4454   OSDOp &osd_op;
4455   Checksummer::CSumType csum_type;
4456   bufferlist init_value_bl;
4457   ceph_le64 read_length;
4458   bufferlist read_bl;
4459   Context *fill_extent_ctx;
4460
4461   C_ChecksumRead(PrimaryLogPG *primary_log_pg, OSDOp &osd_op,
4462                  Checksummer::CSumType csum_type, bufferlist &&init_value_bl,
4463                  boost::optional<uint32_t> maybe_crc, uint64_t size,
4464                  OSDService *osd, hobject_t soid, __le32 flags)
4465     : primary_log_pg(primary_log_pg), osd_op(osd_op),
4466       csum_type(csum_type), init_value_bl(std::move(init_value_bl)),
4467       fill_extent_ctx(new FillInVerifyExtent(&read_length, &osd_op.rval,
4468                                              &read_bl, maybe_crc, size,
4469                                              osd, soid, flags)) {
4470   }
4471
4472   void finish(int r) override {
4473     fill_extent_ctx->complete(r);
4474
4475     if (osd_op.rval >= 0) {
4476       bufferlist::iterator init_value_bl_it = init_value_bl.begin();
4477       osd_op.rval = primary_log_pg->finish_checksum(osd_op, csum_type,
4478                                                     &init_value_bl_it,
4479                                                     read_bl);
4480     }
4481   }
4482 };
4483
4484 int PrimaryLogPG::do_checksum(OpContext *ctx, OSDOp& osd_op,
4485                               bufferlist::iterator *bl_it, bool *async_read)
4486 {
4487   dout(20) << __func__ << dendl;
4488
4489   auto& op = osd_op.op;
4490   if (op.checksum.chunk_size > 0) {
4491     if (op.checksum.length == 0) {
4492       dout(10) << __func__ << ": length required when chunk size provided"
4493                << dendl;
4494       return -EINVAL;
4495     }
4496     if (op.checksum.length % op.checksum.chunk_size != 0) {
4497       dout(10) << __func__ << ": length not aligned to chunk size" << dendl;
4498       return -EINVAL;
4499     }
4500   }
4501
4502   auto& oi = ctx->new_obs.oi;
4503   if (op.checksum.offset == 0 && op.checksum.length == 0) {
4504     // zeroed offset+length implies checksum whole object
4505     op.checksum.length = oi.size;
4506   } else if (op.checksum.offset + op.checksum.length > oi.size) {
4507     return -EOVERFLOW;
4508   }
4509
4510   Checksummer::CSumType csum_type;
4511   switch (op.checksum.type) {
4512   case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH32:
4513     csum_type = Checksummer::CSUM_XXHASH32;
4514     break;
4515   case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH64:
4516     csum_type = Checksummer::CSUM_XXHASH64;
4517     break;
4518   case CEPH_OSD_CHECKSUM_OP_TYPE_CRC32C:
4519     csum_type = Checksummer::CSUM_CRC32C;
4520     break;
4521   default:
4522     dout(10) << __func__ << ": unknown crc type ("
4523              << static_cast<uint32_t>(op.checksum.type) << ")" << dendl;
4524     return -EINVAL;
4525   }
4526
4527   size_t csum_init_value_size = Checksummer::get_csum_init_value_size(csum_type);
4528   if (bl_it->get_remaining() < csum_init_value_size) {
4529     dout(10) << __func__ << ": init value not provided" << dendl;
4530     return -EINVAL;
4531   }
4532
4533   bufferlist init_value_bl;
4534   init_value_bl.substr_of(bl_it->get_bl(), bl_it->get_off(),
4535                           csum_init_value_size);
4536   bl_it->advance(csum_init_value_size);
4537
4538   if (pool.info.require_rollback() && op.checksum.length > 0) {
4539     // If there is a data digest and it is possible we are reading
4540     // entire object, pass the digest.
4541     boost::optional<uint32_t> maybe_crc;
4542     if (oi.is_data_digest() && op.checksum.offset == 0 &&
4543         op.checksum.length >= oi.size) {
4544       maybe_crc = oi.data_digest;
4545     }
4546
4547     // async read
4548     auto& soid = oi.soid;
4549     auto checksum_ctx = new C_ChecksumRead(this, osd_op, csum_type,
4550                                            std::move(init_value_bl), maybe_crc,
4551                                            oi.size, osd, soid, op.flags);
4552     ctx->pending_async_reads.push_back({
4553       {op.checksum.offset, op.checksum.length, op.flags},
4554       {&checksum_ctx->read_bl, checksum_ctx}});
4555
4556     dout(10) << __func__ << ": async_read noted for " << soid << dendl;
4557     *async_read = true;
4558     return 0;
4559   }
4560
4561   // sync read
4562   *async_read = false;
4563   std::vector<OSDOp> read_ops(1);
4564   auto& read_op = read_ops[0];
4565   if (op.checksum.length > 0) {
4566     read_op.op.op = CEPH_OSD_OP_READ;
4567     read_op.op.flags = op.flags;
4568     read_op.op.extent.offset = op.checksum.offset;
4569     read_op.op.extent.length = op.checksum.length;
4570     read_op.op.extent.truncate_size = 0;
4571     read_op.op.extent.truncate_seq = 0;
4572
4573     int r = do_osd_ops(ctx, read_ops);
4574     if (r < 0) {
4575       derr << __func__ << ": do_osd_ops failed: " << cpp_strerror(r) << dendl;
4576       return r;
4577     }
4578   }
4579
4580   bufferlist::iterator init_value_bl_it = init_value_bl.begin();
4581   return finish_checksum(osd_op, csum_type, &init_value_bl_it,
4582                          read_op.outdata);
4583 }
4584
4585 int PrimaryLogPG::finish_checksum(OSDOp& osd_op,
4586                                   Checksummer::CSumType csum_type,
4587                                   bufferlist::iterator *init_value_bl_it,
4588                                   const bufferlist &read_bl) {
4589   dout(20) << __func__ << dendl;
4590
4591   auto& op = osd_op.op;
4592
4593   if (op.checksum.length > 0 && read_bl.length() != op.checksum.length) {
4594     derr << __func__ << ": bytes read " << read_bl.length() << " != "
4595          << op.checksum.length << dendl;
4596     return -EINVAL;
4597   }
4598
4599   size_t csum_chunk_size = (op.checksum.chunk_size != 0 ?
4600                               op.checksum.chunk_size : read_bl.length());
4601   uint32_t csum_count = (csum_chunk_size > 0 ?
4602                            read_bl.length() / csum_chunk_size : 0);
4603
4604   bufferlist csum;
4605   bufferptr csum_data;
4606   if (csum_count > 0) {
4607     size_t csum_value_size = Checksummer::get_csum_value_size(csum_type);
4608     csum_data = buffer::create(csum_value_size * csum_count);
4609     csum_data.zero();
4610     csum.append(csum_data);
4611
4612     switch (csum_type) {
4613     case Checksummer::CSUM_XXHASH32:
4614       {
4615         Checksummer::xxhash32::init_value_t init_value;
4616         ::decode(init_value, *init_value_bl_it);
4617         Checksummer::calculate<Checksummer::xxhash32>(
4618           init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
4619           &csum_data);
4620       }
4621       break;
4622     case Checksummer::CSUM_XXHASH64:
4623       {
4624         Checksummer::xxhash64::init_value_t init_value;
4625         ::decode(init_value, *init_value_bl_it);
4626         Checksummer::calculate<Checksummer::xxhash64>(
4627           init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
4628           &csum_data);
4629       }
4630       break;
4631     case Checksummer::CSUM_CRC32C:
4632       {
4633         Checksummer::crc32c::init_value_t init_value;
4634         ::decode(init_value, *init_value_bl_it);
4635         Checksummer::calculate<Checksummer::crc32c>(
4636           init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
4637           &csum_data);
4638       }
4639       break;
4640     default:
4641       break;
4642     }
4643   }
4644
4645   ::encode(csum_count, osd_op.outdata);
4646   osd_op.outdata.claim_append(csum);
4647   return 0;
4648 }
4649
4650 int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
4651 {
4652   int result = 0;
4653   SnapSetContext *ssc = ctx->obc->ssc;
4654   ObjectState& obs = ctx->new_obs;
4655   object_info_t& oi = obs.oi;
4656   const hobject_t& soid = oi.soid;
4657
4658   bool first_read = true;
4659
4660   PGTransaction* t = ctx->op_t.get();
4661
4662   dout(10) << "do_osd_op " << soid << " " << ops << dendl;
4663
4664   for (vector<OSDOp>::iterator p = ops.begin(); p != ops.end(); ++p, ctx->current_osd_subop_num++) {
4665     OSDOp& osd_op = *p;
4666     ceph_osd_op& op = osd_op.op;
4667
4668     // TODO: check endianness (__le32 vs uint32_t, etc.)
4669     // The fields in ceph_osd_op are little-endian (according to the definition in rados.h),
4670     // but the code in this function seems to treat them as native-endian.  What should the
4671     // tracepoints do?
4672     tracepoint(osd, do_osd_op_pre, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op), op.flags);
4673
4674     dout(10) << "do_osd_op  " << osd_op << dendl;
4675
4676     bufferlist::iterator bp = osd_op.indata.begin();
4677
4678     // user-visible modifcation?
4679     switch (op.op) {
4680       // non user-visible modifications
4681     case CEPH_OSD_OP_WATCH:
4682     case CEPH_OSD_OP_CACHE_EVICT:
4683     case CEPH_OSD_OP_CACHE_FLUSH:
4684     case CEPH_OSD_OP_CACHE_TRY_FLUSH:
4685     case CEPH_OSD_OP_UNDIRTY:
4686     case CEPH_OSD_OP_COPY_FROM:  // we handle user_version update explicitly
4687     case CEPH_OSD_OP_CACHE_PIN:
4688     case CEPH_OSD_OP_CACHE_UNPIN:
4689     case CEPH_OSD_OP_SET_REDIRECT:
4690       break;
4691     default:
4692       if (op.op & CEPH_OSD_OP_MODE_WR)
4693         ctx->user_modify = true;
4694     }
4695
4696     // munge -1 truncate to 0 truncate
4697     if (ceph_osd_op_uses_extent(op.op) &&
4698         op.extent.truncate_seq == 1 &&
4699         op.extent.truncate_size == (-1ULL)) {
4700       op.extent.truncate_size = 0;
4701       op.extent.truncate_seq = 0;
4702     }
4703
4704     // munge ZERO -> TRUNCATE?  (don't munge to DELETE or we risk hosing attributes)
4705     if (op.op == CEPH_OSD_OP_ZERO &&
4706         obs.exists &&
4707         op.extent.offset < cct->_conf->osd_max_object_size &&
4708         op.extent.length >= 1 &&
4709         op.extent.length <= cct->_conf->osd_max_object_size &&
4710         op.extent.offset + op.extent.length >= oi.size) {
4711       if (op.extent.offset >= oi.size) {
4712         // no-op
4713         goto fail;
4714       }
4715       dout(10) << " munging ZERO " << op.extent.offset << "~" << op.extent.length
4716                << " -> TRUNCATE " << op.extent.offset << " (old size is " << oi.size << ")" << dendl;
4717       op.op = CEPH_OSD_OP_TRUNCATE;
4718     }
4719
4720     switch (op.op) {
4721
4722       // --- READS ---
4723
4724     case CEPH_OSD_OP_CMPEXT:
4725       ++ctx->num_read;
4726       tracepoint(osd, do_osd_op_pre_extent_cmp, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
4727       result = do_extent_cmp(ctx, osd_op);
4728       break;
4729
4730     case CEPH_OSD_OP_SYNC_READ:
4731       if (pool.info.require_rollback()) {
4732         result = -EOPNOTSUPP;
4733         break;
4734       }
4735       // fall through
4736     case CEPH_OSD_OP_READ:
4737       ++ctx->num_read;
4738       {
4739         __u32 seq = oi.truncate_seq;
4740         uint64_t size = oi.size;
4741         tracepoint(osd, do_osd_op_pre_read, soid.oid.name.c_str(), soid.snap.val, size, seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
4742         bool trimmed_read = false;
4743         // are we beyond truncate_size?
4744         if ( (seq < op.extent.truncate_seq) &&
4745              (op.extent.offset + op.extent.length > op.extent.truncate_size) )
4746           size = op.extent.truncate_size;
4747
4748         if (op.extent.length == 0) //length is zero mean read the whole object
4749           op.extent.length = size;
4750
4751         if (op.extent.offset >= size) {
4752           op.extent.length = 0;
4753           trimmed_read = true;
4754         } else if (op.extent.offset + op.extent.length > size) {
4755           op.extent.length = size - op.extent.offset;
4756           trimmed_read = true;
4757         }
4758
4759         // read into a buffer
4760         bool async = false;
4761         if (trimmed_read && op.extent.length == 0) {
4762           // read size was trimmed to zero and it is expected to do nothing
4763           // a read operation of 0 bytes does *not* do nothing, this is why
4764           // the trimmed_read boolean is needed
4765         } else if (pool.info.require_rollback()) {
4766           async = true;
4767           boost::optional<uint32_t> maybe_crc;
4768           // If there is a data digest and it is possible we are reading
4769           // entire object, pass the digest.  FillInVerifyExtent will
4770           // will check the oi.size again.
4771           if (oi.is_data_digest() && op.extent.offset == 0 &&
4772               op.extent.length >= oi.size)
4773             maybe_crc = oi.data_digest;
4774           ctx->pending_async_reads.push_back(
4775             make_pair(
4776               boost::make_tuple(op.extent.offset, op.extent.length, op.flags),
4777               make_pair(&osd_op.outdata,
4778                         new FillInVerifyExtent(&op.extent.length, &osd_op.rval,
4779                                 &osd_op.outdata, maybe_crc, oi.size, osd,
4780                                 soid, op.flags))));
4781           dout(10) << " async_read noted for " << soid << dendl;
4782         } else {
4783           int r = pgbackend->objects_read_sync(
4784             soid, op.extent.offset, op.extent.length, op.flags, &osd_op.outdata);
4785           if (r == -EIO) {
4786             r = rep_repair_primary_object(soid, ctx->op);
4787           }
4788           if (r >= 0)
4789             op.extent.length = r;
4790           else {
4791             result = r;
4792             op.extent.length = 0;
4793           }
4794           dout(10) << " read got " << r << " / " << op.extent.length
4795                    << " bytes from obj " << soid << dendl;
4796
4797           // whole object?  can we verify the checksum?
4798           if (op.extent.length == oi.size && oi.is_data_digest()) {
4799             uint32_t crc = osd_op.outdata.crc32c(-1);
4800             if (oi.data_digest != crc) {
4801               osd->clog->error() << info.pgid << std::hex
4802                                  << " full-object read crc 0x" << crc
4803                                  << " != expected 0x" << oi.data_digest
4804                                  << std::dec << " on " << soid;
4805               // FIXME fall back to replica or something?
4806               result = -EIO;
4807             }
4808           }
4809         }
4810         if (first_read) {
4811           first_read = false;
4812           ctx->data_off = op.extent.offset;
4813         }
4814         // XXX the op.extent.length is the requested length for async read
4815         // On error this length is changed to 0 after the error comes back.
4816         ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(op.extent.length, 10);
4817         ctx->delta_stats.num_rd++;
4818
4819         // Skip checking the result and just proceed to the next operation
4820         if (async)
4821           continue;
4822
4823       }
4824       break;
4825
4826     case CEPH_OSD_OP_CHECKSUM:
4827       ++ctx->num_read;
4828       {
4829         tracepoint(osd, do_osd_op_pre_checksum, soid.oid.name.c_str(),
4830                    soid.snap.val, oi.size, oi.truncate_seq, op.checksum.type,
4831                    op.checksum.offset, op.checksum.length,
4832                    op.checksum.chunk_size);
4833
4834         bool async_read;
4835         result = do_checksum(ctx, osd_op, &bp, &async_read);
4836         if (result == 0 && async_read) {
4837           continue;
4838         }
4839       }
4840       break;
4841
4842     /* map extents */
4843     case CEPH_OSD_OP_MAPEXT:
4844       tracepoint(osd, do_osd_op_pre_mapext, soid.oid.name.c_str(), soid.snap.val, op.extent.offset, op.extent.length);
4845       if (pool.info.require_rollback()) {
4846         result = -EOPNOTSUPP;
4847         break;
4848       }
4849       ++ctx->num_read;
4850       {
4851         // read into a buffer
4852         bufferlist bl;
4853         int r = osd->store->fiemap(ch, ghobject_t(soid, ghobject_t::NO_GEN,
4854                                                   info.pgid.shard),
4855                                    op.extent.offset, op.extent.length, bl);
4856         osd_op.outdata.claim(bl);
4857         if (r < 0)
4858           result = r;
4859         else
4860           ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(bl.length(), 10);
4861         ctx->delta_stats.num_rd++;
4862         dout(10) << " map_extents done on object " << soid << dendl;
4863       }
4864       break;
4865
4866     /* map extents */
4867     case CEPH_OSD_OP_SPARSE_READ:
4868       tracepoint(osd, do_osd_op_pre_sparse_read, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
4869       if (op.extent.truncate_seq) {
4870         dout(0) << "sparse_read does not support truncation sequence " << dendl;
4871         result = -EINVAL;
4872         break;
4873       }
4874       ++ctx->num_read;
4875       if (pool.info.ec_pool()) {
4876         // translate sparse read to a normal one if not supported
4877         uint64_t offset = op.extent.offset;
4878         uint64_t length = op.extent.length;
4879         if (offset > oi.size) {
4880           length = 0;
4881         } else if (offset + length > oi.size) {
4882           length = oi.size - offset;
4883         }
4884         if (length > 0) {
4885           ctx->pending_async_reads.push_back(
4886             make_pair(
4887               boost::make_tuple(offset, length, op.flags),
4888               make_pair(
4889                 &osd_op.outdata,
4890                 new ToSparseReadResult(
4891                   osd_op.outdata, offset,
4892                   op.extent.length /* updated by the callback */))));
4893           dout(10) << " async_read (was sparse_read) noted for " << soid << dendl;
4894         } else {
4895           dout(10) << " sparse read ended up empty for " << soid << dendl;
4896           map<uint64_t, uint64_t> extents;
4897           ::encode(extents, osd_op.outdata);
4898         }
4899       } else {
4900         // read into a buffer
4901         map<uint64_t, uint64_t> m;
4902         uint32_t total_read = 0;
4903         int r = osd->store->fiemap(ch, ghobject_t(soid, ghobject_t::NO_GEN,
4904                                                   info.pgid.shard),
4905                                    op.extent.offset, op.extent.length, m);
4906         if (r < 0)  {
4907           result = r;
4908           break;
4909         }
4910         map<uint64_t, uint64_t>::iterator miter;
4911         bufferlist data_bl;
4912         uint64_t last = op.extent.offset;
4913         for (miter = m.begin(); miter != m.end(); ++miter) {
4914           // verify hole?
4915           if (cct->_conf->osd_verify_sparse_read_holes &&
4916               last < miter->first) {
4917             bufferlist t;
4918             uint64_t len = miter->first - last;
4919             r = pgbackend->objects_read_sync(soid, last, len, op.flags, &t);
4920             if (r == -EIO) {
4921               r = rep_repair_primary_object(soid, ctx->op);
4922             }
4923             if (r < 0) {
4924               osd->clog->error() << coll << " " << soid
4925                                  << " sparse-read failed to read: "
4926                                  << r;
4927             } else if (!t.is_zero()) {
4928               osd->clog->error() << coll << " " << soid << " sparse-read found data in hole "
4929                                  << last << "~" << len;
4930             }
4931           }
4932
4933           bufferlist tmpbl;
4934           r = pgbackend->objects_read_sync(soid, miter->first, miter->second, op.flags, &tmpbl);
4935           if (r < 0) {
4936             result = r;
4937             break;
4938           }
4939
4940           if (r < (int)miter->second) /* this is usually happen when we get extent that exceeds the actual file size */
4941             miter->second = r;
4942           total_read += r;
4943           dout(10) << "sparse-read " << miter->first << "@" << miter->second << dendl;
4944           data_bl.claim_append(tmpbl);
4945           last = miter->first + r;
4946         }
4947
4948         if (r < 0) {
4949           result = r;
4950           break;
4951         }
4952
4953         // verify trailing hole?
4954         if (cct->_conf->osd_verify_sparse_read_holes) {
4955           uint64_t end = MIN(op.extent.offset + op.extent.length, oi.size);
4956           if (last < end) {
4957             bufferlist t;
4958             uint64_t len = end - last;
4959             r = pgbackend->objects_read_sync(soid, last, len, op.flags, &t);
4960             if (r < 0) {
4961               osd->clog->error() << coll << " " << soid
4962                                  << " sparse-read failed to read: "
4963                                  << r;
4964             } else if (!t.is_zero()) {
4965               osd->clog->error() << coll << " " << soid << " sparse-read found data in hole "
4966                                  << last << "~" << len;
4967             }
4968           }
4969         }
4970
4971         // Why SPARSE_READ need checksum? In fact, librbd always use sparse-read.
4972         // Maybe at first, there is no much whole objects. With continued use, more and more whole object exist.
4973         // So from this point, for spare-read add checksum make sense.
4974         if (total_read == oi.size && oi.is_data_digest()) {
4975           uint32_t crc = data_bl.crc32c(-1);
4976           if (oi.data_digest != crc) {
4977             osd->clog->error() << info.pgid << std::hex
4978               << " full-object read crc 0x" << crc
4979               << " != expected 0x" << oi.data_digest
4980               << std::dec << " on " << soid;
4981             // FIXME fall back to replica or something?
4982             result = -EIO;
4983             break;
4984           }
4985         }
4986
4987         op.extent.length = total_read;
4988
4989         ::encode(m, osd_op.outdata); // re-encode since it might be modified
4990         ::encode_destructively(data_bl, osd_op.outdata);
4991
4992         dout(10) << " sparse_read got " << total_read << " bytes from object " << soid << dendl;
4993       }
4994       ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(op.extent.length, 10);
4995       ctx->delta_stats.num_rd++;
4996       break;
4997
4998     case CEPH_OSD_OP_CALL:
4999       {
5000         string cname, mname;
5001         bufferlist indata;
5002         try {
5003           bp.copy(op.cls.class_len, cname);
5004           bp.copy(op.cls.method_len, mname);
5005           bp.copy(op.cls.indata_len, indata);
5006         } catch (buffer::error& e) {
5007           dout(10) << "call unable to decode class + method + indata" << dendl;
5008           dout(30) << "in dump: ";
5009           osd_op.indata.hexdump(*_dout);
5010           *_dout << dendl;
5011           result = -EINVAL;
5012           tracepoint(osd, do_osd_op_pre_call, soid.oid.name.c_str(), soid.snap.val, "???", "???");
5013           break;
5014         }
5015         tracepoint(osd, do_osd_op_pre_call, soid.oid.name.c_str(), soid.snap.val, cname.c_str(), mname.c_str());
5016
5017         ClassHandler::ClassData *cls;
5018         result = osd->class_handler->open_class(cname, &cls);
5019         assert(result == 0);   // init_op_flags() already verified this works.
5020
5021         ClassHandler::ClassMethod *method = cls->get_method(mname.c_str());
5022         if (!method) {
5023           dout(10) << "call method " << cname << "." << mname << " does not exist" << dendl;
5024           result = -EOPNOTSUPP;
5025           break;
5026         }
5027
5028         int flags = method->get_flags();
5029         if (flags & CLS_METHOD_WR)
5030           ctx->user_modify = true;
5031
5032         bufferlist outdata;
5033         dout(10) << "call method " << cname << "." << mname << dendl;
5034         int prev_rd = ctx->num_read;
5035         int prev_wr = ctx->num_write;
5036         result = method->exec((cls_method_context_t)&ctx, indata, outdata);
5037
5038         if (ctx->num_read > prev_rd && !(flags & CLS_METHOD_RD)) {
5039           derr << "method " << cname << "." << mname << " tried to read object but is not marked RD" << dendl;
5040           result = -EIO;
5041           break;
5042         }
5043         if (ctx->num_write > prev_wr && !(flags & CLS_METHOD_WR)) {
5044           derr << "method " << cname << "." << mname << " tried to update object but is not marked WR" << dendl;
5045           result = -EIO;
5046           break;
5047         }
5048
5049         dout(10) << "method called response length=" << outdata.length() << dendl;
5050         op.extent.length = outdata.length();
5051         osd_op.outdata.claim_append(outdata);
5052         dout(30) << "out dump: ";
5053         osd_op.outdata.hexdump(*_dout);
5054         *_dout << dendl;
5055       }
5056       break;
5057
5058     case CEPH_OSD_OP_STAT:
5059       // note: stat does not require RD
5060       {
5061         tracepoint(osd, do_osd_op_pre_stat, soid.oid.name.c_str(), soid.snap.val);
5062
5063         if (obs.exists && !oi.is_whiteout()) {
5064           ::encode(oi.size, osd_op.outdata);
5065           ::encode(oi.mtime, osd_op.outdata);
5066           dout(10) << "stat oi has " << oi.size << " " << oi.mtime << dendl;
5067         } else {
5068           result = -ENOENT;
5069           dout(10) << "stat oi object does not exist" << dendl;
5070         }
5071
5072         ctx->delta_stats.num_rd++;
5073       }
5074       break;
5075
5076     case CEPH_OSD_OP_ISDIRTY:
5077       ++ctx->num_read;
5078       {
5079         tracepoint(osd, do_osd_op_pre_isdirty, soid.oid.name.c_str(), soid.snap.val);
5080         bool is_dirty = obs.oi.is_dirty();
5081         ::encode(is_dirty, osd_op.outdata);
5082         ctx->delta_stats.num_rd++;
5083         result = 0;
5084       }
5085       break;
5086
5087     case CEPH_OSD_OP_UNDIRTY:
5088       ++ctx->num_write;
5089       {
5090         tracepoint(osd, do_osd_op_pre_undirty, soid.oid.name.c_str(), soid.snap.val);
5091         if (oi.is_dirty()) {
5092           ctx->undirty = true;  // see make_writeable()
5093           ctx->modify = true;
5094           ctx->delta_stats.num_wr++;
5095         }
5096         result = 0;
5097       }
5098       break;
5099
5100     case CEPH_OSD_OP_CACHE_TRY_FLUSH:
5101       ++ctx->num_write;
5102       {
5103         tracepoint(osd, do_osd_op_pre_try_flush, soid.oid.name.c_str(), soid.snap.val);
5104         if (ctx->lock_type != ObjectContext::RWState::RWNONE) {
5105           dout(10) << "cache-try-flush without SKIPRWLOCKS flag set" << dendl;
5106           result = -EINVAL;
5107           break;
5108         }
5109         if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
5110           result = -EINVAL;
5111           break;
5112         }
5113         if (!obs.exists) {
5114           result = 0;
5115           break;
5116         }
5117         if (oi.is_cache_pinned()) {
5118           dout(10) << "cache-try-flush on a pinned object, consider unpin this object first" << dendl;
5119           result = -EPERM;
5120           break;
5121         }
5122         if (oi.is_dirty()) {
5123           result = start_flush(ctx->op, ctx->obc, false, NULL, boost::none);
5124           if (result == -EINPROGRESS)
5125             result = -EAGAIN;
5126         } else {
5127           result = 0;
5128         }
5129       }
5130       break;
5131
5132     case CEPH_OSD_OP_CACHE_FLUSH:
5133       ++ctx->num_write;
5134       {
5135         tracepoint(osd, do_osd_op_pre_cache_flush, soid.oid.name.c_str(), soid.snap.val);
5136         if (ctx->lock_type == ObjectContext::RWState::RWNONE) {
5137           dout(10) << "cache-flush with SKIPRWLOCKS flag set" << dendl;
5138           result = -EINVAL;
5139           break;
5140         }
5141         if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
5142           result = -EINVAL;
5143           break;
5144         }
5145         if (!obs.exists) {
5146           result = 0;
5147           break;
5148         }
5149         if (oi.is_cache_pinned()) {
5150           dout(10) << "cache-flush on a pinned object, consider unpin this object first" << dendl;
5151           result = -EPERM;
5152           break;
5153         }
5154         hobject_t missing;
5155         if (oi.is_dirty()) {
5156           result = start_flush(ctx->op, ctx->obc, true, &missing, boost::none);
5157           if (result == -EINPROGRESS)
5158             result = -EAGAIN;
5159         } else {
5160           result = 0;
5161         }
5162         // Check special return value which has set missing_return
5163         if (result == -ENOENT) {
5164           dout(10) << __func__ << " CEPH_OSD_OP_CACHE_FLUSH got ENOENT" << dendl;
5165           assert(!missing.is_min());
5166           wait_for_unreadable_object(missing, ctx->op);
5167           // Error code which is used elsewhere when wait_for_unreadable_object() is used
5168           result = -EAGAIN;
5169         }
5170       }
5171       break;
5172
5173     case CEPH_OSD_OP_CACHE_EVICT:
5174       ++ctx->num_write;
5175       {
5176         tracepoint(osd, do_osd_op_pre_cache_evict, soid.oid.name.c_str(), soid.snap.val);
5177         if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
5178           result = -EINVAL;
5179           break;
5180         }
5181         if (!obs.exists) {
5182           result = 0;
5183           break;
5184         }
5185         if (oi.is_cache_pinned()) {
5186           dout(10) << "cache-evict on a pinned object, consider unpin this object first" << dendl;
5187           result = -EPERM;
5188           break;
5189         }
5190         if (oi.is_dirty()) {
5191           result = -EBUSY;
5192           break;
5193         }
5194         if (!oi.watchers.empty()) {
5195           result = -EBUSY;
5196           break;
5197         }
5198         if (soid.snap == CEPH_NOSNAP) {
5199           result = _verify_no_head_clones(soid, ssc->snapset);
5200           if (result < 0)
5201             break;
5202         }
5203         result = _delete_oid(ctx, true, false);
5204         if (result >= 0) {
5205           // mark that this is a cache eviction to avoid triggering normal
5206           // make_writeable() clone or snapdir object creation in finish_ctx()
5207           ctx->cache_evict = true;
5208         }
5209         osd->logger->inc(l_osd_tier_evict);
5210       }
5211       break;
5212
5213     case CEPH_OSD_OP_GETXATTR:
5214       ++ctx->num_read;
5215       {
5216         string aname;
5217         bp.copy(op.xattr.name_len, aname);
5218         tracepoint(osd, do_osd_op_pre_getxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
5219         string name = "_" + aname;
5220         int r = getattr_maybe_cache(
5221           ctx->obc,
5222           name,
5223           &(osd_op.outdata));
5224         if (r >= 0) {
5225           op.xattr.value_len = osd_op.outdata.length();
5226           result = 0;
5227           ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
5228         } else
5229           result = r;
5230
5231         ctx->delta_stats.num_rd++;
5232       }
5233       break;
5234
5235    case CEPH_OSD_OP_GETXATTRS:
5236       ++ctx->num_read;
5237       {
5238         tracepoint(osd, do_osd_op_pre_getxattrs, soid.oid.name.c_str(), soid.snap.val);
5239         map<string, bufferlist> out;
5240         result = getattrs_maybe_cache(
5241           ctx->obc,
5242           &out,
5243           true);
5244
5245         bufferlist bl;
5246         ::encode(out, bl);
5247         ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(bl.length(), 10);
5248         ctx->delta_stats.num_rd++;
5249         osd_op.outdata.claim_append(bl);
5250       }
5251       break;
5252
5253     case CEPH_OSD_OP_CMPXATTR:
5254       ++ctx->num_read;
5255       {
5256         string aname;
5257         bp.copy(op.xattr.name_len, aname);
5258         tracepoint(osd, do_osd_op_pre_cmpxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
5259         string name = "_" + aname;
5260         name[op.xattr.name_len + 1] = 0;
5261
5262         bufferlist xattr;
5263         result = getattr_maybe_cache(
5264           ctx->obc,
5265           name,
5266           &xattr);
5267         if (result < 0 && result != -EEXIST && result != -ENODATA)
5268           break;
5269
5270         ctx->delta_stats.num_rd++;
5271         ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(xattr.length(), 10);
5272
5273         switch (op.xattr.cmp_mode) {
5274         case CEPH_OSD_CMPXATTR_MODE_STRING:
5275           {
5276             string val;
5277             bp.copy(op.xattr.value_len, val);
5278             val[op.xattr.value_len] = 0;
5279             dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name << " val=" << val
5280                      << " op=" << (int)op.xattr.cmp_op << " mode=" << (int)op.xattr.cmp_mode << dendl;
5281             result = do_xattr_cmp_str(op.xattr.cmp_op, val, xattr);
5282           }
5283           break;
5284
5285         case CEPH_OSD_CMPXATTR_MODE_U64:
5286           {
5287             uint64_t u64val;
5288             try {
5289               ::decode(u64val, bp);
5290             }
5291             catch (buffer::error& e) {
5292               result = -EINVAL;
5293               goto fail;
5294             }
5295             dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name << " val=" << u64val
5296                      << " op=" << (int)op.xattr.cmp_op << " mode=" << (int)op.xattr.cmp_mode << dendl;
5297             result = do_xattr_cmp_u64(op.xattr.cmp_op, u64val, xattr);
5298           }
5299           break;
5300
5301         default:
5302           dout(10) << "bad cmp mode " << (int)op.xattr.cmp_mode << dendl;
5303           result = -EINVAL;
5304         }
5305
5306         if (!result) {
5307           dout(10) << "comparison returned false" << dendl;
5308           result = -ECANCELED;
5309           break;
5310         }
5311         if (result < 0) {
5312           dout(10) << "comparison returned " << result << " " << cpp_strerror(-result) << dendl;
5313           break;
5314         }
5315
5316         dout(10) << "comparison returned true" << dendl;
5317       }
5318       break;
5319
5320     case CEPH_OSD_OP_ASSERT_VER:
5321       ++ctx->num_read;
5322       {
5323         uint64_t ver = op.assert_ver.ver;
5324         tracepoint(osd, do_osd_op_pre_assert_ver, soid.oid.name.c_str(), soid.snap.val, ver);
5325         if (!ver)
5326           result = -EINVAL;
5327         else if (ver < oi.user_version)
5328           result = -ERANGE;
5329         else if (ver > oi.user_version)
5330           result = -EOVERFLOW;
5331       }
5332       break;
5333
5334     case CEPH_OSD_OP_LIST_WATCHERS:
5335       ++ctx->num_read;
5336       {
5337         tracepoint(osd, do_osd_op_pre_list_watchers, soid.oid.name.c_str(), soid.snap.val);
5338         obj_list_watch_response_t resp;
5339
5340         map<pair<uint64_t, entity_name_t>, watch_info_t>::const_iterator oi_iter;
5341         for (oi_iter = oi.watchers.begin(); oi_iter != oi.watchers.end();
5342                                        ++oi_iter) {
5343           dout(20) << "key cookie=" << oi_iter->first.first
5344                << " entity=" << oi_iter->first.second << " "
5345                << oi_iter->second << dendl;
5346           assert(oi_iter->first.first == oi_iter->second.cookie);
5347           assert(oi_iter->first.second.is_client());
5348
5349           watch_item_t wi(oi_iter->first.second, oi_iter->second.cookie,
5350                  oi_iter->second.timeout_seconds, oi_iter->second.addr);
5351           resp.entries.push_back(wi);
5352         }
5353
5354         resp.encode(osd_op.outdata, ctx->get_features());
5355         result = 0;
5356
5357         ctx->delta_stats.num_rd++;
5358         break;
5359       }
5360
5361     case CEPH_OSD_OP_LIST_SNAPS:
5362       ++ctx->num_read;
5363       {
5364         tracepoint(osd, do_osd_op_pre_list_snaps, soid.oid.name.c_str(), soid.snap.val);
5365         obj_list_snap_response_t resp;
5366
5367         if (!ssc) {
5368           ssc = ctx->obc->ssc = get_snapset_context(soid, false);
5369         }
5370         assert(ssc);
5371
5372         int clonecount = ssc->snapset.clones.size();
5373         if (ssc->snapset.head_exists)
5374           clonecount++;
5375         resp.clones.reserve(clonecount);
5376         for (auto clone_iter = ssc->snapset.clones.begin();
5377              clone_iter != ssc->snapset.clones.end(); ++clone_iter) {
5378           clone_info ci;
5379           ci.cloneid = *clone_iter;
5380
5381           hobject_t clone_oid = soid;
5382           clone_oid.snap = *clone_iter;
5383
5384           if (!ssc->snapset.is_legacy()) {
5385             auto p = ssc->snapset.clone_snaps.find(*clone_iter);
5386             if (p == ssc->snapset.clone_snaps.end()) {
5387               osd->clog->error() << "osd." << osd->whoami
5388                                  << ": inconsistent clone_snaps found for oid "
5389                                  << soid << " clone " << *clone_iter
5390                                  << " snapset " << ssc->snapset;
5391               result = -EINVAL;
5392               break;
5393             }
5394             for (auto q = p->second.rbegin(); q != p->second.rend(); ++q) {
5395               ci.snaps.push_back(*q);
5396             }
5397           } else {
5398             /* No need to take a lock here.  We are only inspecting state cached on
5399              * in the ObjectContext, so we aren't performing an actual read unless
5400              * the clone obc is not already loaded (in which case, it cannot have
5401              * an in progress write).  We also do not risk exposing uncommitted
5402              * state since we do have a read lock on the head object or snapdir,
5403              * which we would have to write lock in order to make user visible
5404              * modifications to the snapshot state (snap trim related mutations
5405              * are not user visible).
5406              */
5407             if (is_missing_object(clone_oid)) {
5408               dout(20) << "LIST_SNAPS " << clone_oid << " missing" << dendl;
5409               wait_for_unreadable_object(clone_oid, ctx->op);
5410               result = -EAGAIN;
5411               break;
5412             }
5413
5414             ObjectContextRef clone_obc = get_object_context(clone_oid, false);
5415             if (!clone_obc) {
5416               if (maybe_handle_cache(
5417                     ctx->op, true, clone_obc, -ENOENT, clone_oid, true)) {
5418                 // promoting the clone
5419                 result = -EAGAIN;
5420               } else {
5421                 osd->clog->error() << "osd." << osd->whoami
5422                                    << ": missing clone " << clone_oid
5423                                    << " for oid "
5424                                    << soid;
5425                 // should not happen
5426                 result = -ENOENT;
5427               }
5428               break;
5429             }
5430             for (vector<snapid_t>::reverse_iterator p =
5431                    clone_obc->obs.oi.legacy_snaps.rbegin();
5432                  p != clone_obc->obs.oi.legacy_snaps.rend();
5433                  ++p) {
5434               ci.snaps.push_back(*p);
5435             }
5436           }
5437
5438           dout(20) << " clone " << *clone_iter << " snaps " << ci.snaps << dendl;
5439
5440           map<snapid_t, interval_set<uint64_t> >::const_iterator coi;
5441           coi = ssc->snapset.clone_overlap.find(ci.cloneid);
5442           if (coi == ssc->snapset.clone_overlap.end()) {
5443             osd->clog->error() << "osd." << osd->whoami
5444                                << ": inconsistent clone_overlap found for oid "
5445                               << soid << " clone " << *clone_iter;
5446             result = -EINVAL;
5447             break;
5448           }
5449           const interval_set<uint64_t> &o = coi->second;
5450           ci.overlap.reserve(o.num_intervals());
5451           for (interval_set<uint64_t>::const_iterator r = o.begin();
5452                r != o.end(); ++r) {
5453             ci.overlap.push_back(pair<uint64_t,uint64_t>(r.get_start(),
5454                                                          r.get_len()));
5455           }
5456
5457           map<snapid_t, uint64_t>::const_iterator si;
5458           si = ssc->snapset.clone_size.find(ci.cloneid);
5459           if (si == ssc->snapset.clone_size.end()) {
5460             osd->clog->error() << "osd." << osd->whoami
5461                                << ": inconsistent clone_size found for oid "
5462                                << soid << " clone " << *clone_iter;
5463             result = -EINVAL;
5464             break;
5465           }
5466           ci.size = si->second;
5467
5468           resp.clones.push_back(ci);
5469         }
5470         if (result < 0) {
5471           break;
5472         }
5473         if (ssc->snapset.head_exists &&
5474             !ctx->obc->obs.oi.is_whiteout()) {
5475           assert(obs.exists);
5476           clone_info ci;
5477           ci.cloneid = CEPH_NOSNAP;
5478
5479           //Size for HEAD is oi.size
5480           ci.size = oi.size;
5481
5482           resp.clones.push_back(ci);
5483         }
5484         resp.seq = ssc->snapset.seq;
5485
5486         resp.encode(osd_op.outdata);
5487         result = 0;
5488
5489         ctx->delta_stats.num_rd++;
5490         break;
5491       }
5492
5493    case CEPH_OSD_OP_NOTIFY:
5494       ++ctx->num_read;
5495       {
5496         uint32_t timeout;
5497         bufferlist bl;
5498
5499         try {
5500           uint32_t ver; // obsolete
5501           ::decode(ver, bp);
5502           ::decode(timeout, bp);
5503           ::decode(bl, bp);
5504         } catch (const buffer::error &e) {
5505           timeout = 0;
5506         }
5507         tracepoint(osd, do_osd_op_pre_notify, soid.oid.name.c_str(), soid.snap.val, timeout);
5508         if (!timeout)
5509           timeout = cct->_conf->osd_default_notify_timeout;
5510
5511         notify_info_t n;
5512         n.timeout = timeout;
5513         n.notify_id = osd->get_next_id(get_osdmap()->get_epoch());
5514         n.cookie = op.watch.cookie;
5515         n.bl = bl;
5516         ctx->notifies.push_back(n);
5517
5518         // return our unique notify id to the client
5519         ::encode(n.notify_id, osd_op.outdata);
5520       }
5521       break;
5522
5523     case CEPH_OSD_OP_NOTIFY_ACK:
5524       ++ctx->num_read;
5525       {
5526         try {
5527           uint64_t notify_id = 0;
5528           uint64_t watch_cookie = 0;
5529           ::decode(notify_id, bp);
5530           ::decode(watch_cookie, bp);
5531           bufferlist reply_bl;
5532           if (!bp.end()) {
5533             ::decode(reply_bl, bp);
5534           }
5535           tracepoint(osd, do_osd_op_pre_notify_ack, soid.oid.name.c_str(), soid.snap.val, notify_id, watch_cookie, "Y");
5536           OpContext::NotifyAck ack(notify_id, watch_cookie, reply_bl);
5537           ctx->notify_acks.push_back(ack);
5538         } catch (const buffer::error &e) {
5539           tracepoint(osd, do_osd_op_pre_notify_ack, soid.oid.name.c_str(), soid.snap.val, op.watch.cookie, 0, "N");
5540           OpContext::NotifyAck ack(
5541             // op.watch.cookie is actually the notify_id for historical reasons
5542             op.watch.cookie
5543             );
5544           ctx->notify_acks.push_back(ack);
5545         }
5546       }
5547       break;
5548
5549     case CEPH_OSD_OP_SETALLOCHINT:
5550       ++ctx->num_write;
5551       {
5552         tracepoint(osd, do_osd_op_pre_setallochint, soid.oid.name.c_str(), soid.snap.val, op.alloc_hint.expected_object_size, op.alloc_hint.expected_write_size);
5553         maybe_create_new_object(ctx);
5554         oi.expected_object_size = op.alloc_hint.expected_object_size;
5555         oi.expected_write_size = op.alloc_hint.expected_write_size;
5556         oi.alloc_hint_flags = op.alloc_hint.flags;
5557         t->set_alloc_hint(soid, op.alloc_hint.expected_object_size,
5558                           op.alloc_hint.expected_write_size,
5559                           op.alloc_hint.flags);
5560         ctx->delta_stats.num_wr++;
5561         result = 0;
5562       }
5563       break;
5564
5565
5566       // --- WRITES ---
5567
5568       // -- object data --
5569
5570     case CEPH_OSD_OP_WRITE:
5571       ++ctx->num_write;
5572       { // write
5573         __u32 seq = oi.truncate_seq;
5574         tracepoint(osd, do_osd_op_pre_write, soid.oid.name.c_str(), soid.snap.val, oi.size, seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
5575         if (op.extent.length != osd_op.indata.length()) {
5576           result = -EINVAL;
5577           break;
5578         }
5579
5580         if (pool.info.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED))
5581           op.flags = op.flags | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
5582
5583         if (pool.info.requires_aligned_append() &&
5584             (op.extent.offset % pool.info.required_alignment() != 0)) {
5585           result = -EOPNOTSUPP;
5586           break;
5587         }
5588
5589         if (!obs.exists) {
5590           if (pool.info.requires_aligned_append() && op.extent.offset) {
5591             result = -EOPNOTSUPP;
5592             break;
5593           }
5594         } else if (op.extent.offset != oi.size &&
5595                    pool.info.requires_aligned_append()) {
5596           result = -EOPNOTSUPP;
5597           break;
5598         }
5599
5600         if (seq && (seq > op.extent.truncate_seq) &&
5601             (op.extent.offset + op.extent.length > oi.size)) {
5602           // old write, arrived after trimtrunc
5603           op.extent.length = (op.extent.offset > oi.size ? 0 : oi.size - op.extent.offset);
5604           dout(10) << " old truncate_seq " << op.extent.truncate_seq << " < current " << seq
5605                    << ", adjusting write length to " << op.extent.length << dendl;
5606           bufferlist t;
5607           t.substr_of(osd_op.indata, 0, op.extent.length);
5608           osd_op.indata.swap(t);
5609         }
5610         if (op.extent.truncate_seq > seq) {
5611           // write arrives before trimtrunc
5612           if (obs.exists && !oi.is_whiteout()) {
5613             dout(10) << " truncate_seq " << op.extent.truncate_seq << " > current " << seq
5614                      << ", truncating to " << op.extent.truncate_size << dendl;
5615             t->truncate(soid, op.extent.truncate_size);
5616             oi.truncate_seq = op.extent.truncate_seq;
5617             oi.truncate_size = op.extent.truncate_size;
5618             if (op.extent.truncate_size != oi.size) {
5619               ctx->delta_stats.num_bytes -= oi.size;
5620               ctx->delta_stats.num_bytes += op.extent.truncate_size;
5621               oi.size = op.extent.truncate_size;
5622             }
5623           } else {
5624             dout(10) << " truncate_seq " << op.extent.truncate_seq << " > current " << seq
5625                      << ", but object is new" << dendl;
5626             oi.truncate_seq = op.extent.truncate_seq;
5627             oi.truncate_size = op.extent.truncate_size;
5628           }
5629         }
5630         result = check_offset_and_length(op.extent.offset, op.extent.length, cct->_conf->osd_max_object_size);
5631         if (result < 0)
5632           break;
5633
5634         maybe_create_new_object(ctx);
5635
5636         if (op.extent.length == 0) {
5637           if (op.extent.offset > oi.size) {
5638             t->truncate(
5639               soid, op.extent.offset);
5640           } else {
5641             t->nop(soid);
5642           }
5643         } else {
5644           t->write(
5645             soid, op.extent.offset, op.extent.length, osd_op.indata, op.flags);
5646         }
5647
5648         if (op.extent.offset == 0 && op.extent.length >= oi.size)
5649           obs.oi.set_data_digest(osd_op.indata.crc32c(-1));
5650         else if (op.extent.offset == oi.size && obs.oi.is_data_digest())
5651           obs.oi.set_data_digest(osd_op.indata.crc32c(obs.oi.data_digest));
5652         else
5653           obs.oi.clear_data_digest();
5654         write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
5655                                     op.extent.offset, op.extent.length);
5656
5657       }
5658       break;
5659
5660     case CEPH_OSD_OP_WRITEFULL:
5661       ++ctx->num_write;
5662       { // write full object
5663         tracepoint(osd, do_osd_op_pre_writefull, soid.oid.name.c_str(), soid.snap.val, oi.size, 0, op.extent.length);
5664
5665         if (op.extent.length != osd_op.indata.length()) {
5666           result = -EINVAL;
5667           break;
5668         }
5669         result = check_offset_and_length(0, op.extent.length, cct->_conf->osd_max_object_size);
5670         if (result < 0)
5671           break;
5672
5673         if (pool.info.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED))
5674           op.flags = op.flags | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
5675
5676         maybe_create_new_object(ctx);
5677         if (pool.info.require_rollback()) {
5678           t->truncate(soid, 0);
5679         } else if (obs.exists && op.extent.length < oi.size) {
5680           t->truncate(soid, op.extent.length);
5681         }
5682         if (op.extent.length) {
5683           t->write(soid, 0, op.extent.length, osd_op.indata, op.flags);
5684         }
5685         obs.oi.set_data_digest(osd_op.indata.crc32c(-1));
5686
5687         write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
5688             0, op.extent.length, true);
5689       }
5690       break;
5691
5692     case CEPH_OSD_OP_WRITESAME:
5693       ++ctx->num_write;
5694       tracepoint(osd, do_osd_op_pre_writesame, soid.oid.name.c_str(), soid.snap.val, oi.size, op.writesame.offset, op.writesame.length, op.writesame.data_length);
5695       result = do_writesame(ctx, osd_op);
5696       break;
5697
5698     case CEPH_OSD_OP_ROLLBACK :
5699       ++ctx->num_write;
5700       tracepoint(osd, do_osd_op_pre_rollback, soid.oid.name.c_str(), soid.snap.val);
5701       result = _rollback_to(ctx, op);
5702       break;
5703
5704     case CEPH_OSD_OP_ZERO:
5705       tracepoint(osd, do_osd_op_pre_zero, soid.oid.name.c_str(), soid.snap.val, op.extent.offset, op.extent.length);
5706       if (pool.info.requires_aligned_append()) {
5707         result = -EOPNOTSUPP;
5708         break;
5709       }
5710       ++ctx->num_write;
5711       { // zero
5712         result = check_offset_and_length(op.extent.offset, op.extent.length, cct->_conf->osd_max_object_size);
5713         if (result < 0)
5714           break;
5715         assert(op.extent.length);
5716         if (obs.exists && !oi.is_whiteout()) {
5717           t->zero(soid, op.extent.offset, op.extent.length);
5718           interval_set<uint64_t> ch;
5719           ch.insert(op.extent.offset, op.extent.length);
5720           ctx->modified_ranges.union_of(ch);
5721           ctx->delta_stats.num_wr++;
5722           oi.clear_data_digest();
5723         } else {
5724           // no-op
5725         }
5726       }
5727       break;
5728     case CEPH_OSD_OP_CREATE:
5729       ++ctx->num_write;
5730       {
5731         tracepoint(osd, do_osd_op_pre_create, soid.oid.name.c_str(), soid.snap.val);
5732         int flags = le32_to_cpu(op.flags);
5733         if (obs.exists && !oi.is_whiteout() &&
5734             (flags & CEPH_OSD_OP_FLAG_EXCL)) {
5735           result = -EEXIST; /* this is an exclusive create */
5736         } else {
5737           if (osd_op.indata.length()) {
5738             bufferlist::iterator p = osd_op.indata.begin();
5739             string category;
5740             try {
5741               ::decode(category, p);
5742             }
5743             catch (buffer::error& e) {
5744               result = -EINVAL;
5745               goto fail;
5746             }
5747             // category is no longer implemented.
5748           }
5749           if (result >= 0) {
5750             maybe_create_new_object(ctx);
5751             t->nop(soid);
5752           }
5753         }
5754       }
5755       break;
5756
5757     case CEPH_OSD_OP_TRIMTRUNC:
5758       op.extent.offset = op.extent.truncate_size;
5759       // falling through
5760
5761     case CEPH_OSD_OP_TRUNCATE:
5762       tracepoint(osd, do_osd_op_pre_truncate, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
5763       if (pool.info.requires_aligned_append()) {
5764         result = -EOPNOTSUPP;
5765         break;
5766       }
5767       ++ctx->num_write;
5768       {
5769         // truncate
5770         if (!obs.exists || oi.is_whiteout()) {
5771           dout(10) << " object dne, truncate is a no-op" << dendl;
5772           break;
5773         }
5774
5775         if (op.extent.offset > cct->_conf->osd_max_object_size) {
5776           result = -EFBIG;
5777           break;
5778         }
5779
5780         if (op.extent.truncate_seq) {
5781           assert(op.extent.offset == op.extent.truncate_size);
5782           if (op.extent.truncate_seq <= oi.truncate_seq) {
5783             dout(10) << " truncate seq " << op.extent.truncate_seq << " <= current " << oi.truncate_seq
5784                      << ", no-op" << dendl;
5785             break; // old
5786           }
5787           dout(10) << " truncate seq " << op.extent.truncate_seq << " > current " << oi.truncate_seq
5788                    << ", truncating" << dendl;
5789           oi.truncate_seq = op.extent.truncate_seq;
5790           oi.truncate_size = op.extent.truncate_size;
5791         }
5792
5793         maybe_create_new_object(ctx);
5794         t->truncate(soid, op.extent.offset);
5795         if (oi.size > op.extent.offset) {
5796           interval_set<uint64_t> trim;
5797           trim.insert(op.extent.offset, oi.size-op.extent.offset);
5798           ctx->modified_ranges.union_of(trim);
5799         }
5800         if (op.extent.offset != oi.size) {
5801           ctx->delta_stats.num_bytes -= oi.size;
5802           ctx->delta_stats.num_bytes += op.extent.offset;
5803           oi.size = op.extent.offset;
5804         }
5805         ctx->delta_stats.num_wr++;
5806         // do no set exists, or we will break above DELETE -> TRUNCATE munging.
5807
5808         oi.clear_data_digest();
5809       }
5810       break;
5811
5812     case CEPH_OSD_OP_DELETE:
5813       ++ctx->num_write;
5814       tracepoint(osd, do_osd_op_pre_delete, soid.oid.name.c_str(), soid.snap.val);
5815       {
5816         result = _delete_oid(ctx, false, ctx->ignore_cache);
5817       }
5818       break;
5819
5820     case CEPH_OSD_OP_WATCH:
5821       ++ctx->num_write;
5822       {
5823         tracepoint(osd, do_osd_op_pre_watch, soid.oid.name.c_str(), soid.snap.val,
5824                    op.watch.cookie, op.watch.op);
5825         if (!obs.exists) {
5826           result = -ENOENT;
5827           break;
5828         }
5829         uint64_t cookie = op.watch.cookie;
5830         entity_name_t entity = ctx->reqid.name;
5831         ObjectContextRef obc = ctx->obc;
5832
5833         dout(10) << "watch " << ceph_osd_watch_op_name(op.watch.op)
5834                  << ": ctx->obc=" << (void *)obc.get() << " cookie=" << cookie
5835                  << " oi.version=" << oi.version.version << " ctx->at_version=" << ctx->at_version << dendl;
5836         dout(10) << "watch: oi.user_version=" << oi.user_version<< dendl;
5837         dout(10) << "watch: peer_addr="
5838           << ctx->op->get_req()->get_connection()->get_peer_addr() << dendl;
5839
5840         uint32_t timeout = cct->_conf->osd_client_watch_timeout;
5841         if (op.watch.timeout != 0) {
5842           timeout = op.watch.timeout;
5843         }
5844
5845         watch_info_t w(cookie, timeout,
5846           ctx->op->get_req()->get_connection()->get_peer_addr());
5847         if (op.watch.op == CEPH_OSD_WATCH_OP_WATCH ||
5848             op.watch.op == CEPH_OSD_WATCH_OP_LEGACY_WATCH) {
5849           if (oi.watchers.count(make_pair(cookie, entity))) {
5850             dout(10) << " found existing watch " << w << " by " << entity << dendl;
5851           } else {
5852             dout(10) << " registered new watch " << w << " by " << entity << dendl;
5853             oi.watchers[make_pair(cookie, entity)] = w;
5854             t->nop(soid);  // make sure update the object_info on disk!
5855           }
5856           bool will_ping = (op.watch.op == CEPH_OSD_WATCH_OP_WATCH);
5857           ctx->watch_connects.push_back(make_pair(w, will_ping));
5858         } else if (op.watch.op == CEPH_OSD_WATCH_OP_RECONNECT) {
5859           if (!oi.watchers.count(make_pair(cookie, entity))) {
5860             result = -ENOTCONN;
5861             break;
5862           }
5863           dout(10) << " found existing watch " << w << " by " << entity << dendl;
5864           ctx->watch_connects.push_back(make_pair(w, true));
5865         } else if (op.watch.op == CEPH_OSD_WATCH_OP_PING) {
5866           /* Note: WATCH with PING doesn't cause may_write() to return true,
5867            * so if there is nothing else in the transaction, this is going
5868            * to run do_osd_op_effects, but not write out a log entry */
5869           if (!oi.watchers.count(make_pair(cookie, entity))) {
5870             result = -ENOTCONN;
5871             break;
5872           }
5873           map<pair<uint64_t,entity_name_t>,WatchRef>::iterator p =
5874             obc->watchers.find(make_pair(cookie, entity));
5875           if (p == obc->watchers.end() ||
5876               !p->second->is_connected()) {
5877             // client needs to reconnect
5878             result = -ETIMEDOUT;
5879             break;
5880           }
5881           dout(10) << " found existing watch " << w << " by " << entity << dendl;
5882           p->second->got_ping(ceph_clock_now());
5883           result = 0;
5884         } else if (op.watch.op == CEPH_OSD_WATCH_OP_UNWATCH) {
5885           map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator oi_iter =
5886             oi.watchers.find(make_pair(cookie, entity));
5887           if (oi_iter != oi.watchers.end()) {
5888             dout(10) << " removed watch " << oi_iter->second << " by "
5889                      << entity << dendl;
5890             oi.watchers.erase(oi_iter);
5891             t->nop(soid);  // update oi on disk
5892             ctx->watch_disconnects.push_back(
5893               watch_disconnect_t(cookie, entity, false));
5894           } else {
5895             dout(10) << " can't remove: no watch by " << entity << dendl;
5896           }
5897         }
5898       }
5899       break;
5900
5901     case CEPH_OSD_OP_CACHE_PIN:
5902       tracepoint(osd, do_osd_op_pre_cache_pin, soid.oid.name.c_str(), soid.snap.val);
5903       if ((!pool.info.is_tier() ||
5904           pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)) {
5905         result = -EINVAL;
5906         dout(10) << " pin object is only allowed on the cache tier " << dendl;
5907         break;
5908       }
5909       ++ctx->num_write;
5910       {
5911         if (!obs.exists || oi.is_whiteout()) {
5912           result = -ENOENT;
5913           break;
5914         }
5915
5916         if (!oi.is_cache_pinned()) {
5917           oi.set_flag(object_info_t::FLAG_CACHE_PIN);
5918           ctx->modify = true;
5919           ctx->delta_stats.num_objects_pinned++;
5920           ctx->delta_stats.num_wr++;
5921         }
5922         result = 0;
5923       }
5924       break;
5925
5926     case CEPH_OSD_OP_CACHE_UNPIN:
5927       tracepoint(osd, do_osd_op_pre_cache_unpin, soid.oid.name.c_str(), soid.snap.val);
5928       if ((!pool.info.is_tier() ||
5929           pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)) {
5930         result = -EINVAL;
5931         dout(10) << " pin object is only allowed on the cache tier " << dendl;
5932         break;
5933       }
5934       ++ctx->num_write;
5935       {
5936         if (!obs.exists || oi.is_whiteout()) {
5937           result = -ENOENT;
5938           break;
5939         }
5940
5941         if (oi.is_cache_pinned()) {
5942           oi.clear_flag(object_info_t::FLAG_CACHE_PIN);
5943           ctx->modify = true;
5944           ctx->delta_stats.num_objects_pinned--;
5945           ctx->delta_stats.num_wr++;
5946         }
5947         result = 0;
5948       }
5949       break;
5950
5951     case CEPH_OSD_OP_SET_REDIRECT:
5952       ++ctx->num_write;
5953       {
5954         if (pool.info.is_tier()) {
5955           result = -EINVAL;
5956           break;
5957         }
5958         if (!obs.exists) {
5959           result = -ENOENT;
5960           break;
5961         }
5962         if (get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS) {
5963           result = -EOPNOTSUPP;
5964           break;
5965         }
5966
5967         object_t target_name;
5968         object_locator_t target_oloc;
5969         snapid_t target_snapid = (uint64_t)op.copy_from.snapid;
5970         version_t target_version = op.copy_from.src_version;
5971         try {
5972           ::decode(target_name, bp);
5973           ::decode(target_oloc, bp);
5974         }
5975         catch (buffer::error& e) {
5976           result = -EINVAL;
5977           goto fail;
5978         }
5979         pg_t raw_pg;
5980         get_osdmap()->object_locator_to_pg(target_name, target_oloc, raw_pg);
5981         hobject_t target(target_name, target_oloc.key, target_snapid,
5982                 raw_pg.ps(), raw_pg.pool(),
5983                 target_oloc.nspace);
5984         if (target == soid) {
5985           dout(20) << " set-redirect self is invalid" << dendl;
5986           result = -EINVAL;
5987           break;
5988         }
5989         oi.set_flag(object_info_t::FLAG_MANIFEST);
5990         oi.manifest.redirect_target = target;
5991         oi.manifest.type = object_manifest_t::TYPE_REDIRECT;
5992         t->truncate(soid, 0);
5993         if (oi.is_omap() && pool.info.supports_omap()) {
5994           t->omap_clear(soid);
5995           obs.oi.clear_omap_digest();
5996           obs.oi.clear_flag(object_info_t::FLAG_OMAP);
5997         }
5998         ctx->delta_stats.num_bytes -= oi.size;
5999         oi.size = 0;
6000         oi.new_object();
6001         oi.user_version = target_version;
6002         ctx->user_at_version = target_version;
6003         /* rm_attrs */
6004         map<string,bufferlist> rmattrs;
6005         result = getattrs_maybe_cache(ctx->obc,
6006                     &rmattrs,
6007                     true);
6008         if (result < 0) {
6009           return result;
6010         }
6011         map<string, bufferlist>::iterator iter;
6012         for (iter = rmattrs.begin(); iter != rmattrs.end(); ++iter) {
6013           const string& name = iter->first;
6014           t->rmattr(soid, name);
6015         }
6016         dout(10) << "set-redirect oid:" << oi.soid << " user_version: " << oi.user_version << dendl;
6017       }
6018
6019       break;
6020
6021       // -- object attrs --
6022
6023     case CEPH_OSD_OP_SETXATTR:
6024       ++ctx->num_write;
6025       {
6026         if (cct->_conf->osd_max_attr_size > 0 &&
6027             op.xattr.value_len > cct->_conf->osd_max_attr_size) {
6028           tracepoint(osd, do_osd_op_pre_setxattr, soid.oid.name.c_str(), soid.snap.val, "???");
6029           result = -EFBIG;
6030           break;
6031         }
6032         unsigned max_name_len = MIN(osd->store->get_max_attr_name_length(),
6033                                     cct->_conf->osd_max_attr_name_len);
6034         if (op.xattr.name_len > max_name_len) {
6035           result = -ENAMETOOLONG;
6036           break;
6037         }
6038         maybe_create_new_object(ctx);
6039         string aname;
6040         bp.copy(op.xattr.name_len, aname);
6041         tracepoint(osd, do_osd_op_pre_setxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
6042         string name = "_" + aname;
6043         bufferlist bl;
6044         bp.copy(op.xattr.value_len, bl);
6045         t->setattr(soid, name, bl);
6046         ctx->delta_stats.num_wr++;
6047       }
6048       break;
6049
6050     case CEPH_OSD_OP_RMXATTR:
6051       ++ctx->num_write;
6052       {
6053         string aname;
6054         bp.copy(op.xattr.name_len, aname);
6055         tracepoint(osd, do_osd_op_pre_rmxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
6056         if (!obs.exists || oi.is_whiteout()) {
6057           result = -ENOENT;
6058           break;
6059         }
6060         string name = "_" + aname;
6061         t->rmattr(soid, name);
6062         ctx->delta_stats.num_wr++;
6063       }
6064       break;
6065
6066
6067       // -- fancy writers --
6068     case CEPH_OSD_OP_APPEND:
6069       {
6070         tracepoint(osd, do_osd_op_pre_append, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
6071         // just do it inline; this works because we are happy to execute
6072         // fancy op on replicas as well.
6073         vector<OSDOp> nops(1);
6074         OSDOp& newop = nops[0];
6075         newop.op.op = CEPH_OSD_OP_WRITE;
6076         newop.op.extent.offset = oi.size;
6077         newop.op.extent.length = op.extent.length;
6078         newop.op.extent.truncate_seq = oi.truncate_seq;
6079         newop.indata = osd_op.indata;
6080         result = do_osd_ops(ctx, nops);
6081         osd_op.outdata.claim(newop.outdata);
6082       }
6083       break;
6084
6085     case CEPH_OSD_OP_STARTSYNC:
6086       tracepoint(osd, do_osd_op_pre_startsync, soid.oid.name.c_str(), soid.snap.val);
6087       t->nop(soid);
6088       break;
6089
6090
6091       // -- trivial map --
6092     case CEPH_OSD_OP_TMAPGET:
6093       tracepoint(osd, do_osd_op_pre_tmapget, soid.oid.name.c_str(), soid.snap.val);
6094       if (pool.info.require_rollback()) {
6095         result = -EOPNOTSUPP;
6096         break;
6097       }
6098       {
6099         vector<OSDOp> nops(1);
6100         OSDOp& newop = nops[0];
6101         newop.op.op = CEPH_OSD_OP_SYNC_READ;
6102         newop.op.extent.offset = 0;
6103         newop.op.extent.length = 0;
6104         do_osd_ops(ctx, nops);
6105         osd_op.outdata.claim(newop.outdata);
6106       }
6107       break;
6108
6109     case CEPH_OSD_OP_TMAPPUT:
6110       tracepoint(osd, do_osd_op_pre_tmapput, soid.oid.name.c_str(), soid.snap.val);
6111       if (pool.info.require_rollback()) {
6112         result = -EOPNOTSUPP;
6113         break;
6114       }
6115       {
6116         //_dout_lock.Lock();
6117         //osd_op.data.hexdump(*_dout);
6118         //_dout_lock.Unlock();
6119
6120         // verify sort order
6121         bool unsorted = false;
6122         if (true) {
6123           bufferlist header;
6124           ::decode(header, bp);
6125           uint32_t n;
6126           ::decode(n, bp);
6127           string last_key;
6128           while (n--) {
6129             string key;
6130             ::decode(key, bp);
6131             dout(10) << "tmapput key " << key << dendl;
6132             bufferlist val;
6133             ::decode(val, bp);
6134             if (key < last_key) {
6135               dout(10) << "TMAPPUT is unordered; resorting" << dendl;
6136               unsorted = true;
6137               break;
6138             }
6139             last_key = key;
6140           }
6141         }
6142
6143         // write it
6144         vector<OSDOp> nops(1);
6145         OSDOp& newop = nops[0];
6146         newop.op.op = CEPH_OSD_OP_WRITEFULL;
6147         newop.op.extent.offset = 0;
6148         newop.op.extent.length = osd_op.indata.length();
6149         newop.indata = osd_op.indata;
6150
6151         if (unsorted) {
6152           bp = osd_op.indata.begin();
6153           bufferlist header;
6154           map<string, bufferlist> m;
6155           ::decode(header, bp);
6156           ::decode(m, bp);
6157           assert(bp.end());
6158           bufferlist newbl;
6159           ::encode(header, newbl);
6160           ::encode(m, newbl);
6161           newop.indata = newbl;
6162         }
6163         result = do_osd_ops(ctx, nops);
6164         assert(result == 0);
6165       }
6166       break;
6167
6168     case CEPH_OSD_OP_TMAPUP:
6169       tracepoint(osd, do_osd_op_pre_tmapup, soid.oid.name.c_str(), soid.snap.val);
6170       if (pool.info.require_rollback()) {
6171         result = -EOPNOTSUPP;
6172         break;
6173       }
6174       ++ctx->num_write;
6175       result = do_tmapup(ctx, bp, osd_op);
6176       break;
6177
6178     case CEPH_OSD_OP_TMAP2OMAP:
6179       ++ctx->num_write;
6180       tracepoint(osd, do_osd_op_pre_tmap2omap, soid.oid.name.c_str(), soid.snap.val);
6181       result = do_tmap2omap(ctx, op.tmap2omap.flags);
6182       break;
6183
6184       // OMAP Read ops
6185     case CEPH_OSD_OP_OMAPGETKEYS:
6186       ++ctx->num_read;
6187       {
6188         string start_after;
6189         uint64_t max_return;
6190         try {
6191           ::decode(start_after, bp);
6192           ::decode(max_return, bp);
6193         }
6194         catch (buffer::error& e) {
6195           result = -EINVAL;
6196           tracepoint(osd, do_osd_op_pre_omapgetkeys, soid.oid.name.c_str(), soid.snap.val, "???", 0);
6197           goto fail;
6198         }
6199         if (max_return > cct->_conf->osd_max_omap_entries_per_request) {
6200           max_return = cct->_conf->osd_max_omap_entries_per_request;
6201         }
6202         tracepoint(osd, do_osd_op_pre_omapgetkeys, soid.oid.name.c_str(), soid.snap.val, start_after.c_str(), max_return);
6203
6204         bufferlist bl;
6205         uint32_t num = 0;
6206         bool truncated = false;
6207         if (oi.is_omap()) {
6208           ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(
6209             coll, ghobject_t(soid)
6210             );
6211           assert(iter);
6212           iter->upper_bound(start_after);
6213           for (num = 0; iter->valid(); ++num, iter->next(false)) {
6214             if (num >= max_return ||
6215                 bl.length() >= cct->_conf->osd_max_omap_bytes_per_request) {
6216               truncated = true;
6217               break;
6218             }
6219             ::encode(iter->key(), bl);
6220           }
6221         } // else return empty out_set
6222         ::encode(num, osd_op.outdata);
6223         osd_op.outdata.claim_append(bl);
6224         ::encode(truncated, osd_op.outdata);
6225         ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
6226         ctx->delta_stats.num_rd++;
6227       }
6228       break;
6229
6230     case CEPH_OSD_OP_OMAPGETVALS:
6231       ++ctx->num_read;
6232       {
6233         string start_after;
6234         uint64_t max_return;
6235         string filter_prefix;
6236         try {
6237           ::decode(start_after, bp);
6238           ::decode(max_return, bp);
6239           ::decode(filter_prefix, bp);
6240         }
6241         catch (buffer::error& e) {
6242           result = -EINVAL;
6243           tracepoint(osd, do_osd_op_pre_omapgetvals, soid.oid.name.c_str(), soid.snap.val, "???", 0, "???");
6244           goto fail;
6245         }
6246         if (max_return > cct->_conf->osd_max_omap_entries_per_request) {
6247           max_return = cct->_conf->osd_max_omap_entries_per_request;
6248         }
6249         tracepoint(osd, do_osd_op_pre_omapgetvals, soid.oid.name.c_str(), soid.snap.val, start_after.c_str(), max_return, filter_prefix.c_str());
6250
6251         uint32_t num = 0;
6252         bool truncated = false;
6253         bufferlist bl;
6254         if (oi.is_omap()) {
6255           ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(
6256             coll, ghobject_t(soid)
6257             );
6258           if (!iter) {
6259             result = -ENOENT;
6260             goto fail;
6261           }
6262           iter->upper_bound(start_after);
6263           if (filter_prefix > start_after) iter->lower_bound(filter_prefix);
6264           for (num = 0;
6265                iter->valid() &&
6266                  iter->key().substr(0, filter_prefix.size()) == filter_prefix;
6267                ++num, iter->next(false)) {
6268             dout(20) << "Found key " << iter->key() << dendl;
6269             if (num >= max_return ||
6270                 bl.length() >= cct->_conf->osd_max_omap_bytes_per_request) {
6271               truncated = true;
6272               break;
6273             }
6274             ::encode(iter->key(), bl);
6275             ::encode(iter->value(), bl);
6276           }
6277         } // else return empty out_set
6278         ::encode(num, osd_op.outdata);
6279         osd_op.outdata.claim_append(bl);
6280         ::encode(truncated, osd_op.outdata);
6281         ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
6282         ctx->delta_stats.num_rd++;
6283       }
6284       break;
6285
6286     case CEPH_OSD_OP_OMAPGETHEADER:
6287       tracepoint(osd, do_osd_op_pre_omapgetheader, soid.oid.name.c_str(), soid.snap.val);
6288       if (!oi.is_omap()) {
6289         // return empty header
6290         break;
6291       }
6292       ++ctx->num_read;
6293       {
6294         osd->store->omap_get_header(ch, ghobject_t(soid), &osd_op.outdata);
6295         ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
6296         ctx->delta_stats.num_rd++;
6297       }
6298       break;
6299
6300     case CEPH_OSD_OP_OMAPGETVALSBYKEYS:
6301       ++ctx->num_read;
6302       {
6303         set<string> keys_to_get;
6304         try {
6305           ::decode(keys_to_get, bp);
6306         }
6307         catch (buffer::error& e) {
6308           result = -EINVAL;
6309           tracepoint(osd, do_osd_op_pre_omapgetvalsbykeys, soid.oid.name.c_str(), soid.snap.val, "???");
6310           goto fail;
6311         }
6312         tracepoint(osd, do_osd_op_pre_omapgetvalsbykeys, soid.oid.name.c_str(), soid.snap.val, list_entries(keys_to_get).c_str());
6313         map<string, bufferlist> out;
6314         if (oi.is_omap()) {
6315           osd->store->omap_get_values(ch, ghobject_t(soid), keys_to_get, &out);
6316         } // else return empty omap entries
6317         ::encode(out, osd_op.outdata);
6318         ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
6319         ctx->delta_stats.num_rd++;
6320       }
6321       break;
6322
6323     case CEPH_OSD_OP_OMAP_CMP:
6324       ++ctx->num_read;
6325       {
6326         if (!obs.exists || oi.is_whiteout()) {
6327           result = -ENOENT;
6328           tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, "???");
6329           break;
6330         }
6331         map<string, pair<bufferlist, int> > assertions;
6332         try {
6333           ::decode(assertions, bp);
6334         }
6335         catch (buffer::error& e) {
6336           result = -EINVAL;
6337           tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, "???");
6338           goto fail;
6339         }
6340         tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, list_keys(assertions).c_str());
6341
6342         map<string, bufferlist> out;
6343
6344         if (oi.is_omap()) {
6345           set<string> to_get;
6346           for (map<string, pair<bufferlist, int> >::iterator i = assertions.begin();
6347                i != assertions.end();
6348                ++i)
6349             to_get.insert(i->first);
6350           int r = osd->store->omap_get_values(ch, ghobject_t(soid),
6351                                               to_get, &out);
6352           if (r < 0) {
6353             result = r;
6354             break;
6355           }
6356         } // else leave out empty
6357
6358         //Should set num_rd_kb based on encode length of map
6359         ctx->delta_stats.num_rd++;
6360
6361         int r = 0;
6362         bufferlist empty;
6363         for (map<string, pair<bufferlist, int> >::iterator i = assertions.begin();
6364              i != assertions.end();
6365              ++i) {
6366           auto out_entry = out.find(i->first);
6367           bufferlist &bl = (out_entry != out.end()) ?
6368             out_entry->second : empty;
6369           switch (i->second.second) {
6370           case CEPH_OSD_CMPXATTR_OP_EQ:
6371             if (!(bl == i->second.first)) {
6372               r = -ECANCELED;
6373             }
6374             break;
6375           case CEPH_OSD_CMPXATTR_OP_LT:
6376             if (!(bl < i->second.first)) {
6377               r = -ECANCELED;
6378             }
6379             break;
6380           case CEPH_OSD_CMPXATTR_OP_GT:
6381             if (!(bl > i->second.first)) {
6382               r = -ECANCELED;
6383             }
6384             break;
6385           default:
6386             r = -EINVAL;
6387             break;
6388           }
6389           if (r < 0)
6390             break;
6391         }
6392         if (r < 0) {
6393           result = r;
6394         }
6395       }
6396       break;
6397
6398       // OMAP Write ops
6399     case CEPH_OSD_OP_OMAPSETVALS:
6400       if (!pool.info.supports_omap()) {
6401         result = -EOPNOTSUPP;
6402         tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
6403         break;
6404       }
6405       ++ctx->num_write;
6406       {
6407         maybe_create_new_object(ctx);
6408         bufferlist to_set_bl;
6409         try {
6410           decode_str_str_map_to_bl(bp, &to_set_bl);
6411         }
6412         catch (buffer::error& e) {
6413           result = -EINVAL;
6414           tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
6415           goto fail;
6416         }
6417         tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
6418         if (cct->_conf->subsys.should_gather(dout_subsys, 20)) {
6419           dout(20) << "setting vals: " << dendl;
6420           map<string,bufferlist> to_set;
6421           bufferlist::iterator pt = to_set_bl.begin();
6422           ::decode(to_set, pt);
6423           for (map<string, bufferlist>::iterator i = to_set.begin();
6424                i != to_set.end();
6425                ++i) {
6426             dout(20) << "\t" << i->first << dendl;
6427           }
6428         }
6429         t->omap_setkeys(soid, to_set_bl);
6430         ctx->delta_stats.num_wr++;
6431       }
6432       obs.oi.set_flag(object_info_t::FLAG_OMAP);
6433       obs.oi.clear_omap_digest();
6434       break;
6435
6436     case CEPH_OSD_OP_OMAPSETHEADER:
6437       tracepoint(osd, do_osd_op_pre_omapsetheader, soid.oid.name.c_str(), soid.snap.val);
6438       if (!pool.info.supports_omap()) {
6439         result = -EOPNOTSUPP;
6440         break;
6441       }
6442       ++ctx->num_write;
6443       {
6444         maybe_create_new_object(ctx);
6445         t->omap_setheader(soid, osd_op.indata);
6446         ctx->delta_stats.num_wr++;
6447       }
6448       obs.oi.set_flag(object_info_t::FLAG_OMAP);
6449       obs.oi.clear_omap_digest();
6450       break;
6451
6452     case CEPH_OSD_OP_OMAPCLEAR:
6453       tracepoint(osd, do_osd_op_pre_omapclear, soid.oid.name.c_str(), soid.snap.val);
6454       if (!pool.info.supports_omap()) {
6455         result = -EOPNOTSUPP;
6456         break;
6457       }
6458       ++ctx->num_write;
6459       {
6460         if (!obs.exists || oi.is_whiteout()) {
6461           result = -ENOENT;
6462           break;
6463         }
6464         if (oi.is_omap()) {
6465           t->omap_clear(soid);
6466           ctx->delta_stats.num_wr++;
6467           obs.oi.clear_omap_digest();
6468           obs.oi.clear_flag(object_info_t::FLAG_OMAP);
6469         }
6470       }
6471       break;
6472
6473     case CEPH_OSD_OP_OMAPRMKEYS:
6474       if (!pool.info.supports_omap()) {
6475         result = -EOPNOTSUPP;
6476         tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
6477         break;
6478       }
6479       ++ctx->num_write;
6480       {
6481         if (!obs.exists || oi.is_whiteout()) {
6482           result = -ENOENT;
6483           tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
6484           break;
6485         }
6486         bufferlist to_rm_bl;
6487         try {
6488           decode_str_set_to_bl(bp, &to_rm_bl);
6489         }
6490         catch (buffer::error& e) {
6491           result = -EINVAL;
6492           tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
6493           goto fail;
6494         }
6495         tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
6496         t->omap_rmkeys(soid, to_rm_bl);
6497         ctx->delta_stats.num_wr++;
6498       }
6499       obs.oi.clear_omap_digest();
6500       break;
6501
6502     case CEPH_OSD_OP_COPY_GET:
6503       ++ctx->num_read;
6504       tracepoint(osd, do_osd_op_pre_copy_get, soid.oid.name.c_str(), soid.snap.val);
6505       result = fill_in_copy_get(ctx, bp, osd_op, ctx->obc);
6506       break;
6507
6508     case CEPH_OSD_OP_COPY_FROM:
6509       ++ctx->num_write;
6510       {
6511         object_t src_name;
6512         object_locator_t src_oloc;
6513         snapid_t src_snapid = (uint64_t)op.copy_from.snapid;
6514         version_t src_version = op.copy_from.src_version;
6515         try {
6516           ::decode(src_name, bp);
6517           ::decode(src_oloc, bp);
6518         }
6519         catch (buffer::error& e) {
6520           result = -EINVAL;
6521           tracepoint(osd,
6522                      do_osd_op_pre_copy_from,
6523                      soid.oid.name.c_str(),
6524                      soid.snap.val,
6525                      "???",
6526                      0,
6527                      "???",
6528                      "???",
6529                      0,
6530                      src_snapid,
6531                      src_version);
6532           goto fail;
6533         }
6534         tracepoint(osd,
6535                    do_osd_op_pre_copy_from,
6536                    soid.oid.name.c_str(),
6537                    soid.snap.val,
6538                    src_name.name.c_str(),
6539                    src_oloc.pool,
6540                    src_oloc.key.c_str(),
6541                    src_oloc.nspace.c_str(),
6542                    src_oloc.hash,
6543                    src_snapid,
6544                    src_version);
6545         if (!ctx->copy_cb) {
6546           // start
6547           pg_t raw_pg;
6548           get_osdmap()->object_locator_to_pg(src_name, src_oloc, raw_pg);
6549           hobject_t src(src_name, src_oloc.key, src_snapid,
6550                         raw_pg.ps(), raw_pg.pool(),
6551                         src_oloc.nspace);
6552           if (src == soid) {
6553             dout(20) << " copy from self is invalid" << dendl;
6554             result = -EINVAL;
6555             break;
6556           }
6557           CopyFromCallback *cb = new CopyFromCallback(ctx);
6558           ctx->copy_cb = cb;
6559           start_copy(cb, ctx->obc, src, src_oloc, src_version,
6560                      op.copy_from.flags,
6561                      false,
6562                      op.copy_from.src_fadvise_flags,
6563                      op.flags);
6564           result = -EINPROGRESS;
6565         } else {
6566           // finish
6567           assert(ctx->copy_cb->get_result() >= 0);
6568           finish_copyfrom(ctx);
6569           result = 0;
6570         }
6571       }
6572       break;
6573
6574     default:
6575       tracepoint(osd, do_osd_op_pre_unknown, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op));
6576       dout(1) << "unrecognized osd op " << op.op
6577               << " " << ceph_osd_op_name(op.op)
6578               << dendl;
6579       result = -EOPNOTSUPP;
6580     }
6581
6582   fail:
6583     osd_op.rval = result;
6584     tracepoint(osd, do_osd_op_post, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op), op.flags, result);
6585     if (result < 0 && (op.flags & CEPH_OSD_OP_FLAG_FAILOK))
6586       result = 0;
6587
6588     if (result < 0)
6589       break;
6590   }
6591   return result;
6592 }
6593
6594 int PrimaryLogPG::_get_tmap(OpContext *ctx, bufferlist *header, bufferlist *vals)
6595 {
6596   if (ctx->new_obs.oi.size == 0) {
6597     dout(20) << "unable to get tmap for zero sized " << ctx->new_obs.oi.soid << dendl;
6598     return -ENODATA;
6599   }
6600   vector<OSDOp> nops(1);
6601   OSDOp &newop = nops[0];
6602   newop.op.op = CEPH_OSD_OP_TMAPGET;
6603   do_osd_ops(ctx, nops);
6604   try {
6605     bufferlist::iterator i = newop.outdata.begin();
6606     ::decode(*header, i);
6607     (*vals).substr_of(newop.outdata, i.get_off(), i.get_remaining());
6608   } catch (...) {
6609     dout(20) << "unsuccessful at decoding tmap for " << ctx->new_obs.oi.soid
6610              << dendl;
6611     return -EINVAL;
6612   }
6613   dout(20) << "successful at decoding tmap for " << ctx->new_obs.oi.soid
6614            << dendl;
6615   return 0;
6616 }
6617
6618 int PrimaryLogPG::_verify_no_head_clones(const hobject_t& soid,
6619                                         const SnapSet& ss)
6620 {
6621   // verify that all clones have been evicted
6622   dout(20) << __func__ << " verifying clones are absent "
6623            << ss << dendl;
6624   for (vector<snapid_t>::const_iterator p = ss.clones.begin();
6625        p != ss.clones.end();
6626        ++p) {
6627     hobject_t clone_oid = soid;
6628     clone_oid.snap = *p;
6629     if (is_missing_object(clone_oid))
6630       return -EBUSY;
6631     ObjectContextRef clone_obc = get_object_context(clone_oid, false);
6632     if (clone_obc && clone_obc->obs.exists) {
6633       dout(10) << __func__ << " cannot evict head before clone "
6634                << clone_oid << dendl;
6635       return -EBUSY;
6636     }
6637     if (copy_ops.count(clone_oid)) {
6638       dout(10) << __func__ << " cannot evict head, pending promote on clone "
6639                << clone_oid << dendl;
6640       return -EBUSY;
6641     }
6642   }
6643   return 0;
6644 }
6645
6646 inline int PrimaryLogPG::_delete_oid(
6647   OpContext *ctx,
6648   bool no_whiteout,     // no whiteouts, no matter what.
6649   bool try_no_whiteout) // try not to whiteout
6650 {
6651   SnapSet& snapset = ctx->new_snapset;
6652   ObjectState& obs = ctx->new_obs;
6653   object_info_t& oi = obs.oi;
6654   const hobject_t& soid = oi.soid;
6655   PGTransaction* t = ctx->op_t.get();
6656
6657   // cache: cache: set whiteout on delete?
6658   bool whiteout = false;
6659   if (pool.info.cache_mode != pg_pool_t::CACHEMODE_NONE
6660       && !no_whiteout
6661       && !try_no_whiteout) {
6662     whiteout = true;
6663   }
6664   bool legacy;
6665   if (get_osdmap()->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
6666     legacy = false;
6667     // in luminous or later, we can't delete the head if there are
6668     // clones. we trust the caller passing no_whiteout has already
6669     // verified they don't exist.
6670     if (!snapset.clones.empty() ||
6671         (!ctx->snapc.snaps.empty() && ctx->snapc.snaps[0] > snapset.seq)) {
6672       if (no_whiteout) {
6673         dout(20) << __func__ << " has or will have clones but no_whiteout=1"
6674                  << dendl;
6675       } else {
6676         dout(20) << __func__ << " has or will have clones; will whiteout"
6677                  << dendl;
6678         whiteout = true;
6679       }
6680     }
6681   } else {
6682     legacy = false;
6683   }
6684   dout(20) << __func__ << " " << soid << " whiteout=" << (int)whiteout
6685            << " no_whiteout=" << (int)no_whiteout
6686            << " try_no_whiteout=" << (int)try_no_whiteout
6687            << dendl;
6688   if (!obs.exists || (obs.oi.is_whiteout() && whiteout))
6689     return -ENOENT;
6690
6691   t->remove(soid);
6692
6693   if (oi.size > 0) {
6694     interval_set<uint64_t> ch;
6695     ch.insert(0, oi.size);
6696     ctx->modified_ranges.union_of(ch);
6697   }
6698
6699   ctx->delta_stats.num_wr++;
6700   if (soid.is_snap()) {
6701     assert(ctx->obc->ssc->snapset.clone_overlap.count(soid.snap));
6702     ctx->delta_stats.num_bytes -= ctx->obc->ssc->snapset.get_clone_bytes(soid.snap);
6703   } else {
6704     ctx->delta_stats.num_bytes -= oi.size;
6705   }
6706   oi.size = 0;
6707   oi.new_object();
6708
6709   // disconnect all watchers
6710   for (map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator p =
6711          oi.watchers.begin();
6712        p != oi.watchers.end();
6713        ++p) {
6714     dout(20) << __func__ << " will disconnect watcher " << p->first << dendl;
6715     ctx->watch_disconnects.push_back(
6716       watch_disconnect_t(p->first.first, p->first.second, true));
6717   }
6718   oi.watchers.clear();
6719
6720   if (whiteout) {
6721     dout(20) << __func__ << " setting whiteout on " << soid << dendl;
6722     oi.set_flag(object_info_t::FLAG_WHITEOUT);
6723     ctx->delta_stats.num_whiteouts++;
6724     t->create(soid);
6725     osd->logger->inc(l_osd_tier_whiteout);
6726     return 0;
6727   }
6728
6729   // delete the head
6730   ctx->delta_stats.num_objects--;
6731   if (soid.is_snap())
6732     ctx->delta_stats.num_object_clones--;
6733   if (oi.is_whiteout()) {
6734     dout(20) << __func__ << " deleting whiteout on " << soid << dendl;
6735     ctx->delta_stats.num_whiteouts--;
6736     oi.clear_flag(object_info_t::FLAG_WHITEOUT);
6737   }
6738   if (oi.is_cache_pinned()) {
6739     ctx->delta_stats.num_objects_pinned--;
6740   }
6741   if ((legacy || snapset.is_legacy()) && soid.is_head()) {
6742     snapset.head_exists = false;
6743   }
6744   obs.exists = false;
6745   return 0;
6746 }
6747
6748 int PrimaryLogPG::_rollback_to(OpContext *ctx, ceph_osd_op& op)
6749 {
6750   SnapSet& snapset = ctx->new_snapset;
6751   ObjectState& obs = ctx->new_obs;
6752   object_info_t& oi = obs.oi;
6753   const hobject_t& soid = oi.soid;
6754   PGTransaction* t = ctx->op_t.get();
6755   snapid_t snapid = (uint64_t)op.snap.snapid;
6756   hobject_t missing_oid;
6757
6758   dout(10) << "_rollback_to " << soid << " snapid " << snapid << dendl;
6759
6760   ObjectContextRef rollback_to;
6761   int ret = find_object_context(
6762     hobject_t(soid.oid, soid.get_key(), snapid, soid.get_hash(), info.pgid.pool(),
6763               soid.get_namespace()),
6764     &rollback_to, false, false, &missing_oid);
6765   if (ret == -EAGAIN) {
6766     /* clone must be missing */
6767     assert(is_missing_object(missing_oid));
6768     dout(20) << "_rollback_to attempted to roll back to a missing object "
6769              << missing_oid << " (requested snapid: ) " << snapid << dendl;
6770     block_write_on_degraded_snap(missing_oid, ctx->op);
6771     return ret;
6772   }
6773   {
6774     ObjectContextRef promote_obc;
6775     cache_result_t tier_mode_result;
6776     if (obs.exists && obs.oi.has_manifest()) {
6777       tier_mode_result =
6778         maybe_handle_manifest_detail(
6779           ctx->op,
6780           true,
6781           rollback_to);
6782     } else {
6783       tier_mode_result =
6784         maybe_handle_cache_detail(
6785           ctx->op,
6786           true,
6787           rollback_to,
6788           ret,
6789           missing_oid,
6790           true,
6791           false,
6792           &promote_obc);
6793     }
6794     switch (tier_mode_result) {
6795     case cache_result_t::NOOP:
6796       break;
6797     case cache_result_t::BLOCKED_PROMOTE:
6798       assert(promote_obc);
6799       block_write_on_snap_rollback(soid, promote_obc, ctx->op);
6800       return -EAGAIN;
6801     case cache_result_t::BLOCKED_FULL:
6802       block_write_on_full_cache(soid, ctx->op);
6803       return -EAGAIN;
6804     default:
6805       assert(0 == "must promote was set, other values are not valid");
6806       return -EAGAIN;
6807     }
6808   }
6809
6810   if (ret == -ENOENT || (rollback_to && rollback_to->obs.oi.is_whiteout())) {
6811     // there's no snapshot here, or there's no object.
6812     // if there's no snapshot, we delete the object; otherwise, do nothing.
6813     dout(20) << "_rollback_to deleting head on " << soid.oid
6814              << " because got ENOENT|whiteout on find_object_context" << dendl;
6815     if (ctx->obc->obs.oi.watchers.size()) {
6816       // Cannot delete an object with watchers
6817       ret = -EBUSY;
6818     } else {
6819       _delete_oid(ctx, false, false);
6820       ret = 0;
6821     }
6822   } else if (ret) {
6823     // ummm....huh? It *can't* return anything else at time of writing.
6824     assert(0 == "unexpected error code in _rollback_to");
6825   } else { //we got our context, let's use it to do the rollback!
6826     hobject_t& rollback_to_sobject = rollback_to->obs.oi.soid;
6827     if (is_degraded_or_backfilling_object(rollback_to_sobject)) {
6828       dout(20) << "_rollback_to attempted to roll back to a degraded object "
6829                << rollback_to_sobject << " (requested snapid: ) " << snapid << dendl;
6830       block_write_on_degraded_snap(rollback_to_sobject, ctx->op);
6831       ret = -EAGAIN;
6832     } else if (rollback_to->obs.oi.soid.snap == CEPH_NOSNAP) {
6833       // rolling back to the head; we just need to clone it.
6834       ctx->modify = true;
6835     } else {
6836       /* 1) Delete current head
6837        * 2) Clone correct snapshot into head
6838        * 3) Calculate clone_overlaps by following overlaps
6839        *    forward from rollback snapshot */
6840       dout(10) << "_rollback_to deleting " << soid.oid
6841                << " and rolling back to old snap" << dendl;
6842
6843       if (obs.exists) {
6844         t->remove(soid);
6845       }
6846       t->clone(soid, rollback_to_sobject);
6847       snapset.head_exists = true;
6848       t->add_obc(rollback_to);
6849
6850       map<snapid_t, interval_set<uint64_t> >::iterator iter =
6851         snapset.clone_overlap.lower_bound(snapid);
6852       interval_set<uint64_t> overlaps = iter->second;
6853       assert(iter != snapset.clone_overlap.end());
6854       for ( ;
6855             iter != snapset.clone_overlap.end();
6856             ++iter)
6857         overlaps.intersection_of(iter->second);
6858
6859       if (obs.oi.size > 0) {
6860         interval_set<uint64_t> modified;
6861         modified.insert(0, obs.oi.size);
6862         overlaps.intersection_of(modified);
6863         modified.subtract(overlaps);
6864         ctx->modified_ranges.union_of(modified);
6865       }
6866
6867       // Adjust the cached objectcontext
6868       maybe_create_new_object(ctx, true);
6869       ctx->delta_stats.num_bytes -= obs.oi.size;
6870       ctx->delta_stats.num_bytes += rollback_to->obs.oi.size;
6871       obs.oi.size = rollback_to->obs.oi.size;
6872       if (rollback_to->obs.oi.is_data_digest())
6873         obs.oi.set_data_digest(rollback_to->obs.oi.data_digest);
6874       else
6875         obs.oi.clear_data_digest();
6876       if (rollback_to->obs.oi.is_omap_digest())
6877         obs.oi.set_omap_digest(rollback_to->obs.oi.omap_digest);
6878       else
6879         obs.oi.clear_omap_digest();
6880
6881       if (rollback_to->obs.oi.is_omap()) {
6882         dout(10) << __func__ << " setting omap flag on " << obs.oi.soid << dendl;
6883         obs.oi.set_flag(object_info_t::FLAG_OMAP);
6884       } else {
6885         dout(10) << __func__ << " clearing omap flag on " << obs.oi.soid << dendl;
6886         obs.oi.clear_flag(object_info_t::FLAG_OMAP);
6887       }
6888
6889       snapset.head_exists = true;
6890     }
6891   }
6892   return ret;
6893 }
6894
6895 void PrimaryLogPG::_make_clone(
6896   OpContext *ctx,
6897   PGTransaction* t,
6898   ObjectContextRef obc,
6899   const hobject_t& head, const hobject_t& coid,
6900   object_info_t *poi)
6901 {
6902   bufferlist bv;
6903   ::encode(*poi, bv, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
6904
6905   t->clone(coid, head);
6906   setattr_maybe_cache(obc, ctx, t, OI_ATTR, bv);
6907   rmattr_maybe_cache(obc, ctx, t, SS_ATTR);
6908 }
6909
6910 void PrimaryLogPG::make_writeable(OpContext *ctx)
6911 {
6912   const hobject_t& soid = ctx->obs->oi.soid;
6913   SnapContext& snapc = ctx->snapc;
6914
6915   // clone?
6916   assert(soid.snap == CEPH_NOSNAP);
6917   dout(20) << "make_writeable " << soid << " snapset=" << ctx->new_snapset
6918            << "  snapc=" << snapc << dendl;
6919
6920   bool was_dirty = ctx->obc->obs.oi.is_dirty();
6921   if (ctx->new_obs.exists) {
6922     // we will mark the object dirty
6923     if (ctx->undirty && was_dirty) {
6924       dout(20) << " clearing DIRTY flag" << dendl;
6925       assert(ctx->new_obs.oi.is_dirty());
6926       ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
6927       --ctx->delta_stats.num_objects_dirty;
6928       osd->logger->inc(l_osd_tier_clean);
6929     } else if (!was_dirty && !ctx->undirty) {
6930       dout(20) << " setting DIRTY flag" << dendl;
6931       ctx->new_obs.oi.set_flag(object_info_t::FLAG_DIRTY);
6932       ++ctx->delta_stats.num_objects_dirty;
6933       osd->logger->inc(l_osd_tier_dirty);
6934     }
6935   } else {
6936     if (was_dirty) {
6937       dout(20) << " deletion, decrementing num_dirty and clearing flag" << dendl;
6938       ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
6939       --ctx->delta_stats.num_objects_dirty;
6940     }
6941   }
6942
6943   if ((ctx->new_obs.exists &&
6944        ctx->new_obs.oi.is_omap()) &&
6945       (!ctx->obc->obs.exists ||
6946        !ctx->obc->obs.oi.is_omap())) {
6947     ++ctx->delta_stats.num_objects_omap;
6948   }
6949   if ((!ctx->new_obs.exists ||
6950        !ctx->new_obs.oi.is_omap()) &&
6951       (ctx->obc->obs.exists &&
6952        ctx->obc->obs.oi.is_omap())) {
6953     --ctx->delta_stats.num_objects_omap;
6954   }
6955
6956   // use newer snapc?
6957   if (ctx->new_snapset.seq > snapc.seq) {
6958     snapc.seq = ctx->new_snapset.seq;
6959     snapc.snaps = ctx->new_snapset.snaps;
6960     filter_snapc(snapc.snaps);
6961     dout(10) << " using newer snapc " << snapc << dendl;
6962   }
6963
6964   if ((ctx->obs->exists && !ctx->obs->oi.is_whiteout()) && // head exist(ed)
6965       snapc.snaps.size() &&                 // there are snaps
6966       !ctx->cache_evict &&
6967       snapc.snaps[0] > ctx->new_snapset.seq) {  // existing object is old
6968     // clone
6969     hobject_t coid = soid;
6970     coid.snap = snapc.seq;
6971
6972     unsigned l;
6973     for (l=1; l<snapc.snaps.size() && snapc.snaps[l] > ctx->new_snapset.seq; l++) ;
6974
6975     vector<snapid_t> snaps(l);
6976     for (unsigned i=0; i<l; i++)
6977       snaps[i] = snapc.snaps[i];
6978
6979     // prepare clone
6980     object_info_t static_snap_oi(coid);
6981     object_info_t *snap_oi;
6982     if (is_primary()) {
6983       ctx->clone_obc = object_contexts.lookup_or_create(static_snap_oi.soid);
6984       ctx->clone_obc->destructor_callback = new C_PG_ObjectContext(this, ctx->clone_obc.get());
6985       ctx->clone_obc->obs.oi = static_snap_oi;
6986       ctx->clone_obc->obs.exists = true;
6987       ctx->clone_obc->ssc = ctx->obc->ssc;
6988       ctx->clone_obc->ssc->ref++;
6989       if (pool.info.require_rollback())
6990         ctx->clone_obc->attr_cache = ctx->obc->attr_cache;
6991       snap_oi = &ctx->clone_obc->obs.oi;
6992       bool got = ctx->lock_manager.get_write_greedy(
6993         coid,
6994         ctx->clone_obc,
6995         ctx->op);
6996       assert(got);
6997       dout(20) << " got greedy write on clone_obc " << *ctx->clone_obc << dendl;
6998     } else {
6999       snap_oi = &static_snap_oi;
7000     }
7001     snap_oi->version = ctx->at_version;
7002     snap_oi->prior_version = ctx->obs->oi.version;
7003     snap_oi->copy_user_bits(ctx->obs->oi);
7004
7005     bool legacy = ctx->new_snapset.is_legacy() ||
7006       get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
7007     if (legacy) {
7008       snap_oi->legacy_snaps = snaps;
7009     }
7010
7011     _make_clone(ctx, ctx->op_t.get(), ctx->clone_obc, soid, coid, snap_oi);
7012
7013     ctx->delta_stats.num_objects++;
7014     if (snap_oi->is_dirty()) {
7015       ctx->delta_stats.num_objects_dirty++;
7016       osd->logger->inc(l_osd_tier_dirty);
7017     }
7018     if (snap_oi->is_omap())
7019       ctx->delta_stats.num_objects_omap++;
7020     if (snap_oi->is_cache_pinned())
7021       ctx->delta_stats.num_objects_pinned++;
7022     ctx->delta_stats.num_object_clones++;
7023     ctx->new_snapset.clones.push_back(coid.snap);
7024     ctx->new_snapset.clone_size[coid.snap] = ctx->obs->oi.size;
7025     if (!legacy) {
7026       ctx->new_snapset.clone_snaps[coid.snap] = snaps;
7027     }
7028
7029     // clone_overlap should contain an entry for each clone
7030     // (an empty interval_set if there is no overlap)
7031     ctx->new_snapset.clone_overlap[coid.snap];
7032     if (ctx->obs->oi.size)
7033       ctx->new_snapset.clone_overlap[coid.snap].insert(0, ctx->obs->oi.size);
7034
7035     // log clone
7036     dout(10) << " cloning v " << ctx->obs->oi.version
7037              << " to " << coid << " v " << ctx->at_version
7038              << " snaps=" << snaps
7039              << " snapset=" << ctx->new_snapset << dendl;
7040     ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::CLONE, coid, ctx->at_version,
7041                                       ctx->obs->oi.version,
7042                                       ctx->obs->oi.user_version,
7043                                       osd_reqid_t(), ctx->new_obs.oi.mtime, 0));
7044     ::encode(snaps, ctx->log.back().snaps);
7045
7046     ctx->at_version.version++;
7047   }
7048
7049   // update most recent clone_overlap and usage stats
7050   if (ctx->new_snapset.clones.size() > 0) {
7051     /* we need to check whether the most recent clone exists, if it's been evicted,
7052      * it's not included in the stats */
7053     hobject_t last_clone_oid = soid;
7054     last_clone_oid.snap = ctx->new_snapset.clone_overlap.rbegin()->first;
7055     if (is_present_clone(last_clone_oid)) {
7056       interval_set<uint64_t> &newest_overlap = ctx->new_snapset.clone_overlap.rbegin()->second;
7057       ctx->modified_ranges.intersection_of(newest_overlap);
7058       // modified_ranges is still in use by the clone
7059       add_interval_usage(ctx->modified_ranges, ctx->delta_stats);
7060       newest_overlap.subtract(ctx->modified_ranges);
7061     }
7062   }
7063
7064   // update snapset with latest snap context
7065   ctx->new_snapset.seq = snapc.seq;
7066   ctx->new_snapset.snaps = snapc.snaps;
7067   if (get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS) {
7068     // pessimistic assumption that this is a net-new legacy SnapSet
7069     ctx->delta_stats.num_legacy_snapsets++;
7070     ctx->new_snapset.head_exists = ctx->new_obs.exists;
7071   } else if (ctx->new_snapset.is_legacy()) {
7072     ctx->new_snapset.head_exists = ctx->new_obs.exists;
7073   }
7074   dout(20) << "make_writeable " << soid
7075            << " done, snapset=" << ctx->new_snapset << dendl;
7076 }
7077
7078
7079 void PrimaryLogPG::write_update_size_and_usage(object_stat_sum_t& delta_stats, object_info_t& oi,
7080                                                interval_set<uint64_t>& modified, uint64_t offset,
7081                                                uint64_t length, bool write_full)
7082 {
7083   interval_set<uint64_t> ch;
7084   if (write_full) {
7085     if (oi.size)
7086       ch.insert(0, oi.size);
7087   } else if (length)
7088     ch.insert(offset, length);
7089   modified.union_of(ch);
7090   if (write_full || offset + length > oi.size) {
7091     uint64_t new_size = offset + length;
7092     delta_stats.num_bytes -= oi.size;
7093     delta_stats.num_bytes += new_size;
7094     oi.size = new_size;
7095   }
7096   delta_stats.num_wr++;
7097   delta_stats.num_wr_kb += SHIFT_ROUND_UP(length, 10);
7098 }
7099
7100 void PrimaryLogPG::add_interval_usage(interval_set<uint64_t>& s, object_stat_sum_t& delta_stats)
7101 {
7102   for (interval_set<uint64_t>::const_iterator p = s.begin(); p != s.end(); ++p) {
7103     delta_stats.num_bytes += p.get_len();
7104   }
7105 }
7106
7107 void PrimaryLogPG::complete_disconnect_watches(
7108   ObjectContextRef obc,
7109   const list<watch_disconnect_t> &to_disconnect)
7110 {
7111   for (list<watch_disconnect_t>::const_iterator i =
7112          to_disconnect.begin();
7113        i != to_disconnect.end();
7114        ++i) {
7115     pair<uint64_t, entity_name_t> watcher(i->cookie, i->name);
7116     auto watchers_entry = obc->watchers.find(watcher);
7117     if (watchers_entry != obc->watchers.end()) {
7118       WatchRef watch = watchers_entry->second;
7119       dout(10) << "do_osd_op_effects disconnect watcher " << watcher << dendl;
7120       obc->watchers.erase(watcher);
7121       watch->remove(i->send_disconnect);
7122     } else {
7123       dout(10) << "do_osd_op_effects disconnect failed to find watcher "
7124                << watcher << dendl;
7125     }
7126   }
7127 }
7128
7129 void PrimaryLogPG::do_osd_op_effects(OpContext *ctx, const ConnectionRef& conn)
7130 {
7131   entity_name_t entity = ctx->reqid.name;
7132   dout(15) << "do_osd_op_effects " << entity << " con " << conn.get() << dendl;
7133
7134   // disconnects first
7135   complete_disconnect_watches(ctx->obc, ctx->watch_disconnects);
7136
7137   assert(conn);
7138
7139   boost::intrusive_ptr<Session> session((Session *)conn->get_priv());
7140   if (!session.get())
7141     return;
7142   session->put();  // get_priv() takes a ref, and so does the intrusive_ptr
7143
7144   for (list<pair<watch_info_t,bool> >::iterator i = ctx->watch_connects.begin();
7145        i != ctx->watch_connects.end();
7146        ++i) {
7147     pair<uint64_t, entity_name_t> watcher(i->first.cookie, entity);
7148     dout(15) << "do_osd_op_effects applying watch connect on session "
7149              << session.get() << " watcher " << watcher << dendl;
7150     WatchRef watch;
7151     if (ctx->obc->watchers.count(watcher)) {
7152       dout(15) << "do_osd_op_effects found existing watch watcher " << watcher
7153                << dendl;
7154       watch = ctx->obc->watchers[watcher];
7155     } else {
7156       dout(15) << "do_osd_op_effects new watcher " << watcher
7157                << dendl;
7158       watch = Watch::makeWatchRef(
7159         this, osd, ctx->obc, i->first.timeout_seconds,
7160         i->first.cookie, entity, conn->get_peer_addr());
7161       ctx->obc->watchers.insert(
7162         make_pair(
7163           watcher,
7164           watch));
7165     }
7166     watch->connect(conn, i->second);
7167   }
7168
7169   for (list<notify_info_t>::iterator p = ctx->notifies.begin();
7170        p != ctx->notifies.end();
7171        ++p) {
7172     dout(10) << "do_osd_op_effects, notify " << *p << dendl;
7173     ConnectionRef conn(ctx->op->get_req()->get_connection());
7174     NotifyRef notif(
7175       Notify::makeNotifyRef(
7176         conn,
7177         ctx->reqid.name.num(),
7178         p->bl,
7179         p->timeout,
7180         p->cookie,
7181         p->notify_id,
7182         ctx->obc->obs.oi.user_version,
7183         osd));
7184     for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator i =
7185            ctx->obc->watchers.begin();
7186          i != ctx->obc->watchers.end();
7187          ++i) {
7188       dout(10) << "starting notify on watch " << i->first << dendl;
7189       i->second->start_notify(notif);
7190     }
7191     notif->init();
7192   }
7193
7194   for (list<OpContext::NotifyAck>::iterator p = ctx->notify_acks.begin();
7195        p != ctx->notify_acks.end();
7196        ++p) {
7197     if (p->watch_cookie)
7198       dout(10) << "notify_ack " << make_pair(p->watch_cookie.get(), p->notify_id) << dendl;
7199     else
7200       dout(10) << "notify_ack " << make_pair("NULL", p->notify_id) << dendl;
7201     for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator i =
7202            ctx->obc->watchers.begin();
7203          i != ctx->obc->watchers.end();
7204          ++i) {
7205       if (i->first.second != entity) continue;
7206       if (p->watch_cookie &&
7207           p->watch_cookie.get() != i->first.first) continue;
7208       dout(10) << "acking notify on watch " << i->first << dendl;
7209       i->second->notify_ack(p->notify_id, p->reply_bl);
7210     }
7211   }
7212 }
7213
7214 hobject_t PrimaryLogPG::generate_temp_object(const hobject_t& target)
7215 {
7216   ostringstream ss;
7217   ss << "temp_" << info.pgid << "_" << get_role()
7218      << "_" << osd->monc->get_global_id() << "_" << (++temp_seq);
7219   hobject_t hoid = target.make_temp_hobject(ss.str());
7220   dout(20) << __func__ << " " << hoid << dendl;
7221   return hoid;
7222 }
7223
7224 hobject_t PrimaryLogPG::get_temp_recovery_object(
7225   const hobject_t& target,
7226   eversion_t version)
7227 {
7228   ostringstream ss;
7229   ss << "temp_recovering_" << info.pgid  // (note this includes the shardid)
7230      << "_" << version
7231      << "_" << info.history.same_interval_since
7232      << "_" << target.snap;
7233   // pgid + version + interval + snapid is unique, and short
7234   hobject_t hoid = target.make_temp_hobject(ss.str());
7235   dout(20) << __func__ << " " << hoid << dendl;
7236   return hoid;
7237 }
7238
7239 int PrimaryLogPG::prepare_transaction(OpContext *ctx)
7240 {
7241   assert(!ctx->ops.empty());
7242
7243   const hobject_t& soid = ctx->obs->oi.soid;
7244
7245   // valid snap context?
7246   if (!ctx->snapc.is_valid()) {
7247     dout(10) << " invalid snapc " << ctx->snapc << dendl;
7248     return -EINVAL;
7249   }
7250
7251   // prepare the actual mutation
7252   int result = do_osd_ops(ctx, ctx->ops);
7253   if (result < 0) {
7254     if (ctx->op->may_write() &&
7255         get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
7256       // need to save the error code in the pg log, to detect dup ops,
7257       // but do nothing else
7258       ctx->update_log_only = true;
7259     }
7260     return result;
7261   }
7262
7263   // read-op?  write-op noop? done?
7264   if (ctx->op_t->empty() && !ctx->modify) {
7265     unstable_stats.add(ctx->delta_stats);
7266     if (ctx->op->may_write() &&
7267         get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
7268       ctx->update_log_only = true;
7269     }
7270     return result;
7271   }
7272
7273   // check for full
7274   if ((ctx->delta_stats.num_bytes > 0 ||
7275        ctx->delta_stats.num_objects > 0) &&  // FIXME: keys?
7276       (pool.info.has_flag(pg_pool_t::FLAG_FULL) ||
7277        get_osdmap()->test_flag(CEPH_OSDMAP_FULL))) {
7278     const MOSDOp *m = static_cast<const MOSDOp*>(ctx->op->get_req());
7279     if (ctx->reqid.name.is_mds() ||   // FIXME: ignore MDS for now
7280         m->has_flag(CEPH_OSD_FLAG_FULL_FORCE)) {
7281       dout(20) << __func__ << " full, but proceeding due to FULL_FORCE or MDS"
7282                << dendl;
7283     } else if (m->has_flag(CEPH_OSD_FLAG_FULL_TRY)) {
7284       // they tried, they failed.
7285       dout(20) << __func__ << " full, replying to FULL_TRY op" << dendl;
7286       return pool.info.has_flag(pg_pool_t::FLAG_FULL) ? -EDQUOT : -ENOSPC;
7287     } else {
7288       // drop request
7289       dout(20) << __func__ << " full, dropping request (bad client)" << dendl;
7290       return -EAGAIN;
7291     }
7292   }
7293
7294   // clone, if necessary
7295   if (soid.snap == CEPH_NOSNAP)
7296     make_writeable(ctx);
7297
7298   finish_ctx(ctx,
7299              ctx->new_obs.exists ? pg_log_entry_t::MODIFY :
7300              pg_log_entry_t::DELETE);
7301
7302   return result;
7303 }
7304
7305 void PrimaryLogPG::finish_ctx(OpContext *ctx, int log_op_type, bool maintain_ssc)
7306 {
7307   const hobject_t& soid = ctx->obs->oi.soid;
7308   dout(20) << __func__ << " " << soid << " " << ctx
7309            << " op " << pg_log_entry_t::get_op_name(log_op_type)
7310            << dendl;
7311   utime_t now = ceph_clock_now();
7312
7313   // snapset
7314   bufferlist bss;
7315
7316   if (soid.snap == CEPH_NOSNAP && maintain_ssc) {
7317     ::encode(ctx->new_snapset, bss);
7318     assert(ctx->new_obs.exists == ctx->new_snapset.head_exists ||
7319            !ctx->new_snapset.is_legacy());
7320
7321     if (ctx->new_obs.exists) {
7322       if (!ctx->obs->exists) {
7323         if (ctx->snapset_obc && ctx->snapset_obc->obs.exists) {
7324           hobject_t snapoid = soid.get_snapdir();
7325           dout(10) << " removing unneeded snapdir " << snapoid << dendl;
7326           ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::DELETE, snapoid,
7327               ctx->at_version,
7328               ctx->snapset_obc->obs.oi.version,
7329               0, osd_reqid_t(), ctx->mtime, 0));
7330           ctx->op_t->remove(snapoid);
7331
7332           ctx->at_version.version++;
7333
7334           ctx->snapset_obc->obs.exists = false;
7335         }
7336       }
7337     } else if (!ctx->new_snapset.clones.empty() &&
7338                !ctx->cache_evict &&
7339                !ctx->new_snapset.head_exists &&
7340                (!ctx->snapset_obc || !ctx->snapset_obc->obs.exists)) {
7341       // save snapset on _snap
7342       hobject_t snapoid(soid.oid, soid.get_key(), CEPH_SNAPDIR, soid.get_hash(),
7343                         info.pgid.pool(), soid.get_namespace());
7344       dout(10) << " final snapset " << ctx->new_snapset
7345                << " in " << snapoid << dendl;
7346       assert(get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS);
7347       ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::MODIFY, snapoid,
7348                                         ctx->at_version,
7349                                         eversion_t(),
7350                                         0, osd_reqid_t(), ctx->mtime, 0));
7351
7352       if (!ctx->snapset_obc)
7353         ctx->snapset_obc = get_object_context(snapoid, true);
7354       bool got = false;
7355       if (ctx->lock_type == ObjectContext::RWState::RWWRITE) {
7356         got = ctx->lock_manager.get_write_greedy(
7357           snapoid,
7358           ctx->snapset_obc,
7359           ctx->op);
7360       } else {
7361         assert(ctx->lock_type == ObjectContext::RWState::RWEXCL);
7362         got = ctx->lock_manager.get_lock_type(
7363           ObjectContext::RWState::RWEXCL,
7364           snapoid,
7365           ctx->snapset_obc,
7366           ctx->op);
7367       }
7368       assert(got);
7369       dout(20) << " got greedy write on snapset_obc " << *ctx->snapset_obc << dendl;
7370       ctx->snapset_obc->obs.exists = true;
7371       ctx->snapset_obc->obs.oi.version = ctx->at_version;
7372       ctx->snapset_obc->obs.oi.last_reqid = ctx->reqid;
7373       ctx->snapset_obc->obs.oi.mtime = ctx->mtime;
7374       ctx->snapset_obc->obs.oi.local_mtime = now;
7375
7376       map<string, bufferlist> attrs;
7377       bufferlist bv(sizeof(ctx->new_obs.oi));
7378       ::encode(ctx->snapset_obc->obs.oi, bv,
7379                get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
7380       ctx->op_t->create(snapoid);
7381       attrs[OI_ATTR].claim(bv);
7382       attrs[SS_ATTR].claim(bss);
7383       setattrs_maybe_cache(ctx->snapset_obc, ctx, ctx->op_t.get(), attrs);
7384       ctx->at_version.version++;
7385     }
7386   }
7387
7388   // finish and log the op.
7389   if (ctx->user_modify) {
7390     // update the user_version for any modify ops, except for the watch op
7391     ctx->user_at_version = MAX(info.last_user_version, ctx->new_obs.oi.user_version) + 1;
7392     /* In order for new clients and old clients to interoperate properly
7393      * when exchanging versions, we need to lower bound the user_version
7394      * (which our new clients pay proper attention to)
7395      * by the at_version (which is all the old clients can ever see). */
7396     if (ctx->at_version.version > ctx->user_at_version)
7397       ctx->user_at_version = ctx->at_version.version;
7398     ctx->new_obs.oi.user_version = ctx->user_at_version;
7399   }
7400   ctx->bytes_written = ctx->op_t->get_bytes_written();
7401
7402   if (ctx->new_obs.exists) {
7403     // on the head object
7404     ctx->new_obs.oi.version = ctx->at_version;
7405     ctx->new_obs.oi.prior_version = ctx->obs->oi.version;
7406     ctx->new_obs.oi.last_reqid = ctx->reqid;
7407     if (ctx->mtime != utime_t()) {
7408       ctx->new_obs.oi.mtime = ctx->mtime;
7409       dout(10) << " set mtime to " << ctx->new_obs.oi.mtime << dendl;
7410       ctx->new_obs.oi.local_mtime = now;
7411     } else {
7412       dout(10) << " mtime unchanged at " << ctx->new_obs.oi.mtime << dendl;
7413     }
7414
7415     map <string, bufferlist> attrs;
7416     bufferlist bv(sizeof(ctx->new_obs.oi));
7417     ::encode(ctx->new_obs.oi, bv,
7418              get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
7419     attrs[OI_ATTR].claim(bv);
7420
7421     if (soid.snap == CEPH_NOSNAP) {
7422       dout(10) << " final snapset " << ctx->new_snapset
7423                << " in " << soid << dendl;
7424       attrs[SS_ATTR].claim(bss);
7425     } else {
7426       dout(10) << " no snapset (this is a clone)" << dendl;
7427     }
7428     ctx->op_t->setattrs(soid, attrs);
7429   } else {
7430     ctx->new_obs.oi = object_info_t(ctx->obc->obs.oi.soid);
7431   }
7432
7433   bool legacy_snapset = ctx->new_snapset.is_legacy() ||
7434     get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
7435
7436   // append to log
7437   ctx->log.push_back(pg_log_entry_t(log_op_type, soid, ctx->at_version,
7438                                     ctx->obs->oi.version,
7439                                     ctx->user_at_version, ctx->reqid,
7440                                     ctx->mtime, 0));
7441   if (soid.snap < CEPH_NOSNAP) {
7442     switch (log_op_type) {
7443     case pg_log_entry_t::MODIFY:
7444     case pg_log_entry_t::PROMOTE:
7445     case pg_log_entry_t::CLEAN:
7446       if (legacy_snapset) {
7447         dout(20) << __func__ << " encoding legacy_snaps "
7448                  << ctx->new_obs.oi.legacy_snaps
7449                  << dendl;
7450         ::encode(ctx->new_obs.oi.legacy_snaps, ctx->log.back().snaps);
7451       } else {
7452         dout(20) << __func__ << " encoding snaps from " << ctx->new_snapset
7453                  << dendl;
7454         ::encode(ctx->new_snapset.clone_snaps[soid.snap], ctx->log.back().snaps);
7455       }
7456       break;
7457     default:
7458       break;
7459     }
7460   }
7461
7462   if (!ctx->extra_reqids.empty()) {
7463     dout(20) << __func__ << "  extra_reqids " << ctx->extra_reqids << dendl;
7464     ctx->log.back().extra_reqids.swap(ctx->extra_reqids);
7465   }
7466
7467   // apply new object state.
7468   ctx->obc->obs = ctx->new_obs;
7469
7470   if (soid.is_head() && !ctx->obc->obs.exists &&
7471       (!maintain_ssc || ctx->cache_evict)) {
7472     ctx->obc->ssc->exists = false;
7473     ctx->obc->ssc->snapset = SnapSet();
7474   } else {
7475     ctx->obc->ssc->exists = true;
7476     ctx->obc->ssc->snapset = ctx->new_snapset;
7477   }
7478 }
7479
7480 void PrimaryLogPG::apply_stats(
7481   const hobject_t &soid,
7482   const object_stat_sum_t &delta_stats) {
7483
7484   info.stats.stats.add(delta_stats);
7485
7486   for (set<pg_shard_t>::iterator i = backfill_targets.begin();
7487        i != backfill_targets.end();
7488        ++i) {
7489     pg_shard_t bt = *i;
7490     pg_info_t& pinfo = peer_info[bt];
7491     if (soid <= pinfo.last_backfill)
7492       pinfo.stats.stats.add(delta_stats);
7493     else if (soid <= last_backfill_started)
7494       pending_backfill_updates[soid].stats.add(delta_stats);
7495   }
7496
7497   if (is_primary() && scrubber.active) {
7498     if (soid < scrubber.start) {
7499       dout(20) << __func__ << " " << soid << " < [" << scrubber.start
7500                << "," << scrubber.end << ")" << dendl;
7501       scrub_cstat.add(delta_stats);
7502     } else {
7503       dout(20) << __func__ << " " << soid << " >= [" << scrubber.start
7504                << "," << scrubber.end << ")" << dendl;
7505     }
7506   }
7507 }
7508
7509 void PrimaryLogPG::complete_read_ctx(int result, OpContext *ctx)
7510 {
7511   const MOSDOp *m = static_cast<const MOSDOp*>(ctx->op->get_req());
7512   assert(ctx->async_reads_complete());
7513
7514   for (vector<OSDOp>::iterator p = ctx->ops.begin();
7515     p != ctx->ops.end() && result >= 0; ++p) {
7516     if (p->rval < 0 && !(p->op.flags & CEPH_OSD_OP_FLAG_FAILOK)) {
7517       result = p->rval;
7518       break;
7519     }
7520     ctx->bytes_read += p->outdata.length();
7521   }
7522   ctx->reply->claim_op_out_data(ctx->ops);
7523   ctx->reply->get_header().data_off = ctx->data_off;
7524
7525   MOSDOpReply *reply = ctx->reply;
7526   ctx->reply = nullptr;
7527
7528   if (result >= 0) {
7529     if (!ctx->ignore_log_op_stats) {
7530       log_op_stats(ctx);
7531       publish_stats_to_osd();
7532     }
7533
7534     // on read, return the current object version
7535     if (ctx->obs) {
7536       reply->set_reply_versions(eversion_t(), ctx->obs->oi.user_version);
7537     } else {
7538       reply->set_reply_versions(eversion_t(), ctx->user_at_version);
7539     }
7540   } else if (result == -ENOENT) {
7541     // on ENOENT, set a floor for what the next user version will be.
7542     reply->set_enoent_reply_versions(info.last_update, info.last_user_version);
7543   }
7544
7545   reply->set_result(result);
7546   reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
7547   osd->send_message_osd_client(reply, m->get_connection());
7548   close_op_ctx(ctx);
7549 }
7550
7551 // ========================================================================
7552 // copyfrom
7553
7554 struct C_Copyfrom : public Context {
7555   PrimaryLogPGRef pg;
7556   hobject_t oid;
7557   epoch_t last_peering_reset;
7558   ceph_tid_t tid;
7559   PrimaryLogPG::CopyOpRef cop;
7560   C_Copyfrom(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
7561              const PrimaryLogPG::CopyOpRef& c)
7562     : pg(p), oid(o), last_peering_reset(lpr),
7563       tid(0), cop(c)
7564   {}
7565   void finish(int r) override {
7566     if (r == -ECANCELED)
7567       return;
7568     pg->lock();
7569     if (last_peering_reset == pg->get_last_peering_reset()) {
7570       pg->process_copy_chunk(oid, tid, r);
7571     }
7572     pg->unlock();
7573   }
7574 };
7575
7576 struct C_CopyFrom_AsyncReadCb : public Context {
7577   OSDOp *osd_op;
7578   object_copy_data_t reply_obj;
7579   uint64_t features;
7580   size_t len;
7581   C_CopyFrom_AsyncReadCb(OSDOp *osd_op, uint64_t features) :
7582     osd_op(osd_op), features(features), len(0) {}
7583   void finish(int r) override {
7584     assert(len > 0);
7585     assert(len <= reply_obj.data.length());
7586     bufferlist bl;
7587     bl.substr_of(reply_obj.data, 0, len);
7588     reply_obj.data.swap(bl);
7589     ::encode(reply_obj, osd_op->outdata, features);
7590   }
7591 };
7592
7593 int PrimaryLogPG::fill_in_copy_get(
7594   OpContext *ctx,
7595   bufferlist::iterator& bp,
7596   OSDOp& osd_op,
7597   ObjectContextRef &obc)
7598 {
7599   object_info_t& oi = obc->obs.oi;
7600   hobject_t& soid = oi.soid;
7601   int result = 0;
7602   object_copy_cursor_t cursor;
7603   uint64_t out_max;
7604   try {
7605     ::decode(cursor, bp);
7606     ::decode(out_max, bp);
7607   }
7608   catch (buffer::error& e) {
7609     result = -EINVAL;
7610     return result;
7611   }
7612
7613   const MOSDOp *op = reinterpret_cast<const MOSDOp*>(ctx->op->get_req());
7614   uint64_t features = op->get_features();
7615
7616   bool async_read_started = false;
7617   object_copy_data_t _reply_obj;
7618   C_CopyFrom_AsyncReadCb *cb = NULL;
7619   if (pool.info.require_rollback()) {
7620     cb = new C_CopyFrom_AsyncReadCb(&osd_op, features);
7621   }
7622   object_copy_data_t &reply_obj = cb ? cb->reply_obj : _reply_obj;
7623   // size, mtime
7624   reply_obj.size = oi.size;
7625   reply_obj.mtime = oi.mtime;
7626   assert(obc->ssc);
7627   if (soid.snap < CEPH_NOSNAP) {
7628     if (obc->ssc->snapset.is_legacy()) {
7629       reply_obj.snaps = oi.legacy_snaps;
7630     } else {
7631       auto p = obc->ssc->snapset.clone_snaps.find(soid.snap);
7632       assert(p != obc->ssc->snapset.clone_snaps.end()); // warn?
7633       reply_obj.snaps = p->second;
7634     }
7635   } else {
7636     reply_obj.snap_seq = obc->ssc->snapset.seq;
7637   }
7638   if (oi.is_data_digest()) {
7639     reply_obj.flags |= object_copy_data_t::FLAG_DATA_DIGEST;
7640     reply_obj.data_digest = oi.data_digest;
7641   }
7642   if (oi.is_omap_digest()) {
7643     reply_obj.flags |= object_copy_data_t::FLAG_OMAP_DIGEST;
7644     reply_obj.omap_digest = oi.omap_digest;
7645   }
7646   reply_obj.truncate_seq = oi.truncate_seq;
7647   reply_obj.truncate_size = oi.truncate_size;
7648
7649   // attrs
7650   map<string,bufferlist>& out_attrs = reply_obj.attrs;
7651   if (!cursor.attr_complete) {
7652     result = getattrs_maybe_cache(
7653       ctx->obc,
7654       &out_attrs,
7655       true);
7656     if (result < 0) {
7657       if (cb) {
7658         delete cb;
7659       }
7660       return result;
7661     }
7662     cursor.attr_complete = true;
7663     dout(20) << " got attrs" << dendl;
7664   }
7665
7666   int64_t left = out_max - osd_op.outdata.length();
7667
7668   // data
7669   bufferlist& bl = reply_obj.data;
7670   if (left > 0 && !cursor.data_complete) {
7671     if (cursor.data_offset < oi.size) {
7672       uint64_t max_read = MIN(oi.size - cursor.data_offset, (uint64_t)left);
7673       if (cb) {
7674         async_read_started = true;
7675         ctx->pending_async_reads.push_back(
7676           make_pair(
7677             boost::make_tuple(cursor.data_offset, max_read, osd_op.op.flags),
7678             make_pair(&bl, cb)));
7679         result = max_read;
7680         cb->len = result;
7681       } else {
7682         result = pgbackend->objects_read_sync(
7683           oi.soid, cursor.data_offset, left, osd_op.op.flags, &bl);
7684         if (result < 0)
7685           return result;
7686       }
7687       assert(result <= left);
7688       left -= result;
7689       cursor.data_offset += result;
7690     }
7691     if (cursor.data_offset == oi.size) {
7692       cursor.data_complete = true;
7693       dout(20) << " got data" << dendl;
7694     }
7695     assert(cursor.data_offset <= oi.size);
7696   }
7697
7698   // omap
7699   uint32_t omap_keys = 0;
7700   if (!pool.info.supports_omap() || !oi.is_omap()) {
7701     cursor.omap_complete = true;
7702   } else {
7703     if (left > 0 && !cursor.omap_complete) {
7704       assert(cursor.data_complete);
7705       if (cursor.omap_offset.empty()) {
7706         osd->store->omap_get_header(ch, ghobject_t(oi.soid),
7707                                     &reply_obj.omap_header);
7708       }
7709       bufferlist omap_data;
7710       ObjectMap::ObjectMapIterator iter =
7711         osd->store->get_omap_iterator(coll, ghobject_t(oi.soid));
7712       assert(iter);
7713       iter->upper_bound(cursor.omap_offset);
7714       for (; iter->valid(); iter->next(false)) {
7715         ++omap_keys;
7716         ::encode(iter->key(), omap_data);
7717         ::encode(iter->value(), omap_data);
7718         left -= iter->key().length() + 4 + iter->value().length() + 4;
7719         if (left <= 0)
7720           break;
7721       }
7722       if (omap_keys) {
7723         ::encode(omap_keys, reply_obj.omap_data);
7724         reply_obj.omap_data.claim_append(omap_data);
7725       }
7726       if (iter->valid()) {
7727         cursor.omap_offset = iter->key();
7728       } else {
7729         cursor.omap_complete = true;
7730         dout(20) << " got omap" << dendl;
7731       }
7732     }
7733   }
7734
7735   if (cursor.is_complete()) {
7736     // include reqids only in the final step.  this is a bit fragile
7737     // but it works...
7738     pg_log.get_log().get_object_reqids(ctx->obc->obs.oi.soid, 10, &reply_obj.reqids);
7739     dout(20) << " got reqids" << dendl;
7740   }
7741
7742   dout(20) << " cursor.is_complete=" << cursor.is_complete()
7743            << " " << out_attrs.size() << " attrs"
7744            << " " << bl.length() << " bytes"
7745            << " " << reply_obj.omap_header.length() << " omap header bytes"
7746            << " " << reply_obj.omap_data.length() << " omap data bytes in "
7747            << omap_keys << " keys"
7748            << " " << reply_obj.reqids.size() << " reqids"
7749            << dendl;
7750   reply_obj.cursor = cursor;
7751   if (!async_read_started) {
7752     ::encode(reply_obj, osd_op.outdata, features);
7753   }
7754   if (cb && !async_read_started) {
7755     delete cb;
7756   }
7757   result = 0;
7758   return result;
7759 }
7760
7761 void PrimaryLogPG::fill_in_copy_get_noent(OpRequestRef& op, hobject_t oid,
7762                                           OSDOp& osd_op)
7763 {
7764   // NOTE: we take non-const ref here for claim_op_out_data below; we must
7765   // be careful not to modify anything else that will upset a racing
7766   // operator<<
7767   MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
7768   uint64_t features = m->get_features();
7769   object_copy_data_t reply_obj;
7770
7771   pg_log.get_log().get_object_reqids(oid, 10, &reply_obj.reqids);
7772   dout(20) << __func__ << " got reqids " << reply_obj.reqids << dendl;
7773   ::encode(reply_obj, osd_op.outdata, features);
7774   osd_op.rval = -ENOENT;
7775   MOSDOpReply *reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0, false);
7776   reply->claim_op_out_data(m->ops);
7777   reply->set_result(-ENOENT);
7778   reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
7779   osd->send_message_osd_client(reply, m->get_connection());
7780 }
7781
7782 void PrimaryLogPG::start_copy(CopyCallback *cb, ObjectContextRef obc,
7783                               hobject_t src, object_locator_t oloc,
7784                               version_t version, unsigned flags,
7785                               bool mirror_snapset,
7786                               unsigned src_obj_fadvise_flags,
7787                               unsigned dest_obj_fadvise_flags)
7788 {
7789   const hobject_t& dest = obc->obs.oi.soid;
7790   dout(10) << __func__ << " " << dest
7791            << " from " << src << " " << oloc << " v" << version
7792            << " flags " << flags
7793            << (mirror_snapset ? " mirror_snapset" : "")
7794            << dendl;
7795
7796   assert(!mirror_snapset || (src.snap == CEPH_NOSNAP ||
7797                              src.snap == CEPH_SNAPDIR));
7798
7799   // cancel a previous in-progress copy?
7800   if (copy_ops.count(dest)) {
7801     // FIXME: if the src etc match, we could avoid restarting from the
7802     // beginning.
7803     CopyOpRef cop = copy_ops[dest];
7804     cancel_copy(cop, false);
7805   }
7806
7807   CopyOpRef cop(std::make_shared<CopyOp>(cb, obc, src, oloc, version, flags,
7808                            mirror_snapset, src_obj_fadvise_flags,
7809                            dest_obj_fadvise_flags));
7810   copy_ops[dest] = cop;
7811   obc->start_block();
7812
7813   _copy_some(obc, cop);
7814 }
7815
7816 void PrimaryLogPG::_copy_some(ObjectContextRef obc, CopyOpRef cop)
7817 {
7818   dout(10) << __func__ << " " << obc << " " << cop << dendl;
7819
7820   unsigned flags = 0;
7821   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_FLUSH)
7822     flags |= CEPH_OSD_FLAG_FLUSH;
7823   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE)
7824     flags |= CEPH_OSD_FLAG_IGNORE_CACHE;
7825   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY)
7826     flags |= CEPH_OSD_FLAG_IGNORE_OVERLAY;
7827   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE)
7828     flags |= CEPH_OSD_FLAG_MAP_SNAP_CLONE;
7829   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_RWORDERED)
7830     flags |= CEPH_OSD_FLAG_RWORDERED;
7831
7832   C_GatherBuilder gather(cct);
7833
7834   if (cop->cursor.is_initial() && cop->mirror_snapset) {
7835     // list snaps too.
7836     assert(cop->src.snap == CEPH_NOSNAP);
7837     ObjectOperation op;
7838     op.list_snaps(&cop->results.snapset, NULL);
7839     ceph_tid_t tid = osd->objecter->read(cop->src.oid, cop->oloc, op,
7840                                     CEPH_SNAPDIR, NULL,
7841                                     flags, gather.new_sub(), NULL);
7842     cop->objecter_tid2 = tid;
7843   }
7844
7845   ObjectOperation op;
7846   if (cop->results.user_version) {
7847     op.assert_version(cop->results.user_version);
7848   } else {
7849     // we should learn the version after the first chunk, if we didn't know
7850     // it already!
7851     assert(cop->cursor.is_initial());
7852   }
7853   op.copy_get(&cop->cursor, get_copy_chunk_size(),
7854               &cop->results.object_size, &cop->results.mtime,
7855               &cop->attrs, &cop->data, &cop->omap_header, &cop->omap_data,
7856               &cop->results.snaps, &cop->results.snap_seq,
7857               &cop->results.flags,
7858               &cop->results.source_data_digest,
7859               &cop->results.source_omap_digest,
7860               &cop->results.reqids,
7861               &cop->results.truncate_seq,
7862               &cop->results.truncate_size,
7863               &cop->rval);
7864   op.set_last_op_flags(cop->src_obj_fadvise_flags);
7865
7866   C_Copyfrom *fin = new C_Copyfrom(this, obc->obs.oi.soid,
7867                                    get_last_peering_reset(), cop);
7868   gather.set_finisher(new C_OnFinisher(fin,
7869                                        &osd->objecter_finisher));
7870
7871   ceph_tid_t tid = osd->objecter->read(cop->src.oid, cop->oloc, op,
7872                                   cop->src.snap, NULL,
7873                                   flags,
7874                                   gather.new_sub(),
7875                                   // discover the object version if we don't know it yet
7876                                   cop->results.user_version ? NULL : &cop->results.user_version);
7877   fin->tid = tid;
7878   cop->objecter_tid = tid;
7879   gather.activate();
7880 }
7881
7882 void PrimaryLogPG::process_copy_chunk(hobject_t oid, ceph_tid_t tid, int r)
7883 {
7884   dout(10) << __func__ << " " << oid << " tid " << tid
7885            << " " << cpp_strerror(r) << dendl;
7886   map<hobject_t,CopyOpRef>::iterator p = copy_ops.find(oid);
7887   if (p == copy_ops.end()) {
7888     dout(10) << __func__ << " no copy_op found" << dendl;
7889     return;
7890   }
7891   CopyOpRef cop = p->second;
7892   if (tid != cop->objecter_tid) {
7893     dout(10) << __func__ << " tid " << tid << " != cop " << cop
7894              << " tid " << cop->objecter_tid << dendl;
7895     return;
7896   }
7897
7898   if (cop->omap_data.length() || cop->omap_header.length())
7899     cop->results.has_omap = true;
7900
7901   if (r >= 0 && !pool.info.supports_omap() &&
7902       (cop->omap_data.length() || cop->omap_header.length())) {
7903     r = -EOPNOTSUPP;
7904   }
7905   cop->objecter_tid = 0;
7906   cop->objecter_tid2 = 0;  // assume this ordered before us (if it happened)
7907   ObjectContextRef& cobc = cop->obc;
7908
7909   if (r < 0)
7910     goto out;
7911
7912   assert(cop->rval >= 0);
7913
7914   if (oid.snap < CEPH_NOSNAP && !cop->results.snaps.empty()) {
7915     // verify snap hasn't been deleted
7916     vector<snapid_t>::iterator p = cop->results.snaps.begin();
7917     while (p != cop->results.snaps.end()) {
7918       if (pool.info.is_removed_snap(*p)) {
7919         dout(10) << __func__ << " clone snap " << *p << " has been deleted"
7920                  << dendl;
7921         for (vector<snapid_t>::iterator q = p + 1;
7922              q != cop->results.snaps.end();
7923              ++q)
7924           *(q - 1) = *q;
7925         cop->results.snaps.resize(cop->results.snaps.size() - 1);
7926       } else {
7927         ++p;
7928       }
7929     }
7930     if (cop->results.snaps.empty()) {
7931       dout(10) << __func__ << " no more snaps for " << oid << dendl;
7932       r = -ENOENT;
7933       goto out;
7934     }
7935   }
7936
7937   assert(cop->rval >= 0);
7938
7939   if (!cop->temp_cursor.data_complete) {
7940     cop->results.data_digest = cop->data.crc32c(cop->results.data_digest);
7941   }
7942   if (pool.info.supports_omap() && !cop->temp_cursor.omap_complete) {
7943     if (cop->omap_header.length()) {
7944       cop->results.omap_digest =
7945         cop->omap_header.crc32c(cop->results.omap_digest);
7946     }
7947     if (cop->omap_data.length()) {
7948       bufferlist keys;
7949       keys.substr_of(cop->omap_data, 4, cop->omap_data.length() - 4);
7950       cop->results.omap_digest = keys.crc32c(cop->results.omap_digest);
7951     }
7952   }
7953
7954   if (!cop->temp_cursor.attr_complete) {
7955     for (map<string,bufferlist>::iterator p = cop->attrs.begin();
7956          p != cop->attrs.end();
7957          ++p) {
7958       cop->results.attrs[string("_") + p->first] = p->second;
7959     }
7960     cop->attrs.clear();
7961   }
7962
7963   if (!cop->cursor.is_complete()) {
7964     // write out what we have so far
7965     if (cop->temp_cursor.is_initial()) {
7966       assert(!cop->results.started_temp_obj);
7967       cop->results.started_temp_obj = true;
7968       cop->results.temp_oid = generate_temp_object(oid);
7969       dout(20) << __func__ << " using temp " << cop->results.temp_oid << dendl;
7970     }
7971     ObjectContextRef tempobc = get_object_context(cop->results.temp_oid, true);
7972     OpContextUPtr ctx = simple_opc_create(tempobc);
7973     if (cop->temp_cursor.is_initial()) {
7974       ctx->new_temp_oid = cop->results.temp_oid;
7975     }
7976     _write_copy_chunk(cop, ctx->op_t.get());
7977     simple_opc_submit(std::move(ctx));
7978     dout(10) << __func__ << " fetching more" << dendl;
7979     _copy_some(cobc, cop);
7980     return;
7981   }
7982
7983   // verify digests?
7984   if (cop->results.is_data_digest() || cop->results.is_omap_digest()) {
7985     dout(20) << __func__ << std::hex
7986       << " got digest: rx data 0x" << cop->results.data_digest
7987       << " omap 0x" << cop->results.omap_digest
7988       << ", source: data 0x" << cop->results.source_data_digest
7989       << " omap 0x" <<  cop->results.source_omap_digest
7990       << std::dec
7991       << " flags " << cop->results.flags
7992       << dendl;
7993   }
7994   if (cop->results.is_data_digest() &&
7995       cop->results.data_digest != cop->results.source_data_digest) {
7996     derr << __func__ << std::hex << " data digest 0x" << cop->results.data_digest
7997          << " != source 0x" << cop->results.source_data_digest << std::dec
7998          << dendl;
7999     osd->clog->error() << info.pgid << " copy from " << cop->src
8000                        << " to " << cop->obc->obs.oi.soid << std::hex
8001                        << " data digest 0x" << cop->results.data_digest
8002                        << " != source 0x" << cop->results.source_data_digest
8003                        << std::dec;
8004     r = -EIO;
8005     goto out;
8006   }
8007   if (cop->results.is_omap_digest() &&
8008       cop->results.omap_digest != cop->results.source_omap_digest) {
8009     derr << __func__ << std::hex
8010          << " omap digest 0x" << cop->results.omap_digest
8011          << " != source 0x" << cop->results.source_omap_digest
8012          << std::dec << dendl;
8013     osd->clog->error() << info.pgid << " copy from " << cop->src
8014                        << " to " << cop->obc->obs.oi.soid << std::hex
8015                        << " omap digest 0x" << cop->results.omap_digest
8016                        << " != source 0x" << cop->results.source_omap_digest
8017                        << std::dec;
8018     r = -EIO;
8019     goto out;
8020   }
8021   if (cct->_conf->osd_debug_inject_copyfrom_error) {
8022     derr << __func__ << " injecting copyfrom failure" << dendl;
8023     r = -EIO;
8024     goto out;
8025   }
8026
8027   cop->results.fill_in_final_tx = std::function<void(PGTransaction*)>(
8028     [this, &cop /* avoid ref cycle */](PGTransaction *t) {
8029       ObjectState& obs = cop->obc->obs;
8030       if (cop->temp_cursor.is_initial()) {
8031         dout(20) << "fill_in_final_tx: writing "
8032                  << "directly to final object" << dendl;
8033         // write directly to final object
8034         cop->results.temp_oid = obs.oi.soid;
8035         _write_copy_chunk(cop, t);
8036       } else {
8037         // finish writing to temp object, then move into place
8038         dout(20) << "fill_in_final_tx: writing to temp object" << dendl;
8039         _write_copy_chunk(cop, t);
8040         t->rename(obs.oi.soid, cop->results.temp_oid);
8041       }
8042       t->setattrs(obs.oi.soid, cop->results.attrs);
8043     });
8044
8045   dout(20) << __func__ << " success; committing" << dendl;
8046
8047  out:
8048   dout(20) << __func__ << " complete r = " << cpp_strerror(r) << dendl;
8049   CopyCallbackResults results(r, &cop->results);
8050   cop->cb->complete(results);
8051
8052   copy_ops.erase(cobc->obs.oi.soid);
8053   cobc->stop_block();
8054
8055   if (r < 0 && cop->results.started_temp_obj) {
8056     dout(10) << __func__ << " deleting partial temp object "
8057              << cop->results.temp_oid << dendl;
8058     ObjectContextRef tempobc = get_object_context(cop->results.temp_oid, true);
8059     OpContextUPtr ctx = simple_opc_create(tempobc);
8060     ctx->op_t->remove(cop->results.temp_oid);
8061     ctx->discard_temp_oid = cop->results.temp_oid;
8062     simple_opc_submit(std::move(ctx));
8063   }
8064
8065   // cancel and requeue proxy ops on this object
8066   if (!r) {
8067     for (map<ceph_tid_t, ProxyReadOpRef>::iterator it = proxyread_ops.begin();
8068         it != proxyread_ops.end();) {
8069       if (it->second->soid == cobc->obs.oi.soid) {
8070         cancel_proxy_read((it++)->second);
8071       } else {
8072         ++it;
8073       }
8074     }
8075     for (map<ceph_tid_t, ProxyWriteOpRef>::iterator it = proxywrite_ops.begin();
8076          it != proxywrite_ops.end();) {
8077       if (it->second->soid == cobc->obs.oi.soid) {
8078         cancel_proxy_write((it++)->second);
8079       } else {
8080         ++it;
8081       }
8082     }
8083     kick_proxy_ops_blocked(cobc->obs.oi.soid);
8084   }
8085
8086   kick_object_context_blocked(cobc);
8087 }
8088
8089 void PrimaryLogPG::_write_copy_chunk(CopyOpRef cop, PGTransaction *t)
8090 {
8091   dout(20) << __func__ << " " << cop
8092            << " " << cop->attrs.size() << " attrs"
8093            << " " << cop->data.length() << " bytes"
8094            << " " << cop->omap_header.length() << " omap header bytes"
8095            << " " << cop->omap_data.length() << " omap data bytes"
8096            << dendl;
8097   if (!cop->temp_cursor.attr_complete) {
8098     t->create(cop->results.temp_oid);
8099   }
8100   if (!cop->temp_cursor.data_complete) {
8101     assert(cop->data.length() + cop->temp_cursor.data_offset ==
8102            cop->cursor.data_offset);
8103     if (pool.info.requires_aligned_append() &&
8104         !cop->cursor.data_complete) {
8105       /**
8106        * Trim off the unaligned bit at the end, we'll adjust cursor.data_offset
8107        * to pick it up on the next pass.
8108        */
8109       assert(cop->temp_cursor.data_offset %
8110              pool.info.required_alignment() == 0);
8111       if (cop->data.length() % pool.info.required_alignment() != 0) {
8112         uint64_t to_trim =
8113           cop->data.length() % pool.info.required_alignment();
8114         bufferlist bl;
8115         bl.substr_of(cop->data, 0, cop->data.length() - to_trim);
8116         cop->data.swap(bl);
8117         cop->cursor.data_offset -= to_trim;
8118         assert(cop->data.length() + cop->temp_cursor.data_offset ==
8119                cop->cursor.data_offset);
8120       }
8121     }
8122     if (cop->data.length()) {
8123       t->write(
8124         cop->results.temp_oid,
8125         cop->temp_cursor.data_offset,
8126         cop->data.length(),
8127         cop->data,
8128         cop->dest_obj_fadvise_flags);
8129     }
8130     cop->data.clear();
8131   }
8132   if (pool.info.supports_omap()) {
8133     if (!cop->temp_cursor.omap_complete) {
8134       if (cop->omap_header.length()) {
8135         t->omap_setheader(
8136           cop->results.temp_oid,
8137           cop->omap_header);
8138         cop->omap_header.clear();
8139       }
8140       if (cop->omap_data.length()) {
8141         map<string,bufferlist> omap;
8142         bufferlist::iterator p = cop->omap_data.begin();
8143         ::decode(omap, p);
8144         t->omap_setkeys(cop->results.temp_oid, omap);
8145         cop->omap_data.clear();
8146       }
8147     }
8148   } else {
8149     assert(cop->omap_header.length() == 0);
8150     assert(cop->omap_data.length() == 0);
8151   }
8152   cop->temp_cursor = cop->cursor;
8153 }
8154
8155 void PrimaryLogPG::finish_copyfrom(OpContext *ctx)
8156 {
8157   dout(20) << "finish_copyfrom on " << ctx->obs->oi.soid << dendl;
8158   ObjectState& obs = ctx->new_obs;
8159   CopyFromCallback *cb = static_cast<CopyFromCallback*>(ctx->copy_cb);
8160
8161   if (obs.exists) {
8162     dout(20) << __func__ << ": exists, removing" << dendl;
8163     ctx->op_t->remove(obs.oi.soid);
8164   } else {
8165     ctx->delta_stats.num_objects++;
8166     obs.exists = true;
8167   }
8168   if (cb->is_temp_obj_used()) {
8169     ctx->discard_temp_oid = cb->results->temp_oid;
8170   }
8171   cb->results->fill_in_final_tx(ctx->op_t.get());
8172
8173   // CopyFromCallback fills this in for us
8174   obs.oi.user_version = ctx->user_at_version;
8175
8176   obs.oi.set_data_digest(cb->results->data_digest);
8177   obs.oi.set_omap_digest(cb->results->omap_digest);
8178
8179   obs.oi.truncate_seq = cb->results->truncate_seq;
8180   obs.oi.truncate_size = cb->results->truncate_size;
8181
8182   ctx->extra_reqids = cb->results->reqids;
8183
8184   // cache: clear whiteout?
8185   if (obs.oi.is_whiteout()) {
8186     dout(10) << __func__ << " clearing whiteout on " << obs.oi.soid << dendl;
8187     obs.oi.clear_flag(object_info_t::FLAG_WHITEOUT);
8188     --ctx->delta_stats.num_whiteouts;
8189   }
8190
8191   if (cb->results->has_omap) {
8192     dout(10) << __func__ << " setting omap flag on " << obs.oi.soid << dendl;
8193     obs.oi.set_flag(object_info_t::FLAG_OMAP);
8194   } else {
8195     dout(10) << __func__ << " clearing omap flag on " << obs.oi.soid << dendl;
8196     obs.oi.clear_flag(object_info_t::FLAG_OMAP);
8197   }
8198
8199   interval_set<uint64_t> ch;
8200   if (obs.oi.size > 0)
8201     ch.insert(0, obs.oi.size);
8202   ctx->modified_ranges.union_of(ch);
8203
8204   if (cb->get_data_size() != obs.oi.size) {
8205     ctx->delta_stats.num_bytes -= obs.oi.size;
8206     obs.oi.size = cb->get_data_size();
8207     ctx->delta_stats.num_bytes += obs.oi.size;
8208   }
8209   ctx->delta_stats.num_wr++;
8210   ctx->delta_stats.num_wr_kb += SHIFT_ROUND_UP(obs.oi.size, 10);
8211
8212   osd->logger->inc(l_osd_copyfrom);
8213 }
8214
8215 void PrimaryLogPG::finish_promote(int r, CopyResults *results,
8216                                   ObjectContextRef obc)
8217 {
8218   const hobject_t& soid = obc->obs.oi.soid;
8219   dout(10) << __func__ << " " << soid << " r=" << r
8220            << " uv" << results->user_version << dendl;
8221
8222   if (r == -ECANCELED) {
8223     return;
8224   }
8225
8226   if (r != -ENOENT && soid.is_snap()) {
8227     if (results->snaps.empty()) {
8228       // we must have read "snap" content from the head object in
8229       // the base pool.  use snap_seq to construct what snaps should
8230       // be for this clone (what is was before we evicted the clean
8231       // clone from this pool, and what it will be when we flush and
8232       // the clone eventually happens in the base pool).
8233       SnapSet& snapset = obc->ssc->snapset;
8234       vector<snapid_t>::iterator p = snapset.snaps.begin();
8235       while (p != snapset.snaps.end() && *p > soid.snap)
8236         ++p;
8237       while (p != snapset.snaps.end() && *p > results->snap_seq) {
8238         results->snaps.push_back(*p);
8239         ++p;
8240       }
8241     }
8242
8243     dout(20) << __func__ << " snaps " << results->snaps << dendl;
8244     filter_snapc(results->snaps);
8245
8246     dout(20) << __func__ << " filtered snaps " << results->snaps << dendl;
8247     if (results->snaps.empty()) {
8248       dout(20) << __func__
8249                << " snaps are empty, clone is invalid,"
8250                << " setting r to ENOENT" << dendl;
8251       r = -ENOENT;
8252     }
8253   }
8254
8255   if (r < 0 && results->started_temp_obj) {
8256     dout(10) << __func__ << " abort; will clean up partial work" << dendl;
8257     ObjectContextRef tempobc = get_object_context(results->temp_oid, false);
8258     assert(tempobc);
8259     OpContextUPtr ctx = simple_opc_create(tempobc);
8260     ctx->op_t->remove(results->temp_oid);
8261     simple_opc_submit(std::move(ctx));
8262     results->started_temp_obj = false;
8263   }
8264
8265   if (r == -ENOENT && soid.is_snap()) {
8266     dout(10) << __func__
8267              << ": enoent while trying to promote clone, " << soid
8268              << " must have been trimmed, removing from snapset"
8269              << dendl;
8270     hobject_t head(soid.get_head());
8271     ObjectContextRef obc = get_object_context(head, false);
8272     assert(obc);
8273
8274     OpContextUPtr tctx = simple_opc_create(obc);
8275     tctx->at_version = get_next_version();
8276     filter_snapc(tctx->new_snapset.snaps);
8277     vector<snapid_t> new_clones;
8278     map<snapid_t, vector<snapid_t>> new_clone_snaps;
8279     for (vector<snapid_t>::iterator i = tctx->new_snapset.clones.begin();
8280          i != tctx->new_snapset.clones.end();
8281          ++i) {
8282       if (*i != soid.snap) {
8283         new_clones.push_back(*i);
8284         auto p = tctx->new_snapset.clone_snaps.find(*i);
8285         if (p != tctx->new_snapset.clone_snaps.end()) {
8286           new_clone_snaps[*i] = p->second;
8287         }
8288       }
8289     }
8290     tctx->new_snapset.clones.swap(new_clones);
8291     tctx->new_snapset.clone_overlap.erase(soid.snap);
8292     tctx->new_snapset.clone_size.erase(soid.snap);
8293     tctx->new_snapset.clone_snaps.swap(new_clone_snaps);
8294
8295     // take RWWRITE lock for duration of our local write.  ignore starvation.
8296     if (!tctx->lock_manager.take_write_lock(
8297           head,
8298           obc)) {
8299       assert(0 == "problem!");
8300     }
8301     dout(20) << __func__ << " took lock on obc, " << obc->rwstate << dendl;
8302
8303     finish_ctx(tctx.get(), pg_log_entry_t::PROMOTE);
8304
8305     simple_opc_submit(std::move(tctx));
8306     return;
8307   }
8308
8309   bool whiteout = false;
8310   if (r == -ENOENT) {
8311     assert(soid.snap == CEPH_NOSNAP); // snap case is above
8312     dout(10) << __func__ << " whiteout " << soid << dendl;
8313     whiteout = true;
8314   }
8315
8316   if (r < 0 && !whiteout) {
8317     derr << __func__ << " unexpected promote error " << cpp_strerror(r) << dendl;
8318     // pass error to everyone blocked on this object
8319     // FIXME: this is pretty sloppy, but at this point we got
8320     // something unexpected and don't have many other options.
8321     map<hobject_t,list<OpRequestRef>>::iterator blocked_iter =
8322       waiting_for_blocked_object.find(soid);
8323     if (blocked_iter != waiting_for_blocked_object.end()) {
8324       while (!blocked_iter->second.empty()) {
8325         osd->reply_op_error(blocked_iter->second.front(), r);
8326         blocked_iter->second.pop_front();
8327       }
8328       waiting_for_blocked_object.erase(blocked_iter);
8329     }
8330     return;
8331   }
8332
8333   osd->promote_finish(results->object_size);
8334
8335   OpContextUPtr tctx =  simple_opc_create(obc);
8336   tctx->at_version = get_next_version();
8337
8338   ++tctx->delta_stats.num_objects;
8339   if (soid.snap < CEPH_NOSNAP)
8340     ++tctx->delta_stats.num_object_clones;
8341   tctx->new_obs.exists = true;
8342
8343   tctx->extra_reqids = results->reqids;
8344
8345   bool legacy_snapset = tctx->new_snapset.is_legacy() ||
8346     get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
8347
8348   if (whiteout) {
8349     // create a whiteout
8350     tctx->op_t->create(soid);
8351     tctx->new_obs.oi.set_flag(object_info_t::FLAG_WHITEOUT);
8352     ++tctx->delta_stats.num_whiteouts;
8353     dout(20) << __func__ << " creating whiteout on " << soid << dendl;
8354     osd->logger->inc(l_osd_tier_whiteout);
8355   } else {
8356     if (results->has_omap) {
8357       dout(10) << __func__ << " setting omap flag on " << soid << dendl;
8358       tctx->new_obs.oi.set_flag(object_info_t::FLAG_OMAP);
8359       ++tctx->delta_stats.num_objects_omap;
8360     }
8361
8362     results->fill_in_final_tx(tctx->op_t.get());
8363     if (results->started_temp_obj) {
8364       tctx->discard_temp_oid = results->temp_oid;
8365     }
8366     tctx->new_obs.oi.size = results->object_size;
8367     tctx->new_obs.oi.user_version = results->user_version;
8368     // Don't care src object whether have data or omap digest
8369     if (results->object_size)
8370       tctx->new_obs.oi.set_data_digest(results->data_digest);
8371     if (results->has_omap)
8372       tctx->new_obs.oi.set_omap_digest(results->omap_digest);
8373     tctx->new_obs.oi.truncate_seq = results->truncate_seq;
8374     tctx->new_obs.oi.truncate_size = results->truncate_size;
8375
8376     if (soid.snap != CEPH_NOSNAP) {
8377       if (legacy_snapset) {
8378         tctx->new_obs.oi.legacy_snaps = results->snaps;
8379         assert(!tctx->new_obs.oi.legacy_snaps.empty());
8380       } else {
8381         // it's already in the snapset
8382         assert(obc->ssc->snapset.clone_snaps.count(soid.snap));
8383       }
8384       assert(obc->ssc->snapset.clone_size.count(soid.snap));
8385       assert(obc->ssc->snapset.clone_size[soid.snap] ==
8386              results->object_size);
8387       assert(obc->ssc->snapset.clone_overlap.count(soid.snap));
8388
8389       tctx->delta_stats.num_bytes += obc->ssc->snapset.get_clone_bytes(soid.snap);
8390     } else {
8391       tctx->delta_stats.num_bytes += results->object_size;
8392     }
8393   }
8394
8395   if (results->mirror_snapset) {
8396     assert(tctx->new_obs.oi.soid.snap == CEPH_NOSNAP);
8397     tctx->new_snapset.from_snap_set(
8398       results->snapset,
8399       get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS);
8400   }
8401   tctx->new_snapset.head_exists = true;
8402   dout(20) << __func__ << " new_snapset " << tctx->new_snapset << dendl;
8403
8404   // take RWWRITE lock for duration of our local write.  ignore starvation.
8405   if (!tctx->lock_manager.take_write_lock(
8406         obc->obs.oi.soid,
8407         obc)) {
8408     assert(0 == "problem!");
8409   }
8410   dout(20) << __func__ << " took lock on obc, " << obc->rwstate << dendl;
8411
8412   finish_ctx(tctx.get(), pg_log_entry_t::PROMOTE);
8413
8414   simple_opc_submit(std::move(tctx));
8415
8416   osd->logger->inc(l_osd_tier_promote);
8417
8418   if (agent_state &&
8419       agent_state->is_idle())
8420     agent_choose_mode();
8421 }
8422
8423 void PrimaryLogPG::cancel_copy(CopyOpRef cop, bool requeue)
8424 {
8425   dout(10) << __func__ << " " << cop->obc->obs.oi.soid
8426            << " from " << cop->src << " " << cop->oloc
8427            << " v" << cop->results.user_version << dendl;
8428
8429   // cancel objecter op, if we can
8430   if (cop->objecter_tid) {
8431     osd->objecter->op_cancel(cop->objecter_tid, -ECANCELED);
8432     cop->objecter_tid = 0;
8433     if (cop->objecter_tid2) {
8434       osd->objecter->op_cancel(cop->objecter_tid2, -ECANCELED);
8435       cop->objecter_tid2 = 0;
8436     }
8437   }
8438
8439   copy_ops.erase(cop->obc->obs.oi.soid);
8440   cop->obc->stop_block();
8441
8442   kick_object_context_blocked(cop->obc);
8443   cop->results.should_requeue = requeue;
8444   CopyCallbackResults result(-ECANCELED, &cop->results);
8445   cop->cb->complete(result);
8446
8447   // There may still be an objecter callback referencing this copy op.
8448   // That callback will not need the obc since it's been canceled, and
8449   // we need the obc reference to go away prior to flush.
8450   cop->obc = ObjectContextRef();
8451 }
8452
8453 void PrimaryLogPG::cancel_copy_ops(bool requeue)
8454 {
8455   dout(10) << __func__ << dendl;
8456   map<hobject_t,CopyOpRef>::iterator p = copy_ops.begin();
8457   while (p != copy_ops.end()) {
8458     // requeue this op? can I queue up all of them?
8459     cancel_copy((p++)->second, requeue);
8460   }
8461 }
8462
8463
8464 // ========================================================================
8465 // flush
8466 //
8467 // Flush a dirty object in the cache tier by writing it back to the
8468 // base tier.  The sequence looks like:
8469 //
8470 //  * send a copy-from operation to the base tier to copy the current
8471 //    version of the object
8472 //  * base tier will pull the object via (perhaps multiple) copy-get(s)
8473 //  * on completion, we check if the object has been modified.  if so,
8474 //    just reply with -EAGAIN.
8475 //  * try to take a write lock so we can clear the dirty flag.  if this
8476 //    fails, wait and retry
8477 //  * start a repop that clears the bit.
8478 //
8479 // If we have to wait, we will retry by coming back through the
8480 // start_flush method.  We check if a flush is already in progress
8481 // and, if so, try to finish it by rechecking the version and trying
8482 // to clear the dirty bit.
8483 //
8484 // In order for the cache-flush (a write op) to not block the copy-get
8485 // from reading the object, the client *must* set the SKIPRWLOCKS
8486 // flag.
8487 //
8488 // NOTE: normally writes are strictly ordered for the client, but
8489 // flushes are special in that they can be reordered with respect to
8490 // other writes.  In particular, we can't have a flush request block
8491 // an update to the cache pool object!
8492
8493 struct C_Flush : public Context {
8494   PrimaryLogPGRef pg;
8495   hobject_t oid;
8496   epoch_t last_peering_reset;
8497   ceph_tid_t tid;
8498   utime_t start;
8499   C_Flush(PrimaryLogPG *p, hobject_t o, epoch_t lpr)
8500     : pg(p), oid(o), last_peering_reset(lpr),
8501       tid(0), start(ceph_clock_now())
8502   {}
8503   void finish(int r) override {
8504     if (r == -ECANCELED)
8505       return;
8506     pg->lock();
8507     if (last_peering_reset == pg->get_last_peering_reset()) {
8508       pg->finish_flush(oid, tid, r);
8509       pg->osd->logger->tinc(l_osd_tier_flush_lat, ceph_clock_now() - start);
8510     }
8511     pg->unlock();
8512   }
8513 };
8514
8515 int PrimaryLogPG::start_flush(
8516   OpRequestRef op, ObjectContextRef obc,
8517   bool blocking, hobject_t *pmissing,
8518   boost::optional<std::function<void()>> &&on_flush)
8519 {
8520   const object_info_t& oi = obc->obs.oi;
8521   const hobject_t& soid = oi.soid;
8522   dout(10) << __func__ << " " << soid
8523            << " v" << oi.version
8524            << " uv" << oi.user_version
8525            << " " << (blocking ? "blocking" : "non-blocking/best-effort")
8526            << dendl;
8527
8528   // get a filtered snapset, need to remove removed snaps
8529   SnapSet snapset = obc->ssc->snapset.get_filtered(pool.info);
8530
8531   // verify there are no (older) check for dirty clones
8532   {
8533     dout(20) << " snapset " << snapset << dendl;
8534     vector<snapid_t>::reverse_iterator p = snapset.clones.rbegin();
8535     while (p != snapset.clones.rend() && *p >= soid.snap)
8536       ++p;
8537     if (p != snapset.clones.rend()) {
8538       hobject_t next = soid;
8539       next.snap = *p;
8540       assert(next.snap < soid.snap);
8541       if (pg_log.get_missing().is_missing(next)) {
8542         dout(10) << __func__ << " missing clone is " << next << dendl;
8543         if (pmissing)
8544           *pmissing = next;
8545         return -ENOENT;
8546       }
8547       ObjectContextRef older_obc = get_object_context(next, false);
8548       if (older_obc) {
8549         dout(20) << __func__ << " next oldest clone is " << older_obc->obs.oi
8550                  << dendl;
8551         if (older_obc->obs.oi.is_dirty()) {
8552           dout(10) << __func__ << " next oldest clone is dirty: "
8553                    << older_obc->obs.oi << dendl;
8554           return -EBUSY;
8555         }
8556       } else {
8557         dout(20) << __func__ << " next oldest clone " << next
8558                  << " is not present; implicitly clean" << dendl;
8559       }
8560     } else {
8561       dout(20) << __func__ << " no older clones" << dendl;
8562     }
8563   }
8564
8565   if (blocking)
8566     obc->start_block();
8567
8568   map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(soid);
8569   if (p != flush_ops.end()) {
8570     FlushOpRef fop = p->second;
8571     if (fop->op == op) {
8572       // we couldn't take the write lock on a cache-try-flush before;
8573       // now we are trying again for the lock.
8574       return try_flush_mark_clean(fop);
8575     }
8576     if (fop->flushed_version == obc->obs.oi.user_version &&
8577         (fop->blocking || !blocking)) {
8578       // nonblocking can join anything
8579       // blocking can only join a blocking flush
8580       dout(20) << __func__ << " piggybacking on existing flush " << dendl;
8581       if (op)
8582         fop->dup_ops.push_back(op);
8583       return -EAGAIN;   // clean up this ctx; op will retry later
8584     }
8585
8586     // cancel current flush since it will fail anyway, or because we
8587     // are blocking and the existing flush is nonblocking.
8588     dout(20) << __func__ << " canceling previous flush; it will fail" << dendl;
8589     if (fop->op)
8590       osd->reply_op_error(fop->op, -EBUSY);
8591     while (!fop->dup_ops.empty()) {
8592       osd->reply_op_error(fop->dup_ops.front(), -EBUSY);
8593       fop->dup_ops.pop_front();
8594     }
8595     cancel_flush(fop, false);
8596   }
8597
8598   /**
8599    * In general, we need to send a delete and a copyfrom.
8600    * Consider snapc 10:[10, 9, 8, 4, 3, 2]:[10(10, 9), 4(4,3,2)]
8601    * where 4 is marked as clean.  To flush 10, we have to:
8602    * 1) delete 4:[4,3,2] -- Logically, the object does not exist after 4
8603    * 2) copyfrom 8:[8,4,3,2] -- flush object after snap 8
8604    *
8605    * There is a complicating case.  Supposed there had been a clone 7
8606    * for snaps [7, 6] which has been trimmed since they no longer exist.
8607    * In the base pool, we'd have 5:[4,3,2]:[4(4,3,2)]+head.  When we submit
8608    * the delete, the snap will be promoted to 5, and the head will become
8609    * a snapdir.  When the copy-from goes through, we'll end up with
8610    * 8:[8,4,3,2]:[4(4,3,2)]+head.
8611    *
8612    * Another complication is the case where there is an interval change
8613    * after doing the delete and the flush but before marking the object
8614    * clean.  We'll happily delete head and then recreate it at the same
8615    * sequence number, which works out ok.
8616    */
8617
8618   SnapContext snapc, dsnapc;
8619   if (snapset.seq != 0) {
8620     if (soid.snap == CEPH_NOSNAP) {
8621       snapc.seq = snapset.seq;
8622       snapc.snaps = snapset.snaps;
8623     } else {
8624       snapid_t min_included_snap;
8625       if (snapset.is_legacy()) {
8626         min_included_snap = oi.legacy_snaps.back();
8627       } else {
8628         auto p = snapset.clone_snaps.find(soid.snap);
8629         assert(p != snapset.clone_snaps.end());
8630         min_included_snap = p->second.back();
8631       }
8632       snapc = snapset.get_ssc_as_of(min_included_snap - 1);
8633     }
8634
8635     snapid_t prev_snapc = 0;
8636     for (vector<snapid_t>::reverse_iterator citer = snapset.clones.rbegin();
8637          citer != snapset.clones.rend();
8638          ++citer) {
8639       if (*citer < soid.snap) {
8640         prev_snapc = *citer;
8641         break;
8642       }
8643     }
8644
8645     dsnapc = snapset.get_ssc_as_of(prev_snapc);
8646   }
8647
8648   object_locator_t base_oloc(soid);
8649   base_oloc.pool = pool.info.tier_of;
8650
8651   if (dsnapc.seq < snapc.seq) {
8652     ObjectOperation o;
8653     o.remove();
8654     osd->objecter->mutate(
8655       soid.oid,
8656       base_oloc,
8657       o,
8658       dsnapc,
8659       ceph::real_clock::from_ceph_timespec(oi.mtime),
8660       (CEPH_OSD_FLAG_IGNORE_OVERLAY |
8661        CEPH_OSD_FLAG_ENFORCE_SNAPC),
8662       NULL /* no callback, we'll rely on the ordering w.r.t the next op */);
8663   }
8664
8665   FlushOpRef fop(std::make_shared<FlushOp>());
8666   fop->obc = obc;
8667   fop->flushed_version = oi.user_version;
8668   fop->blocking = blocking;
8669   fop->on_flush = std::move(on_flush);
8670   fop->op = op;
8671
8672   ObjectOperation o;
8673   if (oi.is_whiteout()) {
8674     fop->removal = true;
8675     o.remove();
8676   } else {
8677     object_locator_t oloc(soid);
8678     o.copy_from(soid.oid.name, soid.snap, oloc, oi.user_version,
8679                 CEPH_OSD_COPY_FROM_FLAG_FLUSH |
8680                 CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
8681                 CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
8682                 CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE,
8683                 LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL|LIBRADOS_OP_FLAG_FADVISE_NOCACHE);
8684
8685     //mean the base tier don't cache data after this
8686     if (agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL)
8687       o.set_last_op_flags(LIBRADOS_OP_FLAG_FADVISE_DONTNEED);
8688   }
8689   C_Flush *fin = new C_Flush(this, soid, get_last_peering_reset());
8690
8691   ceph_tid_t tid = osd->objecter->mutate(
8692     soid.oid, base_oloc, o, snapc,
8693     ceph::real_clock::from_ceph_timespec(oi.mtime),
8694     CEPH_OSD_FLAG_IGNORE_OVERLAY | CEPH_OSD_FLAG_ENFORCE_SNAPC,
8695     new C_OnFinisher(fin,
8696                      &osd->objecter_finisher));
8697   /* we're under the pg lock and fin->finish() is grabbing that */
8698   fin->tid = tid;
8699   fop->objecter_tid = tid;
8700
8701   flush_ops[soid] = fop;
8702   info.stats.stats.sum.num_flush++;
8703   info.stats.stats.sum.num_flush_kb += SHIFT_ROUND_UP(oi.size, 10);
8704   return -EINPROGRESS;
8705 }
8706
8707 void PrimaryLogPG::finish_flush(hobject_t oid, ceph_tid_t tid, int r)
8708 {
8709   dout(10) << __func__ << " " << oid << " tid " << tid
8710            << " " << cpp_strerror(r) << dendl;
8711   map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(oid);
8712   if (p == flush_ops.end()) {
8713     dout(10) << __func__ << " no flush_op found" << dendl;
8714     return;
8715   }
8716   FlushOpRef fop = p->second;
8717   if (tid != fop->objecter_tid) {
8718     dout(10) << __func__ << " tid " << tid << " != fop " << fop
8719              << " tid " << fop->objecter_tid << dendl;
8720     return;
8721   }
8722   ObjectContextRef obc = fop->obc;
8723   fop->objecter_tid = 0;
8724
8725   if (r < 0 && !(r == -ENOENT && fop->removal)) {
8726     if (fop->op)
8727       osd->reply_op_error(fop->op, -EBUSY);
8728     if (fop->blocking) {
8729       obc->stop_block();
8730       kick_object_context_blocked(obc);
8731     }
8732
8733     if (!fop->dup_ops.empty()) {
8734       dout(20) << __func__ << " requeueing dups" << dendl;
8735       requeue_ops(fop->dup_ops);
8736     }
8737     if (fop->on_flush) {
8738       (*(fop->on_flush))();
8739       fop->on_flush = boost::none;
8740     }
8741     flush_ops.erase(oid);
8742     return;
8743   }
8744
8745   r = try_flush_mark_clean(fop);
8746   if (r == -EBUSY && fop->op) {
8747     osd->reply_op_error(fop->op, r);
8748   }
8749 }
8750
8751 int PrimaryLogPG::try_flush_mark_clean(FlushOpRef fop)
8752 {
8753   ObjectContextRef obc = fop->obc;
8754   const hobject_t& oid = obc->obs.oi.soid;
8755
8756   if (fop->blocking) {
8757     obc->stop_block();
8758     kick_object_context_blocked(obc);
8759   }
8760
8761   if (fop->flushed_version != obc->obs.oi.user_version ||
8762       !obc->obs.exists) {
8763     if (obc->obs.exists)
8764       dout(10) << __func__ << " flushed_version " << fop->flushed_version
8765                << " != current " << obc->obs.oi.user_version
8766                << dendl;
8767     else
8768       dout(10) << __func__ << " object no longer exists" << dendl;
8769
8770     if (!fop->dup_ops.empty()) {
8771       dout(20) << __func__ << " requeueing dups" << dendl;
8772       requeue_ops(fop->dup_ops);
8773     }
8774     if (fop->on_flush) {
8775       (*(fop->on_flush))();
8776       fop->on_flush = boost::none;
8777     }
8778     flush_ops.erase(oid);
8779     if (fop->blocking)
8780       osd->logger->inc(l_osd_tier_flush_fail);
8781     else
8782       osd->logger->inc(l_osd_tier_try_flush_fail);
8783     return -EBUSY;
8784   }
8785
8786   if (!fop->blocking &&
8787       scrubber.write_blocked_by_scrub(oid)) {
8788     if (fop->op) {
8789       dout(10) << __func__ << " blocked by scrub" << dendl;
8790       requeue_op(fop->op);
8791       requeue_ops(fop->dup_ops);
8792       return -EAGAIN;    // will retry
8793     } else {
8794       osd->logger->inc(l_osd_tier_try_flush_fail);
8795       cancel_flush(fop, false);
8796       return -ECANCELED;
8797     }
8798   }
8799
8800   // successfully flushed, can we evict this object?
8801   if (!fop->op && agent_state->evict_mode != TierAgentState::EVICT_MODE_IDLE &&
8802       agent_maybe_evict(obc, true)) {
8803     osd->logger->inc(l_osd_tier_clean);
8804     if (fop->on_flush) {
8805       (*(fop->on_flush))();
8806       fop->on_flush = boost::none;
8807     }
8808     flush_ops.erase(oid);
8809     return 0;
8810   }
8811
8812   dout(10) << __func__ << " clearing DIRTY flag for " << oid << dendl;
8813   OpContextUPtr ctx = simple_opc_create(fop->obc);
8814
8815   // successfully flushed; can we clear the dirty bit?
8816   // try to take the lock manually, since we don't
8817   // have a ctx yet.
8818   if (ctx->lock_manager.get_lock_type(
8819         ObjectContext::RWState::RWWRITE,
8820         oid,
8821         obc,
8822         fop->op)) {
8823     dout(20) << __func__ << " took write lock" << dendl;
8824   } else if (fop->op) {
8825     dout(10) << __func__ << " waiting on write lock" << dendl;
8826     close_op_ctx(ctx.release());
8827     requeue_op(fop->op);
8828     requeue_ops(fop->dup_ops);
8829     return -EAGAIN;    // will retry
8830   } else {
8831     dout(10) << __func__ << " failed write lock, no op; failing" << dendl;
8832     close_op_ctx(ctx.release());
8833     osd->logger->inc(l_osd_tier_try_flush_fail);
8834     cancel_flush(fop, false);
8835     return -ECANCELED;
8836   }
8837
8838   if (fop->on_flush) {
8839     ctx->register_on_finish(*(fop->on_flush));
8840     fop->on_flush = boost::none;
8841   }
8842
8843   ctx->at_version = get_next_version();
8844
8845   ctx->new_obs = obc->obs;
8846   ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
8847   --ctx->delta_stats.num_objects_dirty;
8848
8849   finish_ctx(ctx.get(), pg_log_entry_t::CLEAN);
8850
8851   osd->logger->inc(l_osd_tier_clean);
8852
8853   if (!fop->dup_ops.empty() || fop->op) {
8854     dout(20) << __func__ << " requeueing for " << ctx->at_version << dendl;
8855     list<OpRequestRef> ls;
8856     if (fop->op)
8857       ls.push_back(fop->op);
8858     ls.splice(ls.end(), fop->dup_ops);
8859     requeue_ops(ls);
8860   }
8861
8862   simple_opc_submit(std::move(ctx));
8863
8864   flush_ops.erase(oid);
8865
8866   if (fop->blocking)
8867     osd->logger->inc(l_osd_tier_flush);
8868   else
8869     osd->logger->inc(l_osd_tier_try_flush);
8870
8871   return -EINPROGRESS;
8872 }
8873
8874 void PrimaryLogPG::cancel_flush(FlushOpRef fop, bool requeue)
8875 {
8876   dout(10) << __func__ << " " << fop->obc->obs.oi.soid << " tid "
8877            << fop->objecter_tid << dendl;
8878   if (fop->objecter_tid) {
8879     osd->objecter->op_cancel(fop->objecter_tid, -ECANCELED);
8880     fop->objecter_tid = 0;
8881   }
8882   if (fop->blocking) {
8883     fop->obc->stop_block();
8884     kick_object_context_blocked(fop->obc);
8885   }
8886   if (requeue) {
8887     if (fop->op)
8888       requeue_op(fop->op);
8889     requeue_ops(fop->dup_ops);
8890   }
8891   if (fop->on_flush) {
8892     (*(fop->on_flush))();
8893     fop->on_flush = boost::none;
8894   }
8895   flush_ops.erase(fop->obc->obs.oi.soid);
8896 }
8897
8898 void PrimaryLogPG::cancel_flush_ops(bool requeue)
8899 {
8900   dout(10) << __func__ << dendl;
8901   map<hobject_t,FlushOpRef>::iterator p = flush_ops.begin();
8902   while (p != flush_ops.end()) {
8903     cancel_flush((p++)->second, requeue);
8904   }
8905 }
8906
8907 bool PrimaryLogPG::is_present_clone(hobject_t coid)
8908 {
8909   if (!pool.info.allow_incomplete_clones())
8910     return true;
8911   if (is_missing_object(coid))
8912     return true;
8913   ObjectContextRef obc = get_object_context(coid, false);
8914   return obc && obc->obs.exists;
8915 }
8916
8917 // ========================================================================
8918 // rep op gather
8919
8920 class C_OSD_RepopApplied : public Context {
8921   PrimaryLogPGRef pg;
8922   boost::intrusive_ptr<PrimaryLogPG::RepGather> repop;
8923 public:
8924   C_OSD_RepopApplied(PrimaryLogPG *pg, PrimaryLogPG::RepGather *repop)
8925   : pg(pg), repop(repop) {}
8926   void finish(int) override {
8927     pg->repop_all_applied(repop.get());
8928   }
8929 };
8930
8931
8932 void PrimaryLogPG::repop_all_applied(RepGather *repop)
8933 {
8934   dout(10) << __func__ << ": repop tid " << repop->rep_tid << " all applied "
8935            << dendl;
8936   assert(!repop->applies_with_commit);
8937   repop->all_applied = true;
8938   if (!repop->rep_aborted) {
8939     eval_repop(repop);
8940   }
8941 }
8942
8943 class C_OSD_RepopCommit : public Context {
8944   PrimaryLogPGRef pg;
8945   boost::intrusive_ptr<PrimaryLogPG::RepGather> repop;
8946 public:
8947   C_OSD_RepopCommit(PrimaryLogPG *pg, PrimaryLogPG::RepGather *repop)
8948     : pg(pg), repop(repop) {}
8949   void finish(int) override {
8950     pg->repop_all_committed(repop.get());
8951   }
8952 };
8953
8954 void PrimaryLogPG::repop_all_committed(RepGather *repop)
8955 {
8956   dout(10) << __func__ << ": repop tid " << repop->rep_tid << " all committed "
8957            << dendl;
8958   repop->all_committed = true;
8959   if (repop->applies_with_commit) {
8960     assert(!repop->all_applied);
8961     repop->all_applied = true;
8962   }
8963
8964   if (!repop->rep_aborted) {
8965     if (repop->v != eversion_t()) {
8966       last_update_ondisk = repop->v;
8967       last_complete_ondisk = repop->pg_local_last_complete;
8968     }
8969     eval_repop(repop);
8970   }
8971 }
8972
8973 void PrimaryLogPG::op_applied(const eversion_t &applied_version)
8974 {
8975   dout(10) << "op_applied version " << applied_version << dendl;
8976   if (applied_version == eversion_t())
8977     return;
8978   assert(applied_version > last_update_applied);
8979   assert(applied_version <= info.last_update);
8980   last_update_applied = applied_version;
8981   if (is_primary()) {
8982     if (scrubber.active) {
8983       if (last_update_applied == scrubber.subset_last_update) {
8984         if (ops_blocked_by_scrub()) {
8985           requeue_scrub(true);
8986         } else {
8987           requeue_scrub(false);
8988         }
8989
8990       }
8991     } else {
8992       assert(scrubber.start == scrubber.end);
8993     }
8994   } else {
8995     if (scrubber.active_rep_scrub) {
8996       if (last_update_applied == static_cast<const MOSDRepScrub*>(
8997             scrubber.active_rep_scrub->get_req())->scrub_to) {
8998         osd->enqueue_back(
8999           info.pgid,
9000           PGQueueable(scrubber.active_rep_scrub, get_osdmap()->get_epoch()));
9001         scrubber.active_rep_scrub = OpRequestRef();
9002       }
9003     }
9004   }
9005 }
9006
9007 void PrimaryLogPG::eval_repop(RepGather *repop)
9008 {
9009   const MOSDOp *m = NULL;
9010   if (repop->op)
9011     m = static_cast<const MOSDOp *>(repop->op->get_req());
9012
9013   if (m)
9014     dout(10) << "eval_repop " << *repop
9015              << (repop->rep_done ? " DONE" : "")
9016              << dendl;
9017   else
9018     dout(10) << "eval_repop " << *repop << " (no op)"
9019              << (repop->rep_done ? " DONE" : "")
9020              << dendl;
9021
9022   if (repop->rep_done)
9023     return;
9024
9025   // ondisk?
9026   if (repop->all_committed) {
9027     dout(10) << " commit: " << *repop << dendl;
9028     for (auto p = repop->on_committed.begin();
9029          p != repop->on_committed.end();
9030          repop->on_committed.erase(p++)) {
9031       (*p)();
9032     }
9033     // send dup commits, in order
9034     if (waiting_for_ondisk.count(repop->v)) {
9035       assert(waiting_for_ondisk.begin()->first == repop->v);
9036       for (list<pair<OpRequestRef, version_t> >::iterator i =
9037              waiting_for_ondisk[repop->v].begin();
9038            i != waiting_for_ondisk[repop->v].end();
9039            ++i) {
9040         osd->reply_op_error(i->first, repop->r, repop->v,
9041                             i->second);
9042       }
9043       waiting_for_ondisk.erase(repop->v);
9044     }
9045   }
9046
9047   // applied?
9048   if (repop->all_applied) {
9049     if (repop->applies_with_commit) {
9050       assert(repop->on_applied.empty());
9051     }
9052     dout(10) << " applied: " << *repop << " " << dendl;
9053     for (auto p = repop->on_applied.begin();
9054          p != repop->on_applied.end();
9055          repop->on_applied.erase(p++)) {
9056       (*p)();
9057     }
9058   }
9059
9060   // done.
9061   if (repop->all_applied && repop->all_committed) {
9062     repop->rep_done = true;
9063
9064     publish_stats_to_osd();
9065     calc_min_last_complete_ondisk();
9066
9067     dout(10) << " removing " << *repop << dendl;
9068     assert(!repop_queue.empty());
9069     dout(20) << "   q front is " << *repop_queue.front() << dendl;
9070     if (repop_queue.front() != repop) {
9071       if (!repop->applies_with_commit) {
9072         dout(0) << " removing " << *repop << dendl;
9073         dout(0) << "   q front is " << *repop_queue.front() << dendl;
9074         assert(repop_queue.front() == repop);
9075       }
9076     } else {
9077       RepGather *to_remove = nullptr;
9078       while (!repop_queue.empty() &&
9079              (to_remove = repop_queue.front())->rep_done) {
9080         repop_queue.pop_front();
9081         for (auto p = to_remove->on_success.begin();
9082              p != to_remove->on_success.end();
9083              to_remove->on_success.erase(p++)) {
9084           (*p)();
9085         }
9086         remove_repop(to_remove);
9087       }
9088     }
9089   }
9090 }
9091
9092 void PrimaryLogPG::issue_repop(RepGather *repop, OpContext *ctx)
9093 {
9094   FUNCTRACE();
9095   const hobject_t& soid = ctx->obs->oi.soid;
9096   dout(7) << "issue_repop rep_tid " << repop->rep_tid
9097           << " o " << soid
9098           << dendl;
9099
9100   repop->v = ctx->at_version;
9101   if (ctx->at_version > eversion_t()) {
9102     for (set<pg_shard_t>::iterator i = actingbackfill.begin();
9103          i != actingbackfill.end();
9104          ++i) {
9105       if (*i == get_primary()) continue;
9106       pg_info_t &pinfo = peer_info[*i];
9107       // keep peer_info up to date
9108       if (pinfo.last_complete == pinfo.last_update)
9109         pinfo.last_complete = ctx->at_version;
9110       pinfo.last_update = ctx->at_version;
9111     }
9112   }
9113
9114   ctx->obc->ondisk_write_lock();
9115
9116   bool unlock_snapset_obc = false;
9117   ctx->op_t->add_obc(ctx->obc);
9118   if (ctx->clone_obc) {
9119     ctx->clone_obc->ondisk_write_lock();
9120     ctx->op_t->add_obc(ctx->clone_obc);
9121   }
9122   if (ctx->snapset_obc && ctx->snapset_obc->obs.oi.soid !=
9123       ctx->obc->obs.oi.soid) {
9124     ctx->snapset_obc->ondisk_write_lock();
9125     unlock_snapset_obc = true;
9126     ctx->op_t->add_obc(ctx->snapset_obc);
9127   }
9128
9129   Context *on_all_commit = new C_OSD_RepopCommit(this, repop);
9130   Context *on_all_applied = new C_OSD_RepopApplied(this, repop);
9131   Context *onapplied_sync = new C_OSD_OndiskWriteUnlock(
9132     ctx->obc,
9133     ctx->clone_obc,
9134     unlock_snapset_obc ? ctx->snapset_obc : ObjectContextRef());
9135   if (!(ctx->log.empty())) {
9136     assert(ctx->at_version >= projected_last_update);
9137     projected_last_update = ctx->at_version;
9138   }
9139   for (auto &&entry: ctx->log) {
9140     projected_log.add(entry);
9141   }
9142   pgbackend->submit_transaction(
9143     soid,
9144     ctx->delta_stats,
9145     ctx->at_version,
9146     std::move(ctx->op_t),
9147     pg_trim_to,
9148     min_last_complete_ondisk,
9149     ctx->log,
9150     ctx->updated_hset_history,
9151     onapplied_sync,
9152     on_all_applied,
9153     on_all_commit,
9154     repop->rep_tid,
9155     ctx->reqid,
9156     ctx->op);
9157 }
9158
9159 PrimaryLogPG::RepGather *PrimaryLogPG::new_repop(
9160   OpContext *ctx, ObjectContextRef obc,
9161   ceph_tid_t rep_tid)
9162 {
9163   if (ctx->op)
9164     dout(10) << "new_repop rep_tid " << rep_tid << " on " << *ctx->op->get_req() << dendl;
9165   else
9166     dout(10) << "new_repop rep_tid " << rep_tid << " (no op)" << dendl;
9167
9168   RepGather *repop = new RepGather(
9169     ctx, rep_tid, info.last_complete, false);
9170
9171   repop->start = ceph_clock_now();
9172
9173   repop_queue.push_back(&repop->queue_item);
9174   repop->get();
9175
9176   osd->logger->inc(l_osd_op_wip);
9177
9178   dout(10) << __func__ << ": " << *repop << dendl;
9179   return repop;
9180 }
9181
9182 boost::intrusive_ptr<PrimaryLogPG::RepGather> PrimaryLogPG::new_repop(
9183   eversion_t version,
9184   int r,
9185   ObcLockManager &&manager,
9186   OpRequestRef &&op,
9187   boost::optional<std::function<void(void)> > &&on_complete)
9188 {
9189   RepGather *repop = new RepGather(
9190     std::move(manager),
9191     std::move(op),
9192     std::move(on_complete),
9193     osd->get_tid(),
9194     info.last_complete,
9195     true,
9196     r);
9197   repop->v = version;
9198
9199   repop->start = ceph_clock_now();
9200
9201   repop_queue.push_back(&repop->queue_item);
9202
9203   osd->logger->inc(l_osd_op_wip);
9204
9205   dout(10) << __func__ << ": " << *repop << dendl;
9206   return boost::intrusive_ptr<RepGather>(repop);
9207 }
9208
9209 void PrimaryLogPG::remove_repop(RepGather *repop)
9210 {
9211   dout(20) << __func__ << " " << *repop << dendl;
9212
9213   for (auto p = repop->on_finish.begin();
9214        p != repop->on_finish.end();
9215        repop->on_finish.erase(p++)) {
9216     (*p)();
9217   }
9218
9219   release_object_locks(
9220     repop->lock_manager);
9221   repop->put();
9222
9223   osd->logger->dec(l_osd_op_wip);
9224 }
9225
9226 PrimaryLogPG::OpContextUPtr PrimaryLogPG::simple_opc_create(ObjectContextRef obc)
9227 {
9228   dout(20) << __func__ << " " << obc->obs.oi.soid << dendl;
9229   vector<OSDOp> ops;
9230   ceph_tid_t rep_tid = osd->get_tid();
9231   osd_reqid_t reqid(osd->get_cluster_msgr_name(), 0, rep_tid);
9232   OpContextUPtr ctx(new OpContext(OpRequestRef(), reqid, ops, obc, this));
9233   ctx->op_t.reset(new PGTransaction());
9234   ctx->mtime = ceph_clock_now();
9235   return ctx;
9236 }
9237
9238 void PrimaryLogPG::simple_opc_submit(OpContextUPtr ctx)
9239 {
9240   RepGather *repop = new_repop(ctx.get(), ctx->obc, ctx->reqid.tid);
9241   dout(20) << __func__ << " " << repop << dendl;
9242   issue_repop(repop, ctx.get());
9243   eval_repop(repop);
9244   calc_trim_to();
9245   repop->put();
9246 }
9247
9248
9249 void PrimaryLogPG::submit_log_entries(
9250   const mempool::osd_pglog::list<pg_log_entry_t> &entries,
9251   ObcLockManager &&manager,
9252   boost::optional<std::function<void(void)> > &&_on_complete,
9253   OpRequestRef op,
9254   int r)
9255 {
9256   dout(10) << __func__ << " " << entries << dendl;
9257   assert(is_primary());
9258
9259   eversion_t version;
9260   if (!entries.empty()) {
9261     assert(entries.rbegin()->version >= projected_last_update);
9262     version = projected_last_update = entries.rbegin()->version;
9263   }
9264
9265   boost::intrusive_ptr<RepGather> repop;
9266   boost::optional<std::function<void(void)> > on_complete;
9267   if (get_osdmap()->require_osd_release >= CEPH_RELEASE_JEWEL) {
9268     repop = new_repop(
9269       version,
9270       r,
9271       std::move(manager),
9272       std::move(op),
9273       std::move(_on_complete));
9274   } else {
9275     on_complete = std::move(_on_complete);
9276   }
9277
9278   pgbackend->call_write_ordered(
9279     [this, entries, repop, on_complete]() {
9280       ObjectStore::Transaction t;
9281       eversion_t old_last_update = info.last_update;
9282       merge_new_log_entries(entries, t);
9283
9284
9285       set<pg_shard_t> waiting_on;
9286       for (set<pg_shard_t>::const_iterator i = actingbackfill.begin();
9287            i != actingbackfill.end();
9288            ++i) {
9289         pg_shard_t peer(*i);
9290         if (peer == pg_whoami) continue;
9291         assert(peer_missing.count(peer));
9292         assert(peer_info.count(peer));
9293         if (get_osdmap()->require_osd_release >= CEPH_RELEASE_JEWEL) {
9294           assert(repop);
9295           MOSDPGUpdateLogMissing *m = new MOSDPGUpdateLogMissing(
9296             entries,
9297             spg_t(info.pgid.pgid, i->shard),
9298             pg_whoami.shard,
9299             get_osdmap()->get_epoch(),
9300             last_peering_reset,
9301             repop->rep_tid);
9302           osd->send_message_osd_cluster(
9303             peer.osd, m, get_osdmap()->get_epoch());
9304           waiting_on.insert(peer);
9305         } else {
9306           MOSDPGLog *m = new MOSDPGLog(
9307             peer.shard, pg_whoami.shard,
9308             info.last_update.epoch,
9309             info);
9310           m->log.log = entries;
9311           m->log.tail = old_last_update;
9312           m->log.head = info.last_update;
9313           osd->send_message_osd_cluster(
9314             peer.osd, m, get_osdmap()->get_epoch());
9315         }
9316       }
9317       if (get_osdmap()->require_osd_release >= CEPH_RELEASE_JEWEL) {
9318         ceph_tid_t rep_tid = repop->rep_tid;
9319         waiting_on.insert(pg_whoami);
9320         log_entry_update_waiting_on.insert(
9321           make_pair(
9322             rep_tid,
9323             LogUpdateCtx{std::move(repop), std::move(waiting_on)}
9324             ));
9325         struct OnComplete : public Context {
9326           PrimaryLogPGRef pg;
9327           ceph_tid_t rep_tid;
9328           epoch_t epoch;
9329           OnComplete(
9330             PrimaryLogPGRef pg,
9331             ceph_tid_t rep_tid,
9332             epoch_t epoch)
9333             : pg(pg), rep_tid(rep_tid), epoch(epoch) {}
9334           void finish(int) override {
9335             pg->lock();
9336             if (!pg->pg_has_reset_since(epoch)) {
9337               auto it = pg->log_entry_update_waiting_on.find(rep_tid);
9338               assert(it != pg->log_entry_update_waiting_on.end());
9339               auto it2 = it->second.waiting_on.find(pg->pg_whoami);
9340               assert(it2 != it->second.waiting_on.end());
9341               it->second.waiting_on.erase(it2);
9342               if (it->second.waiting_on.empty()) {
9343                 pg->repop_all_committed(it->second.repop.get());
9344                 pg->log_entry_update_waiting_on.erase(it);
9345               }
9346             }
9347             pg->unlock();
9348           }
9349         };
9350         t.register_on_commit(
9351           new OnComplete{this, rep_tid, get_osdmap()->get_epoch()});
9352       } else {
9353         if (on_complete) {
9354           struct OnComplete : public Context {
9355             PrimaryLogPGRef pg;
9356             std::function<void(void)> on_complete;
9357             epoch_t epoch;
9358             OnComplete(
9359               PrimaryLogPGRef pg,
9360               const std::function<void(void)> &on_complete,
9361               epoch_t epoch)
9362               : pg(pg),
9363                 on_complete(std::move(on_complete)),
9364                 epoch(epoch) {}
9365             void finish(int) override {
9366               pg->lock();
9367               if (!pg->pg_has_reset_since(epoch))
9368                 on_complete();
9369               pg->unlock();
9370             }
9371           };
9372           t.register_on_complete(
9373             new OnComplete{
9374               this, *on_complete, get_osdmap()->get_epoch()
9375                 });
9376         }
9377       }
9378       t.register_on_applied(
9379         new C_OSD_OnApplied{this, get_osdmap()->get_epoch(), info.last_update});
9380       int r = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
9381       assert(r == 0);
9382     });
9383 }
9384
9385 void PrimaryLogPG::cancel_log_updates()
9386 {
9387   // get rid of all the LogUpdateCtx so their references to repops are
9388   // dropped
9389   log_entry_update_waiting_on.clear();
9390 }
9391
9392 // -------------------------------------------------------
9393
9394 void PrimaryLogPG::get_watchers(list<obj_watch_item_t> &pg_watchers)
9395 {
9396   pair<hobject_t, ObjectContextRef> i;
9397   while (object_contexts.get_next(i.first, &i)) {
9398     ObjectContextRef obc(i.second);
9399     get_obc_watchers(obc, pg_watchers);
9400   }
9401 }
9402
9403 void PrimaryLogPG::get_obc_watchers(ObjectContextRef obc, list<obj_watch_item_t> &pg_watchers)
9404 {
9405   for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j =
9406          obc->watchers.begin();
9407         j != obc->watchers.end();
9408         ++j) {
9409     obj_watch_item_t owi;
9410
9411     owi.obj = obc->obs.oi.soid;
9412     owi.wi.addr = j->second->get_peer_addr();
9413     owi.wi.name = j->second->get_entity();
9414     owi.wi.cookie = j->second->get_cookie();
9415     owi.wi.timeout_seconds = j->second->get_timeout();
9416
9417     dout(30) << "watch: Found oid=" << owi.obj << " addr=" << owi.wi.addr
9418       << " name=" << owi.wi.name << " cookie=" << owi.wi.cookie << dendl;
9419
9420     pg_watchers.push_back(owi);
9421   }
9422 }
9423
9424 void PrimaryLogPG::check_blacklisted_watchers()
9425 {
9426   dout(20) << "PrimaryLogPG::check_blacklisted_watchers for pg " << get_pgid() << dendl;
9427   pair<hobject_t, ObjectContextRef> i;
9428   while (object_contexts.get_next(i.first, &i))
9429     check_blacklisted_obc_watchers(i.second);
9430 }
9431
9432 void PrimaryLogPG::check_blacklisted_obc_watchers(ObjectContextRef obc)
9433 {
9434   dout(20) << "PrimaryLogPG::check_blacklisted_obc_watchers for obc " << obc->obs.oi.soid << dendl;
9435   for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator k =
9436          obc->watchers.begin();
9437         k != obc->watchers.end();
9438         ) {
9439     //Advance iterator now so handle_watch_timeout() can erase element
9440     map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j = k++;
9441     dout(30) << "watch: Found " << j->second->get_entity() << " cookie " << j->second->get_cookie() << dendl;
9442     entity_addr_t ea = j->second->get_peer_addr();
9443     dout(30) << "watch: Check entity_addr_t " << ea << dendl;
9444     if (get_osdmap()->is_blacklisted(ea)) {
9445       dout(10) << "watch: Found blacklisted watcher for " << ea << dendl;
9446       assert(j->second->get_pg() == this);
9447       j->second->unregister_cb();
9448       handle_watch_timeout(j->second);
9449     }
9450   }
9451 }
9452
9453 void PrimaryLogPG::populate_obc_watchers(ObjectContextRef obc)
9454 {
9455   assert(is_active());
9456   assert((recovering.count(obc->obs.oi.soid) ||
9457           !is_missing_object(obc->obs.oi.soid)) ||
9458          (pg_log.get_log().objects.count(obc->obs.oi.soid) && // or this is a revert... see recover_primary()
9459           pg_log.get_log().objects.find(obc->obs.oi.soid)->second->op ==
9460             pg_log_entry_t::LOST_REVERT &&
9461           pg_log.get_log().objects.find(obc->obs.oi.soid)->second->reverting_to ==
9462             obc->obs.oi.version));
9463
9464   dout(10) << "populate_obc_watchers " << obc->obs.oi.soid << dendl;
9465   assert(obc->watchers.empty());
9466   // populate unconnected_watchers
9467   for (map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator p =
9468         obc->obs.oi.watchers.begin();
9469        p != obc->obs.oi.watchers.end();
9470        ++p) {
9471     utime_t expire = info.stats.last_became_active;
9472     expire += p->second.timeout_seconds;
9473     dout(10) << "  unconnected watcher " << p->first << " will expire " << expire << dendl;
9474     WatchRef watch(
9475       Watch::makeWatchRef(
9476         this, osd, obc, p->second.timeout_seconds, p->first.first,
9477         p->first.second, p->second.addr));
9478     watch->disconnect();
9479     obc->watchers.insert(
9480       make_pair(
9481         make_pair(p->first.first, p->first.second),
9482         watch));
9483   }
9484   // Look for watchers from blacklisted clients and drop
9485   check_blacklisted_obc_watchers(obc);
9486 }
9487
9488 void PrimaryLogPG::handle_watch_timeout(WatchRef watch)
9489 {
9490   ObjectContextRef obc = watch->get_obc(); // handle_watch_timeout owns this ref
9491   dout(10) << "handle_watch_timeout obc " << obc << dendl;
9492
9493   if (!is_active()) {
9494     dout(10) << "handle_watch_timeout not active, no-op" << dendl;
9495     return;
9496   }
9497   if (is_degraded_or_backfilling_object(obc->obs.oi.soid)) {
9498     callbacks_for_degraded_object[obc->obs.oi.soid].push_back(
9499       watch->get_delayed_cb()
9500       );
9501     dout(10) << "handle_watch_timeout waiting for degraded on obj "
9502              << obc->obs.oi.soid
9503              << dendl;
9504     return;
9505   }
9506
9507   if (scrubber.write_blocked_by_scrub(obc->obs.oi.soid)) {
9508     dout(10) << "handle_watch_timeout waiting for scrub on obj "
9509              << obc->obs.oi.soid
9510              << dendl;
9511     scrubber.add_callback(
9512       watch->get_delayed_cb() // This callback!
9513       );
9514     return;
9515   }
9516
9517   OpContextUPtr ctx = simple_opc_create(obc);
9518   ctx->at_version = get_next_version();
9519
9520   object_info_t& oi = ctx->new_obs.oi;
9521   oi.watchers.erase(make_pair(watch->get_cookie(),
9522                               watch->get_entity()));
9523
9524   list<watch_disconnect_t> watch_disconnects = {
9525     watch_disconnect_t(watch->get_cookie(), watch->get_entity(), true)
9526   };
9527   ctx->register_on_success(
9528     [this, obc, watch_disconnects]() {
9529       complete_disconnect_watches(obc, watch_disconnects);
9530     });
9531
9532
9533   PGTransaction *t = ctx->op_t.get();
9534   ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::MODIFY, obc->obs.oi.soid,
9535                                     ctx->at_version,
9536                                     oi.version,
9537                                     0,
9538                                     osd_reqid_t(), ctx->mtime, 0));
9539
9540   oi.prior_version = obc->obs.oi.version;
9541   oi.version = ctx->at_version;
9542   bufferlist bl;
9543   ::encode(oi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
9544   t->setattr(obc->obs.oi.soid, OI_ATTR, bl);
9545
9546   // apply new object state.
9547   ctx->obc->obs = ctx->new_obs;
9548
9549   // no ctx->delta_stats
9550   simple_opc_submit(std::move(ctx));
9551 }
9552
9553 ObjectContextRef PrimaryLogPG::create_object_context(const object_info_t& oi,
9554                                                      SnapSetContext *ssc)
9555 {
9556   ObjectContextRef obc(object_contexts.lookup_or_create(oi.soid));
9557   assert(obc->destructor_callback == NULL);
9558   obc->destructor_callback = new C_PG_ObjectContext(this, obc.get());
9559   obc->obs.oi = oi;
9560   obc->obs.exists = false;
9561   obc->ssc = ssc;
9562   if (ssc)
9563     register_snapset_context(ssc);
9564   dout(10) << "create_object_context " << (void*)obc.get() << " " << oi.soid << " " << dendl;
9565   if (is_active())
9566     populate_obc_watchers(obc);
9567   return obc;
9568 }
9569
9570 ObjectContextRef PrimaryLogPG::get_object_context(
9571   const hobject_t& soid,
9572   bool can_create,
9573   const map<string, bufferlist> *attrs)
9574 {
9575   assert(
9576     attrs || !pg_log.get_missing().is_missing(soid) ||
9577     // or this is a revert... see recover_primary()
9578     (pg_log.get_log().objects.count(soid) &&
9579       pg_log.get_log().objects.find(soid)->second->op ==
9580       pg_log_entry_t::LOST_REVERT));
9581   ObjectContextRef obc = object_contexts.lookup(soid);
9582   osd->logger->inc(l_osd_object_ctx_cache_total);
9583   if (obc) {
9584     osd->logger->inc(l_osd_object_ctx_cache_hit);
9585     dout(10) << __func__ << ": found obc in cache: " << obc
9586              << dendl;
9587   } else {
9588     dout(10) << __func__ << ": obc NOT found in cache: " << soid << dendl;
9589     // check disk
9590     bufferlist bv;
9591     if (attrs) {
9592       assert(attrs->count(OI_ATTR));
9593       bv = attrs->find(OI_ATTR)->second;
9594     } else {
9595       int r = pgbackend->objects_get_attr(soid, OI_ATTR, &bv);
9596       if (r < 0) {
9597         if (!can_create) {
9598           dout(10) << __func__ << ": no obc for soid "
9599                    << soid << " and !can_create"
9600                    << dendl;
9601           return ObjectContextRef();   // -ENOENT!
9602         }
9603
9604         dout(10) << __func__ << ": no obc for soid "
9605                  << soid << " but can_create"
9606                  << dendl;
9607         // new object.
9608         object_info_t oi(soid);
9609         SnapSetContext *ssc = get_snapset_context(
9610           soid, true, 0, false);
9611         assert(ssc);
9612         obc = create_object_context(oi, ssc);
9613         dout(10) << __func__ << ": " << obc << " " << soid
9614                  << " " << obc->rwstate
9615                  << " oi: " << obc->obs.oi
9616                  << " ssc: " << obc->ssc
9617                  << " snapset: " << obc->ssc->snapset << dendl;
9618         return obc;
9619       }
9620     }
9621
9622     object_info_t oi;
9623     try {
9624       bufferlist::iterator bliter = bv.begin();
9625       ::decode(oi, bliter);
9626     } catch (...) {
9627       dout(0) << __func__ << ": obc corrupt: " << soid << dendl;
9628       return ObjectContextRef();   // -ENOENT!
9629     }
9630
9631     assert(oi.soid.pool == (int64_t)info.pgid.pool());
9632
9633     obc = object_contexts.lookup_or_create(oi.soid);
9634     obc->destructor_callback = new C_PG_ObjectContext(this, obc.get());
9635     obc->obs.oi = oi;
9636     obc->obs.exists = true;
9637
9638     obc->ssc = get_snapset_context(
9639       soid, true,
9640       soid.has_snapset() ? attrs : 0);
9641
9642     if (is_active())
9643       populate_obc_watchers(obc);
9644
9645     if (pool.info.require_rollback()) {
9646       if (attrs) {
9647         obc->attr_cache = *attrs;
9648       } else {
9649         int r = pgbackend->objects_get_attrs(
9650           soid,
9651           &obc->attr_cache);
9652         assert(r == 0);
9653       }
9654     }
9655
9656     dout(10) << __func__ << ": creating obc from disk: " << obc
9657              << dendl;
9658   }
9659
9660   // XXX: Caller doesn't expect this
9661   if (obc->ssc == NULL) {
9662     derr << __func__ << ": obc->ssc not available, not returning context" << dendl;
9663     return ObjectContextRef();   // -ENOENT!
9664   }
9665
9666   dout(10) << __func__ << ": " << obc << " " << soid
9667            << " " << obc->rwstate
9668            << " oi: " << obc->obs.oi
9669            << " exists: " << (int)obc->obs.exists
9670            << " ssc: " << obc->ssc
9671            << " snapset: " << obc->ssc->snapset << dendl;
9672   return obc;
9673 }
9674
9675 void PrimaryLogPG::context_registry_on_change()
9676 {
9677   pair<hobject_t, ObjectContextRef> i;
9678   while (object_contexts.get_next(i.first, &i)) {
9679     ObjectContextRef obc(i.second);
9680     if (obc) {
9681       for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j =
9682              obc->watchers.begin();
9683            j != obc->watchers.end();
9684            obc->watchers.erase(j++)) {
9685         j->second->discard();
9686       }
9687     }
9688   }
9689 }
9690
9691
9692 /*
9693  * If we return an error, and set *pmissing, then promoting that
9694  * object may help.
9695  *
9696  * If we return -EAGAIN, we will always set *pmissing to the missing
9697  * object to wait for.
9698  *
9699  * If we return an error but do not set *pmissing, then we know the
9700  * object does not exist.
9701  */
9702 int PrimaryLogPG::find_object_context(const hobject_t& oid,
9703                                       ObjectContextRef *pobc,
9704                                       bool can_create,
9705                                       bool map_snapid_to_clone,
9706                                       hobject_t *pmissing)
9707 {
9708   FUNCTRACE();
9709   assert(oid.pool == static_cast<int64_t>(info.pgid.pool()));
9710   // want the head?
9711   if (oid.snap == CEPH_NOSNAP) {
9712     ObjectContextRef obc = get_object_context(oid, can_create);
9713     if (!obc) {
9714       if (pmissing)
9715         *pmissing = oid;
9716       return -ENOENT;
9717     }
9718     dout(10) << "find_object_context " << oid
9719        << " @" << oid.snap
9720        << " oi=" << obc->obs.oi
9721        << dendl;
9722     *pobc = obc;
9723
9724     return 0;
9725   }
9726
9727   hobject_t head = oid.get_head();
9728
9729   // want the snapdir?
9730   if (oid.snap == CEPH_SNAPDIR) {
9731     // return head or snapdir, whichever exists.
9732     ObjectContextRef headobc = get_object_context(head, can_create);
9733     ObjectContextRef obc = headobc;
9734     if (!obc || !obc->obs.exists)
9735       obc = get_object_context(oid, can_create);
9736     if (!obc || !obc->obs.exists) {
9737       // if we have neither, we would want to promote the head.
9738       if (pmissing)
9739         *pmissing = head;
9740       if (pobc)
9741         *pobc = headobc; // may be null
9742       return -ENOENT;
9743     }
9744     dout(10) << "find_object_context " << oid
9745              << " @" << oid.snap
9746              << " oi=" << obc->obs.oi
9747              << dendl;
9748     *pobc = obc;
9749
9750     // always populate ssc for SNAPDIR...
9751     if (!obc->ssc)
9752       obc->ssc = get_snapset_context(
9753         oid, true);
9754     return 0;
9755   }
9756
9757   // we want a snap
9758   if (!map_snapid_to_clone && pool.info.is_removed_snap(oid.snap)) {
9759     dout(10) << __func__ << " snap " << oid.snap << " is removed" << dendl;
9760     return -ENOENT;
9761   }
9762
9763   SnapSetContext *ssc = get_snapset_context(oid, can_create);
9764   if (!ssc || !(ssc->exists || can_create)) {
9765     dout(20) << __func__ << " " << oid << " no snapset" << dendl;
9766     if (pmissing)
9767       *pmissing = head;  // start by getting the head
9768     if (ssc)
9769       put_snapset_context(ssc);
9770     return -ENOENT;
9771   }
9772
9773   if (map_snapid_to_clone) {
9774     dout(10) << "find_object_context " << oid << " @" << oid.snap
9775              << " snapset " << ssc->snapset
9776              << " map_snapid_to_clone=true" << dendl;
9777     if (oid.snap > ssc->snapset.seq) {
9778       // already must be readable
9779       ObjectContextRef obc = get_object_context(head, false);
9780       dout(10) << "find_object_context " << oid << " @" << oid.snap
9781                << " snapset " << ssc->snapset
9782                << " maps to head" << dendl;
9783       *pobc = obc;
9784       put_snapset_context(ssc);
9785       return (obc && obc->obs.exists) ? 0 : -ENOENT;
9786     } else {
9787       vector<snapid_t>::const_iterator citer = std::find(
9788         ssc->snapset.clones.begin(),
9789         ssc->snapset.clones.end(),
9790         oid.snap);
9791       if (citer == ssc->snapset.clones.end()) {
9792         dout(10) << "find_object_context " << oid << " @" << oid.snap
9793                  << " snapset " << ssc->snapset
9794                  << " maps to nothing" << dendl;
9795         put_snapset_context(ssc);
9796         return -ENOENT;
9797       }
9798
9799       dout(10) << "find_object_context " << oid << " @" << oid.snap
9800                << " snapset " << ssc->snapset
9801                << " maps to " << oid << dendl;
9802
9803       if (pg_log.get_missing().is_missing(oid)) {
9804         dout(10) << "find_object_context " << oid << " @" << oid.snap
9805                  << " snapset " << ssc->snapset
9806                  << " " << oid << " is missing" << dendl;
9807         if (pmissing)
9808           *pmissing = oid;
9809         put_snapset_context(ssc);
9810         return -EAGAIN;
9811       }
9812
9813       ObjectContextRef obc = get_object_context(oid, false);
9814       if (!obc || !obc->obs.exists) {
9815         dout(10) << "find_object_context " << oid << " @" << oid.snap
9816                  << " snapset " << ssc->snapset
9817                  << " " << oid << " is not present" << dendl;
9818         if (pmissing)
9819           *pmissing = oid;
9820         put_snapset_context(ssc);
9821         return -ENOENT;
9822       }
9823       dout(10) << "find_object_context " << oid << " @" << oid.snap
9824                << " snapset " << ssc->snapset
9825                << " " << oid << " HIT" << dendl;
9826       *pobc = obc;
9827       put_snapset_context(ssc);
9828       return 0;
9829     }
9830     ceph_abort(); //unreachable
9831   }
9832
9833   dout(10) << "find_object_context " << oid << " @" << oid.snap
9834            << " snapset " << ssc->snapset << dendl;
9835
9836   // head?
9837   if (oid.snap > ssc->snapset.seq) {
9838     if (ssc->snapset.head_exists) {
9839       ObjectContextRef obc = get_object_context(head, false);
9840       dout(10) << "find_object_context  " << head
9841                << " want " << oid.snap << " > snapset seq " << ssc->snapset.seq
9842                << " -- HIT " << obc->obs
9843                << dendl;
9844       if (!obc->ssc)
9845         obc->ssc = ssc;
9846       else {
9847         assert(ssc == obc->ssc);
9848         put_snapset_context(ssc);
9849       }
9850       *pobc = obc;
9851       return 0;
9852     }
9853     dout(10) << "find_object_context  " << head
9854              << " want " << oid.snap << " > snapset seq " << ssc->snapset.seq
9855              << " but head dne -- DNE"
9856              << dendl;
9857     put_snapset_context(ssc);
9858     return -ENOENT;
9859   }
9860
9861   // which clone would it be?
9862   unsigned k = 0;
9863   while (k < ssc->snapset.clones.size() &&
9864          ssc->snapset.clones[k] < oid.snap)
9865     k++;
9866   if (k == ssc->snapset.clones.size()) {
9867     dout(10) << "find_object_context  no clones with last >= oid.snap "
9868              << oid.snap << " -- DNE" << dendl;
9869     put_snapset_context(ssc);
9870     return -ENOENT;
9871   }
9872   hobject_t soid(oid.oid, oid.get_key(), ssc->snapset.clones[k], oid.get_hash(),
9873                  info.pgid.pool(), oid.get_namespace());
9874
9875   if (pg_log.get_missing().is_missing(soid)) {
9876     dout(20) << "find_object_context  " << soid << " missing, try again later"
9877              << dendl;
9878     if (pmissing)
9879       *pmissing = soid;
9880     put_snapset_context(ssc);
9881     return -EAGAIN;
9882   }
9883
9884   ObjectContextRef obc = get_object_context(soid, false);
9885   if (!obc || !obc->obs.exists) {
9886     dout(20) << __func__ << " missing clone " << soid << dendl;
9887     if (pmissing)
9888       *pmissing = soid;
9889     put_snapset_context(ssc);
9890     return -ENOENT;
9891   }
9892
9893   if (!obc->ssc) {
9894     obc->ssc = ssc;
9895   } else {
9896     assert(obc->ssc == ssc);
9897     put_snapset_context(ssc);
9898   }
9899   ssc = 0;
9900
9901   // clone
9902   dout(20) << "find_object_context  " << soid
9903            << " snapset " << obc->ssc->snapset
9904            << " legacy_snaps " << obc->obs.oi.legacy_snaps
9905            << dendl;
9906   snapid_t first, last;
9907   if (obc->ssc->snapset.is_legacy()) {
9908     first = obc->obs.oi.legacy_snaps.back();
9909     last = obc->obs.oi.legacy_snaps.front();
9910   } else {
9911     auto p = obc->ssc->snapset.clone_snaps.find(soid.snap);
9912     assert(p != obc->ssc->snapset.clone_snaps.end());
9913     first = p->second.back();
9914     last = p->second.front();
9915   }
9916   if (first <= oid.snap) {
9917     dout(20) << "find_object_context  " << soid << " [" << first << "," << last
9918              << "] contains " << oid.snap << " -- HIT " << obc->obs << dendl;
9919     *pobc = obc;
9920     return 0;
9921   } else {
9922     dout(20) << "find_object_context  " << soid << " [" << first << "," << last
9923              << "] does not contain " << oid.snap << " -- DNE" << dendl;
9924     return -ENOENT;
9925   }
9926 }
9927
9928 void PrimaryLogPG::object_context_destructor_callback(ObjectContext *obc)
9929 {
9930   if (obc->ssc)
9931     put_snapset_context(obc->ssc);
9932 }
9933
9934 void PrimaryLogPG::add_object_context_to_pg_stat(ObjectContextRef obc, pg_stat_t *pgstat)
9935 {
9936   object_info_t& oi = obc->obs.oi;
9937
9938   dout(10) << "add_object_context_to_pg_stat " << oi.soid << dendl;
9939   object_stat_sum_t stat;
9940
9941   stat.num_bytes += oi.size;
9942
9943   if (oi.soid.snap != CEPH_SNAPDIR)
9944     stat.num_objects++;
9945   if (oi.is_dirty())
9946     stat.num_objects_dirty++;
9947   if (oi.is_whiteout())
9948     stat.num_whiteouts++;
9949   if (oi.is_omap())
9950     stat.num_objects_omap++;
9951   if (oi.is_cache_pinned())
9952     stat.num_objects_pinned++;
9953
9954   if (oi.soid.snap && oi.soid.snap != CEPH_NOSNAP && oi.soid.snap != CEPH_SNAPDIR) {
9955     stat.num_object_clones++;
9956
9957     if (!obc->ssc)
9958       obc->ssc = get_snapset_context(oi.soid, false);
9959     assert(obc->ssc);
9960
9961     // subtract off clone overlap
9962     if (obc->ssc->snapset.clone_overlap.count(oi.soid.snap)) {
9963       interval_set<uint64_t>& o = obc->ssc->snapset.clone_overlap[oi.soid.snap];
9964       for (interval_set<uint64_t>::const_iterator r = o.begin();
9965            r != o.end();
9966            ++r) {
9967         stat.num_bytes -= r.get_len();
9968       }
9969     }
9970   }
9971
9972   // add it in
9973   pgstat->stats.sum.add(stat);
9974 }
9975
9976 void PrimaryLogPG::kick_object_context_blocked(ObjectContextRef obc)
9977 {
9978   const hobject_t& soid = obc->obs.oi.soid;
9979   if (obc->is_blocked()) {
9980     dout(10) << __func__ << " " << soid << " still blocked" << dendl;
9981     return;
9982   }
9983
9984   map<hobject_t, list<OpRequestRef>>::iterator p = waiting_for_blocked_object.find(soid);
9985   if (p != waiting_for_blocked_object.end()) {
9986     list<OpRequestRef>& ls = p->second;
9987     dout(10) << __func__ << " " << soid << " requeuing " << ls.size() << " requests" << dendl;
9988     requeue_ops(ls);
9989     waiting_for_blocked_object.erase(p);
9990   }
9991
9992   map<hobject_t, ObjectContextRef>::iterator i =
9993     objects_blocked_on_snap_promotion.find(obc->obs.oi.soid.get_head());
9994   if (i != objects_blocked_on_snap_promotion.end()) {
9995     assert(i->second == obc);
9996     objects_blocked_on_snap_promotion.erase(i);
9997   }
9998
9999   if (obc->requeue_scrub_on_unblock) {
10000     obc->requeue_scrub_on_unblock = false;
10001     requeue_scrub();
10002   }
10003 }
10004
10005 SnapSetContext *PrimaryLogPG::get_snapset_context(
10006   const hobject_t& oid,
10007   bool can_create,
10008   const map<string, bufferlist> *attrs,
10009   bool oid_existed)
10010 {
10011   Mutex::Locker l(snapset_contexts_lock);
10012   SnapSetContext *ssc;
10013   map<hobject_t, SnapSetContext*>::iterator p = snapset_contexts.find(
10014     oid.get_snapdir());
10015   if (p != snapset_contexts.end()) {
10016     if (can_create || p->second->exists) {
10017       ssc = p->second;
10018     } else {
10019       return NULL;
10020     }
10021   } else {
10022     bufferlist bv;
10023     if (!attrs) {
10024       int r = -ENOENT;
10025       if (!(oid.is_head() && !oid_existed))
10026         r = pgbackend->objects_get_attr(oid.get_head(), SS_ATTR, &bv);
10027       if (r < 0) {
10028         // try _snapset
10029         if (!(oid.is_snapdir() && !oid_existed))
10030           r = pgbackend->objects_get_attr(oid.get_snapdir(), SS_ATTR, &bv);
10031         if (r < 0 && !can_create)
10032           return NULL;
10033       }
10034     } else {
10035       assert(attrs->count(SS_ATTR));
10036       bv = attrs->find(SS_ATTR)->second;
10037     }
10038     ssc = new SnapSetContext(oid.get_snapdir());
10039     _register_snapset_context(ssc);
10040     if (bv.length()) {
10041       bufferlist::iterator bvp = bv.begin();
10042       try {
10043         ssc->snapset.decode(bvp);
10044       } catch (buffer::error& e) {
10045         dout(0) << __func__ << " Can't decode snapset: " << e << dendl;
10046         return NULL;
10047       }
10048       ssc->exists = true;
10049     } else {
10050       ssc->exists = false;
10051     }
10052   }
10053   assert(ssc);
10054   ssc->ref++;
10055   return ssc;
10056 }
10057
10058 void PrimaryLogPG::put_snapset_context(SnapSetContext *ssc)
10059 {
10060   Mutex::Locker l(snapset_contexts_lock);
10061   --ssc->ref;
10062   if (ssc->ref == 0) {
10063     if (ssc->registered)
10064       snapset_contexts.erase(ssc->oid);
10065     delete ssc;
10066   }
10067 }
10068
10069 /** pull - request object from a peer
10070  */
10071
10072 /*
10073  * Return values:
10074  *  NONE  - didn't pull anything
10075  *  YES   - pulled what the caller wanted
10076  *  OTHER - needed to pull something else first (_head or _snapdir)
10077  */
10078 enum { PULL_NONE, PULL_OTHER, PULL_YES };
10079
10080 int PrimaryLogPG::recover_missing(
10081   const hobject_t &soid, eversion_t v,
10082   int priority,
10083   PGBackend::RecoveryHandle *h)
10084 {
10085   if (missing_loc.is_unfound(soid)) {
10086     dout(7) << "pull " << soid
10087             << " v " << v
10088             << " but it is unfound" << dendl;
10089     return PULL_NONE;
10090   }
10091
10092   // is this a snapped object?  if so, consult the snapset.. we may not need the entire object!
10093   ObjectContextRef obc;
10094   ObjectContextRef head_obc;
10095   if (soid.snap && soid.snap < CEPH_NOSNAP) {
10096     // do we have the head and/or snapdir?
10097     hobject_t head = soid.get_head();
10098     if (pg_log.get_missing().is_missing(head)) {
10099       if (recovering.count(head)) {
10100         dout(10) << " missing but already recovering head " << head << dendl;
10101         return PULL_NONE;
10102       } else {
10103         int r = recover_missing(
10104           head, pg_log.get_missing().get_items().find(head)->second.need, priority,
10105           h);
10106         if (r != PULL_NONE)
10107           return PULL_OTHER;
10108         return PULL_NONE;
10109       }
10110     }
10111     head = soid.get_snapdir();
10112     if (pg_log.get_missing().is_missing(head)) {
10113       if (recovering.count(head)) {
10114         dout(10) << " missing but already recovering snapdir " << head << dendl;
10115         return PULL_NONE;
10116       } else {
10117         int r = recover_missing(
10118           head, pg_log.get_missing().get_items().find(head)->second.need, priority,
10119           h);
10120         if (r != PULL_NONE)
10121           return PULL_OTHER;
10122         return PULL_NONE;
10123       }
10124     }
10125
10126     // we must have one or the other
10127     head_obc = get_object_context(
10128       soid.get_head(),
10129       false,
10130       0);
10131     if (!head_obc)
10132       head_obc = get_object_context(
10133         soid.get_snapdir(),
10134         false,
10135         0);
10136     assert(head_obc);
10137   }
10138   start_recovery_op(soid);
10139   assert(!recovering.count(soid));
10140   recovering.insert(make_pair(soid, obc));
10141   int r = pgbackend->recover_object(
10142     soid,
10143     v,
10144     head_obc,
10145     obc,
10146     h);
10147   // This is only a pull which shouldn't return an error
10148   assert(r >= 0);
10149   return PULL_YES;
10150 }
10151
10152 void PrimaryLogPG::send_remove_op(
10153   const hobject_t& oid, eversion_t v, pg_shard_t peer)
10154 {
10155   ceph_tid_t tid = osd->get_tid();
10156   osd_reqid_t rid(osd->get_cluster_msgr_name(), 0, tid);
10157
10158   dout(10) << "send_remove_op " << oid << " from osd." << peer
10159            << " tid " << tid << dendl;
10160
10161   MOSDSubOp *subop = new MOSDSubOp(
10162     rid, pg_whoami, spg_t(info.pgid.pgid, peer.shard),
10163     oid, CEPH_OSD_FLAG_ACK,
10164     get_osdmap()->get_epoch(), tid, v);
10165   subop->ops = vector<OSDOp>(1);
10166   subop->ops[0].op.op = CEPH_OSD_OP_DELETE;
10167
10168   osd->send_message_osd_cluster(peer.osd, subop, get_osdmap()->get_epoch());
10169 }
10170
10171
10172 void PrimaryLogPG::finish_degraded_object(const hobject_t& oid)
10173 {
10174   dout(10) << "finish_degraded_object " << oid << dendl;
10175   ObjectContextRef obc(object_contexts.lookup(oid));
10176   if (callbacks_for_degraded_object.count(oid)) {
10177     list<Context*> contexts;
10178     contexts.swap(callbacks_for_degraded_object[oid]);
10179     callbacks_for_degraded_object.erase(oid);
10180     for (list<Context*>::iterator i = contexts.begin();
10181          i != contexts.end();
10182          ++i) {
10183       (*i)->complete(0);
10184     }
10185   }
10186   map<hobject_t, snapid_t>::iterator i = objects_blocked_on_degraded_snap.find(
10187     oid.get_head());
10188   if (i != objects_blocked_on_degraded_snap.end() &&
10189       i->second == oid.snap)
10190     objects_blocked_on_degraded_snap.erase(i);
10191 }
10192
10193 void PrimaryLogPG::_committed_pushed_object(
10194   epoch_t epoch, eversion_t last_complete)
10195 {
10196   lock();
10197   if (!pg_has_reset_since(epoch)) {
10198     dout(10) << "_committed_pushed_object last_complete " << last_complete << " now ondisk" << dendl;
10199     last_complete_ondisk = last_complete;
10200
10201     if (last_complete_ondisk == info.last_update) {
10202       if (!is_primary()) {
10203         // Either we are a replica or backfill target.
10204         // we are fully up to date.  tell the primary!
10205         osd->send_message_osd_cluster(
10206           get_primary().osd,
10207           new MOSDPGTrim(
10208             get_osdmap()->get_epoch(),
10209             spg_t(info.pgid.pgid, get_primary().shard),
10210             last_complete_ondisk),
10211           get_osdmap()->get_epoch());
10212       } else {
10213         calc_min_last_complete_ondisk();
10214       }
10215     }
10216
10217   } else {
10218     dout(10) << "_committed_pushed_object pg has changed, not touching last_complete_ondisk" << dendl;
10219   }
10220
10221   unlock();
10222 }
10223
10224 void PrimaryLogPG::_applied_recovered_object(ObjectContextRef obc)
10225 {
10226   lock();
10227   dout(10) << "_applied_recovered_object " << *obc << dendl;
10228
10229   assert(active_pushes >= 1);
10230   --active_pushes;
10231
10232   // requeue an active chunky scrub waiting on recovery ops
10233   if (!deleting && active_pushes == 0
10234       && scrubber.is_chunky_scrub_active()) {
10235     if (ops_blocked_by_scrub()) {
10236       requeue_scrub(true);
10237     } else {
10238       requeue_scrub(false);
10239     }
10240   }
10241
10242   unlock();
10243 }
10244
10245 void PrimaryLogPG::_applied_recovered_object_replica()
10246 {
10247   lock();
10248   dout(10) << "_applied_recovered_object_replica" << dendl;
10249
10250   assert(active_pushes >= 1);
10251   --active_pushes;
10252
10253   // requeue an active chunky scrub waiting on recovery ops
10254   if (!deleting && active_pushes == 0 &&
10255       scrubber.active_rep_scrub && static_cast<const MOSDRepScrub*>(
10256         scrubber.active_rep_scrub->get_req())->chunky) {
10257     osd->enqueue_back(
10258       info.pgid,
10259       PGQueueable(scrubber.active_rep_scrub, get_osdmap()->get_epoch()));
10260     scrubber.active_rep_scrub = OpRequestRef();
10261   }
10262
10263   unlock();
10264 }
10265
10266 void PrimaryLogPG::recover_got(hobject_t oid, eversion_t v)
10267 {
10268   dout(10) << "got missing " << oid << " v " << v << dendl;
10269   pg_log.recover_got(oid, v, info);
10270   if (pg_log.get_log().complete_to != pg_log.get_log().log.end()) {
10271     dout(10) << "last_complete now " << info.last_complete
10272              << " log.complete_to " << pg_log.get_log().complete_to->version
10273              << dendl;
10274   } else {
10275     dout(10) << "last_complete now " << info.last_complete
10276              << " log.complete_to at end" << dendl;
10277     //below is not true in the repair case.
10278     //assert(missing.num_missing() == 0);  // otherwise, complete_to was wrong.
10279     assert(info.last_complete == info.last_update);
10280   }
10281 }
10282
10283 void PrimaryLogPG::primary_failed(const hobject_t &soid)
10284 {
10285   list<pg_shard_t> fl = { pg_whoami };
10286   failed_push(fl, soid);
10287 }
10288
10289 void PrimaryLogPG::failed_push(const list<pg_shard_t> &from, const hobject_t &soid)
10290 {
10291   dout(20) << __func__ << ": " << soid << dendl;
10292   assert(recovering.count(soid));
10293   auto obc = recovering[soid];
10294   if (obc) {
10295     list<OpRequestRef> blocked_ops;
10296     obc->drop_recovery_read(&blocked_ops);
10297     requeue_ops(blocked_ops);
10298   }
10299   recovering.erase(soid);
10300   for (auto&& i : from)
10301     missing_loc.remove_location(soid, i);
10302   dout(0) << __func__ << " " << soid << " from shard " << from
10303           << ", reps on " << missing_loc.get_locations(soid)
10304           << " unfound? " << missing_loc.is_unfound(soid) << dendl;
10305   finish_recovery_op(soid);  // close out this attempt,
10306 }
10307
10308 void PrimaryLogPG::sub_op_remove(OpRequestRef op)
10309 {
10310   const MOSDSubOp *m = static_cast<const MOSDSubOp*>(op->get_req());
10311   assert(m->get_type() == MSG_OSD_SUBOP);
10312   dout(7) << "sub_op_remove " << m->poid << dendl;
10313
10314   op->mark_started();
10315
10316   ObjectStore::Transaction t;
10317   remove_snap_mapped_object(t, m->poid);
10318   int r = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
10319   assert(r == 0);
10320 }
10321
10322 eversion_t PrimaryLogPG::pick_newest_available(const hobject_t& oid)
10323 {
10324   eversion_t v;
10325   pg_missing_item pmi;
10326   bool is_missing = pg_log.get_missing().is_missing(oid, &pmi);
10327   assert(is_missing);
10328   v = pmi.have;
10329   dout(10) << "pick_newest_available " << oid << " " << v << " on osd." << osd->whoami << " (local)" << dendl;
10330
10331   assert(!actingbackfill.empty());
10332   for (set<pg_shard_t>::iterator i = actingbackfill.begin();
10333        i != actingbackfill.end();
10334        ++i) {
10335     if (*i == get_primary()) continue;
10336     pg_shard_t peer = *i;
10337     if (!peer_missing[peer].is_missing(oid)) {
10338       continue;
10339     }
10340     eversion_t h = peer_missing[peer].get_items().at(oid).have;
10341     dout(10) << "pick_newest_available " << oid << " " << h << " on osd." << peer << dendl;
10342     if (h > v)
10343       v = h;
10344   }
10345
10346   dout(10) << "pick_newest_available " << oid << " " << v << " (newest)" << dendl;
10347   return v;
10348 }
10349
10350 void PrimaryLogPG::do_update_log_missing(OpRequestRef &op)
10351 {
10352   const MOSDPGUpdateLogMissing *m = static_cast<const MOSDPGUpdateLogMissing*>(
10353     op->get_req());
10354   assert(m->get_type() == MSG_OSD_PG_UPDATE_LOG_MISSING);
10355   ObjectStore::Transaction t;
10356   append_log_entries_update_missing(m->entries, t);
10357
10358   Context *complete = new FunctionContext(
10359     [=](int) {
10360       const MOSDPGUpdateLogMissing *msg = static_cast<const MOSDPGUpdateLogMissing*>(
10361         op->get_req());
10362       lock();
10363       if (!pg_has_reset_since(msg->get_epoch())) {
10364         MOSDPGUpdateLogMissingReply *reply =
10365           new MOSDPGUpdateLogMissingReply(
10366             spg_t(info.pgid.pgid, primary_shard().shard),
10367             pg_whoami.shard,
10368             msg->get_epoch(),
10369             msg->min_epoch,
10370             msg->get_tid());
10371         reply->set_priority(CEPH_MSG_PRIO_HIGH);
10372         msg->get_connection()->send_message(reply);
10373       }
10374       unlock();
10375     });
10376
10377   if (get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
10378     t.register_on_commit(complete);
10379   } else {
10380     /* Hack to work around the fact that ReplicatedBackend sends
10381      * ack+commit if commit happens first
10382      *
10383      * This behavior is no longer necessary, but we preserve it so old
10384      * primaries can keep their repops in order */
10385     if (pool.info.ec_pool()) {
10386       t.register_on_complete(complete);
10387     } else {
10388       t.register_on_commit(complete);
10389     }
10390   }
10391   t.register_on_applied(
10392     new C_OSD_OnApplied{this, get_osdmap()->get_epoch(), info.last_update});
10393   int tr = osd->store->queue_transaction(
10394     osr.get(),
10395     std::move(t),
10396     nullptr);
10397   assert(tr == 0);
10398 }
10399
10400 void PrimaryLogPG::do_update_log_missing_reply(OpRequestRef &op)
10401 {
10402   const MOSDPGUpdateLogMissingReply *m =
10403     static_cast<const MOSDPGUpdateLogMissingReply*>(
10404     op->get_req());
10405   dout(20) << __func__ << " got reply from "
10406            << m->get_from() << dendl;
10407
10408   auto it = log_entry_update_waiting_on.find(m->get_tid());
10409   if (it != log_entry_update_waiting_on.end()) {
10410     if (it->second.waiting_on.count(m->get_from())) {
10411       it->second.waiting_on.erase(m->get_from());
10412     } else {
10413       osd->clog->error()
10414         << info.pgid << " got reply "
10415         << *m << " from shard we are not waiting for "
10416         << m->get_from();
10417     }
10418
10419     if (it->second.waiting_on.empty()) {
10420       repop_all_committed(it->second.repop.get());
10421       log_entry_update_waiting_on.erase(it);
10422     }
10423   } else {
10424     osd->clog->error()
10425       << info.pgid << " got reply "
10426       << *m << " on unknown tid " << m->get_tid();
10427   }
10428 }
10429
10430 /* Mark all unfound objects as lost.
10431  */
10432 void PrimaryLogPG::mark_all_unfound_lost(
10433   int what,
10434   ConnectionRef con,
10435   ceph_tid_t tid)
10436 {
10437   dout(3) << __func__ << " " << pg_log_entry_t::get_op_name(what) << dendl;
10438   list<hobject_t> oids;
10439
10440   dout(30) << __func__ << ": log before:\n";
10441   pg_log.get_log().print(*_dout);
10442   *_dout << dendl;
10443
10444   mempool::osd_pglog::list<pg_log_entry_t> log_entries;
10445
10446   utime_t mtime = ceph_clock_now();
10447   map<hobject_t, pg_missing_item>::const_iterator m =
10448     missing_loc.get_needs_recovery().begin();
10449   map<hobject_t, pg_missing_item>::const_iterator mend =
10450     missing_loc.get_needs_recovery().end();
10451
10452   ObcLockManager manager;
10453   eversion_t v = get_next_version();
10454   v.epoch = get_osdmap()->get_epoch();
10455   uint64_t num_unfound = missing_loc.num_unfound();
10456   while (m != mend) {
10457     const hobject_t &oid(m->first);
10458     if (!missing_loc.is_unfound(oid)) {
10459       // We only care about unfound objects
10460       ++m;
10461       continue;
10462     }
10463
10464     ObjectContextRef obc;
10465     eversion_t prev;
10466
10467     switch (what) {
10468     case pg_log_entry_t::LOST_MARK:
10469       assert(0 == "actually, not implemented yet!");
10470       break;
10471
10472     case pg_log_entry_t::LOST_REVERT:
10473       prev = pick_newest_available(oid);
10474       if (prev > eversion_t()) {
10475         // log it
10476         pg_log_entry_t e(
10477           pg_log_entry_t::LOST_REVERT, oid, v,
10478           m->second.need, 0, osd_reqid_t(), mtime, 0);
10479         e.reverting_to = prev;
10480         e.mark_unrollbackable();
10481         log_entries.push_back(e);
10482         dout(10) << e << dendl;
10483
10484         // we are now missing the new version; recovery code will sort it out.
10485         ++v.version;
10486         ++m;
10487         break;
10488       }
10489
10490     case pg_log_entry_t::LOST_DELETE:
10491       {
10492         pg_log_entry_t e(pg_log_entry_t::LOST_DELETE, oid, v, m->second.need,
10493                          0, osd_reqid_t(), mtime, 0);
10494         if (get_osdmap()->require_osd_release >= CEPH_RELEASE_JEWEL) {
10495           if (pool.info.require_rollback()) {
10496             e.mod_desc.try_rmobject(v.version);
10497           } else {
10498             e.mark_unrollbackable();
10499           }
10500         } // otherwise, just do what we used to do
10501         dout(10) << e << dendl;
10502         log_entries.push_back(e);
10503         oids.push_back(oid);
10504
10505         ++v.version;
10506         ++m;
10507       }
10508       break;
10509
10510     default:
10511       ceph_abort();
10512     }
10513   }
10514
10515   info.stats.stats_invalid = true;
10516
10517   submit_log_entries(
10518     log_entries,
10519     std::move(manager),
10520     boost::optional<std::function<void(void)> >(
10521       [this, oids, con, num_unfound, tid]() {
10522           for (auto oid: oids)
10523             missing_loc.recovered(oid);
10524         for (auto& p : waiting_for_unreadable_object) {
10525           release_backoffs(p.first);
10526         }
10527         requeue_object_waiters(waiting_for_unreadable_object);
10528         queue_recovery();
10529
10530         stringstream ss;
10531         ss << "pg has " << num_unfound
10532            << " objects unfound and apparently lost marking";
10533         string rs = ss.str();
10534         dout(0) << "do_command r=" << 0 << " " << rs << dendl;
10535         osd->clog->info() << rs;
10536         if (con) {
10537           MCommandReply *reply = new MCommandReply(0, rs);
10538           reply->set_tid(tid);
10539           con->send_message(reply);
10540         }
10541       }),
10542     OpRequestRef());
10543 }
10544
10545 void PrimaryLogPG::_split_into(pg_t child_pgid, PG *child, unsigned split_bits)
10546 {
10547   assert(repop_queue.empty());
10548 }
10549
10550 /*
10551  * pg status change notification
10552  */
10553
10554 void PrimaryLogPG::apply_and_flush_repops(bool requeue)
10555 {
10556   list<OpRequestRef> rq;
10557
10558   // apply all repops
10559   while (!repop_queue.empty()) {
10560     RepGather *repop = repop_queue.front();
10561     repop_queue.pop_front();
10562     dout(10) << " canceling repop tid " << repop->rep_tid << dendl;
10563     repop->rep_aborted = true;
10564     repop->on_applied.clear();
10565     repop->on_committed.clear();
10566     repop->on_success.clear();
10567
10568     if (requeue) {
10569       if (repop->op) {
10570         dout(10) << " requeuing " << *repop->op->get_req() << dendl;
10571         rq.push_back(repop->op);
10572         repop->op = OpRequestRef();
10573       }
10574
10575       // also requeue any dups, interleaved into position
10576       map<eversion_t, list<pair<OpRequestRef, version_t> > >::iterator p =
10577         waiting_for_ondisk.find(repop->v);
10578       if (p != waiting_for_ondisk.end()) {
10579         dout(10) << " also requeuing ondisk waiters " << p->second << dendl;
10580         for (list<pair<OpRequestRef, version_t> >::iterator i =
10581                p->second.begin();
10582              i != p->second.end();
10583              ++i) {
10584           rq.push_back(i->first);
10585         }
10586         waiting_for_ondisk.erase(p);
10587       }
10588     }
10589
10590     remove_repop(repop);
10591   }
10592
10593   assert(repop_queue.empty());
10594
10595   if (requeue) {
10596     requeue_ops(rq);
10597     if (!waiting_for_ondisk.empty()) {
10598       for (map<eversion_t, list<pair<OpRequestRef, version_t> > >::iterator i =
10599              waiting_for_ondisk.begin();
10600            i != waiting_for_ondisk.end();
10601            ++i) {
10602         for (list<pair<OpRequestRef, version_t> >::iterator j =
10603                i->second.begin();
10604              j != i->second.end();
10605              ++j) {
10606           derr << __func__ << ": op " << *(j->first->get_req()) << " waiting on "
10607                << i->first << dendl;
10608         }
10609       }
10610       assert(waiting_for_ondisk.empty());
10611     }
10612   }
10613
10614   waiting_for_ondisk.clear();
10615 }
10616
10617 void PrimaryLogPG::on_flushed()
10618 {
10619   assert(flushes_in_progress > 0);
10620   flushes_in_progress--;
10621   if (flushes_in_progress == 0) {
10622     requeue_ops(waiting_for_peered);
10623   }
10624   if (!is_peered() || !is_primary()) {
10625     pair<hobject_t, ObjectContextRef> i;
10626     while (object_contexts.get_next(i.first, &i)) {
10627       derr << "on_flushed: object " << i.first << " obc still alive" << dendl;
10628     }
10629     assert(object_contexts.empty());
10630   }
10631   pgbackend->on_flushed();
10632 }
10633
10634 void PrimaryLogPG::on_removal(ObjectStore::Transaction *t)
10635 {
10636   dout(10) << "on_removal" << dendl;
10637
10638   // adjust info to backfill
10639   info.set_last_backfill(hobject_t());
10640   pg_log.reset_backfill();
10641   dirty_info = true;
10642
10643
10644   // clear log
10645   PGLogEntryHandler rollbacker{this, t};
10646   pg_log.roll_forward(&rollbacker);
10647
10648   write_if_dirty(*t);
10649
10650   if (!deleting)
10651     on_shutdown();
10652 }
10653
10654 void PrimaryLogPG::on_shutdown()
10655 {
10656   dout(10) << "on_shutdown" << dendl;
10657
10658   // remove from queues
10659   osd->pg_stat_queue_dequeue(this);
10660   osd->peering_wq.dequeue(this);
10661
10662   // handles queue races
10663   deleting = true;
10664
10665   if (recovery_queued) {
10666     recovery_queued = false;
10667     osd->clear_queued_recovery(this);
10668   }
10669
10670   clear_scrub_reserved();
10671   scrub_clear_state();
10672
10673   unreg_next_scrub();
10674   cancel_copy_ops(false);
10675   cancel_flush_ops(false);
10676   cancel_proxy_ops(false);
10677   apply_and_flush_repops(false);
10678   cancel_log_updates();
10679   // we must remove PGRefs, so do this this prior to release_backoffs() callers
10680   clear_backoffs();
10681   // clean up snap trim references
10682   snap_trimmer_machine.process_event(Reset());
10683
10684   pgbackend->on_change();
10685
10686   context_registry_on_change();
10687   object_contexts.clear();
10688
10689   osd->remote_reserver.cancel_reservation(info.pgid);
10690   osd->local_reserver.cancel_reservation(info.pgid);
10691
10692   clear_primary_state();
10693   cancel_recovery();
10694 }
10695
10696 void PrimaryLogPG::on_activate()
10697 {
10698   // all clean?
10699   if (needs_recovery()) {
10700     dout(10) << "activate not all replicas are up-to-date, queueing recovery" << dendl;
10701     queue_peering_event(
10702       CephPeeringEvtRef(
10703         std::make_shared<CephPeeringEvt>(
10704           get_osdmap()->get_epoch(),
10705           get_osdmap()->get_epoch(),
10706           DoRecovery())));
10707   } else if (needs_backfill()) {
10708     dout(10) << "activate queueing backfill" << dendl;
10709     queue_peering_event(
10710       CephPeeringEvtRef(
10711         std::make_shared<CephPeeringEvt>(
10712           get_osdmap()->get_epoch(),
10713           get_osdmap()->get_epoch(),
10714           RequestBackfill())));
10715   } else {
10716     dout(10) << "activate all replicas clean, no recovery" << dendl;
10717     eio_errors_to_process = false;
10718     queue_peering_event(
10719       CephPeeringEvtRef(
10720         std::make_shared<CephPeeringEvt>(
10721           get_osdmap()->get_epoch(),
10722           get_osdmap()->get_epoch(),
10723           AllReplicasRecovered())));
10724   }
10725
10726   publish_stats_to_osd();
10727
10728   if (!backfill_targets.empty()) {
10729     last_backfill_started = earliest_backfill();
10730     new_backfill = true;
10731     assert(!last_backfill_started.is_max());
10732     dout(5) << "on activate: bft=" << backfill_targets
10733            << " from " << last_backfill_started << dendl;
10734     for (set<pg_shard_t>::iterator i = backfill_targets.begin();
10735          i != backfill_targets.end();
10736          ++i) {
10737       dout(5) << "target shard " << *i
10738              << " from " << peer_info[*i].last_backfill
10739              << dendl;
10740     }
10741   }
10742
10743   hit_set_setup();
10744   agent_setup();
10745 }
10746
10747 void PrimaryLogPG::_on_new_interval()
10748 {
10749 }
10750
10751 void PrimaryLogPG::on_change(ObjectStore::Transaction *t)
10752 {
10753   dout(10) << "on_change" << dendl;
10754
10755   if (hit_set && hit_set->insert_count() == 0) {
10756     dout(20) << " discarding empty hit_set" << dendl;
10757     hit_set_clear();
10758   }
10759
10760   if (recovery_queued) {
10761     recovery_queued = false;
10762     osd->clear_queued_recovery(this);
10763   }
10764
10765   // requeue everything in the reverse order they should be
10766   // reexamined.
10767   requeue_ops(waiting_for_peered);
10768   requeue_ops(waiting_for_active);
10769
10770   clear_scrub_reserved();
10771
10772   cancel_copy_ops(is_primary());
10773   cancel_flush_ops(is_primary());
10774   cancel_proxy_ops(is_primary());
10775
10776   // requeue object waiters
10777   for (auto& p : waiting_for_unreadable_object) {
10778     release_backoffs(p.first);
10779   }
10780   if (is_primary()) {
10781     requeue_object_waiters(waiting_for_unreadable_object);
10782   } else {
10783     waiting_for_unreadable_object.clear();
10784   }
10785   for (map<hobject_t,list<OpRequestRef>>::iterator p = waiting_for_degraded_object.begin();
10786        p != waiting_for_degraded_object.end();
10787        waiting_for_degraded_object.erase(p++)) {
10788     release_backoffs(p->first);
10789     if (is_primary())
10790       requeue_ops(p->second);
10791     else
10792       p->second.clear();
10793     finish_degraded_object(p->first);
10794   }
10795
10796   // requeues waiting_for_scrub
10797   scrub_clear_state();
10798
10799   for (auto p = waiting_for_blocked_object.begin();
10800        p != waiting_for_blocked_object.end();
10801        waiting_for_blocked_object.erase(p++)) {
10802     if (is_primary())
10803       requeue_ops(p->second);
10804     else
10805       p->second.clear();
10806   }
10807   for (auto i = callbacks_for_degraded_object.begin();
10808        i != callbacks_for_degraded_object.end();
10809     ) {
10810     finish_degraded_object((i++)->first);
10811   }
10812   assert(callbacks_for_degraded_object.empty());
10813
10814   if (is_primary()) {
10815     requeue_ops(waiting_for_cache_not_full);
10816   } else {
10817     waiting_for_cache_not_full.clear();
10818   }
10819   objects_blocked_on_cache_full.clear();
10820
10821   for (list<pair<OpRequestRef, OpContext*> >::iterator i =
10822          in_progress_async_reads.begin();
10823        i != in_progress_async_reads.end();
10824        in_progress_async_reads.erase(i++)) {
10825     close_op_ctx(i->second);
10826     if (is_primary())
10827       requeue_op(i->first);
10828   }
10829
10830   // this will requeue ops we were working on but didn't finish, and
10831   // any dups
10832   apply_and_flush_repops(is_primary());
10833   cancel_log_updates();
10834
10835   // do this *after* apply_and_flush_repops so that we catch any newly
10836   // registered watches.
10837   context_registry_on_change();
10838
10839   pgbackend->on_change_cleanup(t);
10840   scrubber.cleanup_store(t);
10841   pgbackend->on_change();
10842
10843   // clear snap_trimmer state
10844   snap_trimmer_machine.process_event(Reset());
10845
10846   debug_op_order.clear();
10847   unstable_stats.clear();
10848
10849   // we don't want to cache object_contexts through the interval change
10850   // NOTE: we actually assert that all currently live references are dead
10851   // by the time the flush for the next interval completes.
10852   object_contexts.clear();
10853
10854   // should have been cleared above by finishing all of the degraded objects
10855   assert(objects_blocked_on_degraded_snap.empty());
10856 }
10857
10858 void PrimaryLogPG::on_role_change()
10859 {
10860   dout(10) << "on_role_change" << dendl;
10861   if (get_role() != 0 && hit_set) {
10862     dout(10) << " clearing hit set" << dendl;
10863     hit_set_clear();
10864   }
10865 }
10866
10867 void PrimaryLogPG::on_pool_change()
10868 {
10869   dout(10) << __func__ << dendl;
10870   // requeue cache full waiters just in case the cache_mode is
10871   // changing away from writeback mode.  note that if we are not
10872   // active the normal requeuing machinery is sufficient (and properly
10873   // ordered).
10874   if (is_active() &&
10875       pool.info.cache_mode != pg_pool_t::CACHEMODE_WRITEBACK &&
10876       !waiting_for_cache_not_full.empty()) {
10877     dout(10) << __func__ << " requeuing full waiters (not in writeback) "
10878              << dendl;
10879     requeue_ops(waiting_for_cache_not_full);
10880     objects_blocked_on_cache_full.clear();
10881   }
10882   hit_set_setup();
10883   agent_setup();
10884 }
10885
10886 // clear state.  called on recovery completion AND cancellation.
10887 void PrimaryLogPG::_clear_recovery_state()
10888 {
10889   missing_loc.clear();
10890 #ifdef DEBUG_RECOVERY_OIDS
10891   recovering_oids.clear();
10892 #endif
10893   last_backfill_started = hobject_t();
10894   set<hobject_t>::iterator i = backfills_in_flight.begin();
10895   while (i != backfills_in_flight.end()) {
10896     assert(recovering.count(*i));
10897     backfills_in_flight.erase(i++);
10898   }
10899
10900   list<OpRequestRef> blocked_ops;
10901   for (map<hobject_t, ObjectContextRef>::iterator i = recovering.begin();
10902        i != recovering.end();
10903        recovering.erase(i++)) {
10904     if (i->second) {
10905       i->second->drop_recovery_read(&blocked_ops);
10906       requeue_ops(blocked_ops);
10907     }
10908   }
10909   assert(backfills_in_flight.empty());
10910   pending_backfill_updates.clear();
10911   assert(recovering.empty());
10912   pgbackend->clear_recovery_state();
10913 }
10914
10915 void PrimaryLogPG::cancel_pull(const hobject_t &soid)
10916 {
10917   dout(20) << __func__ << ": " << soid << dendl;
10918   assert(recovering.count(soid));
10919   ObjectContextRef obc = recovering[soid];
10920   if (obc) {
10921     list<OpRequestRef> blocked_ops;
10922     obc->drop_recovery_read(&blocked_ops);
10923     requeue_ops(blocked_ops);
10924   }
10925   recovering.erase(soid);
10926   finish_recovery_op(soid);
10927   release_backoffs(soid);
10928   if (waiting_for_degraded_object.count(soid)) {
10929     dout(20) << " kicking degraded waiters on " << soid << dendl;
10930     requeue_ops(waiting_for_degraded_object[soid]);
10931     waiting_for_degraded_object.erase(soid);
10932   }
10933   if (waiting_for_unreadable_object.count(soid)) {
10934     dout(20) << " kicking unreadable waiters on " << soid << dendl;
10935     requeue_ops(waiting_for_unreadable_object[soid]);
10936     waiting_for_unreadable_object.erase(soid);
10937   }
10938   if (is_missing_object(soid))
10939     pg_log.set_last_requested(0); // get recover_primary to start over
10940   finish_degraded_object(soid);
10941 }
10942
10943 void PrimaryLogPG::check_recovery_sources(const OSDMapRef& osdmap)
10944 {
10945   /*
10946    * check that any peers we are planning to (or currently) pulling
10947    * objects from are dealt with.
10948    */
10949   missing_loc.check_recovery_sources(osdmap);
10950   pgbackend->check_recovery_sources(osdmap);
10951
10952   for (set<pg_shard_t>::iterator i = peer_log_requested.begin();
10953        i != peer_log_requested.end();
10954        ) {
10955     if (!osdmap->is_up(i->osd)) {
10956       dout(10) << "peer_log_requested removing " << *i << dendl;
10957       peer_log_requested.erase(i++);
10958     } else {
10959       ++i;
10960     }
10961   }
10962
10963   for (set<pg_shard_t>::iterator i = peer_missing_requested.begin();
10964        i != peer_missing_requested.end();
10965        ) {
10966     if (!osdmap->is_up(i->osd)) {
10967       dout(10) << "peer_missing_requested removing " << *i << dendl;
10968       peer_missing_requested.erase(i++);
10969     } else {
10970       ++i;
10971     }
10972   }
10973 }
10974
10975 void PG::MissingLoc::check_recovery_sources(const OSDMapRef& osdmap)
10976 {
10977   set<pg_shard_t> now_down;
10978   for (set<pg_shard_t>::iterator p = missing_loc_sources.begin();
10979        p != missing_loc_sources.end();
10980        ) {
10981     if (osdmap->is_up(p->osd)) {
10982       ++p;
10983       continue;
10984     }
10985     ldout(pg->cct, 10) << "check_recovery_sources source osd." << *p << " now down" << dendl;
10986     now_down.insert(*p);
10987     missing_loc_sources.erase(p++);
10988   }
10989
10990   if (now_down.empty()) {
10991     ldout(pg->cct, 10) << "check_recovery_sources no source osds (" << missing_loc_sources << ") went down" << dendl;
10992   } else {
10993     ldout(pg->cct, 10) << "check_recovery_sources sources osds " << now_down << " now down, remaining sources are "
10994                        << missing_loc_sources << dendl;
10995
10996     // filter missing_loc
10997     map<hobject_t, set<pg_shard_t>>::iterator p = missing_loc.begin();
10998     while (p != missing_loc.end()) {
10999       set<pg_shard_t>::iterator q = p->second.begin();
11000       while (q != p->second.end())
11001         if (now_down.count(*q)) {
11002           p->second.erase(q++);
11003         } else {
11004           ++q;
11005         }
11006       if (p->second.empty())
11007         missing_loc.erase(p++);
11008       else
11009         ++p;
11010     }
11011   }
11012 }
11013
11014
11015 bool PrimaryLogPG::start_recovery_ops(
11016   uint64_t max,
11017   ThreadPool::TPHandle &handle,
11018   uint64_t *ops_started)
11019 {
11020   uint64_t& started = *ops_started;
11021   started = 0;
11022   bool work_in_progress = false;
11023   assert(is_primary());
11024
11025   if (!state_test(PG_STATE_RECOVERING) &&
11026       !state_test(PG_STATE_BACKFILL)) {
11027     /* TODO: I think this case is broken and will make do_recovery()
11028      * unhappy since we're returning false */
11029     dout(10) << "recovery raced and were queued twice, ignoring!" << dendl;
11030     return false;
11031   }
11032
11033   const pg_missing_t &missing = pg_log.get_missing();
11034
11035   unsigned int num_missing = missing.num_missing();
11036   uint64_t num_unfound = get_num_unfound();
11037
11038   if (num_missing == 0) {
11039     info.last_complete = info.last_update;
11040   }
11041
11042   if (num_missing == num_unfound) {
11043     // All of the missing objects we have are unfound.
11044     // Recover the replicas.
11045     started = recover_replicas(max, handle);
11046   }
11047   if (!started) {
11048     // We still have missing objects that we should grab from replicas.
11049     started += recover_primary(max, handle);
11050   }
11051   if (!started && num_unfound != get_num_unfound()) {
11052     // second chance to recovery replicas
11053     started = recover_replicas(max, handle);
11054   }
11055
11056   if (started)
11057     work_in_progress = true;
11058
11059   bool deferred_backfill = false;
11060   if (recovering.empty() &&
11061       state_test(PG_STATE_BACKFILL) &&
11062       !backfill_targets.empty() && started < max &&
11063       missing.num_missing() == 0 &&
11064       waiting_on_backfill.empty()) {
11065     if (get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL)) {
11066       dout(10) << "deferring backfill due to NOBACKFILL" << dendl;
11067       deferred_backfill = true;
11068     } else if (get_osdmap()->test_flag(CEPH_OSDMAP_NOREBALANCE) &&
11069                !is_degraded())  {
11070       dout(10) << "deferring backfill due to NOREBALANCE" << dendl;
11071       deferred_backfill = true;
11072     } else if (!backfill_reserved) {
11073       dout(10) << "deferring backfill due to !backfill_reserved" << dendl;
11074       if (!backfill_reserving) {
11075         dout(10) << "queueing RequestBackfill" << dendl;
11076         backfill_reserving = true;
11077         queue_peering_event(
11078           CephPeeringEvtRef(
11079             std::make_shared<CephPeeringEvt>(
11080               get_osdmap()->get_epoch(),
11081               get_osdmap()->get_epoch(),
11082               RequestBackfill())));
11083       }
11084       deferred_backfill = true;
11085     } else {
11086       started += recover_backfill(max - started, handle, &work_in_progress);
11087     }
11088   }
11089
11090   dout(10) << " started " << started << dendl;
11091   osd->logger->inc(l_osd_rop, started);
11092
11093   if (!recovering.empty() ||
11094       work_in_progress || recovery_ops_active > 0 || deferred_backfill)
11095     return work_in_progress;
11096
11097   assert(recovering.empty());
11098   assert(recovery_ops_active == 0);
11099
11100   dout(10) << __func__ << " needs_recovery: "
11101            << missing_loc.get_needs_recovery()
11102            << dendl;
11103   dout(10) << __func__ << " missing_loc: "
11104            << missing_loc.get_missing_locs()
11105            << dendl;
11106   int unfound = get_num_unfound();
11107   if (unfound) {
11108     dout(10) << " still have " << unfound << " unfound" << dendl;
11109     return work_in_progress;
11110   }
11111
11112   if (missing.num_missing() > 0) {
11113     // this shouldn't happen!
11114     osd->clog->error() << info.pgid << " recovery ending with " << missing.num_missing()
11115                        << ": " << missing.get_items();
11116     return work_in_progress;
11117   }
11118
11119   if (needs_recovery()) {
11120     // this shouldn't happen!
11121     // We already checked num_missing() so we must have missing replicas
11122     osd->clog->error() << info.pgid << " recovery ending with missing replicas";
11123     return work_in_progress;
11124   }
11125
11126   if (state_test(PG_STATE_RECOVERING)) {
11127     state_clear(PG_STATE_RECOVERING);
11128     if (needs_backfill()) {
11129       dout(10) << "recovery done, queuing backfill" << dendl;
11130       queue_peering_event(
11131         CephPeeringEvtRef(
11132           std::make_shared<CephPeeringEvt>(
11133             get_osdmap()->get_epoch(),
11134             get_osdmap()->get_epoch(),
11135             RequestBackfill())));
11136     } else {
11137       dout(10) << "recovery done, no backfill" << dendl;
11138       eio_errors_to_process = false;
11139       queue_peering_event(
11140         CephPeeringEvtRef(
11141           std::make_shared<CephPeeringEvt>(
11142             get_osdmap()->get_epoch(),
11143             get_osdmap()->get_epoch(),
11144             AllReplicasRecovered())));
11145     }
11146   } else { // backfilling
11147     state_clear(PG_STATE_BACKFILL);
11148     dout(10) << "recovery done, backfill done" << dendl;
11149     eio_errors_to_process = false;
11150     queue_peering_event(
11151       CephPeeringEvtRef(
11152         std::make_shared<CephPeeringEvt>(
11153           get_osdmap()->get_epoch(),
11154           get_osdmap()->get_epoch(),
11155           Backfilled())));
11156   }
11157
11158   return false;
11159 }
11160
11161 /**
11162  * do one recovery op.
11163  * return true if done, false if nothing left to do.
11164  */
11165 uint64_t PrimaryLogPG::recover_primary(uint64_t max, ThreadPool::TPHandle &handle)
11166 {
11167   assert(is_primary());
11168
11169   const pg_missing_t &missing = pg_log.get_missing();
11170
11171   dout(10) << "recover_primary recovering " << recovering.size()
11172            << " in pg" << dendl;
11173   dout(10) << "recover_primary " << missing << dendl;
11174   dout(25) << "recover_primary " << missing.get_items() << dendl;
11175
11176   // look at log!
11177   pg_log_entry_t *latest = 0;
11178   unsigned started = 0;
11179   int skipped = 0;
11180
11181   PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
11182   map<version_t, hobject_t>::const_iterator p =
11183     missing.get_rmissing().lower_bound(pg_log.get_log().last_requested);
11184   while (p != missing.get_rmissing().end()) {
11185     handle.reset_tp_timeout();
11186     hobject_t soid;
11187     version_t v = p->first;
11188
11189     if (pg_log.get_log().objects.count(p->second)) {
11190       latest = pg_log.get_log().objects.find(p->second)->second;
11191       assert(latest->is_update());
11192       soid = latest->soid;
11193     } else {
11194       latest = 0;
11195       soid = p->second;
11196     }
11197     const pg_missing_item& item = missing.get_items().find(p->second)->second;
11198     ++p;
11199
11200     hobject_t head = soid.get_head();
11201
11202     eversion_t need = item.need;
11203
11204     dout(10) << "recover_primary "
11205              << soid << " " << item.need
11206              << (missing.is_missing(soid) ? " (missing)":"")
11207              << (missing.is_missing(head) ? " (missing head)":"")
11208              << (recovering.count(soid) ? " (recovering)":"")
11209              << (recovering.count(head) ? " (recovering head)":"")
11210              << dendl;
11211
11212     if (latest) {
11213       switch (latest->op) {
11214       case pg_log_entry_t::CLONE:
11215         /*
11216          * Handling for this special case removed for now, until we
11217          * can correctly construct an accurate SnapSet from the old
11218          * one.
11219          */
11220         break;
11221
11222       case pg_log_entry_t::LOST_REVERT:
11223         {
11224           if (item.have == latest->reverting_to) {
11225             ObjectContextRef obc = get_object_context(soid, true);
11226
11227             if (obc->obs.oi.version == latest->version) {
11228               // I'm already reverting
11229               dout(10) << " already reverting " << soid << dendl;
11230             } else {
11231               dout(10) << " reverting " << soid << " to " << latest->prior_version << dendl;
11232               obc->ondisk_write_lock();
11233               obc->obs.oi.version = latest->version;
11234
11235               ObjectStore::Transaction t;
11236               bufferlist b2;
11237               obc->obs.oi.encode(
11238                 b2,
11239                 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
11240               assert(!pool.info.require_rollback());
11241               t.setattr(coll, ghobject_t(soid), OI_ATTR, b2);
11242
11243               recover_got(soid, latest->version);
11244               missing_loc.add_location(soid, pg_whoami);
11245
11246               ++active_pushes;
11247
11248               osd->store->queue_transaction(osr.get(), std::move(t),
11249                                             new C_OSD_AppliedRecoveredObject(this, obc),
11250                                             new C_OSD_CommittedPushedObject(
11251                                               this,
11252                                               get_osdmap()->get_epoch(),
11253                                               info.last_complete),
11254                                             new C_OSD_OndiskWriteUnlock(obc));
11255               continue;
11256             }
11257           } else {
11258             /*
11259              * Pull the old version of the object.  Update missing_loc here to have the location
11260              * of the version we want.
11261              *
11262              * This doesn't use the usual missing_loc paths, but that's okay:
11263              *  - if we have it locally, we hit the case above, and go from there.
11264              *  - if we don't, we always pass through this case during recovery and set up the location
11265              *    properly.
11266              *  - this way we don't need to mangle the missing code to be general about needing an old
11267              *    version...
11268              */
11269             eversion_t alternate_need = latest->reverting_to;
11270             dout(10) << " need to pull prior_version " << alternate_need << " for revert " << item << dendl;
11271
11272             for (map<pg_shard_t, pg_missing_t>::iterator p = peer_missing.begin();
11273                  p != peer_missing.end();
11274                  ++p)
11275               if (p->second.is_missing(soid, need) &&
11276                   p->second.get_items().at(soid).have == alternate_need) {
11277                 missing_loc.add_location(soid, p->first);
11278               }
11279             dout(10) << " will pull " << alternate_need << " or " << need
11280                      << " from one of " << missing_loc.get_locations(soid)
11281                      << dendl;
11282           }
11283         }
11284         break;
11285       }
11286     }
11287
11288     if (!recovering.count(soid)) {
11289       if (recovering.count(head)) {
11290         ++skipped;
11291       } else {
11292         int r = recover_missing(
11293           soid, need, get_recovery_op_priority(), h);
11294         switch (r) {
11295         case PULL_YES:
11296           ++started;
11297           break;
11298         case PULL_OTHER:
11299           ++started;
11300         case PULL_NONE:
11301           ++skipped;
11302           break;
11303         default:
11304           ceph_abort();
11305         }
11306         if (started >= max)
11307           break;
11308       }
11309     }
11310
11311     // only advance last_requested if we haven't skipped anything
11312     if (!skipped)
11313       pg_log.set_last_requested(v);
11314   }
11315
11316   pgbackend->run_recovery_op(h, get_recovery_op_priority());
11317   return started;
11318 }
11319
11320 bool PrimaryLogPG::primary_error(
11321   const hobject_t& soid, eversion_t v)
11322 {
11323   pg_log.missing_add(soid, v, eversion_t());
11324   pg_log.set_last_requested(0);
11325   missing_loc.remove_location(soid, pg_whoami);
11326   bool uhoh = true;
11327   assert(!actingbackfill.empty());
11328   for (set<pg_shard_t>::iterator i = actingbackfill.begin();
11329        i != actingbackfill.end();
11330        ++i) {
11331     if (*i == get_primary()) continue;
11332     pg_shard_t peer = *i;
11333     if (!peer_missing[peer].is_missing(soid, v)) {
11334       missing_loc.add_location(soid, peer);
11335       dout(10) << info.pgid << " unexpectedly missing " << soid << " v" << v
11336                << ", there should be a copy on shard " << peer << dendl;
11337       uhoh = false;
11338     }
11339   }
11340   if (uhoh)
11341     osd->clog->error() << info.pgid << " missing primary copy of " << soid << ", unfound";
11342   else
11343     osd->clog->error() << info.pgid << " missing primary copy of " << soid
11344                          << ", will try copies on " << missing_loc.get_locations(soid);
11345   return uhoh;
11346 }
11347
11348 int PrimaryLogPG::prep_object_replica_pushes(
11349   const hobject_t& soid, eversion_t v,
11350   PGBackend::RecoveryHandle *h)
11351 {
11352   assert(is_primary());
11353   dout(10) << __func__ << ": on " << soid << dendl;
11354
11355   // NOTE: we know we will get a valid oloc off of disk here.
11356   ObjectContextRef obc = get_object_context(soid, false);
11357   if (!obc) {
11358     primary_error(soid, v);
11359     return 0;
11360   }
11361
11362   if (!obc->get_recovery_read()) {
11363     dout(20) << "recovery delayed on " << soid
11364              << "; could not get rw_manager lock" << dendl;
11365     return 0;
11366   } else {
11367     dout(20) << "recovery got recovery read lock on " << soid
11368              << dendl;
11369   }
11370
11371   start_recovery_op(soid);
11372   assert(!recovering.count(soid));
11373   recovering.insert(make_pair(soid, obc));
11374
11375   /* We need this in case there is an in progress write on the object.  In fact,
11376    * the only possible write is an update to the xattr due to a lost_revert --
11377    * a client write would be blocked since the object is degraded.
11378    * In almost all cases, therefore, this lock should be uncontended.
11379    */
11380   obc->ondisk_read_lock();
11381   int r = pgbackend->recover_object(
11382     soid,
11383     v,
11384     ObjectContextRef(),
11385     obc, // has snapset context
11386     h);
11387   obc->ondisk_read_unlock();
11388   if (r < 0) {
11389     dout(0) << __func__ << " Error " << r << " on oid " << soid << dendl;
11390     primary_failed(soid);
11391     primary_error(soid, v);
11392     return 0;
11393   }
11394   return 1;
11395 }
11396
11397 uint64_t PrimaryLogPG::recover_replicas(uint64_t max, ThreadPool::TPHandle &handle)
11398 {
11399   dout(10) << __func__ << "(" << max << ")" << dendl;
11400   uint64_t started = 0;
11401
11402   PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
11403
11404   // this is FAR from an optimal recovery order.  pretty lame, really.
11405   assert(!actingbackfill.empty());
11406   for (set<pg_shard_t>::iterator i = actingbackfill.begin();
11407        i != actingbackfill.end();
11408        ++i) {
11409     if (*i == get_primary()) continue;
11410     pg_shard_t peer = *i;
11411     map<pg_shard_t, pg_missing_t>::const_iterator pm = peer_missing.find(peer);
11412     assert(pm != peer_missing.end());
11413     map<pg_shard_t, pg_info_t>::const_iterator pi = peer_info.find(peer);
11414     assert(pi != peer_info.end());
11415     size_t m_sz = pm->second.num_missing();
11416
11417     dout(10) << " peer osd." << peer << " missing " << m_sz << " objects." << dendl;
11418     dout(20) << " peer osd." << peer << " missing " << pm->second.get_items() << dendl;
11419
11420     // oldest first!
11421     const pg_missing_t &m(pm->second);
11422     for (map<version_t, hobject_t>::const_iterator p = m.get_rmissing().begin();
11423          p != m.get_rmissing().end() && started < max;
11424            ++p) {
11425       handle.reset_tp_timeout();
11426       const hobject_t soid(p->second);
11427
11428       if (missing_loc.is_unfound(soid)) {
11429         dout(10) << __func__ << ": " << soid << " still unfound" << dendl;
11430         continue;
11431       }
11432
11433       if (soid > pi->second.last_backfill) {
11434         if (!recovering.count(soid)) {
11435           derr << __func__ << ": object " << soid << " last_backfill " << pi->second.last_backfill << dendl;
11436           derr << __func__ << ": object added to missing set for backfill, but "
11437                << "is not in recovering, error!" << dendl;
11438           ceph_abort();
11439         }
11440         continue;
11441       }
11442
11443       if (recovering.count(soid)) {
11444         dout(10) << __func__ << ": already recovering " << soid << dendl;
11445         continue;
11446       }
11447
11448       if (soid.is_snap() && pg_log.get_missing().is_missing(soid.get_head())) {
11449         dout(10) << __func__ << ": " << soid.get_head()
11450                  << " still missing on primary" << dendl;
11451         continue;
11452       }
11453
11454       if (soid.is_snap() && pg_log.get_missing().is_missing(soid.get_snapdir())) {
11455         dout(10) << __func__ << ": " << soid.get_snapdir()
11456                  << " still missing on primary" << dendl;
11457         continue;
11458       }
11459
11460       if (pg_log.get_missing().is_missing(soid)) {
11461         dout(10) << __func__ << ": " << soid << " still missing on primary" << dendl;
11462         continue;
11463       }
11464
11465       dout(10) << __func__ << ": recover_object_replicas(" << soid << ")" << dendl;
11466       map<hobject_t,pg_missing_item>::const_iterator r = m.get_items().find(soid);
11467       started += prep_object_replica_pushes(soid, r->second.need,
11468                                             h);
11469     }
11470   }
11471
11472   pgbackend->run_recovery_op(h, get_recovery_op_priority());
11473   return started;
11474 }
11475
11476 hobject_t PrimaryLogPG::earliest_peer_backfill() const
11477 {
11478   hobject_t e = hobject_t::get_max();
11479   for (set<pg_shard_t>::const_iterator i = backfill_targets.begin();
11480        i != backfill_targets.end();
11481        ++i) {
11482     pg_shard_t peer = *i;
11483     map<pg_shard_t, BackfillInterval>::const_iterator iter =
11484       peer_backfill_info.find(peer);
11485     assert(iter != peer_backfill_info.end());
11486     if (iter->second.begin < e)
11487       e = iter->second.begin;
11488   }
11489   return e;
11490 }
11491
11492 bool PrimaryLogPG::all_peer_done() const
11493 {
11494   // Primary hasn't got any more objects
11495   assert(backfill_info.empty());
11496
11497   for (set<pg_shard_t>::const_iterator i = backfill_targets.begin();
11498        i != backfill_targets.end();
11499        ++i) {
11500     pg_shard_t bt = *i;
11501     map<pg_shard_t, BackfillInterval>::const_iterator piter =
11502       peer_backfill_info.find(bt);
11503     assert(piter != peer_backfill_info.end());
11504     const BackfillInterval& pbi = piter->second;
11505     // See if peer has more to process
11506     if (!pbi.extends_to_end() || !pbi.empty())
11507         return false;
11508   }
11509   return true;
11510 }
11511
11512 /**
11513  * recover_backfill
11514  *
11515  * Invariants:
11516  *
11517  * backfilled: fully pushed to replica or present in replica's missing set (both
11518  * our copy and theirs).
11519  *
11520  * All objects on a backfill_target in
11521  * [MIN,peer_backfill_info[backfill_target].begin) are valid; logically-removed
11522  * objects have been actually deleted and all logically-valid objects are replicated.
11523  * There may be PG objects in this interval yet to be backfilled.
11524  *
11525  * All objects in PG in [MIN,backfill_info.begin) have been backfilled to all
11526  * backfill_targets.  There may be objects on backfill_target(s) yet to be deleted.
11527  *
11528  * For a backfill target, all objects < MIN(peer_backfill_info[target].begin,
11529  *     backfill_info.begin) in PG are backfilled.  No deleted objects in this
11530  * interval remain on the backfill target.
11531  *
11532  * For a backfill target, all objects <= peer_info[target].last_backfill
11533  * have been backfilled to target
11534  *
11535  * There *MAY* be missing/outdated objects between last_backfill_started and
11536  * MIN(peer_backfill_info[*].begin, backfill_info.begin) in the event that client
11537  * io created objects since the last scan.  For this reason, we call
11538  * update_range() again before continuing backfill.
11539  */
11540 uint64_t PrimaryLogPG::recover_backfill(
11541   uint64_t max,
11542   ThreadPool::TPHandle &handle, bool *work_started)
11543 {
11544   dout(10) << "recover_backfill (" << max << ")"
11545            << " bft=" << backfill_targets
11546            << " last_backfill_started " << last_backfill_started
11547            << (new_backfill ? " new_backfill":"")
11548            << dendl;
11549   assert(!backfill_targets.empty());
11550
11551   // Initialize from prior backfill state
11552   if (new_backfill) {
11553     // on_activate() was called prior to getting here
11554     assert(last_backfill_started == earliest_backfill());
11555     new_backfill = false;
11556
11557     // initialize BackfillIntervals
11558     for (set<pg_shard_t>::iterator i = backfill_targets.begin();
11559          i != backfill_targets.end();
11560          ++i) {
11561       peer_backfill_info[*i].reset(peer_info[*i].last_backfill);
11562     }
11563     backfill_info.reset(last_backfill_started);
11564
11565     backfills_in_flight.clear();
11566     pending_backfill_updates.clear();
11567   }
11568
11569   for (set<pg_shard_t>::iterator i = backfill_targets.begin();
11570        i != backfill_targets.end();
11571        ++i) {
11572     dout(10) << "peer osd." << *i
11573            << " info " << peer_info[*i]
11574            << " interval " << peer_backfill_info[*i].begin
11575            << "-" << peer_backfill_info[*i].end
11576            << " " << peer_backfill_info[*i].objects.size() << " objects"
11577            << dendl;
11578   }
11579
11580   // update our local interval to cope with recent changes
11581   backfill_info.begin = last_backfill_started;
11582   update_range(&backfill_info, handle);
11583
11584   unsigned ops = 0;
11585   vector<boost::tuple<hobject_t, eversion_t, pg_shard_t> > to_remove;
11586   set<hobject_t> add_to_stat;
11587
11588   for (set<pg_shard_t>::iterator i = backfill_targets.begin();
11589        i != backfill_targets.end();
11590        ++i) {
11591     peer_backfill_info[*i].trim_to(
11592       std::max(peer_info[*i].last_backfill, last_backfill_started));
11593   }
11594   backfill_info.trim_to(last_backfill_started);
11595
11596   PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
11597   while (ops < max) {
11598     if (backfill_info.begin <= earliest_peer_backfill() &&
11599         !backfill_info.extends_to_end() && backfill_info.empty()) {
11600       hobject_t next = backfill_info.end;
11601       backfill_info.reset(next);
11602       backfill_info.end = hobject_t::get_max();
11603       update_range(&backfill_info, handle);
11604       backfill_info.trim();
11605     }
11606
11607     dout(20) << "   my backfill interval " << backfill_info << dendl;
11608
11609     bool sent_scan = false;
11610     for (set<pg_shard_t>::iterator i = backfill_targets.begin();
11611          i != backfill_targets.end();
11612          ++i) {
11613       pg_shard_t bt = *i;
11614       BackfillInterval& pbi = peer_backfill_info[bt];
11615
11616       dout(20) << " peer shard " << bt << " backfill " << pbi << dendl;
11617       if (pbi.begin <= backfill_info.begin &&
11618           !pbi.extends_to_end() && pbi.empty()) {
11619         dout(10) << " scanning peer osd." << bt << " from " << pbi.end << dendl;
11620         epoch_t e = get_osdmap()->get_epoch();
11621         MOSDPGScan *m = new MOSDPGScan(
11622           MOSDPGScan::OP_SCAN_GET_DIGEST, pg_whoami, e, last_peering_reset,
11623           spg_t(info.pgid.pgid, bt.shard),
11624           pbi.end, hobject_t());
11625         osd->send_message_osd_cluster(bt.osd, m, get_osdmap()->get_epoch());
11626         assert(waiting_on_backfill.find(bt) == waiting_on_backfill.end());
11627         waiting_on_backfill.insert(bt);
11628         sent_scan = true;
11629       }
11630     }
11631
11632     // Count simultaneous scans as a single op and let those complete
11633     if (sent_scan) {
11634       ops++;
11635       start_recovery_op(hobject_t::get_max()); // XXX: was pbi.end
11636       break;
11637     }
11638
11639     if (backfill_info.empty() && all_peer_done()) {
11640       dout(10) << " reached end for both local and all peers" << dendl;
11641       break;
11642     }
11643
11644     // Get object within set of peers to operate on and
11645     // the set of targets for which that object applies.
11646     hobject_t check = earliest_peer_backfill();
11647
11648     if (check < backfill_info.begin) {
11649
11650       set<pg_shard_t> check_targets;
11651       for (set<pg_shard_t>::iterator i = backfill_targets.begin();
11652            i != backfill_targets.end();
11653            ++i) {
11654         pg_shard_t bt = *i;
11655         BackfillInterval& pbi = peer_backfill_info[bt];
11656         if (pbi.begin == check)
11657           check_targets.insert(bt);
11658       }
11659       assert(!check_targets.empty());
11660
11661       dout(20) << " BACKFILL removing " << check
11662                << " from peers " << check_targets << dendl;
11663       for (set<pg_shard_t>::iterator i = check_targets.begin();
11664            i != check_targets.end();
11665            ++i) {
11666         pg_shard_t bt = *i;
11667         BackfillInterval& pbi = peer_backfill_info[bt];
11668         assert(pbi.begin == check);
11669
11670         to_remove.push_back(boost::make_tuple(check, pbi.objects.begin()->second, bt));
11671         pbi.pop_front();
11672       }
11673
11674       /* This requires a bit of explanation.  We compare head against
11675        * last_backfill to determine whether to send an operation
11676        * to the replica.  A single write operation can touch up to three
11677        * objects: head, the snapdir, and a new clone which sorts closer to
11678        * head than any existing clone.  If last_backfill points at a clone,
11679        * the transaction won't be sent and all 3 must lie on the right side
11680        * of the line (i.e., we'll backfill them later).  If last_backfill
11681        * points at snapdir, it sorts greater than head, so we send the
11682        * transaction which is correct because all three must lie to the left
11683        * of the line.
11684        *
11685        * If it points at head, we have a bit of an issue.  If head actually
11686        * exists, no problem, because any transaction which touches snapdir
11687        * must end up creating it (and deleting head), so sending the
11688        * operation won't pose a problem -- we'll end up having to scan it,
11689        * but it'll end up being the right version so we won't bother to
11690        * rebackfill it.  However, if head doesn't exist, any write on head
11691        * will remove snapdir.  For a replicated pool, this isn't a problem,
11692        * ENOENT on remove isn't an issue and it's in backfill future anyway.
11693        * It only poses a problem for EC pools, because we never just delete
11694        * an object, we rename it into a rollback object.  That operation
11695        * will end up crashing the osd with ENOENT.  Tolerating the failure
11696        * wouldn't work either, even if snapdir exists, we'd be creating a
11697        * rollback object past the last_backfill line which wouldn't get
11698        * cleaned up (no rollback objects past the last_backfill line is an
11699        * existing important invariant).  Thus, let's avoid the whole issue
11700        * by just not updating last_backfill_started here if head doesn't
11701        * exist and snapdir does.  We aren't using up a recovery count here,
11702        * so we're going to recover snapdir immediately anyway.  We'll only
11703        * fail "backward" if we fail to get the rw lock and that just means
11704        * we'll re-process this section of the hash space again.
11705        *
11706        * I'm choosing this hack here because the really "correct" answer is
11707        * going to be to unify snapdir and head into a single object (a
11708        * snapdir is really just a confusing way to talk about head existing
11709        * as a whiteout), but doing that is going to be a somewhat larger
11710        * undertaking.
11711        *
11712        * @see http://tracker.ceph.com/issues/17668
11713        */
11714       if (!(check.is_head() &&
11715             backfill_info.begin.is_snapdir() &&
11716             check == backfill_info.begin.get_head()))
11717         last_backfill_started = check;
11718
11719       // Don't increment ops here because deletions
11720       // are cheap and not replied to unlike real recovery_ops,
11721       // and we can't increment ops without requeueing ourself
11722       // for recovery.
11723     } else {
11724       eversion_t& obj_v = backfill_info.objects.begin()->second;
11725
11726       vector<pg_shard_t> need_ver_targs, missing_targs, keep_ver_targs, skip_targs;
11727       for (set<pg_shard_t>::iterator i = backfill_targets.begin();
11728            i != backfill_targets.end();
11729            ++i) {
11730         pg_shard_t bt = *i;
11731         BackfillInterval& pbi = peer_backfill_info[bt];
11732         // Find all check peers that have the wrong version
11733         if (check == backfill_info.begin && check == pbi.begin) {
11734           if (pbi.objects.begin()->second != obj_v) {
11735             need_ver_targs.push_back(bt);
11736           } else {
11737             keep_ver_targs.push_back(bt);
11738           }
11739         } else {
11740           pg_info_t& pinfo = peer_info[bt];
11741
11742           // Only include peers that we've caught up to their backfill line
11743           // otherwise, they only appear to be missing this object
11744           // because their pbi.begin > backfill_info.begin.
11745           if (backfill_info.begin > pinfo.last_backfill)
11746             missing_targs.push_back(bt);
11747           else
11748             skip_targs.push_back(bt);
11749         }
11750       }
11751
11752       if (!keep_ver_targs.empty()) {
11753         // These peers have version obj_v
11754         dout(20) << " BACKFILL keeping " << check
11755                  << " with ver " << obj_v
11756                  << " on peers " << keep_ver_targs << dendl;
11757         //assert(!waiting_for_degraded_object.count(check));
11758       }
11759       if (!need_ver_targs.empty() || !missing_targs.empty()) {
11760         ObjectContextRef obc = get_object_context(backfill_info.begin, false);
11761         assert(obc);
11762         if (obc->get_recovery_read()) {
11763           if (!need_ver_targs.empty()) {
11764             dout(20) << " BACKFILL replacing " << check
11765                    << " with ver " << obj_v
11766                    << " to peers " << need_ver_targs << dendl;
11767           }
11768           if (!missing_targs.empty()) {
11769             dout(20) << " BACKFILL pushing " << backfill_info.begin
11770                  << " with ver " << obj_v
11771                  << " to peers " << missing_targs << dendl;
11772           }
11773           vector<pg_shard_t> all_push = need_ver_targs;
11774           all_push.insert(all_push.end(), missing_targs.begin(), missing_targs.end());
11775
11776           handle.reset_tp_timeout();
11777           int r = prep_backfill_object_push(backfill_info.begin, obj_v, obc, all_push, h);
11778           if (r < 0) {
11779             *work_started = true;
11780             dout(0) << __func__ << " Error " << r << " trying to backfill " << backfill_info.begin << dendl;
11781             break;
11782           }
11783           ops++;
11784         } else {
11785           *work_started = true;
11786           dout(20) << "backfill blocking on " << backfill_info.begin
11787                    << "; could not get rw_manager lock" << dendl;
11788           break;
11789         }
11790       }
11791       dout(20) << "need_ver_targs=" << need_ver_targs
11792                << " keep_ver_targs=" << keep_ver_targs << dendl;
11793       dout(20) << "backfill_targets=" << backfill_targets
11794                << " missing_targs=" << missing_targs
11795                << " skip_targs=" << skip_targs << dendl;
11796
11797       last_backfill_started = backfill_info.begin;
11798       add_to_stat.insert(backfill_info.begin); // XXX: Only one for all pushes?
11799       backfill_info.pop_front();
11800       vector<pg_shard_t> check_targets = need_ver_targs;
11801       check_targets.insert(check_targets.end(), keep_ver_targs.begin(), keep_ver_targs.end());
11802       for (vector<pg_shard_t>::iterator i = check_targets.begin();
11803            i != check_targets.end();
11804            ++i) {
11805         pg_shard_t bt = *i;
11806         BackfillInterval& pbi = peer_backfill_info[bt];
11807         pbi.pop_front();
11808       }
11809     }
11810   }
11811
11812   hobject_t backfill_pos =
11813     std::min(backfill_info.begin, earliest_peer_backfill());
11814
11815   for (set<hobject_t>::iterator i = add_to_stat.begin();
11816        i != add_to_stat.end();
11817        ++i) {
11818     ObjectContextRef obc = get_object_context(*i, false);
11819     assert(obc);
11820     pg_stat_t stat;
11821     add_object_context_to_pg_stat(obc, &stat);
11822     pending_backfill_updates[*i] = stat;
11823   }
11824   if (HAVE_FEATURE(get_min_upacting_features(), SERVER_LUMINOUS)) {
11825     map<pg_shard_t,MOSDPGBackfillRemove*> reqs;
11826     for (unsigned i = 0; i < to_remove.size(); ++i) {
11827       handle.reset_tp_timeout();
11828       const hobject_t& oid = to_remove[i].get<0>();
11829       eversion_t v = to_remove[i].get<1>();
11830       pg_shard_t peer = to_remove[i].get<2>();
11831       MOSDPGBackfillRemove *m;
11832       auto it = reqs.find(peer);
11833       if (it != reqs.end()) {
11834         m = it->second;
11835       } else {
11836         m = reqs[peer] = new MOSDPGBackfillRemove(
11837           spg_t(info.pgid.pgid, peer.shard),
11838           get_osdmap()->get_epoch());
11839       }
11840       m->ls.push_back(make_pair(oid, v));
11841
11842       if (oid <= last_backfill_started)
11843         pending_backfill_updates[oid]; // add empty stat!
11844     }
11845     for (auto p : reqs) {
11846       osd->send_message_osd_cluster(p.first.osd, p.second,
11847                                     get_osdmap()->get_epoch());
11848     }
11849   } else {
11850     // for jewel targets
11851     for (unsigned i = 0; i < to_remove.size(); ++i) {
11852       handle.reset_tp_timeout();
11853
11854       // ordered before any subsequent updates
11855       send_remove_op(to_remove[i].get<0>(), to_remove[i].get<1>(),
11856                      to_remove[i].get<2>());
11857
11858       if (to_remove[i].get<0>() <= last_backfill_started)
11859         pending_backfill_updates[to_remove[i].get<0>()]; // add empty stat!
11860     }
11861   }
11862
11863   pgbackend->run_recovery_op(h, get_recovery_op_priority());
11864
11865   dout(5) << "backfill_pos is " << backfill_pos << dendl;
11866   for (set<hobject_t>::iterator i = backfills_in_flight.begin();
11867        i != backfills_in_flight.end();
11868        ++i) {
11869     dout(20) << *i << " is still in flight" << dendl;
11870   }
11871
11872   hobject_t next_backfill_to_complete = backfills_in_flight.empty() ?
11873     backfill_pos : *(backfills_in_flight.begin());
11874   hobject_t new_last_backfill = earliest_backfill();
11875   dout(10) << "starting new_last_backfill at " << new_last_backfill << dendl;
11876   for (map<hobject_t, pg_stat_t>::iterator i =
11877          pending_backfill_updates.begin();
11878        i != pending_backfill_updates.end() &&
11879          i->first < next_backfill_to_complete;
11880        pending_backfill_updates.erase(i++)) {
11881     dout(20) << " pending_backfill_update " << i->first << dendl;
11882     assert(i->first > new_last_backfill);
11883     for (set<pg_shard_t>::iterator j = backfill_targets.begin();
11884          j != backfill_targets.end();
11885          ++j) {
11886       pg_shard_t bt = *j;
11887       pg_info_t& pinfo = peer_info[bt];
11888       //Add stats to all peers that were missing object
11889       if (i->first > pinfo.last_backfill)
11890         pinfo.stats.add(i->second);
11891     }
11892     new_last_backfill = i->first;
11893   }
11894   dout(10) << "possible new_last_backfill at " << new_last_backfill << dendl;
11895
11896   assert(!pending_backfill_updates.empty() ||
11897          new_last_backfill == last_backfill_started);
11898   if (pending_backfill_updates.empty() &&
11899       backfill_pos.is_max()) {
11900     assert(backfills_in_flight.empty());
11901     new_last_backfill = backfill_pos;
11902     last_backfill_started = backfill_pos;
11903   }
11904   dout(10) << "final new_last_backfill at " << new_last_backfill << dendl;
11905
11906   // If new_last_backfill == MAX, then we will send OP_BACKFILL_FINISH to
11907   // all the backfill targets.  Otherwise, we will move last_backfill up on
11908   // those targets need it and send OP_BACKFILL_PROGRESS to them.
11909   for (set<pg_shard_t>::iterator i = backfill_targets.begin();
11910        i != backfill_targets.end();
11911        ++i) {
11912     pg_shard_t bt = *i;
11913     pg_info_t& pinfo = peer_info[bt];
11914
11915     if (new_last_backfill > pinfo.last_backfill) {
11916       pinfo.set_last_backfill(new_last_backfill);
11917       epoch_t e = get_osdmap()->get_epoch();
11918       MOSDPGBackfill *m = NULL;
11919       if (pinfo.last_backfill.is_max()) {
11920         m = new MOSDPGBackfill(
11921           MOSDPGBackfill::OP_BACKFILL_FINISH,
11922           e,
11923           last_peering_reset,
11924           spg_t(info.pgid.pgid, bt.shard));
11925         // Use default priority here, must match sub_op priority
11926         /* pinfo.stats might be wrong if we did log-based recovery on the
11927          * backfilled portion in addition to continuing backfill.
11928          */
11929         pinfo.stats = info.stats;
11930         start_recovery_op(hobject_t::get_max());
11931       } else {
11932         m = new MOSDPGBackfill(
11933           MOSDPGBackfill::OP_BACKFILL_PROGRESS,
11934           e,
11935           last_peering_reset,
11936           spg_t(info.pgid.pgid, bt.shard));
11937         // Use default priority here, must match sub_op priority
11938       }
11939       m->last_backfill = pinfo.last_backfill;
11940       m->stats = pinfo.stats;
11941       osd->send_message_osd_cluster(bt.osd, m, get_osdmap()->get_epoch());
11942       dout(10) << " peer " << bt
11943                << " num_objects now " << pinfo.stats.stats.sum.num_objects
11944                << " / " << info.stats.stats.sum.num_objects << dendl;
11945     }
11946   }
11947
11948   if (ops)
11949     *work_started = true;
11950   return ops;
11951 }
11952
11953 int PrimaryLogPG::prep_backfill_object_push(
11954   hobject_t oid, eversion_t v,
11955   ObjectContextRef obc,
11956   vector<pg_shard_t> peers,
11957   PGBackend::RecoveryHandle *h)
11958 {
11959   dout(10) << __func__ << " " << oid << " v " << v << " to peers " << peers << dendl;
11960   assert(!peers.empty());
11961
11962   backfills_in_flight.insert(oid);
11963   for (unsigned int i = 0 ; i < peers.size(); ++i) {
11964     map<pg_shard_t, pg_missing_t>::iterator bpm = peer_missing.find(peers[i]);
11965     assert(bpm != peer_missing.end());
11966     bpm->second.add(oid, eversion_t(), eversion_t());
11967   }
11968
11969   assert(!recovering.count(oid));
11970
11971   start_recovery_op(oid);
11972   recovering.insert(make_pair(oid, obc));
11973
11974   // We need to take the read_lock here in order to flush in-progress writes
11975   obc->ondisk_read_lock();
11976   int r = pgbackend->recover_object(
11977     oid,
11978     v,
11979     ObjectContextRef(),
11980     obc,
11981     h);
11982   obc->ondisk_read_unlock();
11983   if (r < 0) {
11984     dout(0) << __func__ << " Error " << r << " on oid " << oid << dendl;
11985     primary_failed(oid);
11986     primary_error(oid, v);
11987     backfills_in_flight.erase(oid);
11988     missing_loc.add_missing(oid, v, eversion_t());
11989   }
11990   return r;
11991 }
11992
11993 void PrimaryLogPG::update_range(
11994   BackfillInterval *bi,
11995   ThreadPool::TPHandle &handle)
11996 {
11997   int local_min = cct->_conf->osd_backfill_scan_min;
11998   int local_max = cct->_conf->osd_backfill_scan_max;
11999
12000   if (bi->version < info.log_tail) {
12001     dout(10) << __func__<< ": bi is old, rescanning local backfill_info"
12002              << dendl;
12003     if (last_update_applied >= info.log_tail) {
12004       bi->version = last_update_applied;
12005     } else {
12006       osr->flush();
12007       bi->version = info.last_update;
12008     }
12009     scan_range(local_min, local_max, bi, handle);
12010   }
12011
12012   if (bi->version >= projected_last_update) {
12013     dout(10) << __func__<< ": bi is current " << dendl;
12014     assert(bi->version == projected_last_update);
12015   } else if (bi->version >= info.log_tail) {
12016     if (pg_log.get_log().empty() && projected_log.empty()) {
12017       /* Because we don't move log_tail on split, the log might be
12018        * empty even if log_tail != last_update.  However, the only
12019        * way to get here with an empty log is if log_tail is actually
12020        * eversion_t(), because otherwise the entry which changed
12021        * last_update since the last scan would have to be present.
12022        */
12023       assert(bi->version == eversion_t());
12024       return;
12025     }
12026
12027     dout(10) << __func__<< ": bi is old, (" << bi->version
12028              << ") can be updated with log to projected_last_update "
12029              << projected_last_update << dendl;
12030
12031     auto func = [&](const pg_log_entry_t &e) {
12032       dout(10) << __func__ << ": updating from version " << e.version
12033                << dendl;
12034       const hobject_t &soid = e.soid;
12035       if (soid >= bi->begin &&
12036           soid < bi->end) {
12037         if (e.is_update()) {
12038           dout(10) << __func__ << ": " << e.soid << " updated to version "
12039                    << e.version << dendl;
12040           bi->objects.erase(e.soid);
12041           bi->objects.insert(
12042             make_pair(
12043               e.soid,
12044               e.version));
12045         } else if (e.is_delete()) {
12046           dout(10) << __func__ << ": " << e.soid << " removed" << dendl;
12047           bi->objects.erase(e.soid);
12048         }
12049       }
12050     };
12051     dout(10) << "scanning pg log first" << dendl;
12052     pg_log.get_log().scan_log_after(bi->version, func);
12053     dout(10) << "scanning projected log" << dendl;
12054     projected_log.scan_log_after(bi->version, func);
12055     bi->version = projected_last_update;
12056   } else {
12057     assert(0 == "scan_range should have raised bi->version past log_tail");
12058   }
12059 }
12060
12061 void PrimaryLogPG::scan_range(
12062   int min, int max, BackfillInterval *bi,
12063   ThreadPool::TPHandle &handle)
12064 {
12065   assert(is_locked());
12066   dout(10) << "scan_range from " << bi->begin << dendl;
12067   bi->clear_objects();
12068
12069   vector<hobject_t> ls;
12070   ls.reserve(max);
12071   int r = pgbackend->objects_list_partial(bi->begin, min, max, &ls, &bi->end);
12072   assert(r >= 0);
12073   dout(10) << " got " << ls.size() << " items, next " << bi->end << dendl;
12074   dout(20) << ls << dendl;
12075
12076   for (vector<hobject_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
12077     handle.reset_tp_timeout();
12078     ObjectContextRef obc;
12079     if (is_primary())
12080       obc = object_contexts.lookup(*p);
12081     if (obc) {
12082       bi->objects[*p] = obc->obs.oi.version;
12083       dout(20) << "  " << *p << " " << obc->obs.oi.version << dendl;
12084     } else {
12085       bufferlist bl;
12086       int r = pgbackend->objects_get_attr(*p, OI_ATTR, &bl);
12087
12088       /* If the object does not exist here, it must have been removed
12089          * between the collection_list_partial and here.  This can happen
12090          * for the first item in the range, which is usually last_backfill.
12091          */
12092       if (r == -ENOENT)
12093         continue;
12094
12095       assert(r >= 0);
12096       object_info_t oi(bl);
12097       bi->objects[*p] = oi.version;
12098       dout(20) << "  " << *p << " " << oi.version << dendl;
12099     }
12100   }
12101 }
12102
12103
12104 /** check_local
12105  *
12106  * verifies that stray objects have been deleted
12107  */
12108 void PrimaryLogPG::check_local()
12109 {
12110   dout(10) << __func__ << dendl;
12111
12112   assert(info.last_update >= pg_log.get_tail());  // otherwise we need some help!
12113
12114   if (!cct->_conf->osd_debug_verify_stray_on_activate)
12115     return;
12116
12117   // just scan the log.
12118   set<hobject_t> did;
12119   for (list<pg_log_entry_t>::const_reverse_iterator p = pg_log.get_log().log.rbegin();
12120        p != pg_log.get_log().log.rend();
12121        ++p) {
12122     if (did.count(p->soid))
12123       continue;
12124     did.insert(p->soid);
12125
12126     if (p->is_delete()) {
12127       dout(10) << " checking " << p->soid
12128                << " at " << p->version << dendl;
12129       struct stat st;
12130       int r = osd->store->stat(
12131         ch,
12132         ghobject_t(p->soid, ghobject_t::NO_GEN, pg_whoami.shard),
12133         &st);
12134       if (r != -ENOENT) {
12135         derr << __func__ << " " << p->soid << " exists, but should have been "
12136              << "deleted" << dendl;
12137         assert(0 == "erroneously present object");
12138       }
12139     } else {
12140       // ignore old(+missing) objects
12141     }
12142   }
12143 }
12144
12145
12146
12147 // ===========================
12148 // hit sets
12149
12150 hobject_t PrimaryLogPG::get_hit_set_current_object(utime_t stamp)
12151 {
12152   ostringstream ss;
12153   ss << "hit_set_" << info.pgid.pgid << "_current_" << stamp;
12154   hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "",
12155                  info.pgid.ps(), info.pgid.pool(),
12156                  cct->_conf->osd_hit_set_namespace);
12157   dout(20) << __func__ << " " << hoid << dendl;
12158   return hoid;
12159 }
12160
12161 hobject_t PrimaryLogPG::get_hit_set_archive_object(utime_t start,
12162                                                    utime_t end,
12163                                                    bool using_gmt)
12164 {
12165   ostringstream ss;
12166   ss << "hit_set_" << info.pgid.pgid << "_archive_";
12167   if (using_gmt) {
12168     start.gmtime(ss) << "_";
12169     end.gmtime(ss);
12170   } else {
12171     start.localtime(ss) << "_";
12172     end.localtime(ss);
12173   }
12174   hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "",
12175                  info.pgid.ps(), info.pgid.pool(),
12176                  cct->_conf->osd_hit_set_namespace);
12177   dout(20) << __func__ << " " << hoid << dendl;
12178   return hoid;
12179 }
12180
12181 void PrimaryLogPG::hit_set_clear()
12182 {
12183   dout(20) << __func__ << dendl;
12184   hit_set.reset();
12185   hit_set_start_stamp = utime_t();
12186 }
12187
12188 void PrimaryLogPG::hit_set_setup()
12189 {
12190   if (!is_active() ||
12191       !is_primary()) {
12192     hit_set_clear();
12193     return;
12194   }
12195
12196   if (is_active() && is_primary() &&
12197       (!pool.info.hit_set_count ||
12198        !pool.info.hit_set_period ||
12199        pool.info.hit_set_params.get_type() == HitSet::TYPE_NONE)) {
12200     hit_set_clear();
12201
12202     // only primary is allowed to remove all the hit set objects
12203     hit_set_remove_all();
12204     return;
12205   }
12206
12207   // FIXME: discard any previous data for now
12208   hit_set_create();
12209
12210   // include any writes we know about from the pg log.  this doesn't
12211   // capture reads, but it is better than nothing!
12212   hit_set_apply_log();
12213 }
12214
12215 void PrimaryLogPG::hit_set_remove_all()
12216 {
12217   // If any archives are degraded we skip this
12218   for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin();
12219        p != info.hit_set.history.end();
12220        ++p) {
12221     hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
12222
12223     // Once we hit a degraded object just skip
12224     if (is_degraded_or_backfilling_object(aoid))
12225       return;
12226     if (scrubber.write_blocked_by_scrub(aoid))
12227       return;
12228   }
12229
12230   if (!info.hit_set.history.empty()) {
12231     list<pg_hit_set_info_t>::reverse_iterator p = info.hit_set.history.rbegin();
12232     assert(p != info.hit_set.history.rend());
12233     hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
12234     assert(!is_degraded_or_backfilling_object(oid));
12235     ObjectContextRef obc = get_object_context(oid, false);
12236     assert(obc);
12237
12238     OpContextUPtr ctx = simple_opc_create(obc);
12239     ctx->at_version = get_next_version();
12240     ctx->updated_hset_history = info.hit_set;
12241     utime_t now = ceph_clock_now();
12242     ctx->mtime = now;
12243     hit_set_trim(ctx, 0);
12244     simple_opc_submit(std::move(ctx));
12245   }
12246
12247   info.hit_set = pg_hit_set_history_t();
12248   if (agent_state) {
12249     agent_state->discard_hit_sets();
12250   }
12251 }
12252
12253 void PrimaryLogPG::hit_set_create()
12254 {
12255   utime_t now = ceph_clock_now();
12256   // make a copy of the params to modify
12257   HitSet::Params params(pool.info.hit_set_params);
12258
12259   dout(20) << __func__ << " " << params << dendl;
12260   if (pool.info.hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
12261     BloomHitSet::Params *p =
12262       static_cast<BloomHitSet::Params*>(params.impl.get());
12263
12264     // convert false positive rate so it holds up across the full period
12265     p->set_fpp(p->get_fpp() / pool.info.hit_set_count);
12266     if (p->get_fpp() <= 0.0)
12267       p->set_fpp(.01);  // fpp cannot be zero!
12268
12269     // if we don't have specified size, estimate target size based on the
12270     // previous bin!
12271     if (p->target_size == 0 && hit_set) {
12272       utime_t dur = now - hit_set_start_stamp;
12273       unsigned unique = hit_set->approx_unique_insert_count();
12274       dout(20) << __func__ << " previous set had approx " << unique
12275                << " unique items over " << dur << " seconds" << dendl;
12276       p->target_size = (double)unique * (double)pool.info.hit_set_period
12277                      / (double)dur;
12278     }
12279     if (p->target_size <
12280         static_cast<uint64_t>(cct->_conf->osd_hit_set_min_size))
12281       p->target_size = cct->_conf->osd_hit_set_min_size;
12282
12283     if (p->target_size
12284         > static_cast<uint64_t>(cct->_conf->osd_hit_set_max_size))
12285       p->target_size = cct->_conf->osd_hit_set_max_size;
12286
12287     p->seed = now.sec();
12288
12289     dout(10) << __func__ << " target_size " << p->target_size
12290              << " fpp " << p->get_fpp() << dendl;
12291   }
12292   hit_set.reset(new HitSet(params));
12293   hit_set_start_stamp = now;
12294 }
12295
12296 /**
12297  * apply log entries to set
12298  *
12299  * this would only happen after peering, to at least capture writes
12300  * during an interval that was potentially lost.
12301  */
12302 bool PrimaryLogPG::hit_set_apply_log()
12303 {
12304   if (!hit_set)
12305     return false;
12306
12307   eversion_t to = info.last_update;
12308   eversion_t from = info.hit_set.current_last_update;
12309   if (to <= from) {
12310     dout(20) << __func__ << " no update" << dendl;
12311     return false;
12312   }
12313
12314   dout(20) << __func__ << " " << to << " .. " << info.last_update << dendl;
12315   list<pg_log_entry_t>::const_reverse_iterator p = pg_log.get_log().log.rbegin();
12316   while (p != pg_log.get_log().log.rend() && p->version > to)
12317     ++p;
12318   while (p != pg_log.get_log().log.rend() && p->version > from) {
12319     hit_set->insert(p->soid);
12320     ++p;
12321   }
12322
12323   return true;
12324 }
12325
12326 void PrimaryLogPG::hit_set_persist()
12327 {
12328   dout(10) << __func__  << dendl;
12329   bufferlist bl;
12330   unsigned max = pool.info.hit_set_count;
12331
12332   utime_t now = ceph_clock_now();
12333   hobject_t oid;
12334
12335   // If any archives are degraded we skip this persist request
12336   // account for the additional entry being added below
12337   for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin();
12338        p != info.hit_set.history.end();
12339        ++p) {
12340     hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
12341
12342     // Once we hit a degraded object just skip further trim
12343     if (is_degraded_or_backfilling_object(aoid))
12344       return;
12345     if (scrubber.write_blocked_by_scrub(aoid))
12346       return;
12347   }
12348
12349   // If backfill is in progress and we could possibly overlap with the
12350   // hit_set_* objects, back off.  Since these all have
12351   // hobject_t::hash set to pgid.ps(), and those sort first, we can
12352   // look just at that.  This is necessary because our transactions
12353   // may include a modify of the new hit_set *and* a delete of the
12354   // old one, and this may span the backfill boundary.
12355   for (set<pg_shard_t>::iterator p = backfill_targets.begin();
12356        p != backfill_targets.end();
12357        ++p) {
12358     assert(peer_info.count(*p));
12359     const pg_info_t& pi = peer_info[*p];
12360     if (pi.last_backfill == hobject_t() ||
12361         pi.last_backfill.get_hash() == info.pgid.ps()) {
12362       dout(10) << __func__ << " backfill target osd." << *p
12363                << " last_backfill has not progressed past pgid ps"
12364                << dendl;
12365       return;
12366     }
12367   }
12368
12369
12370   pg_hit_set_info_t new_hset = pg_hit_set_info_t(pool.info.use_gmt_hitset);
12371   new_hset.begin = hit_set_start_stamp;
12372   new_hset.end = now;
12373   oid = get_hit_set_archive_object(
12374     new_hset.begin,
12375     new_hset.end,
12376     new_hset.using_gmt);
12377
12378   // If the current object is degraded we skip this persist request
12379   if (scrubber.write_blocked_by_scrub(oid))
12380     return;
12381
12382   hit_set->seal();
12383   ::encode(*hit_set, bl);
12384   dout(20) << __func__ << " archive " << oid << dendl;
12385
12386   if (agent_state) {
12387     agent_state->add_hit_set(new_hset.begin, hit_set);
12388     uint32_t size = agent_state->hit_set_map.size();
12389     if (size >= pool.info.hit_set_count) {
12390       size = pool.info.hit_set_count > 0 ? pool.info.hit_set_count - 1: 0;
12391     }
12392     hit_set_in_memory_trim(size);
12393   }
12394
12395   ObjectContextRef obc = get_object_context(oid, true);
12396   OpContextUPtr ctx = simple_opc_create(obc);
12397
12398   ctx->at_version = get_next_version();
12399   ctx->updated_hset_history = info.hit_set;
12400   pg_hit_set_history_t &updated_hit_set_hist = *(ctx->updated_hset_history);
12401
12402   updated_hit_set_hist.current_last_update = info.last_update;
12403   new_hset.version = ctx->at_version;
12404
12405   updated_hit_set_hist.history.push_back(new_hset);
12406   hit_set_create();
12407
12408   // fabricate an object_info_t and SnapSet
12409   obc->obs.oi.version = ctx->at_version;
12410   obc->obs.oi.mtime = now;
12411   obc->obs.oi.size = bl.length();
12412   obc->obs.exists = true;
12413   obc->obs.oi.set_data_digest(bl.crc32c(-1));
12414
12415   ctx->new_obs = obc->obs;
12416
12417   obc->ssc->snapset.head_exists = true;
12418   ctx->new_snapset = obc->ssc->snapset;
12419
12420   ctx->delta_stats.num_objects++;
12421   ctx->delta_stats.num_objects_hit_set_archive++;
12422   ctx->delta_stats.num_bytes += bl.length();
12423   ctx->delta_stats.num_bytes_hit_set_archive += bl.length();
12424
12425   bufferlist bss;
12426   ::encode(ctx->new_snapset, bss);
12427   bufferlist boi(sizeof(ctx->new_obs.oi));
12428   ::encode(ctx->new_obs.oi, boi,
12429            get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
12430
12431   ctx->op_t->create(oid);
12432   if (bl.length()) {
12433     ctx->op_t->write(oid, 0, bl.length(), bl, 0);
12434   }
12435   map <string, bufferlist> attrs;
12436   attrs[OI_ATTR].claim(boi);
12437   attrs[SS_ATTR].claim(bss);
12438   setattrs_maybe_cache(ctx->obc, ctx.get(), ctx->op_t.get(), attrs);
12439   ctx->log.push_back(
12440     pg_log_entry_t(
12441       pg_log_entry_t::MODIFY,
12442       oid,
12443       ctx->at_version,
12444       eversion_t(),
12445       0,
12446       osd_reqid_t(),
12447       ctx->mtime,
12448       0)
12449     );
12450
12451   hit_set_trim(ctx, max);
12452
12453   simple_opc_submit(std::move(ctx));
12454 }
12455
12456 void PrimaryLogPG::hit_set_trim(OpContextUPtr &ctx, unsigned max)
12457 {
12458   assert(ctx->updated_hset_history);
12459   pg_hit_set_history_t &updated_hit_set_hist =
12460     *(ctx->updated_hset_history);
12461   for (unsigned num = updated_hit_set_hist.history.size(); num > max; --num) {
12462     list<pg_hit_set_info_t>::iterator p = updated_hit_set_hist.history.begin();
12463     assert(p != updated_hit_set_hist.history.end());
12464     hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
12465
12466     assert(!is_degraded_or_backfilling_object(oid));
12467
12468     dout(20) << __func__ << " removing " << oid << dendl;
12469     ++ctx->at_version.version;
12470     ctx->log.push_back(
12471         pg_log_entry_t(pg_log_entry_t::DELETE,
12472                        oid,
12473                        ctx->at_version,
12474                        p->version,
12475                        0,
12476                        osd_reqid_t(),
12477                        ctx->mtime,
12478                        0));
12479
12480     ctx->op_t->remove(oid);
12481     updated_hit_set_hist.history.pop_front();
12482
12483     ObjectContextRef obc = get_object_context(oid, false);
12484     assert(obc);
12485     --ctx->delta_stats.num_objects;
12486     --ctx->delta_stats.num_objects_hit_set_archive;
12487     ctx->delta_stats.num_bytes -= obc->obs.oi.size;
12488     ctx->delta_stats.num_bytes_hit_set_archive -= obc->obs.oi.size;
12489   }
12490 }
12491
12492 void PrimaryLogPG::hit_set_in_memory_trim(uint32_t max_in_memory)
12493 {
12494   while (agent_state->hit_set_map.size() > max_in_memory) {
12495     agent_state->remove_oldest_hit_set();
12496   }
12497 }
12498
12499
12500 // =======================================
12501 // cache agent
12502
12503 void PrimaryLogPG::agent_setup()
12504 {
12505   assert(is_locked());
12506   if (!is_active() ||
12507       !is_primary() ||
12508       pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE ||
12509       pool.info.tier_of < 0 ||
12510       !get_osdmap()->have_pg_pool(pool.info.tier_of)) {
12511     agent_clear();
12512     return;
12513   }
12514   if (!agent_state) {
12515     agent_state.reset(new TierAgentState);
12516
12517     // choose random starting position
12518     agent_state->position = hobject_t();
12519     agent_state->position.pool = info.pgid.pool();
12520     agent_state->position.set_hash(pool.info.get_random_pg_position(
12521       info.pgid.pgid,
12522       rand()));
12523     agent_state->start = agent_state->position;
12524
12525     dout(10) << __func__ << " allocated new state, position "
12526              << agent_state->position << dendl;
12527   } else {
12528     dout(10) << __func__ << " keeping existing state" << dendl;
12529   }
12530
12531   if (info.stats.stats_invalid) {
12532     osd->clog->warn() << "pg " << info.pgid << " has invalid (post-split) stats; must scrub before tier agent can activate";
12533   }
12534
12535   agent_choose_mode();
12536 }
12537
12538 void PrimaryLogPG::agent_clear()
12539 {
12540   agent_stop();
12541   agent_state.reset(NULL);
12542 }
12543
12544 // Return false if no objects operated on since start of object hash space
12545 bool PrimaryLogPG::agent_work(int start_max, int agent_flush_quota)
12546 {
12547   lock();
12548   if (!agent_state) {
12549     dout(10) << __func__ << " no agent state, stopping" << dendl;
12550     unlock();
12551     return true;
12552   }
12553
12554   assert(!deleting);
12555
12556   if (agent_state->is_idle()) {
12557     dout(10) << __func__ << " idle, stopping" << dendl;
12558     unlock();
12559     return true;
12560   }
12561
12562   osd->logger->inc(l_osd_agent_wake);
12563
12564   dout(10) << __func__
12565            << " max " << start_max
12566            << ", flush " << agent_state->get_flush_mode_name()
12567            << ", evict " << agent_state->get_evict_mode_name()
12568            << ", pos " << agent_state->position
12569            << dendl;
12570   assert(is_primary());
12571   assert(is_active());
12572
12573   agent_load_hit_sets();
12574
12575   const pg_pool_t *base_pool = get_osdmap()->get_pg_pool(pool.info.tier_of);
12576   assert(base_pool);
12577
12578   int ls_min = 1;
12579   int ls_max = cct->_conf->osd_pool_default_cache_max_evict_check_size;
12580
12581   // list some objects.  this conveniently lists clones (oldest to
12582   // newest) before heads... the same order we want to flush in.
12583   //
12584   // NOTE: do not flush the Sequencer.  we will assume that the
12585   // listing we get back is imprecise.
12586   vector<hobject_t> ls;
12587   hobject_t next;
12588   int r = pgbackend->objects_list_partial(agent_state->position, ls_min, ls_max,
12589                                           &ls, &next);
12590   assert(r >= 0);
12591   dout(20) << __func__ << " got " << ls.size() << " objects" << dendl;
12592   int started = 0;
12593   for (vector<hobject_t>::iterator p = ls.begin();
12594        p != ls.end();
12595        ++p) {
12596     if (p->nspace == cct->_conf->osd_hit_set_namespace) {
12597       dout(20) << __func__ << " skip (hit set) " << *p << dendl;
12598       osd->logger->inc(l_osd_agent_skip);
12599       continue;
12600     }
12601     if (is_degraded_or_backfilling_object(*p)) {
12602       dout(20) << __func__ << " skip (degraded) " << *p << dendl;
12603       osd->logger->inc(l_osd_agent_skip);
12604       continue;
12605     }
12606     if (is_missing_object(p->get_head())) {
12607       dout(20) << __func__ << " skip (missing head) " << *p << dendl;
12608       osd->logger->inc(l_osd_agent_skip);
12609       continue;
12610     }
12611     ObjectContextRef obc = get_object_context(*p, false, NULL);
12612     if (!obc) {
12613       // we didn't flush; we may miss something here.
12614       dout(20) << __func__ << " skip (no obc) " << *p << dendl;
12615       osd->logger->inc(l_osd_agent_skip);
12616       continue;
12617     }
12618     if (!obc->obs.exists) {
12619       dout(20) << __func__ << " skip (dne) " << obc->obs.oi.soid << dendl;
12620       osd->logger->inc(l_osd_agent_skip);
12621       continue;
12622     }
12623     if (scrubber.write_blocked_by_scrub(obc->obs.oi.soid)) {
12624       dout(20) << __func__ << " skip (scrubbing) " << obc->obs.oi << dendl;
12625       osd->logger->inc(l_osd_agent_skip);
12626       continue;
12627     }
12628     if (obc->is_blocked()) {
12629       dout(20) << __func__ << " skip (blocked) " << obc->obs.oi << dendl;
12630       osd->logger->inc(l_osd_agent_skip);
12631       continue;
12632     }
12633     if (obc->is_request_pending()) {
12634       dout(20) << __func__ << " skip (request pending) " << obc->obs.oi << dendl;
12635       osd->logger->inc(l_osd_agent_skip);
12636       continue;
12637     }
12638
12639     // be careful flushing omap to an EC pool.
12640     if (!base_pool->supports_omap() &&
12641         obc->obs.oi.is_omap()) {
12642       dout(20) << __func__ << " skip (omap to EC) " << obc->obs.oi << dendl;
12643       osd->logger->inc(l_osd_agent_skip);
12644       continue;
12645     }
12646
12647     if (agent_state->evict_mode != TierAgentState::EVICT_MODE_IDLE &&
12648         agent_maybe_evict(obc, false))
12649       ++started;
12650     else if (agent_state->flush_mode != TierAgentState::FLUSH_MODE_IDLE &&
12651              agent_flush_quota > 0 && agent_maybe_flush(obc)) {
12652       ++started;
12653       --agent_flush_quota;
12654     }
12655     if (started >= start_max) {
12656       // If finishing early, set "next" to the next object
12657       if (++p != ls.end())
12658         next = *p;
12659       break;
12660     }
12661   }
12662
12663   if (++agent_state->hist_age > cct->_conf->osd_agent_hist_halflife) {
12664     dout(20) << __func__ << " resetting atime and temp histograms" << dendl;
12665     agent_state->hist_age = 0;
12666     agent_state->temp_hist.decay();
12667   }
12668
12669   // Total objects operated on so far
12670   int total_started = agent_state->started + started;
12671   bool need_delay = false;
12672
12673   dout(20) << __func__ << " start pos " << agent_state->position
12674     << " next start pos " << next
12675     << " started " << total_started << dendl;
12676
12677   // See if we've made a full pass over the object hash space
12678   // This might check at most ls_max objects a second time to notice that
12679   // we've checked every objects at least once.
12680   if (agent_state->position < agent_state->start &&
12681       next >= agent_state->start) {
12682     dout(20) << __func__ << " wrap around " << agent_state->start << dendl;
12683     if (total_started == 0)
12684       need_delay = true;
12685     else
12686       total_started = 0;
12687     agent_state->start = next;
12688   }
12689   agent_state->started = total_started;
12690
12691   // See if we are starting from beginning
12692   if (next.is_max())
12693     agent_state->position = hobject_t();
12694   else
12695     agent_state->position = next;
12696
12697   // Discard old in memory HitSets
12698   hit_set_in_memory_trim(pool.info.hit_set_count);
12699
12700   if (need_delay) {
12701     assert(agent_state->delaying == false);
12702     agent_delay();
12703     unlock();
12704     return false;
12705   }
12706   agent_choose_mode();
12707   unlock();
12708   return true;
12709 }
12710
12711 void PrimaryLogPG::agent_load_hit_sets()
12712 {
12713   if (agent_state->evict_mode == TierAgentState::EVICT_MODE_IDLE) {
12714     return;
12715   }
12716
12717   if (agent_state->hit_set_map.size() < info.hit_set.history.size()) {
12718     dout(10) << __func__ << dendl;
12719     for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin();
12720          p != info.hit_set.history.end(); ++p) {
12721       if (agent_state->hit_set_map.count(p->begin.sec()) == 0) {
12722         dout(10) << __func__ << " loading " << p->begin << "-"
12723                  << p->end << dendl;
12724         if (!pool.info.is_replicated()) {
12725           // FIXME: EC not supported here yet
12726           derr << __func__ << " on non-replicated pool" << dendl;
12727           break;
12728         }
12729
12730         hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
12731         if (is_unreadable_object(oid)) {
12732           dout(10) << __func__ << " unreadable " << oid << ", waiting" << dendl;
12733           break;
12734         }
12735
12736         ObjectContextRef obc = get_object_context(oid, false);
12737         if (!obc) {
12738           derr << __func__ << ": could not load hitset " << oid << dendl;
12739           break;
12740         }
12741
12742         bufferlist bl;
12743         {
12744           obc->ondisk_read_lock();
12745           int r = osd->store->read(ch, ghobject_t(oid), 0, 0, bl);
12746           assert(r >= 0);
12747           obc->ondisk_read_unlock();
12748         }
12749         HitSetRef hs(new HitSet);
12750         bufferlist::iterator pbl = bl.begin();
12751         ::decode(*hs, pbl);
12752         agent_state->add_hit_set(p->begin.sec(), hs);
12753       }
12754     }
12755   }
12756 }
12757
12758 bool PrimaryLogPG::agent_maybe_flush(ObjectContextRef& obc)
12759 {
12760   if (!obc->obs.oi.is_dirty()) {
12761     dout(20) << __func__ << " skip (clean) " << obc->obs.oi << dendl;
12762     osd->logger->inc(l_osd_agent_skip);
12763     return false;
12764   }
12765   if (obc->obs.oi.is_cache_pinned()) {
12766     dout(20) << __func__ << " skip (cache_pinned) " << obc->obs.oi << dendl;
12767     osd->logger->inc(l_osd_agent_skip);
12768     return false;
12769   }
12770
12771   utime_t now = ceph_clock_now();
12772   utime_t ob_local_mtime;
12773   if (obc->obs.oi.local_mtime != utime_t()) {
12774     ob_local_mtime = obc->obs.oi.local_mtime;
12775   } else {
12776     ob_local_mtime = obc->obs.oi.mtime;
12777   }
12778   bool evict_mode_full =
12779     (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL);
12780   if (!evict_mode_full &&
12781       obc->obs.oi.soid.snap == CEPH_NOSNAP &&  // snaps immutable; don't delay
12782       (ob_local_mtime + utime_t(pool.info.cache_min_flush_age, 0) > now)) {
12783     dout(20) << __func__ << " skip (too young) " << obc->obs.oi << dendl;
12784     osd->logger->inc(l_osd_agent_skip);
12785     return false;
12786   }
12787
12788   if (osd->agent_is_active_oid(obc->obs.oi.soid)) {
12789     dout(20) << __func__ << " skip (flushing) " << obc->obs.oi << dendl;
12790     osd->logger->inc(l_osd_agent_skip);
12791     return false;
12792   }
12793
12794   dout(10) << __func__ << " flushing " << obc->obs.oi << dendl;
12795
12796   // FIXME: flush anything dirty, regardless of what distribution of
12797   // ages we expect.
12798
12799   hobject_t oid = obc->obs.oi.soid;
12800   osd->agent_start_op(oid);
12801   // no need to capture a pg ref, can't outlive fop or ctx
12802   std::function<void()> on_flush = [this, oid]() {
12803     osd->agent_finish_op(oid);
12804   };
12805
12806   int result = start_flush(
12807     OpRequestRef(), obc, false, NULL,
12808     on_flush);
12809   if (result != -EINPROGRESS) {
12810     on_flush();
12811     dout(10) << __func__ << " start_flush() failed " << obc->obs.oi
12812       << " with " << result << dendl;
12813     osd->logger->inc(l_osd_agent_skip);
12814     return false;
12815   }
12816
12817   osd->logger->inc(l_osd_agent_flush);
12818   return true;
12819 }
12820
12821 bool PrimaryLogPG::agent_maybe_evict(ObjectContextRef& obc, bool after_flush)
12822 {
12823   const hobject_t& soid = obc->obs.oi.soid;
12824   if (!after_flush && obc->obs.oi.is_dirty()) {
12825     dout(20) << __func__ << " skip (dirty) " << obc->obs.oi << dendl;
12826     return false;
12827   }
12828   if (!obc->obs.oi.watchers.empty()) {
12829     dout(20) << __func__ << " skip (watchers) " << obc->obs.oi << dendl;
12830     return false;
12831   }
12832   if (obc->is_blocked()) {
12833     dout(20) << __func__ << " skip (blocked) " << obc->obs.oi << dendl;
12834     return false;
12835   }
12836   if (obc->obs.oi.is_cache_pinned()) {
12837     dout(20) << __func__ << " skip (cache_pinned) " << obc->obs.oi << dendl;
12838     return false;
12839   }
12840
12841   if (soid.snap == CEPH_NOSNAP) {
12842     int result = _verify_no_head_clones(soid, obc->ssc->snapset);
12843     if (result < 0) {
12844       dout(20) << __func__ << " skip (clones) " << obc->obs.oi << dendl;
12845       return false;
12846     }
12847   }
12848
12849   if (agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL) {
12850     // is this object old than cache_min_evict_age?
12851     utime_t now = ceph_clock_now();
12852     utime_t ob_local_mtime;
12853     if (obc->obs.oi.local_mtime != utime_t()) {
12854       ob_local_mtime = obc->obs.oi.local_mtime;
12855     } else {
12856       ob_local_mtime = obc->obs.oi.mtime;
12857     }
12858     if (ob_local_mtime + utime_t(pool.info.cache_min_evict_age, 0) > now) {
12859       dout(20) << __func__ << " skip (too young) " << obc->obs.oi << dendl;
12860       osd->logger->inc(l_osd_agent_skip);
12861       return false;
12862     }
12863     // is this object old and/or cold enough?
12864     int temp = 0;
12865     uint64_t temp_upper = 0, temp_lower = 0;
12866     if (hit_set)
12867       agent_estimate_temp(soid, &temp);
12868     agent_state->temp_hist.add(temp);
12869     agent_state->temp_hist.get_position_micro(temp, &temp_lower, &temp_upper);
12870
12871     dout(20) << __func__
12872              << " temp " << temp
12873              << " pos " << temp_lower << "-" << temp_upper
12874              << ", evict_effort " << agent_state->evict_effort
12875              << dendl;
12876     dout(30) << "agent_state:\n";
12877     Formatter *f = Formatter::create("");
12878     f->open_object_section("agent_state");
12879     agent_state->dump(f);
12880     f->close_section();
12881     f->flush(*_dout);
12882     delete f;
12883     *_dout << dendl;
12884
12885     if (1000000 - temp_upper >= agent_state->evict_effort)
12886       return false;
12887   }
12888
12889   dout(10) << __func__ << " evicting " << obc->obs.oi << dendl;
12890   OpContextUPtr ctx = simple_opc_create(obc);
12891
12892   if (!ctx->lock_manager.get_lock_type(
12893         ObjectContext::RWState::RWWRITE,
12894         obc->obs.oi.soid,
12895         obc,
12896         OpRequestRef())) {
12897     close_op_ctx(ctx.release());
12898     dout(20) << __func__ << " skip (cannot get lock) " << obc->obs.oi << dendl;
12899     return false;
12900   }
12901
12902   osd->agent_start_evict_op();
12903   ctx->register_on_finish(
12904     [this]() {
12905       osd->agent_finish_evict_op();
12906     });
12907
12908   ctx->at_version = get_next_version();
12909   assert(ctx->new_obs.exists);
12910   int r = _delete_oid(ctx.get(), true, false);
12911   if (obc->obs.oi.is_omap())
12912     ctx->delta_stats.num_objects_omap--;
12913   ctx->delta_stats.num_evict++;
12914   ctx->delta_stats.num_evict_kb += SHIFT_ROUND_UP(obc->obs.oi.size, 10);
12915   if (obc->obs.oi.is_dirty())
12916     --ctx->delta_stats.num_objects_dirty;
12917   assert(r == 0);
12918   finish_ctx(ctx.get(), pg_log_entry_t::DELETE, false);
12919   simple_opc_submit(std::move(ctx));
12920   osd->logger->inc(l_osd_tier_evict);
12921   osd->logger->inc(l_osd_agent_evict);
12922   return true;
12923 }
12924
12925 void PrimaryLogPG::agent_stop()
12926 {
12927   dout(20) << __func__ << dendl;
12928   if (agent_state && !agent_state->is_idle()) {
12929     agent_state->evict_mode = TierAgentState::EVICT_MODE_IDLE;
12930     agent_state->flush_mode = TierAgentState::FLUSH_MODE_IDLE;
12931     osd->agent_disable_pg(this, agent_state->evict_effort);
12932   }
12933 }
12934
12935 void PrimaryLogPG::agent_delay()
12936 {
12937   dout(20) << __func__ << dendl;
12938   if (agent_state && !agent_state->is_idle()) {
12939     assert(agent_state->delaying == false);
12940     agent_state->delaying = true;
12941     osd->agent_disable_pg(this, agent_state->evict_effort);
12942   }
12943 }
12944
12945 void PrimaryLogPG::agent_choose_mode_restart()
12946 {
12947   dout(20) << __func__ << dendl;
12948   lock();
12949   if (agent_state && agent_state->delaying) {
12950     agent_state->delaying = false;
12951     agent_choose_mode(true);
12952   }
12953   unlock();
12954 }
12955
12956 bool PrimaryLogPG::agent_choose_mode(bool restart, OpRequestRef op)
12957 {
12958   bool requeued = false;
12959   // Let delay play out
12960   if (agent_state->delaying) {
12961     dout(20) << __func__ << this << " delaying, ignored" << dendl;
12962     return requeued;
12963   }
12964
12965   TierAgentState::flush_mode_t flush_mode = TierAgentState::FLUSH_MODE_IDLE;
12966   TierAgentState::evict_mode_t evict_mode = TierAgentState::EVICT_MODE_IDLE;
12967   unsigned evict_effort = 0;
12968
12969   if (info.stats.stats_invalid) {
12970     // idle; stats can't be trusted until we scrub.
12971     dout(20) << __func__ << " stats invalid (post-split), idle" << dendl;
12972     goto skip_calc;
12973   }
12974
12975   {
12976   uint64_t divisor = pool.info.get_pg_num_divisor(info.pgid.pgid);
12977   assert(divisor > 0);
12978
12979   // adjust (effective) user objects down based on the number
12980   // of HitSet objects, which should not count toward our total since
12981   // they cannot be flushed.
12982   uint64_t unflushable = info.stats.stats.sum.num_objects_hit_set_archive;
12983
12984   // also exclude omap objects if ec backing pool
12985   const pg_pool_t *base_pool = get_osdmap()->get_pg_pool(pool.info.tier_of);
12986   assert(base_pool);
12987   if (!base_pool->supports_omap())
12988     unflushable += info.stats.stats.sum.num_objects_omap;
12989
12990   uint64_t num_user_objects = info.stats.stats.sum.num_objects;
12991   if (num_user_objects > unflushable)
12992     num_user_objects -= unflushable;
12993   else
12994     num_user_objects = 0;
12995
12996   uint64_t num_user_bytes = info.stats.stats.sum.num_bytes;
12997   uint64_t unflushable_bytes = info.stats.stats.sum.num_bytes_hit_set_archive;
12998   num_user_bytes -= unflushable_bytes;
12999   uint64_t num_overhead_bytes = osd->store->estimate_objects_overhead(num_user_objects);
13000   num_user_bytes += num_overhead_bytes;
13001
13002   // also reduce the num_dirty by num_objects_omap
13003   int64_t num_dirty = info.stats.stats.sum.num_objects_dirty;
13004   if (!base_pool->supports_omap()) {
13005     if (num_dirty > info.stats.stats.sum.num_objects_omap)
13006       num_dirty -= info.stats.stats.sum.num_objects_omap;
13007     else
13008       num_dirty = 0;
13009   }
13010
13011   dout(10) << __func__
13012            << " flush_mode: "
13013            << TierAgentState::get_flush_mode_name(agent_state->flush_mode)
13014            << " evict_mode: "
13015            << TierAgentState::get_evict_mode_name(agent_state->evict_mode)
13016            << " num_objects: " << info.stats.stats.sum.num_objects
13017            << " num_bytes: " << info.stats.stats.sum.num_bytes
13018            << " num_objects_dirty: " << info.stats.stats.sum.num_objects_dirty
13019            << " num_objects_omap: " << info.stats.stats.sum.num_objects_omap
13020            << " num_dirty: " << num_dirty
13021            << " num_user_objects: " << num_user_objects
13022            << " num_user_bytes: " << num_user_bytes
13023            << " num_overhead_bytes: " << num_overhead_bytes
13024            << " pool.info.target_max_bytes: " << pool.info.target_max_bytes
13025            << " pool.info.target_max_objects: " << pool.info.target_max_objects
13026            << dendl;
13027
13028   // get dirty, full ratios
13029   uint64_t dirty_micro = 0;
13030   uint64_t full_micro = 0;
13031   if (pool.info.target_max_bytes && num_user_objects > 0) {
13032     uint64_t avg_size = num_user_bytes / num_user_objects;
13033     dirty_micro =
13034       num_dirty * avg_size * 1000000 /
13035       MAX(pool.info.target_max_bytes / divisor, 1);
13036     full_micro =
13037       num_user_objects * avg_size * 1000000 /
13038       MAX(pool.info.target_max_bytes / divisor, 1);
13039   }
13040   if (pool.info.target_max_objects > 0) {
13041     uint64_t dirty_objects_micro =
13042       num_dirty * 1000000 /
13043       MAX(pool.info.target_max_objects / divisor, 1);
13044     if (dirty_objects_micro > dirty_micro)
13045       dirty_micro = dirty_objects_micro;
13046     uint64_t full_objects_micro =
13047       num_user_objects * 1000000 /
13048       MAX(pool.info.target_max_objects / divisor, 1);
13049     if (full_objects_micro > full_micro)
13050       full_micro = full_objects_micro;
13051   }
13052   dout(20) << __func__ << " dirty " << ((float)dirty_micro / 1000000.0)
13053            << " full " << ((float)full_micro / 1000000.0)
13054            << dendl;
13055
13056   // flush mode
13057   uint64_t flush_target = pool.info.cache_target_dirty_ratio_micro;
13058   uint64_t flush_high_target = pool.info.cache_target_dirty_high_ratio_micro;
13059   uint64_t flush_slop = (float)flush_target * cct->_conf->osd_agent_slop;
13060   if (restart || agent_state->flush_mode == TierAgentState::FLUSH_MODE_IDLE) {
13061     flush_target += flush_slop;
13062     flush_high_target += flush_slop;
13063   } else {
13064     flush_target -= MIN(flush_target, flush_slop);
13065     flush_high_target -= MIN(flush_high_target, flush_slop);
13066   }
13067
13068   if (dirty_micro > flush_high_target) {
13069     flush_mode = TierAgentState::FLUSH_MODE_HIGH;
13070   } else if (dirty_micro > flush_target) {
13071     flush_mode = TierAgentState::FLUSH_MODE_LOW;
13072   }
13073
13074   // evict mode
13075   uint64_t evict_target = pool.info.cache_target_full_ratio_micro;
13076   uint64_t evict_slop = (float)evict_target * cct->_conf->osd_agent_slop;
13077   if (restart || agent_state->evict_mode == TierAgentState::EVICT_MODE_IDLE)
13078     evict_target += evict_slop;
13079   else
13080     evict_target -= MIN(evict_target, evict_slop);
13081
13082   if (full_micro > 1000000) {
13083     // evict anything clean
13084     evict_mode = TierAgentState::EVICT_MODE_FULL;
13085     evict_effort = 1000000;
13086   } else if (full_micro > evict_target) {
13087     // set effort in [0..1] range based on where we are between
13088     evict_mode = TierAgentState::EVICT_MODE_SOME;
13089     uint64_t over = full_micro - evict_target;
13090     uint64_t span  = 1000000 - evict_target;
13091     evict_effort = MAX(over * 1000000 / span,
13092                        (unsigned)(1000000.0 * cct->_conf->osd_agent_min_evict_effort));
13093
13094     // quantize effort to avoid too much reordering in the agent_queue.
13095     uint64_t inc = cct->_conf->osd_agent_quantize_effort * 1000000;
13096     assert(inc > 0);
13097     uint64_t was = evict_effort;
13098     evict_effort -= evict_effort % inc;
13099     if (evict_effort < inc)
13100       evict_effort = inc;
13101     assert(evict_effort >= inc && evict_effort <= 1000000);
13102     dout(30) << __func__ << " evict_effort " << was << " quantized by " << inc << " to " << evict_effort << dendl;
13103   }
13104   }
13105
13106   skip_calc:
13107   bool old_idle = agent_state->is_idle();
13108   if (flush_mode != agent_state->flush_mode) {
13109     dout(5) << __func__ << " flush_mode "
13110             << TierAgentState::get_flush_mode_name(agent_state->flush_mode)
13111             << " -> "
13112             << TierAgentState::get_flush_mode_name(flush_mode)
13113             << dendl;
13114     if (flush_mode == TierAgentState::FLUSH_MODE_HIGH) {
13115       osd->agent_inc_high_count();
13116       info.stats.stats.sum.num_flush_mode_high = 1;
13117     } else if (flush_mode == TierAgentState::FLUSH_MODE_LOW) {
13118       info.stats.stats.sum.num_flush_mode_low = 1;
13119     }
13120     if (agent_state->flush_mode == TierAgentState::FLUSH_MODE_HIGH) {
13121       osd->agent_dec_high_count();
13122       info.stats.stats.sum.num_flush_mode_high = 0;
13123     } else if (agent_state->flush_mode == TierAgentState::FLUSH_MODE_LOW) {
13124       info.stats.stats.sum.num_flush_mode_low = 0;
13125     }
13126     agent_state->flush_mode = flush_mode;
13127   }
13128   if (evict_mode != agent_state->evict_mode) {
13129     dout(5) << __func__ << " evict_mode "
13130             << TierAgentState::get_evict_mode_name(agent_state->evict_mode)
13131             << " -> "
13132             << TierAgentState::get_evict_mode_name(evict_mode)
13133             << dendl;
13134     if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL &&
13135         is_active()) {
13136       if (op)
13137         requeue_op(op);
13138       requeue_ops(waiting_for_active);
13139       requeue_ops(waiting_for_scrub);
13140       requeue_ops(waiting_for_cache_not_full);
13141       objects_blocked_on_cache_full.clear();
13142       requeued = true;
13143     }
13144     if (evict_mode == TierAgentState::EVICT_MODE_SOME) {
13145       info.stats.stats.sum.num_evict_mode_some = 1;
13146     } else if (evict_mode == TierAgentState::EVICT_MODE_FULL) {
13147       info.stats.stats.sum.num_evict_mode_full = 1;
13148     }
13149     if (agent_state->evict_mode == TierAgentState::EVICT_MODE_SOME) {
13150       info.stats.stats.sum.num_evict_mode_some = 0;
13151     } else if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
13152       info.stats.stats.sum.num_evict_mode_full = 0;
13153     }
13154     agent_state->evict_mode = evict_mode;
13155   }
13156   uint64_t old_effort = agent_state->evict_effort;
13157   if (evict_effort != agent_state->evict_effort) {
13158     dout(5) << __func__ << " evict_effort "
13159             << ((float)agent_state->evict_effort / 1000000.0)
13160             << " -> "
13161             << ((float)evict_effort / 1000000.0)
13162             << dendl;
13163     agent_state->evict_effort = evict_effort;
13164   }
13165
13166   // NOTE: we are using evict_effort as a proxy for *all* agent effort
13167   // (including flush).  This is probably fine (they should be
13168   // correlated) but it is not precisely correct.
13169   if (agent_state->is_idle()) {
13170     if (!restart && !old_idle) {
13171       osd->agent_disable_pg(this, old_effort);
13172     }
13173   } else {
13174     if (restart || old_idle) {
13175       osd->agent_enable_pg(this, agent_state->evict_effort);
13176     } else if (old_effort != agent_state->evict_effort) {
13177       osd->agent_adjust_pg(this, old_effort, agent_state->evict_effort);
13178     }
13179   }
13180   return requeued;
13181 }
13182
13183 void PrimaryLogPG::agent_estimate_temp(const hobject_t& oid, int *temp)
13184 {
13185   assert(hit_set);
13186   assert(temp);
13187   *temp = 0;
13188   if (hit_set->contains(oid))
13189     *temp = 1000000;
13190   unsigned i = 0;
13191   int last_n = pool.info.hit_set_search_last_n;
13192   for (map<time_t,HitSetRef>::reverse_iterator p =
13193        agent_state->hit_set_map.rbegin(); last_n > 0 &&
13194        p != agent_state->hit_set_map.rend(); ++p, ++i) {
13195     if (p->second->contains(oid)) {
13196       *temp += pool.info.get_grade(i);
13197       --last_n;
13198     }
13199   }
13200 }
13201
13202 // Dup op detection
13203
13204 bool PrimaryLogPG::already_complete(eversion_t v)
13205 {
13206   dout(20) << __func__ << ": " << v << dendl;
13207   for (xlist<RepGather*>::iterator i = repop_queue.begin();
13208        !i.end();
13209        ++i) {
13210     dout(20) << __func__ << ": " << **i << dendl;
13211     // skip copy from temp object ops
13212     if ((*i)->v == eversion_t()) {
13213       dout(20) << __func__ << ": " << **i
13214                << " version is empty" << dendl;
13215       continue;
13216     }
13217     if ((*i)->v > v) {
13218       dout(20) << __func__ << ": " << **i
13219                << " (*i)->v past v" << dendl;
13220       break;
13221     }
13222     if (!(*i)->all_committed) {
13223       dout(20) << __func__ << ": " << **i
13224                << " not committed, returning false"
13225                << dendl;
13226       return false;
13227     }
13228   }
13229   dout(20) << __func__ << ": returning true" << dendl;
13230   return true;
13231 }
13232
13233 bool PrimaryLogPG::already_ack(eversion_t v)
13234 {
13235   dout(20) << __func__ << ": " << v << dendl;
13236   for (xlist<RepGather*>::iterator i = repop_queue.begin();
13237        !i.end();
13238        ++i) {
13239     // skip copy from temp object ops
13240     if ((*i)->v == eversion_t()) {
13241       dout(20) << __func__ << ": " << **i
13242                << " version is empty" << dendl;
13243       continue;
13244     }
13245     if ((*i)->v > v) {
13246       dout(20) << __func__ << ": " << **i
13247                << " (*i)->v past v" << dendl;
13248       break;
13249     }
13250     if (!(*i)->all_applied) {
13251       dout(20) << __func__ << ": " << **i
13252                << " not applied, returning false"
13253                << dendl;
13254       return false;
13255     }
13256   }
13257   dout(20) << __func__ << ": returning true" << dendl;
13258   return true;
13259 }
13260
13261
13262 // ==========================================================================================
13263 // SCRUB
13264
13265
13266 bool PrimaryLogPG::_range_available_for_scrub(
13267   const hobject_t &begin, const hobject_t &end)
13268 {
13269   pair<hobject_t, ObjectContextRef> next;
13270   next.second = object_contexts.lookup(begin);
13271   next.first = begin;
13272   bool more = true;
13273   while (more && next.first < end) {
13274     if (next.second && next.second->is_blocked()) {
13275       next.second->requeue_scrub_on_unblock = true;
13276       dout(10) << __func__ << ": scrub delayed, "
13277                << next.first << " is blocked"
13278                << dendl;
13279       return false;
13280     }
13281     more = object_contexts.get_next(next.first, &next);
13282   }
13283   return true;
13284 }
13285
13286 static bool doing_clones(const boost::optional<SnapSet> &snapset,
13287                          const vector<snapid_t>::reverse_iterator &curclone) {
13288     return snapset && curclone != snapset.get().clones.rend();
13289 }
13290
13291 void PrimaryLogPG::log_missing(unsigned missing,
13292                         const boost::optional<hobject_t> &head,
13293                         LogChannelRef clog,
13294                         const spg_t &pgid,
13295                         const char *func,
13296                         const char *mode,
13297                         bool allow_incomplete_clones)
13298 {
13299   assert(head);
13300   if (allow_incomplete_clones) {
13301     dout(20) << func << " " << mode << " " << pgid << " " << head.get()
13302                << " skipped " << missing << " clone(s) in cache tier" << dendl;
13303   } else {
13304     clog->info() << mode << " " << pgid << " " << head.get()
13305                        << " " << missing << " missing clone(s)";
13306   }
13307 }
13308
13309 unsigned PrimaryLogPG::process_clones_to(const boost::optional<hobject_t> &head,
13310   const boost::optional<SnapSet> &snapset,
13311   LogChannelRef clog,
13312   const spg_t &pgid,
13313   const char *mode,
13314   bool allow_incomplete_clones,
13315   boost::optional<snapid_t> target,
13316   vector<snapid_t>::reverse_iterator *curclone,
13317   inconsistent_snapset_wrapper &e)
13318 {
13319   assert(head);
13320   assert(snapset);
13321   unsigned missing = 0;
13322
13323   // NOTE: clones are in descending order, thus **curclone > target test here
13324   hobject_t next_clone(head.get());
13325   while(doing_clones(snapset, *curclone) && (!target || **curclone > *target)) {
13326     ++missing;
13327     // it is okay to be missing one or more clones in a cache tier.
13328     // skip higher-numbered clones in the list.
13329     if (!allow_incomplete_clones) {
13330       next_clone.snap = **curclone;
13331       clog->error() << mode << " " << pgid << " " << head.get()
13332                          << " expected clone " << next_clone;
13333       ++scrubber.shallow_errors;
13334       e.set_clone_missing(next_clone.snap);
13335     }
13336     // Clones are descending
13337     ++(*curclone);
13338   }
13339   return missing;
13340 }
13341
13342 /*
13343  * Validate consistency of the object info and snap sets.
13344  *
13345  * We are sort of comparing 2 lists. The main loop is on objmap.objects. But
13346  * the comparison of the objects is against multiple snapset.clones. There are
13347  * multiple clone lists and in between lists we expect head or snapdir.
13348  *
13349  * Example
13350  *
13351  * objects              expected
13352  * =======              =======
13353  * obj1 snap 1          head/snapdir, unexpected obj1 snap 1
13354  * obj2 head            head/snapdir, head ok
13355  *              [SnapSet clones 6 4 2 1]
13356  * obj2 snap 7          obj2 snap 6, unexpected obj2 snap 7
13357  * obj2 snap 6          obj2 snap 6, match
13358  * obj2 snap 4          obj2 snap 4, match
13359  * obj3 head            obj2 snap 2 (expected), obj2 snap 1 (expected), head ok
13360  *              [Snapset clones 3 1]
13361  * obj3 snap 3          obj3 snap 3 match
13362  * obj3 snap 1          obj3 snap 1 match
13363  * obj4 snapdir         head/snapdir, snapdir ok
13364  *              [Snapset clones 4]
13365  * EOL                  obj4 snap 4, (expected)
13366  */
13367 void PrimaryLogPG::scrub_snapshot_metadata(
13368   ScrubMap &scrubmap,
13369   const map<hobject_t, pair<uint32_t, uint32_t>> &missing_digest)
13370 {
13371   dout(10) << __func__ << dendl;
13372
13373   coll_t c(info.pgid);
13374   bool repair = state_test(PG_STATE_REPAIR);
13375   bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
13376   const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
13377   boost::optional<snapid_t> all_clones;   // Unspecified snapid_t or boost::none
13378
13379   /// snapsets to repair
13380   map<hobject_t,SnapSet> snapset_to_repair;
13381
13382   // traverse in reverse order.
13383   boost::optional<hobject_t> head;
13384   boost::optional<SnapSet> snapset; // If initialized so will head (above)
13385   vector<snapid_t>::reverse_iterator curclone; // Defined only if snapset initialized
13386   unsigned missing = 0;
13387   inconsistent_snapset_wrapper soid_error, head_error;
13388
13389   bufferlist last_data;
13390
13391   for (map<hobject_t,ScrubMap::object>::reverse_iterator
13392        p = scrubmap.objects.rbegin(); p != scrubmap.objects.rend(); ++p) {
13393     const hobject_t& soid = p->first;
13394     soid_error = inconsistent_snapset_wrapper{soid};
13395     object_stat_sum_t stat;
13396     boost::optional<object_info_t> oi;
13397
13398     if (!soid.is_snapdir())
13399       stat.num_objects++;
13400
13401     if (soid.nspace == cct->_conf->osd_hit_set_namespace)
13402       stat.num_objects_hit_set_archive++;
13403
13404     if (soid.is_snap()) {
13405       // it's a clone
13406       stat.num_object_clones++;
13407     }
13408
13409     // basic checks.
13410     if (p->second.attrs.count(OI_ATTR) == 0) {
13411       oi = boost::none;
13412       osd->clog->error() << mode << " " << info.pgid << " " << soid
13413                         << " no '" << OI_ATTR << "' attr";
13414       ++scrubber.shallow_errors;
13415       soid_error.set_oi_attr_missing();
13416     } else {
13417       bufferlist bv;
13418       bv.push_back(p->second.attrs[OI_ATTR]);
13419       try {
13420         oi = object_info_t(); // Initialize optional<> before decode into it
13421         oi.get().decode(bv);
13422       } catch (buffer::error& e) {
13423         oi = boost::none;
13424         osd->clog->error() << mode << " " << info.pgid << " " << soid
13425                 << " can't decode '" << OI_ATTR << "' attr " << e.what();
13426         ++scrubber.shallow_errors;
13427         soid_error.set_oi_attr_corrupted();
13428         soid_error.set_oi_attr_missing(); // Not available too
13429       }
13430     }
13431
13432     if (oi) {
13433       if (pgbackend->be_get_ondisk_size(oi->size) != p->second.size) {
13434         osd->clog->error() << mode << " " << info.pgid << " " << soid
13435                            << " on disk size (" << p->second.size
13436                            << ") does not match object info size ("
13437                            << oi->size << ") adjusted for ondisk to ("
13438                            << pgbackend->be_get_ondisk_size(oi->size)
13439                            << ")";
13440         soid_error.set_size_mismatch();
13441         ++scrubber.shallow_errors;
13442       }
13443
13444       dout(20) << mode << "  " << soid << " " << oi.get() << dendl;
13445
13446       // A clone num_bytes will be added later when we have snapset
13447       if (!soid.is_snap()) {
13448         stat.num_bytes += oi->size;
13449       }
13450       if (soid.nspace == cct->_conf->osd_hit_set_namespace)
13451         stat.num_bytes_hit_set_archive += oi->size;
13452
13453       if (!soid.is_snapdir()) {
13454         if (oi->is_dirty())
13455           ++stat.num_objects_dirty;
13456         if (oi->is_whiteout())
13457           ++stat.num_whiteouts;
13458         if (oi->is_omap())
13459           ++stat.num_objects_omap;
13460         if (oi->is_cache_pinned())
13461           ++stat.num_objects_pinned;
13462       }
13463     } else {
13464       // pessimistic assumption that this object might contain a
13465       // legacy SnapSet
13466       stat.num_legacy_snapsets++;
13467     }
13468
13469     // Check for any problems while processing clones
13470     if (doing_clones(snapset, curclone)) {
13471       boost::optional<snapid_t> target;
13472       // Expecting an object with snap for current head
13473       if (soid.has_snapset() || soid.get_head() != head->get_head()) {
13474
13475         dout(10) << __func__ << " " << mode << " " << info.pgid << " new object "
13476                  << soid << " while processing " << head.get() << dendl;
13477
13478         target = all_clones;
13479       } else {
13480         assert(soid.is_snap());
13481         target = soid.snap;
13482       }
13483
13484       // Log any clones we were expecting to be there up to target
13485       // This will set missing, but will be a no-op if snap.soid == *curclone.
13486       missing += process_clones_to(head, snapset, osd->clog, info.pgid, mode,
13487                         pool.info.allow_incomplete_clones(), target, &curclone,
13488                         head_error);
13489     }
13490     bool expected;
13491     // Check doing_clones() again in case we ran process_clones_to()
13492     if (doing_clones(snapset, curclone)) {
13493       // A head/snapdir would have processed all clones above
13494       // or all greater than *curclone.
13495       assert(soid.is_snap() && *curclone <= soid.snap);
13496
13497       // After processing above clone snap should match the expected curclone
13498       expected = (*curclone == soid.snap);
13499     } else {
13500       // If we aren't doing clones any longer, then expecting head/snapdir
13501       expected = soid.has_snapset();
13502     }
13503     if (!expected) {
13504       // If we couldn't read the head's snapset, just ignore clones
13505       if (head && !snapset) {
13506         osd->clog->error() << mode << " " << info.pgid << " " << soid
13507                           << " clone ignored due to missing snapset";
13508       } else {
13509         osd->clog->error() << mode << " " << info.pgid << " " << soid
13510                            << " is an unexpected clone";
13511       }
13512       ++scrubber.shallow_errors;
13513       soid_error.set_headless();
13514       scrubber.store->add_snap_error(pool.id, soid_error);
13515       if (head && soid.get_head() == head->get_head())
13516         head_error.set_clone(soid.snap);
13517       continue;
13518     }
13519
13520     // new snapset?
13521     if (soid.has_snapset()) {
13522
13523       if (missing) {
13524         log_missing(missing, head, osd->clog, info.pgid, __func__, mode,
13525                     pool.info.allow_incomplete_clones());
13526       }
13527
13528       // Save previous head error information
13529       if (head && head_error.errors)
13530         scrubber.store->add_snap_error(pool.id, head_error);
13531       // Set this as a new head object
13532       head = soid;
13533       missing = 0;
13534       head_error = soid_error;
13535
13536       dout(20) << __func__ << " " << mode << " new head " << head << dendl;
13537
13538       if (p->second.attrs.count(SS_ATTR) == 0) {
13539         osd->clog->error() << mode << " " << info.pgid << " " << soid
13540                           << " no '" << SS_ATTR << "' attr";
13541         ++scrubber.shallow_errors;
13542         snapset = boost::none;
13543         head_error.set_ss_attr_missing();
13544       } else {
13545         bufferlist bl;
13546         bl.push_back(p->second.attrs[SS_ATTR]);
13547         bufferlist::iterator blp = bl.begin();
13548         try {
13549           snapset = SnapSet(); // Initialize optional<> before decoding into it
13550           ::decode(snapset.get(), blp);
13551         } catch (buffer::error& e) {
13552           snapset = boost::none;
13553           osd->clog->error() << mode << " " << info.pgid << " " << soid
13554                 << " can't decode '" << SS_ATTR << "' attr " << e.what();
13555           ++scrubber.shallow_errors;
13556           head_error.set_ss_attr_corrupted();
13557         }
13558       }
13559
13560       if (snapset) {
13561         // what will be next?
13562         curclone = snapset->clones.rbegin();
13563
13564         if (!snapset->clones.empty()) {
13565           dout(20) << "  snapset " << snapset.get() << dendl;
13566           if (snapset->seq == 0) {
13567             osd->clog->error() << mode << " " << info.pgid << " " << soid
13568                                << " snaps.seq not set";
13569             ++scrubber.shallow_errors;
13570             head_error.set_snapset_mismatch();
13571           }
13572         }
13573
13574         if (soid.is_head() && !snapset->head_exists) {
13575           osd->clog->error() << mode << " " << info.pgid << " " << soid
13576                           << " snapset.head_exists=false, but head exists";
13577           ++scrubber.shallow_errors;
13578           head_error.set_head_mismatch();
13579         }
13580         if (soid.is_snapdir() && snapset->head_exists) {
13581           osd->clog->error() << mode << " " << info.pgid << " " << soid
13582                           << " snapset.head_exists=true, but snapdir exists";
13583           ++scrubber.shallow_errors;
13584           head_error.set_head_mismatch();
13585         }
13586
13587         if (get_osdmap()->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
13588           if (soid.is_snapdir()) {
13589             dout(10) << " will move snapset to head from " << soid << dendl;
13590             snapset_to_repair[soid.get_head()] = *snapset;
13591           } else if (snapset->is_legacy()) {
13592             dout(10) << " will convert legacy snapset on " << soid << " " << *snapset
13593                      << dendl;
13594             snapset_to_repair[soid.get_head()] = *snapset;
13595           }
13596         } else {
13597           stat.num_legacy_snapsets++;
13598         }
13599       } else {
13600         // pessimistic assumption that this object might contain a
13601         // legacy SnapSet
13602         stat.num_legacy_snapsets++;
13603       }
13604     } else {
13605       assert(soid.is_snap());
13606       assert(head);
13607       assert(snapset);
13608       assert(soid.snap == *curclone);
13609
13610       dout(20) << __func__ << " " << mode << " matched clone " << soid << dendl;
13611
13612       if (snapset->clone_size.count(soid.snap) == 0) {
13613         osd->clog->error() << mode << " " << info.pgid << " " << soid
13614                            << " is missing in clone_size";
13615         ++scrubber.shallow_errors;
13616         soid_error.set_size_mismatch();
13617       } else {
13618         if (oi && oi->size != snapset->clone_size[soid.snap]) {
13619           osd->clog->error() << mode << " " << info.pgid << " " << soid
13620                              << " size " << oi->size << " != clone_size "
13621                              << snapset->clone_size[*curclone];
13622           ++scrubber.shallow_errors;
13623           soid_error.set_size_mismatch();
13624         }
13625
13626         if (snapset->clone_overlap.count(soid.snap) == 0) {
13627           osd->clog->error() << mode << " " << info.pgid << " " << soid
13628                              << " is missing in clone_overlap";
13629           ++scrubber.shallow_errors;
13630           soid_error.set_size_mismatch();
13631         } else {
13632           // This checking is based on get_clone_bytes().  The first 2 asserts
13633           // can't happen because we know we have a clone_size and
13634           // a clone_overlap.  Now we check that the interval_set won't
13635           // cause the last assert.
13636           uint64_t size = snapset->clone_size.find(soid.snap)->second;
13637           const interval_set<uint64_t> &overlap =
13638                 snapset->clone_overlap.find(soid.snap)->second;
13639           bool bad_interval_set = false;
13640           for (interval_set<uint64_t>::const_iterator i = overlap.begin();
13641                i != overlap.end(); ++i) {
13642             if (size < i.get_len()) {
13643               bad_interval_set = true;
13644               break;
13645             }
13646             size -= i.get_len();
13647           }
13648
13649           if (bad_interval_set) {
13650             osd->clog->error() << mode << " " << info.pgid << " " << soid
13651                                << " bad interval_set in clone_overlap";
13652             ++scrubber.shallow_errors;
13653             soid_error.set_size_mismatch();
13654           } else {
13655             stat.num_bytes += snapset->get_clone_bytes(soid.snap);
13656           }
13657         }
13658       }
13659
13660       // migrate legacy_snaps to snapset?
13661       auto p = snapset_to_repair.find(soid.get_head());
13662       if (p != snapset_to_repair.end()) {
13663         if (!oi || oi->legacy_snaps.empty()) {
13664           osd->clog->error() << mode << " " << info.pgid << " " << soid
13665                              << " has no oi or legacy_snaps; cannot convert "
13666                              << *snapset;
13667           ++scrubber.shallow_errors;
13668         } else {
13669           dout(20) << __func__ << "   copying legacy_snaps " << oi->legacy_snaps
13670                    << " to snapset " << p->second << dendl;
13671           p->second.clone_snaps[soid.snap] = oi->legacy_snaps;
13672         }
13673       }
13674
13675       // what's next?
13676       ++curclone;
13677       if (soid_error.errors)
13678         scrubber.store->add_snap_error(pool.id, soid_error);
13679     }
13680
13681     scrub_cstat.add(stat);
13682   }
13683
13684   if (doing_clones(snapset, curclone)) {
13685     dout(10) << __func__ << " " << mode << " " << info.pgid
13686              << " No more objects while processing " << head.get() << dendl;
13687
13688     missing += process_clones_to(head, snapset, osd->clog, info.pgid, mode,
13689                       pool.info.allow_incomplete_clones(), all_clones, &curclone,
13690                       head_error);
13691   }
13692   // There could be missing found by the test above or even
13693   // before dropping out of the loop for the last head.
13694   if (missing) {
13695     log_missing(missing, head, osd->clog, info.pgid, __func__,
13696                 mode, pool.info.allow_incomplete_clones());
13697   }
13698   if (head && head_error.errors)
13699     scrubber.store->add_snap_error(pool.id, head_error);
13700
13701   for (map<hobject_t,pair<uint32_t,uint32_t>>::const_iterator p =
13702          missing_digest.begin();
13703        p != missing_digest.end();
13704        ++p) {
13705     if (p->first.is_snapdir())
13706       continue;
13707     dout(10) << __func__ << " recording digests for " << p->first << dendl;
13708     ObjectContextRef obc = get_object_context(p->first, false);
13709     if (!obc) {
13710       osd->clog->error() << info.pgid << " " << mode
13711                          << " cannot get object context for "
13712                          << p->first;
13713       continue;
13714     } else if (obc->obs.oi.soid != p->first) {
13715       osd->clog->error() << info.pgid << " " << mode
13716                          << " object " << p->first
13717                          << " has a valid oi attr with a mismatched name, "
13718                          << " obc->obs.oi.soid: " << obc->obs.oi.soid;
13719       continue;
13720     }
13721     OpContextUPtr ctx = simple_opc_create(obc);
13722     ctx->at_version = get_next_version();
13723     ctx->mtime = utime_t();      // do not update mtime
13724     ctx->new_obs.oi.set_data_digest(p->second.first);
13725     ctx->new_obs.oi.set_omap_digest(p->second.second);
13726     finish_ctx(ctx.get(), pg_log_entry_t::MODIFY);
13727
13728     ctx->register_on_success(
13729       [this]() {
13730         dout(20) << "updating scrub digest" << dendl;
13731         if (--scrubber.num_digest_updates_pending == 0) {
13732           requeue_scrub();
13733         }
13734       });
13735
13736     simple_opc_submit(std::move(ctx));
13737     ++scrubber.num_digest_updates_pending;
13738   }
13739   for (auto& p : snapset_to_repair) {
13740     // cache pools may not have the clones, which means we won't know
13741     // what snaps they have.  fake out the clone_snaps entries anyway (with
13742     // blank snap lists).
13743     p.second.head_exists = true;
13744     if (pool.info.allow_incomplete_clones()) {
13745       for (auto s : p.second.clones) {
13746         if (p.second.clone_snaps.count(s) == 0) {
13747           dout(10) << __func__ << " " << p.first << " faking clone_snaps for "
13748                    << s << dendl;
13749           p.second.clone_snaps[s];
13750         }
13751       }
13752     }
13753     if (p.second.clones.size() != p.second.clone_snaps.size() ||
13754         p.second.is_legacy()) {
13755       // this happens if we encounter other errors above, like a missing
13756       // or extra clone.
13757       dout(10) << __func__ << " not writing snapset to " << p.first
13758                << " snapset " << p.second << " clones " << p.second.clones
13759                << "; didn't convert fully" << dendl;
13760       scrub_cstat.sum.num_legacy_snapsets++;
13761       continue;
13762     }
13763     dout(10) << __func__ << " writing snapset to " << p.first
13764              << " " << p.second << dendl;
13765     ObjectContextRef obc = get_object_context(p.first, true);
13766     if (!obc) {
13767       osd->clog->error() << info.pgid << " " << mode
13768                          << " cannot get object context for "
13769                          << p.first;
13770       continue;
13771     } else if (obc->obs.oi.soid != p.first) {
13772       osd->clog->error() << info.pgid << " " << mode
13773                          << " object " << p.first
13774                          << " has a valid oi attr with a mismatched name, "
13775                          << " obc->obs.oi.soid: " << obc->obs.oi.soid;
13776       continue;
13777     }
13778     ObjectContextRef snapset_obc;
13779     if (!obc->obs.exists) {
13780       snapset_obc = get_object_context(p.first.get_snapdir(), false);
13781       if (!snapset_obc) {
13782         osd->clog->error() << info.pgid << " " << mode
13783                            << " cannot get object context for "
13784                            << p.first.get_snapdir();
13785         continue;
13786       }
13787     }
13788     OpContextUPtr ctx = simple_opc_create(obc);
13789     PGTransaction *t = ctx->op_t.get();
13790     ctx->snapset_obc = snapset_obc;
13791     ctx->at_version = get_next_version();
13792     ctx->mtime = utime_t();      // do not update mtime
13793     ctx->new_snapset = p.second;
13794     if (!ctx->new_obs.exists) {
13795       dout(20) << __func__ << "   making " << p.first << " a whiteout" << dendl;
13796       ctx->new_obs.exists = true;
13797       ctx->new_snapset.head_exists = true;
13798       ctx->new_obs.oi.set_flag(object_info_t::FLAG_WHITEOUT);
13799       ++ctx->delta_stats.num_whiteouts;
13800       ++ctx->delta_stats.num_objects;
13801       t->create(p.first);
13802       if (p.first < scrubber.start) {
13803         dout(20) << __func__ << " kludging around update outside of scrub range"
13804                  << dendl;
13805       } else {
13806         scrub_cstat.add(ctx->delta_stats);
13807       }
13808     }
13809     dout(20) << __func__ << "   final snapset " << ctx->new_snapset << dendl;
13810     assert(!ctx->new_snapset.is_legacy());
13811     finish_ctx(ctx.get(), pg_log_entry_t::MODIFY);
13812     ctx->register_on_success(
13813       [this]() {
13814         dout(20) << "updating snapset" << dendl;
13815         if (--scrubber.num_digest_updates_pending == 0) {
13816           requeue_scrub();
13817         }
13818       });
13819
13820     simple_opc_submit(std::move(ctx));
13821     ++scrubber.num_digest_updates_pending;
13822   }
13823
13824   dout(10) << __func__ << " (" << mode << ") finish" << dendl;
13825 }
13826
13827 void PrimaryLogPG::_scrub_clear_state()
13828 {
13829   scrub_cstat = object_stat_collection_t();
13830 }
13831
13832 void PrimaryLogPG::_scrub_finish()
13833 {
13834   bool repair = state_test(PG_STATE_REPAIR);
13835   bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
13836   const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
13837
13838   if (info.stats.stats_invalid) {
13839     info.stats.stats = scrub_cstat;
13840     info.stats.stats_invalid = false;
13841
13842     if (agent_state)
13843       agent_choose_mode();
13844   }
13845
13846   dout(10) << mode << " got "
13847            << scrub_cstat.sum.num_objects << "/" << info.stats.stats.sum.num_objects << " objects, "
13848            << scrub_cstat.sum.num_object_clones << "/" << info.stats.stats.sum.num_object_clones << " clones, "
13849            << scrub_cstat.sum.num_objects_dirty << "/" << info.stats.stats.sum.num_objects_dirty << " dirty, "
13850            << scrub_cstat.sum.num_objects_omap << "/" << info.stats.stats.sum.num_objects_omap << " omap, "
13851            << scrub_cstat.sum.num_objects_pinned << "/" << info.stats.stats.sum.num_objects_pinned << " pinned, "
13852            << scrub_cstat.sum.num_objects_hit_set_archive << "/" << info.stats.stats.sum.num_objects_hit_set_archive << " hit_set_archive, "
13853            << scrub_cstat.sum.num_bytes << "/" << info.stats.stats.sum.num_bytes << " bytes, "
13854            << scrub_cstat.sum.num_bytes_hit_set_archive << "/" << info.stats.stats.sum.num_bytes_hit_set_archive << " hit_set_archive bytes."
13855            << dendl;
13856
13857   if (scrub_cstat.sum.num_objects != info.stats.stats.sum.num_objects ||
13858       scrub_cstat.sum.num_object_clones != info.stats.stats.sum.num_object_clones ||
13859       (scrub_cstat.sum.num_objects_dirty != info.stats.stats.sum.num_objects_dirty &&
13860        !info.stats.dirty_stats_invalid) ||
13861       (scrub_cstat.sum.num_objects_omap != info.stats.stats.sum.num_objects_omap &&
13862        !info.stats.omap_stats_invalid) ||
13863       (scrub_cstat.sum.num_objects_pinned != info.stats.stats.sum.num_objects_pinned &&
13864        !info.stats.pin_stats_invalid) ||
13865       (scrub_cstat.sum.num_objects_hit_set_archive != info.stats.stats.sum.num_objects_hit_set_archive &&
13866        !info.stats.hitset_stats_invalid) ||
13867       (scrub_cstat.sum.num_bytes_hit_set_archive != info.stats.stats.sum.num_bytes_hit_set_archive &&
13868        !info.stats.hitset_bytes_stats_invalid) ||
13869       scrub_cstat.sum.num_whiteouts != info.stats.stats.sum.num_whiteouts ||
13870       scrub_cstat.sum.num_bytes != info.stats.stats.sum.num_bytes) {
13871     osd->clog->error() << info.pgid << " " << mode
13872                       << " stat mismatch, got "
13873                       << scrub_cstat.sum.num_objects << "/" << info.stats.stats.sum.num_objects << " objects, "
13874                       << scrub_cstat.sum.num_object_clones << "/" << info.stats.stats.sum.num_object_clones << " clones, "
13875                       << scrub_cstat.sum.num_objects_dirty << "/" << info.stats.stats.sum.num_objects_dirty << " dirty, "
13876                       << scrub_cstat.sum.num_objects_omap << "/" << info.stats.stats.sum.num_objects_omap << " omap, "
13877                       << scrub_cstat.sum.num_objects_pinned << "/" << info.stats.stats.sum.num_objects_pinned << " pinned, "
13878                       << scrub_cstat.sum.num_objects_hit_set_archive << "/" << info.stats.stats.sum.num_objects_hit_set_archive << " hit_set_archive, "
13879                       << scrub_cstat.sum.num_whiteouts << "/" << info.stats.stats.sum.num_whiteouts << " whiteouts, "
13880                       << scrub_cstat.sum.num_bytes << "/" << info.stats.stats.sum.num_bytes << " bytes, "
13881                       << scrub_cstat.sum.num_bytes_hit_set_archive << "/" << info.stats.stats.sum.num_bytes_hit_set_archive << " hit_set_archive bytes.";
13882     ++scrubber.shallow_errors;
13883
13884     if (repair) {
13885       ++scrubber.fixed;
13886       info.stats.stats = scrub_cstat;
13887       info.stats.dirty_stats_invalid = false;
13888       info.stats.omap_stats_invalid = false;
13889       info.stats.hitset_stats_invalid = false;
13890       info.stats.hitset_bytes_stats_invalid = false;
13891       publish_stats_to_osd();
13892       share_pg_info();
13893     }
13894   } else if (scrub_cstat.sum.num_legacy_snapsets !=
13895              info.stats.stats.sum.num_legacy_snapsets) {
13896     osd->clog->info() << info.pgid << " " << mode << " updated num_legacy_snapsets"
13897                       << " from " << info.stats.stats.sum.num_legacy_snapsets
13898                       << " -> " << scrub_cstat.sum.num_legacy_snapsets << "\n";
13899     info.stats.stats.sum.num_legacy_snapsets = scrub_cstat.sum.num_legacy_snapsets;
13900     publish_stats_to_osd();
13901     share_pg_info();
13902   }
13903   // Clear object context cache to get repair information
13904   if (repair)
13905     object_contexts.clear();
13906 }
13907
13908 bool PrimaryLogPG::check_osdmap_full(const set<pg_shard_t> &missing_on)
13909 {
13910     return osd->check_osdmap_full(missing_on);
13911 }
13912
13913 int PrimaryLogPG::rep_repair_primary_object(const hobject_t& soid, OpRequestRef op)
13914 {
13915   // Only supports replicated pools
13916   assert(!pool.info.require_rollback());
13917   assert(is_primary());
13918
13919   dout(10) << __func__ << " " << soid
13920            << " peers osd.{" << actingbackfill << "}" << dendl;
13921
13922   if (!is_clean()) {
13923     block_for_clean(soid, op);
13924     return -EAGAIN;
13925   }
13926
13927   assert(!pg_log.get_missing().is_missing(soid));
13928   bufferlist bv;
13929   object_info_t oi;
13930   eversion_t v;
13931   int r = get_pgbackend()->objects_get_attr(soid, OI_ATTR, &bv);
13932   if (r < 0) {
13933     // Leave v and try to repair without a version, getting attr failed
13934     dout(0) << __func__ << ": Need version of replica, objects_get_attr failed: "
13935             << soid << " error=" << r << dendl;
13936   } else try {
13937     bufferlist::iterator bliter = bv.begin();
13938     ::decode(oi, bliter);
13939     v = oi.version;
13940   } catch (...) {
13941     // Leave v as default constructed. This will fail when sent to older OSDs, but
13942     // not much worse than failing here.
13943     dout(0) << __func__ << ": Need version of replica, bad object_info_t: " << soid << dendl;
13944   }
13945
13946   missing_loc.add_missing(soid, v, eversion_t());
13947   if (primary_error(soid, v)) {
13948     dout(0) << __func__ << " No other replicas available for " << soid << dendl;
13949     // XXX: If we knew that there is no down osd which could include this
13950     // object, it would be nice if we could return EIO here.
13951     // If a "never fail" flag was available, that could be used
13952     // for rbd to NOT return EIO until object marked lost.
13953
13954     // Drop through to save this op in case an osd comes up with the object.
13955   }
13956
13957   // Restart the op after object becomes readable again
13958   waiting_for_unreadable_object[soid].push_back(op);
13959   op->mark_delayed("waiting for missing object");
13960
13961   if (!eio_errors_to_process) {
13962     eio_errors_to_process = true;
13963     assert(is_clean());
13964     queue_peering_event(
13965         CephPeeringEvtRef(
13966           std::make_shared<CephPeeringEvt>(
13967           get_osdmap()->get_epoch(),
13968           get_osdmap()->get_epoch(),
13969           DoRecovery())));
13970   } else {
13971     // A prior error must have already cleared clean state and queued recovery
13972     // or a map change has triggered re-peering.
13973     // Not inlining the recovery by calling maybe_kick_recovery(soid);
13974     dout(5) << __func__<< ": Read error on " << soid << ", but already seen errors" << dendl;
13975   }
13976
13977   return -EAGAIN;
13978 }
13979
13980 /*---SnapTrimmer Logging---*/
13981 #undef dout_prefix
13982 #define dout_prefix *_dout << pg->gen_prefix()
13983
13984 void PrimaryLogPG::SnapTrimmer::log_enter(const char *state_name)
13985 {
13986   ldout(pg->cct, 20) << "enter " << state_name << dendl;
13987 }
13988
13989 void PrimaryLogPG::SnapTrimmer::log_exit(const char *state_name, utime_t enter_time)
13990 {
13991   ldout(pg->cct, 20) << "exit " << state_name << dendl;
13992 }
13993
13994 /*---SnapTrimmer states---*/
13995 #undef dout_prefix
13996 #define dout_prefix (*_dout << context< SnapTrimmer >().pg->gen_prefix() \
13997                      << "SnapTrimmer state<" << get_state_name() << ">: ")
13998
13999 /* NotTrimming */
14000 PrimaryLogPG::NotTrimming::NotTrimming(my_context ctx)
14001   : my_base(ctx),
14002     NamedState(context< SnapTrimmer >().pg, "NotTrimming")
14003 {
14004   context< SnapTrimmer >().log_enter(state_name);
14005 }
14006
14007 void PrimaryLogPG::NotTrimming::exit()
14008 {
14009   context< SnapTrimmer >().log_exit(state_name, enter_time);
14010 }
14011
14012 boost::statechart::result PrimaryLogPG::NotTrimming::react(const KickTrim&)
14013 {
14014   PrimaryLogPG *pg = context< SnapTrimmer >().pg;
14015   ldout(pg->cct, 10) << "NotTrimming react KickTrim" << dendl;
14016
14017   if (!(pg->is_primary() && pg->is_active())) {
14018     ldout(pg->cct, 10) << "NotTrimming not primary or active" << dendl;
14019     return discard_event();
14020   }
14021   if (!pg->is_clean() ||
14022       pg->snap_trimq.empty()) {
14023     ldout(pg->cct, 10) << "NotTrimming not clean or nothing to trim" << dendl;
14024     return discard_event();
14025   }
14026   if (pg->scrubber.active) {
14027     ldout(pg->cct, 10) << " scrubbing, will requeue snap_trimmer after" << dendl;
14028     return transit< WaitScrub >();
14029   } else {
14030     return transit< Trimming >();
14031   }
14032 }
14033
14034 boost::statechart::result PrimaryLogPG::WaitReservation::react(const SnapTrimReserved&)
14035 {
14036   PrimaryLogPG *pg = context< SnapTrimmer >().pg;
14037   ldout(pg->cct, 10) << "WaitReservation react SnapTrimReserved" << dendl;
14038
14039   pending = nullptr;
14040   if (!context< SnapTrimmer >().can_trim()) {
14041     post_event(KickTrim());
14042     return transit< NotTrimming >();
14043   }
14044
14045   context<Trimming>().snap_to_trim = pg->snap_trimq.range_start();
14046   ldout(pg->cct, 10) << "NotTrimming: trimming "
14047                      << pg->snap_trimq.range_start()
14048                      << dendl;
14049   return transit< AwaitAsyncWork >();
14050 }
14051
14052 /* AwaitAsyncWork */
14053 PrimaryLogPG::AwaitAsyncWork::AwaitAsyncWork(my_context ctx)
14054   : my_base(ctx),
14055     NamedState(context< SnapTrimmer >().pg, "Trimming/AwaitAsyncWork")
14056 {
14057   auto *pg = context< SnapTrimmer >().pg;
14058   context< SnapTrimmer >().log_enter(state_name);
14059   context< SnapTrimmer >().pg->osd->queue_for_snap_trim(pg);
14060   pg->state_set(PG_STATE_SNAPTRIM);
14061   pg->state_clear(PG_STATE_SNAPTRIM_ERROR);
14062   pg->publish_stats_to_osd();
14063 }
14064
14065 boost::statechart::result PrimaryLogPG::AwaitAsyncWork::react(const DoSnapWork&)
14066 {
14067   PrimaryLogPGRef pg = context< SnapTrimmer >().pg;
14068   snapid_t snap_to_trim = context<Trimming>().snap_to_trim;
14069   auto &in_flight = context<Trimming>().in_flight;
14070   assert(in_flight.empty());
14071
14072   assert(pg->is_primary() && pg->is_active());
14073   if (!context< SnapTrimmer >().can_trim()) {
14074     ldout(pg->cct, 10) << "something changed, reverting to NotTrimming" << dendl;
14075     post_event(KickTrim());
14076     return transit< NotTrimming >();
14077   }
14078
14079   ldout(pg->cct, 10) << "AwaitAsyncWork: trimming snap " << snap_to_trim << dendl;
14080
14081   vector<hobject_t> to_trim;
14082   unsigned max = pg->cct->_conf->osd_pg_max_concurrent_snap_trims;
14083   to_trim.reserve(max);
14084   int r = pg->snap_mapper.get_next_objects_to_trim(
14085     snap_to_trim,
14086     max,
14087     &to_trim);
14088   if (r != 0 && r != -ENOENT) {
14089     lderr(pg->cct) << "get_next_objects_to_trim returned "
14090                    << cpp_strerror(r) << dendl;
14091     assert(0 == "get_next_objects_to_trim returned an invalid code");
14092   } else if (r == -ENOENT) {
14093     // Done!
14094     ldout(pg->cct, 10) << "got ENOENT" << dendl;
14095
14096     ldout(pg->cct, 10) << "adding snap " << snap_to_trim
14097                        << " to purged_snaps"
14098                        << dendl;
14099     pg->info.purged_snaps.insert(snap_to_trim);
14100     pg->snap_trimq.erase(snap_to_trim);
14101     ldout(pg->cct, 10) << "purged_snaps now "
14102                        << pg->info.purged_snaps << ", snap_trimq now "
14103                        << pg->snap_trimq << dendl;
14104
14105     ObjectStore::Transaction t;
14106     pg->dirty_big_info = true;
14107     pg->write_if_dirty(t);
14108     int tr = pg->osd->store->queue_transaction(pg->osr.get(), std::move(t), NULL);
14109     assert(tr == 0);
14110
14111     pg->share_pg_info();
14112     post_event(KickTrim());
14113     return transit< NotTrimming >();
14114   }
14115   assert(!to_trim.empty());
14116
14117   for (auto &&object: to_trim) {
14118     // Get next
14119     ldout(pg->cct, 10) << "AwaitAsyncWork react trimming " << object << dendl;
14120     OpContextUPtr ctx;
14121     int error = pg->trim_object(in_flight.empty(), object, &ctx);
14122     if (error) {
14123       if (error == -ENOLCK) {
14124         ldout(pg->cct, 10) << "could not get write lock on obj "
14125                            << object << dendl;
14126       } else {
14127         pg->state_set(PG_STATE_SNAPTRIM_ERROR);
14128         ldout(pg->cct, 10) << "Snaptrim error=" << error << dendl;
14129       }
14130       if (!in_flight.empty()) {
14131         ldout(pg->cct, 10) << "letting the ones we already started finish" << dendl;
14132         return transit< WaitRepops >();
14133       }
14134       if (error == -ENOLCK) {
14135         ldout(pg->cct, 10) << "waiting for it to clear"
14136                            << dendl;
14137         return transit< WaitRWLock >();
14138       } else {
14139         return transit< NotTrimming >();
14140       }
14141     }
14142
14143     in_flight.insert(object);
14144     ctx->register_on_success(
14145       [pg, object, &in_flight]() {
14146         assert(in_flight.find(object) != in_flight.end());
14147         in_flight.erase(object);
14148         if (in_flight.empty()) {
14149           if (pg->state_test(PG_STATE_SNAPTRIM_ERROR)) {
14150             pg->snap_trimmer_machine.process_event(Reset());
14151           } else {
14152             pg->snap_trimmer_machine.process_event(RepopsComplete());
14153           }
14154         }
14155       });
14156
14157     pg->simple_opc_submit(std::move(ctx));
14158   }
14159
14160   return transit< WaitRepops >();
14161 }
14162
14163 void PrimaryLogPG::setattr_maybe_cache(
14164   ObjectContextRef obc,
14165   OpContext *op,
14166   PGTransaction *t,
14167   const string &key,
14168   bufferlist &val)
14169 {
14170   t->setattr(obc->obs.oi.soid, key, val);
14171 }
14172
14173 void PrimaryLogPG::setattrs_maybe_cache(
14174   ObjectContextRef obc,
14175   OpContext *op,
14176   PGTransaction *t,
14177   map<string, bufferlist> &attrs)
14178 {
14179   t->setattrs(obc->obs.oi.soid, attrs);
14180 }
14181
14182 void PrimaryLogPG::rmattr_maybe_cache(
14183   ObjectContextRef obc,
14184   OpContext *op,
14185   PGTransaction *t,
14186   const string &key)
14187 {
14188   t->rmattr(obc->obs.oi.soid, key);
14189 }
14190
14191 int PrimaryLogPG::getattr_maybe_cache(
14192   ObjectContextRef obc,
14193   const string &key,
14194   bufferlist *val)
14195 {
14196   if (pool.info.require_rollback()) {
14197     map<string, bufferlist>::iterator i = obc->attr_cache.find(key);
14198     if (i != obc->attr_cache.end()) {
14199       if (val)
14200         *val = i->second;
14201       return 0;
14202     } else {
14203       return -ENODATA;
14204     }
14205   }
14206   return pgbackend->objects_get_attr(obc->obs.oi.soid, key, val);
14207 }
14208
14209 int PrimaryLogPG::getattrs_maybe_cache(
14210   ObjectContextRef obc,
14211   map<string, bufferlist> *out,
14212   bool user_only)
14213 {
14214   int r = 0;
14215   if (pool.info.require_rollback()) {
14216     if (out)
14217       *out = obc->attr_cache;
14218   } else {
14219     r = pgbackend->objects_get_attrs(obc->obs.oi.soid, out);
14220   }
14221   if (out && user_only) {
14222     map<string, bufferlist> tmp;
14223     for (map<string, bufferlist>::iterator i = out->begin();
14224          i != out->end();
14225          ++i) {
14226       if (i->first.size() > 1 && i->first[0] == '_')
14227         tmp[i->first.substr(1, i->first.size())].claim(i->second);
14228     }
14229     tmp.swap(*out);
14230   }
14231   return r;
14232 }
14233
14234 bool PrimaryLogPG::check_failsafe_full(ostream &ss) {
14235     return osd->check_failsafe_full(ss);
14236 }
14237
14238 void intrusive_ptr_add_ref(PrimaryLogPG *pg) { pg->get("intptr"); }
14239 void intrusive_ptr_release(PrimaryLogPG *pg) { pg->put("intptr"); }
14240
14241 #ifdef PG_DEBUG_REFS
14242 uint64_t get_with_id(PrimaryLogPG *pg) { return pg->get_with_id(); }
14243 void put_with_id(PrimaryLogPG *pg, uint64_t id) { return pg->put_with_id(id); }
14244 #endif
14245
14246 void intrusive_ptr_add_ref(PrimaryLogPG::RepGather *repop) { repop->get(); }
14247 void intrusive_ptr_release(PrimaryLogPG::RepGather *repop) { repop->put(); }