ceph/src/osd/PrimaryLogPG.cc

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Ceph - scalable distributed file system
   5  *
   6  * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
   7  * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
   8  *
   9  * Author: Loic Dachary <loic@dachary.org>
  10  *
  11  * This is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public
  13  * License version 2.1, as published by the Free Software
  14  * Foundation.  See file COPYING.
  15  *
  16  */
  17
  18 #include "boost/tuple/tuple.hpp"
  19 #include "boost/intrusive_ptr.hpp"
  20 #include "PG.h"
  21 #include "PrimaryLogPG.h"
  22 #include "OSD.h"
  23 #include "OpRequest.h"
  24 #include "ScrubStore.h"
  25 #include "Session.h"
  26 #include "objclass/objclass.h"
  27
  28 #include "common/errno.h"
  29 #include "common/scrub_types.h"
  30 #include "common/perf_counters.h"
  31
  32 #include "messages/MOSDOp.h"
  33 #include "messages/MOSDBackoff.h"
  34 #include "messages/MOSDSubOp.h"
  35 #include "messages/MOSDSubOpReply.h"
  36 #include "messages/MOSDPGTrim.h"
  37 #include "messages/MOSDPGScan.h"
  38 #include "messages/MOSDRepScrub.h"
  39 #include "messages/MOSDPGBackfill.h"
  40 #include "messages/MOSDPGBackfillRemove.h"
  41 #include "messages/MOSDPGUpdateLogMissing.h"
  42 #include "messages/MOSDPGUpdateLogMissingReply.h"
  43 #include "messages/MCommandReply.h"
  44 #include "messages/MOSDScrubReserve.h"
  45 #include "mds/inode_backtrace.h" // Ugh
  46 #include "common/EventTrace.h"
  47
  48 #include "common/config.h"
  49 #include "include/compat.h"
  50 #include "mon/MonClient.h"
  51 #include "osdc/Objecter.h"
  52 #include "json_spirit/json_spirit_value.h"
  53 #include "json_spirit/json_spirit_reader.h"
  54 #include "include/assert.h"  // json_spirit clobbers it
  55 #include "include/rados/rados_types.hpp"
  56
  57 #ifdef WITH_LTTNG
  58 #include "tracing/osd.h"
  59 #else
  60 #define tracepoint(...)
  61 #endif
  62
  63 #define dout_context cct
  64 #define dout_subsys ceph_subsys_osd
  65 #define DOUT_PREFIX_ARGS this, osd->whoami, get_osdmap()
  66 #undef dout_prefix
  67 #define dout_prefix _prefix(_dout, this)
  68 template <typename T>
  69 static ostream& _prefix(std::ostream *_dout, T *pg) {
  70   return *_dout << pg->gen_prefix();
  71 }
  72
  73
  74 #include <sstream>
  75 #include <utility>
  76
  77 #include <errno.h>
  78
  79 MEMPOOL_DEFINE_OBJECT_FACTORY(PrimaryLogPG, replicatedpg, osd);
  80
  81 PGLSFilter::PGLSFilter() : cct(nullptr)
  82 {
  83 }
  84
  85 PGLSFilter::~PGLSFilter()
  86 {
  87 }
  88
  89 struct PrimaryLogPG::C_OSD_OnApplied : Context {
  90   PrimaryLogPGRef pg;
  91   epoch_t epoch;
  92   eversion_t v;
  93   C_OSD_OnApplied(
  94     PrimaryLogPGRef pg,
  95     epoch_t epoch,
  96     eversion_t v)
  97     : pg(pg), epoch(epoch), v(v) {}
  98   void finish(int) override {
  99     pg->lock();
 100     if (!pg->pg_has_reset_since(epoch))
 101       pg->op_applied(v);
 102     pg->unlock();
 103   }
 104 };
 105
 106 /**
 107  * The CopyCallback class defines an interface for completions to the
 108  * copy_start code. Users of the copy infrastructure must implement
 109  * one and give an instance of the class to start_copy.
 110  *
 111  * The implementer is responsible for making sure that the CopyCallback
 112  * can associate itself with the correct copy operation.
 113  */
 114 class PrimaryLogPG::CopyCallback : public GenContext<CopyCallbackResults> {
 115 protected:
 116   CopyCallback() {}
 117   /**
 118    * results.get<0>() is the return code: 0 for success; -ECANCELED if
 119    * the operation was cancelled by the local OSD; -errno for other issues.
 120    * results.get<1>() is a pointer to a CopyResults object, which you are
 121    * responsible for deleting.
 122    */
 123   void finish(CopyCallbackResults results_) override = 0;
 124
 125 public:
 126   /// Provide the final size of the copied object to the CopyCallback
 127   ~CopyCallback() override {}
 128 };
 129
 130 template <typename T>
 131 class PrimaryLogPG::BlessedGenContext : public GenContext<T> {
 132   PrimaryLogPGRef pg;
 133   unique_ptr<GenContext<T>> c;
 134   epoch_t e;
 135 public:
 136   BlessedGenContext(PrimaryLogPG *pg, GenContext<T> *c, epoch_t e)
 137     : pg(pg), c(c), e(e) {}
 138   void finish(T t) override {
 139     pg->lock();
 140     if (pg->pg_has_reset_since(e))
 141       c.reset();
 142     else
 143       c.release()->complete(t);
 144     pg->unlock();
 145   }
 146 };
 147
 148 GenContext<ThreadPool::TPHandle&> *PrimaryLogPG::bless_gencontext(
 149   GenContext<ThreadPool::TPHandle&> *c) {
 150   return new BlessedGenContext<ThreadPool::TPHandle&>(
 151     this, c, get_osdmap()->get_epoch());
 152 }
 153
 154 class PrimaryLogPG::BlessedContext : public Context {
 155   PrimaryLogPGRef pg;
 156   unique_ptr<Context> c;
 157   epoch_t e;
 158 public:
 159   BlessedContext(PrimaryLogPG *pg, Context *c, epoch_t e)
 160     : pg(pg), c(c), e(e) {}
 161   void finish(int r) override {
 162     pg->lock();
 163     if (pg->pg_has_reset_since(e))
 164       c.reset();
 165     else
 166       c.release()->complete(r);
 167     pg->unlock();
 168   }
 169 };
 170
 171
 172 Context *PrimaryLogPG::bless_context(Context *c) {
 173   return new BlessedContext(this, c, get_osdmap()->get_epoch());
 174 }
 175
 176 class PrimaryLogPG::C_PG_ObjectContext : public Context {
 177   PrimaryLogPGRef pg;
 178   ObjectContext *obc;
 179   public:
 180   C_PG_ObjectContext(PrimaryLogPG *p, ObjectContext *o) :
 181     pg(p), obc(o) {}
 182   void finish(int r) override {
 183     pg->object_context_destructor_callback(obc);
 184   }
 185 };
 186
 187 class PrimaryLogPG::C_OSD_OndiskWriteUnlock : public Context {
 188   ObjectContextRef obc, obc2, obc3;
 189   public:
 190   C_OSD_OndiskWriteUnlock(
 191     ObjectContextRef o,
 192     ObjectContextRef o2 = ObjectContextRef(),
 193     ObjectContextRef o3 = ObjectContextRef()) : obc(o), obc2(o2), obc3(o3) {}
 194   void finish(int r) override {
 195     obc->ondisk_write_unlock();
 196     if (obc2)
 197       obc2->ondisk_write_unlock();
 198     if (obc3)
 199       obc3->ondisk_write_unlock();
 200   }
 201 };
 202
 203 struct OnReadComplete : public Context {
 204   PrimaryLogPG *pg;
 205   PrimaryLogPG::OpContext *opcontext;
 206   OnReadComplete(
 207     PrimaryLogPG *pg,
 208     PrimaryLogPG::OpContext *ctx) : pg(pg), opcontext(ctx) {}
 209   void finish(int r) override {
 210     opcontext->finish_read(pg);
 211   }
 212   ~OnReadComplete() override {}
 213 };
 214
 215 class PrimaryLogPG::C_OSD_AppliedRecoveredObject : public Context {
 216   PrimaryLogPGRef pg;
 217   ObjectContextRef obc;
 218   public:
 219   C_OSD_AppliedRecoveredObject(PrimaryLogPG *p, ObjectContextRef o) :
 220     pg(p), obc(o) {}
 221   void finish(int r) override {
 222     pg->_applied_recovered_object(obc);
 223   }
 224 };
 225
 226 class PrimaryLogPG::C_OSD_CommittedPushedObject : public Context {
 227   PrimaryLogPGRef pg;
 228   epoch_t epoch;
 229   eversion_t last_complete;
 230   public:
 231   C_OSD_CommittedPushedObject(
 232     PrimaryLogPG *p, epoch_t epoch, eversion_t lc) :
 233     pg(p), epoch(epoch), last_complete(lc) {
 234   }
 235   void finish(int r) override {
 236     pg->_committed_pushed_object(epoch, last_complete);
 237   }
 238 };
 239
 240 class PrimaryLogPG::C_OSD_AppliedRecoveredObjectReplica : public Context {
 241   PrimaryLogPGRef pg;
 242   public:
 243   explicit C_OSD_AppliedRecoveredObjectReplica(PrimaryLogPG *p) :
 244     pg(p) {}
 245   void finish(int r) override {
 246     pg->_applied_recovered_object_replica();
 247   }
 248 };
 249
 250 // OpContext
 251 void PrimaryLogPG::OpContext::start_async_reads(PrimaryLogPG *pg)
 252 {
 253   inflightreads = 1;
 254   list<pair<boost::tuple<uint64_t, uint64_t, unsigned>,
 255             pair<bufferlist*, Context*> > > in;
 256   in.swap(pending_async_reads);
 257   pg->pgbackend->objects_read_async(
 258     obc->obs.oi.soid,
 259     in,
 260     new OnReadComplete(pg, this), pg->get_pool().fast_read);
 261 }
 262 void PrimaryLogPG::OpContext::finish_read(PrimaryLogPG *pg)
 263 {
 264   assert(inflightreads > 0);
 265   --inflightreads;
 266   if (async_reads_complete()) {
 267     assert(pg->in_progress_async_reads.size());
 268     assert(pg->in_progress_async_reads.front().second == this);
 269     pg->in_progress_async_reads.pop_front();
 270
 271     // Restart the op context now that all reads have been
 272     // completed. Read failures will be handled by the op finisher
 273     pg->execute_ctx(this);
 274   }
 275 }
 276
 277 class CopyFromCallback : public PrimaryLogPG::CopyCallback {
 278 public:
 279   PrimaryLogPG::CopyResults *results = nullptr;
 280   PrimaryLogPG::OpContext *ctx;
 281   OSDOp &osd_op;
 282
 283   CopyFromCallback(PrimaryLogPG::OpContext *ctx, OSDOp &osd_op)
 284     : ctx(ctx), osd_op(osd_op) {
 285   }
 286   ~CopyFromCallback() override {}
 287
 288   void finish(PrimaryLogPG::CopyCallbackResults results_) override {
 289     results = results_.get<1>();
 290     int r = results_.get<0>();
 291
 292     // for finish_copyfrom
 293     ctx->user_at_version = results->user_version;
 294
 295     if (r >= 0) {
 296       ctx->pg->execute_ctx(ctx);
 297     } else {
 298       if (r != -ECANCELED) { // on cancel just toss it out; client resends
 299         if (ctx->op)
 300           ctx->pg->osd->reply_op_error(ctx->op, r);
 301       } else if (results->should_requeue) {
 302         if (ctx->op)
 303           ctx->pg->requeue_op(ctx->op);
 304       }
 305       ctx->pg->close_op_ctx(ctx);
 306     }
 307   }
 308
 309   bool is_temp_obj_used() {
 310     return results->started_temp_obj;
 311   }
 312   uint64_t get_data_size() {
 313     return results->object_size;
 314   }
 315 };
 316
 317 struct CopyFromFinisher : public PrimaryLogPG::OpFinisher {
 318   CopyFromCallback *copy_from_callback;
 319
 320   CopyFromFinisher(CopyFromCallback *copy_from_callback)
 321     : copy_from_callback(copy_from_callback) {
 322   }
 323
 324   int execute() override {
 325     // instance will be destructed after this method completes
 326     copy_from_callback->ctx->pg->finish_copyfrom(copy_from_callback);
 327     return 0;
 328   }
 329 };
 330
 331 // ======================
 332 // PGBackend::Listener
 333
 334 void PrimaryLogPG::on_local_recover(
 335   const hobject_t &hoid,
 336   const ObjectRecoveryInfo &_recovery_info,
 337   ObjectContextRef obc,
 338   bool is_delete,
 339   ObjectStore::Transaction *t
 340   )
 341 {
 342   dout(10) << __func__ << ": " << hoid << dendl;
 343
 344   ObjectRecoveryInfo recovery_info(_recovery_info);
 345   clear_object_snap_mapping(t, hoid);
 346   if (!is_delete && recovery_info.soid.is_snap()) {
 347     OSDriver::OSTransaction _t(osdriver.get_transaction(t));
 348     set<snapid_t> snaps;
 349     dout(20) << " snapset " << recovery_info.ss
 350              << " legacy_snaps " << recovery_info.oi.legacy_snaps << dendl;
 351     if (recovery_info.ss.is_legacy() ||
 352         recovery_info.ss.seq == 0 /* jewel osd doesn't populate this */) {
 353       assert(recovery_info.oi.legacy_snaps.size());
 354       snaps.insert(recovery_info.oi.legacy_snaps.begin(),
 355                    recovery_info.oi.legacy_snaps.end());
 356     } else {
 357       auto p = recovery_info.ss.clone_snaps.find(hoid.snap);
 358       assert(p != recovery_info.ss.clone_snaps.end());  // hmm, should we warn?
 359       snaps.insert(p->second.begin(), p->second.end());
 360     }
 361     dout(20) << " snaps " << snaps << dendl;
 362     snap_mapper.add_oid(
 363       recovery_info.soid,
 364       snaps,
 365       &_t);
 366   }
 367   if (!is_delete && pg_log.get_missing().is_missing(recovery_info.soid) &&
 368       pg_log.get_missing().get_items().find(recovery_info.soid)->second.need > recovery_info.version) {
 369     assert(is_primary());
 370     const pg_log_entry_t *latest = pg_log.get_log().objects.find(recovery_info.soid)->second;
 371     if (latest->op == pg_log_entry_t::LOST_REVERT &&
 372         latest->reverting_to == recovery_info.version) {
 373       dout(10) << " got old revert version " << recovery_info.version
 374                << " for " << *latest << dendl;
 375       recovery_info.version = latest->version;
 376       // update the attr to the revert event version
 377       recovery_info.oi.prior_version = recovery_info.oi.version;
 378       recovery_info.oi.version = latest->version;
 379       bufferlist bl;
 380       ::encode(recovery_info.oi, bl,
 381                get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
 382       assert(!pool.info.require_rollback());
 383       t->setattr(coll, ghobject_t(recovery_info.soid), OI_ATTR, bl);
 384       if (obc)
 385         obc->attr_cache[OI_ATTR] = bl;
 386     }
 387   }
 388
 389   // keep track of active pushes for scrub
 390   ++active_pushes;
 391
 392   if (recovery_info.version > pg_log.get_can_rollback_to()) {
 393     /* This can only happen during a repair, and even then, it would
 394      * be one heck of a race.  If we are repairing the object, the
 395      * write in question must be fully committed, so it's not valid
 396      * to roll it back anyway (and we'll be rolled forward shortly
 397      * anyway) */
 398     PGLogEntryHandler h{this, t};
 399     pg_log.roll_forward_to(recovery_info.version, &h);
 400   }
 401   recover_got(recovery_info.soid, recovery_info.version);
 402
 403   if (is_primary()) {
 404     if (!is_delete) {
 405       obc->obs.exists = true;
 406       obc->ondisk_write_lock();
 407
 408       bool got = obc->get_recovery_read();
 409       assert(got);
 410
 411       assert(recovering.count(obc->obs.oi.soid));
 412       recovering[obc->obs.oi.soid] = obc;
 413       obc->obs.oi = recovery_info.oi;  // may have been updated above
 414       t->register_on_applied_sync(new C_OSD_OndiskWriteUnlock(obc));
 415     }
 416
 417     t->register_on_applied(new C_OSD_AppliedRecoveredObject(this, obc));
 418
 419     publish_stats_to_osd();
 420     assert(missing_loc.needs_recovery(hoid));
 421     if (!is_delete)
 422       missing_loc.add_location(hoid, pg_whoami);
 423     release_backoffs(hoid);
 424     if (!is_unreadable_object(hoid)) {
 425       auto unreadable_object_entry = waiting_for_unreadable_object.find(hoid);
 426       if (unreadable_object_entry != waiting_for_unreadable_object.end()) {
 427         dout(20) << " kicking unreadable waiters on " << hoid << dendl;
 428         requeue_ops(unreadable_object_entry->second);
 429         waiting_for_unreadable_object.erase(unreadable_object_entry);
 430       }
 431     }
 432   } else {
 433     t->register_on_applied(
 434       new C_OSD_AppliedRecoveredObjectReplica(this));
 435
 436   }
 437
 438   t->register_on_commit(
 439     new C_OSD_CommittedPushedObject(
 440       this,
 441       get_osdmap()->get_epoch(),
 442       info.last_complete));
 443
 444   // update pg
 445   dirty_info = true;
 446   write_if_dirty(*t);
 447 }
 448
 449 void PrimaryLogPG::on_global_recover(
 450   const hobject_t &soid,
 451   const object_stat_sum_t &stat_diff,
 452   bool is_delete)
 453 {
 454   info.stats.stats.sum.add(stat_diff);
 455   missing_loc.recovered(soid);
 456   publish_stats_to_osd();
 457   dout(10) << "pushed " << soid << " to all replicas" << dendl;
 458   map<hobject_t, ObjectContextRef>::iterator i = recovering.find(soid);
 459   assert(i != recovering.end());
 460
 461   if (!is_delete) {
 462     // recover missing won't have had an obc, but it gets filled in
 463     // during on_local_recover
 464     assert(i->second);
 465     list<OpRequestRef> requeue_list;
 466     i->second->drop_recovery_read(&requeue_list);
 467     requeue_ops(requeue_list);
 468   }
 469
 470   backfills_in_flight.erase(soid);
 471
 472   recovering.erase(i);
 473   finish_recovery_op(soid);
 474   release_backoffs(soid);
 475   auto degraded_object_entry = waiting_for_degraded_object.find(soid);
 476   if (degraded_object_entry != waiting_for_degraded_object.end()) {
 477     dout(20) << " kicking degraded waiters on " << soid << dendl;
 478     requeue_ops(degraded_object_entry->second);
 479     waiting_for_degraded_object.erase(degraded_object_entry);
 480   }
 481   auto unreadable_object_entry = waiting_for_unreadable_object.find(soid);
 482   if (unreadable_object_entry != waiting_for_unreadable_object.end()) {
 483     dout(20) << " kicking unreadable waiters on " << soid << dendl;
 484     requeue_ops(unreadable_object_entry->second);
 485     waiting_for_unreadable_object.erase(unreadable_object_entry);
 486   }
 487   finish_degraded_object(soid);
 488 }
 489
 490 void PrimaryLogPG::on_peer_recover(
 491   pg_shard_t peer,
 492   const hobject_t &soid,
 493   const ObjectRecoveryInfo &recovery_info)
 494 {
 495   publish_stats_to_osd();
 496   // done!
 497   peer_missing[peer].got(soid, recovery_info.version);
 498 }
 499
 500 void PrimaryLogPG::begin_peer_recover(
 501   pg_shard_t peer,
 502   const hobject_t soid)
 503 {
 504   peer_missing[peer].revise_have(soid, eversion_t());
 505 }
 506
 507 void PrimaryLogPG::schedule_recovery_work(
 508   GenContext<ThreadPool::TPHandle&> *c)
 509 {
 510   osd->recovery_gen_wq.queue(c);
 511 }
 512
 513 void PrimaryLogPG::send_message_osd_cluster(
 514   int peer, Message *m, epoch_t from_epoch)
 515 {
 516   osd->send_message_osd_cluster(peer, m, from_epoch);
 517 }
 518
 519 void PrimaryLogPG::send_message_osd_cluster(
 520   Message *m, Connection *con)
 521 {
 522   osd->send_message_osd_cluster(m, con);
 523 }
 524
 525 void PrimaryLogPG::send_message_osd_cluster(
 526   Message *m, const ConnectionRef& con)
 527 {
 528   osd->send_message_osd_cluster(m, con);
 529 }
 530
 531 void PrimaryLogPG::on_primary_error(
 532   const hobject_t &oid,
 533   eversion_t v)
 534 {
 535   dout(0) << __func__ << ": oid " << oid << " version " << v << dendl;
 536   primary_failed(oid);
 537   primary_error(oid, v);
 538   backfill_add_missing(oid, v);
 539 }
 540
 541 void PrimaryLogPG::backfill_add_missing(
 542   const hobject_t &oid,
 543   eversion_t v)
 544 {
 545   dout(0) << __func__ << ": oid " << oid << " version " << v << dendl;
 546   backfills_in_flight.erase(oid);
 547   missing_loc.add_missing(oid, v, eversion_t());
 548 }
 549
 550 ConnectionRef PrimaryLogPG::get_con_osd_cluster(
 551   int peer, epoch_t from_epoch)
 552 {
 553   return osd->get_con_osd_cluster(peer, from_epoch);
 554 }
 555
 556 PerfCounters *PrimaryLogPG::get_logger()
 557 {
 558   return osd->logger;
 559 }
 560
 561
 562 // ====================
 563 // missing objects
 564
 565 bool PrimaryLogPG::is_missing_object(const hobject_t& soid) const
 566 {
 567   return pg_log.get_missing().get_items().count(soid);
 568 }
 569
 570 void PrimaryLogPG::maybe_kick_recovery(
 571   const hobject_t &soid)
 572 {
 573   eversion_t v;
 574   if (!missing_loc.needs_recovery(soid, &v))
 575     return;
 576
 577   map<hobject_t, ObjectContextRef>::const_iterator p = recovering.find(soid);
 578   if (p != recovering.end()) {
 579     dout(7) << "object " << soid << " v " << v << ", already recovering." << dendl;
 580   } else if (missing_loc.is_unfound(soid)) {
 581     dout(7) << "object " << soid << " v " << v << ", is unfound." << dendl;
 582   } else {
 583     dout(7) << "object " << soid << " v " << v << ", recovering." << dendl;
 584     PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
 585     if (is_missing_object(soid)) {
 586       recover_missing(soid, v, cct->_conf->osd_client_op_priority, h);
 587     } else if (missing_loc.is_deleted(soid)) {
 588       prep_object_replica_deletes(soid, v, h);
 589     } else {
 590       prep_object_replica_pushes(soid, v, h);
 591     }
 592     pgbackend->run_recovery_op(h, cct->_conf->osd_client_op_priority);
 593   }
 594 }
 595
 596 void PrimaryLogPG::wait_for_unreadable_object(
 597   const hobject_t& soid, OpRequestRef op)
 598 {
 599   assert(is_unreadable_object(soid));
 600   maybe_kick_recovery(soid);
 601   waiting_for_unreadable_object[soid].push_back(op);
 602   op->mark_delayed("waiting for missing object");
 603 }
 604
 605 bool PrimaryLogPG::is_degraded_or_backfilling_object(const hobject_t& soid)
 606 {
 607   /* The conditions below may clear (on_local_recover, before we queue
 608    * the transaction) before we actually requeue the degraded waiters
 609    * in on_global_recover after the transaction completes.
 610    */
 611   if (waiting_for_degraded_object.count(soid))
 612     return true;
 613   if (pg_log.get_missing().get_items().count(soid))
 614     return true;
 615   assert(!actingbackfill.empty());
 616   for (set<pg_shard_t>::iterator i = actingbackfill.begin();
 617        i != actingbackfill.end();
 618        ++i) {
 619     if (*i == get_primary()) continue;
 620     pg_shard_t peer = *i;
 621     auto peer_missing_entry = peer_missing.find(peer);
 622     if (peer_missing_entry != peer_missing.end() &&
 623         peer_missing_entry->second.get_items().count(soid))
 624       return true;
 625
 626     // Object is degraded if after last_backfill AND
 627     // we are backfilling it
 628     if (is_backfill_targets(peer) &&
 629         peer_info[peer].last_backfill <= soid &&
 630         last_backfill_started >= soid &&
 631         backfills_in_flight.count(soid))
 632       return true;
 633   }
 634   return false;
 635 }
 636
 637 void PrimaryLogPG::wait_for_degraded_object(const hobject_t& soid, OpRequestRef op)
 638 {
 639   assert(is_degraded_or_backfilling_object(soid));
 640
 641   maybe_kick_recovery(soid);
 642   waiting_for_degraded_object[soid].push_back(op);
 643   op->mark_delayed("waiting for degraded object");
 644 }
 645
 646 void PrimaryLogPG::block_write_on_full_cache(
 647   const hobject_t& _oid, OpRequestRef op)
 648 {
 649   const hobject_t oid = _oid.get_head();
 650   dout(20) << __func__ << ": blocking object " << oid
 651            << " on full cache" << dendl;
 652   objects_blocked_on_cache_full.insert(oid);
 653   waiting_for_cache_not_full.push_back(op);
 654   op->mark_delayed("waiting for cache not full");
 655 }
 656
 657 void PrimaryLogPG::block_for_clean(
 658   const hobject_t& oid, OpRequestRef op)
 659 {
 660   dout(20) << __func__ << ": blocking object " << oid
 661            << " on primary repair" << dendl;
 662   waiting_for_clean_to_primary_repair.push_back(op);
 663   op->mark_delayed("waiting for clean to repair");
 664 }
 665
 666 void PrimaryLogPG::block_write_on_snap_rollback(
 667   const hobject_t& oid, ObjectContextRef obc, OpRequestRef op)
 668 {
 669   dout(20) << __func__ << ": blocking object " << oid.get_head()
 670            << " on snap promotion " << obc->obs.oi.soid << dendl;
 671   // otherwise, we'd have blocked in do_op
 672   assert(oid.is_head());
 673   assert(objects_blocked_on_snap_promotion.count(oid) == 0);
 674   objects_blocked_on_snap_promotion[oid] = obc;
 675   wait_for_blocked_object(obc->obs.oi.soid, op);
 676 }
 677
 678 void PrimaryLogPG::block_write_on_degraded_snap(
 679   const hobject_t& snap, OpRequestRef op)
 680 {
 681   dout(20) << __func__ << ": blocking object " << snap.get_head()
 682            << " on degraded snap " << snap << dendl;
 683   // otherwise, we'd have blocked in do_op
 684   assert(objects_blocked_on_degraded_snap.count(snap.get_head()) == 0);
 685   objects_blocked_on_degraded_snap[snap.get_head()] = snap.snap;
 686   wait_for_degraded_object(snap, op);
 687 }
 688
 689 bool PrimaryLogPG::maybe_await_blocked_snapset(
 690   const hobject_t &hoid,
 691   OpRequestRef op)
 692 {
 693   ObjectContextRef obc;
 694   obc = object_contexts.lookup(hoid.get_head());
 695   if (obc) {
 696     if (obc->is_blocked()) {
 697       wait_for_blocked_object(obc->obs.oi.soid, op);
 698       return true;
 699     } else {
 700       return false;
 701     }
 702   }
 703   obc = object_contexts.lookup(hoid.get_snapdir());
 704   if (obc) {
 705     if (obc->is_blocked()) {
 706       wait_for_blocked_object(obc->obs.oi.soid, op);
 707       return true;
 708     } else {
 709       return false;
 710     }
 711   }
 712   return false;
 713 }
 714
 715 void PrimaryLogPG::wait_for_blocked_object(const hobject_t& soid, OpRequestRef op)
 716 {
 717   dout(10) << __func__ << " " << soid << " " << op << dendl;
 718   waiting_for_blocked_object[soid].push_back(op);
 719   op->mark_delayed("waiting for blocked object");
 720 }
 721
 722 void PrimaryLogPG::maybe_force_recovery()
 723 {
 724   // no force if not in degraded/recovery/backfill states
 725   if (!is_degraded() &&
 726       !state_test(PG_STATE_RECOVERING |
 727                   PG_STATE_RECOVERY_WAIT |
 728                   PG_STATE_BACKFILLING |
 729                   PG_STATE_BACKFILL_WAIT |
 730                   PG_STATE_BACKFILL_TOOFULL))
 731     return;
 732
 733   if (pg_log.get_log().approx_size() <
 734       cct->_conf->osd_max_pg_log_entries *
 735         cct->_conf->osd_force_recovery_pg_log_entries_factor)
 736     return;
 737
 738   // find the oldest missing object
 739   version_t min_version = 0;
 740   hobject_t soid;
 741   if (!pg_log.get_missing().get_items().empty()) {
 742     min_version = pg_log.get_missing().get_rmissing().begin()->first;
 743     soid = pg_log.get_missing().get_rmissing().begin()->second;
 744   }
 745   assert(!actingbackfill.empty());
 746   for (set<pg_shard_t>::iterator it = actingbackfill.begin();
 747        it != actingbackfill.end();
 748        ++it) {
 749     if (*it == get_primary()) continue;
 750     pg_shard_t peer = *it;
 751     if (peer_missing.count(peer) &&
 752         !peer_missing[peer].get_items().empty() &&
 753         min_version > peer_missing[peer].get_rmissing().begin()->first) {
 754       min_version = peer_missing[peer].get_rmissing().begin()->first;
 755       soid = peer_missing[peer].get_rmissing().begin()->second;
 756     }
 757   }
 758
 759   // recover it
 760   if (soid != hobject_t())
 761     maybe_kick_recovery(soid);
 762 }
 763
 764 class PGLSPlainFilter : public PGLSFilter {
 765   string val;
 766 public:
 767   int init(bufferlist::iterator &params) override
 768   {
 769     try {
 770       ::decode(xattr, params);
 771       ::decode(val, params);
 772     } catch (buffer::error &e) {
 773       return -EINVAL;
 774     }
 775
 776     return 0;
 777   }
 778   ~PGLSPlainFilter() override {}
 779   bool filter(const hobject_t &obj, bufferlist& xattr_data,
 780                       bufferlist& outdata) override;
 781 };
 782
 783 class PGLSParentFilter : public PGLSFilter {
 784   inodeno_t parent_ino;
 785 public:
 786   CephContext* cct;
 787   PGLSParentFilter(CephContext* cct) : cct(cct) {
 788     xattr = "_parent";
 789   }
 790   int init(bufferlist::iterator &params) override
 791   {
 792     try {
 793       ::decode(parent_ino, params);
 794     } catch (buffer::error &e) {
 795       return -EINVAL;
 796     }
 797     generic_dout(0) << "parent_ino=" << parent_ino << dendl;
 798
 799     return 0;
 800   }
 801   ~PGLSParentFilter() override {}
 802   bool filter(const hobject_t &obj, bufferlist& xattr_data,
 803                       bufferlist& outdata) override;
 804 };
 805
 806 bool PGLSParentFilter::filter(const hobject_t &obj,
 807                               bufferlist& xattr_data, bufferlist& outdata)
 808 {
 809   bufferlist::iterator iter = xattr_data.begin();
 810   inode_backtrace_t bt;
 811
 812   generic_dout(0) << "PGLSParentFilter::filter" << dendl;
 813
 814   ::decode(bt, iter);
 815
 816   vector<inode_backpointer_t>::iterator vi;
 817   for (vi = bt.ancestors.begin(); vi != bt.ancestors.end(); ++vi) {
 818     generic_dout(0) << "vi->dirino=" << vi->dirino << " parent_ino=" << parent_ino << dendl;
 819     if (vi->dirino == parent_ino) {
 820       ::encode(*vi, outdata);
 821       return true;
 822     }
 823   }
 824
 825   return false;
 826 }
 827
 828 bool PGLSPlainFilter::filter(const hobject_t &obj,
 829                              bufferlist& xattr_data, bufferlist& outdata)
 830 {
 831   if (val.size() != xattr_data.length())
 832     return false;
 833
 834   if (memcmp(val.c_str(), xattr_data.c_str(), val.size()))
 835     return false;
 836
 837   return true;
 838 }
 839
 840 bool PrimaryLogPG::pgls_filter(PGLSFilter *filter, hobject_t& sobj, bufferlist& outdata)
 841 {
 842   bufferlist bl;
 843
 844   // If filter has expressed an interest in an xattr, load it.
 845   if (!filter->get_xattr().empty()) {
 846     int ret = pgbackend->objects_get_attr(
 847       sobj,
 848       filter->get_xattr(),
 849       &bl);
 850     dout(0) << "getattr (sobj=" << sobj << ", attr=" << filter->get_xattr() << ") returned " << ret << dendl;
 851     if (ret < 0) {
 852       if (ret != -ENODATA || filter->reject_empty_xattr()) {
 853         return false;
 854       }
 855     }
 856   }
 857
 858   return filter->filter(sobj, bl, outdata);
 859 }
 860
 861 int PrimaryLogPG::get_pgls_filter(bufferlist::iterator& iter, PGLSFilter **pfilter)
 862 {
 863   string type;
 864   PGLSFilter *filter;
 865
 866   try {
 867     ::decode(type, iter);
 868   }
 869   catch (buffer::error& e) {
 870     return -EINVAL;
 871   }
 872
 873   if (type.compare("parent") == 0) {
 874     filter = new PGLSParentFilter(cct);
 875   } else if (type.compare("plain") == 0) {
 876     filter = new PGLSPlainFilter();
 877   } else {
 878     std::size_t dot = type.find(".");
 879     if (dot == std::string::npos || dot == 0 || dot == type.size() - 1) {
 880       return -EINVAL;
 881     }
 882
 883     const std::string class_name = type.substr(0, dot);
 884     const std::string filter_name = type.substr(dot + 1);
 885     ClassHandler::ClassData *cls = NULL;
 886     int r = osd->class_handler->open_class(class_name, &cls);
 887     if (r != 0) {
 888       derr << "Error opening class '" << class_name << "': "
 889            << cpp_strerror(r) << dendl;
 890       if (r != -EPERM) // propogate permission error
 891         r = -EINVAL;
 892       return r;
 893     } else {
 894       assert(cls);
 895     }
 896
 897     ClassHandler::ClassFilter *class_filter = cls->get_filter(filter_name);
 898     if (class_filter == NULL) {
 899       derr << "Error finding filter '" << filter_name << "' in class "
 900            << class_name << dendl;
 901       return -EINVAL;
 902     }
 903     filter = class_filter->fn();
 904     if (!filter) {
 905       // Object classes are obliged to return us something, but let's
 906       // give an error rather than asserting out.
 907       derr << "Buggy class " << class_name << " failed to construct "
 908               "filter " << filter_name << dendl;
 909       return -EINVAL;
 910     }
 911   }
 912
 913   assert(filter);
 914   int r = filter->init(iter);
 915   if (r < 0) {
 916     derr << "Error initializing filter " << type << ": "
 917          << cpp_strerror(r) << dendl;
 918     delete filter;
 919     return -EINVAL;
 920   } else {
 921     // Successfully constructed and initialized, return it.
 922     *pfilter = filter;
 923     return 0;
 924   }
 925 }
 926
 927
 928 // ==========================================================
 929
 930 int PrimaryLogPG::do_command(
 931   cmdmap_t cmdmap,
 932   ostream& ss,
 933   bufferlist& idata,
 934   bufferlist& odata,
 935   ConnectionRef con,
 936   ceph_tid_t tid)
 937 {
 938   const auto &missing = pg_log.get_missing();
 939   string prefix;
 940   string format;
 941
 942   cmd_getval(cct, cmdmap, "format", format);
 943   boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json"));
 944
 945   string command;
 946   cmd_getval(cct, cmdmap, "cmd", command);
 947   if (command == "query") {
 948     f->open_object_section("pg");
 949     f->dump_string("state", pg_state_string(get_state()));
 950     f->dump_stream("snap_trimq") << snap_trimq;
 951     f->dump_unsigned("snap_trimq_len", snap_trimq.size());
 952     f->dump_unsigned("epoch", get_osdmap()->get_epoch());
 953     f->open_array_section("up");
 954     for (vector<int>::iterator p = up.begin(); p != up.end(); ++p)
 955       f->dump_unsigned("osd", *p);
 956     f->close_section();
 957     f->open_array_section("acting");
 958     for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p)
 959       f->dump_unsigned("osd", *p);
 960     f->close_section();
 961     if (!backfill_targets.empty()) {
 962       f->open_array_section("backfill_targets");
 963       for (set<pg_shard_t>::iterator p = backfill_targets.begin();
 964            p != backfill_targets.end();
 965            ++p)
 966         f->dump_stream("shard") << *p;
 967       f->close_section();
 968     }
 969     if (!actingbackfill.empty()) {
 970       f->open_array_section("actingbackfill");
 971       for (set<pg_shard_t>::iterator p = actingbackfill.begin();
 972            p != actingbackfill.end();
 973            ++p)
 974         f->dump_stream("shard") << *p;
 975       f->close_section();
 976     }
 977     f->open_object_section("info");
 978     _update_calc_stats();
 979     info.dump(f.get());
 980     f->close_section();
 981
 982     f->open_array_section("peer_info");
 983     for (map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
 984          p != peer_info.end();
 985          ++p) {
 986       f->open_object_section("info");
 987       f->dump_stream("peer") << p->first;
 988       p->second.dump(f.get());
 989       f->close_section();
 990     }
 991     f->close_section();
 992
 993     f->open_array_section("recovery_state");
 994     handle_query_state(f.get());
 995     f->close_section();
 996
 997     f->open_object_section("agent_state");
 998     if (agent_state)
 999       agent_state->dump(f.get());
1000     f->close_section();
1001
1002     f->close_section();
1003     f->flush(odata);
1004     return 0;
1005   }
1006   else if (command == "mark_unfound_lost") {
1007     string mulcmd;
1008     cmd_getval(cct, cmdmap, "mulcmd", mulcmd);
1009     int mode = -1;
1010     if (mulcmd == "revert") {
1011       if (pool.info.ec_pool()) {
1012         ss << "mode must be 'delete' for ec pool";
1013         return -EINVAL;
1014       }
1015       mode = pg_log_entry_t::LOST_REVERT;
1016     } else if (mulcmd == "delete") {
1017       mode = pg_log_entry_t::LOST_DELETE;
1018     } else {
1019       ss << "mode must be 'revert' or 'delete'; mark not yet implemented";
1020       return -EINVAL;
1021     }
1022     assert(mode == pg_log_entry_t::LOST_REVERT ||
1023            mode == pg_log_entry_t::LOST_DELETE);
1024
1025     if (!is_primary()) {
1026       ss << "not primary";
1027       return -EROFS;
1028     }
1029
1030     uint64_t unfound = missing_loc.num_unfound();
1031     if (!unfound) {
1032       ss << "pg has no unfound objects";
1033       return 0;  // make command idempotent
1034     }
1035
1036     if (!all_unfound_are_queried_or_lost(get_osdmap())) {
1037       ss << "pg has " << unfound
1038          << " unfound objects but we haven't probed all sources, not marking lost";
1039       return -EINVAL;
1040     }
1041
1042     mark_all_unfound_lost(mode, con, tid);
1043     return -EAGAIN;
1044   }
1045   else if (command == "list_missing") {
1046     hobject_t offset;
1047     string offset_json;
1048     if (cmd_getval(cct, cmdmap, "offset", offset_json)) {
1049       json_spirit::Value v;
1050       try {
1051         if (!json_spirit::read(offset_json, v))
1052           throw std::runtime_error("bad json");
1053         offset.decode(v);
1054       } catch (std::runtime_error& e) {
1055         ss << "error parsing offset: " << e.what();
1056         return -EINVAL;
1057       }
1058     }
1059     f->open_object_section("missing");
1060     {
1061       f->open_object_section("offset");
1062       offset.dump(f.get());
1063       f->close_section();
1064     }
1065     f->dump_int("num_missing", missing.num_missing());
1066     f->dump_int("num_unfound", get_num_unfound());
1067     const map<hobject_t, pg_missing_item> &needs_recovery_map =
1068       missing_loc.get_needs_recovery();
1069     map<hobject_t, pg_missing_item>::const_iterator p =
1070       needs_recovery_map.upper_bound(offset);
1071     {
1072       f->open_array_section("objects");
1073       int32_t num = 0;
1074       for (; p != needs_recovery_map.end() && num < cct->_conf->osd_command_max_records; ++p) {
1075         if (missing_loc.is_unfound(p->first)) {
1076           f->open_object_section("object");
1077           {
1078             f->open_object_section("oid");
1079             p->first.dump(f.get());
1080             f->close_section();
1081           }
1082           p->second.dump(f.get()); // have, need keys
1083           {
1084             f->open_array_section("locations");
1085             for (set<pg_shard_t>::iterator r =
1086                 missing_loc.get_locations(p->first).begin();
1087                 r != missing_loc.get_locations(p->first).end();
1088                 ++r)
1089               f->dump_stream("shard") << *r;
1090             f->close_section();
1091           }
1092           f->close_section();
1093           num++;
1094         }
1095       }
1096       f->close_section();
1097     }
1098     f->dump_bool("more", p != needs_recovery_map.end());
1099     f->close_section();
1100     f->flush(odata);
1101     return 0;
1102   }
1103
1104   ss << "unknown pg command " << prefix;
1105   return -EINVAL;
1106 }
1107
1108 // ==========================================================
1109
1110 void PrimaryLogPG::do_pg_op(OpRequestRef op)
1111 {
1112   // NOTE: this is non-const because we modify the OSDOp.outdata in
1113   // place
1114   MOSDOp *m = static_cast<MOSDOp *>(op->get_nonconst_req());
1115   assert(m->get_type() == CEPH_MSG_OSD_OP);
1116   dout(10) << "do_pg_op " << *m << dendl;
1117
1118   op->mark_started();
1119
1120   int result = 0;
1121   string cname, mname;
1122   PGLSFilter *filter = NULL;
1123   bufferlist filter_out;
1124
1125   snapid_t snapid = m->get_snapid();
1126
1127   vector<OSDOp> ops = m->ops;
1128
1129   for (vector<OSDOp>::iterator p = ops.begin(); p != ops.end(); ++p) {
1130     OSDOp& osd_op = *p;
1131     bufferlist::iterator bp = p->indata.begin();
1132     switch (p->op.op) {
1133     case CEPH_OSD_OP_PGNLS_FILTER:
1134       try {
1135         ::decode(cname, bp);
1136         ::decode(mname, bp);
1137       }
1138       catch (const buffer::error& e) {
1139         dout(0) << "unable to decode PGLS_FILTER description in " << *m << dendl;
1140         result = -EINVAL;
1141         break;
1142       }
1143       if (filter) {
1144         delete filter;
1145         filter = NULL;
1146       }
1147       result = get_pgls_filter(bp, &filter);
1148       if (result < 0)
1149         break;
1150
1151       assert(filter);
1152
1153       // fall through
1154
1155     case CEPH_OSD_OP_PGNLS:
1156       if (snapid != CEPH_NOSNAP) {
1157         result = -EINVAL;
1158         break;
1159       }
1160       if (get_osdmap()->raw_pg_to_pg(m->get_pg()) != info.pgid.pgid) {
1161         dout(10) << " pgnls pg=" << m->get_pg()
1162                  << " " << get_osdmap()->raw_pg_to_pg(m->get_pg())
1163                  << " != " << info.pgid << dendl;
1164         result = 0; // hmm?
1165       } else {
1166         unsigned list_size = MIN(cct->_conf->osd_max_pgls, p->op.pgls.count);
1167
1168         dout(10) << " pgnls pg=" << m->get_pg() << " count " << list_size << dendl;
1169         // read into a buffer
1170         vector<hobject_t> sentries;
1171         pg_nls_response_t response;
1172         try {
1173           ::decode(response.handle, bp);
1174         }
1175         catch (const buffer::error& e) {
1176           dout(0) << "unable to decode PGNLS handle in " << *m << dendl;
1177           result = -EINVAL;
1178           break;
1179         }
1180
1181         hobject_t next;
1182         hobject_t lower_bound = response.handle;
1183         hobject_t pg_start = info.pgid.pgid.get_hobj_start();
1184         hobject_t pg_end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1185         dout(10) << " pgnls lower_bound " << lower_bound
1186                  << " pg_end " << pg_end << dendl;
1187         if (((!lower_bound.is_max() && lower_bound >= pg_end) ||
1188              (lower_bound != hobject_t() && lower_bound < pg_start))) {
1189           // this should only happen with a buggy client.
1190           dout(10) << "outside of PG bounds " << pg_start << " .. "
1191                    << pg_end << dendl;
1192           result = -EINVAL;
1193           break;
1194         }
1195
1196         hobject_t current = lower_bound;
1197         osr->flush();
1198         int r = pgbackend->objects_list_partial(
1199           current,
1200           list_size,
1201           list_size,
1202           &sentries,
1203           &next);
1204         if (r != 0) {
1205           result = -EINVAL;
1206           break;
1207         }
1208
1209         map<hobject_t, pg_missing_item>::const_iterator missing_iter =
1210           pg_log.get_missing().get_items().lower_bound(current);
1211         vector<hobject_t>::iterator ls_iter = sentries.begin();
1212         hobject_t _max = hobject_t::get_max();
1213         while (1) {
1214           const hobject_t &mcand =
1215             missing_iter == pg_log.get_missing().get_items().end() ?
1216             _max :
1217             missing_iter->first;
1218           const hobject_t &lcand =
1219             ls_iter == sentries.end() ?
1220             _max :
1221             *ls_iter;
1222
1223           hobject_t candidate;
1224           if (mcand == lcand) {
1225             candidate = mcand;
1226             if (!mcand.is_max()) {
1227               ++ls_iter;
1228               ++missing_iter;
1229             }
1230           } else if (mcand < lcand) {
1231             candidate = mcand;
1232             assert(!mcand.is_max());
1233             ++missing_iter;
1234           } else {
1235             candidate = lcand;
1236             assert(!lcand.is_max());
1237             ++ls_iter;
1238           }
1239
1240           dout(10) << " pgnls candidate 0x" << std::hex << candidate.get_hash()
1241             << " vs lower bound 0x" << lower_bound.get_hash() << dendl;
1242
1243           if (candidate >= next) {
1244             break;
1245           }
1246
1247           if (response.entries.size() == list_size) {
1248             next = candidate;
1249             break;
1250           }
1251
1252           // skip snapdir objects
1253           if (candidate.snap == CEPH_SNAPDIR)
1254             continue;
1255
1256           if (candidate.snap != CEPH_NOSNAP)
1257             continue;
1258
1259           // skip internal namespace
1260           if (candidate.get_namespace() == cct->_conf->osd_hit_set_namespace)
1261             continue;
1262
1263           if (missing_loc.is_deleted(candidate))
1264             continue;
1265
1266           // skip wrong namespace
1267           if (m->get_hobj().nspace != librados::all_nspaces &&
1268                candidate.get_namespace() != m->get_hobj().nspace)
1269             continue;
1270
1271           if (filter && !pgls_filter(filter, candidate, filter_out))
1272             continue;
1273
1274           dout(20) << "pgnls item 0x" << std::hex
1275             << candidate.get_hash()
1276             << ", rev 0x" << hobject_t::_reverse_bits(candidate.get_hash())
1277             << std::dec << " "
1278             << candidate.oid.name << dendl;
1279
1280           librados::ListObjectImpl item;
1281           item.nspace = candidate.get_namespace();
1282           item.oid = candidate.oid.name;
1283           item.locator = candidate.get_key();
1284           response.entries.push_back(item);
1285         }
1286
1287         if (next.is_max() &&
1288             missing_iter == pg_log.get_missing().get_items().end() &&
1289             ls_iter == sentries.end()) {
1290           result = 1;
1291
1292           // Set response.handle to the start of the next PG according
1293           // to the object sort order.
1294           response.handle = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1295         } else {
1296           response.handle = next;
1297         }
1298         dout(10) << "pgnls handle=" << response.handle << dendl;
1299         ::encode(response, osd_op.outdata);
1300         if (filter)
1301           ::encode(filter_out, osd_op.outdata);
1302         dout(10) << " pgnls result=" << result << " outdata.length()="
1303                  << osd_op.outdata.length() << dendl;
1304       }
1305       break;
1306
1307     case CEPH_OSD_OP_PGLS_FILTER:
1308       try {
1309         ::decode(cname, bp);
1310         ::decode(mname, bp);
1311       }
1312       catch (const buffer::error& e) {
1313         dout(0) << "unable to decode PGLS_FILTER description in " << *m << dendl;
1314         result = -EINVAL;
1315         break;
1316       }
1317       if (filter) {
1318         delete filter;
1319         filter = NULL;
1320       }
1321       result = get_pgls_filter(bp, &filter);
1322       if (result < 0)
1323         break;
1324
1325       assert(filter);
1326
1327       // fall through
1328
1329     case CEPH_OSD_OP_PGLS:
1330       if (snapid != CEPH_NOSNAP) {
1331         result = -EINVAL;
1332         break;
1333       }
1334       if (get_osdmap()->raw_pg_to_pg(m->get_pg()) != info.pgid.pgid) {
1335         dout(10) << " pgls pg=" << m->get_pg()
1336                  << " " << get_osdmap()->raw_pg_to_pg(m->get_pg())
1337                  << " != " << info.pgid << dendl;
1338         result = 0; // hmm?
1339       } else {
1340         unsigned list_size = MIN(cct->_conf->osd_max_pgls, p->op.pgls.count);
1341
1342         dout(10) << " pgls pg=" << m->get_pg() << " count " << list_size << dendl;
1343         // read into a buffer
1344         vector<hobject_t> sentries;
1345         pg_ls_response_t response;
1346         try {
1347           ::decode(response.handle, bp);
1348         }
1349         catch (const buffer::error& e) {
1350           dout(0) << "unable to decode PGLS handle in " << *m << dendl;
1351           result = -EINVAL;
1352           break;
1353         }
1354
1355         hobject_t next;
1356         hobject_t current = response.handle;
1357         osr->flush();
1358         int r = pgbackend->objects_list_partial(
1359           current,
1360           list_size,
1361           list_size,
1362           &sentries,
1363           &next);
1364         if (r != 0) {
1365           result = -EINVAL;
1366           break;
1367         }
1368
1369         assert(snapid == CEPH_NOSNAP || pg_log.get_missing().get_items().empty());
1370
1371         map<hobject_t, pg_missing_item>::const_iterator missing_iter =
1372           pg_log.get_missing().get_items().lower_bound(current);
1373         vector<hobject_t>::iterator ls_iter = sentries.begin();
1374         hobject_t _max = hobject_t::get_max();
1375         while (1) {
1376           const hobject_t &mcand =
1377             missing_iter == pg_log.get_missing().get_items().end() ?
1378             _max :
1379             missing_iter->first;
1380           const hobject_t &lcand =
1381             ls_iter == sentries.end() ?
1382             _max :
1383             *ls_iter;
1384
1385           hobject_t candidate;
1386           if (mcand == lcand) {
1387             candidate = mcand;
1388             if (!mcand.is_max()) {
1389               ++ls_iter;
1390               ++missing_iter;
1391             }
1392           } else if (mcand < lcand) {
1393             candidate = mcand;
1394             assert(!mcand.is_max());
1395             ++missing_iter;
1396           } else {
1397             candidate = lcand;
1398             assert(!lcand.is_max());
1399             ++ls_iter;
1400           }
1401
1402           if (candidate >= next) {
1403             break;
1404           }
1405
1406           if (response.entries.size() == list_size) {
1407             next = candidate;
1408             break;
1409           }
1410
1411           // skip snapdir objects
1412           if (candidate.snap == CEPH_SNAPDIR)
1413             continue;
1414
1415           if (candidate.snap != CEPH_NOSNAP)
1416             continue;
1417
1418           // skip wrong namespace
1419           if (candidate.get_namespace() != m->get_hobj().nspace)
1420             continue;
1421
1422           if (missing_loc.is_deleted(candidate))
1423             continue;
1424
1425           if (filter && !pgls_filter(filter, candidate, filter_out))
1426             continue;
1427
1428           response.entries.push_back(make_pair(candidate.oid,
1429                                                candidate.get_key()));
1430         }
1431         if (next.is_max() &&
1432             missing_iter == pg_log.get_missing().get_items().end() &&
1433             ls_iter == sentries.end()) {
1434           result = 1;
1435         }
1436         response.handle = next;
1437         ::encode(response, osd_op.outdata);
1438         if (filter)
1439           ::encode(filter_out, osd_op.outdata);
1440         dout(10) << " pgls result=" << result << " outdata.length()="
1441                  << osd_op.outdata.length() << dendl;
1442       }
1443       break;
1444
1445     case CEPH_OSD_OP_PG_HITSET_LS:
1446       {
1447         list< pair<utime_t,utime_t> > ls;
1448         for (list<pg_hit_set_info_t>::const_iterator p = info.hit_set.history.begin();
1449              p != info.hit_set.history.end();
1450              ++p)
1451           ls.push_back(make_pair(p->begin, p->end));
1452         if (hit_set)
1453           ls.push_back(make_pair(hit_set_start_stamp, utime_t()));
1454         ::encode(ls, osd_op.outdata);
1455       }
1456       break;
1457
1458     case CEPH_OSD_OP_PG_HITSET_GET:
1459       {
1460         utime_t stamp(osd_op.op.hit_set_get.stamp);
1461         if (hit_set_start_stamp && stamp >= hit_set_start_stamp) {
1462           // read the current in-memory HitSet, not the version we've
1463           // checkpointed.
1464           if (!hit_set) {
1465             result= -ENOENT;
1466             break;
1467           }
1468           ::encode(*hit_set, osd_op.outdata);
1469           result = osd_op.outdata.length();
1470         } else {
1471           // read an archived HitSet.
1472           hobject_t oid;
1473           for (list<pg_hit_set_info_t>::const_iterator p = info.hit_set.history.begin();
1474                p != info.hit_set.history.end();
1475                ++p) {
1476             if (stamp >= p->begin && stamp <= p->end) {
1477               oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
1478               break;
1479             }
1480           }
1481           if (oid == hobject_t()) {
1482             result = -ENOENT;
1483             break;
1484           }
1485           if (!pool.info.is_replicated()) {
1486             // FIXME: EC not supported yet
1487             result = -EOPNOTSUPP;
1488             break;
1489           }
1490           if (is_unreadable_object(oid)) {
1491             wait_for_unreadable_object(oid, op);
1492             delete filter;
1493             return;
1494           }
1495           result = osd->store->read(ch, ghobject_t(oid), 0, 0, osd_op.outdata);
1496         }
1497       }
1498       break;
1499
1500    case CEPH_OSD_OP_SCRUBLS:
1501       result = do_scrub_ls(m, &osd_op);
1502       break;
1503
1504     default:
1505       result = -EINVAL;
1506       break;
1507     }
1508
1509     if (result < 0)
1510       break;
1511   }
1512
1513   // reply
1514   MOSDOpReply *reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(),
1515                                        CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK,
1516                                        false);
1517   reply->claim_op_out_data(ops);
1518   reply->set_result(result);
1519   reply->set_reply_versions(info.last_update, info.last_user_version);
1520   osd->send_message_osd_client(reply, m->get_connection());
1521   delete filter;
1522 }
1523
1524 int PrimaryLogPG::do_scrub_ls(MOSDOp *m, OSDOp *osd_op)
1525 {
1526   if (m->get_pg() != info.pgid.pgid) {
1527     dout(10) << " scrubls pg=" << m->get_pg() << " != " << info.pgid << dendl;
1528     return -EINVAL; // hmm?
1529   }
1530   auto bp = osd_op->indata.begin();
1531   scrub_ls_arg_t arg;
1532   try {
1533     arg.decode(bp);
1534   } catch (buffer::error&) {
1535     dout(10) << " corrupted scrub_ls_arg_t" << dendl;
1536     return -EINVAL;
1537   }
1538   int r = 0;
1539   scrub_ls_result_t result = {.interval = info.history.same_interval_since};
1540   if (arg.interval != 0 && arg.interval != info.history.same_interval_since) {
1541     r = -EAGAIN;
1542   } else if (!scrubber.store) {
1543     r = -ENOENT;
1544   } else if (arg.get_snapsets) {
1545     result.vals = scrubber.store->get_snap_errors(osd->store,
1546                                                   get_pgid().pool(),
1547                                                   arg.start_after,
1548                                                   arg.max_return);
1549   } else {
1550     result.vals = scrubber.store->get_object_errors(osd->store,
1551                                                     get_pgid().pool(),
1552                                                     arg.start_after,
1553                                                     arg.max_return);
1554   }
1555   ::encode(result, osd_op->outdata);
1556   return r;
1557 }
1558
1559 void PrimaryLogPG::calc_trim_to()
1560 {
1561   size_t target = cct->_conf->osd_min_pg_log_entries;
1562   if (is_degraded() ||
1563       state_test(PG_STATE_RECOVERING |
1564                  PG_STATE_RECOVERY_WAIT |
1565                  PG_STATE_BACKFILLING |
1566                  PG_STATE_BACKFILL_WAIT |
1567                  PG_STATE_BACKFILL_TOOFULL)) {
1568     target = cct->_conf->osd_max_pg_log_entries;
1569   }
1570
1571   eversion_t limit = MIN(
1572     min_last_complete_ondisk,
1573     pg_log.get_can_rollback_to());
1574   if (limit != eversion_t() &&
1575       limit != pg_trim_to &&
1576       pg_log.get_log().approx_size() > target) {
1577     size_t num_to_trim = MIN(pg_log.get_log().approx_size() - target,
1578                              cct->_conf->osd_pg_log_trim_max);
1579     if (num_to_trim < cct->_conf->osd_pg_log_trim_min &&
1580         cct->_conf->osd_pg_log_trim_max >= cct->_conf->osd_pg_log_trim_min) {
1581       return;
1582     }
1583     list<pg_log_entry_t>::const_iterator it = pg_log.get_log().log.begin();
1584     eversion_t new_trim_to;
1585     for (size_t i = 0; i < num_to_trim; ++i) {
1586       new_trim_to = it->version;
1587       ++it;
1588       if (new_trim_to > limit) {
1589         new_trim_to = limit;
1590         dout(10) << "calc_trim_to trimming to min_last_complete_ondisk" << dendl;
1591         break;
1592       }
1593     }
1594     dout(10) << "calc_trim_to " << pg_trim_to << " -> " << new_trim_to << dendl;
1595     pg_trim_to = new_trim_to;
1596     assert(pg_trim_to <= pg_log.get_head());
1597     assert(pg_trim_to <= min_last_complete_ondisk);
1598   }
1599 }
1600
1601 PrimaryLogPG::PrimaryLogPG(OSDService *o, OSDMapRef curmap,
1602                            const PGPool &_pool, spg_t p) :
1603   PG(o, curmap, _pool, p),
1604   pgbackend(
1605     PGBackend::build_pg_backend(
1606       _pool.info, curmap, this, coll_t(p), ch, o->store, cct)),
1607   object_contexts(o->cct, o->cct->_conf->osd_pg_object_context_cache_count),
1608   snapset_contexts_lock("PrimaryLogPG::snapset_contexts_lock"),
1609   new_backfill(false),
1610   temp_seq(0),
1611   snap_trimmer_machine(this)
1612 {
1613   missing_loc.set_backend_predicates(
1614     pgbackend->get_is_readable_predicate(),
1615     pgbackend->get_is_recoverable_predicate());
1616   snap_trimmer_machine.initiate();
1617 }
1618
1619 void PrimaryLogPG::get_src_oloc(const object_t& oid, const object_locator_t& oloc, object_locator_t& src_oloc)
1620 {
1621   src_oloc = oloc;
1622   if (oloc.key.empty())
1623     src_oloc.key = oid.name;
1624 }
1625
1626 void PrimaryLogPG::handle_backoff(OpRequestRef& op)
1627 {
1628   const MOSDBackoff *m = static_cast<const MOSDBackoff*>(op->get_req());
1629   SessionRef session = static_cast<Session*>(m->get_connection()->get_priv());
1630   if (!session)
1631     return;  // drop it.
1632   session->put();  // get_priv takes a ref, and so does the SessionRef
1633   hobject_t begin = info.pgid.pgid.get_hobj_start();
1634   hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1635   if (begin < m->begin) {
1636     begin = m->begin;
1637   }
1638   if (end > m->end) {
1639     end = m->end;
1640   }
1641   dout(10) << __func__ << " backoff ack id " << m->id
1642            << " [" << begin << "," << end << ")" << dendl;
1643   session->ack_backoff(cct, m->pgid, m->id, begin, end);
1644 }
1645
1646 void PrimaryLogPG::do_request(
1647   OpRequestRef& op,
1648   ThreadPool::TPHandle &handle)
1649 {
1650   if (op->osd_trace) {
1651     op->pg_trace.init("pg op", &trace_endpoint, &op->osd_trace);
1652     op->pg_trace.event("do request");
1653   }
1654   // make sure we have a new enough map
1655   auto p = waiting_for_map.find(op->get_source());
1656   if (p != waiting_for_map.end()) {
1657     // preserve ordering
1658     dout(20) << __func__ << " waiting_for_map "
1659              << p->first << " not empty, queueing" << dendl;
1660     p->second.push_back(op);
1661     op->mark_delayed("waiting_for_map not empty");
1662     return;
1663   }
1664   if (!have_same_or_newer_map(op->min_epoch)) {
1665     dout(20) << __func__ << " min " << op->min_epoch
1666              << ", queue on waiting_for_map " << op->get_source() << dendl;
1667     waiting_for_map[op->get_source()].push_back(op);
1668     op->mark_delayed("op must wait for map");
1669     osd->request_osdmap_update(op->min_epoch);
1670     return;
1671   }
1672
1673   if (can_discard_request(op)) {
1674     return;
1675   }
1676
1677   // pg-wide backoffs
1678   const Message *m = op->get_req();
1679   if (m->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF)) {
1680     SessionRef session = static_cast<Session*>(m->get_connection()->get_priv());
1681     if (!session)
1682       return;  // drop it.
1683     session->put();  // get_priv takes a ref, and so does the SessionRef
1684
1685     if (op->get_req()->get_type() == CEPH_MSG_OSD_OP) {
1686       if (session->check_backoff(cct, info.pgid,
1687                                  info.pgid.pgid.get_hobj_start(), m)) {
1688         return;
1689       }
1690
1691       bool backoff =
1692         is_down() ||
1693         is_incomplete() ||
1694         (!is_active() && is_peered());
1695       if (g_conf->osd_backoff_on_peering && !backoff) {
1696         if (is_peering()) {
1697           backoff = true;
1698         }
1699       }
1700       if (backoff) {
1701         add_pg_backoff(session);
1702         return;
1703       }
1704     }
1705     // pg backoff acks at pg-level
1706     if (op->get_req()->get_type() == CEPH_MSG_OSD_BACKOFF) {
1707       const MOSDBackoff *ba = static_cast<const MOSDBackoff*>(m);
1708       if (ba->begin != ba->end) {
1709         handle_backoff(op);
1710         return;
1711       }
1712     }
1713   }
1714
1715   if (!is_peered()) {
1716     // Delay unless PGBackend says it's ok
1717     if (pgbackend->can_handle_while_inactive(op)) {
1718       bool handled = pgbackend->handle_message(op);
1719       assert(handled);
1720       return;
1721     } else {
1722       waiting_for_peered.push_back(op);
1723       op->mark_delayed("waiting for peered");
1724       return;
1725     }
1726   }
1727
1728   if (flushes_in_progress > 0) {
1729     dout(20) << flushes_in_progress
1730              << " flushes_in_progress pending "
1731              << "waiting for flush on " << op << dendl;
1732     waiting_for_flush.push_back(op);
1733     op->mark_delayed("waiting for flush");
1734     return;
1735   }
1736
1737   assert(is_peered() && flushes_in_progress == 0);
1738   if (pgbackend->handle_message(op))
1739     return;
1740
1741   switch (op->get_req()->get_type()) {
1742   case CEPH_MSG_OSD_OP:
1743   case CEPH_MSG_OSD_BACKOFF:
1744     if (!is_active()) {
1745       dout(20) << " peered, not active, waiting for active on " << op << dendl;
1746       waiting_for_active.push_back(op);
1747       op->mark_delayed("waiting for active");
1748       return;
1749     }
1750     switch (op->get_req()->get_type()) {
1751     case CEPH_MSG_OSD_OP:
1752       // verify client features
1753       if ((pool.info.has_tiers() || pool.info.is_tier()) &&
1754           !op->has_feature(CEPH_FEATURE_OSD_CACHEPOOL)) {
1755         osd->reply_op_error(op, -EOPNOTSUPP);
1756         return;
1757       }
1758       do_op(op);
1759       break;
1760     case CEPH_MSG_OSD_BACKOFF:
1761       // object-level backoff acks handled in osdop context
1762       handle_backoff(op);
1763       break;
1764     }
1765     break;
1766
1767   case MSG_OSD_SUBOP:
1768     do_sub_op(op);
1769     break;
1770
1771   case MSG_OSD_SUBOPREPLY:
1772     do_sub_op_reply(op);
1773     break;
1774
1775   case MSG_OSD_PG_SCAN:
1776     do_scan(op, handle);
1777     break;
1778
1779   case MSG_OSD_PG_BACKFILL:
1780     do_backfill(op);
1781     break;
1782
1783   case MSG_OSD_PG_BACKFILL_REMOVE:
1784     do_backfill_remove(op);
1785     break;
1786
1787   case MSG_OSD_SCRUB_RESERVE:
1788     {
1789       const MOSDScrubReserve *m =
1790         static_cast<const MOSDScrubReserve*>(op->get_req());
1791       switch (m->type) {
1792       case MOSDScrubReserve::REQUEST:
1793         handle_scrub_reserve_request(op);
1794         break;
1795       case MOSDScrubReserve::GRANT:
1796         handle_scrub_reserve_grant(op, m->from);
1797         break;
1798       case MOSDScrubReserve::REJECT:
1799         handle_scrub_reserve_reject(op, m->from);
1800         break;
1801       case MOSDScrubReserve::RELEASE:
1802         handle_scrub_reserve_release(op);
1803         break;
1804       }
1805     }
1806     break;
1807
1808   case MSG_OSD_REP_SCRUB:
1809     replica_scrub(op, handle);
1810     break;
1811
1812   case MSG_OSD_REP_SCRUBMAP:
1813     do_replica_scrub_map(op);
1814     break;
1815
1816   case MSG_OSD_PG_UPDATE_LOG_MISSING:
1817     do_update_log_missing(op);
1818     break;
1819
1820   case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY:
1821     do_update_log_missing_reply(op);
1822     break;
1823
1824   default:
1825     assert(0 == "bad message type in do_request");
1826   }
1827 }
1828
1829 hobject_t PrimaryLogPG::earliest_backfill() const
1830 {
1831   hobject_t e = hobject_t::get_max();
1832   for (set<pg_shard_t>::iterator i = backfill_targets.begin();
1833        i != backfill_targets.end();
1834        ++i) {
1835     pg_shard_t bt = *i;
1836     map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(bt);
1837     assert(iter != peer_info.end());
1838     if (iter->second.last_backfill < e)
1839       e = iter->second.last_backfill;
1840   }
1841   return e;
1842 }
1843
1844 /** do_op - do an op
1845  * pg lock will be held (if multithreaded)
1846  * osd_lock NOT held.
1847  */
1848 void PrimaryLogPG::do_op(OpRequestRef& op)
1849 {
1850   FUNCTRACE();
1851   // NOTE: take a non-const pointer here; we must be careful not to
1852   // change anything that will break other reads on m (operator<<).
1853   MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
1854   assert(m->get_type() == CEPH_MSG_OSD_OP);
1855   if (m->finish_decode()) {
1856     op->reset_desc();   // for TrackedOp
1857     m->clear_payload();
1858   }
1859
1860   dout(20) << __func__ << ": op " << *m << dendl;
1861
1862   hobject_t head = m->get_hobj();
1863   head.snap = CEPH_NOSNAP;
1864
1865   if (!info.pgid.pgid.contains(
1866         info.pgid.pgid.get_split_bits(pool.info.get_pg_num()), head)) {
1867     derr << __func__ << " " << info.pgid.pgid << " does not contain "
1868          << head << " pg_num " << pool.info.get_pg_num() << " hash "
1869          << std::hex << head.get_hash() << std::dec << dendl;
1870     osd->clog->warn() << info.pgid.pgid << " does not contain " << head
1871                       << " op " << *m;
1872     assert(!cct->_conf->osd_debug_misdirected_ops);
1873     return;
1874   }
1875
1876   bool can_backoff =
1877     m->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF);
1878   SessionRef session;
1879   if (can_backoff) {
1880     session = static_cast<Session*>(m->get_connection()->get_priv());
1881     if (!session.get()) {
1882       dout(10) << __func__ << " no session" << dendl;
1883       return;
1884     }
1885     session->put();  // get_priv() takes a ref, and so does the intrusive_ptr
1886
1887     if (session->check_backoff(cct, info.pgid, head, m)) {
1888       return;
1889     }
1890   }
1891
1892   if (m->has_flag(CEPH_OSD_FLAG_PARALLELEXEC)) {
1893     // not implemented.
1894     dout(20) << __func__ << ": PARALLELEXEC not implemented " << *m << dendl;
1895     osd->reply_op_error(op, -EINVAL);
1896     return;
1897   }
1898
1899   if (op->rmw_flags == 0) {
1900     int r = osd->osd->init_op_flags(op);
1901     if (r) {
1902       osd->reply_op_error(op, r);
1903       return;
1904     }
1905   }
1906
1907   if ((m->get_flags() & (CEPH_OSD_FLAG_BALANCE_READS |
1908                          CEPH_OSD_FLAG_LOCALIZE_READS)) &&
1909       op->may_read() &&
1910       !(op->may_write() || op->may_cache())) {
1911     // balanced reads; any replica will do
1912     if (!(is_primary() || is_replica())) {
1913       osd->handle_misdirected_op(this, op);
1914       return;
1915     }
1916   } else {
1917     // normal case; must be primary
1918     if (!is_primary()) {
1919       osd->handle_misdirected_op(this, op);
1920       return;
1921     }
1922   }
1923
1924   if (!op_has_sufficient_caps(op)) {
1925     osd->reply_op_error(op, -EPERM);
1926     return;
1927   }
1928
1929   if (op->includes_pg_op()) {
1930     return do_pg_op(op);
1931   }
1932
1933   // object name too long?
1934   if (m->get_oid().name.size() > cct->_conf->osd_max_object_name_len) {
1935     dout(4) << "do_op name is longer than "
1936             << cct->_conf->osd_max_object_name_len
1937             << " bytes" << dendl;
1938     osd->reply_op_error(op, -ENAMETOOLONG);
1939     return;
1940   }
1941   if (m->get_hobj().get_key().size() > cct->_conf->osd_max_object_name_len) {
1942     dout(4) << "do_op locator is longer than "
1943             << cct->_conf->osd_max_object_name_len
1944             << " bytes" << dendl;
1945     osd->reply_op_error(op, -ENAMETOOLONG);
1946     return;
1947   }
1948   if (m->get_hobj().nspace.size() > cct->_conf->osd_max_object_namespace_len) {
1949     dout(4) << "do_op namespace is longer than "
1950             << cct->_conf->osd_max_object_namespace_len
1951             << " bytes" << dendl;
1952     osd->reply_op_error(op, -ENAMETOOLONG);
1953     return;
1954   }
1955
1956   if (int r = osd->store->validate_hobject_key(head)) {
1957     dout(4) << "do_op object " << head << " invalid for backing store: "
1958             << r << dendl;
1959     osd->reply_op_error(op, r);
1960     return;
1961   }
1962
1963   // blacklisted?
1964   if (get_osdmap()->is_blacklisted(m->get_source_addr())) {
1965     dout(10) << "do_op " << m->get_source_addr() << " is blacklisted" << dendl;
1966     osd->reply_op_error(op, -EBLACKLISTED);
1967     return;
1968   }
1969
1970   // order this op as a write?
1971   bool write_ordered = op->rwordered();
1972
1973   // discard due to cluster full transition?  (we discard any op that
1974   // originates before the cluster or pool is marked full; the client
1975   // will resend after the full flag is removed or if they expect the
1976   // op to succeed despite being full).  The except is FULL_FORCE and
1977   // FULL_TRY ops, which there is no reason to discard because they
1978   // bypass all full checks anyway.  If this op isn't write or
1979   // read-ordered, we skip.
1980   // FIXME: we exclude mds writes for now.
1981   if (write_ordered && !(m->get_source().is_mds() ||
1982                          m->has_flag(CEPH_OSD_FLAG_FULL_TRY) ||
1983                          m->has_flag(CEPH_OSD_FLAG_FULL_FORCE)) &&
1984       info.history.last_epoch_marked_full > m->get_map_epoch()) {
1985     dout(10) << __func__ << " discarding op sent before full " << m << " "
1986              << *m << dendl;
1987     return;
1988   }
1989   // mds should have stopped writing before this point.
1990   // We can't allow OSD to become non-startable even if mds
1991   // could be writing as part of file removals.
1992   ostringstream ss;
1993   if (write_ordered && osd->check_failsafe_full(ss)) {
1994     dout(10) << __func__ << " fail-safe full check failed, dropping request"
1995              << ss.str()
1996              << dendl;
1997     return;
1998   }
1999   int64_t poolid = get_pgid().pool();
2000   if (op->may_write()) {
2001
2002     const pg_pool_t *pi = get_osdmap()->get_pg_pool(poolid);
2003     if (!pi) {
2004       return;
2005     }
2006
2007     // invalid?
2008     if (m->get_snapid() != CEPH_NOSNAP) {
2009       dout(20) << __func__ << ": write to clone not valid " << *m << dendl;
2010       osd->reply_op_error(op, -EINVAL);
2011       return;
2012     }
2013
2014     // too big?
2015     if (cct->_conf->osd_max_write_size &&
2016         m->get_data_len() > cct->_conf->osd_max_write_size << 20) {
2017       // journal can't hold commit!
2018       derr << "do_op msg data len " << m->get_data_len()
2019            << " > osd_max_write_size " << (cct->_conf->osd_max_write_size << 20)
2020            << " on " << *m << dendl;
2021       osd->reply_op_error(op, -OSD_WRITETOOBIG);
2022       return;
2023     }
2024   }
2025
2026   dout(10) << "do_op " << *m
2027            << (op->may_write() ? " may_write" : "")
2028            << (op->may_read() ? " may_read" : "")
2029            << (op->may_cache() ? " may_cache" : "")
2030            << " -> " << (write_ordered ? "write-ordered" : "read-ordered")
2031            << " flags " << ceph_osd_flag_string(m->get_flags())
2032            << dendl;
2033
2034   // missing object?
2035   if (is_unreadable_object(head)) {
2036     if (!is_primary()) {
2037       osd->reply_op_error(op, -EAGAIN);
2038       return;
2039     }
2040     if (can_backoff &&
2041         (g_conf->osd_backoff_on_degraded ||
2042          (g_conf->osd_backoff_on_unfound && missing_loc.is_unfound(head)))) {
2043       add_backoff(session, head, head);
2044       maybe_kick_recovery(head);
2045     } else {
2046       wait_for_unreadable_object(head, op);
2047     }
2048     return;
2049   }
2050
2051   // degraded object?
2052   if (write_ordered && is_degraded_or_backfilling_object(head)) {
2053     if (can_backoff && g_conf->osd_backoff_on_degraded) {
2054       add_backoff(session, head, head);
2055       maybe_kick_recovery(head);
2056     } else {
2057       wait_for_degraded_object(head, op);
2058     }
2059     return;
2060   }
2061
2062   if (write_ordered && scrubber.is_chunky_scrub_active() &&
2063       write_blocked_by_scrub(head)) {
2064     dout(20) << __func__ << ": waiting for scrub" << dendl;
2065     waiting_for_scrub.push_back(op);
2066     op->mark_delayed("waiting for scrub");
2067     return;
2068   }
2069
2070   // blocked on snap?
2071   map<hobject_t, snapid_t>::iterator blocked_iter =
2072     objects_blocked_on_degraded_snap.find(head);
2073   if (write_ordered && blocked_iter != objects_blocked_on_degraded_snap.end()) {
2074     hobject_t to_wait_on(head);
2075     to_wait_on.snap = blocked_iter->second;
2076     wait_for_degraded_object(to_wait_on, op);
2077     return;
2078   }
2079   map<hobject_t, ObjectContextRef>::iterator blocked_snap_promote_iter =
2080     objects_blocked_on_snap_promotion.find(head);
2081   if (write_ordered &&
2082       blocked_snap_promote_iter != objects_blocked_on_snap_promotion.end()) {
2083     wait_for_blocked_object(
2084       blocked_snap_promote_iter->second->obs.oi.soid,
2085       op);
2086     return;
2087   }
2088   if (write_ordered && objects_blocked_on_cache_full.count(head)) {
2089     block_write_on_full_cache(head, op);
2090     return;
2091   }
2092
2093   // missing snapdir?
2094   hobject_t snapdir = head.get_snapdir();
2095
2096   if (is_unreadable_object(snapdir)) {
2097     wait_for_unreadable_object(snapdir, op);
2098     return;
2099   }
2100
2101   // degraded object?
2102   if (write_ordered && is_degraded_or_backfilling_object(snapdir)) {
2103     wait_for_degraded_object(snapdir, op);
2104     return;
2105   }
2106
2107   // dup/resent?
2108   if (op->may_write() || op->may_cache()) {
2109     // warning: we will get back *a* request for this reqid, but not
2110     // necessarily the most recent.  this happens with flush and
2111     // promote ops, but we can't possible have both in our log where
2112     // the original request is still not stable on disk, so for our
2113     // purposes here it doesn't matter which one we get.
2114     eversion_t version;
2115     version_t user_version;
2116     int return_code = 0;
2117     bool got = check_in_progress_op(
2118       m->get_reqid(), &version, &user_version, &return_code);
2119     if (got) {
2120       dout(3) << __func__ << " dup " << m->get_reqid()
2121               << " version " << version << dendl;
2122       if (already_complete(version)) {
2123         osd->reply_op_error(op, return_code, version, user_version);
2124       } else {
2125         dout(10) << " waiting for " << version << " to commit" << dendl;
2126         // always queue ondisk waiters, so that we can requeue if needed
2127         waiting_for_ondisk[version].push_back(make_pair(op, user_version));
2128         op->mark_delayed("waiting for ondisk");
2129       }
2130       return;
2131     }
2132   }
2133
2134   ObjectContextRef obc;
2135   bool can_create = op->may_write() || op->may_cache();
2136   hobject_t missing_oid;
2137   const hobject_t& oid = m->get_hobj();
2138
2139   // io blocked on obc?
2140   if (!m->has_flag(CEPH_OSD_FLAG_FLUSH) &&
2141       maybe_await_blocked_snapset(oid, op)) {
2142     return;
2143   }
2144
2145   int r = find_object_context(
2146     oid, &obc, can_create,
2147     m->has_flag(CEPH_OSD_FLAG_MAP_SNAP_CLONE),
2148     &missing_oid);
2149
2150   if (r == -EAGAIN) {
2151     // If we're not the primary of this OSD, we just return -EAGAIN. Otherwise,
2152     // we have to wait for the object.
2153     if (is_primary()) {
2154       // missing the specific snap we need; requeue and wait.
2155       assert(!op->may_write()); // only happens on a read/cache
2156       wait_for_unreadable_object(missing_oid, op);
2157       return;
2158     }
2159   } else if (r == 0) {
2160     if (is_unreadable_object(obc->obs.oi.soid)) {
2161       dout(10) << __func__ << ": clone " << obc->obs.oi.soid
2162                << " is unreadable, waiting" << dendl;
2163       wait_for_unreadable_object(obc->obs.oi.soid, op);
2164       return;
2165     }
2166
2167     // degraded object?  (the check above was for head; this could be a clone)
2168     if (write_ordered &&
2169         obc->obs.oi.soid.snap != CEPH_NOSNAP &&
2170         is_degraded_or_backfilling_object(obc->obs.oi.soid)) {
2171       dout(10) << __func__ << ": clone " << obc->obs.oi.soid
2172                << " is degraded, waiting" << dendl;
2173       wait_for_degraded_object(obc->obs.oi.soid, op);
2174       return;
2175     }
2176   }
2177
2178   bool in_hit_set = false;
2179   if (hit_set) {
2180     if (obc.get()) {
2181       if (obc->obs.oi.soid != hobject_t() && hit_set->contains(obc->obs.oi.soid))
2182         in_hit_set = true;
2183     } else {
2184       if (missing_oid != hobject_t() && hit_set->contains(missing_oid))
2185         in_hit_set = true;
2186     }
2187     if (!op->hitset_inserted) {
2188       hit_set->insert(oid);
2189       op->hitset_inserted = true;
2190       if (hit_set->is_full() ||
2191           hit_set_start_stamp + pool.info.hit_set_period <= m->get_recv_stamp()) {
2192         hit_set_persist();
2193       }
2194     }
2195   }
2196
2197   if (agent_state) {
2198     if (agent_choose_mode(false, op))
2199       return;
2200   }
2201
2202   if (obc.get() && obc->obs.exists && obc->obs.oi.has_manifest()) {
2203     if (maybe_handle_manifest(op,
2204                                write_ordered,
2205                                obc))
2206     return;
2207   }
2208
2209   if (maybe_handle_cache(op,
2210                          write_ordered,
2211                          obc,
2212                          r,
2213                          missing_oid,
2214                          false,
2215                          in_hit_set))
2216     return;
2217
2218   if (r && (r != -ENOENT || !obc)) {
2219     // copy the reqids for copy get on ENOENT
2220     if (r == -ENOENT &&
2221         (m->ops[0].op.op == CEPH_OSD_OP_COPY_GET)) {
2222       fill_in_copy_get_noent(op, oid, m->ops[0]);
2223       return;
2224     }
2225     dout(20) << __func__ << ": find_object_context got error " << r << dendl;
2226     if (op->may_write() &&
2227         get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
2228       record_write_error(op, oid, nullptr, r);
2229     } else {
2230       osd->reply_op_error(op, r);
2231     }
2232     return;
2233   }
2234
2235   // make sure locator is consistent
2236   object_locator_t oloc(obc->obs.oi.soid);
2237   if (m->get_object_locator() != oloc) {
2238     dout(10) << " provided locator " << m->get_object_locator()
2239              << " != object's " << obc->obs.oi.soid << dendl;
2240     osd->clog->warn() << "bad locator " << m->get_object_locator()
2241                      << " on object " << oloc
2242                       << " op " << *m;
2243   }
2244
2245   // io blocked on obc?
2246   if (obc->is_blocked() &&
2247       !m->has_flag(CEPH_OSD_FLAG_FLUSH)) {
2248     wait_for_blocked_object(obc->obs.oi.soid, op);
2249     return;
2250   }
2251
2252   dout(25) << __func__ << " oi " << obc->obs.oi << dendl;
2253
2254   for (vector<OSDOp>::iterator p = m->ops.begin(); p != m->ops.end(); ++p) {
2255     OSDOp& osd_op = *p;
2256
2257     // make sure LIST_SNAPS is on CEPH_SNAPDIR and nothing else
2258     if (osd_op.op.op == CEPH_OSD_OP_LIST_SNAPS &&
2259         m->get_snapid() != CEPH_SNAPDIR) {
2260       dout(10) << "LIST_SNAPS with incorrect context" << dendl;
2261       osd->reply_op_error(op, -EINVAL);
2262       return;
2263     }
2264   }
2265
2266   OpContext *ctx = new OpContext(op, m->get_reqid(), &m->ops, obc, this);
2267
2268   if (!obc->obs.exists)
2269     ctx->snapset_obc = get_object_context(obc->obs.oi.soid.get_snapdir(), false);
2270
2271   /* Due to obc caching, we might have a cached non-existent snapset_obc
2272    * for the snapdir.  If so, we can ignore it.  Subsequent parts of the
2273    * do_op pipeline make decisions based on whether snapset_obc is
2274    * populated.
2275    */
2276   if (ctx->snapset_obc && !ctx->snapset_obc->obs.exists)
2277     ctx->snapset_obc = ObjectContextRef();
2278
2279   if (m->has_flag(CEPH_OSD_FLAG_SKIPRWLOCKS)) {
2280     dout(20) << __func__ << ": skipping rw locks" << dendl;
2281   } else if (m->get_flags() & CEPH_OSD_FLAG_FLUSH) {
2282     dout(20) << __func__ << ": part of flush, will ignore write lock" << dendl;
2283
2284     // verify there is in fact a flush in progress
2285     // FIXME: we could make this a stronger test.
2286     map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(obc->obs.oi.soid);
2287     if (p == flush_ops.end()) {
2288       dout(10) << __func__ << " no flush in progress, aborting" << dendl;
2289       reply_ctx(ctx, -EINVAL);
2290       return;
2291     }
2292   } else if (!get_rw_locks(write_ordered, ctx)) {
2293     dout(20) << __func__ << " waiting for rw locks " << dendl;
2294     op->mark_delayed("waiting for rw locks");
2295     close_op_ctx(ctx);
2296     return;
2297   }
2298   dout(20) << __func__ << " obc " << *obc << dendl;
2299
2300   if (r) {
2301     dout(20) << __func__ << " returned an error: " << r << dendl;
2302     close_op_ctx(ctx);
2303     if (op->may_write() &&
2304         get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
2305       record_write_error(op, oid, nullptr, r);
2306     } else {
2307       osd->reply_op_error(op, r);
2308     }
2309     return;
2310   }
2311
2312   if (m->has_flag(CEPH_OSD_FLAG_IGNORE_CACHE)) {
2313     ctx->ignore_cache = true;
2314   }
2315
2316   if ((op->may_read()) && (obc->obs.oi.is_lost())) {
2317     // This object is lost. Reading from it returns an error.
2318     dout(20) << __func__ << ": object " << obc->obs.oi.soid
2319              << " is lost" << dendl;
2320     reply_ctx(ctx, -ENFILE);
2321     return;
2322   }
2323   if (!op->may_write() &&
2324       !op->may_cache() &&
2325       (!obc->obs.exists ||
2326        ((m->get_snapid() != CEPH_SNAPDIR) &&
2327         obc->obs.oi.is_whiteout()))) {
2328     // copy the reqids for copy get on ENOENT
2329     if (m->ops[0].op.op == CEPH_OSD_OP_COPY_GET) {
2330       fill_in_copy_get_noent(op, oid, m->ops[0]);
2331       close_op_ctx(ctx);
2332       return;
2333     }
2334     reply_ctx(ctx, -ENOENT);
2335     return;
2336   }
2337
2338   op->mark_started();
2339
2340   execute_ctx(ctx);
2341   utime_t prepare_latency = ceph_clock_now();
2342   prepare_latency -= op->get_dequeued_time();
2343   osd->logger->tinc(l_osd_op_prepare_lat, prepare_latency);
2344   if (op->may_read() && op->may_write()) {
2345     osd->logger->tinc(l_osd_op_rw_prepare_lat, prepare_latency);
2346   } else if (op->may_read()) {
2347     osd->logger->tinc(l_osd_op_r_prepare_lat, prepare_latency);
2348   } else if (op->may_write() || op->may_cache()) {
2349     osd->logger->tinc(l_osd_op_w_prepare_lat, prepare_latency);
2350   }
2351
2352   // force recovery of the oldest missing object if too many logs
2353   maybe_force_recovery();
2354 }
2355
2356 PrimaryLogPG::cache_result_t PrimaryLogPG::maybe_handle_manifest_detail(
2357   OpRequestRef op,
2358   bool write_ordered,
2359   ObjectContextRef obc)
2360 {
2361   if (static_cast<const MOSDOp *>(op->get_req())->get_flags() &
2362       CEPH_OSD_FLAG_IGNORE_REDIRECT) {
2363     dout(20) << __func__ << ": ignoring redirect due to flag" << dendl;
2364     return cache_result_t::NOOP;
2365   }
2366
2367   if (obc)
2368     dout(10) << __func__ << " " << obc->obs.oi << " "
2369        << (obc->obs.exists ? "exists" : "DNE")
2370        << dendl;
2371
2372   // if it is write-ordered and blocked, stop now
2373   if (obc.get() && obc->is_blocked() && write_ordered) {
2374     // we're already doing something with this object
2375     dout(20) << __func__ << " blocked on " << obc->obs.oi.soid << dendl;
2376     return cache_result_t::NOOP;
2377   }
2378
2379   vector<OSDOp> ops = static_cast<const MOSDOp*>(op->get_req())->ops;
2380   for (vector<OSDOp>::iterator p = ops.begin(); p != ops.end(); ++p) {
2381     OSDOp& osd_op = *p;
2382     ceph_osd_op& op = osd_op.op;
2383     if (op.op == CEPH_OSD_OP_SET_REDIRECT) {
2384       return cache_result_t::NOOP;
2385     }
2386   }
2387
2388   switch (obc->obs.oi.manifest.type) {
2389   case object_manifest_t::TYPE_REDIRECT:
2390     if (op->may_write() || write_ordered) {
2391       do_proxy_write(op, obc->obs.oi.soid, obc);
2392     } else {
2393       do_proxy_read(op, obc);
2394     }
2395     return cache_result_t::HANDLED_PROXY;
2396   case object_manifest_t::TYPE_CHUNKED:
2397   default:
2398     assert(0 == "unrecognized manifest type");
2399   }
2400
2401   return cache_result_t::NOOP;
2402 }
2403
2404 void PrimaryLogPG::record_write_error(OpRequestRef op, const hobject_t &soid,
2405                                       MOSDOpReply *orig_reply, int r)
2406 {
2407   dout(20) << __func__ << " r=" << r << dendl;
2408   assert(op->may_write());
2409   const osd_reqid_t &reqid = static_cast<const MOSDOp*>(op->get_req())->get_reqid();
2410   mempool::osd_pglog::list<pg_log_entry_t> entries;
2411   entries.push_back(pg_log_entry_t(pg_log_entry_t::ERROR, soid,
2412                                    get_next_version(), eversion_t(), 0,
2413                                    reqid, utime_t(), r));
2414
2415   struct OnComplete {
2416     PrimaryLogPG *pg;
2417     OpRequestRef op;
2418     boost::intrusive_ptr<MOSDOpReply> orig_reply;
2419     int r;
2420     OnComplete(
2421       PrimaryLogPG *pg,
2422       OpRequestRef op,
2423       MOSDOpReply *orig_reply,
2424       int r)
2425       : pg(pg), op(op),
2426         orig_reply(orig_reply, false /* take over ref */), r(r)
2427       {}
2428     void operator()() {
2429       ldpp_dout(pg, 20) << "finished " << __func__ << " r=" << r << dendl;
2430       const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
2431       int flags = m->get_flags() & (CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
2432       MOSDOpReply *reply = orig_reply.detach();
2433       if (reply == nullptr) {
2434         reply = new MOSDOpReply(m, r, pg->get_osdmap()->get_epoch(),
2435                                 flags, true);
2436       }
2437       ldpp_dout(pg, 10) << " sending commit on " << *m << " " << reply << dendl;
2438       pg->osd->send_message_osd_client(reply, m->get_connection());
2439     }
2440   };
2441
2442   ObcLockManager lock_manager;
2443   submit_log_entries(
2444     entries,
2445     std::move(lock_manager),
2446     boost::optional<std::function<void(void)> >(
2447       OnComplete(this, op, orig_reply, r)),
2448     op,
2449     r);
2450 }
2451
2452 PrimaryLogPG::cache_result_t PrimaryLogPG::maybe_handle_cache_detail(
2453   OpRequestRef op,
2454   bool write_ordered,
2455   ObjectContextRef obc,
2456   int r, hobject_t missing_oid,
2457   bool must_promote,
2458   bool in_hit_set,
2459   ObjectContextRef *promote_obc)
2460 {
2461   // return quickly if caching is not enabled
2462   if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)
2463     return cache_result_t::NOOP;
2464
2465   if (op &&
2466       op->get_req() &&
2467       op->get_req()->get_type() == CEPH_MSG_OSD_OP &&
2468       (static_cast<const MOSDOp *>(op->get_req())->get_flags() &
2469        CEPH_OSD_FLAG_IGNORE_CACHE)) {
2470     dout(20) << __func__ << ": ignoring cache due to flag" << dendl;
2471     return cache_result_t::NOOP;
2472   }
2473
2474   must_promote = must_promote || op->need_promote();
2475
2476   if (obc)
2477     dout(25) << __func__ << " " << obc->obs.oi << " "
2478              << (obc->obs.exists ? "exists" : "DNE")
2479              << " missing_oid " << missing_oid
2480              << " must_promote " << (int)must_promote
2481              << " in_hit_set " << (int)in_hit_set
2482              << dendl;
2483   else
2484     dout(25) << __func__ << " (no obc)"
2485              << " missing_oid " << missing_oid
2486              << " must_promote " << (int)must_promote
2487              << " in_hit_set " << (int)in_hit_set
2488              << dendl;
2489
2490   // if it is write-ordered and blocked, stop now
2491   if (obc.get() && obc->is_blocked() && write_ordered) {
2492     // we're already doing something with this object
2493     dout(20) << __func__ << " blocked on " << obc->obs.oi.soid << dendl;
2494     return cache_result_t::NOOP;
2495   }
2496
2497   if (r == -ENOENT && missing_oid == hobject_t()) {
2498     // we know this object is logically absent (e.g., an undefined clone)
2499     return cache_result_t::NOOP;
2500   }
2501
2502   if (obc.get() && obc->obs.exists) {
2503     osd->logger->inc(l_osd_op_cache_hit);
2504     return cache_result_t::NOOP;
2505   }
2506   if (!is_primary()) {
2507     dout(20) << __func__ << " cache miss; ask the primary" << dendl;
2508     osd->reply_op_error(op, -EAGAIN);
2509     return cache_result_t::REPLIED_WITH_EAGAIN;
2510   }
2511
2512   if (missing_oid == hobject_t() && obc.get()) {
2513     missing_oid = obc->obs.oi.soid;
2514   }
2515
2516   const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
2517   const object_locator_t oloc = m->get_object_locator();
2518
2519   if (op->need_skip_handle_cache()) {
2520     return cache_result_t::NOOP;
2521   }
2522
2523   // older versions do not proxy the feature bits.
2524   bool can_proxy_write = get_osdmap()->get_up_osd_features() &
2525     CEPH_FEATURE_OSD_PROXY_WRITE_FEATURES;
2526   OpRequestRef promote_op;
2527
2528   switch (pool.info.cache_mode) {
2529   case pg_pool_t::CACHEMODE_WRITEBACK:
2530     if (agent_state &&
2531         agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2532       if (!op->may_write() && !op->may_cache() &&
2533           !write_ordered && !must_promote) {
2534         dout(20) << __func__ << " cache pool full, proxying read" << dendl;
2535         do_proxy_read(op);
2536         return cache_result_t::HANDLED_PROXY;
2537       }
2538       dout(20) << __func__ << " cache pool full, waiting" << dendl;
2539       block_write_on_full_cache(missing_oid, op);
2540       return cache_result_t::BLOCKED_FULL;
2541     }
2542
2543     if (must_promote || (!hit_set && !op->need_skip_promote())) {
2544       promote_object(obc, missing_oid, oloc, op, promote_obc);
2545       return cache_result_t::BLOCKED_PROMOTE;
2546     }
2547
2548     if (op->may_write() || op->may_cache()) {
2549       if (can_proxy_write) {
2550         do_proxy_write(op, missing_oid);
2551       } else {
2552         // promote if can't proxy the write
2553         promote_object(obc, missing_oid, oloc, op, promote_obc);
2554         return cache_result_t::BLOCKED_PROMOTE;
2555       }
2556
2557       // Promote too?
2558       if (!op->need_skip_promote() &&
2559           maybe_promote(obc, missing_oid, oloc, in_hit_set,
2560                       pool.info.min_write_recency_for_promote,
2561                       OpRequestRef(),
2562                       promote_obc)) {
2563         return cache_result_t::BLOCKED_PROMOTE;
2564       }
2565       return cache_result_t::HANDLED_PROXY;
2566     } else {
2567       do_proxy_read(op);
2568
2569       // Avoid duplicate promotion
2570       if (obc.get() && obc->is_blocked()) {
2571         if (promote_obc)
2572           *promote_obc = obc;
2573         return cache_result_t::BLOCKED_PROMOTE;
2574       }
2575
2576       // Promote too?
2577       if (!op->need_skip_promote()) {
2578         (void)maybe_promote(obc, missing_oid, oloc, in_hit_set,
2579                             pool.info.min_read_recency_for_promote,
2580                             promote_op, promote_obc);
2581       }
2582
2583       return cache_result_t::HANDLED_PROXY;
2584     }
2585     assert(0 == "unreachable");
2586     return cache_result_t::NOOP;
2587
2588   case pg_pool_t::CACHEMODE_FORWARD:
2589     // FIXME: this mode allows requests to be reordered.
2590     do_cache_redirect(op);
2591     return cache_result_t::HANDLED_REDIRECT;
2592
2593   case pg_pool_t::CACHEMODE_READONLY:
2594     // TODO: clean this case up
2595     if (!obc.get() && r == -ENOENT) {
2596       // we don't have the object and op's a read
2597       promote_object(obc, missing_oid, oloc, op, promote_obc);
2598       return cache_result_t::BLOCKED_PROMOTE;
2599     }
2600     if (!r) { // it must be a write
2601       do_cache_redirect(op);
2602       return cache_result_t::HANDLED_REDIRECT;
2603     }
2604     // crap, there was a failure of some kind
2605     return cache_result_t::NOOP;
2606
2607   case pg_pool_t::CACHEMODE_READFORWARD:
2608     // Do writeback to the cache tier for writes
2609     if (op->may_write() || write_ordered || must_promote) {
2610       if (agent_state &&
2611           agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2612         dout(20) << __func__ << " cache pool full, waiting" << dendl;
2613         block_write_on_full_cache(missing_oid, op);
2614         return cache_result_t::BLOCKED_FULL;
2615       }
2616       promote_object(obc, missing_oid, oloc, op, promote_obc);
2617       return cache_result_t::BLOCKED_PROMOTE;
2618     }
2619
2620     // If it is a read, we can read, we need to forward it
2621     do_cache_redirect(op);
2622     return cache_result_t::HANDLED_REDIRECT;
2623
2624   case pg_pool_t::CACHEMODE_PROXY:
2625     if (!must_promote) {
2626       if (op->may_write() || op->may_cache() || write_ordered) {
2627         if (can_proxy_write) {
2628           do_proxy_write(op, missing_oid);
2629           return cache_result_t::HANDLED_PROXY;
2630         }
2631       } else {
2632         do_proxy_read(op);
2633         return cache_result_t::HANDLED_PROXY;
2634       }
2635     }
2636     // ugh, we're forced to promote.
2637     if (agent_state &&
2638         agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2639       dout(20) << __func__ << " cache pool full, waiting" << dendl;
2640       block_write_on_full_cache(missing_oid, op);
2641       return cache_result_t::BLOCKED_FULL;
2642     }
2643     promote_object(obc, missing_oid, oloc, op, promote_obc);
2644     return cache_result_t::BLOCKED_PROMOTE;
2645
2646   case pg_pool_t::CACHEMODE_READPROXY:
2647     // Do writeback to the cache tier for writes
2648     if (op->may_write() || write_ordered || must_promote) {
2649       if (agent_state &&
2650           agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2651         dout(20) << __func__ << " cache pool full, waiting" << dendl;
2652         block_write_on_full_cache(missing_oid, op);
2653         return cache_result_t::BLOCKED_FULL;
2654       }
2655       promote_object(obc, missing_oid, oloc, op, promote_obc);
2656       return cache_result_t::BLOCKED_PROMOTE;
2657     }
2658
2659     // If it is a read, we can read, we need to proxy it
2660     do_proxy_read(op);
2661     return cache_result_t::HANDLED_PROXY;
2662
2663   default:
2664     assert(0 == "unrecognized cache_mode");
2665   }
2666   return cache_result_t::NOOP;
2667 }
2668
2669 bool PrimaryLogPG::maybe_promote(ObjectContextRef obc,
2670                                  const hobject_t& missing_oid,
2671                                  const object_locator_t& oloc,
2672                                  bool in_hit_set,
2673                                  uint32_t recency,
2674                                  OpRequestRef promote_op,
2675                                  ObjectContextRef *promote_obc)
2676 {
2677   dout(20) << __func__ << " missing_oid " << missing_oid
2678            << "  in_hit_set " << in_hit_set << dendl;
2679
2680   switch (recency) {
2681   case 0:
2682     break;
2683   case 1:
2684     // Check if in the current hit set
2685     if (in_hit_set) {
2686       break;
2687     } else {
2688       // not promoting
2689       return false;
2690     }
2691     break;
2692   default:
2693     {
2694       unsigned count = (int)in_hit_set;
2695       if (count) {
2696         // Check if in other hit sets
2697         const hobject_t& oid = obc.get() ? obc->obs.oi.soid : missing_oid;
2698         for (map<time_t,HitSetRef>::reverse_iterator itor =
2699                agent_state->hit_set_map.rbegin();
2700              itor != agent_state->hit_set_map.rend();
2701              ++itor) {
2702           if (!itor->second->contains(oid)) {
2703             break;
2704           }
2705           ++count;
2706           if (count >= recency) {
2707             break;
2708           }
2709         }
2710       }
2711       if (count >= recency) {
2712         break;
2713       }
2714       return false;     // not promoting
2715     }
2716     break;
2717   }
2718
2719   if (osd->promote_throttle()) {
2720     dout(10) << __func__ << " promote throttled" << dendl;
2721     return false;
2722   }
2723   promote_object(obc, missing_oid, oloc, promote_op, promote_obc);
2724   return true;
2725 }
2726
2727 void PrimaryLogPG::do_cache_redirect(OpRequestRef op)
2728 {
2729   const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
2730   int flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
2731   MOSDOpReply *reply = new MOSDOpReply(m, -ENOENT,
2732                                        get_osdmap()->get_epoch(), flags, false);
2733   request_redirect_t redir(m->get_object_locator(), pool.info.tier_of);
2734   reply->set_redirect(redir);
2735   dout(10) << "sending redirect to pool " << pool.info.tier_of << " for op "
2736            << op << dendl;
2737   m->get_connection()->send_message(reply);
2738   return;
2739 }
2740
2741 struct C_ProxyRead : public Context {
2742   PrimaryLogPGRef pg;
2743   hobject_t oid;
2744   epoch_t last_peering_reset;
2745   ceph_tid_t tid;
2746   PrimaryLogPG::ProxyReadOpRef prdop;
2747   utime_t start;
2748   C_ProxyRead(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
2749              const PrimaryLogPG::ProxyReadOpRef& prd)
2750     : pg(p), oid(o), last_peering_reset(lpr),
2751       tid(0), prdop(prd), start(ceph_clock_now())
2752   {}
2753   void finish(int r) override {
2754     if (prdop->canceled)
2755       return;
2756     pg->lock();
2757     if (prdop->canceled) {
2758       pg->unlock();
2759       return;
2760     }
2761     if (last_peering_reset == pg->get_last_peering_reset()) {
2762       pg->finish_proxy_read(oid, tid, r);
2763       pg->osd->logger->tinc(l_osd_tier_r_lat, ceph_clock_now() - start);
2764     }
2765     pg->unlock();
2766   }
2767 };
2768
2769 void PrimaryLogPG::do_proxy_read(OpRequestRef op, ObjectContextRef obc)
2770 {
2771   // NOTE: non-const here because the ProxyReadOp needs mutable refs to
2772   // stash the result in the request's OSDOp vector
2773   MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
2774   object_locator_t oloc;
2775   hobject_t soid;
2776   /* extensible tier */
2777   if (obc && obc->obs.exists && obc->obs.oi.has_manifest()) {
2778     switch (obc->obs.oi.manifest.type) {
2779       case object_manifest_t::TYPE_REDIRECT:
2780           oloc = object_locator_t(obc->obs.oi.manifest.redirect_target);
2781           soid = obc->obs.oi.manifest.redirect_target;
2782           break;
2783       case object_manifest_t::TYPE_CHUNKED:
2784       default:
2785         assert(0 == "unrecognized manifest type");
2786     }
2787   } else {
2788   /* proxy */
2789     soid = m->get_hobj();
2790     oloc = object_locator_t(m->get_object_locator());
2791     oloc.pool = pool.info.tier_of;
2792   }
2793   unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY;
2794
2795   // pass through some original flags that make sense.
2796   //  - leave out redirection and balancing flags since we are
2797   //    already proxying through the primary
2798   //  - leave off read/write/exec flags that are derived from the op
2799   flags |= m->get_flags() & (CEPH_OSD_FLAG_RWORDERED |
2800                              CEPH_OSD_FLAG_ORDERSNAP |
2801                              CEPH_OSD_FLAG_ENFORCE_SNAPC |
2802                              CEPH_OSD_FLAG_MAP_SNAP_CLONE);
2803
2804   dout(10) << __func__ << " Start proxy read for " << *m << dendl;
2805
2806   ProxyReadOpRef prdop(std::make_shared<ProxyReadOp>(op, soid, m->ops));
2807
2808   ObjectOperation obj_op;
2809   obj_op.dup(prdop->ops);
2810
2811   if (pool.info.cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
2812       (agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL)) {
2813     for (unsigned i = 0; i < obj_op.ops.size(); i++) {
2814       ceph_osd_op op = obj_op.ops[i].op;
2815       switch (op.op) {
2816         case CEPH_OSD_OP_READ:
2817         case CEPH_OSD_OP_SYNC_READ:
2818         case CEPH_OSD_OP_SPARSE_READ:
2819         case CEPH_OSD_OP_CHECKSUM:
2820         case CEPH_OSD_OP_CMPEXT:
2821           op.flags = (op.flags | CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL) &
2822                        ~(CEPH_OSD_OP_FLAG_FADVISE_DONTNEED | CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
2823       }
2824     }
2825   }
2826
2827   C_ProxyRead *fin = new C_ProxyRead(this, soid, get_last_peering_reset(),
2828                                      prdop);
2829   ceph_tid_t tid = osd->objecter->read(
2830     soid.oid, oloc, obj_op,
2831     m->get_snapid(), NULL,
2832     flags, new C_OnFinisher(fin, &osd->objecter_finisher),
2833     &prdop->user_version,
2834     &prdop->data_offset,
2835     m->get_features());
2836   fin->tid = tid;
2837   prdop->objecter_tid = tid;
2838   proxyread_ops[tid] = prdop;
2839   in_progress_proxy_ops[soid].push_back(op);
2840 }
2841
2842 void PrimaryLogPG::finish_proxy_read(hobject_t oid, ceph_tid_t tid, int r)
2843 {
2844   dout(10) << __func__ << " " << oid << " tid " << tid
2845            << " " << cpp_strerror(r) << dendl;
2846
2847   map<ceph_tid_t, ProxyReadOpRef>::iterator p = proxyread_ops.find(tid);
2848   if (p == proxyread_ops.end()) {
2849     dout(10) << __func__ << " no proxyread_op found" << dendl;
2850     return;
2851   }
2852   ProxyReadOpRef prdop = p->second;
2853   if (tid != prdop->objecter_tid) {
2854     dout(10) << __func__ << " tid " << tid << " != prdop " << prdop
2855              << " tid " << prdop->objecter_tid << dendl;
2856     return;
2857   }
2858   if (oid != prdop->soid) {
2859     dout(10) << __func__ << " oid " << oid << " != prdop " << prdop
2860              << " soid " << prdop->soid << dendl;
2861     return;
2862   }
2863   proxyread_ops.erase(tid);
2864
2865   map<hobject_t, list<OpRequestRef>>::iterator q = in_progress_proxy_ops.find(oid);
2866   if (q == in_progress_proxy_ops.end()) {
2867     dout(10) << __func__ << " no in_progress_proxy_ops found" << dendl;
2868     return;
2869   }
2870   assert(q->second.size());
2871   list<OpRequestRef>::iterator it = std::find(q->second.begin(),
2872                                               q->second.end(),
2873                                               prdop->op);
2874   assert(it != q->second.end());
2875   OpRequestRef op = *it;
2876   q->second.erase(it);
2877   if (q->second.size() == 0) {
2878     in_progress_proxy_ops.erase(oid);
2879   }
2880
2881   osd->logger->inc(l_osd_tier_proxy_read);
2882
2883   const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
2884   OpContext *ctx = new OpContext(op, m->get_reqid(), &prdop->ops, this);
2885   ctx->reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0, false);
2886   ctx->user_at_version = prdop->user_version;
2887   ctx->data_off = prdop->data_offset;
2888   ctx->ignore_log_op_stats = true;
2889   complete_read_ctx(r, ctx);
2890 }
2891
2892 void PrimaryLogPG::kick_proxy_ops_blocked(hobject_t& soid)
2893 {
2894   map<hobject_t, list<OpRequestRef>>::iterator p = in_progress_proxy_ops.find(soid);
2895   if (p == in_progress_proxy_ops.end())
2896     return;
2897
2898   list<OpRequestRef>& ls = p->second;
2899   dout(10) << __func__ << " " << soid << " requeuing " << ls.size() << " requests" << dendl;
2900   requeue_ops(ls);
2901   in_progress_proxy_ops.erase(p);
2902 }
2903
2904 void PrimaryLogPG::cancel_proxy_read(ProxyReadOpRef prdop,
2905                                      vector<ceph_tid_t> *tids)
2906 {
2907   dout(10) << __func__ << " " << prdop->soid << dendl;
2908   prdop->canceled = true;
2909
2910   // cancel objecter op, if we can
2911   if (prdop->objecter_tid) {
2912     tids->push_back(prdop->objecter_tid);
2913     for (uint32_t i = 0; i < prdop->ops.size(); i++) {
2914       prdop->ops[i].outdata.clear();
2915     }
2916     proxyread_ops.erase(prdop->objecter_tid);
2917     prdop->objecter_tid = 0;
2918   }
2919 }
2920
2921 void PrimaryLogPG::cancel_proxy_ops(bool requeue, vector<ceph_tid_t> *tids)
2922 {
2923   dout(10) << __func__ << dendl;
2924
2925   // cancel proxy reads
2926   map<ceph_tid_t, ProxyReadOpRef>::iterator p = proxyread_ops.begin();
2927   while (p != proxyread_ops.end()) {
2928     cancel_proxy_read((p++)->second, tids);
2929   }
2930
2931   // cancel proxy writes
2932   map<ceph_tid_t, ProxyWriteOpRef>::iterator q = proxywrite_ops.begin();
2933   while (q != proxywrite_ops.end()) {
2934     cancel_proxy_write((q++)->second, tids);
2935   }
2936
2937   if (requeue) {
2938     map<hobject_t, list<OpRequestRef>>::iterator p =
2939       in_progress_proxy_ops.begin();
2940     while (p != in_progress_proxy_ops.end()) {
2941       list<OpRequestRef>& ls = p->second;
2942       dout(10) << __func__ << " " << p->first << " requeuing " << ls.size()
2943                << " requests" << dendl;
2944       requeue_ops(ls);
2945       in_progress_proxy_ops.erase(p++);
2946     }
2947   } else {
2948     in_progress_proxy_ops.clear();
2949   }
2950 }
2951
2952 struct C_ProxyWrite_Commit : public Context {
2953   PrimaryLogPGRef pg;
2954   hobject_t oid;
2955   epoch_t last_peering_reset;
2956   ceph_tid_t tid;
2957   PrimaryLogPG::ProxyWriteOpRef pwop;
2958   C_ProxyWrite_Commit(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
2959                       const PrimaryLogPG::ProxyWriteOpRef& pw)
2960     : pg(p), oid(o), last_peering_reset(lpr),
2961       tid(0), pwop(pw)
2962   {}
2963   void finish(int r) override {
2964     if (pwop->canceled)
2965       return;
2966     pg->lock();
2967     if (pwop->canceled) {
2968       pg->unlock();
2969       return;
2970     }
2971     if (last_peering_reset == pg->get_last_peering_reset()) {
2972       pg->finish_proxy_write(oid, tid, r);
2973     }
2974     pg->unlock();
2975   }
2976 };
2977
2978 void PrimaryLogPG::do_proxy_write(OpRequestRef op, const hobject_t& missing_oid, ObjectContextRef obc)
2979 {
2980   // NOTE: non-const because ProxyWriteOp takes a mutable ref
2981   MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
2982   object_locator_t oloc;
2983   SnapContext snapc(m->get_snap_seq(), m->get_snaps());
2984   hobject_t soid;
2985   /* extensible tier */
2986   if (obc && obc->obs.exists && obc->obs.oi.has_manifest()) {
2987     switch (obc->obs.oi.manifest.type) {
2988       case object_manifest_t::TYPE_REDIRECT:
2989           oloc = object_locator_t(obc->obs.oi.manifest.redirect_target);
2990           soid = obc->obs.oi.manifest.redirect_target;
2991           break;
2992       case object_manifest_t::TYPE_CHUNKED:
2993       default:
2994         assert(0 == "unrecognized manifest type");
2995     }
2996   } else {
2997   /* proxy */
2998     soid = m->get_hobj();
2999     oloc = object_locator_t(m->get_object_locator());
3000     oloc.pool = pool.info.tier_of;
3001   }
3002
3003   unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY;
3004   if (!(op->may_write() || op->may_cache())) {
3005     flags |= CEPH_OSD_FLAG_RWORDERED;
3006   }
3007   dout(10) << __func__ << " Start proxy write for " << *m << dendl;
3008
3009   ProxyWriteOpRef pwop(std::make_shared<ProxyWriteOp>(op, soid, m->ops, m->get_reqid()));
3010   pwop->ctx = new OpContext(op, m->get_reqid(), &pwop->ops, this);
3011   pwop->mtime = m->get_mtime();
3012
3013   ObjectOperation obj_op;
3014   obj_op.dup(pwop->ops);
3015
3016   C_ProxyWrite_Commit *fin = new C_ProxyWrite_Commit(
3017       this, soid, get_last_peering_reset(), pwop);
3018   ceph_tid_t tid = osd->objecter->mutate(
3019     soid.oid, oloc, obj_op, snapc,
3020     ceph::real_clock::from_ceph_timespec(pwop->mtime),
3021     flags, new C_OnFinisher(fin, &osd->objecter_finisher),
3022     &pwop->user_version, pwop->reqid);
3023   fin->tid = tid;
3024   pwop->objecter_tid = tid;
3025   proxywrite_ops[tid] = pwop;
3026   in_progress_proxy_ops[soid].push_back(op);
3027 }
3028
3029 void PrimaryLogPG::finish_proxy_write(hobject_t oid, ceph_tid_t tid, int r)
3030 {
3031   dout(10) << __func__ << " " << oid << " tid " << tid
3032            << " " << cpp_strerror(r) << dendl;
3033
3034   map<ceph_tid_t, ProxyWriteOpRef>::iterator p = proxywrite_ops.find(tid);
3035   if (p == proxywrite_ops.end()) {
3036     dout(10) << __func__ << " no proxywrite_op found" << dendl;
3037     return;
3038   }
3039   ProxyWriteOpRef pwop = p->second;
3040   assert(tid == pwop->objecter_tid);
3041   assert(oid == pwop->soid);
3042
3043   proxywrite_ops.erase(tid);
3044
3045   map<hobject_t, list<OpRequestRef> >::iterator q = in_progress_proxy_ops.find(oid);
3046   if (q == in_progress_proxy_ops.end()) {
3047     dout(10) << __func__ << " no in_progress_proxy_ops found" << dendl;
3048     delete pwop->ctx;
3049     pwop->ctx = NULL;
3050     return;
3051   }
3052   list<OpRequestRef>& in_progress_op = q->second;
3053   assert(in_progress_op.size());
3054   list<OpRequestRef>::iterator it = std::find(in_progress_op.begin(),
3055                                               in_progress_op.end(),
3056                                               pwop->op);
3057   assert(it != in_progress_op.end());
3058   in_progress_op.erase(it);
3059   if (in_progress_op.size() == 0) {
3060     in_progress_proxy_ops.erase(oid);
3061   }
3062
3063   osd->logger->inc(l_osd_tier_proxy_write);
3064
3065   const MOSDOp *m = static_cast<const MOSDOp*>(pwop->op->get_req());
3066   assert(m != NULL);
3067
3068   if (!pwop->sent_reply) {
3069     // send commit.
3070     MOSDOpReply *reply = pwop->ctx->reply;
3071     if (reply)
3072       pwop->ctx->reply = NULL;
3073     else {
3074       reply = new MOSDOpReply(m, r, get_osdmap()->get_epoch(), 0, true);
3075       reply->set_reply_versions(eversion_t(), pwop->user_version);
3076     }
3077     reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
3078     dout(10) << " sending commit on " << pwop << " " << reply << dendl;
3079     osd->send_message_osd_client(reply, m->get_connection());
3080     pwop->sent_reply = true;
3081     pwop->ctx->op->mark_commit_sent();
3082   }
3083
3084   delete pwop->ctx;
3085   pwop->ctx = NULL;
3086 }
3087
3088 void PrimaryLogPG::cancel_proxy_write(ProxyWriteOpRef pwop,
3089                                       vector<ceph_tid_t> *tids)
3090 {
3091   dout(10) << __func__ << " " << pwop->soid << dendl;
3092   pwop->canceled = true;
3093
3094   // cancel objecter op, if we can
3095   if (pwop->objecter_tid) {
3096     tids->push_back(pwop->objecter_tid);
3097     delete pwop->ctx;
3098     pwop->ctx = NULL;
3099     proxywrite_ops.erase(pwop->objecter_tid);
3100     pwop->objecter_tid = 0;
3101   }
3102 }
3103
3104 class PromoteCallback: public PrimaryLogPG::CopyCallback {
3105   ObjectContextRef obc;
3106   PrimaryLogPG *pg;
3107   utime_t start;
3108 public:
3109   PromoteCallback(ObjectContextRef obc_, PrimaryLogPG *pg_)
3110     : obc(obc_),
3111       pg(pg_),
3112       start(ceph_clock_now()) {}
3113
3114   void finish(PrimaryLogPG::CopyCallbackResults results) override {
3115     PrimaryLogPG::CopyResults *results_data = results.get<1>();
3116     int r = results.get<0>();
3117     pg->finish_promote(r, results_data, obc);
3118     pg->osd->logger->tinc(l_osd_tier_promote_lat, ceph_clock_now() - start);
3119   }
3120 };
3121
3122 void PrimaryLogPG::promote_object(ObjectContextRef obc,
3123                                   const hobject_t& missing_oid,
3124                                   const object_locator_t& oloc,
3125                                   OpRequestRef op,
3126                                   ObjectContextRef *promote_obc)
3127 {
3128   hobject_t hoid = obc ? obc->obs.oi.soid : missing_oid;
3129   assert(hoid != hobject_t());
3130   if (write_blocked_by_scrub(hoid)) {
3131     dout(10) << __func__ << " " << hoid
3132              << " blocked by scrub" << dendl;
3133     if (op) {
3134       waiting_for_scrub.push_back(op);
3135       op->mark_delayed("waiting for scrub");
3136       dout(10) << __func__ << " " << hoid
3137                << " placing op in waiting_for_scrub" << dendl;
3138     } else {
3139       dout(10) << __func__ << " " << hoid
3140                << " no op, dropping on the floor" << dendl;
3141     }
3142     return;
3143   }
3144   if (!obc) { // we need to create an ObjectContext
3145     assert(missing_oid != hobject_t());
3146     obc = get_object_context(missing_oid, true);
3147   }
3148   if (promote_obc)
3149     *promote_obc = obc;
3150
3151   /*
3152    * Before promote complete, if there are  proxy-reads for the object,
3153    * for this case we don't use DONTNEED.
3154    */
3155   unsigned src_fadvise_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL;
3156   map<hobject_t, list<OpRequestRef>>::iterator q = in_progress_proxy_ops.find(obc->obs.oi.soid);
3157   if (q == in_progress_proxy_ops.end()) {
3158     src_fadvise_flags |= LIBRADOS_OP_FLAG_FADVISE_DONTNEED;
3159   }
3160
3161   PromoteCallback *cb = new PromoteCallback(obc, this);
3162   object_locator_t my_oloc = oloc;
3163   my_oloc.pool = pool.info.tier_of;
3164
3165   unsigned flags = CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
3166                    CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
3167                    CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE |
3168                    CEPH_OSD_COPY_FROM_FLAG_RWORDERED;
3169   start_copy(cb, obc, obc->obs.oi.soid, my_oloc, 0, flags,
3170              obc->obs.oi.soid.snap == CEPH_NOSNAP,
3171              src_fadvise_flags, 0);
3172
3173   assert(obc->is_blocked());
3174
3175   if (op)
3176     wait_for_blocked_object(obc->obs.oi.soid, op);
3177   info.stats.stats.sum.num_promote++;
3178 }
3179
3180 void PrimaryLogPG::execute_ctx(OpContext *ctx)
3181 {
3182   FUNCTRACE();
3183   dout(10) << __func__ << " " << ctx << dendl;
3184   ctx->reset_obs(ctx->obc);
3185   ctx->update_log_only = false; // reset in case finish_copyfrom() is re-running execute_ctx
3186   OpRequestRef op = ctx->op;
3187   const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
3188   ObjectContextRef obc = ctx->obc;
3189   const hobject_t& soid = obc->obs.oi.soid;
3190
3191   // this method must be idempotent since we may call it several times
3192   // before we finally apply the resulting transaction.
3193   ctx->op_t.reset(new PGTransaction);
3194
3195   if (op->may_write() || op->may_cache()) {
3196     // snap
3197     if (!(m->has_flag(CEPH_OSD_FLAG_ENFORCE_SNAPC)) &&
3198         pool.info.is_pool_snaps_mode()) {
3199       // use pool's snapc
3200       ctx->snapc = pool.snapc;
3201     } else {
3202       // client specified snapc
3203       ctx->snapc.seq = m->get_snap_seq();
3204       ctx->snapc.snaps = m->get_snaps();
3205       filter_snapc(ctx->snapc.snaps);
3206     }
3207     if ((m->has_flag(CEPH_OSD_FLAG_ORDERSNAP)) &&
3208         ctx->snapc.seq < obc->ssc->snapset.seq) {
3209       dout(10) << " ORDERSNAP flag set and snapc seq " << ctx->snapc.seq
3210                << " < snapset seq " << obc->ssc->snapset.seq
3211                << " on " << obc->obs.oi.soid << dendl;
3212       reply_ctx(ctx, -EOLDSNAPC);
3213       return;
3214     }
3215
3216     // version
3217     ctx->at_version = get_next_version();
3218     ctx->mtime = m->get_mtime();
3219
3220     dout(10) << __func__ << " " << soid << " " << *ctx->ops
3221              << " ov " << obc->obs.oi.version << " av " << ctx->at_version
3222              << " snapc " << ctx->snapc
3223              << " snapset " << obc->ssc->snapset
3224              << dendl;
3225   } else {
3226     dout(10) << __func__ << " " << soid << " " << *ctx->ops
3227              << " ov " << obc->obs.oi.version
3228              << dendl;
3229   }
3230
3231   if (!ctx->user_at_version)
3232     ctx->user_at_version = obc->obs.oi.user_version;
3233   dout(30) << __func__ << " user_at_version " << ctx->user_at_version << dendl;
3234
3235   if (op->may_read()) {
3236     dout(10) << " taking ondisk_read_lock" << dendl;
3237     obc->ondisk_read_lock();
3238   }
3239
3240   {
3241 #ifdef WITH_LTTNG
3242     osd_reqid_t reqid = ctx->op->get_reqid();
3243 #endif
3244     tracepoint(osd, prepare_tx_enter, reqid.name._type,
3245         reqid.name._num, reqid.tid, reqid.inc);
3246   }
3247
3248   int result = prepare_transaction(ctx);
3249
3250   {
3251 #ifdef WITH_LTTNG
3252     osd_reqid_t reqid = ctx->op->get_reqid();
3253 #endif
3254     tracepoint(osd, prepare_tx_exit, reqid.name._type,
3255         reqid.name._num, reqid.tid, reqid.inc);
3256   }
3257
3258   if (op->may_read()) {
3259     dout(10) << " dropping ondisk_read_lock" << dendl;
3260     obc->ondisk_read_unlock();
3261   }
3262
3263   bool pending_async_reads = !ctx->pending_async_reads.empty();
3264   if (result == -EINPROGRESS || pending_async_reads) {
3265     // come back later.
3266     if (pending_async_reads) {
3267       in_progress_async_reads.push_back(make_pair(op, ctx));
3268       ctx->start_async_reads(this);
3269     }
3270     return;
3271   }
3272
3273   if (result == -EAGAIN) {
3274     // clean up after the ctx
3275     close_op_ctx(ctx);
3276     return;
3277   }
3278
3279   bool successful_write = !ctx->op_t->empty() && op->may_write() && result >= 0;
3280   // prepare the reply
3281   ctx->reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0,
3282                                successful_write);
3283
3284   // Write operations aren't allowed to return a data payload because
3285   // we can't do so reliably. If the client has to resend the request
3286   // and it has already been applied, we will return 0 with no
3287   // payload.  Non-deterministic behavior is no good.  However, it is
3288   // possible to construct an operation that does a read, does a guard
3289   // check (e.g., CMPXATTR), and then a write.  Then we either succeed
3290   // with the write, or return a CMPXATTR and the read value.
3291   if (successful_write) {
3292     // write.  normalize the result code.
3293     dout(20) << " zeroing write result code " << result << dendl;
3294     result = 0;
3295   }
3296   ctx->reply->set_result(result);
3297
3298   // read or error?
3299   if ((ctx->op_t->empty() || result < 0) && !ctx->update_log_only) {
3300     // finish side-effects
3301     if (result >= 0)
3302       do_osd_op_effects(ctx, m->get_connection());
3303
3304     complete_read_ctx(result, ctx);
3305     return;
3306   }
3307
3308   ctx->reply->set_reply_versions(ctx->at_version, ctx->user_at_version);
3309
3310   assert(op->may_write() || op->may_cache());
3311
3312   // trim log?
3313   calc_trim_to();
3314
3315   // verify that we are doing this in order?
3316   if (cct->_conf->osd_debug_op_order && m->get_source().is_client() &&
3317       !pool.info.is_tier() && !pool.info.has_tiers()) {
3318     map<client_t,ceph_tid_t>& cm = debug_op_order[obc->obs.oi.soid];
3319     ceph_tid_t t = m->get_tid();
3320     client_t n = m->get_source().num();
3321     map<client_t,ceph_tid_t>::iterator p = cm.find(n);
3322     if (p == cm.end()) {
3323       dout(20) << " op order client." << n << " tid " << t << " (first)" << dendl;
3324       cm[n] = t;
3325     } else {
3326       dout(20) << " op order client." << n << " tid " << t << " last was " << p->second << dendl;
3327       if (p->second > t) {
3328         derr << "bad op order, already applied " << p->second << " > this " << t << dendl;
3329         assert(0 == "out of order op");
3330       }
3331       p->second = t;
3332     }
3333   }
3334
3335   if (ctx->update_log_only) {
3336     if (result >= 0)
3337       do_osd_op_effects(ctx, m->get_connection());
3338
3339     dout(20) << __func__ << " update_log_only -- result=" << result << dendl;
3340     // save just what we need from ctx
3341     MOSDOpReply *reply = ctx->reply;
3342     ctx->reply = nullptr;
3343     reply->claim_op_out_data(*ctx->ops);
3344     reply->get_header().data_off = (ctx->data_off ? *ctx->data_off : 0);
3345     close_op_ctx(ctx);
3346
3347     if (result == -ENOENT) {
3348       reply->set_enoent_reply_versions(info.last_update,
3349                                        info.last_user_version);
3350     }
3351     reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
3352     // append to pg log for dup detection - don't save buffers for now
3353     record_write_error(op, soid, reply, result);
3354     return;
3355   }
3356
3357   // no need to capture PG ref, repop cancel will handle that
3358   // Can capture the ctx by pointer, it's owned by the repop
3359   ctx->register_on_commit(
3360     [m, ctx, this](){
3361       if (ctx->op)
3362         log_op_stats(
3363           ctx);
3364
3365       if (m && !ctx->sent_reply) {
3366         MOSDOpReply *reply = ctx->reply;
3367         if (reply)
3368           ctx->reply = nullptr;
3369         else {
3370           reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0, true);
3371           reply->set_reply_versions(ctx->at_version,
3372                                     ctx->user_at_version);
3373         }
3374         reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
3375         dout(10) << " sending reply on " << *m << " " << reply << dendl;
3376         osd->send_message_osd_client(reply, m->get_connection());
3377         ctx->sent_reply = true;
3378         ctx->op->mark_commit_sent();
3379       }
3380     });
3381   ctx->register_on_success(
3382     [ctx, this]() {
3383       do_osd_op_effects(
3384         ctx,
3385         ctx->op ? ctx->op->get_req()->get_connection() :
3386         ConnectionRef());
3387     });
3388   ctx->register_on_finish(
3389     [ctx, this]() {
3390       delete ctx;
3391     });
3392
3393   // issue replica writes
3394   ceph_tid_t rep_tid = osd->get_tid();
3395
3396   RepGather *repop = new_repop(ctx, obc, rep_tid);
3397
3398   issue_repop(repop, ctx);
3399   eval_repop(repop);
3400   repop->put();
3401 }
3402
3403 void PrimaryLogPG::close_op_ctx(OpContext *ctx) {
3404   release_object_locks(ctx->lock_manager);
3405
3406   ctx->op_t.reset();
3407
3408   for (auto p = ctx->on_finish.begin(); p != ctx->on_finish.end();
3409        ctx->on_finish.erase(p++)) {
3410     (*p)();
3411   }
3412   delete ctx;
3413 }
3414
3415 void PrimaryLogPG::reply_ctx(OpContext *ctx, int r)
3416 {
3417   if (ctx->op)
3418     osd->reply_op_error(ctx->op, r);
3419   close_op_ctx(ctx);
3420 }
3421
3422 void PrimaryLogPG::reply_ctx(OpContext *ctx, int r, eversion_t v, version_t uv)
3423 {
3424   if (ctx->op)
3425     osd->reply_op_error(ctx->op, r, v, uv);
3426   close_op_ctx(ctx);
3427 }
3428
3429 void PrimaryLogPG::log_op_stats(OpContext *ctx)
3430 {
3431   OpRequestRef op = ctx->op;
3432   const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
3433
3434   utime_t now = ceph_clock_now();
3435   utime_t latency = now;
3436   latency -= ctx->op->get_req()->get_recv_stamp();
3437   utime_t process_latency = now;
3438   process_latency -= ctx->op->get_dequeued_time();
3439
3440   uint64_t inb = ctx->bytes_written;
3441   uint64_t outb = ctx->bytes_read;
3442
3443   osd->logger->inc(l_osd_op);
3444
3445   osd->logger->inc(l_osd_op_outb, outb);
3446   osd->logger->inc(l_osd_op_inb, inb);
3447   osd->logger->tinc(l_osd_op_lat, latency);
3448   osd->logger->tinc(l_osd_op_process_lat, process_latency);
3449
3450   if (op->may_read() && op->may_write()) {
3451     osd->logger->inc(l_osd_op_rw);
3452     osd->logger->inc(l_osd_op_rw_inb, inb);
3453     osd->logger->inc(l_osd_op_rw_outb, outb);
3454     osd->logger->tinc(l_osd_op_rw_lat, latency);
3455     osd->logger->hinc(l_osd_op_rw_lat_inb_hist, latency.to_nsec(), inb);
3456     osd->logger->hinc(l_osd_op_rw_lat_outb_hist, latency.to_nsec(), outb);
3457     osd->logger->tinc(l_osd_op_rw_process_lat, process_latency);
3458   } else if (op->may_read()) {
3459     osd->logger->inc(l_osd_op_r);
3460     osd->logger->inc(l_osd_op_r_outb, outb);
3461     osd->logger->tinc(l_osd_op_r_lat, latency);
3462     osd->logger->hinc(l_osd_op_r_lat_outb_hist, latency.to_nsec(), outb);
3463     osd->logger->tinc(l_osd_op_r_process_lat, process_latency);
3464   } else if (op->may_write() || op->may_cache()) {
3465     osd->logger->inc(l_osd_op_w);
3466     osd->logger->inc(l_osd_op_w_inb, inb);
3467     osd->logger->tinc(l_osd_op_w_lat, latency);
3468     osd->logger->hinc(l_osd_op_w_lat_inb_hist, latency.to_nsec(), inb);
3469     osd->logger->tinc(l_osd_op_w_process_lat, process_latency);
3470   } else
3471     ceph_abort();
3472
3473   dout(15) << "log_op_stats " << *m
3474            << " inb " << inb
3475            << " outb " << outb
3476            << " lat " << latency << dendl;
3477 }
3478
3479 void PrimaryLogPG::do_sub_op(OpRequestRef op)
3480 {
3481   const MOSDSubOp *m = static_cast<const MOSDSubOp*>(op->get_req());
3482   assert(have_same_or_newer_map(m->map_epoch));
3483   assert(m->get_type() == MSG_OSD_SUBOP);
3484   dout(15) << "do_sub_op " << *op->get_req() << dendl;
3485
3486   if (!is_peered()) {
3487     waiting_for_peered.push_back(op);
3488     op->mark_delayed("waiting for active");
3489     return;
3490   }
3491
3492   const OSDOp *first = NULL;
3493   if (m->ops.size() >= 1) {
3494     first = &m->ops[0];
3495   }
3496
3497   if (first) {
3498     switch (first->op.op) {
3499     case CEPH_OSD_OP_DELETE:
3500       sub_op_remove(op);
3501       return;
3502     case CEPH_OSD_OP_SCRUB_RESERVE:
3503       handle_scrub_reserve_request(op);
3504       return;
3505     case CEPH_OSD_OP_SCRUB_UNRESERVE:
3506       handle_scrub_reserve_release(op);
3507       return;
3508     case CEPH_OSD_OP_SCRUB_MAP:
3509       sub_op_scrub_map(op);
3510       return;
3511     }
3512   }
3513 }
3514
3515 void PrimaryLogPG::do_sub_op_reply(OpRequestRef op)
3516 {
3517   const MOSDSubOpReply *r = static_cast<const MOSDSubOpReply *>(op->get_req());
3518   assert(r->get_type() == MSG_OSD_SUBOPREPLY);
3519   if (r->ops.size() >= 1) {
3520     const OSDOp& first = r->ops[0];
3521     switch (first.op.op) {
3522     case CEPH_OSD_OP_SCRUB_RESERVE:
3523       {
3524         pg_shard_t from = r->from;
3525         bufferlist::iterator p = const_cast<bufferlist&>(r->get_data()).begin();
3526         bool reserved;
3527         ::decode(reserved, p);
3528         if (reserved) {
3529           handle_scrub_reserve_grant(op, from);
3530         } else {
3531           handle_scrub_reserve_reject(op, from);
3532         }
3533       }
3534       return;
3535     }
3536   }
3537 }
3538
3539 void PrimaryLogPG::do_scan(
3540   OpRequestRef op,
3541   ThreadPool::TPHandle &handle)
3542 {
3543   const MOSDPGScan *m = static_cast<const MOSDPGScan*>(op->get_req());
3544   assert(m->get_type() == MSG_OSD_PG_SCAN);
3545   dout(10) << "do_scan " << *m << dendl;
3546
3547   op->mark_started();
3548
3549   switch (m->op) {
3550   case MOSDPGScan::OP_SCAN_GET_DIGEST:
3551     {
3552       ostringstream ss;
3553       if (osd->check_backfill_full(ss)) {
3554         dout(1) << __func__ << ": Canceling backfill, " << ss.str() << dendl;
3555         queue_peering_event(
3556           CephPeeringEvtRef(
3557             std::make_shared<CephPeeringEvt>(
3558               get_osdmap()->get_epoch(),
3559               get_osdmap()->get_epoch(),
3560               BackfillTooFull())));
3561         return;
3562       }
3563
3564       BackfillInterval bi;
3565       bi.begin = m->begin;
3566       // No need to flush, there won't be any in progress writes occuring
3567       // past m->begin
3568       scan_range(
3569         cct->_conf->osd_backfill_scan_min,
3570         cct->_conf->osd_backfill_scan_max,
3571         &bi,
3572         handle);
3573       MOSDPGScan *reply = new MOSDPGScan(
3574         MOSDPGScan::OP_SCAN_DIGEST,
3575         pg_whoami,
3576         get_osdmap()->get_epoch(), m->query_epoch,
3577         spg_t(info.pgid.pgid, get_primary().shard), bi.begin, bi.end);
3578       ::encode(bi.objects, reply->get_data());
3579       osd->send_message_osd_cluster(reply, m->get_connection());
3580     }
3581     break;
3582
3583   case MOSDPGScan::OP_SCAN_DIGEST:
3584     {
3585       pg_shard_t from = m->from;
3586
3587       // Check that from is in backfill_targets vector
3588       assert(is_backfill_targets(from));
3589
3590       BackfillInterval& bi = peer_backfill_info[from];
3591       bi.begin = m->begin;
3592       bi.end = m->end;
3593       bufferlist::iterator p = const_cast<bufferlist&>(m->get_data()).begin();
3594
3595       // take care to preserve ordering!
3596       bi.clear_objects();
3597       ::decode_noclear(bi.objects, p);
3598
3599       if (waiting_on_backfill.erase(from)) {
3600         if (waiting_on_backfill.empty()) {
3601           assert(peer_backfill_info.size() == backfill_targets.size());
3602           finish_recovery_op(hobject_t::get_max());
3603         }
3604       } else {
3605         // we canceled backfill for a while due to a too full, and this
3606         // is an extra response from a non-too-full peer
3607       }
3608     }
3609     break;
3610   }
3611 }
3612
3613 void PrimaryLogPG::do_backfill(OpRequestRef op)
3614 {
3615   const MOSDPGBackfill *m = static_cast<const MOSDPGBackfill*>(op->get_req());
3616   assert(m->get_type() == MSG_OSD_PG_BACKFILL);
3617   dout(10) << "do_backfill " << *m << dendl;
3618
3619   op->mark_started();
3620
3621   switch (m->op) {
3622   case MOSDPGBackfill::OP_BACKFILL_FINISH:
3623     {
3624       assert(cct->_conf->osd_kill_backfill_at != 1);
3625
3626       MOSDPGBackfill *reply = new MOSDPGBackfill(
3627         MOSDPGBackfill::OP_BACKFILL_FINISH_ACK,
3628         get_osdmap()->get_epoch(),
3629         m->query_epoch,
3630         spg_t(info.pgid.pgid, get_primary().shard));
3631       reply->set_priority(get_recovery_op_priority());
3632       osd->send_message_osd_cluster(reply, m->get_connection());
3633       queue_peering_event(
3634         CephPeeringEvtRef(
3635           std::make_shared<CephPeeringEvt>(
3636             get_osdmap()->get_epoch(),
3637             get_osdmap()->get_epoch(),
3638             RecoveryDone())));
3639     }
3640     // fall-thru
3641
3642   case MOSDPGBackfill::OP_BACKFILL_PROGRESS:
3643     {
3644       assert(cct->_conf->osd_kill_backfill_at != 2);
3645
3646       info.set_last_backfill(m->last_backfill);
3647       info.stats = m->stats;
3648
3649       ObjectStore::Transaction t;
3650       dirty_info = true;
3651       write_if_dirty(t);
3652       int tr = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
3653       assert(tr == 0);
3654     }
3655     break;
3656
3657   case MOSDPGBackfill::OP_BACKFILL_FINISH_ACK:
3658     {
3659       assert(is_primary());
3660       assert(cct->_conf->osd_kill_backfill_at != 3);
3661       finish_recovery_op(hobject_t::get_max());
3662     }
3663     break;
3664   }
3665 }
3666
3667 void PrimaryLogPG::do_backfill_remove(OpRequestRef op)
3668 {
3669   const MOSDPGBackfillRemove *m = static_cast<const MOSDPGBackfillRemove*>(
3670     op->get_req());
3671   assert(m->get_type() == MSG_OSD_PG_BACKFILL_REMOVE);
3672   dout(7) << __func__ << " " << m->ls << dendl;
3673
3674   op->mark_started();
3675
3676   ObjectStore::Transaction t;
3677   for (auto& p : m->ls) {
3678     remove_snap_mapped_object(t, p.first);
3679   }
3680   int r = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
3681   assert(r == 0);
3682 }
3683
3684 int PrimaryLogPG::trim_object(
3685   bool first, const hobject_t &coid, PrimaryLogPG::OpContextUPtr *ctxp)
3686 {
3687   *ctxp = NULL;
3688   // load clone info
3689   bufferlist bl;
3690   ObjectContextRef obc = get_object_context(coid, false, NULL);
3691   if (!obc || !obc->ssc || !obc->ssc->exists) {
3692     osd->clog->error() << __func__ << ": Can not trim " << coid
3693       << " repair needed " << (obc ? "(no obc->ssc or !exists)" : "(no obc)");
3694     return -ENOENT;
3695   }
3696
3697   hobject_t snapoid(
3698     coid.oid, coid.get_key(),
3699     obc->ssc->snapset.head_exists ? CEPH_NOSNAP:CEPH_SNAPDIR, coid.get_hash(),
3700     info.pgid.pool(), coid.get_namespace());
3701   ObjectContextRef snapset_obc = get_object_context(snapoid, false);
3702   if (!snapset_obc) {
3703     osd->clog->error() << __func__ << ": Can not trim " << coid
3704       << " repair needed, no snapset obc for " << snapoid;
3705     return -ENOENT;
3706   }
3707
3708   SnapSet& snapset = obc->ssc->snapset;
3709
3710   bool legacy = snapset.is_legacy() ||
3711     get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
3712
3713   object_info_t &coi = obc->obs.oi;
3714   set<snapid_t> old_snaps;
3715   if (legacy) {
3716     old_snaps.insert(coi.legacy_snaps.begin(), coi.legacy_snaps.end());
3717   } else {
3718     auto p = snapset.clone_snaps.find(coid.snap);
3719     if (p == snapset.clone_snaps.end()) {
3720       osd->clog->error() << "No clone_snaps in snapset " << snapset
3721                          << " for object " << coid << "\n";
3722       return -ENOENT;
3723     }
3724     old_snaps.insert(snapset.clone_snaps[coid.snap].begin(),
3725                      snapset.clone_snaps[coid.snap].end());
3726   }
3727   if (old_snaps.empty()) {
3728     osd->clog->error() << "No object info snaps for object " << coid;
3729     return -ENOENT;
3730   }
3731
3732   dout(10) << coid << " old_snaps " << old_snaps
3733            << " old snapset " << snapset << dendl;
3734   if (snapset.seq == 0) {
3735     osd->clog->error() << "No snapset.seq for object " << coid;
3736     return -ENOENT;
3737   }
3738
3739   set<snapid_t> new_snaps;
3740   for (set<snapid_t>::iterator i = old_snaps.begin();
3741        i != old_snaps.end();
3742        ++i) {
3743     if (!pool.info.is_removed_snap(*i))
3744       new_snaps.insert(*i);
3745   }
3746
3747   vector<snapid_t>::iterator p = snapset.clones.end();
3748
3749   if (new_snaps.empty()) {
3750     p = std::find(snapset.clones.begin(), snapset.clones.end(), coid.snap);
3751     if (p == snapset.clones.end()) {
3752       osd->clog->error() << "Snap " << coid.snap << " not in clones";
3753       return -ENOENT;
3754     }
3755   }
3756
3757   OpContextUPtr ctx = simple_opc_create(obc);
3758   ctx->snapset_obc = snapset_obc;
3759
3760   if (!ctx->lock_manager.get_snaptrimmer_write(
3761         coid,
3762         obc,
3763         first)) {
3764     close_op_ctx(ctx.release());
3765     dout(10) << __func__ << ": Unable to get a wlock on " << coid << dendl;
3766     return -ENOLCK;
3767   }
3768
3769   if (!ctx->lock_manager.get_snaptrimmer_write(
3770         snapoid,
3771         snapset_obc,
3772         first)) {
3773     close_op_ctx(ctx.release());
3774     dout(10) << __func__ << ": Unable to get a wlock on " << snapoid << dendl;
3775     return -ENOLCK;
3776   }
3777
3778   ctx->at_version = get_next_version();
3779
3780   PGTransaction *t = ctx->op_t.get();
3781
3782   if (new_snaps.empty()) {
3783     // remove clone
3784     dout(10) << coid << " snaps " << old_snaps << " -> "
3785              << new_snaps << " ... deleting" << dendl;
3786
3787     // ...from snapset
3788     assert(p != snapset.clones.end());
3789
3790     snapid_t last = coid.snap;
3791     ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(last);
3792
3793     if (p != snapset.clones.begin()) {
3794       // not the oldest... merge overlap into next older clone
3795       vector<snapid_t>::iterator n = p - 1;
3796       hobject_t prev_coid = coid;
3797       prev_coid.snap = *n;
3798       bool adjust_prev_bytes = is_present_clone(prev_coid);
3799
3800       if (adjust_prev_bytes)
3801         ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(*n);
3802
3803       snapset.clone_overlap[*n].intersection_of(
3804         snapset.clone_overlap[*p]);
3805
3806       if (adjust_prev_bytes)
3807         ctx->delta_stats.num_bytes += snapset.get_clone_bytes(*n);
3808     }
3809     ctx->delta_stats.num_objects--;
3810     if (coi.is_dirty())
3811       ctx->delta_stats.num_objects_dirty--;
3812     if (coi.is_omap())
3813       ctx->delta_stats.num_objects_omap--;
3814     if (coi.is_whiteout()) {
3815       dout(20) << __func__ << " trimming whiteout on " << coid << dendl;
3816       ctx->delta_stats.num_whiteouts--;
3817     }
3818     ctx->delta_stats.num_object_clones--;
3819     if (coi.is_cache_pinned())
3820       ctx->delta_stats.num_objects_pinned--;
3821     obc->obs.exists = false;
3822
3823     snapset.clones.erase(p);
3824     snapset.clone_overlap.erase(last);
3825     snapset.clone_size.erase(last);
3826     snapset.clone_snaps.erase(last);
3827
3828     ctx->log.push_back(
3829       pg_log_entry_t(
3830         pg_log_entry_t::DELETE,
3831         coid,
3832         ctx->at_version,
3833         ctx->obs->oi.version,
3834         0,
3835         osd_reqid_t(),
3836         ctx->mtime,
3837         0)
3838       );
3839     t->remove(coid);
3840     t->update_snaps(
3841       coid,
3842       old_snaps,
3843       new_snaps);
3844
3845     coi = object_info_t(coid);
3846
3847     ctx->at_version.version++;
3848   } else {
3849     // save adjusted snaps for this object
3850     dout(10) << coid << " snaps " << old_snaps << " -> " << new_snaps << dendl;
3851     if (legacy) {
3852       coi.legacy_snaps = vector<snapid_t>(new_snaps.rbegin(), new_snaps.rend());
3853     } else {
3854       snapset.clone_snaps[coid.snap] = vector<snapid_t>(new_snaps.rbegin(),
3855                                                         new_snaps.rend());
3856       // we still do a 'modify' event on this object just to trigger a
3857       // snapmapper.update ... :(
3858     }
3859
3860     coi.prior_version = coi.version;
3861     coi.version = ctx->at_version;
3862     bl.clear();
3863     ::encode(coi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
3864     t->setattr(coid, OI_ATTR, bl);
3865
3866     ctx->log.push_back(
3867       pg_log_entry_t(
3868         pg_log_entry_t::MODIFY,
3869         coid,
3870         coi.version,
3871         coi.prior_version,
3872         0,
3873         osd_reqid_t(),
3874         ctx->mtime,
3875         0)
3876       );
3877     ctx->at_version.version++;
3878
3879     t->update_snaps(
3880       coid,
3881       old_snaps,
3882       new_snaps);
3883   }
3884
3885   // save head snapset
3886   dout(10) << coid << " new snapset " << snapset << " on "
3887            << snapset_obc->obs.oi << dendl;
3888   if (snapset.clones.empty() &&
3889       (!snapset.head_exists ||
3890        (snapset_obc->obs.oi.is_whiteout() &&
3891         !(snapset_obc->obs.oi.is_dirty() && pool.info.is_tier()) &&
3892         !snapset_obc->obs.oi.is_cache_pinned()))) {
3893     // NOTE: this arguably constitutes minor interference with the
3894     // tiering agent if this is a cache tier since a snap trim event
3895     // is effectively evicting a whiteout we might otherwise want to
3896     // keep around.
3897     dout(10) << coid << " removing " << snapoid << dendl;
3898     ctx->log.push_back(
3899       pg_log_entry_t(
3900         pg_log_entry_t::DELETE,
3901         snapoid,
3902         ctx->at_version,
3903         ctx->snapset_obc->obs.oi.version,
3904         0,
3905         osd_reqid_t(),
3906         ctx->mtime,
3907         0)
3908       );
3909     if (snapoid.is_head()) {
3910       derr << "removing snap head" << dendl;
3911       object_info_t& oi = ctx->snapset_obc->obs.oi;
3912       ctx->delta_stats.num_objects--;
3913       if (oi.is_dirty()) {
3914         ctx->delta_stats.num_objects_dirty--;
3915       }
3916       if (oi.is_omap())
3917         ctx->delta_stats.num_objects_omap--;
3918       if (oi.is_whiteout()) {
3919         dout(20) << __func__ << " trimming whiteout on " << oi.soid << dendl;
3920         ctx->delta_stats.num_whiteouts--;
3921       }
3922       if (oi.is_cache_pinned()) {
3923         ctx->delta_stats.num_objects_pinned--;
3924       }
3925     }
3926     ctx->snapset_obc->obs.exists = false;
3927     ctx->snapset_obc->obs.oi = object_info_t(snapoid);
3928     t->remove(snapoid);
3929   } else {
3930     dout(10) << coid << " filtering snapset on " << snapoid << dendl;
3931     snapset.filter(pool.info);
3932     dout(10) << coid << " writing updated snapset on " << snapoid
3933              << ", snapset is " << snapset << dendl;
3934     ctx->log.push_back(
3935       pg_log_entry_t(
3936         pg_log_entry_t::MODIFY,
3937         snapoid,
3938         ctx->at_version,
3939         ctx->snapset_obc->obs.oi.version,
3940         0,
3941         osd_reqid_t(),
3942         ctx->mtime,
3943         0)
3944       );
3945
3946     ctx->snapset_obc->obs.oi.prior_version =
3947       ctx->snapset_obc->obs.oi.version;
3948     ctx->snapset_obc->obs.oi.version = ctx->at_version;
3949
3950     map <string, bufferlist> attrs;
3951     bl.clear();
3952     ::encode(snapset, bl);
3953     attrs[SS_ATTR].claim(bl);
3954
3955     bl.clear();
3956     ::encode(ctx->snapset_obc->obs.oi, bl,
3957              get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
3958     attrs[OI_ATTR].claim(bl);
3959     t->setattrs(snapoid, attrs);
3960   }
3961
3962   *ctxp = std::move(ctx);
3963   return 0;
3964 }
3965
3966 void PrimaryLogPG::kick_snap_trim()
3967 {
3968   assert(is_active());
3969   assert(is_primary());
3970   if (is_clean() && !snap_trimq.empty()) {
3971     dout(10) << __func__ << ": clean and snaps to trim, kicking" << dendl;
3972     snap_trimmer_machine.process_event(KickTrim());
3973   }
3974 }
3975
3976 void PrimaryLogPG::snap_trimmer_scrub_complete()
3977 {
3978   if (is_primary() && is_active() && is_clean()) {
3979     assert(!snap_trimq.empty());
3980     snap_trimmer_machine.process_event(ScrubComplete());
3981   }
3982 }
3983
3984 void PrimaryLogPG::snap_trimmer(epoch_t queued)
3985 {
3986   if (deleting || pg_has_reset_since(queued)) {
3987     return;
3988   }
3989
3990   assert(is_primary());
3991
3992   dout(10) << "snap_trimmer posting" << dendl;
3993   snap_trimmer_machine.process_event(DoSnapWork());
3994   dout(10) << "snap_trimmer complete" << dendl;
3995   return;
3996 }
3997
3998 int PrimaryLogPG::do_xattr_cmp_u64(int op, __u64 v1, bufferlist& xattr)
3999 {
4000   __u64 v2;
4001
4002   string v2s(xattr.c_str(), xattr.length());
4003   if (v2s.length())
4004     v2 = strtoull(v2s.c_str(), NULL, 10);
4005   else
4006     v2 = 0;
4007
4008   dout(20) << "do_xattr_cmp_u64 '" << v1 << "' vs '" << v2 << "' op " << op << dendl;
4009
4010   switch (op) {
4011   case CEPH_OSD_CMPXATTR_OP_EQ:
4012     return (v1 == v2);
4013   case CEPH_OSD_CMPXATTR_OP_NE:
4014     return (v1 != v2);
4015   case CEPH_OSD_CMPXATTR_OP_GT:
4016     return (v1 > v2);
4017   case CEPH_OSD_CMPXATTR_OP_GTE:
4018     return (v1 >= v2);
4019   case CEPH_OSD_CMPXATTR_OP_LT:
4020     return (v1 < v2);
4021   case CEPH_OSD_CMPXATTR_OP_LTE:
4022     return (v1 <= v2);
4023   default:
4024     return -EINVAL;
4025   }
4026 }
4027
4028 int PrimaryLogPG::do_xattr_cmp_str(int op, string& v1s, bufferlist& xattr)
4029 {
4030   string v2s(xattr.c_str(), xattr.length());
4031
4032   dout(20) << "do_xattr_cmp_str '" << v1s << "' vs '" << v2s << "' op " << op << dendl;
4033
4034   switch (op) {
4035   case CEPH_OSD_CMPXATTR_OP_EQ:
4036     return (v1s.compare(v2s) == 0);
4037   case CEPH_OSD_CMPXATTR_OP_NE:
4038     return (v1s.compare(v2s) != 0);
4039   case CEPH_OSD_CMPXATTR_OP_GT:
4040     return (v1s.compare(v2s) > 0);
4041   case CEPH_OSD_CMPXATTR_OP_GTE:
4042     return (v1s.compare(v2s) >= 0);
4043   case CEPH_OSD_CMPXATTR_OP_LT:
4044     return (v1s.compare(v2s) < 0);
4045   case CEPH_OSD_CMPXATTR_OP_LTE:
4046     return (v1s.compare(v2s) <= 0);
4047   default:
4048     return -EINVAL;
4049   }
4050 }
4051
4052 int PrimaryLogPG::do_writesame(OpContext *ctx, OSDOp& osd_op)
4053 {
4054   ceph_osd_op& op = osd_op.op;
4055   vector<OSDOp> write_ops(1);
4056   OSDOp& write_op = write_ops[0];
4057   uint64_t write_length = op.writesame.length;
4058   int result = 0;
4059
4060   if (!write_length)
4061     return 0;
4062
4063   if (!op.writesame.data_length || write_length % op.writesame.data_length)
4064     return -EINVAL;
4065
4066   if (op.writesame.data_length != osd_op.indata.length()) {
4067     derr << "invalid length ws data length " << op.writesame.data_length << " actual len " << osd_op.indata.length() << dendl;
4068     return -EINVAL;
4069   }
4070
4071   while (write_length) {
4072     write_op.indata.append(osd_op.indata);
4073     write_length -= op.writesame.data_length;
4074   }
4075
4076   write_op.op.op = CEPH_OSD_OP_WRITE;
4077   write_op.op.extent.offset = op.writesame.offset;
4078   write_op.op.extent.length = op.writesame.length;
4079   result = do_osd_ops(ctx, write_ops);
4080   if (result < 0)
4081     derr << "do_writesame do_osd_ops failed " << result << dendl;
4082
4083   return result;
4084 }
4085
4086 // ========================================================================
4087 // low level osd ops
4088
4089 int PrimaryLogPG::do_tmap2omap(OpContext *ctx, unsigned flags)
4090 {
4091   dout(20) << " convert tmap to omap for " << ctx->new_obs.oi.soid << dendl;
4092   bufferlist header, vals;
4093   int r = _get_tmap(ctx, &header, &vals);
4094   if (r < 0) {
4095     if (r == -ENODATA && (flags & CEPH_OSD_TMAP2OMAP_NULLOK))
4096       r = 0;
4097     return r;
4098   }
4099
4100   vector<OSDOp> ops(3);
4101
4102   ops[0].op.op = CEPH_OSD_OP_TRUNCATE;
4103   ops[0].op.extent.offset = 0;
4104   ops[0].op.extent.length = 0;
4105
4106   ops[1].op.op = CEPH_OSD_OP_OMAPSETHEADER;
4107   ops[1].indata.claim(header);
4108
4109   ops[2].op.op = CEPH_OSD_OP_OMAPSETVALS;
4110   ops[2].indata.claim(vals);
4111
4112   return do_osd_ops(ctx, ops);
4113 }
4114
4115 int PrimaryLogPG::do_tmapup_slow(OpContext *ctx, bufferlist::iterator& bp, OSDOp& osd_op,
4116                                     bufferlist& bl)
4117 {
4118   // decode
4119   bufferlist header;
4120   map<string, bufferlist> m;
4121   if (bl.length()) {
4122     bufferlist::iterator p = bl.begin();
4123     ::decode(header, p);
4124     ::decode(m, p);
4125     assert(p.end());
4126   }
4127
4128   // do the update(s)
4129   while (!bp.end()) {
4130     __u8 op;
4131     string key;
4132     ::decode(op, bp);
4133
4134     switch (op) {
4135     case CEPH_OSD_TMAP_SET: // insert key
4136       {
4137         ::decode(key, bp);
4138         bufferlist data;
4139         ::decode(data, bp);
4140         m[key] = data;
4141       }
4142       break;
4143     case CEPH_OSD_TMAP_RM: // remove key
4144       ::decode(key, bp);
4145       if (!m.count(key)) {
4146         return -ENOENT;
4147       }
4148       m.erase(key);
4149       break;
4150     case CEPH_OSD_TMAP_RMSLOPPY: // remove key
4151       ::decode(key, bp);
4152       m.erase(key);
4153       break;
4154     case CEPH_OSD_TMAP_HDR: // update header
4155       {
4156         ::decode(header, bp);
4157       }
4158       break;
4159     default:
4160       return -EINVAL;
4161     }
4162   }
4163
4164   // reencode
4165   bufferlist obl;
4166   ::encode(header, obl);
4167   ::encode(m, obl);
4168
4169   // write it out
4170   vector<OSDOp> nops(1);
4171   OSDOp& newop = nops[0];
4172   newop.op.op = CEPH_OSD_OP_WRITEFULL;
4173   newop.op.extent.offset = 0;
4174   newop.op.extent.length = obl.length();
4175   newop.indata = obl;
4176   do_osd_ops(ctx, nops);
4177   osd_op.outdata.claim(newop.outdata);
4178   return 0;
4179 }
4180
4181 int PrimaryLogPG::do_tmapup(OpContext *ctx, bufferlist::iterator& bp, OSDOp& osd_op)
4182 {
4183   bufferlist::iterator orig_bp = bp;
4184   int result = 0;
4185   if (bp.end()) {
4186     dout(10) << "tmapup is a no-op" << dendl;
4187   } else {
4188     // read the whole object
4189     vector<OSDOp> nops(1);
4190     OSDOp& newop = nops[0];
4191     newop.op.op = CEPH_OSD_OP_READ;
4192     newop.op.extent.offset = 0;
4193     newop.op.extent.length = 0;
4194     result = do_osd_ops(ctx, nops);
4195
4196     dout(10) << "tmapup read " << newop.outdata.length() << dendl;
4197
4198     dout(30) << " starting is \n";
4199     newop.outdata.hexdump(*_dout);
4200     *_dout << dendl;
4201
4202     bufferlist::iterator ip = newop.outdata.begin();
4203     bufferlist obl;
4204
4205     dout(30) << "the update command is: \n";
4206     osd_op.indata.hexdump(*_dout);
4207     *_dout << dendl;
4208
4209     // header
4210     bufferlist header;
4211     __u32 nkeys = 0;
4212     if (newop.outdata.length()) {
4213       ::decode(header, ip);
4214       ::decode(nkeys, ip);
4215     }
4216     dout(10) << "tmapup header " << header.length() << dendl;
4217
4218     if (!bp.end() && *bp == CEPH_OSD_TMAP_HDR) {
4219       ++bp;
4220       ::decode(header, bp);
4221       dout(10) << "tmapup new header " << header.length() << dendl;
4222     }
4223
4224     ::encode(header, obl);
4225
4226     dout(20) << "tmapup initial nkeys " << nkeys << dendl;
4227
4228     // update keys
4229     bufferlist newkeydata;
4230     string nextkey, last_in_key;
4231     bufferlist nextval;
4232     bool have_next = false;
4233     if (!ip.end()) {
4234       have_next = true;
4235       ::decode(nextkey, ip);
4236       ::decode(nextval, ip);
4237     }
4238     while (!bp.end() && !result) {
4239       __u8 op;
4240       string key;
4241       try {
4242         ::decode(op, bp);
4243         ::decode(key, bp);
4244       }
4245       catch (buffer::error& e) {
4246         return -EINVAL;
4247       }
4248       if (key < last_in_key) {
4249         dout(5) << "tmapup warning: key '" << key << "' < previous key '" << last_in_key
4250                 << "', falling back to an inefficient (unsorted) update" << dendl;
4251         bp = orig_bp;
4252         return do_tmapup_slow(ctx, bp, osd_op, newop.outdata);
4253       }
4254       last_in_key = key;
4255
4256       dout(10) << "tmapup op " << (int)op << " key " << key << dendl;
4257
4258       // skip existing intervening keys
4259       bool key_exists = false;
4260       while (have_next && !key_exists) {
4261         dout(20) << "  (have_next=" << have_next << " nextkey=" << nextkey << ")" << dendl;
4262         if (nextkey > key)
4263           break;
4264         if (nextkey < key) {
4265           // copy untouched.
4266           ::encode(nextkey, newkeydata);
4267           ::encode(nextval, newkeydata);
4268           dout(20) << "  keep " << nextkey << " " << nextval.length() << dendl;
4269         } else {
4270           // don't copy; discard old value.  and stop.
4271           dout(20) << "  drop " << nextkey << " " << nextval.length() << dendl;
4272           key_exists = true;
4273           nkeys--;
4274         }
4275         if (!ip.end()) {
4276           ::decode(nextkey, ip);
4277           ::decode(nextval, ip);
4278         } else {
4279           have_next = false;
4280         }
4281       }
4282
4283       if (op == CEPH_OSD_TMAP_SET) {
4284         bufferlist val;
4285         try {
4286           ::decode(val, bp);
4287         }
4288         catch (buffer::error& e) {
4289           return -EINVAL;
4290         }
4291         ::encode(key, newkeydata);
4292         ::encode(val, newkeydata);
4293         dout(20) << "   set " << key << " " << val.length() << dendl;
4294         nkeys++;
4295       } else if (op == CEPH_OSD_TMAP_CREATE) {
4296         if (key_exists) {
4297           return -EEXIST;
4298         }
4299         bufferlist val;
4300         try {
4301           ::decode(val, bp);
4302         }
4303         catch (buffer::error& e) {
4304           return -EINVAL;
4305         }
4306         ::encode(key, newkeydata);
4307         ::encode(val, newkeydata);
4308         dout(20) << "   create " << key << " " << val.length() << dendl;
4309         nkeys++;
4310       } else if (op == CEPH_OSD_TMAP_RM) {
4311         // do nothing.
4312         if (!key_exists) {
4313           return -ENOENT;
4314         }
4315       } else if (op == CEPH_OSD_TMAP_RMSLOPPY) {
4316         // do nothing
4317       } else {
4318         dout(10) << "  invalid tmap op " << (int)op << dendl;
4319         return -EINVAL;
4320       }
4321     }
4322
4323     // copy remaining
4324     if (have_next) {
4325       ::encode(nextkey, newkeydata);
4326       ::encode(nextval, newkeydata);
4327       dout(20) << "  keep " << nextkey << " " << nextval.length() << dendl;
4328     }
4329     if (!ip.end()) {
4330       bufferlist rest;
4331       rest.substr_of(newop.outdata, ip.get_off(), newop.outdata.length() - ip.get_off());
4332       dout(20) << "  keep trailing " << rest.length()
4333                << " at " << newkeydata.length() << dendl;
4334       newkeydata.claim_append(rest);
4335     }
4336
4337     // encode final key count + key data
4338     dout(20) << "tmapup final nkeys " << nkeys << dendl;
4339     ::encode(nkeys, obl);
4340     obl.claim_append(newkeydata);
4341
4342     if (0) {
4343       dout(30) << " final is \n";
4344       obl.hexdump(*_dout);
4345       *_dout << dendl;
4346
4347       // sanity check
4348       bufferlist::iterator tp = obl.begin();
4349       bufferlist h;
4350       ::decode(h, tp);
4351       map<string,bufferlist> d;
4352       ::decode(d, tp);
4353       assert(tp.end());
4354       dout(0) << " **** debug sanity check, looks ok ****" << dendl;
4355     }
4356
4357     // write it out
4358     if (!result) {
4359       dout(20) << "tmapput write " << obl.length() << dendl;
4360       newop.op.op = CEPH_OSD_OP_WRITEFULL;
4361       newop.op.extent.offset = 0;
4362       newop.op.extent.length = obl.length();
4363       newop.indata = obl;
4364       do_osd_ops(ctx, nops);
4365       osd_op.outdata.claim(newop.outdata);
4366     }
4367   }
4368   return result;
4369 }
4370
4371 static int check_offset_and_length(uint64_t offset, uint64_t length, uint64_t max)
4372 {
4373   if (offset >= max ||
4374       length > max ||
4375       offset + length > max)
4376     return -EFBIG;
4377
4378   return 0;
4379 }
4380
4381 struct FillInVerifyExtent : public Context {
4382   ceph_le64 *r;
4383   int32_t *rval;
4384   bufferlist *outdatap;
4385   boost::optional<uint32_t> maybe_crc;
4386   uint64_t size;
4387   OSDService *osd;
4388   hobject_t soid;
4389   __le32 flags;
4390   FillInVerifyExtent(ceph_le64 *r, int32_t *rv, bufferlist *blp,
4391                      boost::optional<uint32_t> mc, uint64_t size,
4392                      OSDService *osd, hobject_t soid, __le32 flags) :
4393     r(r), rval(rv), outdatap(blp), maybe_crc(mc),
4394     size(size), osd(osd), soid(soid), flags(flags) {}
4395   void finish(int len) override {
4396     *r = len;
4397     if (len < 0) {
4398       *rval = len;
4399       return;
4400     }
4401     *rval = 0;
4402
4403     // whole object?  can we verify the checksum?
4404     if (maybe_crc && *r == size) {
4405       uint32_t crc = outdatap->crc32c(-1);
4406       if (maybe_crc != crc) {
4407         osd->clog->error() << std::hex << " full-object read crc 0x" << crc
4408                            << " != expected 0x" << *maybe_crc
4409                            << std::dec << " on " << soid;
4410         if (!(flags & CEPH_OSD_OP_FLAG_FAILOK)) {
4411           *rval = -EIO;
4412           *r = 0;
4413         }
4414       }
4415     }
4416   }
4417 };
4418
4419 struct ToSparseReadResult : public Context {
4420   int* result;
4421   bufferlist* data_bl;
4422   uint64_t data_offset;
4423   ceph_le64* len;
4424   ToSparseReadResult(int* result, bufferlist* bl, uint64_t offset,
4425                      ceph_le64* len)
4426     : result(result), data_bl(bl), data_offset(offset),len(len) {}
4427   void finish(int r) override {
4428     if (r < 0) {
4429       *result = r;
4430       return;
4431     }
4432     *result = 0;
4433     *len = r;
4434     bufferlist outdata;
4435     map<uint64_t, uint64_t> extents = {{data_offset, r}};
4436     ::encode(extents, outdata);
4437     ::encode_destructively(*data_bl, outdata);
4438     data_bl->swap(outdata);
4439   }
4440 };
4441
4442 template<typename V>
4443 static string list_keys(const map<string, V>& m) {
4444   string s;
4445   for (typename map<string, V>::const_iterator itr = m.begin(); itr != m.end(); ++itr) {
4446     if (!s.empty()) {
4447       s.push_back(',');
4448     }
4449     s.append(itr->first);
4450   }
4451   return s;
4452 }
4453
4454 template<typename T>
4455 static string list_entries(const T& m) {
4456   string s;
4457   for (typename T::const_iterator itr = m.begin(); itr != m.end(); ++itr) {
4458     if (!s.empty()) {
4459       s.push_back(',');
4460     }
4461     s.append(*itr);
4462   }
4463   return s;
4464 }
4465
4466 void PrimaryLogPG::maybe_create_new_object(
4467   OpContext *ctx,
4468   bool ignore_transaction)
4469 {
4470   ObjectState& obs = ctx->new_obs;
4471   if (!obs.exists) {
4472     ctx->delta_stats.num_objects++;
4473     obs.exists = true;
4474     assert(!obs.oi.is_whiteout());
4475     obs.oi.new_object();
4476     if (!ignore_transaction)
4477       ctx->op_t->create(obs.oi.soid);
4478   } else if (obs.oi.is_whiteout()) {
4479     dout(10) << __func__ << " clearing whiteout on " << obs.oi.soid << dendl;
4480     ctx->new_obs.oi.clear_flag(object_info_t::FLAG_WHITEOUT);
4481     --ctx->delta_stats.num_whiteouts;
4482   }
4483 }
4484
4485 struct ReadFinisher : public PrimaryLogPG::OpFinisher {
4486   OSDOp& osd_op;
4487
4488   ReadFinisher(OSDOp& osd_op) : osd_op(osd_op) {
4489   }
4490
4491   int execute() override {
4492     return osd_op.rval;
4493   }
4494 };
4495
4496 struct C_ChecksumRead : public Context {
4497   PrimaryLogPG *primary_log_pg;
4498   OSDOp &osd_op;
4499   Checksummer::CSumType csum_type;
4500   bufferlist init_value_bl;
4501   ceph_le64 read_length;
4502   bufferlist read_bl;
4503   Context *fill_extent_ctx;
4504
4505   C_ChecksumRead(PrimaryLogPG *primary_log_pg, OSDOp &osd_op,
4506                  Checksummer::CSumType csum_type, bufferlist &&init_value_bl,
4507                  boost::optional<uint32_t> maybe_crc, uint64_t size,
4508                  OSDService *osd, hobject_t soid, __le32 flags)
4509     : primary_log_pg(primary_log_pg), osd_op(osd_op),
4510       csum_type(csum_type), init_value_bl(std::move(init_value_bl)),
4511       fill_extent_ctx(new FillInVerifyExtent(&read_length, &osd_op.rval,
4512                                              &read_bl, maybe_crc, size,
4513                                              osd, soid, flags)) {
4514   }
4515   ~C_ChecksumRead() override {
4516     delete fill_extent_ctx;
4517   }
4518
4519   void finish(int r) override {
4520     fill_extent_ctx->complete(r);
4521     fill_extent_ctx = nullptr;
4522
4523     if (osd_op.rval >= 0) {
4524       bufferlist::iterator init_value_bl_it = init_value_bl.begin();
4525       osd_op.rval = primary_log_pg->finish_checksum(osd_op, csum_type,
4526                                                     &init_value_bl_it, read_bl);
4527     }
4528   }
4529 };
4530
4531 int PrimaryLogPG::do_checksum(OpContext *ctx, OSDOp& osd_op,
4532                               bufferlist::iterator *bl_it)
4533 {
4534   dout(20) << __func__ << dendl;
4535   bool skip_data_digest =
4536     (osd->store->has_builtin_csum() && g_conf->osd_skip_data_digest) ||
4537     g_conf->osd_distrust_data_digest;
4538
4539   auto& op = osd_op.op;
4540   if (op.checksum.chunk_size > 0) {
4541     if (op.checksum.length == 0) {
4542       dout(10) << __func__ << ": length required when chunk size provided"
4543                << dendl;
4544       return -EINVAL;
4545     }
4546     if (op.checksum.length % op.checksum.chunk_size != 0) {
4547       dout(10) << __func__ << ": length not aligned to chunk size" << dendl;
4548       return -EINVAL;
4549     }
4550   }
4551
4552   auto& oi = ctx->new_obs.oi;
4553   if (op.checksum.offset == 0 && op.checksum.length == 0) {
4554     // zeroed offset+length implies checksum whole object
4555     op.checksum.length = oi.size;
4556   } else if (op.checksum.offset + op.checksum.length > oi.size) {
4557     return -EOVERFLOW;
4558   }
4559
4560   Checksummer::CSumType csum_type;
4561   switch (op.checksum.type) {
4562   case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH32:
4563     csum_type = Checksummer::CSUM_XXHASH32;
4564     break;
4565   case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH64:
4566     csum_type = Checksummer::CSUM_XXHASH64;
4567     break;
4568   case CEPH_OSD_CHECKSUM_OP_TYPE_CRC32C:
4569     csum_type = Checksummer::CSUM_CRC32C;
4570     break;
4571   default:
4572     dout(10) << __func__ << ": unknown crc type ("
4573              << static_cast<uint32_t>(op.checksum.type) << ")" << dendl;
4574     return -EINVAL;
4575   }
4576
4577   size_t csum_init_value_size = Checksummer::get_csum_init_value_size(csum_type);
4578   if (bl_it->get_remaining() < csum_init_value_size) {
4579     dout(10) << __func__ << ": init value not provided" << dendl;
4580     return -EINVAL;
4581   }
4582
4583   bufferlist init_value_bl;
4584   init_value_bl.substr_of(bl_it->get_bl(), bl_it->get_off(),
4585                           csum_init_value_size);
4586   bl_it->advance(csum_init_value_size);
4587
4588   if (pool.info.require_rollback() && op.checksum.length > 0) {
4589     // If there is a data digest and it is possible we are reading
4590     // entire object, pass the digest.
4591     boost::optional<uint32_t> maybe_crc;
4592     if (!skip_data_digest &&
4593         oi.is_data_digest() && op.checksum.offset == 0 &&
4594         op.checksum.length >= oi.size) {
4595       maybe_crc = oi.data_digest;
4596     }
4597
4598     // async read
4599     auto& soid = oi.soid;
4600     auto checksum_ctx = new C_ChecksumRead(this, osd_op, csum_type,
4601                                            std::move(init_value_bl), maybe_crc,
4602                                            oi.size, osd, soid, op.flags);
4603
4604     ctx->pending_async_reads.push_back({
4605       {op.checksum.offset, op.checksum.length, op.flags},
4606       {&checksum_ctx->read_bl, checksum_ctx}});
4607
4608     dout(10) << __func__ << ": async_read noted for " << soid << dendl;
4609     ctx->op_finishers[ctx->current_osd_subop_num].reset(
4610       new ReadFinisher(osd_op));
4611     return -EINPROGRESS;
4612   }
4613
4614   // sync read
4615   std::vector<OSDOp> read_ops(1);
4616   auto& read_op = read_ops[0];
4617   if (op.checksum.length > 0) {
4618     read_op.op.op = CEPH_OSD_OP_READ;
4619     read_op.op.flags = op.flags;
4620     read_op.op.extent.offset = op.checksum.offset;
4621     read_op.op.extent.length = op.checksum.length;
4622     read_op.op.extent.truncate_size = 0;
4623     read_op.op.extent.truncate_seq = 0;
4624
4625     int r = do_osd_ops(ctx, read_ops);
4626     if (r < 0) {
4627       derr << __func__ << ": do_osd_ops failed: " << cpp_strerror(r) << dendl;
4628       return r;
4629     }
4630   }
4631
4632   bufferlist::iterator init_value_bl_it = init_value_bl.begin();
4633   return finish_checksum(osd_op, csum_type, &init_value_bl_it,
4634                          read_op.outdata);
4635 }
4636
4637 int PrimaryLogPG::finish_checksum(OSDOp& osd_op,
4638                                   Checksummer::CSumType csum_type,
4639                                   bufferlist::iterator *init_value_bl_it,
4640                                   const bufferlist &read_bl) {
4641   dout(20) << __func__ << dendl;
4642
4643   auto& op = osd_op.op;
4644
4645   if (op.checksum.length > 0 && read_bl.length() != op.checksum.length) {
4646     derr << __func__ << ": bytes read " << read_bl.length() << " != "
4647          << op.checksum.length << dendl;
4648     return -EINVAL;
4649   }
4650
4651   size_t csum_chunk_size = (op.checksum.chunk_size != 0 ?
4652                               op.checksum.chunk_size : read_bl.length());
4653   uint32_t csum_count = (csum_chunk_size > 0 ?
4654                            read_bl.length() / csum_chunk_size : 0);
4655
4656   bufferlist csum;
4657   bufferptr csum_data;
4658   if (csum_count > 0) {
4659     size_t csum_value_size = Checksummer::get_csum_value_size(csum_type);
4660     csum_data = buffer::create(csum_value_size * csum_count);
4661     csum_data.zero();
4662     csum.append(csum_data);
4663
4664     switch (csum_type) {
4665     case Checksummer::CSUM_XXHASH32:
4666       {
4667         Checksummer::xxhash32::init_value_t init_value;
4668         ::decode(init_value, *init_value_bl_it);
4669         Checksummer::calculate<Checksummer::xxhash32>(
4670           init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
4671           &csum_data);
4672       }
4673       break;
4674     case Checksummer::CSUM_XXHASH64:
4675       {
4676         Checksummer::xxhash64::init_value_t init_value;
4677         ::decode(init_value, *init_value_bl_it);
4678         Checksummer::calculate<Checksummer::xxhash64>(
4679           init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
4680           &csum_data);
4681       }
4682       break;
4683     case Checksummer::CSUM_CRC32C:
4684       {
4685         Checksummer::crc32c::init_value_t init_value;
4686         ::decode(init_value, *init_value_bl_it);
4687         Checksummer::calculate<Checksummer::crc32c>(
4688           init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
4689           &csum_data);
4690       }
4691       break;
4692     default:
4693       break;
4694     }
4695   }
4696
4697   ::encode(csum_count, osd_op.outdata);
4698   osd_op.outdata.claim_append(csum);
4699   return 0;
4700 }
4701
4702 struct C_ExtentCmpRead : public Context {
4703   PrimaryLogPG *primary_log_pg;
4704   OSDOp &osd_op;
4705   ceph_le64 read_length;
4706   bufferlist read_bl;
4707   Context *fill_extent_ctx;
4708
4709   C_ExtentCmpRead(PrimaryLogPG *primary_log_pg, OSDOp &osd_op,
4710                   boost::optional<uint32_t> maybe_crc, uint64_t size,
4711                   OSDService *osd, hobject_t soid, __le32 flags)
4712     : primary_log_pg(primary_log_pg), osd_op(osd_op),
4713       fill_extent_ctx(new FillInVerifyExtent(&read_length, &osd_op.rval,
4714                                              &read_bl, maybe_crc, size,
4715                                              osd, soid, flags)) {
4716   }
4717   ~C_ExtentCmpRead() override {
4718     delete fill_extent_ctx;
4719   }
4720
4721   void finish(int r) override {
4722     if (r == -ENOENT) {
4723       osd_op.rval = 0;
4724       read_bl.clear();
4725       delete fill_extent_ctx;
4726     } else {
4727       fill_extent_ctx->complete(r);
4728     }
4729     fill_extent_ctx = nullptr;
4730
4731     if (osd_op.rval >= 0) {
4732       osd_op.rval = primary_log_pg->finish_extent_cmp(osd_op, read_bl);
4733     }
4734   }
4735 };
4736
4737 int PrimaryLogPG::do_extent_cmp(OpContext *ctx, OSDOp& osd_op)
4738 {
4739   dout(20) << __func__ << dendl;
4740   ceph_osd_op& op = osd_op.op;
4741   bool skip_data_digest =
4742     (osd->store->has_builtin_csum() && g_conf->osd_skip_data_digest) ||
4743     g_conf->osd_distrust_data_digest;
4744
4745   auto& oi = ctx->new_obs.oi;
4746   uint64_t size = oi.size;
4747   if ((oi.truncate_seq < op.extent.truncate_seq) &&
4748       (op.extent.offset + op.extent.length > op.extent.truncate_size)) {
4749     size = op.extent.truncate_size;
4750   }
4751
4752   if (op.extent.offset >= size) {
4753     op.extent.length = 0;
4754   } else if (op.extent.offset + op.extent.length > size) {
4755     op.extent.length = size - op.extent.offset;
4756   }
4757
4758   if (op.extent.length == 0) {
4759     dout(20) << __func__ << " zero length extent" << dendl;
4760     return finish_extent_cmp(osd_op, bufferlist{});
4761   } else if (!ctx->obs->exists || ctx->obs->oi.is_whiteout()) {
4762     dout(20) << __func__ << " object DNE" << dendl;
4763     return finish_extent_cmp(osd_op, {});
4764   } else if (pool.info.require_rollback()) {
4765     // If there is a data digest and it is possible we are reading
4766     // entire object, pass the digest.
4767     boost::optional<uint32_t> maybe_crc;
4768     if (!skip_data_digest &&
4769         oi.is_data_digest() && op.checksum.offset == 0 &&
4770         op.checksum.length >= oi.size) {
4771       maybe_crc = oi.data_digest;
4772     }
4773
4774     // async read
4775     auto& soid = oi.soid;
4776     auto extent_cmp_ctx = new C_ExtentCmpRead(this, osd_op, maybe_crc, oi.size,
4777                                               osd, soid, op.flags);
4778     ctx->pending_async_reads.push_back({
4779       {op.extent.offset, op.extent.length, op.flags},
4780       {&extent_cmp_ctx->read_bl, extent_cmp_ctx}});
4781
4782     dout(10) << __func__ << ": async_read noted for " << soid << dendl;
4783
4784     ctx->op_finishers[ctx->current_osd_subop_num].reset(
4785       new ReadFinisher(osd_op));
4786     return -EINPROGRESS;
4787   }
4788
4789   // sync read
4790   vector<OSDOp> read_ops(1);
4791   OSDOp& read_op = read_ops[0];
4792
4793   read_op.op.op = CEPH_OSD_OP_SYNC_READ;
4794   read_op.op.extent.offset = op.extent.offset;
4795   read_op.op.extent.length = op.extent.length;
4796   read_op.op.extent.truncate_seq = op.extent.truncate_seq;
4797   read_op.op.extent.truncate_size = op.extent.truncate_size;
4798
4799   int result = do_osd_ops(ctx, read_ops);
4800   if (result < 0) {
4801     derr << __func__ << " failed " << result << dendl;
4802     return result;
4803   }
4804   return finish_extent_cmp(osd_op, read_op.outdata);
4805 }
4806
4807 int PrimaryLogPG::finish_extent_cmp(OSDOp& osd_op, const bufferlist &read_bl)
4808 {
4809   for (uint64_t idx = 0; idx < osd_op.indata.length(); ++idx) {
4810     char read_byte = (idx < read_bl.length() ? read_bl[idx] : 0);
4811     if (osd_op.indata[idx] != read_byte) {
4812         return (-MAX_ERRNO - idx);
4813     }
4814   }
4815
4816   return 0;
4817 }
4818
4819 int PrimaryLogPG::do_read(OpContext *ctx, OSDOp& osd_op) {
4820   dout(20) << __func__ << dendl;
4821   auto& op = osd_op.op;
4822   auto& oi = ctx->new_obs.oi;
4823   auto& soid = oi.soid;
4824   __u32 seq = oi.truncate_seq;
4825   uint64_t size = oi.size;
4826   bool trimmed_read = false;
4827   bool skip_data_digest =
4828     (osd->store->has_builtin_csum() && g_conf->osd_skip_data_digest) ||
4829     g_conf->osd_distrust_data_digest;
4830
4831   // are we beyond truncate_size?
4832   if ( (seq < op.extent.truncate_seq) &&
4833        (op.extent.offset + op.extent.length > op.extent.truncate_size) )
4834     size = op.extent.truncate_size;
4835
4836   if (op.extent.length == 0) //length is zero mean read the whole object
4837     op.extent.length = size;
4838
4839   if (op.extent.offset >= size) {
4840     op.extent.length = 0;
4841     trimmed_read = true;
4842   } else if (op.extent.offset + op.extent.length > size) {
4843     op.extent.length = size - op.extent.offset;
4844     trimmed_read = true;
4845   }
4846
4847   // read into a buffer
4848   int result = 0;
4849   if (trimmed_read && op.extent.length == 0) {
4850     // read size was trimmed to zero and it is expected to do nothing
4851     // a read operation of 0 bytes does *not* do nothing, this is why
4852     // the trimmed_read boolean is needed
4853   } else if (pool.info.require_rollback()) {
4854     boost::optional<uint32_t> maybe_crc;
4855     // If there is a data digest and it is possible we are reading
4856     // entire object, pass the digest.  FillInVerifyExtent will
4857     // will check the oi.size again.
4858     if (!skip_data_digest &&
4859         oi.is_data_digest() && op.extent.offset == 0 &&
4860         op.extent.length >= oi.size)
4861       maybe_crc = oi.data_digest;
4862     ctx->pending_async_reads.push_back(
4863       make_pair(
4864         boost::make_tuple(op.extent.offset, op.extent.length, op.flags),
4865         make_pair(&osd_op.outdata,
4866                   new FillInVerifyExtent(&op.extent.length, &osd_op.rval,
4867                                          &osd_op.outdata, maybe_crc, oi.size,
4868                                          osd, soid, op.flags))));
4869     dout(10) << " async_read noted for " << soid << dendl;
4870
4871     ctx->op_finishers[ctx->current_osd_subop_num].reset(
4872       new ReadFinisher(osd_op));
4873   } else {
4874     int r = pgbackend->objects_read_sync(
4875       soid, op.extent.offset, op.extent.length, op.flags, &osd_op.outdata);
4876     if (r == -EIO) {
4877       r = rep_repair_primary_object(soid, ctx->op);
4878     }
4879     if (r >= 0)
4880       op.extent.length = r;
4881     else {
4882       result = r;
4883       op.extent.length = 0;
4884     }
4885     dout(10) << " read got " << r << " / " << op.extent.length
4886              << " bytes from obj " << soid << dendl;
4887
4888     // whole object?  can we verify the checksum?
4889     if (!skip_data_digest &&
4890         op.extent.length == oi.size && oi.is_data_digest()) {
4891       uint32_t crc = osd_op.outdata.crc32c(-1);
4892       if (oi.data_digest != crc) {
4893         osd->clog->error() << info.pgid << std::hex
4894                            << " full-object read crc 0x" << crc
4895                            << " != expected 0x" << oi.data_digest
4896                            << std::dec << " on " << soid;
4897         // FIXME fall back to replica or something?
4898         result = -EIO;
4899       }
4900     }
4901   }
4902
4903   // XXX the op.extent.length is the requested length for async read
4904   // On error this length is changed to 0 after the error comes back.
4905   ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(op.extent.length, 10);
4906   ctx->delta_stats.num_rd++;
4907   return result;
4908 }
4909
4910 int PrimaryLogPG::do_sparse_read(OpContext *ctx, OSDOp& osd_op) {
4911   dout(20) << __func__ << dendl;
4912   auto& op = osd_op.op;
4913   auto& oi = ctx->new_obs.oi;
4914   auto& soid = oi.soid;
4915   bool skip_data_digest =
4916     (osd->store->has_builtin_csum() && g_conf->osd_skip_data_digest) ||
4917     g_conf->osd_distrust_data_digest;
4918
4919   if (op.extent.truncate_seq) {
4920     dout(0) << "sparse_read does not support truncation sequence " << dendl;
4921     return -EINVAL;
4922   }
4923
4924   ++ctx->num_read;
4925   if (pool.info.ec_pool()) {
4926     // translate sparse read to a normal one if not supported
4927     uint64_t offset = op.extent.offset;
4928     uint64_t length = op.extent.length;
4929     if (offset > oi.size) {
4930       length = 0;
4931     } else if (offset + length > oi.size) {
4932       length = oi.size - offset;
4933     }
4934
4935     if (length > 0) {
4936       ctx->pending_async_reads.push_back(
4937         make_pair(
4938           boost::make_tuple(offset, length, op.flags),
4939           make_pair(
4940             &osd_op.outdata,
4941             new ToSparseReadResult(&osd_op.rval, &osd_op.outdata, offset,
4942                                    &op.extent.length))));
4943       dout(10) << " async_read (was sparse_read) noted for " << soid << dendl;
4944
4945       ctx->op_finishers[ctx->current_osd_subop_num].reset(
4946         new ReadFinisher(osd_op));
4947     } else {
4948       dout(10) << " sparse read ended up empty for " << soid << dendl;
4949       map<uint64_t, uint64_t> extents;
4950       ::encode(extents, osd_op.outdata);
4951     }
4952   } else {
4953     // read into a buffer
4954     map<uint64_t, uint64_t> m;
4955     uint32_t total_read = 0;
4956     int r = osd->store->fiemap(ch, ghobject_t(soid, ghobject_t::NO_GEN,
4957                                               info.pgid.shard),
4958                                op.extent.offset, op.extent.length, m);
4959     if (r < 0)  {
4960       return r;
4961     }
4962
4963     map<uint64_t, uint64_t>::iterator miter;
4964     bufferlist data_bl;
4965     uint64_t last = op.extent.offset;
4966     for (miter = m.begin(); miter != m.end(); ++miter) {
4967       // verify hole?
4968       if (cct->_conf->osd_verify_sparse_read_holes &&
4969           last < miter->first) {
4970         bufferlist t;
4971         uint64_t len = miter->first - last;
4972         r = pgbackend->objects_read_sync(soid, last, len, op.flags, &t);
4973         if (r < 0) {
4974           osd->clog->error() << coll << " " << soid
4975                              << " sparse-read failed to read: "
4976                              << r;
4977         } else if (!t.is_zero()) {
4978           osd->clog->error() << coll << " " << soid
4979                              << " sparse-read found data in hole "
4980                              << last << "~" << len;
4981         }
4982       }
4983
4984       bufferlist tmpbl;
4985       r = pgbackend->objects_read_sync(soid, miter->first, miter->second,
4986                                        op.flags, &tmpbl);
4987       if (r == -EIO) {
4988         r = rep_repair_primary_object(soid, ctx->op);
4989       }
4990       if (r < 0) {
4991         return r;
4992       }
4993
4994       // this is usually happen when we get extent that exceeds the actual file
4995       // size
4996       if (r < (int)miter->second)
4997         miter->second = r;
4998       total_read += r;
4999       dout(10) << "sparse-read " << miter->first << "@" << miter->second
5000                << dendl;
5001       data_bl.claim_append(tmpbl);
5002       last = miter->first + r;
5003     }
5004
5005     if (r < 0) {
5006       return r;
5007     }
5008
5009     // verify trailing hole?
5010     if (cct->_conf->osd_verify_sparse_read_holes) {
5011       uint64_t end = MIN(op.extent.offset + op.extent.length, oi.size);
5012       if (last < end) {
5013         bufferlist t;
5014         uint64_t len = end - last;
5015         r = pgbackend->objects_read_sync(soid, last, len, op.flags, &t);
5016         if (r < 0) {
5017           osd->clog->error() << coll << " " << soid
5018                              << " sparse-read failed to read: " << r;
5019         } else if (!t.is_zero()) {
5020           osd->clog->error() << coll << " " << soid
5021                              << " sparse-read found data in hole "
5022                              << last << "~" << len;
5023         }
5024       }
5025     }
5026
5027     // Why SPARSE_READ need checksum? In fact, librbd always use sparse-read.
5028     // Maybe at first, there is no much whole objects. With continued use, more
5029     // and more whole object exist. So from this point, for spare-read add
5030     // checksum make sense.
5031     if (!skip_data_digest &&
5032         total_read == oi.size && oi.is_data_digest()) {
5033       uint32_t crc = data_bl.crc32c(-1);
5034       if (oi.data_digest != crc) {
5035         osd->clog->error() << info.pgid << std::hex
5036           << " full-object read crc 0x" << crc
5037           << " != expected 0x" << oi.data_digest
5038           << std::dec << " on " << soid;
5039         // FIXME fall back to replica or something?
5040         return -EIO;
5041       }
5042     }
5043
5044     op.extent.length = total_read;
5045
5046     ::encode(m, osd_op.outdata); // re-encode since it might be modified
5047     ::encode_destructively(data_bl, osd_op.outdata);
5048
5049     dout(10) << " sparse_read got " << total_read << " bytes from object "
5050              << soid << dendl;
5051   }
5052
5053   ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(op.extent.length, 10);
5054   ctx->delta_stats.num_rd++;
5055   return 0;
5056 }
5057
5058 int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
5059 {
5060   int result = 0;
5061   SnapSetContext *ssc = ctx->obc->ssc;
5062   ObjectState& obs = ctx->new_obs;
5063   object_info_t& oi = obs.oi;
5064   const hobject_t& soid = oi.soid;
5065   bool skip_data_digest =
5066     (osd->store->has_builtin_csum() && g_conf->osd_skip_data_digest) ||
5067     g_conf->osd_distrust_data_digest;
5068
5069   PGTransaction* t = ctx->op_t.get();
5070
5071   dout(10) << "do_osd_op " << soid << " " << ops << dendl;
5072
5073   ctx->current_osd_subop_num = 0;
5074   for (auto p = ops.begin(); p != ops.end(); ++p, ctx->current_osd_subop_num++, ctx->processed_subop_count++) {
5075     OSDOp& osd_op = *p;
5076     ceph_osd_op& op = osd_op.op;
5077
5078     OpFinisher* op_finisher = nullptr;
5079     {
5080       auto op_finisher_it = ctx->op_finishers.find(ctx->current_osd_subop_num);
5081       if (op_finisher_it != ctx->op_finishers.end()) {
5082         op_finisher = op_finisher_it->second.get();
5083       }
5084     }
5085
5086     // TODO: check endianness (__le32 vs uint32_t, etc.)
5087     // The fields in ceph_osd_op are little-endian (according to the definition in rados.h),
5088     // but the code in this function seems to treat them as native-endian.  What should the
5089     // tracepoints do?
5090     tracepoint(osd, do_osd_op_pre, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op), op.flags);
5091
5092     dout(10) << "do_osd_op  " << osd_op << dendl;
5093
5094     bufferlist::iterator bp = osd_op.indata.begin();
5095
5096     // user-visible modifcation?
5097     switch (op.op) {
5098       // non user-visible modifications
5099     case CEPH_OSD_OP_WATCH:
5100     case CEPH_OSD_OP_CACHE_EVICT:
5101     case CEPH_OSD_OP_CACHE_FLUSH:
5102     case CEPH_OSD_OP_CACHE_TRY_FLUSH:
5103     case CEPH_OSD_OP_UNDIRTY:
5104     case CEPH_OSD_OP_COPY_FROM:  // we handle user_version update explicitly
5105     case CEPH_OSD_OP_CACHE_PIN:
5106     case CEPH_OSD_OP_CACHE_UNPIN:
5107     case CEPH_OSD_OP_SET_REDIRECT:
5108       break;
5109     default:
5110       if (op.op & CEPH_OSD_OP_MODE_WR)
5111         ctx->user_modify = true;
5112     }
5113
5114     // munge -1 truncate to 0 truncate
5115     if (ceph_osd_op_uses_extent(op.op) &&
5116         op.extent.truncate_seq == 1 &&
5117         op.extent.truncate_size == (-1ULL)) {
5118       op.extent.truncate_size = 0;
5119       op.extent.truncate_seq = 0;
5120     }
5121
5122     // munge ZERO -> TRUNCATE?  (don't munge to DELETE or we risk hosing attributes)
5123     if (op.op == CEPH_OSD_OP_ZERO &&
5124         obs.exists &&
5125         op.extent.offset < cct->_conf->osd_max_object_size &&
5126         op.extent.length >= 1 &&
5127         op.extent.length <= cct->_conf->osd_max_object_size &&
5128         op.extent.offset + op.extent.length >= oi.size) {
5129       if (op.extent.offset >= oi.size) {
5130         // no-op
5131         goto fail;
5132       }
5133       dout(10) << " munging ZERO " << op.extent.offset << "~" << op.extent.length
5134                << " -> TRUNCATE " << op.extent.offset << " (old size is " << oi.size << ")" << dendl;
5135       op.op = CEPH_OSD_OP_TRUNCATE;
5136     }
5137
5138     switch (op.op) {
5139
5140       // --- READS ---
5141
5142     case CEPH_OSD_OP_CMPEXT:
5143       ++ctx->num_read;
5144       tracepoint(osd, do_osd_op_pre_extent_cmp, soid.oid.name.c_str(),
5145                  soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
5146                  op.extent.length, op.extent.truncate_size,
5147                  op.extent.truncate_seq);
5148
5149       if (op_finisher == nullptr) {
5150         result = do_extent_cmp(ctx, osd_op);
5151       } else {
5152         result = op_finisher->execute();
5153       }
5154       break;
5155
5156     case CEPH_OSD_OP_SYNC_READ:
5157       if (pool.info.require_rollback()) {
5158         result = -EOPNOTSUPP;
5159         break;
5160       }
5161       // fall through
5162     case CEPH_OSD_OP_READ:
5163       ++ctx->num_read;
5164       tracepoint(osd, do_osd_op_pre_read, soid.oid.name.c_str(),
5165                  soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
5166                  op.extent.length, op.extent.truncate_size,
5167                  op.extent.truncate_seq);
5168       if (op_finisher == nullptr) {
5169         if (!ctx->data_off) {
5170           ctx->data_off = op.extent.offset;
5171         }
5172         result = do_read(ctx, osd_op);
5173       } else {
5174         result = op_finisher->execute();
5175       }
5176       break;
5177
5178     case CEPH_OSD_OP_CHECKSUM:
5179       ++ctx->num_read;
5180       {
5181         tracepoint(osd, do_osd_op_pre_checksum, soid.oid.name.c_str(),
5182                    soid.snap.val, oi.size, oi.truncate_seq, op.checksum.type,
5183                    op.checksum.offset, op.checksum.length,
5184                    op.checksum.chunk_size);
5185
5186         if (op_finisher == nullptr) {
5187           result = do_checksum(ctx, osd_op, &bp);
5188         } else {
5189           result = op_finisher->execute();
5190         }
5191       }
5192       break;
5193
5194     /* map extents */
5195     case CEPH_OSD_OP_MAPEXT:
5196       tracepoint(osd, do_osd_op_pre_mapext, soid.oid.name.c_str(), soid.snap.val, op.extent.offset, op.extent.length);
5197       if (pool.info.require_rollback()) {
5198         result = -EOPNOTSUPP;
5199         break;
5200       }
5201       ++ctx->num_read;
5202       {
5203         // read into a buffer
5204         bufferlist bl;
5205         int r = osd->store->fiemap(ch, ghobject_t(soid, ghobject_t::NO_GEN,
5206                                                   info.pgid.shard),
5207                                    op.extent.offset, op.extent.length, bl);
5208         osd_op.outdata.claim(bl);
5209         if (r < 0)
5210           result = r;
5211         else
5212           ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(bl.length(), 10);
5213         ctx->delta_stats.num_rd++;
5214         dout(10) << " map_extents done on object " << soid << dendl;
5215       }
5216       break;
5217
5218     /* map extents */
5219     case CEPH_OSD_OP_SPARSE_READ:
5220       tracepoint(osd, do_osd_op_pre_sparse_read, soid.oid.name.c_str(),
5221                  soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
5222                  op.extent.length, op.extent.truncate_size,
5223                  op.extent.truncate_seq);
5224       if (op_finisher == nullptr) {
5225         result = do_sparse_read(ctx, osd_op);
5226       } else {
5227         result = op_finisher->execute();
5228       }
5229       break;
5230
5231     case CEPH_OSD_OP_CALL:
5232       {
5233         string cname, mname;
5234         bufferlist indata;
5235         try {
5236           bp.copy(op.cls.class_len, cname);
5237           bp.copy(op.cls.method_len, mname);
5238           bp.copy(op.cls.indata_len, indata);
5239         } catch (buffer::error& e) {
5240           dout(10) << "call unable to decode class + method + indata" << dendl;
5241           dout(30) << "in dump: ";
5242           osd_op.indata.hexdump(*_dout);
5243           *_dout << dendl;
5244           result = -EINVAL;
5245           tracepoint(osd, do_osd_op_pre_call, soid.oid.name.c_str(), soid.snap.val, "???", "???");
5246           break;
5247         }
5248         tracepoint(osd, do_osd_op_pre_call, soid.oid.name.c_str(), soid.snap.val, cname.c_str(), mname.c_str());
5249
5250         ClassHandler::ClassData *cls;
5251         result = osd->class_handler->open_class(cname, &cls);
5252         assert(result == 0);   // init_op_flags() already verified this works.
5253
5254         ClassHandler::ClassMethod *method = cls->get_method(mname.c_str());
5255         if (!method) {
5256           dout(10) << "call method " << cname << "." << mname << " does not exist" << dendl;
5257           result = -EOPNOTSUPP;
5258           break;
5259         }
5260
5261         int flags = method->get_flags();
5262         if (flags & CLS_METHOD_WR)
5263           ctx->user_modify = true;
5264
5265         bufferlist outdata;
5266         dout(10) << "call method " << cname << "." << mname << dendl;
5267         int prev_rd = ctx->num_read;
5268         int prev_wr = ctx->num_write;
5269         result = method->exec((cls_method_context_t)&ctx, indata, outdata);
5270
5271         if (ctx->num_read > prev_rd && !(flags & CLS_METHOD_RD)) {
5272           derr << "method " << cname << "." << mname << " tried to read object but is not marked RD" << dendl;
5273           result = -EIO;
5274           break;
5275         }
5276         if (ctx->num_write > prev_wr && !(flags & CLS_METHOD_WR)) {
5277           derr << "method " << cname << "." << mname << " tried to update object but is not marked WR" << dendl;
5278           result = -EIO;
5279           break;
5280         }
5281
5282         dout(10) << "method called response length=" << outdata.length() << dendl;
5283         op.extent.length = outdata.length();
5284         osd_op.outdata.claim_append(outdata);
5285         dout(30) << "out dump: ";
5286         osd_op.outdata.hexdump(*_dout);
5287         *_dout << dendl;
5288       }
5289       break;
5290
5291     case CEPH_OSD_OP_STAT:
5292       // note: stat does not require RD
5293       {
5294         tracepoint(osd, do_osd_op_pre_stat, soid.oid.name.c_str(), soid.snap.val);
5295
5296         if (obs.exists && !oi.is_whiteout()) {
5297           ::encode(oi.size, osd_op.outdata);
5298           ::encode(oi.mtime, osd_op.outdata);
5299           dout(10) << "stat oi has " << oi.size << " " << oi.mtime << dendl;
5300         } else {
5301           result = -ENOENT;
5302           dout(10) << "stat oi object does not exist" << dendl;
5303         }
5304
5305         ctx->delta_stats.num_rd++;
5306       }
5307       break;
5308
5309     case CEPH_OSD_OP_ISDIRTY:
5310       ++ctx->num_read;
5311       {
5312         tracepoint(osd, do_osd_op_pre_isdirty, soid.oid.name.c_str(), soid.snap.val);
5313         bool is_dirty = obs.oi.is_dirty();
5314         ::encode(is_dirty, osd_op.outdata);
5315         ctx->delta_stats.num_rd++;
5316         result = 0;
5317       }
5318       break;
5319
5320     case CEPH_OSD_OP_UNDIRTY:
5321       ++ctx->num_write;
5322       {
5323         tracepoint(osd, do_osd_op_pre_undirty, soid.oid.name.c_str(), soid.snap.val);
5324         if (oi.is_dirty()) {
5325           ctx->undirty = true;  // see make_writeable()
5326           ctx->modify = true;
5327           ctx->delta_stats.num_wr++;
5328         }
5329         result = 0;
5330       }
5331       break;
5332
5333     case CEPH_OSD_OP_CACHE_TRY_FLUSH:
5334       ++ctx->num_write;
5335       {
5336         tracepoint(osd, do_osd_op_pre_try_flush, soid.oid.name.c_str(), soid.snap.val);
5337         if (ctx->lock_type != ObjectContext::RWState::RWNONE) {
5338           dout(10) << "cache-try-flush without SKIPRWLOCKS flag set" << dendl;
5339           result = -EINVAL;
5340           break;
5341         }
5342         if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
5343           result = -EINVAL;
5344           break;
5345         }
5346         if (!obs.exists) {
5347           result = 0;
5348           break;
5349         }
5350         if (oi.is_cache_pinned()) {
5351           dout(10) << "cache-try-flush on a pinned object, consider unpin this object first" << dendl;
5352           result = -EPERM;
5353           break;
5354         }
5355         if (oi.is_dirty()) {
5356           result = start_flush(ctx->op, ctx->obc, false, NULL, boost::none);
5357           if (result == -EINPROGRESS)
5358             result = -EAGAIN;
5359         } else {
5360           result = 0;
5361         }
5362       }
5363       break;
5364
5365     case CEPH_OSD_OP_CACHE_FLUSH:
5366       ++ctx->num_write;
5367       {
5368         tracepoint(osd, do_osd_op_pre_cache_flush, soid.oid.name.c_str(), soid.snap.val);
5369         if (ctx->lock_type == ObjectContext::RWState::RWNONE) {
5370           dout(10) << "cache-flush with SKIPRWLOCKS flag set" << dendl;
5371           result = -EINVAL;
5372           break;
5373         }
5374         if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
5375           result = -EINVAL;
5376           break;
5377         }
5378         if (!obs.exists) {
5379           result = 0;
5380           break;
5381         }
5382         if (oi.is_cache_pinned()) {
5383           dout(10) << "cache-flush on a pinned object, consider unpin this object first" << dendl;
5384           result = -EPERM;
5385           break;
5386         }
5387         hobject_t missing;
5388         if (oi.is_dirty()) {
5389           result = start_flush(ctx->op, ctx->obc, true, &missing, boost::none);
5390           if (result == -EINPROGRESS)
5391             result = -EAGAIN;
5392         } else {
5393           result = 0;
5394         }
5395         // Check special return value which has set missing_return
5396         if (result == -ENOENT) {
5397           dout(10) << __func__ << " CEPH_OSD_OP_CACHE_FLUSH got ENOENT" << dendl;
5398           assert(!missing.is_min());
5399           wait_for_unreadable_object(missing, ctx->op);
5400           // Error code which is used elsewhere when wait_for_unreadable_object() is used
5401           result = -EAGAIN;
5402         }
5403       }
5404       break;
5405
5406     case CEPH_OSD_OP_CACHE_EVICT:
5407       ++ctx->num_write;
5408       {
5409         tracepoint(osd, do_osd_op_pre_cache_evict, soid.oid.name.c_str(), soid.snap.val);
5410         if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
5411           result = -EINVAL;
5412           break;
5413         }
5414         if (!obs.exists) {
5415           result = 0;
5416           break;
5417         }
5418         if (oi.is_cache_pinned()) {
5419           dout(10) << "cache-evict on a pinned object, consider unpin this object first" << dendl;
5420           result = -EPERM;
5421           break;
5422         }
5423         if (oi.is_dirty()) {
5424           result = -EBUSY;
5425           break;
5426         }
5427         if (!oi.watchers.empty()) {
5428           result = -EBUSY;
5429           break;
5430         }
5431         if (soid.snap == CEPH_NOSNAP) {
5432           result = _verify_no_head_clones(soid, ssc->snapset);
5433           if (result < 0)
5434             break;
5435         }
5436         result = _delete_oid(ctx, true, false);
5437         if (result >= 0) {
5438           // mark that this is a cache eviction to avoid triggering normal
5439           // make_writeable() clone or snapdir object creation in finish_ctx()
5440           ctx->cache_evict = true;
5441         }
5442         osd->logger->inc(l_osd_tier_evict);
5443       }
5444       break;
5445
5446     case CEPH_OSD_OP_GETXATTR:
5447       ++ctx->num_read;
5448       {
5449         string aname;
5450         bp.copy(op.xattr.name_len, aname);
5451         tracepoint(osd, do_osd_op_pre_getxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
5452         string name = "_" + aname;
5453         int r = getattr_maybe_cache(
5454           ctx->obc,
5455           name,
5456           &(osd_op.outdata));
5457         if (r >= 0) {
5458           op.xattr.value_len = osd_op.outdata.length();
5459           result = 0;
5460           ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
5461         } else
5462           result = r;
5463
5464         ctx->delta_stats.num_rd++;
5465       }
5466       break;
5467
5468    case CEPH_OSD_OP_GETXATTRS:
5469       ++ctx->num_read;
5470       {
5471         tracepoint(osd, do_osd_op_pre_getxattrs, soid.oid.name.c_str(), soid.snap.val);
5472         map<string, bufferlist> out;
5473         result = getattrs_maybe_cache(
5474           ctx->obc,
5475           &out);
5476
5477         bufferlist bl;
5478         ::encode(out, bl);
5479         ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(bl.length(), 10);
5480         ctx->delta_stats.num_rd++;
5481         osd_op.outdata.claim_append(bl);
5482       }
5483       break;
5484
5485     case CEPH_OSD_OP_CMPXATTR:
5486       ++ctx->num_read;
5487       {
5488         string aname;
5489         bp.copy(op.xattr.name_len, aname);
5490         tracepoint(osd, do_osd_op_pre_cmpxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
5491         string name = "_" + aname;
5492         name[op.xattr.name_len + 1] = 0;
5493
5494         bufferlist xattr;
5495         result = getattr_maybe_cache(
5496           ctx->obc,
5497           name,
5498           &xattr);
5499         if (result < 0 && result != -EEXIST && result != -ENODATA)
5500           break;
5501
5502         ctx->delta_stats.num_rd++;
5503         ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(xattr.length(), 10);
5504
5505         switch (op.xattr.cmp_mode) {
5506         case CEPH_OSD_CMPXATTR_MODE_STRING:
5507           {
5508             string val;
5509             bp.copy(op.xattr.value_len, val);
5510             val[op.xattr.value_len] = 0;
5511             dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name << " val=" << val
5512                      << " op=" << (int)op.xattr.cmp_op << " mode=" << (int)op.xattr.cmp_mode << dendl;
5513             result = do_xattr_cmp_str(op.xattr.cmp_op, val, xattr);
5514           }
5515           break;
5516
5517         case CEPH_OSD_CMPXATTR_MODE_U64:
5518           {
5519             uint64_t u64val;
5520             try {
5521               ::decode(u64val, bp);
5522             }
5523             catch (buffer::error& e) {
5524               result = -EINVAL;
5525               goto fail;
5526             }
5527             dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name << " val=" << u64val
5528                      << " op=" << (int)op.xattr.cmp_op << " mode=" << (int)op.xattr.cmp_mode << dendl;
5529             result = do_xattr_cmp_u64(op.xattr.cmp_op, u64val, xattr);
5530           }
5531           break;
5532
5533         default:
5534           dout(10) << "bad cmp mode " << (int)op.xattr.cmp_mode << dendl;
5535           result = -EINVAL;
5536         }
5537
5538         if (!result) {
5539           dout(10) << "comparison returned false" << dendl;
5540           result = -ECANCELED;
5541           break;
5542         }
5543         if (result < 0) {
5544           dout(10) << "comparison returned " << result << " " << cpp_strerror(-result) << dendl;
5545           break;
5546         }
5547
5548         dout(10) << "comparison returned true" << dendl;
5549       }
5550       break;
5551
5552     case CEPH_OSD_OP_ASSERT_VER:
5553       ++ctx->num_read;
5554       {
5555         uint64_t ver = op.assert_ver.ver;
5556         tracepoint(osd, do_osd_op_pre_assert_ver, soid.oid.name.c_str(), soid.snap.val, ver);
5557         if (!ver)
5558           result = -EINVAL;
5559         else if (ver < oi.user_version)
5560           result = -ERANGE;
5561         else if (ver > oi.user_version)
5562           result = -EOVERFLOW;
5563       }
5564       break;
5565
5566     case CEPH_OSD_OP_LIST_WATCHERS:
5567       ++ctx->num_read;
5568       {
5569         tracepoint(osd, do_osd_op_pre_list_watchers, soid.oid.name.c_str(), soid.snap.val);
5570         obj_list_watch_response_t resp;
5571
5572         map<pair<uint64_t, entity_name_t>, watch_info_t>::const_iterator oi_iter;
5573         for (oi_iter = oi.watchers.begin(); oi_iter != oi.watchers.end();
5574                                        ++oi_iter) {
5575           dout(20) << "key cookie=" << oi_iter->first.first
5576                << " entity=" << oi_iter->first.second << " "
5577                << oi_iter->second << dendl;
5578           assert(oi_iter->first.first == oi_iter->second.cookie);
5579           assert(oi_iter->first.second.is_client());
5580
5581           watch_item_t wi(oi_iter->first.second, oi_iter->second.cookie,
5582                  oi_iter->second.timeout_seconds, oi_iter->second.addr);
5583           resp.entries.push_back(wi);
5584         }
5585
5586         resp.encode(osd_op.outdata, ctx->get_features());
5587         result = 0;
5588
5589         ctx->delta_stats.num_rd++;
5590         break;
5591       }
5592
5593     case CEPH_OSD_OP_LIST_SNAPS:
5594       ++ctx->num_read;
5595       {
5596         tracepoint(osd, do_osd_op_pre_list_snaps, soid.oid.name.c_str(), soid.snap.val);
5597         obj_list_snap_response_t resp;
5598
5599         if (!ssc) {
5600           ssc = ctx->obc->ssc = get_snapset_context(soid, false);
5601         }
5602         assert(ssc);
5603
5604         int clonecount = ssc->snapset.clones.size();
5605         if (ssc->snapset.head_exists)
5606           clonecount++;
5607         resp.clones.reserve(clonecount);
5608         for (auto clone_iter = ssc->snapset.clones.begin();
5609              clone_iter != ssc->snapset.clones.end(); ++clone_iter) {
5610           clone_info ci;
5611           ci.cloneid = *clone_iter;
5612
5613           hobject_t clone_oid = soid;
5614           clone_oid.snap = *clone_iter;
5615
5616           if (!ssc->snapset.is_legacy()) {
5617             auto p = ssc->snapset.clone_snaps.find(*clone_iter);
5618             if (p == ssc->snapset.clone_snaps.end()) {
5619               osd->clog->error() << "osd." << osd->whoami
5620                                  << ": inconsistent clone_snaps found for oid "
5621                                  << soid << " clone " << *clone_iter
5622                                  << " snapset " << ssc->snapset;
5623               result = -EINVAL;
5624               break;
5625             }
5626             for (auto q = p->second.rbegin(); q != p->second.rend(); ++q) {
5627               ci.snaps.push_back(*q);
5628             }
5629           } else {
5630             /* No need to take a lock here.  We are only inspecting state cached on
5631              * in the ObjectContext, so we aren't performing an actual read unless
5632              * the clone obc is not already loaded (in which case, it cannot have
5633              * an in progress write).  We also do not risk exposing uncommitted
5634              * state since we do have a read lock on the head object or snapdir,
5635              * which we would have to write lock in order to make user visible
5636              * modifications to the snapshot state (snap trim related mutations
5637              * are not user visible).
5638              */
5639             if (is_missing_object(clone_oid)) {
5640               dout(20) << "LIST_SNAPS " << clone_oid << " missing" << dendl;
5641               wait_for_unreadable_object(clone_oid, ctx->op);
5642               result = -EAGAIN;
5643               break;
5644             }
5645
5646             ObjectContextRef clone_obc = get_object_context(clone_oid, false);
5647             if (!clone_obc) {
5648               if (maybe_handle_cache(
5649                     ctx->op, true, clone_obc, -ENOENT, clone_oid, true)) {
5650                 // promoting the clone
5651                 result = -EAGAIN;
5652               } else {
5653                 osd->clog->error() << "osd." << osd->whoami
5654                                    << ": missing clone " << clone_oid
5655                                    << " for oid "
5656                                    << soid;
5657                 // should not happen
5658                 result = -ENOENT;
5659               }
5660               break;
5661             }
5662             for (vector<snapid_t>::reverse_iterator p =
5663                    clone_obc->obs.oi.legacy_snaps.rbegin();
5664                  p != clone_obc->obs.oi.legacy_snaps.rend();
5665                  ++p) {
5666               ci.snaps.push_back(*p);
5667             }
5668           }
5669
5670           dout(20) << " clone " << *clone_iter << " snaps " << ci.snaps << dendl;
5671
5672           map<snapid_t, interval_set<uint64_t> >::const_iterator coi;
5673           coi = ssc->snapset.clone_overlap.find(ci.cloneid);
5674           if (coi == ssc->snapset.clone_overlap.end()) {
5675             osd->clog->error() << "osd." << osd->whoami
5676                                << ": inconsistent clone_overlap found for oid "
5677                               << soid << " clone " << *clone_iter;
5678             result = -EINVAL;
5679             break;
5680           }
5681           const interval_set<uint64_t> &o = coi->second;
5682           ci.overlap.reserve(o.num_intervals());
5683           for (interval_set<uint64_t>::const_iterator r = o.begin();
5684                r != o.end(); ++r) {
5685             ci.overlap.push_back(pair<uint64_t,uint64_t>(r.get_start(),
5686                                                          r.get_len()));
5687           }
5688
5689           map<snapid_t, uint64_t>::const_iterator si;
5690           si = ssc->snapset.clone_size.find(ci.cloneid);
5691           if (si == ssc->snapset.clone_size.end()) {
5692             osd->clog->error() << "osd." << osd->whoami
5693                                << ": inconsistent clone_size found for oid "
5694                                << soid << " clone " << *clone_iter;
5695             result = -EINVAL;
5696             break;
5697           }
5698           ci.size = si->second;
5699
5700           resp.clones.push_back(ci);
5701         }
5702         if (result < 0) {
5703           break;
5704         }
5705         if (ssc->snapset.head_exists &&
5706             !ctx->obc->obs.oi.is_whiteout()) {
5707           assert(obs.exists);
5708           clone_info ci;
5709           ci.cloneid = CEPH_NOSNAP;
5710
5711           //Size for HEAD is oi.size
5712           ci.size = oi.size;
5713
5714           resp.clones.push_back(ci);
5715         }
5716         resp.seq = ssc->snapset.seq;
5717
5718         resp.encode(osd_op.outdata);
5719         result = 0;
5720
5721         ctx->delta_stats.num_rd++;
5722         break;
5723       }
5724
5725    case CEPH_OSD_OP_NOTIFY:
5726       ++ctx->num_read;
5727       {
5728         uint32_t timeout;
5729         bufferlist bl;
5730
5731         try {
5732           uint32_t ver; // obsolete
5733           ::decode(ver, bp);
5734           ::decode(timeout, bp);
5735           ::decode(bl, bp);
5736         } catch (const buffer::error &e) {
5737           timeout = 0;
5738         }
5739         tracepoint(osd, do_osd_op_pre_notify, soid.oid.name.c_str(), soid.snap.val, timeout);
5740         if (!timeout)
5741           timeout = cct->_conf->osd_default_notify_timeout;
5742
5743         notify_info_t n;
5744         n.timeout = timeout;
5745         n.notify_id = osd->get_next_id(get_osdmap()->get_epoch());
5746         n.cookie = op.watch.cookie;
5747         n.bl = bl;
5748         ctx->notifies.push_back(n);
5749
5750         // return our unique notify id to the client
5751         ::encode(n.notify_id, osd_op.outdata);
5752       }
5753       break;
5754
5755     case CEPH_OSD_OP_NOTIFY_ACK:
5756       ++ctx->num_read;
5757       {
5758         try {
5759           uint64_t notify_id = 0;
5760           uint64_t watch_cookie = 0;
5761           ::decode(notify_id, bp);
5762           ::decode(watch_cookie, bp);
5763           bufferlist reply_bl;
5764           if (!bp.end()) {
5765             ::decode(reply_bl, bp);
5766           }
5767           tracepoint(osd, do_osd_op_pre_notify_ack, soid.oid.name.c_str(), soid.snap.val, notify_id, watch_cookie, "Y");
5768           OpContext::NotifyAck ack(notify_id, watch_cookie, reply_bl);
5769           ctx->notify_acks.push_back(ack);
5770         } catch (const buffer::error &e) {
5771           tracepoint(osd, do_osd_op_pre_notify_ack, soid.oid.name.c_str(), soid.snap.val, op.watch.cookie, 0, "N");
5772           OpContext::NotifyAck ack(
5773             // op.watch.cookie is actually the notify_id for historical reasons
5774             op.watch.cookie
5775             );
5776           ctx->notify_acks.push_back(ack);
5777         }
5778       }
5779       break;
5780
5781     case CEPH_OSD_OP_SETALLOCHINT:
5782       ++ctx->num_write;
5783       {
5784         tracepoint(osd, do_osd_op_pre_setallochint, soid.oid.name.c_str(), soid.snap.val, op.alloc_hint.expected_object_size, op.alloc_hint.expected_write_size);
5785         maybe_create_new_object(ctx);
5786         oi.expected_object_size = op.alloc_hint.expected_object_size;
5787         oi.expected_write_size = op.alloc_hint.expected_write_size;
5788         oi.alloc_hint_flags = op.alloc_hint.flags;
5789         t->set_alloc_hint(soid, op.alloc_hint.expected_object_size,
5790                           op.alloc_hint.expected_write_size,
5791                           op.alloc_hint.flags);
5792         ctx->delta_stats.num_wr++;
5793         result = 0;
5794       }
5795       break;
5796
5797
5798       // --- WRITES ---
5799
5800       // -- object data --
5801
5802     case CEPH_OSD_OP_WRITE:
5803       ++ctx->num_write;
5804       { // write
5805         __u32 seq = oi.truncate_seq;
5806         tracepoint(osd, do_osd_op_pre_write, soid.oid.name.c_str(), soid.snap.val, oi.size, seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
5807         if (op.extent.length != osd_op.indata.length()) {
5808           result = -EINVAL;
5809           break;
5810         }
5811
5812         if (pool.info.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED))
5813           op.flags = op.flags | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
5814
5815         if (pool.info.requires_aligned_append() &&
5816             (op.extent.offset % pool.info.required_alignment() != 0)) {
5817           result = -EOPNOTSUPP;
5818           break;
5819         }
5820
5821         if (!obs.exists) {
5822           if (pool.info.requires_aligned_append() && op.extent.offset) {
5823             result = -EOPNOTSUPP;
5824             break;
5825           }
5826         } else if (op.extent.offset != oi.size &&
5827                    pool.info.requires_aligned_append()) {
5828           result = -EOPNOTSUPP;
5829           break;
5830         }
5831
5832         if (seq && (seq > op.extent.truncate_seq) &&
5833             (op.extent.offset + op.extent.length > oi.size)) {
5834           // old write, arrived after trimtrunc
5835           op.extent.length = (op.extent.offset > oi.size ? 0 : oi.size - op.extent.offset);
5836           dout(10) << " old truncate_seq " << op.extent.truncate_seq << " < current " << seq
5837                    << ", adjusting write length to " << op.extent.length << dendl;
5838           bufferlist t;
5839           t.substr_of(osd_op.indata, 0, op.extent.length);
5840           osd_op.indata.swap(t);
5841         }
5842         if (op.extent.truncate_seq > seq) {
5843           // write arrives before trimtrunc
5844           if (obs.exists && !oi.is_whiteout()) {
5845             dout(10) << " truncate_seq " << op.extent.truncate_seq << " > current " << seq
5846                      << ", truncating to " << op.extent.truncate_size << dendl;
5847             t->truncate(soid, op.extent.truncate_size);
5848             oi.truncate_seq = op.extent.truncate_seq;
5849             oi.truncate_size = op.extent.truncate_size;
5850             if (op.extent.truncate_size != oi.size) {
5851               ctx->delta_stats.num_bytes -= oi.size;
5852               ctx->delta_stats.num_bytes += op.extent.truncate_size;
5853               oi.size = op.extent.truncate_size;
5854             }
5855           } else {
5856             dout(10) << " truncate_seq " << op.extent.truncate_seq << " > current " << seq
5857                      << ", but object is new" << dendl;
5858             oi.truncate_seq = op.extent.truncate_seq;
5859             oi.truncate_size = op.extent.truncate_size;
5860           }
5861         }
5862         result = check_offset_and_length(op.extent.offset, op.extent.length, cct->_conf->osd_max_object_size);
5863         if (result < 0)
5864           break;
5865
5866         maybe_create_new_object(ctx);
5867
5868         if (op.extent.length == 0) {
5869           if (op.extent.offset > oi.size) {
5870             t->truncate(
5871               soid, op.extent.offset);
5872           } else {
5873             t->nop(soid);
5874           }
5875         } else {
5876           t->write(
5877             soid, op.extent.offset, op.extent.length, osd_op.indata, op.flags);
5878         }
5879
5880         if (op.extent.offset == 0 && op.extent.length >= oi.size
5881             && !skip_data_digest) {
5882           obs.oi.set_data_digest(osd_op.indata.crc32c(-1));
5883         } else if (op.extent.offset == oi.size && obs.oi.is_data_digest()) {
5884           if (skip_data_digest) {
5885             obs.oi.clear_data_digest();
5886           } else {
5887             obs.oi.set_data_digest(osd_op.indata.crc32c(obs.oi.data_digest));
5888           }
5889         } else {
5890           obs.oi.clear_data_digest();
5891         }
5892         write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
5893                                     op.extent.offset, op.extent.length);
5894
5895       }
5896       break;
5897
5898     case CEPH_OSD_OP_WRITEFULL:
5899       ++ctx->num_write;
5900       { // write full object
5901         tracepoint(osd, do_osd_op_pre_writefull, soid.oid.name.c_str(), soid.snap.val, oi.size, 0, op.extent.length);
5902
5903         if (op.extent.length != osd_op.indata.length()) {
5904           result = -EINVAL;
5905           break;
5906         }
5907         result = check_offset_and_length(0, op.extent.length, cct->_conf->osd_max_object_size);
5908         if (result < 0)
5909           break;
5910
5911         if (pool.info.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED))
5912           op.flags = op.flags | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
5913
5914         maybe_create_new_object(ctx);
5915         if (pool.info.require_rollback()) {
5916           t->truncate(soid, 0);
5917         } else if (obs.exists && op.extent.length < oi.size) {
5918           t->truncate(soid, op.extent.length);
5919         }
5920         if (op.extent.length) {
5921           t->write(soid, 0, op.extent.length, osd_op.indata, op.flags);
5922         }
5923         if (!skip_data_digest) {
5924           obs.oi.set_data_digest(osd_op.indata.crc32c(-1));
5925         } else {
5926           obs.oi.clear_data_digest();
5927         }
5928
5929         write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
5930             0, op.extent.length, true);
5931       }
5932       break;
5933
5934     case CEPH_OSD_OP_WRITESAME:
5935       ++ctx->num_write;
5936       tracepoint(osd, do_osd_op_pre_writesame, soid.oid.name.c_str(), soid.snap.val, oi.size, op.writesame.offset, op.writesame.length, op.writesame.data_length);
5937       result = do_writesame(ctx, osd_op);
5938       break;
5939
5940     case CEPH_OSD_OP_ROLLBACK :
5941       ++ctx->num_write;
5942       tracepoint(osd, do_osd_op_pre_rollback, soid.oid.name.c_str(), soid.snap.val);
5943       result = _rollback_to(ctx, op);
5944       break;
5945
5946     case CEPH_OSD_OP_ZERO:
5947       tracepoint(osd, do_osd_op_pre_zero, soid.oid.name.c_str(), soid.snap.val, op.extent.offset, op.extent.length);
5948       if (pool.info.requires_aligned_append()) {
5949         result = -EOPNOTSUPP;
5950         break;
5951       }
5952       ++ctx->num_write;
5953       { // zero
5954         result = check_offset_and_length(op.extent.offset, op.extent.length, cct->_conf->osd_max_object_size);
5955         if (result < 0)
5956           break;
5957         assert(op.extent.length);
5958         if (obs.exists && !oi.is_whiteout()) {
5959           t->zero(soid, op.extent.offset, op.extent.length);
5960           interval_set<uint64_t> ch;
5961           ch.insert(op.extent.offset, op.extent.length);
5962           ctx->modified_ranges.union_of(ch);
5963           ctx->delta_stats.num_wr++;
5964           oi.clear_data_digest();
5965         } else {
5966           // no-op
5967         }
5968       }
5969       break;
5970     case CEPH_OSD_OP_CREATE:
5971       ++ctx->num_write;
5972       {
5973         tracepoint(osd, do_osd_op_pre_create, soid.oid.name.c_str(), soid.snap.val);
5974         int flags = le32_to_cpu(op.flags);
5975         if (obs.exists && !oi.is_whiteout() &&
5976             (flags & CEPH_OSD_OP_FLAG_EXCL)) {
5977           result = -EEXIST; /* this is an exclusive create */
5978         } else {
5979           if (osd_op.indata.length()) {
5980             bufferlist::iterator p = osd_op.indata.begin();
5981             string category;
5982             try {
5983               ::decode(category, p);
5984             }
5985             catch (buffer::error& e) {
5986               result = -EINVAL;
5987               goto fail;
5988             }
5989             // category is no longer implemented.
5990           }
5991           if (result >= 0) {
5992             maybe_create_new_object(ctx);
5993             t->nop(soid);
5994           }
5995         }
5996       }
5997       break;
5998
5999     case CEPH_OSD_OP_TRIMTRUNC:
6000       op.extent.offset = op.extent.truncate_size;
6001       // falling through
6002
6003     case CEPH_OSD_OP_TRUNCATE:
6004       tracepoint(osd, do_osd_op_pre_truncate, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
6005       if (pool.info.requires_aligned_append()) {
6006         result = -EOPNOTSUPP;
6007         break;
6008       }
6009       ++ctx->num_write;
6010       {
6011         // truncate
6012         if (!obs.exists || oi.is_whiteout()) {
6013           dout(10) << " object dne, truncate is a no-op" << dendl;
6014           break;
6015         }
6016
6017         if (op.extent.offset > cct->_conf->osd_max_object_size) {
6018           result = -EFBIG;
6019           break;
6020         }
6021
6022         if (op.extent.truncate_seq) {
6023           assert(op.extent.offset == op.extent.truncate_size);
6024           if (op.extent.truncate_seq <= oi.truncate_seq) {
6025             dout(10) << " truncate seq " << op.extent.truncate_seq << " <= current " << oi.truncate_seq
6026                      << ", no-op" << dendl;
6027             break; // old
6028           }
6029           dout(10) << " truncate seq " << op.extent.truncate_seq << " > current " << oi.truncate_seq
6030                    << ", truncating" << dendl;
6031           oi.truncate_seq = op.extent.truncate_seq;
6032           oi.truncate_size = op.extent.truncate_size;
6033         }
6034
6035         maybe_create_new_object(ctx);
6036         t->truncate(soid, op.extent.offset);
6037         if (oi.size > op.extent.offset) {
6038           interval_set<uint64_t> trim;
6039           trim.insert(op.extent.offset, oi.size-op.extent.offset);
6040           ctx->modified_ranges.union_of(trim);
6041         }
6042         if (op.extent.offset != oi.size) {
6043           ctx->delta_stats.num_bytes -= oi.size;
6044           ctx->delta_stats.num_bytes += op.extent.offset;
6045           oi.size = op.extent.offset;
6046         }
6047         ctx->delta_stats.num_wr++;
6048         // do no set exists, or we will break above DELETE -> TRUNCATE munging.
6049
6050         oi.clear_data_digest();
6051       }
6052       break;
6053
6054     case CEPH_OSD_OP_DELETE:
6055       ++ctx->num_write;
6056       tracepoint(osd, do_osd_op_pre_delete, soid.oid.name.c_str(), soid.snap.val);
6057       {
6058         result = _delete_oid(ctx, false, ctx->ignore_cache);
6059       }
6060       break;
6061
6062     case CEPH_OSD_OP_WATCH:
6063       ++ctx->num_write;
6064       {
6065         tracepoint(osd, do_osd_op_pre_watch, soid.oid.name.c_str(), soid.snap.val,
6066                    op.watch.cookie, op.watch.op);
6067         if (!obs.exists) {
6068           result = -ENOENT;
6069           break;
6070         }
6071         uint64_t cookie = op.watch.cookie;
6072         entity_name_t entity = ctx->reqid.name;
6073         ObjectContextRef obc = ctx->obc;
6074
6075         dout(10) << "watch " << ceph_osd_watch_op_name(op.watch.op)
6076                  << ": ctx->obc=" << (void *)obc.get() << " cookie=" << cookie
6077                  << " oi.version=" << oi.version.version << " ctx->at_version=" << ctx->at_version << dendl;
6078         dout(10) << "watch: oi.user_version=" << oi.user_version<< dendl;
6079         dout(10) << "watch: peer_addr="
6080           << ctx->op->get_req()->get_connection()->get_peer_addr() << dendl;
6081
6082         uint32_t timeout = cct->_conf->osd_client_watch_timeout;
6083         if (op.watch.timeout != 0) {
6084           timeout = op.watch.timeout;
6085         }
6086
6087         watch_info_t w(cookie, timeout,
6088           ctx->op->get_req()->get_connection()->get_peer_addr());
6089         if (op.watch.op == CEPH_OSD_WATCH_OP_WATCH ||
6090             op.watch.op == CEPH_OSD_WATCH_OP_LEGACY_WATCH) {
6091           if (oi.watchers.count(make_pair(cookie, entity))) {
6092             dout(10) << " found existing watch " << w << " by " << entity << dendl;
6093           } else {
6094             dout(10) << " registered new watch " << w << " by " << entity << dendl;
6095             oi.watchers[make_pair(cookie, entity)] = w;
6096             t->nop(soid);  // make sure update the object_info on disk!
6097           }
6098           bool will_ping = (op.watch.op == CEPH_OSD_WATCH_OP_WATCH);
6099           ctx->watch_connects.push_back(make_pair(w, will_ping));
6100         } else if (op.watch.op == CEPH_OSD_WATCH_OP_RECONNECT) {
6101           if (!oi.watchers.count(make_pair(cookie, entity))) {
6102             result = -ENOTCONN;
6103             break;
6104           }
6105           dout(10) << " found existing watch " << w << " by " << entity << dendl;
6106           ctx->watch_connects.push_back(make_pair(w, true));
6107         } else if (op.watch.op == CEPH_OSD_WATCH_OP_PING) {
6108           /* Note: WATCH with PING doesn't cause may_write() to return true,
6109            * so if there is nothing else in the transaction, this is going
6110            * to run do_osd_op_effects, but not write out a log entry */
6111           if (!oi.watchers.count(make_pair(cookie, entity))) {
6112             result = -ENOTCONN;
6113             break;
6114           }
6115           map<pair<uint64_t,entity_name_t>,WatchRef>::iterator p =
6116             obc->watchers.find(make_pair(cookie, entity));
6117           if (p == obc->watchers.end() ||
6118               !p->second->is_connected()) {
6119             // client needs to reconnect
6120             result = -ETIMEDOUT;
6121             break;
6122           }
6123           dout(10) << " found existing watch " << w << " by " << entity << dendl;
6124           p->second->got_ping(ceph_clock_now());
6125           result = 0;
6126         } else if (op.watch.op == CEPH_OSD_WATCH_OP_UNWATCH) {
6127           map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator oi_iter =
6128             oi.watchers.find(make_pair(cookie, entity));
6129           if (oi_iter != oi.watchers.end()) {
6130             dout(10) << " removed watch " << oi_iter->second << " by "
6131                      << entity << dendl;
6132             oi.watchers.erase(oi_iter);
6133             t->nop(soid);  // update oi on disk
6134             ctx->watch_disconnects.push_back(
6135               watch_disconnect_t(cookie, entity, false));
6136           } else {
6137             dout(10) << " can't remove: no watch by " << entity << dendl;
6138           }
6139         }
6140       }
6141       break;
6142
6143     case CEPH_OSD_OP_CACHE_PIN:
6144       tracepoint(osd, do_osd_op_pre_cache_pin, soid.oid.name.c_str(), soid.snap.val);
6145       if ((!pool.info.is_tier() ||
6146           pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)) {
6147         result = -EINVAL;
6148         dout(10) << " pin object is only allowed on the cache tier " << dendl;
6149         break;
6150       }
6151       ++ctx->num_write;
6152       {
6153         if (!obs.exists || oi.is_whiteout()) {
6154           result = -ENOENT;
6155           break;
6156         }
6157
6158         if (!oi.is_cache_pinned()) {
6159           oi.set_flag(object_info_t::FLAG_CACHE_PIN);
6160           ctx->modify = true;
6161           ctx->delta_stats.num_objects_pinned++;
6162           ctx->delta_stats.num_wr++;
6163         }
6164         result = 0;
6165       }
6166       break;
6167
6168     case CEPH_OSD_OP_CACHE_UNPIN:
6169       tracepoint(osd, do_osd_op_pre_cache_unpin, soid.oid.name.c_str(), soid.snap.val);
6170       if ((!pool.info.is_tier() ||
6171           pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)) {
6172         result = -EINVAL;
6173         dout(10) << " pin object is only allowed on the cache tier " << dendl;
6174         break;
6175       }
6176       ++ctx->num_write;
6177       {
6178         if (!obs.exists || oi.is_whiteout()) {
6179           result = -ENOENT;
6180           break;
6181         }
6182
6183         if (oi.is_cache_pinned()) {
6184           oi.clear_flag(object_info_t::FLAG_CACHE_PIN);
6185           ctx->modify = true;
6186           ctx->delta_stats.num_objects_pinned--;
6187           ctx->delta_stats.num_wr++;
6188         }
6189         result = 0;
6190       }
6191       break;
6192
6193     case CEPH_OSD_OP_SET_REDIRECT:
6194       ++ctx->num_write;
6195       {
6196         if (pool.info.is_tier()) {
6197           result = -EINVAL;
6198           break;
6199         }
6200         if (!obs.exists) {
6201           result = -ENOENT;
6202           break;
6203         }
6204         if (get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS) {
6205           result = -EOPNOTSUPP;
6206           break;
6207         }
6208
6209         object_t target_name;
6210         object_locator_t target_oloc;
6211         snapid_t target_snapid = (uint64_t)op.copy_from.snapid;
6212         version_t target_version = op.copy_from.src_version;
6213         try {
6214           ::decode(target_name, bp);
6215           ::decode(target_oloc, bp);
6216         }
6217         catch (buffer::error& e) {
6218           result = -EINVAL;
6219           goto fail;
6220         }
6221         pg_t raw_pg;
6222         get_osdmap()->object_locator_to_pg(target_name, target_oloc, raw_pg);
6223         hobject_t target(target_name, target_oloc.key, target_snapid,
6224                 raw_pg.ps(), raw_pg.pool(),
6225                 target_oloc.nspace);
6226         if (target == soid) {
6227           dout(20) << " set-redirect self is invalid" << dendl;
6228           result = -EINVAL;
6229           break;
6230         }
6231         oi.set_flag(object_info_t::FLAG_MANIFEST);
6232         oi.manifest.redirect_target = target;
6233         oi.manifest.type = object_manifest_t::TYPE_REDIRECT;
6234         t->truncate(soid, 0);
6235         if (oi.is_omap() && pool.info.supports_omap()) {
6236           t->omap_clear(soid);
6237           obs.oi.clear_omap_digest();
6238           obs.oi.clear_flag(object_info_t::FLAG_OMAP);
6239         }
6240         ctx->delta_stats.num_bytes -= oi.size;
6241         oi.size = 0;
6242         oi.new_object();
6243         oi.user_version = target_version;
6244         ctx->user_at_version = target_version;
6245         /* rm_attrs */
6246         map<string,bufferlist> rmattrs;
6247         result = getattrs_maybe_cache(ctx->obc,
6248                     &rmattrs);
6249         if (result < 0) {
6250           return result;
6251         }
6252         map<string, bufferlist>::iterator iter;
6253         for (iter = rmattrs.begin(); iter != rmattrs.end(); ++iter) {
6254           const string& name = iter->first;
6255           t->rmattr(soid, name);
6256         }
6257         dout(10) << "set-redirect oid:" << oi.soid << " user_version: " << oi.user_version << dendl;
6258       }
6259
6260       break;
6261
6262       // -- object attrs --
6263
6264     case CEPH_OSD_OP_SETXATTR:
6265       ++ctx->num_write;
6266       {
6267         if (cct->_conf->osd_max_attr_size > 0 &&
6268             op.xattr.value_len > cct->_conf->osd_max_attr_size) {
6269           tracepoint(osd, do_osd_op_pre_setxattr, soid.oid.name.c_str(), soid.snap.val, "???");
6270           result = -EFBIG;
6271           break;
6272         }
6273         unsigned max_name_len = MIN(osd->store->get_max_attr_name_length(),
6274                                     cct->_conf->osd_max_attr_name_len);
6275         if (op.xattr.name_len > max_name_len) {
6276           result = -ENAMETOOLONG;
6277           break;
6278         }
6279         maybe_create_new_object(ctx);
6280         string aname;
6281         bp.copy(op.xattr.name_len, aname);
6282         tracepoint(osd, do_osd_op_pre_setxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
6283         string name = "_" + aname;
6284         bufferlist bl;
6285         bp.copy(op.xattr.value_len, bl);
6286         t->setattr(soid, name, bl);
6287         ctx->delta_stats.num_wr++;
6288       }
6289       break;
6290
6291     case CEPH_OSD_OP_RMXATTR:
6292       ++ctx->num_write;
6293       {
6294         string aname;
6295         bp.copy(op.xattr.name_len, aname);
6296         tracepoint(osd, do_osd_op_pre_rmxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
6297         if (!obs.exists || oi.is_whiteout()) {
6298           result = -ENOENT;
6299           break;
6300         }
6301         string name = "_" + aname;
6302         t->rmattr(soid, name);
6303         ctx->delta_stats.num_wr++;
6304       }
6305       break;
6306
6307
6308       // -- fancy writers --
6309     case CEPH_OSD_OP_APPEND:
6310       {
6311         tracepoint(osd, do_osd_op_pre_append, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
6312         // just do it inline; this works because we are happy to execute
6313         // fancy op on replicas as well.
6314         vector<OSDOp> nops(1);
6315         OSDOp& newop = nops[0];
6316         newop.op.op = CEPH_OSD_OP_WRITE;
6317         newop.op.extent.offset = oi.size;
6318         newop.op.extent.length = op.extent.length;
6319         newop.op.extent.truncate_seq = oi.truncate_seq;
6320         newop.indata = osd_op.indata;
6321         result = do_osd_ops(ctx, nops);
6322         osd_op.outdata.claim(newop.outdata);
6323       }
6324       break;
6325
6326     case CEPH_OSD_OP_STARTSYNC:
6327       tracepoint(osd, do_osd_op_pre_startsync, soid.oid.name.c_str(), soid.snap.val);
6328       t->nop(soid);
6329       break;
6330
6331
6332       // -- trivial map --
6333     case CEPH_OSD_OP_TMAPGET:
6334       tracepoint(osd, do_osd_op_pre_tmapget, soid.oid.name.c_str(), soid.snap.val);
6335       if (pool.info.require_rollback()) {
6336         result = -EOPNOTSUPP;
6337         break;
6338       }
6339       {
6340         vector<OSDOp> nops(1);
6341         OSDOp& newop = nops[0];
6342         newop.op.op = CEPH_OSD_OP_SYNC_READ;
6343         newop.op.extent.offset = 0;
6344         newop.op.extent.length = 0;
6345         do_osd_ops(ctx, nops);
6346         osd_op.outdata.claim(newop.outdata);
6347       }
6348       break;
6349
6350     case CEPH_OSD_OP_TMAPPUT:
6351       tracepoint(osd, do_osd_op_pre_tmapput, soid.oid.name.c_str(), soid.snap.val);
6352       if (pool.info.require_rollback()) {
6353         result = -EOPNOTSUPP;
6354         break;
6355       }
6356       {
6357         //_dout_lock.Lock();
6358         //osd_op.data.hexdump(*_dout);
6359         //_dout_lock.Unlock();
6360
6361         // verify sort order
6362         bool unsorted = false;
6363         if (true) {
6364           bufferlist header;
6365           ::decode(header, bp);
6366           uint32_t n;
6367           ::decode(n, bp);
6368           string last_key;
6369           while (n--) {
6370             string key;
6371             ::decode(key, bp);
6372             dout(10) << "tmapput key " << key << dendl;
6373             bufferlist val;
6374             ::decode(val, bp);
6375             if (key < last_key) {
6376               dout(10) << "TMAPPUT is unordered; resorting" << dendl;
6377               unsorted = true;
6378               break;
6379             }
6380             last_key = key;
6381           }
6382         }
6383
6384         // write it
6385         vector<OSDOp> nops(1);
6386         OSDOp& newop = nops[0];
6387         newop.op.op = CEPH_OSD_OP_WRITEFULL;
6388         newop.op.extent.offset = 0;
6389         newop.op.extent.length = osd_op.indata.length();
6390         newop.indata = osd_op.indata;
6391
6392         if (unsorted) {
6393           bp = osd_op.indata.begin();
6394           bufferlist header;
6395           map<string, bufferlist> m;
6396           ::decode(header, bp);
6397           ::decode(m, bp);
6398           assert(bp.end());
6399           bufferlist newbl;
6400           ::encode(header, newbl);
6401           ::encode(m, newbl);
6402           newop.indata = newbl;
6403         }
6404         result = do_osd_ops(ctx, nops);
6405         assert(result == 0);
6406       }
6407       break;
6408
6409     case CEPH_OSD_OP_TMAPUP:
6410       tracepoint(osd, do_osd_op_pre_tmapup, soid.oid.name.c_str(), soid.snap.val);
6411       if (pool.info.require_rollback()) {
6412         result = -EOPNOTSUPP;
6413         break;
6414       }
6415       ++ctx->num_write;
6416       result = do_tmapup(ctx, bp, osd_op);
6417       break;
6418
6419     case CEPH_OSD_OP_TMAP2OMAP:
6420       ++ctx->num_write;
6421       tracepoint(osd, do_osd_op_pre_tmap2omap, soid.oid.name.c_str(), soid.snap.val);
6422       result = do_tmap2omap(ctx, op.tmap2omap.flags);
6423       break;
6424
6425       // OMAP Read ops
6426     case CEPH_OSD_OP_OMAPGETKEYS:
6427       ++ctx->num_read;
6428       {
6429         string start_after;
6430         uint64_t max_return;
6431         try {
6432           ::decode(start_after, bp);
6433           ::decode(max_return, bp);
6434         }
6435         catch (buffer::error& e) {
6436           result = -EINVAL;
6437           tracepoint(osd, do_osd_op_pre_omapgetkeys, soid.oid.name.c_str(), soid.snap.val, "???", 0);
6438           goto fail;
6439         }
6440         if (max_return > cct->_conf->osd_max_omap_entries_per_request) {
6441           max_return = cct->_conf->osd_max_omap_entries_per_request;
6442         }
6443         tracepoint(osd, do_osd_op_pre_omapgetkeys, soid.oid.name.c_str(), soid.snap.val, start_after.c_str(), max_return);
6444
6445         bufferlist bl;
6446         uint32_t num = 0;
6447         bool truncated = false;
6448         if (oi.is_omap()) {
6449           ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(
6450             coll, ghobject_t(soid)
6451             );
6452           assert(iter);
6453           iter->upper_bound(start_after);
6454           for (num = 0; iter->valid(); ++num, iter->next(false)) {
6455             if (num >= max_return ||
6456                 bl.length() >= cct->_conf->osd_max_omap_bytes_per_request) {
6457               truncated = true;
6458               break;
6459             }
6460             ::encode(iter->key(), bl);
6461           }
6462         } // else return empty out_set
6463         ::encode(num, osd_op.outdata);
6464         osd_op.outdata.claim_append(bl);
6465         ::encode(truncated, osd_op.outdata);
6466         ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
6467         ctx->delta_stats.num_rd++;
6468       }
6469       break;
6470
6471     case CEPH_OSD_OP_OMAPGETVALS:
6472       ++ctx->num_read;
6473       {
6474         string start_after;
6475         uint64_t max_return;
6476         string filter_prefix;
6477         try {
6478           ::decode(start_after, bp);
6479           ::decode(max_return, bp);
6480           ::decode(filter_prefix, bp);
6481         }
6482         catch (buffer::error& e) {
6483           result = -EINVAL;
6484           tracepoint(osd, do_osd_op_pre_omapgetvals, soid.oid.name.c_str(), soid.snap.val, "???", 0, "???");
6485           goto fail;
6486         }
6487         if (max_return > cct->_conf->osd_max_omap_entries_per_request) {
6488           max_return = cct->_conf->osd_max_omap_entries_per_request;
6489         }
6490         tracepoint(osd, do_osd_op_pre_omapgetvals, soid.oid.name.c_str(), soid.snap.val, start_after.c_str(), max_return, filter_prefix.c_str());
6491
6492         uint32_t num = 0;
6493         bool truncated = false;
6494         bufferlist bl;
6495         if (oi.is_omap()) {
6496           ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(
6497             coll, ghobject_t(soid)
6498             );
6499           if (!iter) {
6500             result = -ENOENT;
6501             goto fail;
6502           }
6503           iter->upper_bound(start_after);
6504           if (filter_prefix > start_after) iter->lower_bound(filter_prefix);
6505           for (num = 0;
6506                iter->valid() &&
6507                  iter->key().substr(0, filter_prefix.size()) == filter_prefix;
6508                ++num, iter->next(false)) {
6509             dout(20) << "Found key " << iter->key() << dendl;
6510             if (num >= max_return ||
6511                 bl.length() >= cct->_conf->osd_max_omap_bytes_per_request) {
6512               truncated = true;
6513               break;
6514             }
6515             ::encode(iter->key(), bl);
6516             ::encode(iter->value(), bl);
6517           }
6518         } // else return empty out_set
6519         ::encode(num, osd_op.outdata);
6520         osd_op.outdata.claim_append(bl);
6521         ::encode(truncated, osd_op.outdata);
6522         ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
6523         ctx->delta_stats.num_rd++;
6524       }
6525       break;
6526
6527     case CEPH_OSD_OP_OMAPGETHEADER:
6528       tracepoint(osd, do_osd_op_pre_omapgetheader, soid.oid.name.c_str(), soid.snap.val);
6529       if (!oi.is_omap()) {
6530         // return empty header
6531         break;
6532       }
6533       ++ctx->num_read;
6534       {
6535         osd->store->omap_get_header(ch, ghobject_t(soid), &osd_op.outdata);
6536         ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
6537         ctx->delta_stats.num_rd++;
6538       }
6539       break;
6540
6541     case CEPH_OSD_OP_OMAPGETVALSBYKEYS:
6542       ++ctx->num_read;
6543       {
6544         set<string> keys_to_get;
6545         try {
6546           ::decode(keys_to_get, bp);
6547         }
6548         catch (buffer::error& e) {
6549           result = -EINVAL;
6550           tracepoint(osd, do_osd_op_pre_omapgetvalsbykeys, soid.oid.name.c_str(), soid.snap.val, "???");
6551           goto fail;
6552         }
6553         tracepoint(osd, do_osd_op_pre_omapgetvalsbykeys, soid.oid.name.c_str(), soid.snap.val, list_entries(keys_to_get).c_str());
6554         map<string, bufferlist> out;
6555         if (oi.is_omap()) {
6556           osd->store->omap_get_values(ch, ghobject_t(soid), keys_to_get, &out);
6557         } // else return empty omap entries
6558         ::encode(out, osd_op.outdata);
6559         ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
6560         ctx->delta_stats.num_rd++;
6561       }
6562       break;
6563
6564     case CEPH_OSD_OP_OMAP_CMP:
6565       ++ctx->num_read;
6566       {
6567         if (!obs.exists || oi.is_whiteout()) {
6568           result = -ENOENT;
6569           tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, "???");
6570           break;
6571         }
6572         map<string, pair<bufferlist, int> > assertions;
6573         try {
6574           ::decode(assertions, bp);
6575         }
6576         catch (buffer::error& e) {
6577           result = -EINVAL;
6578           tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, "???");
6579           goto fail;
6580         }
6581         tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, list_keys(assertions).c_str());
6582
6583         map<string, bufferlist> out;
6584
6585         if (oi.is_omap()) {
6586           set<string> to_get;
6587           for (map<string, pair<bufferlist, int> >::iterator i = assertions.begin();
6588                i != assertions.end();
6589                ++i)
6590             to_get.insert(i->first);
6591           int r = osd->store->omap_get_values(ch, ghobject_t(soid),
6592                                               to_get, &out);
6593           if (r < 0) {
6594             result = r;
6595             break;
6596           }
6597         } // else leave out empty
6598
6599         //Should set num_rd_kb based on encode length of map
6600         ctx->delta_stats.num_rd++;
6601
6602         int r = 0;
6603         bufferlist empty;
6604         for (map<string, pair<bufferlist, int> >::iterator i = assertions.begin();
6605              i != assertions.end();
6606              ++i) {
6607           auto out_entry = out.find(i->first);
6608           bufferlist &bl = (out_entry != out.end()) ?
6609             out_entry->second : empty;
6610           switch (i->second.second) {
6611           case CEPH_OSD_CMPXATTR_OP_EQ:
6612             if (!(bl == i->second.first)) {
6613               r = -ECANCELED;
6614             }
6615             break;
6616           case CEPH_OSD_CMPXATTR_OP_LT:
6617             if (!(bl < i->second.first)) {
6618               r = -ECANCELED;
6619             }
6620             break;
6621           case CEPH_OSD_CMPXATTR_OP_GT:
6622             if (!(bl > i->second.first)) {
6623               r = -ECANCELED;
6624             }
6625             break;
6626           default:
6627             r = -EINVAL;
6628             break;
6629           }
6630           if (r < 0)
6631             break;
6632         }
6633         if (r < 0) {
6634           result = r;
6635         }
6636       }
6637       break;
6638
6639       // OMAP Write ops
6640     case CEPH_OSD_OP_OMAPSETVALS:
6641       if (!pool.info.supports_omap()) {
6642         result = -EOPNOTSUPP;
6643         tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
6644         break;
6645       }
6646       ++ctx->num_write;
6647       {
6648         maybe_create_new_object(ctx);
6649         bufferlist to_set_bl;
6650         try {
6651           decode_str_str_map_to_bl(bp, &to_set_bl);
6652         }
6653         catch (buffer::error& e) {
6654           result = -EINVAL;
6655           tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
6656           goto fail;
6657         }
6658         tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
6659         if (cct->_conf->subsys.should_gather(dout_subsys, 20)) {
6660           dout(20) << "setting vals: " << dendl;
6661           map<string,bufferlist> to_set;
6662           bufferlist::iterator pt = to_set_bl.begin();
6663           ::decode(to_set, pt);
6664           for (map<string, bufferlist>::iterator i = to_set.begin();
6665                i != to_set.end();
6666                ++i) {
6667             dout(20) << "\t" << i->first << dendl;
6668           }
6669         }
6670         t->omap_setkeys(soid, to_set_bl);
6671         ctx->delta_stats.num_wr++;
6672       }
6673       obs.oi.set_flag(object_info_t::FLAG_OMAP);
6674       obs.oi.clear_omap_digest();
6675       break;
6676
6677     case CEPH_OSD_OP_OMAPSETHEADER:
6678       tracepoint(osd, do_osd_op_pre_omapsetheader, soid.oid.name.c_str(), soid.snap.val);
6679       if (!pool.info.supports_omap()) {
6680         result = -EOPNOTSUPP;
6681         break;
6682       }
6683       ++ctx->num_write;
6684       {
6685         maybe_create_new_object(ctx);
6686         t->omap_setheader(soid, osd_op.indata);
6687         ctx->delta_stats.num_wr++;
6688       }
6689       obs.oi.set_flag(object_info_t::FLAG_OMAP);
6690       obs.oi.clear_omap_digest();
6691       break;
6692
6693     case CEPH_OSD_OP_OMAPCLEAR:
6694       tracepoint(osd, do_osd_op_pre_omapclear, soid.oid.name.c_str(), soid.snap.val);
6695       if (!pool.info.supports_omap()) {
6696         result = -EOPNOTSUPP;
6697         break;
6698       }
6699       ++ctx->num_write;
6700       {
6701         if (!obs.exists || oi.is_whiteout()) {
6702           result = -ENOENT;
6703           break;
6704         }
6705         if (oi.is_omap()) {
6706           t->omap_clear(soid);
6707           ctx->delta_stats.num_wr++;
6708           obs.oi.clear_omap_digest();
6709           obs.oi.clear_flag(object_info_t::FLAG_OMAP);
6710         }
6711       }
6712       break;
6713
6714     case CEPH_OSD_OP_OMAPRMKEYS:
6715       if (!pool.info.supports_omap()) {
6716         result = -EOPNOTSUPP;
6717         tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
6718         break;
6719       }
6720       ++ctx->num_write;
6721       {
6722         if (!obs.exists || oi.is_whiteout()) {
6723           result = -ENOENT;
6724           tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
6725           break;
6726         }
6727         bufferlist to_rm_bl;
6728         try {
6729           decode_str_set_to_bl(bp, &to_rm_bl);
6730         }
6731         catch (buffer::error& e) {
6732           result = -EINVAL;
6733           tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
6734           goto fail;
6735         }
6736         tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
6737         t->omap_rmkeys(soid, to_rm_bl);
6738         ctx->delta_stats.num_wr++;
6739       }
6740       obs.oi.clear_omap_digest();
6741       break;
6742
6743     case CEPH_OSD_OP_COPY_GET:
6744       ++ctx->num_read;
6745       tracepoint(osd, do_osd_op_pre_copy_get, soid.oid.name.c_str(),
6746                  soid.snap.val);
6747       if (op_finisher == nullptr) {
6748         result = do_copy_get(ctx, bp, osd_op, ctx->obc);
6749       } else {
6750         result = op_finisher->execute();
6751       }
6752       break;
6753
6754     case CEPH_OSD_OP_COPY_FROM:
6755       ++ctx->num_write;
6756       {
6757         object_t src_name;
6758         object_locator_t src_oloc;
6759         snapid_t src_snapid = (uint64_t)op.copy_from.snapid;
6760         version_t src_version = op.copy_from.src_version;
6761         try {
6762           ::decode(src_name, bp);
6763           ::decode(src_oloc, bp);
6764         }
6765         catch (buffer::error& e) {
6766           result = -EINVAL;
6767           tracepoint(osd,
6768                      do_osd_op_pre_copy_from,
6769                      soid.oid.name.c_str(),
6770                      soid.snap.val,
6771                      "???",
6772                      0,
6773                      "???",
6774                      "???",
6775                      0,
6776                      src_snapid,
6777                      src_version);
6778           goto fail;
6779         }
6780         tracepoint(osd,
6781                    do_osd_op_pre_copy_from,
6782                    soid.oid.name.c_str(),
6783                    soid.snap.val,
6784                    src_name.name.c_str(),
6785                    src_oloc.pool,
6786                    src_oloc.key.c_str(),
6787                    src_oloc.nspace.c_str(),
6788                    src_oloc.hash,
6789                    src_snapid,
6790                    src_version);
6791         if (op_finisher == nullptr) {
6792           // start
6793           pg_t raw_pg;
6794           get_osdmap()->object_locator_to_pg(src_name, src_oloc, raw_pg);
6795           hobject_t src(src_name, src_oloc.key, src_snapid,
6796                         raw_pg.ps(), raw_pg.pool(),
6797                         src_oloc.nspace);
6798           if (src == soid) {
6799             dout(20) << " copy from self is invalid" << dendl;
6800             result = -EINVAL;
6801             break;
6802           }
6803           CopyFromCallback *cb = new CopyFromCallback(ctx, osd_op);
6804           ctx->op_finishers[ctx->current_osd_subop_num].reset(
6805             new CopyFromFinisher(cb));
6806           start_copy(cb, ctx->obc, src, src_oloc, src_version,
6807                      op.copy_from.flags,
6808                      false,
6809                      op.copy_from.src_fadvise_flags,
6810                      op.flags);
6811           result = -EINPROGRESS;
6812         } else {
6813           // finish
6814           result = op_finisher->execute();
6815           assert(result == 0);
6816
6817           // COPY_FROM cannot be executed multiple times -- it must restart
6818           ctx->op_finishers.erase(ctx->current_osd_subop_num);
6819         }
6820       }
6821       break;
6822
6823     default:
6824       tracepoint(osd, do_osd_op_pre_unknown, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op));
6825       dout(1) << "unrecognized osd op " << op.op
6826               << " " << ceph_osd_op_name(op.op)
6827               << dendl;
6828       result = -EOPNOTSUPP;
6829     }
6830
6831   fail:
6832     osd_op.rval = result;
6833     tracepoint(osd, do_osd_op_post, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op), op.flags, result);
6834     if (result < 0 && (op.flags & CEPH_OSD_OP_FLAG_FAILOK))
6835       result = 0;
6836
6837     if (result < 0)
6838       break;
6839   }
6840   return result;
6841 }
6842
6843 int PrimaryLogPG::_get_tmap(OpContext *ctx, bufferlist *header, bufferlist *vals)
6844 {
6845   if (ctx->new_obs.oi.size == 0) {
6846     dout(20) << "unable to get tmap for zero sized " << ctx->new_obs.oi.soid << dendl;
6847     return -ENODATA;
6848   }
6849   vector<OSDOp> nops(1);
6850   OSDOp &newop = nops[0];
6851   newop.op.op = CEPH_OSD_OP_TMAPGET;
6852   do_osd_ops(ctx, nops);
6853   try {
6854     bufferlist::iterator i = newop.outdata.begin();
6855     ::decode(*header, i);
6856     (*vals).substr_of(newop.outdata, i.get_off(), i.get_remaining());
6857   } catch (...) {
6858     dout(20) << "unsuccessful at decoding tmap for " << ctx->new_obs.oi.soid
6859              << dendl;
6860     return -EINVAL;
6861   }
6862   dout(20) << "successful at decoding tmap for " << ctx->new_obs.oi.soid
6863            << dendl;
6864   return 0;
6865 }
6866
6867 int PrimaryLogPG::_verify_no_head_clones(const hobject_t& soid,
6868                                         const SnapSet& ss)
6869 {
6870   // verify that all clones have been evicted
6871   dout(20) << __func__ << " verifying clones are absent "
6872            << ss << dendl;
6873   for (vector<snapid_t>::const_iterator p = ss.clones.begin();
6874        p != ss.clones.end();
6875        ++p) {
6876     hobject_t clone_oid = soid;
6877     clone_oid.snap = *p;
6878     if (is_missing_object(clone_oid))
6879       return -EBUSY;
6880     ObjectContextRef clone_obc = get_object_context(clone_oid, false);
6881     if (clone_obc && clone_obc->obs.exists) {
6882       dout(10) << __func__ << " cannot evict head before clone "
6883                << clone_oid << dendl;
6884       return -EBUSY;
6885     }
6886     if (copy_ops.count(clone_oid)) {
6887       dout(10) << __func__ << " cannot evict head, pending promote on clone "
6888                << clone_oid << dendl;
6889       return -EBUSY;
6890     }
6891   }
6892   return 0;
6893 }
6894
6895 inline int PrimaryLogPG::_delete_oid(
6896   OpContext *ctx,
6897   bool no_whiteout,     // no whiteouts, no matter what.
6898   bool try_no_whiteout) // try not to whiteout
6899 {
6900   SnapSet& snapset = ctx->new_snapset;
6901   ObjectState& obs = ctx->new_obs;
6902   object_info_t& oi = obs.oi;
6903   const hobject_t& soid = oi.soid;
6904   PGTransaction* t = ctx->op_t.get();
6905
6906   // cache: cache: set whiteout on delete?
6907   bool whiteout = false;
6908   if (pool.info.cache_mode != pg_pool_t::CACHEMODE_NONE
6909       && !no_whiteout
6910       && !try_no_whiteout) {
6911     whiteout = true;
6912   }
6913   bool legacy;
6914   if (get_osdmap()->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
6915     legacy = false;
6916     // in luminous or later, we can't delete the head if there are
6917     // clones. we trust the caller passing no_whiteout has already
6918     // verified they don't exist.
6919     if (!snapset.clones.empty() ||
6920         (!ctx->snapc.snaps.empty() && ctx->snapc.snaps[0] > snapset.seq)) {
6921       if (no_whiteout) {
6922         dout(20) << __func__ << " has or will have clones but no_whiteout=1"
6923                  << dendl;
6924       } else {
6925         dout(20) << __func__ << " has or will have clones; will whiteout"
6926                  << dendl;
6927         whiteout = true;
6928       }
6929     }
6930   } else {
6931     legacy = true;
6932   }
6933   dout(20) << __func__ << " " << soid << " whiteout=" << (int)whiteout
6934            << " no_whiteout=" << (int)no_whiteout
6935            << " try_no_whiteout=" << (int)try_no_whiteout
6936            << dendl;
6937   if (!obs.exists || (obs.oi.is_whiteout() && whiteout))
6938     return -ENOENT;
6939
6940   t->remove(soid);
6941
6942   if (oi.size > 0) {
6943     interval_set<uint64_t> ch;
6944     ch.insert(0, oi.size);
6945     ctx->modified_ranges.union_of(ch);
6946   }
6947
6948   ctx->delta_stats.num_wr++;
6949   if (soid.is_snap()) {
6950     assert(ctx->obc->ssc->snapset.clone_overlap.count(soid.snap));
6951     ctx->delta_stats.num_bytes -= ctx->obc->ssc->snapset.get_clone_bytes(soid.snap);
6952   } else {
6953     ctx->delta_stats.num_bytes -= oi.size;
6954   }
6955   oi.size = 0;
6956   oi.new_object();
6957
6958   // disconnect all watchers
6959   for (map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator p =
6960          oi.watchers.begin();
6961        p != oi.watchers.end();
6962        ++p) {
6963     dout(20) << __func__ << " will disconnect watcher " << p->first << dendl;
6964     ctx->watch_disconnects.push_back(
6965       watch_disconnect_t(p->first.first, p->first.second, true));
6966   }
6967   oi.watchers.clear();
6968
6969   if (whiteout) {
6970     dout(20) << __func__ << " setting whiteout on " << soid << dendl;
6971     oi.set_flag(object_info_t::FLAG_WHITEOUT);
6972     ctx->delta_stats.num_whiteouts++;
6973     t->create(soid);
6974     osd->logger->inc(l_osd_tier_whiteout);
6975     return 0;
6976   }
6977
6978   // delete the head
6979   ctx->delta_stats.num_objects--;
6980   if (soid.is_snap())
6981     ctx->delta_stats.num_object_clones--;
6982   if (oi.is_whiteout()) {
6983     dout(20) << __func__ << " deleting whiteout on " << soid << dendl;
6984     ctx->delta_stats.num_whiteouts--;
6985     oi.clear_flag(object_info_t::FLAG_WHITEOUT);
6986   }
6987   if (oi.is_cache_pinned()) {
6988     ctx->delta_stats.num_objects_pinned--;
6989   }
6990   if ((legacy || snapset.is_legacy()) && soid.is_head()) {
6991     snapset.head_exists = false;
6992   }
6993   obs.exists = false;
6994   return 0;
6995 }
6996
6997 int PrimaryLogPG::_rollback_to(OpContext *ctx, ceph_osd_op& op)
6998 {
6999   SnapSet& snapset = ctx->new_snapset;
7000   ObjectState& obs = ctx->new_obs;
7001   object_info_t& oi = obs.oi;
7002   const hobject_t& soid = oi.soid;
7003   PGTransaction* t = ctx->op_t.get();
7004   snapid_t snapid = (uint64_t)op.snap.snapid;
7005   hobject_t missing_oid;
7006
7007   dout(10) << "_rollback_to " << soid << " snapid " << snapid << dendl;
7008
7009   ObjectContextRef rollback_to;
7010   int ret = find_object_context(
7011     hobject_t(soid.oid, soid.get_key(), snapid, soid.get_hash(), info.pgid.pool(),
7012               soid.get_namespace()),
7013     &rollback_to, false, false, &missing_oid);
7014   if (ret == -EAGAIN) {
7015     /* clone must be missing */
7016     assert(is_degraded_or_backfilling_object(missing_oid));
7017     dout(20) << "_rollback_to attempted to roll back to a missing or backfilling clone "
7018              << missing_oid << " (requested snapid: ) " << snapid << dendl;
7019     block_write_on_degraded_snap(missing_oid, ctx->op);
7020     return ret;
7021   }
7022   {
7023     ObjectContextRef promote_obc;
7024     cache_result_t tier_mode_result;
7025     if (obs.exists && obs.oi.has_manifest()) {
7026       tier_mode_result =
7027         maybe_handle_manifest_detail(
7028           ctx->op,
7029           true,
7030           rollback_to);
7031     } else {
7032       tier_mode_result =
7033         maybe_handle_cache_detail(
7034           ctx->op,
7035           true,
7036           rollback_to,
7037           ret,
7038           missing_oid,
7039           true,
7040           false,
7041           &promote_obc);
7042     }
7043     switch (tier_mode_result) {
7044     case cache_result_t::NOOP:
7045       break;
7046     case cache_result_t::BLOCKED_PROMOTE:
7047       assert(promote_obc);
7048       block_write_on_snap_rollback(soid, promote_obc, ctx->op);
7049       return -EAGAIN;
7050     case cache_result_t::BLOCKED_FULL:
7051       block_write_on_full_cache(soid, ctx->op);
7052       return -EAGAIN;
7053     case cache_result_t::REPLIED_WITH_EAGAIN:
7054       assert(0 == "this can't happen, no rollback on replica");
7055     default:
7056       assert(0 == "must promote was set, other values are not valid");
7057       return -EAGAIN;
7058     }
7059   }
7060
7061   if (ret == -ENOENT || (rollback_to && rollback_to->obs.oi.is_whiteout())) {
7062     // there's no snapshot here, or there's no object.
7063     // if there's no snapshot, we delete the object; otherwise, do nothing.
7064     dout(20) << "_rollback_to deleting head on " << soid.oid
7065              << " because got ENOENT|whiteout on find_object_context" << dendl;
7066     if (ctx->obc->obs.oi.watchers.size()) {
7067       // Cannot delete an object with watchers
7068       ret = -EBUSY;
7069     } else {
7070       _delete_oid(ctx, false, false);
7071       ret = 0;
7072     }
7073   } else if (ret) {
7074     // ummm....huh? It *can't* return anything else at time of writing.
7075     assert(0 == "unexpected error code in _rollback_to");
7076   } else { //we got our context, let's use it to do the rollback!
7077     hobject_t& rollback_to_sobject = rollback_to->obs.oi.soid;
7078     if (is_degraded_or_backfilling_object(rollback_to_sobject)) {
7079       dout(20) << "_rollback_to attempted to roll back to a degraded object "
7080                << rollback_to_sobject << " (requested snapid: ) " << snapid << dendl;
7081       block_write_on_degraded_snap(rollback_to_sobject, ctx->op);
7082       ret = -EAGAIN;
7083     } else if (rollback_to->obs.oi.soid.snap == CEPH_NOSNAP) {
7084       // rolling back to the head; we just need to clone it.
7085       ctx->modify = true;
7086     } else {
7087       /* 1) Delete current head
7088        * 2) Clone correct snapshot into head
7089        * 3) Calculate clone_overlaps by following overlaps
7090        *    forward from rollback snapshot */
7091       dout(10) << "_rollback_to deleting " << soid.oid
7092                << " and rolling back to old snap" << dendl;
7093
7094       if (obs.exists) {
7095         t->remove(soid);
7096       }
7097       t->clone(soid, rollback_to_sobject);
7098       snapset.head_exists = true;
7099       t->add_obc(rollback_to);
7100
7101       map<snapid_t, interval_set<uint64_t> >::iterator iter =
7102         snapset.clone_overlap.lower_bound(snapid);
7103       interval_set<uint64_t> overlaps = iter->second;
7104       assert(iter != snapset.clone_overlap.end());
7105       for ( ;
7106             iter != snapset.clone_overlap.end();
7107             ++iter)
7108         overlaps.intersection_of(iter->second);
7109
7110       if (obs.oi.size > 0) {
7111         interval_set<uint64_t> modified;
7112         modified.insert(0, obs.oi.size);
7113         overlaps.intersection_of(modified);
7114         modified.subtract(overlaps);
7115         ctx->modified_ranges.union_of(modified);
7116       }
7117
7118       // Adjust the cached objectcontext
7119       maybe_create_new_object(ctx, true);
7120       ctx->delta_stats.num_bytes -= obs.oi.size;
7121       ctx->delta_stats.num_bytes += rollback_to->obs.oi.size;
7122       obs.oi.size = rollback_to->obs.oi.size;
7123       if (rollback_to->obs.oi.is_data_digest())
7124         obs.oi.set_data_digest(rollback_to->obs.oi.data_digest);
7125       else
7126         obs.oi.clear_data_digest();
7127       if (rollback_to->obs.oi.is_omap_digest())
7128         obs.oi.set_omap_digest(rollback_to->obs.oi.omap_digest);
7129       else
7130         obs.oi.clear_omap_digest();
7131
7132       if (rollback_to->obs.oi.is_omap()) {
7133         dout(10) << __func__ << " setting omap flag on " << obs.oi.soid << dendl;
7134         obs.oi.set_flag(object_info_t::FLAG_OMAP);
7135       } else {
7136         dout(10) << __func__ << " clearing omap flag on " << obs.oi.soid << dendl;
7137         obs.oi.clear_flag(object_info_t::FLAG_OMAP);
7138       }
7139
7140       snapset.head_exists = true;
7141     }
7142   }
7143   return ret;
7144 }
7145
7146 void PrimaryLogPG::_make_clone(
7147   OpContext *ctx,
7148   PGTransaction* t,
7149   ObjectContextRef obc,
7150   const hobject_t& head, const hobject_t& coid,
7151   object_info_t *poi)
7152 {
7153   bufferlist bv;
7154   ::encode(*poi, bv, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
7155
7156   t->clone(coid, head);
7157   setattr_maybe_cache(obc, ctx, t, OI_ATTR, bv);
7158   rmattr_maybe_cache(obc, ctx, t, SS_ATTR);
7159 }
7160
7161 void PrimaryLogPG::make_writeable(OpContext *ctx)
7162 {
7163   const hobject_t& soid = ctx->obs->oi.soid;
7164   SnapContext& snapc = ctx->snapc;
7165
7166   // clone?
7167   assert(soid.snap == CEPH_NOSNAP);
7168   dout(20) << "make_writeable " << soid << " snapset=" << ctx->new_snapset
7169            << "  snapc=" << snapc << dendl;
7170
7171   bool was_dirty = ctx->obc->obs.oi.is_dirty();
7172   if (ctx->new_obs.exists) {
7173     // we will mark the object dirty
7174     if (ctx->undirty && was_dirty) {
7175       dout(20) << " clearing DIRTY flag" << dendl;
7176       assert(ctx->new_obs.oi.is_dirty());
7177       ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
7178       --ctx->delta_stats.num_objects_dirty;
7179       osd->logger->inc(l_osd_tier_clean);
7180     } else if (!was_dirty && !ctx->undirty) {
7181       dout(20) << " setting DIRTY flag" << dendl;
7182       ctx->new_obs.oi.set_flag(object_info_t::FLAG_DIRTY);
7183       ++ctx->delta_stats.num_objects_dirty;
7184       osd->logger->inc(l_osd_tier_dirty);
7185     }
7186   } else {
7187     if (was_dirty) {
7188       dout(20) << " deletion, decrementing num_dirty and clearing flag" << dendl;
7189       ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
7190       --ctx->delta_stats.num_objects_dirty;
7191     }
7192   }
7193
7194   if ((ctx->new_obs.exists &&
7195        ctx->new_obs.oi.is_omap()) &&
7196       (!ctx->obc->obs.exists ||
7197        !ctx->obc->obs.oi.is_omap())) {
7198     ++ctx->delta_stats.num_objects_omap;
7199   }
7200   if ((!ctx->new_obs.exists ||
7201        !ctx->new_obs.oi.is_omap()) &&
7202       (ctx->obc->obs.exists &&
7203        ctx->obc->obs.oi.is_omap())) {
7204     --ctx->delta_stats.num_objects_omap;
7205   }
7206
7207   // use newer snapc?
7208   if (ctx->new_snapset.seq > snapc.seq) {
7209     snapc.seq = ctx->new_snapset.seq;
7210     snapc.snaps = ctx->new_snapset.snaps;
7211     filter_snapc(snapc.snaps);
7212     dout(10) << " using newer snapc " << snapc << dendl;
7213   }
7214
7215   if ((ctx->obs->exists && !ctx->obs->oi.is_whiteout()) && // head exist(ed)
7216       snapc.snaps.size() &&                 // there are snaps
7217       !ctx->cache_evict &&
7218       snapc.snaps[0] > ctx->new_snapset.seq) {  // existing object is old
7219     // clone
7220     hobject_t coid = soid;
7221     coid.snap = snapc.seq;
7222
7223     unsigned l;
7224     for (l=1; l<snapc.snaps.size() && snapc.snaps[l] > ctx->new_snapset.seq; l++) ;
7225
7226     vector<snapid_t> snaps(l);
7227     for (unsigned i=0; i<l; i++)
7228       snaps[i] = snapc.snaps[i];
7229
7230     // prepare clone
7231     object_info_t static_snap_oi(coid);
7232     object_info_t *snap_oi;
7233     if (is_primary()) {
7234       ctx->clone_obc = object_contexts.lookup_or_create(static_snap_oi.soid);
7235       ctx->clone_obc->destructor_callback = new C_PG_ObjectContext(this, ctx->clone_obc.get());
7236       ctx->clone_obc->obs.oi = static_snap_oi;
7237       ctx->clone_obc->obs.exists = true;
7238       ctx->clone_obc->ssc = ctx->obc->ssc;
7239       ctx->clone_obc->ssc->ref++;
7240       if (pool.info.require_rollback())
7241         ctx->clone_obc->attr_cache = ctx->obc->attr_cache;
7242       snap_oi = &ctx->clone_obc->obs.oi;
7243       bool got = ctx->lock_manager.get_write_greedy(
7244         coid,
7245         ctx->clone_obc,
7246         ctx->op);
7247       assert(got);
7248       dout(20) << " got greedy write on clone_obc " << *ctx->clone_obc << dendl;
7249     } else {
7250       snap_oi = &static_snap_oi;
7251     }
7252     snap_oi->version = ctx->at_version;
7253     snap_oi->prior_version = ctx->obs->oi.version;
7254     snap_oi->copy_user_bits(ctx->obs->oi);
7255
7256     bool legacy = ctx->new_snapset.is_legacy() ||
7257       get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
7258     if (legacy) {
7259       snap_oi->legacy_snaps = snaps;
7260     }
7261
7262     _make_clone(ctx, ctx->op_t.get(), ctx->clone_obc, soid, coid, snap_oi);
7263
7264     ctx->delta_stats.num_objects++;
7265     if (snap_oi->is_dirty()) {
7266       ctx->delta_stats.num_objects_dirty++;
7267       osd->logger->inc(l_osd_tier_dirty);
7268     }
7269     if (snap_oi->is_omap())
7270       ctx->delta_stats.num_objects_omap++;
7271     if (snap_oi->is_cache_pinned())
7272       ctx->delta_stats.num_objects_pinned++;
7273     ctx->delta_stats.num_object_clones++;
7274     ctx->new_snapset.clones.push_back(coid.snap);
7275     ctx->new_snapset.clone_size[coid.snap] = ctx->obs->oi.size;
7276     if (!legacy) {
7277       ctx->new_snapset.clone_snaps[coid.snap] = snaps;
7278     }
7279
7280     // clone_overlap should contain an entry for each clone
7281     // (an empty interval_set if there is no overlap)
7282     ctx->new_snapset.clone_overlap[coid.snap];
7283     if (ctx->obs->oi.size)
7284       ctx->new_snapset.clone_overlap[coid.snap].insert(0, ctx->obs->oi.size);
7285
7286     // log clone
7287     dout(10) << " cloning v " << ctx->obs->oi.version
7288              << " to " << coid << " v " << ctx->at_version
7289              << " snaps=" << snaps
7290              << " snapset=" << ctx->new_snapset << dendl;
7291     ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::CLONE, coid, ctx->at_version,
7292                                       ctx->obs->oi.version,
7293                                       ctx->obs->oi.user_version,
7294                                       osd_reqid_t(), ctx->new_obs.oi.mtime, 0));
7295     ::encode(snaps, ctx->log.back().snaps);
7296
7297     ctx->at_version.version++;
7298   }
7299
7300   // update most recent clone_overlap and usage stats
7301   if (ctx->new_snapset.clones.size() > 0) {
7302     /* we need to check whether the most recent clone exists, if it's been evicted,
7303      * it's not included in the stats */
7304     hobject_t last_clone_oid = soid;
7305     last_clone_oid.snap = ctx->new_snapset.clone_overlap.rbegin()->first;
7306     if (is_present_clone(last_clone_oid)) {
7307       interval_set<uint64_t> &newest_overlap = ctx->new_snapset.clone_overlap.rbegin()->second;
7308       ctx->modified_ranges.intersection_of(newest_overlap);
7309       // modified_ranges is still in use by the clone
7310       add_interval_usage(ctx->modified_ranges, ctx->delta_stats);
7311       newest_overlap.subtract(ctx->modified_ranges);
7312     }
7313   }
7314
7315   // update snapset with latest snap context
7316   ctx->new_snapset.seq = snapc.seq;
7317   ctx->new_snapset.snaps = snapc.snaps;
7318   if (get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS) {
7319     // pessimistic assumption that this is a net-new legacy SnapSet
7320     ctx->delta_stats.num_legacy_snapsets++;
7321     ctx->new_snapset.head_exists = ctx->new_obs.exists;
7322   } else if (ctx->new_snapset.is_legacy()) {
7323     ctx->new_snapset.head_exists = ctx->new_obs.exists;
7324   }
7325   dout(20) << "make_writeable " << soid
7326            << " done, snapset=" << ctx->new_snapset << dendl;
7327 }
7328
7329
7330 void PrimaryLogPG::write_update_size_and_usage(object_stat_sum_t& delta_stats, object_info_t& oi,
7331                                                interval_set<uint64_t>& modified, uint64_t offset,
7332                                                uint64_t length, bool write_full)
7333 {
7334   interval_set<uint64_t> ch;
7335   if (write_full) {
7336     if (oi.size)
7337       ch.insert(0, oi.size);
7338   } else if (length)
7339     ch.insert(offset, length);
7340   modified.union_of(ch);
7341   if (write_full || offset + length > oi.size) {
7342     uint64_t new_size = offset + length;
7343     delta_stats.num_bytes -= oi.size;
7344     delta_stats.num_bytes += new_size;
7345     oi.size = new_size;
7346   }
7347   delta_stats.num_wr++;
7348   delta_stats.num_wr_kb += SHIFT_ROUND_UP(length, 10);
7349 }
7350
7351 void PrimaryLogPG::add_interval_usage(interval_set<uint64_t>& s, object_stat_sum_t& delta_stats)
7352 {
7353   for (interval_set<uint64_t>::const_iterator p = s.begin(); p != s.end(); ++p) {
7354     delta_stats.num_bytes += p.get_len();
7355   }
7356 }
7357
7358 void PrimaryLogPG::complete_disconnect_watches(
7359   ObjectContextRef obc,
7360   const list<watch_disconnect_t> &to_disconnect)
7361 {
7362   for (list<watch_disconnect_t>::const_iterator i =
7363          to_disconnect.begin();
7364        i != to_disconnect.end();
7365        ++i) {
7366     pair<uint64_t, entity_name_t> watcher(i->cookie, i->name);
7367     auto watchers_entry = obc->watchers.find(watcher);
7368     if (watchers_entry != obc->watchers.end()) {
7369       WatchRef watch = watchers_entry->second;
7370       dout(10) << "do_osd_op_effects disconnect watcher " << watcher << dendl;
7371       obc->watchers.erase(watcher);
7372       watch->remove(i->send_disconnect);
7373     } else {
7374       dout(10) << "do_osd_op_effects disconnect failed to find watcher "
7375                << watcher << dendl;
7376     }
7377   }
7378 }
7379
7380 void PrimaryLogPG::do_osd_op_effects(OpContext *ctx, const ConnectionRef& conn)
7381 {
7382   entity_name_t entity = ctx->reqid.name;
7383   dout(15) << "do_osd_op_effects " << entity << " con " << conn.get() << dendl;
7384
7385   // disconnects first
7386   complete_disconnect_watches(ctx->obc, ctx->watch_disconnects);
7387
7388   assert(conn);
7389
7390   boost::intrusive_ptr<Session> session((Session *)conn->get_priv());
7391   if (!session.get())
7392     return;
7393   session->put();  // get_priv() takes a ref, and so does the intrusive_ptr
7394
7395   for (list<pair<watch_info_t,bool> >::iterator i = ctx->watch_connects.begin();
7396        i != ctx->watch_connects.end();
7397        ++i) {
7398     pair<uint64_t, entity_name_t> watcher(i->first.cookie, entity);
7399     dout(15) << "do_osd_op_effects applying watch connect on session "
7400              << session.get() << " watcher " << watcher << dendl;
7401     WatchRef watch;
7402     if (ctx->obc->watchers.count(watcher)) {
7403       dout(15) << "do_osd_op_effects found existing watch watcher " << watcher
7404                << dendl;
7405       watch = ctx->obc->watchers[watcher];
7406     } else {
7407       dout(15) << "do_osd_op_effects new watcher " << watcher
7408                << dendl;
7409       watch = Watch::makeWatchRef(
7410         this, osd, ctx->obc, i->first.timeout_seconds,
7411         i->first.cookie, entity, conn->get_peer_addr());
7412       ctx->obc->watchers.insert(
7413         make_pair(
7414           watcher,
7415           watch));
7416     }
7417     watch->connect(conn, i->second);
7418   }
7419
7420   for (list<notify_info_t>::iterator p = ctx->notifies.begin();
7421        p != ctx->notifies.end();
7422        ++p) {
7423     dout(10) << "do_osd_op_effects, notify " << *p << dendl;
7424     ConnectionRef conn(ctx->op->get_req()->get_connection());
7425     NotifyRef notif(
7426       Notify::makeNotifyRef(
7427         conn,
7428         ctx->reqid.name.num(),
7429         p->bl,
7430         p->timeout,
7431         p->cookie,
7432         p->notify_id,
7433         ctx->obc->obs.oi.user_version,
7434         osd));
7435     for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator i =
7436            ctx->obc->watchers.begin();
7437          i != ctx->obc->watchers.end();
7438          ++i) {
7439       dout(10) << "starting notify on watch " << i->first << dendl;
7440       i->second->start_notify(notif);
7441     }
7442     notif->init();
7443   }
7444
7445   for (list<OpContext::NotifyAck>::iterator p = ctx->notify_acks.begin();
7446        p != ctx->notify_acks.end();
7447        ++p) {
7448     if (p->watch_cookie)
7449       dout(10) << "notify_ack " << make_pair(p->watch_cookie.get(), p->notify_id) << dendl;
7450     else
7451       dout(10) << "notify_ack " << make_pair("NULL", p->notify_id) << dendl;
7452     for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator i =
7453            ctx->obc->watchers.begin();
7454          i != ctx->obc->watchers.end();
7455          ++i) {
7456       if (i->first.second != entity) continue;
7457       if (p->watch_cookie &&
7458           p->watch_cookie.get() != i->first.first) continue;
7459       dout(10) << "acking notify on watch " << i->first << dendl;
7460       i->second->notify_ack(p->notify_id, p->reply_bl);
7461     }
7462   }
7463 }
7464
7465 hobject_t PrimaryLogPG::generate_temp_object(const hobject_t& target)
7466 {
7467   ostringstream ss;
7468   ss << "temp_" << info.pgid << "_" << get_role()
7469      << "_" << osd->monc->get_global_id() << "_" << (++temp_seq);
7470   hobject_t hoid = target.make_temp_hobject(ss.str());
7471   dout(20) << __func__ << " " << hoid << dendl;
7472   return hoid;
7473 }
7474
7475 hobject_t PrimaryLogPG::get_temp_recovery_object(
7476   const hobject_t& target,
7477   eversion_t version)
7478 {
7479   ostringstream ss;
7480   ss << "temp_recovering_" << info.pgid  // (note this includes the shardid)
7481      << "_" << version
7482      << "_" << info.history.same_interval_since
7483      << "_" << target.snap;
7484   // pgid + version + interval + snapid is unique, and short
7485   hobject_t hoid = target.make_temp_hobject(ss.str());
7486   dout(20) << __func__ << " " << hoid << dendl;
7487   return hoid;
7488 }
7489
7490 int PrimaryLogPG::prepare_transaction(OpContext *ctx)
7491 {
7492   assert(!ctx->ops->empty());
7493
7494   const hobject_t& soid = ctx->obs->oi.soid;
7495
7496   // valid snap context?
7497   if (!ctx->snapc.is_valid()) {
7498     dout(10) << " invalid snapc " << ctx->snapc << dendl;
7499     return -EINVAL;
7500   }
7501
7502   // prepare the actual mutation
7503   int result = do_osd_ops(ctx, *ctx->ops);
7504   if (result < 0) {
7505     if (ctx->op->may_write() &&
7506         get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
7507       // need to save the error code in the pg log, to detect dup ops,
7508       // but do nothing else
7509       ctx->update_log_only = true;
7510     }
7511     return result;
7512   }
7513
7514   // read-op?  write-op noop? done?
7515   if (ctx->op_t->empty() && !ctx->modify) {
7516     unstable_stats.add(ctx->delta_stats);
7517     if (ctx->op->may_write() &&
7518         get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
7519       ctx->update_log_only = true;
7520     }
7521     return result;
7522   }
7523
7524   // check for full
7525   if ((ctx->delta_stats.num_bytes > 0 ||
7526        ctx->delta_stats.num_objects > 0) &&  // FIXME: keys?
7527       (pool.info.has_flag(pg_pool_t::FLAG_FULL) ||
7528        get_osdmap()->test_flag(CEPH_OSDMAP_FULL))) {
7529     const MOSDOp *m = static_cast<const MOSDOp*>(ctx->op->get_req());
7530     if (ctx->reqid.name.is_mds() ||   // FIXME: ignore MDS for now
7531         m->has_flag(CEPH_OSD_FLAG_FULL_FORCE)) {
7532       dout(20) << __func__ << " full, but proceeding due to FULL_FORCE or MDS"
7533                << dendl;
7534     } else if (m->has_flag(CEPH_OSD_FLAG_FULL_TRY)) {
7535       // they tried, they failed.
7536       dout(20) << __func__ << " full, replying to FULL_TRY op" << dendl;
7537       return pool.info.has_flag(pg_pool_t::FLAG_FULL) ? -EDQUOT : -ENOSPC;
7538     } else {
7539       // drop request
7540       dout(20) << __func__ << " full, dropping request (bad client)" << dendl;
7541       return -EAGAIN;
7542     }
7543   }
7544
7545   // clone, if necessary
7546   if (soid.snap == CEPH_NOSNAP)
7547     make_writeable(ctx);
7548
7549   finish_ctx(ctx,
7550              ctx->new_obs.exists ? pg_log_entry_t::MODIFY :
7551              pg_log_entry_t::DELETE);
7552
7553   return result;
7554 }
7555
7556 void PrimaryLogPG::finish_ctx(OpContext *ctx, int log_op_type, bool maintain_ssc)
7557 {
7558   const hobject_t& soid = ctx->obs->oi.soid;
7559   dout(20) << __func__ << " " << soid << " " << ctx
7560            << " op " << pg_log_entry_t::get_op_name(log_op_type)
7561            << dendl;
7562   utime_t now = ceph_clock_now();
7563
7564   // snapset
7565   bufferlist bss;
7566
7567   if (soid.snap == CEPH_NOSNAP && maintain_ssc) {
7568     ::encode(ctx->new_snapset, bss);
7569     assert(ctx->new_obs.exists == ctx->new_snapset.head_exists ||
7570            !ctx->new_snapset.is_legacy());
7571
7572     if (ctx->new_obs.exists) {
7573       if (!ctx->obs->exists) {
7574         if (ctx->snapset_obc && ctx->snapset_obc->obs.exists) {
7575           hobject_t snapoid = soid.get_snapdir();
7576           dout(10) << " removing unneeded snapdir " << snapoid << dendl;
7577           ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::DELETE, snapoid,
7578               ctx->at_version,
7579               ctx->snapset_obc->obs.oi.version,
7580               0, osd_reqid_t(), ctx->mtime, 0));
7581           ctx->op_t->remove(snapoid);
7582
7583           ctx->at_version.version++;
7584
7585           ctx->snapset_obc->obs.exists = false;
7586         }
7587       }
7588     } else if (!ctx->new_snapset.clones.empty() &&
7589                !ctx->cache_evict &&
7590                !ctx->new_snapset.head_exists &&
7591                (!ctx->snapset_obc || !ctx->snapset_obc->obs.exists)) {
7592       // save snapset on _snap
7593       hobject_t snapoid(soid.oid, soid.get_key(), CEPH_SNAPDIR, soid.get_hash(),
7594                         info.pgid.pool(), soid.get_namespace());
7595       dout(10) << " final snapset " << ctx->new_snapset
7596                << " in " << snapoid << dendl;
7597       assert(get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS);
7598       ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::MODIFY, snapoid,
7599                                         ctx->at_version,
7600                                         eversion_t(),
7601                                         0, osd_reqid_t(), ctx->mtime, 0));
7602
7603       if (!ctx->snapset_obc)
7604         ctx->snapset_obc = get_object_context(snapoid, true);
7605       bool got = false;
7606       if (ctx->lock_type == ObjectContext::RWState::RWWRITE) {
7607         got = ctx->lock_manager.get_write_greedy(
7608           snapoid,
7609           ctx->snapset_obc,
7610           ctx->op);
7611       } else {
7612         assert(ctx->lock_type == ObjectContext::RWState::RWEXCL);
7613         got = ctx->lock_manager.get_lock_type(
7614           ObjectContext::RWState::RWEXCL,
7615           snapoid,
7616           ctx->snapset_obc,
7617           ctx->op);
7618       }
7619       assert(got);
7620       dout(20) << " got greedy write on snapset_obc " << *ctx->snapset_obc << dendl;
7621       ctx->snapset_obc->obs.exists = true;
7622       ctx->snapset_obc->obs.oi.version = ctx->at_version;
7623       ctx->snapset_obc->obs.oi.last_reqid = ctx->reqid;
7624       ctx->snapset_obc->obs.oi.mtime = ctx->mtime;
7625       ctx->snapset_obc->obs.oi.local_mtime = now;
7626
7627       map<string, bufferlist> attrs;
7628       bufferlist bv(sizeof(ctx->new_obs.oi));
7629       ::encode(ctx->snapset_obc->obs.oi, bv,
7630                get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
7631       ctx->op_t->create(snapoid);
7632       attrs[OI_ATTR].claim(bv);
7633       attrs[SS_ATTR].claim(bss);
7634       setattrs_maybe_cache(ctx->snapset_obc, ctx, ctx->op_t.get(), attrs);
7635       ctx->at_version.version++;
7636     }
7637   }
7638
7639   // finish and log the op.
7640   if (ctx->user_modify) {
7641     // update the user_version for any modify ops, except for the watch op
7642     ctx->user_at_version = MAX(info.last_user_version, ctx->new_obs.oi.user_version) + 1;
7643     /* In order for new clients and old clients to interoperate properly
7644      * when exchanging versions, we need to lower bound the user_version
7645      * (which our new clients pay proper attention to)
7646      * by the at_version (which is all the old clients can ever see). */
7647     if (ctx->at_version.version > ctx->user_at_version)
7648       ctx->user_at_version = ctx->at_version.version;
7649     ctx->new_obs.oi.user_version = ctx->user_at_version;
7650   }
7651   ctx->bytes_written = ctx->op_t->get_bytes_written();
7652
7653   if (ctx->new_obs.exists) {
7654     // on the head object
7655     ctx->new_obs.oi.version = ctx->at_version;
7656     ctx->new_obs.oi.prior_version = ctx->obs->oi.version;
7657     ctx->new_obs.oi.last_reqid = ctx->reqid;
7658     if (ctx->mtime != utime_t()) {
7659       ctx->new_obs.oi.mtime = ctx->mtime;
7660       dout(10) << " set mtime to " << ctx->new_obs.oi.mtime << dendl;
7661       ctx->new_obs.oi.local_mtime = now;
7662     } else {
7663       dout(10) << " mtime unchanged at " << ctx->new_obs.oi.mtime << dendl;
7664     }
7665
7666     map <string, bufferlist> attrs;
7667     bufferlist bv(sizeof(ctx->new_obs.oi));
7668     ::encode(ctx->new_obs.oi, bv,
7669              get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
7670     attrs[OI_ATTR].claim(bv);
7671
7672     if (soid.snap == CEPH_NOSNAP) {
7673       dout(10) << " final snapset " << ctx->new_snapset
7674                << " in " << soid << dendl;
7675       attrs[SS_ATTR].claim(bss);
7676     } else {
7677       dout(10) << " no snapset (this is a clone)" << dendl;
7678     }
7679     ctx->op_t->setattrs(soid, attrs);
7680   } else {
7681     ctx->new_obs.oi = object_info_t(ctx->obc->obs.oi.soid);
7682   }
7683
7684   bool legacy_snapset = ctx->new_snapset.is_legacy() ||
7685     get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
7686
7687   // append to log
7688   ctx->log.push_back(pg_log_entry_t(log_op_type, soid, ctx->at_version,
7689                                     ctx->obs->oi.version,
7690                                     ctx->user_at_version, ctx->reqid,
7691                                     ctx->mtime, 0));
7692   if (soid.snap < CEPH_NOSNAP) {
7693     switch (log_op_type) {
7694     case pg_log_entry_t::MODIFY:
7695     case pg_log_entry_t::PROMOTE:
7696     case pg_log_entry_t::CLEAN:
7697       if (legacy_snapset) {
7698         dout(20) << __func__ << " encoding legacy_snaps "
7699                  << ctx->new_obs.oi.legacy_snaps
7700                  << dendl;
7701         ::encode(ctx->new_obs.oi.legacy_snaps, ctx->log.back().snaps);
7702       } else {
7703         dout(20) << __func__ << " encoding snaps from " << ctx->new_snapset
7704                  << dendl;
7705         ::encode(ctx->new_snapset.clone_snaps[soid.snap], ctx->log.back().snaps);
7706       }
7707       break;
7708     default:
7709       break;
7710     }
7711   }
7712
7713   if (!ctx->extra_reqids.empty()) {
7714     dout(20) << __func__ << "  extra_reqids " << ctx->extra_reqids << dendl;
7715     ctx->log.back().extra_reqids.swap(ctx->extra_reqids);
7716   }
7717
7718   // apply new object state.
7719   ctx->obc->obs = ctx->new_obs;
7720
7721   if (soid.is_head() && !ctx->obc->obs.exists &&
7722       (!maintain_ssc || ctx->cache_evict)) {
7723     ctx->obc->ssc->exists = false;
7724     ctx->obc->ssc->snapset = SnapSet();
7725   } else {
7726     ctx->obc->ssc->exists = true;
7727     ctx->obc->ssc->snapset = ctx->new_snapset;
7728   }
7729 }
7730
7731 void PrimaryLogPG::apply_stats(
7732   const hobject_t &soid,
7733   const object_stat_sum_t &delta_stats) {
7734
7735   info.stats.stats.add(delta_stats);
7736
7737   for (set<pg_shard_t>::iterator i = backfill_targets.begin();
7738        i != backfill_targets.end();
7739        ++i) {
7740     pg_shard_t bt = *i;
7741     pg_info_t& pinfo = peer_info[bt];
7742     if (soid <= pinfo.last_backfill)
7743       pinfo.stats.stats.add(delta_stats);
7744     else if (soid <= last_backfill_started)
7745       pending_backfill_updates[soid].stats.add(delta_stats);
7746   }
7747
7748   if (is_primary() && scrubber.active) {
7749     if (soid < scrubber.start) {
7750       dout(20) << __func__ << " " << soid << " < [" << scrubber.start
7751                << "," << scrubber.end << ")" << dendl;
7752       scrub_cstat.add(delta_stats);
7753     } else {
7754       dout(20) << __func__ << " " << soid << " >= [" << scrubber.start
7755                << "," << scrubber.end << ")" << dendl;
7756     }
7757   }
7758 }
7759
7760 void PrimaryLogPG::complete_read_ctx(int result, OpContext *ctx)
7761 {
7762   const MOSDOp *m = static_cast<const MOSDOp*>(ctx->op->get_req());
7763   assert(ctx->async_reads_complete());
7764
7765   for (vector<OSDOp>::iterator p = ctx->ops->begin();
7766     p != ctx->ops->end() && result >= 0; ++p) {
7767     if (p->rval < 0 && !(p->op.flags & CEPH_OSD_OP_FLAG_FAILOK)) {
7768       result = p->rval;
7769       break;
7770     }
7771     ctx->bytes_read += p->outdata.length();
7772   }
7773   ctx->reply->claim_op_out_data(*ctx->ops);
7774   ctx->reply->get_header().data_off = (ctx->data_off ? *ctx->data_off : 0);
7775
7776   MOSDOpReply *reply = ctx->reply;
7777   ctx->reply = nullptr;
7778
7779   if (result >= 0) {
7780     if (!ctx->ignore_log_op_stats) {
7781       log_op_stats(ctx);
7782       publish_stats_to_osd();
7783     }
7784
7785     // on read, return the current object version
7786     if (ctx->obs) {
7787       reply->set_reply_versions(eversion_t(), ctx->obs->oi.user_version);
7788     } else {
7789       reply->set_reply_versions(eversion_t(), ctx->user_at_version);
7790     }
7791   } else if (result == -ENOENT) {
7792     // on ENOENT, set a floor for what the next user version will be.
7793     reply->set_enoent_reply_versions(info.last_update, info.last_user_version);
7794   }
7795
7796   reply->set_result(result);
7797   reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
7798   osd->send_message_osd_client(reply, m->get_connection());
7799   close_op_ctx(ctx);
7800 }
7801
7802 // ========================================================================
7803 // copyfrom
7804
7805 struct C_Copyfrom : public Context {
7806   PrimaryLogPGRef pg;
7807   hobject_t oid;
7808   epoch_t last_peering_reset;
7809   ceph_tid_t tid;
7810   PrimaryLogPG::CopyOpRef cop;
7811   C_Copyfrom(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
7812              const PrimaryLogPG::CopyOpRef& c)
7813     : pg(p), oid(o), last_peering_reset(lpr),
7814       tid(0), cop(c)
7815   {}
7816   void finish(int r) override {
7817     if (r == -ECANCELED)
7818       return;
7819     pg->lock();
7820     if (last_peering_reset == pg->get_last_peering_reset()) {
7821       pg->process_copy_chunk(oid, tid, r);
7822     }
7823     pg->unlock();
7824   }
7825 };
7826
7827 struct C_CopyFrom_AsyncReadCb : public Context {
7828   OSDOp *osd_op;
7829   object_copy_data_t reply_obj;
7830   uint64_t features;
7831   size_t len;
7832   C_CopyFrom_AsyncReadCb(OSDOp *osd_op, uint64_t features) :
7833     osd_op(osd_op), features(features), len(0) {}
7834   void finish(int r) override {
7835     osd_op->rval = r;
7836     if (r < 0) {
7837       return;
7838     }
7839
7840     assert(len > 0);
7841     assert(len <= reply_obj.data.length());
7842     bufferlist bl;
7843     bl.substr_of(reply_obj.data, 0, len);
7844     reply_obj.data.swap(bl);
7845     ::encode(reply_obj, osd_op->outdata, features);
7846   }
7847 };
7848
7849 int PrimaryLogPG::do_copy_get(OpContext *ctx, bufferlist::iterator& bp,
7850                               OSDOp& osd_op, ObjectContextRef &obc)
7851 {
7852   object_info_t& oi = obc->obs.oi;
7853   hobject_t& soid = oi.soid;
7854   int result = 0;
7855   object_copy_cursor_t cursor;
7856   uint64_t out_max;
7857   bool skip_data_digest =
7858     (osd->store->has_builtin_csum() && g_conf->osd_skip_data_digest) ||
7859     g_conf->osd_distrust_data_digest;
7860
7861   try {
7862     ::decode(cursor, bp);
7863     ::decode(out_max, bp);
7864   }
7865   catch (buffer::error& e) {
7866     result = -EINVAL;
7867     return result;
7868   }
7869
7870   const MOSDOp *op = reinterpret_cast<const MOSDOp*>(ctx->op->get_req());
7871   uint64_t features = op->get_features();
7872
7873   bool async_read_started = false;
7874   object_copy_data_t _reply_obj;
7875   C_CopyFrom_AsyncReadCb *cb = NULL;
7876   if (pool.info.require_rollback()) {
7877     cb = new C_CopyFrom_AsyncReadCb(&osd_op, features);
7878   }
7879   object_copy_data_t &reply_obj = cb ? cb->reply_obj : _reply_obj;
7880   // size, mtime
7881   reply_obj.size = oi.size;
7882   reply_obj.mtime = oi.mtime;
7883   assert(obc->ssc);
7884   if (soid.snap < CEPH_NOSNAP) {
7885     if (obc->ssc->snapset.is_legacy()) {
7886       reply_obj.snaps = oi.legacy_snaps;
7887     } else {
7888       auto p = obc->ssc->snapset.clone_snaps.find(soid.snap);
7889       assert(p != obc->ssc->snapset.clone_snaps.end()); // warn?
7890       reply_obj.snaps = p->second;
7891     }
7892   } else {
7893     reply_obj.snap_seq = obc->ssc->snapset.seq;
7894   }
7895   if (!skip_data_digest && oi.is_data_digest()) {
7896     reply_obj.flags |= object_copy_data_t::FLAG_DATA_DIGEST;
7897     reply_obj.data_digest = oi.data_digest;
7898   }
7899   if (oi.is_omap_digest()) {
7900     reply_obj.flags |= object_copy_data_t::FLAG_OMAP_DIGEST;
7901     reply_obj.omap_digest = oi.omap_digest;
7902   }
7903   reply_obj.truncate_seq = oi.truncate_seq;
7904   reply_obj.truncate_size = oi.truncate_size;
7905
7906   // attrs
7907   map<string,bufferlist>& out_attrs = reply_obj.attrs;
7908   if (!cursor.attr_complete) {
7909     result = getattrs_maybe_cache(
7910       ctx->obc,
7911       &out_attrs);
7912     if (result < 0) {
7913       if (cb) {
7914         delete cb;
7915       }
7916       return result;
7917     }
7918     cursor.attr_complete = true;
7919     dout(20) << " got attrs" << dendl;
7920   }
7921
7922   int64_t left = out_max - osd_op.outdata.length();
7923
7924   // data
7925   bufferlist& bl = reply_obj.data;
7926   if (left > 0 && !cursor.data_complete) {
7927     if (cursor.data_offset < oi.size) {
7928       uint64_t max_read = MIN(oi.size - cursor.data_offset, (uint64_t)left);
7929       if (cb) {
7930         async_read_started = true;
7931         ctx->pending_async_reads.push_back(
7932           make_pair(
7933             boost::make_tuple(cursor.data_offset, max_read, osd_op.op.flags),
7934             make_pair(&bl, cb)));
7935         cb->len = max_read;
7936
7937         ctx->op_finishers[ctx->current_osd_subop_num].reset(
7938           new ReadFinisher(osd_op));
7939         result = -EINPROGRESS;
7940
7941         dout(10) << __func__ << ": async_read noted for " << soid << dendl;
7942       } else {
7943         result = pgbackend->objects_read_sync(
7944           oi.soid, cursor.data_offset, max_read, osd_op.op.flags, &bl);
7945         if (result < 0)
7946           return result;
7947       }
7948       left -= max_read;
7949       cursor.data_offset += max_read;
7950     }
7951     if (cursor.data_offset == oi.size) {
7952       cursor.data_complete = true;
7953       dout(20) << " got data" << dendl;
7954     }
7955     assert(cursor.data_offset <= oi.size);
7956   }
7957
7958   // omap
7959   uint32_t omap_keys = 0;
7960   if (!pool.info.supports_omap() || !oi.is_omap()) {
7961     cursor.omap_complete = true;
7962   } else {
7963     if (left > 0 && !cursor.omap_complete) {
7964       assert(cursor.data_complete);
7965       if (cursor.omap_offset.empty()) {
7966         osd->store->omap_get_header(ch, ghobject_t(oi.soid),
7967                                     &reply_obj.omap_header);
7968       }
7969       bufferlist omap_data;
7970       ObjectMap::ObjectMapIterator iter =
7971         osd->store->get_omap_iterator(coll, ghobject_t(oi.soid));
7972       assert(iter);
7973       iter->upper_bound(cursor.omap_offset);
7974       for (; iter->valid(); iter->next(false)) {
7975         ++omap_keys;
7976         ::encode(iter->key(), omap_data);
7977         ::encode(iter->value(), omap_data);
7978         left -= iter->key().length() + 4 + iter->value().length() + 4;
7979         if (left <= 0)
7980           break;
7981       }
7982       if (omap_keys) {
7983         ::encode(omap_keys, reply_obj.omap_data);
7984         reply_obj.omap_data.claim_append(omap_data);
7985       }
7986       if (iter->valid()) {
7987         cursor.omap_offset = iter->key();
7988       } else {
7989         cursor.omap_complete = true;
7990         dout(20) << " got omap" << dendl;
7991       }
7992     }
7993   }
7994
7995   if (cursor.is_complete()) {
7996     // include reqids only in the final step.  this is a bit fragile
7997     // but it works...
7998     pg_log.get_log().get_object_reqids(ctx->obc->obs.oi.soid, 10, &reply_obj.reqids);
7999     dout(20) << " got reqids" << dendl;
8000   }
8001
8002   dout(20) << " cursor.is_complete=" << cursor.is_complete()
8003            << " " << out_attrs.size() << " attrs"
8004            << " " << bl.length() << " bytes"
8005            << " " << reply_obj.omap_header.length() << " omap header bytes"
8006            << " " << reply_obj.omap_data.length() << " omap data bytes in "
8007            << omap_keys << " keys"
8008            << " " << reply_obj.reqids.size() << " reqids"
8009            << dendl;
8010   reply_obj.cursor = cursor;
8011   if (!async_read_started) {
8012     ::encode(reply_obj, osd_op.outdata, features);
8013   }
8014   if (cb && !async_read_started) {
8015     delete cb;
8016   }
8017
8018   if (result > 0) {
8019     result = 0;
8020   }
8021   return result;
8022 }
8023
8024 void PrimaryLogPG::fill_in_copy_get_noent(OpRequestRef& op, hobject_t oid,
8025                                           OSDOp& osd_op)
8026 {
8027   // NOTE: we take non-const ref here for claim_op_out_data below; we must
8028   // be careful not to modify anything else that will upset a racing
8029   // operator<<
8030   MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
8031   uint64_t features = m->get_features();
8032   object_copy_data_t reply_obj;
8033
8034   pg_log.get_log().get_object_reqids(oid, 10, &reply_obj.reqids);
8035   dout(20) << __func__ << " got reqids " << reply_obj.reqids << dendl;
8036   ::encode(reply_obj, osd_op.outdata, features);
8037   osd_op.rval = -ENOENT;
8038   MOSDOpReply *reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0, false);
8039   reply->claim_op_out_data(m->ops);
8040   reply->set_result(-ENOENT);
8041   reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
8042   osd->send_message_osd_client(reply, m->get_connection());
8043 }
8044
8045 void PrimaryLogPG::start_copy(CopyCallback *cb, ObjectContextRef obc,
8046                               hobject_t src, object_locator_t oloc,
8047                               version_t version, unsigned flags,
8048                               bool mirror_snapset,
8049                               unsigned src_obj_fadvise_flags,
8050                               unsigned dest_obj_fadvise_flags)
8051 {
8052   const hobject_t& dest = obc->obs.oi.soid;
8053   dout(10) << __func__ << " " << dest
8054            << " from " << src << " " << oloc << " v" << version
8055            << " flags " << flags
8056            << (mirror_snapset ? " mirror_snapset" : "")
8057            << dendl;
8058
8059   assert(!mirror_snapset || (src.snap == CEPH_NOSNAP ||
8060                              src.snap == CEPH_SNAPDIR));
8061
8062   // cancel a previous in-progress copy?
8063   if (copy_ops.count(dest)) {
8064     // FIXME: if the src etc match, we could avoid restarting from the
8065     // beginning.
8066     CopyOpRef cop = copy_ops[dest];
8067     vector<ceph_tid_t> tids;
8068     cancel_copy(cop, false, &tids);
8069     osd->objecter->op_cancel(tids, -ECANCELED);
8070   }
8071
8072   CopyOpRef cop(std::make_shared<CopyOp>(cb, obc, src, oloc, version, flags,
8073                            mirror_snapset, src_obj_fadvise_flags,
8074                            dest_obj_fadvise_flags));
8075   copy_ops[dest] = cop;
8076   obc->start_block();
8077
8078   _copy_some(obc, cop);
8079 }
8080
8081 void PrimaryLogPG::_copy_some(ObjectContextRef obc, CopyOpRef cop)
8082 {
8083   dout(10) << __func__ << " " << obc << " " << cop << dendl;
8084
8085   unsigned flags = 0;
8086   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_FLUSH)
8087     flags |= CEPH_OSD_FLAG_FLUSH;
8088   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE)
8089     flags |= CEPH_OSD_FLAG_IGNORE_CACHE;
8090   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY)
8091     flags |= CEPH_OSD_FLAG_IGNORE_OVERLAY;
8092   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE)
8093     flags |= CEPH_OSD_FLAG_MAP_SNAP_CLONE;
8094   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_RWORDERED)
8095     flags |= CEPH_OSD_FLAG_RWORDERED;
8096
8097   C_GatherBuilder gather(cct);
8098
8099   if (cop->cursor.is_initial() && cop->mirror_snapset) {
8100     // list snaps too.
8101     assert(cop->src.snap == CEPH_NOSNAP);
8102     ObjectOperation op;
8103     op.list_snaps(&cop->results.snapset, NULL);
8104     ceph_tid_t tid = osd->objecter->read(cop->src.oid, cop->oloc, op,
8105                                     CEPH_SNAPDIR, NULL,
8106                                     flags, gather.new_sub(), NULL);
8107     cop->objecter_tid2 = tid;
8108   }
8109
8110   ObjectOperation op;
8111   if (cop->results.user_version) {
8112     op.assert_version(cop->results.user_version);
8113   } else {
8114     // we should learn the version after the first chunk, if we didn't know
8115     // it already!
8116     assert(cop->cursor.is_initial());
8117   }
8118   op.copy_get(&cop->cursor, get_copy_chunk_size(),
8119               &cop->results.object_size, &cop->results.mtime,
8120               &cop->attrs, &cop->data, &cop->omap_header, &cop->omap_data,
8121               &cop->results.snaps, &cop->results.snap_seq,
8122               &cop->results.flags,
8123               &cop->results.source_data_digest,
8124               &cop->results.source_omap_digest,
8125               &cop->results.reqids,
8126               &cop->results.truncate_seq,
8127               &cop->results.truncate_size,
8128               &cop->rval);
8129   op.set_last_op_flags(cop->src_obj_fadvise_flags);
8130
8131   C_Copyfrom *fin = new C_Copyfrom(this, obc->obs.oi.soid,
8132                                    get_last_peering_reset(), cop);
8133   gather.set_finisher(new C_OnFinisher(fin,
8134                                        &osd->objecter_finisher));
8135
8136   ceph_tid_t tid = osd->objecter->read(cop->src.oid, cop->oloc, op,
8137                                   cop->src.snap, NULL,
8138                                   flags,
8139                                   gather.new_sub(),
8140                                   // discover the object version if we don't know it yet
8141                                   cop->results.user_version ? NULL : &cop->results.user_version);
8142   fin->tid = tid;
8143   cop->objecter_tid = tid;
8144   gather.activate();
8145 }
8146
8147 void PrimaryLogPG::process_copy_chunk(hobject_t oid, ceph_tid_t tid, int r)
8148 {
8149   vector<ceph_tid_t> tids;
8150   dout(10) << __func__ << " " << oid << " tid " << tid
8151            << " " << cpp_strerror(r) << dendl;
8152   map<hobject_t,CopyOpRef>::iterator p = copy_ops.find(oid);
8153   if (p == copy_ops.end()) {
8154     dout(10) << __func__ << " no copy_op found" << dendl;
8155     return;
8156   }
8157   CopyOpRef cop = p->second;
8158   if (tid != cop->objecter_tid) {
8159     dout(10) << __func__ << " tid " << tid << " != cop " << cop
8160              << " tid " << cop->objecter_tid << dendl;
8161     return;
8162   }
8163
8164   if (cop->omap_data.length() || cop->omap_header.length())
8165     cop->results.has_omap = true;
8166
8167   if (r >= 0 && !pool.info.supports_omap() &&
8168       (cop->omap_data.length() || cop->omap_header.length())) {
8169     r = -EOPNOTSUPP;
8170   }
8171   cop->objecter_tid = 0;
8172   cop->objecter_tid2 = 0;  // assume this ordered before us (if it happened)
8173   ObjectContextRef& cobc = cop->obc;
8174
8175   if (r < 0)
8176     goto out;
8177
8178   assert(cop->rval >= 0);
8179
8180   if (oid.snap < CEPH_NOSNAP && !cop->results.snaps.empty()) {
8181     // verify snap hasn't been deleted
8182     vector<snapid_t>::iterator p = cop->results.snaps.begin();
8183     while (p != cop->results.snaps.end()) {
8184       if (pool.info.is_removed_snap(*p)) {
8185         dout(10) << __func__ << " clone snap " << *p << " has been deleted"
8186                  << dendl;
8187         for (vector<snapid_t>::iterator q = p + 1;
8188              q != cop->results.snaps.end();
8189              ++q)
8190           *(q - 1) = *q;
8191         cop->results.snaps.resize(cop->results.snaps.size() - 1);
8192       } else {
8193         ++p;
8194       }
8195     }
8196     if (cop->results.snaps.empty()) {
8197       dout(10) << __func__ << " no more snaps for " << oid << dendl;
8198       r = -ENOENT;
8199       goto out;
8200     }
8201   }
8202
8203   assert(cop->rval >= 0);
8204
8205   if (!cop->temp_cursor.data_complete) {
8206     cop->results.data_digest = cop->data.crc32c(cop->results.data_digest);
8207   }
8208   if (pool.info.supports_omap() && !cop->temp_cursor.omap_complete) {
8209     if (cop->omap_header.length()) {
8210       cop->results.omap_digest =
8211         cop->omap_header.crc32c(cop->results.omap_digest);
8212     }
8213     if (cop->omap_data.length()) {
8214       bufferlist keys;
8215       keys.substr_of(cop->omap_data, 4, cop->omap_data.length() - 4);
8216       cop->results.omap_digest = keys.crc32c(cop->results.omap_digest);
8217     }
8218   }
8219
8220   if (!cop->temp_cursor.attr_complete) {
8221     for (map<string,bufferlist>::iterator p = cop->attrs.begin();
8222          p != cop->attrs.end();
8223          ++p) {
8224       cop->results.attrs[string("_") + p->first] = p->second;
8225     }
8226     cop->attrs.clear();
8227   }
8228
8229   if (!cop->cursor.is_complete()) {
8230     // write out what we have so far
8231     if (cop->temp_cursor.is_initial()) {
8232       assert(!cop->results.started_temp_obj);
8233       cop->results.started_temp_obj = true;
8234       cop->results.temp_oid = generate_temp_object(oid);
8235       dout(20) << __func__ << " using temp " << cop->results.temp_oid << dendl;
8236     }
8237     ObjectContextRef tempobc = get_object_context(cop->results.temp_oid, true);
8238     OpContextUPtr ctx = simple_opc_create(tempobc);
8239     if (cop->temp_cursor.is_initial()) {
8240       ctx->new_temp_oid = cop->results.temp_oid;
8241     }
8242     _write_copy_chunk(cop, ctx->op_t.get());
8243     simple_opc_submit(std::move(ctx));
8244     dout(10) << __func__ << " fetching more" << dendl;
8245     _copy_some(cobc, cop);
8246     return;
8247   }
8248
8249   // verify digests?
8250   if (cop->results.is_data_digest() || cop->results.is_omap_digest()) {
8251     dout(20) << __func__ << std::hex
8252       << " got digest: rx data 0x" << cop->results.data_digest
8253       << " omap 0x" << cop->results.omap_digest
8254       << ", source: data 0x" << cop->results.source_data_digest
8255       << " omap 0x" <<  cop->results.source_omap_digest
8256       << std::dec
8257       << " flags " << cop->results.flags
8258       << dendl;
8259   }
8260   if (cop->results.is_data_digest() &&
8261       cop->results.data_digest != cop->results.source_data_digest) {
8262     derr << __func__ << std::hex << " data digest 0x" << cop->results.data_digest
8263          << " != source 0x" << cop->results.source_data_digest << std::dec
8264          << dendl;
8265     osd->clog->error() << info.pgid << " copy from " << cop->src
8266                        << " to " << cop->obc->obs.oi.soid << std::hex
8267                        << " data digest 0x" << cop->results.data_digest
8268                        << " != source 0x" << cop->results.source_data_digest
8269                        << std::dec;
8270     r = -EIO;
8271     goto out;
8272   }
8273   if (cop->results.is_omap_digest() &&
8274       cop->results.omap_digest != cop->results.source_omap_digest) {
8275     derr << __func__ << std::hex
8276          << " omap digest 0x" << cop->results.omap_digest
8277          << " != source 0x" << cop->results.source_omap_digest
8278          << std::dec << dendl;
8279     osd->clog->error() << info.pgid << " copy from " << cop->src
8280                        << " to " << cop->obc->obs.oi.soid << std::hex
8281                        << " omap digest 0x" << cop->results.omap_digest
8282                        << " != source 0x" << cop->results.source_omap_digest
8283                        << std::dec;
8284     r = -EIO;
8285     goto out;
8286   }
8287   if (cct->_conf->osd_debug_inject_copyfrom_error) {
8288     derr << __func__ << " injecting copyfrom failure" << dendl;
8289     r = -EIO;
8290     goto out;
8291   }
8292
8293   cop->results.fill_in_final_tx = std::function<void(PGTransaction*)>(
8294     [this, &cop /* avoid ref cycle */](PGTransaction *t) {
8295       ObjectState& obs = cop->obc->obs;
8296       if (cop->temp_cursor.is_initial()) {
8297         dout(20) << "fill_in_final_tx: writing "
8298                  << "directly to final object" << dendl;
8299         // write directly to final object
8300         cop->results.temp_oid = obs.oi.soid;
8301         _write_copy_chunk(cop, t);
8302       } else {
8303         // finish writing to temp object, then move into place
8304         dout(20) << "fill_in_final_tx: writing to temp object" << dendl;
8305         _write_copy_chunk(cop, t);
8306         t->rename(obs.oi.soid, cop->results.temp_oid);
8307       }
8308       t->setattrs(obs.oi.soid, cop->results.attrs);
8309     });
8310
8311   dout(20) << __func__ << " success; committing" << dendl;
8312
8313  out:
8314   dout(20) << __func__ << " complete r = " << cpp_strerror(r) << dendl;
8315   CopyCallbackResults results(r, &cop->results);
8316   cop->cb->complete(results);
8317
8318   copy_ops.erase(cobc->obs.oi.soid);
8319   cobc->stop_block();
8320
8321   if (r < 0 && cop->results.started_temp_obj) {
8322     dout(10) << __func__ << " deleting partial temp object "
8323              << cop->results.temp_oid << dendl;
8324     ObjectContextRef tempobc = get_object_context(cop->results.temp_oid, true);
8325     OpContextUPtr ctx = simple_opc_create(tempobc);
8326     ctx->op_t->remove(cop->results.temp_oid);
8327     ctx->discard_temp_oid = cop->results.temp_oid;
8328     simple_opc_submit(std::move(ctx));
8329   }
8330
8331   // cancel and requeue proxy ops on this object
8332   if (!r) {
8333     for (map<ceph_tid_t, ProxyReadOpRef>::iterator it = proxyread_ops.begin();
8334         it != proxyread_ops.end();) {
8335       if (it->second->soid == cobc->obs.oi.soid) {
8336         cancel_proxy_read((it++)->second, &tids);
8337       } else {
8338         ++it;
8339       }
8340     }
8341     for (map<ceph_tid_t, ProxyWriteOpRef>::iterator it = proxywrite_ops.begin();
8342          it != proxywrite_ops.end();) {
8343       if (it->second->soid == cobc->obs.oi.soid) {
8344         cancel_proxy_write((it++)->second, &tids);
8345       } else {
8346         ++it;
8347       }
8348     }
8349     osd->objecter->op_cancel(tids, -ECANCELED);
8350     kick_proxy_ops_blocked(cobc->obs.oi.soid);
8351   }
8352
8353   kick_object_context_blocked(cobc);
8354 }
8355
8356 void PrimaryLogPG::cancel_and_requeue_proxy_ops(hobject_t oid) {
8357   vector<ceph_tid_t> tids;
8358   for (map<ceph_tid_t, ProxyReadOpRef>::iterator it = proxyread_ops.begin();
8359       it != proxyread_ops.end();) {
8360     if (it->second->soid == oid) {
8361       cancel_proxy_read((it++)->second, &tids);
8362     } else {
8363       ++it;
8364     }
8365   }
8366   for (map<ceph_tid_t, ProxyWriteOpRef>::iterator it = proxywrite_ops.begin();
8367        it != proxywrite_ops.end();) {
8368     if (it->second->soid == oid) {
8369       cancel_proxy_write((it++)->second, &tids);
8370     } else {
8371       ++it;
8372     }
8373   }
8374   osd->objecter->op_cancel(tids, -ECANCELED);
8375   kick_proxy_ops_blocked(oid);
8376 }
8377
8378 void PrimaryLogPG::_write_copy_chunk(CopyOpRef cop, PGTransaction *t)
8379 {
8380   dout(20) << __func__ << " " << cop
8381            << " " << cop->attrs.size() << " attrs"
8382            << " " << cop->data.length() << " bytes"
8383            << " " << cop->omap_header.length() << " omap header bytes"
8384            << " " << cop->omap_data.length() << " omap data bytes"
8385            << dendl;
8386   if (!cop->temp_cursor.attr_complete) {
8387     t->create(cop->results.temp_oid);
8388   }
8389   if (!cop->temp_cursor.data_complete) {
8390     assert(cop->data.length() + cop->temp_cursor.data_offset ==
8391            cop->cursor.data_offset);
8392     if (pool.info.requires_aligned_append() &&
8393         !cop->cursor.data_complete) {
8394       /**
8395        * Trim off the unaligned bit at the end, we'll adjust cursor.data_offset
8396        * to pick it up on the next pass.
8397        */
8398       assert(cop->temp_cursor.data_offset %
8399              pool.info.required_alignment() == 0);
8400       if (cop->data.length() % pool.info.required_alignment() != 0) {
8401         uint64_t to_trim =
8402           cop->data.length() % pool.info.required_alignment();
8403         bufferlist bl;
8404         bl.substr_of(cop->data, 0, cop->data.length() - to_trim);
8405         cop->data.swap(bl);
8406         cop->cursor.data_offset -= to_trim;
8407         assert(cop->data.length() + cop->temp_cursor.data_offset ==
8408                cop->cursor.data_offset);
8409       }
8410     }
8411     if (cop->data.length()) {
8412       t->write(
8413         cop->results.temp_oid,
8414         cop->temp_cursor.data_offset,
8415         cop->data.length(),
8416         cop->data,
8417         cop->dest_obj_fadvise_flags);
8418     }
8419     cop->data.clear();
8420   }
8421   if (pool.info.supports_omap()) {
8422     if (!cop->temp_cursor.omap_complete) {
8423       if (cop->omap_header.length()) {
8424         t->omap_setheader(
8425           cop->results.temp_oid,
8426           cop->omap_header);
8427         cop->omap_header.clear();
8428       }
8429       if (cop->omap_data.length()) {
8430         map<string,bufferlist> omap;
8431         bufferlist::iterator p = cop->omap_data.begin();
8432         ::decode(omap, p);
8433         t->omap_setkeys(cop->results.temp_oid, omap);
8434         cop->omap_data.clear();
8435       }
8436     }
8437   } else {
8438     assert(cop->omap_header.length() == 0);
8439     assert(cop->omap_data.length() == 0);
8440   }
8441   cop->temp_cursor = cop->cursor;
8442 }
8443
8444 void PrimaryLogPG::finish_copyfrom(CopyFromCallback *cb)
8445 {
8446   OpContext *ctx = cb->ctx;
8447   dout(20) << "finish_copyfrom on " << ctx->obs->oi.soid << dendl;
8448
8449   ObjectState& obs = ctx->new_obs;
8450   if (obs.exists) {
8451     dout(20) << __func__ << ": exists, removing" << dendl;
8452     ctx->op_t->remove(obs.oi.soid);
8453   } else {
8454     ctx->delta_stats.num_objects++;
8455     obs.exists = true;
8456   }
8457   if (cb->is_temp_obj_used()) {
8458     ctx->discard_temp_oid = cb->results->temp_oid;
8459   }
8460   cb->results->fill_in_final_tx(ctx->op_t.get());
8461
8462   // CopyFromCallback fills this in for us
8463   obs.oi.user_version = ctx->user_at_version;
8464
8465   if (cb->results->is_data_digest()) {
8466     obs.oi.set_data_digest(cb->results->data_digest);
8467   } else {
8468     obs.oi.clear_data_digest();
8469   }
8470   if (cb->results->is_omap_digest()) {
8471     obs.oi.set_omap_digest(cb->results->omap_digest);
8472   } else {
8473     obs.oi.clear_omap_digest();
8474   }
8475
8476   obs.oi.truncate_seq = cb->results->truncate_seq;
8477   obs.oi.truncate_size = cb->results->truncate_size;
8478
8479   ctx->extra_reqids = cb->results->reqids;
8480
8481   // cache: clear whiteout?
8482   if (obs.oi.is_whiteout()) {
8483     dout(10) << __func__ << " clearing whiteout on " << obs.oi.soid << dendl;
8484     obs.oi.clear_flag(object_info_t::FLAG_WHITEOUT);
8485     --ctx->delta_stats.num_whiteouts;
8486   }
8487
8488   if (cb->results->has_omap) {
8489     dout(10) << __func__ << " setting omap flag on " << obs.oi.soid << dendl;
8490     obs.oi.set_flag(object_info_t::FLAG_OMAP);
8491   } else {
8492     dout(10) << __func__ << " clearing omap flag on " << obs.oi.soid << dendl;
8493     obs.oi.clear_flag(object_info_t::FLAG_OMAP);
8494   }
8495
8496   interval_set<uint64_t> ch;
8497   if (obs.oi.size > 0)
8498     ch.insert(0, obs.oi.size);
8499   ctx->modified_ranges.union_of(ch);
8500
8501   if (cb->get_data_size() != obs.oi.size) {
8502     ctx->delta_stats.num_bytes -= obs.oi.size;
8503     obs.oi.size = cb->get_data_size();
8504     ctx->delta_stats.num_bytes += obs.oi.size;
8505   }
8506   ctx->delta_stats.num_wr++;
8507   ctx->delta_stats.num_wr_kb += SHIFT_ROUND_UP(obs.oi.size, 10);
8508
8509   osd->logger->inc(l_osd_copyfrom);
8510 }
8511
8512 void PrimaryLogPG::finish_promote(int r, CopyResults *results,
8513                                   ObjectContextRef obc)
8514 {
8515   const hobject_t& soid = obc->obs.oi.soid;
8516   dout(10) << __func__ << " " << soid << " r=" << r
8517            << " uv" << results->user_version << dendl;
8518
8519   if (r == -ECANCELED) {
8520     return;
8521   }
8522
8523   if (r != -ENOENT && soid.is_snap()) {
8524     if (results->snaps.empty()) {
8525       // we must have read "snap" content from the head object in
8526       // the base pool.  use snap_seq to construct what snaps should
8527       // be for this clone (what is was before we evicted the clean
8528       // clone from this pool, and what it will be when we flush and
8529       // the clone eventually happens in the base pool).
8530       SnapSet& snapset = obc->ssc->snapset;
8531       vector<snapid_t>::iterator p = snapset.snaps.begin();
8532       while (p != snapset.snaps.end() && *p > soid.snap)
8533         ++p;
8534       while (p != snapset.snaps.end() && *p > results->snap_seq) {
8535         results->snaps.push_back(*p);
8536         ++p;
8537       }
8538     }
8539
8540     dout(20) << __func__ << " snaps " << results->snaps << dendl;
8541     filter_snapc(results->snaps);
8542
8543     dout(20) << __func__ << " filtered snaps " << results->snaps << dendl;
8544     if (results->snaps.empty()) {
8545       dout(20) << __func__
8546                << " snaps are empty, clone is invalid,"
8547                << " setting r to ENOENT" << dendl;
8548       r = -ENOENT;
8549     }
8550   }
8551
8552   if (r < 0 && results->started_temp_obj) {
8553     dout(10) << __func__ << " abort; will clean up partial work" << dendl;
8554     ObjectContextRef tempobc = get_object_context(results->temp_oid, false);
8555     assert(tempobc);
8556     OpContextUPtr ctx = simple_opc_create(tempobc);
8557     ctx->op_t->remove(results->temp_oid);
8558     simple_opc_submit(std::move(ctx));
8559     results->started_temp_obj = false;
8560   }
8561
8562   if (r == -ENOENT && soid.is_snap()) {
8563     dout(10) << __func__
8564              << ": enoent while trying to promote clone, " << soid
8565              << " must have been trimmed, removing from snapset"
8566              << dendl;
8567     hobject_t head(soid.get_head());
8568     ObjectContextRef obc = get_object_context(head, false);
8569     assert(obc);
8570
8571     OpContextUPtr tctx = simple_opc_create(obc);
8572     tctx->at_version = get_next_version();
8573     filter_snapc(tctx->new_snapset.snaps);
8574     vector<snapid_t> new_clones;
8575     map<snapid_t, vector<snapid_t>> new_clone_snaps;
8576     for (vector<snapid_t>::iterator i = tctx->new_snapset.clones.begin();
8577          i != tctx->new_snapset.clones.end();
8578          ++i) {
8579       if (*i != soid.snap) {
8580         new_clones.push_back(*i);
8581         auto p = tctx->new_snapset.clone_snaps.find(*i);
8582         if (p != tctx->new_snapset.clone_snaps.end()) {
8583           new_clone_snaps[*i] = p->second;
8584         }
8585       }
8586     }
8587     tctx->new_snapset.clones.swap(new_clones);
8588     tctx->new_snapset.clone_overlap.erase(soid.snap);
8589     tctx->new_snapset.clone_size.erase(soid.snap);
8590     tctx->new_snapset.clone_snaps.swap(new_clone_snaps);
8591
8592     // take RWWRITE lock for duration of our local write.  ignore starvation.
8593     if (!tctx->lock_manager.take_write_lock(
8594           head,
8595           obc)) {
8596       assert(0 == "problem!");
8597     }
8598     dout(20) << __func__ << " took lock on obc, " << obc->rwstate << dendl;
8599
8600     finish_ctx(tctx.get(), pg_log_entry_t::PROMOTE);
8601
8602     simple_opc_submit(std::move(tctx));
8603     return;
8604   }
8605
8606   bool whiteout = false;
8607   if (r == -ENOENT) {
8608     assert(soid.snap == CEPH_NOSNAP); // snap case is above
8609     dout(10) << __func__ << " whiteout " << soid << dendl;
8610     whiteout = true;
8611   }
8612
8613   if (r < 0 && !whiteout) {
8614     derr << __func__ << " unexpected promote error " << cpp_strerror(r) << dendl;
8615     // pass error to everyone blocked on this object
8616     // FIXME: this is pretty sloppy, but at this point we got
8617     // something unexpected and don't have many other options.
8618     map<hobject_t,list<OpRequestRef>>::iterator blocked_iter =
8619       waiting_for_blocked_object.find(soid);
8620     if (blocked_iter != waiting_for_blocked_object.end()) {
8621       while (!blocked_iter->second.empty()) {
8622         osd->reply_op_error(blocked_iter->second.front(), r);
8623         blocked_iter->second.pop_front();
8624       }
8625       waiting_for_blocked_object.erase(blocked_iter);
8626     }
8627     return;
8628   }
8629
8630   osd->promote_finish(results->object_size);
8631
8632   OpContextUPtr tctx =  simple_opc_create(obc);
8633   tctx->at_version = get_next_version();
8634
8635   ++tctx->delta_stats.num_objects;
8636   if (soid.snap < CEPH_NOSNAP)
8637     ++tctx->delta_stats.num_object_clones;
8638   tctx->new_obs.exists = true;
8639
8640   tctx->extra_reqids = results->reqids;
8641
8642   bool legacy_snapset = tctx->new_snapset.is_legacy() ||
8643     get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
8644
8645   if (whiteout) {
8646     // create a whiteout
8647     tctx->op_t->create(soid);
8648     tctx->new_obs.oi.set_flag(object_info_t::FLAG_WHITEOUT);
8649     ++tctx->delta_stats.num_whiteouts;
8650     dout(20) << __func__ << " creating whiteout on " << soid << dendl;
8651     osd->logger->inc(l_osd_tier_whiteout);
8652   } else {
8653     if (results->has_omap) {
8654       dout(10) << __func__ << " setting omap flag on " << soid << dendl;
8655       tctx->new_obs.oi.set_flag(object_info_t::FLAG_OMAP);
8656       ++tctx->delta_stats.num_objects_omap;
8657     }
8658
8659     results->fill_in_final_tx(tctx->op_t.get());
8660     if (results->started_temp_obj) {
8661       tctx->discard_temp_oid = results->temp_oid;
8662     }
8663     tctx->new_obs.oi.size = results->object_size;
8664     tctx->new_obs.oi.user_version = results->user_version;
8665     if (results->is_data_digest()) {
8666       tctx->new_obs.oi.set_data_digest(results->data_digest);
8667     } else {
8668       tctx->new_obs.oi.clear_data_digest();
8669     }
8670     if (results->is_omap_digest()) {
8671       tctx->new_obs.oi.set_omap_digest(results->omap_digest);
8672     } else {
8673       tctx->new_obs.oi.clear_omap_digest();
8674     }
8675     tctx->new_obs.oi.truncate_seq = results->truncate_seq;
8676     tctx->new_obs.oi.truncate_size = results->truncate_size;
8677
8678     if (soid.snap != CEPH_NOSNAP) {
8679       if (legacy_snapset) {
8680         tctx->new_obs.oi.legacy_snaps = results->snaps;
8681         assert(!tctx->new_obs.oi.legacy_snaps.empty());
8682       } else {
8683         // it's already in the snapset
8684         assert(obc->ssc->snapset.clone_snaps.count(soid.snap));
8685       }
8686       assert(obc->ssc->snapset.clone_size.count(soid.snap));
8687       assert(obc->ssc->snapset.clone_size[soid.snap] ==
8688              results->object_size);
8689       assert(obc->ssc->snapset.clone_overlap.count(soid.snap));
8690
8691       tctx->delta_stats.num_bytes += obc->ssc->snapset.get_clone_bytes(soid.snap);
8692     } else {
8693       tctx->delta_stats.num_bytes += results->object_size;
8694     }
8695   }
8696
8697   if (results->mirror_snapset) {
8698     assert(tctx->new_obs.oi.soid.snap == CEPH_NOSNAP);
8699     tctx->new_snapset.from_snap_set(
8700       results->snapset,
8701       get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS);
8702   }
8703   tctx->new_snapset.head_exists = true;
8704   dout(20) << __func__ << " new_snapset " << tctx->new_snapset << dendl;
8705
8706   // take RWWRITE lock for duration of our local write.  ignore starvation.
8707   if (!tctx->lock_manager.take_write_lock(
8708         obc->obs.oi.soid,
8709         obc)) {
8710     assert(0 == "problem!");
8711   }
8712   dout(20) << __func__ << " took lock on obc, " << obc->rwstate << dendl;
8713
8714   finish_ctx(tctx.get(), pg_log_entry_t::PROMOTE);
8715
8716   simple_opc_submit(std::move(tctx));
8717
8718   osd->logger->inc(l_osd_tier_promote);
8719
8720   if (agent_state &&
8721       agent_state->is_idle())
8722     agent_choose_mode();
8723 }
8724
8725 void PrimaryLogPG::cancel_copy(CopyOpRef cop, bool requeue,
8726                                vector<ceph_tid_t> *tids)
8727 {
8728   dout(10) << __func__ << " " << cop->obc->obs.oi.soid
8729            << " from " << cop->src << " " << cop->oloc
8730            << " v" << cop->results.user_version << dendl;
8731
8732   // cancel objecter op, if we can
8733   if (cop->objecter_tid) {
8734     tids->push_back(cop->objecter_tid);
8735     cop->objecter_tid = 0;
8736     if (cop->objecter_tid2) {
8737       tids->push_back(cop->objecter_tid2);
8738       cop->objecter_tid2 = 0;
8739     }
8740   }
8741
8742   copy_ops.erase(cop->obc->obs.oi.soid);
8743   cop->obc->stop_block();
8744
8745   kick_object_context_blocked(cop->obc);
8746   cop->results.should_requeue = requeue;
8747   CopyCallbackResults result(-ECANCELED, &cop->results);
8748   cop->cb->complete(result);
8749
8750   // There may still be an objecter callback referencing this copy op.
8751   // That callback will not need the obc since it's been canceled, and
8752   // we need the obc reference to go away prior to flush.
8753   cop->obc = ObjectContextRef();
8754 }
8755
8756 void PrimaryLogPG::cancel_copy_ops(bool requeue, vector<ceph_tid_t> *tids)
8757 {
8758   dout(10) << __func__ << dendl;
8759   map<hobject_t,CopyOpRef>::iterator p = copy_ops.begin();
8760   while (p != copy_ops.end()) {
8761     // requeue this op? can I queue up all of them?
8762     cancel_copy((p++)->second, requeue, tids);
8763   }
8764 }
8765
8766
8767 // ========================================================================
8768 // flush
8769 //
8770 // Flush a dirty object in the cache tier by writing it back to the
8771 // base tier.  The sequence looks like:
8772 //
8773 //  * send a copy-from operation to the base tier to copy the current
8774 //    version of the object
8775 //  * base tier will pull the object via (perhaps multiple) copy-get(s)
8776 //  * on completion, we check if the object has been modified.  if so,
8777 //    just reply with -EAGAIN.
8778 //  * try to take a write lock so we can clear the dirty flag.  if this
8779 //    fails, wait and retry
8780 //  * start a repop that clears the bit.
8781 //
8782 // If we have to wait, we will retry by coming back through the
8783 // start_flush method.  We check if a flush is already in progress
8784 // and, if so, try to finish it by rechecking the version and trying
8785 // to clear the dirty bit.
8786 //
8787 // In order for the cache-flush (a write op) to not block the copy-get
8788 // from reading the object, the client *must* set the SKIPRWLOCKS
8789 // flag.
8790 //
8791 // NOTE: normally writes are strictly ordered for the client, but
8792 // flushes are special in that they can be reordered with respect to
8793 // other writes.  In particular, we can't have a flush request block
8794 // an update to the cache pool object!
8795
8796 struct C_Flush : public Context {
8797   PrimaryLogPGRef pg;
8798   hobject_t oid;
8799   epoch_t last_peering_reset;
8800   ceph_tid_t tid;
8801   utime_t start;
8802   C_Flush(PrimaryLogPG *p, hobject_t o, epoch_t lpr)
8803     : pg(p), oid(o), last_peering_reset(lpr),
8804       tid(0), start(ceph_clock_now())
8805   {}
8806   void finish(int r) override {
8807     if (r == -ECANCELED)
8808       return;
8809     pg->lock();
8810     if (last_peering_reset == pg->get_last_peering_reset()) {
8811       pg->finish_flush(oid, tid, r);
8812       pg->osd->logger->tinc(l_osd_tier_flush_lat, ceph_clock_now() - start);
8813     }
8814     pg->unlock();
8815   }
8816 };
8817
8818 int PrimaryLogPG::start_flush(
8819   OpRequestRef op, ObjectContextRef obc,
8820   bool blocking, hobject_t *pmissing,
8821   boost::optional<std::function<void()>> &&on_flush)
8822 {
8823   const object_info_t& oi = obc->obs.oi;
8824   const hobject_t& soid = oi.soid;
8825   dout(10) << __func__ << " " << soid
8826            << " v" << oi.version
8827            << " uv" << oi.user_version
8828            << " " << (blocking ? "blocking" : "non-blocking/best-effort")
8829            << dendl;
8830
8831   // get a filtered snapset, need to remove removed snaps
8832   SnapSet snapset = obc->ssc->snapset.get_filtered(pool.info);
8833
8834   // verify there are no (older) check for dirty clones
8835   {
8836     dout(20) << " snapset " << snapset << dendl;
8837     vector<snapid_t>::reverse_iterator p = snapset.clones.rbegin();
8838     while (p != snapset.clones.rend() && *p >= soid.snap)
8839       ++p;
8840     if (p != snapset.clones.rend()) {
8841       hobject_t next = soid;
8842       next.snap = *p;
8843       assert(next.snap < soid.snap);
8844       if (pg_log.get_missing().is_missing(next)) {
8845         dout(10) << __func__ << " missing clone is " << next << dendl;
8846         if (pmissing)
8847           *pmissing = next;
8848         return -ENOENT;
8849       }
8850       ObjectContextRef older_obc = get_object_context(next, false);
8851       if (older_obc) {
8852         dout(20) << __func__ << " next oldest clone is " << older_obc->obs.oi
8853                  << dendl;
8854         if (older_obc->obs.oi.is_dirty()) {
8855           dout(10) << __func__ << " next oldest clone is dirty: "
8856                    << older_obc->obs.oi << dendl;
8857           return -EBUSY;
8858         }
8859       } else {
8860         dout(20) << __func__ << " next oldest clone " << next
8861                  << " is not present; implicitly clean" << dendl;
8862       }
8863     } else {
8864       dout(20) << __func__ << " no older clones" << dendl;
8865     }
8866   }
8867
8868   if (blocking)
8869     obc->start_block();
8870
8871   map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(soid);
8872   if (p != flush_ops.end()) {
8873     FlushOpRef fop = p->second;
8874     if (fop->op == op) {
8875       // we couldn't take the write lock on a cache-try-flush before;
8876       // now we are trying again for the lock.
8877       return try_flush_mark_clean(fop);
8878     }
8879     if (fop->flushed_version == obc->obs.oi.user_version &&
8880         (fop->blocking || !blocking)) {
8881       // nonblocking can join anything
8882       // blocking can only join a blocking flush
8883       dout(20) << __func__ << " piggybacking on existing flush " << dendl;
8884       if (op)
8885         fop->dup_ops.push_back(op);
8886       return -EAGAIN;   // clean up this ctx; op will retry later
8887     }
8888
8889     // cancel current flush since it will fail anyway, or because we
8890     // are blocking and the existing flush is nonblocking.
8891     dout(20) << __func__ << " canceling previous flush; it will fail" << dendl;
8892     if (fop->op)
8893       osd->reply_op_error(fop->op, -EBUSY);
8894     while (!fop->dup_ops.empty()) {
8895       osd->reply_op_error(fop->dup_ops.front(), -EBUSY);
8896       fop->dup_ops.pop_front();
8897     }
8898     vector<ceph_tid_t> tids;
8899     cancel_flush(fop, false, &tids);
8900     osd->objecter->op_cancel(tids, -ECANCELED);
8901   }
8902
8903   /**
8904    * In general, we need to send a delete and a copyfrom.
8905    * Consider snapc 10:[10, 9, 8, 4, 3, 2]:[10(10, 9), 4(4,3,2)]
8906    * where 4 is marked as clean.  To flush 10, we have to:
8907    * 1) delete 4:[4,3,2] -- Logically, the object does not exist after 4
8908    * 2) copyfrom 8:[8,4,3,2] -- flush object after snap 8
8909    *
8910    * There is a complicating case.  Supposed there had been a clone 7
8911    * for snaps [7, 6] which has been trimmed since they no longer exist.
8912    * In the base pool, we'd have 5:[4,3,2]:[4(4,3,2)]+head.  When we submit
8913    * the delete, the snap will be promoted to 5, and the head will become
8914    * a snapdir.  When the copy-from goes through, we'll end up with
8915    * 8:[8,4,3,2]:[4(4,3,2)]+head.
8916    *
8917    * Another complication is the case where there is an interval change
8918    * after doing the delete and the flush but before marking the object
8919    * clean.  We'll happily delete head and then recreate it at the same
8920    * sequence number, which works out ok.
8921    */
8922
8923   SnapContext snapc, dsnapc;
8924   if (snapset.seq != 0) {
8925     if (soid.snap == CEPH_NOSNAP) {
8926       snapc.seq = snapset.seq;
8927       snapc.snaps = snapset.snaps;
8928     } else {
8929       snapid_t min_included_snap;
8930       if (snapset.is_legacy()) {
8931         min_included_snap = oi.legacy_snaps.back();
8932       } else {
8933         auto p = snapset.clone_snaps.find(soid.snap);
8934         assert(p != snapset.clone_snaps.end());
8935         min_included_snap = p->second.back();
8936       }
8937       snapc = snapset.get_ssc_as_of(min_included_snap - 1);
8938     }
8939
8940     snapid_t prev_snapc = 0;
8941     for (vector<snapid_t>::reverse_iterator citer = snapset.clones.rbegin();
8942          citer != snapset.clones.rend();
8943          ++citer) {
8944       if (*citer < soid.snap) {
8945         prev_snapc = *citer;
8946         break;
8947       }
8948     }
8949
8950     dsnapc = snapset.get_ssc_as_of(prev_snapc);
8951   }
8952
8953   object_locator_t base_oloc(soid);
8954   base_oloc.pool = pool.info.tier_of;
8955
8956   if (dsnapc.seq < snapc.seq) {
8957     ObjectOperation o;
8958     o.remove();
8959     osd->objecter->mutate(
8960       soid.oid,
8961       base_oloc,
8962       o,
8963       dsnapc,
8964       ceph::real_clock::from_ceph_timespec(oi.mtime),
8965       (CEPH_OSD_FLAG_IGNORE_OVERLAY |
8966        CEPH_OSD_FLAG_ENFORCE_SNAPC),
8967       NULL /* no callback, we'll rely on the ordering w.r.t the next op */);
8968   }
8969
8970   FlushOpRef fop(std::make_shared<FlushOp>());
8971   fop->obc = obc;
8972   fop->flushed_version = oi.user_version;
8973   fop->blocking = blocking;
8974   fop->on_flush = std::move(on_flush);
8975   fop->op = op;
8976
8977   ObjectOperation o;
8978   if (oi.is_whiteout()) {
8979     fop->removal = true;
8980     o.remove();
8981   } else {
8982     object_locator_t oloc(soid);
8983     o.copy_from(soid.oid.name, soid.snap, oloc, oi.user_version,
8984                 CEPH_OSD_COPY_FROM_FLAG_FLUSH |
8985                 CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
8986                 CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
8987                 CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE,
8988                 LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL|LIBRADOS_OP_FLAG_FADVISE_NOCACHE);
8989
8990     //mean the base tier don't cache data after this
8991     if (agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL)
8992       o.set_last_op_flags(LIBRADOS_OP_FLAG_FADVISE_DONTNEED);
8993   }
8994   C_Flush *fin = new C_Flush(this, soid, get_last_peering_reset());
8995
8996   ceph_tid_t tid = osd->objecter->mutate(
8997     soid.oid, base_oloc, o, snapc,
8998     ceph::real_clock::from_ceph_timespec(oi.mtime),
8999     CEPH_OSD_FLAG_IGNORE_OVERLAY | CEPH_OSD_FLAG_ENFORCE_SNAPC,
9000     new C_OnFinisher(fin,
9001                      &osd->objecter_finisher));
9002   /* we're under the pg lock and fin->finish() is grabbing that */
9003   fin->tid = tid;
9004   fop->objecter_tid = tid;
9005
9006   flush_ops[soid] = fop;
9007   info.stats.stats.sum.num_flush++;
9008   info.stats.stats.sum.num_flush_kb += SHIFT_ROUND_UP(oi.size, 10);
9009   return -EINPROGRESS;
9010 }
9011
9012 void PrimaryLogPG::finish_flush(hobject_t oid, ceph_tid_t tid, int r)
9013 {
9014   dout(10) << __func__ << " " << oid << " tid " << tid
9015            << " " << cpp_strerror(r) << dendl;
9016   map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(oid);
9017   if (p == flush_ops.end()) {
9018     dout(10) << __func__ << " no flush_op found" << dendl;
9019     return;
9020   }
9021   FlushOpRef fop = p->second;
9022   if (tid != fop->objecter_tid) {
9023     dout(10) << __func__ << " tid " << tid << " != fop " << fop
9024              << " tid " << fop->objecter_tid << dendl;
9025     return;
9026   }
9027   ObjectContextRef obc = fop->obc;
9028   fop->objecter_tid = 0;
9029
9030   if (r < 0 && !(r == -ENOENT && fop->removal)) {
9031     if (fop->op)
9032       osd->reply_op_error(fop->op, -EBUSY);
9033     if (fop->blocking) {
9034       obc->stop_block();
9035       kick_object_context_blocked(obc);
9036     }
9037
9038     if (!fop->dup_ops.empty()) {
9039       dout(20) << __func__ << " requeueing dups" << dendl;
9040       requeue_ops(fop->dup_ops);
9041     }
9042     if (fop->on_flush) {
9043       (*(fop->on_flush))();
9044       fop->on_flush = boost::none;
9045     }
9046     flush_ops.erase(oid);
9047     return;
9048   }
9049
9050   r = try_flush_mark_clean(fop);
9051   if (r == -EBUSY && fop->op) {
9052     osd->reply_op_error(fop->op, r);
9053   }
9054 }
9055
9056 int PrimaryLogPG::try_flush_mark_clean(FlushOpRef fop)
9057 {
9058   ObjectContextRef obc = fop->obc;
9059   const hobject_t& oid = obc->obs.oi.soid;
9060
9061   if (fop->blocking) {
9062     obc->stop_block();
9063     kick_object_context_blocked(obc);
9064   }
9065
9066   if (fop->flushed_version != obc->obs.oi.user_version ||
9067       !obc->obs.exists) {
9068     if (obc->obs.exists)
9069       dout(10) << __func__ << " flushed_version " << fop->flushed_version
9070                << " != current " << obc->obs.oi.user_version
9071                << dendl;
9072     else
9073       dout(10) << __func__ << " object no longer exists" << dendl;
9074
9075     if (!fop->dup_ops.empty()) {
9076       dout(20) << __func__ << " requeueing dups" << dendl;
9077       requeue_ops(fop->dup_ops);
9078     }
9079     if (fop->on_flush) {
9080       (*(fop->on_flush))();
9081       fop->on_flush = boost::none;
9082     }
9083     flush_ops.erase(oid);
9084     if (fop->blocking)
9085       osd->logger->inc(l_osd_tier_flush_fail);
9086     else
9087       osd->logger->inc(l_osd_tier_try_flush_fail);
9088     return -EBUSY;
9089   }
9090
9091   if (!fop->blocking &&
9092       write_blocked_by_scrub(oid)) {
9093     if (fop->op) {
9094       dout(10) << __func__ << " blocked by scrub" << dendl;
9095       requeue_op(fop->op);
9096       requeue_ops(fop->dup_ops);
9097       return -EAGAIN;    // will retry
9098     } else {
9099       osd->logger->inc(l_osd_tier_try_flush_fail);
9100       vector<ceph_tid_t> tids;
9101       cancel_flush(fop, false, &tids);
9102       osd->objecter->op_cancel(tids, -ECANCELED);
9103       return -ECANCELED;
9104     }
9105   }
9106
9107   // successfully flushed, can we evict this object?
9108   if (!fop->op && agent_state->evict_mode != TierAgentState::EVICT_MODE_IDLE &&
9109       agent_maybe_evict(obc, true)) {
9110     osd->logger->inc(l_osd_tier_clean);
9111     if (fop->on_flush) {
9112       (*(fop->on_flush))();
9113       fop->on_flush = boost::none;
9114     }
9115     flush_ops.erase(oid);
9116     return 0;
9117   }
9118
9119   dout(10) << __func__ << " clearing DIRTY flag for " << oid << dendl;
9120   OpContextUPtr ctx = simple_opc_create(fop->obc);
9121
9122   // successfully flushed; can we clear the dirty bit?
9123   // try to take the lock manually, since we don't
9124   // have a ctx yet.
9125   if (ctx->lock_manager.get_lock_type(
9126         ObjectContext::RWState::RWWRITE,
9127         oid,
9128         obc,
9129         fop->op)) {
9130     dout(20) << __func__ << " took write lock" << dendl;
9131   } else if (fop->op) {
9132     dout(10) << __func__ << " waiting on write lock " << fop->op << " "
9133              << fop->dup_ops << dendl;
9134     close_op_ctx(ctx.release());
9135     // fop->op is now waiting on the lock; get fop->dup_ops to wait too.
9136     for (auto op : fop->dup_ops) {
9137       bool locked = ctx->lock_manager.get_lock_type(
9138         ObjectContext::RWState::RWWRITE,
9139         oid,
9140         obc,
9141         op);
9142       assert(!locked);
9143     }
9144     return -EAGAIN;    // will retry
9145   } else {
9146     dout(10) << __func__ << " failed write lock, no op; failing" << dendl;
9147     close_op_ctx(ctx.release());
9148     osd->logger->inc(l_osd_tier_try_flush_fail);
9149     vector<ceph_tid_t> tids;
9150     cancel_flush(fop, false, &tids);
9151     osd->objecter->op_cancel(tids, -ECANCELED);
9152     return -ECANCELED;
9153   }
9154
9155   if (fop->on_flush) {
9156     ctx->register_on_finish(*(fop->on_flush));
9157     fop->on_flush = boost::none;
9158   }
9159
9160   ctx->at_version = get_next_version();
9161
9162   ctx->new_obs = obc->obs;
9163   ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
9164   --ctx->delta_stats.num_objects_dirty;
9165
9166   finish_ctx(ctx.get(), pg_log_entry_t::CLEAN);
9167
9168   osd->logger->inc(l_osd_tier_clean);
9169
9170   if (!fop->dup_ops.empty() || fop->op) {
9171     dout(20) << __func__ << " requeueing for " << ctx->at_version << dendl;
9172     list<OpRequestRef> ls;
9173     if (fop->op)
9174       ls.push_back(fop->op);
9175     ls.splice(ls.end(), fop->dup_ops);
9176     requeue_ops(ls);
9177   }
9178
9179   simple_opc_submit(std::move(ctx));
9180
9181   flush_ops.erase(oid);
9182
9183   if (fop->blocking)
9184     osd->logger->inc(l_osd_tier_flush);
9185   else
9186     osd->logger->inc(l_osd_tier_try_flush);
9187
9188   return -EINPROGRESS;
9189 }
9190
9191 void PrimaryLogPG::cancel_flush(FlushOpRef fop, bool requeue,
9192                                 vector<ceph_tid_t> *tids)
9193 {
9194   dout(10) << __func__ << " " << fop->obc->obs.oi.soid << " tid "
9195            << fop->objecter_tid << dendl;
9196   if (fop->objecter_tid) {
9197     tids->push_back(fop->objecter_tid);
9198     fop->objecter_tid = 0;
9199   }
9200   if (fop->io_tids.size()) {
9201     for (auto &p : fop->io_tids) {
9202       tids->push_back(p.second);
9203       p.second = 0;
9204     }
9205   }
9206   if (fop->blocking && fop->obc->is_blocked()) {
9207     fop->obc->stop_block();
9208     kick_object_context_blocked(fop->obc);
9209   }
9210   if (requeue) {
9211     if (fop->op)
9212       requeue_op(fop->op);
9213     requeue_ops(fop->dup_ops);
9214   }
9215   if (fop->on_flush) {
9216     (*(fop->on_flush))();
9217     fop->on_flush = boost::none;
9218   }
9219   flush_ops.erase(fop->obc->obs.oi.soid);
9220 }
9221
9222 void PrimaryLogPG::cancel_flush_ops(bool requeue, vector<ceph_tid_t> *tids)
9223 {
9224   dout(10) << __func__ << dendl;
9225   map<hobject_t,FlushOpRef>::iterator p = flush_ops.begin();
9226   while (p != flush_ops.end()) {
9227     cancel_flush((p++)->second, requeue, tids);
9228   }
9229 }
9230
9231 bool PrimaryLogPG::is_present_clone(hobject_t coid)
9232 {
9233   if (!pool.info.allow_incomplete_clones())
9234     return true;
9235   if (is_missing_object(coid))
9236     return true;
9237   ObjectContextRef obc = get_object_context(coid, false);
9238   return obc && obc->obs.exists;
9239 }
9240
9241 // ========================================================================
9242 // rep op gather
9243
9244 class C_OSD_RepopApplied : public Context {
9245   PrimaryLogPGRef pg;
9246   boost::intrusive_ptr<PrimaryLogPG::RepGather> repop;
9247 public:
9248   C_OSD_RepopApplied(PrimaryLogPG *pg, PrimaryLogPG::RepGather *repop)
9249   : pg(pg), repop(repop) {}
9250   void finish(int) override {
9251     pg->repop_all_applied(repop.get());
9252   }
9253 };
9254
9255
9256 void PrimaryLogPG::repop_all_applied(RepGather *repop)
9257 {
9258   dout(10) << __func__ << ": repop tid " << repop->rep_tid << " all applied "
9259            << dendl;
9260   assert(!repop->applies_with_commit);
9261   repop->all_applied = true;
9262   if (!repop->rep_aborted) {
9263     eval_repop(repop);
9264   }
9265 }
9266
9267 class C_OSD_RepopCommit : public Context {
9268   PrimaryLogPGRef pg;
9269   boost::intrusive_ptr<PrimaryLogPG::RepGather> repop;
9270 public:
9271   C_OSD_RepopCommit(PrimaryLogPG *pg, PrimaryLogPG::RepGather *repop)
9272     : pg(pg), repop(repop) {}
9273   void finish(int) override {
9274     pg->repop_all_committed(repop.get());
9275   }
9276 };
9277
9278 void PrimaryLogPG::repop_all_committed(RepGather *repop)
9279 {
9280   dout(10) << __func__ << ": repop tid " << repop->rep_tid << " all committed "
9281            << dendl;
9282   repop->all_committed = true;
9283   if (repop->applies_with_commit) {
9284     assert(!repop->all_applied);
9285     repop->all_applied = true;
9286   }
9287
9288   if (!repop->rep_aborted) {
9289     if (repop->v != eversion_t()) {
9290       last_update_ondisk = repop->v;
9291       last_complete_ondisk = repop->pg_local_last_complete;
9292     }
9293     eval_repop(repop);
9294   }
9295 }
9296
9297 void PrimaryLogPG::op_applied(const eversion_t &applied_version)
9298 {
9299   dout(10) << "op_applied version " << applied_version << dendl;
9300   if (applied_version == eversion_t())
9301     return;
9302   assert(applied_version > last_update_applied);
9303   assert(applied_version <= info.last_update);
9304   last_update_applied = applied_version;
9305   if (is_primary()) {
9306     if (scrubber.active) {
9307       if (last_update_applied >= scrubber.subset_last_update) {
9308         if (ops_blocked_by_scrub()) {
9309           requeue_scrub(true);
9310         } else {
9311           requeue_scrub(false);
9312         }
9313
9314       }
9315     } else {
9316       assert(scrubber.start == scrubber.end);
9317     }
9318   } else {
9319     if (scrubber.active_rep_scrub) {
9320       if (last_update_applied >= static_cast<const MOSDRepScrub*>(
9321             scrubber.active_rep_scrub->get_req())->scrub_to) {
9322         osd->enqueue_back(
9323           info.pgid,
9324           PGQueueable(scrubber.active_rep_scrub, get_osdmap()->get_epoch()));
9325         scrubber.active_rep_scrub = OpRequestRef();
9326       }
9327     }
9328   }
9329 }
9330
9331 void PrimaryLogPG::eval_repop(RepGather *repop)
9332 {
9333   const MOSDOp *m = NULL;
9334   if (repop->op)
9335     m = static_cast<const MOSDOp *>(repop->op->get_req());
9336
9337   if (m)
9338     dout(10) << "eval_repop " << *repop
9339              << (repop->rep_done ? " DONE" : "")
9340              << dendl;
9341   else
9342     dout(10) << "eval_repop " << *repop << " (no op)"
9343              << (repop->rep_done ? " DONE" : "")
9344              << dendl;
9345
9346   if (repop->rep_done)
9347     return;
9348
9349   // ondisk?
9350   if (repop->all_committed) {
9351     dout(10) << " commit: " << *repop << dendl;
9352     for (auto p = repop->on_committed.begin();
9353          p != repop->on_committed.end();
9354          repop->on_committed.erase(p++)) {
9355       (*p)();
9356     }
9357     // send dup commits, in order
9358     if (waiting_for_ondisk.count(repop->v)) {
9359       assert(waiting_for_ondisk.begin()->first == repop->v);
9360       for (list<pair<OpRequestRef, version_t> >::iterator i =
9361              waiting_for_ondisk[repop->v].begin();
9362            i != waiting_for_ondisk[repop->v].end();
9363            ++i) {
9364         osd->reply_op_error(i->first, repop->r, repop->v,
9365                             i->second);
9366       }
9367       waiting_for_ondisk.erase(repop->v);
9368     }
9369   }
9370
9371   // applied?
9372   if (repop->all_applied) {
9373     if (repop->applies_with_commit) {
9374       assert(repop->on_applied.empty());
9375     }
9376     dout(10) << " applied: " << *repop << " " << dendl;
9377     for (auto p = repop->on_applied.begin();
9378          p != repop->on_applied.end();
9379          repop->on_applied.erase(p++)) {
9380       (*p)();
9381     }
9382   }
9383
9384   // done.
9385   if (repop->all_applied && repop->all_committed) {
9386     repop->rep_done = true;
9387
9388     publish_stats_to_osd();
9389     calc_min_last_complete_ondisk();
9390
9391     dout(10) << " removing " << *repop << dendl;
9392     assert(!repop_queue.empty());
9393     dout(20) << "   q front is " << *repop_queue.front() << dendl;
9394     if (repop_queue.front() != repop) {
9395       if (!repop->applies_with_commit) {
9396         dout(0) << " removing " << *repop << dendl;
9397         dout(0) << "   q front is " << *repop_queue.front() << dendl;
9398         assert(repop_queue.front() == repop);
9399       }
9400     } else {
9401       RepGather *to_remove = nullptr;
9402       while (!repop_queue.empty() &&
9403              (to_remove = repop_queue.front())->rep_done) {
9404         repop_queue.pop_front();
9405         for (auto p = to_remove->on_success.begin();
9406              p != to_remove->on_success.end();
9407              to_remove->on_success.erase(p++)) {
9408           (*p)();
9409         }
9410         remove_repop(to_remove);
9411       }
9412     }
9413   }
9414 }
9415
9416 void PrimaryLogPG::issue_repop(RepGather *repop, OpContext *ctx)
9417 {
9418   FUNCTRACE();
9419   const hobject_t& soid = ctx->obs->oi.soid;
9420   dout(7) << "issue_repop rep_tid " << repop->rep_tid
9421           << " o " << soid
9422           << dendl;
9423
9424   repop->v = ctx->at_version;
9425   if (ctx->at_version > eversion_t()) {
9426     for (set<pg_shard_t>::iterator i = actingbackfill.begin();
9427          i != actingbackfill.end();
9428          ++i) {
9429       if (*i == get_primary()) continue;
9430       pg_info_t &pinfo = peer_info[*i];
9431       // keep peer_info up to date
9432       if (pinfo.last_complete == pinfo.last_update)
9433         pinfo.last_complete = ctx->at_version;
9434       pinfo.last_update = ctx->at_version;
9435     }
9436   }
9437
9438   ctx->obc->ondisk_write_lock();
9439
9440   bool unlock_snapset_obc = false;
9441   ctx->op_t->add_obc(ctx->obc);
9442   if (ctx->clone_obc) {
9443     ctx->clone_obc->ondisk_write_lock();
9444     ctx->op_t->add_obc(ctx->clone_obc);
9445   }
9446   if (ctx->snapset_obc && ctx->snapset_obc->obs.oi.soid !=
9447       ctx->obc->obs.oi.soid) {
9448     ctx->snapset_obc->ondisk_write_lock();
9449     unlock_snapset_obc = true;
9450     ctx->op_t->add_obc(ctx->snapset_obc);
9451   }
9452
9453   Context *on_all_commit = new C_OSD_RepopCommit(this, repop);
9454   Context *on_all_applied = new C_OSD_RepopApplied(this, repop);
9455   Context *onapplied_sync = new C_OSD_OndiskWriteUnlock(
9456     ctx->obc,
9457     ctx->clone_obc,
9458     unlock_snapset_obc ? ctx->snapset_obc : ObjectContextRef());
9459   if (!(ctx->log.empty())) {
9460     assert(ctx->at_version >= projected_last_update);
9461     projected_last_update = ctx->at_version;
9462   }
9463   for (auto &&entry: ctx->log) {
9464     projected_log.add(entry);
9465   }
9466   pgbackend->submit_transaction(
9467     soid,
9468     ctx->delta_stats,
9469     ctx->at_version,
9470     std::move(ctx->op_t),
9471     pg_trim_to,
9472     min_last_complete_ondisk,
9473     ctx->log,
9474     ctx->updated_hset_history,
9475     onapplied_sync,
9476     on_all_applied,
9477     on_all_commit,
9478     repop->rep_tid,
9479     ctx->reqid,
9480     ctx->op);
9481 }
9482
9483 PrimaryLogPG::RepGather *PrimaryLogPG::new_repop(
9484   OpContext *ctx, ObjectContextRef obc,
9485   ceph_tid_t rep_tid)
9486 {
9487   if (ctx->op)
9488     dout(10) << "new_repop rep_tid " << rep_tid << " on " << *ctx->op->get_req() << dendl;
9489   else
9490     dout(10) << "new_repop rep_tid " << rep_tid << " (no op)" << dendl;
9491
9492   RepGather *repop = new RepGather(
9493     ctx, rep_tid, info.last_complete, false);
9494
9495   repop->start = ceph_clock_now();
9496
9497   repop_queue.push_back(&repop->queue_item);
9498   repop->get();
9499
9500   osd->logger->inc(l_osd_op_wip);
9501
9502   dout(10) << __func__ << ": " << *repop << dendl;
9503   return repop;
9504 }
9505
9506 boost::intrusive_ptr<PrimaryLogPG::RepGather> PrimaryLogPG::new_repop(
9507   eversion_t version,
9508   int r,
9509   ObcLockManager &&manager,
9510   OpRequestRef &&op,
9511   boost::optional<std::function<void(void)> > &&on_complete)
9512 {
9513   RepGather *repop = new RepGather(
9514     std::move(manager),
9515     std::move(op),
9516     std::move(on_complete),
9517     osd->get_tid(),
9518     info.last_complete,
9519     true,
9520     r);
9521   repop->v = version;
9522
9523   repop->start = ceph_clock_now();
9524
9525   repop_queue.push_back(&repop->queue_item);
9526
9527   osd->logger->inc(l_osd_op_wip);
9528
9529   dout(10) << __func__ << ": " << *repop << dendl;
9530   return boost::intrusive_ptr<RepGather>(repop);
9531 }
9532
9533 void PrimaryLogPG::remove_repop(RepGather *repop)
9534 {
9535   dout(20) << __func__ << " " << *repop << dendl;
9536
9537   for (auto p = repop->on_finish.begin();
9538        p != repop->on_finish.end();
9539        repop->on_finish.erase(p++)) {
9540     (*p)();
9541   }
9542
9543   release_object_locks(
9544     repop->lock_manager);
9545   repop->put();
9546
9547   osd->logger->dec(l_osd_op_wip);
9548 }
9549
9550 PrimaryLogPG::OpContextUPtr PrimaryLogPG::simple_opc_create(ObjectContextRef obc)
9551 {
9552   dout(20) << __func__ << " " << obc->obs.oi.soid << dendl;
9553   ceph_tid_t rep_tid = osd->get_tid();
9554   osd_reqid_t reqid(osd->get_cluster_msgr_name(), 0, rep_tid);
9555   OpContextUPtr ctx(new OpContext(OpRequestRef(), reqid, nullptr, obc, this));
9556   ctx->op_t.reset(new PGTransaction());
9557   ctx->mtime = ceph_clock_now();
9558   return ctx;
9559 }
9560
9561 void PrimaryLogPG::simple_opc_submit(OpContextUPtr ctx)
9562 {
9563   RepGather *repop = new_repop(ctx.get(), ctx->obc, ctx->reqid.tid);
9564   dout(20) << __func__ << " " << repop << dendl;
9565   issue_repop(repop, ctx.get());
9566   eval_repop(repop);
9567   calc_trim_to();
9568   repop->put();
9569 }
9570
9571
9572 void PrimaryLogPG::submit_log_entries(
9573   const mempool::osd_pglog::list<pg_log_entry_t> &entries,
9574   ObcLockManager &&manager,
9575   boost::optional<std::function<void(void)> > &&_on_complete,
9576   OpRequestRef op,
9577   int r)
9578 {
9579   dout(10) << __func__ << " " << entries << dendl;
9580   assert(is_primary());
9581
9582   eversion_t version;
9583   if (!entries.empty()) {
9584     assert(entries.rbegin()->version >= projected_last_update);
9585     version = projected_last_update = entries.rbegin()->version;
9586   }
9587
9588   boost::intrusive_ptr<RepGather> repop;
9589   boost::optional<std::function<void(void)> > on_complete;
9590   if (get_osdmap()->require_osd_release >= CEPH_RELEASE_JEWEL) {
9591     repop = new_repop(
9592       version,
9593       r,
9594       std::move(manager),
9595       std::move(op),
9596       std::move(_on_complete));
9597   } else {
9598     on_complete = std::move(_on_complete);
9599   }
9600
9601   pgbackend->call_write_ordered(
9602     [this, entries, repop, on_complete]() {
9603       ObjectStore::Transaction t;
9604       eversion_t old_last_update = info.last_update;
9605       merge_new_log_entries(entries, t, pg_trim_to, min_last_complete_ondisk);
9606
9607
9608       set<pg_shard_t> waiting_on;
9609       for (set<pg_shard_t>::const_iterator i = actingbackfill.begin();
9610            i != actingbackfill.end();
9611            ++i) {
9612         pg_shard_t peer(*i);
9613         if (peer == pg_whoami) continue;
9614         assert(peer_missing.count(peer));
9615         assert(peer_info.count(peer));
9616         if (get_osdmap()->require_osd_release >= CEPH_RELEASE_JEWEL) {
9617           assert(repop);
9618           MOSDPGUpdateLogMissing *m = new MOSDPGUpdateLogMissing(
9619             entries,
9620             spg_t(info.pgid.pgid, i->shard),
9621             pg_whoami.shard,
9622             get_osdmap()->get_epoch(),
9623             last_peering_reset,
9624             repop->rep_tid,
9625             pg_trim_to,
9626             min_last_complete_ondisk);
9627           osd->send_message_osd_cluster(
9628             peer.osd, m, get_osdmap()->get_epoch());
9629           waiting_on.insert(peer);
9630         } else {
9631           MOSDPGLog *m = new MOSDPGLog(
9632             peer.shard, pg_whoami.shard,
9633             info.last_update.epoch,
9634             info);
9635           m->log.log = entries;
9636           m->log.tail = old_last_update;
9637           m->log.head = info.last_update;
9638           osd->send_message_osd_cluster(
9639             peer.osd, m, get_osdmap()->get_epoch());
9640         }
9641       }
9642       if (get_osdmap()->require_osd_release >= CEPH_RELEASE_JEWEL) {
9643         ceph_tid_t rep_tid = repop->rep_tid;
9644         waiting_on.insert(pg_whoami);
9645         log_entry_update_waiting_on.insert(
9646           make_pair(
9647             rep_tid,
9648             LogUpdateCtx{std::move(repop), std::move(waiting_on)}
9649             ));
9650         struct OnComplete : public Context {
9651           PrimaryLogPGRef pg;
9652           ceph_tid_t rep_tid;
9653           epoch_t epoch;
9654           OnComplete(
9655             PrimaryLogPGRef pg,
9656             ceph_tid_t rep_tid,
9657             epoch_t epoch)
9658             : pg(pg), rep_tid(rep_tid), epoch(epoch) {}
9659           void finish(int) override {
9660             pg->lock();
9661             if (!pg->pg_has_reset_since(epoch)) {
9662               auto it = pg->log_entry_update_waiting_on.find(rep_tid);
9663               assert(it != pg->log_entry_update_waiting_on.end());
9664               auto it2 = it->second.waiting_on.find(pg->pg_whoami);
9665               assert(it2 != it->second.waiting_on.end());
9666               it->second.waiting_on.erase(it2);
9667               if (it->second.waiting_on.empty()) {
9668                 pg->repop_all_committed(it->second.repop.get());
9669                 pg->log_entry_update_waiting_on.erase(it);
9670               }
9671             }
9672             pg->unlock();
9673           }
9674         };
9675         t.register_on_commit(
9676           new OnComplete{this, rep_tid, get_osdmap()->get_epoch()});
9677       } else {
9678         if (on_complete) {
9679           struct OnComplete : public Context {
9680             PrimaryLogPGRef pg;
9681             std::function<void(void)> on_complete;
9682             epoch_t epoch;
9683             OnComplete(
9684               PrimaryLogPGRef pg,
9685               const std::function<void(void)> &on_complete,
9686               epoch_t epoch)
9687               : pg(pg),
9688                 on_complete(std::move(on_complete)),
9689                 epoch(epoch) {}
9690             void finish(int) override {
9691               pg->lock();
9692               if (!pg->pg_has_reset_since(epoch))
9693                 on_complete();
9694               pg->unlock();
9695             }
9696           };
9697           t.register_on_complete(
9698             new OnComplete{
9699               this, *on_complete, get_osdmap()->get_epoch()
9700                 });
9701         }
9702       }
9703       t.register_on_applied(
9704         new C_OSD_OnApplied{this, get_osdmap()->get_epoch(), info.last_update});
9705       int r = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
9706       assert(r == 0);
9707     });
9708
9709   calc_trim_to();
9710 }
9711
9712 void PrimaryLogPG::cancel_log_updates()
9713 {
9714   // get rid of all the LogUpdateCtx so their references to repops are
9715   // dropped
9716   log_entry_update_waiting_on.clear();
9717 }
9718
9719 // -------------------------------------------------------
9720
9721 void PrimaryLogPG::get_watchers(list<obj_watch_item_t> &pg_watchers)
9722 {
9723   pair<hobject_t, ObjectContextRef> i;
9724   while (object_contexts.get_next(i.first, &i)) {
9725     ObjectContextRef obc(i.second);
9726     get_obc_watchers(obc, pg_watchers);
9727   }
9728 }
9729
9730 void PrimaryLogPG::get_obc_watchers(ObjectContextRef obc, list<obj_watch_item_t> &pg_watchers)
9731 {
9732   for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j =
9733          obc->watchers.begin();
9734         j != obc->watchers.end();
9735         ++j) {
9736     obj_watch_item_t owi;
9737
9738     owi.obj = obc->obs.oi.soid;
9739     owi.wi.addr = j->second->get_peer_addr();
9740     owi.wi.name = j->second->get_entity();
9741     owi.wi.cookie = j->second->get_cookie();
9742     owi.wi.timeout_seconds = j->second->get_timeout();
9743
9744     dout(30) << "watch: Found oid=" << owi.obj << " addr=" << owi.wi.addr
9745       << " name=" << owi.wi.name << " cookie=" << owi.wi.cookie << dendl;
9746
9747     pg_watchers.push_back(owi);
9748   }
9749 }
9750
9751 void PrimaryLogPG::check_blacklisted_watchers()
9752 {
9753   dout(20) << "PrimaryLogPG::check_blacklisted_watchers for pg " << get_pgid() << dendl;
9754   pair<hobject_t, ObjectContextRef> i;
9755   while (object_contexts.get_next(i.first, &i))
9756     check_blacklisted_obc_watchers(i.second);
9757 }
9758
9759 void PrimaryLogPG::check_blacklisted_obc_watchers(ObjectContextRef obc)
9760 {
9761   dout(20) << "PrimaryLogPG::check_blacklisted_obc_watchers for obc " << obc->obs.oi.soid << dendl;
9762   for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator k =
9763          obc->watchers.begin();
9764         k != obc->watchers.end();
9765         ) {
9766     //Advance iterator now so handle_watch_timeout() can erase element
9767     map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j = k++;
9768     dout(30) << "watch: Found " << j->second->get_entity() << " cookie " << j->second->get_cookie() << dendl;
9769     entity_addr_t ea = j->second->get_peer_addr();
9770     dout(30) << "watch: Check entity_addr_t " << ea << dendl;
9771     if (get_osdmap()->is_blacklisted(ea)) {
9772       dout(10) << "watch: Found blacklisted watcher for " << ea << dendl;
9773       assert(j->second->get_pg() == this);
9774       j->second->unregister_cb();
9775       handle_watch_timeout(j->second);
9776     }
9777   }
9778 }
9779
9780 void PrimaryLogPG::populate_obc_watchers(ObjectContextRef obc)
9781 {
9782   assert(is_active());
9783   assert((recovering.count(obc->obs.oi.soid) ||
9784           !is_missing_object(obc->obs.oi.soid)) ||
9785          (pg_log.get_log().objects.count(obc->obs.oi.soid) && // or this is a revert... see recover_primary()
9786           pg_log.get_log().objects.find(obc->obs.oi.soid)->second->op ==
9787             pg_log_entry_t::LOST_REVERT &&
9788           pg_log.get_log().objects.find(obc->obs.oi.soid)->second->reverting_to ==
9789             obc->obs.oi.version));
9790
9791   dout(10) << "populate_obc_watchers " << obc->obs.oi.soid << dendl;
9792   assert(obc->watchers.empty());
9793   // populate unconnected_watchers
9794   for (map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator p =
9795         obc->obs.oi.watchers.begin();
9796        p != obc->obs.oi.watchers.end();
9797        ++p) {
9798     utime_t expire = info.stats.last_became_active;
9799     expire += p->second.timeout_seconds;
9800     dout(10) << "  unconnected watcher " << p->first << " will expire " << expire << dendl;
9801     WatchRef watch(
9802       Watch::makeWatchRef(
9803         this, osd, obc, p->second.timeout_seconds, p->first.first,
9804         p->first.second, p->second.addr));
9805     watch->disconnect();
9806     obc->watchers.insert(
9807       make_pair(
9808         make_pair(p->first.first, p->first.second),
9809         watch));
9810   }
9811   // Look for watchers from blacklisted clients and drop
9812   check_blacklisted_obc_watchers(obc);
9813 }
9814
9815 void PrimaryLogPG::handle_watch_timeout(WatchRef watch)
9816 {
9817   ObjectContextRef obc = watch->get_obc(); // handle_watch_timeout owns this ref
9818   dout(10) << "handle_watch_timeout obc " << obc << dendl;
9819
9820   if (!is_active()) {
9821     dout(10) << "handle_watch_timeout not active, no-op" << dendl;
9822     return;
9823   }
9824   if (is_degraded_or_backfilling_object(obc->obs.oi.soid)) {
9825     callbacks_for_degraded_object[obc->obs.oi.soid].push_back(
9826       watch->get_delayed_cb()
9827       );
9828     dout(10) << "handle_watch_timeout waiting for degraded on obj "
9829              << obc->obs.oi.soid
9830              << dendl;
9831     return;
9832   }
9833
9834   if (write_blocked_by_scrub(obc->obs.oi.soid)) {
9835     dout(10) << "handle_watch_timeout waiting for scrub on obj "
9836              << obc->obs.oi.soid
9837              << dendl;
9838     scrubber.add_callback(
9839       watch->get_delayed_cb() // This callback!
9840       );
9841     return;
9842   }
9843
9844   OpContextUPtr ctx = simple_opc_create(obc);
9845   ctx->at_version = get_next_version();
9846
9847   object_info_t& oi = ctx->new_obs.oi;
9848   oi.watchers.erase(make_pair(watch->get_cookie(),
9849                               watch->get_entity()));
9850
9851   list<watch_disconnect_t> watch_disconnects = {
9852     watch_disconnect_t(watch->get_cookie(), watch->get_entity(), true)
9853   };
9854   ctx->register_on_success(
9855     [this, obc, watch_disconnects]() {
9856       complete_disconnect_watches(obc, watch_disconnects);
9857     });
9858
9859
9860   PGTransaction *t = ctx->op_t.get();
9861   ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::MODIFY, obc->obs.oi.soid,
9862                                     ctx->at_version,
9863                                     oi.version,
9864                                     0,
9865                                     osd_reqid_t(), ctx->mtime, 0));
9866
9867   oi.prior_version = obc->obs.oi.version;
9868   oi.version = ctx->at_version;
9869   bufferlist bl;
9870   ::encode(oi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
9871   t->setattr(obc->obs.oi.soid, OI_ATTR, bl);
9872
9873   // apply new object state.
9874   ctx->obc->obs = ctx->new_obs;
9875
9876   // no ctx->delta_stats
9877   simple_opc_submit(std::move(ctx));
9878 }
9879
9880 ObjectContextRef PrimaryLogPG::create_object_context(const object_info_t& oi,
9881                                                      SnapSetContext *ssc)
9882 {
9883   ObjectContextRef obc(object_contexts.lookup_or_create(oi.soid));
9884   assert(obc->destructor_callback == NULL);
9885   obc->destructor_callback = new C_PG_ObjectContext(this, obc.get());
9886   obc->obs.oi = oi;
9887   obc->obs.exists = false;
9888   obc->ssc = ssc;
9889   if (ssc)
9890     register_snapset_context(ssc);
9891   dout(10) << "create_object_context " << (void*)obc.get() << " " << oi.soid << " " << dendl;
9892   if (is_active())
9893     populate_obc_watchers(obc);
9894   return obc;
9895 }
9896
9897 ObjectContextRef PrimaryLogPG::get_object_context(
9898   const hobject_t& soid,
9899   bool can_create,
9900   const map<string, bufferlist> *attrs)
9901 {
9902   assert(
9903     attrs || !pg_log.get_missing().is_missing(soid) ||
9904     // or this is a revert... see recover_primary()
9905     (pg_log.get_log().objects.count(soid) &&
9906       pg_log.get_log().objects.find(soid)->second->op ==
9907       pg_log_entry_t::LOST_REVERT));
9908   ObjectContextRef obc = object_contexts.lookup(soid);
9909   osd->logger->inc(l_osd_object_ctx_cache_total);
9910   if (obc) {
9911     osd->logger->inc(l_osd_object_ctx_cache_hit);
9912     dout(10) << __func__ << ": found obc in cache: " << obc
9913              << dendl;
9914   } else {
9915     dout(10) << __func__ << ": obc NOT found in cache: " << soid << dendl;
9916     // check disk
9917     bufferlist bv;
9918     if (attrs) {
9919       assert(attrs->count(OI_ATTR));
9920       bv = attrs->find(OI_ATTR)->second;
9921     } else {
9922       int r = pgbackend->objects_get_attr(soid, OI_ATTR, &bv);
9923       if (r < 0) {
9924         if (!can_create) {
9925           dout(10) << __func__ << ": no obc for soid "
9926                    << soid << " and !can_create"
9927                    << dendl;
9928           return ObjectContextRef();   // -ENOENT!
9929         }
9930
9931         dout(10) << __func__ << ": no obc for soid "
9932                  << soid << " but can_create"
9933                  << dendl;
9934         // new object.
9935         object_info_t oi(soid);
9936         SnapSetContext *ssc = get_snapset_context(
9937           soid, true, 0, false);
9938         assert(ssc);
9939         obc = create_object_context(oi, ssc);
9940         dout(10) << __func__ << ": " << obc << " " << soid
9941                  << " " << obc->rwstate
9942                  << " oi: " << obc->obs.oi
9943                  << " ssc: " << obc->ssc
9944                  << " snapset: " << obc->ssc->snapset << dendl;
9945         return obc;
9946       }
9947     }
9948
9949     object_info_t oi;
9950     try {
9951       bufferlist::iterator bliter = bv.begin();
9952       ::decode(oi, bliter);
9953     } catch (...) {
9954       dout(0) << __func__ << ": obc corrupt: " << soid << dendl;
9955       return ObjectContextRef();   // -ENOENT!
9956     }
9957
9958     assert(oi.soid.pool == (int64_t)info.pgid.pool());
9959
9960     obc = object_contexts.lookup_or_create(oi.soid);
9961     obc->destructor_callback = new C_PG_ObjectContext(this, obc.get());
9962     obc->obs.oi = oi;
9963     obc->obs.exists = true;
9964
9965     obc->ssc = get_snapset_context(
9966       soid, true,
9967       soid.has_snapset() ? attrs : 0);
9968
9969     if (is_active())
9970       populate_obc_watchers(obc);
9971
9972     if (pool.info.require_rollback()) {
9973       if (attrs) {
9974         obc->attr_cache = *attrs;
9975       } else {
9976         int r = pgbackend->objects_get_attrs(
9977           soid,
9978           &obc->attr_cache);
9979         assert(r == 0);
9980       }
9981     }
9982
9983     dout(10) << __func__ << ": creating obc from disk: " << obc
9984              << dendl;
9985   }
9986
9987   // XXX: Caller doesn't expect this
9988   if (obc->ssc == NULL) {
9989     derr << __func__ << ": obc->ssc not available, not returning context" << dendl;
9990     return ObjectContextRef();   // -ENOENT!
9991   }
9992
9993   dout(10) << __func__ << ": " << obc << " " << soid
9994            << " " << obc->rwstate
9995            << " oi: " << obc->obs.oi
9996            << " exists: " << (int)obc->obs.exists
9997            << " ssc: " << obc->ssc
9998            << " snapset: " << obc->ssc->snapset << dendl;
9999   return obc;
10000 }
10001
10002 void PrimaryLogPG::context_registry_on_change()
10003 {
10004   pair<hobject_t, ObjectContextRef> i;
10005   while (object_contexts.get_next(i.first, &i)) {
10006     ObjectContextRef obc(i.second);
10007     if (obc) {
10008       for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j =
10009              obc->watchers.begin();
10010            j != obc->watchers.end();
10011            obc->watchers.erase(j++)) {
10012         j->second->discard();
10013       }
10014     }
10015   }
10016 }
10017
10018
10019 /*
10020  * If we return an error, and set *pmissing, then promoting that
10021  * object may help.
10022  *
10023  * If we return -EAGAIN, we will always set *pmissing to the missing
10024  * object to wait for.
10025  *
10026  * If we return an error but do not set *pmissing, then we know the
10027  * object does not exist.
10028  */
10029 int PrimaryLogPG::find_object_context(const hobject_t& oid,
10030                                       ObjectContextRef *pobc,
10031                                       bool can_create,
10032                                       bool map_snapid_to_clone,
10033                                       hobject_t *pmissing)
10034 {
10035   FUNCTRACE();
10036   assert(oid.pool == static_cast<int64_t>(info.pgid.pool()));
10037   // want the head?
10038   if (oid.snap == CEPH_NOSNAP) {
10039     ObjectContextRef obc = get_object_context(oid, can_create);
10040     if (!obc) {
10041       if (pmissing)
10042         *pmissing = oid;
10043       return -ENOENT;
10044     }
10045     dout(10) << "find_object_context " << oid
10046        << " @" << oid.snap
10047        << " oi=" << obc->obs.oi
10048        << dendl;
10049     *pobc = obc;
10050
10051     return 0;
10052   }
10053
10054   hobject_t head = oid.get_head();
10055
10056   // want the snapdir?
10057   if (oid.snap == CEPH_SNAPDIR) {
10058     // return head or snapdir, whichever exists.
10059     ObjectContextRef headobc = get_object_context(head, can_create);
10060     ObjectContextRef obc = headobc;
10061     if (!obc || !obc->obs.exists)
10062       obc = get_object_context(oid, can_create);
10063     if (!obc || !obc->obs.exists) {
10064       // if we have neither, we would want to promote the head.
10065       if (pmissing)
10066         *pmissing = head;
10067       if (pobc)
10068         *pobc = headobc; // may be null
10069       return -ENOENT;
10070     }
10071     dout(10) << "find_object_context " << oid
10072              << " @" << oid.snap
10073              << " oi=" << obc->obs.oi
10074              << dendl;
10075     *pobc = obc;
10076
10077     // always populate ssc for SNAPDIR...
10078     if (!obc->ssc)
10079       obc->ssc = get_snapset_context(
10080         oid, true);
10081     return 0;
10082   }
10083
10084   // we want a snap
10085   if (!map_snapid_to_clone && pool.info.is_removed_snap(oid.snap)) {
10086     dout(10) << __func__ << " snap " << oid.snap << " is removed" << dendl;
10087     return -ENOENT;
10088   }
10089
10090   SnapSetContext *ssc = get_snapset_context(oid, can_create);
10091   if (!ssc || !(ssc->exists || can_create)) {
10092     dout(20) << __func__ << " " << oid << " no snapset" << dendl;
10093     if (pmissing)
10094       *pmissing = head;  // start by getting the head
10095     if (ssc)
10096       put_snapset_context(ssc);
10097     return -ENOENT;
10098   }
10099
10100   if (map_snapid_to_clone) {
10101     dout(10) << "find_object_context " << oid << " @" << oid.snap
10102              << " snapset " << ssc->snapset
10103              << " map_snapid_to_clone=true" << dendl;
10104     if (oid.snap > ssc->snapset.seq) {
10105       // already must be readable
10106       ObjectContextRef obc = get_object_context(head, false);
10107       dout(10) << "find_object_context " << oid << " @" << oid.snap
10108                << " snapset " << ssc->snapset
10109                << " maps to head" << dendl;
10110       *pobc = obc;
10111       put_snapset_context(ssc);
10112       return (obc && obc->obs.exists) ? 0 : -ENOENT;
10113     } else {
10114       vector<snapid_t>::const_iterator citer = std::find(
10115         ssc->snapset.clones.begin(),
10116         ssc->snapset.clones.end(),
10117         oid.snap);
10118       if (citer == ssc->snapset.clones.end()) {
10119         dout(10) << "find_object_context " << oid << " @" << oid.snap
10120                  << " snapset " << ssc->snapset
10121                  << " maps to nothing" << dendl;
10122         put_snapset_context(ssc);
10123         return -ENOENT;
10124       }
10125
10126       dout(10) << "find_object_context " << oid << " @" << oid.snap
10127                << " snapset " << ssc->snapset
10128                << " maps to " << oid << dendl;
10129
10130       if (pg_log.get_missing().is_missing(oid)) {
10131         dout(10) << "find_object_context " << oid << " @" << oid.snap
10132                  << " snapset " << ssc->snapset
10133                  << " " << oid << " is missing" << dendl;
10134         if (pmissing)
10135           *pmissing = oid;
10136         put_snapset_context(ssc);
10137         return -EAGAIN;
10138       }
10139
10140       ObjectContextRef obc = get_object_context(oid, false);
10141       if (!obc || !obc->obs.exists) {
10142         dout(10) << "find_object_context " << oid << " @" << oid.snap
10143                  << " snapset " << ssc->snapset
10144                  << " " << oid << " is not present" << dendl;
10145         if (pmissing)
10146           *pmissing = oid;
10147         put_snapset_context(ssc);
10148         return -ENOENT;
10149       }
10150       dout(10) << "find_object_context " << oid << " @" << oid.snap
10151                << " snapset " << ssc->snapset
10152                << " " << oid << " HIT" << dendl;
10153       *pobc = obc;
10154       put_snapset_context(ssc);
10155       return 0;
10156     }
10157     ceph_abort(); //unreachable
10158   }
10159
10160   dout(10) << "find_object_context " << oid << " @" << oid.snap
10161            << " snapset " << ssc->snapset << dendl;
10162
10163   // head?
10164   if (oid.snap > ssc->snapset.seq) {
10165     if (ssc->snapset.head_exists) {
10166       ObjectContextRef obc = get_object_context(head, false);
10167       dout(10) << "find_object_context  " << head
10168                << " want " << oid.snap << " > snapset seq " << ssc->snapset.seq
10169                << " -- HIT " << obc->obs
10170                << dendl;
10171       if (!obc->ssc)
10172         obc->ssc = ssc;
10173       else {
10174         assert(ssc == obc->ssc);
10175         put_snapset_context(ssc);
10176       }
10177       *pobc = obc;
10178       return 0;
10179     }
10180     dout(10) << "find_object_context  " << head
10181              << " want " << oid.snap << " > snapset seq " << ssc->snapset.seq
10182              << " but head dne -- DNE"
10183              << dendl;
10184     put_snapset_context(ssc);
10185     return -ENOENT;
10186   }
10187
10188   // which clone would it be?
10189   unsigned k = 0;
10190   while (k < ssc->snapset.clones.size() &&
10191          ssc->snapset.clones[k] < oid.snap)
10192     k++;
10193   if (k == ssc->snapset.clones.size()) {
10194     dout(10) << "find_object_context  no clones with last >= oid.snap "
10195              << oid.snap << " -- DNE" << dendl;
10196     put_snapset_context(ssc);
10197     return -ENOENT;
10198   }
10199   hobject_t soid(oid.oid, oid.get_key(), ssc->snapset.clones[k], oid.get_hash(),
10200                  info.pgid.pool(), oid.get_namespace());
10201
10202   if (pg_log.get_missing().is_missing(soid)) {
10203     dout(20) << "find_object_context  " << soid << " missing, try again later"
10204              << dendl;
10205     if (pmissing)
10206       *pmissing = soid;
10207     put_snapset_context(ssc);
10208     return -EAGAIN;
10209   }
10210
10211   ObjectContextRef obc = get_object_context(soid, false);
10212   if (!obc || !obc->obs.exists) {
10213     if (pmissing)
10214       *pmissing = soid;
10215     put_snapset_context(ssc);
10216     if (is_degraded_or_backfilling_object(soid)) {
10217       dout(20) << __func__ << " clone is degraded or backfilling " << soid << dendl;
10218       return -EAGAIN;
10219     } else {
10220       dout(20) << __func__ << " missing clone " << soid << dendl;
10221       return -ENOENT;
10222     }
10223   }
10224
10225   if (!obc->ssc) {
10226     obc->ssc = ssc;
10227   } else {
10228     assert(obc->ssc == ssc);
10229     put_snapset_context(ssc);
10230   }
10231   ssc = 0;
10232
10233   // clone
10234   dout(20) << "find_object_context  " << soid
10235            << " snapset " << obc->ssc->snapset
10236            << " legacy_snaps " << obc->obs.oi.legacy_snaps
10237            << dendl;
10238   snapid_t first, last;
10239   if (obc->ssc->snapset.is_legacy()) {
10240     first = obc->obs.oi.legacy_snaps.back();
10241     last = obc->obs.oi.legacy_snaps.front();
10242   } else {
10243     auto p = obc->ssc->snapset.clone_snaps.find(soid.snap);
10244     assert(p != obc->ssc->snapset.clone_snaps.end());
10245     if (p->second.empty()) {
10246       dout(1) << __func__ << " " << soid << " empty snapset -- DNE" << dendl;
10247       assert(!cct->_conf->osd_debug_verify_snaps);
10248       return -ENOENT;
10249     }
10250     first = p->second.back();
10251     last = p->second.front();
10252   }
10253   if (first <= oid.snap) {
10254     dout(20) << "find_object_context  " << soid << " [" << first << "," << last
10255              << "] contains " << oid.snap << " -- HIT " << obc->obs << dendl;
10256     *pobc = obc;
10257     return 0;
10258   } else {
10259     dout(20) << "find_object_context  " << soid << " [" << first << "," << last
10260              << "] does not contain " << oid.snap << " -- DNE" << dendl;
10261     return -ENOENT;
10262   }
10263 }
10264
10265 void PrimaryLogPG::object_context_destructor_callback(ObjectContext *obc)
10266 {
10267   if (obc->ssc)
10268     put_snapset_context(obc->ssc);
10269 }
10270
10271 void PrimaryLogPG::add_object_context_to_pg_stat(ObjectContextRef obc, pg_stat_t *pgstat)
10272 {
10273   object_info_t& oi = obc->obs.oi;
10274
10275   dout(10) << "add_object_context_to_pg_stat " << oi.soid << dendl;
10276   object_stat_sum_t stat;
10277
10278   stat.num_bytes += oi.size;
10279
10280   if (oi.soid.snap != CEPH_SNAPDIR)
10281     stat.num_objects++;
10282   if (oi.is_dirty())
10283     stat.num_objects_dirty++;
10284   if (oi.is_whiteout())
10285     stat.num_whiteouts++;
10286   if (oi.is_omap())
10287     stat.num_objects_omap++;
10288   if (oi.is_cache_pinned())
10289     stat.num_objects_pinned++;
10290
10291   if (oi.soid.snap && oi.soid.snap != CEPH_NOSNAP && oi.soid.snap != CEPH_SNAPDIR) {
10292     stat.num_object_clones++;
10293
10294     if (!obc->ssc)
10295       obc->ssc = get_snapset_context(oi.soid, false);
10296     assert(obc->ssc);
10297
10298     // subtract off clone overlap
10299     if (obc->ssc->snapset.clone_overlap.count(oi.soid.snap)) {
10300       interval_set<uint64_t>& o = obc->ssc->snapset.clone_overlap[oi.soid.snap];
10301       for (interval_set<uint64_t>::const_iterator r = o.begin();
10302            r != o.end();
10303            ++r) {
10304         stat.num_bytes -= r.get_len();
10305       }
10306     }
10307   }
10308
10309   // add it in
10310   pgstat->stats.sum.add(stat);
10311 }
10312
10313 void PrimaryLogPG::kick_object_context_blocked(ObjectContextRef obc)
10314 {
10315   const hobject_t& soid = obc->obs.oi.soid;
10316   if (obc->is_blocked()) {
10317     dout(10) << __func__ << " " << soid << " still blocked" << dendl;
10318     return;
10319   }
10320
10321   map<hobject_t, list<OpRequestRef>>::iterator p = waiting_for_blocked_object.find(soid);
10322   if (p != waiting_for_blocked_object.end()) {
10323     list<OpRequestRef>& ls = p->second;
10324     dout(10) << __func__ << " " << soid << " requeuing " << ls.size() << " requests" << dendl;
10325     requeue_ops(ls);
10326     waiting_for_blocked_object.erase(p);
10327   }
10328
10329   map<hobject_t, ObjectContextRef>::iterator i =
10330     objects_blocked_on_snap_promotion.find(obc->obs.oi.soid.get_head());
10331   if (i != objects_blocked_on_snap_promotion.end()) {
10332     assert(i->second == obc);
10333     objects_blocked_on_snap_promotion.erase(i);
10334   }
10335
10336   if (obc->requeue_scrub_on_unblock) {
10337     obc->requeue_scrub_on_unblock = false;
10338     requeue_scrub();
10339   }
10340 }
10341
10342 SnapSetContext *PrimaryLogPG::get_snapset_context(
10343   const hobject_t& oid,
10344   bool can_create,
10345   const map<string, bufferlist> *attrs,
10346   bool oid_existed)
10347 {
10348   Mutex::Locker l(snapset_contexts_lock);
10349   SnapSetContext *ssc;
10350   map<hobject_t, SnapSetContext*>::iterator p = snapset_contexts.find(
10351     oid.get_snapdir());
10352   if (p != snapset_contexts.end()) {
10353     if (can_create || p->second->exists) {
10354       ssc = p->second;
10355     } else {
10356       return NULL;
10357     }
10358   } else {
10359     bufferlist bv;
10360     if (!attrs) {
10361       int r = -ENOENT;
10362       if (!(oid.is_head() && !oid_existed))
10363         r = pgbackend->objects_get_attr(oid.get_head(), SS_ATTR, &bv);
10364       if (r < 0) {
10365         // try _snapset
10366         if (!(oid.is_snapdir() && !oid_existed))
10367           r = pgbackend->objects_get_attr(oid.get_snapdir(), SS_ATTR, &bv);
10368         if (r < 0 && !can_create)
10369           return NULL;
10370       }
10371     } else {
10372       assert(attrs->count(SS_ATTR));
10373       bv = attrs->find(SS_ATTR)->second;
10374     }
10375     ssc = new SnapSetContext(oid.get_snapdir());
10376     _register_snapset_context(ssc);
10377     if (bv.length()) {
10378       bufferlist::iterator bvp = bv.begin();
10379       try {
10380         ssc->snapset.decode(bvp);
10381       } catch (buffer::error& e) {
10382         dout(0) << __func__ << " Can't decode snapset: " << e << dendl;
10383         return NULL;
10384       }
10385       ssc->exists = true;
10386     } else {
10387       ssc->exists = false;
10388     }
10389   }
10390   assert(ssc);
10391   ssc->ref++;
10392   return ssc;
10393 }
10394
10395 void PrimaryLogPG::put_snapset_context(SnapSetContext *ssc)
10396 {
10397   Mutex::Locker l(snapset_contexts_lock);
10398   --ssc->ref;
10399   if (ssc->ref == 0) {
10400     if (ssc->registered)
10401       snapset_contexts.erase(ssc->oid);
10402     delete ssc;
10403   }
10404 }
10405
10406 /** pull - request object from a peer
10407  */
10408
10409 /*
10410  * Return values:
10411  *  NONE  - didn't pull anything
10412  *  YES   - pulled what the caller wanted
10413  *  OTHER - needed to pull something else first (_head or _snapdir)
10414  */
10415 enum { PULL_NONE, PULL_OTHER, PULL_YES };
10416
10417 int PrimaryLogPG::recover_missing(
10418   const hobject_t &soid, eversion_t v,
10419   int priority,
10420   PGBackend::RecoveryHandle *h)
10421 {
10422   if (missing_loc.is_unfound(soid)) {
10423     dout(7) << "pull " << soid
10424             << " v " << v
10425             << " but it is unfound" << dendl;
10426     return PULL_NONE;
10427   }
10428
10429   if (missing_loc.is_deleted(soid)) {
10430     start_recovery_op(soid);
10431     assert(!recovering.count(soid));
10432     recovering.insert(make_pair(soid, ObjectContextRef()));
10433     epoch_t cur_epoch = get_osdmap()->get_epoch();
10434     remove_missing_object(soid, v, new FunctionContext(
10435      [=](int) {
10436        lock();
10437        if (!pg_has_reset_since(cur_epoch)) {
10438          bool object_missing = false;
10439          for (const auto& shard : actingbackfill) {
10440            if (shard == pg_whoami)
10441              continue;
10442            if (peer_missing[shard].is_missing(soid)) {
10443              dout(20) << __func__ << ": soid " << soid << " needs to be deleted from replica " << shard << dendl;
10444              object_missing = true;
10445              break;
10446            }
10447          }
10448          if (!object_missing) {
10449            object_stat_sum_t stat_diff;
10450            stat_diff.num_objects_recovered = 1;
10451            on_global_recover(soid, stat_diff, true);
10452          } else {
10453            auto recovery_handle = pgbackend->open_recovery_op();
10454            pgbackend->recover_delete_object(soid, v, recovery_handle);
10455            pgbackend->run_recovery_op(recovery_handle, priority);
10456          }
10457        }
10458        unlock();
10459      }));
10460     return PULL_YES;
10461   }
10462
10463   // is this a snapped object?  if so, consult the snapset.. we may not need the entire object!
10464   ObjectContextRef obc;
10465   ObjectContextRef head_obc;
10466   if (soid.snap && soid.snap < CEPH_NOSNAP) {
10467     // do we have the head and/or snapdir?
10468     hobject_t head = soid.get_head();
10469     if (pg_log.get_missing().is_missing(head)) {
10470       if (recovering.count(head)) {
10471         dout(10) << " missing but already recovering head " << head << dendl;
10472         return PULL_NONE;
10473       } else {
10474         int r = recover_missing(
10475           head, pg_log.get_missing().get_items().find(head)->second.need, priority,
10476           h);
10477         if (r != PULL_NONE)
10478           return PULL_OTHER;
10479         return PULL_NONE;
10480       }
10481     }
10482     head = soid.get_snapdir();
10483     if (pg_log.get_missing().is_missing(head)) {
10484       if (recovering.count(head)) {
10485         dout(10) << " missing but already recovering snapdir " << head << dendl;
10486         return PULL_NONE;
10487       } else {
10488         int r = recover_missing(
10489           head, pg_log.get_missing().get_items().find(head)->second.need, priority,
10490           h);
10491         if (r != PULL_NONE)
10492           return PULL_OTHER;
10493         return PULL_NONE;
10494       }
10495     }
10496
10497     // we must have one or the other
10498     head_obc = get_object_context(
10499       soid.get_head(),
10500       false,
10501       0);
10502     if (!head_obc)
10503       head_obc = get_object_context(
10504         soid.get_snapdir(),
10505         false,
10506         0);
10507     assert(head_obc);
10508   }
10509   start_recovery_op(soid);
10510   assert(!recovering.count(soid));
10511   recovering.insert(make_pair(soid, obc));
10512   int r = pgbackend->recover_object(
10513     soid,
10514     v,
10515     head_obc,
10516     obc,
10517     h);
10518   // This is only a pull which shouldn't return an error
10519   assert(r >= 0);
10520   return PULL_YES;
10521 }
10522
10523 void PrimaryLogPG::send_remove_op(
10524   const hobject_t& oid, eversion_t v, pg_shard_t peer)
10525 {
10526   ceph_tid_t tid = osd->get_tid();
10527   osd_reqid_t rid(osd->get_cluster_msgr_name(), 0, tid);
10528
10529   dout(10) << "send_remove_op " << oid << " from osd." << peer
10530            << " tid " << tid << dendl;
10531
10532   MOSDSubOp *subop = new MOSDSubOp(
10533     rid, pg_whoami, spg_t(info.pgid.pgid, peer.shard),
10534     oid, CEPH_OSD_FLAG_ACK,
10535     get_osdmap()->get_epoch(), tid, v);
10536   subop->ops = vector<OSDOp>(1);
10537   subop->ops[0].op.op = CEPH_OSD_OP_DELETE;
10538
10539   osd->send_message_osd_cluster(peer.osd, subop, get_osdmap()->get_epoch());
10540 }
10541
10542 void PrimaryLogPG::remove_missing_object(const hobject_t &soid,
10543                                          eversion_t v, Context *on_complete)
10544 {
10545   dout(20) << __func__ << " " << soid << " " << v << dendl;
10546   assert(on_complete != nullptr);
10547   // delete locally
10548   ObjectStore::Transaction t;
10549   remove_snap_mapped_object(t, soid);
10550
10551   ObjectRecoveryInfo recovery_info;
10552   recovery_info.soid = soid;
10553   recovery_info.version = v;
10554
10555   epoch_t cur_epoch = get_osdmap()->get_epoch();
10556   t.register_on_complete(new FunctionContext(
10557      [=](int) {
10558        lock();
10559        if (!pg_has_reset_since(cur_epoch)) {
10560          ObjectStore::Transaction t2;
10561          on_local_recover(soid, recovery_info, ObjectContextRef(), true, &t2);
10562          t2.register_on_complete(on_complete);
10563          int r = osd->store->queue_transaction(osr.get(), std::move(t2), nullptr);
10564          assert(r == 0);
10565          unlock();
10566        } else {
10567          unlock();
10568          on_complete->complete(-EAGAIN);
10569        }
10570      }));
10571   int r = osd->store->queue_transaction(osr.get(), std::move(t), nullptr);
10572   assert(r == 0);
10573 }
10574
10575 void PrimaryLogPG::finish_degraded_object(const hobject_t& oid)
10576 {
10577   dout(10) << "finish_degraded_object " << oid << dendl;
10578   if (callbacks_for_degraded_object.count(oid)) {
10579     list<Context*> contexts;
10580     contexts.swap(callbacks_for_degraded_object[oid]);
10581     callbacks_for_degraded_object.erase(oid);
10582     for (list<Context*>::iterator i = contexts.begin();
10583          i != contexts.end();
10584          ++i) {
10585       (*i)->complete(0);
10586     }
10587   }
10588   map<hobject_t, snapid_t>::iterator i = objects_blocked_on_degraded_snap.find(
10589     oid.get_head());
10590   if (i != objects_blocked_on_degraded_snap.end() &&
10591       i->second == oid.snap)
10592     objects_blocked_on_degraded_snap.erase(i);
10593 }
10594
10595 void PrimaryLogPG::_committed_pushed_object(
10596   epoch_t epoch, eversion_t last_complete)
10597 {
10598   lock();
10599   if (!pg_has_reset_since(epoch)) {
10600     dout(10) << "_committed_pushed_object last_complete " << last_complete << " now ondisk" << dendl;
10601     last_complete_ondisk = last_complete;
10602
10603     if (last_complete_ondisk == info.last_update) {
10604       if (!is_primary()) {
10605         // Either we are a replica or backfill target.
10606         // we are fully up to date.  tell the primary!
10607         osd->send_message_osd_cluster(
10608           get_primary().osd,
10609           new MOSDPGTrim(
10610             get_osdmap()->get_epoch(),
10611             spg_t(info.pgid.pgid, get_primary().shard),
10612             last_complete_ondisk),
10613           get_osdmap()->get_epoch());
10614       } else {
10615         calc_min_last_complete_ondisk();
10616       }
10617     }
10618
10619   } else {
10620     dout(10) << "_committed_pushed_object pg has changed, not touching last_complete_ondisk" << dendl;
10621   }
10622
10623   unlock();
10624 }
10625
10626 void PrimaryLogPG::_applied_recovered_object(ObjectContextRef obc)
10627 {
10628   lock();
10629   dout(20) << __func__ << dendl;
10630   if (obc) {
10631     dout(20) << "obc = " << *obc << dendl;
10632   }
10633   assert(active_pushes >= 1);
10634   --active_pushes;
10635
10636   // requeue an active chunky scrub waiting on recovery ops
10637   if (!deleting && active_pushes == 0
10638       && scrubber.is_chunky_scrub_active()) {
10639     if (ops_blocked_by_scrub()) {
10640       requeue_scrub(true);
10641     } else {
10642       requeue_scrub(false);
10643     }
10644   }
10645   unlock();
10646 }
10647
10648 void PrimaryLogPG::_applied_recovered_object_replica()
10649 {
10650   lock();
10651   dout(20) << __func__ << dendl;
10652   assert(active_pushes >= 1);
10653   --active_pushes;
10654
10655   // requeue an active chunky scrub waiting on recovery ops
10656   if (!deleting && active_pushes == 0 &&
10657       scrubber.active_rep_scrub && static_cast<const MOSDRepScrub*>(
10658         scrubber.active_rep_scrub->get_req())->chunky) {
10659     osd->enqueue_back(
10660       info.pgid,
10661       PGQueueable(scrubber.active_rep_scrub, get_osdmap()->get_epoch()));
10662     scrubber.active_rep_scrub = OpRequestRef();
10663   }
10664   unlock();
10665 }
10666
10667 void PrimaryLogPG::recover_got(hobject_t oid, eversion_t v)
10668 {
10669   dout(10) << "got missing " << oid << " v " << v << dendl;
10670   pg_log.recover_got(oid, v, info);
10671   if (pg_log.get_log().complete_to != pg_log.get_log().log.end()) {
10672     dout(10) << "last_complete now " << info.last_complete
10673              << " log.complete_to " << pg_log.get_log().complete_to->version
10674              << dendl;
10675   } else {
10676     dout(10) << "last_complete now " << info.last_complete
10677              << " log.complete_to at end" << dendl;
10678     //below is not true in the repair case.
10679     //assert(missing.num_missing() == 0);  // otherwise, complete_to was wrong.
10680     assert(info.last_complete == info.last_update);
10681   }
10682 }
10683
10684 void PrimaryLogPG::primary_failed(const hobject_t &soid)
10685 {
10686   list<pg_shard_t> fl = { pg_whoami };
10687   failed_push(fl, soid);
10688 }
10689
10690 void PrimaryLogPG::failed_push(const list<pg_shard_t> &from, const hobject_t &soid)
10691 {
10692   dout(20) << __func__ << ": " << soid << dendl;
10693   assert(recovering.count(soid));
10694   auto obc = recovering[soid];
10695   if (obc) {
10696     list<OpRequestRef> blocked_ops;
10697     obc->drop_recovery_read(&blocked_ops);
10698     requeue_ops(blocked_ops);
10699   }
10700   recovering.erase(soid);
10701   for (auto&& i : from)
10702     missing_loc.remove_location(soid, i);
10703   dout(0) << __func__ << " " << soid << " from shard " << from
10704           << ", reps on " << missing_loc.get_locations(soid)
10705           << " unfound? " << missing_loc.is_unfound(soid) << dendl;
10706   finish_recovery_op(soid);  // close out this attempt,
10707 }
10708
10709 void PrimaryLogPG::sub_op_remove(OpRequestRef op)
10710 {
10711   const MOSDSubOp *m = static_cast<const MOSDSubOp*>(op->get_req());
10712   assert(m->get_type() == MSG_OSD_SUBOP);
10713   dout(7) << "sub_op_remove " << m->poid << dendl;
10714
10715   op->mark_started();
10716
10717   ObjectStore::Transaction t;
10718   remove_snap_mapped_object(t, m->poid);
10719   int r = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
10720   assert(r == 0);
10721 }
10722
10723 eversion_t PrimaryLogPG::pick_newest_available(const hobject_t& oid)
10724 {
10725   eversion_t v;
10726   pg_missing_item pmi;
10727   bool is_missing = pg_log.get_missing().is_missing(oid, &pmi);
10728   assert(is_missing);
10729   v = pmi.have;
10730   dout(10) << "pick_newest_available " << oid << " " << v << " on osd." << osd->whoami << " (local)" << dendl;
10731
10732   assert(!actingbackfill.empty());
10733   for (set<pg_shard_t>::iterator i = actingbackfill.begin();
10734        i != actingbackfill.end();
10735        ++i) {
10736     if (*i == get_primary()) continue;
10737     pg_shard_t peer = *i;
10738     if (!peer_missing[peer].is_missing(oid)) {
10739       continue;
10740     }
10741     eversion_t h = peer_missing[peer].get_items().at(oid).have;
10742     dout(10) << "pick_newest_available " << oid << " " << h << " on osd." << peer << dendl;
10743     if (h > v)
10744       v = h;
10745   }
10746
10747   dout(10) << "pick_newest_available " << oid << " " << v << " (newest)" << dendl;
10748   return v;
10749 }
10750
10751 void PrimaryLogPG::do_update_log_missing(OpRequestRef &op)
10752 {
10753   const MOSDPGUpdateLogMissing *m = static_cast<const MOSDPGUpdateLogMissing*>(
10754     op->get_req());
10755   assert(m->get_type() == MSG_OSD_PG_UPDATE_LOG_MISSING);
10756   ObjectStore::Transaction t;
10757   boost::optional<eversion_t> op_trim_to, op_roll_forward_to;
10758   if (m->pg_trim_to != eversion_t())
10759     op_trim_to = m->pg_trim_to;
10760   if (m->pg_roll_forward_to != eversion_t())
10761     op_roll_forward_to = m->pg_roll_forward_to;
10762
10763   dout(20) << __func__ << " op_trim_to = " << op_trim_to << " op_roll_forward_to = " << op_roll_forward_to << dendl;
10764
10765   append_log_entries_update_missing(m->entries, t, op_trim_to, op_roll_forward_to);
10766   eversion_t new_lcod = info.last_complete;
10767
10768   Context *complete = new FunctionContext(
10769     [=](int) {
10770       const MOSDPGUpdateLogMissing *msg = static_cast<const MOSDPGUpdateLogMissing*>(
10771         op->get_req());
10772       lock();
10773       if (!pg_has_reset_since(msg->get_epoch())) {
10774         update_last_complete_ondisk(new_lcod);
10775         MOSDPGUpdateLogMissingReply *reply =
10776           new MOSDPGUpdateLogMissingReply(
10777             spg_t(info.pgid.pgid, primary_shard().shard),
10778             pg_whoami.shard,
10779             msg->get_epoch(),
10780             msg->min_epoch,
10781             msg->get_tid(),
10782             new_lcod);
10783         reply->set_priority(CEPH_MSG_PRIO_HIGH);
10784         msg->get_connection()->send_message(reply);
10785       }
10786       unlock();
10787     });
10788
10789   if (get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
10790     t.register_on_commit(complete);
10791   } else {
10792     /* Hack to work around the fact that ReplicatedBackend sends
10793      * ack+commit if commit happens first
10794      *
10795      * This behavior is no longer necessary, but we preserve it so old
10796      * primaries can keep their repops in order */
10797     if (pool.info.ec_pool()) {
10798       t.register_on_complete(complete);
10799     } else {
10800       t.register_on_commit(complete);
10801     }
10802   }
10803   t.register_on_applied(
10804     new C_OSD_OnApplied{this, get_osdmap()->get_epoch(), info.last_update});
10805   int tr = osd->store->queue_transaction(
10806     osr.get(),
10807     std::move(t),
10808     nullptr);
10809   assert(tr == 0);
10810 }
10811
10812 void PrimaryLogPG::do_update_log_missing_reply(OpRequestRef &op)
10813 {
10814   const MOSDPGUpdateLogMissingReply *m =
10815     static_cast<const MOSDPGUpdateLogMissingReply*>(
10816     op->get_req());
10817   dout(20) << __func__ << " got reply from "
10818            << m->get_from() << dendl;
10819
10820   auto it = log_entry_update_waiting_on.find(m->get_tid());
10821   if (it != log_entry_update_waiting_on.end()) {
10822     if (it->second.waiting_on.count(m->get_from())) {
10823       it->second.waiting_on.erase(m->get_from());
10824       if (m->last_complete_ondisk != eversion_t()) {
10825         update_peer_last_complete_ondisk(m->get_from(), m->last_complete_ondisk);
10826       }
10827     } else {
10828       osd->clog->error()
10829         << info.pgid << " got reply "
10830         << *m << " from shard we are not waiting for "
10831         << m->get_from();
10832     }
10833
10834     if (it->second.waiting_on.empty()) {
10835       repop_all_committed(it->second.repop.get());
10836       log_entry_update_waiting_on.erase(it);
10837     }
10838   } else {
10839     osd->clog->error()
10840       << info.pgid << " got reply "
10841       << *m << " on unknown tid " << m->get_tid();
10842   }
10843 }
10844
10845 /* Mark all unfound objects as lost.
10846  */
10847 void PrimaryLogPG::mark_all_unfound_lost(
10848   int what,
10849   ConnectionRef con,
10850   ceph_tid_t tid)
10851 {
10852   dout(3) << __func__ << " " << pg_log_entry_t::get_op_name(what) << dendl;
10853   list<hobject_t> oids;
10854
10855   dout(30) << __func__ << ": log before:\n";
10856   pg_log.get_log().print(*_dout);
10857   *_dout << dendl;
10858
10859   mempool::osd_pglog::list<pg_log_entry_t> log_entries;
10860
10861   utime_t mtime = ceph_clock_now();
10862   map<hobject_t, pg_missing_item>::const_iterator m =
10863     missing_loc.get_needs_recovery().begin();
10864   map<hobject_t, pg_missing_item>::const_iterator mend =
10865     missing_loc.get_needs_recovery().end();
10866
10867   ObcLockManager manager;
10868   eversion_t v = get_next_version();
10869   v.epoch = get_osdmap()->get_epoch();
10870   uint64_t num_unfound = missing_loc.num_unfound();
10871   while (m != mend) {
10872     const hobject_t &oid(m->first);
10873     if (!missing_loc.is_unfound(oid)) {
10874       // We only care about unfound objects
10875       ++m;
10876       continue;
10877     }
10878
10879     ObjectContextRef obc;
10880     eversion_t prev;
10881
10882     switch (what) {
10883     case pg_log_entry_t::LOST_MARK:
10884       assert(0 == "actually, not implemented yet!");
10885       break;
10886
10887     case pg_log_entry_t::LOST_REVERT:
10888       prev = pick_newest_available(oid);
10889       if (prev > eversion_t()) {
10890         // log it
10891         pg_log_entry_t e(
10892           pg_log_entry_t::LOST_REVERT, oid, v,
10893           m->second.need, 0, osd_reqid_t(), mtime, 0);
10894         e.reverting_to = prev;
10895         e.mark_unrollbackable();
10896         log_entries.push_back(e);
10897         dout(10) << e << dendl;
10898
10899         // we are now missing the new version; recovery code will sort it out.
10900         ++v.version;
10901         ++m;
10902         break;
10903       }
10904
10905     case pg_log_entry_t::LOST_DELETE:
10906       {
10907         pg_log_entry_t e(pg_log_entry_t::LOST_DELETE, oid, v, m->second.need,
10908                          0, osd_reqid_t(), mtime, 0);
10909         if (get_osdmap()->require_osd_release >= CEPH_RELEASE_JEWEL) {
10910           if (pool.info.require_rollback()) {
10911             e.mod_desc.try_rmobject(v.version);
10912           } else {
10913             e.mark_unrollbackable();
10914           }
10915         } // otherwise, just do what we used to do
10916         dout(10) << e << dendl;
10917         log_entries.push_back(e);
10918         oids.push_back(oid);
10919
10920         // If context found mark object as deleted in case
10921         // of racing with new creation.  This can happen if
10922         // object lost and EIO at primary.
10923         obc = object_contexts.lookup(oid);
10924         if (obc)
10925           obc->obs.exists = false;
10926
10927         ++v.version;
10928         ++m;
10929       }
10930       break;
10931
10932     default:
10933       ceph_abort();
10934     }
10935   }
10936
10937   info.stats.stats_invalid = true;
10938
10939   submit_log_entries(
10940     log_entries,
10941     std::move(manager),
10942     boost::optional<std::function<void(void)> >(
10943       [this, oids, con, num_unfound, tid]() {
10944         if (perform_deletes_during_peering()) {
10945           for (auto oid : oids) {
10946             // clear old locations - merge_new_log_entries will have
10947             // handled rebuilding missing_loc for each of these
10948             // objects if we have the RECOVERY_DELETES flag
10949             missing_loc.recovered(oid);
10950           }
10951         }
10952
10953         if (is_recovery_unfound()) {
10954           queue_peering_event(
10955             CephPeeringEvtRef(
10956               std::make_shared<CephPeeringEvt>(
10957               get_osdmap()->get_epoch(),
10958               get_osdmap()->get_epoch(),
10959               DoRecovery())));
10960         } else if (is_backfill_unfound()) {
10961           queue_peering_event(
10962             CephPeeringEvtRef(
10963               std::make_shared<CephPeeringEvt>(
10964               get_osdmap()->get_epoch(),
10965               get_osdmap()->get_epoch(),
10966               RequestBackfill())));
10967         } else {
10968           queue_recovery();
10969         }
10970
10971         stringstream ss;
10972         ss << "pg has " << num_unfound
10973            << " objects unfound and apparently lost marking";
10974         string rs = ss.str();
10975         dout(0) << "do_command r=" << 0 << " " << rs << dendl;
10976         osd->clog->info() << rs;
10977         if (con) {
10978           MCommandReply *reply = new MCommandReply(0, rs);
10979           reply->set_tid(tid);
10980           con->send_message(reply);
10981         }
10982       }),
10983     OpRequestRef());
10984 }
10985
10986 void PrimaryLogPG::_split_into(pg_t child_pgid, PG *child, unsigned split_bits)
10987 {
10988   assert(repop_queue.empty());
10989 }
10990
10991 /*
10992  * pg status change notification
10993  */
10994
10995 void PrimaryLogPG::apply_and_flush_repops(bool requeue)
10996 {
10997   list<OpRequestRef> rq;
10998
10999   // apply all repops
11000   while (!repop_queue.empty()) {
11001     RepGather *repop = repop_queue.front();
11002     repop_queue.pop_front();
11003     dout(10) << " canceling repop tid " << repop->rep_tid << dendl;
11004     repop->rep_aborted = true;
11005     repop->on_applied.clear();
11006     repop->on_committed.clear();
11007     repop->on_success.clear();
11008
11009     if (requeue) {
11010       if (repop->op) {
11011         dout(10) << " requeuing " << *repop->op->get_req() << dendl;
11012         rq.push_back(repop->op);
11013         repop->op = OpRequestRef();
11014       }
11015
11016       // also requeue any dups, interleaved into position
11017       map<eversion_t, list<pair<OpRequestRef, version_t> > >::iterator p =
11018         waiting_for_ondisk.find(repop->v);
11019       if (p != waiting_for_ondisk.end()) {
11020         dout(10) << " also requeuing ondisk waiters " << p->second << dendl;
11021         for (list<pair<OpRequestRef, version_t> >::iterator i =
11022                p->second.begin();
11023              i != p->second.end();
11024              ++i) {
11025           rq.push_back(i->first);
11026         }
11027         waiting_for_ondisk.erase(p);
11028       }
11029     }
11030
11031     remove_repop(repop);
11032   }
11033
11034   assert(repop_queue.empty());
11035
11036   if (requeue) {
11037     requeue_ops(rq);
11038     if (!waiting_for_ondisk.empty()) {
11039       for (map<eversion_t, list<pair<OpRequestRef, version_t> > >::iterator i =
11040              waiting_for_ondisk.begin();
11041            i != waiting_for_ondisk.end();
11042            ++i) {
11043         for (list<pair<OpRequestRef, version_t> >::iterator j =
11044                i->second.begin();
11045              j != i->second.end();
11046              ++j) {
11047           derr << __func__ << ": op " << *(j->first->get_req()) << " waiting on "
11048                << i->first << dendl;
11049         }
11050       }
11051       assert(waiting_for_ondisk.empty());
11052     }
11053   }
11054
11055   waiting_for_ondisk.clear();
11056 }
11057
11058 void PrimaryLogPG::on_flushed()
11059 {
11060   assert(flushes_in_progress > 0);
11061   flushes_in_progress--;
11062   if (flushes_in_progress == 0) {
11063     requeue_ops(waiting_for_flush);
11064   }
11065   if (!is_peered() || !is_primary()) {
11066     pair<hobject_t, ObjectContextRef> i;
11067     while (object_contexts.get_next(i.first, &i)) {
11068       derr << "on_flushed: object " << i.first << " obc still alive" << dendl;
11069     }
11070     assert(object_contexts.empty());
11071   }
11072   pgbackend->on_flushed();
11073 }
11074
11075 void PrimaryLogPG::on_removal(ObjectStore::Transaction *t)
11076 {
11077   dout(10) << "on_removal" << dendl;
11078
11079   // adjust info to backfill
11080   info.set_last_backfill(hobject_t());
11081   pg_log.reset_backfill();
11082   dirty_info = true;
11083
11084
11085   // clear log
11086   PGLogEntryHandler rollbacker{this, t};
11087   pg_log.roll_forward(&rollbacker);
11088
11089   write_if_dirty(*t);
11090
11091   if (!deleting)
11092     on_shutdown();
11093 }
11094
11095 void PrimaryLogPG::clear_async_reads()
11096 {
11097   dout(10) << __func__ << dendl;
11098   for(auto& i : in_progress_async_reads) {
11099     dout(10) << "clear ctx: "
11100              << "OpRequestRef " << i.first
11101              << " OpContext " << i.second
11102              << dendl;
11103     close_op_ctx(i.second);
11104   }
11105 }
11106
11107 void PrimaryLogPG::on_shutdown()
11108 {
11109   dout(10) << "on_shutdown" << dendl;
11110
11111   // remove from queues
11112   osd->pg_stat_queue_dequeue(this);
11113   osd->peering_wq.dequeue(this);
11114
11115   // handles queue races
11116   deleting = true;
11117
11118   if (recovery_queued) {
11119     recovery_queued = false;
11120     osd->clear_queued_recovery(this);
11121   }
11122
11123   clear_scrub_reserved();
11124   scrub_clear_state();
11125
11126   unreg_next_scrub();
11127
11128   vector<ceph_tid_t> tids;
11129   cancel_copy_ops(false, &tids);
11130   cancel_flush_ops(false, &tids);
11131   cancel_proxy_ops(false, &tids);
11132   osd->objecter->op_cancel(tids, -ECANCELED);
11133
11134   apply_and_flush_repops(false);
11135   cancel_log_updates();
11136   // we must remove PGRefs, so do this this prior to release_backoffs() callers
11137   clear_backoffs();
11138   // clean up snap trim references
11139   snap_trimmer_machine.process_event(Reset());
11140
11141   pgbackend->on_change();
11142
11143   context_registry_on_change();
11144   object_contexts.clear();
11145
11146   clear_async_reads();
11147
11148   osd->remote_reserver.cancel_reservation(info.pgid);
11149   osd->local_reserver.cancel_reservation(info.pgid);
11150
11151   clear_primary_state();
11152   cancel_recovery();
11153 }
11154
11155 void PrimaryLogPG::on_activate()
11156 {
11157   // all clean?
11158   if (needs_recovery()) {
11159     dout(10) << "activate not all replicas are up-to-date, queueing recovery" << dendl;
11160     queue_peering_event(
11161       CephPeeringEvtRef(
11162         std::make_shared<CephPeeringEvt>(
11163           get_osdmap()->get_epoch(),
11164           get_osdmap()->get_epoch(),
11165           DoRecovery())));
11166   } else if (needs_backfill()) {
11167     dout(10) << "activate queueing backfill" << dendl;
11168     queue_peering_event(
11169       CephPeeringEvtRef(
11170         std::make_shared<CephPeeringEvt>(
11171           get_osdmap()->get_epoch(),
11172           get_osdmap()->get_epoch(),
11173           RequestBackfill())));
11174   } else {
11175     dout(10) << "activate all replicas clean, no recovery" << dendl;
11176     eio_errors_to_process = false;
11177     queue_peering_event(
11178       CephPeeringEvtRef(
11179         std::make_shared<CephPeeringEvt>(
11180           get_osdmap()->get_epoch(),
11181           get_osdmap()->get_epoch(),
11182           AllReplicasRecovered())));
11183   }
11184
11185   publish_stats_to_osd();
11186
11187   if (!backfill_targets.empty()) {
11188     last_backfill_started = earliest_backfill();
11189     new_backfill = true;
11190     assert(!last_backfill_started.is_max());
11191     dout(5) << "on activate: bft=" << backfill_targets
11192            << " from " << last_backfill_started << dendl;
11193     for (set<pg_shard_t>::iterator i = backfill_targets.begin();
11194          i != backfill_targets.end();
11195          ++i) {
11196       dout(5) << "target shard " << *i
11197              << " from " << peer_info[*i].last_backfill
11198              << dendl;
11199     }
11200   }
11201
11202   hit_set_setup();
11203   agent_setup();
11204 }
11205
11206 void PrimaryLogPG::_on_new_interval()
11207 {
11208   dout(20) << __func__ << " checking missing set deletes flag. missing = " << pg_log.get_missing() << dendl;
11209   if (!pg_log.get_missing().may_include_deletes &&
11210       get_osdmap()->test_flag(CEPH_OSDMAP_RECOVERY_DELETES)) {
11211     pg_log.rebuild_missing_set_with_deletes(osd->store, coll, info);
11212   }
11213   assert(pg_log.get_missing().may_include_deletes == get_osdmap()->test_flag(CEPH_OSDMAP_RECOVERY_DELETES));
11214 }
11215
11216 void PrimaryLogPG::on_change(ObjectStore::Transaction *t)
11217 {
11218   dout(10) << "on_change" << dendl;
11219
11220   if (hit_set && hit_set->insert_count() == 0) {
11221     dout(20) << " discarding empty hit_set" << dendl;
11222     hit_set_clear();
11223   }
11224
11225   if (recovery_queued) {
11226     recovery_queued = false;
11227     osd->clear_queued_recovery(this);
11228   }
11229
11230   // requeue everything in the reverse order they should be
11231   // reexamined.
11232   requeue_ops(waiting_for_peered);
11233   requeue_ops(waiting_for_flush);
11234   requeue_ops(waiting_for_active);
11235
11236   clear_scrub_reserved();
11237
11238   vector<ceph_tid_t> tids;
11239   cancel_copy_ops(is_primary(), &tids);
11240   cancel_flush_ops(is_primary(), &tids);
11241   cancel_proxy_ops(is_primary(), &tids);
11242   osd->objecter->op_cancel(tids, -ECANCELED);
11243
11244   // requeue object waiters
11245   for (auto& p : waiting_for_unreadable_object) {
11246     release_backoffs(p.first);
11247   }
11248   if (is_primary()) {
11249     requeue_object_waiters(waiting_for_unreadable_object);
11250   } else {
11251     waiting_for_unreadable_object.clear();
11252   }
11253   for (map<hobject_t,list<OpRequestRef>>::iterator p = waiting_for_degraded_object.begin();
11254        p != waiting_for_degraded_object.end();
11255        waiting_for_degraded_object.erase(p++)) {
11256     release_backoffs(p->first);
11257     if (is_primary())
11258       requeue_ops(p->second);
11259     else
11260       p->second.clear();
11261     finish_degraded_object(p->first);
11262   }
11263
11264   // requeues waiting_for_scrub
11265   scrub_clear_state();
11266
11267   for (auto p = waiting_for_blocked_object.begin();
11268        p != waiting_for_blocked_object.end();
11269        waiting_for_blocked_object.erase(p++)) {
11270     if (is_primary())
11271       requeue_ops(p->second);
11272     else
11273       p->second.clear();
11274   }
11275   for (auto i = callbacks_for_degraded_object.begin();
11276        i != callbacks_for_degraded_object.end();
11277     ) {
11278     finish_degraded_object((i++)->first);
11279   }
11280   assert(callbacks_for_degraded_object.empty());
11281
11282   if (is_primary()) {
11283     requeue_ops(waiting_for_cache_not_full);
11284   } else {
11285     waiting_for_cache_not_full.clear();
11286   }
11287   objects_blocked_on_cache_full.clear();
11288
11289   for (list<pair<OpRequestRef, OpContext*> >::iterator i =
11290          in_progress_async_reads.begin();
11291        i != in_progress_async_reads.end();
11292        in_progress_async_reads.erase(i++)) {
11293     close_op_ctx(i->second);
11294     if (is_primary())
11295       requeue_op(i->first);
11296   }
11297
11298   // this will requeue ops we were working on but didn't finish, and
11299   // any dups
11300   apply_and_flush_repops(is_primary());
11301   cancel_log_updates();
11302
11303   // do this *after* apply_and_flush_repops so that we catch any newly
11304   // registered watches.
11305   context_registry_on_change();
11306
11307   pgbackend->on_change_cleanup(t);
11308   scrubber.cleanup_store(t);
11309   pgbackend->on_change();
11310
11311   // clear snap_trimmer state
11312   snap_trimmer_machine.process_event(Reset());
11313
11314   debug_op_order.clear();
11315   unstable_stats.clear();
11316
11317   // we don't want to cache object_contexts through the interval change
11318   // NOTE: we actually assert that all currently live references are dead
11319   // by the time the flush for the next interval completes.
11320   object_contexts.clear();
11321
11322   // should have been cleared above by finishing all of the degraded objects
11323   assert(objects_blocked_on_degraded_snap.empty());
11324 }
11325
11326 void PrimaryLogPG::on_role_change()
11327 {
11328   dout(10) << "on_role_change" << dendl;
11329   if (get_role() != 0 && hit_set) {
11330     dout(10) << " clearing hit set" << dendl;
11331     hit_set_clear();
11332   }
11333 }
11334
11335 void PrimaryLogPG::on_pool_change()
11336 {
11337   dout(10) << __func__ << dendl;
11338   // requeue cache full waiters just in case the cache_mode is
11339   // changing away from writeback mode.  note that if we are not
11340   // active the normal requeuing machinery is sufficient (and properly
11341   // ordered).
11342   if (is_active() &&
11343       pool.info.cache_mode != pg_pool_t::CACHEMODE_WRITEBACK &&
11344       !waiting_for_cache_not_full.empty()) {
11345     dout(10) << __func__ << " requeuing full waiters (not in writeback) "
11346              << dendl;
11347     requeue_ops(waiting_for_cache_not_full);
11348     objects_blocked_on_cache_full.clear();
11349   }
11350   hit_set_setup();
11351   agent_setup();
11352 }
11353
11354 // clear state.  called on recovery completion AND cancellation.
11355 void PrimaryLogPG::_clear_recovery_state()
11356 {
11357   missing_loc.clear();
11358 #ifdef DEBUG_RECOVERY_OIDS
11359   recovering_oids.clear();
11360 #endif
11361   last_backfill_started = hobject_t();
11362   set<hobject_t>::iterator i = backfills_in_flight.begin();
11363   while (i != backfills_in_flight.end()) {
11364     assert(recovering.count(*i));
11365     backfills_in_flight.erase(i++);
11366   }
11367
11368   list<OpRequestRef> blocked_ops;
11369   for (map<hobject_t, ObjectContextRef>::iterator i = recovering.begin();
11370        i != recovering.end();
11371        recovering.erase(i++)) {
11372     if (i->second) {
11373       i->second->drop_recovery_read(&blocked_ops);
11374       requeue_ops(blocked_ops);
11375     }
11376   }
11377   assert(backfills_in_flight.empty());
11378   pending_backfill_updates.clear();
11379   assert(recovering.empty());
11380   pgbackend->clear_recovery_state();
11381 }
11382
11383 void PrimaryLogPG::cancel_pull(const hobject_t &soid)
11384 {
11385   dout(20) << __func__ << ": " << soid << dendl;
11386   assert(recovering.count(soid));
11387   ObjectContextRef obc = recovering[soid];
11388   if (obc) {
11389     list<OpRequestRef> blocked_ops;
11390     obc->drop_recovery_read(&blocked_ops);
11391     requeue_ops(blocked_ops);
11392   }
11393   recovering.erase(soid);
11394   finish_recovery_op(soid);
11395   release_backoffs(soid);
11396   if (waiting_for_degraded_object.count(soid)) {
11397     dout(20) << " kicking degraded waiters on " << soid << dendl;
11398     requeue_ops(waiting_for_degraded_object[soid]);
11399     waiting_for_degraded_object.erase(soid);
11400   }
11401   if (waiting_for_unreadable_object.count(soid)) {
11402     dout(20) << " kicking unreadable waiters on " << soid << dendl;
11403     requeue_ops(waiting_for_unreadable_object[soid]);
11404     waiting_for_unreadable_object.erase(soid);
11405   }
11406   if (is_missing_object(soid))
11407     pg_log.set_last_requested(0); // get recover_primary to start over
11408   finish_degraded_object(soid);
11409 }
11410
11411 void PrimaryLogPG::check_recovery_sources(const OSDMapRef& osdmap)
11412 {
11413   /*
11414    * check that any peers we are planning to (or currently) pulling
11415    * objects from are dealt with.
11416    */
11417   missing_loc.check_recovery_sources(osdmap);
11418   pgbackend->check_recovery_sources(osdmap);
11419
11420   for (set<pg_shard_t>::iterator i = peer_log_requested.begin();
11421        i != peer_log_requested.end();
11422        ) {
11423     if (!osdmap->is_up(i->osd)) {
11424       dout(10) << "peer_log_requested removing " << *i << dendl;
11425       peer_log_requested.erase(i++);
11426     } else {
11427       ++i;
11428     }
11429   }
11430
11431   for (set<pg_shard_t>::iterator i = peer_missing_requested.begin();
11432        i != peer_missing_requested.end();
11433        ) {
11434     if (!osdmap->is_up(i->osd)) {
11435       dout(10) << "peer_missing_requested removing " << *i << dendl;
11436       peer_missing_requested.erase(i++);
11437     } else {
11438       ++i;
11439     }
11440   }
11441 }
11442
11443 void PG::MissingLoc::check_recovery_sources(const OSDMapRef& osdmap)
11444 {
11445   set<pg_shard_t> now_down;
11446   for (set<pg_shard_t>::iterator p = missing_loc_sources.begin();
11447        p != missing_loc_sources.end();
11448        ) {
11449     if (osdmap->is_up(p->osd)) {
11450       ++p;
11451       continue;
11452     }
11453     ldout(pg->cct, 10) << "check_recovery_sources source osd." << *p << " now down" << dendl;
11454     now_down.insert(*p);
11455     missing_loc_sources.erase(p++);
11456   }
11457
11458   if (now_down.empty()) {
11459     ldout(pg->cct, 10) << "check_recovery_sources no source osds (" << missing_loc_sources << ") went down" << dendl;
11460   } else {
11461     ldout(pg->cct, 10) << "check_recovery_sources sources osds " << now_down << " now down, remaining sources are "
11462                        << missing_loc_sources << dendl;
11463
11464     // filter missing_loc
11465     map<hobject_t, set<pg_shard_t>>::iterator p = missing_loc.begin();
11466     while (p != missing_loc.end()) {
11467       set<pg_shard_t>::iterator q = p->second.begin();
11468       while (q != p->second.end())
11469         if (now_down.count(*q)) {
11470           p->second.erase(q++);
11471         } else {
11472           ++q;
11473         }
11474       if (p->second.empty())
11475         missing_loc.erase(p++);
11476       else
11477         ++p;
11478     }
11479   }
11480 }
11481
11482
11483 bool PrimaryLogPG::start_recovery_ops(
11484   uint64_t max,
11485   ThreadPool::TPHandle &handle,
11486   uint64_t *ops_started)
11487 {
11488   uint64_t& started = *ops_started;
11489   started = 0;
11490   bool work_in_progress = false;
11491   assert(is_primary());
11492
11493   if (!state_test(PG_STATE_RECOVERING) &&
11494       !state_test(PG_STATE_BACKFILLING)) {
11495     /* TODO: I think this case is broken and will make do_recovery()
11496      * unhappy since we're returning false */
11497     dout(10) << "recovery raced and were queued twice, ignoring!" << dendl;
11498     return false;
11499   }
11500
11501   const auto &missing = pg_log.get_missing();
11502
11503   unsigned int num_missing = missing.num_missing();
11504   uint64_t num_unfound = get_num_unfound();
11505
11506   if (num_missing == 0) {
11507     info.last_complete = info.last_update;
11508   }
11509
11510   if (num_missing == num_unfound) {
11511     // All of the missing objects we have are unfound.
11512     // Recover the replicas.
11513     started = recover_replicas(max, handle);
11514   }
11515   if (!started) {
11516     // We still have missing objects that we should grab from replicas.
11517     started += recover_primary(max, handle);
11518   }
11519   if (!started && num_unfound != get_num_unfound()) {
11520     // second chance to recovery replicas
11521     started = recover_replicas(max, handle);
11522   }
11523
11524   if (started)
11525     work_in_progress = true;
11526
11527   bool deferred_backfill = false;
11528   if (recovering.empty() &&
11529       state_test(PG_STATE_BACKFILLING) &&
11530       !backfill_targets.empty() && started < max &&
11531       missing.num_missing() == 0 &&
11532       waiting_on_backfill.empty()) {
11533     if (get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL)) {
11534       dout(10) << "deferring backfill due to NOBACKFILL" << dendl;
11535       deferred_backfill = true;
11536     } else if (get_osdmap()->test_flag(CEPH_OSDMAP_NOREBALANCE) &&
11537                !is_degraded())  {
11538       dout(10) << "deferring backfill due to NOREBALANCE" << dendl;
11539       deferred_backfill = true;
11540     } else if (!backfill_reserved) {
11541       dout(10) << "deferring backfill due to !backfill_reserved" << dendl;
11542       if (!backfill_reserving) {
11543         dout(10) << "queueing RequestBackfill" << dendl;
11544         backfill_reserving = true;
11545         queue_peering_event(
11546           CephPeeringEvtRef(
11547             std::make_shared<CephPeeringEvt>(
11548               get_osdmap()->get_epoch(),
11549               get_osdmap()->get_epoch(),
11550               RequestBackfill())));
11551       }
11552       deferred_backfill = true;
11553     } else {
11554       started += recover_backfill(max - started, handle, &work_in_progress);
11555     }
11556   }
11557
11558   dout(10) << " started " << started << dendl;
11559   osd->logger->inc(l_osd_rop, started);
11560
11561   if (!recovering.empty() ||
11562       work_in_progress || recovery_ops_active > 0 || deferred_backfill)
11563     return work_in_progress;
11564
11565   assert(recovering.empty());
11566   assert(recovery_ops_active == 0);
11567
11568   dout(10) << __func__ << " needs_recovery: "
11569            << missing_loc.get_needs_recovery()
11570            << dendl;
11571   dout(10) << __func__ << " missing_loc: "
11572            << missing_loc.get_missing_locs()
11573            << dendl;
11574   int unfound = get_num_unfound();
11575   if (unfound) {
11576     dout(10) << " still have " << unfound << " unfound" << dendl;
11577     return work_in_progress;
11578   }
11579
11580   if (missing.num_missing() > 0) {
11581     // this shouldn't happen!
11582     osd->clog->error() << info.pgid << " Unexpected Error: recovery ending with "
11583                        << missing.num_missing() << ": " << missing.get_items();
11584     return work_in_progress;
11585   }
11586
11587   if (needs_recovery()) {
11588     // this shouldn't happen!
11589     // We already checked num_missing() so we must have missing replicas
11590     osd->clog->error() << info.pgid
11591                        << " Unexpected Error: recovery ending with missing replicas";
11592     return work_in_progress;
11593   }
11594
11595   if (state_test(PG_STATE_RECOVERING)) {
11596     state_clear(PG_STATE_RECOVERING);
11597     state_clear(PG_STATE_FORCED_RECOVERY);
11598     if (needs_backfill()) {
11599       dout(10) << "recovery done, queuing backfill" << dendl;
11600       queue_peering_event(
11601         CephPeeringEvtRef(
11602           std::make_shared<CephPeeringEvt>(
11603             get_osdmap()->get_epoch(),
11604             get_osdmap()->get_epoch(),
11605             RequestBackfill())));
11606     } else {
11607       dout(10) << "recovery done, no backfill" << dendl;
11608       eio_errors_to_process = false;
11609       state_clear(PG_STATE_FORCED_BACKFILL);
11610       queue_peering_event(
11611         CephPeeringEvtRef(
11612           std::make_shared<CephPeeringEvt>(
11613             get_osdmap()->get_epoch(),
11614             get_osdmap()->get_epoch(),
11615             AllReplicasRecovered())));
11616     }
11617   } else { // backfilling
11618     state_clear(PG_STATE_BACKFILLING);
11619     state_clear(PG_STATE_FORCED_BACKFILL);
11620     state_clear(PG_STATE_FORCED_RECOVERY);
11621     dout(10) << "recovery done, backfill done" << dendl;
11622     eio_errors_to_process = false;
11623     queue_peering_event(
11624       CephPeeringEvtRef(
11625         std::make_shared<CephPeeringEvt>(
11626           get_osdmap()->get_epoch(),
11627           get_osdmap()->get_epoch(),
11628           Backfilled())));
11629   }
11630
11631   return false;
11632 }
11633
11634 /**
11635  * do one recovery op.
11636  * return true if done, false if nothing left to do.
11637  */
11638 uint64_t PrimaryLogPG::recover_primary(uint64_t max, ThreadPool::TPHandle &handle)
11639 {
11640   assert(is_primary());
11641
11642   const auto &missing = pg_log.get_missing();
11643
11644   dout(10) << "recover_primary recovering " << recovering.size()
11645            << " in pg" << dendl;
11646   dout(10) << "recover_primary " << missing << dendl;
11647   dout(25) << "recover_primary " << missing.get_items() << dendl;
11648
11649   // look at log!
11650   pg_log_entry_t *latest = 0;
11651   unsigned started = 0;
11652   int skipped = 0;
11653
11654   PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
11655   map<version_t, hobject_t>::const_iterator p =
11656     missing.get_rmissing().lower_bound(pg_log.get_log().last_requested);
11657   while (p != missing.get_rmissing().end()) {
11658     handle.reset_tp_timeout();
11659     hobject_t soid;
11660     version_t v = p->first;
11661
11662     if (pg_log.get_log().objects.count(p->second)) {
11663       latest = pg_log.get_log().objects.find(p->second)->second;
11664       assert(latest->is_update() || latest->is_delete());
11665       soid = latest->soid;
11666     } else {
11667       latest = 0;
11668       soid = p->second;
11669     }
11670     const pg_missing_item& item = missing.get_items().find(p->second)->second;
11671     ++p;
11672
11673     hobject_t head = soid.get_head();
11674
11675     eversion_t need = item.need;
11676
11677     dout(10) << "recover_primary "
11678              << soid << " " << item.need
11679              << (missing.is_missing(soid) ? " (missing)":"")
11680              << (missing.is_missing(head) ? " (missing head)":"")
11681              << (recovering.count(soid) ? " (recovering)":"")
11682              << (recovering.count(head) ? " (recovering head)":"")
11683              << dendl;
11684
11685     if (latest) {
11686       switch (latest->op) {
11687       case pg_log_entry_t::CLONE:
11688         /*
11689          * Handling for this special case removed for now, until we
11690          * can correctly construct an accurate SnapSet from the old
11691          * one.
11692          */
11693         break;
11694
11695       case pg_log_entry_t::LOST_REVERT:
11696         {
11697           if (item.have == latest->reverting_to) {
11698             ObjectContextRef obc = get_object_context(soid, true);
11699
11700             if (obc->obs.oi.version == latest->version) {
11701               // I'm already reverting
11702               dout(10) << " already reverting " << soid << dendl;
11703             } else {
11704               dout(10) << " reverting " << soid << " to " << latest->prior_version << dendl;
11705               obc->ondisk_write_lock();
11706               obc->obs.oi.version = latest->version;
11707
11708               ObjectStore::Transaction t;
11709               bufferlist b2;
11710               obc->obs.oi.encode(
11711                 b2,
11712                 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
11713               assert(!pool.info.require_rollback());
11714               t.setattr(coll, ghobject_t(soid), OI_ATTR, b2);
11715
11716               recover_got(soid, latest->version);
11717               missing_loc.add_location(soid, pg_whoami);
11718
11719               ++active_pushes;
11720
11721               osd->store->queue_transaction(osr.get(), std::move(t),
11722                                             new C_OSD_AppliedRecoveredObject(this, obc),
11723                                             new C_OSD_CommittedPushedObject(
11724                                               this,
11725                                               get_osdmap()->get_epoch(),
11726                                               info.last_complete),
11727                                             new C_OSD_OndiskWriteUnlock(obc));
11728               continue;
11729             }
11730           } else {
11731             /*
11732              * Pull the old version of the object.  Update missing_loc here to have the location
11733              * of the version we want.
11734              *
11735              * This doesn't use the usual missing_loc paths, but that's okay:
11736              *  - if we have it locally, we hit the case above, and go from there.
11737              *  - if we don't, we always pass through this case during recovery and set up the location
11738              *    properly.
11739              *  - this way we don't need to mangle the missing code to be general about needing an old
11740              *    version...
11741              */
11742             eversion_t alternate_need = latest->reverting_to;
11743             dout(10) << " need to pull prior_version " << alternate_need << " for revert " << item << dendl;
11744
11745             for (map<pg_shard_t, pg_missing_t>::iterator p = peer_missing.begin();
11746                  p != peer_missing.end();
11747                  ++p)
11748               if (p->second.is_missing(soid, need) &&
11749                   p->second.get_items().at(soid).have == alternate_need) {
11750                 missing_loc.add_location(soid, p->first);
11751               }
11752             dout(10) << " will pull " << alternate_need << " or " << need
11753                      << " from one of " << missing_loc.get_locations(soid)
11754                      << dendl;
11755           }
11756         }
11757         break;
11758       }
11759     }
11760
11761     if (!recovering.count(soid)) {
11762       if (recovering.count(head)) {
11763         ++skipped;
11764       } else {
11765         int r = recover_missing(
11766           soid, need, get_recovery_op_priority(), h);
11767         switch (r) {
11768         case PULL_YES:
11769           ++started;
11770           break;
11771         case PULL_OTHER:
11772           ++started;
11773         case PULL_NONE:
11774           ++skipped;
11775           break;
11776         default:
11777           ceph_abort();
11778         }
11779         if (started >= max)
11780           break;
11781       }
11782     }
11783
11784     // only advance last_requested if we haven't skipped anything
11785     if (!skipped)
11786       pg_log.set_last_requested(v);
11787   }
11788
11789   pgbackend->run_recovery_op(h, get_recovery_op_priority());
11790   return started;
11791 }
11792
11793 bool PrimaryLogPG::primary_error(
11794   const hobject_t& soid, eversion_t v)
11795 {
11796   pg_log.missing_add(soid, v, eversion_t());
11797   pg_log.set_last_requested(0);
11798   missing_loc.remove_location(soid, pg_whoami);
11799   bool uhoh = true;
11800   assert(!actingbackfill.empty());
11801   for (set<pg_shard_t>::iterator i = actingbackfill.begin();
11802        i != actingbackfill.end();
11803        ++i) {
11804     if (*i == get_primary()) continue;
11805     pg_shard_t peer = *i;
11806     if (!peer_missing[peer].is_missing(soid, v)) {
11807       missing_loc.add_location(soid, peer);
11808       dout(10) << info.pgid << " unexpectedly missing " << soid << " v" << v
11809                << ", there should be a copy on shard " << peer << dendl;
11810       uhoh = false;
11811     }
11812   }
11813   if (uhoh)
11814     osd->clog->error() << info.pgid << " missing primary copy of " << soid << ", unfound";
11815   else
11816     osd->clog->error() << info.pgid << " missing primary copy of " << soid
11817                          << ", will try copies on " << missing_loc.get_locations(soid);
11818   return uhoh;
11819 }
11820
11821 int PrimaryLogPG::prep_object_replica_deletes(
11822   const hobject_t& soid, eversion_t v,
11823   PGBackend::RecoveryHandle *h)
11824 {
11825   assert(is_primary());
11826   dout(10) << __func__ << ": on " << soid << dendl;
11827
11828   start_recovery_op(soid);
11829   assert(!recovering.count(soid));
11830   recovering.insert(make_pair(soid, ObjectContextRef()));
11831
11832   pgbackend->recover_delete_object(soid, v, h);
11833   return 1;
11834 }
11835
11836 int PrimaryLogPG::prep_object_replica_pushes(
11837   const hobject_t& soid, eversion_t v,
11838   PGBackend::RecoveryHandle *h)
11839 {
11840   assert(is_primary());
11841   dout(10) << __func__ << ": on " << soid << dendl;
11842
11843   // NOTE: we know we will get a valid oloc off of disk here.
11844   ObjectContextRef obc = get_object_context(soid, false);
11845   if (!obc) {
11846     primary_error(soid, v);
11847     return 0;
11848   }
11849
11850   if (!obc->get_recovery_read()) {
11851     dout(20) << "recovery delayed on " << soid
11852              << "; could not get rw_manager lock" << dendl;
11853     return 0;
11854   } else {
11855     dout(20) << "recovery got recovery read lock on " << soid
11856              << dendl;
11857   }
11858
11859   start_recovery_op(soid);
11860   assert(!recovering.count(soid));
11861   recovering.insert(make_pair(soid, obc));
11862
11863   /* We need this in case there is an in progress write on the object.  In fact,
11864    * the only possible write is an update to the xattr due to a lost_revert --
11865    * a client write would be blocked since the object is degraded.
11866    * In almost all cases, therefore, this lock should be uncontended.
11867    */
11868   obc->ondisk_read_lock();
11869   int r = pgbackend->recover_object(
11870     soid,
11871     v,
11872     ObjectContextRef(),
11873     obc, // has snapset context
11874     h);
11875   obc->ondisk_read_unlock();
11876   if (r < 0) {
11877     dout(0) << __func__ << " Error " << r << " on oid " << soid << dendl;
11878     primary_failed(soid);
11879     primary_error(soid, v);
11880     return 0;
11881   }
11882   return 1;
11883 }
11884
11885 uint64_t PrimaryLogPG::recover_replicas(uint64_t max, ThreadPool::TPHandle &handle)
11886 {
11887   dout(10) << __func__ << "(" << max << ")" << dendl;
11888   uint64_t started = 0;
11889
11890   PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
11891
11892   // this is FAR from an optimal recovery order.  pretty lame, really.
11893   assert(!actingbackfill.empty());
11894   for (set<pg_shard_t>::iterator i = actingbackfill.begin();
11895        i != actingbackfill.end();
11896        ++i) {
11897     if (*i == get_primary()) continue;
11898     pg_shard_t peer = *i;
11899     map<pg_shard_t, pg_missing_t>::const_iterator pm = peer_missing.find(peer);
11900     assert(pm != peer_missing.end());
11901     map<pg_shard_t, pg_info_t>::const_iterator pi = peer_info.find(peer);
11902     assert(pi != peer_info.end());
11903     size_t m_sz = pm->second.num_missing();
11904
11905     dout(10) << " peer osd." << peer << " missing " << m_sz << " objects." << dendl;
11906     dout(20) << " peer osd." << peer << " missing " << pm->second.get_items() << dendl;
11907
11908     // oldest first!
11909     const pg_missing_t &m(pm->second);
11910     for (map<version_t, hobject_t>::const_iterator p = m.get_rmissing().begin();
11911          p != m.get_rmissing().end() && started < max;
11912            ++p) {
11913       handle.reset_tp_timeout();
11914       const hobject_t soid(p->second);
11915
11916       if (missing_loc.is_unfound(soid)) {
11917         dout(10) << __func__ << ": " << soid << " still unfound" << dendl;
11918         continue;
11919       }
11920
11921       if (soid > pi->second.last_backfill) {
11922         if (!recovering.count(soid)) {
11923           derr << __func__ << ": object " << soid << " last_backfill " << pi->second.last_backfill << dendl;
11924           derr << __func__ << ": object added to missing set for backfill, but "
11925                << "is not in recovering, error!" << dendl;
11926           ceph_abort();
11927         }
11928         continue;
11929       }
11930
11931       if (recovering.count(soid)) {
11932         dout(10) << __func__ << ": already recovering " << soid << dendl;
11933         continue;
11934       }
11935
11936       if (missing_loc.is_deleted(soid)) {
11937         dout(10) << __func__ << ": " << soid << " is a delete, removing" << dendl;
11938         map<hobject_t,pg_missing_item>::const_iterator r = m.get_items().find(soid);
11939         started += prep_object_replica_deletes(soid, r->second.need, h);
11940         continue;
11941       }
11942
11943       if (soid.is_snap() && pg_log.get_missing().is_missing(soid.get_head())) {
11944         dout(10) << __func__ << ": " << soid.get_head()
11945                  << " still missing on primary" << dendl;
11946         continue;
11947       }
11948
11949       if (soid.is_snap() && pg_log.get_missing().is_missing(soid.get_snapdir())) {
11950         dout(10) << __func__ << ": " << soid.get_snapdir()
11951                  << " still missing on primary" << dendl;
11952         continue;
11953       }
11954
11955       if (pg_log.get_missing().is_missing(soid)) {
11956         dout(10) << __func__ << ": " << soid << " still missing on primary" << dendl;
11957         continue;
11958       }
11959
11960       dout(10) << __func__ << ": recover_object_replicas(" << soid << ")" << dendl;
11961       map<hobject_t,pg_missing_item>::const_iterator r = m.get_items().find(soid);
11962       started += prep_object_replica_pushes(soid, r->second.need,
11963                                             h);
11964     }
11965   }
11966
11967   pgbackend->run_recovery_op(h, get_recovery_op_priority());
11968   return started;
11969 }
11970
11971 hobject_t PrimaryLogPG::earliest_peer_backfill() const
11972 {
11973   hobject_t e = hobject_t::get_max();
11974   for (set<pg_shard_t>::const_iterator i = backfill_targets.begin();
11975        i != backfill_targets.end();
11976        ++i) {
11977     pg_shard_t peer = *i;
11978     map<pg_shard_t, BackfillInterval>::const_iterator iter =
11979       peer_backfill_info.find(peer);
11980     assert(iter != peer_backfill_info.end());
11981     if (iter->second.begin < e)
11982       e = iter->second.begin;
11983   }
11984   return e;
11985 }
11986
11987 bool PrimaryLogPG::all_peer_done() const
11988 {
11989   // Primary hasn't got any more objects
11990   assert(backfill_info.empty());
11991
11992   for (set<pg_shard_t>::const_iterator i = backfill_targets.begin();
11993        i != backfill_targets.end();
11994        ++i) {
11995     pg_shard_t bt = *i;
11996     map<pg_shard_t, BackfillInterval>::const_iterator piter =
11997       peer_backfill_info.find(bt);
11998     assert(piter != peer_backfill_info.end());
11999     const BackfillInterval& pbi = piter->second;
12000     // See if peer has more to process
12001     if (!pbi.extends_to_end() || !pbi.empty())
12002         return false;
12003   }
12004   return true;
12005 }
12006
12007 /**
12008  * recover_backfill
12009  *
12010  * Invariants:
12011  *
12012  * backfilled: fully pushed to replica or present in replica's missing set (both
12013  * our copy and theirs).
12014  *
12015  * All objects on a backfill_target in
12016  * [MIN,peer_backfill_info[backfill_target].begin) are valid; logically-removed
12017  * objects have been actually deleted and all logically-valid objects are replicated.
12018  * There may be PG objects in this interval yet to be backfilled.
12019  *
12020  * All objects in PG in [MIN,backfill_info.begin) have been backfilled to all
12021  * backfill_targets.  There may be objects on backfill_target(s) yet to be deleted.
12022  *
12023  * For a backfill target, all objects < MIN(peer_backfill_info[target].begin,
12024  *     backfill_info.begin) in PG are backfilled.  No deleted objects in this
12025  * interval remain on the backfill target.
12026  *
12027  * For a backfill target, all objects <= peer_info[target].last_backfill
12028  * have been backfilled to target
12029  *
12030  * There *MAY* be missing/outdated objects between last_backfill_started and
12031  * MIN(peer_backfill_info[*].begin, backfill_info.begin) in the event that client
12032  * io created objects since the last scan.  For this reason, we call
12033  * update_range() again before continuing backfill.
12034  */
12035 uint64_t PrimaryLogPG::recover_backfill(
12036   uint64_t max,
12037   ThreadPool::TPHandle &handle, bool *work_started)
12038 {
12039   dout(10) << "recover_backfill (" << max << ")"
12040            << " bft=" << backfill_targets
12041            << " last_backfill_started " << last_backfill_started
12042            << (new_backfill ? " new_backfill":"")
12043            << dendl;
12044   assert(!backfill_targets.empty());
12045
12046   // Initialize from prior backfill state
12047   if (new_backfill) {
12048     // on_activate() was called prior to getting here
12049     assert(last_backfill_started == earliest_backfill());
12050     new_backfill = false;
12051
12052     // initialize BackfillIntervals
12053     for (set<pg_shard_t>::iterator i = backfill_targets.begin();
12054          i != backfill_targets.end();
12055          ++i) {
12056       peer_backfill_info[*i].reset(peer_info[*i].last_backfill);
12057     }
12058     backfill_info.reset(last_backfill_started);
12059
12060     backfills_in_flight.clear();
12061     pending_backfill_updates.clear();
12062   }
12063
12064   for (set<pg_shard_t>::iterator i = backfill_targets.begin();
12065        i != backfill_targets.end();
12066        ++i) {
12067     dout(10) << "peer osd." << *i
12068            << " info " << peer_info[*i]
12069            << " interval " << peer_backfill_info[*i].begin
12070            << "-" << peer_backfill_info[*i].end
12071            << " " << peer_backfill_info[*i].objects.size() << " objects"
12072            << dendl;
12073   }
12074
12075   // update our local interval to cope with recent changes
12076   backfill_info.begin = last_backfill_started;
12077   update_range(&backfill_info, handle);
12078
12079   unsigned ops = 0;
12080   vector<boost::tuple<hobject_t, eversion_t, pg_shard_t> > to_remove;
12081   set<hobject_t> add_to_stat;
12082
12083   for (set<pg_shard_t>::iterator i = backfill_targets.begin();
12084        i != backfill_targets.end();
12085        ++i) {
12086     peer_backfill_info[*i].trim_to(
12087       std::max(peer_info[*i].last_backfill, last_backfill_started));
12088   }
12089   backfill_info.trim_to(last_backfill_started);
12090
12091   PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
12092   while (ops < max) {
12093     if (backfill_info.begin <= earliest_peer_backfill() &&
12094         !backfill_info.extends_to_end() && backfill_info.empty()) {
12095       hobject_t next = backfill_info.end;
12096       backfill_info.reset(next);
12097       backfill_info.end = hobject_t::get_max();
12098       update_range(&backfill_info, handle);
12099       backfill_info.trim();
12100     }
12101
12102     dout(20) << "   my backfill interval " << backfill_info << dendl;
12103
12104     bool sent_scan = false;
12105     for (set<pg_shard_t>::iterator i = backfill_targets.begin();
12106          i != backfill_targets.end();
12107          ++i) {
12108       pg_shard_t bt = *i;
12109       BackfillInterval& pbi = peer_backfill_info[bt];
12110
12111       dout(20) << " peer shard " << bt << " backfill " << pbi << dendl;
12112       if (pbi.begin <= backfill_info.begin &&
12113           !pbi.extends_to_end() && pbi.empty()) {
12114         dout(10) << " scanning peer osd." << bt << " from " << pbi.end << dendl;
12115         epoch_t e = get_osdmap()->get_epoch();
12116         MOSDPGScan *m = new MOSDPGScan(
12117           MOSDPGScan::OP_SCAN_GET_DIGEST, pg_whoami, e, last_peering_reset,
12118           spg_t(info.pgid.pgid, bt.shard),
12119           pbi.end, hobject_t());
12120         osd->send_message_osd_cluster(bt.osd, m, get_osdmap()->get_epoch());
12121         assert(waiting_on_backfill.find(bt) == waiting_on_backfill.end());
12122         waiting_on_backfill.insert(bt);
12123         sent_scan = true;
12124       }
12125     }
12126
12127     // Count simultaneous scans as a single op and let those complete
12128     if (sent_scan) {
12129       ops++;
12130       start_recovery_op(hobject_t::get_max()); // XXX: was pbi.end
12131       break;
12132     }
12133
12134     if (backfill_info.empty() && all_peer_done()) {
12135       dout(10) << " reached end for both local and all peers" << dendl;
12136       break;
12137     }
12138
12139     // Get object within set of peers to operate on and
12140     // the set of targets for which that object applies.
12141     hobject_t check = earliest_peer_backfill();
12142
12143     if (check < backfill_info.begin) {
12144
12145       set<pg_shard_t> check_targets;
12146       for (set<pg_shard_t>::iterator i = backfill_targets.begin();
12147            i != backfill_targets.end();
12148            ++i) {
12149         pg_shard_t bt = *i;
12150         BackfillInterval& pbi = peer_backfill_info[bt];
12151         if (pbi.begin == check)
12152           check_targets.insert(bt);
12153       }
12154       assert(!check_targets.empty());
12155
12156       dout(20) << " BACKFILL removing " << check
12157                << " from peers " << check_targets << dendl;
12158       for (set<pg_shard_t>::iterator i = check_targets.begin();
12159            i != check_targets.end();
12160            ++i) {
12161         pg_shard_t bt = *i;
12162         BackfillInterval& pbi = peer_backfill_info[bt];
12163         assert(pbi.begin == check);
12164
12165         to_remove.push_back(boost::make_tuple(check, pbi.objects.begin()->second, bt));
12166         pbi.pop_front();
12167       }
12168
12169       /* This requires a bit of explanation.  We compare head against
12170        * last_backfill to determine whether to send an operation
12171        * to the replica.  A single write operation can touch up to three
12172        * objects: head, the snapdir, and a new clone which sorts closer to
12173        * head than any existing clone.  If last_backfill points at a clone,
12174        * the transaction won't be sent and all 3 must lie on the right side
12175        * of the line (i.e., we'll backfill them later).  If last_backfill
12176        * points at snapdir, it sorts greater than head, so we send the
12177        * transaction which is correct because all three must lie to the left
12178        * of the line.
12179        *
12180        * If it points at head, we have a bit of an issue.  If head actually
12181        * exists, no problem, because any transaction which touches snapdir
12182        * must end up creating it (and deleting head), so sending the
12183        * operation won't pose a problem -- we'll end up having to scan it,
12184        * but it'll end up being the right version so we won't bother to
12185        * rebackfill it.  However, if head doesn't exist, any write on head
12186        * will remove snapdir.  For a replicated pool, this isn't a problem,
12187        * ENOENT on remove isn't an issue and it's in backfill future anyway.
12188        * It only poses a problem for EC pools, because we never just delete
12189        * an object, we rename it into a rollback object.  That operation
12190        * will end up crashing the osd with ENOENT.  Tolerating the failure
12191        * wouldn't work either, even if snapdir exists, we'd be creating a
12192        * rollback object past the last_backfill line which wouldn't get
12193        * cleaned up (no rollback objects past the last_backfill line is an
12194        * existing important invariant).  Thus, let's avoid the whole issue
12195        * by just not updating last_backfill_started here if head doesn't
12196        * exist and snapdir does.  We aren't using up a recovery count here,
12197        * so we're going to recover snapdir immediately anyway.  We'll only
12198        * fail "backward" if we fail to get the rw lock and that just means
12199        * we'll re-process this section of the hash space again.
12200        *
12201        * I'm choosing this hack here because the really "correct" answer is
12202        * going to be to unify snapdir and head into a single object (a
12203        * snapdir is really just a confusing way to talk about head existing
12204        * as a whiteout), but doing that is going to be a somewhat larger
12205        * undertaking.
12206        *
12207        * @see http://tracker.ceph.com/issues/17668
12208        */
12209       if (!(check.is_head() &&
12210             backfill_info.begin.is_snapdir() &&
12211             check == backfill_info.begin.get_head()))
12212         last_backfill_started = check;
12213
12214       // Don't increment ops here because deletions
12215       // are cheap and not replied to unlike real recovery_ops,
12216       // and we can't increment ops without requeueing ourself
12217       // for recovery.
12218     } else {
12219       eversion_t& obj_v = backfill_info.objects.begin()->second;
12220
12221       vector<pg_shard_t> need_ver_targs, missing_targs, keep_ver_targs, skip_targs;
12222       for (set<pg_shard_t>::iterator i = backfill_targets.begin();
12223            i != backfill_targets.end();
12224            ++i) {
12225         pg_shard_t bt = *i;
12226         BackfillInterval& pbi = peer_backfill_info[bt];
12227         // Find all check peers that have the wrong version
12228         if (check == backfill_info.begin && check == pbi.begin) {
12229           if (pbi.objects.begin()->second != obj_v) {
12230             need_ver_targs.push_back(bt);
12231           } else {
12232             keep_ver_targs.push_back(bt);
12233           }
12234         } else {
12235           pg_info_t& pinfo = peer_info[bt];
12236
12237           // Only include peers that we've caught up to their backfill line
12238           // otherwise, they only appear to be missing this object
12239           // because their pbi.begin > backfill_info.begin.
12240           if (backfill_info.begin > pinfo.last_backfill)
12241             missing_targs.push_back(bt);
12242           else
12243             skip_targs.push_back(bt);
12244         }
12245       }
12246
12247       if (!keep_ver_targs.empty()) {
12248         // These peers have version obj_v
12249         dout(20) << " BACKFILL keeping " << check
12250                  << " with ver " << obj_v
12251                  << " on peers " << keep_ver_targs << dendl;
12252         //assert(!waiting_for_degraded_object.count(check));
12253       }
12254       if (!need_ver_targs.empty() || !missing_targs.empty()) {
12255         ObjectContextRef obc = get_object_context(backfill_info.begin, false);
12256         assert(obc);
12257         if (obc->get_recovery_read()) {
12258           if (!need_ver_targs.empty()) {
12259             dout(20) << " BACKFILL replacing " << check
12260                    << " with ver " << obj_v
12261                    << " to peers " << need_ver_targs << dendl;
12262           }
12263           if (!missing_targs.empty()) {
12264             dout(20) << " BACKFILL pushing " << backfill_info.begin
12265                  << " with ver " << obj_v
12266                  << " to peers " << missing_targs << dendl;
12267           }
12268           vector<pg_shard_t> all_push = need_ver_targs;
12269           all_push.insert(all_push.end(), missing_targs.begin(), missing_targs.end());
12270
12271           handle.reset_tp_timeout();
12272           int r = prep_backfill_object_push(backfill_info.begin, obj_v, obc, all_push, h);
12273           if (r < 0) {
12274             *work_started = true;
12275             dout(0) << __func__ << " Error " << r << " trying to backfill " << backfill_info.begin << dendl;
12276             break;
12277           }
12278           ops++;
12279         } else {
12280           *work_started = true;
12281           dout(20) << "backfill blocking on " << backfill_info.begin
12282                    << "; could not get rw_manager lock" << dendl;
12283           break;
12284         }
12285       }
12286       dout(20) << "need_ver_targs=" << need_ver_targs
12287                << " keep_ver_targs=" << keep_ver_targs << dendl;
12288       dout(20) << "backfill_targets=" << backfill_targets
12289                << " missing_targs=" << missing_targs
12290                << " skip_targs=" << skip_targs << dendl;
12291
12292       last_backfill_started = backfill_info.begin;
12293       add_to_stat.insert(backfill_info.begin); // XXX: Only one for all pushes?
12294       backfill_info.pop_front();
12295       vector<pg_shard_t> check_targets = need_ver_targs;
12296       check_targets.insert(check_targets.end(), keep_ver_targs.begin(), keep_ver_targs.end());
12297       for (vector<pg_shard_t>::iterator i = check_targets.begin();
12298            i != check_targets.end();
12299            ++i) {
12300         pg_shard_t bt = *i;
12301         BackfillInterval& pbi = peer_backfill_info[bt];
12302         pbi.pop_front();
12303       }
12304     }
12305   }
12306
12307   hobject_t backfill_pos =
12308     std::min(backfill_info.begin, earliest_peer_backfill());
12309
12310   for (set<hobject_t>::iterator i = add_to_stat.begin();
12311        i != add_to_stat.end();
12312        ++i) {
12313     ObjectContextRef obc = get_object_context(*i, false);
12314     assert(obc);
12315     pg_stat_t stat;
12316     add_object_context_to_pg_stat(obc, &stat);
12317     pending_backfill_updates[*i] = stat;
12318   }
12319   if (HAVE_FEATURE(get_min_upacting_features(), SERVER_LUMINOUS)) {
12320     map<pg_shard_t,MOSDPGBackfillRemove*> reqs;
12321     for (unsigned i = 0; i < to_remove.size(); ++i) {
12322       handle.reset_tp_timeout();
12323       const hobject_t& oid = to_remove[i].get<0>();
12324       eversion_t v = to_remove[i].get<1>();
12325       pg_shard_t peer = to_remove[i].get<2>();
12326       MOSDPGBackfillRemove *m;
12327       auto it = reqs.find(peer);
12328       if (it != reqs.end()) {
12329         m = it->second;
12330       } else {
12331         m = reqs[peer] = new MOSDPGBackfillRemove(
12332           spg_t(info.pgid.pgid, peer.shard),
12333           get_osdmap()->get_epoch());
12334       }
12335       m->ls.push_back(make_pair(oid, v));
12336
12337       if (oid <= last_backfill_started)
12338         pending_backfill_updates[oid]; // add empty stat!
12339     }
12340     for (auto p : reqs) {
12341       osd->send_message_osd_cluster(p.first.osd, p.second,
12342                                     get_osdmap()->get_epoch());
12343     }
12344   } else {
12345     // for jewel targets
12346     for (unsigned i = 0; i < to_remove.size(); ++i) {
12347       handle.reset_tp_timeout();
12348
12349       // ordered before any subsequent updates
12350       send_remove_op(to_remove[i].get<0>(), to_remove[i].get<1>(),
12351                      to_remove[i].get<2>());
12352
12353       if (to_remove[i].get<0>() <= last_backfill_started)
12354         pending_backfill_updates[to_remove[i].get<0>()]; // add empty stat!
12355     }
12356   }
12357
12358   pgbackend->run_recovery_op(h, get_recovery_op_priority());
12359
12360   dout(5) << "backfill_pos is " << backfill_pos << dendl;
12361   for (set<hobject_t>::iterator i = backfills_in_flight.begin();
12362        i != backfills_in_flight.end();
12363        ++i) {
12364     dout(20) << *i << " is still in flight" << dendl;
12365   }
12366
12367   hobject_t next_backfill_to_complete = backfills_in_flight.empty() ?
12368     backfill_pos : *(backfills_in_flight.begin());
12369   hobject_t new_last_backfill = earliest_backfill();
12370   dout(10) << "starting new_last_backfill at " << new_last_backfill << dendl;
12371   for (map<hobject_t, pg_stat_t>::iterator i =
12372          pending_backfill_updates.begin();
12373        i != pending_backfill_updates.end() &&
12374          i->first < next_backfill_to_complete;
12375        pending_backfill_updates.erase(i++)) {
12376     dout(20) << " pending_backfill_update " << i->first << dendl;
12377     assert(i->first > new_last_backfill);
12378     for (set<pg_shard_t>::iterator j = backfill_targets.begin();
12379          j != backfill_targets.end();
12380          ++j) {
12381       pg_shard_t bt = *j;
12382       pg_info_t& pinfo = peer_info[bt];
12383       //Add stats to all peers that were missing object
12384       if (i->first > pinfo.last_backfill)
12385         pinfo.stats.add(i->second);
12386     }
12387     new_last_backfill = i->first;
12388   }
12389   dout(10) << "possible new_last_backfill at " << new_last_backfill << dendl;
12390
12391   assert(!pending_backfill_updates.empty() ||
12392          new_last_backfill == last_backfill_started);
12393   if (pending_backfill_updates.empty() &&
12394       backfill_pos.is_max()) {
12395     assert(backfills_in_flight.empty());
12396     new_last_backfill = backfill_pos;
12397     last_backfill_started = backfill_pos;
12398   }
12399   dout(10) << "final new_last_backfill at " << new_last_backfill << dendl;
12400
12401   // If new_last_backfill == MAX, then we will send OP_BACKFILL_FINISH to
12402   // all the backfill targets.  Otherwise, we will move last_backfill up on
12403   // those targets need it and send OP_BACKFILL_PROGRESS to them.
12404   for (set<pg_shard_t>::iterator i = backfill_targets.begin();
12405        i != backfill_targets.end();
12406        ++i) {
12407     pg_shard_t bt = *i;
12408     pg_info_t& pinfo = peer_info[bt];
12409
12410     if (new_last_backfill > pinfo.last_backfill) {
12411       pinfo.set_last_backfill(new_last_backfill);
12412       epoch_t e = get_osdmap()->get_epoch();
12413       MOSDPGBackfill *m = NULL;
12414       if (pinfo.last_backfill.is_max()) {
12415         m = new MOSDPGBackfill(
12416           MOSDPGBackfill::OP_BACKFILL_FINISH,
12417           e,
12418           last_peering_reset,
12419           spg_t(info.pgid.pgid, bt.shard));
12420         // Use default priority here, must match sub_op priority
12421         /* pinfo.stats might be wrong if we did log-based recovery on the
12422          * backfilled portion in addition to continuing backfill.
12423          */
12424         pinfo.stats = info.stats;
12425         start_recovery_op(hobject_t::get_max());
12426       } else {
12427         m = new MOSDPGBackfill(
12428           MOSDPGBackfill::OP_BACKFILL_PROGRESS,
12429           e,
12430           last_peering_reset,
12431           spg_t(info.pgid.pgid, bt.shard));
12432         // Use default priority here, must match sub_op priority
12433       }
12434       m->last_backfill = pinfo.last_backfill;
12435       m->stats = pinfo.stats;
12436       osd->send_message_osd_cluster(bt.osd, m, get_osdmap()->get_epoch());
12437       dout(10) << " peer " << bt
12438                << " num_objects now " << pinfo.stats.stats.sum.num_objects
12439                << " / " << info.stats.stats.sum.num_objects << dendl;
12440     }
12441   }
12442
12443   if (ops)
12444     *work_started = true;
12445   return ops;
12446 }
12447
12448 int PrimaryLogPG::prep_backfill_object_push(
12449   hobject_t oid, eversion_t v,
12450   ObjectContextRef obc,
12451   vector<pg_shard_t> peers,
12452   PGBackend::RecoveryHandle *h)
12453 {
12454   dout(10) << __func__ << " " << oid << " v " << v << " to peers " << peers << dendl;
12455   assert(!peers.empty());
12456
12457   backfills_in_flight.insert(oid);
12458   for (unsigned int i = 0 ; i < peers.size(); ++i) {
12459     map<pg_shard_t, pg_missing_t>::iterator bpm = peer_missing.find(peers[i]);
12460     assert(bpm != peer_missing.end());
12461     bpm->second.add(oid, eversion_t(), eversion_t(), false);
12462   }
12463
12464   assert(!recovering.count(oid));
12465
12466   start_recovery_op(oid);
12467   recovering.insert(make_pair(oid, obc));
12468
12469   // We need to take the read_lock here in order to flush in-progress writes
12470   obc->ondisk_read_lock();
12471   int r = pgbackend->recover_object(
12472     oid,
12473     v,
12474     ObjectContextRef(),
12475     obc,
12476     h);
12477   obc->ondisk_read_unlock();
12478   if (r < 0) {
12479     dout(0) << __func__ << " Error " << r << " on oid " << oid << dendl;
12480     primary_failed(oid);
12481     primary_error(oid, v);
12482     backfills_in_flight.erase(oid);
12483     missing_loc.add_missing(oid, v, eversion_t());
12484   }
12485   return r;
12486 }
12487
12488 void PrimaryLogPG::update_range(
12489   BackfillInterval *bi,
12490   ThreadPool::TPHandle &handle)
12491 {
12492   int local_min = cct->_conf->osd_backfill_scan_min;
12493   int local_max = cct->_conf->osd_backfill_scan_max;
12494
12495   if (bi->version < info.log_tail) {
12496     dout(10) << __func__<< ": bi is old, rescanning local backfill_info"
12497              << dendl;
12498     osr->flush();
12499     if (last_update_applied >= info.log_tail) {
12500       bi->version = last_update_applied;
12501     } else {
12502       bi->version = info.last_update;
12503     }
12504     scan_range(local_min, local_max, bi, handle);
12505   }
12506
12507   if (bi->version >= projected_last_update) {
12508     dout(10) << __func__<< ": bi is current " << dendl;
12509     assert(bi->version == projected_last_update);
12510   } else if (bi->version >= info.log_tail) {
12511     if (pg_log.get_log().empty() && projected_log.empty()) {
12512       /* Because we don't move log_tail on split, the log might be
12513        * empty even if log_tail != last_update.  However, the only
12514        * way to get here with an empty log is if log_tail is actually
12515        * eversion_t(), because otherwise the entry which changed
12516        * last_update since the last scan would have to be present.
12517        */
12518       assert(bi->version == eversion_t());
12519       return;
12520     }
12521
12522     dout(10) << __func__<< ": bi is old, (" << bi->version
12523              << ") can be updated with log to projected_last_update "
12524              << projected_last_update << dendl;
12525
12526     auto func = [&](const pg_log_entry_t &e) {
12527       dout(10) << __func__ << ": updating from version " << e.version
12528                << dendl;
12529       const hobject_t &soid = e.soid;
12530       if (soid >= bi->begin &&
12531           soid < bi->end) {
12532         if (e.is_update()) {
12533           dout(10) << __func__ << ": " << e.soid << " updated to version "
12534                    << e.version << dendl;
12535           bi->objects.erase(e.soid);
12536           bi->objects.insert(
12537             make_pair(
12538               e.soid,
12539               e.version));
12540         } else if (e.is_delete()) {
12541           dout(10) << __func__ << ": " << e.soid << " removed" << dendl;
12542           bi->objects.erase(e.soid);
12543         }
12544       }
12545     };
12546     dout(10) << "scanning pg log first" << dendl;
12547     pg_log.get_log().scan_log_after(bi->version, func);
12548     dout(10) << "scanning projected log" << dendl;
12549     projected_log.scan_log_after(bi->version, func);
12550     bi->version = projected_last_update;
12551   } else {
12552     assert(0 == "scan_range should have raised bi->version past log_tail");
12553   }
12554 }
12555
12556 void PrimaryLogPG::scan_range(
12557   int min, int max, BackfillInterval *bi,
12558   ThreadPool::TPHandle &handle)
12559 {
12560   assert(is_locked());
12561   dout(10) << "scan_range from " << bi->begin << dendl;
12562   bi->clear_objects();
12563
12564   vector<hobject_t> ls;
12565   ls.reserve(max);
12566   int r = pgbackend->objects_list_partial(bi->begin, min, max, &ls, &bi->end);
12567   assert(r >= 0);
12568   dout(10) << " got " << ls.size() << " items, next " << bi->end << dendl;
12569   dout(20) << ls << dendl;
12570
12571   for (vector<hobject_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
12572     handle.reset_tp_timeout();
12573     ObjectContextRef obc;
12574     if (is_primary())
12575       obc = object_contexts.lookup(*p);
12576     if (obc) {
12577       bi->objects[*p] = obc->obs.oi.version;
12578       dout(20) << "  " << *p << " " << obc->obs.oi.version << dendl;
12579     } else {
12580       bufferlist bl;
12581       int r = pgbackend->objects_get_attr(*p, OI_ATTR, &bl);
12582
12583       /* If the object does not exist here, it must have been removed
12584          * between the collection_list_partial and here.  This can happen
12585          * for the first item in the range, which is usually last_backfill.
12586          */
12587       if (r == -ENOENT)
12588         continue;
12589
12590       assert(r >= 0);
12591       object_info_t oi(bl);
12592       bi->objects[*p] = oi.version;
12593       dout(20) << "  " << *p << " " << oi.version << dendl;
12594     }
12595   }
12596 }
12597
12598
12599 /** check_local
12600  *
12601  * verifies that stray objects have been deleted
12602  */
12603 void PrimaryLogPG::check_local()
12604 {
12605   dout(10) << __func__ << dendl;
12606
12607   assert(info.last_update >= pg_log.get_tail());  // otherwise we need some help!
12608
12609   if (!cct->_conf->osd_debug_verify_stray_on_activate)
12610     return;
12611
12612   // just scan the log.
12613   set<hobject_t> did;
12614   for (list<pg_log_entry_t>::const_reverse_iterator p = pg_log.get_log().log.rbegin();
12615        p != pg_log.get_log().log.rend();
12616        ++p) {
12617     if (did.count(p->soid))
12618       continue;
12619     did.insert(p->soid);
12620
12621     if (p->is_delete() && !is_missing_object(p->soid)) {
12622       dout(10) << " checking " << p->soid
12623                << " at " << p->version << dendl;
12624       struct stat st;
12625       int r = osd->store->stat(
12626         ch,
12627         ghobject_t(p->soid, ghobject_t::NO_GEN, pg_whoami.shard),
12628         &st);
12629       if (r != -ENOENT) {
12630         derr << __func__ << " " << p->soid << " exists, but should have been "
12631              << "deleted" << dendl;
12632         assert(0 == "erroneously present object");
12633       }
12634     } else {
12635       // ignore old(+missing) objects
12636     }
12637   }
12638 }
12639
12640
12641
12642 // ===========================
12643 // hit sets
12644
12645 hobject_t PrimaryLogPG::get_hit_set_current_object(utime_t stamp)
12646 {
12647   ostringstream ss;
12648   ss << "hit_set_" << info.pgid.pgid << "_current_" << stamp;
12649   hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "",
12650                  info.pgid.ps(), info.pgid.pool(),
12651                  cct->_conf->osd_hit_set_namespace);
12652   dout(20) << __func__ << " " << hoid << dendl;
12653   return hoid;
12654 }
12655
12656 hobject_t PrimaryLogPG::get_hit_set_archive_object(utime_t start,
12657                                                    utime_t end,
12658                                                    bool using_gmt)
12659 {
12660   ostringstream ss;
12661   ss << "hit_set_" << info.pgid.pgid << "_archive_";
12662   if (using_gmt) {
12663     start.gmtime(ss) << "_";
12664     end.gmtime(ss);
12665   } else {
12666     start.localtime(ss) << "_";
12667     end.localtime(ss);
12668   }
12669   hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "",
12670                  info.pgid.ps(), info.pgid.pool(),
12671                  cct->_conf->osd_hit_set_namespace);
12672   dout(20) << __func__ << " " << hoid << dendl;
12673   return hoid;
12674 }
12675
12676 void PrimaryLogPG::hit_set_clear()
12677 {
12678   dout(20) << __func__ << dendl;
12679   hit_set.reset();
12680   hit_set_start_stamp = utime_t();
12681 }
12682
12683 void PrimaryLogPG::hit_set_setup()
12684 {
12685   if (!is_active() ||
12686       !is_primary()) {
12687     hit_set_clear();
12688     return;
12689   }
12690
12691   if (is_active() && is_primary() &&
12692       (!pool.info.hit_set_count ||
12693        !pool.info.hit_set_period ||
12694        pool.info.hit_set_params.get_type() == HitSet::TYPE_NONE)) {
12695     hit_set_clear();
12696
12697     // only primary is allowed to remove all the hit set objects
12698     hit_set_remove_all();
12699     return;
12700   }
12701
12702   // FIXME: discard any previous data for now
12703   hit_set_create();
12704
12705   // include any writes we know about from the pg log.  this doesn't
12706   // capture reads, but it is better than nothing!
12707   hit_set_apply_log();
12708 }
12709
12710 void PrimaryLogPG::hit_set_remove_all()
12711 {
12712   // If any archives are degraded we skip this
12713   for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin();
12714        p != info.hit_set.history.end();
12715        ++p) {
12716     hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
12717
12718     // Once we hit a degraded object just skip
12719     if (is_degraded_or_backfilling_object(aoid))
12720       return;
12721     if (write_blocked_by_scrub(aoid))
12722       return;
12723   }
12724
12725   if (!info.hit_set.history.empty()) {
12726     list<pg_hit_set_info_t>::reverse_iterator p = info.hit_set.history.rbegin();
12727     assert(p != info.hit_set.history.rend());
12728     hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
12729     assert(!is_degraded_or_backfilling_object(oid));
12730     ObjectContextRef obc = get_object_context(oid, false);
12731     assert(obc);
12732
12733     OpContextUPtr ctx = simple_opc_create(obc);
12734     ctx->at_version = get_next_version();
12735     ctx->updated_hset_history = info.hit_set;
12736     utime_t now = ceph_clock_now();
12737     ctx->mtime = now;
12738     hit_set_trim(ctx, 0);
12739     simple_opc_submit(std::move(ctx));
12740   }
12741
12742   info.hit_set = pg_hit_set_history_t();
12743   if (agent_state) {
12744     agent_state->discard_hit_sets();
12745   }
12746 }
12747
12748 void PrimaryLogPG::hit_set_create()
12749 {
12750   utime_t now = ceph_clock_now();
12751   // make a copy of the params to modify
12752   HitSet::Params params(pool.info.hit_set_params);
12753
12754   dout(20) << __func__ << " " << params << dendl;
12755   if (pool.info.hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
12756     BloomHitSet::Params *p =
12757       static_cast<BloomHitSet::Params*>(params.impl.get());
12758
12759     // convert false positive rate so it holds up across the full period
12760     p->set_fpp(p->get_fpp() / pool.info.hit_set_count);
12761     if (p->get_fpp() <= 0.0)
12762       p->set_fpp(.01);  // fpp cannot be zero!
12763
12764     // if we don't have specified size, estimate target size based on the
12765     // previous bin!
12766     if (p->target_size == 0 && hit_set) {
12767       utime_t dur = now - hit_set_start_stamp;
12768       unsigned unique = hit_set->approx_unique_insert_count();
12769       dout(20) << __func__ << " previous set had approx " << unique
12770                << " unique items over " << dur << " seconds" << dendl;
12771       p->target_size = (double)unique * (double)pool.info.hit_set_period
12772                      / (double)dur;
12773     }
12774     if (p->target_size <
12775         static_cast<uint64_t>(cct->_conf->osd_hit_set_min_size))
12776       p->target_size = cct->_conf->osd_hit_set_min_size;
12777
12778     if (p->target_size
12779         > static_cast<uint64_t>(cct->_conf->osd_hit_set_max_size))
12780       p->target_size = cct->_conf->osd_hit_set_max_size;
12781
12782     p->seed = now.sec();
12783
12784     dout(10) << __func__ << " target_size " << p->target_size
12785              << " fpp " << p->get_fpp() << dendl;
12786   }
12787   hit_set.reset(new HitSet(params));
12788   hit_set_start_stamp = now;
12789 }
12790
12791 /**
12792  * apply log entries to set
12793  *
12794  * this would only happen after peering, to at least capture writes
12795  * during an interval that was potentially lost.
12796  */
12797 bool PrimaryLogPG::hit_set_apply_log()
12798 {
12799   if (!hit_set)
12800     return false;
12801
12802   eversion_t to = info.last_update;
12803   eversion_t from = info.hit_set.current_last_update;
12804   if (to <= from) {
12805     dout(20) << __func__ << " no update" << dendl;
12806     return false;
12807   }
12808
12809   dout(20) << __func__ << " " << to << " .. " << info.last_update << dendl;
12810   list<pg_log_entry_t>::const_reverse_iterator p = pg_log.get_log().log.rbegin();
12811   while (p != pg_log.get_log().log.rend() && p->version > to)
12812     ++p;
12813   while (p != pg_log.get_log().log.rend() && p->version > from) {
12814     hit_set->insert(p->soid);
12815     ++p;
12816   }
12817
12818   return true;
12819 }
12820
12821 void PrimaryLogPG::hit_set_persist()
12822 {
12823   dout(10) << __func__  << dendl;
12824   bufferlist bl;
12825   unsigned max = pool.info.hit_set_count;
12826
12827   utime_t now = ceph_clock_now();
12828   hobject_t oid;
12829
12830   // If any archives are degraded we skip this persist request
12831   // account for the additional entry being added below
12832   for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin();
12833        p != info.hit_set.history.end();
12834        ++p) {
12835     hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
12836
12837     // Once we hit a degraded object just skip further trim
12838     if (is_degraded_or_backfilling_object(aoid))
12839       return;
12840     if (write_blocked_by_scrub(aoid))
12841       return;
12842   }
12843
12844   // If backfill is in progress and we could possibly overlap with the
12845   // hit_set_* objects, back off.  Since these all have
12846   // hobject_t::hash set to pgid.ps(), and those sort first, we can
12847   // look just at that.  This is necessary because our transactions
12848   // may include a modify of the new hit_set *and* a delete of the
12849   // old one, and this may span the backfill boundary.
12850   for (set<pg_shard_t>::iterator p = backfill_targets.begin();
12851        p != backfill_targets.end();
12852        ++p) {
12853     assert(peer_info.count(*p));
12854     const pg_info_t& pi = peer_info[*p];
12855     if (pi.last_backfill == hobject_t() ||
12856         pi.last_backfill.get_hash() == info.pgid.ps()) {
12857       dout(10) << __func__ << " backfill target osd." << *p
12858                << " last_backfill has not progressed past pgid ps"
12859                << dendl;
12860       return;
12861     }
12862   }
12863
12864
12865   pg_hit_set_info_t new_hset = pg_hit_set_info_t(pool.info.use_gmt_hitset);
12866   new_hset.begin = hit_set_start_stamp;
12867   new_hset.end = now;
12868   oid = get_hit_set_archive_object(
12869     new_hset.begin,
12870     new_hset.end,
12871     new_hset.using_gmt);
12872
12873   // If the current object is degraded we skip this persist request
12874   if (write_blocked_by_scrub(oid))
12875     return;
12876
12877   hit_set->seal();
12878   ::encode(*hit_set, bl);
12879   dout(20) << __func__ << " archive " << oid << dendl;
12880
12881   if (agent_state) {
12882     agent_state->add_hit_set(new_hset.begin, hit_set);
12883     uint32_t size = agent_state->hit_set_map.size();
12884     if (size >= pool.info.hit_set_count) {
12885       size = pool.info.hit_set_count > 0 ? pool.info.hit_set_count - 1: 0;
12886     }
12887     hit_set_in_memory_trim(size);
12888   }
12889
12890   ObjectContextRef obc = get_object_context(oid, true);
12891   OpContextUPtr ctx = simple_opc_create(obc);
12892
12893   ctx->at_version = get_next_version();
12894   ctx->updated_hset_history = info.hit_set;
12895   pg_hit_set_history_t &updated_hit_set_hist = *(ctx->updated_hset_history);
12896
12897   updated_hit_set_hist.current_last_update = info.last_update;
12898   new_hset.version = ctx->at_version;
12899
12900   updated_hit_set_hist.history.push_back(new_hset);
12901   hit_set_create();
12902
12903   // fabricate an object_info_t and SnapSet
12904   obc->obs.oi.version = ctx->at_version;
12905   obc->obs.oi.mtime = now;
12906   obc->obs.oi.size = bl.length();
12907   obc->obs.exists = true;
12908   obc->obs.oi.set_data_digest(bl.crc32c(-1));
12909
12910   ctx->new_obs = obc->obs;
12911
12912   obc->ssc->snapset.head_exists = true;
12913   ctx->new_snapset = obc->ssc->snapset;
12914
12915   ctx->delta_stats.num_objects++;
12916   ctx->delta_stats.num_objects_hit_set_archive++;
12917   ctx->delta_stats.num_bytes += bl.length();
12918   ctx->delta_stats.num_bytes_hit_set_archive += bl.length();
12919
12920   bufferlist bss;
12921   ::encode(ctx->new_snapset, bss);
12922   bufferlist boi(sizeof(ctx->new_obs.oi));
12923   ::encode(ctx->new_obs.oi, boi,
12924            get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
12925
12926   ctx->op_t->create(oid);
12927   if (bl.length()) {
12928     ctx->op_t->write(oid, 0, bl.length(), bl, 0);
12929   }
12930   map <string, bufferlist> attrs;
12931   attrs[OI_ATTR].claim(boi);
12932   attrs[SS_ATTR].claim(bss);
12933   setattrs_maybe_cache(ctx->obc, ctx.get(), ctx->op_t.get(), attrs);
12934   ctx->log.push_back(
12935     pg_log_entry_t(
12936       pg_log_entry_t::MODIFY,
12937       oid,
12938       ctx->at_version,
12939       eversion_t(),
12940       0,
12941       osd_reqid_t(),
12942       ctx->mtime,
12943       0)
12944     );
12945
12946   hit_set_trim(ctx, max);
12947
12948   simple_opc_submit(std::move(ctx));
12949 }
12950
12951 void PrimaryLogPG::hit_set_trim(OpContextUPtr &ctx, unsigned max)
12952 {
12953   assert(ctx->updated_hset_history);
12954   pg_hit_set_history_t &updated_hit_set_hist =
12955     *(ctx->updated_hset_history);
12956   for (unsigned num = updated_hit_set_hist.history.size(); num > max; --num) {
12957     list<pg_hit_set_info_t>::iterator p = updated_hit_set_hist.history.begin();
12958     assert(p != updated_hit_set_hist.history.end());
12959     hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
12960
12961     assert(!is_degraded_or_backfilling_object(oid));
12962
12963     dout(20) << __func__ << " removing " << oid << dendl;
12964     ++ctx->at_version.version;
12965     ctx->log.push_back(
12966         pg_log_entry_t(pg_log_entry_t::DELETE,
12967                        oid,
12968                        ctx->at_version,
12969                        p->version,
12970                        0,
12971                        osd_reqid_t(),
12972                        ctx->mtime,
12973                        0));
12974
12975     ctx->op_t->remove(oid);
12976     updated_hit_set_hist.history.pop_front();
12977
12978     ObjectContextRef obc = get_object_context(oid, false);
12979     assert(obc);
12980     --ctx->delta_stats.num_objects;
12981     --ctx->delta_stats.num_objects_hit_set_archive;
12982     ctx->delta_stats.num_bytes -= obc->obs.oi.size;
12983     ctx->delta_stats.num_bytes_hit_set_archive -= obc->obs.oi.size;
12984   }
12985 }
12986
12987 void PrimaryLogPG::hit_set_in_memory_trim(uint32_t max_in_memory)
12988 {
12989   while (agent_state->hit_set_map.size() > max_in_memory) {
12990     agent_state->remove_oldest_hit_set();
12991   }
12992 }
12993
12994
12995 // =======================================
12996 // cache agent
12997
12998 void PrimaryLogPG::agent_setup()
12999 {
13000   assert(is_locked());
13001   if (!is_active() ||
13002       !is_primary() ||
13003       pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE ||
13004       pool.info.tier_of < 0 ||
13005       !get_osdmap()->have_pg_pool(pool.info.tier_of)) {
13006     agent_clear();
13007     return;
13008   }
13009   if (!agent_state) {
13010     agent_state.reset(new TierAgentState);
13011
13012     // choose random starting position
13013     agent_state->position = hobject_t();
13014     agent_state->position.pool = info.pgid.pool();
13015     agent_state->position.set_hash(pool.info.get_random_pg_position(
13016       info.pgid.pgid,
13017       rand()));
13018     agent_state->start = agent_state->position;
13019
13020     dout(10) << __func__ << " allocated new state, position "
13021              << agent_state->position << dendl;
13022   } else {
13023     dout(10) << __func__ << " keeping existing state" << dendl;
13024   }
13025
13026   if (info.stats.stats_invalid) {
13027     osd->clog->warn() << "pg " << info.pgid << " has invalid (post-split) stats; must scrub before tier agent can activate";
13028   }
13029
13030   agent_choose_mode();
13031 }
13032
13033 void PrimaryLogPG::agent_clear()
13034 {
13035   agent_stop();
13036   agent_state.reset(NULL);
13037 }
13038
13039 // Return false if no objects operated on since start of object hash space
13040 bool PrimaryLogPG::agent_work(int start_max, int agent_flush_quota)
13041 {
13042   lock();
13043   if (!agent_state) {
13044     dout(10) << __func__ << " no agent state, stopping" << dendl;
13045     unlock();
13046     return true;
13047   }
13048
13049   assert(!deleting);
13050
13051   if (agent_state->is_idle()) {
13052     dout(10) << __func__ << " idle, stopping" << dendl;
13053     unlock();
13054     return true;
13055   }
13056
13057   osd->logger->inc(l_osd_agent_wake);
13058
13059   dout(10) << __func__
13060            << " max " << start_max
13061            << ", flush " << agent_state->get_flush_mode_name()
13062            << ", evict " << agent_state->get_evict_mode_name()
13063            << ", pos " << agent_state->position
13064            << dendl;
13065   assert(is_primary());
13066   assert(is_active());
13067
13068   agent_load_hit_sets();
13069
13070   const pg_pool_t *base_pool = get_osdmap()->get_pg_pool(pool.info.tier_of);
13071   assert(base_pool);
13072
13073   int ls_min = 1;
13074   int ls_max = cct->_conf->osd_pool_default_cache_max_evict_check_size;
13075
13076   // list some objects.  this conveniently lists clones (oldest to
13077   // newest) before heads... the same order we want to flush in.
13078   //
13079   // NOTE: do not flush the Sequencer.  we will assume that the
13080   // listing we get back is imprecise.
13081   vector<hobject_t> ls;
13082   hobject_t next;
13083   int r = pgbackend->objects_list_partial(agent_state->position, ls_min, ls_max,
13084                                           &ls, &next);
13085   assert(r >= 0);
13086   dout(20) << __func__ << " got " << ls.size() << " objects" << dendl;
13087   int started = 0;
13088   for (vector<hobject_t>::iterator p = ls.begin();
13089        p != ls.end();
13090        ++p) {
13091     if (p->nspace == cct->_conf->osd_hit_set_namespace) {
13092       dout(20) << __func__ << " skip (hit set) " << *p << dendl;
13093       osd->logger->inc(l_osd_agent_skip);
13094       continue;
13095     }
13096     if (is_degraded_or_backfilling_object(*p)) {
13097       dout(20) << __func__ << " skip (degraded) " << *p << dendl;
13098       osd->logger->inc(l_osd_agent_skip);
13099       continue;
13100     }
13101     if (is_missing_object(p->get_head())) {
13102       dout(20) << __func__ << " skip (missing head) " << *p << dendl;
13103       osd->logger->inc(l_osd_agent_skip);
13104       continue;
13105     }
13106     ObjectContextRef obc = get_object_context(*p, false, NULL);
13107     if (!obc) {
13108       // we didn't flush; we may miss something here.
13109       dout(20) << __func__ << " skip (no obc) " << *p << dendl;
13110       osd->logger->inc(l_osd_agent_skip);
13111       continue;
13112     }
13113     if (!obc->obs.exists) {
13114       dout(20) << __func__ << " skip (dne) " << obc->obs.oi.soid << dendl;
13115       osd->logger->inc(l_osd_agent_skip);
13116       continue;
13117     }
13118     if (range_intersects_scrub(obc->obs.oi.soid,
13119                                obc->obs.oi.soid.get_head())) {
13120       dout(20) << __func__ << " skip (scrubbing) " << obc->obs.oi << dendl;
13121       osd->logger->inc(l_osd_agent_skip);
13122       continue;
13123     }
13124     if (obc->is_blocked()) {
13125       dout(20) << __func__ << " skip (blocked) " << obc->obs.oi << dendl;
13126       osd->logger->inc(l_osd_agent_skip);
13127       continue;
13128     }
13129     if (obc->is_request_pending()) {
13130       dout(20) << __func__ << " skip (request pending) " << obc->obs.oi << dendl;
13131       osd->logger->inc(l_osd_agent_skip);
13132       continue;
13133     }
13134
13135     // be careful flushing omap to an EC pool.
13136     if (!base_pool->supports_omap() &&
13137         obc->obs.oi.is_omap()) {
13138       dout(20) << __func__ << " skip (omap to EC) " << obc->obs.oi << dendl;
13139       osd->logger->inc(l_osd_agent_skip);
13140       continue;
13141     }
13142
13143     if (agent_state->evict_mode != TierAgentState::EVICT_MODE_IDLE &&
13144         agent_maybe_evict(obc, false))
13145       ++started;
13146     else if (agent_state->flush_mode != TierAgentState::FLUSH_MODE_IDLE &&
13147              agent_flush_quota > 0 && agent_maybe_flush(obc)) {
13148       ++started;
13149       --agent_flush_quota;
13150     }
13151     if (started >= start_max) {
13152       // If finishing early, set "next" to the next object
13153       if (++p != ls.end())
13154         next = *p;
13155       break;
13156     }
13157   }
13158
13159   if (++agent_state->hist_age > cct->_conf->osd_agent_hist_halflife) {
13160     dout(20) << __func__ << " resetting atime and temp histograms" << dendl;
13161     agent_state->hist_age = 0;
13162     agent_state->temp_hist.decay();
13163   }
13164
13165   // Total objects operated on so far
13166   int total_started = agent_state->started + started;
13167   bool need_delay = false;
13168
13169   dout(20) << __func__ << " start pos " << agent_state->position
13170     << " next start pos " << next
13171     << " started " << total_started << dendl;
13172
13173   // See if we've made a full pass over the object hash space
13174   // This might check at most ls_max objects a second time to notice that
13175   // we've checked every objects at least once.
13176   if (agent_state->position < agent_state->start &&
13177       next >= agent_state->start) {
13178     dout(20) << __func__ << " wrap around " << agent_state->start << dendl;
13179     if (total_started == 0)
13180       need_delay = true;
13181     else
13182       total_started = 0;
13183     agent_state->start = next;
13184   }
13185   agent_state->started = total_started;
13186
13187   // See if we are starting from beginning
13188   if (next.is_max())
13189     agent_state->position = hobject_t();
13190   else
13191     agent_state->position = next;
13192
13193   // Discard old in memory HitSets
13194   hit_set_in_memory_trim(pool.info.hit_set_count);
13195
13196   if (need_delay) {
13197     assert(agent_state->delaying == false);
13198     agent_delay();
13199     unlock();
13200     return false;
13201   }
13202   agent_choose_mode();
13203   unlock();
13204   return true;
13205 }
13206
13207 void PrimaryLogPG::agent_load_hit_sets()
13208 {
13209   if (agent_state->evict_mode == TierAgentState::EVICT_MODE_IDLE) {
13210     return;
13211   }
13212
13213   if (agent_state->hit_set_map.size() < info.hit_set.history.size()) {
13214     dout(10) << __func__ << dendl;
13215     for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin();
13216          p != info.hit_set.history.end(); ++p) {
13217       if (agent_state->hit_set_map.count(p->begin.sec()) == 0) {
13218         dout(10) << __func__ << " loading " << p->begin << "-"
13219                  << p->end << dendl;
13220         if (!pool.info.is_replicated()) {
13221           // FIXME: EC not supported here yet
13222           derr << __func__ << " on non-replicated pool" << dendl;
13223           break;
13224         }
13225
13226         hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
13227         if (is_unreadable_object(oid)) {
13228           dout(10) << __func__ << " unreadable " << oid << ", waiting" << dendl;
13229           break;
13230         }
13231
13232         ObjectContextRef obc = get_object_context(oid, false);
13233         if (!obc) {
13234           derr << __func__ << ": could not load hitset " << oid << dendl;
13235           break;
13236         }
13237
13238         bufferlist bl;
13239         {
13240           obc->ondisk_read_lock();
13241           int r = osd->store->read(ch, ghobject_t(oid), 0, 0, bl);
13242           assert(r >= 0);
13243           obc->ondisk_read_unlock();
13244         }
13245         HitSetRef hs(new HitSet);
13246         bufferlist::iterator pbl = bl.begin();
13247         ::decode(*hs, pbl);
13248         agent_state->add_hit_set(p->begin.sec(), hs);
13249       }
13250     }
13251   }
13252 }
13253
13254 bool PrimaryLogPG::agent_maybe_flush(ObjectContextRef& obc)
13255 {
13256   if (!obc->obs.oi.is_dirty()) {
13257     dout(20) << __func__ << " skip (clean) " << obc->obs.oi << dendl;
13258     osd->logger->inc(l_osd_agent_skip);
13259     return false;
13260   }
13261   if (obc->obs.oi.is_cache_pinned()) {
13262     dout(20) << __func__ << " skip (cache_pinned) " << obc->obs.oi << dendl;
13263     osd->logger->inc(l_osd_agent_skip);
13264     return false;
13265   }
13266
13267   utime_t now = ceph_clock_now();
13268   utime_t ob_local_mtime;
13269   if (obc->obs.oi.local_mtime != utime_t()) {
13270     ob_local_mtime = obc->obs.oi.local_mtime;
13271   } else {
13272     ob_local_mtime = obc->obs.oi.mtime;
13273   }
13274   bool evict_mode_full =
13275     (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL);
13276   if (!evict_mode_full &&
13277       obc->obs.oi.soid.snap == CEPH_NOSNAP &&  // snaps immutable; don't delay
13278       (ob_local_mtime + utime_t(pool.info.cache_min_flush_age, 0) > now)) {
13279     dout(20) << __func__ << " skip (too young) " << obc->obs.oi << dendl;
13280     osd->logger->inc(l_osd_agent_skip);
13281     return false;
13282   }
13283
13284   if (osd->agent_is_active_oid(obc->obs.oi.soid)) {
13285     dout(20) << __func__ << " skip (flushing) " << obc->obs.oi << dendl;
13286     osd->logger->inc(l_osd_agent_skip);
13287     return false;
13288   }
13289
13290   dout(10) << __func__ << " flushing " << obc->obs.oi << dendl;
13291
13292   // FIXME: flush anything dirty, regardless of what distribution of
13293   // ages we expect.
13294
13295   hobject_t oid = obc->obs.oi.soid;
13296   osd->agent_start_op(oid);
13297   // no need to capture a pg ref, can't outlive fop or ctx
13298   std::function<void()> on_flush = [this, oid]() {
13299     osd->agent_finish_op(oid);
13300   };
13301
13302   int result = start_flush(
13303     OpRequestRef(), obc, false, NULL,
13304     on_flush);
13305   if (result != -EINPROGRESS) {
13306     on_flush();
13307     dout(10) << __func__ << " start_flush() failed " << obc->obs.oi
13308       << " with " << result << dendl;
13309     osd->logger->inc(l_osd_agent_skip);
13310     return false;
13311   }
13312
13313   osd->logger->inc(l_osd_agent_flush);
13314   return true;
13315 }
13316
13317 bool PrimaryLogPG::agent_maybe_evict(ObjectContextRef& obc, bool after_flush)
13318 {
13319   const hobject_t& soid = obc->obs.oi.soid;
13320   if (!after_flush && obc->obs.oi.is_dirty()) {
13321     dout(20) << __func__ << " skip (dirty) " << obc->obs.oi << dendl;
13322     return false;
13323   }
13324   if (!obc->obs.oi.watchers.empty()) {
13325     dout(20) << __func__ << " skip (watchers) " << obc->obs.oi << dendl;
13326     return false;
13327   }
13328   if (obc->is_blocked()) {
13329     dout(20) << __func__ << " skip (blocked) " << obc->obs.oi << dendl;
13330     return false;
13331   }
13332   if (obc->obs.oi.is_cache_pinned()) {
13333     dout(20) << __func__ << " skip (cache_pinned) " << obc->obs.oi << dendl;
13334     return false;
13335   }
13336
13337   if (soid.snap == CEPH_NOSNAP) {
13338     int result = _verify_no_head_clones(soid, obc->ssc->snapset);
13339     if (result < 0) {
13340       dout(20) << __func__ << " skip (clones) " << obc->obs.oi << dendl;
13341       return false;
13342     }
13343   }
13344
13345   if (agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL) {
13346     // is this object old than cache_min_evict_age?
13347     utime_t now = ceph_clock_now();
13348     utime_t ob_local_mtime;
13349     if (obc->obs.oi.local_mtime != utime_t()) {
13350       ob_local_mtime = obc->obs.oi.local_mtime;
13351     } else {
13352       ob_local_mtime = obc->obs.oi.mtime;
13353     }
13354     if (ob_local_mtime + utime_t(pool.info.cache_min_evict_age, 0) > now) {
13355       dout(20) << __func__ << " skip (too young) " << obc->obs.oi << dendl;
13356       osd->logger->inc(l_osd_agent_skip);
13357       return false;
13358     }
13359     // is this object old and/or cold enough?
13360     int temp = 0;
13361     uint64_t temp_upper = 0, temp_lower = 0;
13362     if (hit_set)
13363       agent_estimate_temp(soid, &temp);
13364     agent_state->temp_hist.add(temp);
13365     agent_state->temp_hist.get_position_micro(temp, &temp_lower, &temp_upper);
13366
13367     dout(20) << __func__
13368              << " temp " << temp
13369              << " pos " << temp_lower << "-" << temp_upper
13370              << ", evict_effort " << agent_state->evict_effort
13371              << dendl;
13372     dout(30) << "agent_state:\n";
13373     Formatter *f = Formatter::create("");
13374     f->open_object_section("agent_state");
13375     agent_state->dump(f);
13376     f->close_section();
13377     f->flush(*_dout);
13378     delete f;
13379     *_dout << dendl;
13380
13381     if (1000000 - temp_upper >= agent_state->evict_effort)
13382       return false;
13383   }
13384
13385   dout(10) << __func__ << " evicting " << obc->obs.oi << dendl;
13386   OpContextUPtr ctx = simple_opc_create(obc);
13387
13388   if (!ctx->lock_manager.get_lock_type(
13389         ObjectContext::RWState::RWWRITE,
13390         obc->obs.oi.soid,
13391         obc,
13392         OpRequestRef())) {
13393     close_op_ctx(ctx.release());
13394     dout(20) << __func__ << " skip (cannot get lock) " << obc->obs.oi << dendl;
13395     return false;
13396   }
13397
13398   osd->agent_start_evict_op();
13399   ctx->register_on_finish(
13400     [this]() {
13401       osd->agent_finish_evict_op();
13402     });
13403
13404   ctx->at_version = get_next_version();
13405   assert(ctx->new_obs.exists);
13406   int r = _delete_oid(ctx.get(), true, false);
13407   if (obc->obs.oi.is_omap())
13408     ctx->delta_stats.num_objects_omap--;
13409   ctx->delta_stats.num_evict++;
13410   ctx->delta_stats.num_evict_kb += SHIFT_ROUND_UP(obc->obs.oi.size, 10);
13411   if (obc->obs.oi.is_dirty())
13412     --ctx->delta_stats.num_objects_dirty;
13413   assert(r == 0);
13414   finish_ctx(ctx.get(), pg_log_entry_t::DELETE, false);
13415   simple_opc_submit(std::move(ctx));
13416   osd->logger->inc(l_osd_tier_evict);
13417   osd->logger->inc(l_osd_agent_evict);
13418   return true;
13419 }
13420
13421 void PrimaryLogPG::agent_stop()
13422 {
13423   dout(20) << __func__ << dendl;
13424   if (agent_state && !agent_state->is_idle()) {
13425     agent_state->evict_mode = TierAgentState::EVICT_MODE_IDLE;
13426     agent_state->flush_mode = TierAgentState::FLUSH_MODE_IDLE;
13427     osd->agent_disable_pg(this, agent_state->evict_effort);
13428   }
13429 }
13430
13431 void PrimaryLogPG::agent_delay()
13432 {
13433   dout(20) << __func__ << dendl;
13434   if (agent_state && !agent_state->is_idle()) {
13435     assert(agent_state->delaying == false);
13436     agent_state->delaying = true;
13437     osd->agent_disable_pg(this, agent_state->evict_effort);
13438   }
13439 }
13440
13441 void PrimaryLogPG::agent_choose_mode_restart()
13442 {
13443   dout(20) << __func__ << dendl;
13444   lock();
13445   if (agent_state && agent_state->delaying) {
13446     agent_state->delaying = false;
13447     agent_choose_mode(true);
13448   }
13449   unlock();
13450 }
13451
13452 bool PrimaryLogPG::agent_choose_mode(bool restart, OpRequestRef op)
13453 {
13454   bool requeued = false;
13455   // Let delay play out
13456   if (agent_state->delaying) {
13457     dout(20) << __func__ << this << " delaying, ignored" << dendl;
13458     return requeued;
13459   }
13460
13461   TierAgentState::flush_mode_t flush_mode = TierAgentState::FLUSH_MODE_IDLE;
13462   TierAgentState::evict_mode_t evict_mode = TierAgentState::EVICT_MODE_IDLE;
13463   unsigned evict_effort = 0;
13464
13465   if (info.stats.stats_invalid) {
13466     // idle; stats can't be trusted until we scrub.
13467     dout(20) << __func__ << " stats invalid (post-split), idle" << dendl;
13468     goto skip_calc;
13469   }
13470
13471   {
13472   uint64_t divisor = pool.info.get_pg_num_divisor(info.pgid.pgid);
13473   assert(divisor > 0);
13474
13475   // adjust (effective) user objects down based on the number
13476   // of HitSet objects, which should not count toward our total since
13477   // they cannot be flushed.
13478   uint64_t unflushable = info.stats.stats.sum.num_objects_hit_set_archive;
13479
13480   // also exclude omap objects if ec backing pool
13481   const pg_pool_t *base_pool = get_osdmap()->get_pg_pool(pool.info.tier_of);
13482   assert(base_pool);
13483   if (!base_pool->supports_omap())
13484     unflushable += info.stats.stats.sum.num_objects_omap;
13485
13486   uint64_t num_user_objects = info.stats.stats.sum.num_objects;
13487   if (num_user_objects > unflushable)
13488     num_user_objects -= unflushable;
13489   else
13490     num_user_objects = 0;
13491
13492   uint64_t num_user_bytes = info.stats.stats.sum.num_bytes;
13493   uint64_t unflushable_bytes = info.stats.stats.sum.num_bytes_hit_set_archive;
13494   num_user_bytes -= unflushable_bytes;
13495   uint64_t num_overhead_bytes = osd->store->estimate_objects_overhead(num_user_objects);
13496   num_user_bytes += num_overhead_bytes;
13497
13498   // also reduce the num_dirty by num_objects_omap
13499   int64_t num_dirty = info.stats.stats.sum.num_objects_dirty;
13500   if (!base_pool->supports_omap()) {
13501     if (num_dirty > info.stats.stats.sum.num_objects_omap)
13502       num_dirty -= info.stats.stats.sum.num_objects_omap;
13503     else
13504       num_dirty = 0;
13505   }
13506
13507   dout(10) << __func__
13508            << " flush_mode: "
13509            << TierAgentState::get_flush_mode_name(agent_state->flush_mode)
13510            << " evict_mode: "
13511            << TierAgentState::get_evict_mode_name(agent_state->evict_mode)
13512            << " num_objects: " << info.stats.stats.sum.num_objects
13513            << " num_bytes: " << info.stats.stats.sum.num_bytes
13514            << " num_objects_dirty: " << info.stats.stats.sum.num_objects_dirty
13515            << " num_objects_omap: " << info.stats.stats.sum.num_objects_omap
13516            << " num_dirty: " << num_dirty
13517            << " num_user_objects: " << num_user_objects
13518            << " num_user_bytes: " << num_user_bytes
13519            << " num_overhead_bytes: " << num_overhead_bytes
13520            << " pool.info.target_max_bytes: " << pool.info.target_max_bytes
13521            << " pool.info.target_max_objects: " << pool.info.target_max_objects
13522            << dendl;
13523
13524   // get dirty, full ratios
13525   uint64_t dirty_micro = 0;
13526   uint64_t full_micro = 0;
13527   if (pool.info.target_max_bytes && num_user_objects > 0) {
13528     uint64_t avg_size = num_user_bytes / num_user_objects;
13529     dirty_micro =
13530       num_dirty * avg_size * 1000000 /
13531       MAX(pool.info.target_max_bytes / divisor, 1);
13532     full_micro =
13533       num_user_objects * avg_size * 1000000 /
13534       MAX(pool.info.target_max_bytes / divisor, 1);
13535   }
13536   if (pool.info.target_max_objects > 0) {
13537     uint64_t dirty_objects_micro =
13538       num_dirty * 1000000 /
13539       MAX(pool.info.target_max_objects / divisor, 1);
13540     if (dirty_objects_micro > dirty_micro)
13541       dirty_micro = dirty_objects_micro;
13542     uint64_t full_objects_micro =
13543       num_user_objects * 1000000 /
13544       MAX(pool.info.target_max_objects / divisor, 1);
13545     if (full_objects_micro > full_micro)
13546       full_micro = full_objects_micro;
13547   }
13548   dout(20) << __func__ << " dirty " << ((float)dirty_micro / 1000000.0)
13549            << " full " << ((float)full_micro / 1000000.0)
13550            << dendl;
13551
13552   // flush mode
13553   uint64_t flush_target = pool.info.cache_target_dirty_ratio_micro;
13554   uint64_t flush_high_target = pool.info.cache_target_dirty_high_ratio_micro;
13555   uint64_t flush_slop = (float)flush_target * cct->_conf->osd_agent_slop;
13556   if (restart || agent_state->flush_mode == TierAgentState::FLUSH_MODE_IDLE) {
13557     flush_target += flush_slop;
13558     flush_high_target += flush_slop;
13559   } else {
13560     flush_target -= MIN(flush_target, flush_slop);
13561     flush_high_target -= MIN(flush_high_target, flush_slop);
13562   }
13563
13564   if (dirty_micro > flush_high_target) {
13565     flush_mode = TierAgentState::FLUSH_MODE_HIGH;
13566   } else if (dirty_micro > flush_target) {
13567     flush_mode = TierAgentState::FLUSH_MODE_LOW;
13568   }
13569
13570   // evict mode
13571   uint64_t evict_target = pool.info.cache_target_full_ratio_micro;
13572   uint64_t evict_slop = (float)evict_target * cct->_conf->osd_agent_slop;
13573   if (restart || agent_state->evict_mode == TierAgentState::EVICT_MODE_IDLE)
13574     evict_target += evict_slop;
13575   else
13576     evict_target -= MIN(evict_target, evict_slop);
13577
13578   if (full_micro > 1000000) {
13579     // evict anything clean
13580     evict_mode = TierAgentState::EVICT_MODE_FULL;
13581     evict_effort = 1000000;
13582   } else if (full_micro > evict_target) {
13583     // set effort in [0..1] range based on where we are between
13584     evict_mode = TierAgentState::EVICT_MODE_SOME;
13585     uint64_t over = full_micro - evict_target;
13586     uint64_t span  = 1000000 - evict_target;
13587     evict_effort = MAX(over * 1000000 / span,
13588                        (unsigned)(1000000.0 * cct->_conf->osd_agent_min_evict_effort));
13589
13590     // quantize effort to avoid too much reordering in the agent_queue.
13591     uint64_t inc = cct->_conf->osd_agent_quantize_effort * 1000000;
13592     assert(inc > 0);
13593     uint64_t was = evict_effort;
13594     evict_effort -= evict_effort % inc;
13595     if (evict_effort < inc)
13596       evict_effort = inc;
13597     assert(evict_effort >= inc && evict_effort <= 1000000);
13598     dout(30) << __func__ << " evict_effort " << was << " quantized by " << inc << " to " << evict_effort << dendl;
13599   }
13600   }
13601
13602   skip_calc:
13603   bool old_idle = agent_state->is_idle();
13604   if (flush_mode != agent_state->flush_mode) {
13605     dout(5) << __func__ << " flush_mode "
13606             << TierAgentState::get_flush_mode_name(agent_state->flush_mode)
13607             << " -> "
13608             << TierAgentState::get_flush_mode_name(flush_mode)
13609             << dendl;
13610     if (flush_mode == TierAgentState::FLUSH_MODE_HIGH) {
13611       osd->agent_inc_high_count();
13612       info.stats.stats.sum.num_flush_mode_high = 1;
13613     } else if (flush_mode == TierAgentState::FLUSH_MODE_LOW) {
13614       info.stats.stats.sum.num_flush_mode_low = 1;
13615     }
13616     if (agent_state->flush_mode == TierAgentState::FLUSH_MODE_HIGH) {
13617       osd->agent_dec_high_count();
13618       info.stats.stats.sum.num_flush_mode_high = 0;
13619     } else if (agent_state->flush_mode == TierAgentState::FLUSH_MODE_LOW) {
13620       info.stats.stats.sum.num_flush_mode_low = 0;
13621     }
13622     agent_state->flush_mode = flush_mode;
13623   }
13624   if (evict_mode != agent_state->evict_mode) {
13625     dout(5) << __func__ << " evict_mode "
13626             << TierAgentState::get_evict_mode_name(agent_state->evict_mode)
13627             << " -> "
13628             << TierAgentState::get_evict_mode_name(evict_mode)
13629             << dendl;
13630     if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL &&
13631         is_active()) {
13632       if (op)
13633         requeue_op(op);
13634       requeue_ops(waiting_for_flush);
13635       requeue_ops(waiting_for_active);
13636       requeue_ops(waiting_for_scrub);
13637       requeue_ops(waiting_for_cache_not_full);
13638       objects_blocked_on_cache_full.clear();
13639       requeued = true;
13640     }
13641     if (evict_mode == TierAgentState::EVICT_MODE_SOME) {
13642       info.stats.stats.sum.num_evict_mode_some = 1;
13643     } else if (evict_mode == TierAgentState::EVICT_MODE_FULL) {
13644       info.stats.stats.sum.num_evict_mode_full = 1;
13645     }
13646     if (agent_state->evict_mode == TierAgentState::EVICT_MODE_SOME) {
13647       info.stats.stats.sum.num_evict_mode_some = 0;
13648     } else if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
13649       info.stats.stats.sum.num_evict_mode_full = 0;
13650     }
13651     agent_state->evict_mode = evict_mode;
13652   }
13653   uint64_t old_effort = agent_state->evict_effort;
13654   if (evict_effort != agent_state->evict_effort) {
13655     dout(5) << __func__ << " evict_effort "
13656             << ((float)agent_state->evict_effort / 1000000.0)
13657             << " -> "
13658             << ((float)evict_effort / 1000000.0)
13659             << dendl;
13660     agent_state->evict_effort = evict_effort;
13661   }
13662
13663   // NOTE: we are using evict_effort as a proxy for *all* agent effort
13664   // (including flush).  This is probably fine (they should be
13665   // correlated) but it is not precisely correct.
13666   if (agent_state->is_idle()) {
13667     if (!restart && !old_idle) {
13668       osd->agent_disable_pg(this, old_effort);
13669     }
13670   } else {
13671     if (restart || old_idle) {
13672       osd->agent_enable_pg(this, agent_state->evict_effort);
13673     } else if (old_effort != agent_state->evict_effort) {
13674       osd->agent_adjust_pg(this, old_effort, agent_state->evict_effort);
13675     }
13676   }
13677   return requeued;
13678 }
13679
13680 void PrimaryLogPG::agent_estimate_temp(const hobject_t& oid, int *temp)
13681 {
13682   assert(hit_set);
13683   assert(temp);
13684   *temp = 0;
13685   if (hit_set->contains(oid))
13686     *temp = 1000000;
13687   unsigned i = 0;
13688   int last_n = pool.info.hit_set_search_last_n;
13689   for (map<time_t,HitSetRef>::reverse_iterator p =
13690        agent_state->hit_set_map.rbegin(); last_n > 0 &&
13691        p != agent_state->hit_set_map.rend(); ++p, ++i) {
13692     if (p->second->contains(oid)) {
13693       *temp += pool.info.get_grade(i);
13694       --last_n;
13695     }
13696   }
13697 }
13698
13699 // Dup op detection
13700
13701 bool PrimaryLogPG::already_complete(eversion_t v)
13702 {
13703   dout(20) << __func__ << ": " << v << dendl;
13704   for (xlist<RepGather*>::iterator i = repop_queue.begin();
13705        !i.end();
13706        ++i) {
13707     dout(20) << __func__ << ": " << **i << dendl;
13708     // skip copy from temp object ops
13709     if ((*i)->v == eversion_t()) {
13710       dout(20) << __func__ << ": " << **i
13711                << " version is empty" << dendl;
13712       continue;
13713     }
13714     if ((*i)->v > v) {
13715       dout(20) << __func__ << ": " << **i
13716                << " (*i)->v past v" << dendl;
13717       break;
13718     }
13719     if (!(*i)->all_committed) {
13720       dout(20) << __func__ << ": " << **i
13721                << " not committed, returning false"
13722                << dendl;
13723       return false;
13724     }
13725   }
13726   dout(20) << __func__ << ": returning true" << dendl;
13727   return true;
13728 }
13729
13730 bool PrimaryLogPG::already_ack(eversion_t v)
13731 {
13732   dout(20) << __func__ << ": " << v << dendl;
13733   for (xlist<RepGather*>::iterator i = repop_queue.begin();
13734        !i.end();
13735        ++i) {
13736     // skip copy from temp object ops
13737     if ((*i)->v == eversion_t()) {
13738       dout(20) << __func__ << ": " << **i
13739                << " version is empty" << dendl;
13740       continue;
13741     }
13742     if ((*i)->v > v) {
13743       dout(20) << __func__ << ": " << **i
13744                << " (*i)->v past v" << dendl;
13745       break;
13746     }
13747     if (!(*i)->all_applied) {
13748       dout(20) << __func__ << ": " << **i
13749                << " not applied, returning false"
13750                << dendl;
13751       return false;
13752     }
13753   }
13754   dout(20) << __func__ << ": returning true" << dendl;
13755   return true;
13756 }
13757
13758
13759 // ==========================================================================================
13760 // SCRUB
13761
13762
13763 bool PrimaryLogPG::_range_available_for_scrub(
13764   const hobject_t &begin, const hobject_t &end)
13765 {
13766   pair<hobject_t, ObjectContextRef> next;
13767   next.second = object_contexts.lookup(begin);
13768   next.first = begin;
13769   bool more = true;
13770   while (more && next.first < end) {
13771     if (next.second && next.second->is_blocked()) {
13772       next.second->requeue_scrub_on_unblock = true;
13773       dout(10) << __func__ << ": scrub delayed, "
13774                << next.first << " is blocked"
13775                << dendl;
13776       return false;
13777     }
13778     more = object_contexts.get_next(next.first, &next);
13779   }
13780   return true;
13781 }
13782
13783 static bool doing_clones(const boost::optional<SnapSet> &snapset,
13784                          const vector<snapid_t>::reverse_iterator &curclone) {
13785     return snapset && curclone != snapset.get().clones.rend();
13786 }
13787
13788 void PrimaryLogPG::log_missing(unsigned missing,
13789                         const boost::optional<hobject_t> &head,
13790                         LogChannelRef clog,
13791                         const spg_t &pgid,
13792                         const char *func,
13793                         const char *mode,
13794                         bool allow_incomplete_clones)
13795 {
13796   assert(head);
13797   if (allow_incomplete_clones) {
13798     dout(20) << func << " " << mode << " " << pgid << " " << head.get()
13799                << " skipped " << missing << " clone(s) in cache tier" << dendl;
13800   } else {
13801     clog->info() << mode << " " << pgid << " " << head.get()
13802                        << " " << missing << " missing clone(s)";
13803   }
13804 }
13805
13806 unsigned PrimaryLogPG::process_clones_to(const boost::optional<hobject_t> &head,
13807   const boost::optional<SnapSet> &snapset,
13808   LogChannelRef clog,
13809   const spg_t &pgid,
13810   const char *mode,
13811   bool allow_incomplete_clones,
13812   boost::optional<snapid_t> target,
13813   vector<snapid_t>::reverse_iterator *curclone,
13814   inconsistent_snapset_wrapper &e)
13815 {
13816   assert(head);
13817   assert(snapset);
13818   unsigned missing = 0;
13819
13820   // NOTE: clones are in descending order, thus **curclone > target test here
13821   hobject_t next_clone(head.get());
13822   while(doing_clones(snapset, *curclone) && (!target || **curclone > *target)) {
13823     ++missing;
13824     // it is okay to be missing one or more clones in a cache tier.
13825     // skip higher-numbered clones in the list.
13826     if (!allow_incomplete_clones) {
13827       next_clone.snap = **curclone;
13828       clog->error() << mode << " " << pgid << " " << head.get()
13829                          << " expected clone " << next_clone << " " << missing
13830                          << " missing";
13831       ++scrubber.shallow_errors;
13832       e.set_clone_missing(next_clone.snap);
13833     }
13834     // Clones are descending
13835     ++(*curclone);
13836   }
13837   return missing;
13838 }
13839
13840 /*
13841  * Validate consistency of the object info and snap sets.
13842  *
13843  * We are sort of comparing 2 lists. The main loop is on objmap.objects. But
13844  * the comparison of the objects is against multiple snapset.clones. There are
13845  * multiple clone lists and in between lists we expect head or snapdir.
13846  *
13847  * Example
13848  *
13849  * objects              expected
13850  * =======              =======
13851  * obj1 snap 1          head/snapdir, unexpected obj1 snap 1
13852  * obj2 head            head/snapdir, head ok
13853  *              [SnapSet clones 6 4 2 1]
13854  * obj2 snap 7          obj2 snap 6, unexpected obj2 snap 7
13855  * obj2 snap 6          obj2 snap 6, match
13856  * obj2 snap 4          obj2 snap 4, match
13857  * obj3 head            obj2 snap 2 (expected), obj2 snap 1 (expected), head ok
13858  *              [Snapset clones 3 1]
13859  * obj3 snap 3          obj3 snap 3 match
13860  * obj3 snap 1          obj3 snap 1 match
13861  * obj4 snapdir         head/snapdir, snapdir ok
13862  *              [Snapset clones 4]
13863  * EOL                  obj4 snap 4, (expected)
13864  */
13865 void PrimaryLogPG::scrub_snapshot_metadata(
13866   ScrubMap &scrubmap,
13867   const map<hobject_t,
13868             pair<boost::optional<uint32_t>,
13869                  boost::optional<uint32_t>>> &missing_digest)
13870 {
13871   dout(10) << __func__ << dendl;
13872
13873   coll_t c(info.pgid);
13874   bool repair = state_test(PG_STATE_REPAIR);
13875   bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
13876   const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
13877   boost::optional<snapid_t> all_clones;   // Unspecified snapid_t or boost::none
13878
13879   /// snapsets to repair
13880   map<hobject_t,SnapSet> snapset_to_repair;
13881
13882   // traverse in reverse order.
13883   boost::optional<hobject_t> head;
13884   boost::optional<SnapSet> snapset; // If initialized so will head (above)
13885   vector<snapid_t>::reverse_iterator curclone; // Defined only if snapset initialized
13886   unsigned missing = 0;
13887   inconsistent_snapset_wrapper soid_error, head_error;
13888   unsigned soid_error_count = 0;
13889
13890   bufferlist last_data;
13891
13892   for (map<hobject_t,ScrubMap::object>::reverse_iterator
13893        p = scrubmap.objects.rbegin(); p != scrubmap.objects.rend(); ++p) {
13894     const hobject_t& soid = p->first;
13895     soid_error = inconsistent_snapset_wrapper{soid};
13896     object_stat_sum_t stat;
13897     boost::optional<object_info_t> oi;
13898
13899     if (!soid.is_snapdir())
13900       stat.num_objects++;
13901
13902     if (soid.nspace == cct->_conf->osd_hit_set_namespace)
13903       stat.num_objects_hit_set_archive++;
13904
13905     if (soid.is_snap()) {
13906       // it's a clone
13907       stat.num_object_clones++;
13908     }
13909
13910     // basic checks.
13911     if (p->second.attrs.count(OI_ATTR) == 0) {
13912       oi = boost::none;
13913       osd->clog->error() << mode << " " << info.pgid << " " << soid
13914                         << " no '" << OI_ATTR << "' attr";
13915       ++scrubber.shallow_errors;
13916       soid_error.set_info_missing();
13917     } else {
13918       bufferlist bv;
13919       bv.push_back(p->second.attrs[OI_ATTR]);
13920       try {
13921         oi = object_info_t(); // Initialize optional<> before decode into it
13922         oi.get().decode(bv);
13923       } catch (buffer::error& e) {
13924         oi = boost::none;
13925         osd->clog->error() << mode << " " << info.pgid << " " << soid
13926                 << " can't decode '" << OI_ATTR << "' attr " << e.what();
13927         ++scrubber.shallow_errors;
13928         soid_error.set_info_corrupted();
13929         soid_error.set_info_missing(); // Not available too
13930       }
13931     }
13932
13933     if (oi) {
13934       if (pgbackend->be_get_ondisk_size(oi->size) != p->second.size) {
13935         osd->clog->error() << mode << " " << info.pgid << " " << soid
13936                            << " on disk size (" << p->second.size
13937                            << ") does not match object info size ("
13938                            << oi->size << ") adjusted for ondisk to ("
13939                            << pgbackend->be_get_ondisk_size(oi->size)
13940                            << ")";
13941         soid_error.set_size_mismatch();
13942         ++scrubber.shallow_errors;
13943       }
13944
13945       dout(20) << mode << "  " << soid << " " << oi.get() << dendl;
13946
13947       // A clone num_bytes will be added later when we have snapset
13948       if (!soid.is_snap()) {
13949         stat.num_bytes += oi->size;
13950       }
13951       if (soid.nspace == cct->_conf->osd_hit_set_namespace)
13952         stat.num_bytes_hit_set_archive += oi->size;
13953
13954       if (!soid.is_snapdir()) {
13955         if (oi->is_dirty())
13956           ++stat.num_objects_dirty;
13957         if (oi->is_whiteout())
13958           ++stat.num_whiteouts;
13959         if (oi->is_omap())
13960           ++stat.num_objects_omap;
13961         if (oi->is_cache_pinned())
13962           ++stat.num_objects_pinned;
13963       }
13964     } else {
13965       // pessimistic assumption that this object might contain a
13966       // legacy SnapSet
13967       stat.num_legacy_snapsets++;
13968     }
13969
13970     // Check for any problems while processing clones
13971     if (doing_clones(snapset, curclone)) {
13972       boost::optional<snapid_t> target;
13973       // Expecting an object with snap for current head
13974       if (soid.has_snapset() || soid.get_head() != head->get_head()) {
13975
13976         dout(10) << __func__ << " " << mode << " " << info.pgid << " new object "
13977                  << soid << " while processing " << head.get() << dendl;
13978
13979         target = all_clones;
13980       } else {
13981         assert(soid.is_snap());
13982         target = soid.snap;
13983       }
13984
13985       // Log any clones we were expecting to be there up to target
13986       // This will set missing, but will be a no-op if snap.soid == *curclone.
13987       missing += process_clones_to(head, snapset, osd->clog, info.pgid, mode,
13988                         pool.info.allow_incomplete_clones(), target, &curclone,
13989                         head_error);
13990     }
13991     bool expected;
13992     // Check doing_clones() again in case we ran process_clones_to()
13993     if (doing_clones(snapset, curclone)) {
13994       // A head/snapdir would have processed all clones above
13995       // or all greater than *curclone.
13996       assert(soid.is_snap() && *curclone <= soid.snap);
13997
13998       // After processing above clone snap should match the expected curclone
13999       expected = (*curclone == soid.snap);
14000     } else {
14001       // If we aren't doing clones any longer, then expecting head/snapdir
14002       expected = soid.has_snapset();
14003     }
14004     if (!expected) {
14005       // If we couldn't read the head's snapset, just ignore clones
14006       if (head && !snapset) {
14007         osd->clog->error() << mode << " " << info.pgid << " " << soid
14008                           << " clone ignored due to missing snapset";
14009       } else {
14010         osd->clog->error() << mode << " " << info.pgid << " " << soid
14011                            << " is an unexpected clone";
14012       }
14013       ++scrubber.shallow_errors;
14014       soid_error.set_headless();
14015       scrubber.store->add_snap_error(pool.id, soid_error);
14016       ++soid_error_count;
14017       if (head && soid.get_head() == head->get_head())
14018         head_error.set_clone(soid.snap);
14019       continue;
14020     }
14021
14022     // new snapset?
14023     if (soid.has_snapset()) {
14024
14025       if (missing) {
14026         log_missing(missing, head, osd->clog, info.pgid, __func__, mode,
14027                     pool.info.allow_incomplete_clones());
14028       }
14029
14030       // Save previous head error information
14031       if (head && (head_error.errors || soid_error_count))
14032         scrubber.store->add_snap_error(pool.id, head_error);
14033       // Set this as a new head object
14034       head = soid;
14035       missing = 0;
14036       head_error = soid_error;
14037       soid_error_count = 0;
14038
14039       dout(20) << __func__ << " " << mode << " new head " << head << dendl;
14040
14041       if (p->second.attrs.count(SS_ATTR) == 0) {
14042         osd->clog->error() << mode << " " << info.pgid << " " << soid
14043                           << " no '" << SS_ATTR << "' attr";
14044         ++scrubber.shallow_errors;
14045         snapset = boost::none;
14046         head_error.set_snapset_missing();
14047       } else {
14048         bufferlist bl;
14049         bl.push_back(p->second.attrs[SS_ATTR]);
14050         bufferlist::iterator blp = bl.begin();
14051         try {
14052           snapset = SnapSet(); // Initialize optional<> before decoding into it
14053           ::decode(snapset.get(), blp);
14054           head_error.ss_bl.push_back(p->second.attrs[SS_ATTR]);
14055         } catch (buffer::error& e) {
14056           snapset = boost::none;
14057           osd->clog->error() << mode << " " << info.pgid << " " << soid
14058                 << " can't decode '" << SS_ATTR << "' attr " << e.what();
14059           ++scrubber.shallow_errors;
14060           head_error.set_snapset_corrupted();
14061         }
14062       }
14063
14064       if (snapset) {
14065         // what will be next?
14066         curclone = snapset->clones.rbegin();
14067
14068         if (!snapset->clones.empty()) {
14069           dout(20) << "  snapset " << snapset.get() << dendl;
14070           if (snapset->seq == 0) {
14071             osd->clog->error() << mode << " " << info.pgid << " " << soid
14072                                << " snaps.seq not set";
14073             ++scrubber.shallow_errors;
14074             head_error.set_snapset_error();
14075           }
14076         }
14077
14078         if (soid.is_head() && !snapset->head_exists) {
14079           osd->clog->error() << mode << " " << info.pgid << " " << soid
14080                           << " snapset.head_exists=false, but head exists";
14081           ++scrubber.shallow_errors;
14082           head_error.set_head_mismatch();
14083           // Fix head_exists locally so is_legacy() returns correctly
14084           snapset->head_exists = true;
14085         }
14086         if (soid.is_snapdir() && snapset->head_exists) {
14087           osd->clog->error() << mode << " " << info.pgid << " " << soid
14088                           << " snapset.head_exists=true, but snapdir exists";
14089           ++scrubber.shallow_errors;
14090           head_error.set_head_mismatch();
14091           // For symmetry fix this too, but probably doesn't matter
14092           snapset->head_exists = false;
14093         }
14094
14095         if (get_osdmap()->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
14096           if (soid.is_snapdir()) {
14097             dout(10) << " will move snapset to head from " << soid << dendl;
14098             snapset_to_repair[soid.get_head()] = *snapset;
14099           } else if (snapset->is_legacy()) {
14100             dout(10) << " will convert legacy snapset on " << soid << " " << *snapset
14101                      << dendl;
14102             snapset_to_repair[soid.get_head()] = *snapset;
14103           }
14104         } else {
14105           stat.num_legacy_snapsets++;
14106         }
14107       } else {
14108         // pessimistic assumption that this object might contain a
14109         // legacy SnapSet
14110         stat.num_legacy_snapsets++;
14111       }
14112     } else {
14113       assert(soid.is_snap());
14114       assert(head);
14115       assert(snapset);
14116       assert(soid.snap == *curclone);
14117
14118       dout(20) << __func__ << " " << mode << " matched clone " << soid << dendl;
14119
14120       if (snapset->clone_size.count(soid.snap) == 0) {
14121         osd->clog->error() << mode << " " << info.pgid << " " << soid
14122                            << " is missing in clone_size";
14123         ++scrubber.shallow_errors;
14124         soid_error.set_size_mismatch();
14125       } else {
14126         if (oi && oi->size != snapset->clone_size[soid.snap]) {
14127           osd->clog->error() << mode << " " << info.pgid << " " << soid
14128                              << " size " << oi->size << " != clone_size "
14129                              << snapset->clone_size[*curclone];
14130           ++scrubber.shallow_errors;
14131           soid_error.set_size_mismatch();
14132         }
14133
14134         if (snapset->clone_overlap.count(soid.snap) == 0) {
14135           osd->clog->error() << mode << " " << info.pgid << " " << soid
14136                              << " is missing in clone_overlap";
14137           ++scrubber.shallow_errors;
14138           soid_error.set_size_mismatch();
14139         } else {
14140           // This checking is based on get_clone_bytes().  The first 2 asserts
14141           // can't happen because we know we have a clone_size and
14142           // a clone_overlap.  Now we check that the interval_set won't
14143           // cause the last assert.
14144           uint64_t size = snapset->clone_size.find(soid.snap)->second;
14145           const interval_set<uint64_t> &overlap =
14146                 snapset->clone_overlap.find(soid.snap)->second;
14147           bool bad_interval_set = false;
14148           for (interval_set<uint64_t>::const_iterator i = overlap.begin();
14149                i != overlap.end(); ++i) {
14150             if (size < i.get_len()) {
14151               bad_interval_set = true;
14152               break;
14153             }
14154             size -= i.get_len();
14155           }
14156
14157           if (bad_interval_set) {
14158             osd->clog->error() << mode << " " << info.pgid << " " << soid
14159                                << " bad interval_set in clone_overlap";
14160             ++scrubber.shallow_errors;
14161             soid_error.set_size_mismatch();
14162           } else {
14163             stat.num_bytes += snapset->get_clone_bytes(soid.snap);
14164           }
14165         }
14166       }
14167
14168       // migrate legacy_snaps to snapset?
14169       auto p = snapset_to_repair.find(soid.get_head());
14170       if (p != snapset_to_repair.end()) {
14171         if (!oi || oi->legacy_snaps.empty()) {
14172           osd->clog->error() << mode << " " << info.pgid << " " << soid
14173                              << " has no oi or legacy_snaps; cannot convert "
14174                              << *snapset;
14175           ++scrubber.shallow_errors;
14176         } else {
14177           dout(20) << __func__ << "   copying legacy_snaps " << oi->legacy_snaps
14178                    << " to snapset " << p->second << dendl;
14179           p->second.clone_snaps[soid.snap] = oi->legacy_snaps;
14180         }
14181       }
14182
14183       // what's next?
14184       ++curclone;
14185       if (soid_error.errors) {
14186         scrubber.store->add_snap_error(pool.id, soid_error);
14187         ++soid_error_count;
14188       }
14189     }
14190
14191     scrub_cstat.add(stat);
14192   }
14193
14194   if (doing_clones(snapset, curclone)) {
14195     dout(10) << __func__ << " " << mode << " " << info.pgid
14196              << " No more objects while processing " << head.get() << dendl;
14197
14198     missing += process_clones_to(head, snapset, osd->clog, info.pgid, mode,
14199                       pool.info.allow_incomplete_clones(), all_clones, &curclone,
14200                       head_error);
14201   }
14202   // There could be missing found by the test above or even
14203   // before dropping out of the loop for the last head.
14204   if (missing) {
14205     log_missing(missing, head, osd->clog, info.pgid, __func__,
14206                 mode, pool.info.allow_incomplete_clones());
14207   }
14208   if (head && (head_error.errors || soid_error_count))
14209     scrubber.store->add_snap_error(pool.id, head_error);
14210
14211   for (auto p = missing_digest.begin(); p != missing_digest.end(); ++p) {
14212     if (p->first.is_snapdir())
14213       continue;
14214     dout(10) << __func__ << " recording digests for " << p->first << dendl;
14215     ObjectContextRef obc = get_object_context(p->first, false);
14216     if (!obc) {
14217       osd->clog->error() << info.pgid << " " << mode
14218                          << " cannot get object context for object "
14219                          << p->first;
14220       continue;
14221     } else if (obc->obs.oi.soid != p->first) {
14222       osd->clog->error() << info.pgid << " " << mode
14223                          << " object " << p->first
14224                          << " has a valid oi attr with a mismatched name, "
14225                          << " obc->obs.oi.soid: " << obc->obs.oi.soid;
14226       continue;
14227     }
14228     OpContextUPtr ctx = simple_opc_create(obc);
14229     ctx->at_version = get_next_version();
14230     ctx->mtime = utime_t();      // do not update mtime
14231     if (p->second.first) {
14232       ctx->new_obs.oi.set_data_digest(*p->second.first);
14233     } else {
14234       ctx->new_obs.oi.clear_data_digest();
14235     }
14236     if (p->second.second) {
14237       ctx->new_obs.oi.set_omap_digest(*p->second.second);
14238     } else {
14239       ctx->new_obs.oi.clear_omap_digest();
14240     }
14241     finish_ctx(ctx.get(), pg_log_entry_t::MODIFY);
14242
14243     ctx->register_on_success(
14244       [this]() {
14245         dout(20) << "updating scrub digest" << dendl;
14246         if (--scrubber.num_digest_updates_pending == 0) {
14247           requeue_scrub();
14248         }
14249       });
14250
14251     simple_opc_submit(std::move(ctx));
14252     ++scrubber.num_digest_updates_pending;
14253   }
14254   for (auto& p : snapset_to_repair) {
14255     // cache pools may not have the clones, which means we won't know
14256     // what snaps they have.  fake out the clone_snaps entries anyway (with
14257     // blank snap lists).
14258     p.second.head_exists = true;
14259     if (pool.info.allow_incomplete_clones()) {
14260       for (auto s : p.second.clones) {
14261         if (p.second.clone_snaps.count(s) == 0) {
14262           dout(10) << __func__ << " " << p.first << " faking clone_snaps for "
14263                    << s << dendl;
14264           p.second.clone_snaps[s];
14265         }
14266       }
14267     }
14268     if (p.second.clones.size() != p.second.clone_snaps.size() ||
14269         p.second.is_legacy()) {
14270       // this happens if we encounter other errors above, like a missing
14271       // or extra clone.
14272       dout(10) << __func__ << " not writing snapset to " << p.first
14273                << " snapset " << p.second << " clones " << p.second.clones
14274                << "; didn't convert fully" << dendl;
14275       scrub_cstat.sum.num_legacy_snapsets++;
14276       continue;
14277     }
14278     dout(10) << __func__ << " writing snapset to " << p.first
14279              << " " << p.second << dendl;
14280     ObjectContextRef obc = get_object_context(p.first, true);
14281     if (!obc) {
14282       osd->clog->error() << info.pgid << " " << mode
14283                          << " cannot get object context for object "
14284                          << p.first;
14285       continue;
14286     } else if (obc->obs.oi.soid != p.first) {
14287       osd->clog->error() << info.pgid << " " << mode
14288                          << " object " << p.first
14289                          << " has a valid oi attr with a mismatched name, "
14290                          << " obc->obs.oi.soid: " << obc->obs.oi.soid;
14291       continue;
14292     }
14293     ObjectContextRef snapset_obc;
14294     if (!obc->obs.exists) {
14295       snapset_obc = get_object_context(p.first.get_snapdir(), false);
14296       if (!snapset_obc) {
14297         osd->clog->error() << info.pgid << " " << mode
14298                            << " cannot get object context for "
14299                            << p.first.get_snapdir();
14300         continue;
14301       }
14302     }
14303     OpContextUPtr ctx = simple_opc_create(obc);
14304     PGTransaction *t = ctx->op_t.get();
14305     ctx->snapset_obc = snapset_obc;
14306     ctx->at_version = get_next_version();
14307     ctx->mtime = utime_t();      // do not update mtime
14308     ctx->new_snapset = p.second;
14309     if (!ctx->new_obs.exists) {
14310       dout(20) << __func__ << "   making " << p.first << " a whiteout" << dendl;
14311       ctx->new_obs.exists = true;
14312       ctx->new_snapset.head_exists = true;
14313       ctx->new_obs.oi.set_flag(object_info_t::FLAG_WHITEOUT);
14314       ++ctx->delta_stats.num_whiteouts;
14315       ++ctx->delta_stats.num_objects;
14316       t->create(p.first);
14317       if (p.first < scrubber.start) {
14318         dout(20) << __func__ << " kludging around update outside of scrub range"
14319                  << dendl;
14320       } else {
14321         scrub_cstat.add(ctx->delta_stats);
14322       }
14323     }
14324     dout(20) << __func__ << "   final snapset " << ctx->new_snapset << dendl;
14325     assert(!ctx->new_snapset.is_legacy());
14326     finish_ctx(ctx.get(), pg_log_entry_t::MODIFY);
14327     ctx->register_on_success(
14328       [this]() {
14329         dout(20) << "updating snapset" << dendl;
14330         if (--scrubber.num_digest_updates_pending == 0) {
14331           requeue_scrub();
14332         }
14333       });
14334
14335     simple_opc_submit(std::move(ctx));
14336     ++scrubber.num_digest_updates_pending;
14337   }
14338
14339   dout(10) << __func__ << " (" << mode << ") finish" << dendl;
14340 }
14341
14342 void PrimaryLogPG::_scrub_clear_state()
14343 {
14344   scrub_cstat = object_stat_collection_t();
14345 }
14346
14347 void PrimaryLogPG::_scrub_finish()
14348 {
14349   bool repair = state_test(PG_STATE_REPAIR);
14350   bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
14351   const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
14352
14353   if (info.stats.stats_invalid) {
14354     info.stats.stats = scrub_cstat;
14355     info.stats.stats_invalid = false;
14356
14357     if (agent_state)
14358       agent_choose_mode();
14359   }
14360
14361   dout(10) << mode << " got "
14362            << scrub_cstat.sum.num_objects << "/" << info.stats.stats.sum.num_objects << " objects, "
14363            << scrub_cstat.sum.num_object_clones << "/" << info.stats.stats.sum.num_object_clones << " clones, "
14364            << scrub_cstat.sum.num_objects_dirty << "/" << info.stats.stats.sum.num_objects_dirty << " dirty, "
14365            << scrub_cstat.sum.num_objects_omap << "/" << info.stats.stats.sum.num_objects_omap << " omap, "
14366            << scrub_cstat.sum.num_objects_pinned << "/" << info.stats.stats.sum.num_objects_pinned << " pinned, "
14367            << scrub_cstat.sum.num_objects_hit_set_archive << "/" << info.stats.stats.sum.num_objects_hit_set_archive << " hit_set_archive, "
14368            << scrub_cstat.sum.num_bytes << "/" << info.stats.stats.sum.num_bytes << " bytes, "
14369            << scrub_cstat.sum.num_bytes_hit_set_archive << "/" << info.stats.stats.sum.num_bytes_hit_set_archive << " hit_set_archive bytes."
14370            << dendl;
14371
14372   if (scrub_cstat.sum.num_objects != info.stats.stats.sum.num_objects ||
14373       scrub_cstat.sum.num_object_clones != info.stats.stats.sum.num_object_clones ||
14374       (scrub_cstat.sum.num_objects_dirty != info.stats.stats.sum.num_objects_dirty &&
14375        !info.stats.dirty_stats_invalid) ||
14376       (scrub_cstat.sum.num_objects_omap != info.stats.stats.sum.num_objects_omap &&
14377        !info.stats.omap_stats_invalid) ||
14378       (scrub_cstat.sum.num_objects_pinned != info.stats.stats.sum.num_objects_pinned &&
14379        !info.stats.pin_stats_invalid) ||
14380       (scrub_cstat.sum.num_objects_hit_set_archive != info.stats.stats.sum.num_objects_hit_set_archive &&
14381        !info.stats.hitset_stats_invalid) ||
14382       (scrub_cstat.sum.num_bytes_hit_set_archive != info.stats.stats.sum.num_bytes_hit_set_archive &&
14383        !info.stats.hitset_bytes_stats_invalid) ||
14384       scrub_cstat.sum.num_whiteouts != info.stats.stats.sum.num_whiteouts ||
14385       scrub_cstat.sum.num_bytes != info.stats.stats.sum.num_bytes) {
14386     osd->clog->error() << info.pgid << " " << mode
14387                       << " stat mismatch, got "
14388                       << scrub_cstat.sum.num_objects << "/" << info.stats.stats.sum.num_objects << " objects, "
14389                       << scrub_cstat.sum.num_object_clones << "/" << info.stats.stats.sum.num_object_clones << " clones, "
14390                       << scrub_cstat.sum.num_objects_dirty << "/" << info.stats.stats.sum.num_objects_dirty << " dirty, "
14391                       << scrub_cstat.sum.num_objects_omap << "/" << info.stats.stats.sum.num_objects_omap << " omap, "
14392                       << scrub_cstat.sum.num_objects_pinned << "/" << info.stats.stats.sum.num_objects_pinned << " pinned, "
14393                       << scrub_cstat.sum.num_objects_hit_set_archive << "/" << info.stats.stats.sum.num_objects_hit_set_archive << " hit_set_archive, "
14394                       << scrub_cstat.sum.num_whiteouts << "/" << info.stats.stats.sum.num_whiteouts << " whiteouts, "
14395                       << scrub_cstat.sum.num_bytes << "/" << info.stats.stats.sum.num_bytes << " bytes, "
14396                       << scrub_cstat.sum.num_bytes_hit_set_archive << "/" << info.stats.stats.sum.num_bytes_hit_set_archive << " hit_set_archive bytes.";
14397     ++scrubber.shallow_errors;
14398
14399     if (repair) {
14400       ++scrubber.fixed;
14401       info.stats.stats = scrub_cstat;
14402       info.stats.dirty_stats_invalid = false;
14403       info.stats.omap_stats_invalid = false;
14404       info.stats.hitset_stats_invalid = false;
14405       info.stats.hitset_bytes_stats_invalid = false;
14406       publish_stats_to_osd();
14407       share_pg_info();
14408     }
14409   } else if (scrub_cstat.sum.num_legacy_snapsets !=
14410              info.stats.stats.sum.num_legacy_snapsets) {
14411     osd->clog->info() << info.pgid << " " << mode << " updated num_legacy_snapsets"
14412                       << " from " << info.stats.stats.sum.num_legacy_snapsets
14413                       << " -> " << scrub_cstat.sum.num_legacy_snapsets << "\n";
14414     info.stats.stats.sum.num_legacy_snapsets = scrub_cstat.sum.num_legacy_snapsets;
14415     publish_stats_to_osd();
14416     share_pg_info();
14417   }
14418   // Clear object context cache to get repair information
14419   if (repair)
14420     object_contexts.clear();
14421 }
14422
14423 bool PrimaryLogPG::check_osdmap_full(const set<pg_shard_t> &missing_on)
14424 {
14425     return osd->check_osdmap_full(missing_on);
14426 }
14427
14428 int PrimaryLogPG::rep_repair_primary_object(const hobject_t& soid, OpRequestRef op)
14429 {
14430   // Only supports replicated pools
14431   assert(!pool.info.require_rollback());
14432   assert(is_primary());
14433
14434   dout(10) << __func__ << " " << soid
14435            << " peers osd.{" << actingbackfill << "}" << dendl;
14436
14437   if (!is_clean()) {
14438     block_for_clean(soid, op);
14439     return -EAGAIN;
14440   }
14441
14442   assert(!pg_log.get_missing().is_missing(soid));
14443   bufferlist bv;
14444   object_info_t oi;
14445   eversion_t v;
14446   int r = get_pgbackend()->objects_get_attr(soid, OI_ATTR, &bv);
14447   if (r < 0) {
14448     // Leave v and try to repair without a version, getting attr failed
14449     dout(0) << __func__ << ": Need version of replica, objects_get_attr failed: "
14450             << soid << " error=" << r << dendl;
14451   } else try {
14452     bufferlist::iterator bliter = bv.begin();
14453     ::decode(oi, bliter);
14454     v = oi.version;
14455   } catch (...) {
14456     // Leave v as default constructed. This will fail when sent to older OSDs, but
14457     // not much worse than failing here.
14458     dout(0) << __func__ << ": Need version of replica, bad object_info_t: " << soid << dendl;
14459   }
14460
14461   missing_loc.add_missing(soid, v, eversion_t());
14462   if (primary_error(soid, v)) {
14463     dout(0) << __func__ << " No other replicas available for " << soid << dendl;
14464     // XXX: If we knew that there is no down osd which could include this
14465     // object, it would be nice if we could return EIO here.
14466     // If a "never fail" flag was available, that could be used
14467     // for rbd to NOT return EIO until object marked lost.
14468
14469     // Drop through to save this op in case an osd comes up with the object.
14470   }
14471
14472   // Restart the op after object becomes readable again
14473   waiting_for_unreadable_object[soid].push_back(op);
14474   op->mark_delayed("waiting for missing object");
14475
14476   if (!eio_errors_to_process) {
14477     eio_errors_to_process = true;
14478     assert(is_clean());
14479     queue_peering_event(
14480         CephPeeringEvtRef(
14481           std::make_shared<CephPeeringEvt>(
14482           get_osdmap()->get_epoch(),
14483           get_osdmap()->get_epoch(),
14484           DoRecovery())));
14485   } else {
14486     // A prior error must have already cleared clean state and queued recovery
14487     // or a map change has triggered re-peering.
14488     // Not inlining the recovery by calling maybe_kick_recovery(soid);
14489     dout(5) << __func__<< ": Read error on " << soid << ", but already seen errors" << dendl;
14490   }
14491
14492   return -EAGAIN;
14493 }
14494
14495 /*---SnapTrimmer Logging---*/
14496 #undef dout_prefix
14497 #define dout_prefix *_dout << pg->gen_prefix()
14498
14499 void PrimaryLogPG::SnapTrimmer::log_enter(const char *state_name)
14500 {
14501   ldout(pg->cct, 20) << "enter " << state_name << dendl;
14502 }
14503
14504 void PrimaryLogPG::SnapTrimmer::log_exit(const char *state_name, utime_t enter_time)
14505 {
14506   ldout(pg->cct, 20) << "exit " << state_name << dendl;
14507 }
14508
14509 /*---SnapTrimmer states---*/
14510 #undef dout_prefix
14511 #define dout_prefix (*_dout << context< SnapTrimmer >().pg->gen_prefix() \
14512                      << "SnapTrimmer state<" << get_state_name() << ">: ")
14513
14514 /* NotTrimming */
14515 PrimaryLogPG::NotTrimming::NotTrimming(my_context ctx)
14516   : my_base(ctx),
14517     NamedState(context< SnapTrimmer >().pg, "NotTrimming")
14518 {
14519   context< SnapTrimmer >().log_enter(state_name);
14520 }
14521
14522 void PrimaryLogPG::NotTrimming::exit()
14523 {
14524   context< SnapTrimmer >().log_exit(state_name, enter_time);
14525 }
14526
14527 boost::statechart::result PrimaryLogPG::NotTrimming::react(const KickTrim&)
14528 {
14529   PrimaryLogPG *pg = context< SnapTrimmer >().pg;
14530   ldout(pg->cct, 10) << "NotTrimming react KickTrim" << dendl;
14531
14532   if (!(pg->is_primary() && pg->is_active())) {
14533     ldout(pg->cct, 10) << "NotTrimming not primary or active" << dendl;
14534     return discard_event();
14535   }
14536   if (!pg->is_clean() ||
14537       pg->snap_trimq.empty()) {
14538     ldout(pg->cct, 10) << "NotTrimming not clean or nothing to trim" << dendl;
14539     return discard_event();
14540   }
14541   if (pg->scrubber.active) {
14542     ldout(pg->cct, 10) << " scrubbing, will requeue snap_trimmer after" << dendl;
14543     return transit< WaitScrub >();
14544   } else {
14545     return transit< Trimming >();
14546   }
14547 }
14548
14549 boost::statechart::result PrimaryLogPG::WaitReservation::react(const SnapTrimReserved&)
14550 {
14551   PrimaryLogPG *pg = context< SnapTrimmer >().pg;
14552   ldout(pg->cct, 10) << "WaitReservation react SnapTrimReserved" << dendl;
14553
14554   pending = nullptr;
14555   if (!context< SnapTrimmer >().can_trim()) {
14556     post_event(KickTrim());
14557     return transit< NotTrimming >();
14558   }
14559
14560   context<Trimming>().snap_to_trim = pg->snap_trimq.range_start();
14561   ldout(pg->cct, 10) << "NotTrimming: trimming "
14562                      << pg->snap_trimq.range_start()
14563                      << dendl;
14564   return transit< AwaitAsyncWork >();
14565 }
14566
14567 /* AwaitAsyncWork */
14568 PrimaryLogPG::AwaitAsyncWork::AwaitAsyncWork(my_context ctx)
14569   : my_base(ctx),
14570     NamedState(context< SnapTrimmer >().pg, "Trimming/AwaitAsyncWork")
14571 {
14572   auto *pg = context< SnapTrimmer >().pg;
14573   context< SnapTrimmer >().log_enter(state_name);
14574   context< SnapTrimmer >().pg->osd->queue_for_snap_trim(pg);
14575   pg->state_set(PG_STATE_SNAPTRIM);
14576   pg->state_clear(PG_STATE_SNAPTRIM_ERROR);
14577   pg->publish_stats_to_osd();
14578 }
14579
14580 boost::statechart::result PrimaryLogPG::AwaitAsyncWork::react(const DoSnapWork&)
14581 {
14582   PrimaryLogPGRef pg = context< SnapTrimmer >().pg;
14583   snapid_t snap_to_trim = context<Trimming>().snap_to_trim;
14584   auto &in_flight = context<Trimming>().in_flight;
14585   assert(in_flight.empty());
14586
14587   assert(pg->is_primary() && pg->is_active());
14588   if (!context< SnapTrimmer >().can_trim()) {
14589     ldout(pg->cct, 10) << "something changed, reverting to NotTrimming" << dendl;
14590     post_event(KickTrim());
14591     return transit< NotTrimming >();
14592   }
14593
14594   ldout(pg->cct, 10) << "AwaitAsyncWork: trimming snap " << snap_to_trim << dendl;
14595
14596   vector<hobject_t> to_trim;
14597   unsigned max = pg->cct->_conf->osd_pg_max_concurrent_snap_trims;
14598   to_trim.reserve(max);
14599   int r = pg->snap_mapper.get_next_objects_to_trim(
14600     snap_to_trim,
14601     max,
14602     &to_trim);
14603   if (r != 0 && r != -ENOENT) {
14604     lderr(pg->cct) << "get_next_objects_to_trim returned "
14605                    << cpp_strerror(r) << dendl;
14606     assert(0 == "get_next_objects_to_trim returned an invalid code");
14607   } else if (r == -ENOENT) {
14608     // Done!
14609     ldout(pg->cct, 10) << "got ENOENT" << dendl;
14610
14611     ldout(pg->cct, 10) << "adding snap " << snap_to_trim
14612                        << " to purged_snaps"
14613                        << dendl;
14614     pg->info.purged_snaps.insert(snap_to_trim);
14615     pg->snap_trimq.erase(snap_to_trim);
14616     ldout(pg->cct, 10) << "purged_snaps now "
14617                        << pg->info.purged_snaps << ", snap_trimq now "
14618                        << pg->snap_trimq << dendl;
14619
14620     ObjectStore::Transaction t;
14621     pg->dirty_big_info = true;
14622     pg->write_if_dirty(t);
14623     int tr = pg->osd->store->queue_transaction(pg->osr.get(), std::move(t), NULL);
14624     assert(tr == 0);
14625
14626     pg->share_pg_info();
14627     post_event(KickTrim());
14628     return transit< NotTrimming >();
14629   }
14630   assert(!to_trim.empty());
14631
14632   for (auto &&object: to_trim) {
14633     // Get next
14634     ldout(pg->cct, 10) << "AwaitAsyncWork react trimming " << object << dendl;
14635     OpContextUPtr ctx;
14636     int error = pg->trim_object(in_flight.empty(), object, &ctx);
14637     if (error) {
14638       if (error == -ENOLCK) {
14639         ldout(pg->cct, 10) << "could not get write lock on obj "
14640                            << object << dendl;
14641       } else {
14642         pg->state_set(PG_STATE_SNAPTRIM_ERROR);
14643         ldout(pg->cct, 10) << "Snaptrim error=" << error << dendl;
14644       }
14645       if (!in_flight.empty()) {
14646         ldout(pg->cct, 10) << "letting the ones we already started finish" << dendl;
14647         return transit< WaitRepops >();
14648       }
14649       if (error == -ENOLCK) {
14650         ldout(pg->cct, 10) << "waiting for it to clear"
14651                            << dendl;
14652         return transit< WaitRWLock >();
14653       } else {
14654         return transit< NotTrimming >();
14655       }
14656     }
14657
14658     in_flight.insert(object);
14659     ctx->register_on_success(
14660       [pg, object, &in_flight]() {
14661         assert(in_flight.find(object) != in_flight.end());
14662         in_flight.erase(object);
14663         if (in_flight.empty()) {
14664           if (pg->state_test(PG_STATE_SNAPTRIM_ERROR)) {
14665             pg->snap_trimmer_machine.process_event(Reset());
14666           } else {
14667             pg->snap_trimmer_machine.process_event(RepopsComplete());
14668           }
14669         }
14670       });
14671
14672     pg->simple_opc_submit(std::move(ctx));
14673   }
14674
14675   return transit< WaitRepops >();
14676 }
14677
14678 void PrimaryLogPG::setattr_maybe_cache(
14679   ObjectContextRef obc,
14680   OpContext *op,
14681   PGTransaction *t,
14682   const string &key,
14683   bufferlist &val)
14684 {
14685   t->setattr(obc->obs.oi.soid, key, val);
14686 }
14687
14688 void PrimaryLogPG::setattrs_maybe_cache(
14689   ObjectContextRef obc,
14690   OpContext *op,
14691   PGTransaction *t,
14692   map<string, bufferlist> &attrs)
14693 {
14694   t->setattrs(obc->obs.oi.soid, attrs);
14695 }
14696
14697 void PrimaryLogPG::rmattr_maybe_cache(
14698   ObjectContextRef obc,
14699   OpContext *op,
14700   PGTransaction *t,
14701   const string &key)
14702 {
14703   t->rmattr(obc->obs.oi.soid, key);
14704 }
14705
14706 int PrimaryLogPG::getattr_maybe_cache(
14707   ObjectContextRef obc,
14708   const string &key,
14709   bufferlist *val)
14710 {
14711   if (pool.info.require_rollback()) {
14712     map<string, bufferlist>::iterator i = obc->attr_cache.find(key);
14713     if (i != obc->attr_cache.end()) {
14714       if (val)
14715         *val = i->second;
14716       return 0;
14717     } else {
14718       return -ENODATA;
14719     }
14720   }
14721   return pgbackend->objects_get_attr(obc->obs.oi.soid, key, val);
14722 }
14723
14724 int PrimaryLogPG::getattrs_maybe_cache(
14725   ObjectContextRef obc,
14726   map<string, bufferlist> *out)
14727 {
14728   int r = 0;
14729   assert(out);
14730   if (pool.info.require_rollback()) {
14731     *out = obc->attr_cache;
14732   } else {
14733     r = pgbackend->objects_get_attrs(obc->obs.oi.soid, out);
14734   }
14735   map<string, bufferlist> tmp;
14736   for (map<string, bufferlist>::iterator i = out->begin();
14737        i != out->end();
14738        ++i) {
14739     if (i->first.size() > 1 && i->first[0] == '_')
14740       tmp[i->first.substr(1, i->first.size())].claim(i->second);
14741   }
14742   tmp.swap(*out);
14743   return r;
14744 }
14745
14746 bool PrimaryLogPG::check_failsafe_full(ostream &ss) {
14747     return osd->check_failsafe_full(ss);
14748 }
14749
14750 void intrusive_ptr_add_ref(PrimaryLogPG *pg) { pg->get("intptr"); }
14751 void intrusive_ptr_release(PrimaryLogPG *pg) { pg->put("intptr"); }
14752
14753 #ifdef PG_DEBUG_REFS
14754 uint64_t get_with_id(PrimaryLogPG *pg) { return pg->get_with_id(); }
14755 void put_with_id(PrimaryLogPG *pg, uint64_t id) { return pg->put_with_id(id); }
14756 #endif
14757
14758 void intrusive_ptr_add_ref(PrimaryLogPG::RepGather *repop) { repop->get(); }
14759 void intrusive_ptr_release(PrimaryLogPG::RepGather *repop) { repop->put(); }