ceph/src/osd/PrimaryLogPG.cc

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Ceph - scalable distributed file system
   5  *
   6  * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
   7  * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
   8  *
   9  * Author: Loic Dachary <loic@dachary.org>
  10  *
  11  * This is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public
  13  * License version 2.1, as published by the Free Software
  14  * Foundation.  See file COPYING.
  15  *
  16  */
  17
  18 #include "boost/tuple/tuple.hpp"
  19 #include "boost/intrusive_ptr.hpp"
  20 #include "PG.h"
  21 #include "PrimaryLogPG.h"
  22 #include "OSD.h"
  23 #include "OpRequest.h"
  24 #include "ScrubStore.h"
  25 #include "Session.h"
  26 #include "objclass/objclass.h"
  27
  28 #include "common/errno.h"
  29 #include "common/scrub_types.h"
  30 #include "common/perf_counters.h"
  31
  32 #include "messages/MOSDOp.h"
  33 #include "messages/MOSDBackoff.h"
  34 #include "messages/MOSDSubOp.h"
  35 #include "messages/MOSDSubOpReply.h"
  36 #include "messages/MOSDPGTrim.h"
  37 #include "messages/MOSDPGScan.h"
  38 #include "messages/MOSDRepScrub.h"
  39 #include "messages/MOSDPGBackfill.h"
  40 #include "messages/MOSDPGBackfillRemove.h"
  41 #include "messages/MOSDPGUpdateLogMissing.h"
  42 #include "messages/MOSDPGUpdateLogMissingReply.h"
  43 #include "messages/MCommandReply.h"
  44 #include "messages/MOSDScrubReserve.h"
  45 #include "mds/inode_backtrace.h" // Ugh
  46 #include "common/EventTrace.h"
  47
  48 #include "common/config.h"
  49 #include "include/compat.h"
  50 #include "mon/MonClient.h"
  51 #include "osdc/Objecter.h"
  52 #include "json_spirit/json_spirit_value.h"
  53 #include "json_spirit/json_spirit_reader.h"
  54 #include "include/assert.h"  // json_spirit clobbers it
  55 #include "include/rados/rados_types.hpp"
  56
  57 #ifdef WITH_LTTNG
  58 #include "tracing/osd.h"
  59 #else
  60 #define tracepoint(...)
  61 #endif
  62
  63 #define dout_context cct
  64 #define dout_subsys ceph_subsys_osd
  65 #define DOUT_PREFIX_ARGS this, osd->whoami, get_osdmap()
  66 #undef dout_prefix
  67 #define dout_prefix _prefix(_dout, this)
  68 template <typename T>
  69 static ostream& _prefix(std::ostream *_dout, T *pg) {
  70   return *_dout << pg->gen_prefix();
  71 }
  72
  73
  74 #include <sstream>
  75 #include <utility>
  76
  77 #include <errno.h>
  78
  79 MEMPOOL_DEFINE_OBJECT_FACTORY(PrimaryLogPG, replicatedpg, osd);
  80
  81 PGLSFilter::PGLSFilter() : cct(nullptr)
  82 {
  83 }
  84
  85 PGLSFilter::~PGLSFilter()
  86 {
  87 }
  88
  89 struct PrimaryLogPG::C_OSD_OnApplied : Context {
  90   PrimaryLogPGRef pg;
  91   epoch_t epoch;
  92   eversion_t v;
  93   C_OSD_OnApplied(
  94     PrimaryLogPGRef pg,
  95     epoch_t epoch,
  96     eversion_t v)
  97     : pg(pg), epoch(epoch), v(v) {}
  98   void finish(int) override {
  99     pg->lock();
 100     if (!pg->pg_has_reset_since(epoch))
 101       pg->op_applied(v);
 102     pg->unlock();
 103   }
 104 };
 105
 106 /**
 107  * The CopyCallback class defines an interface for completions to the
 108  * copy_start code. Users of the copy infrastructure must implement
 109  * one and give an instance of the class to start_copy.
 110  *
 111  * The implementer is responsible for making sure that the CopyCallback
 112  * can associate itself with the correct copy operation.
 113  */
 114 class PrimaryLogPG::CopyCallback : public GenContext<CopyCallbackResults> {
 115 protected:
 116   CopyCallback() {}
 117   /**
 118    * results.get<0>() is the return code: 0 for success; -ECANCELED if
 119    * the operation was cancelled by the local OSD; -errno for other issues.
 120    * results.get<1>() is a pointer to a CopyResults object, which you are
 121    * responsible for deleting.
 122    */
 123   void finish(CopyCallbackResults results_) override = 0;
 124
 125 public:
 126   /// Provide the final size of the copied object to the CopyCallback
 127   ~CopyCallback() override {}
 128 };
 129
 130 template <typename T>
 131 class PrimaryLogPG::BlessedGenContext : public GenContext<T> {
 132   PrimaryLogPGRef pg;
 133   unique_ptr<GenContext<T>> c;
 134   epoch_t e;
 135 public:
 136   BlessedGenContext(PrimaryLogPG *pg, GenContext<T> *c, epoch_t e)
 137     : pg(pg), c(c), e(e) {}
 138   void finish(T t) override {
 139     pg->lock();
 140     if (pg->pg_has_reset_since(e))
 141       c.reset();
 142     else
 143       c.release()->complete(t);
 144     pg->unlock();
 145   }
 146 };
 147
 148 GenContext<ThreadPool::TPHandle&> *PrimaryLogPG::bless_gencontext(
 149   GenContext<ThreadPool::TPHandle&> *c) {
 150   return new BlessedGenContext<ThreadPool::TPHandle&>(
 151     this, c, get_osdmap()->get_epoch());
 152 }
 153
 154 class PrimaryLogPG::BlessedContext : public Context {
 155   PrimaryLogPGRef pg;
 156   unique_ptr<Context> c;
 157   epoch_t e;
 158 public:
 159   BlessedContext(PrimaryLogPG *pg, Context *c, epoch_t e)
 160     : pg(pg), c(c), e(e) {}
 161   void finish(int r) override {
 162     pg->lock();
 163     if (pg->pg_has_reset_since(e))
 164       c.reset();
 165     else
 166       c.release()->complete(r);
 167     pg->unlock();
 168   }
 169 };
 170
 171
 172 Context *PrimaryLogPG::bless_context(Context *c) {
 173   return new BlessedContext(this, c, get_osdmap()->get_epoch());
 174 }
 175
 176 class PrimaryLogPG::C_PG_ObjectContext : public Context {
 177   PrimaryLogPGRef pg;
 178   ObjectContext *obc;
 179   public:
 180   C_PG_ObjectContext(PrimaryLogPG *p, ObjectContext *o) :
 181     pg(p), obc(o) {}
 182   void finish(int r) override {
 183     pg->object_context_destructor_callback(obc);
 184   }
 185 };
 186
 187 class PrimaryLogPG::C_OSD_OndiskWriteUnlock : public Context {
 188   ObjectContextRef obc, obc2, obc3;
 189   public:
 190   C_OSD_OndiskWriteUnlock(
 191     ObjectContextRef o,
 192     ObjectContextRef o2 = ObjectContextRef(),
 193     ObjectContextRef o3 = ObjectContextRef()) : obc(o), obc2(o2), obc3(o3) {}
 194   void finish(int r) override {
 195     obc->ondisk_write_unlock();
 196     if (obc2)
 197       obc2->ondisk_write_unlock();
 198     if (obc3)
 199       obc3->ondisk_write_unlock();
 200   }
 201 };
 202
 203 struct OnReadComplete : public Context {
 204   PrimaryLogPG *pg;
 205   PrimaryLogPG::OpContext *opcontext;
 206   OnReadComplete(
 207     PrimaryLogPG *pg,
 208     PrimaryLogPG::OpContext *ctx) : pg(pg), opcontext(ctx) {}
 209   void finish(int r) override {
 210     opcontext->finish_read(pg);
 211   }
 212   ~OnReadComplete() override {}
 213 };
 214
 215 class PrimaryLogPG::C_OSD_AppliedRecoveredObject : public Context {
 216   PrimaryLogPGRef pg;
 217   ObjectContextRef obc;
 218   public:
 219   C_OSD_AppliedRecoveredObject(PrimaryLogPG *p, ObjectContextRef o) :
 220     pg(p), obc(o) {}
 221   void finish(int r) override {
 222     pg->_applied_recovered_object(obc);
 223   }
 224 };
 225
 226 class PrimaryLogPG::C_OSD_CommittedPushedObject : public Context {
 227   PrimaryLogPGRef pg;
 228   epoch_t epoch;
 229   eversion_t last_complete;
 230   public:
 231   C_OSD_CommittedPushedObject(
 232     PrimaryLogPG *p, epoch_t epoch, eversion_t lc) :
 233     pg(p), epoch(epoch), last_complete(lc) {
 234   }
 235   void finish(int r) override {
 236     pg->_committed_pushed_object(epoch, last_complete);
 237   }
 238 };
 239
 240 class PrimaryLogPG::C_OSD_AppliedRecoveredObjectReplica : public Context {
 241   PrimaryLogPGRef pg;
 242   public:
 243   explicit C_OSD_AppliedRecoveredObjectReplica(PrimaryLogPG *p) :
 244     pg(p) {}
 245   void finish(int r) override {
 246     pg->_applied_recovered_object_replica();
 247   }
 248 };
 249
 250 // OpContext
 251 void PrimaryLogPG::OpContext::start_async_reads(PrimaryLogPG *pg)
 252 {
 253   inflightreads = 1;
 254   list<pair<boost::tuple<uint64_t, uint64_t, unsigned>,
 255             pair<bufferlist*, Context*> > > in;
 256   in.swap(pending_async_reads);
 257   pg->pgbackend->objects_read_async(
 258     obc->obs.oi.soid,
 259     in,
 260     new OnReadComplete(pg, this), pg->get_pool().fast_read);
 261 }
 262 void PrimaryLogPG::OpContext::finish_read(PrimaryLogPG *pg)
 263 {
 264   assert(inflightreads > 0);
 265   --inflightreads;
 266   if (async_reads_complete()) {
 267     assert(pg->in_progress_async_reads.size());
 268     assert(pg->in_progress_async_reads.front().second == this);
 269     pg->in_progress_async_reads.pop_front();
 270
 271     // Restart the op context now that all reads have been
 272     // completed. Read failures will be handled by the op finisher
 273     pg->execute_ctx(this);
 274   }
 275 }
 276
 277 class CopyFromCallback : public PrimaryLogPG::CopyCallback {
 278 public:
 279   PrimaryLogPG::CopyResults *results = nullptr;
 280   PrimaryLogPG::OpContext *ctx;
 281   OSDOp &osd_op;
 282
 283   CopyFromCallback(PrimaryLogPG::OpContext *ctx, OSDOp &osd_op)
 284     : ctx(ctx), osd_op(osd_op) {
 285   }
 286   ~CopyFromCallback() override {}
 287
 288   void finish(PrimaryLogPG::CopyCallbackResults results_) override {
 289     results = results_.get<1>();
 290     int r = results_.get<0>();
 291
 292     // for finish_copyfrom
 293     ctx->user_at_version = results->user_version;
 294
 295     if (r >= 0) {
 296       ctx->pg->execute_ctx(ctx);
 297     } else {
 298       if (r != -ECANCELED) { // on cancel just toss it out; client resends
 299         if (ctx->op)
 300           ctx->pg->osd->reply_op_error(ctx->op, r);
 301       } else if (results->should_requeue) {
 302         if (ctx->op)
 303           ctx->pg->requeue_op(ctx->op);
 304       }
 305       ctx->pg->close_op_ctx(ctx);
 306     }
 307   }
 308
 309   bool is_temp_obj_used() {
 310     return results->started_temp_obj;
 311   }
 312   uint64_t get_data_size() {
 313     return results->object_size;
 314   }
 315 };
 316
 317 struct CopyFromFinisher : public PrimaryLogPG::OpFinisher {
 318   CopyFromCallback *copy_from_callback;
 319
 320   CopyFromFinisher(CopyFromCallback *copy_from_callback)
 321     : copy_from_callback(copy_from_callback) {
 322   }
 323
 324   int execute() override {
 325     // instance will be destructed after this method completes
 326     copy_from_callback->ctx->pg->finish_copyfrom(copy_from_callback);
 327     return 0;
 328   }
 329 };
 330
 331 // ======================
 332 // PGBackend::Listener
 333
 334 void PrimaryLogPG::on_local_recover(
 335   const hobject_t &hoid,
 336   const ObjectRecoveryInfo &_recovery_info,
 337   ObjectContextRef obc,
 338   bool is_delete,
 339   ObjectStore::Transaction *t
 340   )
 341 {
 342   dout(10) << __func__ << ": " << hoid << dendl;
 343
 344   ObjectRecoveryInfo recovery_info(_recovery_info);
 345   clear_object_snap_mapping(t, hoid);
 346   if (!is_delete && recovery_info.soid.is_snap()) {
 347     OSDriver::OSTransaction _t(osdriver.get_transaction(t));
 348     set<snapid_t> snaps;
 349     dout(20) << " snapset " << recovery_info.ss
 350              << " legacy_snaps " << recovery_info.oi.legacy_snaps << dendl;
 351     bool error = false;
 352     if (recovery_info.ss.is_legacy() ||
 353         recovery_info.ss.seq == 0 /* jewel osd doesn't populate this */) {
 354       assert(recovery_info.oi.legacy_snaps.size());
 355       snaps.insert(recovery_info.oi.legacy_snaps.begin(),
 356                    recovery_info.oi.legacy_snaps.end());
 357     } else {
 358       auto p = recovery_info.ss.clone_snaps.find(hoid.snap);
 359       if (p != recovery_info.ss.clone_snaps.end()) {
 360         snaps.insert(p->second.begin(), p->second.end());
 361       } else {
 362         derr << __func__ << " " << hoid << " had no clone_snaps" << dendl;
 363         error = true;
 364       }
 365     }
 366     if (!error) {
 367       dout(20) << " snaps " << snaps << dendl;
 368       snap_mapper.add_oid(
 369         recovery_info.soid,
 370         snaps,
 371         &_t);
 372     }
 373   }
 374   if (!is_delete && pg_log.get_missing().is_missing(recovery_info.soid) &&
 375       pg_log.get_missing().get_items().find(recovery_info.soid)->second.need > recovery_info.version) {
 376     assert(is_primary());
 377     const pg_log_entry_t *latest = pg_log.get_log().objects.find(recovery_info.soid)->second;
 378     if (latest->op == pg_log_entry_t::LOST_REVERT &&
 379         latest->reverting_to == recovery_info.version) {
 380       dout(10) << " got old revert version " << recovery_info.version
 381                << " for " << *latest << dendl;
 382       recovery_info.version = latest->version;
 383       // update the attr to the revert event version
 384       recovery_info.oi.prior_version = recovery_info.oi.version;
 385       recovery_info.oi.version = latest->version;
 386       bufferlist bl;
 387       ::encode(recovery_info.oi, bl,
 388                get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
 389       assert(!pool.info.require_rollback());
 390       t->setattr(coll, ghobject_t(recovery_info.soid), OI_ATTR, bl);
 391       if (obc)
 392         obc->attr_cache[OI_ATTR] = bl;
 393     }
 394   }
 395
 396   // keep track of active pushes for scrub
 397   ++active_pushes;
 398
 399   if (recovery_info.version > pg_log.get_can_rollback_to()) {
 400     /* This can only happen during a repair, and even then, it would
 401      * be one heck of a race.  If we are repairing the object, the
 402      * write in question must be fully committed, so it's not valid
 403      * to roll it back anyway (and we'll be rolled forward shortly
 404      * anyway) */
 405     PGLogEntryHandler h{this, t};
 406     pg_log.roll_forward_to(recovery_info.version, &h);
 407   }
 408   recover_got(recovery_info.soid, recovery_info.version);
 409
 410   if (is_primary()) {
 411     if (!is_delete) {
 412       obc->obs.exists = true;
 413       obc->ondisk_write_lock();
 414
 415       bool got = obc->get_recovery_read();
 416       assert(got);
 417
 418       assert(recovering.count(obc->obs.oi.soid));
 419       recovering[obc->obs.oi.soid] = obc;
 420       obc->obs.oi = recovery_info.oi;  // may have been updated above
 421       t->register_on_applied_sync(new C_OSD_OndiskWriteUnlock(obc));
 422     }
 423
 424     t->register_on_applied(new C_OSD_AppliedRecoveredObject(this, obc));
 425
 426     publish_stats_to_osd();
 427     assert(missing_loc.needs_recovery(hoid));
 428     if (!is_delete)
 429       missing_loc.add_location(hoid, pg_whoami);
 430     release_backoffs(hoid);
 431     if (!is_unreadable_object(hoid)) {
 432       auto unreadable_object_entry = waiting_for_unreadable_object.find(hoid);
 433       if (unreadable_object_entry != waiting_for_unreadable_object.end()) {
 434         dout(20) << " kicking unreadable waiters on " << hoid << dendl;
 435         requeue_ops(unreadable_object_entry->second);
 436         waiting_for_unreadable_object.erase(unreadable_object_entry);
 437       }
 438     }
 439   } else {
 440     t->register_on_applied(
 441       new C_OSD_AppliedRecoveredObjectReplica(this));
 442
 443   }
 444
 445   t->register_on_commit(
 446     new C_OSD_CommittedPushedObject(
 447       this,
 448       get_osdmap()->get_epoch(),
 449       info.last_complete));
 450
 451   // update pg
 452   dirty_info = true;
 453   write_if_dirty(*t);
 454 }
 455
 456 void PrimaryLogPG::on_global_recover(
 457   const hobject_t &soid,
 458   const object_stat_sum_t &stat_diff,
 459   bool is_delete)
 460 {
 461   info.stats.stats.sum.add(stat_diff);
 462   missing_loc.recovered(soid);
 463   publish_stats_to_osd();
 464   dout(10) << "pushed " << soid << " to all replicas" << dendl;
 465   map<hobject_t, ObjectContextRef>::iterator i = recovering.find(soid);
 466   assert(i != recovering.end());
 467
 468   if (!is_delete) {
 469     // recover missing won't have had an obc, but it gets filled in
 470     // during on_local_recover
 471     assert(i->second);
 472     list<OpRequestRef> requeue_list;
 473     i->second->drop_recovery_read(&requeue_list);
 474     requeue_ops(requeue_list);
 475   }
 476
 477   backfills_in_flight.erase(soid);
 478
 479   recovering.erase(i);
 480   finish_recovery_op(soid);
 481   release_backoffs(soid);
 482   auto degraded_object_entry = waiting_for_degraded_object.find(soid);
 483   if (degraded_object_entry != waiting_for_degraded_object.end()) {
 484     dout(20) << " kicking degraded waiters on " << soid << dendl;
 485     requeue_ops(degraded_object_entry->second);
 486     waiting_for_degraded_object.erase(degraded_object_entry);
 487   }
 488   auto unreadable_object_entry = waiting_for_unreadable_object.find(soid);
 489   if (unreadable_object_entry != waiting_for_unreadable_object.end()) {
 490     dout(20) << " kicking unreadable waiters on " << soid << dendl;
 491     requeue_ops(unreadable_object_entry->second);
 492     waiting_for_unreadable_object.erase(unreadable_object_entry);
 493   }
 494   finish_degraded_object(soid);
 495 }
 496
 497 void PrimaryLogPG::on_peer_recover(
 498   pg_shard_t peer,
 499   const hobject_t &soid,
 500   const ObjectRecoveryInfo &recovery_info)
 501 {
 502   publish_stats_to_osd();
 503   // done!
 504   peer_missing[peer].got(soid, recovery_info.version);
 505 }
 506
 507 void PrimaryLogPG::begin_peer_recover(
 508   pg_shard_t peer,
 509   const hobject_t soid)
 510 {
 511   peer_missing[peer].revise_have(soid, eversion_t());
 512 }
 513
 514 void PrimaryLogPG::schedule_recovery_work(
 515   GenContext<ThreadPool::TPHandle&> *c)
 516 {
 517   osd->recovery_gen_wq.queue(c);
 518 }
 519
 520 void PrimaryLogPG::send_message_osd_cluster(
 521   int peer, Message *m, epoch_t from_epoch)
 522 {
 523   osd->send_message_osd_cluster(peer, m, from_epoch);
 524 }
 525
 526 void PrimaryLogPG::send_message_osd_cluster(
 527   Message *m, Connection *con)
 528 {
 529   osd->send_message_osd_cluster(m, con);
 530 }
 531
 532 void PrimaryLogPG::send_message_osd_cluster(
 533   Message *m, const ConnectionRef& con)
 534 {
 535   osd->send_message_osd_cluster(m, con);
 536 }
 537
 538 void PrimaryLogPG::on_primary_error(
 539   const hobject_t &oid,
 540   eversion_t v)
 541 {
 542   dout(0) << __func__ << ": oid " << oid << " version " << v << dendl;
 543   primary_failed(oid);
 544   primary_error(oid, v);
 545   backfill_add_missing(oid, v);
 546 }
 547
 548 void PrimaryLogPG::backfill_add_missing(
 549   const hobject_t &oid,
 550   eversion_t v)
 551 {
 552   dout(0) << __func__ << ": oid " << oid << " version " << v << dendl;
 553   backfills_in_flight.erase(oid);
 554   missing_loc.add_missing(oid, v, eversion_t());
 555 }
 556
 557 ConnectionRef PrimaryLogPG::get_con_osd_cluster(
 558   int peer, epoch_t from_epoch)
 559 {
 560   return osd->get_con_osd_cluster(peer, from_epoch);
 561 }
 562
 563 PerfCounters *PrimaryLogPG::get_logger()
 564 {
 565   return osd->logger;
 566 }
 567
 568
 569 // ====================
 570 // missing objects
 571
 572 bool PrimaryLogPG::is_missing_object(const hobject_t& soid) const
 573 {
 574   return pg_log.get_missing().get_items().count(soid);
 575 }
 576
 577 void PrimaryLogPG::maybe_kick_recovery(
 578   const hobject_t &soid)
 579 {
 580   eversion_t v;
 581   if (!missing_loc.needs_recovery(soid, &v))
 582     return;
 583
 584   map<hobject_t, ObjectContextRef>::const_iterator p = recovering.find(soid);
 585   if (p != recovering.end()) {
 586     dout(7) << "object " << soid << " v " << v << ", already recovering." << dendl;
 587   } else if (missing_loc.is_unfound(soid)) {
 588     dout(7) << "object " << soid << " v " << v << ", is unfound." << dendl;
 589   } else {
 590     dout(7) << "object " << soid << " v " << v << ", recovering." << dendl;
 591     PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
 592     if (is_missing_object(soid)) {
 593       recover_missing(soid, v, cct->_conf->osd_client_op_priority, h);
 594     } else if (missing_loc.is_deleted(soid)) {
 595       prep_object_replica_deletes(soid, v, h);
 596     } else {
 597       prep_object_replica_pushes(soid, v, h);
 598     }
 599     pgbackend->run_recovery_op(h, cct->_conf->osd_client_op_priority);
 600   }
 601 }
 602
 603 void PrimaryLogPG::wait_for_unreadable_object(
 604   const hobject_t& soid, OpRequestRef op)
 605 {
 606   assert(is_unreadable_object(soid));
 607   maybe_kick_recovery(soid);
 608   waiting_for_unreadable_object[soid].push_back(op);
 609   op->mark_delayed("waiting for missing object");
 610 }
 611
 612 bool PrimaryLogPG::is_degraded_or_backfilling_object(const hobject_t& soid)
 613 {
 614   /* The conditions below may clear (on_local_recover, before we queue
 615    * the transaction) before we actually requeue the degraded waiters
 616    * in on_global_recover after the transaction completes.
 617    */
 618   if (waiting_for_degraded_object.count(soid))
 619     return true;
 620   if (pg_log.get_missing().get_items().count(soid))
 621     return true;
 622   assert(!actingbackfill.empty());
 623   for (set<pg_shard_t>::iterator i = actingbackfill.begin();
 624        i != actingbackfill.end();
 625        ++i) {
 626     if (*i == get_primary()) continue;
 627     pg_shard_t peer = *i;
 628     auto peer_missing_entry = peer_missing.find(peer);
 629     if (peer_missing_entry != peer_missing.end() &&
 630         peer_missing_entry->second.get_items().count(soid))
 631       return true;
 632
 633     // Object is degraded if after last_backfill AND
 634     // we are backfilling it
 635     if (is_backfill_targets(peer) &&
 636         peer_info[peer].last_backfill <= soid &&
 637         last_backfill_started >= soid &&
 638         backfills_in_flight.count(soid))
 639       return true;
 640   }
 641   return false;
 642 }
 643
 644 void PrimaryLogPG::wait_for_degraded_object(const hobject_t& soid, OpRequestRef op)
 645 {
 646   assert(is_degraded_or_backfilling_object(soid));
 647
 648   maybe_kick_recovery(soid);
 649   waiting_for_degraded_object[soid].push_back(op);
 650   op->mark_delayed("waiting for degraded object");
 651 }
 652
 653 void PrimaryLogPG::block_write_on_full_cache(
 654   const hobject_t& _oid, OpRequestRef op)
 655 {
 656   const hobject_t oid = _oid.get_head();
 657   dout(20) << __func__ << ": blocking object " << oid
 658            << " on full cache" << dendl;
 659   objects_blocked_on_cache_full.insert(oid);
 660   waiting_for_cache_not_full.push_back(op);
 661   op->mark_delayed("waiting for cache not full");
 662 }
 663
 664 void PrimaryLogPG::block_for_clean(
 665   const hobject_t& oid, OpRequestRef op)
 666 {
 667   dout(20) << __func__ << ": blocking object " << oid
 668            << " on primary repair" << dendl;
 669   waiting_for_clean_to_primary_repair.push_back(op);
 670   op->mark_delayed("waiting for clean to repair");
 671 }
 672
 673 void PrimaryLogPG::block_write_on_snap_rollback(
 674   const hobject_t& oid, ObjectContextRef obc, OpRequestRef op)
 675 {
 676   dout(20) << __func__ << ": blocking object " << oid.get_head()
 677            << " on snap promotion " << obc->obs.oi.soid << dendl;
 678   // otherwise, we'd have blocked in do_op
 679   assert(oid.is_head());
 680   assert(objects_blocked_on_snap_promotion.count(oid) == 0);
 681   objects_blocked_on_snap_promotion[oid] = obc;
 682   wait_for_blocked_object(obc->obs.oi.soid, op);
 683 }
 684
 685 void PrimaryLogPG::block_write_on_degraded_snap(
 686   const hobject_t& snap, OpRequestRef op)
 687 {
 688   dout(20) << __func__ << ": blocking object " << snap.get_head()
 689            << " on degraded snap " << snap << dendl;
 690   // otherwise, we'd have blocked in do_op
 691   assert(objects_blocked_on_degraded_snap.count(snap.get_head()) == 0);
 692   objects_blocked_on_degraded_snap[snap.get_head()] = snap.snap;
 693   wait_for_degraded_object(snap, op);
 694 }
 695
 696 bool PrimaryLogPG::maybe_await_blocked_snapset(
 697   const hobject_t &hoid,
 698   OpRequestRef op)
 699 {
 700   ObjectContextRef obc;
 701   obc = object_contexts.lookup(hoid.get_head());
 702   if (obc) {
 703     if (obc->is_blocked()) {
 704       wait_for_blocked_object(obc->obs.oi.soid, op);
 705       return true;
 706     } else {
 707       return false;
 708     }
 709   }
 710   obc = object_contexts.lookup(hoid.get_snapdir());
 711   if (obc) {
 712     if (obc->is_blocked()) {
 713       wait_for_blocked_object(obc->obs.oi.soid, op);
 714       return true;
 715     } else {
 716       return false;
 717     }
 718   }
 719   return false;
 720 }
 721
 722 void PrimaryLogPG::wait_for_blocked_object(const hobject_t& soid, OpRequestRef op)
 723 {
 724   dout(10) << __func__ << " " << soid << " " << op << dendl;
 725   waiting_for_blocked_object[soid].push_back(op);
 726   op->mark_delayed("waiting for blocked object");
 727 }
 728
 729 void PrimaryLogPG::maybe_force_recovery()
 730 {
 731   // no force if not in degraded/recovery/backfill states
 732   if (!is_degraded() &&
 733       !state_test(PG_STATE_RECOVERING |
 734                   PG_STATE_RECOVERY_WAIT |
 735                   PG_STATE_BACKFILLING |
 736                   PG_STATE_BACKFILL_WAIT |
 737                   PG_STATE_BACKFILL_TOOFULL))
 738     return;
 739
 740   if (pg_log.get_log().approx_size() <
 741       cct->_conf->osd_max_pg_log_entries *
 742         cct->_conf->osd_force_recovery_pg_log_entries_factor)
 743     return;
 744
 745   // find the oldest missing object
 746   version_t min_version = 0;
 747   hobject_t soid;
 748   if (!pg_log.get_missing().get_items().empty()) {
 749     min_version = pg_log.get_missing().get_rmissing().begin()->first;
 750     soid = pg_log.get_missing().get_rmissing().begin()->second;
 751   }
 752   assert(!actingbackfill.empty());
 753   for (set<pg_shard_t>::iterator it = actingbackfill.begin();
 754        it != actingbackfill.end();
 755        ++it) {
 756     if (*it == get_primary()) continue;
 757     pg_shard_t peer = *it;
 758     if (peer_missing.count(peer) &&
 759         !peer_missing[peer].get_items().empty() &&
 760         min_version > peer_missing[peer].get_rmissing().begin()->first) {
 761       min_version = peer_missing[peer].get_rmissing().begin()->first;
 762       soid = peer_missing[peer].get_rmissing().begin()->second;
 763     }
 764   }
 765
 766   // recover it
 767   if (soid != hobject_t())
 768     maybe_kick_recovery(soid);
 769 }
 770
 771 class PGLSPlainFilter : public PGLSFilter {
 772   string val;
 773 public:
 774   int init(bufferlist::iterator &params) override
 775   {
 776     try {
 777       ::decode(xattr, params);
 778       ::decode(val, params);
 779     } catch (buffer::error &e) {
 780       return -EINVAL;
 781     }
 782
 783     return 0;
 784   }
 785   ~PGLSPlainFilter() override {}
 786   bool filter(const hobject_t &obj, bufferlist& xattr_data,
 787                       bufferlist& outdata) override;
 788 };
 789
 790 class PGLSParentFilter : public PGLSFilter {
 791   inodeno_t parent_ino;
 792 public:
 793   CephContext* cct;
 794   PGLSParentFilter(CephContext* cct) : cct(cct) {
 795     xattr = "_parent";
 796   }
 797   int init(bufferlist::iterator &params) override
 798   {
 799     try {
 800       ::decode(parent_ino, params);
 801     } catch (buffer::error &e) {
 802       return -EINVAL;
 803     }
 804     generic_dout(0) << "parent_ino=" << parent_ino << dendl;
 805
 806     return 0;
 807   }
 808   ~PGLSParentFilter() override {}
 809   bool filter(const hobject_t &obj, bufferlist& xattr_data,
 810                       bufferlist& outdata) override;
 811 };
 812
 813 bool PGLSParentFilter::filter(const hobject_t &obj,
 814                               bufferlist& xattr_data, bufferlist& outdata)
 815 {
 816   bufferlist::iterator iter = xattr_data.begin();
 817   inode_backtrace_t bt;
 818
 819   generic_dout(0) << "PGLSParentFilter::filter" << dendl;
 820
 821   ::decode(bt, iter);
 822
 823   vector<inode_backpointer_t>::iterator vi;
 824   for (vi = bt.ancestors.begin(); vi != bt.ancestors.end(); ++vi) {
 825     generic_dout(0) << "vi->dirino=" << vi->dirino << " parent_ino=" << parent_ino << dendl;
 826     if (vi->dirino == parent_ino) {
 827       ::encode(*vi, outdata);
 828       return true;
 829     }
 830   }
 831
 832   return false;
 833 }
 834
 835 bool PGLSPlainFilter::filter(const hobject_t &obj,
 836                              bufferlist& xattr_data, bufferlist& outdata)
 837 {
 838   if (val.size() != xattr_data.length())
 839     return false;
 840
 841   if (memcmp(val.c_str(), xattr_data.c_str(), val.size()))
 842     return false;
 843
 844   return true;
 845 }
 846
 847 bool PrimaryLogPG::pgls_filter(PGLSFilter *filter, hobject_t& sobj, bufferlist& outdata)
 848 {
 849   bufferlist bl;
 850
 851   // If filter has expressed an interest in an xattr, load it.
 852   if (!filter->get_xattr().empty()) {
 853     int ret = pgbackend->objects_get_attr(
 854       sobj,
 855       filter->get_xattr(),
 856       &bl);
 857     dout(0) << "getattr (sobj=" << sobj << ", attr=" << filter->get_xattr() << ") returned " << ret << dendl;
 858     if (ret < 0) {
 859       if (ret != -ENODATA || filter->reject_empty_xattr()) {
 860         return false;
 861       }
 862     }
 863   }
 864
 865   return filter->filter(sobj, bl, outdata);
 866 }
 867
 868 int PrimaryLogPG::get_pgls_filter(bufferlist::iterator& iter, PGLSFilter **pfilter)
 869 {
 870   string type;
 871   PGLSFilter *filter;
 872
 873   try {
 874     ::decode(type, iter);
 875   }
 876   catch (buffer::error& e) {
 877     return -EINVAL;
 878   }
 879
 880   if (type.compare("parent") == 0) {
 881     filter = new PGLSParentFilter(cct);
 882   } else if (type.compare("plain") == 0) {
 883     filter = new PGLSPlainFilter();
 884   } else {
 885     std::size_t dot = type.find(".");
 886     if (dot == std::string::npos || dot == 0 || dot == type.size() - 1) {
 887       return -EINVAL;
 888     }
 889
 890     const std::string class_name = type.substr(0, dot);
 891     const std::string filter_name = type.substr(dot + 1);
 892     ClassHandler::ClassData *cls = NULL;
 893     int r = osd->class_handler->open_class(class_name, &cls);
 894     if (r != 0) {
 895       derr << "Error opening class '" << class_name << "': "
 896            << cpp_strerror(r) << dendl;
 897       if (r != -EPERM) // propogate permission error
 898         r = -EINVAL;
 899       return r;
 900     } else {
 901       assert(cls);
 902     }
 903
 904     ClassHandler::ClassFilter *class_filter = cls->get_filter(filter_name);
 905     if (class_filter == NULL) {
 906       derr << "Error finding filter '" << filter_name << "' in class "
 907            << class_name << dendl;
 908       return -EINVAL;
 909     }
 910     filter = class_filter->fn();
 911     if (!filter) {
 912       // Object classes are obliged to return us something, but let's
 913       // give an error rather than asserting out.
 914       derr << "Buggy class " << class_name << " failed to construct "
 915               "filter " << filter_name << dendl;
 916       return -EINVAL;
 917     }
 918   }
 919
 920   assert(filter);
 921   int r = filter->init(iter);
 922   if (r < 0) {
 923     derr << "Error initializing filter " << type << ": "
 924          << cpp_strerror(r) << dendl;
 925     delete filter;
 926     return -EINVAL;
 927   } else {
 928     // Successfully constructed and initialized, return it.
 929     *pfilter = filter;
 930     return 0;
 931   }
 932 }
 933
 934
 935 // ==========================================================
 936
 937 int PrimaryLogPG::do_command(
 938   cmdmap_t cmdmap,
 939   ostream& ss,
 940   bufferlist& idata,
 941   bufferlist& odata,
 942   ConnectionRef con,
 943   ceph_tid_t tid)
 944 {
 945   const auto &missing = pg_log.get_missing();
 946   string prefix;
 947   string format;
 948
 949   cmd_getval(cct, cmdmap, "format", format);
 950   boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json"));
 951
 952   string command;
 953   cmd_getval(cct, cmdmap, "cmd", command);
 954   if (command == "query") {
 955     f->open_object_section("pg");
 956     f->dump_string("state", pg_state_string(get_state()));
 957     f->dump_stream("snap_trimq") << snap_trimq;
 958     f->dump_unsigned("snap_trimq_len", snap_trimq.size());
 959     f->dump_unsigned("epoch", get_osdmap()->get_epoch());
 960     f->open_array_section("up");
 961     for (vector<int>::iterator p = up.begin(); p != up.end(); ++p)
 962       f->dump_unsigned("osd", *p);
 963     f->close_section();
 964     f->open_array_section("acting");
 965     for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p)
 966       f->dump_unsigned("osd", *p);
 967     f->close_section();
 968     if (!backfill_targets.empty()) {
 969       f->open_array_section("backfill_targets");
 970       for (set<pg_shard_t>::iterator p = backfill_targets.begin();
 971            p != backfill_targets.end();
 972            ++p)
 973         f->dump_stream("shard") << *p;
 974       f->close_section();
 975     }
 976     if (!actingbackfill.empty()) {
 977       f->open_array_section("actingbackfill");
 978       for (set<pg_shard_t>::iterator p = actingbackfill.begin();
 979            p != actingbackfill.end();
 980            ++p)
 981         f->dump_stream("shard") << *p;
 982       f->close_section();
 983     }
 984     f->open_object_section("info");
 985     _update_calc_stats();
 986     info.dump(f.get());
 987     f->close_section();
 988
 989     f->open_array_section("peer_info");
 990     for (map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
 991          p != peer_info.end();
 992          ++p) {
 993       f->open_object_section("info");
 994       f->dump_stream("peer") << p->first;
 995       p->second.dump(f.get());
 996       f->close_section();
 997     }
 998     f->close_section();
 999
1000     f->open_array_section("recovery_state");
1001     handle_query_state(f.get());
1002     f->close_section();
1003
1004     f->open_object_section("agent_state");
1005     if (agent_state)
1006       agent_state->dump(f.get());
1007     f->close_section();
1008
1009     f->close_section();
1010     f->flush(odata);
1011     return 0;
1012   }
1013   else if (command == "mark_unfound_lost") {
1014     string mulcmd;
1015     cmd_getval(cct, cmdmap, "mulcmd", mulcmd);
1016     int mode = -1;
1017     if (mulcmd == "revert") {
1018       if (pool.info.ec_pool()) {
1019         ss << "mode must be 'delete' for ec pool";
1020         return -EINVAL;
1021       }
1022       mode = pg_log_entry_t::LOST_REVERT;
1023     } else if (mulcmd == "delete") {
1024       mode = pg_log_entry_t::LOST_DELETE;
1025     } else {
1026       ss << "mode must be 'revert' or 'delete'; mark not yet implemented";
1027       return -EINVAL;
1028     }
1029     assert(mode == pg_log_entry_t::LOST_REVERT ||
1030            mode == pg_log_entry_t::LOST_DELETE);
1031
1032     if (!is_primary()) {
1033       ss << "not primary";
1034       return -EROFS;
1035     }
1036
1037     uint64_t unfound = missing_loc.num_unfound();
1038     if (!unfound) {
1039       ss << "pg has no unfound objects";
1040       return 0;  // make command idempotent
1041     }
1042
1043     if (!all_unfound_are_queried_or_lost(get_osdmap())) {
1044       ss << "pg has " << unfound
1045          << " unfound objects but we haven't probed all sources, not marking lost";
1046       return -EINVAL;
1047     }
1048
1049     mark_all_unfound_lost(mode, con, tid);
1050     return -EAGAIN;
1051   }
1052   else if (command == "list_missing") {
1053     hobject_t offset;
1054     string offset_json;
1055     if (cmd_getval(cct, cmdmap, "offset", offset_json)) {
1056       json_spirit::Value v;
1057       try {
1058         if (!json_spirit::read(offset_json, v))
1059           throw std::runtime_error("bad json");
1060         offset.decode(v);
1061       } catch (std::runtime_error& e) {
1062         ss << "error parsing offset: " << e.what();
1063         return -EINVAL;
1064       }
1065     }
1066     f->open_object_section("missing");
1067     {
1068       f->open_object_section("offset");
1069       offset.dump(f.get());
1070       f->close_section();
1071     }
1072     f->dump_int("num_missing", missing.num_missing());
1073     f->dump_int("num_unfound", get_num_unfound());
1074     const map<hobject_t, pg_missing_item> &needs_recovery_map =
1075       missing_loc.get_needs_recovery();
1076     map<hobject_t, pg_missing_item>::const_iterator p =
1077       needs_recovery_map.upper_bound(offset);
1078     {
1079       f->open_array_section("objects");
1080       int32_t num = 0;
1081       for (; p != needs_recovery_map.end() && num < cct->_conf->osd_command_max_records; ++p) {
1082         if (missing_loc.is_unfound(p->first)) {
1083           f->open_object_section("object");
1084           {
1085             f->open_object_section("oid");
1086             p->first.dump(f.get());
1087             f->close_section();
1088           }
1089           p->second.dump(f.get()); // have, need keys
1090           {
1091             f->open_array_section("locations");
1092             for (set<pg_shard_t>::iterator r =
1093                 missing_loc.get_locations(p->first).begin();
1094                 r != missing_loc.get_locations(p->first).end();
1095                 ++r)
1096               f->dump_stream("shard") << *r;
1097             f->close_section();
1098           }
1099           f->close_section();
1100           num++;
1101         }
1102       }
1103       f->close_section();
1104     }
1105     f->dump_bool("more", p != needs_recovery_map.end());
1106     f->close_section();
1107     f->flush(odata);
1108     return 0;
1109   }
1110
1111   ss << "unknown pg command " << prefix;
1112   return -EINVAL;
1113 }
1114
1115 // ==========================================================
1116
1117 void PrimaryLogPG::do_pg_op(OpRequestRef op)
1118 {
1119   // NOTE: this is non-const because we modify the OSDOp.outdata in
1120   // place
1121   MOSDOp *m = static_cast<MOSDOp *>(op->get_nonconst_req());
1122   assert(m->get_type() == CEPH_MSG_OSD_OP);
1123   dout(10) << "do_pg_op " << *m << dendl;
1124
1125   op->mark_started();
1126
1127   int result = 0;
1128   string cname, mname;
1129   PGLSFilter *filter = NULL;
1130   bufferlist filter_out;
1131
1132   snapid_t snapid = m->get_snapid();
1133
1134   vector<OSDOp> ops = m->ops;
1135
1136   for (vector<OSDOp>::iterator p = ops.begin(); p != ops.end(); ++p) {
1137     OSDOp& osd_op = *p;
1138     bufferlist::iterator bp = p->indata.begin();
1139     switch (p->op.op) {
1140     case CEPH_OSD_OP_PGNLS_FILTER:
1141       try {
1142         ::decode(cname, bp);
1143         ::decode(mname, bp);
1144       }
1145       catch (const buffer::error& e) {
1146         dout(0) << "unable to decode PGLS_FILTER description in " << *m << dendl;
1147         result = -EINVAL;
1148         break;
1149       }
1150       if (filter) {
1151         delete filter;
1152         filter = NULL;
1153       }
1154       result = get_pgls_filter(bp, &filter);
1155       if (result < 0)
1156         break;
1157
1158       assert(filter);
1159
1160       // fall through
1161
1162     case CEPH_OSD_OP_PGNLS:
1163       if (snapid != CEPH_NOSNAP) {
1164         result = -EINVAL;
1165         break;
1166       }
1167       if (get_osdmap()->raw_pg_to_pg(m->get_pg()) != info.pgid.pgid) {
1168         dout(10) << " pgnls pg=" << m->get_pg()
1169                  << " " << get_osdmap()->raw_pg_to_pg(m->get_pg())
1170                  << " != " << info.pgid << dendl;
1171         result = 0; // hmm?
1172       } else {
1173         unsigned list_size = MIN(cct->_conf->osd_max_pgls, p->op.pgls.count);
1174
1175         dout(10) << " pgnls pg=" << m->get_pg() << " count " << list_size << dendl;
1176         // read into a buffer
1177         vector<hobject_t> sentries;
1178         pg_nls_response_t response;
1179         try {
1180           ::decode(response.handle, bp);
1181         }
1182         catch (const buffer::error& e) {
1183           dout(0) << "unable to decode PGNLS handle in " << *m << dendl;
1184           result = -EINVAL;
1185           break;
1186         }
1187
1188         hobject_t next;
1189         hobject_t lower_bound = response.handle;
1190         hobject_t pg_start = info.pgid.pgid.get_hobj_start();
1191         hobject_t pg_end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1192         dout(10) << " pgnls lower_bound " << lower_bound
1193                  << " pg_end " << pg_end << dendl;
1194         if (((!lower_bound.is_max() && lower_bound >= pg_end) ||
1195              (lower_bound != hobject_t() && lower_bound < pg_start))) {
1196           // this should only happen with a buggy client.
1197           dout(10) << "outside of PG bounds " << pg_start << " .. "
1198                    << pg_end << dendl;
1199           result = -EINVAL;
1200           break;
1201         }
1202
1203         hobject_t current = lower_bound;
1204         osr->flush();
1205         int r = pgbackend->objects_list_partial(
1206           current,
1207           list_size,
1208           list_size,
1209           &sentries,
1210           &next);
1211         if (r != 0) {
1212           result = -EINVAL;
1213           break;
1214         }
1215
1216         map<hobject_t, pg_missing_item>::const_iterator missing_iter =
1217           pg_log.get_missing().get_items().lower_bound(current);
1218         vector<hobject_t>::iterator ls_iter = sentries.begin();
1219         hobject_t _max = hobject_t::get_max();
1220         while (1) {
1221           const hobject_t &mcand =
1222             missing_iter == pg_log.get_missing().get_items().end() ?
1223             _max :
1224             missing_iter->first;
1225           const hobject_t &lcand =
1226             ls_iter == sentries.end() ?
1227             _max :
1228             *ls_iter;
1229
1230           hobject_t candidate;
1231           if (mcand == lcand) {
1232             candidate = mcand;
1233             if (!mcand.is_max()) {
1234               ++ls_iter;
1235               ++missing_iter;
1236             }
1237           } else if (mcand < lcand) {
1238             candidate = mcand;
1239             assert(!mcand.is_max());
1240             ++missing_iter;
1241           } else {
1242             candidate = lcand;
1243             assert(!lcand.is_max());
1244             ++ls_iter;
1245           }
1246
1247           dout(10) << " pgnls candidate 0x" << std::hex << candidate.get_hash()
1248             << " vs lower bound 0x" << lower_bound.get_hash() << dendl;
1249
1250           if (candidate >= next) {
1251             break;
1252           }
1253
1254           if (response.entries.size() == list_size) {
1255             next = candidate;
1256             break;
1257           }
1258
1259           // skip snapdir objects
1260           if (candidate.snap == CEPH_SNAPDIR)
1261             continue;
1262
1263           if (candidate.snap != CEPH_NOSNAP)
1264             continue;
1265
1266           // skip internal namespace
1267           if (candidate.get_namespace() == cct->_conf->osd_hit_set_namespace)
1268             continue;
1269
1270           if (missing_loc.is_deleted(candidate))
1271             continue;
1272
1273           // skip wrong namespace
1274           if (m->get_hobj().nspace != librados::all_nspaces &&
1275                candidate.get_namespace() != m->get_hobj().nspace)
1276             continue;
1277
1278           if (filter && !pgls_filter(filter, candidate, filter_out))
1279             continue;
1280
1281           dout(20) << "pgnls item 0x" << std::hex
1282             << candidate.get_hash()
1283             << ", rev 0x" << hobject_t::_reverse_bits(candidate.get_hash())
1284             << std::dec << " "
1285             << candidate.oid.name << dendl;
1286
1287           librados::ListObjectImpl item;
1288           item.nspace = candidate.get_namespace();
1289           item.oid = candidate.oid.name;
1290           item.locator = candidate.get_key();
1291           response.entries.push_back(item);
1292         }
1293
1294         if (next.is_max() &&
1295             missing_iter == pg_log.get_missing().get_items().end() &&
1296             ls_iter == sentries.end()) {
1297           result = 1;
1298
1299           // Set response.handle to the start of the next PG according
1300           // to the object sort order.
1301           response.handle = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1302         } else {
1303           response.handle = next;
1304         }
1305         dout(10) << "pgnls handle=" << response.handle << dendl;
1306         ::encode(response, osd_op.outdata);
1307         if (filter)
1308           ::encode(filter_out, osd_op.outdata);
1309         dout(10) << " pgnls result=" << result << " outdata.length()="
1310                  << osd_op.outdata.length() << dendl;
1311       }
1312       break;
1313
1314     case CEPH_OSD_OP_PGLS_FILTER:
1315       try {
1316         ::decode(cname, bp);
1317         ::decode(mname, bp);
1318       }
1319       catch (const buffer::error& e) {
1320         dout(0) << "unable to decode PGLS_FILTER description in " << *m << dendl;
1321         result = -EINVAL;
1322         break;
1323       }
1324       if (filter) {
1325         delete filter;
1326         filter = NULL;
1327       }
1328       result = get_pgls_filter(bp, &filter);
1329       if (result < 0)
1330         break;
1331
1332       assert(filter);
1333
1334       // fall through
1335
1336     case CEPH_OSD_OP_PGLS:
1337       if (snapid != CEPH_NOSNAP) {
1338         result = -EINVAL;
1339         break;
1340       }
1341       if (get_osdmap()->raw_pg_to_pg(m->get_pg()) != info.pgid.pgid) {
1342         dout(10) << " pgls pg=" << m->get_pg()
1343                  << " " << get_osdmap()->raw_pg_to_pg(m->get_pg())
1344                  << " != " << info.pgid << dendl;
1345         result = 0; // hmm?
1346       } else {
1347         unsigned list_size = MIN(cct->_conf->osd_max_pgls, p->op.pgls.count);
1348
1349         dout(10) << " pgls pg=" << m->get_pg() << " count " << list_size << dendl;
1350         // read into a buffer
1351         vector<hobject_t> sentries;
1352         pg_ls_response_t response;
1353         try {
1354           ::decode(response.handle, bp);
1355         }
1356         catch (const buffer::error& e) {
1357           dout(0) << "unable to decode PGLS handle in " << *m << dendl;
1358           result = -EINVAL;
1359           break;
1360         }
1361
1362         hobject_t next;
1363         hobject_t current = response.handle;
1364         osr->flush();
1365         int r = pgbackend->objects_list_partial(
1366           current,
1367           list_size,
1368           list_size,
1369           &sentries,
1370           &next);
1371         if (r != 0) {
1372           result = -EINVAL;
1373           break;
1374         }
1375
1376         assert(snapid == CEPH_NOSNAP || pg_log.get_missing().get_items().empty());
1377
1378         map<hobject_t, pg_missing_item>::const_iterator missing_iter =
1379           pg_log.get_missing().get_items().lower_bound(current);
1380         vector<hobject_t>::iterator ls_iter = sentries.begin();
1381         hobject_t _max = hobject_t::get_max();
1382         while (1) {
1383           const hobject_t &mcand =
1384             missing_iter == pg_log.get_missing().get_items().end() ?
1385             _max :
1386             missing_iter->first;
1387           const hobject_t &lcand =
1388             ls_iter == sentries.end() ?
1389             _max :
1390             *ls_iter;
1391
1392           hobject_t candidate;
1393           if (mcand == lcand) {
1394             candidate = mcand;
1395             if (!mcand.is_max()) {
1396               ++ls_iter;
1397               ++missing_iter;
1398             }
1399           } else if (mcand < lcand) {
1400             candidate = mcand;
1401             assert(!mcand.is_max());
1402             ++missing_iter;
1403           } else {
1404             candidate = lcand;
1405             assert(!lcand.is_max());
1406             ++ls_iter;
1407           }
1408
1409           if (candidate >= next) {
1410             break;
1411           }
1412
1413           if (response.entries.size() == list_size) {
1414             next = candidate;
1415             break;
1416           }
1417
1418           // skip snapdir objects
1419           if (candidate.snap == CEPH_SNAPDIR)
1420             continue;
1421
1422           if (candidate.snap != CEPH_NOSNAP)
1423             continue;
1424
1425           // skip wrong namespace
1426           if (candidate.get_namespace() != m->get_hobj().nspace)
1427             continue;
1428
1429           if (missing_loc.is_deleted(candidate))
1430             continue;
1431
1432           if (filter && !pgls_filter(filter, candidate, filter_out))
1433             continue;
1434
1435           response.entries.push_back(make_pair(candidate.oid,
1436                                                candidate.get_key()));
1437         }
1438         if (next.is_max() &&
1439             missing_iter == pg_log.get_missing().get_items().end() &&
1440             ls_iter == sentries.end()) {
1441           result = 1;
1442         }
1443         response.handle = next;
1444         ::encode(response, osd_op.outdata);
1445         if (filter)
1446           ::encode(filter_out, osd_op.outdata);
1447         dout(10) << " pgls result=" << result << " outdata.length()="
1448                  << osd_op.outdata.length() << dendl;
1449       }
1450       break;
1451
1452     case CEPH_OSD_OP_PG_HITSET_LS:
1453       {
1454         list< pair<utime_t,utime_t> > ls;
1455         for (list<pg_hit_set_info_t>::const_iterator p = info.hit_set.history.begin();
1456              p != info.hit_set.history.end();
1457              ++p)
1458           ls.push_back(make_pair(p->begin, p->end));
1459         if (hit_set)
1460           ls.push_back(make_pair(hit_set_start_stamp, utime_t()));
1461         ::encode(ls, osd_op.outdata);
1462       }
1463       break;
1464
1465     case CEPH_OSD_OP_PG_HITSET_GET:
1466       {
1467         utime_t stamp(osd_op.op.hit_set_get.stamp);
1468         if (hit_set_start_stamp && stamp >= hit_set_start_stamp) {
1469           // read the current in-memory HitSet, not the version we've
1470           // checkpointed.
1471           if (!hit_set) {
1472             result= -ENOENT;
1473             break;
1474           }
1475           ::encode(*hit_set, osd_op.outdata);
1476           result = osd_op.outdata.length();
1477         } else {
1478           // read an archived HitSet.
1479           hobject_t oid;
1480           for (list<pg_hit_set_info_t>::const_iterator p = info.hit_set.history.begin();
1481                p != info.hit_set.history.end();
1482                ++p) {
1483             if (stamp >= p->begin && stamp <= p->end) {
1484               oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
1485               break;
1486             }
1487           }
1488           if (oid == hobject_t()) {
1489             result = -ENOENT;
1490             break;
1491           }
1492           if (!pool.info.is_replicated()) {
1493             // FIXME: EC not supported yet
1494             result = -EOPNOTSUPP;
1495             break;
1496           }
1497           if (is_unreadable_object(oid)) {
1498             wait_for_unreadable_object(oid, op);
1499             delete filter;
1500             return;
1501           }
1502           result = osd->store->read(ch, ghobject_t(oid), 0, 0, osd_op.outdata);
1503         }
1504       }
1505       break;
1506
1507    case CEPH_OSD_OP_SCRUBLS:
1508       result = do_scrub_ls(m, &osd_op);
1509       break;
1510
1511     default:
1512       result = -EINVAL;
1513       break;
1514     }
1515
1516     if (result < 0)
1517       break;
1518   }
1519
1520   // reply
1521   MOSDOpReply *reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(),
1522                                        CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK,
1523                                        false);
1524   reply->claim_op_out_data(ops);
1525   reply->set_result(result);
1526   reply->set_reply_versions(info.last_update, info.last_user_version);
1527   osd->send_message_osd_client(reply, m->get_connection());
1528   delete filter;
1529 }
1530
1531 int PrimaryLogPG::do_scrub_ls(MOSDOp *m, OSDOp *osd_op)
1532 {
1533   if (m->get_pg() != info.pgid.pgid) {
1534     dout(10) << " scrubls pg=" << m->get_pg() << " != " << info.pgid << dendl;
1535     return -EINVAL; // hmm?
1536   }
1537   auto bp = osd_op->indata.begin();
1538   scrub_ls_arg_t arg;
1539   try {
1540     arg.decode(bp);
1541   } catch (buffer::error&) {
1542     dout(10) << " corrupted scrub_ls_arg_t" << dendl;
1543     return -EINVAL;
1544   }
1545   int r = 0;
1546   scrub_ls_result_t result = {.interval = info.history.same_interval_since};
1547   if (arg.interval != 0 && arg.interval != info.history.same_interval_since) {
1548     r = -EAGAIN;
1549   } else if (!scrubber.store) {
1550     r = -ENOENT;
1551   } else if (arg.get_snapsets) {
1552     result.vals = scrubber.store->get_snap_errors(osd->store,
1553                                                   get_pgid().pool(),
1554                                                   arg.start_after,
1555                                                   arg.max_return);
1556   } else {
1557     result.vals = scrubber.store->get_object_errors(osd->store,
1558                                                     get_pgid().pool(),
1559                                                     arg.start_after,
1560                                                     arg.max_return);
1561   }
1562   ::encode(result, osd_op->outdata);
1563   return r;
1564 }
1565
1566 void PrimaryLogPG::calc_trim_to()
1567 {
1568   size_t target = cct->_conf->osd_min_pg_log_entries;
1569   if (is_degraded() ||
1570       state_test(PG_STATE_RECOVERING |
1571                  PG_STATE_RECOVERY_WAIT |
1572                  PG_STATE_BACKFILLING |
1573                  PG_STATE_BACKFILL_WAIT |
1574                  PG_STATE_BACKFILL_TOOFULL)) {
1575     target = cct->_conf->osd_max_pg_log_entries;
1576   }
1577
1578   eversion_t limit = MIN(
1579     min_last_complete_ondisk,
1580     pg_log.get_can_rollback_to());
1581   if (limit != eversion_t() &&
1582       limit != pg_trim_to &&
1583       pg_log.get_log().approx_size() > target) {
1584     size_t num_to_trim = MIN(pg_log.get_log().approx_size() - target,
1585                              cct->_conf->osd_pg_log_trim_max);
1586     if (num_to_trim < cct->_conf->osd_pg_log_trim_min &&
1587         cct->_conf->osd_pg_log_trim_max >= cct->_conf->osd_pg_log_trim_min) {
1588       return;
1589     }
1590     list<pg_log_entry_t>::const_iterator it = pg_log.get_log().log.begin();
1591     eversion_t new_trim_to;
1592     for (size_t i = 0; i < num_to_trim; ++i) {
1593       new_trim_to = it->version;
1594       ++it;
1595       if (new_trim_to > limit) {
1596         new_trim_to = limit;
1597         dout(10) << "calc_trim_to trimming to min_last_complete_ondisk" << dendl;
1598         break;
1599       }
1600     }
1601     dout(10) << "calc_trim_to " << pg_trim_to << " -> " << new_trim_to << dendl;
1602     pg_trim_to = new_trim_to;
1603     assert(pg_trim_to <= pg_log.get_head());
1604     assert(pg_trim_to <= min_last_complete_ondisk);
1605   }
1606 }
1607
1608 PrimaryLogPG::PrimaryLogPG(OSDService *o, OSDMapRef curmap,
1609                            const PGPool &_pool, spg_t p) :
1610   PG(o, curmap, _pool, p),
1611   pgbackend(
1612     PGBackend::build_pg_backend(
1613       _pool.info, curmap, this, coll_t(p), ch, o->store, cct)),
1614   object_contexts(o->cct, o->cct->_conf->osd_pg_object_context_cache_count),
1615   snapset_contexts_lock("PrimaryLogPG::snapset_contexts_lock"),
1616   new_backfill(false),
1617   temp_seq(0),
1618   snap_trimmer_machine(this)
1619 {
1620   missing_loc.set_backend_predicates(
1621     pgbackend->get_is_readable_predicate(),
1622     pgbackend->get_is_recoverable_predicate());
1623   snap_trimmer_machine.initiate();
1624 }
1625
1626 void PrimaryLogPG::get_src_oloc(const object_t& oid, const object_locator_t& oloc, object_locator_t& src_oloc)
1627 {
1628   src_oloc = oloc;
1629   if (oloc.key.empty())
1630     src_oloc.key = oid.name;
1631 }
1632
1633 void PrimaryLogPG::handle_backoff(OpRequestRef& op)
1634 {
1635   const MOSDBackoff *m = static_cast<const MOSDBackoff*>(op->get_req());
1636   SessionRef session = static_cast<Session*>(m->get_connection()->get_priv());
1637   if (!session)
1638     return;  // drop it.
1639   session->put();  // get_priv takes a ref, and so does the SessionRef
1640   hobject_t begin = info.pgid.pgid.get_hobj_start();
1641   hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1642   if (begin < m->begin) {
1643     begin = m->begin;
1644   }
1645   if (end > m->end) {
1646     end = m->end;
1647   }
1648   dout(10) << __func__ << " backoff ack id " << m->id
1649            << " [" << begin << "," << end << ")" << dendl;
1650   session->ack_backoff(cct, m->pgid, m->id, begin, end);
1651 }
1652
1653 void PrimaryLogPG::do_request(
1654   OpRequestRef& op,
1655   ThreadPool::TPHandle &handle)
1656 {
1657   if (op->osd_trace) {
1658     op->pg_trace.init("pg op", &trace_endpoint, &op->osd_trace);
1659     op->pg_trace.event("do request");
1660   }
1661   // make sure we have a new enough map
1662   auto p = waiting_for_map.find(op->get_source());
1663   if (p != waiting_for_map.end()) {
1664     // preserve ordering
1665     dout(20) << __func__ << " waiting_for_map "
1666              << p->first << " not empty, queueing" << dendl;
1667     p->second.push_back(op);
1668     op->mark_delayed("waiting_for_map not empty");
1669     return;
1670   }
1671   if (!have_same_or_newer_map(op->min_epoch)) {
1672     dout(20) << __func__ << " min " << op->min_epoch
1673              << ", queue on waiting_for_map " << op->get_source() << dendl;
1674     waiting_for_map[op->get_source()].push_back(op);
1675     op->mark_delayed("op must wait for map");
1676     osd->request_osdmap_update(op->min_epoch);
1677     return;
1678   }
1679
1680   if (can_discard_request(op)) {
1681     return;
1682   }
1683
1684   // pg-wide backoffs
1685   const Message *m = op->get_req();
1686   if (m->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF)) {
1687     SessionRef session = static_cast<Session*>(m->get_connection()->get_priv());
1688     if (!session)
1689       return;  // drop it.
1690     session->put();  // get_priv takes a ref, and so does the SessionRef
1691
1692     if (op->get_req()->get_type() == CEPH_MSG_OSD_OP) {
1693       if (session->check_backoff(cct, info.pgid,
1694                                  info.pgid.pgid.get_hobj_start(), m)) {
1695         return;
1696       }
1697
1698       bool backoff =
1699         is_down() ||
1700         is_incomplete() ||
1701         (!is_active() && is_peered());
1702       if (g_conf->osd_backoff_on_peering && !backoff) {
1703         if (is_peering()) {
1704           backoff = true;
1705         }
1706       }
1707       if (backoff) {
1708         add_pg_backoff(session);
1709         return;
1710       }
1711     }
1712     // pg backoff acks at pg-level
1713     if (op->get_req()->get_type() == CEPH_MSG_OSD_BACKOFF) {
1714       const MOSDBackoff *ba = static_cast<const MOSDBackoff*>(m);
1715       if (ba->begin != ba->end) {
1716         handle_backoff(op);
1717         return;
1718       }
1719     }
1720   }
1721
1722   if (!is_peered()) {
1723     // Delay unless PGBackend says it's ok
1724     if (pgbackend->can_handle_while_inactive(op)) {
1725       bool handled = pgbackend->handle_message(op);
1726       assert(handled);
1727       return;
1728     } else {
1729       waiting_for_peered.push_back(op);
1730       op->mark_delayed("waiting for peered");
1731       return;
1732     }
1733   }
1734
1735   if (flushes_in_progress > 0) {
1736     dout(20) << flushes_in_progress
1737              << " flushes_in_progress pending "
1738              << "waiting for flush on " << op << dendl;
1739     waiting_for_flush.push_back(op);
1740     op->mark_delayed("waiting for flush");
1741     return;
1742   }
1743
1744   assert(is_peered() && flushes_in_progress == 0);
1745   if (pgbackend->handle_message(op))
1746     return;
1747
1748   switch (op->get_req()->get_type()) {
1749   case CEPH_MSG_OSD_OP:
1750   case CEPH_MSG_OSD_BACKOFF:
1751     if (!is_active()) {
1752       dout(20) << " peered, not active, waiting for active on " << op << dendl;
1753       waiting_for_active.push_back(op);
1754       op->mark_delayed("waiting for active");
1755       return;
1756     }
1757     switch (op->get_req()->get_type()) {
1758     case CEPH_MSG_OSD_OP:
1759       // verify client features
1760       if ((pool.info.has_tiers() || pool.info.is_tier()) &&
1761           !op->has_feature(CEPH_FEATURE_OSD_CACHEPOOL)) {
1762         osd->reply_op_error(op, -EOPNOTSUPP);
1763         return;
1764       }
1765       do_op(op);
1766       break;
1767     case CEPH_MSG_OSD_BACKOFF:
1768       // object-level backoff acks handled in osdop context
1769       handle_backoff(op);
1770       break;
1771     }
1772     break;
1773
1774   case MSG_OSD_SUBOP:
1775     do_sub_op(op);
1776     break;
1777
1778   case MSG_OSD_SUBOPREPLY:
1779     do_sub_op_reply(op);
1780     break;
1781
1782   case MSG_OSD_PG_SCAN:
1783     do_scan(op, handle);
1784     break;
1785
1786   case MSG_OSD_PG_BACKFILL:
1787     do_backfill(op);
1788     break;
1789
1790   case MSG_OSD_PG_BACKFILL_REMOVE:
1791     do_backfill_remove(op);
1792     break;
1793
1794   case MSG_OSD_SCRUB_RESERVE:
1795     {
1796       const MOSDScrubReserve *m =
1797         static_cast<const MOSDScrubReserve*>(op->get_req());
1798       switch (m->type) {
1799       case MOSDScrubReserve::REQUEST:
1800         handle_scrub_reserve_request(op);
1801         break;
1802       case MOSDScrubReserve::GRANT:
1803         handle_scrub_reserve_grant(op, m->from);
1804         break;
1805       case MOSDScrubReserve::REJECT:
1806         handle_scrub_reserve_reject(op, m->from);
1807         break;
1808       case MOSDScrubReserve::RELEASE:
1809         handle_scrub_reserve_release(op);
1810         break;
1811       }
1812     }
1813     break;
1814
1815   case MSG_OSD_REP_SCRUB:
1816     replica_scrub(op, handle);
1817     break;
1818
1819   case MSG_OSD_REP_SCRUBMAP:
1820     do_replica_scrub_map(op);
1821     break;
1822
1823   case MSG_OSD_PG_UPDATE_LOG_MISSING:
1824     do_update_log_missing(op);
1825     break;
1826
1827   case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY:
1828     do_update_log_missing_reply(op);
1829     break;
1830
1831   default:
1832     assert(0 == "bad message type in do_request");
1833   }
1834 }
1835
1836 hobject_t PrimaryLogPG::earliest_backfill() const
1837 {
1838   hobject_t e = hobject_t::get_max();
1839   for (set<pg_shard_t>::iterator i = backfill_targets.begin();
1840        i != backfill_targets.end();
1841        ++i) {
1842     pg_shard_t bt = *i;
1843     map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(bt);
1844     assert(iter != peer_info.end());
1845     if (iter->second.last_backfill < e)
1846       e = iter->second.last_backfill;
1847   }
1848   return e;
1849 }
1850
1851 /** do_op - do an op
1852  * pg lock will be held (if multithreaded)
1853  * osd_lock NOT held.
1854  */
1855 void PrimaryLogPG::do_op(OpRequestRef& op)
1856 {
1857   FUNCTRACE();
1858   // NOTE: take a non-const pointer here; we must be careful not to
1859   // change anything that will break other reads on m (operator<<).
1860   MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
1861   assert(m->get_type() == CEPH_MSG_OSD_OP);
1862   if (m->finish_decode()) {
1863     op->reset_desc();   // for TrackedOp
1864     m->clear_payload();
1865   }
1866
1867   dout(20) << __func__ << ": op " << *m << dendl;
1868
1869   hobject_t head = m->get_hobj();
1870   head.snap = CEPH_NOSNAP;
1871
1872   if (!info.pgid.pgid.contains(
1873         info.pgid.pgid.get_split_bits(pool.info.get_pg_num()), head)) {
1874     derr << __func__ << " " << info.pgid.pgid << " does not contain "
1875          << head << " pg_num " << pool.info.get_pg_num() << " hash "
1876          << std::hex << head.get_hash() << std::dec << dendl;
1877     osd->clog->warn() << info.pgid.pgid << " does not contain " << head
1878                       << " op " << *m;
1879     assert(!cct->_conf->osd_debug_misdirected_ops);
1880     return;
1881   }
1882
1883   bool can_backoff =
1884     m->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF);
1885   SessionRef session;
1886   if (can_backoff) {
1887     session = static_cast<Session*>(m->get_connection()->get_priv());
1888     if (!session.get()) {
1889       dout(10) << __func__ << " no session" << dendl;
1890       return;
1891     }
1892     session->put();  // get_priv() takes a ref, and so does the intrusive_ptr
1893
1894     if (session->check_backoff(cct, info.pgid, head, m)) {
1895       return;
1896     }
1897   }
1898
1899   if (m->has_flag(CEPH_OSD_FLAG_PARALLELEXEC)) {
1900     // not implemented.
1901     dout(20) << __func__ << ": PARALLELEXEC not implemented " << *m << dendl;
1902     osd->reply_op_error(op, -EINVAL);
1903     return;
1904   }
1905
1906   if (op->rmw_flags == 0) {
1907     int r = osd->osd->init_op_flags(op);
1908     if (r) {
1909       osd->reply_op_error(op, r);
1910       return;
1911     }
1912   }
1913
1914   if ((m->get_flags() & (CEPH_OSD_FLAG_BALANCE_READS |
1915                          CEPH_OSD_FLAG_LOCALIZE_READS)) &&
1916       op->may_read() &&
1917       !(op->may_write() || op->may_cache())) {
1918     // balanced reads; any replica will do
1919     if (!(is_primary() || is_replica())) {
1920       osd->handle_misdirected_op(this, op);
1921       return;
1922     }
1923   } else {
1924     // normal case; must be primary
1925     if (!is_primary()) {
1926       osd->handle_misdirected_op(this, op);
1927       return;
1928     }
1929   }
1930
1931   if (!op_has_sufficient_caps(op)) {
1932     osd->reply_op_error(op, -EPERM);
1933     return;
1934   }
1935
1936   if (op->includes_pg_op()) {
1937     return do_pg_op(op);
1938   }
1939
1940   // object name too long?
1941   if (m->get_oid().name.size() > cct->_conf->osd_max_object_name_len) {
1942     dout(4) << "do_op name is longer than "
1943             << cct->_conf->osd_max_object_name_len
1944             << " bytes" << dendl;
1945     osd->reply_op_error(op, -ENAMETOOLONG);
1946     return;
1947   }
1948   if (m->get_hobj().get_key().size() > cct->_conf->osd_max_object_name_len) {
1949     dout(4) << "do_op locator is longer than "
1950             << cct->_conf->osd_max_object_name_len
1951             << " bytes" << dendl;
1952     osd->reply_op_error(op, -ENAMETOOLONG);
1953     return;
1954   }
1955   if (m->get_hobj().nspace.size() > cct->_conf->osd_max_object_namespace_len) {
1956     dout(4) << "do_op namespace is longer than "
1957             << cct->_conf->osd_max_object_namespace_len
1958             << " bytes" << dendl;
1959     osd->reply_op_error(op, -ENAMETOOLONG);
1960     return;
1961   }
1962
1963   if (int r = osd->store->validate_hobject_key(head)) {
1964     dout(4) << "do_op object " << head << " invalid for backing store: "
1965             << r << dendl;
1966     osd->reply_op_error(op, r);
1967     return;
1968   }
1969
1970   // blacklisted?
1971   if (get_osdmap()->is_blacklisted(m->get_source_addr())) {
1972     dout(10) << "do_op " << m->get_source_addr() << " is blacklisted" << dendl;
1973     osd->reply_op_error(op, -EBLACKLISTED);
1974     return;
1975   }
1976
1977   // order this op as a write?
1978   bool write_ordered = op->rwordered();
1979
1980   // discard due to cluster full transition?  (we discard any op that
1981   // originates before the cluster or pool is marked full; the client
1982   // will resend after the full flag is removed or if they expect the
1983   // op to succeed despite being full).  The except is FULL_FORCE and
1984   // FULL_TRY ops, which there is no reason to discard because they
1985   // bypass all full checks anyway.  If this op isn't write or
1986   // read-ordered, we skip.
1987   // FIXME: we exclude mds writes for now.
1988   if (write_ordered && !(m->get_source().is_mds() ||
1989                          m->has_flag(CEPH_OSD_FLAG_FULL_TRY) ||
1990                          m->has_flag(CEPH_OSD_FLAG_FULL_FORCE)) &&
1991       info.history.last_epoch_marked_full > m->get_map_epoch()) {
1992     dout(10) << __func__ << " discarding op sent before full " << m << " "
1993              << *m << dendl;
1994     return;
1995   }
1996   // mds should have stopped writing before this point.
1997   // We can't allow OSD to become non-startable even if mds
1998   // could be writing as part of file removals.
1999   ostringstream ss;
2000   if (write_ordered && osd->check_failsafe_full(ss)) {
2001     dout(10) << __func__ << " fail-safe full check failed, dropping request"
2002              << ss.str()
2003              << dendl;
2004     return;
2005   }
2006   int64_t poolid = get_pgid().pool();
2007   if (op->may_write()) {
2008
2009     const pg_pool_t *pi = get_osdmap()->get_pg_pool(poolid);
2010     if (!pi) {
2011       return;
2012     }
2013
2014     // invalid?
2015     if (m->get_snapid() != CEPH_NOSNAP) {
2016       dout(20) << __func__ << ": write to clone not valid " << *m << dendl;
2017       osd->reply_op_error(op, -EINVAL);
2018       return;
2019     }
2020
2021     // too big?
2022     if (cct->_conf->osd_max_write_size &&
2023         m->get_data_len() > cct->_conf->osd_max_write_size << 20) {
2024       // journal can't hold commit!
2025       derr << "do_op msg data len " << m->get_data_len()
2026            << " > osd_max_write_size " << (cct->_conf->osd_max_write_size << 20)
2027            << " on " << *m << dendl;
2028       osd->reply_op_error(op, -OSD_WRITETOOBIG);
2029       return;
2030     }
2031   }
2032
2033   dout(10) << "do_op " << *m
2034            << (op->may_write() ? " may_write" : "")
2035            << (op->may_read() ? " may_read" : "")
2036            << (op->may_cache() ? " may_cache" : "")
2037            << " -> " << (write_ordered ? "write-ordered" : "read-ordered")
2038            << " flags " << ceph_osd_flag_string(m->get_flags())
2039            << dendl;
2040
2041   // missing object?
2042   if (is_unreadable_object(head)) {
2043     if (!is_primary()) {
2044       osd->reply_op_error(op, -EAGAIN);
2045       return;
2046     }
2047     if (can_backoff &&
2048         (g_conf->osd_backoff_on_degraded ||
2049          (g_conf->osd_backoff_on_unfound && missing_loc.is_unfound(head)))) {
2050       add_backoff(session, head, head);
2051       maybe_kick_recovery(head);
2052     } else {
2053       wait_for_unreadable_object(head, op);
2054     }
2055     return;
2056   }
2057
2058   // degraded object?
2059   if (write_ordered && is_degraded_or_backfilling_object(head)) {
2060     if (can_backoff && g_conf->osd_backoff_on_degraded) {
2061       add_backoff(session, head, head);
2062       maybe_kick_recovery(head);
2063     } else {
2064       wait_for_degraded_object(head, op);
2065     }
2066     return;
2067   }
2068
2069   if (write_ordered && scrubber.is_chunky_scrub_active() &&
2070       write_blocked_by_scrub(head)) {
2071     dout(20) << __func__ << ": waiting for scrub" << dendl;
2072     waiting_for_scrub.push_back(op);
2073     op->mark_delayed("waiting for scrub");
2074     return;
2075   }
2076
2077   // blocked on snap?
2078   map<hobject_t, snapid_t>::iterator blocked_iter =
2079     objects_blocked_on_degraded_snap.find(head);
2080   if (write_ordered && blocked_iter != objects_blocked_on_degraded_snap.end()) {
2081     hobject_t to_wait_on(head);
2082     to_wait_on.snap = blocked_iter->second;
2083     wait_for_degraded_object(to_wait_on, op);
2084     return;
2085   }
2086   map<hobject_t, ObjectContextRef>::iterator blocked_snap_promote_iter =
2087     objects_blocked_on_snap_promotion.find(head);
2088   if (write_ordered &&
2089       blocked_snap_promote_iter != objects_blocked_on_snap_promotion.end()) {
2090     wait_for_blocked_object(
2091       blocked_snap_promote_iter->second->obs.oi.soid,
2092       op);
2093     return;
2094   }
2095   if (write_ordered && objects_blocked_on_cache_full.count(head)) {
2096     block_write_on_full_cache(head, op);
2097     return;
2098   }
2099
2100   // missing snapdir?
2101   hobject_t snapdir = head.get_snapdir();
2102
2103   if (is_unreadable_object(snapdir)) {
2104     wait_for_unreadable_object(snapdir, op);
2105     return;
2106   }
2107
2108   // degraded object?
2109   if (write_ordered && is_degraded_or_backfilling_object(snapdir)) {
2110     wait_for_degraded_object(snapdir, op);
2111     return;
2112   }
2113
2114   // dup/resent?
2115   if (op->may_write() || op->may_cache()) {
2116     // warning: we will get back *a* request for this reqid, but not
2117     // necessarily the most recent.  this happens with flush and
2118     // promote ops, but we can't possible have both in our log where
2119     // the original request is still not stable on disk, so for our
2120     // purposes here it doesn't matter which one we get.
2121     eversion_t version;
2122     version_t user_version;
2123     int return_code = 0;
2124     bool got = check_in_progress_op(
2125       m->get_reqid(), &version, &user_version, &return_code);
2126     if (got) {
2127       dout(3) << __func__ << " dup " << m->get_reqid()
2128               << " version " << version << dendl;
2129       if (already_complete(version)) {
2130         osd->reply_op_error(op, return_code, version, user_version);
2131       } else {
2132         dout(10) << " waiting for " << version << " to commit" << dendl;
2133         // always queue ondisk waiters, so that we can requeue if needed
2134         waiting_for_ondisk[version].push_back(make_pair(op, user_version));
2135         op->mark_delayed("waiting for ondisk");
2136       }
2137       return;
2138     }
2139   }
2140
2141   ObjectContextRef obc;
2142   bool can_create = op->may_write() || op->may_cache();
2143   hobject_t missing_oid;
2144   const hobject_t& oid = m->get_hobj();
2145
2146   // io blocked on obc?
2147   if (!m->has_flag(CEPH_OSD_FLAG_FLUSH) &&
2148       maybe_await_blocked_snapset(oid, op)) {
2149     return;
2150   }
2151
2152   int r = find_object_context(
2153     oid, &obc, can_create,
2154     m->has_flag(CEPH_OSD_FLAG_MAP_SNAP_CLONE),
2155     &missing_oid);
2156
2157   if (r == -EAGAIN) {
2158     // If we're not the primary of this OSD, we just return -EAGAIN. Otherwise,
2159     // we have to wait for the object.
2160     if (is_primary()) {
2161       // missing the specific snap we need; requeue and wait.
2162       assert(!op->may_write()); // only happens on a read/cache
2163       wait_for_unreadable_object(missing_oid, op);
2164       return;
2165     }
2166   } else if (r == 0) {
2167     if (is_unreadable_object(obc->obs.oi.soid)) {
2168       dout(10) << __func__ << ": clone " << obc->obs.oi.soid
2169                << " is unreadable, waiting" << dendl;
2170       wait_for_unreadable_object(obc->obs.oi.soid, op);
2171       return;
2172     }
2173
2174     // degraded object?  (the check above was for head; this could be a clone)
2175     if (write_ordered &&
2176         obc->obs.oi.soid.snap != CEPH_NOSNAP &&
2177         is_degraded_or_backfilling_object(obc->obs.oi.soid)) {
2178       dout(10) << __func__ << ": clone " << obc->obs.oi.soid
2179                << " is degraded, waiting" << dendl;
2180       wait_for_degraded_object(obc->obs.oi.soid, op);
2181       return;
2182     }
2183   }
2184
2185   bool in_hit_set = false;
2186   if (hit_set) {
2187     if (obc.get()) {
2188       if (obc->obs.oi.soid != hobject_t() && hit_set->contains(obc->obs.oi.soid))
2189         in_hit_set = true;
2190     } else {
2191       if (missing_oid != hobject_t() && hit_set->contains(missing_oid))
2192         in_hit_set = true;
2193     }
2194     if (!op->hitset_inserted) {
2195       hit_set->insert(oid);
2196       op->hitset_inserted = true;
2197       if (hit_set->is_full() ||
2198           hit_set_start_stamp + pool.info.hit_set_period <= m->get_recv_stamp()) {
2199         hit_set_persist();
2200       }
2201     }
2202   }
2203
2204   if (agent_state) {
2205     if (agent_choose_mode(false, op))
2206       return;
2207   }
2208
2209   if (obc.get() && obc->obs.exists && obc->obs.oi.has_manifest()) {
2210     if (maybe_handle_manifest(op,
2211                                write_ordered,
2212                                obc))
2213     return;
2214   }
2215
2216   if (maybe_handle_cache(op,
2217                          write_ordered,
2218                          obc,
2219                          r,
2220                          missing_oid,
2221                          false,
2222                          in_hit_set))
2223     return;
2224
2225   if (r && (r != -ENOENT || !obc)) {
2226     // copy the reqids for copy get on ENOENT
2227     if (r == -ENOENT &&
2228         (m->ops[0].op.op == CEPH_OSD_OP_COPY_GET)) {
2229       fill_in_copy_get_noent(op, oid, m->ops[0]);
2230       return;
2231     }
2232     dout(20) << __func__ << ": find_object_context got error " << r << dendl;
2233     if (op->may_write() &&
2234         get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
2235       record_write_error(op, oid, nullptr, r);
2236     } else {
2237       osd->reply_op_error(op, r);
2238     }
2239     return;
2240   }
2241
2242   // make sure locator is consistent
2243   object_locator_t oloc(obc->obs.oi.soid);
2244   if (m->get_object_locator() != oloc) {
2245     dout(10) << " provided locator " << m->get_object_locator()
2246              << " != object's " << obc->obs.oi.soid << dendl;
2247     osd->clog->warn() << "bad locator " << m->get_object_locator()
2248                      << " on object " << oloc
2249                       << " op " << *m;
2250   }
2251
2252   // io blocked on obc?
2253   if (obc->is_blocked() &&
2254       !m->has_flag(CEPH_OSD_FLAG_FLUSH)) {
2255     wait_for_blocked_object(obc->obs.oi.soid, op);
2256     return;
2257   }
2258
2259   dout(25) << __func__ << " oi " << obc->obs.oi << dendl;
2260
2261   for (vector<OSDOp>::iterator p = m->ops.begin(); p != m->ops.end(); ++p) {
2262     OSDOp& osd_op = *p;
2263
2264     // make sure LIST_SNAPS is on CEPH_SNAPDIR and nothing else
2265     if (osd_op.op.op == CEPH_OSD_OP_LIST_SNAPS &&
2266         m->get_snapid() != CEPH_SNAPDIR) {
2267       dout(10) << "LIST_SNAPS with incorrect context" << dendl;
2268       osd->reply_op_error(op, -EINVAL);
2269       return;
2270     }
2271   }
2272
2273   OpContext *ctx = new OpContext(op, m->get_reqid(), &m->ops, obc, this);
2274
2275   if (!obc->obs.exists)
2276     ctx->snapset_obc = get_object_context(obc->obs.oi.soid.get_snapdir(), false);
2277
2278   /* Due to obc caching, we might have a cached non-existent snapset_obc
2279    * for the snapdir.  If so, we can ignore it.  Subsequent parts of the
2280    * do_op pipeline make decisions based on whether snapset_obc is
2281    * populated.
2282    */
2283   if (ctx->snapset_obc && !ctx->snapset_obc->obs.exists)
2284     ctx->snapset_obc = ObjectContextRef();
2285
2286   if (m->has_flag(CEPH_OSD_FLAG_SKIPRWLOCKS)) {
2287     dout(20) << __func__ << ": skipping rw locks" << dendl;
2288   } else if (m->get_flags() & CEPH_OSD_FLAG_FLUSH) {
2289     dout(20) << __func__ << ": part of flush, will ignore write lock" << dendl;
2290
2291     // verify there is in fact a flush in progress
2292     // FIXME: we could make this a stronger test.
2293     map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(obc->obs.oi.soid);
2294     if (p == flush_ops.end()) {
2295       dout(10) << __func__ << " no flush in progress, aborting" << dendl;
2296       reply_ctx(ctx, -EINVAL);
2297       return;
2298     }
2299   } else if (!get_rw_locks(write_ordered, ctx)) {
2300     dout(20) << __func__ << " waiting for rw locks " << dendl;
2301     op->mark_delayed("waiting for rw locks");
2302     close_op_ctx(ctx);
2303     return;
2304   }
2305   dout(20) << __func__ << " obc " << *obc << dendl;
2306
2307   if (r) {
2308     dout(20) << __func__ << " returned an error: " << r << dendl;
2309     close_op_ctx(ctx);
2310     if (op->may_write() &&
2311         get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
2312       record_write_error(op, oid, nullptr, r);
2313     } else {
2314       osd->reply_op_error(op, r);
2315     }
2316     return;
2317   }
2318
2319   if (m->has_flag(CEPH_OSD_FLAG_IGNORE_CACHE)) {
2320     ctx->ignore_cache = true;
2321   }
2322
2323   if ((op->may_read()) && (obc->obs.oi.is_lost())) {
2324     // This object is lost. Reading from it returns an error.
2325     dout(20) << __func__ << ": object " << obc->obs.oi.soid
2326              << " is lost" << dendl;
2327     reply_ctx(ctx, -ENFILE);
2328     return;
2329   }
2330   if (!op->may_write() &&
2331       !op->may_cache() &&
2332       (!obc->obs.exists ||
2333        ((m->get_snapid() != CEPH_SNAPDIR) &&
2334         obc->obs.oi.is_whiteout()))) {
2335     // copy the reqids for copy get on ENOENT
2336     if (m->ops[0].op.op == CEPH_OSD_OP_COPY_GET) {
2337       fill_in_copy_get_noent(op, oid, m->ops[0]);
2338       close_op_ctx(ctx);
2339       return;
2340     }
2341     reply_ctx(ctx, -ENOENT);
2342     return;
2343   }
2344
2345   op->mark_started();
2346
2347   execute_ctx(ctx);
2348   utime_t prepare_latency = ceph_clock_now();
2349   prepare_latency -= op->get_dequeued_time();
2350   osd->logger->tinc(l_osd_op_prepare_lat, prepare_latency);
2351   if (op->may_read() && op->may_write()) {
2352     osd->logger->tinc(l_osd_op_rw_prepare_lat, prepare_latency);
2353   } else if (op->may_read()) {
2354     osd->logger->tinc(l_osd_op_r_prepare_lat, prepare_latency);
2355   } else if (op->may_write() || op->may_cache()) {
2356     osd->logger->tinc(l_osd_op_w_prepare_lat, prepare_latency);
2357   }
2358
2359   // force recovery of the oldest missing object if too many logs
2360   maybe_force_recovery();
2361 }
2362
2363 PrimaryLogPG::cache_result_t PrimaryLogPG::maybe_handle_manifest_detail(
2364   OpRequestRef op,
2365   bool write_ordered,
2366   ObjectContextRef obc)
2367 {
2368   if (static_cast<const MOSDOp *>(op->get_req())->get_flags() &
2369       CEPH_OSD_FLAG_IGNORE_REDIRECT) {
2370     dout(20) << __func__ << ": ignoring redirect due to flag" << dendl;
2371     return cache_result_t::NOOP;
2372   }
2373
2374   if (obc)
2375     dout(10) << __func__ << " " << obc->obs.oi << " "
2376        << (obc->obs.exists ? "exists" : "DNE")
2377        << dendl;
2378
2379   // if it is write-ordered and blocked, stop now
2380   if (obc.get() && obc->is_blocked() && write_ordered) {
2381     // we're already doing something with this object
2382     dout(20) << __func__ << " blocked on " << obc->obs.oi.soid << dendl;
2383     return cache_result_t::NOOP;
2384   }
2385
2386   vector<OSDOp> ops = static_cast<const MOSDOp*>(op->get_req())->ops;
2387   for (vector<OSDOp>::iterator p = ops.begin(); p != ops.end(); ++p) {
2388     OSDOp& osd_op = *p;
2389     ceph_osd_op& op = osd_op.op;
2390     if (op.op == CEPH_OSD_OP_SET_REDIRECT) {
2391       return cache_result_t::NOOP;
2392     }
2393   }
2394
2395   switch (obc->obs.oi.manifest.type) {
2396   case object_manifest_t::TYPE_REDIRECT:
2397     if (op->may_write() || write_ordered) {
2398       do_proxy_write(op, obc->obs.oi.soid, obc);
2399     } else {
2400       do_proxy_read(op, obc);
2401     }
2402     return cache_result_t::HANDLED_PROXY;
2403   case object_manifest_t::TYPE_CHUNKED:
2404   default:
2405     assert(0 == "unrecognized manifest type");
2406   }
2407
2408   return cache_result_t::NOOP;
2409 }
2410
2411 void PrimaryLogPG::record_write_error(OpRequestRef op, const hobject_t &soid,
2412                                       MOSDOpReply *orig_reply, int r)
2413 {
2414   dout(20) << __func__ << " r=" << r << dendl;
2415   assert(op->may_write());
2416   const osd_reqid_t &reqid = static_cast<const MOSDOp*>(op->get_req())->get_reqid();
2417   mempool::osd_pglog::list<pg_log_entry_t> entries;
2418   entries.push_back(pg_log_entry_t(pg_log_entry_t::ERROR, soid,
2419                                    get_next_version(), eversion_t(), 0,
2420                                    reqid, utime_t(), r));
2421
2422   struct OnComplete {
2423     PrimaryLogPG *pg;
2424     OpRequestRef op;
2425     boost::intrusive_ptr<MOSDOpReply> orig_reply;
2426     int r;
2427     OnComplete(
2428       PrimaryLogPG *pg,
2429       OpRequestRef op,
2430       MOSDOpReply *orig_reply,
2431       int r)
2432       : pg(pg), op(op),
2433         orig_reply(orig_reply, false /* take over ref */), r(r)
2434       {}
2435     void operator()() {
2436       ldpp_dout(pg, 20) << "finished " << __func__ << " r=" << r << dendl;
2437       const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
2438       int flags = m->get_flags() & (CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
2439       MOSDOpReply *reply = orig_reply.detach();
2440       if (reply == nullptr) {
2441         reply = new MOSDOpReply(m, r, pg->get_osdmap()->get_epoch(),
2442                                 flags, true);
2443       }
2444       ldpp_dout(pg, 10) << " sending commit on " << *m << " " << reply << dendl;
2445       pg->osd->send_message_osd_client(reply, m->get_connection());
2446     }
2447   };
2448
2449   ObcLockManager lock_manager;
2450   submit_log_entries(
2451     entries,
2452     std::move(lock_manager),
2453     boost::optional<std::function<void(void)> >(
2454       OnComplete(this, op, orig_reply, r)),
2455     op,
2456     r);
2457 }
2458
2459 PrimaryLogPG::cache_result_t PrimaryLogPG::maybe_handle_cache_detail(
2460   OpRequestRef op,
2461   bool write_ordered,
2462   ObjectContextRef obc,
2463   int r, hobject_t missing_oid,
2464   bool must_promote,
2465   bool in_hit_set,
2466   ObjectContextRef *promote_obc)
2467 {
2468   // return quickly if caching is not enabled
2469   if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)
2470     return cache_result_t::NOOP;
2471
2472   if (op &&
2473       op->get_req() &&
2474       op->get_req()->get_type() == CEPH_MSG_OSD_OP &&
2475       (static_cast<const MOSDOp *>(op->get_req())->get_flags() &
2476        CEPH_OSD_FLAG_IGNORE_CACHE)) {
2477     dout(20) << __func__ << ": ignoring cache due to flag" << dendl;
2478     return cache_result_t::NOOP;
2479   }
2480
2481   must_promote = must_promote || op->need_promote();
2482
2483   if (obc)
2484     dout(25) << __func__ << " " << obc->obs.oi << " "
2485              << (obc->obs.exists ? "exists" : "DNE")
2486              << " missing_oid " << missing_oid
2487              << " must_promote " << (int)must_promote
2488              << " in_hit_set " << (int)in_hit_set
2489              << dendl;
2490   else
2491     dout(25) << __func__ << " (no obc)"
2492              << " missing_oid " << missing_oid
2493              << " must_promote " << (int)must_promote
2494              << " in_hit_set " << (int)in_hit_set
2495              << dendl;
2496
2497   // if it is write-ordered and blocked, stop now
2498   if (obc.get() && obc->is_blocked() && write_ordered) {
2499     // we're already doing something with this object
2500     dout(20) << __func__ << " blocked on " << obc->obs.oi.soid << dendl;
2501     return cache_result_t::NOOP;
2502   }
2503
2504   if (r == -ENOENT && missing_oid == hobject_t()) {
2505     // we know this object is logically absent (e.g., an undefined clone)
2506     return cache_result_t::NOOP;
2507   }
2508
2509   if (obc.get() && obc->obs.exists) {
2510     osd->logger->inc(l_osd_op_cache_hit);
2511     return cache_result_t::NOOP;
2512   }
2513   if (!is_primary()) {
2514     dout(20) << __func__ << " cache miss; ask the primary" << dendl;
2515     osd->reply_op_error(op, -EAGAIN);
2516     return cache_result_t::REPLIED_WITH_EAGAIN;
2517   }
2518
2519   if (missing_oid == hobject_t() && obc.get()) {
2520     missing_oid = obc->obs.oi.soid;
2521   }
2522
2523   const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
2524   const object_locator_t oloc = m->get_object_locator();
2525
2526   if (op->need_skip_handle_cache()) {
2527     return cache_result_t::NOOP;
2528   }
2529
2530   // older versions do not proxy the feature bits.
2531   bool can_proxy_write = get_osdmap()->get_up_osd_features() &
2532     CEPH_FEATURE_OSD_PROXY_WRITE_FEATURES;
2533   OpRequestRef promote_op;
2534
2535   switch (pool.info.cache_mode) {
2536   case pg_pool_t::CACHEMODE_WRITEBACK:
2537     if (agent_state &&
2538         agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2539       if (!op->may_write() && !op->may_cache() &&
2540           !write_ordered && !must_promote) {
2541         dout(20) << __func__ << " cache pool full, proxying read" << dendl;
2542         do_proxy_read(op);
2543         return cache_result_t::HANDLED_PROXY;
2544       }
2545       dout(20) << __func__ << " cache pool full, waiting" << dendl;
2546       block_write_on_full_cache(missing_oid, op);
2547       return cache_result_t::BLOCKED_FULL;
2548     }
2549
2550     if (must_promote || (!hit_set && !op->need_skip_promote())) {
2551       promote_object(obc, missing_oid, oloc, op, promote_obc);
2552       return cache_result_t::BLOCKED_PROMOTE;
2553     }
2554
2555     if (op->may_write() || op->may_cache()) {
2556       if (can_proxy_write) {
2557         do_proxy_write(op, missing_oid);
2558       } else {
2559         // promote if can't proxy the write
2560         promote_object(obc, missing_oid, oloc, op, promote_obc);
2561         return cache_result_t::BLOCKED_PROMOTE;
2562       }
2563
2564       // Promote too?
2565       if (!op->need_skip_promote() &&
2566           maybe_promote(obc, missing_oid, oloc, in_hit_set,
2567                       pool.info.min_write_recency_for_promote,
2568                       OpRequestRef(),
2569                       promote_obc)) {
2570         return cache_result_t::BLOCKED_PROMOTE;
2571       }
2572       return cache_result_t::HANDLED_PROXY;
2573     } else {
2574       do_proxy_read(op);
2575
2576       // Avoid duplicate promotion
2577       if (obc.get() && obc->is_blocked()) {
2578         if (promote_obc)
2579           *promote_obc = obc;
2580         return cache_result_t::BLOCKED_PROMOTE;
2581       }
2582
2583       // Promote too?
2584       if (!op->need_skip_promote()) {
2585         (void)maybe_promote(obc, missing_oid, oloc, in_hit_set,
2586                             pool.info.min_read_recency_for_promote,
2587                             promote_op, promote_obc);
2588       }
2589
2590       return cache_result_t::HANDLED_PROXY;
2591     }
2592     assert(0 == "unreachable");
2593     return cache_result_t::NOOP;
2594
2595   case pg_pool_t::CACHEMODE_FORWARD:
2596     // FIXME: this mode allows requests to be reordered.
2597     do_cache_redirect(op);
2598     return cache_result_t::HANDLED_REDIRECT;
2599
2600   case pg_pool_t::CACHEMODE_READONLY:
2601     // TODO: clean this case up
2602     if (!obc.get() && r == -ENOENT) {
2603       // we don't have the object and op's a read
2604       promote_object(obc, missing_oid, oloc, op, promote_obc);
2605       return cache_result_t::BLOCKED_PROMOTE;
2606     }
2607     if (!r) { // it must be a write
2608       do_cache_redirect(op);
2609       return cache_result_t::HANDLED_REDIRECT;
2610     }
2611     // crap, there was a failure of some kind
2612     return cache_result_t::NOOP;
2613
2614   case pg_pool_t::CACHEMODE_READFORWARD:
2615     // Do writeback to the cache tier for writes
2616     if (op->may_write() || write_ordered || must_promote) {
2617       if (agent_state &&
2618           agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2619         dout(20) << __func__ << " cache pool full, waiting" << dendl;
2620         block_write_on_full_cache(missing_oid, op);
2621         return cache_result_t::BLOCKED_FULL;
2622       }
2623       promote_object(obc, missing_oid, oloc, op, promote_obc);
2624       return cache_result_t::BLOCKED_PROMOTE;
2625     }
2626
2627     // If it is a read, we can read, we need to forward it
2628     do_cache_redirect(op);
2629     return cache_result_t::HANDLED_REDIRECT;
2630
2631   case pg_pool_t::CACHEMODE_PROXY:
2632     if (!must_promote) {
2633       if (op->may_write() || op->may_cache() || write_ordered) {
2634         if (can_proxy_write) {
2635           do_proxy_write(op, missing_oid);
2636           return cache_result_t::HANDLED_PROXY;
2637         }
2638       } else {
2639         do_proxy_read(op);
2640         return cache_result_t::HANDLED_PROXY;
2641       }
2642     }
2643     // ugh, we're forced to promote.
2644     if (agent_state &&
2645         agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2646       dout(20) << __func__ << " cache pool full, waiting" << dendl;
2647       block_write_on_full_cache(missing_oid, op);
2648       return cache_result_t::BLOCKED_FULL;
2649     }
2650     promote_object(obc, missing_oid, oloc, op, promote_obc);
2651     return cache_result_t::BLOCKED_PROMOTE;
2652
2653   case pg_pool_t::CACHEMODE_READPROXY:
2654     // Do writeback to the cache tier for writes
2655     if (op->may_write() || write_ordered || must_promote) {
2656       if (agent_state &&
2657           agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2658         dout(20) << __func__ << " cache pool full, waiting" << dendl;
2659         block_write_on_full_cache(missing_oid, op);
2660         return cache_result_t::BLOCKED_FULL;
2661       }
2662       promote_object(obc, missing_oid, oloc, op, promote_obc);
2663       return cache_result_t::BLOCKED_PROMOTE;
2664     }
2665
2666     // If it is a read, we can read, we need to proxy it
2667     do_proxy_read(op);
2668     return cache_result_t::HANDLED_PROXY;
2669
2670   default:
2671     assert(0 == "unrecognized cache_mode");
2672   }
2673   return cache_result_t::NOOP;
2674 }
2675
2676 bool PrimaryLogPG::maybe_promote(ObjectContextRef obc,
2677                                  const hobject_t& missing_oid,
2678                                  const object_locator_t& oloc,
2679                                  bool in_hit_set,
2680                                  uint32_t recency,
2681                                  OpRequestRef promote_op,
2682                                  ObjectContextRef *promote_obc)
2683 {
2684   dout(20) << __func__ << " missing_oid " << missing_oid
2685            << "  in_hit_set " << in_hit_set << dendl;
2686
2687   switch (recency) {
2688   case 0:
2689     break;
2690   case 1:
2691     // Check if in the current hit set
2692     if (in_hit_set) {
2693       break;
2694     } else {
2695       // not promoting
2696       return false;
2697     }
2698     break;
2699   default:
2700     {
2701       unsigned count = (int)in_hit_set;
2702       if (count) {
2703         // Check if in other hit sets
2704         const hobject_t& oid = obc.get() ? obc->obs.oi.soid : missing_oid;
2705         for (map<time_t,HitSetRef>::reverse_iterator itor =
2706                agent_state->hit_set_map.rbegin();
2707              itor != agent_state->hit_set_map.rend();
2708              ++itor) {
2709           if (!itor->second->contains(oid)) {
2710             break;
2711           }
2712           ++count;
2713           if (count >= recency) {
2714             break;
2715           }
2716         }
2717       }
2718       if (count >= recency) {
2719         break;
2720       }
2721       return false;     // not promoting
2722     }
2723     break;
2724   }
2725
2726   if (osd->promote_throttle()) {
2727     dout(10) << __func__ << " promote throttled" << dendl;
2728     return false;
2729   }
2730   promote_object(obc, missing_oid, oloc, promote_op, promote_obc);
2731   return true;
2732 }
2733
2734 void PrimaryLogPG::do_cache_redirect(OpRequestRef op)
2735 {
2736   const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
2737   int flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
2738   MOSDOpReply *reply = new MOSDOpReply(m, -ENOENT,
2739                                        get_osdmap()->get_epoch(), flags, false);
2740   request_redirect_t redir(m->get_object_locator(), pool.info.tier_of);
2741   reply->set_redirect(redir);
2742   dout(10) << "sending redirect to pool " << pool.info.tier_of << " for op "
2743            << op << dendl;
2744   m->get_connection()->send_message(reply);
2745   return;
2746 }
2747
2748 struct C_ProxyRead : public Context {
2749   PrimaryLogPGRef pg;
2750   hobject_t oid;
2751   epoch_t last_peering_reset;
2752   ceph_tid_t tid;
2753   PrimaryLogPG::ProxyReadOpRef prdop;
2754   utime_t start;
2755   C_ProxyRead(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
2756              const PrimaryLogPG::ProxyReadOpRef& prd)
2757     : pg(p), oid(o), last_peering_reset(lpr),
2758       tid(0), prdop(prd), start(ceph_clock_now())
2759   {}
2760   void finish(int r) override {
2761     if (prdop->canceled)
2762       return;
2763     pg->lock();
2764     if (prdop->canceled) {
2765       pg->unlock();
2766       return;
2767     }
2768     if (last_peering_reset == pg->get_last_peering_reset()) {
2769       pg->finish_proxy_read(oid, tid, r);
2770       pg->osd->logger->tinc(l_osd_tier_r_lat, ceph_clock_now() - start);
2771     }
2772     pg->unlock();
2773   }
2774 };
2775
2776 void PrimaryLogPG::do_proxy_read(OpRequestRef op, ObjectContextRef obc)
2777 {
2778   // NOTE: non-const here because the ProxyReadOp needs mutable refs to
2779   // stash the result in the request's OSDOp vector
2780   MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
2781   object_locator_t oloc;
2782   hobject_t soid;
2783   /* extensible tier */
2784   if (obc && obc->obs.exists && obc->obs.oi.has_manifest()) {
2785     switch (obc->obs.oi.manifest.type) {
2786       case object_manifest_t::TYPE_REDIRECT:
2787           oloc = object_locator_t(obc->obs.oi.manifest.redirect_target);
2788           soid = obc->obs.oi.manifest.redirect_target;
2789           break;
2790       case object_manifest_t::TYPE_CHUNKED:
2791       default:
2792         assert(0 == "unrecognized manifest type");
2793     }
2794   } else {
2795   /* proxy */
2796     soid = m->get_hobj();
2797     oloc = object_locator_t(m->get_object_locator());
2798     oloc.pool = pool.info.tier_of;
2799   }
2800   unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY;
2801
2802   // pass through some original flags that make sense.
2803   //  - leave out redirection and balancing flags since we are
2804   //    already proxying through the primary
2805   //  - leave off read/write/exec flags that are derived from the op
2806   flags |= m->get_flags() & (CEPH_OSD_FLAG_RWORDERED |
2807                              CEPH_OSD_FLAG_ORDERSNAP |
2808                              CEPH_OSD_FLAG_ENFORCE_SNAPC |
2809                              CEPH_OSD_FLAG_MAP_SNAP_CLONE);
2810
2811   dout(10) << __func__ << " Start proxy read for " << *m << dendl;
2812
2813   ProxyReadOpRef prdop(std::make_shared<ProxyReadOp>(op, soid, m->ops));
2814
2815   ObjectOperation obj_op;
2816   obj_op.dup(prdop->ops);
2817
2818   if (pool.info.cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
2819       (agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL)) {
2820     for (unsigned i = 0; i < obj_op.ops.size(); i++) {
2821       ceph_osd_op op = obj_op.ops[i].op;
2822       switch (op.op) {
2823         case CEPH_OSD_OP_READ:
2824         case CEPH_OSD_OP_SYNC_READ:
2825         case CEPH_OSD_OP_SPARSE_READ:
2826         case CEPH_OSD_OP_CHECKSUM:
2827         case CEPH_OSD_OP_CMPEXT:
2828           op.flags = (op.flags | CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL) &
2829                        ~(CEPH_OSD_OP_FLAG_FADVISE_DONTNEED | CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
2830       }
2831     }
2832   }
2833
2834   C_ProxyRead *fin = new C_ProxyRead(this, soid, get_last_peering_reset(),
2835                                      prdop);
2836   ceph_tid_t tid = osd->objecter->read(
2837     soid.oid, oloc, obj_op,
2838     m->get_snapid(), NULL,
2839     flags, new C_OnFinisher(fin, &osd->objecter_finisher),
2840     &prdop->user_version,
2841     &prdop->data_offset,
2842     m->get_features());
2843   fin->tid = tid;
2844   prdop->objecter_tid = tid;
2845   proxyread_ops[tid] = prdop;
2846   in_progress_proxy_ops[soid].push_back(op);
2847 }
2848
2849 void PrimaryLogPG::finish_proxy_read(hobject_t oid, ceph_tid_t tid, int r)
2850 {
2851   dout(10) << __func__ << " " << oid << " tid " << tid
2852            << " " << cpp_strerror(r) << dendl;
2853
2854   map<ceph_tid_t, ProxyReadOpRef>::iterator p = proxyread_ops.find(tid);
2855   if (p == proxyread_ops.end()) {
2856     dout(10) << __func__ << " no proxyread_op found" << dendl;
2857     return;
2858   }
2859   ProxyReadOpRef prdop = p->second;
2860   if (tid != prdop->objecter_tid) {
2861     dout(10) << __func__ << " tid " << tid << " != prdop " << prdop
2862              << " tid " << prdop->objecter_tid << dendl;
2863     return;
2864   }
2865   if (oid != prdop->soid) {
2866     dout(10) << __func__ << " oid " << oid << " != prdop " << prdop
2867              << " soid " << prdop->soid << dendl;
2868     return;
2869   }
2870   proxyread_ops.erase(tid);
2871
2872   map<hobject_t, list<OpRequestRef>>::iterator q = in_progress_proxy_ops.find(oid);
2873   if (q == in_progress_proxy_ops.end()) {
2874     dout(10) << __func__ << " no in_progress_proxy_ops found" << dendl;
2875     return;
2876   }
2877   assert(q->second.size());
2878   list<OpRequestRef>::iterator it = std::find(q->second.begin(),
2879                                               q->second.end(),
2880                                               prdop->op);
2881   assert(it != q->second.end());
2882   OpRequestRef op = *it;
2883   q->second.erase(it);
2884   if (q->second.size() == 0) {
2885     in_progress_proxy_ops.erase(oid);
2886   }
2887
2888   osd->logger->inc(l_osd_tier_proxy_read);
2889
2890   const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
2891   OpContext *ctx = new OpContext(op, m->get_reqid(), &prdop->ops, this);
2892   ctx->reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0, false);
2893   ctx->user_at_version = prdop->user_version;
2894   ctx->data_off = prdop->data_offset;
2895   ctx->ignore_log_op_stats = true;
2896   complete_read_ctx(r, ctx);
2897 }
2898
2899 void PrimaryLogPG::kick_proxy_ops_blocked(hobject_t& soid)
2900 {
2901   map<hobject_t, list<OpRequestRef>>::iterator p = in_progress_proxy_ops.find(soid);
2902   if (p == in_progress_proxy_ops.end())
2903     return;
2904
2905   list<OpRequestRef>& ls = p->second;
2906   dout(10) << __func__ << " " << soid << " requeuing " << ls.size() << " requests" << dendl;
2907   requeue_ops(ls);
2908   in_progress_proxy_ops.erase(p);
2909 }
2910
2911 void PrimaryLogPG::cancel_proxy_read(ProxyReadOpRef prdop,
2912                                      vector<ceph_tid_t> *tids)
2913 {
2914   dout(10) << __func__ << " " << prdop->soid << dendl;
2915   prdop->canceled = true;
2916
2917   // cancel objecter op, if we can
2918   if (prdop->objecter_tid) {
2919     tids->push_back(prdop->objecter_tid);
2920     for (uint32_t i = 0; i < prdop->ops.size(); i++) {
2921       prdop->ops[i].outdata.clear();
2922     }
2923     proxyread_ops.erase(prdop->objecter_tid);
2924     prdop->objecter_tid = 0;
2925   }
2926 }
2927
2928 void PrimaryLogPG::cancel_proxy_ops(bool requeue, vector<ceph_tid_t> *tids)
2929 {
2930   dout(10) << __func__ << dendl;
2931
2932   // cancel proxy reads
2933   map<ceph_tid_t, ProxyReadOpRef>::iterator p = proxyread_ops.begin();
2934   while (p != proxyread_ops.end()) {
2935     cancel_proxy_read((p++)->second, tids);
2936   }
2937
2938   // cancel proxy writes
2939   map<ceph_tid_t, ProxyWriteOpRef>::iterator q = proxywrite_ops.begin();
2940   while (q != proxywrite_ops.end()) {
2941     cancel_proxy_write((q++)->second, tids);
2942   }
2943
2944   if (requeue) {
2945     map<hobject_t, list<OpRequestRef>>::iterator p =
2946       in_progress_proxy_ops.begin();
2947     while (p != in_progress_proxy_ops.end()) {
2948       list<OpRequestRef>& ls = p->second;
2949       dout(10) << __func__ << " " << p->first << " requeuing " << ls.size()
2950                << " requests" << dendl;
2951       requeue_ops(ls);
2952       in_progress_proxy_ops.erase(p++);
2953     }
2954   } else {
2955     in_progress_proxy_ops.clear();
2956   }
2957 }
2958
2959 struct C_ProxyWrite_Commit : public Context {
2960   PrimaryLogPGRef pg;
2961   hobject_t oid;
2962   epoch_t last_peering_reset;
2963   ceph_tid_t tid;
2964   PrimaryLogPG::ProxyWriteOpRef pwop;
2965   C_ProxyWrite_Commit(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
2966                       const PrimaryLogPG::ProxyWriteOpRef& pw)
2967     : pg(p), oid(o), last_peering_reset(lpr),
2968       tid(0), pwop(pw)
2969   {}
2970   void finish(int r) override {
2971     if (pwop->canceled)
2972       return;
2973     pg->lock();
2974     if (pwop->canceled) {
2975       pg->unlock();
2976       return;
2977     }
2978     if (last_peering_reset == pg->get_last_peering_reset()) {
2979       pg->finish_proxy_write(oid, tid, r);
2980     }
2981     pg->unlock();
2982   }
2983 };
2984
2985 void PrimaryLogPG::do_proxy_write(OpRequestRef op, const hobject_t& missing_oid, ObjectContextRef obc)
2986 {
2987   // NOTE: non-const because ProxyWriteOp takes a mutable ref
2988   MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
2989   object_locator_t oloc;
2990   SnapContext snapc(m->get_snap_seq(), m->get_snaps());
2991   hobject_t soid;
2992   /* extensible tier */
2993   if (obc && obc->obs.exists && obc->obs.oi.has_manifest()) {
2994     switch (obc->obs.oi.manifest.type) {
2995       case object_manifest_t::TYPE_REDIRECT:
2996           oloc = object_locator_t(obc->obs.oi.manifest.redirect_target);
2997           soid = obc->obs.oi.manifest.redirect_target;
2998           break;
2999       case object_manifest_t::TYPE_CHUNKED:
3000       default:
3001         assert(0 == "unrecognized manifest type");
3002     }
3003   } else {
3004   /* proxy */
3005     soid = m->get_hobj();
3006     oloc = object_locator_t(m->get_object_locator());
3007     oloc.pool = pool.info.tier_of;
3008   }
3009
3010   unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY;
3011   if (!(op->may_write() || op->may_cache())) {
3012     flags |= CEPH_OSD_FLAG_RWORDERED;
3013   }
3014   dout(10) << __func__ << " Start proxy write for " << *m << dendl;
3015
3016   ProxyWriteOpRef pwop(std::make_shared<ProxyWriteOp>(op, soid, m->ops, m->get_reqid()));
3017   pwop->ctx = new OpContext(op, m->get_reqid(), &pwop->ops, this);
3018   pwop->mtime = m->get_mtime();
3019
3020   ObjectOperation obj_op;
3021   obj_op.dup(pwop->ops);
3022
3023   C_ProxyWrite_Commit *fin = new C_ProxyWrite_Commit(
3024       this, soid, get_last_peering_reset(), pwop);
3025   ceph_tid_t tid = osd->objecter->mutate(
3026     soid.oid, oloc, obj_op, snapc,
3027     ceph::real_clock::from_ceph_timespec(pwop->mtime),
3028     flags, new C_OnFinisher(fin, &osd->objecter_finisher),
3029     &pwop->user_version, pwop->reqid);
3030   fin->tid = tid;
3031   pwop->objecter_tid = tid;
3032   proxywrite_ops[tid] = pwop;
3033   in_progress_proxy_ops[soid].push_back(op);
3034 }
3035
3036 void PrimaryLogPG::finish_proxy_write(hobject_t oid, ceph_tid_t tid, int r)
3037 {
3038   dout(10) << __func__ << " " << oid << " tid " << tid
3039            << " " << cpp_strerror(r) << dendl;
3040
3041   map<ceph_tid_t, ProxyWriteOpRef>::iterator p = proxywrite_ops.find(tid);
3042   if (p == proxywrite_ops.end()) {
3043     dout(10) << __func__ << " no proxywrite_op found" << dendl;
3044     return;
3045   }
3046   ProxyWriteOpRef pwop = p->second;
3047   assert(tid == pwop->objecter_tid);
3048   assert(oid == pwop->soid);
3049
3050   proxywrite_ops.erase(tid);
3051
3052   map<hobject_t, list<OpRequestRef> >::iterator q = in_progress_proxy_ops.find(oid);
3053   if (q == in_progress_proxy_ops.end()) {
3054     dout(10) << __func__ << " no in_progress_proxy_ops found" << dendl;
3055     delete pwop->ctx;
3056     pwop->ctx = NULL;
3057     return;
3058   }
3059   list<OpRequestRef>& in_progress_op = q->second;
3060   assert(in_progress_op.size());
3061   list<OpRequestRef>::iterator it = std::find(in_progress_op.begin(),
3062                                               in_progress_op.end(),
3063                                               pwop->op);
3064   assert(it != in_progress_op.end());
3065   in_progress_op.erase(it);
3066   if (in_progress_op.size() == 0) {
3067     in_progress_proxy_ops.erase(oid);
3068   }
3069
3070   osd->logger->inc(l_osd_tier_proxy_write);
3071
3072   const MOSDOp *m = static_cast<const MOSDOp*>(pwop->op->get_req());
3073   assert(m != NULL);
3074
3075   if (!pwop->sent_reply) {
3076     // send commit.
3077     MOSDOpReply *reply = pwop->ctx->reply;
3078     if (reply)
3079       pwop->ctx->reply = NULL;
3080     else {
3081       reply = new MOSDOpReply(m, r, get_osdmap()->get_epoch(), 0, true);
3082       reply->set_reply_versions(eversion_t(), pwop->user_version);
3083     }
3084     reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
3085     dout(10) << " sending commit on " << pwop << " " << reply << dendl;
3086     osd->send_message_osd_client(reply, m->get_connection());
3087     pwop->sent_reply = true;
3088     pwop->ctx->op->mark_commit_sent();
3089   }
3090
3091   delete pwop->ctx;
3092   pwop->ctx = NULL;
3093 }
3094
3095 void PrimaryLogPG::cancel_proxy_write(ProxyWriteOpRef pwop,
3096                                       vector<ceph_tid_t> *tids)
3097 {
3098   dout(10) << __func__ << " " << pwop->soid << dendl;
3099   pwop->canceled = true;
3100
3101   // cancel objecter op, if we can
3102   if (pwop->objecter_tid) {
3103     tids->push_back(pwop->objecter_tid);
3104     delete pwop->ctx;
3105     pwop->ctx = NULL;
3106     proxywrite_ops.erase(pwop->objecter_tid);
3107     pwop->objecter_tid = 0;
3108   }
3109 }
3110
3111 class PromoteCallback: public PrimaryLogPG::CopyCallback {
3112   ObjectContextRef obc;
3113   PrimaryLogPG *pg;
3114   utime_t start;
3115 public:
3116   PromoteCallback(ObjectContextRef obc_, PrimaryLogPG *pg_)
3117     : obc(obc_),
3118       pg(pg_),
3119       start(ceph_clock_now()) {}
3120
3121   void finish(PrimaryLogPG::CopyCallbackResults results) override {
3122     PrimaryLogPG::CopyResults *results_data = results.get<1>();
3123     int r = results.get<0>();
3124     pg->finish_promote(r, results_data, obc);
3125     pg->osd->logger->tinc(l_osd_tier_promote_lat, ceph_clock_now() - start);
3126   }
3127 };
3128
3129 void PrimaryLogPG::promote_object(ObjectContextRef obc,
3130                                   const hobject_t& missing_oid,
3131                                   const object_locator_t& oloc,
3132                                   OpRequestRef op,
3133                                   ObjectContextRef *promote_obc)
3134 {
3135   hobject_t hoid = obc ? obc->obs.oi.soid : missing_oid;
3136   assert(hoid != hobject_t());
3137   if (write_blocked_by_scrub(hoid)) {
3138     dout(10) << __func__ << " " << hoid
3139              << " blocked by scrub" << dendl;
3140     if (op) {
3141       waiting_for_scrub.push_back(op);
3142       op->mark_delayed("waiting for scrub");
3143       dout(10) << __func__ << " " << hoid
3144                << " placing op in waiting_for_scrub" << dendl;
3145     } else {
3146       dout(10) << __func__ << " " << hoid
3147                << " no op, dropping on the floor" << dendl;
3148     }
3149     return;
3150   }
3151   if (!obc) { // we need to create an ObjectContext
3152     assert(missing_oid != hobject_t());
3153     obc = get_object_context(missing_oid, true);
3154   }
3155   if (promote_obc)
3156     *promote_obc = obc;
3157
3158   /*
3159    * Before promote complete, if there are  proxy-reads for the object,
3160    * for this case we don't use DONTNEED.
3161    */
3162   unsigned src_fadvise_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL;
3163   map<hobject_t, list<OpRequestRef>>::iterator q = in_progress_proxy_ops.find(obc->obs.oi.soid);
3164   if (q == in_progress_proxy_ops.end()) {
3165     src_fadvise_flags |= LIBRADOS_OP_FLAG_FADVISE_DONTNEED;
3166   }
3167
3168   PromoteCallback *cb = new PromoteCallback(obc, this);
3169   object_locator_t my_oloc = oloc;
3170   my_oloc.pool = pool.info.tier_of;
3171
3172   unsigned flags = CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
3173                    CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
3174                    CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE |
3175                    CEPH_OSD_COPY_FROM_FLAG_RWORDERED;
3176   start_copy(cb, obc, obc->obs.oi.soid, my_oloc, 0, flags,
3177              obc->obs.oi.soid.snap == CEPH_NOSNAP,
3178              src_fadvise_flags, 0);
3179
3180   assert(obc->is_blocked());
3181
3182   if (op)
3183     wait_for_blocked_object(obc->obs.oi.soid, op);
3184   info.stats.stats.sum.num_promote++;
3185 }
3186
3187 void PrimaryLogPG::execute_ctx(OpContext *ctx)
3188 {
3189   FUNCTRACE();
3190   dout(10) << __func__ << " " << ctx << dendl;
3191   ctx->reset_obs(ctx->obc);
3192   ctx->update_log_only = false; // reset in case finish_copyfrom() is re-running execute_ctx
3193   OpRequestRef op = ctx->op;
3194   const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
3195   ObjectContextRef obc = ctx->obc;
3196   const hobject_t& soid = obc->obs.oi.soid;
3197
3198   // this method must be idempotent since we may call it several times
3199   // before we finally apply the resulting transaction.
3200   ctx->op_t.reset(new PGTransaction);
3201
3202   if (op->may_write() || op->may_cache()) {
3203     // snap
3204     if (!(m->has_flag(CEPH_OSD_FLAG_ENFORCE_SNAPC)) &&
3205         pool.info.is_pool_snaps_mode()) {
3206       // use pool's snapc
3207       ctx->snapc = pool.snapc;
3208     } else {
3209       // client specified snapc
3210       ctx->snapc.seq = m->get_snap_seq();
3211       ctx->snapc.snaps = m->get_snaps();
3212       filter_snapc(ctx->snapc.snaps);
3213     }
3214     if ((m->has_flag(CEPH_OSD_FLAG_ORDERSNAP)) &&
3215         ctx->snapc.seq < obc->ssc->snapset.seq) {
3216       dout(10) << " ORDERSNAP flag set and snapc seq " << ctx->snapc.seq
3217                << " < snapset seq " << obc->ssc->snapset.seq
3218                << " on " << obc->obs.oi.soid << dendl;
3219       reply_ctx(ctx, -EOLDSNAPC);
3220       return;
3221     }
3222
3223     // version
3224     ctx->at_version = get_next_version();
3225     ctx->mtime = m->get_mtime();
3226
3227     dout(10) << __func__ << " " << soid << " " << *ctx->ops
3228              << " ov " << obc->obs.oi.version << " av " << ctx->at_version
3229              << " snapc " << ctx->snapc
3230              << " snapset " << obc->ssc->snapset
3231              << dendl;
3232   } else {
3233     dout(10) << __func__ << " " << soid << " " << *ctx->ops
3234              << " ov " << obc->obs.oi.version
3235              << dendl;
3236   }
3237
3238   if (!ctx->user_at_version)
3239     ctx->user_at_version = obc->obs.oi.user_version;
3240   dout(30) << __func__ << " user_at_version " << ctx->user_at_version << dendl;
3241
3242   if (op->may_read()) {
3243     dout(10) << " taking ondisk_read_lock" << dendl;
3244     obc->ondisk_read_lock();
3245   }
3246
3247   {
3248 #ifdef WITH_LTTNG
3249     osd_reqid_t reqid = ctx->op->get_reqid();
3250 #endif
3251     tracepoint(osd, prepare_tx_enter, reqid.name._type,
3252         reqid.name._num, reqid.tid, reqid.inc);
3253   }
3254
3255   int result = prepare_transaction(ctx);
3256
3257   {
3258 #ifdef WITH_LTTNG
3259     osd_reqid_t reqid = ctx->op->get_reqid();
3260 #endif
3261     tracepoint(osd, prepare_tx_exit, reqid.name._type,
3262         reqid.name._num, reqid.tid, reqid.inc);
3263   }
3264
3265   if (op->may_read()) {
3266     dout(10) << " dropping ondisk_read_lock" << dendl;
3267     obc->ondisk_read_unlock();
3268   }
3269
3270   bool pending_async_reads = !ctx->pending_async_reads.empty();
3271   if (result == -EINPROGRESS || pending_async_reads) {
3272     // come back later.
3273     if (pending_async_reads) {
3274       in_progress_async_reads.push_back(make_pair(op, ctx));
3275       ctx->start_async_reads(this);
3276     }
3277     return;
3278   }
3279
3280   if (result == -EAGAIN) {
3281     // clean up after the ctx
3282     close_op_ctx(ctx);
3283     return;
3284   }
3285
3286   bool successful_write = !ctx->op_t->empty() && op->may_write() && result >= 0;
3287   // prepare the reply
3288   ctx->reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0,
3289                                successful_write);
3290
3291   // Write operations aren't allowed to return a data payload because
3292   // we can't do so reliably. If the client has to resend the request
3293   // and it has already been applied, we will return 0 with no
3294   // payload.  Non-deterministic behavior is no good.  However, it is
3295   // possible to construct an operation that does a read, does a guard
3296   // check (e.g., CMPXATTR), and then a write.  Then we either succeed
3297   // with the write, or return a CMPXATTR and the read value.
3298   if (successful_write) {
3299     // write.  normalize the result code.
3300     dout(20) << " zeroing write result code " << result << dendl;
3301     result = 0;
3302   }
3303   ctx->reply->set_result(result);
3304
3305   // read or error?
3306   if ((ctx->op_t->empty() || result < 0) && !ctx->update_log_only) {
3307     // finish side-effects
3308     if (result >= 0)
3309       do_osd_op_effects(ctx, m->get_connection());
3310
3311     complete_read_ctx(result, ctx);
3312     return;
3313   }
3314
3315   ctx->reply->set_reply_versions(ctx->at_version, ctx->user_at_version);
3316
3317   assert(op->may_write() || op->may_cache());
3318
3319   // trim log?
3320   calc_trim_to();
3321
3322   // verify that we are doing this in order?
3323   if (cct->_conf->osd_debug_op_order && m->get_source().is_client() &&
3324       !pool.info.is_tier() && !pool.info.has_tiers()) {
3325     map<client_t,ceph_tid_t>& cm = debug_op_order[obc->obs.oi.soid];
3326     ceph_tid_t t = m->get_tid();
3327     client_t n = m->get_source().num();
3328     map<client_t,ceph_tid_t>::iterator p = cm.find(n);
3329     if (p == cm.end()) {
3330       dout(20) << " op order client." << n << " tid " << t << " (first)" << dendl;
3331       cm[n] = t;
3332     } else {
3333       dout(20) << " op order client." << n << " tid " << t << " last was " << p->second << dendl;
3334       if (p->second > t) {
3335         derr << "bad op order, already applied " << p->second << " > this " << t << dendl;
3336         assert(0 == "out of order op");
3337       }
3338       p->second = t;
3339     }
3340   }
3341
3342   if (ctx->update_log_only) {
3343     if (result >= 0)
3344       do_osd_op_effects(ctx, m->get_connection());
3345
3346     dout(20) << __func__ << " update_log_only -- result=" << result << dendl;
3347     // save just what we need from ctx
3348     MOSDOpReply *reply = ctx->reply;
3349     ctx->reply = nullptr;
3350     reply->claim_op_out_data(*ctx->ops);
3351     reply->get_header().data_off = (ctx->data_off ? *ctx->data_off : 0);
3352     close_op_ctx(ctx);
3353
3354     if (result == -ENOENT) {
3355       reply->set_enoent_reply_versions(info.last_update,
3356                                        info.last_user_version);
3357     }
3358     reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
3359     // append to pg log for dup detection - don't save buffers for now
3360     record_write_error(op, soid, reply, result);
3361     return;
3362   }
3363
3364   // no need to capture PG ref, repop cancel will handle that
3365   // Can capture the ctx by pointer, it's owned by the repop
3366   ctx->register_on_commit(
3367     [m, ctx, this](){
3368       if (ctx->op)
3369         log_op_stats(
3370           ctx);
3371
3372       if (m && !ctx->sent_reply) {
3373         MOSDOpReply *reply = ctx->reply;
3374         if (reply)
3375           ctx->reply = nullptr;
3376         else {
3377           reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0, true);
3378           reply->set_reply_versions(ctx->at_version,
3379                                     ctx->user_at_version);
3380         }
3381         reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
3382         dout(10) << " sending reply on " << *m << " " << reply << dendl;
3383         osd->send_message_osd_client(reply, m->get_connection());
3384         ctx->sent_reply = true;
3385         ctx->op->mark_commit_sent();
3386       }
3387     });
3388   ctx->register_on_success(
3389     [ctx, this]() {
3390       do_osd_op_effects(
3391         ctx,
3392         ctx->op ? ctx->op->get_req()->get_connection() :
3393         ConnectionRef());
3394     });
3395   ctx->register_on_finish(
3396     [ctx, this]() {
3397       delete ctx;
3398     });
3399
3400   // issue replica writes
3401   ceph_tid_t rep_tid = osd->get_tid();
3402
3403   RepGather *repop = new_repop(ctx, obc, rep_tid);
3404
3405   issue_repop(repop, ctx);
3406   eval_repop(repop);
3407   repop->put();
3408 }
3409
3410 void PrimaryLogPG::close_op_ctx(OpContext *ctx) {
3411   release_object_locks(ctx->lock_manager);
3412
3413   ctx->op_t.reset();
3414
3415   for (auto p = ctx->on_finish.begin(); p != ctx->on_finish.end();
3416        ctx->on_finish.erase(p++)) {
3417     (*p)();
3418   }
3419   delete ctx;
3420 }
3421
3422 void PrimaryLogPG::reply_ctx(OpContext *ctx, int r)
3423 {
3424   if (ctx->op)
3425     osd->reply_op_error(ctx->op, r);
3426   close_op_ctx(ctx);
3427 }
3428
3429 void PrimaryLogPG::reply_ctx(OpContext *ctx, int r, eversion_t v, version_t uv)
3430 {
3431   if (ctx->op)
3432     osd->reply_op_error(ctx->op, r, v, uv);
3433   close_op_ctx(ctx);
3434 }
3435
3436 void PrimaryLogPG::log_op_stats(OpContext *ctx)
3437 {
3438   OpRequestRef op = ctx->op;
3439   const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
3440
3441   utime_t now = ceph_clock_now();
3442   utime_t latency = now;
3443   latency -= ctx->op->get_req()->get_recv_stamp();
3444   utime_t process_latency = now;
3445   process_latency -= ctx->op->get_dequeued_time();
3446
3447   uint64_t inb = ctx->bytes_written;
3448   uint64_t outb = ctx->bytes_read;
3449
3450   osd->logger->inc(l_osd_op);
3451
3452   osd->logger->inc(l_osd_op_outb, outb);
3453   osd->logger->inc(l_osd_op_inb, inb);
3454   osd->logger->tinc(l_osd_op_lat, latency);
3455   osd->logger->tinc(l_osd_op_process_lat, process_latency);
3456
3457   if (op->may_read() && op->may_write()) {
3458     osd->logger->inc(l_osd_op_rw);
3459     osd->logger->inc(l_osd_op_rw_inb, inb);
3460     osd->logger->inc(l_osd_op_rw_outb, outb);
3461     osd->logger->tinc(l_osd_op_rw_lat, latency);
3462     osd->logger->hinc(l_osd_op_rw_lat_inb_hist, latency.to_nsec(), inb);
3463     osd->logger->hinc(l_osd_op_rw_lat_outb_hist, latency.to_nsec(), outb);
3464     osd->logger->tinc(l_osd_op_rw_process_lat, process_latency);
3465   } else if (op->may_read()) {
3466     osd->logger->inc(l_osd_op_r);
3467     osd->logger->inc(l_osd_op_r_outb, outb);
3468     osd->logger->tinc(l_osd_op_r_lat, latency);
3469     osd->logger->hinc(l_osd_op_r_lat_outb_hist, latency.to_nsec(), outb);
3470     osd->logger->tinc(l_osd_op_r_process_lat, process_latency);
3471   } else if (op->may_write() || op->may_cache()) {
3472     osd->logger->inc(l_osd_op_w);
3473     osd->logger->inc(l_osd_op_w_inb, inb);
3474     osd->logger->tinc(l_osd_op_w_lat, latency);
3475     osd->logger->hinc(l_osd_op_w_lat_inb_hist, latency.to_nsec(), inb);
3476     osd->logger->tinc(l_osd_op_w_process_lat, process_latency);
3477   } else
3478     ceph_abort();
3479
3480   dout(15) << "log_op_stats " << *m
3481            << " inb " << inb
3482            << " outb " << outb
3483            << " lat " << latency << dendl;
3484 }
3485
3486 void PrimaryLogPG::do_sub_op(OpRequestRef op)
3487 {
3488   const MOSDSubOp *m = static_cast<const MOSDSubOp*>(op->get_req());
3489   assert(have_same_or_newer_map(m->map_epoch));
3490   assert(m->get_type() == MSG_OSD_SUBOP);
3491   dout(15) << "do_sub_op " << *op->get_req() << dendl;
3492
3493   if (!is_peered()) {
3494     waiting_for_peered.push_back(op);
3495     op->mark_delayed("waiting for active");
3496     return;
3497   }
3498
3499   const OSDOp *first = NULL;
3500   if (m->ops.size() >= 1) {
3501     first = &m->ops[0];
3502   }
3503
3504   if (first) {
3505     switch (first->op.op) {
3506     case CEPH_OSD_OP_DELETE:
3507       sub_op_remove(op);
3508       return;
3509     case CEPH_OSD_OP_SCRUB_RESERVE:
3510       handle_scrub_reserve_request(op);
3511       return;
3512     case CEPH_OSD_OP_SCRUB_UNRESERVE:
3513       handle_scrub_reserve_release(op);
3514       return;
3515     case CEPH_OSD_OP_SCRUB_MAP:
3516       sub_op_scrub_map(op);
3517       return;
3518     }
3519   }
3520 }
3521
3522 void PrimaryLogPG::do_sub_op_reply(OpRequestRef op)
3523 {
3524   const MOSDSubOpReply *r = static_cast<const MOSDSubOpReply *>(op->get_req());
3525   assert(r->get_type() == MSG_OSD_SUBOPREPLY);
3526   if (r->ops.size() >= 1) {
3527     const OSDOp& first = r->ops[0];
3528     switch (first.op.op) {
3529     case CEPH_OSD_OP_SCRUB_RESERVE:
3530       {
3531         pg_shard_t from = r->from;
3532         bufferlist::iterator p = const_cast<bufferlist&>(r->get_data()).begin();
3533         bool reserved;
3534         ::decode(reserved, p);
3535         if (reserved) {
3536           handle_scrub_reserve_grant(op, from);
3537         } else {
3538           handle_scrub_reserve_reject(op, from);
3539         }
3540       }
3541       return;
3542     }
3543   }
3544 }
3545
3546 void PrimaryLogPG::do_scan(
3547   OpRequestRef op,
3548   ThreadPool::TPHandle &handle)
3549 {
3550   const MOSDPGScan *m = static_cast<const MOSDPGScan*>(op->get_req());
3551   assert(m->get_type() == MSG_OSD_PG_SCAN);
3552   dout(10) << "do_scan " << *m << dendl;
3553
3554   op->mark_started();
3555
3556   switch (m->op) {
3557   case MOSDPGScan::OP_SCAN_GET_DIGEST:
3558     {
3559       ostringstream ss;
3560       if (osd->check_backfill_full(ss)) {
3561         dout(1) << __func__ << ": Canceling backfill, " << ss.str() << dendl;
3562         queue_peering_event(
3563           CephPeeringEvtRef(
3564             std::make_shared<CephPeeringEvt>(
3565               get_osdmap()->get_epoch(),
3566               get_osdmap()->get_epoch(),
3567               BackfillTooFull())));
3568         return;
3569       }
3570
3571       BackfillInterval bi;
3572       bi.begin = m->begin;
3573       // No need to flush, there won't be any in progress writes occuring
3574       // past m->begin
3575       scan_range(
3576         cct->_conf->osd_backfill_scan_min,
3577         cct->_conf->osd_backfill_scan_max,
3578         &bi,
3579         handle);
3580       MOSDPGScan *reply = new MOSDPGScan(
3581         MOSDPGScan::OP_SCAN_DIGEST,
3582         pg_whoami,
3583         get_osdmap()->get_epoch(), m->query_epoch,
3584         spg_t(info.pgid.pgid, get_primary().shard), bi.begin, bi.end);
3585       ::encode(bi.objects, reply->get_data());
3586       osd->send_message_osd_cluster(reply, m->get_connection());
3587     }
3588     break;
3589
3590   case MOSDPGScan::OP_SCAN_DIGEST:
3591     {
3592       pg_shard_t from = m->from;
3593
3594       // Check that from is in backfill_targets vector
3595       assert(is_backfill_targets(from));
3596
3597       BackfillInterval& bi = peer_backfill_info[from];
3598       bi.begin = m->begin;
3599       bi.end = m->end;
3600       bufferlist::iterator p = const_cast<bufferlist&>(m->get_data()).begin();
3601
3602       // take care to preserve ordering!
3603       bi.clear_objects();
3604       ::decode_noclear(bi.objects, p);
3605
3606       if (waiting_on_backfill.erase(from)) {
3607         if (waiting_on_backfill.empty()) {
3608           assert(peer_backfill_info.size() == backfill_targets.size());
3609           finish_recovery_op(hobject_t::get_max());
3610         }
3611       } else {
3612         // we canceled backfill for a while due to a too full, and this
3613         // is an extra response from a non-too-full peer
3614       }
3615     }
3616     break;
3617   }
3618 }
3619
3620 void PrimaryLogPG::do_backfill(OpRequestRef op)
3621 {
3622   const MOSDPGBackfill *m = static_cast<const MOSDPGBackfill*>(op->get_req());
3623   assert(m->get_type() == MSG_OSD_PG_BACKFILL);
3624   dout(10) << "do_backfill " << *m << dendl;
3625
3626   op->mark_started();
3627
3628   switch (m->op) {
3629   case MOSDPGBackfill::OP_BACKFILL_FINISH:
3630     {
3631       assert(cct->_conf->osd_kill_backfill_at != 1);
3632
3633       MOSDPGBackfill *reply = new MOSDPGBackfill(
3634         MOSDPGBackfill::OP_BACKFILL_FINISH_ACK,
3635         get_osdmap()->get_epoch(),
3636         m->query_epoch,
3637         spg_t(info.pgid.pgid, get_primary().shard));
3638       reply->set_priority(get_recovery_op_priority());
3639       osd->send_message_osd_cluster(reply, m->get_connection());
3640       queue_peering_event(
3641         CephPeeringEvtRef(
3642           std::make_shared<CephPeeringEvt>(
3643             get_osdmap()->get_epoch(),
3644             get_osdmap()->get_epoch(),
3645             RecoveryDone())));
3646     }
3647     // fall-thru
3648
3649   case MOSDPGBackfill::OP_BACKFILL_PROGRESS:
3650     {
3651       assert(cct->_conf->osd_kill_backfill_at != 2);
3652
3653       info.set_last_backfill(m->last_backfill);
3654       info.stats = m->stats;
3655
3656       ObjectStore::Transaction t;
3657       dirty_info = true;
3658       write_if_dirty(t);
3659       int tr = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
3660       assert(tr == 0);
3661     }
3662     break;
3663
3664   case MOSDPGBackfill::OP_BACKFILL_FINISH_ACK:
3665     {
3666       assert(is_primary());
3667       assert(cct->_conf->osd_kill_backfill_at != 3);
3668       finish_recovery_op(hobject_t::get_max());
3669     }
3670     break;
3671   }
3672 }
3673
3674 void PrimaryLogPG::do_backfill_remove(OpRequestRef op)
3675 {
3676   const MOSDPGBackfillRemove *m = static_cast<const MOSDPGBackfillRemove*>(
3677     op->get_req());
3678   assert(m->get_type() == MSG_OSD_PG_BACKFILL_REMOVE);
3679   dout(7) << __func__ << " " << m->ls << dendl;
3680
3681   op->mark_started();
3682
3683   ObjectStore::Transaction t;
3684   for (auto& p : m->ls) {
3685     remove_snap_mapped_object(t, p.first);
3686   }
3687   int r = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
3688   assert(r == 0);
3689 }
3690
3691 int PrimaryLogPG::trim_object(
3692   bool first, const hobject_t &coid, PrimaryLogPG::OpContextUPtr *ctxp)
3693 {
3694   *ctxp = NULL;
3695   // load clone info
3696   bufferlist bl;
3697   ObjectContextRef obc = get_object_context(coid, false, NULL);
3698   if (!obc || !obc->ssc || !obc->ssc->exists) {
3699     osd->clog->error() << __func__ << ": Can not trim " << coid
3700       << " repair needed " << (obc ? "(no obc->ssc or !exists)" : "(no obc)");
3701     return -ENOENT;
3702   }
3703
3704   hobject_t snapoid(
3705     coid.oid, coid.get_key(),
3706     obc->ssc->snapset.head_exists ? CEPH_NOSNAP:CEPH_SNAPDIR, coid.get_hash(),
3707     info.pgid.pool(), coid.get_namespace());
3708   ObjectContextRef snapset_obc = get_object_context(snapoid, false);
3709   if (!snapset_obc) {
3710     osd->clog->error() << __func__ << ": Can not trim " << coid
3711       << " repair needed, no snapset obc for " << snapoid;
3712     return -ENOENT;
3713   }
3714
3715   SnapSet& snapset = obc->ssc->snapset;
3716
3717   bool legacy = snapset.is_legacy() ||
3718     get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
3719
3720   object_info_t &coi = obc->obs.oi;
3721   set<snapid_t> old_snaps;
3722   if (legacy) {
3723     old_snaps.insert(coi.legacy_snaps.begin(), coi.legacy_snaps.end());
3724   } else {
3725     auto p = snapset.clone_snaps.find(coid.snap);
3726     if (p == snapset.clone_snaps.end()) {
3727       osd->clog->error() << "No clone_snaps in snapset " << snapset
3728                          << " for object " << coid << "\n";
3729       return -ENOENT;
3730     }
3731     old_snaps.insert(snapset.clone_snaps[coid.snap].begin(),
3732                      snapset.clone_snaps[coid.snap].end());
3733   }
3734   if (old_snaps.empty()) {
3735     osd->clog->error() << "No object info snaps for object " << coid;
3736     return -ENOENT;
3737   }
3738
3739   dout(10) << coid << " old_snaps " << old_snaps
3740            << " old snapset " << snapset << dendl;
3741   if (snapset.seq == 0) {
3742     osd->clog->error() << "No snapset.seq for object " << coid;
3743     return -ENOENT;
3744   }
3745
3746   set<snapid_t> new_snaps;
3747   for (set<snapid_t>::iterator i = old_snaps.begin();
3748        i != old_snaps.end();
3749        ++i) {
3750     if (!pool.info.is_removed_snap(*i))
3751       new_snaps.insert(*i);
3752   }
3753
3754   vector<snapid_t>::iterator p = snapset.clones.end();
3755
3756   if (new_snaps.empty()) {
3757     p = std::find(snapset.clones.begin(), snapset.clones.end(), coid.snap);
3758     if (p == snapset.clones.end()) {
3759       osd->clog->error() << "Snap " << coid.snap << " not in clones";
3760       return -ENOENT;
3761     }
3762   }
3763
3764   OpContextUPtr ctx = simple_opc_create(obc);
3765   ctx->snapset_obc = snapset_obc;
3766
3767   if (!ctx->lock_manager.get_snaptrimmer_write(
3768         coid,
3769         obc,
3770         first)) {
3771     close_op_ctx(ctx.release());
3772     dout(10) << __func__ << ": Unable to get a wlock on " << coid << dendl;
3773     return -ENOLCK;
3774   }
3775
3776   if (!ctx->lock_manager.get_snaptrimmer_write(
3777         snapoid,
3778         snapset_obc,
3779         first)) {
3780     close_op_ctx(ctx.release());
3781     dout(10) << __func__ << ": Unable to get a wlock on " << snapoid << dendl;
3782     return -ENOLCK;
3783   }
3784
3785   ctx->at_version = get_next_version();
3786
3787   PGTransaction *t = ctx->op_t.get();
3788
3789   if (new_snaps.empty()) {
3790     // remove clone
3791     dout(10) << coid << " snaps " << old_snaps << " -> "
3792              << new_snaps << " ... deleting" << dendl;
3793
3794     // ...from snapset
3795     assert(p != snapset.clones.end());
3796
3797     snapid_t last = coid.snap;
3798     ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(last);
3799
3800     if (p != snapset.clones.begin()) {
3801       // not the oldest... merge overlap into next older clone
3802       vector<snapid_t>::iterator n = p - 1;
3803       hobject_t prev_coid = coid;
3804       prev_coid.snap = *n;
3805       bool adjust_prev_bytes = is_present_clone(prev_coid);
3806
3807       if (adjust_prev_bytes)
3808         ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(*n);
3809
3810       snapset.clone_overlap[*n].intersection_of(
3811         snapset.clone_overlap[*p]);
3812
3813       if (adjust_prev_bytes)
3814         ctx->delta_stats.num_bytes += snapset.get_clone_bytes(*n);
3815     }
3816     ctx->delta_stats.num_objects--;
3817     if (coi.is_dirty())
3818       ctx->delta_stats.num_objects_dirty--;
3819     if (coi.is_omap())
3820       ctx->delta_stats.num_objects_omap--;
3821     if (coi.is_whiteout()) {
3822       dout(20) << __func__ << " trimming whiteout on " << coid << dendl;
3823       ctx->delta_stats.num_whiteouts--;
3824     }
3825     ctx->delta_stats.num_object_clones--;
3826     if (coi.is_cache_pinned())
3827       ctx->delta_stats.num_objects_pinned--;
3828     obc->obs.exists = false;
3829
3830     snapset.clones.erase(p);
3831     snapset.clone_overlap.erase(last);
3832     snapset.clone_size.erase(last);
3833     snapset.clone_snaps.erase(last);
3834
3835     ctx->log.push_back(
3836       pg_log_entry_t(
3837         pg_log_entry_t::DELETE,
3838         coid,
3839         ctx->at_version,
3840         ctx->obs->oi.version,
3841         0,
3842         osd_reqid_t(),
3843         ctx->mtime,
3844         0)
3845       );
3846     t->remove(coid);
3847     t->update_snaps(
3848       coid,
3849       old_snaps,
3850       new_snaps);
3851
3852     coi = object_info_t(coid);
3853
3854     ctx->at_version.version++;
3855   } else {
3856     // save adjusted snaps for this object
3857     dout(10) << coid << " snaps " << old_snaps << " -> " << new_snaps << dendl;
3858     if (legacy) {
3859       coi.legacy_snaps = vector<snapid_t>(new_snaps.rbegin(), new_snaps.rend());
3860     } else {
3861       snapset.clone_snaps[coid.snap] = vector<snapid_t>(new_snaps.rbegin(),
3862                                                         new_snaps.rend());
3863       // we still do a 'modify' event on this object just to trigger a
3864       // snapmapper.update ... :(
3865     }
3866
3867     coi.prior_version = coi.version;
3868     coi.version = ctx->at_version;
3869     bl.clear();
3870     ::encode(coi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
3871     t->setattr(coid, OI_ATTR, bl);
3872
3873     ctx->log.push_back(
3874       pg_log_entry_t(
3875         pg_log_entry_t::MODIFY,
3876         coid,
3877         coi.version,
3878         coi.prior_version,
3879         0,
3880         osd_reqid_t(),
3881         ctx->mtime,
3882         0)
3883       );
3884     ctx->at_version.version++;
3885
3886     t->update_snaps(
3887       coid,
3888       old_snaps,
3889       new_snaps);
3890   }
3891
3892   // save head snapset
3893   dout(10) << coid << " new snapset " << snapset << " on "
3894            << snapset_obc->obs.oi << dendl;
3895   if (snapset.clones.empty() &&
3896       (!snapset.head_exists ||
3897        (snapset_obc->obs.oi.is_whiteout() &&
3898         !(snapset_obc->obs.oi.is_dirty() && pool.info.is_tier()) &&
3899         !snapset_obc->obs.oi.is_cache_pinned()))) {
3900     // NOTE: this arguably constitutes minor interference with the
3901     // tiering agent if this is a cache tier since a snap trim event
3902     // is effectively evicting a whiteout we might otherwise want to
3903     // keep around.
3904     dout(10) << coid << " removing " << snapoid << dendl;
3905     ctx->log.push_back(
3906       pg_log_entry_t(
3907         pg_log_entry_t::DELETE,
3908         snapoid,
3909         ctx->at_version,
3910         ctx->snapset_obc->obs.oi.version,
3911         0,
3912         osd_reqid_t(),
3913         ctx->mtime,
3914         0)
3915       );
3916     if (snapoid.is_head()) {
3917       derr << "removing snap head" << dendl;
3918       object_info_t& oi = ctx->snapset_obc->obs.oi;
3919       ctx->delta_stats.num_objects--;
3920       if (oi.is_dirty()) {
3921         ctx->delta_stats.num_objects_dirty--;
3922       }
3923       if (oi.is_omap())
3924         ctx->delta_stats.num_objects_omap--;
3925       if (oi.is_whiteout()) {
3926         dout(20) << __func__ << " trimming whiteout on " << oi.soid << dendl;
3927         ctx->delta_stats.num_whiteouts--;
3928       }
3929       if (oi.is_cache_pinned()) {
3930         ctx->delta_stats.num_objects_pinned--;
3931       }
3932     }
3933     ctx->snapset_obc->obs.exists = false;
3934     ctx->snapset_obc->obs.oi = object_info_t(snapoid);
3935     t->remove(snapoid);
3936   } else {
3937     dout(10) << coid << " filtering snapset on " << snapoid << dendl;
3938     snapset.filter(pool.info);
3939     dout(10) << coid << " writing updated snapset on " << snapoid
3940              << ", snapset is " << snapset << dendl;
3941     ctx->log.push_back(
3942       pg_log_entry_t(
3943         pg_log_entry_t::MODIFY,
3944         snapoid,
3945         ctx->at_version,
3946         ctx->snapset_obc->obs.oi.version,
3947         0,
3948         osd_reqid_t(),
3949         ctx->mtime,
3950         0)
3951       );
3952
3953     ctx->snapset_obc->obs.oi.prior_version =
3954       ctx->snapset_obc->obs.oi.version;
3955     ctx->snapset_obc->obs.oi.version = ctx->at_version;
3956
3957     map <string, bufferlist> attrs;
3958     bl.clear();
3959     ::encode(snapset, bl);
3960     attrs[SS_ATTR].claim(bl);
3961
3962     bl.clear();
3963     ::encode(ctx->snapset_obc->obs.oi, bl,
3964              get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
3965     attrs[OI_ATTR].claim(bl);
3966     t->setattrs(snapoid, attrs);
3967   }
3968
3969   *ctxp = std::move(ctx);
3970   return 0;
3971 }
3972
3973 void PrimaryLogPG::kick_snap_trim()
3974 {
3975   assert(is_active());
3976   assert(is_primary());
3977   if (is_clean() && !snap_trimq.empty()) {
3978     dout(10) << __func__ << ": clean and snaps to trim, kicking" << dendl;
3979     snap_trimmer_machine.process_event(KickTrim());
3980   }
3981 }
3982
3983 void PrimaryLogPG::snap_trimmer_scrub_complete()
3984 {
3985   if (is_primary() && is_active() && is_clean()) {
3986     assert(!snap_trimq.empty());
3987     snap_trimmer_machine.process_event(ScrubComplete());
3988   }
3989 }
3990
3991 void PrimaryLogPG::snap_trimmer(epoch_t queued)
3992 {
3993   if (deleting || pg_has_reset_since(queued)) {
3994     return;
3995   }
3996
3997   assert(is_primary());
3998
3999   dout(10) << "snap_trimmer posting" << dendl;
4000   snap_trimmer_machine.process_event(DoSnapWork());
4001   dout(10) << "snap_trimmer complete" << dendl;
4002   return;
4003 }
4004
4005 int PrimaryLogPG::do_xattr_cmp_u64(int op, __u64 v1, bufferlist& xattr)
4006 {
4007   __u64 v2;
4008
4009   string v2s(xattr.c_str(), xattr.length());
4010   if (v2s.length())
4011     v2 = strtoull(v2s.c_str(), NULL, 10);
4012   else
4013     v2 = 0;
4014
4015   dout(20) << "do_xattr_cmp_u64 '" << v1 << "' vs '" << v2 << "' op " << op << dendl;
4016
4017   switch (op) {
4018   case CEPH_OSD_CMPXATTR_OP_EQ:
4019     return (v1 == v2);
4020   case CEPH_OSD_CMPXATTR_OP_NE:
4021     return (v1 != v2);
4022   case CEPH_OSD_CMPXATTR_OP_GT:
4023     return (v1 > v2);
4024   case CEPH_OSD_CMPXATTR_OP_GTE:
4025     return (v1 >= v2);
4026   case CEPH_OSD_CMPXATTR_OP_LT:
4027     return (v1 < v2);
4028   case CEPH_OSD_CMPXATTR_OP_LTE:
4029     return (v1 <= v2);
4030   default:
4031     return -EINVAL;
4032   }
4033 }
4034
4035 int PrimaryLogPG::do_xattr_cmp_str(int op, string& v1s, bufferlist& xattr)
4036 {
4037   string v2s(xattr.c_str(), xattr.length());
4038
4039   dout(20) << "do_xattr_cmp_str '" << v1s << "' vs '" << v2s << "' op " << op << dendl;
4040
4041   switch (op) {
4042   case CEPH_OSD_CMPXATTR_OP_EQ:
4043     return (v1s.compare(v2s) == 0);
4044   case CEPH_OSD_CMPXATTR_OP_NE:
4045     return (v1s.compare(v2s) != 0);
4046   case CEPH_OSD_CMPXATTR_OP_GT:
4047     return (v1s.compare(v2s) > 0);
4048   case CEPH_OSD_CMPXATTR_OP_GTE:
4049     return (v1s.compare(v2s) >= 0);
4050   case CEPH_OSD_CMPXATTR_OP_LT:
4051     return (v1s.compare(v2s) < 0);
4052   case CEPH_OSD_CMPXATTR_OP_LTE:
4053     return (v1s.compare(v2s) <= 0);
4054   default:
4055     return -EINVAL;
4056   }
4057 }
4058
4059 int PrimaryLogPG::do_writesame(OpContext *ctx, OSDOp& osd_op)
4060 {
4061   ceph_osd_op& op = osd_op.op;
4062   vector<OSDOp> write_ops(1);
4063   OSDOp& write_op = write_ops[0];
4064   uint64_t write_length = op.writesame.length;
4065   int result = 0;
4066
4067   if (!write_length)
4068     return 0;
4069
4070   if (!op.writesame.data_length || write_length % op.writesame.data_length)
4071     return -EINVAL;
4072
4073   if (op.writesame.data_length != osd_op.indata.length()) {
4074     derr << "invalid length ws data length " << op.writesame.data_length << " actual len " << osd_op.indata.length() << dendl;
4075     return -EINVAL;
4076   }
4077
4078   while (write_length) {
4079     write_op.indata.append(osd_op.indata);
4080     write_length -= op.writesame.data_length;
4081   }
4082
4083   write_op.op.op = CEPH_OSD_OP_WRITE;
4084   write_op.op.extent.offset = op.writesame.offset;
4085   write_op.op.extent.length = op.writesame.length;
4086   result = do_osd_ops(ctx, write_ops);
4087   if (result < 0)
4088     derr << "do_writesame do_osd_ops failed " << result << dendl;
4089
4090   return result;
4091 }
4092
4093 // ========================================================================
4094 // low level osd ops
4095
4096 int PrimaryLogPG::do_tmap2omap(OpContext *ctx, unsigned flags)
4097 {
4098   dout(20) << " convert tmap to omap for " << ctx->new_obs.oi.soid << dendl;
4099   bufferlist header, vals;
4100   int r = _get_tmap(ctx, &header, &vals);
4101   if (r < 0) {
4102     if (r == -ENODATA && (flags & CEPH_OSD_TMAP2OMAP_NULLOK))
4103       r = 0;
4104     return r;
4105   }
4106
4107   vector<OSDOp> ops(3);
4108
4109   ops[0].op.op = CEPH_OSD_OP_TRUNCATE;
4110   ops[0].op.extent.offset = 0;
4111   ops[0].op.extent.length = 0;
4112
4113   ops[1].op.op = CEPH_OSD_OP_OMAPSETHEADER;
4114   ops[1].indata.claim(header);
4115
4116   ops[2].op.op = CEPH_OSD_OP_OMAPSETVALS;
4117   ops[2].indata.claim(vals);
4118
4119   return do_osd_ops(ctx, ops);
4120 }
4121
4122 int PrimaryLogPG::do_tmapup_slow(OpContext *ctx, bufferlist::iterator& bp, OSDOp& osd_op,
4123                                     bufferlist& bl)
4124 {
4125   // decode
4126   bufferlist header;
4127   map<string, bufferlist> m;
4128   if (bl.length()) {
4129     bufferlist::iterator p = bl.begin();
4130     ::decode(header, p);
4131     ::decode(m, p);
4132     assert(p.end());
4133   }
4134
4135   // do the update(s)
4136   while (!bp.end()) {
4137     __u8 op;
4138     string key;
4139     ::decode(op, bp);
4140
4141     switch (op) {
4142     case CEPH_OSD_TMAP_SET: // insert key
4143       {
4144         ::decode(key, bp);
4145         bufferlist data;
4146         ::decode(data, bp);
4147         m[key] = data;
4148       }
4149       break;
4150     case CEPH_OSD_TMAP_RM: // remove key
4151       ::decode(key, bp);
4152       if (!m.count(key)) {
4153         return -ENOENT;
4154       }
4155       m.erase(key);
4156       break;
4157     case CEPH_OSD_TMAP_RMSLOPPY: // remove key
4158       ::decode(key, bp);
4159       m.erase(key);
4160       break;
4161     case CEPH_OSD_TMAP_HDR: // update header
4162       {
4163         ::decode(header, bp);
4164       }
4165       break;
4166     default:
4167       return -EINVAL;
4168     }
4169   }
4170
4171   // reencode
4172   bufferlist obl;
4173   ::encode(header, obl);
4174   ::encode(m, obl);
4175
4176   // write it out
4177   vector<OSDOp> nops(1);
4178   OSDOp& newop = nops[0];
4179   newop.op.op = CEPH_OSD_OP_WRITEFULL;
4180   newop.op.extent.offset = 0;
4181   newop.op.extent.length = obl.length();
4182   newop.indata = obl;
4183   do_osd_ops(ctx, nops);
4184   osd_op.outdata.claim(newop.outdata);
4185   return 0;
4186 }
4187
4188 int PrimaryLogPG::do_tmapup(OpContext *ctx, bufferlist::iterator& bp, OSDOp& osd_op)
4189 {
4190   bufferlist::iterator orig_bp = bp;
4191   int result = 0;
4192   if (bp.end()) {
4193     dout(10) << "tmapup is a no-op" << dendl;
4194   } else {
4195     // read the whole object
4196     vector<OSDOp> nops(1);
4197     OSDOp& newop = nops[0];
4198     newop.op.op = CEPH_OSD_OP_READ;
4199     newop.op.extent.offset = 0;
4200     newop.op.extent.length = 0;
4201     result = do_osd_ops(ctx, nops);
4202
4203     dout(10) << "tmapup read " << newop.outdata.length() << dendl;
4204
4205     dout(30) << " starting is \n";
4206     newop.outdata.hexdump(*_dout);
4207     *_dout << dendl;
4208
4209     bufferlist::iterator ip = newop.outdata.begin();
4210     bufferlist obl;
4211
4212     dout(30) << "the update command is: \n";
4213     osd_op.indata.hexdump(*_dout);
4214     *_dout << dendl;
4215
4216     // header
4217     bufferlist header;
4218     __u32 nkeys = 0;
4219     if (newop.outdata.length()) {
4220       ::decode(header, ip);
4221       ::decode(nkeys, ip);
4222     }
4223     dout(10) << "tmapup header " << header.length() << dendl;
4224
4225     if (!bp.end() && *bp == CEPH_OSD_TMAP_HDR) {
4226       ++bp;
4227       ::decode(header, bp);
4228       dout(10) << "tmapup new header " << header.length() << dendl;
4229     }
4230
4231     ::encode(header, obl);
4232
4233     dout(20) << "tmapup initial nkeys " << nkeys << dendl;
4234
4235     // update keys
4236     bufferlist newkeydata;
4237     string nextkey, last_in_key;
4238     bufferlist nextval;
4239     bool have_next = false;
4240     if (!ip.end()) {
4241       have_next = true;
4242       ::decode(nextkey, ip);
4243       ::decode(nextval, ip);
4244     }
4245     while (!bp.end() && !result) {
4246       __u8 op;
4247       string key;
4248       try {
4249         ::decode(op, bp);
4250         ::decode(key, bp);
4251       }
4252       catch (buffer::error& e) {
4253         return -EINVAL;
4254       }
4255       if (key < last_in_key) {
4256         dout(5) << "tmapup warning: key '" << key << "' < previous key '" << last_in_key
4257                 << "', falling back to an inefficient (unsorted) update" << dendl;
4258         bp = orig_bp;
4259         return do_tmapup_slow(ctx, bp, osd_op, newop.outdata);
4260       }
4261       last_in_key = key;
4262
4263       dout(10) << "tmapup op " << (int)op << " key " << key << dendl;
4264
4265       // skip existing intervening keys
4266       bool key_exists = false;
4267       while (have_next && !key_exists) {
4268         dout(20) << "  (have_next=" << have_next << " nextkey=" << nextkey << ")" << dendl;
4269         if (nextkey > key)
4270           break;
4271         if (nextkey < key) {
4272           // copy untouched.
4273           ::encode(nextkey, newkeydata);
4274           ::encode(nextval, newkeydata);
4275           dout(20) << "  keep " << nextkey << " " << nextval.length() << dendl;
4276         } else {
4277           // don't copy; discard old value.  and stop.
4278           dout(20) << "  drop " << nextkey << " " << nextval.length() << dendl;
4279           key_exists = true;
4280           nkeys--;
4281         }
4282         if (!ip.end()) {
4283           ::decode(nextkey, ip);
4284           ::decode(nextval, ip);
4285         } else {
4286           have_next = false;
4287         }
4288       }
4289
4290       if (op == CEPH_OSD_TMAP_SET) {
4291         bufferlist val;
4292         try {
4293           ::decode(val, bp);
4294         }
4295         catch (buffer::error& e) {
4296           return -EINVAL;
4297         }
4298         ::encode(key, newkeydata);
4299         ::encode(val, newkeydata);
4300         dout(20) << "   set " << key << " " << val.length() << dendl;
4301         nkeys++;
4302       } else if (op == CEPH_OSD_TMAP_CREATE) {
4303         if (key_exists) {
4304           return -EEXIST;
4305         }
4306         bufferlist val;
4307         try {
4308           ::decode(val, bp);
4309         }
4310         catch (buffer::error& e) {
4311           return -EINVAL;
4312         }
4313         ::encode(key, newkeydata);
4314         ::encode(val, newkeydata);
4315         dout(20) << "   create " << key << " " << val.length() << dendl;
4316         nkeys++;
4317       } else if (op == CEPH_OSD_TMAP_RM) {
4318         // do nothing.
4319         if (!key_exists) {
4320           return -ENOENT;
4321         }
4322       } else if (op == CEPH_OSD_TMAP_RMSLOPPY) {
4323         // do nothing
4324       } else {
4325         dout(10) << "  invalid tmap op " << (int)op << dendl;
4326         return -EINVAL;
4327       }
4328     }
4329
4330     // copy remaining
4331     if (have_next) {
4332       ::encode(nextkey, newkeydata);
4333       ::encode(nextval, newkeydata);
4334       dout(20) << "  keep " << nextkey << " " << nextval.length() << dendl;
4335     }
4336     if (!ip.end()) {
4337       bufferlist rest;
4338       rest.substr_of(newop.outdata, ip.get_off(), newop.outdata.length() - ip.get_off());
4339       dout(20) << "  keep trailing " << rest.length()
4340                << " at " << newkeydata.length() << dendl;
4341       newkeydata.claim_append(rest);
4342     }
4343
4344     // encode final key count + key data
4345     dout(20) << "tmapup final nkeys " << nkeys << dendl;
4346     ::encode(nkeys, obl);
4347     obl.claim_append(newkeydata);
4348
4349     if (0) {
4350       dout(30) << " final is \n";
4351       obl.hexdump(*_dout);
4352       *_dout << dendl;
4353
4354       // sanity check
4355       bufferlist::iterator tp = obl.begin();
4356       bufferlist h;
4357       ::decode(h, tp);
4358       map<string,bufferlist> d;
4359       ::decode(d, tp);
4360       assert(tp.end());
4361       dout(0) << " **** debug sanity check, looks ok ****" << dendl;
4362     }
4363
4364     // write it out
4365     if (!result) {
4366       dout(20) << "tmapput write " << obl.length() << dendl;
4367       newop.op.op = CEPH_OSD_OP_WRITEFULL;
4368       newop.op.extent.offset = 0;
4369       newop.op.extent.length = obl.length();
4370       newop.indata = obl;
4371       do_osd_ops(ctx, nops);
4372       osd_op.outdata.claim(newop.outdata);
4373     }
4374   }
4375   return result;
4376 }
4377
4378 static int check_offset_and_length(uint64_t offset, uint64_t length, uint64_t max)
4379 {
4380   if (offset >= max ||
4381       length > max ||
4382       offset + length > max)
4383     return -EFBIG;
4384
4385   return 0;
4386 }
4387
4388 struct FillInVerifyExtent : public Context {
4389   ceph_le64 *r;
4390   int32_t *rval;
4391   bufferlist *outdatap;
4392   boost::optional<uint32_t> maybe_crc;
4393   uint64_t size;
4394   OSDService *osd;
4395   hobject_t soid;
4396   __le32 flags;
4397   FillInVerifyExtent(ceph_le64 *r, int32_t *rv, bufferlist *blp,
4398                      boost::optional<uint32_t> mc, uint64_t size,
4399                      OSDService *osd, hobject_t soid, __le32 flags) :
4400     r(r), rval(rv), outdatap(blp), maybe_crc(mc),
4401     size(size), osd(osd), soid(soid), flags(flags) {}
4402   void finish(int len) override {
4403     *r = len;
4404     if (len < 0) {
4405       *rval = len;
4406       return;
4407     }
4408     *rval = 0;
4409
4410     // whole object?  can we verify the checksum?
4411     if (maybe_crc && *r == size) {
4412       uint32_t crc = outdatap->crc32c(-1);
4413       if (maybe_crc != crc) {
4414         osd->clog->error() << std::hex << " full-object read crc 0x" << crc
4415                            << " != expected 0x" << *maybe_crc
4416                            << std::dec << " on " << soid;
4417         if (!(flags & CEPH_OSD_OP_FLAG_FAILOK)) {
4418           *rval = -EIO;
4419           *r = 0;
4420         }
4421       }
4422     }
4423   }
4424 };
4425
4426 struct ToSparseReadResult : public Context {
4427   int* result;
4428   bufferlist* data_bl;
4429   uint64_t data_offset;
4430   ceph_le64* len;
4431   ToSparseReadResult(int* result, bufferlist* bl, uint64_t offset,
4432                      ceph_le64* len)
4433     : result(result), data_bl(bl), data_offset(offset),len(len) {}
4434   void finish(int r) override {
4435     if (r < 0) {
4436       *result = r;
4437       return;
4438     }
4439     *result = 0;
4440     *len = r;
4441     bufferlist outdata;
4442     map<uint64_t, uint64_t> extents = {{data_offset, r}};
4443     ::encode(extents, outdata);
4444     ::encode_destructively(*data_bl, outdata);
4445     data_bl->swap(outdata);
4446   }
4447 };
4448
4449 template<typename V>
4450 static string list_keys(const map<string, V>& m) {
4451   string s;
4452   for (typename map<string, V>::const_iterator itr = m.begin(); itr != m.end(); ++itr) {
4453     if (!s.empty()) {
4454       s.push_back(',');
4455     }
4456     s.append(itr->first);
4457   }
4458   return s;
4459 }
4460
4461 template<typename T>
4462 static string list_entries(const T& m) {
4463   string s;
4464   for (typename T::const_iterator itr = m.begin(); itr != m.end(); ++itr) {
4465     if (!s.empty()) {
4466       s.push_back(',');
4467     }
4468     s.append(*itr);
4469   }
4470   return s;
4471 }
4472
4473 void PrimaryLogPG::maybe_create_new_object(
4474   OpContext *ctx,
4475   bool ignore_transaction)
4476 {
4477   ObjectState& obs = ctx->new_obs;
4478   if (!obs.exists) {
4479     ctx->delta_stats.num_objects++;
4480     obs.exists = true;
4481     assert(!obs.oi.is_whiteout());
4482     obs.oi.new_object();
4483     if (!ignore_transaction)
4484       ctx->op_t->create(obs.oi.soid);
4485   } else if (obs.oi.is_whiteout()) {
4486     dout(10) << __func__ << " clearing whiteout on " << obs.oi.soid << dendl;
4487     ctx->new_obs.oi.clear_flag(object_info_t::FLAG_WHITEOUT);
4488     --ctx->delta_stats.num_whiteouts;
4489   }
4490 }
4491
4492 struct ReadFinisher : public PrimaryLogPG::OpFinisher {
4493   OSDOp& osd_op;
4494
4495   ReadFinisher(OSDOp& osd_op) : osd_op(osd_op) {
4496   }
4497
4498   int execute() override {
4499     return osd_op.rval;
4500   }
4501 };
4502
4503 struct C_ChecksumRead : public Context {
4504   PrimaryLogPG *primary_log_pg;
4505   OSDOp &osd_op;
4506   Checksummer::CSumType csum_type;
4507   bufferlist init_value_bl;
4508   ceph_le64 read_length;
4509   bufferlist read_bl;
4510   Context *fill_extent_ctx;
4511
4512   C_ChecksumRead(PrimaryLogPG *primary_log_pg, OSDOp &osd_op,
4513                  Checksummer::CSumType csum_type, bufferlist &&init_value_bl,
4514                  boost::optional<uint32_t> maybe_crc, uint64_t size,
4515                  OSDService *osd, hobject_t soid, __le32 flags)
4516     : primary_log_pg(primary_log_pg), osd_op(osd_op),
4517       csum_type(csum_type), init_value_bl(std::move(init_value_bl)),
4518       fill_extent_ctx(new FillInVerifyExtent(&read_length, &osd_op.rval,
4519                                              &read_bl, maybe_crc, size,
4520                                              osd, soid, flags)) {
4521   }
4522   ~C_ChecksumRead() override {
4523     delete fill_extent_ctx;
4524   }
4525
4526   void finish(int r) override {
4527     fill_extent_ctx->complete(r);
4528     fill_extent_ctx = nullptr;
4529
4530     if (osd_op.rval >= 0) {
4531       bufferlist::iterator init_value_bl_it = init_value_bl.begin();
4532       osd_op.rval = primary_log_pg->finish_checksum(osd_op, csum_type,
4533                                                     &init_value_bl_it, read_bl);
4534     }
4535   }
4536 };
4537
4538 int PrimaryLogPG::do_checksum(OpContext *ctx, OSDOp& osd_op,
4539                               bufferlist::iterator *bl_it)
4540 {
4541   dout(20) << __func__ << dendl;
4542   bool skip_data_digest =
4543     (osd->store->has_builtin_csum() && g_conf->osd_skip_data_digest) ||
4544     g_conf->osd_distrust_data_digest;
4545
4546   auto& op = osd_op.op;
4547   if (op.checksum.chunk_size > 0) {
4548     if (op.checksum.length == 0) {
4549       dout(10) << __func__ << ": length required when chunk size provided"
4550                << dendl;
4551       return -EINVAL;
4552     }
4553     if (op.checksum.length % op.checksum.chunk_size != 0) {
4554       dout(10) << __func__ << ": length not aligned to chunk size" << dendl;
4555       return -EINVAL;
4556     }
4557   }
4558
4559   auto& oi = ctx->new_obs.oi;
4560   if (op.checksum.offset == 0 && op.checksum.length == 0) {
4561     // zeroed offset+length implies checksum whole object
4562     op.checksum.length = oi.size;
4563   } else if (op.checksum.offset + op.checksum.length > oi.size) {
4564     return -EOVERFLOW;
4565   }
4566
4567   Checksummer::CSumType csum_type;
4568   switch (op.checksum.type) {
4569   case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH32:
4570     csum_type = Checksummer::CSUM_XXHASH32;
4571     break;
4572   case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH64:
4573     csum_type = Checksummer::CSUM_XXHASH64;
4574     break;
4575   case CEPH_OSD_CHECKSUM_OP_TYPE_CRC32C:
4576     csum_type = Checksummer::CSUM_CRC32C;
4577     break;
4578   default:
4579     dout(10) << __func__ << ": unknown crc type ("
4580              << static_cast<uint32_t>(op.checksum.type) << ")" << dendl;
4581     return -EINVAL;
4582   }
4583
4584   size_t csum_init_value_size = Checksummer::get_csum_init_value_size(csum_type);
4585   if (bl_it->get_remaining() < csum_init_value_size) {
4586     dout(10) << __func__ << ": init value not provided" << dendl;
4587     return -EINVAL;
4588   }
4589
4590   bufferlist init_value_bl;
4591   init_value_bl.substr_of(bl_it->get_bl(), bl_it->get_off(),
4592                           csum_init_value_size);
4593   bl_it->advance(csum_init_value_size);
4594
4595   if (pool.info.require_rollback() && op.checksum.length > 0) {
4596     // If there is a data digest and it is possible we are reading
4597     // entire object, pass the digest.
4598     boost::optional<uint32_t> maybe_crc;
4599     if (!skip_data_digest &&
4600         oi.is_data_digest() && op.checksum.offset == 0 &&
4601         op.checksum.length >= oi.size) {
4602       maybe_crc = oi.data_digest;
4603     }
4604
4605     // async read
4606     auto& soid = oi.soid;
4607     auto checksum_ctx = new C_ChecksumRead(this, osd_op, csum_type,
4608                                            std::move(init_value_bl), maybe_crc,
4609                                            oi.size, osd, soid, op.flags);
4610
4611     ctx->pending_async_reads.push_back({
4612       {op.checksum.offset, op.checksum.length, op.flags},
4613       {&checksum_ctx->read_bl, checksum_ctx}});
4614
4615     dout(10) << __func__ << ": async_read noted for " << soid << dendl;
4616     ctx->op_finishers[ctx->current_osd_subop_num].reset(
4617       new ReadFinisher(osd_op));
4618     return -EINPROGRESS;
4619   }
4620
4621   // sync read
4622   std::vector<OSDOp> read_ops(1);
4623   auto& read_op = read_ops[0];
4624   if (op.checksum.length > 0) {
4625     read_op.op.op = CEPH_OSD_OP_READ;
4626     read_op.op.flags = op.flags;
4627     read_op.op.extent.offset = op.checksum.offset;
4628     read_op.op.extent.length = op.checksum.length;
4629     read_op.op.extent.truncate_size = 0;
4630     read_op.op.extent.truncate_seq = 0;
4631
4632     int r = do_osd_ops(ctx, read_ops);
4633     if (r < 0) {
4634       derr << __func__ << ": do_osd_ops failed: " << cpp_strerror(r) << dendl;
4635       return r;
4636     }
4637   }
4638
4639   bufferlist::iterator init_value_bl_it = init_value_bl.begin();
4640   return finish_checksum(osd_op, csum_type, &init_value_bl_it,
4641                          read_op.outdata);
4642 }
4643
4644 int PrimaryLogPG::finish_checksum(OSDOp& osd_op,
4645                                   Checksummer::CSumType csum_type,
4646                                   bufferlist::iterator *init_value_bl_it,
4647                                   const bufferlist &read_bl) {
4648   dout(20) << __func__ << dendl;
4649
4650   auto& op = osd_op.op;
4651
4652   if (op.checksum.length > 0 && read_bl.length() != op.checksum.length) {
4653     derr << __func__ << ": bytes read " << read_bl.length() << " != "
4654          << op.checksum.length << dendl;
4655     return -EINVAL;
4656   }
4657
4658   size_t csum_chunk_size = (op.checksum.chunk_size != 0 ?
4659                               op.checksum.chunk_size : read_bl.length());
4660   uint32_t csum_count = (csum_chunk_size > 0 ?
4661                            read_bl.length() / csum_chunk_size : 0);
4662
4663   bufferlist csum;
4664   bufferptr csum_data;
4665   if (csum_count > 0) {
4666     size_t csum_value_size = Checksummer::get_csum_value_size(csum_type);
4667     csum_data = buffer::create(csum_value_size * csum_count);
4668     csum_data.zero();
4669     csum.append(csum_data);
4670
4671     switch (csum_type) {
4672     case Checksummer::CSUM_XXHASH32:
4673       {
4674         Checksummer::xxhash32::init_value_t init_value;
4675         ::decode(init_value, *init_value_bl_it);
4676         Checksummer::calculate<Checksummer::xxhash32>(
4677           init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
4678           &csum_data);
4679       }
4680       break;
4681     case Checksummer::CSUM_XXHASH64:
4682       {
4683         Checksummer::xxhash64::init_value_t init_value;
4684         ::decode(init_value, *init_value_bl_it);
4685         Checksummer::calculate<Checksummer::xxhash64>(
4686           init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
4687           &csum_data);
4688       }
4689       break;
4690     case Checksummer::CSUM_CRC32C:
4691       {
4692         Checksummer::crc32c::init_value_t init_value;
4693         ::decode(init_value, *init_value_bl_it);
4694         Checksummer::calculate<Checksummer::crc32c>(
4695           init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
4696           &csum_data);
4697       }
4698       break;
4699     default:
4700       break;
4701     }
4702   }
4703
4704   ::encode(csum_count, osd_op.outdata);
4705   osd_op.outdata.claim_append(csum);
4706   return 0;
4707 }
4708
4709 struct C_ExtentCmpRead : public Context {
4710   PrimaryLogPG *primary_log_pg;
4711   OSDOp &osd_op;
4712   ceph_le64 read_length;
4713   bufferlist read_bl;
4714   Context *fill_extent_ctx;
4715
4716   C_ExtentCmpRead(PrimaryLogPG *primary_log_pg, OSDOp &osd_op,
4717                   boost::optional<uint32_t> maybe_crc, uint64_t size,
4718                   OSDService *osd, hobject_t soid, __le32 flags)
4719     : primary_log_pg(primary_log_pg), osd_op(osd_op),
4720       fill_extent_ctx(new FillInVerifyExtent(&read_length, &osd_op.rval,
4721                                              &read_bl, maybe_crc, size,
4722                                              osd, soid, flags)) {
4723   }
4724   ~C_ExtentCmpRead() override {
4725     delete fill_extent_ctx;
4726   }
4727
4728   void finish(int r) override {
4729     if (r == -ENOENT) {
4730       osd_op.rval = 0;
4731       read_bl.clear();
4732       delete fill_extent_ctx;
4733     } else {
4734       fill_extent_ctx->complete(r);
4735     }
4736     fill_extent_ctx = nullptr;
4737
4738     if (osd_op.rval >= 0) {
4739       osd_op.rval = primary_log_pg->finish_extent_cmp(osd_op, read_bl);
4740     }
4741   }
4742 };
4743
4744 int PrimaryLogPG::do_extent_cmp(OpContext *ctx, OSDOp& osd_op)
4745 {
4746   dout(20) << __func__ << dendl;
4747   ceph_osd_op& op = osd_op.op;
4748   bool skip_data_digest =
4749     (osd->store->has_builtin_csum() && g_conf->osd_skip_data_digest) ||
4750     g_conf->osd_distrust_data_digest;
4751
4752   auto& oi = ctx->new_obs.oi;
4753   uint64_t size = oi.size;
4754   if ((oi.truncate_seq < op.extent.truncate_seq) &&
4755       (op.extent.offset + op.extent.length > op.extent.truncate_size)) {
4756     size = op.extent.truncate_size;
4757   }
4758
4759   if (op.extent.offset >= size) {
4760     op.extent.length = 0;
4761   } else if (op.extent.offset + op.extent.length > size) {
4762     op.extent.length = size - op.extent.offset;
4763   }
4764
4765   if (op.extent.length == 0) {
4766     dout(20) << __func__ << " zero length extent" << dendl;
4767     return finish_extent_cmp(osd_op, bufferlist{});
4768   } else if (!ctx->obs->exists || ctx->obs->oi.is_whiteout()) {
4769     dout(20) << __func__ << " object DNE" << dendl;
4770     return finish_extent_cmp(osd_op, {});
4771   } else if (pool.info.require_rollback()) {
4772     // If there is a data digest and it is possible we are reading
4773     // entire object, pass the digest.
4774     boost::optional<uint32_t> maybe_crc;
4775     if (!skip_data_digest &&
4776         oi.is_data_digest() && op.checksum.offset == 0 &&
4777         op.checksum.length >= oi.size) {
4778       maybe_crc = oi.data_digest;
4779     }
4780
4781     // async read
4782     auto& soid = oi.soid;
4783     auto extent_cmp_ctx = new C_ExtentCmpRead(this, osd_op, maybe_crc, oi.size,
4784                                               osd, soid, op.flags);
4785     ctx->pending_async_reads.push_back({
4786       {op.extent.offset, op.extent.length, op.flags},
4787       {&extent_cmp_ctx->read_bl, extent_cmp_ctx}});
4788
4789     dout(10) << __func__ << ": async_read noted for " << soid << dendl;
4790
4791     ctx->op_finishers[ctx->current_osd_subop_num].reset(
4792       new ReadFinisher(osd_op));
4793     return -EINPROGRESS;
4794   }
4795
4796   // sync read
4797   vector<OSDOp> read_ops(1);
4798   OSDOp& read_op = read_ops[0];
4799
4800   read_op.op.op = CEPH_OSD_OP_SYNC_READ;
4801   read_op.op.extent.offset = op.extent.offset;
4802   read_op.op.extent.length = op.extent.length;
4803   read_op.op.extent.truncate_seq = op.extent.truncate_seq;
4804   read_op.op.extent.truncate_size = op.extent.truncate_size;
4805
4806   int result = do_osd_ops(ctx, read_ops);
4807   if (result < 0) {
4808     derr << __func__ << " failed " << result << dendl;
4809     return result;
4810   }
4811   return finish_extent_cmp(osd_op, read_op.outdata);
4812 }
4813
4814 int PrimaryLogPG::finish_extent_cmp(OSDOp& osd_op, const bufferlist &read_bl)
4815 {
4816   for (uint64_t idx = 0; idx < osd_op.indata.length(); ++idx) {
4817     char read_byte = (idx < read_bl.length() ? read_bl[idx] : 0);
4818     if (osd_op.indata[idx] != read_byte) {
4819         return (-MAX_ERRNO - idx);
4820     }
4821   }
4822
4823   return 0;
4824 }
4825
4826 int PrimaryLogPG::do_read(OpContext *ctx, OSDOp& osd_op) {
4827   dout(20) << __func__ << dendl;
4828   auto& op = osd_op.op;
4829   auto& oi = ctx->new_obs.oi;
4830   auto& soid = oi.soid;
4831   __u32 seq = oi.truncate_seq;
4832   uint64_t size = oi.size;
4833   bool trimmed_read = false;
4834   bool skip_data_digest =
4835     (osd->store->has_builtin_csum() && g_conf->osd_skip_data_digest) ||
4836     g_conf->osd_distrust_data_digest;
4837
4838   // are we beyond truncate_size?
4839   if ( (seq < op.extent.truncate_seq) &&
4840        (op.extent.offset + op.extent.length > op.extent.truncate_size) )
4841     size = op.extent.truncate_size;
4842
4843   if (op.extent.length == 0) //length is zero mean read the whole object
4844     op.extent.length = size;
4845
4846   if (op.extent.offset >= size) {
4847     op.extent.length = 0;
4848     trimmed_read = true;
4849   } else if (op.extent.offset + op.extent.length > size) {
4850     op.extent.length = size - op.extent.offset;
4851     trimmed_read = true;
4852   }
4853
4854   // read into a buffer
4855   int result = 0;
4856   if (trimmed_read && op.extent.length == 0) {
4857     // read size was trimmed to zero and it is expected to do nothing
4858     // a read operation of 0 bytes does *not* do nothing, this is why
4859     // the trimmed_read boolean is needed
4860   } else if (pool.info.require_rollback()) {
4861     boost::optional<uint32_t> maybe_crc;
4862     // If there is a data digest and it is possible we are reading
4863     // entire object, pass the digest.  FillInVerifyExtent will
4864     // will check the oi.size again.
4865     if (!skip_data_digest &&
4866         oi.is_data_digest() && op.extent.offset == 0 &&
4867         op.extent.length >= oi.size)
4868       maybe_crc = oi.data_digest;
4869     ctx->pending_async_reads.push_back(
4870       make_pair(
4871         boost::make_tuple(op.extent.offset, op.extent.length, op.flags),
4872         make_pair(&osd_op.outdata,
4873                   new FillInVerifyExtent(&op.extent.length, &osd_op.rval,
4874                                          &osd_op.outdata, maybe_crc, oi.size,
4875                                          osd, soid, op.flags))));
4876     dout(10) << " async_read noted for " << soid << dendl;
4877
4878     ctx->op_finishers[ctx->current_osd_subop_num].reset(
4879       new ReadFinisher(osd_op));
4880   } else {
4881     int r = pgbackend->objects_read_sync(
4882       soid, op.extent.offset, op.extent.length, op.flags, &osd_op.outdata);
4883     // whole object?  can we verify the checksum?
4884     if (!skip_data_digest && r >= 0 && op.extent.offset == 0 &&
4885         (uint64_t)r == oi.size && oi.is_data_digest()) {
4886       uint32_t crc = osd_op.outdata.crc32c(-1);
4887       if (oi.data_digest != crc) {
4888         osd->clog->error() << info.pgid << std::hex
4889                            << " full-object read crc 0x" << crc
4890                            << " != expected 0x" << oi.data_digest
4891                            << std::dec << " on " << soid;
4892         r = -EIO; // try repair later
4893       }
4894     }
4895     if (r == -EIO) {
4896       r = rep_repair_primary_object(soid, ctx->op);
4897     }
4898     if (r >= 0)
4899       op.extent.length = r;
4900     else {
4901       result = r;
4902       op.extent.length = 0;
4903     }
4904     dout(10) << " read got " << r << " / " << op.extent.length
4905              << " bytes from obj " << soid << dendl;
4906   }
4907
4908   // XXX the op.extent.length is the requested length for async read
4909   // On error this length is changed to 0 after the error comes back.
4910   ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(op.extent.length, 10);
4911   ctx->delta_stats.num_rd++;
4912   return result;
4913 }
4914
4915 int PrimaryLogPG::do_sparse_read(OpContext *ctx, OSDOp& osd_op) {
4916   dout(20) << __func__ << dendl;
4917   auto& op = osd_op.op;
4918   auto& oi = ctx->new_obs.oi;
4919   auto& soid = oi.soid;
4920   bool skip_data_digest =
4921     (osd->store->has_builtin_csum() && g_conf->osd_skip_data_digest) ||
4922     g_conf->osd_distrust_data_digest;
4923
4924   if (op.extent.truncate_seq) {
4925     dout(0) << "sparse_read does not support truncation sequence " << dendl;
4926     return -EINVAL;
4927   }
4928
4929   ++ctx->num_read;
4930   if (pool.info.ec_pool()) {
4931     // translate sparse read to a normal one if not supported
4932     uint64_t offset = op.extent.offset;
4933     uint64_t length = op.extent.length;
4934     if (offset > oi.size) {
4935       length = 0;
4936     } else if (offset + length > oi.size) {
4937       length = oi.size - offset;
4938     }
4939
4940     if (length > 0) {
4941       ctx->pending_async_reads.push_back(
4942         make_pair(
4943           boost::make_tuple(offset, length, op.flags),
4944           make_pair(
4945             &osd_op.outdata,
4946             new ToSparseReadResult(&osd_op.rval, &osd_op.outdata, offset,
4947                                    &op.extent.length))));
4948       dout(10) << " async_read (was sparse_read) noted for " << soid << dendl;
4949
4950       ctx->op_finishers[ctx->current_osd_subop_num].reset(
4951         new ReadFinisher(osd_op));
4952     } else {
4953       dout(10) << " sparse read ended up empty for " << soid << dendl;
4954       map<uint64_t, uint64_t> extents;
4955       ::encode(extents, osd_op.outdata);
4956     }
4957   } else {
4958     // read into a buffer
4959     map<uint64_t, uint64_t> m;
4960     uint32_t total_read = 0;
4961     int r = osd->store->fiemap(ch, ghobject_t(soid, ghobject_t::NO_GEN,
4962                                               info.pgid.shard),
4963                                op.extent.offset, op.extent.length, m);
4964     if (r < 0)  {
4965       return r;
4966     }
4967
4968     map<uint64_t, uint64_t>::iterator miter;
4969     bufferlist data_bl;
4970     uint64_t last = op.extent.offset;
4971     for (miter = m.begin(); miter != m.end(); ++miter) {
4972       // verify hole?
4973       if (cct->_conf->osd_verify_sparse_read_holes &&
4974           last < miter->first) {
4975         bufferlist t;
4976         uint64_t len = miter->first - last;
4977         r = pgbackend->objects_read_sync(soid, last, len, op.flags, &t);
4978         if (r < 0) {
4979           osd->clog->error() << coll << " " << soid
4980                              << " sparse-read failed to read: "
4981                              << r;
4982         } else if (!t.is_zero()) {
4983           osd->clog->error() << coll << " " << soid
4984                              << " sparse-read found data in hole "
4985                              << last << "~" << len;
4986         }
4987       }
4988
4989       bufferlist tmpbl;
4990       r = pgbackend->objects_read_sync(soid, miter->first, miter->second,
4991                                        op.flags, &tmpbl);
4992       if (r == -EIO) {
4993         r = rep_repair_primary_object(soid, ctx->op);
4994       }
4995       if (r < 0) {
4996         return r;
4997       }
4998
4999       // this is usually happen when we get extent that exceeds the actual file
5000       // size
5001       if (r < (int)miter->second)
5002         miter->second = r;
5003       total_read += r;
5004       dout(10) << "sparse-read " << miter->first << "@" << miter->second
5005                << dendl;
5006       data_bl.claim_append(tmpbl);
5007       last = miter->first + r;
5008     }
5009
5010     if (r < 0) {
5011       return r;
5012     }
5013
5014     // verify trailing hole?
5015     if (cct->_conf->osd_verify_sparse_read_holes) {
5016       uint64_t end = MIN(op.extent.offset + op.extent.length, oi.size);
5017       if (last < end) {
5018         bufferlist t;
5019         uint64_t len = end - last;
5020         r = pgbackend->objects_read_sync(soid, last, len, op.flags, &t);
5021         if (r < 0) {
5022           osd->clog->error() << coll << " " << soid
5023                              << " sparse-read failed to read: " << r;
5024         } else if (!t.is_zero()) {
5025           osd->clog->error() << coll << " " << soid
5026                              << " sparse-read found data in hole "
5027                              << last << "~" << len;
5028         }
5029       }
5030     }
5031
5032     // Why SPARSE_READ need checksum? In fact, librbd always use sparse-read.
5033     // Maybe at first, there is no much whole objects. With continued use, more
5034     // and more whole object exist. So from this point, for spare-read add
5035     // checksum make sense.
5036     if (!skip_data_digest &&
5037         total_read == oi.size && oi.is_data_digest()) {
5038       uint32_t crc = data_bl.crc32c(-1);
5039       if (oi.data_digest != crc) {
5040         osd->clog->error() << info.pgid << std::hex
5041           << " full-object read crc 0x" << crc
5042           << " != expected 0x" << oi.data_digest
5043           << std::dec << " on " << soid;
5044         r = rep_repair_primary_object(soid, ctx->op);
5045         if (r < 0) {
5046           return r;
5047         }
5048       }
5049     }
5050
5051     op.extent.length = total_read;
5052
5053     ::encode(m, osd_op.outdata); // re-encode since it might be modified
5054     ::encode_destructively(data_bl, osd_op.outdata);
5055
5056     dout(10) << " sparse_read got " << total_read << " bytes from object "
5057              << soid << dendl;
5058   }
5059
5060   ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(op.extent.length, 10);
5061   ctx->delta_stats.num_rd++;
5062   return 0;
5063 }
5064
5065 int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
5066 {
5067   int result = 0;
5068   SnapSetContext *ssc = ctx->obc->ssc;
5069   ObjectState& obs = ctx->new_obs;
5070   object_info_t& oi = obs.oi;
5071   const hobject_t& soid = oi.soid;
5072   bool skip_data_digest =
5073     osd->store->has_builtin_csum() && g_conf->osd_skip_data_digest;
5074
5075   PGTransaction* t = ctx->op_t.get();
5076
5077   dout(10) << "do_osd_op " << soid << " " << ops << dendl;
5078
5079   ctx->current_osd_subop_num = 0;
5080   for (auto p = ops.begin(); p != ops.end(); ++p, ctx->current_osd_subop_num++, ctx->processed_subop_count++) {
5081     OSDOp& osd_op = *p;
5082     ceph_osd_op& op = osd_op.op;
5083
5084     OpFinisher* op_finisher = nullptr;
5085     {
5086       auto op_finisher_it = ctx->op_finishers.find(ctx->current_osd_subop_num);
5087       if (op_finisher_it != ctx->op_finishers.end()) {
5088         op_finisher = op_finisher_it->second.get();
5089       }
5090     }
5091
5092     // TODO: check endianness (__le32 vs uint32_t, etc.)
5093     // The fields in ceph_osd_op are little-endian (according to the definition in rados.h),
5094     // but the code in this function seems to treat them as native-endian.  What should the
5095     // tracepoints do?
5096     tracepoint(osd, do_osd_op_pre, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op), op.flags);
5097
5098     dout(10) << "do_osd_op  " << osd_op << dendl;
5099
5100     bufferlist::iterator bp = osd_op.indata.begin();
5101
5102     // user-visible modifcation?
5103     switch (op.op) {
5104       // non user-visible modifications
5105     case CEPH_OSD_OP_WATCH:
5106     case CEPH_OSD_OP_CACHE_EVICT:
5107     case CEPH_OSD_OP_CACHE_FLUSH:
5108     case CEPH_OSD_OP_CACHE_TRY_FLUSH:
5109     case CEPH_OSD_OP_UNDIRTY:
5110     case CEPH_OSD_OP_COPY_FROM:  // we handle user_version update explicitly
5111     case CEPH_OSD_OP_CACHE_PIN:
5112     case CEPH_OSD_OP_CACHE_UNPIN:
5113     case CEPH_OSD_OP_SET_REDIRECT:
5114       break;
5115     default:
5116       if (op.op & CEPH_OSD_OP_MODE_WR)
5117         ctx->user_modify = true;
5118     }
5119
5120     // munge -1 truncate to 0 truncate
5121     if (ceph_osd_op_uses_extent(op.op) &&
5122         op.extent.truncate_seq == 1 &&
5123         op.extent.truncate_size == (-1ULL)) {
5124       op.extent.truncate_size = 0;
5125       op.extent.truncate_seq = 0;
5126     }
5127
5128     // munge ZERO -> TRUNCATE?  (don't munge to DELETE or we risk hosing attributes)
5129     if (op.op == CEPH_OSD_OP_ZERO &&
5130         obs.exists &&
5131         op.extent.offset < cct->_conf->osd_max_object_size &&
5132         op.extent.length >= 1 &&
5133         op.extent.length <= cct->_conf->osd_max_object_size &&
5134         op.extent.offset + op.extent.length >= oi.size) {
5135       if (op.extent.offset >= oi.size) {
5136         // no-op
5137         goto fail;
5138       }
5139       dout(10) << " munging ZERO " << op.extent.offset << "~" << op.extent.length
5140                << " -> TRUNCATE " << op.extent.offset << " (old size is " << oi.size << ")" << dendl;
5141       op.op = CEPH_OSD_OP_TRUNCATE;
5142     }
5143
5144     switch (op.op) {
5145
5146       // --- READS ---
5147
5148     case CEPH_OSD_OP_CMPEXT:
5149       ++ctx->num_read;
5150       tracepoint(osd, do_osd_op_pre_extent_cmp, soid.oid.name.c_str(),
5151                  soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
5152                  op.extent.length, op.extent.truncate_size,
5153                  op.extent.truncate_seq);
5154
5155       if (op_finisher == nullptr) {
5156         result = do_extent_cmp(ctx, osd_op);
5157       } else {
5158         result = op_finisher->execute();
5159       }
5160       break;
5161
5162     case CEPH_OSD_OP_SYNC_READ:
5163       if (pool.info.require_rollback()) {
5164         result = -EOPNOTSUPP;
5165         break;
5166       }
5167       // fall through
5168     case CEPH_OSD_OP_READ:
5169       ++ctx->num_read;
5170       tracepoint(osd, do_osd_op_pre_read, soid.oid.name.c_str(),
5171                  soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
5172                  op.extent.length, op.extent.truncate_size,
5173                  op.extent.truncate_seq);
5174       if (op_finisher == nullptr) {
5175         if (!ctx->data_off) {
5176           ctx->data_off = op.extent.offset;
5177         }
5178         result = do_read(ctx, osd_op);
5179       } else {
5180         result = op_finisher->execute();
5181       }
5182       break;
5183
5184     case CEPH_OSD_OP_CHECKSUM:
5185       ++ctx->num_read;
5186       {
5187         tracepoint(osd, do_osd_op_pre_checksum, soid.oid.name.c_str(),
5188                    soid.snap.val, oi.size, oi.truncate_seq, op.checksum.type,
5189                    op.checksum.offset, op.checksum.length,
5190                    op.checksum.chunk_size);
5191
5192         if (op_finisher == nullptr) {
5193           result = do_checksum(ctx, osd_op, &bp);
5194         } else {
5195           result = op_finisher->execute();
5196         }
5197       }
5198       break;
5199
5200     /* map extents */
5201     case CEPH_OSD_OP_MAPEXT:
5202       tracepoint(osd, do_osd_op_pre_mapext, soid.oid.name.c_str(), soid.snap.val, op.extent.offset, op.extent.length);
5203       if (pool.info.require_rollback()) {
5204         result = -EOPNOTSUPP;
5205         break;
5206       }
5207       ++ctx->num_read;
5208       {
5209         // read into a buffer
5210         bufferlist bl;
5211         int r = osd->store->fiemap(ch, ghobject_t(soid, ghobject_t::NO_GEN,
5212                                                   info.pgid.shard),
5213                                    op.extent.offset, op.extent.length, bl);
5214         osd_op.outdata.claim(bl);
5215         if (r < 0)
5216           result = r;
5217         else
5218           ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(bl.length(), 10);
5219         ctx->delta_stats.num_rd++;
5220         dout(10) << " map_extents done on object " << soid << dendl;
5221       }
5222       break;
5223
5224     /* map extents */
5225     case CEPH_OSD_OP_SPARSE_READ:
5226       tracepoint(osd, do_osd_op_pre_sparse_read, soid.oid.name.c_str(),
5227                  soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
5228                  op.extent.length, op.extent.truncate_size,
5229                  op.extent.truncate_seq);
5230       if (op_finisher == nullptr) {
5231         result = do_sparse_read(ctx, osd_op);
5232       } else {
5233         result = op_finisher->execute();
5234       }
5235       break;
5236
5237     case CEPH_OSD_OP_CALL:
5238       {
5239         string cname, mname;
5240         bufferlist indata;
5241         try {
5242           bp.copy(op.cls.class_len, cname);
5243           bp.copy(op.cls.method_len, mname);
5244           bp.copy(op.cls.indata_len, indata);
5245         } catch (buffer::error& e) {
5246           dout(10) << "call unable to decode class + method + indata" << dendl;
5247           dout(30) << "in dump: ";
5248           osd_op.indata.hexdump(*_dout);
5249           *_dout << dendl;
5250           result = -EINVAL;
5251           tracepoint(osd, do_osd_op_pre_call, soid.oid.name.c_str(), soid.snap.val, "???", "???");
5252           break;
5253         }
5254         tracepoint(osd, do_osd_op_pre_call, soid.oid.name.c_str(), soid.snap.val, cname.c_str(), mname.c_str());
5255
5256         ClassHandler::ClassData *cls;
5257         result = osd->class_handler->open_class(cname, &cls);
5258         assert(result == 0);   // init_op_flags() already verified this works.
5259
5260         ClassHandler::ClassMethod *method = cls->get_method(mname.c_str());
5261         if (!method) {
5262           dout(10) << "call method " << cname << "." << mname << " does not exist" << dendl;
5263           result = -EOPNOTSUPP;
5264           break;
5265         }
5266
5267         int flags = method->get_flags();
5268         if (flags & CLS_METHOD_WR)
5269           ctx->user_modify = true;
5270
5271         bufferlist outdata;
5272         dout(10) << "call method " << cname << "." << mname << dendl;
5273         int prev_rd = ctx->num_read;
5274         int prev_wr = ctx->num_write;
5275         result = method->exec((cls_method_context_t)&ctx, indata, outdata);
5276
5277         if (ctx->num_read > prev_rd && !(flags & CLS_METHOD_RD)) {
5278           derr << "method " << cname << "." << mname << " tried to read object but is not marked RD" << dendl;
5279           result = -EIO;
5280           break;
5281         }
5282         if (ctx->num_write > prev_wr && !(flags & CLS_METHOD_WR)) {
5283           derr << "method " << cname << "." << mname << " tried to update object but is not marked WR" << dendl;
5284           result = -EIO;
5285           break;
5286         }
5287
5288         dout(10) << "method called response length=" << outdata.length() << dendl;
5289         op.extent.length = outdata.length();
5290         osd_op.outdata.claim_append(outdata);
5291         dout(30) << "out dump: ";
5292         osd_op.outdata.hexdump(*_dout);
5293         *_dout << dendl;
5294       }
5295       break;
5296
5297     case CEPH_OSD_OP_STAT:
5298       // note: stat does not require RD
5299       {
5300         tracepoint(osd, do_osd_op_pre_stat, soid.oid.name.c_str(), soid.snap.val);
5301
5302         if (obs.exists && !oi.is_whiteout()) {
5303           ::encode(oi.size, osd_op.outdata);
5304           ::encode(oi.mtime, osd_op.outdata);
5305           dout(10) << "stat oi has " << oi.size << " " << oi.mtime << dendl;
5306         } else {
5307           result = -ENOENT;
5308           dout(10) << "stat oi object does not exist" << dendl;
5309         }
5310
5311         ctx->delta_stats.num_rd++;
5312       }
5313       break;
5314
5315     case CEPH_OSD_OP_ISDIRTY:
5316       ++ctx->num_read;
5317       {
5318         tracepoint(osd, do_osd_op_pre_isdirty, soid.oid.name.c_str(), soid.snap.val);
5319         bool is_dirty = obs.oi.is_dirty();
5320         ::encode(is_dirty, osd_op.outdata);
5321         ctx->delta_stats.num_rd++;
5322         result = 0;
5323       }
5324       break;
5325
5326     case CEPH_OSD_OP_UNDIRTY:
5327       ++ctx->num_write;
5328       {
5329         tracepoint(osd, do_osd_op_pre_undirty, soid.oid.name.c_str(), soid.snap.val);
5330         if (oi.is_dirty()) {
5331           ctx->undirty = true;  // see make_writeable()
5332           ctx->modify = true;
5333           ctx->delta_stats.num_wr++;
5334         }
5335         result = 0;
5336       }
5337       break;
5338
5339     case CEPH_OSD_OP_CACHE_TRY_FLUSH:
5340       ++ctx->num_write;
5341       {
5342         tracepoint(osd, do_osd_op_pre_try_flush, soid.oid.name.c_str(), soid.snap.val);
5343         if (ctx->lock_type != ObjectContext::RWState::RWNONE) {
5344           dout(10) << "cache-try-flush without SKIPRWLOCKS flag set" << dendl;
5345           result = -EINVAL;
5346           break;
5347         }
5348         if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
5349           result = -EINVAL;
5350           break;
5351         }
5352         if (!obs.exists) {
5353           result = 0;
5354           break;
5355         }
5356         if (oi.is_cache_pinned()) {
5357           dout(10) << "cache-try-flush on a pinned object, consider unpin this object first" << dendl;
5358           result = -EPERM;
5359           break;
5360         }
5361         if (oi.is_dirty()) {
5362           result = start_flush(ctx->op, ctx->obc, false, NULL, boost::none);
5363           if (result == -EINPROGRESS)
5364             result = -EAGAIN;
5365         } else {
5366           result = 0;
5367         }
5368       }
5369       break;
5370
5371     case CEPH_OSD_OP_CACHE_FLUSH:
5372       ++ctx->num_write;
5373       {
5374         tracepoint(osd, do_osd_op_pre_cache_flush, soid.oid.name.c_str(), soid.snap.val);
5375         if (ctx->lock_type == ObjectContext::RWState::RWNONE) {
5376           dout(10) << "cache-flush with SKIPRWLOCKS flag set" << dendl;
5377           result = -EINVAL;
5378           break;
5379         }
5380         if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
5381           result = -EINVAL;
5382           break;
5383         }
5384         if (!obs.exists) {
5385           result = 0;
5386           break;
5387         }
5388         if (oi.is_cache_pinned()) {
5389           dout(10) << "cache-flush on a pinned object, consider unpin this object first" << dendl;
5390           result = -EPERM;
5391           break;
5392         }
5393         hobject_t missing;
5394         if (oi.is_dirty()) {
5395           result = start_flush(ctx->op, ctx->obc, true, &missing, boost::none);
5396           if (result == -EINPROGRESS)
5397             result = -EAGAIN;
5398         } else {
5399           result = 0;
5400         }
5401         // Check special return value which has set missing_return
5402         if (result == -ENOENT) {
5403           dout(10) << __func__ << " CEPH_OSD_OP_CACHE_FLUSH got ENOENT" << dendl;
5404           assert(!missing.is_min());
5405           wait_for_unreadable_object(missing, ctx->op);
5406           // Error code which is used elsewhere when wait_for_unreadable_object() is used
5407           result = -EAGAIN;
5408         }
5409       }
5410       break;
5411
5412     case CEPH_OSD_OP_CACHE_EVICT:
5413       ++ctx->num_write;
5414       {
5415         tracepoint(osd, do_osd_op_pre_cache_evict, soid.oid.name.c_str(), soid.snap.val);
5416         if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
5417           result = -EINVAL;
5418           break;
5419         }
5420         if (!obs.exists) {
5421           result = 0;
5422           break;
5423         }
5424         if (oi.is_cache_pinned()) {
5425           dout(10) << "cache-evict on a pinned object, consider unpin this object first" << dendl;
5426           result = -EPERM;
5427           break;
5428         }
5429         if (oi.is_dirty()) {
5430           result = -EBUSY;
5431           break;
5432         }
5433         if (!oi.watchers.empty()) {
5434           result = -EBUSY;
5435           break;
5436         }
5437         if (soid.snap == CEPH_NOSNAP) {
5438           result = _verify_no_head_clones(soid, ssc->snapset);
5439           if (result < 0)
5440             break;
5441         }
5442         result = _delete_oid(ctx, true, false);
5443         if (result >= 0) {
5444           // mark that this is a cache eviction to avoid triggering normal
5445           // make_writeable() clone or snapdir object creation in finish_ctx()
5446           ctx->cache_evict = true;
5447         }
5448         osd->logger->inc(l_osd_tier_evict);
5449       }
5450       break;
5451
5452     case CEPH_OSD_OP_GETXATTR:
5453       ++ctx->num_read;
5454       {
5455         string aname;
5456         bp.copy(op.xattr.name_len, aname);
5457         tracepoint(osd, do_osd_op_pre_getxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
5458         string name = "_" + aname;
5459         int r = getattr_maybe_cache(
5460           ctx->obc,
5461           name,
5462           &(osd_op.outdata));
5463         if (r >= 0) {
5464           op.xattr.value_len = osd_op.outdata.length();
5465           result = 0;
5466           ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
5467         } else
5468           result = r;
5469
5470         ctx->delta_stats.num_rd++;
5471       }
5472       break;
5473
5474    case CEPH_OSD_OP_GETXATTRS:
5475       ++ctx->num_read;
5476       {
5477         tracepoint(osd, do_osd_op_pre_getxattrs, soid.oid.name.c_str(), soid.snap.val);
5478         map<string, bufferlist> out;
5479         result = getattrs_maybe_cache(
5480           ctx->obc,
5481           &out);
5482
5483         bufferlist bl;
5484         ::encode(out, bl);
5485         ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(bl.length(), 10);
5486         ctx->delta_stats.num_rd++;
5487         osd_op.outdata.claim_append(bl);
5488       }
5489       break;
5490
5491     case CEPH_OSD_OP_CMPXATTR:
5492       ++ctx->num_read;
5493       {
5494         string aname;
5495         bp.copy(op.xattr.name_len, aname);
5496         tracepoint(osd, do_osd_op_pre_cmpxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
5497         string name = "_" + aname;
5498         name[op.xattr.name_len + 1] = 0;
5499
5500         bufferlist xattr;
5501         result = getattr_maybe_cache(
5502           ctx->obc,
5503           name,
5504           &xattr);
5505         if (result < 0 && result != -EEXIST && result != -ENODATA)
5506           break;
5507
5508         ctx->delta_stats.num_rd++;
5509         ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(xattr.length(), 10);
5510
5511         switch (op.xattr.cmp_mode) {
5512         case CEPH_OSD_CMPXATTR_MODE_STRING:
5513           {
5514             string val;
5515             bp.copy(op.xattr.value_len, val);
5516             val[op.xattr.value_len] = 0;
5517             dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name << " val=" << val
5518                      << " op=" << (int)op.xattr.cmp_op << " mode=" << (int)op.xattr.cmp_mode << dendl;
5519             result = do_xattr_cmp_str(op.xattr.cmp_op, val, xattr);
5520           }
5521           break;
5522
5523         case CEPH_OSD_CMPXATTR_MODE_U64:
5524           {
5525             uint64_t u64val;
5526             try {
5527               ::decode(u64val, bp);
5528             }
5529             catch (buffer::error& e) {
5530               result = -EINVAL;
5531               goto fail;
5532             }
5533             dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name << " val=" << u64val
5534                      << " op=" << (int)op.xattr.cmp_op << " mode=" << (int)op.xattr.cmp_mode << dendl;
5535             result = do_xattr_cmp_u64(op.xattr.cmp_op, u64val, xattr);
5536           }
5537           break;
5538
5539         default:
5540           dout(10) << "bad cmp mode " << (int)op.xattr.cmp_mode << dendl;
5541           result = -EINVAL;
5542         }
5543
5544         if (!result) {
5545           dout(10) << "comparison returned false" << dendl;
5546           result = -ECANCELED;
5547           break;
5548         }
5549         if (result < 0) {
5550           dout(10) << "comparison returned " << result << " " << cpp_strerror(-result) << dendl;
5551           break;
5552         }
5553
5554         dout(10) << "comparison returned true" << dendl;
5555       }
5556       break;
5557
5558     case CEPH_OSD_OP_ASSERT_VER:
5559       ++ctx->num_read;
5560       {
5561         uint64_t ver = op.assert_ver.ver;
5562         tracepoint(osd, do_osd_op_pre_assert_ver, soid.oid.name.c_str(), soid.snap.val, ver);
5563         if (!ver)
5564           result = -EINVAL;
5565         else if (ver < oi.user_version)
5566           result = -ERANGE;
5567         else if (ver > oi.user_version)
5568           result = -EOVERFLOW;
5569       }
5570       break;
5571
5572     case CEPH_OSD_OP_LIST_WATCHERS:
5573       ++ctx->num_read;
5574       {
5575         tracepoint(osd, do_osd_op_pre_list_watchers, soid.oid.name.c_str(), soid.snap.val);
5576         obj_list_watch_response_t resp;
5577
5578         map<pair<uint64_t, entity_name_t>, watch_info_t>::const_iterator oi_iter;
5579         for (oi_iter = oi.watchers.begin(); oi_iter != oi.watchers.end();
5580                                        ++oi_iter) {
5581           dout(20) << "key cookie=" << oi_iter->first.first
5582                << " entity=" << oi_iter->first.second << " "
5583                << oi_iter->second << dendl;
5584           assert(oi_iter->first.first == oi_iter->second.cookie);
5585           assert(oi_iter->first.second.is_client());
5586
5587           watch_item_t wi(oi_iter->first.second, oi_iter->second.cookie,
5588                  oi_iter->second.timeout_seconds, oi_iter->second.addr);
5589           resp.entries.push_back(wi);
5590         }
5591
5592         resp.encode(osd_op.outdata, ctx->get_features());
5593         result = 0;
5594
5595         ctx->delta_stats.num_rd++;
5596         break;
5597       }
5598
5599     case CEPH_OSD_OP_LIST_SNAPS:
5600       ++ctx->num_read;
5601       {
5602         tracepoint(osd, do_osd_op_pre_list_snaps, soid.oid.name.c_str(), soid.snap.val);
5603         obj_list_snap_response_t resp;
5604
5605         if (!ssc) {
5606           ssc = ctx->obc->ssc = get_snapset_context(soid, false);
5607         }
5608         assert(ssc);
5609
5610         int clonecount = ssc->snapset.clones.size();
5611         if (ssc->snapset.head_exists)
5612           clonecount++;
5613         resp.clones.reserve(clonecount);
5614         for (auto clone_iter = ssc->snapset.clones.begin();
5615              clone_iter != ssc->snapset.clones.end(); ++clone_iter) {
5616           clone_info ci;
5617           ci.cloneid = *clone_iter;
5618
5619           hobject_t clone_oid = soid;
5620           clone_oid.snap = *clone_iter;
5621
5622           if (!ssc->snapset.is_legacy()) {
5623             auto p = ssc->snapset.clone_snaps.find(*clone_iter);
5624             if (p == ssc->snapset.clone_snaps.end()) {
5625               osd->clog->error() << "osd." << osd->whoami
5626                                  << ": inconsistent clone_snaps found for oid "
5627                                  << soid << " clone " << *clone_iter
5628                                  << " snapset " << ssc->snapset;
5629               result = -EINVAL;
5630               break;
5631             }
5632             for (auto q = p->second.rbegin(); q != p->second.rend(); ++q) {
5633               ci.snaps.push_back(*q);
5634             }
5635           } else {
5636             /* No need to take a lock here.  We are only inspecting state cached on
5637              * in the ObjectContext, so we aren't performing an actual read unless
5638              * the clone obc is not already loaded (in which case, it cannot have
5639              * an in progress write).  We also do not risk exposing uncommitted
5640              * state since we do have a read lock on the head object or snapdir,
5641              * which we would have to write lock in order to make user visible
5642              * modifications to the snapshot state (snap trim related mutations
5643              * are not user visible).
5644              */
5645             if (is_missing_object(clone_oid)) {
5646               dout(20) << "LIST_SNAPS " << clone_oid << " missing" << dendl;
5647               wait_for_unreadable_object(clone_oid, ctx->op);
5648               result = -EAGAIN;
5649               break;
5650             }
5651
5652             ObjectContextRef clone_obc = get_object_context(clone_oid, false);
5653             if (!clone_obc) {
5654               if (maybe_handle_cache(
5655                     ctx->op, true, clone_obc, -ENOENT, clone_oid, true)) {
5656                 // promoting the clone
5657                 result = -EAGAIN;
5658               } else {
5659                 osd->clog->error() << "osd." << osd->whoami
5660                                    << ": missing clone " << clone_oid
5661                                    << " for oid "
5662                                    << soid;
5663                 // should not happen
5664                 result = -ENOENT;
5665               }
5666               break;
5667             }
5668             for (vector<snapid_t>::reverse_iterator p =
5669                    clone_obc->obs.oi.legacy_snaps.rbegin();
5670                  p != clone_obc->obs.oi.legacy_snaps.rend();
5671                  ++p) {
5672               ci.snaps.push_back(*p);
5673             }
5674           }
5675
5676           dout(20) << " clone " << *clone_iter << " snaps " << ci.snaps << dendl;
5677
5678           map<snapid_t, interval_set<uint64_t> >::const_iterator coi;
5679           coi = ssc->snapset.clone_overlap.find(ci.cloneid);
5680           if (coi == ssc->snapset.clone_overlap.end()) {
5681             osd->clog->error() << "osd." << osd->whoami
5682                                << ": inconsistent clone_overlap found for oid "
5683                               << soid << " clone " << *clone_iter;
5684             result = -EINVAL;
5685             break;
5686           }
5687           const interval_set<uint64_t> &o = coi->second;
5688           ci.overlap.reserve(o.num_intervals());
5689           for (interval_set<uint64_t>::const_iterator r = o.begin();
5690                r != o.end(); ++r) {
5691             ci.overlap.push_back(pair<uint64_t,uint64_t>(r.get_start(),
5692                                                          r.get_len()));
5693           }
5694
5695           map<snapid_t, uint64_t>::const_iterator si;
5696           si = ssc->snapset.clone_size.find(ci.cloneid);
5697           if (si == ssc->snapset.clone_size.end()) {
5698             osd->clog->error() << "osd." << osd->whoami
5699                                << ": inconsistent clone_size found for oid "
5700                                << soid << " clone " << *clone_iter;
5701             result = -EINVAL;
5702             break;
5703           }
5704           ci.size = si->second;
5705
5706           resp.clones.push_back(ci);
5707         }
5708         if (result < 0) {
5709           break;
5710         }
5711         if (ssc->snapset.head_exists &&
5712             !ctx->obc->obs.oi.is_whiteout()) {
5713           assert(obs.exists);
5714           clone_info ci;
5715           ci.cloneid = CEPH_NOSNAP;
5716
5717           //Size for HEAD is oi.size
5718           ci.size = oi.size;
5719
5720           resp.clones.push_back(ci);
5721         }
5722         resp.seq = ssc->snapset.seq;
5723
5724         resp.encode(osd_op.outdata);
5725         result = 0;
5726
5727         ctx->delta_stats.num_rd++;
5728         break;
5729       }
5730
5731    case CEPH_OSD_OP_NOTIFY:
5732       ++ctx->num_read;
5733       {
5734         uint32_t timeout;
5735         bufferlist bl;
5736
5737         try {
5738           uint32_t ver; // obsolete
5739           ::decode(ver, bp);
5740           ::decode(timeout, bp);
5741           ::decode(bl, bp);
5742         } catch (const buffer::error &e) {
5743           timeout = 0;
5744         }
5745         tracepoint(osd, do_osd_op_pre_notify, soid.oid.name.c_str(), soid.snap.val, timeout);
5746         if (!timeout)
5747           timeout = cct->_conf->osd_default_notify_timeout;
5748
5749         notify_info_t n;
5750         n.timeout = timeout;
5751         n.notify_id = osd->get_next_id(get_osdmap()->get_epoch());
5752         n.cookie = op.watch.cookie;
5753         n.bl = bl;
5754         ctx->notifies.push_back(n);
5755
5756         // return our unique notify id to the client
5757         ::encode(n.notify_id, osd_op.outdata);
5758       }
5759       break;
5760
5761     case CEPH_OSD_OP_NOTIFY_ACK:
5762       ++ctx->num_read;
5763       {
5764         try {
5765           uint64_t notify_id = 0;
5766           uint64_t watch_cookie = 0;
5767           ::decode(notify_id, bp);
5768           ::decode(watch_cookie, bp);
5769           bufferlist reply_bl;
5770           if (!bp.end()) {
5771             ::decode(reply_bl, bp);
5772           }
5773           tracepoint(osd, do_osd_op_pre_notify_ack, soid.oid.name.c_str(), soid.snap.val, notify_id, watch_cookie, "Y");
5774           OpContext::NotifyAck ack(notify_id, watch_cookie, reply_bl);
5775           ctx->notify_acks.push_back(ack);
5776         } catch (const buffer::error &e) {
5777           tracepoint(osd, do_osd_op_pre_notify_ack, soid.oid.name.c_str(), soid.snap.val, op.watch.cookie, 0, "N");
5778           OpContext::NotifyAck ack(
5779             // op.watch.cookie is actually the notify_id for historical reasons
5780             op.watch.cookie
5781             );
5782           ctx->notify_acks.push_back(ack);
5783         }
5784       }
5785       break;
5786
5787     case CEPH_OSD_OP_SETALLOCHINT:
5788       ++ctx->num_write;
5789       {
5790         tracepoint(osd, do_osd_op_pre_setallochint, soid.oid.name.c_str(), soid.snap.val, op.alloc_hint.expected_object_size, op.alloc_hint.expected_write_size);
5791         maybe_create_new_object(ctx);
5792         oi.expected_object_size = op.alloc_hint.expected_object_size;
5793         oi.expected_write_size = op.alloc_hint.expected_write_size;
5794         oi.alloc_hint_flags = op.alloc_hint.flags;
5795         t->set_alloc_hint(soid, op.alloc_hint.expected_object_size,
5796                           op.alloc_hint.expected_write_size,
5797                           op.alloc_hint.flags);
5798         ctx->delta_stats.num_wr++;
5799         result = 0;
5800       }
5801       break;
5802
5803
5804       // --- WRITES ---
5805
5806       // -- object data --
5807
5808     case CEPH_OSD_OP_WRITE:
5809       ++ctx->num_write;
5810       { // write
5811         __u32 seq = oi.truncate_seq;
5812         tracepoint(osd, do_osd_op_pre_write, soid.oid.name.c_str(), soid.snap.val, oi.size, seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
5813         if (op.extent.length != osd_op.indata.length()) {
5814           result = -EINVAL;
5815           break;
5816         }
5817
5818         if (pool.info.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED))
5819           op.flags = op.flags | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
5820
5821         if (pool.info.requires_aligned_append() &&
5822             (op.extent.offset % pool.info.required_alignment() != 0)) {
5823           result = -EOPNOTSUPP;
5824           break;
5825         }
5826
5827         if (!obs.exists) {
5828           if (pool.info.requires_aligned_append() && op.extent.offset) {
5829             result = -EOPNOTSUPP;
5830             break;
5831           }
5832         } else if (op.extent.offset != oi.size &&
5833                    pool.info.requires_aligned_append()) {
5834           result = -EOPNOTSUPP;
5835           break;
5836         }
5837
5838         if (seq && (seq > op.extent.truncate_seq) &&
5839             (op.extent.offset + op.extent.length > oi.size)) {
5840           // old write, arrived after trimtrunc
5841           op.extent.length = (op.extent.offset > oi.size ? 0 : oi.size - op.extent.offset);
5842           dout(10) << " old truncate_seq " << op.extent.truncate_seq << " < current " << seq
5843                    << ", adjusting write length to " << op.extent.length << dendl;
5844           bufferlist t;
5845           t.substr_of(osd_op.indata, 0, op.extent.length);
5846           osd_op.indata.swap(t);
5847         }
5848         if (op.extent.truncate_seq > seq) {
5849           // write arrives before trimtrunc
5850           if (obs.exists && !oi.is_whiteout()) {
5851             dout(10) << " truncate_seq " << op.extent.truncate_seq << " > current " << seq
5852                      << ", truncating to " << op.extent.truncate_size << dendl;
5853             t->truncate(soid, op.extent.truncate_size);
5854             oi.truncate_seq = op.extent.truncate_seq;
5855             oi.truncate_size = op.extent.truncate_size;
5856             if (op.extent.truncate_size != oi.size) {
5857               ctx->delta_stats.num_bytes -= oi.size;
5858               ctx->delta_stats.num_bytes += op.extent.truncate_size;
5859               oi.size = op.extent.truncate_size;
5860             }
5861           } else {
5862             dout(10) << " truncate_seq " << op.extent.truncate_seq << " > current " << seq
5863                      << ", but object is new" << dendl;
5864             oi.truncate_seq = op.extent.truncate_seq;
5865             oi.truncate_size = op.extent.truncate_size;
5866           }
5867         }
5868         result = check_offset_and_length(op.extent.offset, op.extent.length, cct->_conf->osd_max_object_size);
5869         if (result < 0)
5870           break;
5871
5872         maybe_create_new_object(ctx);
5873
5874         if (op.extent.length == 0) {
5875           if (op.extent.offset > oi.size) {
5876             t->truncate(
5877               soid, op.extent.offset);
5878           } else {
5879             t->nop(soid);
5880           }
5881         } else {
5882           t->write(
5883             soid, op.extent.offset, op.extent.length, osd_op.indata, op.flags);
5884         }
5885
5886         if (op.extent.offset == 0 && op.extent.length >= oi.size
5887             && !skip_data_digest) {
5888           obs.oi.set_data_digest(osd_op.indata.crc32c(-1));
5889         } else if (op.extent.offset == oi.size && obs.oi.is_data_digest()) {
5890           if (skip_data_digest) {
5891             obs.oi.clear_data_digest();
5892           } else {
5893             obs.oi.set_data_digest(osd_op.indata.crc32c(obs.oi.data_digest));
5894           }
5895         } else {
5896           obs.oi.clear_data_digest();
5897         }
5898         write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
5899                                     op.extent.offset, op.extent.length);
5900
5901       }
5902       break;
5903
5904     case CEPH_OSD_OP_WRITEFULL:
5905       ++ctx->num_write;
5906       { // write full object
5907         tracepoint(osd, do_osd_op_pre_writefull, soid.oid.name.c_str(), soid.snap.val, oi.size, 0, op.extent.length);
5908
5909         if (op.extent.length != osd_op.indata.length()) {
5910           result = -EINVAL;
5911           break;
5912         }
5913         result = check_offset_and_length(0, op.extent.length, cct->_conf->osd_max_object_size);
5914         if (result < 0)
5915           break;
5916
5917         if (pool.info.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED))
5918           op.flags = op.flags | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
5919
5920         maybe_create_new_object(ctx);
5921         if (pool.info.require_rollback()) {
5922           t->truncate(soid, 0);
5923         } else if (obs.exists && op.extent.length < oi.size) {
5924           t->truncate(soid, op.extent.length);
5925         }
5926         if (op.extent.length) {
5927           t->write(soid, 0, op.extent.length, osd_op.indata, op.flags);
5928         }
5929         if (!skip_data_digest) {
5930           obs.oi.set_data_digest(osd_op.indata.crc32c(-1));
5931         } else {
5932           obs.oi.clear_data_digest();
5933         }
5934
5935         write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
5936             0, op.extent.length, true);
5937       }
5938       break;
5939
5940     case CEPH_OSD_OP_WRITESAME:
5941       ++ctx->num_write;
5942       tracepoint(osd, do_osd_op_pre_writesame, soid.oid.name.c_str(), soid.snap.val, oi.size, op.writesame.offset, op.writesame.length, op.writesame.data_length);
5943       result = do_writesame(ctx, osd_op);
5944       break;
5945
5946     case CEPH_OSD_OP_ROLLBACK :
5947       ++ctx->num_write;
5948       tracepoint(osd, do_osd_op_pre_rollback, soid.oid.name.c_str(), soid.snap.val);
5949       result = _rollback_to(ctx, op);
5950       break;
5951
5952     case CEPH_OSD_OP_ZERO:
5953       tracepoint(osd, do_osd_op_pre_zero, soid.oid.name.c_str(), soid.snap.val, op.extent.offset, op.extent.length);
5954       if (pool.info.requires_aligned_append()) {
5955         result = -EOPNOTSUPP;
5956         break;
5957       }
5958       ++ctx->num_write;
5959       { // zero
5960         result = check_offset_and_length(op.extent.offset, op.extent.length, cct->_conf->osd_max_object_size);
5961         if (result < 0)
5962           break;
5963         assert(op.extent.length);
5964         if (obs.exists && !oi.is_whiteout()) {
5965           t->zero(soid, op.extent.offset, op.extent.length);
5966           interval_set<uint64_t> ch;
5967           ch.insert(op.extent.offset, op.extent.length);
5968           ctx->modified_ranges.union_of(ch);
5969           ctx->delta_stats.num_wr++;
5970           oi.clear_data_digest();
5971         } else {
5972           // no-op
5973         }
5974       }
5975       break;
5976     case CEPH_OSD_OP_CREATE:
5977       ++ctx->num_write;
5978       {
5979         tracepoint(osd, do_osd_op_pre_create, soid.oid.name.c_str(), soid.snap.val);
5980         int flags = le32_to_cpu(op.flags);
5981         if (obs.exists && !oi.is_whiteout() &&
5982             (flags & CEPH_OSD_OP_FLAG_EXCL)) {
5983           result = -EEXIST; /* this is an exclusive create */
5984         } else {
5985           if (osd_op.indata.length()) {
5986             bufferlist::iterator p = osd_op.indata.begin();
5987             string category;
5988             try {
5989               ::decode(category, p);
5990             }
5991             catch (buffer::error& e) {
5992               result = -EINVAL;
5993               goto fail;
5994             }
5995             // category is no longer implemented.
5996           }
5997           if (result >= 0) {
5998             maybe_create_new_object(ctx);
5999             t->nop(soid);
6000           }
6001         }
6002       }
6003       break;
6004
6005     case CEPH_OSD_OP_TRIMTRUNC:
6006       op.extent.offset = op.extent.truncate_size;
6007       // falling through
6008
6009     case CEPH_OSD_OP_TRUNCATE:
6010       tracepoint(osd, do_osd_op_pre_truncate, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
6011       if (pool.info.requires_aligned_append()) {
6012         result = -EOPNOTSUPP;
6013         break;
6014       }
6015       ++ctx->num_write;
6016       {
6017         // truncate
6018         if (!obs.exists || oi.is_whiteout()) {
6019           dout(10) << " object dne, truncate is a no-op" << dendl;
6020           break;
6021         }
6022
6023         if (op.extent.offset > cct->_conf->osd_max_object_size) {
6024           result = -EFBIG;
6025           break;
6026         }
6027
6028         if (op.extent.truncate_seq) {
6029           assert(op.extent.offset == op.extent.truncate_size);
6030           if (op.extent.truncate_seq <= oi.truncate_seq) {
6031             dout(10) << " truncate seq " << op.extent.truncate_seq << " <= current " << oi.truncate_seq
6032                      << ", no-op" << dendl;
6033             break; // old
6034           }
6035           dout(10) << " truncate seq " << op.extent.truncate_seq << " > current " << oi.truncate_seq
6036                    << ", truncating" << dendl;
6037           oi.truncate_seq = op.extent.truncate_seq;
6038           oi.truncate_size = op.extent.truncate_size;
6039         }
6040
6041         maybe_create_new_object(ctx);
6042         t->truncate(soid, op.extent.offset);
6043         if (oi.size > op.extent.offset) {
6044           interval_set<uint64_t> trim;
6045           trim.insert(op.extent.offset, oi.size-op.extent.offset);
6046           ctx->modified_ranges.union_of(trim);
6047         }
6048         if (op.extent.offset != oi.size) {
6049           ctx->delta_stats.num_bytes -= oi.size;
6050           ctx->delta_stats.num_bytes += op.extent.offset;
6051           oi.size = op.extent.offset;
6052         }
6053         ctx->delta_stats.num_wr++;
6054         // do no set exists, or we will break above DELETE -> TRUNCATE munging.
6055
6056         oi.clear_data_digest();
6057       }
6058       break;
6059
6060     case CEPH_OSD_OP_DELETE:
6061       ++ctx->num_write;
6062       tracepoint(osd, do_osd_op_pre_delete, soid.oid.name.c_str(), soid.snap.val);
6063       {
6064         result = _delete_oid(ctx, false, ctx->ignore_cache);
6065       }
6066       break;
6067
6068     case CEPH_OSD_OP_WATCH:
6069       ++ctx->num_write;
6070       {
6071         tracepoint(osd, do_osd_op_pre_watch, soid.oid.name.c_str(), soid.snap.val,
6072                    op.watch.cookie, op.watch.op);
6073         if (!obs.exists) {
6074           result = -ENOENT;
6075           break;
6076         }
6077         uint64_t cookie = op.watch.cookie;
6078         entity_name_t entity = ctx->reqid.name;
6079         ObjectContextRef obc = ctx->obc;
6080
6081         dout(10) << "watch " << ceph_osd_watch_op_name(op.watch.op)
6082                  << ": ctx->obc=" << (void *)obc.get() << " cookie=" << cookie
6083                  << " oi.version=" << oi.version.version << " ctx->at_version=" << ctx->at_version << dendl;
6084         dout(10) << "watch: oi.user_version=" << oi.user_version<< dendl;
6085         dout(10) << "watch: peer_addr="
6086           << ctx->op->get_req()->get_connection()->get_peer_addr() << dendl;
6087
6088         uint32_t timeout = cct->_conf->osd_client_watch_timeout;
6089         if (op.watch.timeout != 0) {
6090           timeout = op.watch.timeout;
6091         }
6092
6093         watch_info_t w(cookie, timeout,
6094           ctx->op->get_req()->get_connection()->get_peer_addr());
6095         if (op.watch.op == CEPH_OSD_WATCH_OP_WATCH ||
6096             op.watch.op == CEPH_OSD_WATCH_OP_LEGACY_WATCH) {
6097           if (oi.watchers.count(make_pair(cookie, entity))) {
6098             dout(10) << " found existing watch " << w << " by " << entity << dendl;
6099           } else {
6100             dout(10) << " registered new watch " << w << " by " << entity << dendl;
6101             oi.watchers[make_pair(cookie, entity)] = w;
6102             t->nop(soid);  // make sure update the object_info on disk!
6103           }
6104           bool will_ping = (op.watch.op == CEPH_OSD_WATCH_OP_WATCH);
6105           ctx->watch_connects.push_back(make_pair(w, will_ping));
6106         } else if (op.watch.op == CEPH_OSD_WATCH_OP_RECONNECT) {
6107           if (!oi.watchers.count(make_pair(cookie, entity))) {
6108             result = -ENOTCONN;
6109             break;
6110           }
6111           dout(10) << " found existing watch " << w << " by " << entity << dendl;
6112           ctx->watch_connects.push_back(make_pair(w, true));
6113         } else if (op.watch.op == CEPH_OSD_WATCH_OP_PING) {
6114           /* Note: WATCH with PING doesn't cause may_write() to return true,
6115            * so if there is nothing else in the transaction, this is going
6116            * to run do_osd_op_effects, but not write out a log entry */
6117           if (!oi.watchers.count(make_pair(cookie, entity))) {
6118             result = -ENOTCONN;
6119             break;
6120           }
6121           map<pair<uint64_t,entity_name_t>,WatchRef>::iterator p =
6122             obc->watchers.find(make_pair(cookie, entity));
6123           if (p == obc->watchers.end() ||
6124               !p->second->is_connected()) {
6125             // client needs to reconnect
6126             result = -ETIMEDOUT;
6127             break;
6128           }
6129           dout(10) << " found existing watch " << w << " by " << entity << dendl;
6130           p->second->got_ping(ceph_clock_now());
6131           result = 0;
6132         } else if (op.watch.op == CEPH_OSD_WATCH_OP_UNWATCH) {
6133           map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator oi_iter =
6134             oi.watchers.find(make_pair(cookie, entity));
6135           if (oi_iter != oi.watchers.end()) {
6136             dout(10) << " removed watch " << oi_iter->second << " by "
6137                      << entity << dendl;
6138             oi.watchers.erase(oi_iter);
6139             t->nop(soid);  // update oi on disk
6140             ctx->watch_disconnects.push_back(
6141               watch_disconnect_t(cookie, entity, false));
6142           } else {
6143             dout(10) << " can't remove: no watch by " << entity << dendl;
6144           }
6145         }
6146       }
6147       break;
6148
6149     case CEPH_OSD_OP_CACHE_PIN:
6150       tracepoint(osd, do_osd_op_pre_cache_pin, soid.oid.name.c_str(), soid.snap.val);
6151       if ((!pool.info.is_tier() ||
6152           pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)) {
6153         result = -EINVAL;
6154         dout(10) << " pin object is only allowed on the cache tier " << dendl;
6155         break;
6156       }
6157       ++ctx->num_write;
6158       {
6159         if (!obs.exists || oi.is_whiteout()) {
6160           result = -ENOENT;
6161           break;
6162         }
6163
6164         if (!oi.is_cache_pinned()) {
6165           oi.set_flag(object_info_t::FLAG_CACHE_PIN);
6166           ctx->modify = true;
6167           ctx->delta_stats.num_objects_pinned++;
6168           ctx->delta_stats.num_wr++;
6169         }
6170         result = 0;
6171       }
6172       break;
6173
6174     case CEPH_OSD_OP_CACHE_UNPIN:
6175       tracepoint(osd, do_osd_op_pre_cache_unpin, soid.oid.name.c_str(), soid.snap.val);
6176       if ((!pool.info.is_tier() ||
6177           pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)) {
6178         result = -EINVAL;
6179         dout(10) << " pin object is only allowed on the cache tier " << dendl;
6180         break;
6181       }
6182       ++ctx->num_write;
6183       {
6184         if (!obs.exists || oi.is_whiteout()) {
6185           result = -ENOENT;
6186           break;
6187         }
6188
6189         if (oi.is_cache_pinned()) {
6190           oi.clear_flag(object_info_t::FLAG_CACHE_PIN);
6191           ctx->modify = true;
6192           ctx->delta_stats.num_objects_pinned--;
6193           ctx->delta_stats.num_wr++;
6194         }
6195         result = 0;
6196       }
6197       break;
6198
6199     case CEPH_OSD_OP_SET_REDIRECT:
6200       ++ctx->num_write;
6201       {
6202         if (pool.info.is_tier()) {
6203           result = -EINVAL;
6204           break;
6205         }
6206         if (!obs.exists) {
6207           result = -ENOENT;
6208           break;
6209         }
6210         if (get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS) {
6211           result = -EOPNOTSUPP;
6212           break;
6213         }
6214
6215         object_t target_name;
6216         object_locator_t target_oloc;
6217         snapid_t target_snapid = (uint64_t)op.copy_from.snapid;
6218         version_t target_version = op.copy_from.src_version;
6219         try {
6220           ::decode(target_name, bp);
6221           ::decode(target_oloc, bp);
6222         }
6223         catch (buffer::error& e) {
6224           result = -EINVAL;
6225           goto fail;
6226         }
6227         pg_t raw_pg;
6228         get_osdmap()->object_locator_to_pg(target_name, target_oloc, raw_pg);
6229         hobject_t target(target_name, target_oloc.key, target_snapid,
6230                 raw_pg.ps(), raw_pg.pool(),
6231                 target_oloc.nspace);
6232         if (target == soid) {
6233           dout(20) << " set-redirect self is invalid" << dendl;
6234           result = -EINVAL;
6235           break;
6236         }
6237         oi.set_flag(object_info_t::FLAG_MANIFEST);
6238         oi.manifest.redirect_target = target;
6239         oi.manifest.type = object_manifest_t::TYPE_REDIRECT;
6240         t->truncate(soid, 0);
6241         if (oi.is_omap() && pool.info.supports_omap()) {
6242           t->omap_clear(soid);
6243           obs.oi.clear_omap_digest();
6244           obs.oi.clear_flag(object_info_t::FLAG_OMAP);
6245         }
6246         ctx->delta_stats.num_bytes -= oi.size;
6247         oi.size = 0;
6248         oi.new_object();
6249         oi.user_version = target_version;
6250         ctx->user_at_version = target_version;
6251         /* rm_attrs */
6252         map<string,bufferlist> rmattrs;
6253         result = getattrs_maybe_cache(ctx->obc,
6254                     &rmattrs);
6255         if (result < 0) {
6256           return result;
6257         }
6258         map<string, bufferlist>::iterator iter;
6259         for (iter = rmattrs.begin(); iter != rmattrs.end(); ++iter) {
6260           const string& name = iter->first;
6261           t->rmattr(soid, name);
6262         }
6263         dout(10) << "set-redirect oid:" << oi.soid << " user_version: " << oi.user_version << dendl;
6264       }
6265
6266       break;
6267
6268       // -- object attrs --
6269
6270     case CEPH_OSD_OP_SETXATTR:
6271       ++ctx->num_write;
6272       {
6273         if (cct->_conf->osd_max_attr_size > 0 &&
6274             op.xattr.value_len > cct->_conf->osd_max_attr_size) {
6275           tracepoint(osd, do_osd_op_pre_setxattr, soid.oid.name.c_str(), soid.snap.val, "???");
6276           result = -EFBIG;
6277           break;
6278         }
6279         unsigned max_name_len = MIN(osd->store->get_max_attr_name_length(),
6280                                     cct->_conf->osd_max_attr_name_len);
6281         if (op.xattr.name_len > max_name_len) {
6282           result = -ENAMETOOLONG;
6283           break;
6284         }
6285         maybe_create_new_object(ctx);
6286         string aname;
6287         bp.copy(op.xattr.name_len, aname);
6288         tracepoint(osd, do_osd_op_pre_setxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
6289         string name = "_" + aname;
6290         bufferlist bl;
6291         bp.copy(op.xattr.value_len, bl);
6292         t->setattr(soid, name, bl);
6293         ctx->delta_stats.num_wr++;
6294       }
6295       break;
6296
6297     case CEPH_OSD_OP_RMXATTR:
6298       ++ctx->num_write;
6299       {
6300         string aname;
6301         bp.copy(op.xattr.name_len, aname);
6302         tracepoint(osd, do_osd_op_pre_rmxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
6303         if (!obs.exists || oi.is_whiteout()) {
6304           result = -ENOENT;
6305           break;
6306         }
6307         string name = "_" + aname;
6308         t->rmattr(soid, name);
6309         ctx->delta_stats.num_wr++;
6310       }
6311       break;
6312
6313
6314       // -- fancy writers --
6315     case CEPH_OSD_OP_APPEND:
6316       {
6317         tracepoint(osd, do_osd_op_pre_append, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
6318         // just do it inline; this works because we are happy to execute
6319         // fancy op on replicas as well.
6320         vector<OSDOp> nops(1);
6321         OSDOp& newop = nops[0];
6322         newop.op.op = CEPH_OSD_OP_WRITE;
6323         newop.op.extent.offset = oi.size;
6324         newop.op.extent.length = op.extent.length;
6325         newop.op.extent.truncate_seq = oi.truncate_seq;
6326         newop.indata = osd_op.indata;
6327         result = do_osd_ops(ctx, nops);
6328         osd_op.outdata.claim(newop.outdata);
6329       }
6330       break;
6331
6332     case CEPH_OSD_OP_STARTSYNC:
6333       tracepoint(osd, do_osd_op_pre_startsync, soid.oid.name.c_str(), soid.snap.val);
6334       t->nop(soid);
6335       break;
6336
6337
6338       // -- trivial map --
6339     case CEPH_OSD_OP_TMAPGET:
6340       tracepoint(osd, do_osd_op_pre_tmapget, soid.oid.name.c_str(), soid.snap.val);
6341       if (pool.info.require_rollback()) {
6342         result = -EOPNOTSUPP;
6343         break;
6344       }
6345       {
6346         vector<OSDOp> nops(1);
6347         OSDOp& newop = nops[0];
6348         newop.op.op = CEPH_OSD_OP_SYNC_READ;
6349         newop.op.extent.offset = 0;
6350         newop.op.extent.length = 0;
6351         do_osd_ops(ctx, nops);
6352         osd_op.outdata.claim(newop.outdata);
6353       }
6354       break;
6355
6356     case CEPH_OSD_OP_TMAPPUT:
6357       tracepoint(osd, do_osd_op_pre_tmapput, soid.oid.name.c_str(), soid.snap.val);
6358       if (pool.info.require_rollback()) {
6359         result = -EOPNOTSUPP;
6360         break;
6361       }
6362       {
6363         //_dout_lock.Lock();
6364         //osd_op.data.hexdump(*_dout);
6365         //_dout_lock.Unlock();
6366
6367         // verify sort order
6368         bool unsorted = false;
6369         if (true) {
6370           bufferlist header;
6371           ::decode(header, bp);
6372           uint32_t n;
6373           ::decode(n, bp);
6374           string last_key;
6375           while (n--) {
6376             string key;
6377             ::decode(key, bp);
6378             dout(10) << "tmapput key " << key << dendl;
6379             bufferlist val;
6380             ::decode(val, bp);
6381             if (key < last_key) {
6382               dout(10) << "TMAPPUT is unordered; resorting" << dendl;
6383               unsorted = true;
6384               break;
6385             }
6386             last_key = key;
6387           }
6388         }
6389
6390         // write it
6391         vector<OSDOp> nops(1);
6392         OSDOp& newop = nops[0];
6393         newop.op.op = CEPH_OSD_OP_WRITEFULL;
6394         newop.op.extent.offset = 0;
6395         newop.op.extent.length = osd_op.indata.length();
6396         newop.indata = osd_op.indata;
6397
6398         if (unsorted) {
6399           bp = osd_op.indata.begin();
6400           bufferlist header;
6401           map<string, bufferlist> m;
6402           ::decode(header, bp);
6403           ::decode(m, bp);
6404           assert(bp.end());
6405           bufferlist newbl;
6406           ::encode(header, newbl);
6407           ::encode(m, newbl);
6408           newop.indata = newbl;
6409         }
6410         result = do_osd_ops(ctx, nops);
6411         assert(result == 0);
6412       }
6413       break;
6414
6415     case CEPH_OSD_OP_TMAPUP:
6416       tracepoint(osd, do_osd_op_pre_tmapup, soid.oid.name.c_str(), soid.snap.val);
6417       if (pool.info.require_rollback()) {
6418         result = -EOPNOTSUPP;
6419         break;
6420       }
6421       ++ctx->num_write;
6422       result = do_tmapup(ctx, bp, osd_op);
6423       break;
6424
6425     case CEPH_OSD_OP_TMAP2OMAP:
6426       ++ctx->num_write;
6427       tracepoint(osd, do_osd_op_pre_tmap2omap, soid.oid.name.c_str(), soid.snap.val);
6428       result = do_tmap2omap(ctx, op.tmap2omap.flags);
6429       break;
6430
6431       // OMAP Read ops
6432     case CEPH_OSD_OP_OMAPGETKEYS:
6433       ++ctx->num_read;
6434       {
6435         string start_after;
6436         uint64_t max_return;
6437         try {
6438           ::decode(start_after, bp);
6439           ::decode(max_return, bp);
6440         }
6441         catch (buffer::error& e) {
6442           result = -EINVAL;
6443           tracepoint(osd, do_osd_op_pre_omapgetkeys, soid.oid.name.c_str(), soid.snap.val, "???", 0);
6444           goto fail;
6445         }
6446         if (max_return > cct->_conf->osd_max_omap_entries_per_request) {
6447           max_return = cct->_conf->osd_max_omap_entries_per_request;
6448         }
6449         tracepoint(osd, do_osd_op_pre_omapgetkeys, soid.oid.name.c_str(), soid.snap.val, start_after.c_str(), max_return);
6450
6451         bufferlist bl;
6452         uint32_t num = 0;
6453         bool truncated = false;
6454         if (oi.is_omap()) {
6455           ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(
6456             coll, ghobject_t(soid)
6457             );
6458           assert(iter);
6459           iter->upper_bound(start_after);
6460           for (num = 0; iter->valid(); ++num, iter->next(false)) {
6461             if (num >= max_return ||
6462                 bl.length() >= cct->_conf->osd_max_omap_bytes_per_request) {
6463               truncated = true;
6464               break;
6465             }
6466             ::encode(iter->key(), bl);
6467           }
6468         } // else return empty out_set
6469         ::encode(num, osd_op.outdata);
6470         osd_op.outdata.claim_append(bl);
6471         ::encode(truncated, osd_op.outdata);
6472         ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
6473         ctx->delta_stats.num_rd++;
6474       }
6475       break;
6476
6477     case CEPH_OSD_OP_OMAPGETVALS:
6478       ++ctx->num_read;
6479       {
6480         string start_after;
6481         uint64_t max_return;
6482         string filter_prefix;
6483         try {
6484           ::decode(start_after, bp);
6485           ::decode(max_return, bp);
6486           ::decode(filter_prefix, bp);
6487         }
6488         catch (buffer::error& e) {
6489           result = -EINVAL;
6490           tracepoint(osd, do_osd_op_pre_omapgetvals, soid.oid.name.c_str(), soid.snap.val, "???", 0, "???");
6491           goto fail;
6492         }
6493         if (max_return > cct->_conf->osd_max_omap_entries_per_request) {
6494           max_return = cct->_conf->osd_max_omap_entries_per_request;
6495         }
6496         tracepoint(osd, do_osd_op_pre_omapgetvals, soid.oid.name.c_str(), soid.snap.val, start_after.c_str(), max_return, filter_prefix.c_str());
6497
6498         uint32_t num = 0;
6499         bool truncated = false;
6500         bufferlist bl;
6501         if (oi.is_omap()) {
6502           ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(
6503             coll, ghobject_t(soid)
6504             );
6505           if (!iter) {
6506             result = -ENOENT;
6507             goto fail;
6508           }
6509           iter->upper_bound(start_after);
6510           if (filter_prefix > start_after) iter->lower_bound(filter_prefix);
6511           for (num = 0;
6512                iter->valid() &&
6513                  iter->key().substr(0, filter_prefix.size()) == filter_prefix;
6514                ++num, iter->next(false)) {
6515             dout(20) << "Found key " << iter->key() << dendl;
6516             if (num >= max_return ||
6517                 bl.length() >= cct->_conf->osd_max_omap_bytes_per_request) {
6518               truncated = true;
6519               break;
6520             }
6521             ::encode(iter->key(), bl);
6522             ::encode(iter->value(), bl);
6523           }
6524         } // else return empty out_set
6525         ::encode(num, osd_op.outdata);
6526         osd_op.outdata.claim_append(bl);
6527         ::encode(truncated, osd_op.outdata);
6528         ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
6529         ctx->delta_stats.num_rd++;
6530       }
6531       break;
6532
6533     case CEPH_OSD_OP_OMAPGETHEADER:
6534       tracepoint(osd, do_osd_op_pre_omapgetheader, soid.oid.name.c_str(), soid.snap.val);
6535       if (!oi.is_omap()) {
6536         // return empty header
6537         break;
6538       }
6539       ++ctx->num_read;
6540       {
6541         osd->store->omap_get_header(ch, ghobject_t(soid), &osd_op.outdata);
6542         ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
6543         ctx->delta_stats.num_rd++;
6544       }
6545       break;
6546
6547     case CEPH_OSD_OP_OMAPGETVALSBYKEYS:
6548       ++ctx->num_read;
6549       {
6550         set<string> keys_to_get;
6551         try {
6552           ::decode(keys_to_get, bp);
6553         }
6554         catch (buffer::error& e) {
6555           result = -EINVAL;
6556           tracepoint(osd, do_osd_op_pre_omapgetvalsbykeys, soid.oid.name.c_str(), soid.snap.val, "???");
6557           goto fail;
6558         }
6559         tracepoint(osd, do_osd_op_pre_omapgetvalsbykeys, soid.oid.name.c_str(), soid.snap.val, list_entries(keys_to_get).c_str());
6560         map<string, bufferlist> out;
6561         if (oi.is_omap()) {
6562           osd->store->omap_get_values(ch, ghobject_t(soid), keys_to_get, &out);
6563         } // else return empty omap entries
6564         ::encode(out, osd_op.outdata);
6565         ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
6566         ctx->delta_stats.num_rd++;
6567       }
6568       break;
6569
6570     case CEPH_OSD_OP_OMAP_CMP:
6571       ++ctx->num_read;
6572       {
6573         if (!obs.exists || oi.is_whiteout()) {
6574           result = -ENOENT;
6575           tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, "???");
6576           break;
6577         }
6578         map<string, pair<bufferlist, int> > assertions;
6579         try {
6580           ::decode(assertions, bp);
6581         }
6582         catch (buffer::error& e) {
6583           result = -EINVAL;
6584           tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, "???");
6585           goto fail;
6586         }
6587         tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, list_keys(assertions).c_str());
6588
6589         map<string, bufferlist> out;
6590
6591         if (oi.is_omap()) {
6592           set<string> to_get;
6593           for (map<string, pair<bufferlist, int> >::iterator i = assertions.begin();
6594                i != assertions.end();
6595                ++i)
6596             to_get.insert(i->first);
6597           int r = osd->store->omap_get_values(ch, ghobject_t(soid),
6598                                               to_get, &out);
6599           if (r < 0) {
6600             result = r;
6601             break;
6602           }
6603         } // else leave out empty
6604
6605         //Should set num_rd_kb based on encode length of map
6606         ctx->delta_stats.num_rd++;
6607
6608         int r = 0;
6609         bufferlist empty;
6610         for (map<string, pair<bufferlist, int> >::iterator i = assertions.begin();
6611              i != assertions.end();
6612              ++i) {
6613           auto out_entry = out.find(i->first);
6614           bufferlist &bl = (out_entry != out.end()) ?
6615             out_entry->second : empty;
6616           switch (i->second.second) {
6617           case CEPH_OSD_CMPXATTR_OP_EQ:
6618             if (!(bl == i->second.first)) {
6619               r = -ECANCELED;
6620             }
6621             break;
6622           case CEPH_OSD_CMPXATTR_OP_LT:
6623             if (!(bl < i->second.first)) {
6624               r = -ECANCELED;
6625             }
6626             break;
6627           case CEPH_OSD_CMPXATTR_OP_GT:
6628             if (!(bl > i->second.first)) {
6629               r = -ECANCELED;
6630             }
6631             break;
6632           default:
6633             r = -EINVAL;
6634             break;
6635           }
6636           if (r < 0)
6637             break;
6638         }
6639         if (r < 0) {
6640           result = r;
6641         }
6642       }
6643       break;
6644
6645       // OMAP Write ops
6646     case CEPH_OSD_OP_OMAPSETVALS:
6647       if (!pool.info.supports_omap()) {
6648         result = -EOPNOTSUPP;
6649         tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
6650         break;
6651       }
6652       ++ctx->num_write;
6653       {
6654         maybe_create_new_object(ctx);
6655         bufferlist to_set_bl;
6656         try {
6657           decode_str_str_map_to_bl(bp, &to_set_bl);
6658         }
6659         catch (buffer::error& e) {
6660           result = -EINVAL;
6661           tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
6662           goto fail;
6663         }
6664         tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
6665         if (cct->_conf->subsys.should_gather(dout_subsys, 20)) {
6666           dout(20) << "setting vals: " << dendl;
6667           map<string,bufferlist> to_set;
6668           bufferlist::iterator pt = to_set_bl.begin();
6669           ::decode(to_set, pt);
6670           for (map<string, bufferlist>::iterator i = to_set.begin();
6671                i != to_set.end();
6672                ++i) {
6673             dout(20) << "\t" << i->first << dendl;
6674           }
6675         }
6676         t->omap_setkeys(soid, to_set_bl);
6677         ctx->delta_stats.num_wr++;
6678       }
6679       obs.oi.set_flag(object_info_t::FLAG_OMAP);
6680       obs.oi.clear_omap_digest();
6681       break;
6682
6683     case CEPH_OSD_OP_OMAPSETHEADER:
6684       tracepoint(osd, do_osd_op_pre_omapsetheader, soid.oid.name.c_str(), soid.snap.val);
6685       if (!pool.info.supports_omap()) {
6686         result = -EOPNOTSUPP;
6687         break;
6688       }
6689       ++ctx->num_write;
6690       {
6691         maybe_create_new_object(ctx);
6692         t->omap_setheader(soid, osd_op.indata);
6693         ctx->delta_stats.num_wr++;
6694       }
6695       obs.oi.set_flag(object_info_t::FLAG_OMAP);
6696       obs.oi.clear_omap_digest();
6697       break;
6698
6699     case CEPH_OSD_OP_OMAPCLEAR:
6700       tracepoint(osd, do_osd_op_pre_omapclear, soid.oid.name.c_str(), soid.snap.val);
6701       if (!pool.info.supports_omap()) {
6702         result = -EOPNOTSUPP;
6703         break;
6704       }
6705       ++ctx->num_write;
6706       {
6707         if (!obs.exists || oi.is_whiteout()) {
6708           result = -ENOENT;
6709           break;
6710         }
6711         if (oi.is_omap()) {
6712           t->omap_clear(soid);
6713           ctx->delta_stats.num_wr++;
6714           obs.oi.clear_omap_digest();
6715           obs.oi.clear_flag(object_info_t::FLAG_OMAP);
6716         }
6717       }
6718       break;
6719
6720     case CEPH_OSD_OP_OMAPRMKEYS:
6721       if (!pool.info.supports_omap()) {
6722         result = -EOPNOTSUPP;
6723         tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
6724         break;
6725       }
6726       ++ctx->num_write;
6727       {
6728         if (!obs.exists || oi.is_whiteout()) {
6729           result = -ENOENT;
6730           tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
6731           break;
6732         }
6733         bufferlist to_rm_bl;
6734         try {
6735           decode_str_set_to_bl(bp, &to_rm_bl);
6736         }
6737         catch (buffer::error& e) {
6738           result = -EINVAL;
6739           tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
6740           goto fail;
6741         }
6742         tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
6743         t->omap_rmkeys(soid, to_rm_bl);
6744         ctx->delta_stats.num_wr++;
6745       }
6746       obs.oi.clear_omap_digest();
6747       break;
6748
6749     case CEPH_OSD_OP_COPY_GET:
6750       ++ctx->num_read;
6751       tracepoint(osd, do_osd_op_pre_copy_get, soid.oid.name.c_str(),
6752                  soid.snap.val);
6753       if (op_finisher == nullptr) {
6754         result = do_copy_get(ctx, bp, osd_op, ctx->obc);
6755       } else {
6756         result = op_finisher->execute();
6757       }
6758       break;
6759
6760     case CEPH_OSD_OP_COPY_FROM:
6761       ++ctx->num_write;
6762       {
6763         object_t src_name;
6764         object_locator_t src_oloc;
6765         snapid_t src_snapid = (uint64_t)op.copy_from.snapid;
6766         version_t src_version = op.copy_from.src_version;
6767         try {
6768           ::decode(src_name, bp);
6769           ::decode(src_oloc, bp);
6770         }
6771         catch (buffer::error& e) {
6772           result = -EINVAL;
6773           tracepoint(osd,
6774                      do_osd_op_pre_copy_from,
6775                      soid.oid.name.c_str(),
6776                      soid.snap.val,
6777                      "???",
6778                      0,
6779                      "???",
6780                      "???",
6781                      0,
6782                      src_snapid,
6783                      src_version);
6784           goto fail;
6785         }
6786         tracepoint(osd,
6787                    do_osd_op_pre_copy_from,
6788                    soid.oid.name.c_str(),
6789                    soid.snap.val,
6790                    src_name.name.c_str(),
6791                    src_oloc.pool,
6792                    src_oloc.key.c_str(),
6793                    src_oloc.nspace.c_str(),
6794                    src_oloc.hash,
6795                    src_snapid,
6796                    src_version);
6797         if (op_finisher == nullptr) {
6798           // start
6799           pg_t raw_pg;
6800           get_osdmap()->object_locator_to_pg(src_name, src_oloc, raw_pg);
6801           hobject_t src(src_name, src_oloc.key, src_snapid,
6802                         raw_pg.ps(), raw_pg.pool(),
6803                         src_oloc.nspace);
6804           if (src == soid) {
6805             dout(20) << " copy from self is invalid" << dendl;
6806             result = -EINVAL;
6807             break;
6808           }
6809           CopyFromCallback *cb = new CopyFromCallback(ctx, osd_op);
6810           ctx->op_finishers[ctx->current_osd_subop_num].reset(
6811             new CopyFromFinisher(cb));
6812           start_copy(cb, ctx->obc, src, src_oloc, src_version,
6813                      op.copy_from.flags,
6814                      false,
6815                      op.copy_from.src_fadvise_flags,
6816                      op.flags);
6817           result = -EINPROGRESS;
6818         } else {
6819           // finish
6820           result = op_finisher->execute();
6821           assert(result == 0);
6822
6823           // COPY_FROM cannot be executed multiple times -- it must restart
6824           ctx->op_finishers.erase(ctx->current_osd_subop_num);
6825         }
6826       }
6827       break;
6828
6829     default:
6830       tracepoint(osd, do_osd_op_pre_unknown, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op));
6831       dout(1) << "unrecognized osd op " << op.op
6832               << " " << ceph_osd_op_name(op.op)
6833               << dendl;
6834       result = -EOPNOTSUPP;
6835     }
6836
6837   fail:
6838     osd_op.rval = result;
6839     tracepoint(osd, do_osd_op_post, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op), op.flags, result);
6840     if (result < 0 && (op.flags & CEPH_OSD_OP_FLAG_FAILOK))
6841       result = 0;
6842
6843     if (result < 0)
6844       break;
6845   }
6846   return result;
6847 }
6848
6849 int PrimaryLogPG::_get_tmap(OpContext *ctx, bufferlist *header, bufferlist *vals)
6850 {
6851   if (ctx->new_obs.oi.size == 0) {
6852     dout(20) << "unable to get tmap for zero sized " << ctx->new_obs.oi.soid << dendl;
6853     return -ENODATA;
6854   }
6855   vector<OSDOp> nops(1);
6856   OSDOp &newop = nops[0];
6857   newop.op.op = CEPH_OSD_OP_TMAPGET;
6858   do_osd_ops(ctx, nops);
6859   try {
6860     bufferlist::iterator i = newop.outdata.begin();
6861     ::decode(*header, i);
6862     (*vals).substr_of(newop.outdata, i.get_off(), i.get_remaining());
6863   } catch (...) {
6864     dout(20) << "unsuccessful at decoding tmap for " << ctx->new_obs.oi.soid
6865              << dendl;
6866     return -EINVAL;
6867   }
6868   dout(20) << "successful at decoding tmap for " << ctx->new_obs.oi.soid
6869            << dendl;
6870   return 0;
6871 }
6872
6873 int PrimaryLogPG::_verify_no_head_clones(const hobject_t& soid,
6874                                         const SnapSet& ss)
6875 {
6876   // verify that all clones have been evicted
6877   dout(20) << __func__ << " verifying clones are absent "
6878            << ss << dendl;
6879   for (vector<snapid_t>::const_iterator p = ss.clones.begin();
6880        p != ss.clones.end();
6881        ++p) {
6882     hobject_t clone_oid = soid;
6883     clone_oid.snap = *p;
6884     if (is_missing_object(clone_oid))
6885       return -EBUSY;
6886     ObjectContextRef clone_obc = get_object_context(clone_oid, false);
6887     if (clone_obc && clone_obc->obs.exists) {
6888       dout(10) << __func__ << " cannot evict head before clone "
6889                << clone_oid << dendl;
6890       return -EBUSY;
6891     }
6892     if (copy_ops.count(clone_oid)) {
6893       dout(10) << __func__ << " cannot evict head, pending promote on clone "
6894                << clone_oid << dendl;
6895       return -EBUSY;
6896     }
6897   }
6898   return 0;
6899 }
6900
6901 inline int PrimaryLogPG::_delete_oid(
6902   OpContext *ctx,
6903   bool no_whiteout,     // no whiteouts, no matter what.
6904   bool try_no_whiteout) // try not to whiteout
6905 {
6906   SnapSet& snapset = ctx->new_snapset;
6907   ObjectState& obs = ctx->new_obs;
6908   object_info_t& oi = obs.oi;
6909   const hobject_t& soid = oi.soid;
6910   PGTransaction* t = ctx->op_t.get();
6911
6912   // cache: cache: set whiteout on delete?
6913   bool whiteout = false;
6914   if (pool.info.cache_mode != pg_pool_t::CACHEMODE_NONE
6915       && !no_whiteout
6916       && !try_no_whiteout) {
6917     whiteout = true;
6918   }
6919   bool legacy;
6920   if (get_osdmap()->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
6921     legacy = false;
6922     // in luminous or later, we can't delete the head if there are
6923     // clones. we trust the caller passing no_whiteout has already
6924     // verified they don't exist.
6925     if (!snapset.clones.empty() ||
6926         (!ctx->snapc.snaps.empty() && ctx->snapc.snaps[0] > snapset.seq)) {
6927       if (no_whiteout) {
6928         dout(20) << __func__ << " has or will have clones but no_whiteout=1"
6929                  << dendl;
6930       } else {
6931         dout(20) << __func__ << " has or will have clones; will whiteout"
6932                  << dendl;
6933         whiteout = true;
6934       }
6935     }
6936   } else {
6937     legacy = true;
6938   }
6939   dout(20) << __func__ << " " << soid << " whiteout=" << (int)whiteout
6940            << " no_whiteout=" << (int)no_whiteout
6941            << " try_no_whiteout=" << (int)try_no_whiteout
6942            << dendl;
6943   if (!obs.exists || (obs.oi.is_whiteout() && whiteout))
6944     return -ENOENT;
6945
6946   t->remove(soid);
6947
6948   if (oi.size > 0) {
6949     interval_set<uint64_t> ch;
6950     ch.insert(0, oi.size);
6951     ctx->modified_ranges.union_of(ch);
6952   }
6953
6954   ctx->delta_stats.num_wr++;
6955   if (soid.is_snap()) {
6956     assert(ctx->obc->ssc->snapset.clone_overlap.count(soid.snap));
6957     ctx->delta_stats.num_bytes -= ctx->obc->ssc->snapset.get_clone_bytes(soid.snap);
6958   } else {
6959     ctx->delta_stats.num_bytes -= oi.size;
6960   }
6961   oi.size = 0;
6962   oi.new_object();
6963
6964   // disconnect all watchers
6965   for (map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator p =
6966          oi.watchers.begin();
6967        p != oi.watchers.end();
6968        ++p) {
6969     dout(20) << __func__ << " will disconnect watcher " << p->first << dendl;
6970     ctx->watch_disconnects.push_back(
6971       watch_disconnect_t(p->first.first, p->first.second, true));
6972   }
6973   oi.watchers.clear();
6974
6975   if (whiteout) {
6976     dout(20) << __func__ << " setting whiteout on " << soid << dendl;
6977     oi.set_flag(object_info_t::FLAG_WHITEOUT);
6978     ctx->delta_stats.num_whiteouts++;
6979     t->create(soid);
6980     osd->logger->inc(l_osd_tier_whiteout);
6981     return 0;
6982   }
6983
6984   // delete the head
6985   ctx->delta_stats.num_objects--;
6986   if (soid.is_snap())
6987     ctx->delta_stats.num_object_clones--;
6988   if (oi.is_whiteout()) {
6989     dout(20) << __func__ << " deleting whiteout on " << soid << dendl;
6990     ctx->delta_stats.num_whiteouts--;
6991     oi.clear_flag(object_info_t::FLAG_WHITEOUT);
6992   }
6993   if (oi.is_cache_pinned()) {
6994     ctx->delta_stats.num_objects_pinned--;
6995   }
6996   if ((legacy || snapset.is_legacy()) && soid.is_head()) {
6997     snapset.head_exists = false;
6998   }
6999   obs.exists = false;
7000   return 0;
7001 }
7002
7003 int PrimaryLogPG::_rollback_to(OpContext *ctx, ceph_osd_op& op)
7004 {
7005   SnapSet& snapset = ctx->new_snapset;
7006   ObjectState& obs = ctx->new_obs;
7007   object_info_t& oi = obs.oi;
7008   const hobject_t& soid = oi.soid;
7009   PGTransaction* t = ctx->op_t.get();
7010   snapid_t snapid = (uint64_t)op.snap.snapid;
7011   hobject_t missing_oid;
7012
7013   dout(10) << "_rollback_to " << soid << " snapid " << snapid << dendl;
7014
7015   ObjectContextRef rollback_to;
7016   int ret = find_object_context(
7017     hobject_t(soid.oid, soid.get_key(), snapid, soid.get_hash(), info.pgid.pool(),
7018               soid.get_namespace()),
7019     &rollback_to, false, false, &missing_oid);
7020   if (ret == -EAGAIN) {
7021     /* clone must be missing */
7022     assert(is_degraded_or_backfilling_object(missing_oid));
7023     dout(20) << "_rollback_to attempted to roll back to a missing or backfilling clone "
7024              << missing_oid << " (requested snapid: ) " << snapid << dendl;
7025     block_write_on_degraded_snap(missing_oid, ctx->op);
7026     return ret;
7027   }
7028   {
7029     ObjectContextRef promote_obc;
7030     cache_result_t tier_mode_result;
7031     if (obs.exists && obs.oi.has_manifest()) {
7032       tier_mode_result =
7033         maybe_handle_manifest_detail(
7034           ctx->op,
7035           true,
7036           rollback_to);
7037     } else {
7038       tier_mode_result =
7039         maybe_handle_cache_detail(
7040           ctx->op,
7041           true,
7042           rollback_to,
7043           ret,
7044           missing_oid,
7045           true,
7046           false,
7047           &promote_obc);
7048     }
7049     switch (tier_mode_result) {
7050     case cache_result_t::NOOP:
7051       break;
7052     case cache_result_t::BLOCKED_PROMOTE:
7053       assert(promote_obc);
7054       block_write_on_snap_rollback(soid, promote_obc, ctx->op);
7055       return -EAGAIN;
7056     case cache_result_t::BLOCKED_FULL:
7057       block_write_on_full_cache(soid, ctx->op);
7058       return -EAGAIN;
7059     case cache_result_t::REPLIED_WITH_EAGAIN:
7060       assert(0 == "this can't happen, no rollback on replica");
7061     default:
7062       assert(0 == "must promote was set, other values are not valid");
7063       return -EAGAIN;
7064     }
7065   }
7066
7067   if (ret == -ENOENT || (rollback_to && rollback_to->obs.oi.is_whiteout())) {
7068     // there's no snapshot here, or there's no object.
7069     // if there's no snapshot, we delete the object; otherwise, do nothing.
7070     dout(20) << "_rollback_to deleting head on " << soid.oid
7071              << " because got ENOENT|whiteout on find_object_context" << dendl;
7072     if (ctx->obc->obs.oi.watchers.size()) {
7073       // Cannot delete an object with watchers
7074       ret = -EBUSY;
7075     } else {
7076       _delete_oid(ctx, false, false);
7077       ret = 0;
7078     }
7079   } else if (ret) {
7080     // ummm....huh? It *can't* return anything else at time of writing.
7081     assert(0 == "unexpected error code in _rollback_to");
7082   } else { //we got our context, let's use it to do the rollback!
7083     hobject_t& rollback_to_sobject = rollback_to->obs.oi.soid;
7084     if (is_degraded_or_backfilling_object(rollback_to_sobject)) {
7085       dout(20) << "_rollback_to attempted to roll back to a degraded object "
7086                << rollback_to_sobject << " (requested snapid: ) " << snapid << dendl;
7087       block_write_on_degraded_snap(rollback_to_sobject, ctx->op);
7088       ret = -EAGAIN;
7089     } else if (rollback_to->obs.oi.soid.snap == CEPH_NOSNAP) {
7090       // rolling back to the head; we just need to clone it.
7091       ctx->modify = true;
7092     } else {
7093       /* 1) Delete current head
7094        * 2) Clone correct snapshot into head
7095        * 3) Calculate clone_overlaps by following overlaps
7096        *    forward from rollback snapshot */
7097       dout(10) << "_rollback_to deleting " << soid.oid
7098                << " and rolling back to old snap" << dendl;
7099
7100       if (obs.exists) {
7101         t->remove(soid);
7102       }
7103       t->clone(soid, rollback_to_sobject);
7104       snapset.head_exists = true;
7105       t->add_obc(rollback_to);
7106
7107       map<snapid_t, interval_set<uint64_t> >::iterator iter =
7108         snapset.clone_overlap.lower_bound(snapid);
7109       interval_set<uint64_t> overlaps = iter->second;
7110       assert(iter != snapset.clone_overlap.end());
7111       for ( ;
7112             iter != snapset.clone_overlap.end();
7113             ++iter)
7114         overlaps.intersection_of(iter->second);
7115
7116       if (obs.oi.size > 0) {
7117         interval_set<uint64_t> modified;
7118         modified.insert(0, obs.oi.size);
7119         overlaps.intersection_of(modified);
7120         modified.subtract(overlaps);
7121         ctx->modified_ranges.union_of(modified);
7122       }
7123
7124       // Adjust the cached objectcontext
7125       maybe_create_new_object(ctx, true);
7126       ctx->delta_stats.num_bytes -= obs.oi.size;
7127       ctx->delta_stats.num_bytes += rollback_to->obs.oi.size;
7128       obs.oi.size = rollback_to->obs.oi.size;
7129       if (rollback_to->obs.oi.is_data_digest())
7130         obs.oi.set_data_digest(rollback_to->obs.oi.data_digest);
7131       else
7132         obs.oi.clear_data_digest();
7133       if (rollback_to->obs.oi.is_omap_digest())
7134         obs.oi.set_omap_digest(rollback_to->obs.oi.omap_digest);
7135       else
7136         obs.oi.clear_omap_digest();
7137
7138       if (rollback_to->obs.oi.is_omap()) {
7139         dout(10) << __func__ << " setting omap flag on " << obs.oi.soid << dendl;
7140         obs.oi.set_flag(object_info_t::FLAG_OMAP);
7141       } else {
7142         dout(10) << __func__ << " clearing omap flag on " << obs.oi.soid << dendl;
7143         obs.oi.clear_flag(object_info_t::FLAG_OMAP);
7144       }
7145
7146       snapset.head_exists = true;
7147     }
7148   }
7149   return ret;
7150 }
7151
7152 void PrimaryLogPG::_make_clone(
7153   OpContext *ctx,
7154   PGTransaction* t,
7155   ObjectContextRef obc,
7156   const hobject_t& head, const hobject_t& coid,
7157   object_info_t *poi)
7158 {
7159   bufferlist bv;
7160   ::encode(*poi, bv, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
7161
7162   t->clone(coid, head);
7163   setattr_maybe_cache(obc, ctx, t, OI_ATTR, bv);
7164   rmattr_maybe_cache(obc, ctx, t, SS_ATTR);
7165 }
7166
7167 void PrimaryLogPG::make_writeable(OpContext *ctx)
7168 {
7169   const hobject_t& soid = ctx->obs->oi.soid;
7170   SnapContext& snapc = ctx->snapc;
7171
7172   // clone?
7173   assert(soid.snap == CEPH_NOSNAP);
7174   dout(20) << "make_writeable " << soid << " snapset=" << ctx->new_snapset
7175            << "  snapc=" << snapc << dendl;
7176
7177   bool was_dirty = ctx->obc->obs.oi.is_dirty();
7178   if (ctx->new_obs.exists) {
7179     // we will mark the object dirty
7180     if (ctx->undirty && was_dirty) {
7181       dout(20) << " clearing DIRTY flag" << dendl;
7182       assert(ctx->new_obs.oi.is_dirty());
7183       ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
7184       --ctx->delta_stats.num_objects_dirty;
7185       osd->logger->inc(l_osd_tier_clean);
7186     } else if (!was_dirty && !ctx->undirty) {
7187       dout(20) << " setting DIRTY flag" << dendl;
7188       ctx->new_obs.oi.set_flag(object_info_t::FLAG_DIRTY);
7189       ++ctx->delta_stats.num_objects_dirty;
7190       osd->logger->inc(l_osd_tier_dirty);
7191     }
7192   } else {
7193     if (was_dirty) {
7194       dout(20) << " deletion, decrementing num_dirty and clearing flag" << dendl;
7195       ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
7196       --ctx->delta_stats.num_objects_dirty;
7197     }
7198   }
7199
7200   if ((ctx->new_obs.exists &&
7201        ctx->new_obs.oi.is_omap()) &&
7202       (!ctx->obc->obs.exists ||
7203        !ctx->obc->obs.oi.is_omap())) {
7204     ++ctx->delta_stats.num_objects_omap;
7205   }
7206   if ((!ctx->new_obs.exists ||
7207        !ctx->new_obs.oi.is_omap()) &&
7208       (ctx->obc->obs.exists &&
7209        ctx->obc->obs.oi.is_omap())) {
7210     --ctx->delta_stats.num_objects_omap;
7211   }
7212
7213   // use newer snapc?
7214   if (ctx->new_snapset.seq > snapc.seq) {
7215     snapc.seq = ctx->new_snapset.seq;
7216     snapc.snaps = ctx->new_snapset.snaps;
7217     filter_snapc(snapc.snaps);
7218     dout(10) << " using newer snapc " << snapc << dendl;
7219   }
7220
7221   if ((ctx->obs->exists && !ctx->obs->oi.is_whiteout()) && // head exist(ed)
7222       snapc.snaps.size() &&                 // there are snaps
7223       !ctx->cache_evict &&
7224       snapc.snaps[0] > ctx->new_snapset.seq) {  // existing object is old
7225     // clone
7226     hobject_t coid = soid;
7227     coid.snap = snapc.seq;
7228
7229     unsigned l;
7230     for (l=1; l<snapc.snaps.size() && snapc.snaps[l] > ctx->new_snapset.seq; l++) ;
7231
7232     vector<snapid_t> snaps(l);
7233     for (unsigned i=0; i<l; i++)
7234       snaps[i] = snapc.snaps[i];
7235
7236     // prepare clone
7237     object_info_t static_snap_oi(coid);
7238     object_info_t *snap_oi;
7239     if (is_primary()) {
7240       ctx->clone_obc = object_contexts.lookup_or_create(static_snap_oi.soid);
7241       ctx->clone_obc->destructor_callback = new C_PG_ObjectContext(this, ctx->clone_obc.get());
7242       ctx->clone_obc->obs.oi = static_snap_oi;
7243       ctx->clone_obc->obs.exists = true;
7244       ctx->clone_obc->ssc = ctx->obc->ssc;
7245       ctx->clone_obc->ssc->ref++;
7246       if (pool.info.require_rollback())
7247         ctx->clone_obc->attr_cache = ctx->obc->attr_cache;
7248       snap_oi = &ctx->clone_obc->obs.oi;
7249       bool got = ctx->lock_manager.get_write_greedy(
7250         coid,
7251         ctx->clone_obc,
7252         ctx->op);
7253       assert(got);
7254       dout(20) << " got greedy write on clone_obc " << *ctx->clone_obc << dendl;
7255     } else {
7256       snap_oi = &static_snap_oi;
7257     }
7258     snap_oi->version = ctx->at_version;
7259     snap_oi->prior_version = ctx->obs->oi.version;
7260     snap_oi->copy_user_bits(ctx->obs->oi);
7261
7262     bool legacy = ctx->new_snapset.is_legacy() ||
7263       get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
7264     if (legacy) {
7265       snap_oi->legacy_snaps = snaps;
7266     }
7267
7268     _make_clone(ctx, ctx->op_t.get(), ctx->clone_obc, soid, coid, snap_oi);
7269
7270     ctx->delta_stats.num_objects++;
7271     if (snap_oi->is_dirty()) {
7272       ctx->delta_stats.num_objects_dirty++;
7273       osd->logger->inc(l_osd_tier_dirty);
7274     }
7275     if (snap_oi->is_omap())
7276       ctx->delta_stats.num_objects_omap++;
7277     if (snap_oi->is_cache_pinned())
7278       ctx->delta_stats.num_objects_pinned++;
7279     ctx->delta_stats.num_object_clones++;
7280     ctx->new_snapset.clones.push_back(coid.snap);
7281     ctx->new_snapset.clone_size[coid.snap] = ctx->obs->oi.size;
7282     if (!legacy) {
7283       ctx->new_snapset.clone_snaps[coid.snap] = snaps;
7284     }
7285
7286     // clone_overlap should contain an entry for each clone
7287     // (an empty interval_set if there is no overlap)
7288     ctx->new_snapset.clone_overlap[coid.snap];
7289     if (ctx->obs->oi.size)
7290       ctx->new_snapset.clone_overlap[coid.snap].insert(0, ctx->obs->oi.size);
7291
7292     // log clone
7293     dout(10) << " cloning v " << ctx->obs->oi.version
7294              << " to " << coid << " v " << ctx->at_version
7295              << " snaps=" << snaps
7296              << " snapset=" << ctx->new_snapset << dendl;
7297     ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::CLONE, coid, ctx->at_version,
7298                                       ctx->obs->oi.version,
7299                                       ctx->obs->oi.user_version,
7300                                       osd_reqid_t(), ctx->new_obs.oi.mtime, 0));
7301     ::encode(snaps, ctx->log.back().snaps);
7302
7303     ctx->at_version.version++;
7304   }
7305
7306   // update most recent clone_overlap and usage stats
7307   if (ctx->new_snapset.clones.size() > 0) {
7308     /* we need to check whether the most recent clone exists, if it's been evicted,
7309      * it's not included in the stats */
7310     hobject_t last_clone_oid = soid;
7311     last_clone_oid.snap = ctx->new_snapset.clone_overlap.rbegin()->first;
7312     if (is_present_clone(last_clone_oid)) {
7313       interval_set<uint64_t> &newest_overlap = ctx->new_snapset.clone_overlap.rbegin()->second;
7314       ctx->modified_ranges.intersection_of(newest_overlap);
7315       // modified_ranges is still in use by the clone
7316       add_interval_usage(ctx->modified_ranges, ctx->delta_stats);
7317       newest_overlap.subtract(ctx->modified_ranges);
7318     }
7319   }
7320
7321   // update snapset with latest snap context
7322   ctx->new_snapset.seq = snapc.seq;
7323   ctx->new_snapset.snaps = snapc.snaps;
7324   if (get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS) {
7325     // pessimistic assumption that this is a net-new legacy SnapSet
7326     ctx->delta_stats.num_legacy_snapsets++;
7327     ctx->new_snapset.head_exists = ctx->new_obs.exists;
7328   } else if (ctx->new_snapset.is_legacy()) {
7329     ctx->new_snapset.head_exists = ctx->new_obs.exists;
7330   }
7331   dout(20) << "make_writeable " << soid
7332            << " done, snapset=" << ctx->new_snapset << dendl;
7333 }
7334
7335
7336 void PrimaryLogPG::write_update_size_and_usage(object_stat_sum_t& delta_stats, object_info_t& oi,
7337                                                interval_set<uint64_t>& modified, uint64_t offset,
7338                                                uint64_t length, bool write_full)
7339 {
7340   interval_set<uint64_t> ch;
7341   if (write_full) {
7342     if (oi.size)
7343       ch.insert(0, oi.size);
7344   } else if (length)
7345     ch.insert(offset, length);
7346   modified.union_of(ch);
7347   if (write_full || offset + length > oi.size) {
7348     uint64_t new_size = offset + length;
7349     delta_stats.num_bytes -= oi.size;
7350     delta_stats.num_bytes += new_size;
7351     oi.size = new_size;
7352   }
7353   delta_stats.num_wr++;
7354   delta_stats.num_wr_kb += SHIFT_ROUND_UP(length, 10);
7355 }
7356
7357 void PrimaryLogPG::add_interval_usage(interval_set<uint64_t>& s, object_stat_sum_t& delta_stats)
7358 {
7359   for (interval_set<uint64_t>::const_iterator p = s.begin(); p != s.end(); ++p) {
7360     delta_stats.num_bytes += p.get_len();
7361   }
7362 }
7363
7364 void PrimaryLogPG::complete_disconnect_watches(
7365   ObjectContextRef obc,
7366   const list<watch_disconnect_t> &to_disconnect)
7367 {
7368   for (list<watch_disconnect_t>::const_iterator i =
7369          to_disconnect.begin();
7370        i != to_disconnect.end();
7371        ++i) {
7372     pair<uint64_t, entity_name_t> watcher(i->cookie, i->name);
7373     auto watchers_entry = obc->watchers.find(watcher);
7374     if (watchers_entry != obc->watchers.end()) {
7375       WatchRef watch = watchers_entry->second;
7376       dout(10) << "do_osd_op_effects disconnect watcher " << watcher << dendl;
7377       obc->watchers.erase(watcher);
7378       watch->remove(i->send_disconnect);
7379     } else {
7380       dout(10) << "do_osd_op_effects disconnect failed to find watcher "
7381                << watcher << dendl;
7382     }
7383   }
7384 }
7385
7386 void PrimaryLogPG::do_osd_op_effects(OpContext *ctx, const ConnectionRef& conn)
7387 {
7388   entity_name_t entity = ctx->reqid.name;
7389   dout(15) << "do_osd_op_effects " << entity << " con " << conn.get() << dendl;
7390
7391   // disconnects first
7392   complete_disconnect_watches(ctx->obc, ctx->watch_disconnects);
7393
7394   assert(conn);
7395
7396   boost::intrusive_ptr<Session> session((Session *)conn->get_priv());
7397   if (!session.get())
7398     return;
7399   session->put();  // get_priv() takes a ref, and so does the intrusive_ptr
7400
7401   for (list<pair<watch_info_t,bool> >::iterator i = ctx->watch_connects.begin();
7402        i != ctx->watch_connects.end();
7403        ++i) {
7404     pair<uint64_t, entity_name_t> watcher(i->first.cookie, entity);
7405     dout(15) << "do_osd_op_effects applying watch connect on session "
7406              << session.get() << " watcher " << watcher << dendl;
7407     WatchRef watch;
7408     if (ctx->obc->watchers.count(watcher)) {
7409       dout(15) << "do_osd_op_effects found existing watch watcher " << watcher
7410                << dendl;
7411       watch = ctx->obc->watchers[watcher];
7412     } else {
7413       dout(15) << "do_osd_op_effects new watcher " << watcher
7414                << dendl;
7415       watch = Watch::makeWatchRef(
7416         this, osd, ctx->obc, i->first.timeout_seconds,
7417         i->first.cookie, entity, conn->get_peer_addr());
7418       ctx->obc->watchers.insert(
7419         make_pair(
7420           watcher,
7421           watch));
7422     }
7423     watch->connect(conn, i->second);
7424   }
7425
7426   for (list<notify_info_t>::iterator p = ctx->notifies.begin();
7427        p != ctx->notifies.end();
7428        ++p) {
7429     dout(10) << "do_osd_op_effects, notify " << *p << dendl;
7430     ConnectionRef conn(ctx->op->get_req()->get_connection());
7431     NotifyRef notif(
7432       Notify::makeNotifyRef(
7433         conn,
7434         ctx->reqid.name.num(),
7435         p->bl,
7436         p->timeout,
7437         p->cookie,
7438         p->notify_id,
7439         ctx->obc->obs.oi.user_version,
7440         osd));
7441     for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator i =
7442            ctx->obc->watchers.begin();
7443          i != ctx->obc->watchers.end();
7444          ++i) {
7445       dout(10) << "starting notify on watch " << i->first << dendl;
7446       i->second->start_notify(notif);
7447     }
7448     notif->init();
7449   }
7450
7451   for (list<OpContext::NotifyAck>::iterator p = ctx->notify_acks.begin();
7452        p != ctx->notify_acks.end();
7453        ++p) {
7454     if (p->watch_cookie)
7455       dout(10) << "notify_ack " << make_pair(p->watch_cookie.get(), p->notify_id) << dendl;
7456     else
7457       dout(10) << "notify_ack " << make_pair("NULL", p->notify_id) << dendl;
7458     for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator i =
7459            ctx->obc->watchers.begin();
7460          i != ctx->obc->watchers.end();
7461          ++i) {
7462       if (i->first.second != entity) continue;
7463       if (p->watch_cookie &&
7464           p->watch_cookie.get() != i->first.first) continue;
7465       dout(10) << "acking notify on watch " << i->first << dendl;
7466       i->second->notify_ack(p->notify_id, p->reply_bl);
7467     }
7468   }
7469 }
7470
7471 hobject_t PrimaryLogPG::generate_temp_object(const hobject_t& target)
7472 {
7473   ostringstream ss;
7474   ss << "temp_" << info.pgid << "_" << get_role()
7475      << "_" << osd->monc->get_global_id() << "_" << (++temp_seq);
7476   hobject_t hoid = target.make_temp_hobject(ss.str());
7477   dout(20) << __func__ << " " << hoid << dendl;
7478   return hoid;
7479 }
7480
7481 hobject_t PrimaryLogPG::get_temp_recovery_object(
7482   const hobject_t& target,
7483   eversion_t version)
7484 {
7485   ostringstream ss;
7486   ss << "temp_recovering_" << info.pgid  // (note this includes the shardid)
7487      << "_" << version
7488      << "_" << info.history.same_interval_since
7489      << "_" << target.snap;
7490   // pgid + version + interval + snapid is unique, and short
7491   hobject_t hoid = target.make_temp_hobject(ss.str());
7492   dout(20) << __func__ << " " << hoid << dendl;
7493   return hoid;
7494 }
7495
7496 int PrimaryLogPG::prepare_transaction(OpContext *ctx)
7497 {
7498   assert(!ctx->ops->empty());
7499
7500   const hobject_t& soid = ctx->obs->oi.soid;
7501
7502   // valid snap context?
7503   if (!ctx->snapc.is_valid()) {
7504     dout(10) << " invalid snapc " << ctx->snapc << dendl;
7505     return -EINVAL;
7506   }
7507
7508   // prepare the actual mutation
7509   int result = do_osd_ops(ctx, *ctx->ops);
7510   if (result < 0) {
7511     if (ctx->op->may_write() &&
7512         get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
7513       // need to save the error code in the pg log, to detect dup ops,
7514       // but do nothing else
7515       ctx->update_log_only = true;
7516     }
7517     return result;
7518   }
7519
7520   // read-op?  write-op noop? done?
7521   if (ctx->op_t->empty() && !ctx->modify) {
7522     unstable_stats.add(ctx->delta_stats);
7523     if (ctx->op->may_write() &&
7524         get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
7525       ctx->update_log_only = true;
7526     }
7527     return result;
7528   }
7529
7530   // check for full
7531   if ((ctx->delta_stats.num_bytes > 0 ||
7532        ctx->delta_stats.num_objects > 0) &&  // FIXME: keys?
7533       (pool.info.has_flag(pg_pool_t::FLAG_FULL) ||
7534        get_osdmap()->test_flag(CEPH_OSDMAP_FULL))) {
7535     const MOSDOp *m = static_cast<const MOSDOp*>(ctx->op->get_req());
7536     if (ctx->reqid.name.is_mds() ||   // FIXME: ignore MDS for now
7537         m->has_flag(CEPH_OSD_FLAG_FULL_FORCE)) {
7538       dout(20) << __func__ << " full, but proceeding due to FULL_FORCE or MDS"
7539                << dendl;
7540     } else if (m->has_flag(CEPH_OSD_FLAG_FULL_TRY)) {
7541       // they tried, they failed.
7542       dout(20) << __func__ << " full, replying to FULL_TRY op" << dendl;
7543       return pool.info.has_flag(pg_pool_t::FLAG_FULL) ? -EDQUOT : -ENOSPC;
7544     } else {
7545       // drop request
7546       dout(20) << __func__ << " full, dropping request (bad client)" << dendl;
7547       return -EAGAIN;
7548     }
7549   }
7550
7551   // clone, if necessary
7552   if (soid.snap == CEPH_NOSNAP)
7553     make_writeable(ctx);
7554
7555   finish_ctx(ctx,
7556              ctx->new_obs.exists ? pg_log_entry_t::MODIFY :
7557              pg_log_entry_t::DELETE);
7558
7559   return result;
7560 }
7561
7562 void PrimaryLogPG::finish_ctx(OpContext *ctx, int log_op_type, bool maintain_ssc)
7563 {
7564   const hobject_t& soid = ctx->obs->oi.soid;
7565   dout(20) << __func__ << " " << soid << " " << ctx
7566            << " op " << pg_log_entry_t::get_op_name(log_op_type)
7567            << dendl;
7568   utime_t now = ceph_clock_now();
7569
7570   // snapset
7571   bufferlist bss;
7572
7573   if (soid.snap == CEPH_NOSNAP && maintain_ssc) {
7574     ::encode(ctx->new_snapset, bss);
7575     assert(ctx->new_obs.exists == ctx->new_snapset.head_exists ||
7576            !ctx->new_snapset.is_legacy());
7577
7578     if (ctx->new_obs.exists) {
7579       if (!ctx->obs->exists) {
7580         if (ctx->snapset_obc && ctx->snapset_obc->obs.exists) {
7581           hobject_t snapoid = soid.get_snapdir();
7582           dout(10) << " removing unneeded snapdir " << snapoid << dendl;
7583           ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::DELETE, snapoid,
7584               ctx->at_version,
7585               ctx->snapset_obc->obs.oi.version,
7586               0, osd_reqid_t(), ctx->mtime, 0));
7587           ctx->op_t->remove(snapoid);
7588
7589           ctx->at_version.version++;
7590
7591           ctx->snapset_obc->obs.exists = false;
7592         }
7593       }
7594     } else if (!ctx->new_snapset.clones.empty() &&
7595                !ctx->cache_evict &&
7596                !ctx->new_snapset.head_exists &&
7597                (!ctx->snapset_obc || !ctx->snapset_obc->obs.exists)) {
7598       // save snapset on _snap
7599       hobject_t snapoid(soid.oid, soid.get_key(), CEPH_SNAPDIR, soid.get_hash(),
7600                         info.pgid.pool(), soid.get_namespace());
7601       dout(10) << " final snapset " << ctx->new_snapset
7602                << " in " << snapoid << dendl;
7603       assert(get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS);
7604       ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::MODIFY, snapoid,
7605                                         ctx->at_version,
7606                                         eversion_t(),
7607                                         0, osd_reqid_t(), ctx->mtime, 0));
7608
7609       if (!ctx->snapset_obc)
7610         ctx->snapset_obc = get_object_context(snapoid, true);
7611       bool got = false;
7612       if (ctx->lock_type == ObjectContext::RWState::RWWRITE) {
7613         got = ctx->lock_manager.get_write_greedy(
7614           snapoid,
7615           ctx->snapset_obc,
7616           ctx->op);
7617       } else {
7618         assert(ctx->lock_type == ObjectContext::RWState::RWEXCL);
7619         got = ctx->lock_manager.get_lock_type(
7620           ObjectContext::RWState::RWEXCL,
7621           snapoid,
7622           ctx->snapset_obc,
7623           ctx->op);
7624       }
7625       assert(got);
7626       dout(20) << " got greedy write on snapset_obc " << *ctx->snapset_obc << dendl;
7627       ctx->snapset_obc->obs.exists = true;
7628       ctx->snapset_obc->obs.oi.version = ctx->at_version;
7629       ctx->snapset_obc->obs.oi.last_reqid = ctx->reqid;
7630       ctx->snapset_obc->obs.oi.mtime = ctx->mtime;
7631       ctx->snapset_obc->obs.oi.local_mtime = now;
7632
7633       map<string, bufferlist> attrs;
7634       bufferlist bv(sizeof(ctx->new_obs.oi));
7635       ::encode(ctx->snapset_obc->obs.oi, bv,
7636                get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
7637       ctx->op_t->create(snapoid);
7638       attrs[OI_ATTR].claim(bv);
7639       attrs[SS_ATTR].claim(bss);
7640       setattrs_maybe_cache(ctx->snapset_obc, ctx, ctx->op_t.get(), attrs);
7641       ctx->at_version.version++;
7642     }
7643   }
7644
7645   // finish and log the op.
7646   if (ctx->user_modify) {
7647     // update the user_version for any modify ops, except for the watch op
7648     ctx->user_at_version = MAX(info.last_user_version, ctx->new_obs.oi.user_version) + 1;
7649     /* In order for new clients and old clients to interoperate properly
7650      * when exchanging versions, we need to lower bound the user_version
7651      * (which our new clients pay proper attention to)
7652      * by the at_version (which is all the old clients can ever see). */
7653     if (ctx->at_version.version > ctx->user_at_version)
7654       ctx->user_at_version = ctx->at_version.version;
7655     ctx->new_obs.oi.user_version = ctx->user_at_version;
7656   }
7657   ctx->bytes_written = ctx->op_t->get_bytes_written();
7658
7659   if (ctx->new_obs.exists) {
7660     // on the head object
7661     ctx->new_obs.oi.version = ctx->at_version;
7662     ctx->new_obs.oi.prior_version = ctx->obs->oi.version;
7663     ctx->new_obs.oi.last_reqid = ctx->reqid;
7664     if (ctx->mtime != utime_t()) {
7665       ctx->new_obs.oi.mtime = ctx->mtime;
7666       dout(10) << " set mtime to " << ctx->new_obs.oi.mtime << dendl;
7667       ctx->new_obs.oi.local_mtime = now;
7668     } else {
7669       dout(10) << " mtime unchanged at " << ctx->new_obs.oi.mtime << dendl;
7670     }
7671
7672     map <string, bufferlist> attrs;
7673     bufferlist bv(sizeof(ctx->new_obs.oi));
7674     ::encode(ctx->new_obs.oi, bv,
7675              get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
7676     attrs[OI_ATTR].claim(bv);
7677
7678     if (soid.snap == CEPH_NOSNAP) {
7679       dout(10) << " final snapset " << ctx->new_snapset
7680                << " in " << soid << dendl;
7681       attrs[SS_ATTR].claim(bss);
7682     } else {
7683       dout(10) << " no snapset (this is a clone)" << dendl;
7684     }
7685     ctx->op_t->setattrs(soid, attrs);
7686   } else {
7687     ctx->new_obs.oi = object_info_t(ctx->obc->obs.oi.soid);
7688   }
7689
7690   bool legacy_snapset = ctx->new_snapset.is_legacy() ||
7691     get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
7692
7693   // append to log
7694   ctx->log.push_back(pg_log_entry_t(log_op_type, soid, ctx->at_version,
7695                                     ctx->obs->oi.version,
7696                                     ctx->user_at_version, ctx->reqid,
7697                                     ctx->mtime, 0));
7698   if (soid.snap < CEPH_NOSNAP) {
7699     switch (log_op_type) {
7700     case pg_log_entry_t::MODIFY:
7701     case pg_log_entry_t::PROMOTE:
7702     case pg_log_entry_t::CLEAN:
7703       if (legacy_snapset) {
7704         dout(20) << __func__ << " encoding legacy_snaps "
7705                  << ctx->new_obs.oi.legacy_snaps
7706                  << dendl;
7707         ::encode(ctx->new_obs.oi.legacy_snaps, ctx->log.back().snaps);
7708       } else {
7709         dout(20) << __func__ << " encoding snaps from " << ctx->new_snapset
7710                  << dendl;
7711         ::encode(ctx->new_snapset.clone_snaps[soid.snap], ctx->log.back().snaps);
7712       }
7713       break;
7714     default:
7715       break;
7716     }
7717   }
7718
7719   if (!ctx->extra_reqids.empty()) {
7720     dout(20) << __func__ << "  extra_reqids " << ctx->extra_reqids << dendl;
7721     ctx->log.back().extra_reqids.swap(ctx->extra_reqids);
7722   }
7723
7724   // apply new object state.
7725   ctx->obc->obs = ctx->new_obs;
7726
7727   if (soid.is_head() && !ctx->obc->obs.exists &&
7728       (!maintain_ssc || ctx->cache_evict)) {
7729     ctx->obc->ssc->exists = false;
7730     ctx->obc->ssc->snapset = SnapSet();
7731   } else {
7732     ctx->obc->ssc->exists = true;
7733     ctx->obc->ssc->snapset = ctx->new_snapset;
7734   }
7735 }
7736
7737 void PrimaryLogPG::apply_stats(
7738   const hobject_t &soid,
7739   const object_stat_sum_t &delta_stats) {
7740
7741   info.stats.stats.add(delta_stats);
7742
7743   for (set<pg_shard_t>::iterator i = backfill_targets.begin();
7744        i != backfill_targets.end();
7745        ++i) {
7746     pg_shard_t bt = *i;
7747     pg_info_t& pinfo = peer_info[bt];
7748     if (soid <= pinfo.last_backfill)
7749       pinfo.stats.stats.add(delta_stats);
7750     else if (soid <= last_backfill_started)
7751       pending_backfill_updates[soid].stats.add(delta_stats);
7752   }
7753
7754   if (is_primary() && scrubber.active) {
7755     if (soid < scrubber.start) {
7756       dout(20) << __func__ << " " << soid << " < [" << scrubber.start
7757                << "," << scrubber.end << ")" << dendl;
7758       scrub_cstat.add(delta_stats);
7759     } else {
7760       dout(20) << __func__ << " " << soid << " >= [" << scrubber.start
7761                << "," << scrubber.end << ")" << dendl;
7762     }
7763   }
7764 }
7765
7766 void PrimaryLogPG::complete_read_ctx(int result, OpContext *ctx)
7767 {
7768   const MOSDOp *m = static_cast<const MOSDOp*>(ctx->op->get_req());
7769   assert(ctx->async_reads_complete());
7770
7771   for (vector<OSDOp>::iterator p = ctx->ops->begin();
7772     p != ctx->ops->end() && result >= 0; ++p) {
7773     if (p->rval < 0 && !(p->op.flags & CEPH_OSD_OP_FLAG_FAILOK)) {
7774       result = p->rval;
7775       break;
7776     }
7777     ctx->bytes_read += p->outdata.length();
7778   }
7779   ctx->reply->claim_op_out_data(*ctx->ops);
7780   ctx->reply->get_header().data_off = (ctx->data_off ? *ctx->data_off : 0);
7781
7782   MOSDOpReply *reply = ctx->reply;
7783   ctx->reply = nullptr;
7784
7785   if (result >= 0) {
7786     if (!ctx->ignore_log_op_stats) {
7787       log_op_stats(ctx);
7788       publish_stats_to_osd();
7789     }
7790
7791     // on read, return the current object version
7792     if (ctx->obs) {
7793       reply->set_reply_versions(eversion_t(), ctx->obs->oi.user_version);
7794     } else {
7795       reply->set_reply_versions(eversion_t(), ctx->user_at_version);
7796     }
7797   } else if (result == -ENOENT) {
7798     // on ENOENT, set a floor for what the next user version will be.
7799     reply->set_enoent_reply_versions(info.last_update, info.last_user_version);
7800   }
7801
7802   reply->set_result(result);
7803   reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
7804   osd->send_message_osd_client(reply, m->get_connection());
7805   close_op_ctx(ctx);
7806 }
7807
7808 // ========================================================================
7809 // copyfrom
7810
7811 struct C_Copyfrom : public Context {
7812   PrimaryLogPGRef pg;
7813   hobject_t oid;
7814   epoch_t last_peering_reset;
7815   ceph_tid_t tid;
7816   PrimaryLogPG::CopyOpRef cop;
7817   C_Copyfrom(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
7818              const PrimaryLogPG::CopyOpRef& c)
7819     : pg(p), oid(o), last_peering_reset(lpr),
7820       tid(0), cop(c)
7821   {}
7822   void finish(int r) override {
7823     if (r == -ECANCELED)
7824       return;
7825     pg->lock();
7826     if (last_peering_reset == pg->get_last_peering_reset()) {
7827       pg->process_copy_chunk(oid, tid, r);
7828     }
7829     pg->unlock();
7830   }
7831 };
7832
7833 struct C_CopyFrom_AsyncReadCb : public Context {
7834   OSDOp *osd_op;
7835   object_copy_data_t reply_obj;
7836   uint64_t features;
7837   size_t len;
7838   C_CopyFrom_AsyncReadCb(OSDOp *osd_op, uint64_t features) :
7839     osd_op(osd_op), features(features), len(0) {}
7840   void finish(int r) override {
7841     osd_op->rval = r;
7842     if (r < 0) {
7843       return;
7844     }
7845
7846     assert(len > 0);
7847     assert(len <= reply_obj.data.length());
7848     bufferlist bl;
7849     bl.substr_of(reply_obj.data, 0, len);
7850     reply_obj.data.swap(bl);
7851     ::encode(reply_obj, osd_op->outdata, features);
7852   }
7853 };
7854
7855 int PrimaryLogPG::do_copy_get(OpContext *ctx, bufferlist::iterator& bp,
7856                               OSDOp& osd_op, ObjectContextRef &obc)
7857 {
7858   object_info_t& oi = obc->obs.oi;
7859   hobject_t& soid = oi.soid;
7860   int result = 0;
7861   object_copy_cursor_t cursor;
7862   uint64_t out_max;
7863   bool skip_data_digest =
7864     (osd->store->has_builtin_csum() && g_conf->osd_skip_data_digest) ||
7865     g_conf->osd_distrust_data_digest;
7866
7867   try {
7868     ::decode(cursor, bp);
7869     ::decode(out_max, bp);
7870   }
7871   catch (buffer::error& e) {
7872     result = -EINVAL;
7873     return result;
7874   }
7875
7876   const MOSDOp *op = reinterpret_cast<const MOSDOp*>(ctx->op->get_req());
7877   uint64_t features = op->get_features();
7878
7879   bool async_read_started = false;
7880   object_copy_data_t _reply_obj;
7881   C_CopyFrom_AsyncReadCb *cb = NULL;
7882   if (pool.info.require_rollback()) {
7883     cb = new C_CopyFrom_AsyncReadCb(&osd_op, features);
7884   }
7885   object_copy_data_t &reply_obj = cb ? cb->reply_obj : _reply_obj;
7886   // size, mtime
7887   reply_obj.size = oi.size;
7888   reply_obj.mtime = oi.mtime;
7889   assert(obc->ssc);
7890   if (soid.snap < CEPH_NOSNAP) {
7891     if (obc->ssc->snapset.is_legacy()) {
7892       reply_obj.snaps = oi.legacy_snaps;
7893     } else {
7894       auto p = obc->ssc->snapset.clone_snaps.find(soid.snap);
7895       assert(p != obc->ssc->snapset.clone_snaps.end()); // warn?
7896       reply_obj.snaps = p->second;
7897     }
7898   } else {
7899     reply_obj.snap_seq = obc->ssc->snapset.seq;
7900   }
7901   if (!skip_data_digest && oi.is_data_digest()) {
7902     reply_obj.flags |= object_copy_data_t::FLAG_DATA_DIGEST;
7903     reply_obj.data_digest = oi.data_digest;
7904   }
7905   if (oi.is_omap_digest()) {
7906     reply_obj.flags |= object_copy_data_t::FLAG_OMAP_DIGEST;
7907     reply_obj.omap_digest = oi.omap_digest;
7908   }
7909   reply_obj.truncate_seq = oi.truncate_seq;
7910   reply_obj.truncate_size = oi.truncate_size;
7911
7912   // attrs
7913   map<string,bufferlist>& out_attrs = reply_obj.attrs;
7914   if (!cursor.attr_complete) {
7915     result = getattrs_maybe_cache(
7916       ctx->obc,
7917       &out_attrs);
7918     if (result < 0) {
7919       if (cb) {
7920         delete cb;
7921       }
7922       return result;
7923     }
7924     cursor.attr_complete = true;
7925     dout(20) << " got attrs" << dendl;
7926   }
7927
7928   int64_t left = out_max - osd_op.outdata.length();
7929
7930   // data
7931   bufferlist& bl = reply_obj.data;
7932   if (left > 0 && !cursor.data_complete) {
7933     if (cursor.data_offset < oi.size) {
7934       uint64_t max_read = MIN(oi.size - cursor.data_offset, (uint64_t)left);
7935       if (cb) {
7936         async_read_started = true;
7937         ctx->pending_async_reads.push_back(
7938           make_pair(
7939             boost::make_tuple(cursor.data_offset, max_read, osd_op.op.flags),
7940             make_pair(&bl, cb)));
7941         cb->len = max_read;
7942
7943         ctx->op_finishers[ctx->current_osd_subop_num].reset(
7944           new ReadFinisher(osd_op));
7945         result = -EINPROGRESS;
7946
7947         dout(10) << __func__ << ": async_read noted for " << soid << dendl;
7948       } else {
7949         result = pgbackend->objects_read_sync(
7950           oi.soid, cursor.data_offset, max_read, osd_op.op.flags, &bl);
7951         if (result < 0)
7952           return result;
7953       }
7954       left -= max_read;
7955       cursor.data_offset += max_read;
7956     }
7957     if (cursor.data_offset == oi.size) {
7958       cursor.data_complete = true;
7959       dout(20) << " got data" << dendl;
7960     }
7961     assert(cursor.data_offset <= oi.size);
7962   }
7963
7964   // omap
7965   uint32_t omap_keys = 0;
7966   if (!pool.info.supports_omap() || !oi.is_omap()) {
7967     cursor.omap_complete = true;
7968   } else {
7969     if (left > 0 && !cursor.omap_complete) {
7970       assert(cursor.data_complete);
7971       if (cursor.omap_offset.empty()) {
7972         osd->store->omap_get_header(ch, ghobject_t(oi.soid),
7973                                     &reply_obj.omap_header);
7974       }
7975       bufferlist omap_data;
7976       ObjectMap::ObjectMapIterator iter =
7977         osd->store->get_omap_iterator(coll, ghobject_t(oi.soid));
7978       assert(iter);
7979       iter->upper_bound(cursor.omap_offset);
7980       for (; iter->valid(); iter->next(false)) {
7981         ++omap_keys;
7982         ::encode(iter->key(), omap_data);
7983         ::encode(iter->value(), omap_data);
7984         left -= iter->key().length() + 4 + iter->value().length() + 4;
7985         if (left <= 0)
7986           break;
7987       }
7988       if (omap_keys) {
7989         ::encode(omap_keys, reply_obj.omap_data);
7990         reply_obj.omap_data.claim_append(omap_data);
7991       }
7992       if (iter->valid()) {
7993         cursor.omap_offset = iter->key();
7994       } else {
7995         cursor.omap_complete = true;
7996         dout(20) << " got omap" << dendl;
7997       }
7998     }
7999   }
8000
8001   if (cursor.is_complete()) {
8002     // include reqids only in the final step.  this is a bit fragile
8003     // but it works...
8004     pg_log.get_log().get_object_reqids(ctx->obc->obs.oi.soid, 10, &reply_obj.reqids);
8005     dout(20) << " got reqids" << dendl;
8006   }
8007
8008   dout(20) << " cursor.is_complete=" << cursor.is_complete()
8009            << " " << out_attrs.size() << " attrs"
8010            << " " << bl.length() << " bytes"
8011            << " " << reply_obj.omap_header.length() << " omap header bytes"
8012            << " " << reply_obj.omap_data.length() << " omap data bytes in "
8013            << omap_keys << " keys"
8014            << " " << reply_obj.reqids.size() << " reqids"
8015            << dendl;
8016   reply_obj.cursor = cursor;
8017   if (!async_read_started) {
8018     ::encode(reply_obj, osd_op.outdata, features);
8019   }
8020   if (cb && !async_read_started) {
8021     delete cb;
8022   }
8023
8024   if (result > 0) {
8025     result = 0;
8026   }
8027   return result;
8028 }
8029
8030 void PrimaryLogPG::fill_in_copy_get_noent(OpRequestRef& op, hobject_t oid,
8031                                           OSDOp& osd_op)
8032 {
8033   // NOTE: we take non-const ref here for claim_op_out_data below; we must
8034   // be careful not to modify anything else that will upset a racing
8035   // operator<<
8036   MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
8037   uint64_t features = m->get_features();
8038   object_copy_data_t reply_obj;
8039
8040   pg_log.get_log().get_object_reqids(oid, 10, &reply_obj.reqids);
8041   dout(20) << __func__ << " got reqids " << reply_obj.reqids << dendl;
8042   ::encode(reply_obj, osd_op.outdata, features);
8043   osd_op.rval = -ENOENT;
8044   MOSDOpReply *reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0, false);
8045   reply->claim_op_out_data(m->ops);
8046   reply->set_result(-ENOENT);
8047   reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
8048   osd->send_message_osd_client(reply, m->get_connection());
8049 }
8050
8051 void PrimaryLogPG::start_copy(CopyCallback *cb, ObjectContextRef obc,
8052                               hobject_t src, object_locator_t oloc,
8053                               version_t version, unsigned flags,
8054                               bool mirror_snapset,
8055                               unsigned src_obj_fadvise_flags,
8056                               unsigned dest_obj_fadvise_flags)
8057 {
8058   const hobject_t& dest = obc->obs.oi.soid;
8059   dout(10) << __func__ << " " << dest
8060            << " from " << src << " " << oloc << " v" << version
8061            << " flags " << flags
8062            << (mirror_snapset ? " mirror_snapset" : "")
8063            << dendl;
8064
8065   assert(!mirror_snapset || (src.snap == CEPH_NOSNAP ||
8066                              src.snap == CEPH_SNAPDIR));
8067
8068   // cancel a previous in-progress copy?
8069   if (copy_ops.count(dest)) {
8070     // FIXME: if the src etc match, we could avoid restarting from the
8071     // beginning.
8072     CopyOpRef cop = copy_ops[dest];
8073     vector<ceph_tid_t> tids;
8074     cancel_copy(cop, false, &tids);
8075     osd->objecter->op_cancel(tids, -ECANCELED);
8076   }
8077
8078   CopyOpRef cop(std::make_shared<CopyOp>(cb, obc, src, oloc, version, flags,
8079                            mirror_snapset, src_obj_fadvise_flags,
8080                            dest_obj_fadvise_flags));
8081   copy_ops[dest] = cop;
8082   obc->start_block();
8083
8084   _copy_some(obc, cop);
8085 }
8086
8087 void PrimaryLogPG::_copy_some(ObjectContextRef obc, CopyOpRef cop)
8088 {
8089   dout(10) << __func__ << " " << obc << " " << cop << dendl;
8090
8091   unsigned flags = 0;
8092   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_FLUSH)
8093     flags |= CEPH_OSD_FLAG_FLUSH;
8094   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE)
8095     flags |= CEPH_OSD_FLAG_IGNORE_CACHE;
8096   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY)
8097     flags |= CEPH_OSD_FLAG_IGNORE_OVERLAY;
8098   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE)
8099     flags |= CEPH_OSD_FLAG_MAP_SNAP_CLONE;
8100   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_RWORDERED)
8101     flags |= CEPH_OSD_FLAG_RWORDERED;
8102
8103   C_GatherBuilder gather(cct);
8104
8105   if (cop->cursor.is_initial() && cop->mirror_snapset) {
8106     // list snaps too.
8107     assert(cop->src.snap == CEPH_NOSNAP);
8108     ObjectOperation op;
8109     op.list_snaps(&cop->results.snapset, NULL);
8110     ceph_tid_t tid = osd->objecter->read(cop->src.oid, cop->oloc, op,
8111                                     CEPH_SNAPDIR, NULL,
8112                                     flags, gather.new_sub(), NULL);
8113     cop->objecter_tid2 = tid;
8114   }
8115
8116   ObjectOperation op;
8117   if (cop->results.user_version) {
8118     op.assert_version(cop->results.user_version);
8119   } else {
8120     // we should learn the version after the first chunk, if we didn't know
8121     // it already!
8122     assert(cop->cursor.is_initial());
8123   }
8124   op.copy_get(&cop->cursor, get_copy_chunk_size(),
8125               &cop->results.object_size, &cop->results.mtime,
8126               &cop->attrs, &cop->data, &cop->omap_header, &cop->omap_data,
8127               &cop->results.snaps, &cop->results.snap_seq,
8128               &cop->results.flags,
8129               &cop->results.source_data_digest,
8130               &cop->results.source_omap_digest,
8131               &cop->results.reqids,
8132               &cop->results.truncate_seq,
8133               &cop->results.truncate_size,
8134               &cop->rval);
8135   op.set_last_op_flags(cop->src_obj_fadvise_flags);
8136
8137   C_Copyfrom *fin = new C_Copyfrom(this, obc->obs.oi.soid,
8138                                    get_last_peering_reset(), cop);
8139   gather.set_finisher(new C_OnFinisher(fin,
8140                                        &osd->objecter_finisher));
8141
8142   ceph_tid_t tid = osd->objecter->read(cop->src.oid, cop->oloc, op,
8143                                   cop->src.snap, NULL,
8144                                   flags,
8145                                   gather.new_sub(),
8146                                   // discover the object version if we don't know it yet
8147                                   cop->results.user_version ? NULL : &cop->results.user_version);
8148   fin->tid = tid;
8149   cop->objecter_tid = tid;
8150   gather.activate();
8151 }
8152
8153 void PrimaryLogPG::process_copy_chunk(hobject_t oid, ceph_tid_t tid, int r)
8154 {
8155   vector<ceph_tid_t> tids;
8156   dout(10) << __func__ << " " << oid << " tid " << tid
8157            << " " << cpp_strerror(r) << dendl;
8158   map<hobject_t,CopyOpRef>::iterator p = copy_ops.find(oid);
8159   if (p == copy_ops.end()) {
8160     dout(10) << __func__ << " no copy_op found" << dendl;
8161     return;
8162   }
8163   CopyOpRef cop = p->second;
8164   if (tid != cop->objecter_tid) {
8165     dout(10) << __func__ << " tid " << tid << " != cop " << cop
8166              << " tid " << cop->objecter_tid << dendl;
8167     return;
8168   }
8169
8170   if (cop->omap_data.length() || cop->omap_header.length())
8171     cop->results.has_omap = true;
8172
8173   if (r >= 0 && !pool.info.supports_omap() &&
8174       (cop->omap_data.length() || cop->omap_header.length())) {
8175     r = -EOPNOTSUPP;
8176   }
8177   cop->objecter_tid = 0;
8178   cop->objecter_tid2 = 0;  // assume this ordered before us (if it happened)
8179   ObjectContextRef& cobc = cop->obc;
8180
8181   if (r < 0)
8182     goto out;
8183
8184   assert(cop->rval >= 0);
8185
8186   if (oid.snap < CEPH_NOSNAP && !cop->results.snaps.empty()) {
8187     // verify snap hasn't been deleted
8188     vector<snapid_t>::iterator p = cop->results.snaps.begin();
8189     while (p != cop->results.snaps.end()) {
8190       if (pool.info.is_removed_snap(*p)) {
8191         dout(10) << __func__ << " clone snap " << *p << " has been deleted"
8192                  << dendl;
8193         for (vector<snapid_t>::iterator q = p + 1;
8194              q != cop->results.snaps.end();
8195              ++q)
8196           *(q - 1) = *q;
8197         cop->results.snaps.resize(cop->results.snaps.size() - 1);
8198       } else {
8199         ++p;
8200       }
8201     }
8202     if (cop->results.snaps.empty()) {
8203       dout(10) << __func__ << " no more snaps for " << oid << dendl;
8204       r = -ENOENT;
8205       goto out;
8206     }
8207   }
8208
8209   assert(cop->rval >= 0);
8210
8211   if (!cop->temp_cursor.data_complete) {
8212     cop->results.data_digest = cop->data.crc32c(cop->results.data_digest);
8213   }
8214   if (pool.info.supports_omap() && !cop->temp_cursor.omap_complete) {
8215     if (cop->omap_header.length()) {
8216       cop->results.omap_digest =
8217         cop->omap_header.crc32c(cop->results.omap_digest);
8218     }
8219     if (cop->omap_data.length()) {
8220       bufferlist keys;
8221       keys.substr_of(cop->omap_data, 4, cop->omap_data.length() - 4);
8222       cop->results.omap_digest = keys.crc32c(cop->results.omap_digest);
8223     }
8224   }
8225
8226   if (!cop->temp_cursor.attr_complete) {
8227     for (map<string,bufferlist>::iterator p = cop->attrs.begin();
8228          p != cop->attrs.end();
8229          ++p) {
8230       cop->results.attrs[string("_") + p->first] = p->second;
8231     }
8232     cop->attrs.clear();
8233   }
8234
8235   if (!cop->cursor.is_complete()) {
8236     // write out what we have so far
8237     if (cop->temp_cursor.is_initial()) {
8238       assert(!cop->results.started_temp_obj);
8239       cop->results.started_temp_obj = true;
8240       cop->results.temp_oid = generate_temp_object(oid);
8241       dout(20) << __func__ << " using temp " << cop->results.temp_oid << dendl;
8242     }
8243     ObjectContextRef tempobc = get_object_context(cop->results.temp_oid, true);
8244     OpContextUPtr ctx = simple_opc_create(tempobc);
8245     if (cop->temp_cursor.is_initial()) {
8246       ctx->new_temp_oid = cop->results.temp_oid;
8247     }
8248     _write_copy_chunk(cop, ctx->op_t.get());
8249     simple_opc_submit(std::move(ctx));
8250     dout(10) << __func__ << " fetching more" << dendl;
8251     _copy_some(cobc, cop);
8252     return;
8253   }
8254
8255   // verify digests?
8256   if (cop->results.is_data_digest() || cop->results.is_omap_digest()) {
8257     dout(20) << __func__ << std::hex
8258       << " got digest: rx data 0x" << cop->results.data_digest
8259       << " omap 0x" << cop->results.omap_digest
8260       << ", source: data 0x" << cop->results.source_data_digest
8261       << " omap 0x" <<  cop->results.source_omap_digest
8262       << std::dec
8263       << " flags " << cop->results.flags
8264       << dendl;
8265   }
8266   if (cop->results.is_data_digest() &&
8267       cop->results.data_digest != cop->results.source_data_digest) {
8268     derr << __func__ << std::hex << " data digest 0x" << cop->results.data_digest
8269          << " != source 0x" << cop->results.source_data_digest << std::dec
8270          << dendl;
8271     osd->clog->error() << info.pgid << " copy from " << cop->src
8272                        << " to " << cop->obc->obs.oi.soid << std::hex
8273                        << " data digest 0x" << cop->results.data_digest
8274                        << " != source 0x" << cop->results.source_data_digest
8275                        << std::dec;
8276     r = -EIO;
8277     goto out;
8278   }
8279   if (cop->results.is_omap_digest() &&
8280       cop->results.omap_digest != cop->results.source_omap_digest) {
8281     derr << __func__ << std::hex
8282          << " omap digest 0x" << cop->results.omap_digest
8283          << " != source 0x" << cop->results.source_omap_digest
8284          << std::dec << dendl;
8285     osd->clog->error() << info.pgid << " copy from " << cop->src
8286                        << " to " << cop->obc->obs.oi.soid << std::hex
8287                        << " omap digest 0x" << cop->results.omap_digest
8288                        << " != source 0x" << cop->results.source_omap_digest
8289                        << std::dec;
8290     r = -EIO;
8291     goto out;
8292   }
8293   if (cct->_conf->osd_debug_inject_copyfrom_error) {
8294     derr << __func__ << " injecting copyfrom failure" << dendl;
8295     r = -EIO;
8296     goto out;
8297   }
8298
8299   cop->results.fill_in_final_tx = std::function<void(PGTransaction*)>(
8300     [this, &cop /* avoid ref cycle */](PGTransaction *t) {
8301       ObjectState& obs = cop->obc->obs;
8302       if (cop->temp_cursor.is_initial()) {
8303         dout(20) << "fill_in_final_tx: writing "
8304                  << "directly to final object" << dendl;
8305         // write directly to final object
8306         cop->results.temp_oid = obs.oi.soid;
8307         _write_copy_chunk(cop, t);
8308       } else {
8309         // finish writing to temp object, then move into place
8310         dout(20) << "fill_in_final_tx: writing to temp object" << dendl;
8311         _write_copy_chunk(cop, t);
8312         t->rename(obs.oi.soid, cop->results.temp_oid);
8313       }
8314       t->setattrs(obs.oi.soid, cop->results.attrs);
8315     });
8316
8317   dout(20) << __func__ << " success; committing" << dendl;
8318
8319  out:
8320   dout(20) << __func__ << " complete r = " << cpp_strerror(r) << dendl;
8321   CopyCallbackResults results(r, &cop->results);
8322   cop->cb->complete(results);
8323
8324   copy_ops.erase(cobc->obs.oi.soid);
8325   cobc->stop_block();
8326
8327   if (r < 0 && cop->results.started_temp_obj) {
8328     dout(10) << __func__ << " deleting partial temp object "
8329              << cop->results.temp_oid << dendl;
8330     ObjectContextRef tempobc = get_object_context(cop->results.temp_oid, true);
8331     OpContextUPtr ctx = simple_opc_create(tempobc);
8332     ctx->op_t->remove(cop->results.temp_oid);
8333     ctx->discard_temp_oid = cop->results.temp_oid;
8334     simple_opc_submit(std::move(ctx));
8335   }
8336
8337   // cancel and requeue proxy ops on this object
8338   if (!r) {
8339     for (map<ceph_tid_t, ProxyReadOpRef>::iterator it = proxyread_ops.begin();
8340         it != proxyread_ops.end();) {
8341       if (it->second->soid == cobc->obs.oi.soid) {
8342         cancel_proxy_read((it++)->second, &tids);
8343       } else {
8344         ++it;
8345       }
8346     }
8347     for (map<ceph_tid_t, ProxyWriteOpRef>::iterator it = proxywrite_ops.begin();
8348          it != proxywrite_ops.end();) {
8349       if (it->second->soid == cobc->obs.oi.soid) {
8350         cancel_proxy_write((it++)->second, &tids);
8351       } else {
8352         ++it;
8353       }
8354     }
8355     osd->objecter->op_cancel(tids, -ECANCELED);
8356     kick_proxy_ops_blocked(cobc->obs.oi.soid);
8357   }
8358
8359   kick_object_context_blocked(cobc);
8360 }
8361
8362 void PrimaryLogPG::cancel_and_requeue_proxy_ops(hobject_t oid) {
8363   vector<ceph_tid_t> tids;
8364   for (map<ceph_tid_t, ProxyReadOpRef>::iterator it = proxyread_ops.begin();
8365       it != proxyread_ops.end();) {
8366     if (it->second->soid == oid) {
8367       cancel_proxy_read((it++)->second, &tids);
8368     } else {
8369       ++it;
8370     }
8371   }
8372   for (map<ceph_tid_t, ProxyWriteOpRef>::iterator it = proxywrite_ops.begin();
8373        it != proxywrite_ops.end();) {
8374     if (it->second->soid == oid) {
8375       cancel_proxy_write((it++)->second, &tids);
8376     } else {
8377       ++it;
8378     }
8379   }
8380   osd->objecter->op_cancel(tids, -ECANCELED);
8381   kick_proxy_ops_blocked(oid);
8382 }
8383
8384 void PrimaryLogPG::_write_copy_chunk(CopyOpRef cop, PGTransaction *t)
8385 {
8386   dout(20) << __func__ << " " << cop
8387            << " " << cop->attrs.size() << " attrs"
8388            << " " << cop->data.length() << " bytes"
8389            << " " << cop->omap_header.length() << " omap header bytes"
8390            << " " << cop->omap_data.length() << " omap data bytes"
8391            << dendl;
8392   if (!cop->temp_cursor.attr_complete) {
8393     t->create(cop->results.temp_oid);
8394   }
8395   if (!cop->temp_cursor.data_complete) {
8396     assert(cop->data.length() + cop->temp_cursor.data_offset ==
8397            cop->cursor.data_offset);
8398     if (pool.info.requires_aligned_append() &&
8399         !cop->cursor.data_complete) {
8400       /**
8401        * Trim off the unaligned bit at the end, we'll adjust cursor.data_offset
8402        * to pick it up on the next pass.
8403        */
8404       assert(cop->temp_cursor.data_offset %
8405              pool.info.required_alignment() == 0);
8406       if (cop->data.length() % pool.info.required_alignment() != 0) {
8407         uint64_t to_trim =
8408           cop->data.length() % pool.info.required_alignment();
8409         bufferlist bl;
8410         bl.substr_of(cop->data, 0, cop->data.length() - to_trim);
8411         cop->data.swap(bl);
8412         cop->cursor.data_offset -= to_trim;
8413         assert(cop->data.length() + cop->temp_cursor.data_offset ==
8414                cop->cursor.data_offset);
8415       }
8416     }
8417     if (cop->data.length()) {
8418       t->write(
8419         cop->results.temp_oid,
8420         cop->temp_cursor.data_offset,
8421         cop->data.length(),
8422         cop->data,
8423         cop->dest_obj_fadvise_flags);
8424     }
8425     cop->data.clear();
8426   }
8427   if (pool.info.supports_omap()) {
8428     if (!cop->temp_cursor.omap_complete) {
8429       if (cop->omap_header.length()) {
8430         t->omap_setheader(
8431           cop->results.temp_oid,
8432           cop->omap_header);
8433         cop->omap_header.clear();
8434       }
8435       if (cop->omap_data.length()) {
8436         map<string,bufferlist> omap;
8437         bufferlist::iterator p = cop->omap_data.begin();
8438         ::decode(omap, p);
8439         t->omap_setkeys(cop->results.temp_oid, omap);
8440         cop->omap_data.clear();
8441       }
8442     }
8443   } else {
8444     assert(cop->omap_header.length() == 0);
8445     assert(cop->omap_data.length() == 0);
8446   }
8447   cop->temp_cursor = cop->cursor;
8448 }
8449
8450 void PrimaryLogPG::finish_copyfrom(CopyFromCallback *cb)
8451 {
8452   OpContext *ctx = cb->ctx;
8453   dout(20) << "finish_copyfrom on " << ctx->obs->oi.soid << dendl;
8454
8455   ObjectState& obs = ctx->new_obs;
8456   if (obs.exists) {
8457     dout(20) << __func__ << ": exists, removing" << dendl;
8458     ctx->op_t->remove(obs.oi.soid);
8459   } else {
8460     ctx->delta_stats.num_objects++;
8461     obs.exists = true;
8462   }
8463   if (cb->is_temp_obj_used()) {
8464     ctx->discard_temp_oid = cb->results->temp_oid;
8465   }
8466   cb->results->fill_in_final_tx(ctx->op_t.get());
8467
8468   // CopyFromCallback fills this in for us
8469   obs.oi.user_version = ctx->user_at_version;
8470
8471   if (cb->results->is_data_digest()) {
8472     obs.oi.set_data_digest(cb->results->data_digest);
8473   } else {
8474     obs.oi.clear_data_digest();
8475   }
8476   if (cb->results->is_omap_digest()) {
8477     obs.oi.set_omap_digest(cb->results->omap_digest);
8478   } else {
8479     obs.oi.clear_omap_digest();
8480   }
8481
8482   obs.oi.truncate_seq = cb->results->truncate_seq;
8483   obs.oi.truncate_size = cb->results->truncate_size;
8484
8485   ctx->extra_reqids = cb->results->reqids;
8486
8487   // cache: clear whiteout?
8488   if (obs.oi.is_whiteout()) {
8489     dout(10) << __func__ << " clearing whiteout on " << obs.oi.soid << dendl;
8490     obs.oi.clear_flag(object_info_t::FLAG_WHITEOUT);
8491     --ctx->delta_stats.num_whiteouts;
8492   }
8493
8494   if (cb->results->has_omap) {
8495     dout(10) << __func__ << " setting omap flag on " << obs.oi.soid << dendl;
8496     obs.oi.set_flag(object_info_t::FLAG_OMAP);
8497   } else {
8498     dout(10) << __func__ << " clearing omap flag on " << obs.oi.soid << dendl;
8499     obs.oi.clear_flag(object_info_t::FLAG_OMAP);
8500   }
8501
8502   interval_set<uint64_t> ch;
8503   if (obs.oi.size > 0)
8504     ch.insert(0, obs.oi.size);
8505   ctx->modified_ranges.union_of(ch);
8506
8507   if (cb->get_data_size() != obs.oi.size) {
8508     ctx->delta_stats.num_bytes -= obs.oi.size;
8509     obs.oi.size = cb->get_data_size();
8510     ctx->delta_stats.num_bytes += obs.oi.size;
8511   }
8512   ctx->delta_stats.num_wr++;
8513   ctx->delta_stats.num_wr_kb += SHIFT_ROUND_UP(obs.oi.size, 10);
8514
8515   osd->logger->inc(l_osd_copyfrom);
8516 }
8517
8518 void PrimaryLogPG::finish_promote(int r, CopyResults *results,
8519                                   ObjectContextRef obc)
8520 {
8521   const hobject_t& soid = obc->obs.oi.soid;
8522   dout(10) << __func__ << " " << soid << " r=" << r
8523            << " uv" << results->user_version << dendl;
8524
8525   if (r == -ECANCELED) {
8526     return;
8527   }
8528
8529   if (r != -ENOENT && soid.is_snap()) {
8530     if (results->snaps.empty()) {
8531       // we must have read "snap" content from the head object in
8532       // the base pool.  use snap_seq to construct what snaps should
8533       // be for this clone (what is was before we evicted the clean
8534       // clone from this pool, and what it will be when we flush and
8535       // the clone eventually happens in the base pool).
8536       SnapSet& snapset = obc->ssc->snapset;
8537       vector<snapid_t>::iterator p = snapset.snaps.begin();
8538       while (p != snapset.snaps.end() && *p > soid.snap)
8539         ++p;
8540       while (p != snapset.snaps.end() && *p > results->snap_seq) {
8541         results->snaps.push_back(*p);
8542         ++p;
8543       }
8544     }
8545
8546     dout(20) << __func__ << " snaps " << results->snaps << dendl;
8547     filter_snapc(results->snaps);
8548
8549     dout(20) << __func__ << " filtered snaps " << results->snaps << dendl;
8550     if (results->snaps.empty()) {
8551       dout(20) << __func__
8552                << " snaps are empty, clone is invalid,"
8553                << " setting r to ENOENT" << dendl;
8554       r = -ENOENT;
8555     }
8556   }
8557
8558   if (r < 0 && results->started_temp_obj) {
8559     dout(10) << __func__ << " abort; will clean up partial work" << dendl;
8560     ObjectContextRef tempobc = get_object_context(results->temp_oid, false);
8561     assert(tempobc);
8562     OpContextUPtr ctx = simple_opc_create(tempobc);
8563     ctx->op_t->remove(results->temp_oid);
8564     simple_opc_submit(std::move(ctx));
8565     results->started_temp_obj = false;
8566   }
8567
8568   if (r == -ENOENT && soid.is_snap()) {
8569     dout(10) << __func__
8570              << ": enoent while trying to promote clone, " << soid
8571              << " must have been trimmed, removing from snapset"
8572              << dendl;
8573     hobject_t head(soid.get_head());
8574     ObjectContextRef obc = get_object_context(head, false);
8575     assert(obc);
8576
8577     OpContextUPtr tctx = simple_opc_create(obc);
8578     tctx->at_version = get_next_version();
8579     filter_snapc(tctx->new_snapset.snaps);
8580     vector<snapid_t> new_clones;
8581     map<snapid_t, vector<snapid_t>> new_clone_snaps;
8582     for (vector<snapid_t>::iterator i = tctx->new_snapset.clones.begin();
8583          i != tctx->new_snapset.clones.end();
8584          ++i) {
8585       if (*i != soid.snap) {
8586         new_clones.push_back(*i);
8587         auto p = tctx->new_snapset.clone_snaps.find(*i);
8588         if (p != tctx->new_snapset.clone_snaps.end()) {
8589           new_clone_snaps[*i] = p->second;
8590         }
8591       }
8592     }
8593     tctx->new_snapset.clones.swap(new_clones);
8594     tctx->new_snapset.clone_overlap.erase(soid.snap);
8595     tctx->new_snapset.clone_size.erase(soid.snap);
8596     tctx->new_snapset.clone_snaps.swap(new_clone_snaps);
8597
8598     // take RWWRITE lock for duration of our local write.  ignore starvation.
8599     if (!tctx->lock_manager.take_write_lock(
8600           head,
8601           obc)) {
8602       assert(0 == "problem!");
8603     }
8604     dout(20) << __func__ << " took lock on obc, " << obc->rwstate << dendl;
8605
8606     finish_ctx(tctx.get(), pg_log_entry_t::PROMOTE);
8607
8608     simple_opc_submit(std::move(tctx));
8609     return;
8610   }
8611
8612   bool whiteout = false;
8613   if (r == -ENOENT) {
8614     assert(soid.snap == CEPH_NOSNAP); // snap case is above
8615     dout(10) << __func__ << " whiteout " << soid << dendl;
8616     whiteout = true;
8617   }
8618
8619   if (r < 0 && !whiteout) {
8620     derr << __func__ << " unexpected promote error " << cpp_strerror(r) << dendl;
8621     // pass error to everyone blocked on this object
8622     // FIXME: this is pretty sloppy, but at this point we got
8623     // something unexpected and don't have many other options.
8624     map<hobject_t,list<OpRequestRef>>::iterator blocked_iter =
8625       waiting_for_blocked_object.find(soid);
8626     if (blocked_iter != waiting_for_blocked_object.end()) {
8627       while (!blocked_iter->second.empty()) {
8628         osd->reply_op_error(blocked_iter->second.front(), r);
8629         blocked_iter->second.pop_front();
8630       }
8631       waiting_for_blocked_object.erase(blocked_iter);
8632     }
8633     return;
8634   }
8635
8636   osd->promote_finish(results->object_size);
8637
8638   OpContextUPtr tctx =  simple_opc_create(obc);
8639   tctx->at_version = get_next_version();
8640
8641   ++tctx->delta_stats.num_objects;
8642   if (soid.snap < CEPH_NOSNAP)
8643     ++tctx->delta_stats.num_object_clones;
8644   tctx->new_obs.exists = true;
8645
8646   tctx->extra_reqids = results->reqids;
8647
8648   bool legacy_snapset = tctx->new_snapset.is_legacy() ||
8649     get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
8650
8651   if (whiteout) {
8652     // create a whiteout
8653     tctx->op_t->create(soid);
8654     tctx->new_obs.oi.set_flag(object_info_t::FLAG_WHITEOUT);
8655     ++tctx->delta_stats.num_whiteouts;
8656     dout(20) << __func__ << " creating whiteout on " << soid << dendl;
8657     osd->logger->inc(l_osd_tier_whiteout);
8658   } else {
8659     if (results->has_omap) {
8660       dout(10) << __func__ << " setting omap flag on " << soid << dendl;
8661       tctx->new_obs.oi.set_flag(object_info_t::FLAG_OMAP);
8662       ++tctx->delta_stats.num_objects_omap;
8663     }
8664
8665     results->fill_in_final_tx(tctx->op_t.get());
8666     if (results->started_temp_obj) {
8667       tctx->discard_temp_oid = results->temp_oid;
8668     }
8669     tctx->new_obs.oi.size = results->object_size;
8670     tctx->new_obs.oi.user_version = results->user_version;
8671     if (results->is_data_digest()) {
8672       tctx->new_obs.oi.set_data_digest(results->data_digest);
8673     } else {
8674       tctx->new_obs.oi.clear_data_digest();
8675     }
8676     if (results->is_omap_digest()) {
8677       tctx->new_obs.oi.set_omap_digest(results->omap_digest);
8678     } else {
8679       tctx->new_obs.oi.clear_omap_digest();
8680     }
8681     tctx->new_obs.oi.truncate_seq = results->truncate_seq;
8682     tctx->new_obs.oi.truncate_size = results->truncate_size;
8683
8684     if (soid.snap != CEPH_NOSNAP) {
8685       if (legacy_snapset) {
8686         tctx->new_obs.oi.legacy_snaps = results->snaps;
8687         assert(!tctx->new_obs.oi.legacy_snaps.empty());
8688       } else {
8689         // it's already in the snapset
8690         assert(obc->ssc->snapset.clone_snaps.count(soid.snap));
8691       }
8692       assert(obc->ssc->snapset.clone_size.count(soid.snap));
8693       assert(obc->ssc->snapset.clone_size[soid.snap] ==
8694              results->object_size);
8695       assert(obc->ssc->snapset.clone_overlap.count(soid.snap));
8696
8697       tctx->delta_stats.num_bytes += obc->ssc->snapset.get_clone_bytes(soid.snap);
8698     } else {
8699       tctx->delta_stats.num_bytes += results->object_size;
8700     }
8701   }
8702
8703   if (results->mirror_snapset) {
8704     assert(tctx->new_obs.oi.soid.snap == CEPH_NOSNAP);
8705     tctx->new_snapset.from_snap_set(
8706       results->snapset,
8707       get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS);
8708   }
8709   tctx->new_snapset.head_exists = true;
8710   dout(20) << __func__ << " new_snapset " << tctx->new_snapset << dendl;
8711
8712   // take RWWRITE lock for duration of our local write.  ignore starvation.
8713   if (!tctx->lock_manager.take_write_lock(
8714         obc->obs.oi.soid,
8715         obc)) {
8716     assert(0 == "problem!");
8717   }
8718   dout(20) << __func__ << " took lock on obc, " << obc->rwstate << dendl;
8719
8720   finish_ctx(tctx.get(), pg_log_entry_t::PROMOTE);
8721
8722   simple_opc_submit(std::move(tctx));
8723
8724   osd->logger->inc(l_osd_tier_promote);
8725
8726   if (agent_state &&
8727       agent_state->is_idle())
8728     agent_choose_mode();
8729 }
8730
8731 void PrimaryLogPG::cancel_copy(CopyOpRef cop, bool requeue,
8732                                vector<ceph_tid_t> *tids)
8733 {
8734   dout(10) << __func__ << " " << cop->obc->obs.oi.soid
8735            << " from " << cop->src << " " << cop->oloc
8736            << " v" << cop->results.user_version << dendl;
8737
8738   // cancel objecter op, if we can
8739   if (cop->objecter_tid) {
8740     tids->push_back(cop->objecter_tid);
8741     cop->objecter_tid = 0;
8742     if (cop->objecter_tid2) {
8743       tids->push_back(cop->objecter_tid2);
8744       cop->objecter_tid2 = 0;
8745     }
8746   }
8747
8748   copy_ops.erase(cop->obc->obs.oi.soid);
8749   cop->obc->stop_block();
8750
8751   kick_object_context_blocked(cop->obc);
8752   cop->results.should_requeue = requeue;
8753   CopyCallbackResults result(-ECANCELED, &cop->results);
8754   cop->cb->complete(result);
8755
8756   // There may still be an objecter callback referencing this copy op.
8757   // That callback will not need the obc since it's been canceled, and
8758   // we need the obc reference to go away prior to flush.
8759   cop->obc = ObjectContextRef();
8760 }
8761
8762 void PrimaryLogPG::cancel_copy_ops(bool requeue, vector<ceph_tid_t> *tids)
8763 {
8764   dout(10) << __func__ << dendl;
8765   map<hobject_t,CopyOpRef>::iterator p = copy_ops.begin();
8766   while (p != copy_ops.end()) {
8767     // requeue this op? can I queue up all of them?
8768     cancel_copy((p++)->second, requeue, tids);
8769   }
8770 }
8771
8772
8773 // ========================================================================
8774 // flush
8775 //
8776 // Flush a dirty object in the cache tier by writing it back to the
8777 // base tier.  The sequence looks like:
8778 //
8779 //  * send a copy-from operation to the base tier to copy the current
8780 //    version of the object
8781 //  * base tier will pull the object via (perhaps multiple) copy-get(s)
8782 //  * on completion, we check if the object has been modified.  if so,
8783 //    just reply with -EAGAIN.
8784 //  * try to take a write lock so we can clear the dirty flag.  if this
8785 //    fails, wait and retry
8786 //  * start a repop that clears the bit.
8787 //
8788 // If we have to wait, we will retry by coming back through the
8789 // start_flush method.  We check if a flush is already in progress
8790 // and, if so, try to finish it by rechecking the version and trying
8791 // to clear the dirty bit.
8792 //
8793 // In order for the cache-flush (a write op) to not block the copy-get
8794 // from reading the object, the client *must* set the SKIPRWLOCKS
8795 // flag.
8796 //
8797 // NOTE: normally writes are strictly ordered for the client, but
8798 // flushes are special in that they can be reordered with respect to
8799 // other writes.  In particular, we can't have a flush request block
8800 // an update to the cache pool object!
8801
8802 struct C_Flush : public Context {
8803   PrimaryLogPGRef pg;
8804   hobject_t oid;
8805   epoch_t last_peering_reset;
8806   ceph_tid_t tid;
8807   utime_t start;
8808   C_Flush(PrimaryLogPG *p, hobject_t o, epoch_t lpr)
8809     : pg(p), oid(o), last_peering_reset(lpr),
8810       tid(0), start(ceph_clock_now())
8811   {}
8812   void finish(int r) override {
8813     if (r == -ECANCELED)
8814       return;
8815     pg->lock();
8816     if (last_peering_reset == pg->get_last_peering_reset()) {
8817       pg->finish_flush(oid, tid, r);
8818       pg->osd->logger->tinc(l_osd_tier_flush_lat, ceph_clock_now() - start);
8819     }
8820     pg->unlock();
8821   }
8822 };
8823
8824 int PrimaryLogPG::start_flush(
8825   OpRequestRef op, ObjectContextRef obc,
8826   bool blocking, hobject_t *pmissing,
8827   boost::optional<std::function<void()>> &&on_flush)
8828 {
8829   const object_info_t& oi = obc->obs.oi;
8830   const hobject_t& soid = oi.soid;
8831   dout(10) << __func__ << " " << soid
8832            << " v" << oi.version
8833            << " uv" << oi.user_version
8834            << " " << (blocking ? "blocking" : "non-blocking/best-effort")
8835            << dendl;
8836
8837   // get a filtered snapset, need to remove removed snaps
8838   SnapSet snapset = obc->ssc->snapset.get_filtered(pool.info);
8839
8840   // verify there are no (older) check for dirty clones
8841   {
8842     dout(20) << " snapset " << snapset << dendl;
8843     vector<snapid_t>::reverse_iterator p = snapset.clones.rbegin();
8844     while (p != snapset.clones.rend() && *p >= soid.snap)
8845       ++p;
8846     if (p != snapset.clones.rend()) {
8847       hobject_t next = soid;
8848       next.snap = *p;
8849       assert(next.snap < soid.snap);
8850       if (pg_log.get_missing().is_missing(next)) {
8851         dout(10) << __func__ << " missing clone is " << next << dendl;
8852         if (pmissing)
8853           *pmissing = next;
8854         return -ENOENT;
8855       }
8856       ObjectContextRef older_obc = get_object_context(next, false);
8857       if (older_obc) {
8858         dout(20) << __func__ << " next oldest clone is " << older_obc->obs.oi
8859                  << dendl;
8860         if (older_obc->obs.oi.is_dirty()) {
8861           dout(10) << __func__ << " next oldest clone is dirty: "
8862                    << older_obc->obs.oi << dendl;
8863           return -EBUSY;
8864         }
8865       } else {
8866         dout(20) << __func__ << " next oldest clone " << next
8867                  << " is not present; implicitly clean" << dendl;
8868       }
8869     } else {
8870       dout(20) << __func__ << " no older clones" << dendl;
8871     }
8872   }
8873
8874   if (blocking)
8875     obc->start_block();
8876
8877   map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(soid);
8878   if (p != flush_ops.end()) {
8879     FlushOpRef fop = p->second;
8880     if (fop->op == op) {
8881       // we couldn't take the write lock on a cache-try-flush before;
8882       // now we are trying again for the lock.
8883       return try_flush_mark_clean(fop);
8884     }
8885     if (fop->flushed_version == obc->obs.oi.user_version &&
8886         (fop->blocking || !blocking)) {
8887       // nonblocking can join anything
8888       // blocking can only join a blocking flush
8889       dout(20) << __func__ << " piggybacking on existing flush " << dendl;
8890       if (op)
8891         fop->dup_ops.push_back(op);
8892       return -EAGAIN;   // clean up this ctx; op will retry later
8893     }
8894
8895     // cancel current flush since it will fail anyway, or because we
8896     // are blocking and the existing flush is nonblocking.
8897     dout(20) << __func__ << " canceling previous flush; it will fail" << dendl;
8898     if (fop->op)
8899       osd->reply_op_error(fop->op, -EBUSY);
8900     while (!fop->dup_ops.empty()) {
8901       osd->reply_op_error(fop->dup_ops.front(), -EBUSY);
8902       fop->dup_ops.pop_front();
8903     }
8904     vector<ceph_tid_t> tids;
8905     cancel_flush(fop, false, &tids);
8906     osd->objecter->op_cancel(tids, -ECANCELED);
8907   }
8908
8909   /**
8910    * In general, we need to send a delete and a copyfrom.
8911    * Consider snapc 10:[10, 9, 8, 4, 3, 2]:[10(10, 9), 4(4,3,2)]
8912    * where 4 is marked as clean.  To flush 10, we have to:
8913    * 1) delete 4:[4,3,2] -- Logically, the object does not exist after 4
8914    * 2) copyfrom 8:[8,4,3,2] -- flush object after snap 8
8915    *
8916    * There is a complicating case.  Supposed there had been a clone 7
8917    * for snaps [7, 6] which has been trimmed since they no longer exist.
8918    * In the base pool, we'd have 5:[4,3,2]:[4(4,3,2)]+head.  When we submit
8919    * the delete, the snap will be promoted to 5, and the head will become
8920    * a snapdir.  When the copy-from goes through, we'll end up with
8921    * 8:[8,4,3,2]:[4(4,3,2)]+head.
8922    *
8923    * Another complication is the case where there is an interval change
8924    * after doing the delete and the flush but before marking the object
8925    * clean.  We'll happily delete head and then recreate it at the same
8926    * sequence number, which works out ok.
8927    */
8928
8929   SnapContext snapc, dsnapc;
8930   if (snapset.seq != 0) {
8931     if (soid.snap == CEPH_NOSNAP) {
8932       snapc.seq = snapset.seq;
8933       snapc.snaps = snapset.snaps;
8934     } else {
8935       snapid_t min_included_snap;
8936       if (snapset.is_legacy()) {
8937         min_included_snap = oi.legacy_snaps.back();
8938       } else {
8939         auto p = snapset.clone_snaps.find(soid.snap);
8940         assert(p != snapset.clone_snaps.end());
8941         min_included_snap = p->second.back();
8942       }
8943       snapc = snapset.get_ssc_as_of(min_included_snap - 1);
8944     }
8945
8946     snapid_t prev_snapc = 0;
8947     for (vector<snapid_t>::reverse_iterator citer = snapset.clones.rbegin();
8948          citer != snapset.clones.rend();
8949          ++citer) {
8950       if (*citer < soid.snap) {
8951         prev_snapc = *citer;
8952         break;
8953       }
8954     }
8955
8956     dsnapc = snapset.get_ssc_as_of(prev_snapc);
8957   }
8958
8959   object_locator_t base_oloc(soid);
8960   base_oloc.pool = pool.info.tier_of;
8961
8962   if (dsnapc.seq < snapc.seq) {
8963     ObjectOperation o;
8964     o.remove();
8965     osd->objecter->mutate(
8966       soid.oid,
8967       base_oloc,
8968       o,
8969       dsnapc,
8970       ceph::real_clock::from_ceph_timespec(oi.mtime),
8971       (CEPH_OSD_FLAG_IGNORE_OVERLAY |
8972        CEPH_OSD_FLAG_ENFORCE_SNAPC),
8973       NULL /* no callback, we'll rely on the ordering w.r.t the next op */);
8974   }
8975
8976   FlushOpRef fop(std::make_shared<FlushOp>());
8977   fop->obc = obc;
8978   fop->flushed_version = oi.user_version;
8979   fop->blocking = blocking;
8980   fop->on_flush = std::move(on_flush);
8981   fop->op = op;
8982
8983   ObjectOperation o;
8984   if (oi.is_whiteout()) {
8985     fop->removal = true;
8986     o.remove();
8987   } else {
8988     object_locator_t oloc(soid);
8989     o.copy_from(soid.oid.name, soid.snap, oloc, oi.user_version,
8990                 CEPH_OSD_COPY_FROM_FLAG_FLUSH |
8991                 CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
8992                 CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
8993                 CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE,
8994                 LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL|LIBRADOS_OP_FLAG_FADVISE_NOCACHE);
8995
8996     //mean the base tier don't cache data after this
8997     if (agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL)
8998       o.set_last_op_flags(LIBRADOS_OP_FLAG_FADVISE_DONTNEED);
8999   }
9000   C_Flush *fin = new C_Flush(this, soid, get_last_peering_reset());
9001
9002   ceph_tid_t tid = osd->objecter->mutate(
9003     soid.oid, base_oloc, o, snapc,
9004     ceph::real_clock::from_ceph_timespec(oi.mtime),
9005     CEPH_OSD_FLAG_IGNORE_OVERLAY | CEPH_OSD_FLAG_ENFORCE_SNAPC,
9006     new C_OnFinisher(fin,
9007                      &osd->objecter_finisher));
9008   /* we're under the pg lock and fin->finish() is grabbing that */
9009   fin->tid = tid;
9010   fop->objecter_tid = tid;
9011
9012   flush_ops[soid] = fop;
9013   info.stats.stats.sum.num_flush++;
9014   info.stats.stats.sum.num_flush_kb += SHIFT_ROUND_UP(oi.size, 10);
9015   return -EINPROGRESS;
9016 }
9017
9018 void PrimaryLogPG::finish_flush(hobject_t oid, ceph_tid_t tid, int r)
9019 {
9020   dout(10) << __func__ << " " << oid << " tid " << tid
9021            << " " << cpp_strerror(r) << dendl;
9022   map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(oid);
9023   if (p == flush_ops.end()) {
9024     dout(10) << __func__ << " no flush_op found" << dendl;
9025     return;
9026   }
9027   FlushOpRef fop = p->second;
9028   if (tid != fop->objecter_tid) {
9029     dout(10) << __func__ << " tid " << tid << " != fop " << fop
9030              << " tid " << fop->objecter_tid << dendl;
9031     return;
9032   }
9033   ObjectContextRef obc = fop->obc;
9034   fop->objecter_tid = 0;
9035
9036   if (r < 0 && !(r == -ENOENT && fop->removal)) {
9037     if (fop->op)
9038       osd->reply_op_error(fop->op, -EBUSY);
9039     if (fop->blocking) {
9040       obc->stop_block();
9041       kick_object_context_blocked(obc);
9042     }
9043
9044     if (!fop->dup_ops.empty()) {
9045       dout(20) << __func__ << " requeueing dups" << dendl;
9046       requeue_ops(fop->dup_ops);
9047     }
9048     if (fop->on_flush) {
9049       (*(fop->on_flush))();
9050       fop->on_flush = boost::none;
9051     }
9052     flush_ops.erase(oid);
9053     return;
9054   }
9055
9056   r = try_flush_mark_clean(fop);
9057   if (r == -EBUSY && fop->op) {
9058     osd->reply_op_error(fop->op, r);
9059   }
9060 }
9061
9062 int PrimaryLogPG::try_flush_mark_clean(FlushOpRef fop)
9063 {
9064   ObjectContextRef obc = fop->obc;
9065   const hobject_t& oid = obc->obs.oi.soid;
9066
9067   if (fop->blocking) {
9068     obc->stop_block();
9069     kick_object_context_blocked(obc);
9070   }
9071
9072   if (fop->flushed_version != obc->obs.oi.user_version ||
9073       !obc->obs.exists) {
9074     if (obc->obs.exists)
9075       dout(10) << __func__ << " flushed_version " << fop->flushed_version
9076                << " != current " << obc->obs.oi.user_version
9077                << dendl;
9078     else
9079       dout(10) << __func__ << " object no longer exists" << dendl;
9080
9081     if (!fop->dup_ops.empty()) {
9082       dout(20) << __func__ << " requeueing dups" << dendl;
9083       requeue_ops(fop->dup_ops);
9084     }
9085     if (fop->on_flush) {
9086       (*(fop->on_flush))();
9087       fop->on_flush = boost::none;
9088     }
9089     flush_ops.erase(oid);
9090     if (fop->blocking)
9091       osd->logger->inc(l_osd_tier_flush_fail);
9092     else
9093       osd->logger->inc(l_osd_tier_try_flush_fail);
9094     return -EBUSY;
9095   }
9096
9097   if (!fop->blocking &&
9098       write_blocked_by_scrub(oid)) {
9099     if (fop->op) {
9100       dout(10) << __func__ << " blocked by scrub" << dendl;
9101       requeue_op(fop->op);
9102       requeue_ops(fop->dup_ops);
9103       return -EAGAIN;    // will retry
9104     } else {
9105       osd->logger->inc(l_osd_tier_try_flush_fail);
9106       vector<ceph_tid_t> tids;
9107       cancel_flush(fop, false, &tids);
9108       osd->objecter->op_cancel(tids, -ECANCELED);
9109       return -ECANCELED;
9110     }
9111   }
9112
9113   // successfully flushed, can we evict this object?
9114   if (!fop->op && agent_state->evict_mode != TierAgentState::EVICT_MODE_IDLE &&
9115       agent_maybe_evict(obc, true)) {
9116     osd->logger->inc(l_osd_tier_clean);
9117     if (fop->on_flush) {
9118       (*(fop->on_flush))();
9119       fop->on_flush = boost::none;
9120     }
9121     flush_ops.erase(oid);
9122     return 0;
9123   }
9124
9125   dout(10) << __func__ << " clearing DIRTY flag for " << oid << dendl;
9126   OpContextUPtr ctx = simple_opc_create(fop->obc);
9127
9128   // successfully flushed; can we clear the dirty bit?
9129   // try to take the lock manually, since we don't
9130   // have a ctx yet.
9131   if (ctx->lock_manager.get_lock_type(
9132         ObjectContext::RWState::RWWRITE,
9133         oid,
9134         obc,
9135         fop->op)) {
9136     dout(20) << __func__ << " took write lock" << dendl;
9137   } else if (fop->op) {
9138     dout(10) << __func__ << " waiting on write lock " << fop->op << " "
9139              << fop->dup_ops << dendl;
9140     close_op_ctx(ctx.release());
9141     // fop->op is now waiting on the lock; get fop->dup_ops to wait too.
9142     for (auto op : fop->dup_ops) {
9143       bool locked = ctx->lock_manager.get_lock_type(
9144         ObjectContext::RWState::RWWRITE,
9145         oid,
9146         obc,
9147         op);
9148       assert(!locked);
9149     }
9150     return -EAGAIN;    // will retry
9151   } else {
9152     dout(10) << __func__ << " failed write lock, no op; failing" << dendl;
9153     close_op_ctx(ctx.release());
9154     osd->logger->inc(l_osd_tier_try_flush_fail);
9155     vector<ceph_tid_t> tids;
9156     cancel_flush(fop, false, &tids);
9157     osd->objecter->op_cancel(tids, -ECANCELED);
9158     return -ECANCELED;
9159   }
9160
9161   if (fop->on_flush) {
9162     ctx->register_on_finish(*(fop->on_flush));
9163     fop->on_flush = boost::none;
9164   }
9165
9166   ctx->at_version = get_next_version();
9167
9168   ctx->new_obs = obc->obs;
9169   ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
9170   --ctx->delta_stats.num_objects_dirty;
9171
9172   finish_ctx(ctx.get(), pg_log_entry_t::CLEAN);
9173
9174   osd->logger->inc(l_osd_tier_clean);
9175
9176   if (!fop->dup_ops.empty() || fop->op) {
9177     dout(20) << __func__ << " requeueing for " << ctx->at_version << dendl;
9178     list<OpRequestRef> ls;
9179     if (fop->op)
9180       ls.push_back(fop->op);
9181     ls.splice(ls.end(), fop->dup_ops);
9182     requeue_ops(ls);
9183   }
9184
9185   simple_opc_submit(std::move(ctx));
9186
9187   flush_ops.erase(oid);
9188
9189   if (fop->blocking)
9190     osd->logger->inc(l_osd_tier_flush);
9191   else
9192     osd->logger->inc(l_osd_tier_try_flush);
9193
9194   return -EINPROGRESS;
9195 }
9196
9197 void PrimaryLogPG::cancel_flush(FlushOpRef fop, bool requeue,
9198                                 vector<ceph_tid_t> *tids)
9199 {
9200   dout(10) << __func__ << " " << fop->obc->obs.oi.soid << " tid "
9201            << fop->objecter_tid << dendl;
9202   if (fop->objecter_tid) {
9203     tids->push_back(fop->objecter_tid);
9204     fop->objecter_tid = 0;
9205   }
9206   if (fop->io_tids.size()) {
9207     for (auto &p : fop->io_tids) {
9208       tids->push_back(p.second);
9209       p.second = 0;
9210     }
9211   }
9212   if (fop->blocking && fop->obc->is_blocked()) {
9213     fop->obc->stop_block();
9214     kick_object_context_blocked(fop->obc);
9215   }
9216   if (requeue) {
9217     if (fop->op)
9218       requeue_op(fop->op);
9219     requeue_ops(fop->dup_ops);
9220   }
9221   if (fop->on_flush) {
9222     (*(fop->on_flush))();
9223     fop->on_flush = boost::none;
9224   }
9225   flush_ops.erase(fop->obc->obs.oi.soid);
9226 }
9227
9228 void PrimaryLogPG::cancel_flush_ops(bool requeue, vector<ceph_tid_t> *tids)
9229 {
9230   dout(10) << __func__ << dendl;
9231   map<hobject_t,FlushOpRef>::iterator p = flush_ops.begin();
9232   while (p != flush_ops.end()) {
9233     cancel_flush((p++)->second, requeue, tids);
9234   }
9235 }
9236
9237 bool PrimaryLogPG::is_present_clone(hobject_t coid)
9238 {
9239   if (!pool.info.allow_incomplete_clones())
9240     return true;
9241   if (is_missing_object(coid))
9242     return true;
9243   ObjectContextRef obc = get_object_context(coid, false);
9244   return obc && obc->obs.exists;
9245 }
9246
9247 // ========================================================================
9248 // rep op gather
9249
9250 class C_OSD_RepopApplied : public Context {
9251   PrimaryLogPGRef pg;
9252   boost::intrusive_ptr<PrimaryLogPG::RepGather> repop;
9253 public:
9254   C_OSD_RepopApplied(PrimaryLogPG *pg, PrimaryLogPG::RepGather *repop)
9255   : pg(pg), repop(repop) {}
9256   void finish(int) override {
9257     pg->repop_all_applied(repop.get());
9258   }
9259 };
9260
9261
9262 void PrimaryLogPG::repop_all_applied(RepGather *repop)
9263 {
9264   dout(10) << __func__ << ": repop tid " << repop->rep_tid << " all applied "
9265            << dendl;
9266   assert(!repop->applies_with_commit);
9267   repop->all_applied = true;
9268   if (!repop->rep_aborted) {
9269     eval_repop(repop);
9270   }
9271 }
9272
9273 class C_OSD_RepopCommit : public Context {
9274   PrimaryLogPGRef pg;
9275   boost::intrusive_ptr<PrimaryLogPG::RepGather> repop;
9276 public:
9277   C_OSD_RepopCommit(PrimaryLogPG *pg, PrimaryLogPG::RepGather *repop)
9278     : pg(pg), repop(repop) {}
9279   void finish(int) override {
9280     pg->repop_all_committed(repop.get());
9281   }
9282 };
9283
9284 void PrimaryLogPG::repop_all_committed(RepGather *repop)
9285 {
9286   dout(10) << __func__ << ": repop tid " << repop->rep_tid << " all committed "
9287            << dendl;
9288   repop->all_committed = true;
9289   if (repop->applies_with_commit) {
9290     assert(!repop->all_applied);
9291     repop->all_applied = true;
9292   }
9293
9294   if (!repop->rep_aborted) {
9295     if (repop->v != eversion_t()) {
9296       last_update_ondisk = repop->v;
9297       last_complete_ondisk = repop->pg_local_last_complete;
9298     }
9299     eval_repop(repop);
9300   }
9301 }
9302
9303 void PrimaryLogPG::op_applied(const eversion_t &applied_version)
9304 {
9305   dout(10) << "op_applied version " << applied_version << dendl;
9306   if (applied_version == eversion_t())
9307     return;
9308   assert(applied_version > last_update_applied);
9309   assert(applied_version <= info.last_update);
9310   last_update_applied = applied_version;
9311   if (is_primary()) {
9312     if (scrubber.active) {
9313       if (last_update_applied >= scrubber.subset_last_update) {
9314         if (ops_blocked_by_scrub()) {
9315           requeue_scrub(true);
9316         } else {
9317           requeue_scrub(false);
9318         }
9319
9320       }
9321     } else {
9322       assert(scrubber.start == scrubber.end);
9323     }
9324   } else {
9325     if (scrubber.active_rep_scrub) {
9326       if (last_update_applied >= static_cast<const MOSDRepScrub*>(
9327             scrubber.active_rep_scrub->get_req())->scrub_to) {
9328         osd->enqueue_back(
9329           info.pgid,
9330           PGQueueable(scrubber.active_rep_scrub, get_osdmap()->get_epoch()));
9331         scrubber.active_rep_scrub = OpRequestRef();
9332       }
9333     }
9334   }
9335 }
9336
9337 void PrimaryLogPG::eval_repop(RepGather *repop)
9338 {
9339   const MOSDOp *m = NULL;
9340   if (repop->op)
9341     m = static_cast<const MOSDOp *>(repop->op->get_req());
9342
9343   if (m)
9344     dout(10) << "eval_repop " << *repop
9345              << (repop->rep_done ? " DONE" : "")
9346              << dendl;
9347   else
9348     dout(10) << "eval_repop " << *repop << " (no op)"
9349              << (repop->rep_done ? " DONE" : "")
9350              << dendl;
9351
9352   if (repop->rep_done)
9353     return;
9354
9355   // ondisk?
9356   if (repop->all_committed) {
9357     dout(10) << " commit: " << *repop << dendl;
9358     for (auto p = repop->on_committed.begin();
9359          p != repop->on_committed.end();
9360          repop->on_committed.erase(p++)) {
9361       (*p)();
9362     }
9363     // send dup commits, in order
9364     if (waiting_for_ondisk.count(repop->v)) {
9365       assert(waiting_for_ondisk.begin()->first == repop->v);
9366       for (list<pair<OpRequestRef, version_t> >::iterator i =
9367              waiting_for_ondisk[repop->v].begin();
9368            i != waiting_for_ondisk[repop->v].end();
9369            ++i) {
9370         osd->reply_op_error(i->first, repop->r, repop->v,
9371                             i->second);
9372       }
9373       waiting_for_ondisk.erase(repop->v);
9374     }
9375   }
9376
9377   // applied?
9378   if (repop->all_applied) {
9379     if (repop->applies_with_commit) {
9380       assert(repop->on_applied.empty());
9381     }
9382     dout(10) << " applied: " << *repop << " " << dendl;
9383     for (auto p = repop->on_applied.begin();
9384          p != repop->on_applied.end();
9385          repop->on_applied.erase(p++)) {
9386       (*p)();
9387     }
9388   }
9389
9390   // done.
9391   if (repop->all_applied && repop->all_committed) {
9392     repop->rep_done = true;
9393
9394     publish_stats_to_osd();
9395     calc_min_last_complete_ondisk();
9396
9397     dout(10) << " removing " << *repop << dendl;
9398     assert(!repop_queue.empty());
9399     dout(20) << "   q front is " << *repop_queue.front() << dendl;
9400     if (repop_queue.front() != repop) {
9401       if (!repop->applies_with_commit) {
9402         dout(0) << " removing " << *repop << dendl;
9403         dout(0) << "   q front is " << *repop_queue.front() << dendl;
9404         assert(repop_queue.front() == repop);
9405       }
9406     } else {
9407       RepGather *to_remove = nullptr;
9408       while (!repop_queue.empty() &&
9409              (to_remove = repop_queue.front())->rep_done) {
9410         repop_queue.pop_front();
9411         for (auto p = to_remove->on_success.begin();
9412              p != to_remove->on_success.end();
9413              to_remove->on_success.erase(p++)) {
9414           (*p)();
9415         }
9416         remove_repop(to_remove);
9417       }
9418     }
9419   }
9420 }
9421
9422 void PrimaryLogPG::issue_repop(RepGather *repop, OpContext *ctx)
9423 {
9424   FUNCTRACE();
9425   const hobject_t& soid = ctx->obs->oi.soid;
9426   dout(7) << "issue_repop rep_tid " << repop->rep_tid
9427           << " o " << soid
9428           << dendl;
9429
9430   repop->v = ctx->at_version;
9431   if (ctx->at_version > eversion_t()) {
9432     for (set<pg_shard_t>::iterator i = actingbackfill.begin();
9433          i != actingbackfill.end();
9434          ++i) {
9435       if (*i == get_primary()) continue;
9436       pg_info_t &pinfo = peer_info[*i];
9437       // keep peer_info up to date
9438       if (pinfo.last_complete == pinfo.last_update)
9439         pinfo.last_complete = ctx->at_version;
9440       pinfo.last_update = ctx->at_version;
9441     }
9442   }
9443
9444   ctx->obc->ondisk_write_lock();
9445
9446   bool unlock_snapset_obc = false;
9447   ctx->op_t->add_obc(ctx->obc);
9448   if (ctx->clone_obc) {
9449     ctx->clone_obc->ondisk_write_lock();
9450     ctx->op_t->add_obc(ctx->clone_obc);
9451   }
9452   if (ctx->snapset_obc && ctx->snapset_obc->obs.oi.soid !=
9453       ctx->obc->obs.oi.soid) {
9454     ctx->snapset_obc->ondisk_write_lock();
9455     unlock_snapset_obc = true;
9456     ctx->op_t->add_obc(ctx->snapset_obc);
9457   }
9458
9459   Context *on_all_commit = new C_OSD_RepopCommit(this, repop);
9460   Context *on_all_applied = new C_OSD_RepopApplied(this, repop);
9461   Context *onapplied_sync = new C_OSD_OndiskWriteUnlock(
9462     ctx->obc,
9463     ctx->clone_obc,
9464     unlock_snapset_obc ? ctx->snapset_obc : ObjectContextRef());
9465   if (!(ctx->log.empty())) {
9466     assert(ctx->at_version >= projected_last_update);
9467     projected_last_update = ctx->at_version;
9468   }
9469   for (auto &&entry: ctx->log) {
9470     projected_log.add(entry);
9471   }
9472   pgbackend->submit_transaction(
9473     soid,
9474     ctx->delta_stats,
9475     ctx->at_version,
9476     std::move(ctx->op_t),
9477     pg_trim_to,
9478     min_last_complete_ondisk,
9479     ctx->log,
9480     ctx->updated_hset_history,
9481     onapplied_sync,
9482     on_all_applied,
9483     on_all_commit,
9484     repop->rep_tid,
9485     ctx->reqid,
9486     ctx->op);
9487 }
9488
9489 PrimaryLogPG::RepGather *PrimaryLogPG::new_repop(
9490   OpContext *ctx, ObjectContextRef obc,
9491   ceph_tid_t rep_tid)
9492 {
9493   if (ctx->op)
9494     dout(10) << "new_repop rep_tid " << rep_tid << " on " << *ctx->op->get_req() << dendl;
9495   else
9496     dout(10) << "new_repop rep_tid " << rep_tid << " (no op)" << dendl;
9497
9498   RepGather *repop = new RepGather(
9499     ctx, rep_tid, info.last_complete, false);
9500
9501   repop->start = ceph_clock_now();
9502
9503   repop_queue.push_back(&repop->queue_item);
9504   repop->get();
9505
9506   osd->logger->inc(l_osd_op_wip);
9507
9508   dout(10) << __func__ << ": " << *repop << dendl;
9509   return repop;
9510 }
9511
9512 boost::intrusive_ptr<PrimaryLogPG::RepGather> PrimaryLogPG::new_repop(
9513   eversion_t version,
9514   int r,
9515   ObcLockManager &&manager,
9516   OpRequestRef &&op,
9517   boost::optional<std::function<void(void)> > &&on_complete)
9518 {
9519   RepGather *repop = new RepGather(
9520     std::move(manager),
9521     std::move(op),
9522     std::move(on_complete),
9523     osd->get_tid(),
9524     info.last_complete,
9525     true,
9526     r);
9527   repop->v = version;
9528
9529   repop->start = ceph_clock_now();
9530
9531   repop_queue.push_back(&repop->queue_item);
9532
9533   osd->logger->inc(l_osd_op_wip);
9534
9535   dout(10) << __func__ << ": " << *repop << dendl;
9536   return boost::intrusive_ptr<RepGather>(repop);
9537 }
9538
9539 void PrimaryLogPG::remove_repop(RepGather *repop)
9540 {
9541   dout(20) << __func__ << " " << *repop << dendl;
9542
9543   for (auto p = repop->on_finish.begin();
9544        p != repop->on_finish.end();
9545        repop->on_finish.erase(p++)) {
9546     (*p)();
9547   }
9548
9549   release_object_locks(
9550     repop->lock_manager);
9551   repop->put();
9552
9553   osd->logger->dec(l_osd_op_wip);
9554 }
9555
9556 PrimaryLogPG::OpContextUPtr PrimaryLogPG::simple_opc_create(ObjectContextRef obc)
9557 {
9558   dout(20) << __func__ << " " << obc->obs.oi.soid << dendl;
9559   ceph_tid_t rep_tid = osd->get_tid();
9560   osd_reqid_t reqid(osd->get_cluster_msgr_name(), 0, rep_tid);
9561   OpContextUPtr ctx(new OpContext(OpRequestRef(), reqid, nullptr, obc, this));
9562   ctx->op_t.reset(new PGTransaction());
9563   ctx->mtime = ceph_clock_now();
9564   return ctx;
9565 }
9566
9567 void PrimaryLogPG::simple_opc_submit(OpContextUPtr ctx)
9568 {
9569   RepGather *repop = new_repop(ctx.get(), ctx->obc, ctx->reqid.tid);
9570   dout(20) << __func__ << " " << repop << dendl;
9571   issue_repop(repop, ctx.get());
9572   eval_repop(repop);
9573   calc_trim_to();
9574   repop->put();
9575 }
9576
9577
9578 void PrimaryLogPG::submit_log_entries(
9579   const mempool::osd_pglog::list<pg_log_entry_t> &entries,
9580   ObcLockManager &&manager,
9581   boost::optional<std::function<void(void)> > &&_on_complete,
9582   OpRequestRef op,
9583   int r)
9584 {
9585   dout(10) << __func__ << " " << entries << dendl;
9586   assert(is_primary());
9587
9588   eversion_t version;
9589   if (!entries.empty()) {
9590     assert(entries.rbegin()->version >= projected_last_update);
9591     version = projected_last_update = entries.rbegin()->version;
9592   }
9593
9594   boost::intrusive_ptr<RepGather> repop;
9595   boost::optional<std::function<void(void)> > on_complete;
9596   if (get_osdmap()->require_osd_release >= CEPH_RELEASE_JEWEL) {
9597     repop = new_repop(
9598       version,
9599       r,
9600       std::move(manager),
9601       std::move(op),
9602       std::move(_on_complete));
9603   } else {
9604     on_complete = std::move(_on_complete);
9605   }
9606
9607   pgbackend->call_write_ordered(
9608     [this, entries, repop, on_complete]() {
9609       ObjectStore::Transaction t;
9610       eversion_t old_last_update = info.last_update;
9611       merge_new_log_entries(entries, t, pg_trim_to, min_last_complete_ondisk);
9612
9613
9614       set<pg_shard_t> waiting_on;
9615       for (set<pg_shard_t>::const_iterator i = actingbackfill.begin();
9616            i != actingbackfill.end();
9617            ++i) {
9618         pg_shard_t peer(*i);
9619         if (peer == pg_whoami) continue;
9620         assert(peer_missing.count(peer));
9621         assert(peer_info.count(peer));
9622         if (get_osdmap()->require_osd_release >= CEPH_RELEASE_JEWEL) {
9623           assert(repop);
9624           MOSDPGUpdateLogMissing *m = new MOSDPGUpdateLogMissing(
9625             entries,
9626             spg_t(info.pgid.pgid, i->shard),
9627             pg_whoami.shard,
9628             get_osdmap()->get_epoch(),
9629             last_peering_reset,
9630             repop->rep_tid,
9631             pg_trim_to,
9632             min_last_complete_ondisk);
9633           osd->send_message_osd_cluster(
9634             peer.osd, m, get_osdmap()->get_epoch());
9635           waiting_on.insert(peer);
9636         } else {
9637           MOSDPGLog *m = new MOSDPGLog(
9638             peer.shard, pg_whoami.shard,
9639             info.last_update.epoch,
9640             info);
9641           m->log.log = entries;
9642           m->log.tail = old_last_update;
9643           m->log.head = info.last_update;
9644           osd->send_message_osd_cluster(
9645             peer.osd, m, get_osdmap()->get_epoch());
9646         }
9647       }
9648       if (get_osdmap()->require_osd_release >= CEPH_RELEASE_JEWEL) {
9649         ceph_tid_t rep_tid = repop->rep_tid;
9650         waiting_on.insert(pg_whoami);
9651         log_entry_update_waiting_on.insert(
9652           make_pair(
9653             rep_tid,
9654             LogUpdateCtx{std::move(repop), std::move(waiting_on)}
9655             ));
9656         struct OnComplete : public Context {
9657           PrimaryLogPGRef pg;
9658           ceph_tid_t rep_tid;
9659           epoch_t epoch;
9660           OnComplete(
9661             PrimaryLogPGRef pg,
9662             ceph_tid_t rep_tid,
9663             epoch_t epoch)
9664             : pg(pg), rep_tid(rep_tid), epoch(epoch) {}
9665           void finish(int) override {
9666             pg->lock();
9667             if (!pg->pg_has_reset_since(epoch)) {
9668               auto it = pg->log_entry_update_waiting_on.find(rep_tid);
9669               assert(it != pg->log_entry_update_waiting_on.end());
9670               auto it2 = it->second.waiting_on.find(pg->pg_whoami);
9671               assert(it2 != it->second.waiting_on.end());
9672               it->second.waiting_on.erase(it2);
9673               if (it->second.waiting_on.empty()) {
9674                 pg->repop_all_committed(it->second.repop.get());
9675                 pg->log_entry_update_waiting_on.erase(it);
9676               }
9677             }
9678             pg->unlock();
9679           }
9680         };
9681         t.register_on_commit(
9682           new OnComplete{this, rep_tid, get_osdmap()->get_epoch()});
9683       } else {
9684         if (on_complete) {
9685           struct OnComplete : public Context {
9686             PrimaryLogPGRef pg;
9687             std::function<void(void)> on_complete;
9688             epoch_t epoch;
9689             OnComplete(
9690               PrimaryLogPGRef pg,
9691               const std::function<void(void)> &on_complete,
9692               epoch_t epoch)
9693               : pg(pg),
9694                 on_complete(std::move(on_complete)),
9695                 epoch(epoch) {}
9696             void finish(int) override {
9697               pg->lock();
9698               if (!pg->pg_has_reset_since(epoch))
9699                 on_complete();
9700               pg->unlock();
9701             }
9702           };
9703           t.register_on_complete(
9704             new OnComplete{
9705               this, *on_complete, get_osdmap()->get_epoch()
9706                 });
9707         }
9708       }
9709       t.register_on_applied(
9710         new C_OSD_OnApplied{this, get_osdmap()->get_epoch(), info.last_update});
9711       int r = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
9712       assert(r == 0);
9713     });
9714
9715   calc_trim_to();
9716 }
9717
9718 void PrimaryLogPG::cancel_log_updates()
9719 {
9720   // get rid of all the LogUpdateCtx so their references to repops are
9721   // dropped
9722   log_entry_update_waiting_on.clear();
9723 }
9724
9725 // -------------------------------------------------------
9726
9727 void PrimaryLogPG::get_watchers(list<obj_watch_item_t> &pg_watchers)
9728 {
9729   pair<hobject_t, ObjectContextRef> i;
9730   while (object_contexts.get_next(i.first, &i)) {
9731     ObjectContextRef obc(i.second);
9732     get_obc_watchers(obc, pg_watchers);
9733   }
9734 }
9735
9736 void PrimaryLogPG::get_obc_watchers(ObjectContextRef obc, list<obj_watch_item_t> &pg_watchers)
9737 {
9738   for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j =
9739          obc->watchers.begin();
9740         j != obc->watchers.end();
9741         ++j) {
9742     obj_watch_item_t owi;
9743
9744     owi.obj = obc->obs.oi.soid;
9745     owi.wi.addr = j->second->get_peer_addr();
9746     owi.wi.name = j->second->get_entity();
9747     owi.wi.cookie = j->second->get_cookie();
9748     owi.wi.timeout_seconds = j->second->get_timeout();
9749
9750     dout(30) << "watch: Found oid=" << owi.obj << " addr=" << owi.wi.addr
9751       << " name=" << owi.wi.name << " cookie=" << owi.wi.cookie << dendl;
9752
9753     pg_watchers.push_back(owi);
9754   }
9755 }
9756
9757 void PrimaryLogPG::check_blacklisted_watchers()
9758 {
9759   dout(20) << "PrimaryLogPG::check_blacklisted_watchers for pg " << get_pgid() << dendl;
9760   pair<hobject_t, ObjectContextRef> i;
9761   while (object_contexts.get_next(i.first, &i))
9762     check_blacklisted_obc_watchers(i.second);
9763 }
9764
9765 void PrimaryLogPG::check_blacklisted_obc_watchers(ObjectContextRef obc)
9766 {
9767   dout(20) << "PrimaryLogPG::check_blacklisted_obc_watchers for obc " << obc->obs.oi.soid << dendl;
9768   for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator k =
9769          obc->watchers.begin();
9770         k != obc->watchers.end();
9771         ) {
9772     //Advance iterator now so handle_watch_timeout() can erase element
9773     map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j = k++;
9774     dout(30) << "watch: Found " << j->second->get_entity() << " cookie " << j->second->get_cookie() << dendl;
9775     entity_addr_t ea = j->second->get_peer_addr();
9776     dout(30) << "watch: Check entity_addr_t " << ea << dendl;
9777     if (get_osdmap()->is_blacklisted(ea)) {
9778       dout(10) << "watch: Found blacklisted watcher for " << ea << dendl;
9779       assert(j->second->get_pg() == this);
9780       j->second->unregister_cb();
9781       handle_watch_timeout(j->second);
9782     }
9783   }
9784 }
9785
9786 void PrimaryLogPG::populate_obc_watchers(ObjectContextRef obc)
9787 {
9788   assert(is_active());
9789   assert((recovering.count(obc->obs.oi.soid) ||
9790           !is_missing_object(obc->obs.oi.soid)) ||
9791          (pg_log.get_log().objects.count(obc->obs.oi.soid) && // or this is a revert... see recover_primary()
9792           pg_log.get_log().objects.find(obc->obs.oi.soid)->second->op ==
9793             pg_log_entry_t::LOST_REVERT &&
9794           pg_log.get_log().objects.find(obc->obs.oi.soid)->second->reverting_to ==
9795             obc->obs.oi.version));
9796
9797   dout(10) << "populate_obc_watchers " << obc->obs.oi.soid << dendl;
9798   assert(obc->watchers.empty());
9799   // populate unconnected_watchers
9800   for (map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator p =
9801         obc->obs.oi.watchers.begin();
9802        p != obc->obs.oi.watchers.end();
9803        ++p) {
9804     utime_t expire = info.stats.last_became_active;
9805     expire += p->second.timeout_seconds;
9806     dout(10) << "  unconnected watcher " << p->first << " will expire " << expire << dendl;
9807     WatchRef watch(
9808       Watch::makeWatchRef(
9809         this, osd, obc, p->second.timeout_seconds, p->first.first,
9810         p->first.second, p->second.addr));
9811     watch->disconnect();
9812     obc->watchers.insert(
9813       make_pair(
9814         make_pair(p->first.first, p->first.second),
9815         watch));
9816   }
9817   // Look for watchers from blacklisted clients and drop
9818   check_blacklisted_obc_watchers(obc);
9819 }
9820
9821 void PrimaryLogPG::handle_watch_timeout(WatchRef watch)
9822 {
9823   ObjectContextRef obc = watch->get_obc(); // handle_watch_timeout owns this ref
9824   dout(10) << "handle_watch_timeout obc " << obc << dendl;
9825
9826   if (!is_active()) {
9827     dout(10) << "handle_watch_timeout not active, no-op" << dendl;
9828     return;
9829   }
9830   if (is_degraded_or_backfilling_object(obc->obs.oi.soid)) {
9831     callbacks_for_degraded_object[obc->obs.oi.soid].push_back(
9832       watch->get_delayed_cb()
9833       );
9834     dout(10) << "handle_watch_timeout waiting for degraded on obj "
9835              << obc->obs.oi.soid
9836              << dendl;
9837     return;
9838   }
9839
9840   if (write_blocked_by_scrub(obc->obs.oi.soid)) {
9841     dout(10) << "handle_watch_timeout waiting for scrub on obj "
9842              << obc->obs.oi.soid
9843              << dendl;
9844     scrubber.add_callback(
9845       watch->get_delayed_cb() // This callback!
9846       );
9847     return;
9848   }
9849
9850   OpContextUPtr ctx = simple_opc_create(obc);
9851   ctx->at_version = get_next_version();
9852
9853   object_info_t& oi = ctx->new_obs.oi;
9854   oi.watchers.erase(make_pair(watch->get_cookie(),
9855                               watch->get_entity()));
9856
9857   list<watch_disconnect_t> watch_disconnects = {
9858     watch_disconnect_t(watch->get_cookie(), watch->get_entity(), true)
9859   };
9860   ctx->register_on_success(
9861     [this, obc, watch_disconnects]() {
9862       complete_disconnect_watches(obc, watch_disconnects);
9863     });
9864
9865
9866   PGTransaction *t = ctx->op_t.get();
9867   ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::MODIFY, obc->obs.oi.soid,
9868                                     ctx->at_version,
9869                                     oi.version,
9870                                     0,
9871                                     osd_reqid_t(), ctx->mtime, 0));
9872
9873   oi.prior_version = obc->obs.oi.version;
9874   oi.version = ctx->at_version;
9875   bufferlist bl;
9876   ::encode(oi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
9877   t->setattr(obc->obs.oi.soid, OI_ATTR, bl);
9878
9879   // apply new object state.
9880   ctx->obc->obs = ctx->new_obs;
9881
9882   // no ctx->delta_stats
9883   simple_opc_submit(std::move(ctx));
9884 }
9885
9886 ObjectContextRef PrimaryLogPG::create_object_context(const object_info_t& oi,
9887                                                      SnapSetContext *ssc)
9888 {
9889   ObjectContextRef obc(object_contexts.lookup_or_create(oi.soid));
9890   assert(obc->destructor_callback == NULL);
9891   obc->destructor_callback = new C_PG_ObjectContext(this, obc.get());
9892   obc->obs.oi = oi;
9893   obc->obs.exists = false;
9894   obc->ssc = ssc;
9895   if (ssc)
9896     register_snapset_context(ssc);
9897   dout(10) << "create_object_context " << (void*)obc.get() << " " << oi.soid << " " << dendl;
9898   if (is_active())
9899     populate_obc_watchers(obc);
9900   return obc;
9901 }
9902
9903 ObjectContextRef PrimaryLogPG::get_object_context(
9904   const hobject_t& soid,
9905   bool can_create,
9906   const map<string, bufferlist> *attrs)
9907 {
9908   assert(
9909     attrs || !pg_log.get_missing().is_missing(soid) ||
9910     // or this is a revert... see recover_primary()
9911     (pg_log.get_log().objects.count(soid) &&
9912       pg_log.get_log().objects.find(soid)->second->op ==
9913       pg_log_entry_t::LOST_REVERT));
9914   ObjectContextRef obc = object_contexts.lookup(soid);
9915   osd->logger->inc(l_osd_object_ctx_cache_total);
9916   if (obc) {
9917     osd->logger->inc(l_osd_object_ctx_cache_hit);
9918     dout(10) << __func__ << ": found obc in cache: " << obc
9919              << dendl;
9920   } else {
9921     dout(10) << __func__ << ": obc NOT found in cache: " << soid << dendl;
9922     // check disk
9923     bufferlist bv;
9924     if (attrs) {
9925       assert(attrs->count(OI_ATTR));
9926       bv = attrs->find(OI_ATTR)->second;
9927     } else {
9928       int r = pgbackend->objects_get_attr(soid, OI_ATTR, &bv);
9929       if (r < 0) {
9930         if (!can_create) {
9931           dout(10) << __func__ << ": no obc for soid "
9932                    << soid << " and !can_create"
9933                    << dendl;
9934           return ObjectContextRef();   // -ENOENT!
9935         }
9936
9937         dout(10) << __func__ << ": no obc for soid "
9938                  << soid << " but can_create"
9939                  << dendl;
9940         // new object.
9941         object_info_t oi(soid);
9942         SnapSetContext *ssc = get_snapset_context(
9943           soid, true, 0, false);
9944         assert(ssc);
9945         obc = create_object_context(oi, ssc);
9946         dout(10) << __func__ << ": " << obc << " " << soid
9947                  << " " << obc->rwstate
9948                  << " oi: " << obc->obs.oi
9949                  << " ssc: " << obc->ssc
9950                  << " snapset: " << obc->ssc->snapset << dendl;
9951         return obc;
9952       }
9953     }
9954
9955     object_info_t oi;
9956     try {
9957       bufferlist::iterator bliter = bv.begin();
9958       ::decode(oi, bliter);
9959     } catch (...) {
9960       dout(0) << __func__ << ": obc corrupt: " << soid << dendl;
9961       return ObjectContextRef();   // -ENOENT!
9962     }
9963
9964     assert(oi.soid.pool == (int64_t)info.pgid.pool());
9965
9966     obc = object_contexts.lookup_or_create(oi.soid);
9967     obc->destructor_callback = new C_PG_ObjectContext(this, obc.get());
9968     obc->obs.oi = oi;
9969     obc->obs.exists = true;
9970
9971     obc->ssc = get_snapset_context(
9972       soid, true,
9973       soid.has_snapset() ? attrs : 0);
9974
9975     if (is_active())
9976       populate_obc_watchers(obc);
9977
9978     if (pool.info.require_rollback()) {
9979       if (attrs) {
9980         obc->attr_cache = *attrs;
9981       } else {
9982         int r = pgbackend->objects_get_attrs(
9983           soid,
9984           &obc->attr_cache);
9985         assert(r == 0);
9986       }
9987     }
9988
9989     dout(10) << __func__ << ": creating obc from disk: " << obc
9990              << dendl;
9991   }
9992
9993   // XXX: Caller doesn't expect this
9994   if (obc->ssc == NULL) {
9995     derr << __func__ << ": obc->ssc not available, not returning context" << dendl;
9996     return ObjectContextRef();   // -ENOENT!
9997   }
9998
9999   dout(10) << __func__ << ": " << obc << " " << soid
10000            << " " << obc->rwstate
10001            << " oi: " << obc->obs.oi
10002            << " exists: " << (int)obc->obs.exists
10003            << " ssc: " << obc->ssc
10004            << " snapset: " << obc->ssc->snapset << dendl;
10005   return obc;
10006 }
10007
10008 void PrimaryLogPG::context_registry_on_change()
10009 {
10010   pair<hobject_t, ObjectContextRef> i;
10011   while (object_contexts.get_next(i.first, &i)) {
10012     ObjectContextRef obc(i.second);
10013     if (obc) {
10014       for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j =
10015              obc->watchers.begin();
10016            j != obc->watchers.end();
10017            obc->watchers.erase(j++)) {
10018         j->second->discard();
10019       }
10020     }
10021   }
10022 }
10023
10024
10025 /*
10026  * If we return an error, and set *pmissing, then promoting that
10027  * object may help.
10028  *
10029  * If we return -EAGAIN, we will always set *pmissing to the missing
10030  * object to wait for.
10031  *
10032  * If we return an error but do not set *pmissing, then we know the
10033  * object does not exist.
10034  */
10035 int PrimaryLogPG::find_object_context(const hobject_t& oid,
10036                                       ObjectContextRef *pobc,
10037                                       bool can_create,
10038                                       bool map_snapid_to_clone,
10039                                       hobject_t *pmissing)
10040 {
10041   FUNCTRACE();
10042   assert(oid.pool == static_cast<int64_t>(info.pgid.pool()));
10043   // want the head?
10044   if (oid.snap == CEPH_NOSNAP) {
10045     ObjectContextRef obc = get_object_context(oid, can_create);
10046     if (!obc) {
10047       if (pmissing)
10048         *pmissing = oid;
10049       return -ENOENT;
10050     }
10051     dout(10) << "find_object_context " << oid
10052        << " @" << oid.snap
10053        << " oi=" << obc->obs.oi
10054        << dendl;
10055     *pobc = obc;
10056
10057     return 0;
10058   }
10059
10060   hobject_t head = oid.get_head();
10061
10062   // want the snapdir?
10063   if (oid.snap == CEPH_SNAPDIR) {
10064     // return head or snapdir, whichever exists.
10065     ObjectContextRef headobc = get_object_context(head, can_create);
10066     ObjectContextRef obc = headobc;
10067     if (!obc || !obc->obs.exists)
10068       obc = get_object_context(oid, can_create);
10069     if (!obc || !obc->obs.exists) {
10070       // if we have neither, we would want to promote the head.
10071       if (pmissing)
10072         *pmissing = head;
10073       if (pobc)
10074         *pobc = headobc; // may be null
10075       return -ENOENT;
10076     }
10077     dout(10) << "find_object_context " << oid
10078              << " @" << oid.snap
10079              << " oi=" << obc->obs.oi
10080              << dendl;
10081     *pobc = obc;
10082
10083     // always populate ssc for SNAPDIR...
10084     if (!obc->ssc)
10085       obc->ssc = get_snapset_context(
10086         oid, true);
10087     return 0;
10088   }
10089
10090   // we want a snap
10091   if (!map_snapid_to_clone && pool.info.is_removed_snap(oid.snap)) {
10092     dout(10) << __func__ << " snap " << oid.snap << " is removed" << dendl;
10093     return -ENOENT;
10094   }
10095
10096   SnapSetContext *ssc = get_snapset_context(oid, can_create);
10097   if (!ssc || !(ssc->exists || can_create)) {
10098     dout(20) << __func__ << " " << oid << " no snapset" << dendl;
10099     if (pmissing)
10100       *pmissing = head;  // start by getting the head
10101     if (ssc)
10102       put_snapset_context(ssc);
10103     return -ENOENT;
10104   }
10105
10106   if (map_snapid_to_clone) {
10107     dout(10) << "find_object_context " << oid << " @" << oid.snap
10108              << " snapset " << ssc->snapset
10109              << " map_snapid_to_clone=true" << dendl;
10110     if (oid.snap > ssc->snapset.seq) {
10111       // already must be readable
10112       ObjectContextRef obc = get_object_context(head, false);
10113       dout(10) << "find_object_context " << oid << " @" << oid.snap
10114                << " snapset " << ssc->snapset
10115                << " maps to head" << dendl;
10116       *pobc = obc;
10117       put_snapset_context(ssc);
10118       return (obc && obc->obs.exists) ? 0 : -ENOENT;
10119     } else {
10120       vector<snapid_t>::const_iterator citer = std::find(
10121         ssc->snapset.clones.begin(),
10122         ssc->snapset.clones.end(),
10123         oid.snap);
10124       if (citer == ssc->snapset.clones.end()) {
10125         dout(10) << "find_object_context " << oid << " @" << oid.snap
10126                  << " snapset " << ssc->snapset
10127                  << " maps to nothing" << dendl;
10128         put_snapset_context(ssc);
10129         return -ENOENT;
10130       }
10131
10132       dout(10) << "find_object_context " << oid << " @" << oid.snap
10133                << " snapset " << ssc->snapset
10134                << " maps to " << oid << dendl;
10135
10136       if (pg_log.get_missing().is_missing(oid)) {
10137         dout(10) << "find_object_context " << oid << " @" << oid.snap
10138                  << " snapset " << ssc->snapset
10139                  << " " << oid << " is missing" << dendl;
10140         if (pmissing)
10141           *pmissing = oid;
10142         put_snapset_context(ssc);
10143         return -EAGAIN;
10144       }
10145
10146       ObjectContextRef obc = get_object_context(oid, false);
10147       if (!obc || !obc->obs.exists) {
10148         dout(10) << "find_object_context " << oid << " @" << oid.snap
10149                  << " snapset " << ssc->snapset
10150                  << " " << oid << " is not present" << dendl;
10151         if (pmissing)
10152           *pmissing = oid;
10153         put_snapset_context(ssc);
10154         return -ENOENT;
10155       }
10156       dout(10) << "find_object_context " << oid << " @" << oid.snap
10157                << " snapset " << ssc->snapset
10158                << " " << oid << " HIT" << dendl;
10159       *pobc = obc;
10160       put_snapset_context(ssc);
10161       return 0;
10162     }
10163     ceph_abort(); //unreachable
10164   }
10165
10166   dout(10) << "find_object_context " << oid << " @" << oid.snap
10167            << " snapset " << ssc->snapset << dendl;
10168
10169   // head?
10170   if (oid.snap > ssc->snapset.seq) {
10171     if (ssc->snapset.head_exists) {
10172       ObjectContextRef obc = get_object_context(head, false);
10173       dout(10) << "find_object_context  " << head
10174                << " want " << oid.snap << " > snapset seq " << ssc->snapset.seq
10175                << " -- HIT " << obc->obs
10176                << dendl;
10177       if (!obc->ssc)
10178         obc->ssc = ssc;
10179       else {
10180         assert(ssc == obc->ssc);
10181         put_snapset_context(ssc);
10182       }
10183       *pobc = obc;
10184       return 0;
10185     }
10186     dout(10) << "find_object_context  " << head
10187              << " want " << oid.snap << " > snapset seq " << ssc->snapset.seq
10188              << " but head dne -- DNE"
10189              << dendl;
10190     put_snapset_context(ssc);
10191     return -ENOENT;
10192   }
10193
10194   // which clone would it be?
10195   unsigned k = 0;
10196   while (k < ssc->snapset.clones.size() &&
10197          ssc->snapset.clones[k] < oid.snap)
10198     k++;
10199   if (k == ssc->snapset.clones.size()) {
10200     dout(10) << "find_object_context  no clones with last >= oid.snap "
10201              << oid.snap << " -- DNE" << dendl;
10202     put_snapset_context(ssc);
10203     return -ENOENT;
10204   }
10205   hobject_t soid(oid.oid, oid.get_key(), ssc->snapset.clones[k], oid.get_hash(),
10206                  info.pgid.pool(), oid.get_namespace());
10207
10208   if (pg_log.get_missing().is_missing(soid)) {
10209     dout(20) << "find_object_context  " << soid << " missing, try again later"
10210              << dendl;
10211     if (pmissing)
10212       *pmissing = soid;
10213     put_snapset_context(ssc);
10214     return -EAGAIN;
10215   }
10216
10217   ObjectContextRef obc = get_object_context(soid, false);
10218   if (!obc || !obc->obs.exists) {
10219     if (pmissing)
10220       *pmissing = soid;
10221     put_snapset_context(ssc);
10222     if (is_degraded_or_backfilling_object(soid)) {
10223       dout(20) << __func__ << " clone is degraded or backfilling " << soid << dendl;
10224       return -EAGAIN;
10225     } else {
10226       dout(20) << __func__ << " missing clone " << soid << dendl;
10227       return -ENOENT;
10228     }
10229   }
10230
10231   if (!obc->ssc) {
10232     obc->ssc = ssc;
10233   } else {
10234     assert(obc->ssc == ssc);
10235     put_snapset_context(ssc);
10236   }
10237   ssc = 0;
10238
10239   // clone
10240   dout(20) << "find_object_context  " << soid
10241            << " snapset " << obc->ssc->snapset
10242            << " legacy_snaps " << obc->obs.oi.legacy_snaps
10243            << dendl;
10244   snapid_t first, last;
10245   if (obc->ssc->snapset.is_legacy()) {
10246     first = obc->obs.oi.legacy_snaps.back();
10247     last = obc->obs.oi.legacy_snaps.front();
10248   } else {
10249     auto p = obc->ssc->snapset.clone_snaps.find(soid.snap);
10250     assert(p != obc->ssc->snapset.clone_snaps.end());
10251     if (p->second.empty()) {
10252       dout(1) << __func__ << " " << soid << " empty snapset -- DNE" << dendl;
10253       assert(!cct->_conf->osd_debug_verify_snaps);
10254       return -ENOENT;
10255     }
10256     first = p->second.back();
10257     last = p->second.front();
10258   }
10259   if (first <= oid.snap) {
10260     dout(20) << "find_object_context  " << soid << " [" << first << "," << last
10261              << "] contains " << oid.snap << " -- HIT " << obc->obs << dendl;
10262     *pobc = obc;
10263     return 0;
10264   } else {
10265     dout(20) << "find_object_context  " << soid << " [" << first << "," << last
10266              << "] does not contain " << oid.snap << " -- DNE" << dendl;
10267     return -ENOENT;
10268   }
10269 }
10270
10271 void PrimaryLogPG::object_context_destructor_callback(ObjectContext *obc)
10272 {
10273   if (obc->ssc)
10274     put_snapset_context(obc->ssc);
10275 }
10276
10277 void PrimaryLogPG::add_object_context_to_pg_stat(ObjectContextRef obc, pg_stat_t *pgstat)
10278 {
10279   object_info_t& oi = obc->obs.oi;
10280
10281   dout(10) << "add_object_context_to_pg_stat " << oi.soid << dendl;
10282   object_stat_sum_t stat;
10283
10284   stat.num_bytes += oi.size;
10285
10286   if (oi.soid.snap != CEPH_SNAPDIR)
10287     stat.num_objects++;
10288   if (oi.is_dirty())
10289     stat.num_objects_dirty++;
10290   if (oi.is_whiteout())
10291     stat.num_whiteouts++;
10292   if (oi.is_omap())
10293     stat.num_objects_omap++;
10294   if (oi.is_cache_pinned())
10295     stat.num_objects_pinned++;
10296
10297   if (oi.soid.snap && oi.soid.snap != CEPH_NOSNAP && oi.soid.snap != CEPH_SNAPDIR) {
10298     stat.num_object_clones++;
10299
10300     if (!obc->ssc)
10301       obc->ssc = get_snapset_context(oi.soid, false);
10302     assert(obc->ssc);
10303
10304     // subtract off clone overlap
10305     if (obc->ssc->snapset.clone_overlap.count(oi.soid.snap)) {
10306       interval_set<uint64_t>& o = obc->ssc->snapset.clone_overlap[oi.soid.snap];
10307       for (interval_set<uint64_t>::const_iterator r = o.begin();
10308            r != o.end();
10309            ++r) {
10310         stat.num_bytes -= r.get_len();
10311       }
10312     }
10313   }
10314
10315   // add it in
10316   pgstat->stats.sum.add(stat);
10317 }
10318
10319 void PrimaryLogPG::kick_object_context_blocked(ObjectContextRef obc)
10320 {
10321   const hobject_t& soid = obc->obs.oi.soid;
10322   if (obc->is_blocked()) {
10323     dout(10) << __func__ << " " << soid << " still blocked" << dendl;
10324     return;
10325   }
10326
10327   map<hobject_t, list<OpRequestRef>>::iterator p = waiting_for_blocked_object.find(soid);
10328   if (p != waiting_for_blocked_object.end()) {
10329     list<OpRequestRef>& ls = p->second;
10330     dout(10) << __func__ << " " << soid << " requeuing " << ls.size() << " requests" << dendl;
10331     requeue_ops(ls);
10332     waiting_for_blocked_object.erase(p);
10333   }
10334
10335   map<hobject_t, ObjectContextRef>::iterator i =
10336     objects_blocked_on_snap_promotion.find(obc->obs.oi.soid.get_head());
10337   if (i != objects_blocked_on_snap_promotion.end()) {
10338     assert(i->second == obc);
10339     objects_blocked_on_snap_promotion.erase(i);
10340   }
10341
10342   if (obc->requeue_scrub_on_unblock) {
10343     obc->requeue_scrub_on_unblock = false;
10344     requeue_scrub();
10345   }
10346 }
10347
10348 SnapSetContext *PrimaryLogPG::get_snapset_context(
10349   const hobject_t& oid,
10350   bool can_create,
10351   const map<string, bufferlist> *attrs,
10352   bool oid_existed)
10353 {
10354   Mutex::Locker l(snapset_contexts_lock);
10355   SnapSetContext *ssc;
10356   map<hobject_t, SnapSetContext*>::iterator p = snapset_contexts.find(
10357     oid.get_snapdir());
10358   if (p != snapset_contexts.end()) {
10359     if (can_create || p->second->exists) {
10360       ssc = p->second;
10361     } else {
10362       return NULL;
10363     }
10364   } else {
10365     bufferlist bv;
10366     if (!attrs) {
10367       int r = -ENOENT;
10368       if (!(oid.is_head() && !oid_existed))
10369         r = pgbackend->objects_get_attr(oid.get_head(), SS_ATTR, &bv);
10370       if (r < 0) {
10371         // try _snapset
10372         if (!(oid.is_snapdir() && !oid_existed))
10373           r = pgbackend->objects_get_attr(oid.get_snapdir(), SS_ATTR, &bv);
10374         if (r < 0 && !can_create)
10375           return NULL;
10376       }
10377     } else {
10378       assert(attrs->count(SS_ATTR));
10379       bv = attrs->find(SS_ATTR)->second;
10380     }
10381     ssc = new SnapSetContext(oid.get_snapdir());
10382     _register_snapset_context(ssc);
10383     if (bv.length()) {
10384       bufferlist::iterator bvp = bv.begin();
10385       try {
10386         ssc->snapset.decode(bvp);
10387       } catch (buffer::error& e) {
10388         dout(0) << __func__ << " Can't decode snapset: " << e << dendl;
10389         return NULL;
10390       }
10391       ssc->exists = true;
10392     } else {
10393       ssc->exists = false;
10394     }
10395   }
10396   assert(ssc);
10397   ssc->ref++;
10398   return ssc;
10399 }
10400
10401 void PrimaryLogPG::put_snapset_context(SnapSetContext *ssc)
10402 {
10403   Mutex::Locker l(snapset_contexts_lock);
10404   --ssc->ref;
10405   if (ssc->ref == 0) {
10406     if (ssc->registered)
10407       snapset_contexts.erase(ssc->oid);
10408     delete ssc;
10409   }
10410 }
10411
10412 /** pull - request object from a peer
10413  */
10414
10415 /*
10416  * Return values:
10417  *  NONE  - didn't pull anything
10418  *  YES   - pulled what the caller wanted
10419  *  OTHER - needed to pull something else first (_head or _snapdir)
10420  */
10421 enum { PULL_NONE, PULL_OTHER, PULL_YES };
10422
10423 int PrimaryLogPG::recover_missing(
10424   const hobject_t &soid, eversion_t v,
10425   int priority,
10426   PGBackend::RecoveryHandle *h)
10427 {
10428   if (missing_loc.is_unfound(soid)) {
10429     dout(7) << "pull " << soid
10430             << " v " << v
10431             << " but it is unfound" << dendl;
10432     return PULL_NONE;
10433   }
10434
10435   if (missing_loc.is_deleted(soid)) {
10436     start_recovery_op(soid);
10437     assert(!recovering.count(soid));
10438     recovering.insert(make_pair(soid, ObjectContextRef()));
10439     epoch_t cur_epoch = get_osdmap()->get_epoch();
10440     remove_missing_object(soid, v, new FunctionContext(
10441      [=](int) {
10442        lock();
10443        if (!pg_has_reset_since(cur_epoch)) {
10444          bool object_missing = false;
10445          for (const auto& shard : actingbackfill) {
10446            if (shard == pg_whoami)
10447              continue;
10448            if (peer_missing[shard].is_missing(soid)) {
10449              dout(20) << __func__ << ": soid " << soid << " needs to be deleted from replica " << shard << dendl;
10450              object_missing = true;
10451              break;
10452            }
10453          }
10454          if (!object_missing) {
10455            object_stat_sum_t stat_diff;
10456            stat_diff.num_objects_recovered = 1;
10457            on_global_recover(soid, stat_diff, true);
10458          } else {
10459            auto recovery_handle = pgbackend->open_recovery_op();
10460            pgbackend->recover_delete_object(soid, v, recovery_handle);
10461            pgbackend->run_recovery_op(recovery_handle, priority);
10462          }
10463        }
10464        unlock();
10465      }));
10466     return PULL_YES;
10467   }
10468
10469   // is this a snapped object?  if so, consult the snapset.. we may not need the entire object!
10470   ObjectContextRef obc;
10471   ObjectContextRef head_obc;
10472   if (soid.snap && soid.snap < CEPH_NOSNAP) {
10473     // do we have the head and/or snapdir?
10474     hobject_t head = soid.get_head();
10475     if (pg_log.get_missing().is_missing(head)) {
10476       if (recovering.count(head)) {
10477         dout(10) << " missing but already recovering head " << head << dendl;
10478         return PULL_NONE;
10479       } else {
10480         int r = recover_missing(
10481           head, pg_log.get_missing().get_items().find(head)->second.need, priority,
10482           h);
10483         if (r != PULL_NONE)
10484           return PULL_OTHER;
10485         return PULL_NONE;
10486       }
10487     }
10488     head = soid.get_snapdir();
10489     if (pg_log.get_missing().is_missing(head)) {
10490       if (recovering.count(head)) {
10491         dout(10) << " missing but already recovering snapdir " << head << dendl;
10492         return PULL_NONE;
10493       } else {
10494         int r = recover_missing(
10495           head, pg_log.get_missing().get_items().find(head)->second.need, priority,
10496           h);
10497         if (r != PULL_NONE)
10498           return PULL_OTHER;
10499         return PULL_NONE;
10500       }
10501     }
10502
10503     // we must have one or the other
10504     head_obc = get_object_context(
10505       soid.get_head(),
10506       false,
10507       0);
10508     if (!head_obc)
10509       head_obc = get_object_context(
10510         soid.get_snapdir(),
10511         false,
10512         0);
10513     assert(head_obc);
10514   }
10515   start_recovery_op(soid);
10516   assert(!recovering.count(soid));
10517   recovering.insert(make_pair(soid, obc));
10518   int r = pgbackend->recover_object(
10519     soid,
10520     v,
10521     head_obc,
10522     obc,
10523     h);
10524   // This is only a pull which shouldn't return an error
10525   assert(r >= 0);
10526   return PULL_YES;
10527 }
10528
10529 void PrimaryLogPG::send_remove_op(
10530   const hobject_t& oid, eversion_t v, pg_shard_t peer)
10531 {
10532   ceph_tid_t tid = osd->get_tid();
10533   osd_reqid_t rid(osd->get_cluster_msgr_name(), 0, tid);
10534
10535   dout(10) << "send_remove_op " << oid << " from osd." << peer
10536            << " tid " << tid << dendl;
10537
10538   MOSDSubOp *subop = new MOSDSubOp(
10539     rid, pg_whoami, spg_t(info.pgid.pgid, peer.shard),
10540     oid, CEPH_OSD_FLAG_ACK,
10541     get_osdmap()->get_epoch(), tid, v);
10542   subop->ops = vector<OSDOp>(1);
10543   subop->ops[0].op.op = CEPH_OSD_OP_DELETE;
10544
10545   osd->send_message_osd_cluster(peer.osd, subop, get_osdmap()->get_epoch());
10546 }
10547
10548 void PrimaryLogPG::remove_missing_object(const hobject_t &soid,
10549                                          eversion_t v, Context *on_complete)
10550 {
10551   dout(20) << __func__ << " " << soid << " " << v << dendl;
10552   assert(on_complete != nullptr);
10553   // delete locally
10554   ObjectStore::Transaction t;
10555   remove_snap_mapped_object(t, soid);
10556
10557   ObjectRecoveryInfo recovery_info;
10558   recovery_info.soid = soid;
10559   recovery_info.version = v;
10560
10561   epoch_t cur_epoch = get_osdmap()->get_epoch();
10562   t.register_on_complete(new FunctionContext(
10563      [=](int) {
10564        lock();
10565        if (!pg_has_reset_since(cur_epoch)) {
10566          ObjectStore::Transaction t2;
10567          on_local_recover(soid, recovery_info, ObjectContextRef(), true, &t2);
10568          t2.register_on_complete(on_complete);
10569          int r = osd->store->queue_transaction(osr.get(), std::move(t2), nullptr);
10570          assert(r == 0);
10571          unlock();
10572        } else {
10573          unlock();
10574          on_complete->complete(-EAGAIN);
10575        }
10576      }));
10577   int r = osd->store->queue_transaction(osr.get(), std::move(t), nullptr);
10578   assert(r == 0);
10579 }
10580
10581 void PrimaryLogPG::finish_degraded_object(const hobject_t& oid)
10582 {
10583   dout(10) << "finish_degraded_object " << oid << dendl;
10584   if (callbacks_for_degraded_object.count(oid)) {
10585     list<Context*> contexts;
10586     contexts.swap(callbacks_for_degraded_object[oid]);
10587     callbacks_for_degraded_object.erase(oid);
10588     for (list<Context*>::iterator i = contexts.begin();
10589          i != contexts.end();
10590          ++i) {
10591       (*i)->complete(0);
10592     }
10593   }
10594   map<hobject_t, snapid_t>::iterator i = objects_blocked_on_degraded_snap.find(
10595     oid.get_head());
10596   if (i != objects_blocked_on_degraded_snap.end() &&
10597       i->second == oid.snap)
10598     objects_blocked_on_degraded_snap.erase(i);
10599 }
10600
10601 void PrimaryLogPG::_committed_pushed_object(
10602   epoch_t epoch, eversion_t last_complete)
10603 {
10604   lock();
10605   if (!pg_has_reset_since(epoch)) {
10606     dout(10) << "_committed_pushed_object last_complete " << last_complete << " now ondisk" << dendl;
10607     last_complete_ondisk = last_complete;
10608
10609     if (last_complete_ondisk == info.last_update) {
10610       if (!is_primary()) {
10611         // Either we are a replica or backfill target.
10612         // we are fully up to date.  tell the primary!
10613         osd->send_message_osd_cluster(
10614           get_primary().osd,
10615           new MOSDPGTrim(
10616             get_osdmap()->get_epoch(),
10617             spg_t(info.pgid.pgid, get_primary().shard),
10618             last_complete_ondisk),
10619           get_osdmap()->get_epoch());
10620       } else {
10621         calc_min_last_complete_ondisk();
10622       }
10623     }
10624
10625   } else {
10626     dout(10) << "_committed_pushed_object pg has changed, not touching last_complete_ondisk" << dendl;
10627   }
10628
10629   unlock();
10630 }
10631
10632 void PrimaryLogPG::_applied_recovered_object(ObjectContextRef obc)
10633 {
10634   lock();
10635   dout(20) << __func__ << dendl;
10636   if (obc) {
10637     dout(20) << "obc = " << *obc << dendl;
10638   }
10639   assert(active_pushes >= 1);
10640   --active_pushes;
10641
10642   // requeue an active chunky scrub waiting on recovery ops
10643   if (!deleting && active_pushes == 0
10644       && scrubber.is_chunky_scrub_active()) {
10645     if (ops_blocked_by_scrub()) {
10646       requeue_scrub(true);
10647     } else {
10648       requeue_scrub(false);
10649     }
10650   }
10651   unlock();
10652 }
10653
10654 void PrimaryLogPG::_applied_recovered_object_replica()
10655 {
10656   lock();
10657   dout(20) << __func__ << dendl;
10658   assert(active_pushes >= 1);
10659   --active_pushes;
10660
10661   // requeue an active chunky scrub waiting on recovery ops
10662   if (!deleting && active_pushes == 0 &&
10663       scrubber.active_rep_scrub && static_cast<const MOSDRepScrub*>(
10664         scrubber.active_rep_scrub->get_req())->chunky) {
10665     osd->enqueue_back(
10666       info.pgid,
10667       PGQueueable(scrubber.active_rep_scrub, get_osdmap()->get_epoch()));
10668     scrubber.active_rep_scrub = OpRequestRef();
10669   }
10670   unlock();
10671 }
10672
10673 void PrimaryLogPG::recover_got(hobject_t oid, eversion_t v)
10674 {
10675   dout(10) << "got missing " << oid << " v " << v << dendl;
10676   pg_log.recover_got(oid, v, info);
10677   if (pg_log.get_log().complete_to != pg_log.get_log().log.end()) {
10678     dout(10) << "last_complete now " << info.last_complete
10679              << " log.complete_to " << pg_log.get_log().complete_to->version
10680              << dendl;
10681   } else {
10682     dout(10) << "last_complete now " << info.last_complete
10683              << " log.complete_to at end" << dendl;
10684     //below is not true in the repair case.
10685     //assert(missing.num_missing() == 0);  // otherwise, complete_to was wrong.
10686     assert(info.last_complete == info.last_update);
10687   }
10688 }
10689
10690 void PrimaryLogPG::primary_failed(const hobject_t &soid)
10691 {
10692   list<pg_shard_t> fl = { pg_whoami };
10693   failed_push(fl, soid);
10694 }
10695
10696 void PrimaryLogPG::failed_push(const list<pg_shard_t> &from, const hobject_t &soid)
10697 {
10698   dout(20) << __func__ << ": " << soid << dendl;
10699   assert(recovering.count(soid));
10700   auto obc = recovering[soid];
10701   if (obc) {
10702     list<OpRequestRef> blocked_ops;
10703     obc->drop_recovery_read(&blocked_ops);
10704     requeue_ops(blocked_ops);
10705   }
10706   recovering.erase(soid);
10707   for (auto&& i : from)
10708     missing_loc.remove_location(soid, i);
10709   dout(0) << __func__ << " " << soid << " from shard " << from
10710           << ", reps on " << missing_loc.get_locations(soid)
10711           << " unfound? " << missing_loc.is_unfound(soid) << dendl;
10712   finish_recovery_op(soid);  // close out this attempt,
10713 }
10714
10715 void PrimaryLogPG::sub_op_remove(OpRequestRef op)
10716 {
10717   const MOSDSubOp *m = static_cast<const MOSDSubOp*>(op->get_req());
10718   assert(m->get_type() == MSG_OSD_SUBOP);
10719   dout(7) << "sub_op_remove " << m->poid << dendl;
10720
10721   op->mark_started();
10722
10723   ObjectStore::Transaction t;
10724   remove_snap_mapped_object(t, m->poid);
10725   int r = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
10726   assert(r == 0);
10727 }
10728
10729 eversion_t PrimaryLogPG::pick_newest_available(const hobject_t& oid)
10730 {
10731   eversion_t v;
10732   pg_missing_item pmi;
10733   bool is_missing = pg_log.get_missing().is_missing(oid, &pmi);
10734   assert(is_missing);
10735   v = pmi.have;
10736   dout(10) << "pick_newest_available " << oid << " " << v << " on osd." << osd->whoami << " (local)" << dendl;
10737
10738   assert(!actingbackfill.empty());
10739   for (set<pg_shard_t>::iterator i = actingbackfill.begin();
10740        i != actingbackfill.end();
10741        ++i) {
10742     if (*i == get_primary()) continue;
10743     pg_shard_t peer = *i;
10744     if (!peer_missing[peer].is_missing(oid)) {
10745       continue;
10746     }
10747     eversion_t h = peer_missing[peer].get_items().at(oid).have;
10748     dout(10) << "pick_newest_available " << oid << " " << h << " on osd." << peer << dendl;
10749     if (h > v)
10750       v = h;
10751   }
10752
10753   dout(10) << "pick_newest_available " << oid << " " << v << " (newest)" << dendl;
10754   return v;
10755 }
10756
10757 void PrimaryLogPG::do_update_log_missing(OpRequestRef &op)
10758 {
10759   const MOSDPGUpdateLogMissing *m = static_cast<const MOSDPGUpdateLogMissing*>(
10760     op->get_req());
10761   assert(m->get_type() == MSG_OSD_PG_UPDATE_LOG_MISSING);
10762   ObjectStore::Transaction t;
10763   boost::optional<eversion_t> op_trim_to, op_roll_forward_to;
10764   if (m->pg_trim_to != eversion_t())
10765     op_trim_to = m->pg_trim_to;
10766   if (m->pg_roll_forward_to != eversion_t())
10767     op_roll_forward_to = m->pg_roll_forward_to;
10768
10769   dout(20) << __func__ << " op_trim_to = " << op_trim_to << " op_roll_forward_to = " << op_roll_forward_to << dendl;
10770
10771   append_log_entries_update_missing(m->entries, t, op_trim_to, op_roll_forward_to);
10772   eversion_t new_lcod = info.last_complete;
10773
10774   Context *complete = new FunctionContext(
10775     [=](int) {
10776       const MOSDPGUpdateLogMissing *msg = static_cast<const MOSDPGUpdateLogMissing*>(
10777         op->get_req());
10778       lock();
10779       if (!pg_has_reset_since(msg->get_epoch())) {
10780         update_last_complete_ondisk(new_lcod);
10781         MOSDPGUpdateLogMissingReply *reply =
10782           new MOSDPGUpdateLogMissingReply(
10783             spg_t(info.pgid.pgid, primary_shard().shard),
10784             pg_whoami.shard,
10785             msg->get_epoch(),
10786             msg->min_epoch,
10787             msg->get_tid(),
10788             new_lcod);
10789         reply->set_priority(CEPH_MSG_PRIO_HIGH);
10790         msg->get_connection()->send_message(reply);
10791       }
10792       unlock();
10793     });
10794
10795   if (get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
10796     t.register_on_commit(complete);
10797   } else {
10798     /* Hack to work around the fact that ReplicatedBackend sends
10799      * ack+commit if commit happens first
10800      *
10801      * This behavior is no longer necessary, but we preserve it so old
10802      * primaries can keep their repops in order */
10803     if (pool.info.ec_pool()) {
10804       t.register_on_complete(complete);
10805     } else {
10806       t.register_on_commit(complete);
10807     }
10808   }
10809   t.register_on_applied(
10810     new C_OSD_OnApplied{this, get_osdmap()->get_epoch(), info.last_update});
10811   int tr = osd->store->queue_transaction(
10812     osr.get(),
10813     std::move(t),
10814     nullptr);
10815   assert(tr == 0);
10816 }
10817
10818 void PrimaryLogPG::do_update_log_missing_reply(OpRequestRef &op)
10819 {
10820   const MOSDPGUpdateLogMissingReply *m =
10821     static_cast<const MOSDPGUpdateLogMissingReply*>(
10822     op->get_req());
10823   dout(20) << __func__ << " got reply from "
10824            << m->get_from() << dendl;
10825
10826   auto it = log_entry_update_waiting_on.find(m->get_tid());
10827   if (it != log_entry_update_waiting_on.end()) {
10828     if (it->second.waiting_on.count(m->get_from())) {
10829       it->second.waiting_on.erase(m->get_from());
10830       if (m->last_complete_ondisk != eversion_t()) {
10831         update_peer_last_complete_ondisk(m->get_from(), m->last_complete_ondisk);
10832       }
10833     } else {
10834       osd->clog->error()
10835         << info.pgid << " got reply "
10836         << *m << " from shard we are not waiting for "
10837         << m->get_from();
10838     }
10839
10840     if (it->second.waiting_on.empty()) {
10841       repop_all_committed(it->second.repop.get());
10842       log_entry_update_waiting_on.erase(it);
10843     }
10844   } else {
10845     osd->clog->error()
10846       << info.pgid << " got reply "
10847       << *m << " on unknown tid " << m->get_tid();
10848   }
10849 }
10850
10851 /* Mark all unfound objects as lost.
10852  */
10853 void PrimaryLogPG::mark_all_unfound_lost(
10854   int what,
10855   ConnectionRef con,
10856   ceph_tid_t tid)
10857 {
10858   dout(3) << __func__ << " " << pg_log_entry_t::get_op_name(what) << dendl;
10859   list<hobject_t> oids;
10860
10861   dout(30) << __func__ << ": log before:\n";
10862   pg_log.get_log().print(*_dout);
10863   *_dout << dendl;
10864
10865   mempool::osd_pglog::list<pg_log_entry_t> log_entries;
10866
10867   utime_t mtime = ceph_clock_now();
10868   map<hobject_t, pg_missing_item>::const_iterator m =
10869     missing_loc.get_needs_recovery().begin();
10870   map<hobject_t, pg_missing_item>::const_iterator mend =
10871     missing_loc.get_needs_recovery().end();
10872
10873   ObcLockManager manager;
10874   eversion_t v = get_next_version();
10875   v.epoch = get_osdmap()->get_epoch();
10876   uint64_t num_unfound = missing_loc.num_unfound();
10877   while (m != mend) {
10878     const hobject_t &oid(m->first);
10879     if (!missing_loc.is_unfound(oid)) {
10880       // We only care about unfound objects
10881       ++m;
10882       continue;
10883     }
10884
10885     ObjectContextRef obc;
10886     eversion_t prev;
10887
10888     switch (what) {
10889     case pg_log_entry_t::LOST_MARK:
10890       assert(0 == "actually, not implemented yet!");
10891       break;
10892
10893     case pg_log_entry_t::LOST_REVERT:
10894       prev = pick_newest_available(oid);
10895       if (prev > eversion_t()) {
10896         // log it
10897         pg_log_entry_t e(
10898           pg_log_entry_t::LOST_REVERT, oid, v,
10899           m->second.need, 0, osd_reqid_t(), mtime, 0);
10900         e.reverting_to = prev;
10901         e.mark_unrollbackable();
10902         log_entries.push_back(e);
10903         dout(10) << e << dendl;
10904
10905         // we are now missing the new version; recovery code will sort it out.
10906         ++v.version;
10907         ++m;
10908         break;
10909       }
10910
10911     case pg_log_entry_t::LOST_DELETE:
10912       {
10913         pg_log_entry_t e(pg_log_entry_t::LOST_DELETE, oid, v, m->second.need,
10914                          0, osd_reqid_t(), mtime, 0);
10915         if (get_osdmap()->require_osd_release >= CEPH_RELEASE_JEWEL) {
10916           if (pool.info.require_rollback()) {
10917             e.mod_desc.try_rmobject(v.version);
10918           } else {
10919             e.mark_unrollbackable();
10920           }
10921         } // otherwise, just do what we used to do
10922         dout(10) << e << dendl;
10923         log_entries.push_back(e);
10924         oids.push_back(oid);
10925
10926         // If context found mark object as deleted in case
10927         // of racing with new creation.  This can happen if
10928         // object lost and EIO at primary.
10929         obc = object_contexts.lookup(oid);
10930         if (obc)
10931           obc->obs.exists = false;
10932
10933         ++v.version;
10934         ++m;
10935       }
10936       break;
10937
10938     default:
10939       ceph_abort();
10940     }
10941   }
10942
10943   info.stats.stats_invalid = true;
10944
10945   submit_log_entries(
10946     log_entries,
10947     std::move(manager),
10948     boost::optional<std::function<void(void)> >(
10949       [this, oids, con, num_unfound, tid]() {
10950         if (perform_deletes_during_peering()) {
10951           for (auto oid : oids) {
10952             // clear old locations - merge_new_log_entries will have
10953             // handled rebuilding missing_loc for each of these
10954             // objects if we have the RECOVERY_DELETES flag
10955             missing_loc.recovered(oid);
10956           }
10957         }
10958
10959         if (is_recovery_unfound()) {
10960           queue_peering_event(
10961             CephPeeringEvtRef(
10962               std::make_shared<CephPeeringEvt>(
10963               get_osdmap()->get_epoch(),
10964               get_osdmap()->get_epoch(),
10965               DoRecovery())));
10966         } else if (is_backfill_unfound()) {
10967           queue_peering_event(
10968             CephPeeringEvtRef(
10969               std::make_shared<CephPeeringEvt>(
10970               get_osdmap()->get_epoch(),
10971               get_osdmap()->get_epoch(),
10972               RequestBackfill())));
10973         } else {
10974           queue_recovery();
10975         }
10976
10977         stringstream ss;
10978         ss << "pg has " << num_unfound
10979            << " objects unfound and apparently lost marking";
10980         string rs = ss.str();
10981         dout(0) << "do_command r=" << 0 << " " << rs << dendl;
10982         osd->clog->info() << rs;
10983         if (con) {
10984           MCommandReply *reply = new MCommandReply(0, rs);
10985           reply->set_tid(tid);
10986           con->send_message(reply);
10987         }
10988       }),
10989     OpRequestRef());
10990 }
10991
10992 void PrimaryLogPG::_split_into(pg_t child_pgid, PG *child, unsigned split_bits)
10993 {
10994   assert(repop_queue.empty());
10995 }
10996
10997 /*
10998  * pg status change notification
10999  */
11000
11001 void PrimaryLogPG::apply_and_flush_repops(bool requeue)
11002 {
11003   list<OpRequestRef> rq;
11004
11005   // apply all repops
11006   while (!repop_queue.empty()) {
11007     RepGather *repop = repop_queue.front();
11008     repop_queue.pop_front();
11009     dout(10) << " canceling repop tid " << repop->rep_tid << dendl;
11010     repop->rep_aborted = true;
11011     repop->on_applied.clear();
11012     repop->on_committed.clear();
11013     repop->on_success.clear();
11014
11015     if (requeue) {
11016       if (repop->op) {
11017         dout(10) << " requeuing " << *repop->op->get_req() << dendl;
11018         rq.push_back(repop->op);
11019         repop->op = OpRequestRef();
11020       }
11021
11022       // also requeue any dups, interleaved into position
11023       map<eversion_t, list<pair<OpRequestRef, version_t> > >::iterator p =
11024         waiting_for_ondisk.find(repop->v);
11025       if (p != waiting_for_ondisk.end()) {
11026         dout(10) << " also requeuing ondisk waiters " << p->second << dendl;
11027         for (list<pair<OpRequestRef, version_t> >::iterator i =
11028                p->second.begin();
11029              i != p->second.end();
11030              ++i) {
11031           rq.push_back(i->first);
11032         }
11033         waiting_for_ondisk.erase(p);
11034       }
11035     }
11036
11037     remove_repop(repop);
11038   }
11039
11040   assert(repop_queue.empty());
11041
11042   if (requeue) {
11043     requeue_ops(rq);
11044     if (!waiting_for_ondisk.empty()) {
11045       for (map<eversion_t, list<pair<OpRequestRef, version_t> > >::iterator i =
11046              waiting_for_ondisk.begin();
11047            i != waiting_for_ondisk.end();
11048            ++i) {
11049         for (list<pair<OpRequestRef, version_t> >::iterator j =
11050                i->second.begin();
11051              j != i->second.end();
11052              ++j) {
11053           derr << __func__ << ": op " << *(j->first->get_req()) << " waiting on "
11054                << i->first << dendl;
11055         }
11056       }
11057       assert(waiting_for_ondisk.empty());
11058     }
11059   }
11060
11061   waiting_for_ondisk.clear();
11062 }
11063
11064 void PrimaryLogPG::on_flushed()
11065 {
11066   assert(flushes_in_progress > 0);
11067   flushes_in_progress--;
11068   if (flushes_in_progress == 0) {
11069     requeue_ops(waiting_for_flush);
11070   }
11071   if (!is_peered() || !is_primary()) {
11072     pair<hobject_t, ObjectContextRef> i;
11073     while (object_contexts.get_next(i.first, &i)) {
11074       derr << "on_flushed: object " << i.first << " obc still alive" << dendl;
11075     }
11076     assert(object_contexts.empty());
11077   }
11078   pgbackend->on_flushed();
11079 }
11080
11081 void PrimaryLogPG::on_removal(ObjectStore::Transaction *t)
11082 {
11083   dout(10) << "on_removal" << dendl;
11084
11085   // adjust info to backfill
11086   info.set_last_backfill(hobject_t());
11087   pg_log.reset_backfill();
11088   dirty_info = true;
11089
11090
11091   // clear log
11092   PGLogEntryHandler rollbacker{this, t};
11093   pg_log.roll_forward(&rollbacker);
11094
11095   write_if_dirty(*t);
11096
11097   if (!deleting)
11098     on_shutdown();
11099 }
11100
11101 void PrimaryLogPG::clear_async_reads()
11102 {
11103   dout(10) << __func__ << dendl;
11104   for(auto& i : in_progress_async_reads) {
11105     dout(10) << "clear ctx: "
11106              << "OpRequestRef " << i.first
11107              << " OpContext " << i.second
11108              << dendl;
11109     close_op_ctx(i.second);
11110   }
11111 }
11112
11113 void PrimaryLogPG::on_shutdown()
11114 {
11115   dout(10) << "on_shutdown" << dendl;
11116
11117   // remove from queues
11118   osd->pg_stat_queue_dequeue(this);
11119   osd->peering_wq.dequeue(this);
11120
11121   // handles queue races
11122   deleting = true;
11123
11124   if (recovery_queued) {
11125     recovery_queued = false;
11126     osd->clear_queued_recovery(this);
11127   }
11128
11129   clear_scrub_reserved();
11130   scrub_clear_state();
11131
11132   unreg_next_scrub();
11133
11134   vector<ceph_tid_t> tids;
11135   cancel_copy_ops(false, &tids);
11136   cancel_flush_ops(false, &tids);
11137   cancel_proxy_ops(false, &tids);
11138   osd->objecter->op_cancel(tids, -ECANCELED);
11139
11140   apply_and_flush_repops(false);
11141   cancel_log_updates();
11142   // we must remove PGRefs, so do this this prior to release_backoffs() callers
11143   clear_backoffs();
11144   // clean up snap trim references
11145   snap_trimmer_machine.process_event(Reset());
11146
11147   pgbackend->on_change();
11148
11149   context_registry_on_change();
11150   object_contexts.clear();
11151
11152   clear_async_reads();
11153
11154   osd->remote_reserver.cancel_reservation(info.pgid);
11155   osd->local_reserver.cancel_reservation(info.pgid);
11156
11157   clear_primary_state();
11158   cancel_recovery();
11159 }
11160
11161 void PrimaryLogPG::on_activate()
11162 {
11163   // all clean?
11164   if (needs_recovery()) {
11165     dout(10) << "activate not all replicas are up-to-date, queueing recovery" << dendl;
11166     queue_peering_event(
11167       CephPeeringEvtRef(
11168         std::make_shared<CephPeeringEvt>(
11169           get_osdmap()->get_epoch(),
11170           get_osdmap()->get_epoch(),
11171           DoRecovery())));
11172   } else if (needs_backfill()) {
11173     dout(10) << "activate queueing backfill" << dendl;
11174     queue_peering_event(
11175       CephPeeringEvtRef(
11176         std::make_shared<CephPeeringEvt>(
11177           get_osdmap()->get_epoch(),
11178           get_osdmap()->get_epoch(),
11179           RequestBackfill())));
11180   } else {
11181     dout(10) << "activate all replicas clean, no recovery" << dendl;
11182     eio_errors_to_process = false;
11183     queue_peering_event(
11184       CephPeeringEvtRef(
11185         std::make_shared<CephPeeringEvt>(
11186           get_osdmap()->get_epoch(),
11187           get_osdmap()->get_epoch(),
11188           AllReplicasRecovered())));
11189   }
11190
11191   publish_stats_to_osd();
11192
11193   if (!backfill_targets.empty()) {
11194     last_backfill_started = earliest_backfill();
11195     new_backfill = true;
11196     assert(!last_backfill_started.is_max());
11197     dout(5) << "on activate: bft=" << backfill_targets
11198            << " from " << last_backfill_started << dendl;
11199     for (set<pg_shard_t>::iterator i = backfill_targets.begin();
11200          i != backfill_targets.end();
11201          ++i) {
11202       dout(5) << "target shard " << *i
11203              << " from " << peer_info[*i].last_backfill
11204              << dendl;
11205     }
11206   }
11207
11208   hit_set_setup();
11209   agent_setup();
11210 }
11211
11212 void PrimaryLogPG::_on_new_interval()
11213 {
11214   dout(20) << __func__ << " checking missing set deletes flag. missing = " << pg_log.get_missing() << dendl;
11215   if (!pg_log.get_missing().may_include_deletes &&
11216       get_osdmap()->test_flag(CEPH_OSDMAP_RECOVERY_DELETES)) {
11217     pg_log.rebuild_missing_set_with_deletes(osd->store, coll, info);
11218   }
11219   assert(pg_log.get_missing().may_include_deletes == get_osdmap()->test_flag(CEPH_OSDMAP_RECOVERY_DELETES));
11220 }
11221
11222 void PrimaryLogPG::on_change(ObjectStore::Transaction *t)
11223 {
11224   dout(10) << "on_change" << dendl;
11225
11226   if (hit_set && hit_set->insert_count() == 0) {
11227     dout(20) << " discarding empty hit_set" << dendl;
11228     hit_set_clear();
11229   }
11230
11231   if (recovery_queued) {
11232     recovery_queued = false;
11233     osd->clear_queued_recovery(this);
11234   }
11235
11236   // requeue everything in the reverse order they should be
11237   // reexamined.
11238   requeue_ops(waiting_for_peered);
11239   requeue_ops(waiting_for_flush);
11240   requeue_ops(waiting_for_active);
11241
11242   clear_scrub_reserved();
11243
11244   vector<ceph_tid_t> tids;
11245   cancel_copy_ops(is_primary(), &tids);
11246   cancel_flush_ops(is_primary(), &tids);
11247   cancel_proxy_ops(is_primary(), &tids);
11248   osd->objecter->op_cancel(tids, -ECANCELED);
11249
11250   // requeue object waiters
11251   for (auto& p : waiting_for_unreadable_object) {
11252     release_backoffs(p.first);
11253   }
11254   if (is_primary()) {
11255     requeue_object_waiters(waiting_for_unreadable_object);
11256   } else {
11257     waiting_for_unreadable_object.clear();
11258   }
11259   for (map<hobject_t,list<OpRequestRef>>::iterator p = waiting_for_degraded_object.begin();
11260        p != waiting_for_degraded_object.end();
11261        waiting_for_degraded_object.erase(p++)) {
11262     release_backoffs(p->first);
11263     if (is_primary())
11264       requeue_ops(p->second);
11265     else
11266       p->second.clear();
11267     finish_degraded_object(p->first);
11268   }
11269
11270   // requeues waiting_for_scrub
11271   scrub_clear_state();
11272
11273   for (auto p = waiting_for_blocked_object.begin();
11274        p != waiting_for_blocked_object.end();
11275        waiting_for_blocked_object.erase(p++)) {
11276     if (is_primary())
11277       requeue_ops(p->second);
11278     else
11279       p->second.clear();
11280   }
11281   for (auto i = callbacks_for_degraded_object.begin();
11282        i != callbacks_for_degraded_object.end();
11283     ) {
11284     finish_degraded_object((i++)->first);
11285   }
11286   assert(callbacks_for_degraded_object.empty());
11287
11288   if (is_primary()) {
11289     requeue_ops(waiting_for_cache_not_full);
11290   } else {
11291     waiting_for_cache_not_full.clear();
11292   }
11293   objects_blocked_on_cache_full.clear();
11294
11295   for (list<pair<OpRequestRef, OpContext*> >::iterator i =
11296          in_progress_async_reads.begin();
11297        i != in_progress_async_reads.end();
11298        in_progress_async_reads.erase(i++)) {
11299     close_op_ctx(i->second);
11300     if (is_primary())
11301       requeue_op(i->first);
11302   }
11303
11304   // this will requeue ops we were working on but didn't finish, and
11305   // any dups
11306   apply_and_flush_repops(is_primary());
11307   cancel_log_updates();
11308
11309   // do this *after* apply_and_flush_repops so that we catch any newly
11310   // registered watches.
11311   context_registry_on_change();
11312
11313   pgbackend->on_change_cleanup(t);
11314   scrubber.cleanup_store(t);
11315   pgbackend->on_change();
11316
11317   // clear snap_trimmer state
11318   snap_trimmer_machine.process_event(Reset());
11319
11320   debug_op_order.clear();
11321   unstable_stats.clear();
11322
11323   // we don't want to cache object_contexts through the interval change
11324   // NOTE: we actually assert that all currently live references are dead
11325   // by the time the flush for the next interval completes.
11326   object_contexts.clear();
11327
11328   // should have been cleared above by finishing all of the degraded objects
11329   assert(objects_blocked_on_degraded_snap.empty());
11330 }
11331
11332 void PrimaryLogPG::on_role_change()
11333 {
11334   dout(10) << "on_role_change" << dendl;
11335   if (get_role() != 0 && hit_set) {
11336     dout(10) << " clearing hit set" << dendl;
11337     hit_set_clear();
11338   }
11339 }
11340
11341 void PrimaryLogPG::on_pool_change()
11342 {
11343   dout(10) << __func__ << dendl;
11344   // requeue cache full waiters just in case the cache_mode is
11345   // changing away from writeback mode.  note that if we are not
11346   // active the normal requeuing machinery is sufficient (and properly
11347   // ordered).
11348   if (is_active() &&
11349       pool.info.cache_mode != pg_pool_t::CACHEMODE_WRITEBACK &&
11350       !waiting_for_cache_not_full.empty()) {
11351     dout(10) << __func__ << " requeuing full waiters (not in writeback) "
11352              << dendl;
11353     requeue_ops(waiting_for_cache_not_full);
11354     objects_blocked_on_cache_full.clear();
11355   }
11356   hit_set_setup();
11357   agent_setup();
11358 }
11359
11360 // clear state.  called on recovery completion AND cancellation.
11361 void PrimaryLogPG::_clear_recovery_state()
11362 {
11363   missing_loc.clear();
11364 #ifdef DEBUG_RECOVERY_OIDS
11365   recovering_oids.clear();
11366 #endif
11367   last_backfill_started = hobject_t();
11368   set<hobject_t>::iterator i = backfills_in_flight.begin();
11369   while (i != backfills_in_flight.end()) {
11370     assert(recovering.count(*i));
11371     backfills_in_flight.erase(i++);
11372   }
11373
11374   list<OpRequestRef> blocked_ops;
11375   for (map<hobject_t, ObjectContextRef>::iterator i = recovering.begin();
11376        i != recovering.end();
11377        recovering.erase(i++)) {
11378     if (i->second) {
11379       i->second->drop_recovery_read(&blocked_ops);
11380       requeue_ops(blocked_ops);
11381     }
11382   }
11383   assert(backfills_in_flight.empty());
11384   pending_backfill_updates.clear();
11385   assert(recovering.empty());
11386   pgbackend->clear_recovery_state();
11387 }
11388
11389 void PrimaryLogPG::cancel_pull(const hobject_t &soid)
11390 {
11391   dout(20) << __func__ << ": " << soid << dendl;
11392   assert(recovering.count(soid));
11393   ObjectContextRef obc = recovering[soid];
11394   if (obc) {
11395     list<OpRequestRef> blocked_ops;
11396     obc->drop_recovery_read(&blocked_ops);
11397     requeue_ops(blocked_ops);
11398   }
11399   recovering.erase(soid);
11400   finish_recovery_op(soid);
11401   release_backoffs(soid);
11402   if (waiting_for_degraded_object.count(soid)) {
11403     dout(20) << " kicking degraded waiters on " << soid << dendl;
11404     requeue_ops(waiting_for_degraded_object[soid]);
11405     waiting_for_degraded_object.erase(soid);
11406   }
11407   if (waiting_for_unreadable_object.count(soid)) {
11408     dout(20) << " kicking unreadable waiters on " << soid << dendl;
11409     requeue_ops(waiting_for_unreadable_object[soid]);
11410     waiting_for_unreadable_object.erase(soid);
11411   }
11412   if (is_missing_object(soid))
11413     pg_log.set_last_requested(0); // get recover_primary to start over
11414   finish_degraded_object(soid);
11415 }
11416
11417 void PrimaryLogPG::check_recovery_sources(const OSDMapRef& osdmap)
11418 {
11419   /*
11420    * check that any peers we are planning to (or currently) pulling
11421    * objects from are dealt with.
11422    */
11423   missing_loc.check_recovery_sources(osdmap);
11424   pgbackend->check_recovery_sources(osdmap);
11425
11426   for (set<pg_shard_t>::iterator i = peer_log_requested.begin();
11427        i != peer_log_requested.end();
11428        ) {
11429     if (!osdmap->is_up(i->osd)) {
11430       dout(10) << "peer_log_requested removing " << *i << dendl;
11431       peer_log_requested.erase(i++);
11432     } else {
11433       ++i;
11434     }
11435   }
11436
11437   for (set<pg_shard_t>::iterator i = peer_missing_requested.begin();
11438        i != peer_missing_requested.end();
11439        ) {
11440     if (!osdmap->is_up(i->osd)) {
11441       dout(10) << "peer_missing_requested removing " << *i << dendl;
11442       peer_missing_requested.erase(i++);
11443     } else {
11444       ++i;
11445     }
11446   }
11447 }
11448
11449 void PG::MissingLoc::check_recovery_sources(const OSDMapRef& osdmap)
11450 {
11451   set<pg_shard_t> now_down;
11452   for (set<pg_shard_t>::iterator p = missing_loc_sources.begin();
11453        p != missing_loc_sources.end();
11454        ) {
11455     if (osdmap->is_up(p->osd)) {
11456       ++p;
11457       continue;
11458     }
11459     ldout(pg->cct, 10) << "check_recovery_sources source osd." << *p << " now down" << dendl;
11460     now_down.insert(*p);
11461     missing_loc_sources.erase(p++);
11462   }
11463
11464   if (now_down.empty()) {
11465     ldout(pg->cct, 10) << "check_recovery_sources no source osds (" << missing_loc_sources << ") went down" << dendl;
11466   } else {
11467     ldout(pg->cct, 10) << "check_recovery_sources sources osds " << now_down << " now down, remaining sources are "
11468                        << missing_loc_sources << dendl;
11469
11470     // filter missing_loc
11471     map<hobject_t, set<pg_shard_t>>::iterator p = missing_loc.begin();
11472     while (p != missing_loc.end()) {
11473       set<pg_shard_t>::iterator q = p->second.begin();
11474       while (q != p->second.end())
11475         if (now_down.count(*q)) {
11476           p->second.erase(q++);
11477         } else {
11478           ++q;
11479         }
11480       if (p->second.empty())
11481         missing_loc.erase(p++);
11482       else
11483         ++p;
11484     }
11485   }
11486 }
11487
11488
11489 bool PrimaryLogPG::start_recovery_ops(
11490   uint64_t max,
11491   ThreadPool::TPHandle &handle,
11492   uint64_t *ops_started)
11493 {
11494   uint64_t& started = *ops_started;
11495   started = 0;
11496   bool work_in_progress = false;
11497   assert(is_primary());
11498
11499   if (!state_test(PG_STATE_RECOVERING) &&
11500       !state_test(PG_STATE_BACKFILLING)) {
11501     /* TODO: I think this case is broken and will make do_recovery()
11502      * unhappy since we're returning false */
11503     dout(10) << "recovery raced and were queued twice, ignoring!" << dendl;
11504     return false;
11505   }
11506
11507   const auto &missing = pg_log.get_missing();
11508
11509   unsigned int num_missing = missing.num_missing();
11510   uint64_t num_unfound = get_num_unfound();
11511
11512   if (num_missing == 0) {
11513     info.last_complete = info.last_update;
11514   }
11515
11516   if (num_missing == num_unfound) {
11517     // All of the missing objects we have are unfound.
11518     // Recover the replicas.
11519     started = recover_replicas(max, handle);
11520   }
11521   if (!started) {
11522     // We still have missing objects that we should grab from replicas.
11523     started += recover_primary(max, handle);
11524   }
11525   if (!started && num_unfound != get_num_unfound()) {
11526     // second chance to recovery replicas
11527     started = recover_replicas(max, handle);
11528   }
11529
11530   if (started)
11531     work_in_progress = true;
11532
11533   bool deferred_backfill = false;
11534   if (recovering.empty() &&
11535       state_test(PG_STATE_BACKFILLING) &&
11536       !backfill_targets.empty() && started < max &&
11537       missing.num_missing() == 0 &&
11538       waiting_on_backfill.empty()) {
11539     if (get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL)) {
11540       dout(10) << "deferring backfill due to NOBACKFILL" << dendl;
11541       deferred_backfill = true;
11542     } else if (get_osdmap()->test_flag(CEPH_OSDMAP_NOREBALANCE) &&
11543                !is_degraded())  {
11544       dout(10) << "deferring backfill due to NOREBALANCE" << dendl;
11545       deferred_backfill = true;
11546     } else if (!backfill_reserved) {
11547       dout(10) << "deferring backfill due to !backfill_reserved" << dendl;
11548       if (!backfill_reserving) {
11549         dout(10) << "queueing RequestBackfill" << dendl;
11550         backfill_reserving = true;
11551         queue_peering_event(
11552           CephPeeringEvtRef(
11553             std::make_shared<CephPeeringEvt>(
11554               get_osdmap()->get_epoch(),
11555               get_osdmap()->get_epoch(),
11556               RequestBackfill())));
11557       }
11558       deferred_backfill = true;
11559     } else {
11560       started += recover_backfill(max - started, handle, &work_in_progress);
11561     }
11562   }
11563
11564   dout(10) << " started " << started << dendl;
11565   osd->logger->inc(l_osd_rop, started);
11566
11567   if (!recovering.empty() ||
11568       work_in_progress || recovery_ops_active > 0 || deferred_backfill)
11569     return work_in_progress;
11570
11571   assert(recovering.empty());
11572   assert(recovery_ops_active == 0);
11573
11574   dout(10) << __func__ << " needs_recovery: "
11575            << missing_loc.get_needs_recovery()
11576            << dendl;
11577   dout(10) << __func__ << " missing_loc: "
11578            << missing_loc.get_missing_locs()
11579            << dendl;
11580   int unfound = get_num_unfound();
11581   if (unfound) {
11582     dout(10) << " still have " << unfound << " unfound" << dendl;
11583     return work_in_progress;
11584   }
11585
11586   if (missing.num_missing() > 0) {
11587     // this shouldn't happen!
11588     osd->clog->error() << info.pgid << " Unexpected Error: recovery ending with "
11589                        << missing.num_missing() << ": " << missing.get_items();
11590     return work_in_progress;
11591   }
11592
11593   if (needs_recovery()) {
11594     // this shouldn't happen!
11595     // We already checked num_missing() so we must have missing replicas
11596     osd->clog->error() << info.pgid
11597                        << " Unexpected Error: recovery ending with missing replicas";
11598     return work_in_progress;
11599   }
11600
11601   if (state_test(PG_STATE_RECOVERING)) {
11602     state_clear(PG_STATE_RECOVERING);
11603     state_clear(PG_STATE_FORCED_RECOVERY);
11604     if (needs_backfill()) {
11605       dout(10) << "recovery done, queuing backfill" << dendl;
11606       queue_peering_event(
11607         CephPeeringEvtRef(
11608           std::make_shared<CephPeeringEvt>(
11609             get_osdmap()->get_epoch(),
11610             get_osdmap()->get_epoch(),
11611             RequestBackfill())));
11612     } else {
11613       dout(10) << "recovery done, no backfill" << dendl;
11614       eio_errors_to_process = false;
11615       state_clear(PG_STATE_FORCED_BACKFILL);
11616       queue_peering_event(
11617         CephPeeringEvtRef(
11618           std::make_shared<CephPeeringEvt>(
11619             get_osdmap()->get_epoch(),
11620             get_osdmap()->get_epoch(),
11621             AllReplicasRecovered())));
11622     }
11623   } else { // backfilling
11624     state_clear(PG_STATE_BACKFILLING);
11625     state_clear(PG_STATE_FORCED_BACKFILL);
11626     state_clear(PG_STATE_FORCED_RECOVERY);
11627     dout(10) << "recovery done, backfill done" << dendl;
11628     eio_errors_to_process = false;
11629     queue_peering_event(
11630       CephPeeringEvtRef(
11631         std::make_shared<CephPeeringEvt>(
11632           get_osdmap()->get_epoch(),
11633           get_osdmap()->get_epoch(),
11634           Backfilled())));
11635   }
11636
11637   return false;
11638 }
11639
11640 /**
11641  * do one recovery op.
11642  * return true if done, false if nothing left to do.
11643  */
11644 uint64_t PrimaryLogPG::recover_primary(uint64_t max, ThreadPool::TPHandle &handle)
11645 {
11646   assert(is_primary());
11647
11648   const auto &missing = pg_log.get_missing();
11649
11650   dout(10) << "recover_primary recovering " << recovering.size()
11651            << " in pg" << dendl;
11652   dout(10) << "recover_primary " << missing << dendl;
11653   dout(25) << "recover_primary " << missing.get_items() << dendl;
11654
11655   // look at log!
11656   pg_log_entry_t *latest = 0;
11657   unsigned started = 0;
11658   int skipped = 0;
11659
11660   PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
11661   map<version_t, hobject_t>::const_iterator p =
11662     missing.get_rmissing().lower_bound(pg_log.get_log().last_requested);
11663   while (p != missing.get_rmissing().end()) {
11664     handle.reset_tp_timeout();
11665     hobject_t soid;
11666     version_t v = p->first;
11667
11668     if (pg_log.get_log().objects.count(p->second)) {
11669       latest = pg_log.get_log().objects.find(p->second)->second;
11670       assert(latest->is_update() || latest->is_delete());
11671       soid = latest->soid;
11672     } else {
11673       latest = 0;
11674       soid = p->second;
11675     }
11676     const pg_missing_item& item = missing.get_items().find(p->second)->second;
11677     ++p;
11678
11679     hobject_t head = soid.get_head();
11680
11681     eversion_t need = item.need;
11682
11683     dout(10) << "recover_primary "
11684              << soid << " " << item.need
11685              << (missing.is_missing(soid) ? " (missing)":"")
11686              << (missing.is_missing(head) ? " (missing head)":"")
11687              << (recovering.count(soid) ? " (recovering)":"")
11688              << (recovering.count(head) ? " (recovering head)":"")
11689              << dendl;
11690
11691     if (latest) {
11692       switch (latest->op) {
11693       case pg_log_entry_t::CLONE:
11694         /*
11695          * Handling for this special case removed for now, until we
11696          * can correctly construct an accurate SnapSet from the old
11697          * one.
11698          */
11699         break;
11700
11701       case pg_log_entry_t::LOST_REVERT:
11702         {
11703           if (item.have == latest->reverting_to) {
11704             ObjectContextRef obc = get_object_context(soid, true);
11705
11706             if (obc->obs.oi.version == latest->version) {
11707               // I'm already reverting
11708               dout(10) << " already reverting " << soid << dendl;
11709             } else {
11710               dout(10) << " reverting " << soid << " to " << latest->prior_version << dendl;
11711               obc->ondisk_write_lock();
11712               obc->obs.oi.version = latest->version;
11713
11714               ObjectStore::Transaction t;
11715               bufferlist b2;
11716               obc->obs.oi.encode(
11717                 b2,
11718                 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
11719               assert(!pool.info.require_rollback());
11720               t.setattr(coll, ghobject_t(soid), OI_ATTR, b2);
11721
11722               recover_got(soid, latest->version);
11723               missing_loc.add_location(soid, pg_whoami);
11724
11725               ++active_pushes;
11726
11727               osd->store->queue_transaction(osr.get(), std::move(t),
11728                                             new C_OSD_AppliedRecoveredObject(this, obc),
11729                                             new C_OSD_CommittedPushedObject(
11730                                               this,
11731                                               get_osdmap()->get_epoch(),
11732                                               info.last_complete),
11733                                             new C_OSD_OndiskWriteUnlock(obc));
11734               continue;
11735             }
11736           } else {
11737             /*
11738              * Pull the old version of the object.  Update missing_loc here to have the location
11739              * of the version we want.
11740              *
11741              * This doesn't use the usual missing_loc paths, but that's okay:
11742              *  - if we have it locally, we hit the case above, and go from there.
11743              *  - if we don't, we always pass through this case during recovery and set up the location
11744              *    properly.
11745              *  - this way we don't need to mangle the missing code to be general about needing an old
11746              *    version...
11747              */
11748             eversion_t alternate_need = latest->reverting_to;
11749             dout(10) << " need to pull prior_version " << alternate_need << " for revert " << item << dendl;
11750
11751             for (map<pg_shard_t, pg_missing_t>::iterator p = peer_missing.begin();
11752                  p != peer_missing.end();
11753                  ++p)
11754               if (p->second.is_missing(soid, need) &&
11755                   p->second.get_items().at(soid).have == alternate_need) {
11756                 missing_loc.add_location(soid, p->first);
11757               }
11758             dout(10) << " will pull " << alternate_need << " or " << need
11759                      << " from one of " << missing_loc.get_locations(soid)
11760                      << dendl;
11761           }
11762         }
11763         break;
11764       }
11765     }
11766
11767     if (!recovering.count(soid)) {
11768       if (recovering.count(head)) {
11769         ++skipped;
11770       } else {
11771         int r = recover_missing(
11772           soid, need, get_recovery_op_priority(), h);
11773         switch (r) {
11774         case PULL_YES:
11775           ++started;
11776           break;
11777         case PULL_OTHER:
11778           ++started;
11779         case PULL_NONE:
11780           ++skipped;
11781           break;
11782         default:
11783           ceph_abort();
11784         }
11785         if (started >= max)
11786           break;
11787       }
11788     }
11789
11790     // only advance last_requested if we haven't skipped anything
11791     if (!skipped)
11792       pg_log.set_last_requested(v);
11793   }
11794
11795   pgbackend->run_recovery_op(h, get_recovery_op_priority());
11796   return started;
11797 }
11798
11799 bool PrimaryLogPG::primary_error(
11800   const hobject_t& soid, eversion_t v)
11801 {
11802   pg_log.missing_add(soid, v, eversion_t());
11803   pg_log.set_last_requested(0);
11804   missing_loc.remove_location(soid, pg_whoami);
11805   bool uhoh = true;
11806   assert(!actingbackfill.empty());
11807   for (set<pg_shard_t>::iterator i = actingbackfill.begin();
11808        i != actingbackfill.end();
11809        ++i) {
11810     if (*i == get_primary()) continue;
11811     pg_shard_t peer = *i;
11812     if (!peer_missing[peer].is_missing(soid, v)) {
11813       missing_loc.add_location(soid, peer);
11814       dout(10) << info.pgid << " unexpectedly missing " << soid << " v" << v
11815                << ", there should be a copy on shard " << peer << dendl;
11816       uhoh = false;
11817     }
11818   }
11819   if (uhoh)
11820     osd->clog->error() << info.pgid << " missing primary copy of " << soid << ", unfound";
11821   else
11822     osd->clog->error() << info.pgid << " missing primary copy of " << soid
11823                          << ", will try copies on " << missing_loc.get_locations(soid);
11824   return uhoh;
11825 }
11826
11827 int PrimaryLogPG::prep_object_replica_deletes(
11828   const hobject_t& soid, eversion_t v,
11829   PGBackend::RecoveryHandle *h)
11830 {
11831   assert(is_primary());
11832   dout(10) << __func__ << ": on " << soid << dendl;
11833
11834   start_recovery_op(soid);
11835   assert(!recovering.count(soid));
11836   recovering.insert(make_pair(soid, ObjectContextRef()));
11837
11838   pgbackend->recover_delete_object(soid, v, h);
11839   return 1;
11840 }
11841
11842 int PrimaryLogPG::prep_object_replica_pushes(
11843   const hobject_t& soid, eversion_t v,
11844   PGBackend::RecoveryHandle *h)
11845 {
11846   assert(is_primary());
11847   dout(10) << __func__ << ": on " << soid << dendl;
11848
11849   // NOTE: we know we will get a valid oloc off of disk here.
11850   ObjectContextRef obc = get_object_context(soid, false);
11851   if (!obc) {
11852     primary_error(soid, v);
11853     return 0;
11854   }
11855
11856   if (!obc->get_recovery_read()) {
11857     dout(20) << "recovery delayed on " << soid
11858              << "; could not get rw_manager lock" << dendl;
11859     return 0;
11860   } else {
11861     dout(20) << "recovery got recovery read lock on " << soid
11862              << dendl;
11863   }
11864
11865   start_recovery_op(soid);
11866   assert(!recovering.count(soid));
11867   recovering.insert(make_pair(soid, obc));
11868
11869   /* We need this in case there is an in progress write on the object.  In fact,
11870    * the only possible write is an update to the xattr due to a lost_revert --
11871    * a client write would be blocked since the object is degraded.
11872    * In almost all cases, therefore, this lock should be uncontended.
11873    */
11874   obc->ondisk_read_lock();
11875   int r = pgbackend->recover_object(
11876     soid,
11877     v,
11878     ObjectContextRef(),
11879     obc, // has snapset context
11880     h);
11881   obc->ondisk_read_unlock();
11882   if (r < 0) {
11883     dout(0) << __func__ << " Error " << r << " on oid " << soid << dendl;
11884     primary_failed(soid);
11885     primary_error(soid, v);
11886     return 0;
11887   }
11888   return 1;
11889 }
11890
11891 uint64_t PrimaryLogPG::recover_replicas(uint64_t max, ThreadPool::TPHandle &handle)
11892 {
11893   dout(10) << __func__ << "(" << max << ")" << dendl;
11894   uint64_t started = 0;
11895
11896   PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
11897
11898   // this is FAR from an optimal recovery order.  pretty lame, really.
11899   assert(!actingbackfill.empty());
11900   for (set<pg_shard_t>::iterator i = actingbackfill.begin();
11901        i != actingbackfill.end();
11902        ++i) {
11903     if (*i == get_primary()) continue;
11904     pg_shard_t peer = *i;
11905     map<pg_shard_t, pg_missing_t>::const_iterator pm = peer_missing.find(peer);
11906     assert(pm != peer_missing.end());
11907     map<pg_shard_t, pg_info_t>::const_iterator pi = peer_info.find(peer);
11908     assert(pi != peer_info.end());
11909     size_t m_sz = pm->second.num_missing();
11910
11911     dout(10) << " peer osd." << peer << " missing " << m_sz << " objects." << dendl;
11912     dout(20) << " peer osd." << peer << " missing " << pm->second.get_items() << dendl;
11913
11914     // oldest first!
11915     const pg_missing_t &m(pm->second);
11916     for (map<version_t, hobject_t>::const_iterator p = m.get_rmissing().begin();
11917          p != m.get_rmissing().end() && started < max;
11918            ++p) {
11919       handle.reset_tp_timeout();
11920       const hobject_t soid(p->second);
11921
11922       if (missing_loc.is_unfound(soid)) {
11923         dout(10) << __func__ << ": " << soid << " still unfound" << dendl;
11924         continue;
11925       }
11926
11927       if (soid > pi->second.last_backfill) {
11928         if (!recovering.count(soid)) {
11929           derr << __func__ << ": object " << soid << " last_backfill " << pi->second.last_backfill << dendl;
11930           derr << __func__ << ": object added to missing set for backfill, but "
11931                << "is not in recovering, error!" << dendl;
11932           ceph_abort();
11933         }
11934         continue;
11935       }
11936
11937       if (recovering.count(soid)) {
11938         dout(10) << __func__ << ": already recovering " << soid << dendl;
11939         continue;
11940       }
11941
11942       if (missing_loc.is_deleted(soid)) {
11943         dout(10) << __func__ << ": " << soid << " is a delete, removing" << dendl;
11944         map<hobject_t,pg_missing_item>::const_iterator r = m.get_items().find(soid);
11945         started += prep_object_replica_deletes(soid, r->second.need, h);
11946         continue;
11947       }
11948
11949       if (soid.is_snap() && pg_log.get_missing().is_missing(soid.get_head())) {
11950         dout(10) << __func__ << ": " << soid.get_head()
11951                  << " still missing on primary" << dendl;
11952         continue;
11953       }
11954
11955       if (soid.is_snap() && pg_log.get_missing().is_missing(soid.get_snapdir())) {
11956         dout(10) << __func__ << ": " << soid.get_snapdir()
11957                  << " still missing on primary" << dendl;
11958         continue;
11959       }
11960
11961       if (pg_log.get_missing().is_missing(soid)) {
11962         dout(10) << __func__ << ": " << soid << " still missing on primary" << dendl;
11963         continue;
11964       }
11965
11966       dout(10) << __func__ << ": recover_object_replicas(" << soid << ")" << dendl;
11967       map<hobject_t,pg_missing_item>::const_iterator r = m.get_items().find(soid);
11968       started += prep_object_replica_pushes(soid, r->second.need,
11969                                             h);
11970     }
11971   }
11972
11973   pgbackend->run_recovery_op(h, get_recovery_op_priority());
11974   return started;
11975 }
11976
11977 hobject_t PrimaryLogPG::earliest_peer_backfill() const
11978 {
11979   hobject_t e = hobject_t::get_max();
11980   for (set<pg_shard_t>::const_iterator i = backfill_targets.begin();
11981        i != backfill_targets.end();
11982        ++i) {
11983     pg_shard_t peer = *i;
11984     map<pg_shard_t, BackfillInterval>::const_iterator iter =
11985       peer_backfill_info.find(peer);
11986     assert(iter != peer_backfill_info.end());
11987     if (iter->second.begin < e)
11988       e = iter->second.begin;
11989   }
11990   return e;
11991 }
11992
11993 bool PrimaryLogPG::all_peer_done() const
11994 {
11995   // Primary hasn't got any more objects
11996   assert(backfill_info.empty());
11997
11998   for (set<pg_shard_t>::const_iterator i = backfill_targets.begin();
11999        i != backfill_targets.end();
12000        ++i) {
12001     pg_shard_t bt = *i;
12002     map<pg_shard_t, BackfillInterval>::const_iterator piter =
12003       peer_backfill_info.find(bt);
12004     assert(piter != peer_backfill_info.end());
12005     const BackfillInterval& pbi = piter->second;
12006     // See if peer has more to process
12007     if (!pbi.extends_to_end() || !pbi.empty())
12008         return false;
12009   }
12010   return true;
12011 }
12012
12013 /**
12014  * recover_backfill
12015  *
12016  * Invariants:
12017  *
12018  * backfilled: fully pushed to replica or present in replica's missing set (both
12019  * our copy and theirs).
12020  *
12021  * All objects on a backfill_target in
12022  * [MIN,peer_backfill_info[backfill_target].begin) are valid; logically-removed
12023  * objects have been actually deleted and all logically-valid objects are replicated.
12024  * There may be PG objects in this interval yet to be backfilled.
12025  *
12026  * All objects in PG in [MIN,backfill_info.begin) have been backfilled to all
12027  * backfill_targets.  There may be objects on backfill_target(s) yet to be deleted.
12028  *
12029  * For a backfill target, all objects < MIN(peer_backfill_info[target].begin,
12030  *     backfill_info.begin) in PG are backfilled.  No deleted objects in this
12031  * interval remain on the backfill target.
12032  *
12033  * For a backfill target, all objects <= peer_info[target].last_backfill
12034  * have been backfilled to target
12035  *
12036  * There *MAY* be missing/outdated objects between last_backfill_started and
12037  * MIN(peer_backfill_info[*].begin, backfill_info.begin) in the event that client
12038  * io created objects since the last scan.  For this reason, we call
12039  * update_range() again before continuing backfill.
12040  */
12041 uint64_t PrimaryLogPG::recover_backfill(
12042   uint64_t max,
12043   ThreadPool::TPHandle &handle, bool *work_started)
12044 {
12045   dout(10) << "recover_backfill (" << max << ")"
12046            << " bft=" << backfill_targets
12047            << " last_backfill_started " << last_backfill_started
12048            << (new_backfill ? " new_backfill":"")
12049            << dendl;
12050   assert(!backfill_targets.empty());
12051
12052   // Initialize from prior backfill state
12053   if (new_backfill) {
12054     // on_activate() was called prior to getting here
12055     assert(last_backfill_started == earliest_backfill());
12056     new_backfill = false;
12057
12058     // initialize BackfillIntervals
12059     for (set<pg_shard_t>::iterator i = backfill_targets.begin();
12060          i != backfill_targets.end();
12061          ++i) {
12062       peer_backfill_info[*i].reset(peer_info[*i].last_backfill);
12063     }
12064     backfill_info.reset(last_backfill_started);
12065
12066     backfills_in_flight.clear();
12067     pending_backfill_updates.clear();
12068   }
12069
12070   for (set<pg_shard_t>::iterator i = backfill_targets.begin();
12071        i != backfill_targets.end();
12072        ++i) {
12073     dout(10) << "peer osd." << *i
12074            << " info " << peer_info[*i]
12075            << " interval " << peer_backfill_info[*i].begin
12076            << "-" << peer_backfill_info[*i].end
12077            << " " << peer_backfill_info[*i].objects.size() << " objects"
12078            << dendl;
12079   }
12080
12081   // update our local interval to cope with recent changes
12082   backfill_info.begin = last_backfill_started;
12083   update_range(&backfill_info, handle);
12084
12085   unsigned ops = 0;
12086   vector<boost::tuple<hobject_t, eversion_t, pg_shard_t> > to_remove;
12087   set<hobject_t> add_to_stat;
12088
12089   for (set<pg_shard_t>::iterator i = backfill_targets.begin();
12090        i != backfill_targets.end();
12091        ++i) {
12092     peer_backfill_info[*i].trim_to(
12093       std::max(peer_info[*i].last_backfill, last_backfill_started));
12094   }
12095   backfill_info.trim_to(last_backfill_started);
12096
12097   PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
12098   while (ops < max) {
12099     if (backfill_info.begin <= earliest_peer_backfill() &&
12100         !backfill_info.extends_to_end() && backfill_info.empty()) {
12101       hobject_t next = backfill_info.end;
12102       backfill_info.reset(next);
12103       backfill_info.end = hobject_t::get_max();
12104       update_range(&backfill_info, handle);
12105       backfill_info.trim();
12106     }
12107
12108     dout(20) << "   my backfill interval " << backfill_info << dendl;
12109
12110     bool sent_scan = false;
12111     for (set<pg_shard_t>::iterator i = backfill_targets.begin();
12112          i != backfill_targets.end();
12113          ++i) {
12114       pg_shard_t bt = *i;
12115       BackfillInterval& pbi = peer_backfill_info[bt];
12116
12117       dout(20) << " peer shard " << bt << " backfill " << pbi << dendl;
12118       if (pbi.begin <= backfill_info.begin &&
12119           !pbi.extends_to_end() && pbi.empty()) {
12120         dout(10) << " scanning peer osd." << bt << " from " << pbi.end << dendl;
12121         epoch_t e = get_osdmap()->get_epoch();
12122         MOSDPGScan *m = new MOSDPGScan(
12123           MOSDPGScan::OP_SCAN_GET_DIGEST, pg_whoami, e, last_peering_reset,
12124           spg_t(info.pgid.pgid, bt.shard),
12125           pbi.end, hobject_t());
12126         osd->send_message_osd_cluster(bt.osd, m, get_osdmap()->get_epoch());
12127         assert(waiting_on_backfill.find(bt) == waiting_on_backfill.end());
12128         waiting_on_backfill.insert(bt);
12129         sent_scan = true;
12130       }
12131     }
12132
12133     // Count simultaneous scans as a single op and let those complete
12134     if (sent_scan) {
12135       ops++;
12136       start_recovery_op(hobject_t::get_max()); // XXX: was pbi.end
12137       break;
12138     }
12139
12140     if (backfill_info.empty() && all_peer_done()) {
12141       dout(10) << " reached end for both local and all peers" << dendl;
12142       break;
12143     }
12144
12145     // Get object within set of peers to operate on and
12146     // the set of targets for which that object applies.
12147     hobject_t check = earliest_peer_backfill();
12148
12149     if (check < backfill_info.begin) {
12150
12151       set<pg_shard_t> check_targets;
12152       for (set<pg_shard_t>::iterator i = backfill_targets.begin();
12153            i != backfill_targets.end();
12154            ++i) {
12155         pg_shard_t bt = *i;
12156         BackfillInterval& pbi = peer_backfill_info[bt];
12157         if (pbi.begin == check)
12158           check_targets.insert(bt);
12159       }
12160       assert(!check_targets.empty());
12161
12162       dout(20) << " BACKFILL removing " << check
12163                << " from peers " << check_targets << dendl;
12164       for (set<pg_shard_t>::iterator i = check_targets.begin();
12165            i != check_targets.end();
12166            ++i) {
12167         pg_shard_t bt = *i;
12168         BackfillInterval& pbi = peer_backfill_info[bt];
12169         assert(pbi.begin == check);
12170
12171         to_remove.push_back(boost::make_tuple(check, pbi.objects.begin()->second, bt));
12172         pbi.pop_front();
12173       }
12174
12175       /* This requires a bit of explanation.  We compare head against
12176        * last_backfill to determine whether to send an operation
12177        * to the replica.  A single write operation can touch up to three
12178        * objects: head, the snapdir, and a new clone which sorts closer to
12179        * head than any existing clone.  If last_backfill points at a clone,
12180        * the transaction won't be sent and all 3 must lie on the right side
12181        * of the line (i.e., we'll backfill them later).  If last_backfill
12182        * points at snapdir, it sorts greater than head, so we send the
12183        * transaction which is correct because all three must lie to the left
12184        * of the line.
12185        *
12186        * If it points at head, we have a bit of an issue.  If head actually
12187        * exists, no problem, because any transaction which touches snapdir
12188        * must end up creating it (and deleting head), so sending the
12189        * operation won't pose a problem -- we'll end up having to scan it,
12190        * but it'll end up being the right version so we won't bother to
12191        * rebackfill it.  However, if head doesn't exist, any write on head
12192        * will remove snapdir.  For a replicated pool, this isn't a problem,
12193        * ENOENT on remove isn't an issue and it's in backfill future anyway.
12194        * It only poses a problem for EC pools, because we never just delete
12195        * an object, we rename it into a rollback object.  That operation
12196        * will end up crashing the osd with ENOENT.  Tolerating the failure
12197        * wouldn't work either, even if snapdir exists, we'd be creating a
12198        * rollback object past the last_backfill line which wouldn't get
12199        * cleaned up (no rollback objects past the last_backfill line is an
12200        * existing important invariant).  Thus, let's avoid the whole issue
12201        * by just not updating last_backfill_started here if head doesn't
12202        * exist and snapdir does.  We aren't using up a recovery count here,
12203        * so we're going to recover snapdir immediately anyway.  We'll only
12204        * fail "backward" if we fail to get the rw lock and that just means
12205        * we'll re-process this section of the hash space again.
12206        *
12207        * I'm choosing this hack here because the really "correct" answer is
12208        * going to be to unify snapdir and head into a single object (a
12209        * snapdir is really just a confusing way to talk about head existing
12210        * as a whiteout), but doing that is going to be a somewhat larger
12211        * undertaking.
12212        *
12213        * @see http://tracker.ceph.com/issues/17668
12214        */
12215       if (!(check.is_head() &&
12216             backfill_info.begin.is_snapdir() &&
12217             check == backfill_info.begin.get_head()))
12218         last_backfill_started = check;
12219
12220       // Don't increment ops here because deletions
12221       // are cheap and not replied to unlike real recovery_ops,
12222       // and we can't increment ops without requeueing ourself
12223       // for recovery.
12224     } else {
12225       eversion_t& obj_v = backfill_info.objects.begin()->second;
12226
12227       vector<pg_shard_t> need_ver_targs, missing_targs, keep_ver_targs, skip_targs;
12228       for (set<pg_shard_t>::iterator i = backfill_targets.begin();
12229            i != backfill_targets.end();
12230            ++i) {
12231         pg_shard_t bt = *i;
12232         BackfillInterval& pbi = peer_backfill_info[bt];
12233         // Find all check peers that have the wrong version
12234         if (check == backfill_info.begin && check == pbi.begin) {
12235           if (pbi.objects.begin()->second != obj_v) {
12236             need_ver_targs.push_back(bt);
12237           } else {
12238             keep_ver_targs.push_back(bt);
12239           }
12240         } else {
12241           pg_info_t& pinfo = peer_info[bt];
12242
12243           // Only include peers that we've caught up to their backfill line
12244           // otherwise, they only appear to be missing this object
12245           // because their pbi.begin > backfill_info.begin.
12246           if (backfill_info.begin > pinfo.last_backfill)
12247             missing_targs.push_back(bt);
12248           else
12249             skip_targs.push_back(bt);
12250         }
12251       }
12252
12253       if (!keep_ver_targs.empty()) {
12254         // These peers have version obj_v
12255         dout(20) << " BACKFILL keeping " << check
12256                  << " with ver " << obj_v
12257                  << " on peers " << keep_ver_targs << dendl;
12258         //assert(!waiting_for_degraded_object.count(check));
12259       }
12260       if (!need_ver_targs.empty() || !missing_targs.empty()) {
12261         ObjectContextRef obc = get_object_context(backfill_info.begin, false);
12262         assert(obc);
12263         if (obc->get_recovery_read()) {
12264           if (!need_ver_targs.empty()) {
12265             dout(20) << " BACKFILL replacing " << check
12266                    << " with ver " << obj_v
12267                    << " to peers " << need_ver_targs << dendl;
12268           }
12269           if (!missing_targs.empty()) {
12270             dout(20) << " BACKFILL pushing " << backfill_info.begin
12271                  << " with ver " << obj_v
12272                  << " to peers " << missing_targs << dendl;
12273           }
12274           vector<pg_shard_t> all_push = need_ver_targs;
12275           all_push.insert(all_push.end(), missing_targs.begin(), missing_targs.end());
12276
12277           handle.reset_tp_timeout();
12278           int r = prep_backfill_object_push(backfill_info.begin, obj_v, obc, all_push, h);
12279           if (r < 0) {
12280             *work_started = true;
12281             dout(0) << __func__ << " Error " << r << " trying to backfill " << backfill_info.begin << dendl;
12282             break;
12283           }
12284           ops++;
12285         } else {
12286           *work_started = true;
12287           dout(20) << "backfill blocking on " << backfill_info.begin
12288                    << "; could not get rw_manager lock" << dendl;
12289           break;
12290         }
12291       }
12292       dout(20) << "need_ver_targs=" << need_ver_targs
12293                << " keep_ver_targs=" << keep_ver_targs << dendl;
12294       dout(20) << "backfill_targets=" << backfill_targets
12295                << " missing_targs=" << missing_targs
12296                << " skip_targs=" << skip_targs << dendl;
12297
12298       last_backfill_started = backfill_info.begin;
12299       add_to_stat.insert(backfill_info.begin); // XXX: Only one for all pushes?
12300       backfill_info.pop_front();
12301       vector<pg_shard_t> check_targets = need_ver_targs;
12302       check_targets.insert(check_targets.end(), keep_ver_targs.begin(), keep_ver_targs.end());
12303       for (vector<pg_shard_t>::iterator i = check_targets.begin();
12304            i != check_targets.end();
12305            ++i) {
12306         pg_shard_t bt = *i;
12307         BackfillInterval& pbi = peer_backfill_info[bt];
12308         pbi.pop_front();
12309       }
12310     }
12311   }
12312
12313   hobject_t backfill_pos =
12314     std::min(backfill_info.begin, earliest_peer_backfill());
12315
12316   for (set<hobject_t>::iterator i = add_to_stat.begin();
12317        i != add_to_stat.end();
12318        ++i) {
12319     ObjectContextRef obc = get_object_context(*i, false);
12320     assert(obc);
12321     pg_stat_t stat;
12322     add_object_context_to_pg_stat(obc, &stat);
12323     pending_backfill_updates[*i] = stat;
12324   }
12325   if (HAVE_FEATURE(get_min_upacting_features(), SERVER_LUMINOUS)) {
12326     map<pg_shard_t,MOSDPGBackfillRemove*> reqs;
12327     for (unsigned i = 0; i < to_remove.size(); ++i) {
12328       handle.reset_tp_timeout();
12329       const hobject_t& oid = to_remove[i].get<0>();
12330       eversion_t v = to_remove[i].get<1>();
12331       pg_shard_t peer = to_remove[i].get<2>();
12332       MOSDPGBackfillRemove *m;
12333       auto it = reqs.find(peer);
12334       if (it != reqs.end()) {
12335         m = it->second;
12336       } else {
12337         m = reqs[peer] = new MOSDPGBackfillRemove(
12338           spg_t(info.pgid.pgid, peer.shard),
12339           get_osdmap()->get_epoch());
12340       }
12341       m->ls.push_back(make_pair(oid, v));
12342
12343       if (oid <= last_backfill_started)
12344         pending_backfill_updates[oid]; // add empty stat!
12345     }
12346     for (auto p : reqs) {
12347       osd->send_message_osd_cluster(p.first.osd, p.second,
12348                                     get_osdmap()->get_epoch());
12349     }
12350   } else {
12351     // for jewel targets
12352     for (unsigned i = 0; i < to_remove.size(); ++i) {
12353       handle.reset_tp_timeout();
12354
12355       // ordered before any subsequent updates
12356       send_remove_op(to_remove[i].get<0>(), to_remove[i].get<1>(),
12357                      to_remove[i].get<2>());
12358
12359       if (to_remove[i].get<0>() <= last_backfill_started)
12360         pending_backfill_updates[to_remove[i].get<0>()]; // add empty stat!
12361     }
12362   }
12363
12364   pgbackend->run_recovery_op(h, get_recovery_op_priority());
12365
12366   dout(5) << "backfill_pos is " << backfill_pos << dendl;
12367   for (set<hobject_t>::iterator i = backfills_in_flight.begin();
12368        i != backfills_in_flight.end();
12369        ++i) {
12370     dout(20) << *i << " is still in flight" << dendl;
12371   }
12372
12373   hobject_t next_backfill_to_complete = backfills_in_flight.empty() ?
12374     backfill_pos : *(backfills_in_flight.begin());
12375   hobject_t new_last_backfill = earliest_backfill();
12376   dout(10) << "starting new_last_backfill at " << new_last_backfill << dendl;
12377   for (map<hobject_t, pg_stat_t>::iterator i =
12378          pending_backfill_updates.begin();
12379        i != pending_backfill_updates.end() &&
12380          i->first < next_backfill_to_complete;
12381        pending_backfill_updates.erase(i++)) {
12382     dout(20) << " pending_backfill_update " << i->first << dendl;
12383     assert(i->first > new_last_backfill);
12384     for (set<pg_shard_t>::iterator j = backfill_targets.begin();
12385          j != backfill_targets.end();
12386          ++j) {
12387       pg_shard_t bt = *j;
12388       pg_info_t& pinfo = peer_info[bt];
12389       //Add stats to all peers that were missing object
12390       if (i->first > pinfo.last_backfill)
12391         pinfo.stats.add(i->second);
12392     }
12393     new_last_backfill = i->first;
12394   }
12395   dout(10) << "possible new_last_backfill at " << new_last_backfill << dendl;
12396
12397   assert(!pending_backfill_updates.empty() ||
12398          new_last_backfill == last_backfill_started);
12399   if (pending_backfill_updates.empty() &&
12400       backfill_pos.is_max()) {
12401     assert(backfills_in_flight.empty());
12402     new_last_backfill = backfill_pos;
12403     last_backfill_started = backfill_pos;
12404   }
12405   dout(10) << "final new_last_backfill at " << new_last_backfill << dendl;
12406
12407   // If new_last_backfill == MAX, then we will send OP_BACKFILL_FINISH to
12408   // all the backfill targets.  Otherwise, we will move last_backfill up on
12409   // those targets need it and send OP_BACKFILL_PROGRESS to them.
12410   for (set<pg_shard_t>::iterator i = backfill_targets.begin();
12411        i != backfill_targets.end();
12412        ++i) {
12413     pg_shard_t bt = *i;
12414     pg_info_t& pinfo = peer_info[bt];
12415
12416     if (new_last_backfill > pinfo.last_backfill) {
12417       pinfo.set_last_backfill(new_last_backfill);
12418       epoch_t e = get_osdmap()->get_epoch();
12419       MOSDPGBackfill *m = NULL;
12420       if (pinfo.last_backfill.is_max()) {
12421         m = new MOSDPGBackfill(
12422           MOSDPGBackfill::OP_BACKFILL_FINISH,
12423           e,
12424           last_peering_reset,
12425           spg_t(info.pgid.pgid, bt.shard));
12426         // Use default priority here, must match sub_op priority
12427         /* pinfo.stats might be wrong if we did log-based recovery on the
12428          * backfilled portion in addition to continuing backfill.
12429          */
12430         pinfo.stats = info.stats;
12431         start_recovery_op(hobject_t::get_max());
12432       } else {
12433         m = new MOSDPGBackfill(
12434           MOSDPGBackfill::OP_BACKFILL_PROGRESS,
12435           e,
12436           last_peering_reset,
12437           spg_t(info.pgid.pgid, bt.shard));
12438         // Use default priority here, must match sub_op priority
12439       }
12440       m->last_backfill = pinfo.last_backfill;
12441       m->stats = pinfo.stats;
12442       osd->send_message_osd_cluster(bt.osd, m, get_osdmap()->get_epoch());
12443       dout(10) << " peer " << bt
12444                << " num_objects now " << pinfo.stats.stats.sum.num_objects
12445                << " / " << info.stats.stats.sum.num_objects << dendl;
12446     }
12447   }
12448
12449   if (ops)
12450     *work_started = true;
12451   return ops;
12452 }
12453
12454 int PrimaryLogPG::prep_backfill_object_push(
12455   hobject_t oid, eversion_t v,
12456   ObjectContextRef obc,
12457   vector<pg_shard_t> peers,
12458   PGBackend::RecoveryHandle *h)
12459 {
12460   dout(10) << __func__ << " " << oid << " v " << v << " to peers " << peers << dendl;
12461   assert(!peers.empty());
12462
12463   backfills_in_flight.insert(oid);
12464   for (unsigned int i = 0 ; i < peers.size(); ++i) {
12465     map<pg_shard_t, pg_missing_t>::iterator bpm = peer_missing.find(peers[i]);
12466     assert(bpm != peer_missing.end());
12467     bpm->second.add(oid, eversion_t(), eversion_t(), false);
12468   }
12469
12470   assert(!recovering.count(oid));
12471
12472   start_recovery_op(oid);
12473   recovering.insert(make_pair(oid, obc));
12474
12475   // We need to take the read_lock here in order to flush in-progress writes
12476   obc->ondisk_read_lock();
12477   int r = pgbackend->recover_object(
12478     oid,
12479     v,
12480     ObjectContextRef(),
12481     obc,
12482     h);
12483   obc->ondisk_read_unlock();
12484   if (r < 0) {
12485     dout(0) << __func__ << " Error " << r << " on oid " << oid << dendl;
12486     primary_failed(oid);
12487     primary_error(oid, v);
12488     backfills_in_flight.erase(oid);
12489     missing_loc.add_missing(oid, v, eversion_t());
12490   }
12491   return r;
12492 }
12493
12494 void PrimaryLogPG::update_range(
12495   BackfillInterval *bi,
12496   ThreadPool::TPHandle &handle)
12497 {
12498   int local_min = cct->_conf->osd_backfill_scan_min;
12499   int local_max = cct->_conf->osd_backfill_scan_max;
12500
12501   if (bi->version < info.log_tail) {
12502     dout(10) << __func__<< ": bi is old, rescanning local backfill_info"
12503              << dendl;
12504     osr->flush();
12505     if (last_update_applied >= info.log_tail) {
12506       bi->version = last_update_applied;
12507     } else {
12508       bi->version = info.last_update;
12509     }
12510     scan_range(local_min, local_max, bi, handle);
12511   }
12512
12513   if (bi->version >= projected_last_update) {
12514     dout(10) << __func__<< ": bi is current " << dendl;
12515     assert(bi->version == projected_last_update);
12516   } else if (bi->version >= info.log_tail) {
12517     if (pg_log.get_log().empty() && projected_log.empty()) {
12518       /* Because we don't move log_tail on split, the log might be
12519        * empty even if log_tail != last_update.  However, the only
12520        * way to get here with an empty log is if log_tail is actually
12521        * eversion_t(), because otherwise the entry which changed
12522        * last_update since the last scan would have to be present.
12523        */
12524       assert(bi->version == eversion_t());
12525       return;
12526     }
12527
12528     dout(10) << __func__<< ": bi is old, (" << bi->version
12529              << ") can be updated with log to projected_last_update "
12530              << projected_last_update << dendl;
12531
12532     auto func = [&](const pg_log_entry_t &e) {
12533       dout(10) << __func__ << ": updating from version " << e.version
12534                << dendl;
12535       const hobject_t &soid = e.soid;
12536       if (soid >= bi->begin &&
12537           soid < bi->end) {
12538         if (e.is_update()) {
12539           dout(10) << __func__ << ": " << e.soid << " updated to version "
12540                    << e.version << dendl;
12541           bi->objects.erase(e.soid);
12542           bi->objects.insert(
12543             make_pair(
12544               e.soid,
12545               e.version));
12546         } else if (e.is_delete()) {
12547           dout(10) << __func__ << ": " << e.soid << " removed" << dendl;
12548           bi->objects.erase(e.soid);
12549         }
12550       }
12551     };
12552     dout(10) << "scanning pg log first" << dendl;
12553     pg_log.get_log().scan_log_after(bi->version, func);
12554     dout(10) << "scanning projected log" << dendl;
12555     projected_log.scan_log_after(bi->version, func);
12556     bi->version = projected_last_update;
12557   } else {
12558     assert(0 == "scan_range should have raised bi->version past log_tail");
12559   }
12560 }
12561
12562 void PrimaryLogPG::scan_range(
12563   int min, int max, BackfillInterval *bi,
12564   ThreadPool::TPHandle &handle)
12565 {
12566   assert(is_locked());
12567   dout(10) << "scan_range from " << bi->begin << dendl;
12568   bi->clear_objects();
12569
12570   vector<hobject_t> ls;
12571   ls.reserve(max);
12572   int r = pgbackend->objects_list_partial(bi->begin, min, max, &ls, &bi->end);
12573   assert(r >= 0);
12574   dout(10) << " got " << ls.size() << " items, next " << bi->end << dendl;
12575   dout(20) << ls << dendl;
12576
12577   for (vector<hobject_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
12578     handle.reset_tp_timeout();
12579     ObjectContextRef obc;
12580     if (is_primary())
12581       obc = object_contexts.lookup(*p);
12582     if (obc) {
12583       bi->objects[*p] = obc->obs.oi.version;
12584       dout(20) << "  " << *p << " " << obc->obs.oi.version << dendl;
12585     } else {
12586       bufferlist bl;
12587       int r = pgbackend->objects_get_attr(*p, OI_ATTR, &bl);
12588
12589       /* If the object does not exist here, it must have been removed
12590          * between the collection_list_partial and here.  This can happen
12591          * for the first item in the range, which is usually last_backfill.
12592          */
12593       if (r == -ENOENT)
12594         continue;
12595
12596       assert(r >= 0);
12597       object_info_t oi(bl);
12598       bi->objects[*p] = oi.version;
12599       dout(20) << "  " << *p << " " << oi.version << dendl;
12600     }
12601   }
12602 }
12603
12604
12605 /** check_local
12606  *
12607  * verifies that stray objects have been deleted
12608  */
12609 void PrimaryLogPG::check_local()
12610 {
12611   dout(10) << __func__ << dendl;
12612
12613   assert(info.last_update >= pg_log.get_tail());  // otherwise we need some help!
12614
12615   if (!cct->_conf->osd_debug_verify_stray_on_activate)
12616     return;
12617
12618   // just scan the log.
12619   set<hobject_t> did;
12620   for (list<pg_log_entry_t>::const_reverse_iterator p = pg_log.get_log().log.rbegin();
12621        p != pg_log.get_log().log.rend();
12622        ++p) {
12623     if (did.count(p->soid))
12624       continue;
12625     did.insert(p->soid);
12626
12627     if (p->is_delete() && !is_missing_object(p->soid)) {
12628       dout(10) << " checking " << p->soid
12629                << " at " << p->version << dendl;
12630       struct stat st;
12631       int r = osd->store->stat(
12632         ch,
12633         ghobject_t(p->soid, ghobject_t::NO_GEN, pg_whoami.shard),
12634         &st);
12635       if (r != -ENOENT) {
12636         derr << __func__ << " " << p->soid << " exists, but should have been "
12637              << "deleted" << dendl;
12638         assert(0 == "erroneously present object");
12639       }
12640     } else {
12641       // ignore old(+missing) objects
12642     }
12643   }
12644 }
12645
12646
12647
12648 // ===========================
12649 // hit sets
12650
12651 hobject_t PrimaryLogPG::get_hit_set_current_object(utime_t stamp)
12652 {
12653   ostringstream ss;
12654   ss << "hit_set_" << info.pgid.pgid << "_current_" << stamp;
12655   hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "",
12656                  info.pgid.ps(), info.pgid.pool(),
12657                  cct->_conf->osd_hit_set_namespace);
12658   dout(20) << __func__ << " " << hoid << dendl;
12659   return hoid;
12660 }
12661
12662 hobject_t PrimaryLogPG::get_hit_set_archive_object(utime_t start,
12663                                                    utime_t end,
12664                                                    bool using_gmt)
12665 {
12666   ostringstream ss;
12667   ss << "hit_set_" << info.pgid.pgid << "_archive_";
12668   if (using_gmt) {
12669     start.gmtime(ss) << "_";
12670     end.gmtime(ss);
12671   } else {
12672     start.localtime(ss) << "_";
12673     end.localtime(ss);
12674   }
12675   hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "",
12676                  info.pgid.ps(), info.pgid.pool(),
12677                  cct->_conf->osd_hit_set_namespace);
12678   dout(20) << __func__ << " " << hoid << dendl;
12679   return hoid;
12680 }
12681
12682 void PrimaryLogPG::hit_set_clear()
12683 {
12684   dout(20) << __func__ << dendl;
12685   hit_set.reset();
12686   hit_set_start_stamp = utime_t();
12687 }
12688
12689 void PrimaryLogPG::hit_set_setup()
12690 {
12691   if (!is_active() ||
12692       !is_primary()) {
12693     hit_set_clear();
12694     return;
12695   }
12696
12697   if (is_active() && is_primary() &&
12698       (!pool.info.hit_set_count ||
12699        !pool.info.hit_set_period ||
12700        pool.info.hit_set_params.get_type() == HitSet::TYPE_NONE)) {
12701     hit_set_clear();
12702
12703     // only primary is allowed to remove all the hit set objects
12704     hit_set_remove_all();
12705     return;
12706   }
12707
12708   // FIXME: discard any previous data for now
12709   hit_set_create();
12710
12711   // include any writes we know about from the pg log.  this doesn't
12712   // capture reads, but it is better than nothing!
12713   hit_set_apply_log();
12714 }
12715
12716 void PrimaryLogPG::hit_set_remove_all()
12717 {
12718   // If any archives are degraded we skip this
12719   for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin();
12720        p != info.hit_set.history.end();
12721        ++p) {
12722     hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
12723
12724     // Once we hit a degraded object just skip
12725     if (is_degraded_or_backfilling_object(aoid))
12726       return;
12727     if (write_blocked_by_scrub(aoid))
12728       return;
12729   }
12730
12731   if (!info.hit_set.history.empty()) {
12732     list<pg_hit_set_info_t>::reverse_iterator p = info.hit_set.history.rbegin();
12733     assert(p != info.hit_set.history.rend());
12734     hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
12735     assert(!is_degraded_or_backfilling_object(oid));
12736     ObjectContextRef obc = get_object_context(oid, false);
12737     assert(obc);
12738
12739     OpContextUPtr ctx = simple_opc_create(obc);
12740     ctx->at_version = get_next_version();
12741     ctx->updated_hset_history = info.hit_set;
12742     utime_t now = ceph_clock_now();
12743     ctx->mtime = now;
12744     hit_set_trim(ctx, 0);
12745     simple_opc_submit(std::move(ctx));
12746   }
12747
12748   info.hit_set = pg_hit_set_history_t();
12749   if (agent_state) {
12750     agent_state->discard_hit_sets();
12751   }
12752 }
12753
12754 void PrimaryLogPG::hit_set_create()
12755 {
12756   utime_t now = ceph_clock_now();
12757   // make a copy of the params to modify
12758   HitSet::Params params(pool.info.hit_set_params);
12759
12760   dout(20) << __func__ << " " << params << dendl;
12761   if (pool.info.hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
12762     BloomHitSet::Params *p =
12763       static_cast<BloomHitSet::Params*>(params.impl.get());
12764
12765     // convert false positive rate so it holds up across the full period
12766     p->set_fpp(p->get_fpp() / pool.info.hit_set_count);
12767     if (p->get_fpp() <= 0.0)
12768       p->set_fpp(.01);  // fpp cannot be zero!
12769
12770     // if we don't have specified size, estimate target size based on the
12771     // previous bin!
12772     if (p->target_size == 0 && hit_set) {
12773       utime_t dur = now - hit_set_start_stamp;
12774       unsigned unique = hit_set->approx_unique_insert_count();
12775       dout(20) << __func__ << " previous set had approx " << unique
12776                << " unique items over " << dur << " seconds" << dendl;
12777       p->target_size = (double)unique * (double)pool.info.hit_set_period
12778                      / (double)dur;
12779     }
12780     if (p->target_size <
12781         static_cast<uint64_t>(cct->_conf->osd_hit_set_min_size))
12782       p->target_size = cct->_conf->osd_hit_set_min_size;
12783
12784     if (p->target_size
12785         > static_cast<uint64_t>(cct->_conf->osd_hit_set_max_size))
12786       p->target_size = cct->_conf->osd_hit_set_max_size;
12787
12788     p->seed = now.sec();
12789
12790     dout(10) << __func__ << " target_size " << p->target_size
12791              << " fpp " << p->get_fpp() << dendl;
12792   }
12793   hit_set.reset(new HitSet(params));
12794   hit_set_start_stamp = now;
12795 }
12796
12797 /**
12798  * apply log entries to set
12799  *
12800  * this would only happen after peering, to at least capture writes
12801  * during an interval that was potentially lost.
12802  */
12803 bool PrimaryLogPG::hit_set_apply_log()
12804 {
12805   if (!hit_set)
12806     return false;
12807
12808   eversion_t to = info.last_update;
12809   eversion_t from = info.hit_set.current_last_update;
12810   if (to <= from) {
12811     dout(20) << __func__ << " no update" << dendl;
12812     return false;
12813   }
12814
12815   dout(20) << __func__ << " " << to << " .. " << info.last_update << dendl;
12816   list<pg_log_entry_t>::const_reverse_iterator p = pg_log.get_log().log.rbegin();
12817   while (p != pg_log.get_log().log.rend() && p->version > to)
12818     ++p;
12819   while (p != pg_log.get_log().log.rend() && p->version > from) {
12820     hit_set->insert(p->soid);
12821     ++p;
12822   }
12823
12824   return true;
12825 }
12826
12827 void PrimaryLogPG::hit_set_persist()
12828 {
12829   dout(10) << __func__  << dendl;
12830   bufferlist bl;
12831   unsigned max = pool.info.hit_set_count;
12832
12833   utime_t now = ceph_clock_now();
12834   hobject_t oid;
12835
12836   // If any archives are degraded we skip this persist request
12837   // account for the additional entry being added below
12838   for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin();
12839        p != info.hit_set.history.end();
12840        ++p) {
12841     hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
12842
12843     // Once we hit a degraded object just skip further trim
12844     if (is_degraded_or_backfilling_object(aoid))
12845       return;
12846     if (write_blocked_by_scrub(aoid))
12847       return;
12848   }
12849
12850   // If backfill is in progress and we could possibly overlap with the
12851   // hit_set_* objects, back off.  Since these all have
12852   // hobject_t::hash set to pgid.ps(), and those sort first, we can
12853   // look just at that.  This is necessary because our transactions
12854   // may include a modify of the new hit_set *and* a delete of the
12855   // old one, and this may span the backfill boundary.
12856   for (set<pg_shard_t>::iterator p = backfill_targets.begin();
12857        p != backfill_targets.end();
12858        ++p) {
12859     assert(peer_info.count(*p));
12860     const pg_info_t& pi = peer_info[*p];
12861     if (pi.last_backfill == hobject_t() ||
12862         pi.last_backfill.get_hash() == info.pgid.ps()) {
12863       dout(10) << __func__ << " backfill target osd." << *p
12864                << " last_backfill has not progressed past pgid ps"
12865                << dendl;
12866       return;
12867     }
12868   }
12869
12870
12871   pg_hit_set_info_t new_hset = pg_hit_set_info_t(pool.info.use_gmt_hitset);
12872   new_hset.begin = hit_set_start_stamp;
12873   new_hset.end = now;
12874   oid = get_hit_set_archive_object(
12875     new_hset.begin,
12876     new_hset.end,
12877     new_hset.using_gmt);
12878
12879   // If the current object is degraded we skip this persist request
12880   if (write_blocked_by_scrub(oid))
12881     return;
12882
12883   hit_set->seal();
12884   ::encode(*hit_set, bl);
12885   dout(20) << __func__ << " archive " << oid << dendl;
12886
12887   if (agent_state) {
12888     agent_state->add_hit_set(new_hset.begin, hit_set);
12889     uint32_t size = agent_state->hit_set_map.size();
12890     if (size >= pool.info.hit_set_count) {
12891       size = pool.info.hit_set_count > 0 ? pool.info.hit_set_count - 1: 0;
12892     }
12893     hit_set_in_memory_trim(size);
12894   }
12895
12896   ObjectContextRef obc = get_object_context(oid, true);
12897   OpContextUPtr ctx = simple_opc_create(obc);
12898
12899   ctx->at_version = get_next_version();
12900   ctx->updated_hset_history = info.hit_set;
12901   pg_hit_set_history_t &updated_hit_set_hist = *(ctx->updated_hset_history);
12902
12903   updated_hit_set_hist.current_last_update = info.last_update;
12904   new_hset.version = ctx->at_version;
12905
12906   updated_hit_set_hist.history.push_back(new_hset);
12907   hit_set_create();
12908
12909   // fabricate an object_info_t and SnapSet
12910   obc->obs.oi.version = ctx->at_version;
12911   obc->obs.oi.mtime = now;
12912   obc->obs.oi.size = bl.length();
12913   obc->obs.exists = true;
12914   obc->obs.oi.set_data_digest(bl.crc32c(-1));
12915
12916   ctx->new_obs = obc->obs;
12917
12918   obc->ssc->snapset.head_exists = true;
12919   ctx->new_snapset = obc->ssc->snapset;
12920
12921   ctx->delta_stats.num_objects++;
12922   ctx->delta_stats.num_objects_hit_set_archive++;
12923   ctx->delta_stats.num_bytes += bl.length();
12924   ctx->delta_stats.num_bytes_hit_set_archive += bl.length();
12925
12926   bufferlist bss;
12927   ::encode(ctx->new_snapset, bss);
12928   bufferlist boi(sizeof(ctx->new_obs.oi));
12929   ::encode(ctx->new_obs.oi, boi,
12930            get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
12931
12932   ctx->op_t->create(oid);
12933   if (bl.length()) {
12934     ctx->op_t->write(oid, 0, bl.length(), bl, 0);
12935   }
12936   map <string, bufferlist> attrs;
12937   attrs[OI_ATTR].claim(boi);
12938   attrs[SS_ATTR].claim(bss);
12939   setattrs_maybe_cache(ctx->obc, ctx.get(), ctx->op_t.get(), attrs);
12940   ctx->log.push_back(
12941     pg_log_entry_t(
12942       pg_log_entry_t::MODIFY,
12943       oid,
12944       ctx->at_version,
12945       eversion_t(),
12946       0,
12947       osd_reqid_t(),
12948       ctx->mtime,
12949       0)
12950     );
12951
12952   hit_set_trim(ctx, max);
12953
12954   simple_opc_submit(std::move(ctx));
12955 }
12956
12957 void PrimaryLogPG::hit_set_trim(OpContextUPtr &ctx, unsigned max)
12958 {
12959   assert(ctx->updated_hset_history);
12960   pg_hit_set_history_t &updated_hit_set_hist =
12961     *(ctx->updated_hset_history);
12962   for (unsigned num = updated_hit_set_hist.history.size(); num > max; --num) {
12963     list<pg_hit_set_info_t>::iterator p = updated_hit_set_hist.history.begin();
12964     assert(p != updated_hit_set_hist.history.end());
12965     hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
12966
12967     assert(!is_degraded_or_backfilling_object(oid));
12968
12969     dout(20) << __func__ << " removing " << oid << dendl;
12970     ++ctx->at_version.version;
12971     ctx->log.push_back(
12972         pg_log_entry_t(pg_log_entry_t::DELETE,
12973                        oid,
12974                        ctx->at_version,
12975                        p->version,
12976                        0,
12977                        osd_reqid_t(),
12978                        ctx->mtime,
12979                        0));
12980
12981     ctx->op_t->remove(oid);
12982     updated_hit_set_hist.history.pop_front();
12983
12984     ObjectContextRef obc = get_object_context(oid, false);
12985     assert(obc);
12986     --ctx->delta_stats.num_objects;
12987     --ctx->delta_stats.num_objects_hit_set_archive;
12988     ctx->delta_stats.num_bytes -= obc->obs.oi.size;
12989     ctx->delta_stats.num_bytes_hit_set_archive -= obc->obs.oi.size;
12990   }
12991 }
12992
12993 void PrimaryLogPG::hit_set_in_memory_trim(uint32_t max_in_memory)
12994 {
12995   while (agent_state->hit_set_map.size() > max_in_memory) {
12996     agent_state->remove_oldest_hit_set();
12997   }
12998 }
12999
13000
13001 // =======================================
13002 // cache agent
13003
13004 void PrimaryLogPG::agent_setup()
13005 {
13006   assert(is_locked());
13007   if (!is_active() ||
13008       !is_primary() ||
13009       pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE ||
13010       pool.info.tier_of < 0 ||
13011       !get_osdmap()->have_pg_pool(pool.info.tier_of)) {
13012     agent_clear();
13013     return;
13014   }
13015   if (!agent_state) {
13016     agent_state.reset(new TierAgentState);
13017
13018     // choose random starting position
13019     agent_state->position = hobject_t();
13020     agent_state->position.pool = info.pgid.pool();
13021     agent_state->position.set_hash(pool.info.get_random_pg_position(
13022       info.pgid.pgid,
13023       rand()));
13024     agent_state->start = agent_state->position;
13025
13026     dout(10) << __func__ << " allocated new state, position "
13027              << agent_state->position << dendl;
13028   } else {
13029     dout(10) << __func__ << " keeping existing state" << dendl;
13030   }
13031
13032   if (info.stats.stats_invalid) {
13033     osd->clog->warn() << "pg " << info.pgid << " has invalid (post-split) stats; must scrub before tier agent can activate";
13034   }
13035
13036   agent_choose_mode();
13037 }
13038
13039 void PrimaryLogPG::agent_clear()
13040 {
13041   agent_stop();
13042   agent_state.reset(NULL);
13043 }
13044
13045 // Return false if no objects operated on since start of object hash space
13046 bool PrimaryLogPG::agent_work(int start_max, int agent_flush_quota)
13047 {
13048   lock();
13049   if (!agent_state) {
13050     dout(10) << __func__ << " no agent state, stopping" << dendl;
13051     unlock();
13052     return true;
13053   }
13054
13055   assert(!deleting);
13056
13057   if (agent_state->is_idle()) {
13058     dout(10) << __func__ << " idle, stopping" << dendl;
13059     unlock();
13060     return true;
13061   }
13062
13063   osd->logger->inc(l_osd_agent_wake);
13064
13065   dout(10) << __func__
13066            << " max " << start_max
13067            << ", flush " << agent_state->get_flush_mode_name()
13068            << ", evict " << agent_state->get_evict_mode_name()
13069            << ", pos " << agent_state->position
13070            << dendl;
13071   assert(is_primary());
13072   assert(is_active());
13073
13074   agent_load_hit_sets();
13075
13076   const pg_pool_t *base_pool = get_osdmap()->get_pg_pool(pool.info.tier_of);
13077   assert(base_pool);
13078
13079   int ls_min = 1;
13080   int ls_max = cct->_conf->osd_pool_default_cache_max_evict_check_size;
13081
13082   // list some objects.  this conveniently lists clones (oldest to
13083   // newest) before heads... the same order we want to flush in.
13084   //
13085   // NOTE: do not flush the Sequencer.  we will assume that the
13086   // listing we get back is imprecise.
13087   vector<hobject_t> ls;
13088   hobject_t next;
13089   int r = pgbackend->objects_list_partial(agent_state->position, ls_min, ls_max,
13090                                           &ls, &next);
13091   assert(r >= 0);
13092   dout(20) << __func__ << " got " << ls.size() << " objects" << dendl;
13093   int started = 0;
13094   for (vector<hobject_t>::iterator p = ls.begin();
13095        p != ls.end();
13096        ++p) {
13097     if (p->nspace == cct->_conf->osd_hit_set_namespace) {
13098       dout(20) << __func__ << " skip (hit set) " << *p << dendl;
13099       osd->logger->inc(l_osd_agent_skip);
13100       continue;
13101     }
13102     if (is_degraded_or_backfilling_object(*p)) {
13103       dout(20) << __func__ << " skip (degraded) " << *p << dendl;
13104       osd->logger->inc(l_osd_agent_skip);
13105       continue;
13106     }
13107     if (is_missing_object(p->get_head())) {
13108       dout(20) << __func__ << " skip (missing head) " << *p << dendl;
13109       osd->logger->inc(l_osd_agent_skip);
13110       continue;
13111     }
13112     ObjectContextRef obc = get_object_context(*p, false, NULL);
13113     if (!obc) {
13114       // we didn't flush; we may miss something here.
13115       dout(20) << __func__ << " skip (no obc) " << *p << dendl;
13116       osd->logger->inc(l_osd_agent_skip);
13117       continue;
13118     }
13119     if (!obc->obs.exists) {
13120       dout(20) << __func__ << " skip (dne) " << obc->obs.oi.soid << dendl;
13121       osd->logger->inc(l_osd_agent_skip);
13122       continue;
13123     }
13124     if (range_intersects_scrub(obc->obs.oi.soid,
13125                                obc->obs.oi.soid.get_head())) {
13126       dout(20) << __func__ << " skip (scrubbing) " << obc->obs.oi << dendl;
13127       osd->logger->inc(l_osd_agent_skip);
13128       continue;
13129     }
13130     if (obc->is_blocked()) {
13131       dout(20) << __func__ << " skip (blocked) " << obc->obs.oi << dendl;
13132       osd->logger->inc(l_osd_agent_skip);
13133       continue;
13134     }
13135     if (obc->is_request_pending()) {
13136       dout(20) << __func__ << " skip (request pending) " << obc->obs.oi << dendl;
13137       osd->logger->inc(l_osd_agent_skip);
13138       continue;
13139     }
13140
13141     // be careful flushing omap to an EC pool.
13142     if (!base_pool->supports_omap() &&
13143         obc->obs.oi.is_omap()) {
13144       dout(20) << __func__ << " skip (omap to EC) " << obc->obs.oi << dendl;
13145       osd->logger->inc(l_osd_agent_skip);
13146       continue;
13147     }
13148
13149     if (agent_state->evict_mode != TierAgentState::EVICT_MODE_IDLE &&
13150         agent_maybe_evict(obc, false))
13151       ++started;
13152     else if (agent_state->flush_mode != TierAgentState::FLUSH_MODE_IDLE &&
13153              agent_flush_quota > 0 && agent_maybe_flush(obc)) {
13154       ++started;
13155       --agent_flush_quota;
13156     }
13157     if (started >= start_max) {
13158       // If finishing early, set "next" to the next object
13159       if (++p != ls.end())
13160         next = *p;
13161       break;
13162     }
13163   }
13164
13165   if (++agent_state->hist_age > cct->_conf->osd_agent_hist_halflife) {
13166     dout(20) << __func__ << " resetting atime and temp histograms" << dendl;
13167     agent_state->hist_age = 0;
13168     agent_state->temp_hist.decay();
13169   }
13170
13171   // Total objects operated on so far
13172   int total_started = agent_state->started + started;
13173   bool need_delay = false;
13174
13175   dout(20) << __func__ << " start pos " << agent_state->position
13176     << " next start pos " << next
13177     << " started " << total_started << dendl;
13178
13179   // See if we've made a full pass over the object hash space
13180   // This might check at most ls_max objects a second time to notice that
13181   // we've checked every objects at least once.
13182   if (agent_state->position < agent_state->start &&
13183       next >= agent_state->start) {
13184     dout(20) << __func__ << " wrap around " << agent_state->start << dendl;
13185     if (total_started == 0)
13186       need_delay = true;
13187     else
13188       total_started = 0;
13189     agent_state->start = next;
13190   }
13191   agent_state->started = total_started;
13192
13193   // See if we are starting from beginning
13194   if (next.is_max())
13195     agent_state->position = hobject_t();
13196   else
13197     agent_state->position = next;
13198
13199   // Discard old in memory HitSets
13200   hit_set_in_memory_trim(pool.info.hit_set_count);
13201
13202   if (need_delay) {
13203     assert(agent_state->delaying == false);
13204     agent_delay();
13205     unlock();
13206     return false;
13207   }
13208   agent_choose_mode();
13209   unlock();
13210   return true;
13211 }
13212
13213 void PrimaryLogPG::agent_load_hit_sets()
13214 {
13215   if (agent_state->evict_mode == TierAgentState::EVICT_MODE_IDLE) {
13216     return;
13217   }
13218
13219   if (agent_state->hit_set_map.size() < info.hit_set.history.size()) {
13220     dout(10) << __func__ << dendl;
13221     for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin();
13222          p != info.hit_set.history.end(); ++p) {
13223       if (agent_state->hit_set_map.count(p->begin.sec()) == 0) {
13224         dout(10) << __func__ << " loading " << p->begin << "-"
13225                  << p->end << dendl;
13226         if (!pool.info.is_replicated()) {
13227           // FIXME: EC not supported here yet
13228           derr << __func__ << " on non-replicated pool" << dendl;
13229           break;
13230         }
13231
13232         hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
13233         if (is_unreadable_object(oid)) {
13234           dout(10) << __func__ << " unreadable " << oid << ", waiting" << dendl;
13235           break;
13236         }
13237
13238         ObjectContextRef obc = get_object_context(oid, false);
13239         if (!obc) {
13240           derr << __func__ << ": could not load hitset " << oid << dendl;
13241           break;
13242         }
13243
13244         bufferlist bl;
13245         {
13246           obc->ondisk_read_lock();
13247           int r = osd->store->read(ch, ghobject_t(oid), 0, 0, bl);
13248           assert(r >= 0);
13249           obc->ondisk_read_unlock();
13250         }
13251         HitSetRef hs(new HitSet);
13252         bufferlist::iterator pbl = bl.begin();
13253         ::decode(*hs, pbl);
13254         agent_state->add_hit_set(p->begin.sec(), hs);
13255       }
13256     }
13257   }
13258 }
13259
13260 bool PrimaryLogPG::agent_maybe_flush(ObjectContextRef& obc)
13261 {
13262   if (!obc->obs.oi.is_dirty()) {
13263     dout(20) << __func__ << " skip (clean) " << obc->obs.oi << dendl;
13264     osd->logger->inc(l_osd_agent_skip);
13265     return false;
13266   }
13267   if (obc->obs.oi.is_cache_pinned()) {
13268     dout(20) << __func__ << " skip (cache_pinned) " << obc->obs.oi << dendl;
13269     osd->logger->inc(l_osd_agent_skip);
13270     return false;
13271   }
13272
13273   utime_t now = ceph_clock_now();
13274   utime_t ob_local_mtime;
13275   if (obc->obs.oi.local_mtime != utime_t()) {
13276     ob_local_mtime = obc->obs.oi.local_mtime;
13277   } else {
13278     ob_local_mtime = obc->obs.oi.mtime;
13279   }
13280   bool evict_mode_full =
13281     (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL);
13282   if (!evict_mode_full &&
13283       obc->obs.oi.soid.snap == CEPH_NOSNAP &&  // snaps immutable; don't delay
13284       (ob_local_mtime + utime_t(pool.info.cache_min_flush_age, 0) > now)) {
13285     dout(20) << __func__ << " skip (too young) " << obc->obs.oi << dendl;
13286     osd->logger->inc(l_osd_agent_skip);
13287     return false;
13288   }
13289
13290   if (osd->agent_is_active_oid(obc->obs.oi.soid)) {
13291     dout(20) << __func__ << " skip (flushing) " << obc->obs.oi << dendl;
13292     osd->logger->inc(l_osd_agent_skip);
13293     return false;
13294   }
13295
13296   dout(10) << __func__ << " flushing " << obc->obs.oi << dendl;
13297
13298   // FIXME: flush anything dirty, regardless of what distribution of
13299   // ages we expect.
13300
13301   hobject_t oid = obc->obs.oi.soid;
13302   osd->agent_start_op(oid);
13303   // no need to capture a pg ref, can't outlive fop or ctx
13304   std::function<void()> on_flush = [this, oid]() {
13305     osd->agent_finish_op(oid);
13306   };
13307
13308   int result = start_flush(
13309     OpRequestRef(), obc, false, NULL,
13310     on_flush);
13311   if (result != -EINPROGRESS) {
13312     on_flush();
13313     dout(10) << __func__ << " start_flush() failed " << obc->obs.oi
13314       << " with " << result << dendl;
13315     osd->logger->inc(l_osd_agent_skip);
13316     return false;
13317   }
13318
13319   osd->logger->inc(l_osd_agent_flush);
13320   return true;
13321 }
13322
13323 bool PrimaryLogPG::agent_maybe_evict(ObjectContextRef& obc, bool after_flush)
13324 {
13325   const hobject_t& soid = obc->obs.oi.soid;
13326   if (!after_flush && obc->obs.oi.is_dirty()) {
13327     dout(20) << __func__ << " skip (dirty) " << obc->obs.oi << dendl;
13328     return false;
13329   }
13330   if (!obc->obs.oi.watchers.empty()) {
13331     dout(20) << __func__ << " skip (watchers) " << obc->obs.oi << dendl;
13332     return false;
13333   }
13334   if (obc->is_blocked()) {
13335     dout(20) << __func__ << " skip (blocked) " << obc->obs.oi << dendl;
13336     return false;
13337   }
13338   if (obc->obs.oi.is_cache_pinned()) {
13339     dout(20) << __func__ << " skip (cache_pinned) " << obc->obs.oi << dendl;
13340     return false;
13341   }
13342
13343   if (soid.snap == CEPH_NOSNAP) {
13344     int result = _verify_no_head_clones(soid, obc->ssc->snapset);
13345     if (result < 0) {
13346       dout(20) << __func__ << " skip (clones) " << obc->obs.oi << dendl;
13347       return false;
13348     }
13349   }
13350
13351   if (agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL) {
13352     // is this object old than cache_min_evict_age?
13353     utime_t now = ceph_clock_now();
13354     utime_t ob_local_mtime;
13355     if (obc->obs.oi.local_mtime != utime_t()) {
13356       ob_local_mtime = obc->obs.oi.local_mtime;
13357     } else {
13358       ob_local_mtime = obc->obs.oi.mtime;
13359     }
13360     if (ob_local_mtime + utime_t(pool.info.cache_min_evict_age, 0) > now) {
13361       dout(20) << __func__ << " skip (too young) " << obc->obs.oi << dendl;
13362       osd->logger->inc(l_osd_agent_skip);
13363       return false;
13364     }
13365     // is this object old and/or cold enough?
13366     int temp = 0;
13367     uint64_t temp_upper = 0, temp_lower = 0;
13368     if (hit_set)
13369       agent_estimate_temp(soid, &temp);
13370     agent_state->temp_hist.add(temp);
13371     agent_state->temp_hist.get_position_micro(temp, &temp_lower, &temp_upper);
13372
13373     dout(20) << __func__
13374              << " temp " << temp
13375              << " pos " << temp_lower << "-" << temp_upper
13376              << ", evict_effort " << agent_state->evict_effort
13377              << dendl;
13378     dout(30) << "agent_state:\n";
13379     Formatter *f = Formatter::create("");
13380     f->open_object_section("agent_state");
13381     agent_state->dump(f);
13382     f->close_section();
13383     f->flush(*_dout);
13384     delete f;
13385     *_dout << dendl;
13386
13387     if (1000000 - temp_upper >= agent_state->evict_effort)
13388       return false;
13389   }
13390
13391   dout(10) << __func__ << " evicting " << obc->obs.oi << dendl;
13392   OpContextUPtr ctx = simple_opc_create(obc);
13393
13394   if (!ctx->lock_manager.get_lock_type(
13395         ObjectContext::RWState::RWWRITE,
13396         obc->obs.oi.soid,
13397         obc,
13398         OpRequestRef())) {
13399     close_op_ctx(ctx.release());
13400     dout(20) << __func__ << " skip (cannot get lock) " << obc->obs.oi << dendl;
13401     return false;
13402   }
13403
13404   osd->agent_start_evict_op();
13405   ctx->register_on_finish(
13406     [this]() {
13407       osd->agent_finish_evict_op();
13408     });
13409
13410   ctx->at_version = get_next_version();
13411   assert(ctx->new_obs.exists);
13412   int r = _delete_oid(ctx.get(), true, false);
13413   if (obc->obs.oi.is_omap())
13414     ctx->delta_stats.num_objects_omap--;
13415   ctx->delta_stats.num_evict++;
13416   ctx->delta_stats.num_evict_kb += SHIFT_ROUND_UP(obc->obs.oi.size, 10);
13417   if (obc->obs.oi.is_dirty())
13418     --ctx->delta_stats.num_objects_dirty;
13419   assert(r == 0);
13420   finish_ctx(ctx.get(), pg_log_entry_t::DELETE, false);
13421   simple_opc_submit(std::move(ctx));
13422   osd->logger->inc(l_osd_tier_evict);
13423   osd->logger->inc(l_osd_agent_evict);
13424   return true;
13425 }
13426
13427 void PrimaryLogPG::agent_stop()
13428 {
13429   dout(20) << __func__ << dendl;
13430   if (agent_state && !agent_state->is_idle()) {
13431     agent_state->evict_mode = TierAgentState::EVICT_MODE_IDLE;
13432     agent_state->flush_mode = TierAgentState::FLUSH_MODE_IDLE;
13433     osd->agent_disable_pg(this, agent_state->evict_effort);
13434   }
13435 }
13436
13437 void PrimaryLogPG::agent_delay()
13438 {
13439   dout(20) << __func__ << dendl;
13440   if (agent_state && !agent_state->is_idle()) {
13441     assert(agent_state->delaying == false);
13442     agent_state->delaying = true;
13443     osd->agent_disable_pg(this, agent_state->evict_effort);
13444   }
13445 }
13446
13447 void PrimaryLogPG::agent_choose_mode_restart()
13448 {
13449   dout(20) << __func__ << dendl;
13450   lock();
13451   if (agent_state && agent_state->delaying) {
13452     agent_state->delaying = false;
13453     agent_choose_mode(true);
13454   }
13455   unlock();
13456 }
13457
13458 bool PrimaryLogPG::agent_choose_mode(bool restart, OpRequestRef op)
13459 {
13460   bool requeued = false;
13461   // Let delay play out
13462   if (agent_state->delaying) {
13463     dout(20) << __func__ << this << " delaying, ignored" << dendl;
13464     return requeued;
13465   }
13466
13467   TierAgentState::flush_mode_t flush_mode = TierAgentState::FLUSH_MODE_IDLE;
13468   TierAgentState::evict_mode_t evict_mode = TierAgentState::EVICT_MODE_IDLE;
13469   unsigned evict_effort = 0;
13470
13471   if (info.stats.stats_invalid) {
13472     // idle; stats can't be trusted until we scrub.
13473     dout(20) << __func__ << " stats invalid (post-split), idle" << dendl;
13474     goto skip_calc;
13475   }
13476
13477   {
13478   uint64_t divisor = pool.info.get_pg_num_divisor(info.pgid.pgid);
13479   assert(divisor > 0);
13480
13481   // adjust (effective) user objects down based on the number
13482   // of HitSet objects, which should not count toward our total since
13483   // they cannot be flushed.
13484   uint64_t unflushable = info.stats.stats.sum.num_objects_hit_set_archive;
13485
13486   // also exclude omap objects if ec backing pool
13487   const pg_pool_t *base_pool = get_osdmap()->get_pg_pool(pool.info.tier_of);
13488   assert(base_pool);
13489   if (!base_pool->supports_omap())
13490     unflushable += info.stats.stats.sum.num_objects_omap;
13491
13492   uint64_t num_user_objects = info.stats.stats.sum.num_objects;
13493   if (num_user_objects > unflushable)
13494     num_user_objects -= unflushable;
13495   else
13496     num_user_objects = 0;
13497
13498   uint64_t num_user_bytes = info.stats.stats.sum.num_bytes;
13499   uint64_t unflushable_bytes = info.stats.stats.sum.num_bytes_hit_set_archive;
13500   num_user_bytes -= unflushable_bytes;
13501   uint64_t num_overhead_bytes = osd->store->estimate_objects_overhead(num_user_objects);
13502   num_user_bytes += num_overhead_bytes;
13503
13504   // also reduce the num_dirty by num_objects_omap
13505   int64_t num_dirty = info.stats.stats.sum.num_objects_dirty;
13506   if (!base_pool->supports_omap()) {
13507     if (num_dirty > info.stats.stats.sum.num_objects_omap)
13508       num_dirty -= info.stats.stats.sum.num_objects_omap;
13509     else
13510       num_dirty = 0;
13511   }
13512
13513   dout(10) << __func__
13514            << " flush_mode: "
13515            << TierAgentState::get_flush_mode_name(agent_state->flush_mode)
13516            << " evict_mode: "
13517            << TierAgentState::get_evict_mode_name(agent_state->evict_mode)
13518            << " num_objects: " << info.stats.stats.sum.num_objects
13519            << " num_bytes: " << info.stats.stats.sum.num_bytes
13520            << " num_objects_dirty: " << info.stats.stats.sum.num_objects_dirty
13521            << " num_objects_omap: " << info.stats.stats.sum.num_objects_omap
13522            << " num_dirty: " << num_dirty
13523            << " num_user_objects: " << num_user_objects
13524            << " num_user_bytes: " << num_user_bytes
13525            << " num_overhead_bytes: " << num_overhead_bytes
13526            << " pool.info.target_max_bytes: " << pool.info.target_max_bytes
13527            << " pool.info.target_max_objects: " << pool.info.target_max_objects
13528            << dendl;
13529
13530   // get dirty, full ratios
13531   uint64_t dirty_micro = 0;
13532   uint64_t full_micro = 0;
13533   if (pool.info.target_max_bytes && num_user_objects > 0) {
13534     uint64_t avg_size = num_user_bytes / num_user_objects;
13535     dirty_micro =
13536       num_dirty * avg_size * 1000000 /
13537       MAX(pool.info.target_max_bytes / divisor, 1);
13538     full_micro =
13539       num_user_objects * avg_size * 1000000 /
13540       MAX(pool.info.target_max_bytes / divisor, 1);
13541   }
13542   if (pool.info.target_max_objects > 0) {
13543     uint64_t dirty_objects_micro =
13544       num_dirty * 1000000 /
13545       MAX(pool.info.target_max_objects / divisor, 1);
13546     if (dirty_objects_micro > dirty_micro)
13547       dirty_micro = dirty_objects_micro;
13548     uint64_t full_objects_micro =
13549       num_user_objects * 1000000 /
13550       MAX(pool.info.target_max_objects / divisor, 1);
13551     if (full_objects_micro > full_micro)
13552       full_micro = full_objects_micro;
13553   }
13554   dout(20) << __func__ << " dirty " << ((float)dirty_micro / 1000000.0)
13555            << " full " << ((float)full_micro / 1000000.0)
13556            << dendl;
13557
13558   // flush mode
13559   uint64_t flush_target = pool.info.cache_target_dirty_ratio_micro;
13560   uint64_t flush_high_target = pool.info.cache_target_dirty_high_ratio_micro;
13561   uint64_t flush_slop = (float)flush_target * cct->_conf->osd_agent_slop;
13562   if (restart || agent_state->flush_mode == TierAgentState::FLUSH_MODE_IDLE) {
13563     flush_target += flush_slop;
13564     flush_high_target += flush_slop;
13565   } else {
13566     flush_target -= MIN(flush_target, flush_slop);
13567     flush_high_target -= MIN(flush_high_target, flush_slop);
13568   }
13569
13570   if (dirty_micro > flush_high_target) {
13571     flush_mode = TierAgentState::FLUSH_MODE_HIGH;
13572   } else if (dirty_micro > flush_target) {
13573     flush_mode = TierAgentState::FLUSH_MODE_LOW;
13574   }
13575
13576   // evict mode
13577   uint64_t evict_target = pool.info.cache_target_full_ratio_micro;
13578   uint64_t evict_slop = (float)evict_target * cct->_conf->osd_agent_slop;
13579   if (restart || agent_state->evict_mode == TierAgentState::EVICT_MODE_IDLE)
13580     evict_target += evict_slop;
13581   else
13582     evict_target -= MIN(evict_target, evict_slop);
13583
13584   if (full_micro > 1000000) {
13585     // evict anything clean
13586     evict_mode = TierAgentState::EVICT_MODE_FULL;
13587     evict_effort = 1000000;
13588   } else if (full_micro > evict_target) {
13589     // set effort in [0..1] range based on where we are between
13590     evict_mode = TierAgentState::EVICT_MODE_SOME;
13591     uint64_t over = full_micro - evict_target;
13592     uint64_t span  = 1000000 - evict_target;
13593     evict_effort = MAX(over * 1000000 / span,
13594                        (unsigned)(1000000.0 * cct->_conf->osd_agent_min_evict_effort));
13595
13596     // quantize effort to avoid too much reordering in the agent_queue.
13597     uint64_t inc = cct->_conf->osd_agent_quantize_effort * 1000000;
13598     assert(inc > 0);
13599     uint64_t was = evict_effort;
13600     evict_effort -= evict_effort % inc;
13601     if (evict_effort < inc)
13602       evict_effort = inc;
13603     assert(evict_effort >= inc && evict_effort <= 1000000);
13604     dout(30) << __func__ << " evict_effort " << was << " quantized by " << inc << " to " << evict_effort << dendl;
13605   }
13606   }
13607
13608   skip_calc:
13609   bool old_idle = agent_state->is_idle();
13610   if (flush_mode != agent_state->flush_mode) {
13611     dout(5) << __func__ << " flush_mode "
13612             << TierAgentState::get_flush_mode_name(agent_state->flush_mode)
13613             << " -> "
13614             << TierAgentState::get_flush_mode_name(flush_mode)
13615             << dendl;
13616     if (flush_mode == TierAgentState::FLUSH_MODE_HIGH) {
13617       osd->agent_inc_high_count();
13618       info.stats.stats.sum.num_flush_mode_high = 1;
13619     } else if (flush_mode == TierAgentState::FLUSH_MODE_LOW) {
13620       info.stats.stats.sum.num_flush_mode_low = 1;
13621     }
13622     if (agent_state->flush_mode == TierAgentState::FLUSH_MODE_HIGH) {
13623       osd->agent_dec_high_count();
13624       info.stats.stats.sum.num_flush_mode_high = 0;
13625     } else if (agent_state->flush_mode == TierAgentState::FLUSH_MODE_LOW) {
13626       info.stats.stats.sum.num_flush_mode_low = 0;
13627     }
13628     agent_state->flush_mode = flush_mode;
13629   }
13630   if (evict_mode != agent_state->evict_mode) {
13631     dout(5) << __func__ << " evict_mode "
13632             << TierAgentState::get_evict_mode_name(agent_state->evict_mode)
13633             << " -> "
13634             << TierAgentState::get_evict_mode_name(evict_mode)
13635             << dendl;
13636     if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL &&
13637         is_active()) {
13638       if (op)
13639         requeue_op(op);
13640       requeue_ops(waiting_for_flush);
13641       requeue_ops(waiting_for_active);
13642       requeue_ops(waiting_for_scrub);
13643       requeue_ops(waiting_for_cache_not_full);
13644       objects_blocked_on_cache_full.clear();
13645       requeued = true;
13646     }
13647     if (evict_mode == TierAgentState::EVICT_MODE_SOME) {
13648       info.stats.stats.sum.num_evict_mode_some = 1;
13649     } else if (evict_mode == TierAgentState::EVICT_MODE_FULL) {
13650       info.stats.stats.sum.num_evict_mode_full = 1;
13651     }
13652     if (agent_state->evict_mode == TierAgentState::EVICT_MODE_SOME) {
13653       info.stats.stats.sum.num_evict_mode_some = 0;
13654     } else if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
13655       info.stats.stats.sum.num_evict_mode_full = 0;
13656     }
13657     agent_state->evict_mode = evict_mode;
13658   }
13659   uint64_t old_effort = agent_state->evict_effort;
13660   if (evict_effort != agent_state->evict_effort) {
13661     dout(5) << __func__ << " evict_effort "
13662             << ((float)agent_state->evict_effort / 1000000.0)
13663             << " -> "
13664             << ((float)evict_effort / 1000000.0)
13665             << dendl;
13666     agent_state->evict_effort = evict_effort;
13667   }
13668
13669   // NOTE: we are using evict_effort as a proxy for *all* agent effort
13670   // (including flush).  This is probably fine (they should be
13671   // correlated) but it is not precisely correct.
13672   if (agent_state->is_idle()) {
13673     if (!restart && !old_idle) {
13674       osd->agent_disable_pg(this, old_effort);
13675     }
13676   } else {
13677     if (restart || old_idle) {
13678       osd->agent_enable_pg(this, agent_state->evict_effort);
13679     } else if (old_effort != agent_state->evict_effort) {
13680       osd->agent_adjust_pg(this, old_effort, agent_state->evict_effort);
13681     }
13682   }
13683   return requeued;
13684 }
13685
13686 void PrimaryLogPG::agent_estimate_temp(const hobject_t& oid, int *temp)
13687 {
13688   assert(hit_set);
13689   assert(temp);
13690   *temp = 0;
13691   if (hit_set->contains(oid))
13692     *temp = 1000000;
13693   unsigned i = 0;
13694   int last_n = pool.info.hit_set_search_last_n;
13695   for (map<time_t,HitSetRef>::reverse_iterator p =
13696        agent_state->hit_set_map.rbegin(); last_n > 0 &&
13697        p != agent_state->hit_set_map.rend(); ++p, ++i) {
13698     if (p->second->contains(oid)) {
13699       *temp += pool.info.get_grade(i);
13700       --last_n;
13701     }
13702   }
13703 }
13704
13705 // Dup op detection
13706
13707 bool PrimaryLogPG::already_complete(eversion_t v)
13708 {
13709   dout(20) << __func__ << ": " << v << dendl;
13710   for (xlist<RepGather*>::iterator i = repop_queue.begin();
13711        !i.end();
13712        ++i) {
13713     dout(20) << __func__ << ": " << **i << dendl;
13714     // skip copy from temp object ops
13715     if ((*i)->v == eversion_t()) {
13716       dout(20) << __func__ << ": " << **i
13717                << " version is empty" << dendl;
13718       continue;
13719     }
13720     if ((*i)->v > v) {
13721       dout(20) << __func__ << ": " << **i
13722                << " (*i)->v past v" << dendl;
13723       break;
13724     }
13725     if (!(*i)->all_committed) {
13726       dout(20) << __func__ << ": " << **i
13727                << " not committed, returning false"
13728                << dendl;
13729       return false;
13730     }
13731   }
13732   dout(20) << __func__ << ": returning true" << dendl;
13733   return true;
13734 }
13735
13736 bool PrimaryLogPG::already_ack(eversion_t v)
13737 {
13738   dout(20) << __func__ << ": " << v << dendl;
13739   for (xlist<RepGather*>::iterator i = repop_queue.begin();
13740        !i.end();
13741        ++i) {
13742     // skip copy from temp object ops
13743     if ((*i)->v == eversion_t()) {
13744       dout(20) << __func__ << ": " << **i
13745                << " version is empty" << dendl;
13746       continue;
13747     }
13748     if ((*i)->v > v) {
13749       dout(20) << __func__ << ": " << **i
13750                << " (*i)->v past v" << dendl;
13751       break;
13752     }
13753     if (!(*i)->all_applied) {
13754       dout(20) << __func__ << ": " << **i
13755                << " not applied, returning false"
13756                << dendl;
13757       return false;
13758     }
13759   }
13760   dout(20) << __func__ << ": returning true" << dendl;
13761   return true;
13762 }
13763
13764
13765 // ==========================================================================================
13766 // SCRUB
13767
13768
13769 bool PrimaryLogPG::_range_available_for_scrub(
13770   const hobject_t &begin, const hobject_t &end)
13771 {
13772   pair<hobject_t, ObjectContextRef> next;
13773   next.second = object_contexts.lookup(begin);
13774   next.first = begin;
13775   bool more = true;
13776   while (more && next.first < end) {
13777     if (next.second && next.second->is_blocked()) {
13778       next.second->requeue_scrub_on_unblock = true;
13779       dout(10) << __func__ << ": scrub delayed, "
13780                << next.first << " is blocked"
13781                << dendl;
13782       return false;
13783     }
13784     more = object_contexts.get_next(next.first, &next);
13785   }
13786   return true;
13787 }
13788
13789 static bool doing_clones(const boost::optional<SnapSet> &snapset,
13790                          const vector<snapid_t>::reverse_iterator &curclone) {
13791     return snapset && curclone != snapset.get().clones.rend();
13792 }
13793
13794 void PrimaryLogPG::log_missing(unsigned missing,
13795                         const boost::optional<hobject_t> &head,
13796                         LogChannelRef clog,
13797                         const spg_t &pgid,
13798                         const char *func,
13799                         const char *mode,
13800                         bool allow_incomplete_clones)
13801 {
13802   assert(head);
13803   if (allow_incomplete_clones) {
13804     dout(20) << func << " " << mode << " " << pgid << " " << head.get()
13805                << " skipped " << missing << " clone(s) in cache tier" << dendl;
13806   } else {
13807     clog->info() << mode << " " << pgid << " " << head.get()
13808                        << " " << missing << " missing clone(s)";
13809   }
13810 }
13811
13812 unsigned PrimaryLogPG::process_clones_to(const boost::optional<hobject_t> &head,
13813   const boost::optional<SnapSet> &snapset,
13814   LogChannelRef clog,
13815   const spg_t &pgid,
13816   const char *mode,
13817   bool allow_incomplete_clones,
13818   boost::optional<snapid_t> target,
13819   vector<snapid_t>::reverse_iterator *curclone,
13820   inconsistent_snapset_wrapper &e)
13821 {
13822   assert(head);
13823   assert(snapset);
13824   unsigned missing = 0;
13825
13826   // NOTE: clones are in descending order, thus **curclone > target test here
13827   hobject_t next_clone(head.get());
13828   while(doing_clones(snapset, *curclone) && (!target || **curclone > *target)) {
13829     ++missing;
13830     // it is okay to be missing one or more clones in a cache tier.
13831     // skip higher-numbered clones in the list.
13832     if (!allow_incomplete_clones) {
13833       next_clone.snap = **curclone;
13834       clog->error() << mode << " " << pgid << " " << head.get()
13835                          << " expected clone " << next_clone << " " << missing
13836                          << " missing";
13837       ++scrubber.shallow_errors;
13838       e.set_clone_missing(next_clone.snap);
13839     }
13840     // Clones are descending
13841     ++(*curclone);
13842   }
13843   return missing;
13844 }
13845
13846 /*
13847  * Validate consistency of the object info and snap sets.
13848  *
13849  * We are sort of comparing 2 lists. The main loop is on objmap.objects. But
13850  * the comparison of the objects is against multiple snapset.clones. There are
13851  * multiple clone lists and in between lists we expect head or snapdir.
13852  *
13853  * Example
13854  *
13855  * objects              expected
13856  * =======              =======
13857  * obj1 snap 1          head/snapdir, unexpected obj1 snap 1
13858  * obj2 head            head/snapdir, head ok
13859  *              [SnapSet clones 6 4 2 1]
13860  * obj2 snap 7          obj2 snap 6, unexpected obj2 snap 7
13861  * obj2 snap 6          obj2 snap 6, match
13862  * obj2 snap 4          obj2 snap 4, match
13863  * obj3 head            obj2 snap 2 (expected), obj2 snap 1 (expected), head ok
13864  *              [Snapset clones 3 1]
13865  * obj3 snap 3          obj3 snap 3 match
13866  * obj3 snap 1          obj3 snap 1 match
13867  * obj4 snapdir         head/snapdir, snapdir ok
13868  *              [Snapset clones 4]
13869  * EOL                  obj4 snap 4, (expected)
13870  */
13871 void PrimaryLogPG::scrub_snapshot_metadata(
13872   ScrubMap &scrubmap,
13873   const map<hobject_t,
13874             pair<boost::optional<uint32_t>,
13875                  boost::optional<uint32_t>>> &missing_digest)
13876 {
13877   dout(10) << __func__ << dendl;
13878
13879   coll_t c(info.pgid);
13880   bool repair = state_test(PG_STATE_REPAIR);
13881   bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
13882   const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
13883   boost::optional<snapid_t> all_clones;   // Unspecified snapid_t or boost::none
13884
13885   /// snapsets to repair
13886   map<hobject_t,SnapSet> snapset_to_repair;
13887
13888   // traverse in reverse order.
13889   boost::optional<hobject_t> head;
13890   boost::optional<SnapSet> snapset; // If initialized so will head (above)
13891   vector<snapid_t>::reverse_iterator curclone; // Defined only if snapset initialized
13892   unsigned missing = 0;
13893   inconsistent_snapset_wrapper soid_error, head_error;
13894   unsigned soid_error_count = 0;
13895
13896   bufferlist last_data;
13897
13898   for (map<hobject_t,ScrubMap::object>::reverse_iterator
13899        p = scrubmap.objects.rbegin(); p != scrubmap.objects.rend(); ++p) {
13900     const hobject_t& soid = p->first;
13901     soid_error = inconsistent_snapset_wrapper{soid};
13902     object_stat_sum_t stat;
13903     boost::optional<object_info_t> oi;
13904
13905     if (!soid.is_snapdir())
13906       stat.num_objects++;
13907
13908     if (soid.nspace == cct->_conf->osd_hit_set_namespace)
13909       stat.num_objects_hit_set_archive++;
13910
13911     if (soid.is_snap()) {
13912       // it's a clone
13913       stat.num_object_clones++;
13914     }
13915
13916     // basic checks.
13917     if (p->second.attrs.count(OI_ATTR) == 0) {
13918       oi = boost::none;
13919       osd->clog->error() << mode << " " << info.pgid << " " << soid
13920                         << " no '" << OI_ATTR << "' attr";
13921       ++scrubber.shallow_errors;
13922       soid_error.set_info_missing();
13923     } else {
13924       bufferlist bv;
13925       bv.push_back(p->second.attrs[OI_ATTR]);
13926       try {
13927         oi = object_info_t(); // Initialize optional<> before decode into it
13928         oi.get().decode(bv);
13929       } catch (buffer::error& e) {
13930         oi = boost::none;
13931         osd->clog->error() << mode << " " << info.pgid << " " << soid
13932                 << " can't decode '" << OI_ATTR << "' attr " << e.what();
13933         ++scrubber.shallow_errors;
13934         soid_error.set_info_corrupted();
13935         soid_error.set_info_missing(); // Not available too
13936       }
13937     }
13938
13939     if (oi) {
13940       if (pgbackend->be_get_ondisk_size(oi->size) != p->second.size) {
13941         osd->clog->error() << mode << " " << info.pgid << " " << soid
13942                            << " on disk size (" << p->second.size
13943                            << ") does not match object info size ("
13944                            << oi->size << ") adjusted for ondisk to ("
13945                            << pgbackend->be_get_ondisk_size(oi->size)
13946                            << ")";
13947         soid_error.set_size_mismatch();
13948         ++scrubber.shallow_errors;
13949       }
13950
13951       dout(20) << mode << "  " << soid << " " << oi.get() << dendl;
13952
13953       // A clone num_bytes will be added later when we have snapset
13954       if (!soid.is_snap()) {
13955         stat.num_bytes += oi->size;
13956       }
13957       if (soid.nspace == cct->_conf->osd_hit_set_namespace)
13958         stat.num_bytes_hit_set_archive += oi->size;
13959
13960       if (!soid.is_snapdir()) {
13961         if (oi->is_dirty())
13962           ++stat.num_objects_dirty;
13963         if (oi->is_whiteout())
13964           ++stat.num_whiteouts;
13965         if (oi->is_omap())
13966           ++stat.num_objects_omap;
13967         if (oi->is_cache_pinned())
13968           ++stat.num_objects_pinned;
13969       }
13970     } else {
13971       // pessimistic assumption that this object might contain a
13972       // legacy SnapSet
13973       stat.num_legacy_snapsets++;
13974     }
13975
13976     // Check for any problems while processing clones
13977     if (doing_clones(snapset, curclone)) {
13978       boost::optional<snapid_t> target;
13979       // Expecting an object with snap for current head
13980       if (soid.has_snapset() || soid.get_head() != head->get_head()) {
13981
13982         dout(10) << __func__ << " " << mode << " " << info.pgid << " new object "
13983                  << soid << " while processing " << head.get() << dendl;
13984
13985         target = all_clones;
13986       } else {
13987         assert(soid.is_snap());
13988         target = soid.snap;
13989       }
13990
13991       // Log any clones we were expecting to be there up to target
13992       // This will set missing, but will be a no-op if snap.soid == *curclone.
13993       missing += process_clones_to(head, snapset, osd->clog, info.pgid, mode,
13994                         pool.info.allow_incomplete_clones(), target, &curclone,
13995                         head_error);
13996     }
13997     bool expected;
13998     // Check doing_clones() again in case we ran process_clones_to()
13999     if (doing_clones(snapset, curclone)) {
14000       // A head/snapdir would have processed all clones above
14001       // or all greater than *curclone.
14002       assert(soid.is_snap() && *curclone <= soid.snap);
14003
14004       // After processing above clone snap should match the expected curclone
14005       expected = (*curclone == soid.snap);
14006     } else {
14007       // If we aren't doing clones any longer, then expecting head/snapdir
14008       expected = soid.has_snapset();
14009     }
14010     if (!expected) {
14011       // If we couldn't read the head's snapset, just ignore clones
14012       if (head && !snapset) {
14013         osd->clog->error() << mode << " " << info.pgid << " " << soid
14014                           << " clone ignored due to missing snapset";
14015       } else {
14016         osd->clog->error() << mode << " " << info.pgid << " " << soid
14017                            << " is an unexpected clone";
14018       }
14019       ++scrubber.shallow_errors;
14020       soid_error.set_headless();
14021       scrubber.store->add_snap_error(pool.id, soid_error);
14022       ++soid_error_count;
14023       if (head && soid.get_head() == head->get_head())
14024         head_error.set_clone(soid.snap);
14025       continue;
14026     }
14027
14028     // new snapset?
14029     if (soid.has_snapset()) {
14030
14031       if (missing) {
14032         log_missing(missing, head, osd->clog, info.pgid, __func__, mode,
14033                     pool.info.allow_incomplete_clones());
14034       }
14035
14036       // Save previous head error information
14037       if (head && (head_error.errors || soid_error_count))
14038         scrubber.store->add_snap_error(pool.id, head_error);
14039       // Set this as a new head object
14040       head = soid;
14041       missing = 0;
14042       head_error = soid_error;
14043       soid_error_count = 0;
14044
14045       dout(20) << __func__ << " " << mode << " new head " << head << dendl;
14046
14047       if (p->second.attrs.count(SS_ATTR) == 0) {
14048         osd->clog->error() << mode << " " << info.pgid << " " << soid
14049                           << " no '" << SS_ATTR << "' attr";
14050         ++scrubber.shallow_errors;
14051         snapset = boost::none;
14052         head_error.set_snapset_missing();
14053       } else {
14054         bufferlist bl;
14055         bl.push_back(p->second.attrs[SS_ATTR]);
14056         bufferlist::iterator blp = bl.begin();
14057         try {
14058           snapset = SnapSet(); // Initialize optional<> before decoding into it
14059           ::decode(snapset.get(), blp);
14060           head_error.ss_bl.push_back(p->second.attrs[SS_ATTR]);
14061         } catch (buffer::error& e) {
14062           snapset = boost::none;
14063           osd->clog->error() << mode << " " << info.pgid << " " << soid
14064                 << " can't decode '" << SS_ATTR << "' attr " << e.what();
14065           ++scrubber.shallow_errors;
14066           head_error.set_snapset_corrupted();
14067         }
14068       }
14069
14070       if (snapset) {
14071         // what will be next?
14072         curclone = snapset->clones.rbegin();
14073
14074         if (!snapset->clones.empty()) {
14075           dout(20) << "  snapset " << snapset.get() << dendl;
14076           if (snapset->seq == 0) {
14077             osd->clog->error() << mode << " " << info.pgid << " " << soid
14078                                << " snaps.seq not set";
14079             ++scrubber.shallow_errors;
14080             head_error.set_snapset_error();
14081           }
14082         }
14083
14084         if (soid.is_head() && !snapset->head_exists) {
14085           osd->clog->error() << mode << " " << info.pgid << " " << soid
14086                           << " snapset.head_exists=false, but head exists";
14087           ++scrubber.shallow_errors;
14088           head_error.set_head_mismatch();
14089           // Fix head_exists locally so is_legacy() returns correctly
14090           snapset->head_exists = true;
14091         }
14092         if (soid.is_snapdir() && snapset->head_exists) {
14093           osd->clog->error() << mode << " " << info.pgid << " " << soid
14094                           << " snapset.head_exists=true, but snapdir exists";
14095           ++scrubber.shallow_errors;
14096           head_error.set_head_mismatch();
14097           // For symmetry fix this too, but probably doesn't matter
14098           snapset->head_exists = false;
14099         }
14100
14101         if (get_osdmap()->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
14102           if (soid.is_snapdir()) {
14103             dout(10) << " will move snapset to head from " << soid << dendl;
14104             snapset_to_repair[soid.get_head()] = *snapset;
14105           } else if (snapset->is_legacy()) {
14106             dout(10) << " will convert legacy snapset on " << soid << " " << *snapset
14107                      << dendl;
14108             snapset_to_repair[soid.get_head()] = *snapset;
14109           }
14110         } else {
14111           stat.num_legacy_snapsets++;
14112         }
14113       } else {
14114         // pessimistic assumption that this object might contain a
14115         // legacy SnapSet
14116         stat.num_legacy_snapsets++;
14117       }
14118     } else {
14119       assert(soid.is_snap());
14120       assert(head);
14121       assert(snapset);
14122       assert(soid.snap == *curclone);
14123
14124       dout(20) << __func__ << " " << mode << " matched clone " << soid << dendl;
14125
14126       if (snapset->clone_size.count(soid.snap) == 0) {
14127         osd->clog->error() << mode << " " << info.pgid << " " << soid
14128                            << " is missing in clone_size";
14129         ++scrubber.shallow_errors;
14130         soid_error.set_size_mismatch();
14131       } else {
14132         if (oi && oi->size != snapset->clone_size[soid.snap]) {
14133           osd->clog->error() << mode << " " << info.pgid << " " << soid
14134                              << " size " << oi->size << " != clone_size "
14135                              << snapset->clone_size[*curclone];
14136           ++scrubber.shallow_errors;
14137           soid_error.set_size_mismatch();
14138         }
14139
14140         if (snapset->clone_overlap.count(soid.snap) == 0) {
14141           osd->clog->error() << mode << " " << info.pgid << " " << soid
14142                              << " is missing in clone_overlap";
14143           ++scrubber.shallow_errors;
14144           soid_error.set_size_mismatch();
14145         } else {
14146           // This checking is based on get_clone_bytes().  The first 2 asserts
14147           // can't happen because we know we have a clone_size and
14148           // a clone_overlap.  Now we check that the interval_set won't
14149           // cause the last assert.
14150           uint64_t size = snapset->clone_size.find(soid.snap)->second;
14151           const interval_set<uint64_t> &overlap =
14152                 snapset->clone_overlap.find(soid.snap)->second;
14153           bool bad_interval_set = false;
14154           for (interval_set<uint64_t>::const_iterator i = overlap.begin();
14155                i != overlap.end(); ++i) {
14156             if (size < i.get_len()) {
14157               bad_interval_set = true;
14158               break;
14159             }
14160             size -= i.get_len();
14161           }
14162
14163           if (bad_interval_set) {
14164             osd->clog->error() << mode << " " << info.pgid << " " << soid
14165                                << " bad interval_set in clone_overlap";
14166             ++scrubber.shallow_errors;
14167             soid_error.set_size_mismatch();
14168           } else {
14169             stat.num_bytes += snapset->get_clone_bytes(soid.snap);
14170           }
14171         }
14172       }
14173
14174       // migrate legacy_snaps to snapset?
14175       auto p = snapset_to_repair.find(soid.get_head());
14176       if (p != snapset_to_repair.end()) {
14177         if (!oi || oi->legacy_snaps.empty()) {
14178           osd->clog->error() << mode << " " << info.pgid << " " << soid
14179                              << " has no oi or legacy_snaps; cannot convert "
14180                              << *snapset;
14181           ++scrubber.shallow_errors;
14182         } else {
14183           dout(20) << __func__ << "   copying legacy_snaps " << oi->legacy_snaps
14184                    << " to snapset " << p->second << dendl;
14185           p->second.clone_snaps[soid.snap] = oi->legacy_snaps;
14186         }
14187       }
14188
14189       // what's next?
14190       ++curclone;
14191       if (soid_error.errors) {
14192         scrubber.store->add_snap_error(pool.id, soid_error);
14193         ++soid_error_count;
14194       }
14195     }
14196
14197     scrub_cstat.add(stat);
14198   }
14199
14200   if (doing_clones(snapset, curclone)) {
14201     dout(10) << __func__ << " " << mode << " " << info.pgid
14202              << " No more objects while processing " << head.get() << dendl;
14203
14204     missing += process_clones_to(head, snapset, osd->clog, info.pgid, mode,
14205                       pool.info.allow_incomplete_clones(), all_clones, &curclone,
14206                       head_error);
14207   }
14208   // There could be missing found by the test above or even
14209   // before dropping out of the loop for the last head.
14210   if (missing) {
14211     log_missing(missing, head, osd->clog, info.pgid, __func__,
14212                 mode, pool.info.allow_incomplete_clones());
14213   }
14214   if (head && (head_error.errors || soid_error_count))
14215     scrubber.store->add_snap_error(pool.id, head_error);
14216
14217   for (auto p = missing_digest.begin(); p != missing_digest.end(); ++p) {
14218     if (p->first.is_snapdir())
14219       continue;
14220     dout(10) << __func__ << " recording digests for " << p->first << dendl;
14221     ObjectContextRef obc = get_object_context(p->first, false);
14222     if (!obc) {
14223       osd->clog->error() << info.pgid << " " << mode
14224                          << " cannot get object context for object "
14225                          << p->first;
14226       continue;
14227     } else if (obc->obs.oi.soid != p->first) {
14228       osd->clog->error() << info.pgid << " " << mode
14229                          << " object " << p->first
14230                          << " has a valid oi attr with a mismatched name, "
14231                          << " obc->obs.oi.soid: " << obc->obs.oi.soid;
14232       continue;
14233     }
14234     OpContextUPtr ctx = simple_opc_create(obc);
14235     ctx->at_version = get_next_version();
14236     ctx->mtime = utime_t();      // do not update mtime
14237     if (p->second.first) {
14238       ctx->new_obs.oi.set_data_digest(*p->second.first);
14239     } else {
14240       ctx->new_obs.oi.clear_data_digest();
14241     }
14242     if (p->second.second) {
14243       ctx->new_obs.oi.set_omap_digest(*p->second.second);
14244     } else {
14245       ctx->new_obs.oi.clear_omap_digest();
14246     }
14247     finish_ctx(ctx.get(), pg_log_entry_t::MODIFY);
14248
14249     ctx->register_on_success(
14250       [this]() {
14251         dout(20) << "updating scrub digest" << dendl;
14252         if (--scrubber.num_digest_updates_pending == 0) {
14253           requeue_scrub();
14254         }
14255       });
14256
14257     simple_opc_submit(std::move(ctx));
14258     ++scrubber.num_digest_updates_pending;
14259   }
14260   for (auto& p : snapset_to_repair) {
14261     // cache pools may not have the clones, which means we won't know
14262     // what snaps they have.  fake out the clone_snaps entries anyway (with
14263     // blank snap lists).
14264     p.second.head_exists = true;
14265     if (pool.info.allow_incomplete_clones()) {
14266       for (auto s : p.second.clones) {
14267         if (p.second.clone_snaps.count(s) == 0) {
14268           dout(10) << __func__ << " " << p.first << " faking clone_snaps for "
14269                    << s << dendl;
14270           p.second.clone_snaps[s];
14271         }
14272       }
14273     }
14274     if (p.second.clones.size() != p.second.clone_snaps.size() ||
14275         p.second.is_legacy()) {
14276       // this happens if we encounter other errors above, like a missing
14277       // or extra clone.
14278       dout(10) << __func__ << " not writing snapset to " << p.first
14279                << " snapset " << p.second << " clones " << p.second.clones
14280                << "; didn't convert fully" << dendl;
14281       scrub_cstat.sum.num_legacy_snapsets++;
14282       continue;
14283     }
14284     dout(10) << __func__ << " writing snapset to " << p.first
14285              << " " << p.second << dendl;
14286     ObjectContextRef obc = get_object_context(p.first, true);
14287     if (!obc) {
14288       osd->clog->error() << info.pgid << " " << mode
14289                          << " cannot get object context for object "
14290                          << p.first;
14291       continue;
14292     } else if (obc->obs.oi.soid != p.first) {
14293       osd->clog->error() << info.pgid << " " << mode
14294                          << " object " << p.first
14295                          << " has a valid oi attr with a mismatched name, "
14296                          << " obc->obs.oi.soid: " << obc->obs.oi.soid;
14297       continue;
14298     }
14299     ObjectContextRef snapset_obc;
14300     if (!obc->obs.exists) {
14301       snapset_obc = get_object_context(p.first.get_snapdir(), false);
14302       if (!snapset_obc) {
14303         osd->clog->error() << info.pgid << " " << mode
14304                            << " cannot get object context for "
14305                            << p.first.get_snapdir();
14306         continue;
14307       }
14308     }
14309     OpContextUPtr ctx = simple_opc_create(obc);
14310     PGTransaction *t = ctx->op_t.get();
14311     ctx->snapset_obc = snapset_obc;
14312     ctx->at_version = get_next_version();
14313     ctx->mtime = utime_t();      // do not update mtime
14314     ctx->new_snapset = p.second;
14315     if (!ctx->new_obs.exists) {
14316       dout(20) << __func__ << "   making " << p.first << " a whiteout" << dendl;
14317       ctx->new_obs.exists = true;
14318       ctx->new_snapset.head_exists = true;
14319       ctx->new_obs.oi.set_flag(object_info_t::FLAG_WHITEOUT);
14320       ++ctx->delta_stats.num_whiteouts;
14321       ++ctx->delta_stats.num_objects;
14322       t->create(p.first);
14323       if (p.first < scrubber.start) {
14324         dout(20) << __func__ << " kludging around update outside of scrub range"
14325                  << dendl;
14326       } else {
14327         scrub_cstat.add(ctx->delta_stats);
14328       }
14329     }
14330     dout(20) << __func__ << "   final snapset " << ctx->new_snapset << dendl;
14331     assert(!ctx->new_snapset.is_legacy());
14332     finish_ctx(ctx.get(), pg_log_entry_t::MODIFY);
14333     ctx->register_on_success(
14334       [this]() {
14335         dout(20) << "updating snapset" << dendl;
14336         if (--scrubber.num_digest_updates_pending == 0) {
14337           requeue_scrub();
14338         }
14339       });
14340
14341     simple_opc_submit(std::move(ctx));
14342     ++scrubber.num_digest_updates_pending;
14343   }
14344
14345   dout(10) << __func__ << " (" << mode << ") finish" << dendl;
14346 }
14347
14348 void PrimaryLogPG::_scrub_clear_state()
14349 {
14350   scrub_cstat = object_stat_collection_t();
14351 }
14352
14353 void PrimaryLogPG::_scrub_finish()
14354 {
14355   bool repair = state_test(PG_STATE_REPAIR);
14356   bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
14357   const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
14358
14359   if (info.stats.stats_invalid) {
14360     info.stats.stats = scrub_cstat;
14361     info.stats.stats_invalid = false;
14362
14363     if (agent_state)
14364       agent_choose_mode();
14365   }
14366
14367   dout(10) << mode << " got "
14368            << scrub_cstat.sum.num_objects << "/" << info.stats.stats.sum.num_objects << " objects, "
14369            << scrub_cstat.sum.num_object_clones << "/" << info.stats.stats.sum.num_object_clones << " clones, "
14370            << scrub_cstat.sum.num_objects_dirty << "/" << info.stats.stats.sum.num_objects_dirty << " dirty, "
14371            << scrub_cstat.sum.num_objects_omap << "/" << info.stats.stats.sum.num_objects_omap << " omap, "
14372            << scrub_cstat.sum.num_objects_pinned << "/" << info.stats.stats.sum.num_objects_pinned << " pinned, "
14373            << scrub_cstat.sum.num_objects_hit_set_archive << "/" << info.stats.stats.sum.num_objects_hit_set_archive << " hit_set_archive, "
14374            << scrub_cstat.sum.num_bytes << "/" << info.stats.stats.sum.num_bytes << " bytes, "
14375            << scrub_cstat.sum.num_bytes_hit_set_archive << "/" << info.stats.stats.sum.num_bytes_hit_set_archive << " hit_set_archive bytes."
14376            << dendl;
14377
14378   if (scrub_cstat.sum.num_objects != info.stats.stats.sum.num_objects ||
14379       scrub_cstat.sum.num_object_clones != info.stats.stats.sum.num_object_clones ||
14380       (scrub_cstat.sum.num_objects_dirty != info.stats.stats.sum.num_objects_dirty &&
14381        !info.stats.dirty_stats_invalid) ||
14382       (scrub_cstat.sum.num_objects_omap != info.stats.stats.sum.num_objects_omap &&
14383        !info.stats.omap_stats_invalid) ||
14384       (scrub_cstat.sum.num_objects_pinned != info.stats.stats.sum.num_objects_pinned &&
14385        !info.stats.pin_stats_invalid) ||
14386       (scrub_cstat.sum.num_objects_hit_set_archive != info.stats.stats.sum.num_objects_hit_set_archive &&
14387        !info.stats.hitset_stats_invalid) ||
14388       (scrub_cstat.sum.num_bytes_hit_set_archive != info.stats.stats.sum.num_bytes_hit_set_archive &&
14389        !info.stats.hitset_bytes_stats_invalid) ||
14390       scrub_cstat.sum.num_whiteouts != info.stats.stats.sum.num_whiteouts ||
14391       scrub_cstat.sum.num_bytes != info.stats.stats.sum.num_bytes) {
14392     osd->clog->error() << info.pgid << " " << mode
14393                       << " stat mismatch, got "
14394                       << scrub_cstat.sum.num_objects << "/" << info.stats.stats.sum.num_objects << " objects, "
14395                       << scrub_cstat.sum.num_object_clones << "/" << info.stats.stats.sum.num_object_clones << " clones, "
14396                       << scrub_cstat.sum.num_objects_dirty << "/" << info.stats.stats.sum.num_objects_dirty << " dirty, "
14397                       << scrub_cstat.sum.num_objects_omap << "/" << info.stats.stats.sum.num_objects_omap << " omap, "
14398                       << scrub_cstat.sum.num_objects_pinned << "/" << info.stats.stats.sum.num_objects_pinned << " pinned, "
14399                       << scrub_cstat.sum.num_objects_hit_set_archive << "/" << info.stats.stats.sum.num_objects_hit_set_archive << " hit_set_archive, "
14400                       << scrub_cstat.sum.num_whiteouts << "/" << info.stats.stats.sum.num_whiteouts << " whiteouts, "
14401                       << scrub_cstat.sum.num_bytes << "/" << info.stats.stats.sum.num_bytes << " bytes, "
14402                       << scrub_cstat.sum.num_bytes_hit_set_archive << "/" << info.stats.stats.sum.num_bytes_hit_set_archive << " hit_set_archive bytes.";
14403     ++scrubber.shallow_errors;
14404
14405     if (repair) {
14406       ++scrubber.fixed;
14407       info.stats.stats = scrub_cstat;
14408       info.stats.dirty_stats_invalid = false;
14409       info.stats.omap_stats_invalid = false;
14410       info.stats.hitset_stats_invalid = false;
14411       info.stats.hitset_bytes_stats_invalid = false;
14412       publish_stats_to_osd();
14413       share_pg_info();
14414     }
14415   } else if (scrub_cstat.sum.num_legacy_snapsets !=
14416              info.stats.stats.sum.num_legacy_snapsets) {
14417     osd->clog->info() << info.pgid << " " << mode << " updated num_legacy_snapsets"
14418                       << " from " << info.stats.stats.sum.num_legacy_snapsets
14419                       << " -> " << scrub_cstat.sum.num_legacy_snapsets << "\n";
14420     info.stats.stats.sum.num_legacy_snapsets = scrub_cstat.sum.num_legacy_snapsets;
14421     publish_stats_to_osd();
14422     share_pg_info();
14423   }
14424   // Clear object context cache to get repair information
14425   if (repair)
14426     object_contexts.clear();
14427 }
14428
14429 bool PrimaryLogPG::check_osdmap_full(const set<pg_shard_t> &missing_on)
14430 {
14431     return osd->check_osdmap_full(missing_on);
14432 }
14433
14434 int PrimaryLogPG::rep_repair_primary_object(const hobject_t& soid, OpRequestRef op)
14435 {
14436   // Only supports replicated pools
14437   assert(!pool.info.require_rollback());
14438   assert(is_primary());
14439
14440   dout(10) << __func__ << " " << soid
14441            << " peers osd.{" << actingbackfill << "}" << dendl;
14442
14443   if (!is_clean()) {
14444     block_for_clean(soid, op);
14445     return -EAGAIN;
14446   }
14447
14448   assert(!pg_log.get_missing().is_missing(soid));
14449   bufferlist bv;
14450   object_info_t oi;
14451   eversion_t v;
14452   int r = get_pgbackend()->objects_get_attr(soid, OI_ATTR, &bv);
14453   if (r < 0) {
14454     // Leave v and try to repair without a version, getting attr failed
14455     dout(0) << __func__ << ": Need version of replica, objects_get_attr failed: "
14456             << soid << " error=" << r << dendl;
14457   } else try {
14458     bufferlist::iterator bliter = bv.begin();
14459     ::decode(oi, bliter);
14460     v = oi.version;
14461   } catch (...) {
14462     // Leave v as default constructed. This will fail when sent to older OSDs, but
14463     // not much worse than failing here.
14464     dout(0) << __func__ << ": Need version of replica, bad object_info_t: " << soid << dendl;
14465   }
14466
14467   missing_loc.add_missing(soid, v, eversion_t());
14468   if (primary_error(soid, v)) {
14469     dout(0) << __func__ << " No other replicas available for " << soid << dendl;
14470     // XXX: If we knew that there is no down osd which could include this
14471     // object, it would be nice if we could return EIO here.
14472     // If a "never fail" flag was available, that could be used
14473     // for rbd to NOT return EIO until object marked lost.
14474
14475     // Drop through to save this op in case an osd comes up with the object.
14476   }
14477
14478   // Restart the op after object becomes readable again
14479   waiting_for_unreadable_object[soid].push_back(op);
14480   op->mark_delayed("waiting for missing object");
14481
14482   if (!eio_errors_to_process) {
14483     eio_errors_to_process = true;
14484     assert(is_clean());
14485     queue_peering_event(
14486         CephPeeringEvtRef(
14487           std::make_shared<CephPeeringEvt>(
14488           get_osdmap()->get_epoch(),
14489           get_osdmap()->get_epoch(),
14490           DoRecovery())));
14491   } else {
14492     // A prior error must have already cleared clean state and queued recovery
14493     // or a map change has triggered re-peering.
14494     // Not inlining the recovery by calling maybe_kick_recovery(soid);
14495     dout(5) << __func__<< ": Read error on " << soid << ", but already seen errors" << dendl;
14496   }
14497
14498   return -EAGAIN;
14499 }
14500
14501 /*---SnapTrimmer Logging---*/
14502 #undef dout_prefix
14503 #define dout_prefix *_dout << pg->gen_prefix()
14504
14505 void PrimaryLogPG::SnapTrimmer::log_enter(const char *state_name)
14506 {
14507   ldout(pg->cct, 20) << "enter " << state_name << dendl;
14508 }
14509
14510 void PrimaryLogPG::SnapTrimmer::log_exit(const char *state_name, utime_t enter_time)
14511 {
14512   ldout(pg->cct, 20) << "exit " << state_name << dendl;
14513 }
14514
14515 /*---SnapTrimmer states---*/
14516 #undef dout_prefix
14517 #define dout_prefix (*_dout << context< SnapTrimmer >().pg->gen_prefix() \
14518                      << "SnapTrimmer state<" << get_state_name() << ">: ")
14519
14520 /* NotTrimming */
14521 PrimaryLogPG::NotTrimming::NotTrimming(my_context ctx)
14522   : my_base(ctx),
14523     NamedState(context< SnapTrimmer >().pg, "NotTrimming")
14524 {
14525   context< SnapTrimmer >().log_enter(state_name);
14526 }
14527
14528 void PrimaryLogPG::NotTrimming::exit()
14529 {
14530   context< SnapTrimmer >().log_exit(state_name, enter_time);
14531 }
14532
14533 boost::statechart::result PrimaryLogPG::NotTrimming::react(const KickTrim&)
14534 {
14535   PrimaryLogPG *pg = context< SnapTrimmer >().pg;
14536   ldout(pg->cct, 10) << "NotTrimming react KickTrim" << dendl;
14537
14538   if (!(pg->is_primary() && pg->is_active())) {
14539     ldout(pg->cct, 10) << "NotTrimming not primary or active" << dendl;
14540     return discard_event();
14541   }
14542   if (!pg->is_clean() ||
14543       pg->snap_trimq.empty()) {
14544     ldout(pg->cct, 10) << "NotTrimming not clean or nothing to trim" << dendl;
14545     return discard_event();
14546   }
14547   if (pg->scrubber.active) {
14548     ldout(pg->cct, 10) << " scrubbing, will requeue snap_trimmer after" << dendl;
14549     return transit< WaitScrub >();
14550   } else {
14551     return transit< Trimming >();
14552   }
14553 }
14554
14555 boost::statechart::result PrimaryLogPG::WaitReservation::react(const SnapTrimReserved&)
14556 {
14557   PrimaryLogPG *pg = context< SnapTrimmer >().pg;
14558   ldout(pg->cct, 10) << "WaitReservation react SnapTrimReserved" << dendl;
14559
14560   pending = nullptr;
14561   if (!context< SnapTrimmer >().can_trim()) {
14562     post_event(KickTrim());
14563     return transit< NotTrimming >();
14564   }
14565
14566   context<Trimming>().snap_to_trim = pg->snap_trimq.range_start();
14567   ldout(pg->cct, 10) << "NotTrimming: trimming "
14568                      << pg->snap_trimq.range_start()
14569                      << dendl;
14570   return transit< AwaitAsyncWork >();
14571 }
14572
14573 /* AwaitAsyncWork */
14574 PrimaryLogPG::AwaitAsyncWork::AwaitAsyncWork(my_context ctx)
14575   : my_base(ctx),
14576     NamedState(context< SnapTrimmer >().pg, "Trimming/AwaitAsyncWork")
14577 {
14578   auto *pg = context< SnapTrimmer >().pg;
14579   context< SnapTrimmer >().log_enter(state_name);
14580   context< SnapTrimmer >().pg->osd->queue_for_snap_trim(pg);
14581   pg->state_set(PG_STATE_SNAPTRIM);
14582   pg->state_clear(PG_STATE_SNAPTRIM_ERROR);
14583   pg->publish_stats_to_osd();
14584 }
14585
14586 boost::statechart::result PrimaryLogPG::AwaitAsyncWork::react(const DoSnapWork&)
14587 {
14588   PrimaryLogPGRef pg = context< SnapTrimmer >().pg;
14589   snapid_t snap_to_trim = context<Trimming>().snap_to_trim;
14590   auto &in_flight = context<Trimming>().in_flight;
14591   assert(in_flight.empty());
14592
14593   assert(pg->is_primary() && pg->is_active());
14594   if (!context< SnapTrimmer >().can_trim()) {
14595     ldout(pg->cct, 10) << "something changed, reverting to NotTrimming" << dendl;
14596     post_event(KickTrim());
14597     return transit< NotTrimming >();
14598   }
14599
14600   ldout(pg->cct, 10) << "AwaitAsyncWork: trimming snap " << snap_to_trim << dendl;
14601
14602   vector<hobject_t> to_trim;
14603   unsigned max = pg->cct->_conf->osd_pg_max_concurrent_snap_trims;
14604   to_trim.reserve(max);
14605   int r = pg->snap_mapper.get_next_objects_to_trim(
14606     snap_to_trim,
14607     max,
14608     &to_trim);
14609   if (r != 0 && r != -ENOENT) {
14610     lderr(pg->cct) << "get_next_objects_to_trim returned "
14611                    << cpp_strerror(r) << dendl;
14612     assert(0 == "get_next_objects_to_trim returned an invalid code");
14613   } else if (r == -ENOENT) {
14614     // Done!
14615     ldout(pg->cct, 10) << "got ENOENT" << dendl;
14616
14617     ldout(pg->cct, 10) << "adding snap " << snap_to_trim
14618                        << " to purged_snaps"
14619                        << dendl;
14620     pg->info.purged_snaps.insert(snap_to_trim);
14621     pg->snap_trimq.erase(snap_to_trim);
14622     ldout(pg->cct, 10) << "purged_snaps now "
14623                        << pg->info.purged_snaps << ", snap_trimq now "
14624                        << pg->snap_trimq << dendl;
14625
14626     ObjectStore::Transaction t;
14627     pg->dirty_big_info = true;
14628     pg->write_if_dirty(t);
14629     int tr = pg->osd->store->queue_transaction(pg->osr.get(), std::move(t), NULL);
14630     assert(tr == 0);
14631
14632     pg->share_pg_info();
14633     post_event(KickTrim());
14634     return transit< NotTrimming >();
14635   }
14636   assert(!to_trim.empty());
14637
14638   for (auto &&object: to_trim) {
14639     // Get next
14640     ldout(pg->cct, 10) << "AwaitAsyncWork react trimming " << object << dendl;
14641     OpContextUPtr ctx;
14642     int error = pg->trim_object(in_flight.empty(), object, &ctx);
14643     if (error) {
14644       if (error == -ENOLCK) {
14645         ldout(pg->cct, 10) << "could not get write lock on obj "
14646                            << object << dendl;
14647       } else {
14648         pg->state_set(PG_STATE_SNAPTRIM_ERROR);
14649         ldout(pg->cct, 10) << "Snaptrim error=" << error << dendl;
14650       }
14651       if (!in_flight.empty()) {
14652         ldout(pg->cct, 10) << "letting the ones we already started finish" << dendl;
14653         return transit< WaitRepops >();
14654       }
14655       if (error == -ENOLCK) {
14656         ldout(pg->cct, 10) << "waiting for it to clear"
14657                            << dendl;
14658         return transit< WaitRWLock >();
14659       } else {
14660         return transit< NotTrimming >();
14661       }
14662     }
14663
14664     in_flight.insert(object);
14665     ctx->register_on_success(
14666       [pg, object, &in_flight]() {
14667         assert(in_flight.find(object) != in_flight.end());
14668         in_flight.erase(object);
14669         if (in_flight.empty()) {
14670           if (pg->state_test(PG_STATE_SNAPTRIM_ERROR)) {
14671             pg->snap_trimmer_machine.process_event(Reset());
14672           } else {
14673             pg->snap_trimmer_machine.process_event(RepopsComplete());
14674           }
14675         }
14676       });
14677
14678     pg->simple_opc_submit(std::move(ctx));
14679   }
14680
14681   return transit< WaitRepops >();
14682 }
14683
14684 void PrimaryLogPG::setattr_maybe_cache(
14685   ObjectContextRef obc,
14686   OpContext *op,
14687   PGTransaction *t,
14688   const string &key,
14689   bufferlist &val)
14690 {
14691   t->setattr(obc->obs.oi.soid, key, val);
14692 }
14693
14694 void PrimaryLogPG::setattrs_maybe_cache(
14695   ObjectContextRef obc,
14696   OpContext *op,
14697   PGTransaction *t,
14698   map<string, bufferlist> &attrs)
14699 {
14700   t->setattrs(obc->obs.oi.soid, attrs);
14701 }
14702
14703 void PrimaryLogPG::rmattr_maybe_cache(
14704   ObjectContextRef obc,
14705   OpContext *op,
14706   PGTransaction *t,
14707   const string &key)
14708 {
14709   t->rmattr(obc->obs.oi.soid, key);
14710 }
14711
14712 int PrimaryLogPG::getattr_maybe_cache(
14713   ObjectContextRef obc,
14714   const string &key,
14715   bufferlist *val)
14716 {
14717   if (pool.info.require_rollback()) {
14718     map<string, bufferlist>::iterator i = obc->attr_cache.find(key);
14719     if (i != obc->attr_cache.end()) {
14720       if (val)
14721         *val = i->second;
14722       return 0;
14723     } else {
14724       return -ENODATA;
14725     }
14726   }
14727   return pgbackend->objects_get_attr(obc->obs.oi.soid, key, val);
14728 }
14729
14730 int PrimaryLogPG::getattrs_maybe_cache(
14731   ObjectContextRef obc,
14732   map<string, bufferlist> *out)
14733 {
14734   int r = 0;
14735   assert(out);
14736   if (pool.info.require_rollback()) {
14737     *out = obc->attr_cache;
14738   } else {
14739     r = pgbackend->objects_get_attrs(obc->obs.oi.soid, out);
14740   }
14741   map<string, bufferlist> tmp;
14742   for (map<string, bufferlist>::iterator i = out->begin();
14743        i != out->end();
14744        ++i) {
14745     if (i->first.size() > 1 && i->first[0] == '_')
14746       tmp[i->first.substr(1, i->first.size())].claim(i->second);
14747   }
14748   tmp.swap(*out);
14749   return r;
14750 }
14751
14752 bool PrimaryLogPG::check_failsafe_full(ostream &ss) {
14753     return osd->check_failsafe_full(ss);
14754 }
14755
14756 void intrusive_ptr_add_ref(PrimaryLogPG *pg) { pg->get("intptr"); }
14757 void intrusive_ptr_release(PrimaryLogPG *pg) { pg->put("intptr"); }
14758
14759 #ifdef PG_DEBUG_REFS
14760 uint64_t get_with_id(PrimaryLogPG *pg) { return pg->get_with_id(); }
14761 void put_with_id(PrimaryLogPG *pg, uint64_t id) { return pg->put_with_id(id); }
14762 #endif
14763
14764 void intrusive_ptr_add_ref(PrimaryLogPG::RepGather *repop) { repop->get(); }
14765 void intrusive_ptr_release(PrimaryLogPG::RepGather *repop) { repop->put(); }