ceph/src/osd/PrimaryLogPG.cc

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Ceph - scalable distributed file system
   5  *
   6  * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
   7  * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
   8  *
   9  * Author: Loic Dachary <loic@dachary.org>
  10  *
  11  * This is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public
  13  * License version 2.1, as published by the Free Software
  14  * Foundation.  See file COPYING.
  15  *
  16  */
  17
  18 #include "boost/tuple/tuple.hpp"
  19 #include "boost/intrusive_ptr.hpp"
  20 #include "PG.h"
  21 #include "PrimaryLogPG.h"
  22 #include "OSD.h"
  23 #include "OpRequest.h"
  24 #include "ScrubStore.h"
  25 #include "Session.h"
  26 #include "objclass/objclass.h"
  27
  28 #include "common/errno.h"
  29 #include "common/scrub_types.h"
  30 #include "common/perf_counters.h"
  31
  32 #include "messages/MOSDOp.h"
  33 #include "messages/MOSDBackoff.h"
  34 #include "messages/MOSDSubOp.h"
  35 #include "messages/MOSDSubOpReply.h"
  36 #include "messages/MOSDPGTrim.h"
  37 #include "messages/MOSDPGScan.h"
  38 #include "messages/MOSDRepScrub.h"
  39 #include "messages/MOSDPGBackfill.h"
  40 #include "messages/MOSDPGBackfillRemove.h"
  41 #include "messages/MOSDPGUpdateLogMissing.h"
  42 #include "messages/MOSDPGUpdateLogMissingReply.h"
  43 #include "messages/MCommandReply.h"
  44 #include "messages/MOSDScrubReserve.h"
  45 #include "mds/inode_backtrace.h" // Ugh
  46 #include "common/EventTrace.h"
  47
  48 #include "common/config.h"
  49 #include "include/compat.h"
  50 #include "mon/MonClient.h"
  51 #include "osdc/Objecter.h"
  52 #include "json_spirit/json_spirit_value.h"
  53 #include "json_spirit/json_spirit_reader.h"
  54 #include "include/assert.h"  // json_spirit clobbers it
  55 #include "include/rados/rados_types.hpp"
  56
  57 #ifdef WITH_LTTNG
  58 #include "tracing/osd.h"
  59 #else
  60 #define tracepoint(...)
  61 #endif
  62
  63 #define dout_context cct
  64 #define dout_subsys ceph_subsys_osd
  65 #define DOUT_PREFIX_ARGS this, osd->whoami, get_osdmap()
  66 #undef dout_prefix
  67 #define dout_prefix _prefix(_dout, this)
  68 template <typename T>
  69 static ostream& _prefix(std::ostream *_dout, T *pg) {
  70   return *_dout << pg->gen_prefix();
  71 }
  72
  73
  74 #include <sstream>
  75 #include <utility>
  76
  77 #include <errno.h>
  78
  79 MEMPOOL_DEFINE_OBJECT_FACTORY(PrimaryLogPG, replicatedpg, osd);
  80
  81 PGLSFilter::PGLSFilter() : cct(nullptr)
  82 {
  83 }
  84
  85 PGLSFilter::~PGLSFilter()
  86 {
  87 }
  88
  89 struct PrimaryLogPG::C_OSD_OnApplied : Context {
  90   PrimaryLogPGRef pg;
  91   epoch_t epoch;
  92   eversion_t v;
  93   C_OSD_OnApplied(
  94     PrimaryLogPGRef pg,
  95     epoch_t epoch,
  96     eversion_t v)
  97     : pg(pg), epoch(epoch), v(v) {}
  98   void finish(int) override {
  99     pg->lock();
 100     if (!pg->pg_has_reset_since(epoch))
 101       pg->op_applied(v);
 102     pg->unlock();
 103   }
 104 };
 105
 106 /**
 107  * The CopyCallback class defines an interface for completions to the
 108  * copy_start code. Users of the copy infrastructure must implement
 109  * one and give an instance of the class to start_copy.
 110  *
 111  * The implementer is responsible for making sure that the CopyCallback
 112  * can associate itself with the correct copy operation.
 113  */
 114 class PrimaryLogPG::CopyCallback : public GenContext<CopyCallbackResults> {
 115 protected:
 116   CopyCallback() {}
 117   /**
 118    * results.get<0>() is the return code: 0 for success; -ECANCELED if
 119    * the operation was cancelled by the local OSD; -errno for other issues.
 120    * results.get<1>() is a pointer to a CopyResults object, which you are
 121    * responsible for deleting.
 122    */
 123   void finish(CopyCallbackResults results_) override = 0;
 124
 125 public:
 126   /// Provide the final size of the copied object to the CopyCallback
 127   ~CopyCallback() override {}
 128 };
 129
 130 template <typename T>
 131 class PrimaryLogPG::BlessedGenContext : public GenContext<T> {
 132   PrimaryLogPGRef pg;
 133   unique_ptr<GenContext<T>> c;
 134   epoch_t e;
 135 public:
 136   BlessedGenContext(PrimaryLogPG *pg, GenContext<T> *c, epoch_t e)
 137     : pg(pg), c(c), e(e) {}
 138   void finish(T t) override {
 139     pg->lock();
 140     if (pg->pg_has_reset_since(e))
 141       c.reset();
 142     else
 143       c.release()->complete(t);
 144     pg->unlock();
 145   }
 146 };
 147
 148 GenContext<ThreadPool::TPHandle&> *PrimaryLogPG::bless_gencontext(
 149   GenContext<ThreadPool::TPHandle&> *c) {
 150   return new BlessedGenContext<ThreadPool::TPHandle&>(
 151     this, c, get_osdmap()->get_epoch());
 152 }
 153
 154 class PrimaryLogPG::BlessedContext : public Context {
 155   PrimaryLogPGRef pg;
 156   unique_ptr<Context> c;
 157   epoch_t e;
 158 public:
 159   BlessedContext(PrimaryLogPG *pg, Context *c, epoch_t e)
 160     : pg(pg), c(c), e(e) {}
 161   void finish(int r) override {
 162     pg->lock();
 163     if (pg->pg_has_reset_since(e))
 164       c.reset();
 165     else
 166       c.release()->complete(r);
 167     pg->unlock();
 168   }
 169 };
 170
 171
 172 Context *PrimaryLogPG::bless_context(Context *c) {
 173   return new BlessedContext(this, c, get_osdmap()->get_epoch());
 174 }
 175
 176 class PrimaryLogPG::C_PG_ObjectContext : public Context {
 177   PrimaryLogPGRef pg;
 178   ObjectContext *obc;
 179   public:
 180   C_PG_ObjectContext(PrimaryLogPG *p, ObjectContext *o) :
 181     pg(p), obc(o) {}
 182   void finish(int r) override {
 183     pg->object_context_destructor_callback(obc);
 184   }
 185 };
 186
 187 class PrimaryLogPG::C_OSD_OndiskWriteUnlock : public Context {
 188   ObjectContextRef obc, obc2, obc3;
 189   public:
 190   C_OSD_OndiskWriteUnlock(
 191     ObjectContextRef o,
 192     ObjectContextRef o2 = ObjectContextRef(),
 193     ObjectContextRef o3 = ObjectContextRef()) : obc(o), obc2(o2), obc3(o3) {}
 194   void finish(int r) override {
 195     obc->ondisk_write_unlock();
 196     if (obc2)
 197       obc2->ondisk_write_unlock();
 198     if (obc3)
 199       obc3->ondisk_write_unlock();
 200   }
 201 };
 202
 203 struct OnReadComplete : public Context {
 204   PrimaryLogPG *pg;
 205   PrimaryLogPG::OpContext *opcontext;
 206   OnReadComplete(
 207     PrimaryLogPG *pg,
 208     PrimaryLogPG::OpContext *ctx) : pg(pg), opcontext(ctx) {}
 209   void finish(int r) override {
 210     opcontext->finish_read(pg);
 211   }
 212   ~OnReadComplete() override {}
 213 };
 214
 215 class PrimaryLogPG::C_OSD_AppliedRecoveredObject : public Context {
 216   PrimaryLogPGRef pg;
 217   ObjectContextRef obc;
 218   public:
 219   C_OSD_AppliedRecoveredObject(PrimaryLogPG *p, ObjectContextRef o) :
 220     pg(p), obc(o) {}
 221   void finish(int r) override {
 222     pg->_applied_recovered_object(obc);
 223   }
 224 };
 225
 226 class PrimaryLogPG::C_OSD_CommittedPushedObject : public Context {
 227   PrimaryLogPGRef pg;
 228   epoch_t epoch;
 229   eversion_t last_complete;
 230   public:
 231   C_OSD_CommittedPushedObject(
 232     PrimaryLogPG *p, epoch_t epoch, eversion_t lc) :
 233     pg(p), epoch(epoch), last_complete(lc) {
 234   }
 235   void finish(int r) override {
 236     pg->_committed_pushed_object(epoch, last_complete);
 237   }
 238 };
 239
 240 class PrimaryLogPG::C_OSD_AppliedRecoveredObjectReplica : public Context {
 241   PrimaryLogPGRef pg;
 242   public:
 243   explicit C_OSD_AppliedRecoveredObjectReplica(PrimaryLogPG *p) :
 244     pg(p) {}
 245   void finish(int r) override {
 246     pg->_applied_recovered_object_replica();
 247   }
 248 };
 249
 250 // OpContext
 251 void PrimaryLogPG::OpContext::start_async_reads(PrimaryLogPG *pg)
 252 {
 253   inflightreads = 1;
 254   list<pair<boost::tuple<uint64_t, uint64_t, unsigned>,
 255             pair<bufferlist*, Context*> > > in;
 256   in.swap(pending_async_reads);
 257   pg->pgbackend->objects_read_async(
 258     obc->obs.oi.soid,
 259     in,
 260     new OnReadComplete(pg, this), pg->get_pool().fast_read);
 261 }
 262 void PrimaryLogPG::OpContext::finish_read(PrimaryLogPG *pg)
 263 {
 264   assert(inflightreads > 0);
 265   --inflightreads;
 266   if (async_reads_complete()) {
 267     assert(pg->in_progress_async_reads.size());
 268     assert(pg->in_progress_async_reads.front().second == this);
 269     pg->in_progress_async_reads.pop_front();
 270
 271     // Restart the op context now that all reads have been
 272     // completed. Read failures will be handled by the op finisher
 273     pg->execute_ctx(this);
 274   }
 275 }
 276
 277 class CopyFromCallback : public PrimaryLogPG::CopyCallback {
 278 public:
 279   PrimaryLogPG::CopyResults *results = nullptr;
 280   PrimaryLogPG::OpContext *ctx;
 281   OSDOp &osd_op;
 282
 283   CopyFromCallback(PrimaryLogPG::OpContext *ctx, OSDOp &osd_op)
 284     : ctx(ctx), osd_op(osd_op) {
 285   }
 286   ~CopyFromCallback() override {}
 287
 288   void finish(PrimaryLogPG::CopyCallbackResults results_) override {
 289     results = results_.get<1>();
 290     int r = results_.get<0>();
 291
 292     // for finish_copyfrom
 293     ctx->user_at_version = results->user_version;
 294
 295     if (r >= 0) {
 296       ctx->pg->execute_ctx(ctx);
 297     } else {
 298       if (r != -ECANCELED) { // on cancel just toss it out; client resends
 299         if (ctx->op)
 300           ctx->pg->osd->reply_op_error(ctx->op, r);
 301       } else if (results->should_requeue) {
 302         if (ctx->op)
 303           ctx->pg->requeue_op(ctx->op);
 304       }
 305       ctx->pg->close_op_ctx(ctx);
 306     }
 307   }
 308
 309   bool is_temp_obj_used() {
 310     return results->started_temp_obj;
 311   }
 312   uint64_t get_data_size() {
 313     return results->object_size;
 314   }
 315 };
 316
 317 struct CopyFromFinisher : public PrimaryLogPG::OpFinisher {
 318   CopyFromCallback *copy_from_callback;
 319
 320   CopyFromFinisher(CopyFromCallback *copy_from_callback)
 321     : copy_from_callback(copy_from_callback) {
 322   }
 323
 324   int execute() override {
 325     // instance will be destructed after this method completes
 326     copy_from_callback->ctx->pg->finish_copyfrom(copy_from_callback);
 327     return 0;
 328   }
 329 };
 330
 331 // ======================
 332 // PGBackend::Listener
 333
 334 void PrimaryLogPG::on_local_recover(
 335   const hobject_t &hoid,
 336   const ObjectRecoveryInfo &_recovery_info,
 337   ObjectContextRef obc,
 338   bool is_delete,
 339   ObjectStore::Transaction *t
 340   )
 341 {
 342   dout(10) << __func__ << ": " << hoid << dendl;
 343
 344   ObjectRecoveryInfo recovery_info(_recovery_info);
 345   clear_object_snap_mapping(t, hoid);
 346   if (!is_delete && recovery_info.soid.is_snap()) {
 347     OSDriver::OSTransaction _t(osdriver.get_transaction(t));
 348     set<snapid_t> snaps;
 349     dout(20) << " snapset " << recovery_info.ss
 350              << " legacy_snaps " << recovery_info.oi.legacy_snaps << dendl;
 351     if (recovery_info.ss.is_legacy() ||
 352         recovery_info.ss.seq == 0 /* jewel osd doesn't populate this */) {
 353       assert(recovery_info.oi.legacy_snaps.size());
 354       snaps.insert(recovery_info.oi.legacy_snaps.begin(),
 355                    recovery_info.oi.legacy_snaps.end());
 356     } else {
 357       auto p = recovery_info.ss.clone_snaps.find(hoid.snap);
 358       assert(p != recovery_info.ss.clone_snaps.end());  // hmm, should we warn?
 359       snaps.insert(p->second.begin(), p->second.end());
 360     }
 361     dout(20) << " snaps " << snaps << dendl;
 362     snap_mapper.add_oid(
 363       recovery_info.soid,
 364       snaps,
 365       &_t);
 366   }
 367   if (!is_delete && pg_log.get_missing().is_missing(recovery_info.soid) &&
 368       pg_log.get_missing().get_items().find(recovery_info.soid)->second.need > recovery_info.version) {
 369     assert(is_primary());
 370     const pg_log_entry_t *latest = pg_log.get_log().objects.find(recovery_info.soid)->second;
 371     if (latest->op == pg_log_entry_t::LOST_REVERT &&
 372         latest->reverting_to == recovery_info.version) {
 373       dout(10) << " got old revert version " << recovery_info.version
 374                << " for " << *latest << dendl;
 375       recovery_info.version = latest->version;
 376       // update the attr to the revert event version
 377       recovery_info.oi.prior_version = recovery_info.oi.version;
 378       recovery_info.oi.version = latest->version;
 379       bufferlist bl;
 380       ::encode(recovery_info.oi, bl,
 381                get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
 382       assert(!pool.info.require_rollback());
 383       t->setattr(coll, ghobject_t(recovery_info.soid), OI_ATTR, bl);
 384       if (obc)
 385         obc->attr_cache[OI_ATTR] = bl;
 386     }
 387   }
 388
 389   // keep track of active pushes for scrub
 390   ++active_pushes;
 391
 392   if (recovery_info.version > pg_log.get_can_rollback_to()) {
 393     /* This can only happen during a repair, and even then, it would
 394      * be one heck of a race.  If we are repairing the object, the
 395      * write in question must be fully committed, so it's not valid
 396      * to roll it back anyway (and we'll be rolled forward shortly
 397      * anyway) */
 398     PGLogEntryHandler h{this, t};
 399     pg_log.roll_forward_to(recovery_info.version, &h);
 400   }
 401   recover_got(recovery_info.soid, recovery_info.version);
 402
 403   if (is_primary()) {
 404     if (!is_delete) {
 405       obc->obs.exists = true;
 406       obc->ondisk_write_lock();
 407
 408       bool got = obc->get_recovery_read();
 409       assert(got);
 410
 411       assert(recovering.count(obc->obs.oi.soid));
 412       recovering[obc->obs.oi.soid] = obc;
 413       obc->obs.oi = recovery_info.oi;  // may have been updated above
 414       t->register_on_applied_sync(new C_OSD_OndiskWriteUnlock(obc));
 415     }
 416
 417     t->register_on_applied(new C_OSD_AppliedRecoveredObject(this, obc));
 418
 419     publish_stats_to_osd();
 420     assert(missing_loc.needs_recovery(hoid));
 421     if (!is_delete)
 422       missing_loc.add_location(hoid, pg_whoami);
 423     release_backoffs(hoid);
 424     if (!is_unreadable_object(hoid)) {
 425       auto unreadable_object_entry = waiting_for_unreadable_object.find(hoid);
 426       if (unreadable_object_entry != waiting_for_unreadable_object.end()) {
 427         dout(20) << " kicking unreadable waiters on " << hoid << dendl;
 428         requeue_ops(unreadable_object_entry->second);
 429         waiting_for_unreadable_object.erase(unreadable_object_entry);
 430       }
 431     }
 432   } else {
 433     t->register_on_applied(
 434       new C_OSD_AppliedRecoveredObjectReplica(this));
 435
 436   }
 437
 438   t->register_on_commit(
 439     new C_OSD_CommittedPushedObject(
 440       this,
 441       get_osdmap()->get_epoch(),
 442       info.last_complete));
 443
 444   // update pg
 445   dirty_info = true;
 446   write_if_dirty(*t);
 447 }
 448
 449 void PrimaryLogPG::on_global_recover(
 450   const hobject_t &soid,
 451   const object_stat_sum_t &stat_diff,
 452   bool is_delete)
 453 {
 454   info.stats.stats.sum.add(stat_diff);
 455   missing_loc.recovered(soid);
 456   publish_stats_to_osd();
 457   dout(10) << "pushed " << soid << " to all replicas" << dendl;
 458   map<hobject_t, ObjectContextRef>::iterator i = recovering.find(soid);
 459   assert(i != recovering.end());
 460
 461   if (!is_delete) {
 462     // recover missing won't have had an obc, but it gets filled in
 463     // during on_local_recover
 464     assert(i->second);
 465     list<OpRequestRef> requeue_list;
 466     i->second->drop_recovery_read(&requeue_list);
 467     requeue_ops(requeue_list);
 468   }
 469
 470   backfills_in_flight.erase(soid);
 471
 472   recovering.erase(i);
 473   finish_recovery_op(soid);
 474   release_backoffs(soid);
 475   auto degraded_object_entry = waiting_for_degraded_object.find(soid);
 476   if (degraded_object_entry != waiting_for_degraded_object.end()) {
 477     dout(20) << " kicking degraded waiters on " << soid << dendl;
 478     requeue_ops(degraded_object_entry->second);
 479     waiting_for_degraded_object.erase(degraded_object_entry);
 480   }
 481   auto unreadable_object_entry = waiting_for_unreadable_object.find(soid);
 482   if (unreadable_object_entry != waiting_for_unreadable_object.end()) {
 483     dout(20) << " kicking unreadable waiters on " << soid << dendl;
 484     requeue_ops(unreadable_object_entry->second);
 485     waiting_for_unreadable_object.erase(unreadable_object_entry);
 486   }
 487   finish_degraded_object(soid);
 488 }
 489
 490 void PrimaryLogPG::on_peer_recover(
 491   pg_shard_t peer,
 492   const hobject_t &soid,
 493   const ObjectRecoveryInfo &recovery_info)
 494 {
 495   publish_stats_to_osd();
 496   // done!
 497   peer_missing[peer].got(soid, recovery_info.version);
 498 }
 499
 500 void PrimaryLogPG::begin_peer_recover(
 501   pg_shard_t peer,
 502   const hobject_t soid)
 503 {
 504   peer_missing[peer].revise_have(soid, eversion_t());
 505 }
 506
 507 void PrimaryLogPG::schedule_recovery_work(
 508   GenContext<ThreadPool::TPHandle&> *c)
 509 {
 510   osd->recovery_gen_wq.queue(c);
 511 }
 512
 513 void PrimaryLogPG::send_message_osd_cluster(
 514   int peer, Message *m, epoch_t from_epoch)
 515 {
 516   osd->send_message_osd_cluster(peer, m, from_epoch);
 517 }
 518
 519 void PrimaryLogPG::send_message_osd_cluster(
 520   Message *m, Connection *con)
 521 {
 522   osd->send_message_osd_cluster(m, con);
 523 }
 524
 525 void PrimaryLogPG::send_message_osd_cluster(
 526   Message *m, const ConnectionRef& con)
 527 {
 528   osd->send_message_osd_cluster(m, con);
 529 }
 530
 531 void PrimaryLogPG::on_primary_error(
 532   const hobject_t &oid,
 533   eversion_t v)
 534 {
 535   dout(0) << __func__ << ": oid " << oid << " version " << v << dendl;
 536   primary_failed(oid);
 537   primary_error(oid, v);
 538   backfills_in_flight.erase(oid);
 539   missing_loc.add_missing(oid, v, eversion_t());
 540 }
 541
 542 ConnectionRef PrimaryLogPG::get_con_osd_cluster(
 543   int peer, epoch_t from_epoch)
 544 {
 545   return osd->get_con_osd_cluster(peer, from_epoch);
 546 }
 547
 548 PerfCounters *PrimaryLogPG::get_logger()
 549 {
 550   return osd->logger;
 551 }
 552
 553
 554 // ====================
 555 // missing objects
 556
 557 bool PrimaryLogPG::is_missing_object(const hobject_t& soid) const
 558 {
 559   return pg_log.get_missing().get_items().count(soid);
 560 }
 561
 562 void PrimaryLogPG::maybe_kick_recovery(
 563   const hobject_t &soid)
 564 {
 565   eversion_t v;
 566   if (!missing_loc.needs_recovery(soid, &v))
 567     return;
 568
 569   map<hobject_t, ObjectContextRef>::const_iterator p = recovering.find(soid);
 570   if (p != recovering.end()) {
 571     dout(7) << "object " << soid << " v " << v << ", already recovering." << dendl;
 572   } else if (missing_loc.is_unfound(soid)) {
 573     dout(7) << "object " << soid << " v " << v << ", is unfound." << dendl;
 574   } else {
 575     dout(7) << "object " << soid << " v " << v << ", recovering." << dendl;
 576     PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
 577     if (is_missing_object(soid)) {
 578       recover_missing(soid, v, cct->_conf->osd_client_op_priority, h);
 579     } else if (missing_loc.is_deleted(soid)) {
 580       prep_object_replica_deletes(soid, v, h);
 581     } else {
 582       prep_object_replica_pushes(soid, v, h);
 583     }
 584     pgbackend->run_recovery_op(h, cct->_conf->osd_client_op_priority);
 585   }
 586 }
 587
 588 void PrimaryLogPG::wait_for_unreadable_object(
 589   const hobject_t& soid, OpRequestRef op)
 590 {
 591   assert(is_unreadable_object(soid));
 592   maybe_kick_recovery(soid);
 593   waiting_for_unreadable_object[soid].push_back(op);
 594   op->mark_delayed("waiting for missing object");
 595 }
 596
 597 bool PrimaryLogPG::is_degraded_or_backfilling_object(const hobject_t& soid)
 598 {
 599   /* The conditions below may clear (on_local_recover, before we queue
 600    * the transaction) before we actually requeue the degraded waiters
 601    * in on_global_recover after the transaction completes.
 602    */
 603   if (waiting_for_degraded_object.count(soid))
 604     return true;
 605   if (pg_log.get_missing().get_items().count(soid))
 606     return true;
 607   assert(!actingbackfill.empty());
 608   for (set<pg_shard_t>::iterator i = actingbackfill.begin();
 609        i != actingbackfill.end();
 610        ++i) {
 611     if (*i == get_primary()) continue;
 612     pg_shard_t peer = *i;
 613     auto peer_missing_entry = peer_missing.find(peer);
 614     if (peer_missing_entry != peer_missing.end() &&
 615         peer_missing_entry->second.get_items().count(soid))
 616       return true;
 617
 618     // Object is degraded if after last_backfill AND
 619     // we are backfilling it
 620     if (is_backfill_targets(peer) &&
 621         peer_info[peer].last_backfill <= soid &&
 622         last_backfill_started >= soid &&
 623         backfills_in_flight.count(soid))
 624       return true;
 625   }
 626   return false;
 627 }
 628
 629 void PrimaryLogPG::wait_for_degraded_object(const hobject_t& soid, OpRequestRef op)
 630 {
 631   assert(is_degraded_or_backfilling_object(soid));
 632
 633   maybe_kick_recovery(soid);
 634   waiting_for_degraded_object[soid].push_back(op);
 635   op->mark_delayed("waiting for degraded object");
 636 }
 637
 638 void PrimaryLogPG::block_write_on_full_cache(
 639   const hobject_t& _oid, OpRequestRef op)
 640 {
 641   const hobject_t oid = _oid.get_head();
 642   dout(20) << __func__ << ": blocking object " << oid
 643            << " on full cache" << dendl;
 644   objects_blocked_on_cache_full.insert(oid);
 645   waiting_for_cache_not_full.push_back(op);
 646   op->mark_delayed("waiting for cache not full");
 647 }
 648
 649 void PrimaryLogPG::block_for_clean(
 650   const hobject_t& oid, OpRequestRef op)
 651 {
 652   dout(20) << __func__ << ": blocking object " << oid
 653            << " on primary repair" << dendl;
 654   waiting_for_clean_to_primary_repair.push_back(op);
 655   op->mark_delayed("waiting for clean to repair");
 656 }
 657
 658 void PrimaryLogPG::block_write_on_snap_rollback(
 659   const hobject_t& oid, ObjectContextRef obc, OpRequestRef op)
 660 {
 661   dout(20) << __func__ << ": blocking object " << oid.get_head()
 662            << " on snap promotion " << obc->obs.oi.soid << dendl;
 663   // otherwise, we'd have blocked in do_op
 664   assert(oid.is_head());
 665   assert(objects_blocked_on_snap_promotion.count(oid) == 0);
 666   objects_blocked_on_snap_promotion[oid] = obc;
 667   wait_for_blocked_object(obc->obs.oi.soid, op);
 668 }
 669
 670 void PrimaryLogPG::block_write_on_degraded_snap(
 671   const hobject_t& snap, OpRequestRef op)
 672 {
 673   dout(20) << __func__ << ": blocking object " << snap.get_head()
 674            << " on degraded snap " << snap << dendl;
 675   // otherwise, we'd have blocked in do_op
 676   assert(objects_blocked_on_degraded_snap.count(snap.get_head()) == 0);
 677   objects_blocked_on_degraded_snap[snap.get_head()] = snap.snap;
 678   wait_for_degraded_object(snap, op);
 679 }
 680
 681 bool PrimaryLogPG::maybe_await_blocked_snapset(
 682   const hobject_t &hoid,
 683   OpRequestRef op)
 684 {
 685   ObjectContextRef obc;
 686   obc = object_contexts.lookup(hoid.get_head());
 687   if (obc) {
 688     if (obc->is_blocked()) {
 689       wait_for_blocked_object(obc->obs.oi.soid, op);
 690       return true;
 691     } else {
 692       return false;
 693     }
 694   }
 695   obc = object_contexts.lookup(hoid.get_snapdir());
 696   if (obc) {
 697     if (obc->is_blocked()) {
 698       wait_for_blocked_object(obc->obs.oi.soid, op);
 699       return true;
 700     } else {
 701       return false;
 702     }
 703   }
 704   return false;
 705 }
 706
 707 void PrimaryLogPG::wait_for_blocked_object(const hobject_t& soid, OpRequestRef op)
 708 {
 709   dout(10) << __func__ << " " << soid << " " << op << dendl;
 710   waiting_for_blocked_object[soid].push_back(op);
 711   op->mark_delayed("waiting for blocked object");
 712 }
 713
 714 void PrimaryLogPG::maybe_force_recovery()
 715 {
 716   // no force if not in degraded/recovery/backfill stats
 717   if (!is_degraded() &&
 718       !state_test(PG_STATE_RECOVERING |
 719                   PG_STATE_RECOVERY_WAIT |
 720                   PG_STATE_BACKFILL |
 721                   PG_STATE_BACKFILL_WAIT |
 722                   PG_STATE_BACKFILL_TOOFULL))
 723     return;
 724
 725   if (pg_log.get_log().approx_size() <
 726       cct->_conf->osd_max_pg_log_entries *
 727         cct->_conf->osd_force_recovery_pg_log_entries_factor)
 728     return;
 729
 730   // find the oldest missing object
 731   version_t min_version = 0;
 732   hobject_t soid;
 733   if (!pg_log.get_missing().get_items().empty()) {
 734     min_version = pg_log.get_missing().get_rmissing().begin()->first;
 735     soid = pg_log.get_missing().get_rmissing().begin()->second;
 736   }
 737   assert(!actingbackfill.empty());
 738   for (set<pg_shard_t>::iterator it = actingbackfill.begin();
 739        it != actingbackfill.end();
 740        ++it) {
 741     if (*it == get_primary()) continue;
 742     pg_shard_t peer = *it;
 743     if (peer_missing.count(peer) &&
 744         !peer_missing[peer].get_items().empty() &&
 745         min_version > peer_missing[peer].get_rmissing().begin()->first) {
 746       min_version = peer_missing[peer].get_rmissing().begin()->first;
 747       soid = peer_missing[peer].get_rmissing().begin()->second;
 748     }
 749   }
 750
 751   // recover it
 752   if (soid != hobject_t())
 753     maybe_kick_recovery(soid);
 754 }
 755
 756 class PGLSPlainFilter : public PGLSFilter {
 757   string val;
 758 public:
 759   int init(bufferlist::iterator &params) override
 760   {
 761     try {
 762       ::decode(xattr, params);
 763       ::decode(val, params);
 764     } catch (buffer::error &e) {
 765       return -EINVAL;
 766     }
 767
 768     return 0;
 769   }
 770   ~PGLSPlainFilter() override {}
 771   bool filter(const hobject_t &obj, bufferlist& xattr_data,
 772                       bufferlist& outdata) override;
 773 };
 774
 775 class PGLSParentFilter : public PGLSFilter {
 776   inodeno_t parent_ino;
 777 public:
 778   CephContext* cct;
 779   PGLSParentFilter(CephContext* cct) : cct(cct) {
 780     xattr = "_parent";
 781   }
 782   int init(bufferlist::iterator &params) override
 783   {
 784     try {
 785       ::decode(parent_ino, params);
 786     } catch (buffer::error &e) {
 787       return -EINVAL;
 788     }
 789     generic_dout(0) << "parent_ino=" << parent_ino << dendl;
 790
 791     return 0;
 792   }
 793   ~PGLSParentFilter() override {}
 794   bool filter(const hobject_t &obj, bufferlist& xattr_data,
 795                       bufferlist& outdata) override;
 796 };
 797
 798 bool PGLSParentFilter::filter(const hobject_t &obj,
 799                               bufferlist& xattr_data, bufferlist& outdata)
 800 {
 801   bufferlist::iterator iter = xattr_data.begin();
 802   inode_backtrace_t bt;
 803
 804   generic_dout(0) << "PGLSParentFilter::filter" << dendl;
 805
 806   ::decode(bt, iter);
 807
 808   vector<inode_backpointer_t>::iterator vi;
 809   for (vi = bt.ancestors.begin(); vi != bt.ancestors.end(); ++vi) {
 810     generic_dout(0) << "vi->dirino=" << vi->dirino << " parent_ino=" << parent_ino << dendl;
 811     if (vi->dirino == parent_ino) {
 812       ::encode(*vi, outdata);
 813       return true;
 814     }
 815   }
 816
 817   return false;
 818 }
 819
 820 bool PGLSPlainFilter::filter(const hobject_t &obj,
 821                              bufferlist& xattr_data, bufferlist& outdata)
 822 {
 823   if (val.size() != xattr_data.length())
 824     return false;
 825
 826   if (memcmp(val.c_str(), xattr_data.c_str(), val.size()))
 827     return false;
 828
 829   return true;
 830 }
 831
 832 bool PrimaryLogPG::pgls_filter(PGLSFilter *filter, hobject_t& sobj, bufferlist& outdata)
 833 {
 834   bufferlist bl;
 835
 836   // If filter has expressed an interest in an xattr, load it.
 837   if (!filter->get_xattr().empty()) {
 838     int ret = pgbackend->objects_get_attr(
 839       sobj,
 840       filter->get_xattr(),
 841       &bl);
 842     dout(0) << "getattr (sobj=" << sobj << ", attr=" << filter->get_xattr() << ") returned " << ret << dendl;
 843     if (ret < 0) {
 844       if (ret != -ENODATA || filter->reject_empty_xattr()) {
 845         return false;
 846       }
 847     }
 848   }
 849
 850   return filter->filter(sobj, bl, outdata);
 851 }
 852
 853 int PrimaryLogPG::get_pgls_filter(bufferlist::iterator& iter, PGLSFilter **pfilter)
 854 {
 855   string type;
 856   PGLSFilter *filter;
 857
 858   try {
 859     ::decode(type, iter);
 860   }
 861   catch (buffer::error& e) {
 862     return -EINVAL;
 863   }
 864
 865   if (type.compare("parent") == 0) {
 866     filter = new PGLSParentFilter(cct);
 867   } else if (type.compare("plain") == 0) {
 868     filter = new PGLSPlainFilter();
 869   } else {
 870     std::size_t dot = type.find(".");
 871     if (dot == std::string::npos || dot == 0 || dot == type.size() - 1) {
 872       return -EINVAL;
 873     }
 874
 875     const std::string class_name = type.substr(0, dot);
 876     const std::string filter_name = type.substr(dot + 1);
 877     ClassHandler::ClassData *cls = NULL;
 878     int r = osd->class_handler->open_class(class_name, &cls);
 879     if (r != 0) {
 880       derr << "Error opening class '" << class_name << "': "
 881            << cpp_strerror(r) << dendl;
 882       if (r != -EPERM) // propogate permission error
 883         r = -EINVAL;
 884       return r;
 885     } else {
 886       assert(cls);
 887     }
 888
 889     ClassHandler::ClassFilter *class_filter = cls->get_filter(filter_name);
 890     if (class_filter == NULL) {
 891       derr << "Error finding filter '" << filter_name << "' in class "
 892            << class_name << dendl;
 893       return -EINVAL;
 894     }
 895     filter = class_filter->fn();
 896     if (!filter) {
 897       // Object classes are obliged to return us something, but let's
 898       // give an error rather than asserting out.
 899       derr << "Buggy class " << class_name << " failed to construct "
 900               "filter " << filter_name << dendl;
 901       return -EINVAL;
 902     }
 903   }
 904
 905   assert(filter);
 906   int r = filter->init(iter);
 907   if (r < 0) {
 908     derr << "Error initializing filter " << type << ": "
 909          << cpp_strerror(r) << dendl;
 910     delete filter;
 911     return -EINVAL;
 912   } else {
 913     // Successfully constructed and initialized, return it.
 914     *pfilter = filter;
 915     return 0;
 916   }
 917 }
 918
 919
 920 // ==========================================================
 921
 922 int PrimaryLogPG::do_command(
 923   cmdmap_t cmdmap,
 924   ostream& ss,
 925   bufferlist& idata,
 926   bufferlist& odata,
 927   ConnectionRef con,
 928   ceph_tid_t tid)
 929 {
 930   const auto &missing = pg_log.get_missing();
 931   string prefix;
 932   string format;
 933
 934   cmd_getval(cct, cmdmap, "format", format);
 935   boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json"));
 936
 937   string command;
 938   cmd_getval(cct, cmdmap, "cmd", command);
 939   if (command == "query") {
 940     f->open_object_section("pg");
 941     f->dump_string("state", pg_state_string(get_state()));
 942     f->dump_stream("snap_trimq") << snap_trimq;
 943     f->dump_unsigned("epoch", get_osdmap()->get_epoch());
 944     f->open_array_section("up");
 945     for (vector<int>::iterator p = up.begin(); p != up.end(); ++p)
 946       f->dump_unsigned("osd", *p);
 947     f->close_section();
 948     f->open_array_section("acting");
 949     for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p)
 950       f->dump_unsigned("osd", *p);
 951     f->close_section();
 952     if (!backfill_targets.empty()) {
 953       f->open_array_section("backfill_targets");
 954       for (set<pg_shard_t>::iterator p = backfill_targets.begin();
 955            p != backfill_targets.end();
 956            ++p)
 957         f->dump_stream("shard") << *p;
 958       f->close_section();
 959     }
 960     if (!actingbackfill.empty()) {
 961       f->open_array_section("actingbackfill");
 962       for (set<pg_shard_t>::iterator p = actingbackfill.begin();
 963            p != actingbackfill.end();
 964            ++p)
 965         f->dump_stream("shard") << *p;
 966       f->close_section();
 967     }
 968     f->open_object_section("info");
 969     _update_calc_stats();
 970     info.dump(f.get());
 971     f->close_section();
 972
 973     f->open_array_section("peer_info");
 974     for (map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
 975          p != peer_info.end();
 976          ++p) {
 977       f->open_object_section("info");
 978       f->dump_stream("peer") << p->first;
 979       p->second.dump(f.get());
 980       f->close_section();
 981     }
 982     f->close_section();
 983
 984     f->open_array_section("recovery_state");
 985     handle_query_state(f.get());
 986     f->close_section();
 987
 988     f->open_object_section("agent_state");
 989     if (agent_state)
 990       agent_state->dump(f.get());
 991     f->close_section();
 992
 993     f->close_section();
 994     f->flush(odata);
 995     return 0;
 996   }
 997   else if (command == "mark_unfound_lost") {
 998     string mulcmd;
 999     cmd_getval(cct, cmdmap, "mulcmd", mulcmd);
1000     int mode = -1;
1001     if (mulcmd == "revert") {
1002       if (pool.info.ec_pool()) {
1003         ss << "mode must be 'delete' for ec pool";
1004         return -EINVAL;
1005       }
1006       mode = pg_log_entry_t::LOST_REVERT;
1007     } else if (mulcmd == "delete") {
1008       mode = pg_log_entry_t::LOST_DELETE;
1009     } else {
1010       ss << "mode must be 'revert' or 'delete'; mark not yet implemented";
1011       return -EINVAL;
1012     }
1013     assert(mode == pg_log_entry_t::LOST_REVERT ||
1014            mode == pg_log_entry_t::LOST_DELETE);
1015
1016     if (!is_primary()) {
1017       ss << "not primary";
1018       return -EROFS;
1019     }
1020
1021     uint64_t unfound = missing_loc.num_unfound();
1022     if (!unfound) {
1023       ss << "pg has no unfound objects";
1024       return 0;  // make command idempotent
1025     }
1026
1027     if (!all_unfound_are_queried_or_lost(get_osdmap())) {
1028       ss << "pg has " << unfound
1029          << " unfound objects but we haven't probed all sources, not marking lost";
1030       return -EINVAL;
1031     }
1032
1033     mark_all_unfound_lost(mode, con, tid);
1034     return -EAGAIN;
1035   }
1036   else if (command == "list_missing") {
1037     hobject_t offset;
1038     string offset_json;
1039     if (cmd_getval(cct, cmdmap, "offset", offset_json)) {
1040       json_spirit::Value v;
1041       try {
1042         if (!json_spirit::read(offset_json, v))
1043           throw std::runtime_error("bad json");
1044         offset.decode(v);
1045       } catch (std::runtime_error& e) {
1046         ss << "error parsing offset: " << e.what();
1047         return -EINVAL;
1048       }
1049     }
1050     f->open_object_section("missing");
1051     {
1052       f->open_object_section("offset");
1053       offset.dump(f.get());
1054       f->close_section();
1055     }
1056     f->dump_int("num_missing", missing.num_missing());
1057     f->dump_int("num_unfound", get_num_unfound());
1058     const map<hobject_t, pg_missing_item> &needs_recovery_map =
1059       missing_loc.get_needs_recovery();
1060     map<hobject_t, pg_missing_item>::const_iterator p =
1061       needs_recovery_map.upper_bound(offset);
1062     {
1063       f->open_array_section("objects");
1064       int32_t num = 0;
1065       for (; p != needs_recovery_map.end() && num < cct->_conf->osd_command_max_records; ++p) {
1066         if (missing_loc.is_unfound(p->first)) {
1067           f->open_object_section("object");
1068           {
1069             f->open_object_section("oid");
1070             p->first.dump(f.get());
1071             f->close_section();
1072           }
1073           p->second.dump(f.get()); // have, need keys
1074           {
1075             f->open_array_section("locations");
1076             for (set<pg_shard_t>::iterator r =
1077                 missing_loc.get_locations(p->first).begin();
1078                 r != missing_loc.get_locations(p->first).end();
1079                 ++r)
1080               f->dump_stream("shard") << *r;
1081             f->close_section();
1082           }
1083           f->close_section();
1084           num++;
1085         }
1086       }
1087       f->close_section();
1088     }
1089     f->dump_bool("more", p != needs_recovery_map.end());
1090     f->close_section();
1091     f->flush(odata);
1092     return 0;
1093   }
1094
1095   ss << "unknown pg command " << prefix;
1096   return -EINVAL;
1097 }
1098
1099 // ==========================================================
1100
1101 void PrimaryLogPG::do_pg_op(OpRequestRef op)
1102 {
1103   // NOTE: this is non-const because we modify the OSDOp.outdata in
1104   // place
1105   MOSDOp *m = static_cast<MOSDOp *>(op->get_nonconst_req());
1106   assert(m->get_type() == CEPH_MSG_OSD_OP);
1107   dout(10) << "do_pg_op " << *m << dendl;
1108
1109   op->mark_started();
1110
1111   int result = 0;
1112   string cname, mname;
1113   PGLSFilter *filter = NULL;
1114   bufferlist filter_out;
1115
1116   snapid_t snapid = m->get_snapid();
1117
1118   vector<OSDOp> ops = m->ops;
1119
1120   for (vector<OSDOp>::iterator p = ops.begin(); p != ops.end(); ++p) {
1121     OSDOp& osd_op = *p;
1122     bufferlist::iterator bp = p->indata.begin();
1123     switch (p->op.op) {
1124     case CEPH_OSD_OP_PGNLS_FILTER:
1125       try {
1126         ::decode(cname, bp);
1127         ::decode(mname, bp);
1128       }
1129       catch (const buffer::error& e) {
1130         dout(0) << "unable to decode PGLS_FILTER description in " << *m << dendl;
1131         result = -EINVAL;
1132         break;
1133       }
1134       if (filter) {
1135         delete filter;
1136         filter = NULL;
1137       }
1138       result = get_pgls_filter(bp, &filter);
1139       if (result < 0)
1140         break;
1141
1142       assert(filter);
1143
1144       // fall through
1145
1146     case CEPH_OSD_OP_PGNLS:
1147       if (snapid != CEPH_NOSNAP) {
1148         result = -EINVAL;
1149         break;
1150       }
1151       if (get_osdmap()->raw_pg_to_pg(m->get_pg()) != info.pgid.pgid) {
1152         dout(10) << " pgnls pg=" << m->get_pg()
1153                  << " " << get_osdmap()->raw_pg_to_pg(m->get_pg())
1154                  << " != " << info.pgid << dendl;
1155         result = 0; // hmm?
1156       } else {
1157         unsigned list_size = MIN(cct->_conf->osd_max_pgls, p->op.pgls.count);
1158
1159         dout(10) << " pgnls pg=" << m->get_pg() << " count " << list_size << dendl;
1160         // read into a buffer
1161         vector<hobject_t> sentries;
1162         pg_nls_response_t response;
1163         try {
1164           ::decode(response.handle, bp);
1165         }
1166         catch (const buffer::error& e) {
1167           dout(0) << "unable to decode PGNLS handle in " << *m << dendl;
1168           result = -EINVAL;
1169           break;
1170         }
1171
1172         hobject_t next;
1173         hobject_t lower_bound = response.handle;
1174         hobject_t pg_start = info.pgid.pgid.get_hobj_start();
1175         hobject_t pg_end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1176         dout(10) << " pgnls lower_bound " << lower_bound
1177                  << " pg_end " << pg_end << dendl;
1178         if (((!lower_bound.is_max() && lower_bound >= pg_end) ||
1179              (lower_bound != hobject_t() && lower_bound < pg_start))) {
1180           // this should only happen with a buggy client.
1181           dout(10) << "outside of PG bounds " << pg_start << " .. "
1182                    << pg_end << dendl;
1183           result = -EINVAL;
1184           break;
1185         }
1186
1187         hobject_t current = lower_bound;
1188         osr->flush();
1189         int r = pgbackend->objects_list_partial(
1190           current,
1191           list_size,
1192           list_size,
1193           &sentries,
1194           &next);
1195         if (r != 0) {
1196           result = -EINVAL;
1197           break;
1198         }
1199
1200         map<hobject_t, pg_missing_item>::const_iterator missing_iter =
1201           pg_log.get_missing().get_items().lower_bound(current);
1202         vector<hobject_t>::iterator ls_iter = sentries.begin();
1203         hobject_t _max = hobject_t::get_max();
1204         while (1) {
1205           const hobject_t &mcand =
1206             missing_iter == pg_log.get_missing().get_items().end() ?
1207             _max :
1208             missing_iter->first;
1209           const hobject_t &lcand =
1210             ls_iter == sentries.end() ?
1211             _max :
1212             *ls_iter;
1213
1214           hobject_t candidate;
1215           if (mcand == lcand) {
1216             candidate = mcand;
1217             if (!mcand.is_max()) {
1218               ++ls_iter;
1219               ++missing_iter;
1220             }
1221           } else if (mcand < lcand) {
1222             candidate = mcand;
1223             assert(!mcand.is_max());
1224             ++missing_iter;
1225           } else {
1226             candidate = lcand;
1227             assert(!lcand.is_max());
1228             ++ls_iter;
1229           }
1230
1231           dout(10) << " pgnls candidate 0x" << std::hex << candidate.get_hash()
1232             << " vs lower bound 0x" << lower_bound.get_hash() << dendl;
1233
1234           if (candidate >= next) {
1235             break;
1236           }
1237
1238           if (response.entries.size() == list_size) {
1239             next = candidate;
1240             break;
1241           }
1242
1243           // skip snapdir objects
1244           if (candidate.snap == CEPH_SNAPDIR)
1245             continue;
1246
1247           if (candidate.snap != CEPH_NOSNAP)
1248             continue;
1249
1250           // skip internal namespace
1251           if (candidate.get_namespace() == cct->_conf->osd_hit_set_namespace)
1252             continue;
1253
1254           if (missing_loc.is_deleted(candidate))
1255             continue;
1256
1257           // skip wrong namespace
1258           if (m->get_hobj().nspace != librados::all_nspaces &&
1259                candidate.get_namespace() != m->get_hobj().nspace)
1260             continue;
1261
1262           if (filter && !pgls_filter(filter, candidate, filter_out))
1263             continue;
1264
1265           dout(20) << "pgnls item 0x" << std::hex
1266             << candidate.get_hash()
1267             << ", rev 0x" << hobject_t::_reverse_bits(candidate.get_hash())
1268             << std::dec << " "
1269             << candidate.oid.name << dendl;
1270
1271           librados::ListObjectImpl item;
1272           item.nspace = candidate.get_namespace();
1273           item.oid = candidate.oid.name;
1274           item.locator = candidate.get_key();
1275           response.entries.push_back(item);
1276         }
1277
1278         if (next.is_max() &&
1279             missing_iter == pg_log.get_missing().get_items().end() &&
1280             ls_iter == sentries.end()) {
1281           result = 1;
1282
1283           // Set response.handle to the start of the next PG according
1284           // to the object sort order.
1285           response.handle = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1286         } else {
1287           response.handle = next;
1288         }
1289         dout(10) << "pgnls handle=" << response.handle << dendl;
1290         ::encode(response, osd_op.outdata);
1291         if (filter)
1292           ::encode(filter_out, osd_op.outdata);
1293         dout(10) << " pgnls result=" << result << " outdata.length()="
1294                  << osd_op.outdata.length() << dendl;
1295       }
1296       break;
1297
1298     case CEPH_OSD_OP_PGLS_FILTER:
1299       try {
1300         ::decode(cname, bp);
1301         ::decode(mname, bp);
1302       }
1303       catch (const buffer::error& e) {
1304         dout(0) << "unable to decode PGLS_FILTER description in " << *m << dendl;
1305         result = -EINVAL;
1306         break;
1307       }
1308       if (filter) {
1309         delete filter;
1310         filter = NULL;
1311       }
1312       result = get_pgls_filter(bp, &filter);
1313       if (result < 0)
1314         break;
1315
1316       assert(filter);
1317
1318       // fall through
1319
1320     case CEPH_OSD_OP_PGLS:
1321       if (snapid != CEPH_NOSNAP) {
1322         result = -EINVAL;
1323         break;
1324       }
1325       if (get_osdmap()->raw_pg_to_pg(m->get_pg()) != info.pgid.pgid) {
1326         dout(10) << " pgls pg=" << m->get_pg()
1327                  << " " << get_osdmap()->raw_pg_to_pg(m->get_pg())
1328                  << " != " << info.pgid << dendl;
1329         result = 0; // hmm?
1330       } else {
1331         unsigned list_size = MIN(cct->_conf->osd_max_pgls, p->op.pgls.count);
1332
1333         dout(10) << " pgls pg=" << m->get_pg() << " count " << list_size << dendl;
1334         // read into a buffer
1335         vector<hobject_t> sentries;
1336         pg_ls_response_t response;
1337         try {
1338           ::decode(response.handle, bp);
1339         }
1340         catch (const buffer::error& e) {
1341           dout(0) << "unable to decode PGLS handle in " << *m << dendl;
1342           result = -EINVAL;
1343           break;
1344         }
1345
1346         hobject_t next;
1347         hobject_t current = response.handle;
1348         osr->flush();
1349         int r = pgbackend->objects_list_partial(
1350           current,
1351           list_size,
1352           list_size,
1353           &sentries,
1354           &next);
1355         if (r != 0) {
1356           result = -EINVAL;
1357           break;
1358         }
1359
1360         assert(snapid == CEPH_NOSNAP || pg_log.get_missing().get_items().empty());
1361
1362         map<hobject_t, pg_missing_item>::const_iterator missing_iter =
1363           pg_log.get_missing().get_items().lower_bound(current);
1364         vector<hobject_t>::iterator ls_iter = sentries.begin();
1365         hobject_t _max = hobject_t::get_max();
1366         while (1) {
1367           const hobject_t &mcand =
1368             missing_iter == pg_log.get_missing().get_items().end() ?
1369             _max :
1370             missing_iter->first;
1371           const hobject_t &lcand =
1372             ls_iter == sentries.end() ?
1373             _max :
1374             *ls_iter;
1375
1376           hobject_t candidate;
1377           if (mcand == lcand) {
1378             candidate = mcand;
1379             if (!mcand.is_max()) {
1380               ++ls_iter;
1381               ++missing_iter;
1382             }
1383           } else if (mcand < lcand) {
1384             candidate = mcand;
1385             assert(!mcand.is_max());
1386             ++missing_iter;
1387           } else {
1388             candidate = lcand;
1389             assert(!lcand.is_max());
1390             ++ls_iter;
1391           }
1392
1393           if (candidate >= next) {
1394             break;
1395           }
1396
1397           if (response.entries.size() == list_size) {
1398             next = candidate;
1399             break;
1400           }
1401
1402           // skip snapdir objects
1403           if (candidate.snap == CEPH_SNAPDIR)
1404             continue;
1405
1406           if (candidate.snap != CEPH_NOSNAP)
1407             continue;
1408
1409           // skip wrong namespace
1410           if (candidate.get_namespace() != m->get_hobj().nspace)
1411             continue;
1412
1413           if (missing_loc.is_deleted(candidate))
1414             continue;
1415
1416           if (filter && !pgls_filter(filter, candidate, filter_out))
1417             continue;
1418
1419           response.entries.push_back(make_pair(candidate.oid,
1420                                                candidate.get_key()));
1421         }
1422         if (next.is_max() &&
1423             missing_iter == pg_log.get_missing().get_items().end() &&
1424             ls_iter == sentries.end()) {
1425           result = 1;
1426         }
1427         response.handle = next;
1428         ::encode(response, osd_op.outdata);
1429         if (filter)
1430           ::encode(filter_out, osd_op.outdata);
1431         dout(10) << " pgls result=" << result << " outdata.length()="
1432                  << osd_op.outdata.length() << dendl;
1433       }
1434       break;
1435
1436     case CEPH_OSD_OP_PG_HITSET_LS:
1437       {
1438         list< pair<utime_t,utime_t> > ls;
1439         for (list<pg_hit_set_info_t>::const_iterator p = info.hit_set.history.begin();
1440              p != info.hit_set.history.end();
1441              ++p)
1442           ls.push_back(make_pair(p->begin, p->end));
1443         if (hit_set)
1444           ls.push_back(make_pair(hit_set_start_stamp, utime_t()));
1445         ::encode(ls, osd_op.outdata);
1446       }
1447       break;
1448
1449     case CEPH_OSD_OP_PG_HITSET_GET:
1450       {
1451         utime_t stamp(osd_op.op.hit_set_get.stamp);
1452         if (hit_set_start_stamp && stamp >= hit_set_start_stamp) {
1453           // read the current in-memory HitSet, not the version we've
1454           // checkpointed.
1455           if (!hit_set) {
1456             result= -ENOENT;
1457             break;
1458           }
1459           ::encode(*hit_set, osd_op.outdata);
1460           result = osd_op.outdata.length();
1461         } else {
1462           // read an archived HitSet.
1463           hobject_t oid;
1464           for (list<pg_hit_set_info_t>::const_iterator p = info.hit_set.history.begin();
1465                p != info.hit_set.history.end();
1466                ++p) {
1467             if (stamp >= p->begin && stamp <= p->end) {
1468               oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
1469               break;
1470             }
1471           }
1472           if (oid == hobject_t()) {
1473             result = -ENOENT;
1474             break;
1475           }
1476           if (!pool.info.is_replicated()) {
1477             // FIXME: EC not supported yet
1478             result = -EOPNOTSUPP;
1479             break;
1480           }
1481           if (is_unreadable_object(oid)) {
1482             wait_for_unreadable_object(oid, op);
1483             delete filter;
1484             return;
1485           }
1486           result = osd->store->read(ch, ghobject_t(oid), 0, 0, osd_op.outdata);
1487         }
1488       }
1489       break;
1490
1491    case CEPH_OSD_OP_SCRUBLS:
1492       result = do_scrub_ls(m, &osd_op);
1493       break;
1494
1495     default:
1496       result = -EINVAL;
1497       break;
1498     }
1499
1500     if (result < 0)
1501       break;
1502   }
1503
1504   // reply
1505   MOSDOpReply *reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(),
1506                                        CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK,
1507                                        false);
1508   reply->claim_op_out_data(ops);
1509   reply->set_result(result);
1510   reply->set_reply_versions(info.last_update, info.last_user_version);
1511   osd->send_message_osd_client(reply, m->get_connection());
1512   delete filter;
1513 }
1514
1515 int PrimaryLogPG::do_scrub_ls(MOSDOp *m, OSDOp *osd_op)
1516 {
1517   if (m->get_pg() != info.pgid.pgid) {
1518     dout(10) << " scrubls pg=" << m->get_pg() << " != " << info.pgid << dendl;
1519     return -EINVAL; // hmm?
1520   }
1521   auto bp = osd_op->indata.begin();
1522   scrub_ls_arg_t arg;
1523   try {
1524     arg.decode(bp);
1525   } catch (buffer::error&) {
1526     dout(10) << " corrupted scrub_ls_arg_t" << dendl;
1527     return -EINVAL;
1528   }
1529   int r = 0;
1530   scrub_ls_result_t result = {.interval = info.history.same_interval_since};
1531   if (arg.interval != 0 && arg.interval != info.history.same_interval_since) {
1532     r = -EAGAIN;
1533   } else if (!scrubber.store) {
1534     r = -ENOENT;
1535   } else if (arg.get_snapsets) {
1536     result.vals = scrubber.store->get_snap_errors(osd->store,
1537                                                   get_pgid().pool(),
1538                                                   arg.start_after,
1539                                                   arg.max_return);
1540   } else {
1541     result.vals = scrubber.store->get_object_errors(osd->store,
1542                                                     get_pgid().pool(),
1543                                                     arg.start_after,
1544                                                     arg.max_return);
1545   }
1546   ::encode(result, osd_op->outdata);
1547   return r;
1548 }
1549
1550 void PrimaryLogPG::calc_trim_to()
1551 {
1552   size_t target = cct->_conf->osd_min_pg_log_entries;
1553   if (is_degraded() ||
1554       state_test(PG_STATE_RECOVERING |
1555                  PG_STATE_RECOVERY_WAIT |
1556                  PG_STATE_BACKFILL |
1557                  PG_STATE_BACKFILL_WAIT |
1558                  PG_STATE_BACKFILL_TOOFULL)) {
1559     target = cct->_conf->osd_max_pg_log_entries;
1560   }
1561
1562   eversion_t limit = MIN(
1563     min_last_complete_ondisk,
1564     pg_log.get_can_rollback_to());
1565   if (limit != eversion_t() &&
1566       limit != pg_trim_to &&
1567       pg_log.get_log().approx_size() > target) {
1568     size_t num_to_trim = pg_log.get_log().approx_size() - target;
1569     if (num_to_trim < cct->_conf->osd_pg_log_trim_min) {
1570       return;
1571     }
1572     list<pg_log_entry_t>::const_iterator it = pg_log.get_log().log.begin();
1573     eversion_t new_trim_to;
1574     for (size_t i = 0; i < num_to_trim; ++i) {
1575       new_trim_to = it->version;
1576       ++it;
1577       if (new_trim_to > limit) {
1578         new_trim_to = limit;
1579         dout(10) << "calc_trim_to trimming to min_last_complete_ondisk" << dendl;
1580         break;
1581       }
1582     }
1583     dout(10) << "calc_trim_to " << pg_trim_to << " -> " << new_trim_to << dendl;
1584     pg_trim_to = new_trim_to;
1585     assert(pg_trim_to <= pg_log.get_head());
1586     assert(pg_trim_to <= min_last_complete_ondisk);
1587   }
1588 }
1589
1590 PrimaryLogPG::PrimaryLogPG(OSDService *o, OSDMapRef curmap,
1591                            const PGPool &_pool, spg_t p) :
1592   PG(o, curmap, _pool, p),
1593   pgbackend(
1594     PGBackend::build_pg_backend(
1595       _pool.info, curmap, this, coll_t(p), ch, o->store, cct)),
1596   object_contexts(o->cct, o->cct->_conf->osd_pg_object_context_cache_count),
1597   snapset_contexts_lock("PrimaryLogPG::snapset_contexts_lock"),
1598   new_backfill(false),
1599   temp_seq(0),
1600   snap_trimmer_machine(this)
1601 {
1602   missing_loc.set_backend_predicates(
1603     pgbackend->get_is_readable_predicate(),
1604     pgbackend->get_is_recoverable_predicate());
1605   snap_trimmer_machine.initiate();
1606 }
1607
1608 void PrimaryLogPG::get_src_oloc(const object_t& oid, const object_locator_t& oloc, object_locator_t& src_oloc)
1609 {
1610   src_oloc = oloc;
1611   if (oloc.key.empty())
1612     src_oloc.key = oid.name;
1613 }
1614
1615 void PrimaryLogPG::handle_backoff(OpRequestRef& op)
1616 {
1617   const MOSDBackoff *m = static_cast<const MOSDBackoff*>(op->get_req());
1618   SessionRef session = static_cast<Session*>(m->get_connection()->get_priv());
1619   if (!session)
1620     return;  // drop it.
1621   session->put();  // get_priv takes a ref, and so does the SessionRef
1622   hobject_t begin = info.pgid.pgid.get_hobj_start();
1623   hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1624   if (begin < m->begin) {
1625     begin = m->begin;
1626   }
1627   if (end > m->end) {
1628     end = m->end;
1629   }
1630   dout(10) << __func__ << " backoff ack id " << m->id
1631            << " [" << begin << "," << end << ")" << dendl;
1632   session->ack_backoff(cct, m->pgid, m->id, begin, end);
1633 }
1634
1635 void PrimaryLogPG::do_request(
1636   OpRequestRef& op,
1637   ThreadPool::TPHandle &handle)
1638 {
1639   if (op->osd_trace) {
1640     op->pg_trace.init("pg op", &trace_endpoint, &op->osd_trace);
1641     op->pg_trace.event("do request");
1642   }
1643   // make sure we have a new enough map
1644   auto p = waiting_for_map.find(op->get_source());
1645   if (p != waiting_for_map.end()) {
1646     // preserve ordering
1647     dout(20) << __func__ << " waiting_for_map "
1648              << p->first << " not empty, queueing" << dendl;
1649     p->second.push_back(op);
1650     op->mark_delayed("waiting_for_map not empty");
1651     return;
1652   }
1653   if (!have_same_or_newer_map(op->min_epoch)) {
1654     dout(20) << __func__ << " min " << op->min_epoch
1655              << ", queue on waiting_for_map " << op->get_source() << dendl;
1656     waiting_for_map[op->get_source()].push_back(op);
1657     op->mark_delayed("op must wait for map");
1658     osd->request_osdmap_update(op->min_epoch);
1659     return;
1660   }
1661
1662   if (can_discard_request(op)) {
1663     return;
1664   }
1665
1666   // pg-wide backoffs
1667   const Message *m = op->get_req();
1668   if (m->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF)) {
1669     SessionRef session = static_cast<Session*>(m->get_connection()->get_priv());
1670     if (!session)
1671       return;  // drop it.
1672     session->put();  // get_priv takes a ref, and so does the SessionRef
1673
1674     if (op->get_req()->get_type() == CEPH_MSG_OSD_OP) {
1675       if (session->check_backoff(cct, info.pgid,
1676                                  info.pgid.pgid.get_hobj_start(), m)) {
1677         return;
1678       }
1679
1680       bool backoff =
1681         is_down() ||
1682         is_incomplete() ||
1683         (!is_active() && is_peered());
1684       if (g_conf->osd_backoff_on_peering && !backoff) {
1685         if (is_peering()) {
1686           backoff = true;
1687         }
1688       }
1689       if (backoff) {
1690         add_pg_backoff(session);
1691         return;
1692       }
1693     }
1694     // pg backoff acks at pg-level
1695     if (op->get_req()->get_type() == CEPH_MSG_OSD_BACKOFF) {
1696       const MOSDBackoff *ba = static_cast<const MOSDBackoff*>(m);
1697       if (ba->begin != ba->end) {
1698         handle_backoff(op);
1699         return;
1700       }
1701     }
1702   }
1703
1704   if (flushes_in_progress > 0) {
1705     dout(20) << flushes_in_progress
1706              << " flushes_in_progress pending "
1707              << "waiting for active on " << op << dendl;
1708     waiting_for_peered.push_back(op);
1709     op->mark_delayed("waiting for peered");
1710     return;
1711   }
1712
1713   if (!is_peered()) {
1714     // Delay unless PGBackend says it's ok
1715     if (pgbackend->can_handle_while_inactive(op)) {
1716       bool handled = pgbackend->handle_message(op);
1717       assert(handled);
1718       return;
1719     } else {
1720       waiting_for_peered.push_back(op);
1721       op->mark_delayed("waiting for peered");
1722       return;
1723     }
1724   }
1725
1726   assert(is_peered() && flushes_in_progress == 0);
1727   if (pgbackend->handle_message(op))
1728     return;
1729
1730   switch (op->get_req()->get_type()) {
1731   case CEPH_MSG_OSD_OP:
1732   case CEPH_MSG_OSD_BACKOFF:
1733     if (!is_active()) {
1734       dout(20) << " peered, not active, waiting for active on " << op << dendl;
1735       waiting_for_active.push_back(op);
1736       op->mark_delayed("waiting for active");
1737       return;
1738     }
1739     switch (op->get_req()->get_type()) {
1740     case CEPH_MSG_OSD_OP:
1741       // verify client features
1742       if ((pool.info.has_tiers() || pool.info.is_tier()) &&
1743           !op->has_feature(CEPH_FEATURE_OSD_CACHEPOOL)) {
1744         osd->reply_op_error(op, -EOPNOTSUPP);
1745         return;
1746       }
1747       do_op(op);
1748       break;
1749     case CEPH_MSG_OSD_BACKOFF:
1750       // object-level backoff acks handled in osdop context
1751       handle_backoff(op);
1752       break;
1753     }
1754     break;
1755
1756   case MSG_OSD_SUBOP:
1757     do_sub_op(op);
1758     break;
1759
1760   case MSG_OSD_SUBOPREPLY:
1761     do_sub_op_reply(op);
1762     break;
1763
1764   case MSG_OSD_PG_SCAN:
1765     do_scan(op, handle);
1766     break;
1767
1768   case MSG_OSD_PG_BACKFILL:
1769     do_backfill(op);
1770     break;
1771
1772   case MSG_OSD_PG_BACKFILL_REMOVE:
1773     do_backfill_remove(op);
1774     break;
1775
1776   case MSG_OSD_SCRUB_RESERVE:
1777     {
1778       const MOSDScrubReserve *m =
1779         static_cast<const MOSDScrubReserve*>(op->get_req());
1780       switch (m->type) {
1781       case MOSDScrubReserve::REQUEST:
1782         handle_scrub_reserve_request(op);
1783         break;
1784       case MOSDScrubReserve::GRANT:
1785         handle_scrub_reserve_grant(op, m->from);
1786         break;
1787       case MOSDScrubReserve::REJECT:
1788         handle_scrub_reserve_reject(op, m->from);
1789         break;
1790       case MOSDScrubReserve::RELEASE:
1791         handle_scrub_reserve_release(op);
1792         break;
1793       }
1794     }
1795     break;
1796
1797   case MSG_OSD_REP_SCRUB:
1798     replica_scrub(op, handle);
1799     break;
1800
1801   case MSG_OSD_REP_SCRUBMAP:
1802     do_replica_scrub_map(op);
1803     break;
1804
1805   case MSG_OSD_PG_UPDATE_LOG_MISSING:
1806     do_update_log_missing(op);
1807     break;
1808
1809   case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY:
1810     do_update_log_missing_reply(op);
1811     break;
1812
1813   default:
1814     assert(0 == "bad message type in do_request");
1815   }
1816 }
1817
1818 hobject_t PrimaryLogPG::earliest_backfill() const
1819 {
1820   hobject_t e = hobject_t::get_max();
1821   for (set<pg_shard_t>::iterator i = backfill_targets.begin();
1822        i != backfill_targets.end();
1823        ++i) {
1824     pg_shard_t bt = *i;
1825     map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(bt);
1826     assert(iter != peer_info.end());
1827     if (iter->second.last_backfill < e)
1828       e = iter->second.last_backfill;
1829   }
1830   return e;
1831 }
1832
1833 /** do_op - do an op
1834  * pg lock will be held (if multithreaded)
1835  * osd_lock NOT held.
1836  */
1837 void PrimaryLogPG::do_op(OpRequestRef& op)
1838 {
1839   FUNCTRACE();
1840   // NOTE: take a non-const pointer here; we must be careful not to
1841   // change anything that will break other reads on m (operator<<).
1842   MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
1843   assert(m->get_type() == CEPH_MSG_OSD_OP);
1844   if (m->finish_decode()) {
1845     op->reset_desc();   // for TrackedOp
1846     m->clear_payload();
1847   }
1848
1849   dout(20) << __func__ << ": op " << *m << dendl;
1850
1851   hobject_t head = m->get_hobj();
1852   head.snap = CEPH_NOSNAP;
1853
1854   if (!info.pgid.pgid.contains(
1855         info.pgid.pgid.get_split_bits(pool.info.get_pg_num()), head)) {
1856     derr << __func__ << " " << info.pgid.pgid << " does not contain "
1857          << head << " pg_num " << pool.info.get_pg_num() << " hash "
1858          << std::hex << head.get_hash() << std::dec << dendl;
1859     osd->clog->warn() << info.pgid.pgid << " does not contain " << head
1860                       << " op " << *m;
1861     assert(!cct->_conf->osd_debug_misdirected_ops);
1862     return;
1863   }
1864
1865   bool can_backoff =
1866     m->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF);
1867   SessionRef session;
1868   if (can_backoff) {
1869     session = static_cast<Session*>(m->get_connection()->get_priv());
1870     if (!session.get()) {
1871       dout(10) << __func__ << " no session" << dendl;
1872       return;
1873     }
1874     session->put();  // get_priv() takes a ref, and so does the intrusive_ptr
1875
1876     if (session->check_backoff(cct, info.pgid, head, m)) {
1877       return;
1878     }
1879   }
1880
1881   if (m->has_flag(CEPH_OSD_FLAG_PARALLELEXEC)) {
1882     // not implemented.
1883     dout(20) << __func__ << ": PARALLELEXEC not implemented " << *m << dendl;
1884     osd->reply_op_error(op, -EINVAL);
1885     return;
1886   }
1887
1888   if (op->rmw_flags == 0) {
1889     int r = osd->osd->init_op_flags(op);
1890     if (r) {
1891       osd->reply_op_error(op, r);
1892       return;
1893     }
1894   }
1895
1896   if ((m->get_flags() & (CEPH_OSD_FLAG_BALANCE_READS |
1897                          CEPH_OSD_FLAG_LOCALIZE_READS)) &&
1898       op->may_read() &&
1899       !(op->may_write() || op->may_cache())) {
1900     // balanced reads; any replica will do
1901     if (!(is_primary() || is_replica())) {
1902       osd->handle_misdirected_op(this, op);
1903       return;
1904     }
1905   } else {
1906     // normal case; must be primary
1907     if (!is_primary()) {
1908       osd->handle_misdirected_op(this, op);
1909       return;
1910     }
1911   }
1912
1913   if (!op_has_sufficient_caps(op)) {
1914     osd->reply_op_error(op, -EPERM);
1915     return;
1916   }
1917
1918   if (op->includes_pg_op()) {
1919     return do_pg_op(op);
1920   }
1921
1922   // object name too long?
1923   if (m->get_oid().name.size() > cct->_conf->osd_max_object_name_len) {
1924     dout(4) << "do_op name is longer than "
1925             << cct->_conf->osd_max_object_name_len
1926             << " bytes" << dendl;
1927     osd->reply_op_error(op, -ENAMETOOLONG);
1928     return;
1929   }
1930   if (m->get_hobj().get_key().size() > cct->_conf->osd_max_object_name_len) {
1931     dout(4) << "do_op locator is longer than "
1932             << cct->_conf->osd_max_object_name_len
1933             << " bytes" << dendl;
1934     osd->reply_op_error(op, -ENAMETOOLONG);
1935     return;
1936   }
1937   if (m->get_hobj().nspace.size() > cct->_conf->osd_max_object_namespace_len) {
1938     dout(4) << "do_op namespace is longer than "
1939             << cct->_conf->osd_max_object_namespace_len
1940             << " bytes" << dendl;
1941     osd->reply_op_error(op, -ENAMETOOLONG);
1942     return;
1943   }
1944
1945   if (int r = osd->store->validate_hobject_key(head)) {
1946     dout(4) << "do_op object " << head << " invalid for backing store: "
1947             << r << dendl;
1948     osd->reply_op_error(op, r);
1949     return;
1950   }
1951
1952   // blacklisted?
1953   if (get_osdmap()->is_blacklisted(m->get_source_addr())) {
1954     dout(10) << "do_op " << m->get_source_addr() << " is blacklisted" << dendl;
1955     osd->reply_op_error(op, -EBLACKLISTED);
1956     return;
1957   }
1958
1959   // order this op as a write?
1960   bool write_ordered = op->rwordered();
1961
1962   // discard due to cluster full transition?  (we discard any op that
1963   // originates before the cluster or pool is marked full; the client
1964   // will resend after the full flag is removed or if they expect the
1965   // op to succeed despite being full).  The except is FULL_FORCE and
1966   // FULL_TRY ops, which there is no reason to discard because they
1967   // bypass all full checks anyway.  If this op isn't write or
1968   // read-ordered, we skip.
1969   // FIXME: we exclude mds writes for now.
1970   if (write_ordered && !(m->get_source().is_mds() ||
1971                          m->has_flag(CEPH_OSD_FLAG_FULL_TRY) ||
1972                          m->has_flag(CEPH_OSD_FLAG_FULL_FORCE)) &&
1973       info.history.last_epoch_marked_full > m->get_map_epoch()) {
1974     dout(10) << __func__ << " discarding op sent before full " << m << " "
1975              << *m << dendl;
1976     return;
1977   }
1978   // mds should have stopped writing before this point.
1979   // We can't allow OSD to become non-startable even if mds
1980   // could be writing as part of file removals.
1981   ostringstream ss;
1982   if (write_ordered && osd->check_failsafe_full(ss)) {
1983     dout(10) << __func__ << " fail-safe full check failed, dropping request"
1984              << ss.str()
1985              << dendl;
1986     return;
1987   }
1988   int64_t poolid = get_pgid().pool();
1989   if (op->may_write()) {
1990
1991     const pg_pool_t *pi = get_osdmap()->get_pg_pool(poolid);
1992     if (!pi) {
1993       return;
1994     }
1995
1996     // invalid?
1997     if (m->get_snapid() != CEPH_NOSNAP) {
1998       dout(20) << __func__ << ": write to clone not valid " << *m << dendl;
1999       osd->reply_op_error(op, -EINVAL);
2000       return;
2001     }
2002
2003     // too big?
2004     if (cct->_conf->osd_max_write_size &&
2005         m->get_data_len() > cct->_conf->osd_max_write_size << 20) {
2006       // journal can't hold commit!
2007       derr << "do_op msg data len " << m->get_data_len()
2008            << " > osd_max_write_size " << (cct->_conf->osd_max_write_size << 20)
2009            << " on " << *m << dendl;
2010       osd->reply_op_error(op, -OSD_WRITETOOBIG);
2011       return;
2012     }
2013   }
2014
2015   dout(10) << "do_op " << *m
2016            << (op->may_write() ? " may_write" : "")
2017            << (op->may_read() ? " may_read" : "")
2018            << (op->may_cache() ? " may_cache" : "")
2019            << " -> " << (write_ordered ? "write-ordered" : "read-ordered")
2020            << " flags " << ceph_osd_flag_string(m->get_flags())
2021            << dendl;
2022
2023   // missing object?
2024   if (is_unreadable_object(head)) {
2025     if (!is_primary()) {
2026       osd->reply_op_error(op, -EAGAIN);
2027       return;
2028     }
2029     if (can_backoff &&
2030         (g_conf->osd_backoff_on_degraded ||
2031          (g_conf->osd_backoff_on_unfound && missing_loc.is_unfound(head)))) {
2032       add_backoff(session, head, head);
2033       maybe_kick_recovery(head);
2034     } else {
2035       wait_for_unreadable_object(head, op);
2036     }
2037     return;
2038   }
2039
2040   // degraded object?
2041   if (write_ordered && is_degraded_or_backfilling_object(head)) {
2042     if (can_backoff && g_conf->osd_backoff_on_degraded) {
2043       add_backoff(session, head, head);
2044     } else {
2045       wait_for_degraded_object(head, op);
2046     }
2047     return;
2048   }
2049
2050   if (write_ordered &&
2051       scrubber.write_blocked_by_scrub(head)) {
2052     dout(20) << __func__ << ": waiting for scrub" << dendl;
2053     waiting_for_scrub.push_back(op);
2054     op->mark_delayed("waiting for scrub");
2055     return;
2056   }
2057
2058   // blocked on snap?
2059   map<hobject_t, snapid_t>::iterator blocked_iter =
2060     objects_blocked_on_degraded_snap.find(head);
2061   if (write_ordered && blocked_iter != objects_blocked_on_degraded_snap.end()) {
2062     hobject_t to_wait_on(head);
2063     to_wait_on.snap = blocked_iter->second;
2064     wait_for_degraded_object(to_wait_on, op);
2065     return;
2066   }
2067   map<hobject_t, ObjectContextRef>::iterator blocked_snap_promote_iter =
2068     objects_blocked_on_snap_promotion.find(head);
2069   if (write_ordered &&
2070       blocked_snap_promote_iter != objects_blocked_on_snap_promotion.end()) {
2071     wait_for_blocked_object(
2072       blocked_snap_promote_iter->second->obs.oi.soid,
2073       op);
2074     return;
2075   }
2076   if (write_ordered && objects_blocked_on_cache_full.count(head)) {
2077     block_write_on_full_cache(head, op);
2078     return;
2079   }
2080
2081   // missing snapdir?
2082   hobject_t snapdir = head.get_snapdir();
2083
2084   if (is_unreadable_object(snapdir)) {
2085     wait_for_unreadable_object(snapdir, op);
2086     return;
2087   }
2088
2089   // degraded object?
2090   if (write_ordered && is_degraded_or_backfilling_object(snapdir)) {
2091     wait_for_degraded_object(snapdir, op);
2092     return;
2093   }
2094
2095   // dup/resent?
2096   if (op->may_write() || op->may_cache()) {
2097     // warning: we will get back *a* request for this reqid, but not
2098     // necessarily the most recent.  this happens with flush and
2099     // promote ops, but we can't possible have both in our log where
2100     // the original request is still not stable on disk, so for our
2101     // purposes here it doesn't matter which one we get.
2102     eversion_t version;
2103     version_t user_version;
2104     int return_code = 0;
2105     bool got = check_in_progress_op(
2106       m->get_reqid(), &version, &user_version, &return_code);
2107     if (got) {
2108       dout(3) << __func__ << " dup " << m->get_reqid()
2109               << " version " << version << dendl;
2110       if (already_complete(version)) {
2111         osd->reply_op_error(op, return_code, version, user_version);
2112       } else {
2113         dout(10) << " waiting for " << version << " to commit" << dendl;
2114         // always queue ondisk waiters, so that we can requeue if needed
2115         waiting_for_ondisk[version].push_back(make_pair(op, user_version));
2116         op->mark_delayed("waiting for ondisk");
2117       }
2118       return;
2119     }
2120   }
2121
2122   ObjectContextRef obc;
2123   bool can_create = op->may_write() || op->may_cache();
2124   hobject_t missing_oid;
2125   const hobject_t& oid = m->get_hobj();
2126
2127   // io blocked on obc?
2128   if (!m->has_flag(CEPH_OSD_FLAG_FLUSH) &&
2129       maybe_await_blocked_snapset(oid, op)) {
2130     return;
2131   }
2132
2133   int r = find_object_context(
2134     oid, &obc, can_create,
2135     m->has_flag(CEPH_OSD_FLAG_MAP_SNAP_CLONE),
2136     &missing_oid);
2137
2138   if (r == -EAGAIN) {
2139     // If we're not the primary of this OSD, we just return -EAGAIN. Otherwise,
2140     // we have to wait for the object.
2141     if (is_primary()) {
2142       // missing the specific snap we need; requeue and wait.
2143       assert(!op->may_write()); // only happens on a read/cache
2144       wait_for_unreadable_object(missing_oid, op);
2145       return;
2146     }
2147   } else if (r == 0) {
2148     if (is_unreadable_object(obc->obs.oi.soid)) {
2149       dout(10) << __func__ << ": clone " << obc->obs.oi.soid
2150                << " is unreadable, waiting" << dendl;
2151       wait_for_unreadable_object(obc->obs.oi.soid, op);
2152       return;
2153     }
2154
2155     // degraded object?  (the check above was for head; this could be a clone)
2156     if (write_ordered &&
2157         obc->obs.oi.soid.snap != CEPH_NOSNAP &&
2158         is_degraded_or_backfilling_object(obc->obs.oi.soid)) {
2159       dout(10) << __func__ << ": clone " << obc->obs.oi.soid
2160                << " is degraded, waiting" << dendl;
2161       wait_for_degraded_object(obc->obs.oi.soid, op);
2162       return;
2163     }
2164   }
2165
2166   bool in_hit_set = false;
2167   if (hit_set) {
2168     if (obc.get()) {
2169       if (obc->obs.oi.soid != hobject_t() && hit_set->contains(obc->obs.oi.soid))
2170         in_hit_set = true;
2171     } else {
2172       if (missing_oid != hobject_t() && hit_set->contains(missing_oid))
2173         in_hit_set = true;
2174     }
2175     if (!op->hitset_inserted) {
2176       hit_set->insert(oid);
2177       op->hitset_inserted = true;
2178       if (hit_set->is_full() ||
2179           hit_set_start_stamp + pool.info.hit_set_period <= m->get_recv_stamp()) {
2180         hit_set_persist();
2181       }
2182     }
2183   }
2184
2185   if (agent_state) {
2186     if (agent_choose_mode(false, op))
2187       return;
2188   }
2189
2190   if (obc.get() && obc->obs.exists && obc->obs.oi.has_manifest()) {
2191     if (maybe_handle_manifest(op,
2192                                write_ordered,
2193                                obc))
2194     return;
2195   }
2196
2197   if (maybe_handle_cache(op,
2198                          write_ordered,
2199                          obc,
2200                          r,
2201                          missing_oid,
2202                          false,
2203                          in_hit_set))
2204     return;
2205
2206   if (r && (r != -ENOENT || !obc)) {
2207     // copy the reqids for copy get on ENOENT
2208     if (r == -ENOENT &&
2209         (m->ops[0].op.op == CEPH_OSD_OP_COPY_GET)) {
2210       fill_in_copy_get_noent(op, oid, m->ops[0]);
2211       return;
2212     }
2213     dout(20) << __func__ << ": find_object_context got error " << r << dendl;
2214     if (op->may_write() &&
2215         get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
2216       record_write_error(op, oid, nullptr, r);
2217     } else {
2218       osd->reply_op_error(op, r);
2219     }
2220     return;
2221   }
2222
2223   // make sure locator is consistent
2224   object_locator_t oloc(obc->obs.oi.soid);
2225   if (m->get_object_locator() != oloc) {
2226     dout(10) << " provided locator " << m->get_object_locator()
2227              << " != object's " << obc->obs.oi.soid << dendl;
2228     osd->clog->warn() << "bad locator " << m->get_object_locator()
2229                      << " on object " << oloc
2230                       << " op " << *m;
2231   }
2232
2233   // io blocked on obc?
2234   if (obc->is_blocked() &&
2235       !m->has_flag(CEPH_OSD_FLAG_FLUSH)) {
2236     wait_for_blocked_object(obc->obs.oi.soid, op);
2237     return;
2238   }
2239
2240   dout(25) << __func__ << " oi " << obc->obs.oi << dendl;
2241
2242   for (vector<OSDOp>::iterator p = m->ops.begin(); p != m->ops.end(); ++p) {
2243     OSDOp& osd_op = *p;
2244
2245     // make sure LIST_SNAPS is on CEPH_SNAPDIR and nothing else
2246     if (osd_op.op.op == CEPH_OSD_OP_LIST_SNAPS &&
2247         m->get_snapid() != CEPH_SNAPDIR) {
2248       dout(10) << "LIST_SNAPS with incorrect context" << dendl;
2249       osd->reply_op_error(op, -EINVAL);
2250       return;
2251     }
2252   }
2253
2254   OpContext *ctx = new OpContext(op, m->get_reqid(), &m->ops, obc, this);
2255
2256   if (!obc->obs.exists)
2257     ctx->snapset_obc = get_object_context(obc->obs.oi.soid.get_snapdir(), false);
2258
2259   /* Due to obc caching, we might have a cached non-existent snapset_obc
2260    * for the snapdir.  If so, we can ignore it.  Subsequent parts of the
2261    * do_op pipeline make decisions based on whether snapset_obc is
2262    * populated.
2263    */
2264   if (ctx->snapset_obc && !ctx->snapset_obc->obs.exists)
2265     ctx->snapset_obc = ObjectContextRef();
2266
2267   if (m->has_flag(CEPH_OSD_FLAG_SKIPRWLOCKS)) {
2268     dout(20) << __func__ << ": skipping rw locks" << dendl;
2269   } else if (m->get_flags() & CEPH_OSD_FLAG_FLUSH) {
2270     dout(20) << __func__ << ": part of flush, will ignore write lock" << dendl;
2271
2272     // verify there is in fact a flush in progress
2273     // FIXME: we could make this a stronger test.
2274     map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(obc->obs.oi.soid);
2275     if (p == flush_ops.end()) {
2276       dout(10) << __func__ << " no flush in progress, aborting" << dendl;
2277       reply_ctx(ctx, -EINVAL);
2278       return;
2279     }
2280   } else if (!get_rw_locks(write_ordered, ctx)) {
2281     dout(20) << __func__ << " waiting for rw locks " << dendl;
2282     op->mark_delayed("waiting for rw locks");
2283     close_op_ctx(ctx);
2284     return;
2285   }
2286   dout(20) << __func__ << " obc " << *obc << dendl;
2287
2288   if (r) {
2289     dout(20) << __func__ << " returned an error: " << r << dendl;
2290     close_op_ctx(ctx);
2291     if (op->may_write() &&
2292         get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
2293       record_write_error(op, oid, nullptr, r);
2294     } else {
2295       osd->reply_op_error(op, r);
2296     }
2297     return;
2298   }
2299
2300   if (m->has_flag(CEPH_OSD_FLAG_IGNORE_CACHE)) {
2301     ctx->ignore_cache = true;
2302   }
2303
2304   if ((op->may_read()) && (obc->obs.oi.is_lost())) {
2305     // This object is lost. Reading from it returns an error.
2306     dout(20) << __func__ << ": object " << obc->obs.oi.soid
2307              << " is lost" << dendl;
2308     reply_ctx(ctx, -ENFILE);
2309     return;
2310   }
2311   if (!op->may_write() &&
2312       !op->may_cache() &&
2313       (!obc->obs.exists ||
2314        ((m->get_snapid() != CEPH_SNAPDIR) &&
2315         obc->obs.oi.is_whiteout()))) {
2316     // copy the reqids for copy get on ENOENT
2317     if (m->ops[0].op.op == CEPH_OSD_OP_COPY_GET) {
2318       fill_in_copy_get_noent(op, oid, m->ops[0]);
2319       close_op_ctx(ctx);
2320       return;
2321     }
2322     reply_ctx(ctx, -ENOENT);
2323     return;
2324   }
2325
2326   op->mark_started();
2327
2328   execute_ctx(ctx);
2329   utime_t prepare_latency = ceph_clock_now();
2330   prepare_latency -= op->get_dequeued_time();
2331   osd->logger->tinc(l_osd_op_prepare_lat, prepare_latency);
2332   if (op->may_read() && op->may_write()) {
2333     osd->logger->tinc(l_osd_op_rw_prepare_lat, prepare_latency);
2334   } else if (op->may_read()) {
2335     osd->logger->tinc(l_osd_op_r_prepare_lat, prepare_latency);
2336   } else if (op->may_write() || op->may_cache()) {
2337     osd->logger->tinc(l_osd_op_w_prepare_lat, prepare_latency);
2338   }
2339
2340   // force recovery of the oldest missing object if too many logs
2341   maybe_force_recovery();
2342 }
2343 PrimaryLogPG::cache_result_t PrimaryLogPG::maybe_handle_manifest_detail(
2344   OpRequestRef op,
2345   bool write_ordered,
2346   ObjectContextRef obc)
2347 {
2348   if (static_cast<const MOSDOp *>(op->get_req())->get_flags() &
2349       CEPH_OSD_FLAG_IGNORE_REDIRECT) {
2350     dout(20) << __func__ << ": ignoring redirect due to flag" << dendl;
2351     return cache_result_t::NOOP;
2352   }
2353
2354   if (obc)
2355     dout(10) << __func__ << " " << obc->obs.oi << " "
2356        << (obc->obs.exists ? "exists" : "DNE")
2357        << dendl;
2358
2359   // if it is write-ordered and blocked, stop now
2360   if (obc.get() && obc->is_blocked() && write_ordered) {
2361     // we're already doing something with this object
2362     dout(20) << __func__ << " blocked on " << obc->obs.oi.soid << dendl;
2363     return cache_result_t::NOOP;
2364   }
2365
2366   vector<OSDOp> ops = static_cast<const MOSDOp*>(op->get_req())->ops;
2367   for (vector<OSDOp>::iterator p = ops.begin(); p != ops.end(); ++p) {
2368     OSDOp& osd_op = *p;
2369     ceph_osd_op& op = osd_op.op;
2370     if (op.op == CEPH_OSD_OP_SET_REDIRECT) {
2371       return cache_result_t::NOOP;
2372     }
2373   }
2374
2375   switch (obc->obs.oi.manifest.type) {
2376   case object_manifest_t::TYPE_REDIRECT:
2377     if (op->may_write() || write_ordered) {
2378       do_proxy_write(op, obc->obs.oi.soid, obc);
2379     } else {
2380       do_proxy_read(op, obc);
2381     }
2382     return cache_result_t::HANDLED_PROXY;
2383   case object_manifest_t::TYPE_CHUNKED:
2384   default:
2385     assert(0 == "unrecognized manifest type");
2386   }
2387
2388   return cache_result_t::NOOP;
2389 }
2390
2391 void PrimaryLogPG::record_write_error(OpRequestRef op, const hobject_t &soid,
2392                                       MOSDOpReply *orig_reply, int r)
2393 {
2394   dout(20) << __func__ << " r=" << r << dendl;
2395   assert(op->may_write());
2396   const osd_reqid_t &reqid = static_cast<const MOSDOp*>(op->get_req())->get_reqid();
2397   ObjectContextRef obc;
2398   mempool::osd_pglog::list<pg_log_entry_t> entries;
2399   entries.push_back(pg_log_entry_t(pg_log_entry_t::ERROR, soid,
2400                                    get_next_version(), eversion_t(), 0,
2401                                    reqid, utime_t(), r));
2402
2403   struct OnComplete {
2404     PrimaryLogPG *pg;
2405     OpRequestRef op;
2406     boost::intrusive_ptr<MOSDOpReply> orig_reply;
2407     int r;
2408     OnComplete(
2409       PrimaryLogPG *pg,
2410       OpRequestRef op,
2411       MOSDOpReply *orig_reply,
2412       int r)
2413       : pg(pg), op(op),
2414         orig_reply(orig_reply, false /* take over ref */), r(r)
2415       {}
2416     void operator()() {
2417       ldpp_dout(pg, 20) << "finished " << __func__ << " r=" << r << dendl;
2418       const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
2419       int flags = m->get_flags() & (CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
2420       MOSDOpReply *reply = orig_reply.detach();
2421       if (reply == nullptr) {
2422         reply = new MOSDOpReply(m, r, pg->get_osdmap()->get_epoch(),
2423                                 flags, true);
2424       }
2425       ldpp_dout(pg, 10) << " sending commit on " << *m << " " << reply << dendl;
2426       pg->osd->send_message_osd_client(reply, m->get_connection());
2427     }
2428   };
2429
2430   ObcLockManager lock_manager;
2431   submit_log_entries(
2432     entries,
2433     std::move(lock_manager),
2434     boost::optional<std::function<void(void)> >(
2435       OnComplete(this, op, orig_reply, r)),
2436     op,
2437     r);
2438 }
2439
2440 PrimaryLogPG::cache_result_t PrimaryLogPG::maybe_handle_cache_detail(
2441   OpRequestRef op,
2442   bool write_ordered,
2443   ObjectContextRef obc,
2444   int r, hobject_t missing_oid,
2445   bool must_promote,
2446   bool in_hit_set,
2447   ObjectContextRef *promote_obc)
2448 {
2449   if (op &&
2450       op->get_req() &&
2451       op->get_req()->get_type() == CEPH_MSG_OSD_OP &&
2452       (static_cast<const MOSDOp *>(op->get_req())->get_flags() &
2453        CEPH_OSD_FLAG_IGNORE_CACHE)) {
2454     dout(20) << __func__ << ": ignoring cache due to flag" << dendl;
2455     return cache_result_t::NOOP;
2456   }
2457   // return quickly if caching is not enabled
2458   if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)
2459     return cache_result_t::NOOP;
2460
2461   must_promote = must_promote || op->need_promote();
2462
2463   if (obc)
2464     dout(25) << __func__ << " " << obc->obs.oi << " "
2465              << (obc->obs.exists ? "exists" : "DNE")
2466              << " missing_oid " << missing_oid
2467              << " must_promote " << (int)must_promote
2468              << " in_hit_set " << (int)in_hit_set
2469              << dendl;
2470   else
2471     dout(25) << __func__ << " (no obc)"
2472              << " missing_oid " << missing_oid
2473              << " must_promote " << (int)must_promote
2474              << " in_hit_set " << (int)in_hit_set
2475              << dendl;
2476
2477   // if it is write-ordered and blocked, stop now
2478   if (obc.get() && obc->is_blocked() && write_ordered) {
2479     // we're already doing something with this object
2480     dout(20) << __func__ << " blocked on " << obc->obs.oi.soid << dendl;
2481     return cache_result_t::NOOP;
2482   }
2483
2484   if (r == -ENOENT && missing_oid == hobject_t()) {
2485     // we know this object is logically absent (e.g., an undefined clone)
2486     return cache_result_t::NOOP;
2487   }
2488
2489   if (obc.get() && obc->obs.exists) {
2490     osd->logger->inc(l_osd_op_cache_hit);
2491     return cache_result_t::NOOP;
2492   }
2493
2494   if (missing_oid == hobject_t() && obc.get()) {
2495     missing_oid = obc->obs.oi.soid;
2496   }
2497
2498   const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
2499   const object_locator_t oloc = m->get_object_locator();
2500
2501   if (op->need_skip_handle_cache()) {
2502     return cache_result_t::NOOP;
2503   }
2504
2505   // older versions do not proxy the feature bits.
2506   bool can_proxy_write = get_osdmap()->get_up_osd_features() &
2507     CEPH_FEATURE_OSD_PROXY_WRITE_FEATURES;
2508   OpRequestRef promote_op;
2509
2510   switch (pool.info.cache_mode) {
2511   case pg_pool_t::CACHEMODE_WRITEBACK:
2512     if (agent_state &&
2513         agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2514       if (!op->may_write() && !op->may_cache() &&
2515           !write_ordered && !must_promote) {
2516         dout(20) << __func__ << " cache pool full, proxying read" << dendl;
2517         do_proxy_read(op);
2518         return cache_result_t::HANDLED_PROXY;
2519       }
2520       dout(20) << __func__ << " cache pool full, waiting" << dendl;
2521       block_write_on_full_cache(missing_oid, op);
2522       return cache_result_t::BLOCKED_FULL;
2523     }
2524
2525     if (must_promote || (!hit_set && !op->need_skip_promote())) {
2526       promote_object(obc, missing_oid, oloc, op, promote_obc);
2527       return cache_result_t::BLOCKED_PROMOTE;
2528     }
2529
2530     if (op->may_write() || op->may_cache()) {
2531       if (can_proxy_write) {
2532         do_proxy_write(op, missing_oid);
2533       } else {
2534         // promote if can't proxy the write
2535         promote_object(obc, missing_oid, oloc, op, promote_obc);
2536         return cache_result_t::BLOCKED_PROMOTE;
2537       }
2538
2539       // Promote too?
2540       if (!op->need_skip_promote() &&
2541           maybe_promote(obc, missing_oid, oloc, in_hit_set,
2542                       pool.info.min_write_recency_for_promote,
2543                       OpRequestRef(),
2544                       promote_obc)) {
2545         return cache_result_t::BLOCKED_PROMOTE;
2546       }
2547       return cache_result_t::HANDLED_PROXY;
2548     } else {
2549       do_proxy_read(op);
2550
2551       // Avoid duplicate promotion
2552       if (obc.get() && obc->is_blocked()) {
2553         if (promote_obc)
2554           *promote_obc = obc;
2555         return cache_result_t::BLOCKED_PROMOTE;
2556       }
2557
2558       // Promote too?
2559       if (!op->need_skip_promote()) {
2560         (void)maybe_promote(obc, missing_oid, oloc, in_hit_set,
2561                             pool.info.min_read_recency_for_promote,
2562                             promote_op, promote_obc);
2563       }
2564
2565       return cache_result_t::HANDLED_PROXY;
2566     }
2567     assert(0 == "unreachable");
2568     return cache_result_t::NOOP;
2569
2570   case pg_pool_t::CACHEMODE_FORWARD:
2571     // FIXME: this mode allows requests to be reordered.
2572     do_cache_redirect(op);
2573     return cache_result_t::HANDLED_REDIRECT;
2574
2575   case pg_pool_t::CACHEMODE_READONLY:
2576     // TODO: clean this case up
2577     if (!obc.get() && r == -ENOENT) {
2578       // we don't have the object and op's a read
2579       promote_object(obc, missing_oid, oloc, op, promote_obc);
2580       return cache_result_t::BLOCKED_PROMOTE;
2581     }
2582     if (!r) { // it must be a write
2583       do_cache_redirect(op);
2584       return cache_result_t::HANDLED_REDIRECT;
2585     }
2586     // crap, there was a failure of some kind
2587     return cache_result_t::NOOP;
2588
2589   case pg_pool_t::CACHEMODE_READFORWARD:
2590     // Do writeback to the cache tier for writes
2591     if (op->may_write() || write_ordered || must_promote) {
2592       if (agent_state &&
2593           agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2594         dout(20) << __func__ << " cache pool full, waiting" << dendl;
2595         block_write_on_full_cache(missing_oid, op);
2596         return cache_result_t::BLOCKED_FULL;
2597       }
2598       promote_object(obc, missing_oid, oloc, op, promote_obc);
2599       return cache_result_t::BLOCKED_PROMOTE;
2600     }
2601
2602     // If it is a read, we can read, we need to forward it
2603     do_cache_redirect(op);
2604     return cache_result_t::HANDLED_REDIRECT;
2605
2606   case pg_pool_t::CACHEMODE_PROXY:
2607     if (!must_promote) {
2608       if (op->may_write() || op->may_cache() || write_ordered) {
2609         if (can_proxy_write) {
2610           do_proxy_write(op, missing_oid);
2611           return cache_result_t::HANDLED_PROXY;
2612         }
2613       } else {
2614         do_proxy_read(op);
2615         return cache_result_t::HANDLED_PROXY;
2616       }
2617     }
2618     // ugh, we're forced to promote.
2619     if (agent_state &&
2620         agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2621       dout(20) << __func__ << " cache pool full, waiting" << dendl;
2622       block_write_on_full_cache(missing_oid, op);
2623       return cache_result_t::BLOCKED_FULL;
2624     }
2625     promote_object(obc, missing_oid, oloc, op, promote_obc);
2626     return cache_result_t::BLOCKED_PROMOTE;
2627
2628   case pg_pool_t::CACHEMODE_READPROXY:
2629     // Do writeback to the cache tier for writes
2630     if (op->may_write() || write_ordered || must_promote) {
2631       if (agent_state &&
2632           agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2633         dout(20) << __func__ << " cache pool full, waiting" << dendl;
2634         block_write_on_full_cache(missing_oid, op);
2635         return cache_result_t::BLOCKED_FULL;
2636       }
2637       promote_object(obc, missing_oid, oloc, op, promote_obc);
2638       return cache_result_t::BLOCKED_PROMOTE;
2639     }
2640
2641     // If it is a read, we can read, we need to proxy it
2642     do_proxy_read(op);
2643     return cache_result_t::HANDLED_PROXY;
2644
2645   default:
2646     assert(0 == "unrecognized cache_mode");
2647   }
2648   return cache_result_t::NOOP;
2649 }
2650
2651 bool PrimaryLogPG::maybe_promote(ObjectContextRef obc,
2652                                  const hobject_t& missing_oid,
2653                                  const object_locator_t& oloc,
2654                                  bool in_hit_set,
2655                                  uint32_t recency,
2656                                  OpRequestRef promote_op,
2657                                  ObjectContextRef *promote_obc)
2658 {
2659   dout(20) << __func__ << " missing_oid " << missing_oid
2660            << "  in_hit_set " << in_hit_set << dendl;
2661
2662   switch (recency) {
2663   case 0:
2664     break;
2665   case 1:
2666     // Check if in the current hit set
2667     if (in_hit_set) {
2668       break;
2669     } else {
2670       // not promoting
2671       return false;
2672     }
2673     break;
2674   default:
2675     {
2676       unsigned count = (int)in_hit_set;
2677       if (count) {
2678         // Check if in other hit sets
2679         const hobject_t& oid = obc.get() ? obc->obs.oi.soid : missing_oid;
2680         for (map<time_t,HitSetRef>::reverse_iterator itor =
2681                agent_state->hit_set_map.rbegin();
2682              itor != agent_state->hit_set_map.rend();
2683              ++itor) {
2684           if (!itor->second->contains(oid)) {
2685             break;
2686           }
2687           ++count;
2688           if (count >= recency) {
2689             break;
2690           }
2691         }
2692       }
2693       if (count >= recency) {
2694         break;
2695       }
2696       return false;     // not promoting
2697     }
2698     break;
2699   }
2700
2701   if (osd->promote_throttle()) {
2702     dout(10) << __func__ << " promote throttled" << dendl;
2703     return false;
2704   }
2705   promote_object(obc, missing_oid, oloc, promote_op, promote_obc);
2706   return true;
2707 }
2708
2709 void PrimaryLogPG::do_cache_redirect(OpRequestRef op)
2710 {
2711   const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
2712   int flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
2713   MOSDOpReply *reply = new MOSDOpReply(m, -ENOENT,
2714                                        get_osdmap()->get_epoch(), flags, false);
2715   request_redirect_t redir(m->get_object_locator(), pool.info.tier_of);
2716   reply->set_redirect(redir);
2717   dout(10) << "sending redirect to pool " << pool.info.tier_of << " for op "
2718            << op << dendl;
2719   m->get_connection()->send_message(reply);
2720   return;
2721 }
2722
2723 struct C_ProxyRead : public Context {
2724   PrimaryLogPGRef pg;
2725   hobject_t oid;
2726   epoch_t last_peering_reset;
2727   ceph_tid_t tid;
2728   PrimaryLogPG::ProxyReadOpRef prdop;
2729   utime_t start;
2730   C_ProxyRead(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
2731              const PrimaryLogPG::ProxyReadOpRef& prd)
2732     : pg(p), oid(o), last_peering_reset(lpr),
2733       tid(0), prdop(prd), start(ceph_clock_now())
2734   {}
2735   void finish(int r) override {
2736     if (prdop->canceled)
2737       return;
2738     pg->lock();
2739     if (prdop->canceled) {
2740       pg->unlock();
2741       return;
2742     }
2743     if (last_peering_reset == pg->get_last_peering_reset()) {
2744       pg->finish_proxy_read(oid, tid, r);
2745       pg->osd->logger->tinc(l_osd_tier_r_lat, ceph_clock_now() - start);
2746     }
2747     pg->unlock();
2748   }
2749 };
2750
2751 void PrimaryLogPG::do_proxy_read(OpRequestRef op, ObjectContextRef obc)
2752 {
2753   // NOTE: non-const here because the ProxyReadOp needs mutable refs to
2754   // stash the result in the request's OSDOp vector
2755   MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
2756   object_locator_t oloc;
2757   hobject_t soid;
2758   /* extensible tier */
2759   if (obc && obc->obs.exists && obc->obs.oi.has_manifest()) {
2760     switch (obc->obs.oi.manifest.type) {
2761       case object_manifest_t::TYPE_REDIRECT:
2762           oloc = object_locator_t(obc->obs.oi.manifest.redirect_target);
2763           soid = obc->obs.oi.manifest.redirect_target;
2764           break;
2765       case object_manifest_t::TYPE_CHUNKED:
2766       default:
2767         assert(0 == "unrecognized manifest type");
2768     }
2769   } else {
2770   /* proxy */
2771     soid = m->get_hobj();
2772     oloc = object_locator_t(m->get_object_locator());
2773     oloc.pool = pool.info.tier_of;
2774   }
2775   unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY;
2776
2777   // pass through some original flags that make sense.
2778   //  - leave out redirection and balancing flags since we are
2779   //    already proxying through the primary
2780   //  - leave off read/write/exec flags that are derived from the op
2781   flags |= m->get_flags() & (CEPH_OSD_FLAG_RWORDERED |
2782                              CEPH_OSD_FLAG_ORDERSNAP |
2783                              CEPH_OSD_FLAG_ENFORCE_SNAPC |
2784                              CEPH_OSD_FLAG_MAP_SNAP_CLONE);
2785
2786   dout(10) << __func__ << " Start proxy read for " << *m << dendl;
2787
2788   ProxyReadOpRef prdop(std::make_shared<ProxyReadOp>(op, soid, m->ops));
2789
2790   ObjectOperation obj_op;
2791   obj_op.dup(prdop->ops);
2792
2793   if (pool.info.cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
2794       (agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL)) {
2795     for (unsigned i = 0; i < obj_op.ops.size(); i++) {
2796       ceph_osd_op op = obj_op.ops[i].op;
2797       switch (op.op) {
2798         case CEPH_OSD_OP_READ:
2799         case CEPH_OSD_OP_SYNC_READ:
2800         case CEPH_OSD_OP_SPARSE_READ:
2801         case CEPH_OSD_OP_CHECKSUM:
2802         case CEPH_OSD_OP_CMPEXT:
2803           op.flags = (op.flags | CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL) &
2804                        ~(CEPH_OSD_OP_FLAG_FADVISE_DONTNEED | CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
2805       }
2806     }
2807   }
2808
2809   C_ProxyRead *fin = new C_ProxyRead(this, soid, get_last_peering_reset(),
2810                                      prdop);
2811   ceph_tid_t tid = osd->objecter->read(
2812     soid.oid, oloc, obj_op,
2813     m->get_snapid(), NULL,
2814     flags, new C_OnFinisher(fin, &osd->objecter_finisher),
2815     &prdop->user_version,
2816     &prdop->data_offset,
2817     m->get_features());
2818   fin->tid = tid;
2819   prdop->objecter_tid = tid;
2820   proxyread_ops[tid] = prdop;
2821   in_progress_proxy_ops[soid].push_back(op);
2822 }
2823
2824 void PrimaryLogPG::finish_proxy_read(hobject_t oid, ceph_tid_t tid, int r)
2825 {
2826   dout(10) << __func__ << " " << oid << " tid " << tid
2827            << " " << cpp_strerror(r) << dendl;
2828
2829   map<ceph_tid_t, ProxyReadOpRef>::iterator p = proxyread_ops.find(tid);
2830   if (p == proxyread_ops.end()) {
2831     dout(10) << __func__ << " no proxyread_op found" << dendl;
2832     return;
2833   }
2834   ProxyReadOpRef prdop = p->second;
2835   if (tid != prdop->objecter_tid) {
2836     dout(10) << __func__ << " tid " << tid << " != prdop " << prdop
2837              << " tid " << prdop->objecter_tid << dendl;
2838     return;
2839   }
2840   if (oid != prdop->soid) {
2841     dout(10) << __func__ << " oid " << oid << " != prdop " << prdop
2842              << " soid " << prdop->soid << dendl;
2843     return;
2844   }
2845   proxyread_ops.erase(tid);
2846
2847   map<hobject_t, list<OpRequestRef>>::iterator q = in_progress_proxy_ops.find(oid);
2848   if (q == in_progress_proxy_ops.end()) {
2849     dout(10) << __func__ << " no in_progress_proxy_ops found" << dendl;
2850     return;
2851   }
2852   assert(q->second.size());
2853   list<OpRequestRef>::iterator it = std::find(q->second.begin(),
2854                                               q->second.end(),
2855                                               prdop->op);
2856   assert(it != q->second.end());
2857   OpRequestRef op = *it;
2858   q->second.erase(it);
2859   if (q->second.size() == 0) {
2860     in_progress_proxy_ops.erase(oid);
2861   }
2862
2863   osd->logger->inc(l_osd_tier_proxy_read);
2864
2865   const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
2866   OpContext *ctx = new OpContext(op, m->get_reqid(), &prdop->ops, this);
2867   ctx->reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0, false);
2868   ctx->user_at_version = prdop->user_version;
2869   ctx->data_off = prdop->data_offset;
2870   ctx->ignore_log_op_stats = true;
2871   complete_read_ctx(r, ctx);
2872 }
2873
2874 void PrimaryLogPG::kick_proxy_ops_blocked(hobject_t& soid)
2875 {
2876   map<hobject_t, list<OpRequestRef>>::iterator p = in_progress_proxy_ops.find(soid);
2877   if (p == in_progress_proxy_ops.end())
2878     return;
2879
2880   list<OpRequestRef>& ls = p->second;
2881   dout(10) << __func__ << " " << soid << " requeuing " << ls.size() << " requests" << dendl;
2882   requeue_ops(ls);
2883   in_progress_proxy_ops.erase(p);
2884 }
2885
2886 void PrimaryLogPG::cancel_proxy_read(ProxyReadOpRef prdop)
2887 {
2888   dout(10) << __func__ << " " << prdop->soid << dendl;
2889   prdop->canceled = true;
2890
2891   // cancel objecter op, if we can
2892   if (prdop->objecter_tid) {
2893     osd->objecter->op_cancel(prdop->objecter_tid, -ECANCELED);
2894     for (uint32_t i = 0; i < prdop->ops.size(); i++) {
2895       prdop->ops[i].outdata.clear();
2896     }
2897     proxyread_ops.erase(prdop->objecter_tid);
2898     prdop->objecter_tid = 0;
2899   }
2900 }
2901
2902 void PrimaryLogPG::cancel_proxy_ops(bool requeue)
2903 {
2904   dout(10) << __func__ << dendl;
2905
2906   // cancel proxy reads
2907   map<ceph_tid_t, ProxyReadOpRef>::iterator p = proxyread_ops.begin();
2908   while (p != proxyread_ops.end()) {
2909     cancel_proxy_read((p++)->second);
2910   }
2911
2912   // cancel proxy writes
2913   map<ceph_tid_t, ProxyWriteOpRef>::iterator q = proxywrite_ops.begin();
2914   while (q != proxywrite_ops.end()) {
2915     cancel_proxy_write((q++)->second);
2916   }
2917
2918   if (requeue) {
2919     map<hobject_t, list<OpRequestRef>>::iterator p =
2920       in_progress_proxy_ops.begin();
2921     while (p != in_progress_proxy_ops.end()) {
2922       list<OpRequestRef>& ls = p->second;
2923       dout(10) << __func__ << " " << p->first << " requeuing " << ls.size()
2924                << " requests" << dendl;
2925       requeue_ops(ls);
2926       in_progress_proxy_ops.erase(p++);
2927     }
2928   } else {
2929     in_progress_proxy_ops.clear();
2930   }
2931 }
2932
2933 struct C_ProxyWrite_Commit : public Context {
2934   PrimaryLogPGRef pg;
2935   hobject_t oid;
2936   epoch_t last_peering_reset;
2937   ceph_tid_t tid;
2938   PrimaryLogPG::ProxyWriteOpRef pwop;
2939   C_ProxyWrite_Commit(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
2940                       const PrimaryLogPG::ProxyWriteOpRef& pw)
2941     : pg(p), oid(o), last_peering_reset(lpr),
2942       tid(0), pwop(pw)
2943   {}
2944   void finish(int r) override {
2945     if (pwop->canceled)
2946       return;
2947     pg->lock();
2948     if (pwop->canceled) {
2949       pg->unlock();
2950       return;
2951     }
2952     if (last_peering_reset == pg->get_last_peering_reset()) {
2953       pg->finish_proxy_write(oid, tid, r);
2954     }
2955     pg->unlock();
2956   }
2957 };
2958
2959 void PrimaryLogPG::do_proxy_write(OpRequestRef op, const hobject_t& missing_oid, ObjectContextRef obc)
2960 {
2961   // NOTE: non-const because ProxyWriteOp takes a mutable ref
2962   MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
2963   object_locator_t oloc;
2964   SnapContext snapc(m->get_snap_seq(), m->get_snaps());
2965   hobject_t soid;
2966   /* extensible tier */
2967   if (obc && obc->obs.exists && obc->obs.oi.has_manifest()) {
2968     switch (obc->obs.oi.manifest.type) {
2969       case object_manifest_t::TYPE_REDIRECT:
2970           oloc = object_locator_t(obc->obs.oi.manifest.redirect_target);
2971           soid = obc->obs.oi.manifest.redirect_target;
2972           break;
2973       case object_manifest_t::TYPE_CHUNKED:
2974       default:
2975         assert(0 == "unrecognized manifest type");
2976     }
2977   } else {
2978   /* proxy */
2979     soid = m->get_hobj();
2980     oloc = object_locator_t(m->get_object_locator());
2981     oloc.pool = pool.info.tier_of;
2982   }
2983
2984   unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY;
2985   if (!(op->may_write() || op->may_cache())) {
2986     flags |= CEPH_OSD_FLAG_RWORDERED;
2987   }
2988   dout(10) << __func__ << " Start proxy write for " << *m << dendl;
2989
2990   ProxyWriteOpRef pwop(std::make_shared<ProxyWriteOp>(op, soid, m->ops, m->get_reqid()));
2991   pwop->ctx = new OpContext(op, m->get_reqid(), &pwop->ops, this);
2992   pwop->mtime = m->get_mtime();
2993
2994   ObjectOperation obj_op;
2995   obj_op.dup(pwop->ops);
2996
2997   C_ProxyWrite_Commit *fin = new C_ProxyWrite_Commit(
2998       this, soid, get_last_peering_reset(), pwop);
2999   ceph_tid_t tid = osd->objecter->mutate(
3000     soid.oid, oloc, obj_op, snapc,
3001     ceph::real_clock::from_ceph_timespec(pwop->mtime),
3002     flags, new C_OnFinisher(fin, &osd->objecter_finisher),
3003     &pwop->user_version, pwop->reqid);
3004   fin->tid = tid;
3005   pwop->objecter_tid = tid;
3006   proxywrite_ops[tid] = pwop;
3007   in_progress_proxy_ops[soid].push_back(op);
3008 }
3009
3010 void PrimaryLogPG::finish_proxy_write(hobject_t oid, ceph_tid_t tid, int r)
3011 {
3012   dout(10) << __func__ << " " << oid << " tid " << tid
3013            << " " << cpp_strerror(r) << dendl;
3014
3015   map<ceph_tid_t, ProxyWriteOpRef>::iterator p = proxywrite_ops.find(tid);
3016   if (p == proxywrite_ops.end()) {
3017     dout(10) << __func__ << " no proxywrite_op found" << dendl;
3018     return;
3019   }
3020   ProxyWriteOpRef pwop = p->second;
3021   assert(tid == pwop->objecter_tid);
3022   assert(oid == pwop->soid);
3023
3024   proxywrite_ops.erase(tid);
3025
3026   map<hobject_t, list<OpRequestRef> >::iterator q = in_progress_proxy_ops.find(oid);
3027   if (q == in_progress_proxy_ops.end()) {
3028     dout(10) << __func__ << " no in_progress_proxy_ops found" << dendl;
3029     delete pwop->ctx;
3030     pwop->ctx = NULL;
3031     return;
3032   }
3033   list<OpRequestRef>& in_progress_op = q->second;
3034   assert(in_progress_op.size());
3035   list<OpRequestRef>::iterator it = std::find(in_progress_op.begin(),
3036                                               in_progress_op.end(),
3037                                               pwop->op);
3038   assert(it != in_progress_op.end());
3039   in_progress_op.erase(it);
3040   if (in_progress_op.size() == 0) {
3041     in_progress_proxy_ops.erase(oid);
3042   }
3043
3044   osd->logger->inc(l_osd_tier_proxy_write);
3045
3046   const MOSDOp *m = static_cast<const MOSDOp*>(pwop->op->get_req());
3047   assert(m != NULL);
3048
3049   if (!pwop->sent_reply) {
3050     // send commit.
3051     MOSDOpReply *reply = pwop->ctx->reply;
3052     if (reply)
3053       pwop->ctx->reply = NULL;
3054     else {
3055       reply = new MOSDOpReply(m, r, get_osdmap()->get_epoch(), 0, true);
3056       reply->set_reply_versions(eversion_t(), pwop->user_version);
3057     }
3058     reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
3059     dout(10) << " sending commit on " << pwop << " " << reply << dendl;
3060     osd->send_message_osd_client(reply, m->get_connection());
3061     pwop->sent_reply = true;
3062     pwop->ctx->op->mark_commit_sent();
3063   }
3064
3065   delete pwop->ctx;
3066   pwop->ctx = NULL;
3067 }
3068
3069 void PrimaryLogPG::cancel_proxy_write(ProxyWriteOpRef pwop)
3070 {
3071   dout(10) << __func__ << " " << pwop->soid << dendl;
3072   pwop->canceled = true;
3073
3074   // cancel objecter op, if we can
3075   if (pwop->objecter_tid) {
3076     osd->objecter->op_cancel(pwop->objecter_tid, -ECANCELED);
3077     delete pwop->ctx;
3078     pwop->ctx = NULL;
3079     proxywrite_ops.erase(pwop->objecter_tid);
3080     pwop->objecter_tid = 0;
3081   }
3082 }
3083
3084 class PromoteCallback: public PrimaryLogPG::CopyCallback {
3085   ObjectContextRef obc;
3086   PrimaryLogPG *pg;
3087   utime_t start;
3088 public:
3089   PromoteCallback(ObjectContextRef obc_, PrimaryLogPG *pg_)
3090     : obc(obc_),
3091       pg(pg_),
3092       start(ceph_clock_now()) {}
3093
3094   void finish(PrimaryLogPG::CopyCallbackResults results) override {
3095     PrimaryLogPG::CopyResults *results_data = results.get<1>();
3096     int r = results.get<0>();
3097     pg->finish_promote(r, results_data, obc);
3098     pg->osd->logger->tinc(l_osd_tier_promote_lat, ceph_clock_now() - start);
3099   }
3100 };
3101
3102 void PrimaryLogPG::promote_object(ObjectContextRef obc,
3103                                   const hobject_t& missing_oid,
3104                                   const object_locator_t& oloc,
3105                                   OpRequestRef op,
3106                                   ObjectContextRef *promote_obc)
3107 {
3108   hobject_t hoid = obc ? obc->obs.oi.soid : missing_oid;
3109   assert(hoid != hobject_t());
3110   if (scrubber.write_blocked_by_scrub(hoid)) {
3111     dout(10) << __func__ << " " << hoid
3112              << " blocked by scrub" << dendl;
3113     if (op) {
3114       waiting_for_scrub.push_back(op);
3115       op->mark_delayed("waiting for scrub");
3116       dout(10) << __func__ << " " << hoid
3117                << " placing op in waiting_for_scrub" << dendl;
3118     } else {
3119       dout(10) << __func__ << " " << hoid
3120                << " no op, dropping on the floor" << dendl;
3121     }
3122     return;
3123   }
3124   if (!obc) { // we need to create an ObjectContext
3125     assert(missing_oid != hobject_t());
3126     obc = get_object_context(missing_oid, true);
3127   }
3128   if (promote_obc)
3129     *promote_obc = obc;
3130
3131   /*
3132    * Before promote complete, if there are  proxy-reads for the object,
3133    * for this case we don't use DONTNEED.
3134    */
3135   unsigned src_fadvise_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL;
3136   map<hobject_t, list<OpRequestRef>>::iterator q = in_progress_proxy_ops.find(obc->obs.oi.soid);
3137   if (q == in_progress_proxy_ops.end()) {
3138     src_fadvise_flags |= LIBRADOS_OP_FLAG_FADVISE_DONTNEED;
3139   }
3140
3141   PromoteCallback *cb = new PromoteCallback(obc, this);
3142   object_locator_t my_oloc = oloc;
3143   my_oloc.pool = pool.info.tier_of;
3144
3145   unsigned flags = CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
3146                    CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
3147                    CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE |
3148                    CEPH_OSD_COPY_FROM_FLAG_RWORDERED;
3149   start_copy(cb, obc, obc->obs.oi.soid, my_oloc, 0, flags,
3150              obc->obs.oi.soid.snap == CEPH_NOSNAP,
3151              src_fadvise_flags, 0);
3152
3153   assert(obc->is_blocked());
3154
3155   if (op)
3156     wait_for_blocked_object(obc->obs.oi.soid, op);
3157   info.stats.stats.sum.num_promote++;
3158 }
3159
3160 void PrimaryLogPG::execute_ctx(OpContext *ctx)
3161 {
3162   FUNCTRACE();
3163   dout(10) << __func__ << " " << ctx << dendl;
3164   ctx->reset_obs(ctx->obc);
3165   ctx->update_log_only = false; // reset in case finish_copyfrom() is re-running execute_ctx
3166   OpRequestRef op = ctx->op;
3167   const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
3168   ObjectContextRef obc = ctx->obc;
3169   const hobject_t& soid = obc->obs.oi.soid;
3170
3171   // this method must be idempotent since we may call it several times
3172   // before we finally apply the resulting transaction.
3173   ctx->op_t.reset(new PGTransaction);
3174
3175   if (op->may_write() || op->may_cache()) {
3176     // snap
3177     if (!(m->has_flag(CEPH_OSD_FLAG_ENFORCE_SNAPC)) &&
3178         pool.info.is_pool_snaps_mode()) {
3179       // use pool's snapc
3180       ctx->snapc = pool.snapc;
3181     } else {
3182       // client specified snapc
3183       ctx->snapc.seq = m->get_snap_seq();
3184       ctx->snapc.snaps = m->get_snaps();
3185       filter_snapc(ctx->snapc.snaps);
3186     }
3187     if ((m->has_flag(CEPH_OSD_FLAG_ORDERSNAP)) &&
3188         ctx->snapc.seq < obc->ssc->snapset.seq) {
3189       dout(10) << " ORDERSNAP flag set and snapc seq " << ctx->snapc.seq
3190                << " < snapset seq " << obc->ssc->snapset.seq
3191                << " on " << obc->obs.oi.soid << dendl;
3192       reply_ctx(ctx, -EOLDSNAPC);
3193       return;
3194     }
3195
3196     // version
3197     ctx->at_version = get_next_version();
3198     ctx->mtime = m->get_mtime();
3199
3200     dout(10) << __func__ << " " << soid << " " << *ctx->ops
3201              << " ov " << obc->obs.oi.version << " av " << ctx->at_version
3202              << " snapc " << ctx->snapc
3203              << " snapset " << obc->ssc->snapset
3204              << dendl;
3205   } else {
3206     dout(10) << __func__ << " " << soid << " " << *ctx->ops
3207              << " ov " << obc->obs.oi.version
3208              << dendl;
3209   }
3210
3211   if (!ctx->user_at_version)
3212     ctx->user_at_version = obc->obs.oi.user_version;
3213   dout(30) << __func__ << " user_at_version " << ctx->user_at_version << dendl;
3214
3215   if (op->may_read()) {
3216     dout(10) << " taking ondisk_read_lock" << dendl;
3217     obc->ondisk_read_lock();
3218   }
3219
3220   {
3221 #ifdef WITH_LTTNG
3222     osd_reqid_t reqid = ctx->op->get_reqid();
3223 #endif
3224     tracepoint(osd, prepare_tx_enter, reqid.name._type,
3225         reqid.name._num, reqid.tid, reqid.inc);
3226   }
3227
3228   int result = prepare_transaction(ctx);
3229
3230   {
3231 #ifdef WITH_LTTNG
3232     osd_reqid_t reqid = ctx->op->get_reqid();
3233 #endif
3234     tracepoint(osd, prepare_tx_exit, reqid.name._type,
3235         reqid.name._num, reqid.tid, reqid.inc);
3236   }
3237
3238   if (op->may_read()) {
3239     dout(10) << " dropping ondisk_read_lock" << dendl;
3240     obc->ondisk_read_unlock();
3241   }
3242
3243   bool pending_async_reads = !ctx->pending_async_reads.empty();
3244   if (result == -EINPROGRESS || pending_async_reads) {
3245     // come back later.
3246     if (pending_async_reads) {
3247       in_progress_async_reads.push_back(make_pair(op, ctx));
3248       ctx->start_async_reads(this);
3249     }
3250     return;
3251   }
3252
3253   if (result == -EAGAIN) {
3254     // clean up after the ctx
3255     close_op_ctx(ctx);
3256     return;
3257   }
3258
3259   bool successful_write = !ctx->op_t->empty() && op->may_write() && result >= 0;
3260   // prepare the reply
3261   ctx->reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0,
3262                                successful_write);
3263
3264   // Write operations aren't allowed to return a data payload because
3265   // we can't do so reliably. If the client has to resend the request
3266   // and it has already been applied, we will return 0 with no
3267   // payload.  Non-deterministic behavior is no good.  However, it is
3268   // possible to construct an operation that does a read, does a guard
3269   // check (e.g., CMPXATTR), and then a write.  Then we either succeed
3270   // with the write, or return a CMPXATTR and the read value.
3271   if (successful_write) {
3272     // write.  normalize the result code.
3273     dout(20) << " zeroing write result code " << result << dendl;
3274     result = 0;
3275   }
3276   ctx->reply->set_result(result);
3277
3278   // read or error?
3279   if ((ctx->op_t->empty() || result < 0) && !ctx->update_log_only) {
3280     // finish side-effects
3281     if (result >= 0)
3282       do_osd_op_effects(ctx, m->get_connection());
3283
3284     complete_read_ctx(result, ctx);
3285     return;
3286   }
3287
3288   ctx->reply->set_reply_versions(ctx->at_version, ctx->user_at_version);
3289
3290   assert(op->may_write() || op->may_cache());
3291
3292   // trim log?
3293   calc_trim_to();
3294
3295   // verify that we are doing this in order?
3296   if (cct->_conf->osd_debug_op_order && m->get_source().is_client() &&
3297       !pool.info.is_tier() && !pool.info.has_tiers()) {
3298     map<client_t,ceph_tid_t>& cm = debug_op_order[obc->obs.oi.soid];
3299     ceph_tid_t t = m->get_tid();
3300     client_t n = m->get_source().num();
3301     map<client_t,ceph_tid_t>::iterator p = cm.find(n);
3302     if (p == cm.end()) {
3303       dout(20) << " op order client." << n << " tid " << t << " (first)" << dendl;
3304       cm[n] = t;
3305     } else {
3306       dout(20) << " op order client." << n << " tid " << t << " last was " << p->second << dendl;
3307       if (p->second > t) {
3308         derr << "bad op order, already applied " << p->second << " > this " << t << dendl;
3309         assert(0 == "out of order op");
3310       }
3311       p->second = t;
3312     }
3313   }
3314
3315   if (ctx->update_log_only) {
3316     if (result >= 0)
3317       do_osd_op_effects(ctx, m->get_connection());
3318
3319     dout(20) << __func__ << " update_log_only -- result=" << result << dendl;
3320     // save just what we need from ctx
3321     MOSDOpReply *reply = ctx->reply;
3322     ctx->reply = nullptr;
3323     reply->claim_op_out_data(*ctx->ops);
3324     reply->get_header().data_off = (ctx->data_off ? *ctx->data_off : 0);
3325     close_op_ctx(ctx);
3326
3327     if (result == -ENOENT) {
3328       reply->set_enoent_reply_versions(info.last_update,
3329                                        info.last_user_version);
3330     }
3331     reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
3332     // append to pg log for dup detection - don't save buffers for now
3333     record_write_error(op, soid, reply, result);
3334     return;
3335   }
3336
3337   // no need to capture PG ref, repop cancel will handle that
3338   // Can capture the ctx by pointer, it's owned by the repop
3339   ctx->register_on_commit(
3340     [m, ctx, this](){
3341       if (ctx->op)
3342         log_op_stats(
3343           ctx);
3344
3345       if (m && !ctx->sent_reply) {
3346         MOSDOpReply *reply = ctx->reply;
3347         if (reply)
3348           ctx->reply = nullptr;
3349         else {
3350           reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0, true);
3351           reply->set_reply_versions(ctx->at_version,
3352                                     ctx->user_at_version);
3353         }
3354         reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
3355         dout(10) << " sending reply on " << *m << " " << reply << dendl;
3356         osd->send_message_osd_client(reply, m->get_connection());
3357         ctx->sent_reply = true;
3358         ctx->op->mark_commit_sent();
3359       }
3360     });
3361   ctx->register_on_success(
3362     [ctx, this]() {
3363       do_osd_op_effects(
3364         ctx,
3365         ctx->op ? ctx->op->get_req()->get_connection() :
3366         ConnectionRef());
3367     });
3368   ctx->register_on_finish(
3369     [ctx, this]() {
3370       delete ctx;
3371     });
3372
3373   // issue replica writes
3374   ceph_tid_t rep_tid = osd->get_tid();
3375
3376   RepGather *repop = new_repop(ctx, obc, rep_tid);
3377
3378   issue_repop(repop, ctx);
3379   eval_repop(repop);
3380   repop->put();
3381 }
3382
3383 void PrimaryLogPG::close_op_ctx(OpContext *ctx) {
3384   release_object_locks(ctx->lock_manager);
3385
3386   ctx->op_t.reset();
3387
3388   for (auto p = ctx->on_finish.begin(); p != ctx->on_finish.end();
3389        ctx->on_finish.erase(p++)) {
3390     (*p)();
3391   }
3392   delete ctx;
3393 }
3394
3395 void PrimaryLogPG::reply_ctx(OpContext *ctx, int r)
3396 {
3397   if (ctx->op)
3398     osd->reply_op_error(ctx->op, r);
3399   close_op_ctx(ctx);
3400 }
3401
3402 void PrimaryLogPG::reply_ctx(OpContext *ctx, int r, eversion_t v, version_t uv)
3403 {
3404   if (ctx->op)
3405     osd->reply_op_error(ctx->op, r, v, uv);
3406   close_op_ctx(ctx);
3407 }
3408
3409 void PrimaryLogPG::log_op_stats(OpContext *ctx)
3410 {
3411   OpRequestRef op = ctx->op;
3412   const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
3413
3414   utime_t now = ceph_clock_now();
3415   utime_t latency = now;
3416   latency -= ctx->op->get_req()->get_recv_stamp();
3417   utime_t process_latency = now;
3418   process_latency -= ctx->op->get_dequeued_time();
3419
3420   uint64_t inb = ctx->bytes_written;
3421   uint64_t outb = ctx->bytes_read;
3422
3423   osd->logger->inc(l_osd_op);
3424
3425   osd->logger->inc(l_osd_op_outb, outb);
3426   osd->logger->inc(l_osd_op_inb, inb);
3427   osd->logger->tinc(l_osd_op_lat, latency);
3428   osd->logger->tinc(l_osd_op_process_lat, process_latency);
3429
3430   if (op->may_read() && op->may_write()) {
3431     osd->logger->inc(l_osd_op_rw);
3432     osd->logger->inc(l_osd_op_rw_inb, inb);
3433     osd->logger->inc(l_osd_op_rw_outb, outb);
3434     osd->logger->tinc(l_osd_op_rw_lat, latency);
3435     osd->logger->hinc(l_osd_op_rw_lat_inb_hist, latency.to_nsec(), inb);
3436     osd->logger->hinc(l_osd_op_rw_lat_outb_hist, latency.to_nsec(), outb);
3437     osd->logger->tinc(l_osd_op_rw_process_lat, process_latency);
3438   } else if (op->may_read()) {
3439     osd->logger->inc(l_osd_op_r);
3440     osd->logger->inc(l_osd_op_r_outb, outb);
3441     osd->logger->tinc(l_osd_op_r_lat, latency);
3442     osd->logger->hinc(l_osd_op_r_lat_outb_hist, latency.to_nsec(), outb);
3443     osd->logger->tinc(l_osd_op_r_process_lat, process_latency);
3444   } else if (op->may_write() || op->may_cache()) {
3445     osd->logger->inc(l_osd_op_w);
3446     osd->logger->inc(l_osd_op_w_inb, inb);
3447     osd->logger->tinc(l_osd_op_w_lat, latency);
3448     osd->logger->hinc(l_osd_op_w_lat_inb_hist, latency.to_nsec(), inb);
3449     osd->logger->tinc(l_osd_op_w_process_lat, process_latency);
3450   } else
3451     ceph_abort();
3452
3453   dout(15) << "log_op_stats " << *m
3454            << " inb " << inb
3455            << " outb " << outb
3456            << " lat " << latency << dendl;
3457 }
3458
3459 void PrimaryLogPG::do_sub_op(OpRequestRef op)
3460 {
3461   const MOSDSubOp *m = static_cast<const MOSDSubOp*>(op->get_req());
3462   assert(have_same_or_newer_map(m->map_epoch));
3463   assert(m->get_type() == MSG_OSD_SUBOP);
3464   dout(15) << "do_sub_op " << *op->get_req() << dendl;
3465
3466   if (!is_peered()) {
3467     waiting_for_peered.push_back(op);
3468     op->mark_delayed("waiting for active");
3469     return;
3470   }
3471
3472   const OSDOp *first = NULL;
3473   if (m->ops.size() >= 1) {
3474     first = &m->ops[0];
3475   }
3476
3477   if (first) {
3478     switch (first->op.op) {
3479     case CEPH_OSD_OP_DELETE:
3480       sub_op_remove(op);
3481       return;
3482     case CEPH_OSD_OP_SCRUB_RESERVE:
3483       handle_scrub_reserve_request(op);
3484       return;
3485     case CEPH_OSD_OP_SCRUB_UNRESERVE:
3486       handle_scrub_reserve_release(op);
3487       return;
3488     case CEPH_OSD_OP_SCRUB_MAP:
3489       sub_op_scrub_map(op);
3490       return;
3491     }
3492   }
3493 }
3494
3495 void PrimaryLogPG::do_sub_op_reply(OpRequestRef op)
3496 {
3497   const MOSDSubOpReply *r = static_cast<const MOSDSubOpReply *>(op->get_req());
3498   assert(r->get_type() == MSG_OSD_SUBOPREPLY);
3499   if (r->ops.size() >= 1) {
3500     const OSDOp& first = r->ops[0];
3501     switch (first.op.op) {
3502     case CEPH_OSD_OP_SCRUB_RESERVE:
3503       {
3504         pg_shard_t from = r->from;
3505         bufferlist::iterator p = const_cast<bufferlist&>(r->get_data()).begin();
3506         bool reserved;
3507         ::decode(reserved, p);
3508         if (reserved) {
3509           handle_scrub_reserve_grant(op, from);
3510         } else {
3511           handle_scrub_reserve_reject(op, from);
3512         }
3513       }
3514       return;
3515     }
3516   }
3517 }
3518
3519 void PrimaryLogPG::do_scan(
3520   OpRequestRef op,
3521   ThreadPool::TPHandle &handle)
3522 {
3523   const MOSDPGScan *m = static_cast<const MOSDPGScan*>(op->get_req());
3524   assert(m->get_type() == MSG_OSD_PG_SCAN);
3525   dout(10) << "do_scan " << *m << dendl;
3526
3527   op->mark_started();
3528
3529   switch (m->op) {
3530   case MOSDPGScan::OP_SCAN_GET_DIGEST:
3531     {
3532       ostringstream ss;
3533       if (osd->check_backfill_full(ss)) {
3534         dout(1) << __func__ << ": Canceling backfill, " << ss.str() << dendl;
3535         queue_peering_event(
3536           CephPeeringEvtRef(
3537             std::make_shared<CephPeeringEvt>(
3538               get_osdmap()->get_epoch(),
3539               get_osdmap()->get_epoch(),
3540               BackfillTooFull())));
3541         return;
3542       }
3543
3544       BackfillInterval bi;
3545       bi.begin = m->begin;
3546       // No need to flush, there won't be any in progress writes occuring
3547       // past m->begin
3548       scan_range(
3549         cct->_conf->osd_backfill_scan_min,
3550         cct->_conf->osd_backfill_scan_max,
3551         &bi,
3552         handle);
3553       MOSDPGScan *reply = new MOSDPGScan(
3554         MOSDPGScan::OP_SCAN_DIGEST,
3555         pg_whoami,
3556         get_osdmap()->get_epoch(), m->query_epoch,
3557         spg_t(info.pgid.pgid, get_primary().shard), bi.begin, bi.end);
3558       ::encode(bi.objects, reply->get_data());
3559       osd->send_message_osd_cluster(reply, m->get_connection());
3560     }
3561     break;
3562
3563   case MOSDPGScan::OP_SCAN_DIGEST:
3564     {
3565       pg_shard_t from = m->from;
3566
3567       // Check that from is in backfill_targets vector
3568       assert(is_backfill_targets(from));
3569
3570       BackfillInterval& bi = peer_backfill_info[from];
3571       bi.begin = m->begin;
3572       bi.end = m->end;
3573       bufferlist::iterator p = const_cast<bufferlist&>(m->get_data()).begin();
3574
3575       // take care to preserve ordering!
3576       bi.clear_objects();
3577       ::decode_noclear(bi.objects, p);
3578
3579       if (waiting_on_backfill.erase(from)) {
3580         if (waiting_on_backfill.empty()) {
3581           assert(peer_backfill_info.size() == backfill_targets.size());
3582           finish_recovery_op(hobject_t::get_max());
3583         }
3584       } else {
3585         // we canceled backfill for a while due to a too full, and this
3586         // is an extra response from a non-too-full peer
3587       }
3588     }
3589     break;
3590   }
3591 }
3592
3593 void PrimaryLogPG::do_backfill(OpRequestRef op)
3594 {
3595   const MOSDPGBackfill *m = static_cast<const MOSDPGBackfill*>(op->get_req());
3596   assert(m->get_type() == MSG_OSD_PG_BACKFILL);
3597   dout(10) << "do_backfill " << *m << dendl;
3598
3599   op->mark_started();
3600
3601   switch (m->op) {
3602   case MOSDPGBackfill::OP_BACKFILL_FINISH:
3603     {
3604       assert(cct->_conf->osd_kill_backfill_at != 1);
3605
3606       MOSDPGBackfill *reply = new MOSDPGBackfill(
3607         MOSDPGBackfill::OP_BACKFILL_FINISH_ACK,
3608         get_osdmap()->get_epoch(),
3609         m->query_epoch,
3610         spg_t(info.pgid.pgid, get_primary().shard));
3611       reply->set_priority(get_recovery_op_priority());
3612       osd->send_message_osd_cluster(reply, m->get_connection());
3613       queue_peering_event(
3614         CephPeeringEvtRef(
3615           std::make_shared<CephPeeringEvt>(
3616             get_osdmap()->get_epoch(),
3617             get_osdmap()->get_epoch(),
3618             RecoveryDone())));
3619     }
3620     // fall-thru
3621
3622   case MOSDPGBackfill::OP_BACKFILL_PROGRESS:
3623     {
3624       assert(cct->_conf->osd_kill_backfill_at != 2);
3625
3626       info.set_last_backfill(m->last_backfill);
3627       info.stats = m->stats;
3628
3629       ObjectStore::Transaction t;
3630       dirty_info = true;
3631       write_if_dirty(t);
3632       int tr = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
3633       assert(tr == 0);
3634     }
3635     break;
3636
3637   case MOSDPGBackfill::OP_BACKFILL_FINISH_ACK:
3638     {
3639       assert(is_primary());
3640       assert(cct->_conf->osd_kill_backfill_at != 3);
3641       finish_recovery_op(hobject_t::get_max());
3642     }
3643     break;
3644   }
3645 }
3646
3647 void PrimaryLogPG::do_backfill_remove(OpRequestRef op)
3648 {
3649   const MOSDPGBackfillRemove *m = static_cast<const MOSDPGBackfillRemove*>(
3650     op->get_req());
3651   assert(m->get_type() == MSG_OSD_PG_BACKFILL_REMOVE);
3652   dout(7) << __func__ << " " << m->ls << dendl;
3653
3654   op->mark_started();
3655
3656   ObjectStore::Transaction t;
3657   for (auto& p : m->ls) {
3658     remove_snap_mapped_object(t, p.first);
3659   }
3660   int r = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
3661   assert(r == 0);
3662 }
3663
3664 int PrimaryLogPG::trim_object(
3665   bool first, const hobject_t &coid, PrimaryLogPG::OpContextUPtr *ctxp)
3666 {
3667   *ctxp = NULL;
3668   // load clone info
3669   bufferlist bl;
3670   ObjectContextRef obc = get_object_context(coid, false, NULL);
3671   if (!obc || !obc->ssc || !obc->ssc->exists) {
3672     osd->clog->error() << __func__ << ": Can not trim " << coid
3673       << " repair needed " << (obc ? "(no obc->ssc or !exists)" : "(no obc)");
3674     return -ENOENT;
3675   }
3676
3677   hobject_t snapoid(
3678     coid.oid, coid.get_key(),
3679     obc->ssc->snapset.head_exists ? CEPH_NOSNAP:CEPH_SNAPDIR, coid.get_hash(),
3680     info.pgid.pool(), coid.get_namespace());
3681   ObjectContextRef snapset_obc = get_object_context(snapoid, false);
3682   if (!snapset_obc) {
3683     osd->clog->error() << __func__ << ": Can not trim " << coid
3684       << " repair needed, no snapset obc for " << snapoid;
3685     return -ENOENT;
3686   }
3687
3688   SnapSet& snapset = obc->ssc->snapset;
3689
3690   bool legacy = snapset.is_legacy() ||
3691     get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
3692
3693   object_info_t &coi = obc->obs.oi;
3694   set<snapid_t> old_snaps;
3695   if (legacy) {
3696     old_snaps.insert(coi.legacy_snaps.begin(), coi.legacy_snaps.end());
3697   } else {
3698     auto p = snapset.clone_snaps.find(coid.snap);
3699     if (p == snapset.clone_snaps.end()) {
3700       osd->clog->error() << "No clone_snaps in snapset " << snapset
3701                          << " for object " << coid << "\n";
3702       return -ENOENT;
3703     }
3704     old_snaps.insert(snapset.clone_snaps[coid.snap].begin(),
3705                      snapset.clone_snaps[coid.snap].end());
3706   }
3707   if (old_snaps.empty()) {
3708     osd->clog->error() << "No object info snaps for object " << coid;
3709     return -ENOENT;
3710   }
3711
3712   dout(10) << coid << " old_snaps " << old_snaps
3713            << " old snapset " << snapset << dendl;
3714   if (snapset.seq == 0) {
3715     osd->clog->error() << "No snapset.seq for object " << coid;
3716     return -ENOENT;
3717   }
3718
3719   set<snapid_t> new_snaps;
3720   for (set<snapid_t>::iterator i = old_snaps.begin();
3721        i != old_snaps.end();
3722        ++i) {
3723     if (!pool.info.is_removed_snap(*i))
3724       new_snaps.insert(*i);
3725   }
3726
3727   vector<snapid_t>::iterator p = snapset.clones.end();
3728
3729   if (new_snaps.empty()) {
3730     p = std::find(snapset.clones.begin(), snapset.clones.end(), coid.snap);
3731     if (p == snapset.clones.end()) {
3732       osd->clog->error() << "Snap " << coid.snap << " not in clones";
3733       return -ENOENT;
3734     }
3735   }
3736
3737   OpContextUPtr ctx = simple_opc_create(obc);
3738   ctx->snapset_obc = snapset_obc;
3739
3740   if (!ctx->lock_manager.get_snaptrimmer_write(
3741         coid,
3742         obc,
3743         first)) {
3744     close_op_ctx(ctx.release());
3745     dout(10) << __func__ << ": Unable to get a wlock on " << coid << dendl;
3746     return -ENOLCK;
3747   }
3748
3749   if (!ctx->lock_manager.get_snaptrimmer_write(
3750         snapoid,
3751         snapset_obc,
3752         first)) {
3753     close_op_ctx(ctx.release());
3754     dout(10) << __func__ << ": Unable to get a wlock on " << snapoid << dendl;
3755     return -ENOLCK;
3756   }
3757
3758   ctx->at_version = get_next_version();
3759
3760   PGTransaction *t = ctx->op_t.get();
3761
3762   if (new_snaps.empty()) {
3763     // remove clone
3764     dout(10) << coid << " snaps " << old_snaps << " -> "
3765              << new_snaps << " ... deleting" << dendl;
3766
3767     // ...from snapset
3768     assert(p != snapset.clones.end());
3769
3770     snapid_t last = coid.snap;
3771     ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(last);
3772
3773     if (p != snapset.clones.begin()) {
3774       // not the oldest... merge overlap into next older clone
3775       vector<snapid_t>::iterator n = p - 1;
3776       hobject_t prev_coid = coid;
3777       prev_coid.snap = *n;
3778       bool adjust_prev_bytes = is_present_clone(prev_coid);
3779
3780       if (adjust_prev_bytes)
3781         ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(*n);
3782
3783       snapset.clone_overlap[*n].intersection_of(
3784         snapset.clone_overlap[*p]);
3785
3786       if (adjust_prev_bytes)
3787         ctx->delta_stats.num_bytes += snapset.get_clone_bytes(*n);
3788     }
3789     ctx->delta_stats.num_objects--;
3790     if (coi.is_dirty())
3791       ctx->delta_stats.num_objects_dirty--;
3792     if (coi.is_omap())
3793       ctx->delta_stats.num_objects_omap--;
3794     if (coi.is_whiteout()) {
3795       dout(20) << __func__ << " trimming whiteout on " << coid << dendl;
3796       ctx->delta_stats.num_whiteouts--;
3797     }
3798     ctx->delta_stats.num_object_clones--;
3799     if (coi.is_cache_pinned())
3800       ctx->delta_stats.num_objects_pinned--;
3801     obc->obs.exists = false;
3802
3803     snapset.clones.erase(p);
3804     snapset.clone_overlap.erase(last);
3805     snapset.clone_size.erase(last);
3806     snapset.clone_snaps.erase(last);
3807
3808     ctx->log.push_back(
3809       pg_log_entry_t(
3810         pg_log_entry_t::DELETE,
3811         coid,
3812         ctx->at_version,
3813         ctx->obs->oi.version,
3814         0,
3815         osd_reqid_t(),
3816         ctx->mtime,
3817         0)
3818       );
3819     t->remove(coid);
3820     t->update_snaps(
3821       coid,
3822       old_snaps,
3823       new_snaps);
3824
3825     coi = object_info_t(coid);
3826
3827     ctx->at_version.version++;
3828   } else {
3829     // save adjusted snaps for this object
3830     dout(10) << coid << " snaps " << old_snaps << " -> " << new_snaps << dendl;
3831     if (legacy) {
3832       coi.legacy_snaps = vector<snapid_t>(new_snaps.rbegin(), new_snaps.rend());
3833     } else {
3834       snapset.clone_snaps[coid.snap] = vector<snapid_t>(new_snaps.rbegin(),
3835                                                         new_snaps.rend());
3836       // we still do a 'modify' event on this object just to trigger a
3837       // snapmapper.update ... :(
3838     }
3839
3840     coi.prior_version = coi.version;
3841     coi.version = ctx->at_version;
3842     bl.clear();
3843     ::encode(coi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
3844     t->setattr(coid, OI_ATTR, bl);
3845
3846     ctx->log.push_back(
3847       pg_log_entry_t(
3848         pg_log_entry_t::MODIFY,
3849         coid,
3850         coi.version,
3851         coi.prior_version,
3852         0,
3853         osd_reqid_t(),
3854         ctx->mtime,
3855         0)
3856       );
3857     ctx->at_version.version++;
3858
3859     t->update_snaps(
3860       coid,
3861       old_snaps,
3862       new_snaps);
3863   }
3864
3865   // save head snapset
3866   dout(10) << coid << " new snapset " << snapset << " on "
3867            << snapset_obc->obs.oi << dendl;
3868   if (snapset.clones.empty() &&
3869       (!snapset.head_exists ||
3870        (snapset_obc->obs.oi.is_whiteout() &&
3871         !(snapset_obc->obs.oi.is_dirty() && pool.info.is_tier()) &&
3872         !snapset_obc->obs.oi.is_cache_pinned()))) {
3873     // NOTE: this arguably constitutes minor interference with the
3874     // tiering agent if this is a cache tier since a snap trim event
3875     // is effectively evicting a whiteout we might otherwise want to
3876     // keep around.
3877     dout(10) << coid << " removing " << snapoid << dendl;
3878     ctx->log.push_back(
3879       pg_log_entry_t(
3880         pg_log_entry_t::DELETE,
3881         snapoid,
3882         ctx->at_version,
3883         ctx->snapset_obc->obs.oi.version,
3884         0,
3885         osd_reqid_t(),
3886         ctx->mtime,
3887         0)
3888       );
3889     if (snapoid.is_head()) {
3890       derr << "removing snap head" << dendl;
3891       object_info_t& oi = ctx->snapset_obc->obs.oi;
3892       ctx->delta_stats.num_objects--;
3893       if (oi.is_dirty()) {
3894         ctx->delta_stats.num_objects_dirty--;
3895       }
3896       if (oi.is_omap())
3897         ctx->delta_stats.num_objects_omap--;
3898       if (oi.is_whiteout()) {
3899         dout(20) << __func__ << " trimming whiteout on " << oi.soid << dendl;
3900         ctx->delta_stats.num_whiteouts--;
3901       }
3902       if (oi.is_cache_pinned()) {
3903         ctx->delta_stats.num_objects_pinned--;
3904       }
3905     }
3906     ctx->snapset_obc->obs.exists = false;
3907     ctx->snapset_obc->obs.oi = object_info_t(snapoid);
3908     t->remove(snapoid);
3909   } else {
3910     dout(10) << coid << " filtering snapset on " << snapoid << dendl;
3911     snapset.filter(pool.info);
3912     dout(10) << coid << " writing updated snapset on " << snapoid
3913              << ", snapset is " << snapset << dendl;
3914     ctx->log.push_back(
3915       pg_log_entry_t(
3916         pg_log_entry_t::MODIFY,
3917         snapoid,
3918         ctx->at_version,
3919         ctx->snapset_obc->obs.oi.version,
3920         0,
3921         osd_reqid_t(),
3922         ctx->mtime,
3923         0)
3924       );
3925
3926     ctx->snapset_obc->obs.oi.prior_version =
3927       ctx->snapset_obc->obs.oi.version;
3928     ctx->snapset_obc->obs.oi.version = ctx->at_version;
3929
3930     map <string, bufferlist> attrs;
3931     bl.clear();
3932     ::encode(snapset, bl);
3933     attrs[SS_ATTR].claim(bl);
3934
3935     bl.clear();
3936     ::encode(ctx->snapset_obc->obs.oi, bl,
3937              get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
3938     attrs[OI_ATTR].claim(bl);
3939     t->setattrs(snapoid, attrs);
3940   }
3941
3942   *ctxp = std::move(ctx);
3943   return 0;
3944 }
3945
3946 void PrimaryLogPG::kick_snap_trim()
3947 {
3948   assert(is_active());
3949   assert(is_primary());
3950   if (is_clean() && !snap_trimq.empty()) {
3951     dout(10) << __func__ << ": clean and snaps to trim, kicking" << dendl;
3952     snap_trimmer_machine.process_event(KickTrim());
3953   }
3954 }
3955
3956 void PrimaryLogPG::snap_trimmer_scrub_complete()
3957 {
3958   if (is_primary() && is_active() && is_clean()) {
3959     assert(!snap_trimq.empty());
3960     snap_trimmer_machine.process_event(ScrubComplete());
3961   }
3962 }
3963
3964 void PrimaryLogPG::snap_trimmer(epoch_t queued)
3965 {
3966   if (deleting || pg_has_reset_since(queued)) {
3967     return;
3968   }
3969
3970   assert(is_primary());
3971
3972   dout(10) << "snap_trimmer posting" << dendl;
3973   snap_trimmer_machine.process_event(DoSnapWork());
3974   dout(10) << "snap_trimmer complete" << dendl;
3975   return;
3976 }
3977
3978 int PrimaryLogPG::do_xattr_cmp_u64(int op, __u64 v1, bufferlist& xattr)
3979 {
3980   __u64 v2;
3981
3982   string v2s(xattr.c_str(), xattr.length());
3983   if (v2s.length())
3984     v2 = strtoull(v2s.c_str(), NULL, 10);
3985   else
3986     v2 = 0;
3987
3988   dout(20) << "do_xattr_cmp_u64 '" << v1 << "' vs '" << v2 << "' op " << op << dendl;
3989
3990   switch (op) {
3991   case CEPH_OSD_CMPXATTR_OP_EQ:
3992     return (v1 == v2);
3993   case CEPH_OSD_CMPXATTR_OP_NE:
3994     return (v1 != v2);
3995   case CEPH_OSD_CMPXATTR_OP_GT:
3996     return (v1 > v2);
3997   case CEPH_OSD_CMPXATTR_OP_GTE:
3998     return (v1 >= v2);
3999   case CEPH_OSD_CMPXATTR_OP_LT:
4000     return (v1 < v2);
4001   case CEPH_OSD_CMPXATTR_OP_LTE:
4002     return (v1 <= v2);
4003   default:
4004     return -EINVAL;
4005   }
4006 }
4007
4008 int PrimaryLogPG::do_xattr_cmp_str(int op, string& v1s, bufferlist& xattr)
4009 {
4010   string v2s(xattr.c_str(), xattr.length());
4011
4012   dout(20) << "do_xattr_cmp_str '" << v1s << "' vs '" << v2s << "' op " << op << dendl;
4013
4014   switch (op) {
4015   case CEPH_OSD_CMPXATTR_OP_EQ:
4016     return (v1s.compare(v2s) == 0);
4017   case CEPH_OSD_CMPXATTR_OP_NE:
4018     return (v1s.compare(v2s) != 0);
4019   case CEPH_OSD_CMPXATTR_OP_GT:
4020     return (v1s.compare(v2s) > 0);
4021   case CEPH_OSD_CMPXATTR_OP_GTE:
4022     return (v1s.compare(v2s) >= 0);
4023   case CEPH_OSD_CMPXATTR_OP_LT:
4024     return (v1s.compare(v2s) < 0);
4025   case CEPH_OSD_CMPXATTR_OP_LTE:
4026     return (v1s.compare(v2s) <= 0);
4027   default:
4028     return -EINVAL;
4029   }
4030 }
4031
4032 int PrimaryLogPG::do_writesame(OpContext *ctx, OSDOp& osd_op)
4033 {
4034   ceph_osd_op& op = osd_op.op;
4035   vector<OSDOp> write_ops(1);
4036   OSDOp& write_op = write_ops[0];
4037   uint64_t write_length = op.writesame.length;
4038   int result = 0;
4039
4040   if (!write_length)
4041     return 0;
4042
4043   if (!op.writesame.data_length || write_length % op.writesame.data_length)
4044     return -EINVAL;
4045
4046   if (op.writesame.data_length != osd_op.indata.length()) {
4047     derr << "invalid length ws data length " << op.writesame.data_length << " actual len " << osd_op.indata.length() << dendl;
4048     return -EINVAL;
4049   }
4050
4051   while (write_length) {
4052     write_op.indata.append(osd_op.indata);
4053     write_length -= op.writesame.data_length;
4054   }
4055
4056   write_op.op.op = CEPH_OSD_OP_WRITE;
4057   write_op.op.extent.offset = op.writesame.offset;
4058   write_op.op.extent.length = op.writesame.length;
4059   result = do_osd_ops(ctx, write_ops);
4060   if (result < 0)
4061     derr << "do_writesame do_osd_ops failed " << result << dendl;
4062
4063   return result;
4064 }
4065
4066 // ========================================================================
4067 // low level osd ops
4068
4069 int PrimaryLogPG::do_tmap2omap(OpContext *ctx, unsigned flags)
4070 {
4071   dout(20) << " convert tmap to omap for " << ctx->new_obs.oi.soid << dendl;
4072   bufferlist header, vals;
4073   int r = _get_tmap(ctx, &header, &vals);
4074   if (r < 0) {
4075     if (r == -ENODATA && (flags & CEPH_OSD_TMAP2OMAP_NULLOK))
4076       r = 0;
4077     return r;
4078   }
4079
4080   vector<OSDOp> ops(3);
4081
4082   ops[0].op.op = CEPH_OSD_OP_TRUNCATE;
4083   ops[0].op.extent.offset = 0;
4084   ops[0].op.extent.length = 0;
4085
4086   ops[1].op.op = CEPH_OSD_OP_OMAPSETHEADER;
4087   ops[1].indata.claim(header);
4088
4089   ops[2].op.op = CEPH_OSD_OP_OMAPSETVALS;
4090   ops[2].indata.claim(vals);
4091
4092   return do_osd_ops(ctx, ops);
4093 }
4094
4095 int PrimaryLogPG::do_tmapup_slow(OpContext *ctx, bufferlist::iterator& bp, OSDOp& osd_op,
4096                                     bufferlist& bl)
4097 {
4098   // decode
4099   bufferlist header;
4100   map<string, bufferlist> m;
4101   if (bl.length()) {
4102     bufferlist::iterator p = bl.begin();
4103     ::decode(header, p);
4104     ::decode(m, p);
4105     assert(p.end());
4106   }
4107
4108   // do the update(s)
4109   while (!bp.end()) {
4110     __u8 op;
4111     string key;
4112     ::decode(op, bp);
4113
4114     switch (op) {
4115     case CEPH_OSD_TMAP_SET: // insert key
4116       {
4117         ::decode(key, bp);
4118         bufferlist data;
4119         ::decode(data, bp);
4120         m[key] = data;
4121       }
4122       break;
4123     case CEPH_OSD_TMAP_RM: // remove key
4124       ::decode(key, bp);
4125       if (!m.count(key)) {
4126         return -ENOENT;
4127       }
4128       m.erase(key);
4129       break;
4130     case CEPH_OSD_TMAP_RMSLOPPY: // remove key
4131       ::decode(key, bp);
4132       m.erase(key);
4133       break;
4134     case CEPH_OSD_TMAP_HDR: // update header
4135       {
4136         ::decode(header, bp);
4137       }
4138       break;
4139     default:
4140       return -EINVAL;
4141     }
4142   }
4143
4144   // reencode
4145   bufferlist obl;
4146   ::encode(header, obl);
4147   ::encode(m, obl);
4148
4149   // write it out
4150   vector<OSDOp> nops(1);
4151   OSDOp& newop = nops[0];
4152   newop.op.op = CEPH_OSD_OP_WRITEFULL;
4153   newop.op.extent.offset = 0;
4154   newop.op.extent.length = obl.length();
4155   newop.indata = obl;
4156   do_osd_ops(ctx, nops);
4157   osd_op.outdata.claim(newop.outdata);
4158   return 0;
4159 }
4160
4161 int PrimaryLogPG::do_tmapup(OpContext *ctx, bufferlist::iterator& bp, OSDOp& osd_op)
4162 {
4163   bufferlist::iterator orig_bp = bp;
4164   int result = 0;
4165   if (bp.end()) {
4166     dout(10) << "tmapup is a no-op" << dendl;
4167   } else {
4168     // read the whole object
4169     vector<OSDOp> nops(1);
4170     OSDOp& newop = nops[0];
4171     newop.op.op = CEPH_OSD_OP_READ;
4172     newop.op.extent.offset = 0;
4173     newop.op.extent.length = 0;
4174     result = do_osd_ops(ctx, nops);
4175
4176     dout(10) << "tmapup read " << newop.outdata.length() << dendl;
4177
4178     dout(30) << " starting is \n";
4179     newop.outdata.hexdump(*_dout);
4180     *_dout << dendl;
4181
4182     bufferlist::iterator ip = newop.outdata.begin();
4183     bufferlist obl;
4184
4185     dout(30) << "the update command is: \n";
4186     osd_op.indata.hexdump(*_dout);
4187     *_dout << dendl;
4188
4189     // header
4190     bufferlist header;
4191     __u32 nkeys = 0;
4192     if (newop.outdata.length()) {
4193       ::decode(header, ip);
4194       ::decode(nkeys, ip);
4195     }
4196     dout(10) << "tmapup header " << header.length() << dendl;
4197
4198     if (!bp.end() && *bp == CEPH_OSD_TMAP_HDR) {
4199       ++bp;
4200       ::decode(header, bp);
4201       dout(10) << "tmapup new header " << header.length() << dendl;
4202     }
4203
4204     ::encode(header, obl);
4205
4206     dout(20) << "tmapup initial nkeys " << nkeys << dendl;
4207
4208     // update keys
4209     bufferlist newkeydata;
4210     string nextkey, last_in_key;
4211     bufferlist nextval;
4212     bool have_next = false;
4213     if (!ip.end()) {
4214       have_next = true;
4215       ::decode(nextkey, ip);
4216       ::decode(nextval, ip);
4217     }
4218     while (!bp.end() && !result) {
4219       __u8 op;
4220       string key;
4221       try {
4222         ::decode(op, bp);
4223         ::decode(key, bp);
4224       }
4225       catch (buffer::error& e) {
4226         return -EINVAL;
4227       }
4228       if (key < last_in_key) {
4229         dout(5) << "tmapup warning: key '" << key << "' < previous key '" << last_in_key
4230                 << "', falling back to an inefficient (unsorted) update" << dendl;
4231         bp = orig_bp;
4232         return do_tmapup_slow(ctx, bp, osd_op, newop.outdata);
4233       }
4234       last_in_key = key;
4235
4236       dout(10) << "tmapup op " << (int)op << " key " << key << dendl;
4237
4238       // skip existing intervening keys
4239       bool key_exists = false;
4240       while (have_next && !key_exists) {
4241         dout(20) << "  (have_next=" << have_next << " nextkey=" << nextkey << ")" << dendl;
4242         if (nextkey > key)
4243           break;
4244         if (nextkey < key) {
4245           // copy untouched.
4246           ::encode(nextkey, newkeydata);
4247           ::encode(nextval, newkeydata);
4248           dout(20) << "  keep " << nextkey << " " << nextval.length() << dendl;
4249         } else {
4250           // don't copy; discard old value.  and stop.
4251           dout(20) << "  drop " << nextkey << " " << nextval.length() << dendl;
4252           key_exists = true;
4253           nkeys--;
4254         }
4255         if (!ip.end()) {
4256           ::decode(nextkey, ip);
4257           ::decode(nextval, ip);
4258         } else {
4259           have_next = false;
4260         }
4261       }
4262
4263       if (op == CEPH_OSD_TMAP_SET) {
4264         bufferlist val;
4265         try {
4266           ::decode(val, bp);
4267         }
4268         catch (buffer::error& e) {
4269           return -EINVAL;
4270         }
4271         ::encode(key, newkeydata);
4272         ::encode(val, newkeydata);
4273         dout(20) << "   set " << key << " " << val.length() << dendl;
4274         nkeys++;
4275       } else if (op == CEPH_OSD_TMAP_CREATE) {
4276         if (key_exists) {
4277           return -EEXIST;
4278         }
4279         bufferlist val;
4280         try {
4281           ::decode(val, bp);
4282         }
4283         catch (buffer::error& e) {
4284           return -EINVAL;
4285         }
4286         ::encode(key, newkeydata);
4287         ::encode(val, newkeydata);
4288         dout(20) << "   create " << key << " " << val.length() << dendl;
4289         nkeys++;
4290       } else if (op == CEPH_OSD_TMAP_RM) {
4291         // do nothing.
4292         if (!key_exists) {
4293           return -ENOENT;
4294         }
4295       } else if (op == CEPH_OSD_TMAP_RMSLOPPY) {
4296         // do nothing
4297       } else {
4298         dout(10) << "  invalid tmap op " << (int)op << dendl;
4299         return -EINVAL;
4300       }
4301     }
4302
4303     // copy remaining
4304     if (have_next) {
4305       ::encode(nextkey, newkeydata);
4306       ::encode(nextval, newkeydata);
4307       dout(20) << "  keep " << nextkey << " " << nextval.length() << dendl;
4308     }
4309     if (!ip.end()) {
4310       bufferlist rest;
4311       rest.substr_of(newop.outdata, ip.get_off(), newop.outdata.length() - ip.get_off());
4312       dout(20) << "  keep trailing " << rest.length()
4313                << " at " << newkeydata.length() << dendl;
4314       newkeydata.claim_append(rest);
4315     }
4316
4317     // encode final key count + key data
4318     dout(20) << "tmapup final nkeys " << nkeys << dendl;
4319     ::encode(nkeys, obl);
4320     obl.claim_append(newkeydata);
4321
4322     if (0) {
4323       dout(30) << " final is \n";
4324       obl.hexdump(*_dout);
4325       *_dout << dendl;
4326
4327       // sanity check
4328       bufferlist::iterator tp = obl.begin();
4329       bufferlist h;
4330       ::decode(h, tp);
4331       map<string,bufferlist> d;
4332       ::decode(d, tp);
4333       assert(tp.end());
4334       dout(0) << " **** debug sanity check, looks ok ****" << dendl;
4335     }
4336
4337     // write it out
4338     if (!result) {
4339       dout(20) << "tmapput write " << obl.length() << dendl;
4340       newop.op.op = CEPH_OSD_OP_WRITEFULL;
4341       newop.op.extent.offset = 0;
4342       newop.op.extent.length = obl.length();
4343       newop.indata = obl;
4344       do_osd_ops(ctx, nops);
4345       osd_op.outdata.claim(newop.outdata);
4346     }
4347   }
4348   return result;
4349 }
4350
4351 static int check_offset_and_length(uint64_t offset, uint64_t length, uint64_t max)
4352 {
4353   if (offset >= max ||
4354       length > max ||
4355       offset + length > max)
4356     return -EFBIG;
4357
4358   return 0;
4359 }
4360
4361 struct FillInVerifyExtent : public Context {
4362   ceph_le64 *r;
4363   int32_t *rval;
4364   bufferlist *outdatap;
4365   boost::optional<uint32_t> maybe_crc;
4366   uint64_t size;
4367   OSDService *osd;
4368   hobject_t soid;
4369   __le32 flags;
4370   FillInVerifyExtent(ceph_le64 *r, int32_t *rv, bufferlist *blp,
4371                      boost::optional<uint32_t> mc, uint64_t size,
4372                      OSDService *osd, hobject_t soid, __le32 flags) :
4373     r(r), rval(rv), outdatap(blp), maybe_crc(mc),
4374     size(size), osd(osd), soid(soid), flags(flags) {}
4375   void finish(int len) override {
4376     *r = len;
4377     if (len < 0) {
4378       *rval = len;
4379       return;
4380     }
4381     *rval = 0;
4382
4383     // whole object?  can we verify the checksum?
4384     if (maybe_crc && *r == size) {
4385       uint32_t crc = outdatap->crc32c(-1);
4386       if (maybe_crc != crc) {
4387         osd->clog->error() << std::hex << " full-object read crc 0x" << crc
4388                            << " != expected 0x" << *maybe_crc
4389                            << std::dec << " on " << soid;
4390         if (!(flags & CEPH_OSD_OP_FLAG_FAILOK)) {
4391           *rval = -EIO;
4392           *r = 0;
4393         }
4394       }
4395     }
4396   }
4397 };
4398
4399 struct ToSparseReadResult : public Context {
4400   int* result;
4401   bufferlist* data_bl;
4402   uint64_t data_offset;
4403   ceph_le64* len;
4404   ToSparseReadResult(int* result, bufferlist* bl, uint64_t offset,
4405                      ceph_le64* len)
4406     : result(result), data_bl(bl), data_offset(offset),len(len) {}
4407   void finish(int r) override {
4408     if (r < 0) {
4409       *result = r;
4410       return;
4411     }
4412     *result = 0;
4413     *len = r;
4414     bufferlist outdata;
4415     map<uint64_t, uint64_t> extents = {{data_offset, r}};
4416     ::encode(extents, outdata);
4417     ::encode_destructively(*data_bl, outdata);
4418     data_bl->swap(outdata);
4419   }
4420 };
4421
4422 template<typename V>
4423 static string list_keys(const map<string, V>& m) {
4424   string s;
4425   for (typename map<string, V>::const_iterator itr = m.begin(); itr != m.end(); ++itr) {
4426     if (!s.empty()) {
4427       s.push_back(',');
4428     }
4429     s.append(itr->first);
4430   }
4431   return s;
4432 }
4433
4434 template<typename T>
4435 static string list_entries(const T& m) {
4436   string s;
4437   for (typename T::const_iterator itr = m.begin(); itr != m.end(); ++itr) {
4438     if (!s.empty()) {
4439       s.push_back(',');
4440     }
4441     s.append(*itr);
4442   }
4443   return s;
4444 }
4445
4446 void PrimaryLogPG::maybe_create_new_object(
4447   OpContext *ctx,
4448   bool ignore_transaction)
4449 {
4450   ObjectState& obs = ctx->new_obs;
4451   if (!obs.exists) {
4452     ctx->delta_stats.num_objects++;
4453     obs.exists = true;
4454     assert(!obs.oi.is_whiteout());
4455     obs.oi.new_object();
4456     if (!ignore_transaction)
4457       ctx->op_t->create(obs.oi.soid);
4458   } else if (obs.oi.is_whiteout()) {
4459     dout(10) << __func__ << " clearing whiteout on " << obs.oi.soid << dendl;
4460     ctx->new_obs.oi.clear_flag(object_info_t::FLAG_WHITEOUT);
4461     --ctx->delta_stats.num_whiteouts;
4462   }
4463 }
4464
4465 struct ReadFinisher : public PrimaryLogPG::OpFinisher {
4466   OSDOp& osd_op;
4467
4468   ReadFinisher(OSDOp& osd_op) : osd_op(osd_op) {
4469   }
4470
4471   int execute() override {
4472     return osd_op.rval;
4473   }
4474 };
4475
4476 struct C_ChecksumRead : public Context {
4477   PrimaryLogPG *primary_log_pg;
4478   OSDOp &osd_op;
4479   Checksummer::CSumType csum_type;
4480   bufferlist init_value_bl;
4481   ceph_le64 read_length;
4482   bufferlist read_bl;
4483   Context *fill_extent_ctx;
4484
4485   C_ChecksumRead(PrimaryLogPG *primary_log_pg, OSDOp &osd_op,
4486                  Checksummer::CSumType csum_type, bufferlist &&init_value_bl,
4487                  boost::optional<uint32_t> maybe_crc, uint64_t size,
4488                  OSDService *osd, hobject_t soid, __le32 flags)
4489     : primary_log_pg(primary_log_pg), osd_op(osd_op),
4490       csum_type(csum_type), init_value_bl(std::move(init_value_bl)),
4491       fill_extent_ctx(new FillInVerifyExtent(&read_length, &osd_op.rval,
4492                                              &read_bl, maybe_crc, size,
4493                                              osd, soid, flags)) {
4494   }
4495   ~C_ChecksumRead() override {
4496     delete fill_extent_ctx;
4497   }
4498
4499   void finish(int r) override {
4500     fill_extent_ctx->complete(r);
4501     fill_extent_ctx = nullptr;
4502
4503     if (osd_op.rval >= 0) {
4504       bufferlist::iterator init_value_bl_it = init_value_bl.begin();
4505       osd_op.rval = primary_log_pg->finish_checksum(osd_op, csum_type,
4506                                                     &init_value_bl_it, read_bl);
4507     }
4508   }
4509 };
4510
4511 int PrimaryLogPG::do_checksum(OpContext *ctx, OSDOp& osd_op,
4512                               bufferlist::iterator *bl_it)
4513 {
4514   dout(20) << __func__ << dendl;
4515
4516   auto& op = osd_op.op;
4517   if (op.checksum.chunk_size > 0) {
4518     if (op.checksum.length == 0) {
4519       dout(10) << __func__ << ": length required when chunk size provided"
4520                << dendl;
4521       return -EINVAL;
4522     }
4523     if (op.checksum.length % op.checksum.chunk_size != 0) {
4524       dout(10) << __func__ << ": length not aligned to chunk size" << dendl;
4525       return -EINVAL;
4526     }
4527   }
4528
4529   auto& oi = ctx->new_obs.oi;
4530   if (op.checksum.offset == 0 && op.checksum.length == 0) {
4531     // zeroed offset+length implies checksum whole object
4532     op.checksum.length = oi.size;
4533   } else if (op.checksum.offset + op.checksum.length > oi.size) {
4534     return -EOVERFLOW;
4535   }
4536
4537   Checksummer::CSumType csum_type;
4538   switch (op.checksum.type) {
4539   case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH32:
4540     csum_type = Checksummer::CSUM_XXHASH32;
4541     break;
4542   case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH64:
4543     csum_type = Checksummer::CSUM_XXHASH64;
4544     break;
4545   case CEPH_OSD_CHECKSUM_OP_TYPE_CRC32C:
4546     csum_type = Checksummer::CSUM_CRC32C;
4547     break;
4548   default:
4549     dout(10) << __func__ << ": unknown crc type ("
4550              << static_cast<uint32_t>(op.checksum.type) << ")" << dendl;
4551     return -EINVAL;
4552   }
4553
4554   size_t csum_init_value_size = Checksummer::get_csum_init_value_size(csum_type);
4555   if (bl_it->get_remaining() < csum_init_value_size) {
4556     dout(10) << __func__ << ": init value not provided" << dendl;
4557     return -EINVAL;
4558   }
4559
4560   bufferlist init_value_bl;
4561   init_value_bl.substr_of(bl_it->get_bl(), bl_it->get_off(),
4562                           csum_init_value_size);
4563   bl_it->advance(csum_init_value_size);
4564
4565   if (pool.info.require_rollback() && op.checksum.length > 0) {
4566     // If there is a data digest and it is possible we are reading
4567     // entire object, pass the digest.
4568     boost::optional<uint32_t> maybe_crc;
4569     if (oi.is_data_digest() && op.checksum.offset == 0 &&
4570         op.checksum.length >= oi.size) {
4571       maybe_crc = oi.data_digest;
4572     }
4573
4574     // async read
4575     auto& soid = oi.soid;
4576     auto checksum_ctx = new C_ChecksumRead(this, osd_op, csum_type,
4577                                            std::move(init_value_bl), maybe_crc,
4578                                            oi.size, osd, soid, op.flags);
4579
4580     ctx->pending_async_reads.push_back({
4581       {op.checksum.offset, op.checksum.length, op.flags},
4582       {&checksum_ctx->read_bl, checksum_ctx}});
4583
4584     dout(10) << __func__ << ": async_read noted for " << soid << dendl;
4585     ctx->op_finishers[ctx->current_osd_subop_num].reset(
4586       new ReadFinisher(osd_op));
4587     return -EINPROGRESS;
4588   }
4589
4590   // sync read
4591   std::vector<OSDOp> read_ops(1);
4592   auto& read_op = read_ops[0];
4593   if (op.checksum.length > 0) {
4594     read_op.op.op = CEPH_OSD_OP_READ;
4595     read_op.op.flags = op.flags;
4596     read_op.op.extent.offset = op.checksum.offset;
4597     read_op.op.extent.length = op.checksum.length;
4598     read_op.op.extent.truncate_size = 0;
4599     read_op.op.extent.truncate_seq = 0;
4600
4601     int r = do_osd_ops(ctx, read_ops);
4602     if (r < 0) {
4603       derr << __func__ << ": do_osd_ops failed: " << cpp_strerror(r) << dendl;
4604       return r;
4605     }
4606   }
4607
4608   bufferlist::iterator init_value_bl_it = init_value_bl.begin();
4609   return finish_checksum(osd_op, csum_type, &init_value_bl_it,
4610                          read_op.outdata);
4611 }
4612
4613 int PrimaryLogPG::finish_checksum(OSDOp& osd_op,
4614                                   Checksummer::CSumType csum_type,
4615                                   bufferlist::iterator *init_value_bl_it,
4616                                   const bufferlist &read_bl) {
4617   dout(20) << __func__ << dendl;
4618
4619   auto& op = osd_op.op;
4620
4621   if (op.checksum.length > 0 && read_bl.length() != op.checksum.length) {
4622     derr << __func__ << ": bytes read " << read_bl.length() << " != "
4623          << op.checksum.length << dendl;
4624     return -EINVAL;
4625   }
4626
4627   size_t csum_chunk_size = (op.checksum.chunk_size != 0 ?
4628                               op.checksum.chunk_size : read_bl.length());
4629   uint32_t csum_count = (csum_chunk_size > 0 ?
4630                            read_bl.length() / csum_chunk_size : 0);
4631
4632   bufferlist csum;
4633   bufferptr csum_data;
4634   if (csum_count > 0) {
4635     size_t csum_value_size = Checksummer::get_csum_value_size(csum_type);
4636     csum_data = buffer::create(csum_value_size * csum_count);
4637     csum_data.zero();
4638     csum.append(csum_data);
4639
4640     switch (csum_type) {
4641     case Checksummer::CSUM_XXHASH32:
4642       {
4643         Checksummer::xxhash32::init_value_t init_value;
4644         ::decode(init_value, *init_value_bl_it);
4645         Checksummer::calculate<Checksummer::xxhash32>(
4646           init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
4647           &csum_data);
4648       }
4649       break;
4650     case Checksummer::CSUM_XXHASH64:
4651       {
4652         Checksummer::xxhash64::init_value_t init_value;
4653         ::decode(init_value, *init_value_bl_it);
4654         Checksummer::calculate<Checksummer::xxhash64>(
4655           init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
4656           &csum_data);
4657       }
4658       break;
4659     case Checksummer::CSUM_CRC32C:
4660       {
4661         Checksummer::crc32c::init_value_t init_value;
4662         ::decode(init_value, *init_value_bl_it);
4663         Checksummer::calculate<Checksummer::crc32c>(
4664           init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
4665           &csum_data);
4666       }
4667       break;
4668     default:
4669       break;
4670     }
4671   }
4672
4673   ::encode(csum_count, osd_op.outdata);
4674   osd_op.outdata.claim_append(csum);
4675   return 0;
4676 }
4677
4678 struct C_ExtentCmpRead : public Context {
4679   PrimaryLogPG *primary_log_pg;
4680   OSDOp &osd_op;
4681   ceph_le64 read_length;
4682   bufferlist read_bl;
4683   Context *fill_extent_ctx;
4684
4685   C_ExtentCmpRead(PrimaryLogPG *primary_log_pg, OSDOp &osd_op,
4686                   boost::optional<uint32_t> maybe_crc, uint64_t size,
4687                   OSDService *osd, hobject_t soid, __le32 flags)
4688     : primary_log_pg(primary_log_pg), osd_op(osd_op),
4689       fill_extent_ctx(new FillInVerifyExtent(&read_length, &osd_op.rval,
4690                                              &read_bl, maybe_crc, size,
4691                                              osd, soid, flags)) {
4692   }
4693   ~C_ExtentCmpRead() override {
4694     delete fill_extent_ctx;
4695   }
4696
4697   void finish(int r) override {
4698     if (r == -ENOENT) {
4699       osd_op.rval = 0;
4700       read_bl.clear();
4701       delete fill_extent_ctx;
4702     } else {
4703       fill_extent_ctx->complete(r);
4704     }
4705     fill_extent_ctx = nullptr;
4706
4707     if (osd_op.rval >= 0) {
4708       osd_op.rval = primary_log_pg->finish_extent_cmp(osd_op, read_bl);
4709     }
4710   }
4711 };
4712
4713 int PrimaryLogPG::do_extent_cmp(OpContext *ctx, OSDOp& osd_op)
4714 {
4715   dout(20) << __func__ << dendl;
4716   ceph_osd_op& op = osd_op.op;
4717
4718   if (!ctx->obs->exists || ctx->obs->oi.is_whiteout()) {
4719     dout(20) << __func__ << " object DNE" << dendl;
4720     return finish_extent_cmp(osd_op, {});
4721   } else if (pool.info.require_rollback()) {
4722     // If there is a data digest and it is possible we are reading
4723     // entire object, pass the digest.
4724     auto& oi = ctx->new_obs.oi;
4725     boost::optional<uint32_t> maybe_crc;
4726     if (oi.is_data_digest() && op.checksum.offset == 0 &&
4727         op.checksum.length >= oi.size) {
4728       maybe_crc = oi.data_digest;
4729     }
4730
4731     // async read
4732     auto& soid = oi.soid;
4733     auto extent_cmp_ctx = new C_ExtentCmpRead(this, osd_op, maybe_crc, oi.size,
4734                                               osd, soid, op.flags);
4735     ctx->pending_async_reads.push_back({
4736       {op.extent.offset, op.extent.length, op.flags},
4737       {&extent_cmp_ctx->read_bl, extent_cmp_ctx}});
4738
4739     dout(10) << __func__ << ": async_read noted for " << soid << dendl;
4740
4741     ctx->op_finishers[ctx->current_osd_subop_num].reset(
4742       new ReadFinisher(osd_op));
4743     return -EINPROGRESS;
4744   }
4745
4746   // sync read
4747   vector<OSDOp> read_ops(1);
4748   OSDOp& read_op = read_ops[0];
4749
4750   read_op.op.op = CEPH_OSD_OP_SYNC_READ;
4751   read_op.op.extent.offset = op.extent.offset;
4752   read_op.op.extent.length = op.extent.length;
4753   read_op.op.extent.truncate_seq = op.extent.truncate_seq;
4754   read_op.op.extent.truncate_size = op.extent.truncate_size;
4755
4756   int result = do_osd_ops(ctx, read_ops);
4757   if (result < 0) {
4758     derr << __func__ << " failed " << result << dendl;
4759     return result;
4760   }
4761   return finish_extent_cmp(osd_op, read_op.outdata);
4762 }
4763
4764 int PrimaryLogPG::finish_extent_cmp(OSDOp& osd_op, const bufferlist &read_bl)
4765 {
4766   for (uint64_t idx = 0; idx < osd_op.indata.length(); ++idx) {
4767     char read_byte = (idx < read_bl.length() ? read_bl[idx] : 0);
4768     if (osd_op.indata[idx] != read_byte) {
4769         return (-MAX_ERRNO - idx);
4770     }
4771   }
4772
4773   return 0;
4774 }
4775
4776 int PrimaryLogPG::do_read(OpContext *ctx, OSDOp& osd_op) {
4777   dout(20) << __func__ << dendl;
4778   auto& op = osd_op.op;
4779   auto& oi = ctx->new_obs.oi;
4780   auto& soid = oi.soid;
4781   __u32 seq = oi.truncate_seq;
4782   uint64_t size = oi.size;
4783   bool trimmed_read = false;
4784
4785   // are we beyond truncate_size?
4786   if ( (seq < op.extent.truncate_seq) &&
4787        (op.extent.offset + op.extent.length > op.extent.truncate_size) )
4788     size = op.extent.truncate_size;
4789
4790   if (op.extent.length == 0) //length is zero mean read the whole object
4791     op.extent.length = size;
4792
4793   if (op.extent.offset >= size) {
4794     op.extent.length = 0;
4795     trimmed_read = true;
4796   } else if (op.extent.offset + op.extent.length > size) {
4797     op.extent.length = size - op.extent.offset;
4798     trimmed_read = true;
4799   }
4800
4801   // read into a buffer
4802   int result = 0;
4803   if (trimmed_read && op.extent.length == 0) {
4804     // read size was trimmed to zero and it is expected to do nothing
4805     // a read operation of 0 bytes does *not* do nothing, this is why
4806     // the trimmed_read boolean is needed
4807   } else if (pool.info.require_rollback()) {
4808     boost::optional<uint32_t> maybe_crc;
4809     // If there is a data digest and it is possible we are reading
4810     // entire object, pass the digest.  FillInVerifyExtent will
4811     // will check the oi.size again.
4812     if (oi.is_data_digest() && op.extent.offset == 0 &&
4813         op.extent.length >= oi.size)
4814       maybe_crc = oi.data_digest;
4815     ctx->pending_async_reads.push_back(
4816       make_pair(
4817         boost::make_tuple(op.extent.offset, op.extent.length, op.flags),
4818         make_pair(&osd_op.outdata,
4819                   new FillInVerifyExtent(&op.extent.length, &osd_op.rval,
4820                                          &osd_op.outdata, maybe_crc, oi.size,
4821                                          osd, soid, op.flags))));
4822     dout(10) << " async_read noted for " << soid << dendl;
4823
4824     ctx->op_finishers[ctx->current_osd_subop_num].reset(
4825       new ReadFinisher(osd_op));
4826   } else {
4827     int r = pgbackend->objects_read_sync(
4828       soid, op.extent.offset, op.extent.length, op.flags, &osd_op.outdata);
4829     if (r == -EIO) {
4830       r = rep_repair_primary_object(soid, ctx->op);
4831     }
4832     if (r >= 0)
4833       op.extent.length = r;
4834     else {
4835       result = r;
4836       op.extent.length = 0;
4837     }
4838     dout(10) << " read got " << r << " / " << op.extent.length
4839              << " bytes from obj " << soid << dendl;
4840
4841     // whole object?  can we verify the checksum?
4842     if (op.extent.length == oi.size && oi.is_data_digest()) {
4843       uint32_t crc = osd_op.outdata.crc32c(-1);
4844       if (oi.data_digest != crc) {
4845         osd->clog->error() << info.pgid << std::hex
4846                            << " full-object read crc 0x" << crc
4847                            << " != expected 0x" << oi.data_digest
4848                            << std::dec << " on " << soid;
4849         // FIXME fall back to replica or something?
4850         result = -EIO;
4851       }
4852     }
4853   }
4854
4855   // XXX the op.extent.length is the requested length for async read
4856   // On error this length is changed to 0 after the error comes back.
4857   ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(op.extent.length, 10);
4858   ctx->delta_stats.num_rd++;
4859   return result;
4860 }
4861
4862 int PrimaryLogPG::do_sparse_read(OpContext *ctx, OSDOp& osd_op) {
4863   dout(20) << __func__ << dendl;
4864   auto& op = osd_op.op;
4865   auto& oi = ctx->new_obs.oi;
4866   auto& soid = oi.soid;
4867
4868   if (op.extent.truncate_seq) {
4869     dout(0) << "sparse_read does not support truncation sequence " << dendl;
4870     return -EINVAL;
4871   }
4872
4873   ++ctx->num_read;
4874   if (pool.info.ec_pool()) {
4875     // translate sparse read to a normal one if not supported
4876     uint64_t offset = op.extent.offset;
4877     uint64_t length = op.extent.length;
4878     if (offset > oi.size) {
4879       length = 0;
4880     } else if (offset + length > oi.size) {
4881       length = oi.size - offset;
4882     }
4883
4884     if (length > 0) {
4885       ctx->pending_async_reads.push_back(
4886         make_pair(
4887           boost::make_tuple(offset, length, op.flags),
4888           make_pair(
4889             &osd_op.outdata,
4890             new ToSparseReadResult(&osd_op.rval, &osd_op.outdata, offset,
4891                                    &op.extent.length))));
4892       dout(10) << " async_read (was sparse_read) noted for " << soid << dendl;
4893
4894       ctx->op_finishers[ctx->current_osd_subop_num].reset(
4895         new ReadFinisher(osd_op));
4896     } else {
4897       dout(10) << " sparse read ended up empty for " << soid << dendl;
4898       map<uint64_t, uint64_t> extents;
4899       ::encode(extents, osd_op.outdata);
4900     }
4901   } else {
4902     // read into a buffer
4903     map<uint64_t, uint64_t> m;
4904     uint32_t total_read = 0;
4905     int r = osd->store->fiemap(ch, ghobject_t(soid, ghobject_t::NO_GEN,
4906                                               info.pgid.shard),
4907                                op.extent.offset, op.extent.length, m);
4908     if (r < 0)  {
4909       return r;
4910     }
4911
4912     map<uint64_t, uint64_t>::iterator miter;
4913     bufferlist data_bl;
4914     uint64_t last = op.extent.offset;
4915     for (miter = m.begin(); miter != m.end(); ++miter) {
4916       // verify hole?
4917       if (cct->_conf->osd_verify_sparse_read_holes &&
4918           last < miter->first) {
4919         bufferlist t;
4920         uint64_t len = miter->first - last;
4921         r = pgbackend->objects_read_sync(soid, last, len, op.flags, &t);
4922         if (r < 0) {
4923           osd->clog->error() << coll << " " << soid
4924                              << " sparse-read failed to read: "
4925                              << r;
4926         } else if (!t.is_zero()) {
4927           osd->clog->error() << coll << " " << soid
4928                              << " sparse-read found data in hole "
4929                              << last << "~" << len;
4930         }
4931       }
4932
4933       bufferlist tmpbl;
4934       r = pgbackend->objects_read_sync(soid, miter->first, miter->second,
4935                                        op.flags, &tmpbl);
4936       if (r == -EIO) {
4937         r = rep_repair_primary_object(soid, ctx->op);
4938       }
4939       if (r < 0) {
4940         return r;
4941       }
4942
4943       // this is usually happen when we get extent that exceeds the actual file
4944       // size
4945       if (r < (int)miter->second)
4946         miter->second = r;
4947       total_read += r;
4948       dout(10) << "sparse-read " << miter->first << "@" << miter->second
4949                << dendl;
4950       data_bl.claim_append(tmpbl);
4951       last = miter->first + r;
4952     }
4953
4954     if (r < 0) {
4955       return r;
4956     }
4957
4958     // verify trailing hole?
4959     if (cct->_conf->osd_verify_sparse_read_holes) {
4960       uint64_t end = MIN(op.extent.offset + op.extent.length, oi.size);
4961       if (last < end) {
4962         bufferlist t;
4963         uint64_t len = end - last;
4964         r = pgbackend->objects_read_sync(soid, last, len, op.flags, &t);
4965         if (r < 0) {
4966           osd->clog->error() << coll << " " << soid
4967                              << " sparse-read failed to read: " << r;
4968         } else if (!t.is_zero()) {
4969           osd->clog->error() << coll << " " << soid
4970                              << " sparse-read found data in hole "
4971                              << last << "~" << len;
4972         }
4973       }
4974     }
4975
4976     // Why SPARSE_READ need checksum? In fact, librbd always use sparse-read.
4977     // Maybe at first, there is no much whole objects. With continued use, more
4978     // and more whole object exist. So from this point, for spare-read add
4979     // checksum make sense.
4980     if (total_read == oi.size && oi.is_data_digest()) {
4981       uint32_t crc = data_bl.crc32c(-1);
4982       if (oi.data_digest != crc) {
4983         osd->clog->error() << info.pgid << std::hex
4984           << " full-object read crc 0x" << crc
4985           << " != expected 0x" << oi.data_digest
4986           << std::dec << " on " << soid;
4987         // FIXME fall back to replica or something?
4988         return -EIO;
4989       }
4990     }
4991
4992     op.extent.length = total_read;
4993
4994     ::encode(m, osd_op.outdata); // re-encode since it might be modified
4995     ::encode_destructively(data_bl, osd_op.outdata);
4996
4997     dout(10) << " sparse_read got " << total_read << " bytes from object "
4998              << soid << dendl;
4999   }
5000
5001   ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(op.extent.length, 10);
5002   ctx->delta_stats.num_rd++;
5003   return 0;
5004 }
5005
5006 int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
5007 {
5008   int result = 0;
5009   SnapSetContext *ssc = ctx->obc->ssc;
5010   ObjectState& obs = ctx->new_obs;
5011   object_info_t& oi = obs.oi;
5012   const hobject_t& soid = oi.soid;
5013
5014   PGTransaction* t = ctx->op_t.get();
5015
5016   dout(10) << "do_osd_op " << soid << " " << ops << dendl;
5017
5018   ctx->current_osd_subop_num = 0;
5019   for (vector<OSDOp>::iterator p = ops.begin(); p != ops.end(); ++p, ctx->current_osd_subop_num++) {
5020     OSDOp& osd_op = *p;
5021     ceph_osd_op& op = osd_op.op;
5022
5023     OpFinisher* op_finisher = nullptr;
5024     {
5025       auto op_finisher_it = ctx->op_finishers.find(ctx->current_osd_subop_num);
5026       if (op_finisher_it != ctx->op_finishers.end()) {
5027         op_finisher = op_finisher_it->second.get();
5028       }
5029     }
5030
5031     // TODO: check endianness (__le32 vs uint32_t, etc.)
5032     // The fields in ceph_osd_op are little-endian (according to the definition in rados.h),
5033     // but the code in this function seems to treat them as native-endian.  What should the
5034     // tracepoints do?
5035     tracepoint(osd, do_osd_op_pre, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op), op.flags);
5036
5037     dout(10) << "do_osd_op  " << osd_op << dendl;
5038
5039     bufferlist::iterator bp = osd_op.indata.begin();
5040
5041     // user-visible modifcation?
5042     switch (op.op) {
5043       // non user-visible modifications
5044     case CEPH_OSD_OP_WATCH:
5045     case CEPH_OSD_OP_CACHE_EVICT:
5046     case CEPH_OSD_OP_CACHE_FLUSH:
5047     case CEPH_OSD_OP_CACHE_TRY_FLUSH:
5048     case CEPH_OSD_OP_UNDIRTY:
5049     case CEPH_OSD_OP_COPY_FROM:  // we handle user_version update explicitly
5050     case CEPH_OSD_OP_CACHE_PIN:
5051     case CEPH_OSD_OP_CACHE_UNPIN:
5052     case CEPH_OSD_OP_SET_REDIRECT:
5053       break;
5054     default:
5055       if (op.op & CEPH_OSD_OP_MODE_WR)
5056         ctx->user_modify = true;
5057     }
5058
5059     // munge -1 truncate to 0 truncate
5060     if (ceph_osd_op_uses_extent(op.op) &&
5061         op.extent.truncate_seq == 1 &&
5062         op.extent.truncate_size == (-1ULL)) {
5063       op.extent.truncate_size = 0;
5064       op.extent.truncate_seq = 0;
5065     }
5066
5067     // munge ZERO -> TRUNCATE?  (don't munge to DELETE or we risk hosing attributes)
5068     if (op.op == CEPH_OSD_OP_ZERO &&
5069         obs.exists &&
5070         op.extent.offset < cct->_conf->osd_max_object_size &&
5071         op.extent.length >= 1 &&
5072         op.extent.length <= cct->_conf->osd_max_object_size &&
5073         op.extent.offset + op.extent.length >= oi.size) {
5074       if (op.extent.offset >= oi.size) {
5075         // no-op
5076         goto fail;
5077       }
5078       dout(10) << " munging ZERO " << op.extent.offset << "~" << op.extent.length
5079                << " -> TRUNCATE " << op.extent.offset << " (old size is " << oi.size << ")" << dendl;
5080       op.op = CEPH_OSD_OP_TRUNCATE;
5081     }
5082
5083     switch (op.op) {
5084
5085       // --- READS ---
5086
5087     case CEPH_OSD_OP_CMPEXT:
5088       ++ctx->num_read;
5089       tracepoint(osd, do_osd_op_pre_extent_cmp, soid.oid.name.c_str(),
5090                  soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
5091                  op.extent.length, op.extent.truncate_size,
5092                  op.extent.truncate_seq);
5093
5094       if (op_finisher == nullptr) {
5095         result = do_extent_cmp(ctx, osd_op);
5096       } else {
5097         result = op_finisher->execute();
5098       }
5099       break;
5100
5101     case CEPH_OSD_OP_SYNC_READ:
5102       if (pool.info.require_rollback()) {
5103         result = -EOPNOTSUPP;
5104         break;
5105       }
5106       // fall through
5107     case CEPH_OSD_OP_READ:
5108       ++ctx->num_read;
5109       tracepoint(osd, do_osd_op_pre_read, soid.oid.name.c_str(),
5110                  soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
5111                  op.extent.length, op.extent.truncate_size,
5112                  op.extent.truncate_seq);
5113       if (op_finisher == nullptr) {
5114         if (!ctx->data_off) {
5115           ctx->data_off = op.extent.offset;
5116         }
5117         result = do_read(ctx, osd_op);
5118       } else {
5119         result = op_finisher->execute();
5120       }
5121       break;
5122
5123     case CEPH_OSD_OP_CHECKSUM:
5124       ++ctx->num_read;
5125       {
5126         tracepoint(osd, do_osd_op_pre_checksum, soid.oid.name.c_str(),
5127                    soid.snap.val, oi.size, oi.truncate_seq, op.checksum.type,
5128                    op.checksum.offset, op.checksum.length,
5129                    op.checksum.chunk_size);
5130
5131         if (op_finisher == nullptr) {
5132           result = do_checksum(ctx, osd_op, &bp);
5133         } else {
5134           result = op_finisher->execute();
5135         }
5136       }
5137       break;
5138
5139     /* map extents */
5140     case CEPH_OSD_OP_MAPEXT:
5141       tracepoint(osd, do_osd_op_pre_mapext, soid.oid.name.c_str(), soid.snap.val, op.extent.offset, op.extent.length);
5142       if (pool.info.require_rollback()) {
5143         result = -EOPNOTSUPP;
5144         break;
5145       }
5146       ++ctx->num_read;
5147       {
5148         // read into a buffer
5149         bufferlist bl;
5150         int r = osd->store->fiemap(ch, ghobject_t(soid, ghobject_t::NO_GEN,
5151                                                   info.pgid.shard),
5152                                    op.extent.offset, op.extent.length, bl);
5153         osd_op.outdata.claim(bl);
5154         if (r < 0)
5155           result = r;
5156         else
5157           ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(bl.length(), 10);
5158         ctx->delta_stats.num_rd++;
5159         dout(10) << " map_extents done on object " << soid << dendl;
5160       }
5161       break;
5162
5163     /* map extents */
5164     case CEPH_OSD_OP_SPARSE_READ:
5165       tracepoint(osd, do_osd_op_pre_sparse_read, soid.oid.name.c_str(),
5166                  soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
5167                  op.extent.length, op.extent.truncate_size,
5168                  op.extent.truncate_seq);
5169       if (op_finisher == nullptr) {
5170         result = do_sparse_read(ctx, osd_op);
5171       } else {
5172         result = op_finisher->execute();
5173       }
5174       break;
5175
5176     case CEPH_OSD_OP_CALL:
5177       {
5178         string cname, mname;
5179         bufferlist indata;
5180         try {
5181           bp.copy(op.cls.class_len, cname);
5182           bp.copy(op.cls.method_len, mname);
5183           bp.copy(op.cls.indata_len, indata);
5184         } catch (buffer::error& e) {
5185           dout(10) << "call unable to decode class + method + indata" << dendl;
5186           dout(30) << "in dump: ";
5187           osd_op.indata.hexdump(*_dout);
5188           *_dout << dendl;
5189           result = -EINVAL;
5190           tracepoint(osd, do_osd_op_pre_call, soid.oid.name.c_str(), soid.snap.val, "???", "???");
5191           break;
5192         }
5193         tracepoint(osd, do_osd_op_pre_call, soid.oid.name.c_str(), soid.snap.val, cname.c_str(), mname.c_str());
5194
5195         ClassHandler::ClassData *cls;
5196         result = osd->class_handler->open_class(cname, &cls);
5197         assert(result == 0);   // init_op_flags() already verified this works.
5198
5199         ClassHandler::ClassMethod *method = cls->get_method(mname.c_str());
5200         if (!method) {
5201           dout(10) << "call method " << cname << "." << mname << " does not exist" << dendl;
5202           result = -EOPNOTSUPP;
5203           break;
5204         }
5205
5206         int flags = method->get_flags();
5207         if (flags & CLS_METHOD_WR)
5208           ctx->user_modify = true;
5209
5210         bufferlist outdata;
5211         dout(10) << "call method " << cname << "." << mname << dendl;
5212         int prev_rd = ctx->num_read;
5213         int prev_wr = ctx->num_write;
5214         result = method->exec((cls_method_context_t)&ctx, indata, outdata);
5215
5216         if (ctx->num_read > prev_rd && !(flags & CLS_METHOD_RD)) {
5217           derr << "method " << cname << "." << mname << " tried to read object but is not marked RD" << dendl;
5218           result = -EIO;
5219           break;
5220         }
5221         if (ctx->num_write > prev_wr && !(flags & CLS_METHOD_WR)) {
5222           derr << "method " << cname << "." << mname << " tried to update object but is not marked WR" << dendl;
5223           result = -EIO;
5224           break;
5225         }
5226
5227         dout(10) << "method called response length=" << outdata.length() << dendl;
5228         op.extent.length = outdata.length();
5229         osd_op.outdata.claim_append(outdata);
5230         dout(30) << "out dump: ";
5231         osd_op.outdata.hexdump(*_dout);
5232         *_dout << dendl;
5233       }
5234       break;
5235
5236     case CEPH_OSD_OP_STAT:
5237       // note: stat does not require RD
5238       {
5239         tracepoint(osd, do_osd_op_pre_stat, soid.oid.name.c_str(), soid.snap.val);
5240
5241         if (obs.exists && !oi.is_whiteout()) {
5242           ::encode(oi.size, osd_op.outdata);
5243           ::encode(oi.mtime, osd_op.outdata);
5244           dout(10) << "stat oi has " << oi.size << " " << oi.mtime << dendl;
5245         } else {
5246           result = -ENOENT;
5247           dout(10) << "stat oi object does not exist" << dendl;
5248         }
5249
5250         ctx->delta_stats.num_rd++;
5251       }
5252       break;
5253
5254     case CEPH_OSD_OP_ISDIRTY:
5255       ++ctx->num_read;
5256       {
5257         tracepoint(osd, do_osd_op_pre_isdirty, soid.oid.name.c_str(), soid.snap.val);
5258         bool is_dirty = obs.oi.is_dirty();
5259         ::encode(is_dirty, osd_op.outdata);
5260         ctx->delta_stats.num_rd++;
5261         result = 0;
5262       }
5263       break;
5264
5265     case CEPH_OSD_OP_UNDIRTY:
5266       ++ctx->num_write;
5267       {
5268         tracepoint(osd, do_osd_op_pre_undirty, soid.oid.name.c_str(), soid.snap.val);
5269         if (oi.is_dirty()) {
5270           ctx->undirty = true;  // see make_writeable()
5271           ctx->modify = true;
5272           ctx->delta_stats.num_wr++;
5273         }
5274         result = 0;
5275       }
5276       break;
5277
5278     case CEPH_OSD_OP_CACHE_TRY_FLUSH:
5279       ++ctx->num_write;
5280       {
5281         tracepoint(osd, do_osd_op_pre_try_flush, soid.oid.name.c_str(), soid.snap.val);
5282         if (ctx->lock_type != ObjectContext::RWState::RWNONE) {
5283           dout(10) << "cache-try-flush without SKIPRWLOCKS flag set" << dendl;
5284           result = -EINVAL;
5285           break;
5286         }
5287         if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
5288           result = -EINVAL;
5289           break;
5290         }
5291         if (!obs.exists) {
5292           result = 0;
5293           break;
5294         }
5295         if (oi.is_cache_pinned()) {
5296           dout(10) << "cache-try-flush on a pinned object, consider unpin this object first" << dendl;
5297           result = -EPERM;
5298           break;
5299         }
5300         if (oi.is_dirty()) {
5301           result = start_flush(ctx->op, ctx->obc, false, NULL, boost::none);
5302           if (result == -EINPROGRESS)
5303             result = -EAGAIN;
5304         } else {
5305           result = 0;
5306         }
5307       }
5308       break;
5309
5310     case CEPH_OSD_OP_CACHE_FLUSH:
5311       ++ctx->num_write;
5312       {
5313         tracepoint(osd, do_osd_op_pre_cache_flush, soid.oid.name.c_str(), soid.snap.val);
5314         if (ctx->lock_type == ObjectContext::RWState::RWNONE) {
5315           dout(10) << "cache-flush with SKIPRWLOCKS flag set" << dendl;
5316           result = -EINVAL;
5317           break;
5318         }
5319         if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
5320           result = -EINVAL;
5321           break;
5322         }
5323         if (!obs.exists) {
5324           result = 0;
5325           break;
5326         }
5327         if (oi.is_cache_pinned()) {
5328           dout(10) << "cache-flush on a pinned object, consider unpin this object first" << dendl;
5329           result = -EPERM;
5330           break;
5331         }
5332         hobject_t missing;
5333         if (oi.is_dirty()) {
5334           result = start_flush(ctx->op, ctx->obc, true, &missing, boost::none);
5335           if (result == -EINPROGRESS)
5336             result = -EAGAIN;
5337         } else {
5338           result = 0;
5339         }
5340         // Check special return value which has set missing_return
5341         if (result == -ENOENT) {
5342           dout(10) << __func__ << " CEPH_OSD_OP_CACHE_FLUSH got ENOENT" << dendl;
5343           assert(!missing.is_min());
5344           wait_for_unreadable_object(missing, ctx->op);
5345           // Error code which is used elsewhere when wait_for_unreadable_object() is used
5346           result = -EAGAIN;
5347         }
5348       }
5349       break;
5350
5351     case CEPH_OSD_OP_CACHE_EVICT:
5352       ++ctx->num_write;
5353       {
5354         tracepoint(osd, do_osd_op_pre_cache_evict, soid.oid.name.c_str(), soid.snap.val);
5355         if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
5356           result = -EINVAL;
5357           break;
5358         }
5359         if (!obs.exists) {
5360           result = 0;
5361           break;
5362         }
5363         if (oi.is_cache_pinned()) {
5364           dout(10) << "cache-evict on a pinned object, consider unpin this object first" << dendl;
5365           result = -EPERM;
5366           break;
5367         }
5368         if (oi.is_dirty()) {
5369           result = -EBUSY;
5370           break;
5371         }
5372         if (!oi.watchers.empty()) {
5373           result = -EBUSY;
5374           break;
5375         }
5376         if (soid.snap == CEPH_NOSNAP) {
5377           result = _verify_no_head_clones(soid, ssc->snapset);
5378           if (result < 0)
5379             break;
5380         }
5381         result = _delete_oid(ctx, true, false);
5382         if (result >= 0) {
5383           // mark that this is a cache eviction to avoid triggering normal
5384           // make_writeable() clone or snapdir object creation in finish_ctx()
5385           ctx->cache_evict = true;
5386         }
5387         osd->logger->inc(l_osd_tier_evict);
5388       }
5389       break;
5390
5391     case CEPH_OSD_OP_GETXATTR:
5392       ++ctx->num_read;
5393       {
5394         string aname;
5395         bp.copy(op.xattr.name_len, aname);
5396         tracepoint(osd, do_osd_op_pre_getxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
5397         string name = "_" + aname;
5398         int r = getattr_maybe_cache(
5399           ctx->obc,
5400           name,
5401           &(osd_op.outdata));
5402         if (r >= 0) {
5403           op.xattr.value_len = osd_op.outdata.length();
5404           result = 0;
5405           ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
5406         } else
5407           result = r;
5408
5409         ctx->delta_stats.num_rd++;
5410       }
5411       break;
5412
5413    case CEPH_OSD_OP_GETXATTRS:
5414       ++ctx->num_read;
5415       {
5416         tracepoint(osd, do_osd_op_pre_getxattrs, soid.oid.name.c_str(), soid.snap.val);
5417         map<string, bufferlist> out;
5418         result = getattrs_maybe_cache(
5419           ctx->obc,
5420           &out,
5421           true);
5422
5423         bufferlist bl;
5424         ::encode(out, bl);
5425         ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(bl.length(), 10);
5426         ctx->delta_stats.num_rd++;
5427         osd_op.outdata.claim_append(bl);
5428       }
5429       break;
5430
5431     case CEPH_OSD_OP_CMPXATTR:
5432       ++ctx->num_read;
5433       {
5434         string aname;
5435         bp.copy(op.xattr.name_len, aname);
5436         tracepoint(osd, do_osd_op_pre_cmpxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
5437         string name = "_" + aname;
5438         name[op.xattr.name_len + 1] = 0;
5439
5440         bufferlist xattr;
5441         result = getattr_maybe_cache(
5442           ctx->obc,
5443           name,
5444           &xattr);
5445         if (result < 0 && result != -EEXIST && result != -ENODATA)
5446           break;
5447
5448         ctx->delta_stats.num_rd++;
5449         ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(xattr.length(), 10);
5450
5451         switch (op.xattr.cmp_mode) {
5452         case CEPH_OSD_CMPXATTR_MODE_STRING:
5453           {
5454             string val;
5455             bp.copy(op.xattr.value_len, val);
5456             val[op.xattr.value_len] = 0;
5457             dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name << " val=" << val
5458                      << " op=" << (int)op.xattr.cmp_op << " mode=" << (int)op.xattr.cmp_mode << dendl;
5459             result = do_xattr_cmp_str(op.xattr.cmp_op, val, xattr);
5460           }
5461           break;
5462
5463         case CEPH_OSD_CMPXATTR_MODE_U64:
5464           {
5465             uint64_t u64val;
5466             try {
5467               ::decode(u64val, bp);
5468             }
5469             catch (buffer::error& e) {
5470               result = -EINVAL;
5471               goto fail;
5472             }
5473             dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name << " val=" << u64val
5474                      << " op=" << (int)op.xattr.cmp_op << " mode=" << (int)op.xattr.cmp_mode << dendl;
5475             result = do_xattr_cmp_u64(op.xattr.cmp_op, u64val, xattr);
5476           }
5477           break;
5478
5479         default:
5480           dout(10) << "bad cmp mode " << (int)op.xattr.cmp_mode << dendl;
5481           result = -EINVAL;
5482         }
5483
5484         if (!result) {
5485           dout(10) << "comparison returned false" << dendl;
5486           result = -ECANCELED;
5487           break;
5488         }
5489         if (result < 0) {
5490           dout(10) << "comparison returned " << result << " " << cpp_strerror(-result) << dendl;
5491           break;
5492         }
5493
5494         dout(10) << "comparison returned true" << dendl;
5495       }
5496       break;
5497
5498     case CEPH_OSD_OP_ASSERT_VER:
5499       ++ctx->num_read;
5500       {
5501         uint64_t ver = op.assert_ver.ver;
5502         tracepoint(osd, do_osd_op_pre_assert_ver, soid.oid.name.c_str(), soid.snap.val, ver);
5503         if (!ver)
5504           result = -EINVAL;
5505         else if (ver < oi.user_version)
5506           result = -ERANGE;
5507         else if (ver > oi.user_version)
5508           result = -EOVERFLOW;
5509       }
5510       break;
5511
5512     case CEPH_OSD_OP_LIST_WATCHERS:
5513       ++ctx->num_read;
5514       {
5515         tracepoint(osd, do_osd_op_pre_list_watchers, soid.oid.name.c_str(), soid.snap.val);
5516         obj_list_watch_response_t resp;
5517
5518         map<pair<uint64_t, entity_name_t>, watch_info_t>::const_iterator oi_iter;
5519         for (oi_iter = oi.watchers.begin(); oi_iter != oi.watchers.end();
5520                                        ++oi_iter) {
5521           dout(20) << "key cookie=" << oi_iter->first.first
5522                << " entity=" << oi_iter->first.second << " "
5523                << oi_iter->second << dendl;
5524           assert(oi_iter->first.first == oi_iter->second.cookie);
5525           assert(oi_iter->first.second.is_client());
5526
5527           watch_item_t wi(oi_iter->first.second, oi_iter->second.cookie,
5528                  oi_iter->second.timeout_seconds, oi_iter->second.addr);
5529           resp.entries.push_back(wi);
5530         }
5531
5532         resp.encode(osd_op.outdata, ctx->get_features());
5533         result = 0;
5534
5535         ctx->delta_stats.num_rd++;
5536         break;
5537       }
5538
5539     case CEPH_OSD_OP_LIST_SNAPS:
5540       ++ctx->num_read;
5541       {
5542         tracepoint(osd, do_osd_op_pre_list_snaps, soid.oid.name.c_str(), soid.snap.val);
5543         obj_list_snap_response_t resp;
5544
5545         if (!ssc) {
5546           ssc = ctx->obc->ssc = get_snapset_context(soid, false);
5547         }
5548         assert(ssc);
5549
5550         int clonecount = ssc->snapset.clones.size();
5551         if (ssc->snapset.head_exists)
5552           clonecount++;
5553         resp.clones.reserve(clonecount);
5554         for (auto clone_iter = ssc->snapset.clones.begin();
5555              clone_iter != ssc->snapset.clones.end(); ++clone_iter) {
5556           clone_info ci;
5557           ci.cloneid = *clone_iter;
5558
5559           hobject_t clone_oid = soid;
5560           clone_oid.snap = *clone_iter;
5561
5562           if (!ssc->snapset.is_legacy()) {
5563             auto p = ssc->snapset.clone_snaps.find(*clone_iter);
5564             if (p == ssc->snapset.clone_snaps.end()) {
5565               osd->clog->error() << "osd." << osd->whoami
5566                                  << ": inconsistent clone_snaps found for oid "
5567                                  << soid << " clone " << *clone_iter
5568                                  << " snapset " << ssc->snapset;
5569               result = -EINVAL;
5570               break;
5571             }
5572             for (auto q = p->second.rbegin(); q != p->second.rend(); ++q) {
5573               ci.snaps.push_back(*q);
5574             }
5575           } else {
5576             /* No need to take a lock here.  We are only inspecting state cached on
5577              * in the ObjectContext, so we aren't performing an actual read unless
5578              * the clone obc is not already loaded (in which case, it cannot have
5579              * an in progress write).  We also do not risk exposing uncommitted
5580              * state since we do have a read lock on the head object or snapdir,
5581              * which we would have to write lock in order to make user visible
5582              * modifications to the snapshot state (snap trim related mutations
5583              * are not user visible).
5584              */
5585             if (is_missing_object(clone_oid)) {
5586               dout(20) << "LIST_SNAPS " << clone_oid << " missing" << dendl;
5587               wait_for_unreadable_object(clone_oid, ctx->op);
5588               result = -EAGAIN;
5589               break;
5590             }
5591
5592             ObjectContextRef clone_obc = get_object_context(clone_oid, false);
5593             if (!clone_obc) {
5594               if (maybe_handle_cache(
5595                     ctx->op, true, clone_obc, -ENOENT, clone_oid, true)) {
5596                 // promoting the clone
5597                 result = -EAGAIN;
5598               } else {
5599                 osd->clog->error() << "osd." << osd->whoami
5600                                    << ": missing clone " << clone_oid
5601                                    << " for oid "
5602                                    << soid;
5603                 // should not happen
5604                 result = -ENOENT;
5605               }
5606               break;
5607             }
5608             for (vector<snapid_t>::reverse_iterator p =
5609                    clone_obc->obs.oi.legacy_snaps.rbegin();
5610                  p != clone_obc->obs.oi.legacy_snaps.rend();
5611                  ++p) {
5612               ci.snaps.push_back(*p);
5613             }
5614           }
5615
5616           dout(20) << " clone " << *clone_iter << " snaps " << ci.snaps << dendl;
5617
5618           map<snapid_t, interval_set<uint64_t> >::const_iterator coi;
5619           coi = ssc->snapset.clone_overlap.find(ci.cloneid);
5620           if (coi == ssc->snapset.clone_overlap.end()) {
5621             osd->clog->error() << "osd." << osd->whoami
5622                                << ": inconsistent clone_overlap found for oid "
5623                               << soid << " clone " << *clone_iter;
5624             result = -EINVAL;
5625             break;
5626           }
5627           const interval_set<uint64_t> &o = coi->second;
5628           ci.overlap.reserve(o.num_intervals());
5629           for (interval_set<uint64_t>::const_iterator r = o.begin();
5630                r != o.end(); ++r) {
5631             ci.overlap.push_back(pair<uint64_t,uint64_t>(r.get_start(),
5632                                                          r.get_len()));
5633           }
5634
5635           map<snapid_t, uint64_t>::const_iterator si;
5636           si = ssc->snapset.clone_size.find(ci.cloneid);
5637           if (si == ssc->snapset.clone_size.end()) {
5638             osd->clog->error() << "osd." << osd->whoami
5639                                << ": inconsistent clone_size found for oid "
5640                                << soid << " clone " << *clone_iter;
5641             result = -EINVAL;
5642             break;
5643           }
5644           ci.size = si->second;
5645
5646           resp.clones.push_back(ci);
5647         }
5648         if (result < 0) {
5649           break;
5650         }
5651         if (ssc->snapset.head_exists &&
5652             !ctx->obc->obs.oi.is_whiteout()) {
5653           assert(obs.exists);
5654           clone_info ci;
5655           ci.cloneid = CEPH_NOSNAP;
5656
5657           //Size for HEAD is oi.size
5658           ci.size = oi.size;
5659
5660           resp.clones.push_back(ci);
5661         }
5662         resp.seq = ssc->snapset.seq;
5663
5664         resp.encode(osd_op.outdata);
5665         result = 0;
5666
5667         ctx->delta_stats.num_rd++;
5668         break;
5669       }
5670
5671    case CEPH_OSD_OP_NOTIFY:
5672       ++ctx->num_read;
5673       {
5674         uint32_t timeout;
5675         bufferlist bl;
5676
5677         try {
5678           uint32_t ver; // obsolete
5679           ::decode(ver, bp);
5680           ::decode(timeout, bp);
5681           ::decode(bl, bp);
5682         } catch (const buffer::error &e) {
5683           timeout = 0;
5684         }
5685         tracepoint(osd, do_osd_op_pre_notify, soid.oid.name.c_str(), soid.snap.val, timeout);
5686         if (!timeout)
5687           timeout = cct->_conf->osd_default_notify_timeout;
5688
5689         notify_info_t n;
5690         n.timeout = timeout;
5691         n.notify_id = osd->get_next_id(get_osdmap()->get_epoch());
5692         n.cookie = op.watch.cookie;
5693         n.bl = bl;
5694         ctx->notifies.push_back(n);
5695
5696         // return our unique notify id to the client
5697         ::encode(n.notify_id, osd_op.outdata);
5698       }
5699       break;
5700
5701     case CEPH_OSD_OP_NOTIFY_ACK:
5702       ++ctx->num_read;
5703       {
5704         try {
5705           uint64_t notify_id = 0;
5706           uint64_t watch_cookie = 0;
5707           ::decode(notify_id, bp);
5708           ::decode(watch_cookie, bp);
5709           bufferlist reply_bl;
5710           if (!bp.end()) {
5711             ::decode(reply_bl, bp);
5712           }
5713           tracepoint(osd, do_osd_op_pre_notify_ack, soid.oid.name.c_str(), soid.snap.val, notify_id, watch_cookie, "Y");
5714           OpContext::NotifyAck ack(notify_id, watch_cookie, reply_bl);
5715           ctx->notify_acks.push_back(ack);
5716         } catch (const buffer::error &e) {
5717           tracepoint(osd, do_osd_op_pre_notify_ack, soid.oid.name.c_str(), soid.snap.val, op.watch.cookie, 0, "N");
5718           OpContext::NotifyAck ack(
5719             // op.watch.cookie is actually the notify_id for historical reasons
5720             op.watch.cookie
5721             );
5722           ctx->notify_acks.push_back(ack);
5723         }
5724       }
5725       break;
5726
5727     case CEPH_OSD_OP_SETALLOCHINT:
5728       ++ctx->num_write;
5729       {
5730         tracepoint(osd, do_osd_op_pre_setallochint, soid.oid.name.c_str(), soid.snap.val, op.alloc_hint.expected_object_size, op.alloc_hint.expected_write_size);
5731         maybe_create_new_object(ctx);
5732         oi.expected_object_size = op.alloc_hint.expected_object_size;
5733         oi.expected_write_size = op.alloc_hint.expected_write_size;
5734         oi.alloc_hint_flags = op.alloc_hint.flags;
5735         t->set_alloc_hint(soid, op.alloc_hint.expected_object_size,
5736                           op.alloc_hint.expected_write_size,
5737                           op.alloc_hint.flags);
5738         ctx->delta_stats.num_wr++;
5739         result = 0;
5740       }
5741       break;
5742
5743
5744       // --- WRITES ---
5745
5746       // -- object data --
5747
5748     case CEPH_OSD_OP_WRITE:
5749       ++ctx->num_write;
5750       { // write
5751         __u32 seq = oi.truncate_seq;
5752         tracepoint(osd, do_osd_op_pre_write, soid.oid.name.c_str(), soid.snap.val, oi.size, seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
5753         if (op.extent.length != osd_op.indata.length()) {
5754           result = -EINVAL;
5755           break;
5756         }
5757
5758         if (pool.info.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED))
5759           op.flags = op.flags | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
5760
5761         if (pool.info.requires_aligned_append() &&
5762             (op.extent.offset % pool.info.required_alignment() != 0)) {
5763           result = -EOPNOTSUPP;
5764           break;
5765         }
5766
5767         if (!obs.exists) {
5768           if (pool.info.requires_aligned_append() && op.extent.offset) {
5769             result = -EOPNOTSUPP;
5770             break;
5771           }
5772         } else if (op.extent.offset != oi.size &&
5773                    pool.info.requires_aligned_append()) {
5774           result = -EOPNOTSUPP;
5775           break;
5776         }
5777
5778         if (seq && (seq > op.extent.truncate_seq) &&
5779             (op.extent.offset + op.extent.length > oi.size)) {
5780           // old write, arrived after trimtrunc
5781           op.extent.length = (op.extent.offset > oi.size ? 0 : oi.size - op.extent.offset);
5782           dout(10) << " old truncate_seq " << op.extent.truncate_seq << " < current " << seq
5783                    << ", adjusting write length to " << op.extent.length << dendl;
5784           bufferlist t;
5785           t.substr_of(osd_op.indata, 0, op.extent.length);
5786           osd_op.indata.swap(t);
5787         }
5788         if (op.extent.truncate_seq > seq) {
5789           // write arrives before trimtrunc
5790           if (obs.exists && !oi.is_whiteout()) {
5791             dout(10) << " truncate_seq " << op.extent.truncate_seq << " > current " << seq
5792                      << ", truncating to " << op.extent.truncate_size << dendl;
5793             t->truncate(soid, op.extent.truncate_size);
5794             oi.truncate_seq = op.extent.truncate_seq;
5795             oi.truncate_size = op.extent.truncate_size;
5796             if (op.extent.truncate_size != oi.size) {
5797               ctx->delta_stats.num_bytes -= oi.size;
5798               ctx->delta_stats.num_bytes += op.extent.truncate_size;
5799               oi.size = op.extent.truncate_size;
5800             }
5801           } else {
5802             dout(10) << " truncate_seq " << op.extent.truncate_seq << " > current " << seq
5803                      << ", but object is new" << dendl;
5804             oi.truncate_seq = op.extent.truncate_seq;
5805             oi.truncate_size = op.extent.truncate_size;
5806           }
5807         }
5808         result = check_offset_and_length(op.extent.offset, op.extent.length, cct->_conf->osd_max_object_size);
5809         if (result < 0)
5810           break;
5811
5812         maybe_create_new_object(ctx);
5813
5814         if (op.extent.length == 0) {
5815           if (op.extent.offset > oi.size) {
5816             t->truncate(
5817               soid, op.extent.offset);
5818           } else {
5819             t->nop(soid);
5820           }
5821         } else {
5822           t->write(
5823             soid, op.extent.offset, op.extent.length, osd_op.indata, op.flags);
5824         }
5825
5826         if (op.extent.offset == 0 && op.extent.length >= oi.size)
5827           obs.oi.set_data_digest(osd_op.indata.crc32c(-1));
5828         else if (op.extent.offset == oi.size && obs.oi.is_data_digest())
5829           obs.oi.set_data_digest(osd_op.indata.crc32c(obs.oi.data_digest));
5830         else
5831           obs.oi.clear_data_digest();
5832         write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
5833                                     op.extent.offset, op.extent.length);
5834
5835       }
5836       break;
5837
5838     case CEPH_OSD_OP_WRITEFULL:
5839       ++ctx->num_write;
5840       { // write full object
5841         tracepoint(osd, do_osd_op_pre_writefull, soid.oid.name.c_str(), soid.snap.val, oi.size, 0, op.extent.length);
5842
5843         if (op.extent.length != osd_op.indata.length()) {
5844           result = -EINVAL;
5845           break;
5846         }
5847         result = check_offset_and_length(0, op.extent.length, cct->_conf->osd_max_object_size);
5848         if (result < 0)
5849           break;
5850
5851         if (pool.info.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED))
5852           op.flags = op.flags | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
5853
5854         maybe_create_new_object(ctx);
5855         if (pool.info.require_rollback()) {
5856           t->truncate(soid, 0);
5857         } else if (obs.exists && op.extent.length < oi.size) {
5858           t->truncate(soid, op.extent.length);
5859         }
5860         if (op.extent.length) {
5861           t->write(soid, 0, op.extent.length, osd_op.indata, op.flags);
5862         }
5863         obs.oi.set_data_digest(osd_op.indata.crc32c(-1));
5864
5865         write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
5866             0, op.extent.length, true);
5867       }
5868       break;
5869
5870     case CEPH_OSD_OP_WRITESAME:
5871       ++ctx->num_write;
5872       tracepoint(osd, do_osd_op_pre_writesame, soid.oid.name.c_str(), soid.snap.val, oi.size, op.writesame.offset, op.writesame.length, op.writesame.data_length);
5873       result = do_writesame(ctx, osd_op);
5874       break;
5875
5876     case CEPH_OSD_OP_ROLLBACK :
5877       ++ctx->num_write;
5878       tracepoint(osd, do_osd_op_pre_rollback, soid.oid.name.c_str(), soid.snap.val);
5879       result = _rollback_to(ctx, op);
5880       break;
5881
5882     case CEPH_OSD_OP_ZERO:
5883       tracepoint(osd, do_osd_op_pre_zero, soid.oid.name.c_str(), soid.snap.val, op.extent.offset, op.extent.length);
5884       if (pool.info.requires_aligned_append()) {
5885         result = -EOPNOTSUPP;
5886         break;
5887       }
5888       ++ctx->num_write;
5889       { // zero
5890         result = check_offset_and_length(op.extent.offset, op.extent.length, cct->_conf->osd_max_object_size);
5891         if (result < 0)
5892           break;
5893         assert(op.extent.length);
5894         if (obs.exists && !oi.is_whiteout()) {
5895           t->zero(soid, op.extent.offset, op.extent.length);
5896           interval_set<uint64_t> ch;
5897           ch.insert(op.extent.offset, op.extent.length);
5898           ctx->modified_ranges.union_of(ch);
5899           ctx->delta_stats.num_wr++;
5900           oi.clear_data_digest();
5901         } else {
5902           // no-op
5903         }
5904       }
5905       break;
5906     case CEPH_OSD_OP_CREATE:
5907       ++ctx->num_write;
5908       {
5909         tracepoint(osd, do_osd_op_pre_create, soid.oid.name.c_str(), soid.snap.val);
5910         int flags = le32_to_cpu(op.flags);
5911         if (obs.exists && !oi.is_whiteout() &&
5912             (flags & CEPH_OSD_OP_FLAG_EXCL)) {
5913           result = -EEXIST; /* this is an exclusive create */
5914         } else {
5915           if (osd_op.indata.length()) {
5916             bufferlist::iterator p = osd_op.indata.begin();
5917             string category;
5918             try {
5919               ::decode(category, p);
5920             }
5921             catch (buffer::error& e) {
5922               result = -EINVAL;
5923               goto fail;
5924             }
5925             // category is no longer implemented.
5926           }
5927           if (result >= 0) {
5928             maybe_create_new_object(ctx);
5929             t->nop(soid);
5930           }
5931         }
5932       }
5933       break;
5934
5935     case CEPH_OSD_OP_TRIMTRUNC:
5936       op.extent.offset = op.extent.truncate_size;
5937       // falling through
5938
5939     case CEPH_OSD_OP_TRUNCATE:
5940       tracepoint(osd, do_osd_op_pre_truncate, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
5941       if (pool.info.requires_aligned_append()) {
5942         result = -EOPNOTSUPP;
5943         break;
5944       }
5945       ++ctx->num_write;
5946       {
5947         // truncate
5948         if (!obs.exists || oi.is_whiteout()) {
5949           dout(10) << " object dne, truncate is a no-op" << dendl;
5950           break;
5951         }
5952
5953         if (op.extent.offset > cct->_conf->osd_max_object_size) {
5954           result = -EFBIG;
5955           break;
5956         }
5957
5958         if (op.extent.truncate_seq) {
5959           assert(op.extent.offset == op.extent.truncate_size);
5960           if (op.extent.truncate_seq <= oi.truncate_seq) {
5961             dout(10) << " truncate seq " << op.extent.truncate_seq << " <= current " << oi.truncate_seq
5962                      << ", no-op" << dendl;
5963             break; // old
5964           }
5965           dout(10) << " truncate seq " << op.extent.truncate_seq << " > current " << oi.truncate_seq
5966                    << ", truncating" << dendl;
5967           oi.truncate_seq = op.extent.truncate_seq;
5968           oi.truncate_size = op.extent.truncate_size;
5969         }
5970
5971         maybe_create_new_object(ctx);
5972         t->truncate(soid, op.extent.offset);
5973         if (oi.size > op.extent.offset) {
5974           interval_set<uint64_t> trim;
5975           trim.insert(op.extent.offset, oi.size-op.extent.offset);
5976           ctx->modified_ranges.union_of(trim);
5977         }
5978         if (op.extent.offset != oi.size) {
5979           ctx->delta_stats.num_bytes -= oi.size;
5980           ctx->delta_stats.num_bytes += op.extent.offset;
5981           oi.size = op.extent.offset;
5982         }
5983         ctx->delta_stats.num_wr++;
5984         // do no set exists, or we will break above DELETE -> TRUNCATE munging.
5985
5986         oi.clear_data_digest();
5987       }
5988       break;
5989
5990     case CEPH_OSD_OP_DELETE:
5991       ++ctx->num_write;
5992       tracepoint(osd, do_osd_op_pre_delete, soid.oid.name.c_str(), soid.snap.val);
5993       {
5994         result = _delete_oid(ctx, false, ctx->ignore_cache);
5995       }
5996       break;
5997
5998     case CEPH_OSD_OP_WATCH:
5999       ++ctx->num_write;
6000       {
6001         tracepoint(osd, do_osd_op_pre_watch, soid.oid.name.c_str(), soid.snap.val,
6002                    op.watch.cookie, op.watch.op);
6003         if (!obs.exists) {
6004           result = -ENOENT;
6005           break;
6006         }
6007         uint64_t cookie = op.watch.cookie;
6008         entity_name_t entity = ctx->reqid.name;
6009         ObjectContextRef obc = ctx->obc;
6010
6011         dout(10) << "watch " << ceph_osd_watch_op_name(op.watch.op)
6012                  << ": ctx->obc=" << (void *)obc.get() << " cookie=" << cookie
6013                  << " oi.version=" << oi.version.version << " ctx->at_version=" << ctx->at_version << dendl;
6014         dout(10) << "watch: oi.user_version=" << oi.user_version<< dendl;
6015         dout(10) << "watch: peer_addr="
6016           << ctx->op->get_req()->get_connection()->get_peer_addr() << dendl;
6017
6018         uint32_t timeout = cct->_conf->osd_client_watch_timeout;
6019         if (op.watch.timeout != 0) {
6020           timeout = op.watch.timeout;
6021         }
6022
6023         watch_info_t w(cookie, timeout,
6024           ctx->op->get_req()->get_connection()->get_peer_addr());
6025         if (op.watch.op == CEPH_OSD_WATCH_OP_WATCH ||
6026             op.watch.op == CEPH_OSD_WATCH_OP_LEGACY_WATCH) {
6027           if (oi.watchers.count(make_pair(cookie, entity))) {
6028             dout(10) << " found existing watch " << w << " by " << entity << dendl;
6029           } else {
6030             dout(10) << " registered new watch " << w << " by " << entity << dendl;
6031             oi.watchers[make_pair(cookie, entity)] = w;
6032             t->nop(soid);  // make sure update the object_info on disk!
6033           }
6034           bool will_ping = (op.watch.op == CEPH_OSD_WATCH_OP_WATCH);
6035           ctx->watch_connects.push_back(make_pair(w, will_ping));
6036         } else if (op.watch.op == CEPH_OSD_WATCH_OP_RECONNECT) {
6037           if (!oi.watchers.count(make_pair(cookie, entity))) {
6038             result = -ENOTCONN;
6039             break;
6040           }
6041           dout(10) << " found existing watch " << w << " by " << entity << dendl;
6042           ctx->watch_connects.push_back(make_pair(w, true));
6043         } else if (op.watch.op == CEPH_OSD_WATCH_OP_PING) {
6044           /* Note: WATCH with PING doesn't cause may_write() to return true,
6045            * so if there is nothing else in the transaction, this is going
6046            * to run do_osd_op_effects, but not write out a log entry */
6047           if (!oi.watchers.count(make_pair(cookie, entity))) {
6048             result = -ENOTCONN;
6049             break;
6050           }
6051           map<pair<uint64_t,entity_name_t>,WatchRef>::iterator p =
6052             obc->watchers.find(make_pair(cookie, entity));
6053           if (p == obc->watchers.end() ||
6054               !p->second->is_connected()) {
6055             // client needs to reconnect
6056             result = -ETIMEDOUT;
6057             break;
6058           }
6059           dout(10) << " found existing watch " << w << " by " << entity << dendl;
6060           p->second->got_ping(ceph_clock_now());
6061           result = 0;
6062         } else if (op.watch.op == CEPH_OSD_WATCH_OP_UNWATCH) {
6063           map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator oi_iter =
6064             oi.watchers.find(make_pair(cookie, entity));
6065           if (oi_iter != oi.watchers.end()) {
6066             dout(10) << " removed watch " << oi_iter->second << " by "
6067                      << entity << dendl;
6068             oi.watchers.erase(oi_iter);
6069             t->nop(soid);  // update oi on disk
6070             ctx->watch_disconnects.push_back(
6071               watch_disconnect_t(cookie, entity, false));
6072           } else {
6073             dout(10) << " can't remove: no watch by " << entity << dendl;
6074           }
6075         }
6076       }
6077       break;
6078
6079     case CEPH_OSD_OP_CACHE_PIN:
6080       tracepoint(osd, do_osd_op_pre_cache_pin, soid.oid.name.c_str(), soid.snap.val);
6081       if ((!pool.info.is_tier() ||
6082           pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)) {
6083         result = -EINVAL;
6084         dout(10) << " pin object is only allowed on the cache tier " << dendl;
6085         break;
6086       }
6087       ++ctx->num_write;
6088       {
6089         if (!obs.exists || oi.is_whiteout()) {
6090           result = -ENOENT;
6091           break;
6092         }
6093
6094         if (!oi.is_cache_pinned()) {
6095           oi.set_flag(object_info_t::FLAG_CACHE_PIN);
6096           ctx->modify = true;
6097           ctx->delta_stats.num_objects_pinned++;
6098           ctx->delta_stats.num_wr++;
6099         }
6100         result = 0;
6101       }
6102       break;
6103
6104     case CEPH_OSD_OP_CACHE_UNPIN:
6105       tracepoint(osd, do_osd_op_pre_cache_unpin, soid.oid.name.c_str(), soid.snap.val);
6106       if ((!pool.info.is_tier() ||
6107           pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)) {
6108         result = -EINVAL;
6109         dout(10) << " pin object is only allowed on the cache tier " << dendl;
6110         break;
6111       }
6112       ++ctx->num_write;
6113       {
6114         if (!obs.exists || oi.is_whiteout()) {
6115           result = -ENOENT;
6116           break;
6117         }
6118
6119         if (oi.is_cache_pinned()) {
6120           oi.clear_flag(object_info_t::FLAG_CACHE_PIN);
6121           ctx->modify = true;
6122           ctx->delta_stats.num_objects_pinned--;
6123           ctx->delta_stats.num_wr++;
6124         }
6125         result = 0;
6126       }
6127       break;
6128
6129     case CEPH_OSD_OP_SET_REDIRECT:
6130       ++ctx->num_write;
6131       {
6132         if (pool.info.is_tier()) {
6133           result = -EINVAL;
6134           break;
6135         }
6136         if (!obs.exists) {
6137           result = -ENOENT;
6138           break;
6139         }
6140         if (get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS) {
6141           result = -EOPNOTSUPP;
6142           break;
6143         }
6144
6145         object_t target_name;
6146         object_locator_t target_oloc;
6147         snapid_t target_snapid = (uint64_t)op.copy_from.snapid;
6148         version_t target_version = op.copy_from.src_version;
6149         try {
6150           ::decode(target_name, bp);
6151           ::decode(target_oloc, bp);
6152         }
6153         catch (buffer::error& e) {
6154           result = -EINVAL;
6155           goto fail;
6156         }
6157         pg_t raw_pg;
6158         get_osdmap()->object_locator_to_pg(target_name, target_oloc, raw_pg);
6159         hobject_t target(target_name, target_oloc.key, target_snapid,
6160                 raw_pg.ps(), raw_pg.pool(),
6161                 target_oloc.nspace);
6162         if (target == soid) {
6163           dout(20) << " set-redirect self is invalid" << dendl;
6164           result = -EINVAL;
6165           break;
6166         }
6167         oi.set_flag(object_info_t::FLAG_MANIFEST);
6168         oi.manifest.redirect_target = target;
6169         oi.manifest.type = object_manifest_t::TYPE_REDIRECT;
6170         t->truncate(soid, 0);
6171         if (oi.is_omap() && pool.info.supports_omap()) {
6172           t->omap_clear(soid);
6173           obs.oi.clear_omap_digest();
6174           obs.oi.clear_flag(object_info_t::FLAG_OMAP);
6175         }
6176         ctx->delta_stats.num_bytes -= oi.size;
6177         oi.size = 0;
6178         oi.new_object();
6179         oi.user_version = target_version;
6180         ctx->user_at_version = target_version;
6181         /* rm_attrs */
6182         map<string,bufferlist> rmattrs;
6183         result = getattrs_maybe_cache(ctx->obc,
6184                     &rmattrs,
6185                     true);
6186         if (result < 0) {
6187           return result;
6188         }
6189         map<string, bufferlist>::iterator iter;
6190         for (iter = rmattrs.begin(); iter != rmattrs.end(); ++iter) {
6191           const string& name = iter->first;
6192           t->rmattr(soid, name);
6193         }
6194         dout(10) << "set-redirect oid:" << oi.soid << " user_version: " << oi.user_version << dendl;
6195       }
6196
6197       break;
6198
6199       // -- object attrs --
6200
6201     case CEPH_OSD_OP_SETXATTR:
6202       ++ctx->num_write;
6203       {
6204         if (cct->_conf->osd_max_attr_size > 0 &&
6205             op.xattr.value_len > cct->_conf->osd_max_attr_size) {
6206           tracepoint(osd, do_osd_op_pre_setxattr, soid.oid.name.c_str(), soid.snap.val, "???");
6207           result = -EFBIG;
6208           break;
6209         }
6210         unsigned max_name_len = MIN(osd->store->get_max_attr_name_length(),
6211                                     cct->_conf->osd_max_attr_name_len);
6212         if (op.xattr.name_len > max_name_len) {
6213           result = -ENAMETOOLONG;
6214           break;
6215         }
6216         maybe_create_new_object(ctx);
6217         string aname;
6218         bp.copy(op.xattr.name_len, aname);
6219         tracepoint(osd, do_osd_op_pre_setxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
6220         string name = "_" + aname;
6221         bufferlist bl;
6222         bp.copy(op.xattr.value_len, bl);
6223         t->setattr(soid, name, bl);
6224         ctx->delta_stats.num_wr++;
6225       }
6226       break;
6227
6228     case CEPH_OSD_OP_RMXATTR:
6229       ++ctx->num_write;
6230       {
6231         string aname;
6232         bp.copy(op.xattr.name_len, aname);
6233         tracepoint(osd, do_osd_op_pre_rmxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
6234         if (!obs.exists || oi.is_whiteout()) {
6235           result = -ENOENT;
6236           break;
6237         }
6238         string name = "_" + aname;
6239         t->rmattr(soid, name);
6240         ctx->delta_stats.num_wr++;
6241       }
6242       break;
6243
6244
6245       // -- fancy writers --
6246     case CEPH_OSD_OP_APPEND:
6247       {
6248         tracepoint(osd, do_osd_op_pre_append, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
6249         // just do it inline; this works because we are happy to execute
6250         // fancy op on replicas as well.
6251         vector<OSDOp> nops(1);
6252         OSDOp& newop = nops[0];
6253         newop.op.op = CEPH_OSD_OP_WRITE;
6254         newop.op.extent.offset = oi.size;
6255         newop.op.extent.length = op.extent.length;
6256         newop.op.extent.truncate_seq = oi.truncate_seq;
6257         newop.indata = osd_op.indata;
6258         result = do_osd_ops(ctx, nops);
6259         osd_op.outdata.claim(newop.outdata);
6260       }
6261       break;
6262
6263     case CEPH_OSD_OP_STARTSYNC:
6264       tracepoint(osd, do_osd_op_pre_startsync, soid.oid.name.c_str(), soid.snap.val);
6265       t->nop(soid);
6266       break;
6267
6268
6269       // -- trivial map --
6270     case CEPH_OSD_OP_TMAPGET:
6271       tracepoint(osd, do_osd_op_pre_tmapget, soid.oid.name.c_str(), soid.snap.val);
6272       if (pool.info.require_rollback()) {
6273         result = -EOPNOTSUPP;
6274         break;
6275       }
6276       {
6277         vector<OSDOp> nops(1);
6278         OSDOp& newop = nops[0];
6279         newop.op.op = CEPH_OSD_OP_SYNC_READ;
6280         newop.op.extent.offset = 0;
6281         newop.op.extent.length = 0;
6282         do_osd_ops(ctx, nops);
6283         osd_op.outdata.claim(newop.outdata);
6284       }
6285       break;
6286
6287     case CEPH_OSD_OP_TMAPPUT:
6288       tracepoint(osd, do_osd_op_pre_tmapput, soid.oid.name.c_str(), soid.snap.val);
6289       if (pool.info.require_rollback()) {
6290         result = -EOPNOTSUPP;
6291         break;
6292       }
6293       {
6294         //_dout_lock.Lock();
6295         //osd_op.data.hexdump(*_dout);
6296         //_dout_lock.Unlock();
6297
6298         // verify sort order
6299         bool unsorted = false;
6300         if (true) {
6301           bufferlist header;
6302           ::decode(header, bp);
6303           uint32_t n;
6304           ::decode(n, bp);
6305           string last_key;
6306           while (n--) {
6307             string key;
6308             ::decode(key, bp);
6309             dout(10) << "tmapput key " << key << dendl;
6310             bufferlist val;
6311             ::decode(val, bp);
6312             if (key < last_key) {
6313               dout(10) << "TMAPPUT is unordered; resorting" << dendl;
6314               unsorted = true;
6315               break;
6316             }
6317             last_key = key;
6318           }
6319         }
6320
6321         // write it
6322         vector<OSDOp> nops(1);
6323         OSDOp& newop = nops[0];
6324         newop.op.op = CEPH_OSD_OP_WRITEFULL;
6325         newop.op.extent.offset = 0;
6326         newop.op.extent.length = osd_op.indata.length();
6327         newop.indata = osd_op.indata;
6328
6329         if (unsorted) {
6330           bp = osd_op.indata.begin();
6331           bufferlist header;
6332           map<string, bufferlist> m;
6333           ::decode(header, bp);
6334           ::decode(m, bp);
6335           assert(bp.end());
6336           bufferlist newbl;
6337           ::encode(header, newbl);
6338           ::encode(m, newbl);
6339           newop.indata = newbl;
6340         }
6341         result = do_osd_ops(ctx, nops);
6342         assert(result == 0);
6343       }
6344       break;
6345
6346     case CEPH_OSD_OP_TMAPUP:
6347       tracepoint(osd, do_osd_op_pre_tmapup, soid.oid.name.c_str(), soid.snap.val);
6348       if (pool.info.require_rollback()) {
6349         result = -EOPNOTSUPP;
6350         break;
6351       }
6352       ++ctx->num_write;
6353       result = do_tmapup(ctx, bp, osd_op);
6354       break;
6355
6356     case CEPH_OSD_OP_TMAP2OMAP:
6357       ++ctx->num_write;
6358       tracepoint(osd, do_osd_op_pre_tmap2omap, soid.oid.name.c_str(), soid.snap.val);
6359       result = do_tmap2omap(ctx, op.tmap2omap.flags);
6360       break;
6361
6362       // OMAP Read ops
6363     case CEPH_OSD_OP_OMAPGETKEYS:
6364       ++ctx->num_read;
6365       {
6366         string start_after;
6367         uint64_t max_return;
6368         try {
6369           ::decode(start_after, bp);
6370           ::decode(max_return, bp);
6371         }
6372         catch (buffer::error& e) {
6373           result = -EINVAL;
6374           tracepoint(osd, do_osd_op_pre_omapgetkeys, soid.oid.name.c_str(), soid.snap.val, "???", 0);
6375           goto fail;
6376         }
6377         if (max_return > cct->_conf->osd_max_omap_entries_per_request) {
6378           max_return = cct->_conf->osd_max_omap_entries_per_request;
6379         }
6380         tracepoint(osd, do_osd_op_pre_omapgetkeys, soid.oid.name.c_str(), soid.snap.val, start_after.c_str(), max_return);
6381
6382         bufferlist bl;
6383         uint32_t num = 0;
6384         bool truncated = false;
6385         if (oi.is_omap()) {
6386           ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(
6387             coll, ghobject_t(soid)
6388             );
6389           assert(iter);
6390           iter->upper_bound(start_after);
6391           for (num = 0; iter->valid(); ++num, iter->next(false)) {
6392             if (num >= max_return ||
6393                 bl.length() >= cct->_conf->osd_max_omap_bytes_per_request) {
6394               truncated = true;
6395               break;
6396             }
6397             ::encode(iter->key(), bl);
6398           }
6399         } // else return empty out_set
6400         ::encode(num, osd_op.outdata);
6401         osd_op.outdata.claim_append(bl);
6402         ::encode(truncated, osd_op.outdata);
6403         ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
6404         ctx->delta_stats.num_rd++;
6405       }
6406       break;
6407
6408     case CEPH_OSD_OP_OMAPGETVALS:
6409       ++ctx->num_read;
6410       {
6411         string start_after;
6412         uint64_t max_return;
6413         string filter_prefix;
6414         try {
6415           ::decode(start_after, bp);
6416           ::decode(max_return, bp);
6417           ::decode(filter_prefix, bp);
6418         }
6419         catch (buffer::error& e) {
6420           result = -EINVAL;
6421           tracepoint(osd, do_osd_op_pre_omapgetvals, soid.oid.name.c_str(), soid.snap.val, "???", 0, "???");
6422           goto fail;
6423         }
6424         if (max_return > cct->_conf->osd_max_omap_entries_per_request) {
6425           max_return = cct->_conf->osd_max_omap_entries_per_request;
6426         }
6427         tracepoint(osd, do_osd_op_pre_omapgetvals, soid.oid.name.c_str(), soid.snap.val, start_after.c_str(), max_return, filter_prefix.c_str());
6428
6429         uint32_t num = 0;
6430         bool truncated = false;
6431         bufferlist bl;
6432         if (oi.is_omap()) {
6433           ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(
6434             coll, ghobject_t(soid)
6435             );
6436           if (!iter) {
6437             result = -ENOENT;
6438             goto fail;
6439           }
6440           iter->upper_bound(start_after);
6441           if (filter_prefix > start_after) iter->lower_bound(filter_prefix);
6442           for (num = 0;
6443                iter->valid() &&
6444                  iter->key().substr(0, filter_prefix.size()) == filter_prefix;
6445                ++num, iter->next(false)) {
6446             dout(20) << "Found key " << iter->key() << dendl;
6447             if (num >= max_return ||
6448                 bl.length() >= cct->_conf->osd_max_omap_bytes_per_request) {
6449               truncated = true;
6450               break;
6451             }
6452             ::encode(iter->key(), bl);
6453             ::encode(iter->value(), bl);
6454           }
6455         } // else return empty out_set
6456         ::encode(num, osd_op.outdata);
6457         osd_op.outdata.claim_append(bl);
6458         ::encode(truncated, osd_op.outdata);
6459         ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
6460         ctx->delta_stats.num_rd++;
6461       }
6462       break;
6463
6464     case CEPH_OSD_OP_OMAPGETHEADER:
6465       tracepoint(osd, do_osd_op_pre_omapgetheader, soid.oid.name.c_str(), soid.snap.val);
6466       if (!oi.is_omap()) {
6467         // return empty header
6468         break;
6469       }
6470       ++ctx->num_read;
6471       {
6472         osd->store->omap_get_header(ch, ghobject_t(soid), &osd_op.outdata);
6473         ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
6474         ctx->delta_stats.num_rd++;
6475       }
6476       break;
6477
6478     case CEPH_OSD_OP_OMAPGETVALSBYKEYS:
6479       ++ctx->num_read;
6480       {
6481         set<string> keys_to_get;
6482         try {
6483           ::decode(keys_to_get, bp);
6484         }
6485         catch (buffer::error& e) {
6486           result = -EINVAL;
6487           tracepoint(osd, do_osd_op_pre_omapgetvalsbykeys, soid.oid.name.c_str(), soid.snap.val, "???");
6488           goto fail;
6489         }
6490         tracepoint(osd, do_osd_op_pre_omapgetvalsbykeys, soid.oid.name.c_str(), soid.snap.val, list_entries(keys_to_get).c_str());
6491         map<string, bufferlist> out;
6492         if (oi.is_omap()) {
6493           osd->store->omap_get_values(ch, ghobject_t(soid), keys_to_get, &out);
6494         } // else return empty omap entries
6495         ::encode(out, osd_op.outdata);
6496         ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
6497         ctx->delta_stats.num_rd++;
6498       }
6499       break;
6500
6501     case CEPH_OSD_OP_OMAP_CMP:
6502       ++ctx->num_read;
6503       {
6504         if (!obs.exists || oi.is_whiteout()) {
6505           result = -ENOENT;
6506           tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, "???");
6507           break;
6508         }
6509         map<string, pair<bufferlist, int> > assertions;
6510         try {
6511           ::decode(assertions, bp);
6512         }
6513         catch (buffer::error& e) {
6514           result = -EINVAL;
6515           tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, "???");
6516           goto fail;
6517         }
6518         tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, list_keys(assertions).c_str());
6519
6520         map<string, bufferlist> out;
6521
6522         if (oi.is_omap()) {
6523           set<string> to_get;
6524           for (map<string, pair<bufferlist, int> >::iterator i = assertions.begin();
6525                i != assertions.end();
6526                ++i)
6527             to_get.insert(i->first);
6528           int r = osd->store->omap_get_values(ch, ghobject_t(soid),
6529                                               to_get, &out);
6530           if (r < 0) {
6531             result = r;
6532             break;
6533           }
6534         } // else leave out empty
6535
6536         //Should set num_rd_kb based on encode length of map
6537         ctx->delta_stats.num_rd++;
6538
6539         int r = 0;
6540         bufferlist empty;
6541         for (map<string, pair<bufferlist, int> >::iterator i = assertions.begin();
6542              i != assertions.end();
6543              ++i) {
6544           auto out_entry = out.find(i->first);
6545           bufferlist &bl = (out_entry != out.end()) ?
6546             out_entry->second : empty;
6547           switch (i->second.second) {
6548           case CEPH_OSD_CMPXATTR_OP_EQ:
6549             if (!(bl == i->second.first)) {
6550               r = -ECANCELED;
6551             }
6552             break;
6553           case CEPH_OSD_CMPXATTR_OP_LT:
6554             if (!(bl < i->second.first)) {
6555               r = -ECANCELED;
6556             }
6557             break;
6558           case CEPH_OSD_CMPXATTR_OP_GT:
6559             if (!(bl > i->second.first)) {
6560               r = -ECANCELED;
6561             }
6562             break;
6563           default:
6564             r = -EINVAL;
6565             break;
6566           }
6567           if (r < 0)
6568             break;
6569         }
6570         if (r < 0) {
6571           result = r;
6572         }
6573       }
6574       break;
6575
6576       // OMAP Write ops
6577     case CEPH_OSD_OP_OMAPSETVALS:
6578       if (!pool.info.supports_omap()) {
6579         result = -EOPNOTSUPP;
6580         tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
6581         break;
6582       }
6583       ++ctx->num_write;
6584       {
6585         maybe_create_new_object(ctx);
6586         bufferlist to_set_bl;
6587         try {
6588           decode_str_str_map_to_bl(bp, &to_set_bl);
6589         }
6590         catch (buffer::error& e) {
6591           result = -EINVAL;
6592           tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
6593           goto fail;
6594         }
6595         tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
6596         if (cct->_conf->subsys.should_gather(dout_subsys, 20)) {
6597           dout(20) << "setting vals: " << dendl;
6598           map<string,bufferlist> to_set;
6599           bufferlist::iterator pt = to_set_bl.begin();
6600           ::decode(to_set, pt);
6601           for (map<string, bufferlist>::iterator i = to_set.begin();
6602                i != to_set.end();
6603                ++i) {
6604             dout(20) << "\t" << i->first << dendl;
6605           }
6606         }
6607         t->omap_setkeys(soid, to_set_bl);
6608         ctx->delta_stats.num_wr++;
6609       }
6610       obs.oi.set_flag(object_info_t::FLAG_OMAP);
6611       obs.oi.clear_omap_digest();
6612       break;
6613
6614     case CEPH_OSD_OP_OMAPSETHEADER:
6615       tracepoint(osd, do_osd_op_pre_omapsetheader, soid.oid.name.c_str(), soid.snap.val);
6616       if (!pool.info.supports_omap()) {
6617         result = -EOPNOTSUPP;
6618         break;
6619       }
6620       ++ctx->num_write;
6621       {
6622         maybe_create_new_object(ctx);
6623         t->omap_setheader(soid, osd_op.indata);
6624         ctx->delta_stats.num_wr++;
6625       }
6626       obs.oi.set_flag(object_info_t::FLAG_OMAP);
6627       obs.oi.clear_omap_digest();
6628       break;
6629
6630     case CEPH_OSD_OP_OMAPCLEAR:
6631       tracepoint(osd, do_osd_op_pre_omapclear, soid.oid.name.c_str(), soid.snap.val);
6632       if (!pool.info.supports_omap()) {
6633         result = -EOPNOTSUPP;
6634         break;
6635       }
6636       ++ctx->num_write;
6637       {
6638         if (!obs.exists || oi.is_whiteout()) {
6639           result = -ENOENT;
6640           break;
6641         }
6642         if (oi.is_omap()) {
6643           t->omap_clear(soid);
6644           ctx->delta_stats.num_wr++;
6645           obs.oi.clear_omap_digest();
6646           obs.oi.clear_flag(object_info_t::FLAG_OMAP);
6647         }
6648       }
6649       break;
6650
6651     case CEPH_OSD_OP_OMAPRMKEYS:
6652       if (!pool.info.supports_omap()) {
6653         result = -EOPNOTSUPP;
6654         tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
6655         break;
6656       }
6657       ++ctx->num_write;
6658       {
6659         if (!obs.exists || oi.is_whiteout()) {
6660           result = -ENOENT;
6661           tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
6662           break;
6663         }
6664         bufferlist to_rm_bl;
6665         try {
6666           decode_str_set_to_bl(bp, &to_rm_bl);
6667         }
6668         catch (buffer::error& e) {
6669           result = -EINVAL;
6670           tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
6671           goto fail;
6672         }
6673         tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
6674         t->omap_rmkeys(soid, to_rm_bl);
6675         ctx->delta_stats.num_wr++;
6676       }
6677       obs.oi.clear_omap_digest();
6678       break;
6679
6680     case CEPH_OSD_OP_COPY_GET:
6681       ++ctx->num_read;
6682       tracepoint(osd, do_osd_op_pre_copy_get, soid.oid.name.c_str(),
6683                  soid.snap.val);
6684       if (op_finisher == nullptr) {
6685         result = do_copy_get(ctx, bp, osd_op, ctx->obc);
6686       } else {
6687         result = op_finisher->execute();
6688       }
6689       break;
6690
6691     case CEPH_OSD_OP_COPY_FROM:
6692       ++ctx->num_write;
6693       {
6694         object_t src_name;
6695         object_locator_t src_oloc;
6696         snapid_t src_snapid = (uint64_t)op.copy_from.snapid;
6697         version_t src_version = op.copy_from.src_version;
6698         try {
6699           ::decode(src_name, bp);
6700           ::decode(src_oloc, bp);
6701         }
6702         catch (buffer::error& e) {
6703           result = -EINVAL;
6704           tracepoint(osd,
6705                      do_osd_op_pre_copy_from,
6706                      soid.oid.name.c_str(),
6707                      soid.snap.val,
6708                      "???",
6709                      0,
6710                      "???",
6711                      "???",
6712                      0,
6713                      src_snapid,
6714                      src_version);
6715           goto fail;
6716         }
6717         tracepoint(osd,
6718                    do_osd_op_pre_copy_from,
6719                    soid.oid.name.c_str(),
6720                    soid.snap.val,
6721                    src_name.name.c_str(),
6722                    src_oloc.pool,
6723                    src_oloc.key.c_str(),
6724                    src_oloc.nspace.c_str(),
6725                    src_oloc.hash,
6726                    src_snapid,
6727                    src_version);
6728         if (op_finisher == nullptr) {
6729           // start
6730           pg_t raw_pg;
6731           get_osdmap()->object_locator_to_pg(src_name, src_oloc, raw_pg);
6732           hobject_t src(src_name, src_oloc.key, src_snapid,
6733                         raw_pg.ps(), raw_pg.pool(),
6734                         src_oloc.nspace);
6735           if (src == soid) {
6736             dout(20) << " copy from self is invalid" << dendl;
6737             result = -EINVAL;
6738             break;
6739           }
6740           CopyFromCallback *cb = new CopyFromCallback(ctx, osd_op);
6741           ctx->op_finishers[ctx->current_osd_subop_num].reset(
6742             new CopyFromFinisher(cb));
6743           start_copy(cb, ctx->obc, src, src_oloc, src_version,
6744                      op.copy_from.flags,
6745                      false,
6746                      op.copy_from.src_fadvise_flags,
6747                      op.flags);
6748           result = -EINPROGRESS;
6749         } else {
6750           // finish
6751           result = op_finisher->execute();
6752           assert(result == 0);
6753
6754           // COPY_FROM cannot be executed multiple times -- it must restart
6755           ctx->op_finishers.erase(ctx->current_osd_subop_num);
6756         }
6757       }
6758       break;
6759
6760     default:
6761       tracepoint(osd, do_osd_op_pre_unknown, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op));
6762       dout(1) << "unrecognized osd op " << op.op
6763               << " " << ceph_osd_op_name(op.op)
6764               << dendl;
6765       result = -EOPNOTSUPP;
6766     }
6767
6768   fail:
6769     osd_op.rval = result;
6770     tracepoint(osd, do_osd_op_post, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op), op.flags, result);
6771     if (result < 0 && (op.flags & CEPH_OSD_OP_FLAG_FAILOK))
6772       result = 0;
6773
6774     if (result < 0)
6775       break;
6776   }
6777   return result;
6778 }
6779
6780 int PrimaryLogPG::_get_tmap(OpContext *ctx, bufferlist *header, bufferlist *vals)
6781 {
6782   if (ctx->new_obs.oi.size == 0) {
6783     dout(20) << "unable to get tmap for zero sized " << ctx->new_obs.oi.soid << dendl;
6784     return -ENODATA;
6785   }
6786   vector<OSDOp> nops(1);
6787   OSDOp &newop = nops[0];
6788   newop.op.op = CEPH_OSD_OP_TMAPGET;
6789   do_osd_ops(ctx, nops);
6790   try {
6791     bufferlist::iterator i = newop.outdata.begin();
6792     ::decode(*header, i);
6793     (*vals).substr_of(newop.outdata, i.get_off(), i.get_remaining());
6794   } catch (...) {
6795     dout(20) << "unsuccessful at decoding tmap for " << ctx->new_obs.oi.soid
6796              << dendl;
6797     return -EINVAL;
6798   }
6799   dout(20) << "successful at decoding tmap for " << ctx->new_obs.oi.soid
6800            << dendl;
6801   return 0;
6802 }
6803
6804 int PrimaryLogPG::_verify_no_head_clones(const hobject_t& soid,
6805                                         const SnapSet& ss)
6806 {
6807   // verify that all clones have been evicted
6808   dout(20) << __func__ << " verifying clones are absent "
6809            << ss << dendl;
6810   for (vector<snapid_t>::const_iterator p = ss.clones.begin();
6811        p != ss.clones.end();
6812        ++p) {
6813     hobject_t clone_oid = soid;
6814     clone_oid.snap = *p;
6815     if (is_missing_object(clone_oid))
6816       return -EBUSY;
6817     ObjectContextRef clone_obc = get_object_context(clone_oid, false);
6818     if (clone_obc && clone_obc->obs.exists) {
6819       dout(10) << __func__ << " cannot evict head before clone "
6820                << clone_oid << dendl;
6821       return -EBUSY;
6822     }
6823     if (copy_ops.count(clone_oid)) {
6824       dout(10) << __func__ << " cannot evict head, pending promote on clone "
6825                << clone_oid << dendl;
6826       return -EBUSY;
6827     }
6828   }
6829   return 0;
6830 }
6831
6832 inline int PrimaryLogPG::_delete_oid(
6833   OpContext *ctx,
6834   bool no_whiteout,     // no whiteouts, no matter what.
6835   bool try_no_whiteout) // try not to whiteout
6836 {
6837   SnapSet& snapset = ctx->new_snapset;
6838   ObjectState& obs = ctx->new_obs;
6839   object_info_t& oi = obs.oi;
6840   const hobject_t& soid = oi.soid;
6841   PGTransaction* t = ctx->op_t.get();
6842
6843   // cache: cache: set whiteout on delete?
6844   bool whiteout = false;
6845   if (pool.info.cache_mode != pg_pool_t::CACHEMODE_NONE
6846       && !no_whiteout
6847       && !try_no_whiteout) {
6848     whiteout = true;
6849   }
6850   bool legacy;
6851   if (get_osdmap()->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
6852     legacy = false;
6853     // in luminous or later, we can't delete the head if there are
6854     // clones. we trust the caller passing no_whiteout has already
6855     // verified they don't exist.
6856     if (!snapset.clones.empty() ||
6857         (!ctx->snapc.snaps.empty() && ctx->snapc.snaps[0] > snapset.seq)) {
6858       if (no_whiteout) {
6859         dout(20) << __func__ << " has or will have clones but no_whiteout=1"
6860                  << dendl;
6861       } else {
6862         dout(20) << __func__ << " has or will have clones; will whiteout"
6863                  << dendl;
6864         whiteout = true;
6865       }
6866     }
6867   } else {
6868     legacy = false;
6869   }
6870   dout(20) << __func__ << " " << soid << " whiteout=" << (int)whiteout
6871            << " no_whiteout=" << (int)no_whiteout
6872            << " try_no_whiteout=" << (int)try_no_whiteout
6873            << dendl;
6874   if (!obs.exists || (obs.oi.is_whiteout() && whiteout))
6875     return -ENOENT;
6876
6877   t->remove(soid);
6878
6879   if (oi.size > 0) {
6880     interval_set<uint64_t> ch;
6881     ch.insert(0, oi.size);
6882     ctx->modified_ranges.union_of(ch);
6883   }
6884
6885   ctx->delta_stats.num_wr++;
6886   if (soid.is_snap()) {
6887     assert(ctx->obc->ssc->snapset.clone_overlap.count(soid.snap));
6888     ctx->delta_stats.num_bytes -= ctx->obc->ssc->snapset.get_clone_bytes(soid.snap);
6889   } else {
6890     ctx->delta_stats.num_bytes -= oi.size;
6891   }
6892   oi.size = 0;
6893   oi.new_object();
6894
6895   // disconnect all watchers
6896   for (map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator p =
6897          oi.watchers.begin();
6898        p != oi.watchers.end();
6899        ++p) {
6900     dout(20) << __func__ << " will disconnect watcher " << p->first << dendl;
6901     ctx->watch_disconnects.push_back(
6902       watch_disconnect_t(p->first.first, p->first.second, true));
6903   }
6904   oi.watchers.clear();
6905
6906   if (whiteout) {
6907     dout(20) << __func__ << " setting whiteout on " << soid << dendl;
6908     oi.set_flag(object_info_t::FLAG_WHITEOUT);
6909     ctx->delta_stats.num_whiteouts++;
6910     t->create(soid);
6911     osd->logger->inc(l_osd_tier_whiteout);
6912     return 0;
6913   }
6914
6915   // delete the head
6916   ctx->delta_stats.num_objects--;
6917   if (soid.is_snap())
6918     ctx->delta_stats.num_object_clones--;
6919   if (oi.is_whiteout()) {
6920     dout(20) << __func__ << " deleting whiteout on " << soid << dendl;
6921     ctx->delta_stats.num_whiteouts--;
6922     oi.clear_flag(object_info_t::FLAG_WHITEOUT);
6923   }
6924   if (oi.is_cache_pinned()) {
6925     ctx->delta_stats.num_objects_pinned--;
6926   }
6927   if ((legacy || snapset.is_legacy()) && soid.is_head()) {
6928     snapset.head_exists = false;
6929   }
6930   obs.exists = false;
6931   return 0;
6932 }
6933
6934 int PrimaryLogPG::_rollback_to(OpContext *ctx, ceph_osd_op& op)
6935 {
6936   SnapSet& snapset = ctx->new_snapset;
6937   ObjectState& obs = ctx->new_obs;
6938   object_info_t& oi = obs.oi;
6939   const hobject_t& soid = oi.soid;
6940   PGTransaction* t = ctx->op_t.get();
6941   snapid_t snapid = (uint64_t)op.snap.snapid;
6942   hobject_t missing_oid;
6943
6944   dout(10) << "_rollback_to " << soid << " snapid " << snapid << dendl;
6945
6946   ObjectContextRef rollback_to;
6947   int ret = find_object_context(
6948     hobject_t(soid.oid, soid.get_key(), snapid, soid.get_hash(), info.pgid.pool(),
6949               soid.get_namespace()),
6950     &rollback_to, false, false, &missing_oid);
6951   if (ret == -EAGAIN) {
6952     /* clone must be missing */
6953     assert(is_degraded_or_backfilling_object(missing_oid));
6954     dout(20) << "_rollback_to attempted to roll back to a missing or backfilling clone "
6955              << missing_oid << " (requested snapid: ) " << snapid << dendl;
6956     block_write_on_degraded_snap(missing_oid, ctx->op);
6957     return ret;
6958   }
6959   {
6960     ObjectContextRef promote_obc;
6961     cache_result_t tier_mode_result;
6962     if (obs.exists && obs.oi.has_manifest()) {
6963       tier_mode_result =
6964         maybe_handle_manifest_detail(
6965           ctx->op,
6966           true,
6967           rollback_to);
6968     } else {
6969       tier_mode_result =
6970         maybe_handle_cache_detail(
6971           ctx->op,
6972           true,
6973           rollback_to,
6974           ret,
6975           missing_oid,
6976           true,
6977           false,
6978           &promote_obc);
6979     }
6980     switch (tier_mode_result) {
6981     case cache_result_t::NOOP:
6982       break;
6983     case cache_result_t::BLOCKED_PROMOTE:
6984       assert(promote_obc);
6985       block_write_on_snap_rollback(soid, promote_obc, ctx->op);
6986       return -EAGAIN;
6987     case cache_result_t::BLOCKED_FULL:
6988       block_write_on_full_cache(soid, ctx->op);
6989       return -EAGAIN;
6990     default:
6991       assert(0 == "must promote was set, other values are not valid");
6992       return -EAGAIN;
6993     }
6994   }
6995
6996   if (ret == -ENOENT || (rollback_to && rollback_to->obs.oi.is_whiteout())) {
6997     // there's no snapshot here, or there's no object.
6998     // if there's no snapshot, we delete the object; otherwise, do nothing.
6999     dout(20) << "_rollback_to deleting head on " << soid.oid
7000              << " because got ENOENT|whiteout on find_object_context" << dendl;
7001     if (ctx->obc->obs.oi.watchers.size()) {
7002       // Cannot delete an object with watchers
7003       ret = -EBUSY;
7004     } else {
7005       _delete_oid(ctx, false, false);
7006       ret = 0;
7007     }
7008   } else if (ret) {
7009     // ummm....huh? It *can't* return anything else at time of writing.
7010     assert(0 == "unexpected error code in _rollback_to");
7011   } else { //we got our context, let's use it to do the rollback!
7012     hobject_t& rollback_to_sobject = rollback_to->obs.oi.soid;
7013     if (is_degraded_or_backfilling_object(rollback_to_sobject)) {
7014       dout(20) << "_rollback_to attempted to roll back to a degraded object "
7015                << rollback_to_sobject << " (requested snapid: ) " << snapid << dendl;
7016       block_write_on_degraded_snap(rollback_to_sobject, ctx->op);
7017       ret = -EAGAIN;
7018     } else if (rollback_to->obs.oi.soid.snap == CEPH_NOSNAP) {
7019       // rolling back to the head; we just need to clone it.
7020       ctx->modify = true;
7021     } else {
7022       /* 1) Delete current head
7023        * 2) Clone correct snapshot into head
7024        * 3) Calculate clone_overlaps by following overlaps
7025        *    forward from rollback snapshot */
7026       dout(10) << "_rollback_to deleting " << soid.oid
7027                << " and rolling back to old snap" << dendl;
7028
7029       if (obs.exists) {
7030         t->remove(soid);
7031       }
7032       t->clone(soid, rollback_to_sobject);
7033       snapset.head_exists = true;
7034       t->add_obc(rollback_to);
7035
7036       map<snapid_t, interval_set<uint64_t> >::iterator iter =
7037         snapset.clone_overlap.lower_bound(snapid);
7038       interval_set<uint64_t> overlaps = iter->second;
7039       assert(iter != snapset.clone_overlap.end());
7040       for ( ;
7041             iter != snapset.clone_overlap.end();
7042             ++iter)
7043         overlaps.intersection_of(iter->second);
7044
7045       if (obs.oi.size > 0) {
7046         interval_set<uint64_t> modified;
7047         modified.insert(0, obs.oi.size);
7048         overlaps.intersection_of(modified);
7049         modified.subtract(overlaps);
7050         ctx->modified_ranges.union_of(modified);
7051       }
7052
7053       // Adjust the cached objectcontext
7054       maybe_create_new_object(ctx, true);
7055       ctx->delta_stats.num_bytes -= obs.oi.size;
7056       ctx->delta_stats.num_bytes += rollback_to->obs.oi.size;
7057       obs.oi.size = rollback_to->obs.oi.size;
7058       if (rollback_to->obs.oi.is_data_digest())
7059         obs.oi.set_data_digest(rollback_to->obs.oi.data_digest);
7060       else
7061         obs.oi.clear_data_digest();
7062       if (rollback_to->obs.oi.is_omap_digest())
7063         obs.oi.set_omap_digest(rollback_to->obs.oi.omap_digest);
7064       else
7065         obs.oi.clear_omap_digest();
7066
7067       if (rollback_to->obs.oi.is_omap()) {
7068         dout(10) << __func__ << " setting omap flag on " << obs.oi.soid << dendl;
7069         obs.oi.set_flag(object_info_t::FLAG_OMAP);
7070       } else {
7071         dout(10) << __func__ << " clearing omap flag on " << obs.oi.soid << dendl;
7072         obs.oi.clear_flag(object_info_t::FLAG_OMAP);
7073       }
7074
7075       snapset.head_exists = true;
7076     }
7077   }
7078   return ret;
7079 }
7080
7081 void PrimaryLogPG::_make_clone(
7082   OpContext *ctx,
7083   PGTransaction* t,
7084   ObjectContextRef obc,
7085   const hobject_t& head, const hobject_t& coid,
7086   object_info_t *poi)
7087 {
7088   bufferlist bv;
7089   ::encode(*poi, bv, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
7090
7091   t->clone(coid, head);
7092   setattr_maybe_cache(obc, ctx, t, OI_ATTR, bv);
7093   rmattr_maybe_cache(obc, ctx, t, SS_ATTR);
7094 }
7095
7096 void PrimaryLogPG::make_writeable(OpContext *ctx)
7097 {
7098   const hobject_t& soid = ctx->obs->oi.soid;
7099   SnapContext& snapc = ctx->snapc;
7100
7101   // clone?
7102   assert(soid.snap == CEPH_NOSNAP);
7103   dout(20) << "make_writeable " << soid << " snapset=" << ctx->new_snapset
7104            << "  snapc=" << snapc << dendl;
7105
7106   bool was_dirty = ctx->obc->obs.oi.is_dirty();
7107   if (ctx->new_obs.exists) {
7108     // we will mark the object dirty
7109     if (ctx->undirty && was_dirty) {
7110       dout(20) << " clearing DIRTY flag" << dendl;
7111       assert(ctx->new_obs.oi.is_dirty());
7112       ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
7113       --ctx->delta_stats.num_objects_dirty;
7114       osd->logger->inc(l_osd_tier_clean);
7115     } else if (!was_dirty && !ctx->undirty) {
7116       dout(20) << " setting DIRTY flag" << dendl;
7117       ctx->new_obs.oi.set_flag(object_info_t::FLAG_DIRTY);
7118       ++ctx->delta_stats.num_objects_dirty;
7119       osd->logger->inc(l_osd_tier_dirty);
7120     }
7121   } else {
7122     if (was_dirty) {
7123       dout(20) << " deletion, decrementing num_dirty and clearing flag" << dendl;
7124       ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
7125       --ctx->delta_stats.num_objects_dirty;
7126     }
7127   }
7128
7129   if ((ctx->new_obs.exists &&
7130        ctx->new_obs.oi.is_omap()) &&
7131       (!ctx->obc->obs.exists ||
7132        !ctx->obc->obs.oi.is_omap())) {
7133     ++ctx->delta_stats.num_objects_omap;
7134   }
7135   if ((!ctx->new_obs.exists ||
7136        !ctx->new_obs.oi.is_omap()) &&
7137       (ctx->obc->obs.exists &&
7138        ctx->obc->obs.oi.is_omap())) {
7139     --ctx->delta_stats.num_objects_omap;
7140   }
7141
7142   // use newer snapc?
7143   if (ctx->new_snapset.seq > snapc.seq) {
7144     snapc.seq = ctx->new_snapset.seq;
7145     snapc.snaps = ctx->new_snapset.snaps;
7146     filter_snapc(snapc.snaps);
7147     dout(10) << " using newer snapc " << snapc << dendl;
7148   }
7149
7150   if ((ctx->obs->exists && !ctx->obs->oi.is_whiteout()) && // head exist(ed)
7151       snapc.snaps.size() &&                 // there are snaps
7152       !ctx->cache_evict &&
7153       snapc.snaps[0] > ctx->new_snapset.seq) {  // existing object is old
7154     // clone
7155     hobject_t coid = soid;
7156     coid.snap = snapc.seq;
7157
7158     unsigned l;
7159     for (l=1; l<snapc.snaps.size() && snapc.snaps[l] > ctx->new_snapset.seq; l++) ;
7160
7161     vector<snapid_t> snaps(l);
7162     for (unsigned i=0; i<l; i++)
7163       snaps[i] = snapc.snaps[i];
7164
7165     // prepare clone
7166     object_info_t static_snap_oi(coid);
7167     object_info_t *snap_oi;
7168     if (is_primary()) {
7169       ctx->clone_obc = object_contexts.lookup_or_create(static_snap_oi.soid);
7170       ctx->clone_obc->destructor_callback = new C_PG_ObjectContext(this, ctx->clone_obc.get());
7171       ctx->clone_obc->obs.oi = static_snap_oi;
7172       ctx->clone_obc->obs.exists = true;
7173       ctx->clone_obc->ssc = ctx->obc->ssc;
7174       ctx->clone_obc->ssc->ref++;
7175       if (pool.info.require_rollback())
7176         ctx->clone_obc->attr_cache = ctx->obc->attr_cache;
7177       snap_oi = &ctx->clone_obc->obs.oi;
7178       bool got = ctx->lock_manager.get_write_greedy(
7179         coid,
7180         ctx->clone_obc,
7181         ctx->op);
7182       assert(got);
7183       dout(20) << " got greedy write on clone_obc " << *ctx->clone_obc << dendl;
7184     } else {
7185       snap_oi = &static_snap_oi;
7186     }
7187     snap_oi->version = ctx->at_version;
7188     snap_oi->prior_version = ctx->obs->oi.version;
7189     snap_oi->copy_user_bits(ctx->obs->oi);
7190
7191     bool legacy = ctx->new_snapset.is_legacy() ||
7192       get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
7193     if (legacy) {
7194       snap_oi->legacy_snaps = snaps;
7195     }
7196
7197     _make_clone(ctx, ctx->op_t.get(), ctx->clone_obc, soid, coid, snap_oi);
7198
7199     ctx->delta_stats.num_objects++;
7200     if (snap_oi->is_dirty()) {
7201       ctx->delta_stats.num_objects_dirty++;
7202       osd->logger->inc(l_osd_tier_dirty);
7203     }
7204     if (snap_oi->is_omap())
7205       ctx->delta_stats.num_objects_omap++;
7206     if (snap_oi->is_cache_pinned())
7207       ctx->delta_stats.num_objects_pinned++;
7208     ctx->delta_stats.num_object_clones++;
7209     ctx->new_snapset.clones.push_back(coid.snap);
7210     ctx->new_snapset.clone_size[coid.snap] = ctx->obs->oi.size;
7211     if (!legacy) {
7212       ctx->new_snapset.clone_snaps[coid.snap] = snaps;
7213     }
7214
7215     // clone_overlap should contain an entry for each clone
7216     // (an empty interval_set if there is no overlap)
7217     ctx->new_snapset.clone_overlap[coid.snap];
7218     if (ctx->obs->oi.size)
7219       ctx->new_snapset.clone_overlap[coid.snap].insert(0, ctx->obs->oi.size);
7220
7221     // log clone
7222     dout(10) << " cloning v " << ctx->obs->oi.version
7223              << " to " << coid << " v " << ctx->at_version
7224              << " snaps=" << snaps
7225              << " snapset=" << ctx->new_snapset << dendl;
7226     ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::CLONE, coid, ctx->at_version,
7227                                       ctx->obs->oi.version,
7228                                       ctx->obs->oi.user_version,
7229                                       osd_reqid_t(), ctx->new_obs.oi.mtime, 0));
7230     ::encode(snaps, ctx->log.back().snaps);
7231
7232     ctx->at_version.version++;
7233   }
7234
7235   // update most recent clone_overlap and usage stats
7236   if (ctx->new_snapset.clones.size() > 0) {
7237     /* we need to check whether the most recent clone exists, if it's been evicted,
7238      * it's not included in the stats */
7239     hobject_t last_clone_oid = soid;
7240     last_clone_oid.snap = ctx->new_snapset.clone_overlap.rbegin()->first;
7241     if (is_present_clone(last_clone_oid)) {
7242       interval_set<uint64_t> &newest_overlap = ctx->new_snapset.clone_overlap.rbegin()->second;
7243       ctx->modified_ranges.intersection_of(newest_overlap);
7244       // modified_ranges is still in use by the clone
7245       add_interval_usage(ctx->modified_ranges, ctx->delta_stats);
7246       newest_overlap.subtract(ctx->modified_ranges);
7247     }
7248   }
7249
7250   // update snapset with latest snap context
7251   ctx->new_snapset.seq = snapc.seq;
7252   ctx->new_snapset.snaps = snapc.snaps;
7253   if (get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS) {
7254     // pessimistic assumption that this is a net-new legacy SnapSet
7255     ctx->delta_stats.num_legacy_snapsets++;
7256     ctx->new_snapset.head_exists = ctx->new_obs.exists;
7257   } else if (ctx->new_snapset.is_legacy()) {
7258     ctx->new_snapset.head_exists = ctx->new_obs.exists;
7259   }
7260   dout(20) << "make_writeable " << soid
7261            << " done, snapset=" << ctx->new_snapset << dendl;
7262 }
7263
7264
7265 void PrimaryLogPG::write_update_size_and_usage(object_stat_sum_t& delta_stats, object_info_t& oi,
7266                                                interval_set<uint64_t>& modified, uint64_t offset,
7267                                                uint64_t length, bool write_full)
7268 {
7269   interval_set<uint64_t> ch;
7270   if (write_full) {
7271     if (oi.size)
7272       ch.insert(0, oi.size);
7273   } else if (length)
7274     ch.insert(offset, length);
7275   modified.union_of(ch);
7276   if (write_full || offset + length > oi.size) {
7277     uint64_t new_size = offset + length;
7278     delta_stats.num_bytes -= oi.size;
7279     delta_stats.num_bytes += new_size;
7280     oi.size = new_size;
7281   }
7282   delta_stats.num_wr++;
7283   delta_stats.num_wr_kb += SHIFT_ROUND_UP(length, 10);
7284 }
7285
7286 void PrimaryLogPG::add_interval_usage(interval_set<uint64_t>& s, object_stat_sum_t& delta_stats)
7287 {
7288   for (interval_set<uint64_t>::const_iterator p = s.begin(); p != s.end(); ++p) {
7289     delta_stats.num_bytes += p.get_len();
7290   }
7291 }
7292
7293 void PrimaryLogPG::complete_disconnect_watches(
7294   ObjectContextRef obc,
7295   const list<watch_disconnect_t> &to_disconnect)
7296 {
7297   for (list<watch_disconnect_t>::const_iterator i =
7298          to_disconnect.begin();
7299        i != to_disconnect.end();
7300        ++i) {
7301     pair<uint64_t, entity_name_t> watcher(i->cookie, i->name);
7302     auto watchers_entry = obc->watchers.find(watcher);
7303     if (watchers_entry != obc->watchers.end()) {
7304       WatchRef watch = watchers_entry->second;
7305       dout(10) << "do_osd_op_effects disconnect watcher " << watcher << dendl;
7306       obc->watchers.erase(watcher);
7307       watch->remove(i->send_disconnect);
7308     } else {
7309       dout(10) << "do_osd_op_effects disconnect failed to find watcher "
7310                << watcher << dendl;
7311     }
7312   }
7313 }
7314
7315 void PrimaryLogPG::do_osd_op_effects(OpContext *ctx, const ConnectionRef& conn)
7316 {
7317   entity_name_t entity = ctx->reqid.name;
7318   dout(15) << "do_osd_op_effects " << entity << " con " << conn.get() << dendl;
7319
7320   // disconnects first
7321   complete_disconnect_watches(ctx->obc, ctx->watch_disconnects);
7322
7323   assert(conn);
7324
7325   boost::intrusive_ptr<Session> session((Session *)conn->get_priv());
7326   if (!session.get())
7327     return;
7328   session->put();  // get_priv() takes a ref, and so does the intrusive_ptr
7329
7330   for (list<pair<watch_info_t,bool> >::iterator i = ctx->watch_connects.begin();
7331        i != ctx->watch_connects.end();
7332        ++i) {
7333     pair<uint64_t, entity_name_t> watcher(i->first.cookie, entity);
7334     dout(15) << "do_osd_op_effects applying watch connect on session "
7335              << session.get() << " watcher " << watcher << dendl;
7336     WatchRef watch;
7337     if (ctx->obc->watchers.count(watcher)) {
7338       dout(15) << "do_osd_op_effects found existing watch watcher " << watcher
7339                << dendl;
7340       watch = ctx->obc->watchers[watcher];
7341     } else {
7342       dout(15) << "do_osd_op_effects new watcher " << watcher
7343                << dendl;
7344       watch = Watch::makeWatchRef(
7345         this, osd, ctx->obc, i->first.timeout_seconds,
7346         i->first.cookie, entity, conn->get_peer_addr());
7347       ctx->obc->watchers.insert(
7348         make_pair(
7349           watcher,
7350           watch));
7351     }
7352     watch->connect(conn, i->second);
7353   }
7354
7355   for (list<notify_info_t>::iterator p = ctx->notifies.begin();
7356        p != ctx->notifies.end();
7357        ++p) {
7358     dout(10) << "do_osd_op_effects, notify " << *p << dendl;
7359     ConnectionRef conn(ctx->op->get_req()->get_connection());
7360     NotifyRef notif(
7361       Notify::makeNotifyRef(
7362         conn,
7363         ctx->reqid.name.num(),
7364         p->bl,
7365         p->timeout,
7366         p->cookie,
7367         p->notify_id,
7368         ctx->obc->obs.oi.user_version,
7369         osd));
7370     for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator i =
7371            ctx->obc->watchers.begin();
7372          i != ctx->obc->watchers.end();
7373          ++i) {
7374       dout(10) << "starting notify on watch " << i->first << dendl;
7375       i->second->start_notify(notif);
7376     }
7377     notif->init();
7378   }
7379
7380   for (list<OpContext::NotifyAck>::iterator p = ctx->notify_acks.begin();
7381        p != ctx->notify_acks.end();
7382        ++p) {
7383     if (p->watch_cookie)
7384       dout(10) << "notify_ack " << make_pair(p->watch_cookie.get(), p->notify_id) << dendl;
7385     else
7386       dout(10) << "notify_ack " << make_pair("NULL", p->notify_id) << dendl;
7387     for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator i =
7388            ctx->obc->watchers.begin();
7389          i != ctx->obc->watchers.end();
7390          ++i) {
7391       if (i->first.second != entity) continue;
7392       if (p->watch_cookie &&
7393           p->watch_cookie.get() != i->first.first) continue;
7394       dout(10) << "acking notify on watch " << i->first << dendl;
7395       i->second->notify_ack(p->notify_id, p->reply_bl);
7396     }
7397   }
7398 }
7399
7400 hobject_t PrimaryLogPG::generate_temp_object(const hobject_t& target)
7401 {
7402   ostringstream ss;
7403   ss << "temp_" << info.pgid << "_" << get_role()
7404      << "_" << osd->monc->get_global_id() << "_" << (++temp_seq);
7405   hobject_t hoid = target.make_temp_hobject(ss.str());
7406   dout(20) << __func__ << " " << hoid << dendl;
7407   return hoid;
7408 }
7409
7410 hobject_t PrimaryLogPG::get_temp_recovery_object(
7411   const hobject_t& target,
7412   eversion_t version)
7413 {
7414   ostringstream ss;
7415   ss << "temp_recovering_" << info.pgid  // (note this includes the shardid)
7416      << "_" << version
7417      << "_" << info.history.same_interval_since
7418      << "_" << target.snap;
7419   // pgid + version + interval + snapid is unique, and short
7420   hobject_t hoid = target.make_temp_hobject(ss.str());
7421   dout(20) << __func__ << " " << hoid << dendl;
7422   return hoid;
7423 }
7424
7425 int PrimaryLogPG::prepare_transaction(OpContext *ctx)
7426 {
7427   assert(!ctx->ops->empty());
7428
7429   const hobject_t& soid = ctx->obs->oi.soid;
7430
7431   // valid snap context?
7432   if (!ctx->snapc.is_valid()) {
7433     dout(10) << " invalid snapc " << ctx->snapc << dendl;
7434     return -EINVAL;
7435   }
7436
7437   // prepare the actual mutation
7438   int result = do_osd_ops(ctx, *ctx->ops);
7439   if (result < 0) {
7440     if (ctx->op->may_write() &&
7441         get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
7442       // need to save the error code in the pg log, to detect dup ops,
7443       // but do nothing else
7444       ctx->update_log_only = true;
7445     }
7446     return result;
7447   }
7448
7449   // read-op?  write-op noop? done?
7450   if (ctx->op_t->empty() && !ctx->modify) {
7451     unstable_stats.add(ctx->delta_stats);
7452     if (ctx->op->may_write() &&
7453         get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
7454       ctx->update_log_only = true;
7455     }
7456     return result;
7457   }
7458
7459   // check for full
7460   if ((ctx->delta_stats.num_bytes > 0 ||
7461        ctx->delta_stats.num_objects > 0) &&  // FIXME: keys?
7462       (pool.info.has_flag(pg_pool_t::FLAG_FULL) ||
7463        get_osdmap()->test_flag(CEPH_OSDMAP_FULL))) {
7464     const MOSDOp *m = static_cast<const MOSDOp*>(ctx->op->get_req());
7465     if (ctx->reqid.name.is_mds() ||   // FIXME: ignore MDS for now
7466         m->has_flag(CEPH_OSD_FLAG_FULL_FORCE)) {
7467       dout(20) << __func__ << " full, but proceeding due to FULL_FORCE or MDS"
7468                << dendl;
7469     } else if (m->has_flag(CEPH_OSD_FLAG_FULL_TRY)) {
7470       // they tried, they failed.
7471       dout(20) << __func__ << " full, replying to FULL_TRY op" << dendl;
7472       return pool.info.has_flag(pg_pool_t::FLAG_FULL) ? -EDQUOT : -ENOSPC;
7473     } else {
7474       // drop request
7475       dout(20) << __func__ << " full, dropping request (bad client)" << dendl;
7476       return -EAGAIN;
7477     }
7478   }
7479
7480   // clone, if necessary
7481   if (soid.snap == CEPH_NOSNAP)
7482     make_writeable(ctx);
7483
7484   finish_ctx(ctx,
7485              ctx->new_obs.exists ? pg_log_entry_t::MODIFY :
7486              pg_log_entry_t::DELETE);
7487
7488   return result;
7489 }
7490
7491 void PrimaryLogPG::finish_ctx(OpContext *ctx, int log_op_type, bool maintain_ssc)
7492 {
7493   const hobject_t& soid = ctx->obs->oi.soid;
7494   dout(20) << __func__ << " " << soid << " " << ctx
7495            << " op " << pg_log_entry_t::get_op_name(log_op_type)
7496            << dendl;
7497   utime_t now = ceph_clock_now();
7498
7499   // snapset
7500   bufferlist bss;
7501
7502   if (soid.snap == CEPH_NOSNAP && maintain_ssc) {
7503     ::encode(ctx->new_snapset, bss);
7504     assert(ctx->new_obs.exists == ctx->new_snapset.head_exists ||
7505            !ctx->new_snapset.is_legacy());
7506
7507     if (ctx->new_obs.exists) {
7508       if (!ctx->obs->exists) {
7509         if (ctx->snapset_obc && ctx->snapset_obc->obs.exists) {
7510           hobject_t snapoid = soid.get_snapdir();
7511           dout(10) << " removing unneeded snapdir " << snapoid << dendl;
7512           ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::DELETE, snapoid,
7513               ctx->at_version,
7514               ctx->snapset_obc->obs.oi.version,
7515               0, osd_reqid_t(), ctx->mtime, 0));
7516           ctx->op_t->remove(snapoid);
7517
7518           ctx->at_version.version++;
7519
7520           ctx->snapset_obc->obs.exists = false;
7521         }
7522       }
7523     } else if (!ctx->new_snapset.clones.empty() &&
7524                !ctx->cache_evict &&
7525                !ctx->new_snapset.head_exists &&
7526                (!ctx->snapset_obc || !ctx->snapset_obc->obs.exists)) {
7527       // save snapset on _snap
7528       hobject_t snapoid(soid.oid, soid.get_key(), CEPH_SNAPDIR, soid.get_hash(),
7529                         info.pgid.pool(), soid.get_namespace());
7530       dout(10) << " final snapset " << ctx->new_snapset
7531                << " in " << snapoid << dendl;
7532       assert(get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS);
7533       ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::MODIFY, snapoid,
7534                                         ctx->at_version,
7535                                         eversion_t(),
7536                                         0, osd_reqid_t(), ctx->mtime, 0));
7537
7538       if (!ctx->snapset_obc)
7539         ctx->snapset_obc = get_object_context(snapoid, true);
7540       bool got = false;
7541       if (ctx->lock_type == ObjectContext::RWState::RWWRITE) {
7542         got = ctx->lock_manager.get_write_greedy(
7543           snapoid,
7544           ctx->snapset_obc,
7545           ctx->op);
7546       } else {
7547         assert(ctx->lock_type == ObjectContext::RWState::RWEXCL);
7548         got = ctx->lock_manager.get_lock_type(
7549           ObjectContext::RWState::RWEXCL,
7550           snapoid,
7551           ctx->snapset_obc,
7552           ctx->op);
7553       }
7554       assert(got);
7555       dout(20) << " got greedy write on snapset_obc " << *ctx->snapset_obc << dendl;
7556       ctx->snapset_obc->obs.exists = true;
7557       ctx->snapset_obc->obs.oi.version = ctx->at_version;
7558       ctx->snapset_obc->obs.oi.last_reqid = ctx->reqid;
7559       ctx->snapset_obc->obs.oi.mtime = ctx->mtime;
7560       ctx->snapset_obc->obs.oi.local_mtime = now;
7561
7562       map<string, bufferlist> attrs;
7563       bufferlist bv(sizeof(ctx->new_obs.oi));
7564       ::encode(ctx->snapset_obc->obs.oi, bv,
7565                get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
7566       ctx->op_t->create(snapoid);
7567       attrs[OI_ATTR].claim(bv);
7568       attrs[SS_ATTR].claim(bss);
7569       setattrs_maybe_cache(ctx->snapset_obc, ctx, ctx->op_t.get(), attrs);
7570       ctx->at_version.version++;
7571     }
7572   }
7573
7574   // finish and log the op.
7575   if (ctx->user_modify) {
7576     // update the user_version for any modify ops, except for the watch op
7577     ctx->user_at_version = MAX(info.last_user_version, ctx->new_obs.oi.user_version) + 1;
7578     /* In order for new clients and old clients to interoperate properly
7579      * when exchanging versions, we need to lower bound the user_version
7580      * (which our new clients pay proper attention to)
7581      * by the at_version (which is all the old clients can ever see). */
7582     if (ctx->at_version.version > ctx->user_at_version)
7583       ctx->user_at_version = ctx->at_version.version;
7584     ctx->new_obs.oi.user_version = ctx->user_at_version;
7585   }
7586   ctx->bytes_written = ctx->op_t->get_bytes_written();
7587
7588   if (ctx->new_obs.exists) {
7589     // on the head object
7590     ctx->new_obs.oi.version = ctx->at_version;
7591     ctx->new_obs.oi.prior_version = ctx->obs->oi.version;
7592     ctx->new_obs.oi.last_reqid = ctx->reqid;
7593     if (ctx->mtime != utime_t()) {
7594       ctx->new_obs.oi.mtime = ctx->mtime;
7595       dout(10) << " set mtime to " << ctx->new_obs.oi.mtime << dendl;
7596       ctx->new_obs.oi.local_mtime = now;
7597     } else {
7598       dout(10) << " mtime unchanged at " << ctx->new_obs.oi.mtime << dendl;
7599     }
7600
7601     map <string, bufferlist> attrs;
7602     bufferlist bv(sizeof(ctx->new_obs.oi));
7603     ::encode(ctx->new_obs.oi, bv,
7604              get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
7605     attrs[OI_ATTR].claim(bv);
7606
7607     if (soid.snap == CEPH_NOSNAP) {
7608       dout(10) << " final snapset " << ctx->new_snapset
7609                << " in " << soid << dendl;
7610       attrs[SS_ATTR].claim(bss);
7611     } else {
7612       dout(10) << " no snapset (this is a clone)" << dendl;
7613     }
7614     ctx->op_t->setattrs(soid, attrs);
7615   } else {
7616     ctx->new_obs.oi = object_info_t(ctx->obc->obs.oi.soid);
7617   }
7618
7619   bool legacy_snapset = ctx->new_snapset.is_legacy() ||
7620     get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
7621
7622   // append to log
7623   ctx->log.push_back(pg_log_entry_t(log_op_type, soid, ctx->at_version,
7624                                     ctx->obs->oi.version,
7625                                     ctx->user_at_version, ctx->reqid,
7626                                     ctx->mtime, 0));
7627   if (soid.snap < CEPH_NOSNAP) {
7628     switch (log_op_type) {
7629     case pg_log_entry_t::MODIFY:
7630     case pg_log_entry_t::PROMOTE:
7631     case pg_log_entry_t::CLEAN:
7632       if (legacy_snapset) {
7633         dout(20) << __func__ << " encoding legacy_snaps "
7634                  << ctx->new_obs.oi.legacy_snaps
7635                  << dendl;
7636         ::encode(ctx->new_obs.oi.legacy_snaps, ctx->log.back().snaps);
7637       } else {
7638         dout(20) << __func__ << " encoding snaps from " << ctx->new_snapset
7639                  << dendl;
7640         ::encode(ctx->new_snapset.clone_snaps[soid.snap], ctx->log.back().snaps);
7641       }
7642       break;
7643     default:
7644       break;
7645     }
7646   }
7647
7648   if (!ctx->extra_reqids.empty()) {
7649     dout(20) << __func__ << "  extra_reqids " << ctx->extra_reqids << dendl;
7650     ctx->log.back().extra_reqids.swap(ctx->extra_reqids);
7651   }
7652
7653   // apply new object state.
7654   ctx->obc->obs = ctx->new_obs;
7655
7656   if (soid.is_head() && !ctx->obc->obs.exists &&
7657       (!maintain_ssc || ctx->cache_evict)) {
7658     ctx->obc->ssc->exists = false;
7659     ctx->obc->ssc->snapset = SnapSet();
7660   } else {
7661     ctx->obc->ssc->exists = true;
7662     ctx->obc->ssc->snapset = ctx->new_snapset;
7663   }
7664 }
7665
7666 void PrimaryLogPG::apply_stats(
7667   const hobject_t &soid,
7668   const object_stat_sum_t &delta_stats) {
7669
7670   info.stats.stats.add(delta_stats);
7671
7672   for (set<pg_shard_t>::iterator i = backfill_targets.begin();
7673        i != backfill_targets.end();
7674        ++i) {
7675     pg_shard_t bt = *i;
7676     pg_info_t& pinfo = peer_info[bt];
7677     if (soid <= pinfo.last_backfill)
7678       pinfo.stats.stats.add(delta_stats);
7679     else if (soid <= last_backfill_started)
7680       pending_backfill_updates[soid].stats.add(delta_stats);
7681   }
7682
7683   if (is_primary() && scrubber.active) {
7684     if (soid < scrubber.start) {
7685       dout(20) << __func__ << " " << soid << " < [" << scrubber.start
7686                << "," << scrubber.end << ")" << dendl;
7687       scrub_cstat.add(delta_stats);
7688     } else {
7689       dout(20) << __func__ << " " << soid << " >= [" << scrubber.start
7690                << "," << scrubber.end << ")" << dendl;
7691     }
7692   }
7693 }
7694
7695 void PrimaryLogPG::complete_read_ctx(int result, OpContext *ctx)
7696 {
7697   const MOSDOp *m = static_cast<const MOSDOp*>(ctx->op->get_req());
7698   assert(ctx->async_reads_complete());
7699
7700   for (vector<OSDOp>::iterator p = ctx->ops->begin();
7701     p != ctx->ops->end() && result >= 0; ++p) {
7702     if (p->rval < 0 && !(p->op.flags & CEPH_OSD_OP_FLAG_FAILOK)) {
7703       result = p->rval;
7704       break;
7705     }
7706     ctx->bytes_read += p->outdata.length();
7707   }
7708   ctx->reply->claim_op_out_data(*ctx->ops);
7709   ctx->reply->get_header().data_off = (ctx->data_off ? *ctx->data_off : 0);
7710
7711   MOSDOpReply *reply = ctx->reply;
7712   ctx->reply = nullptr;
7713
7714   if (result >= 0) {
7715     if (!ctx->ignore_log_op_stats) {
7716       log_op_stats(ctx);
7717       publish_stats_to_osd();
7718     }
7719
7720     // on read, return the current object version
7721     if (ctx->obs) {
7722       reply->set_reply_versions(eversion_t(), ctx->obs->oi.user_version);
7723     } else {
7724       reply->set_reply_versions(eversion_t(), ctx->user_at_version);
7725     }
7726   } else if (result == -ENOENT) {
7727     // on ENOENT, set a floor for what the next user version will be.
7728     reply->set_enoent_reply_versions(info.last_update, info.last_user_version);
7729   }
7730
7731   reply->set_result(result);
7732   reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
7733   osd->send_message_osd_client(reply, m->get_connection());
7734   close_op_ctx(ctx);
7735 }
7736
7737 // ========================================================================
7738 // copyfrom
7739
7740 struct C_Copyfrom : public Context {
7741   PrimaryLogPGRef pg;
7742   hobject_t oid;
7743   epoch_t last_peering_reset;
7744   ceph_tid_t tid;
7745   PrimaryLogPG::CopyOpRef cop;
7746   C_Copyfrom(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
7747              const PrimaryLogPG::CopyOpRef& c)
7748     : pg(p), oid(o), last_peering_reset(lpr),
7749       tid(0), cop(c)
7750   {}
7751   void finish(int r) override {
7752     if (r == -ECANCELED)
7753       return;
7754     pg->lock();
7755     if (last_peering_reset == pg->get_last_peering_reset()) {
7756       pg->process_copy_chunk(oid, tid, r);
7757     }
7758     pg->unlock();
7759   }
7760 };
7761
7762 struct C_CopyFrom_AsyncReadCb : public Context {
7763   OSDOp *osd_op;
7764   object_copy_data_t reply_obj;
7765   uint64_t features;
7766   size_t len;
7767   C_CopyFrom_AsyncReadCb(OSDOp *osd_op, uint64_t features) :
7768     osd_op(osd_op), features(features), len(0) {}
7769   void finish(int r) override {
7770     osd_op->rval = r;
7771     if (r < 0) {
7772       return;
7773     }
7774
7775     assert(len > 0);
7776     assert(len <= reply_obj.data.length());
7777     bufferlist bl;
7778     bl.substr_of(reply_obj.data, 0, len);
7779     reply_obj.data.swap(bl);
7780     ::encode(reply_obj, osd_op->outdata, features);
7781   }
7782 };
7783
7784 int PrimaryLogPG::do_copy_get(OpContext *ctx, bufferlist::iterator& bp,
7785                               OSDOp& osd_op, ObjectContextRef &obc)
7786 {
7787   object_info_t& oi = obc->obs.oi;
7788   hobject_t& soid = oi.soid;
7789   int result = 0;
7790   object_copy_cursor_t cursor;
7791   uint64_t out_max;
7792   try {
7793     ::decode(cursor, bp);
7794     ::decode(out_max, bp);
7795   }
7796   catch (buffer::error& e) {
7797     result = -EINVAL;
7798     return result;
7799   }
7800
7801   const MOSDOp *op = reinterpret_cast<const MOSDOp*>(ctx->op->get_req());
7802   uint64_t features = op->get_features();
7803
7804   bool async_read_started = false;
7805   object_copy_data_t _reply_obj;
7806   C_CopyFrom_AsyncReadCb *cb = NULL;
7807   if (pool.info.require_rollback()) {
7808     cb = new C_CopyFrom_AsyncReadCb(&osd_op, features);
7809   }
7810   object_copy_data_t &reply_obj = cb ? cb->reply_obj : _reply_obj;
7811   // size, mtime
7812   reply_obj.size = oi.size;
7813   reply_obj.mtime = oi.mtime;
7814   assert(obc->ssc);
7815   if (soid.snap < CEPH_NOSNAP) {
7816     if (obc->ssc->snapset.is_legacy()) {
7817       reply_obj.snaps = oi.legacy_snaps;
7818     } else {
7819       auto p = obc->ssc->snapset.clone_snaps.find(soid.snap);
7820       assert(p != obc->ssc->snapset.clone_snaps.end()); // warn?
7821       reply_obj.snaps = p->second;
7822     }
7823   } else {
7824     reply_obj.snap_seq = obc->ssc->snapset.seq;
7825   }
7826   if (oi.is_data_digest()) {
7827     reply_obj.flags |= object_copy_data_t::FLAG_DATA_DIGEST;
7828     reply_obj.data_digest = oi.data_digest;
7829   }
7830   if (oi.is_omap_digest()) {
7831     reply_obj.flags |= object_copy_data_t::FLAG_OMAP_DIGEST;
7832     reply_obj.omap_digest = oi.omap_digest;
7833   }
7834   reply_obj.truncate_seq = oi.truncate_seq;
7835   reply_obj.truncate_size = oi.truncate_size;
7836
7837   // attrs
7838   map<string,bufferlist>& out_attrs = reply_obj.attrs;
7839   if (!cursor.attr_complete) {
7840     result = getattrs_maybe_cache(
7841       ctx->obc,
7842       &out_attrs,
7843       true);
7844     if (result < 0) {
7845       if (cb) {
7846         delete cb;
7847       }
7848       return result;
7849     }
7850     cursor.attr_complete = true;
7851     dout(20) << " got attrs" << dendl;
7852   }
7853
7854   int64_t left = out_max - osd_op.outdata.length();
7855
7856   // data
7857   bufferlist& bl = reply_obj.data;
7858   if (left > 0 && !cursor.data_complete) {
7859     if (cursor.data_offset < oi.size) {
7860       uint64_t max_read = MIN(oi.size - cursor.data_offset, (uint64_t)left);
7861       if (cb) {
7862         async_read_started = true;
7863         ctx->pending_async_reads.push_back(
7864           make_pair(
7865             boost::make_tuple(cursor.data_offset, max_read, osd_op.op.flags),
7866             make_pair(&bl, cb)));
7867         cb->len = max_read;
7868
7869         ctx->op_finishers[ctx->current_osd_subop_num].reset(
7870           new ReadFinisher(osd_op));
7871         result = -EINPROGRESS;
7872
7873         dout(10) << __func__ << ": async_read noted for " << soid << dendl;
7874       } else {
7875         result = pgbackend->objects_read_sync(
7876           oi.soid, cursor.data_offset, max_read, osd_op.op.flags, &bl);
7877         if (result < 0)
7878           return result;
7879       }
7880       left -= max_read;
7881       cursor.data_offset += max_read;
7882     }
7883     if (cursor.data_offset == oi.size) {
7884       cursor.data_complete = true;
7885       dout(20) << " got data" << dendl;
7886     }
7887     assert(cursor.data_offset <= oi.size);
7888   }
7889
7890   // omap
7891   uint32_t omap_keys = 0;
7892   if (!pool.info.supports_omap() || !oi.is_omap()) {
7893     cursor.omap_complete = true;
7894   } else {
7895     if (left > 0 && !cursor.omap_complete) {
7896       assert(cursor.data_complete);
7897       if (cursor.omap_offset.empty()) {
7898         osd->store->omap_get_header(ch, ghobject_t(oi.soid),
7899                                     &reply_obj.omap_header);
7900       }
7901       bufferlist omap_data;
7902       ObjectMap::ObjectMapIterator iter =
7903         osd->store->get_omap_iterator(coll, ghobject_t(oi.soid));
7904       assert(iter);
7905       iter->upper_bound(cursor.omap_offset);
7906       for (; iter->valid(); iter->next(false)) {
7907         ++omap_keys;
7908         ::encode(iter->key(), omap_data);
7909         ::encode(iter->value(), omap_data);
7910         left -= iter->key().length() + 4 + iter->value().length() + 4;
7911         if (left <= 0)
7912           break;
7913       }
7914       if (omap_keys) {
7915         ::encode(omap_keys, reply_obj.omap_data);
7916         reply_obj.omap_data.claim_append(omap_data);
7917       }
7918       if (iter->valid()) {
7919         cursor.omap_offset = iter->key();
7920       } else {
7921         cursor.omap_complete = true;
7922         dout(20) << " got omap" << dendl;
7923       }
7924     }
7925   }
7926
7927   if (cursor.is_complete()) {
7928     // include reqids only in the final step.  this is a bit fragile
7929     // but it works...
7930     pg_log.get_log().get_object_reqids(ctx->obc->obs.oi.soid, 10, &reply_obj.reqids);
7931     dout(20) << " got reqids" << dendl;
7932   }
7933
7934   dout(20) << " cursor.is_complete=" << cursor.is_complete()
7935            << " " << out_attrs.size() << " attrs"
7936            << " " << bl.length() << " bytes"
7937            << " " << reply_obj.omap_header.length() << " omap header bytes"
7938            << " " << reply_obj.omap_data.length() << " omap data bytes in "
7939            << omap_keys << " keys"
7940            << " " << reply_obj.reqids.size() << " reqids"
7941            << dendl;
7942   reply_obj.cursor = cursor;
7943   if (!async_read_started) {
7944     ::encode(reply_obj, osd_op.outdata, features);
7945   }
7946   if (cb && !async_read_started) {
7947     delete cb;
7948   }
7949
7950   if (result > 0) {
7951     result = 0;
7952   }
7953   return result;
7954 }
7955
7956 void PrimaryLogPG::fill_in_copy_get_noent(OpRequestRef& op, hobject_t oid,
7957                                           OSDOp& osd_op)
7958 {
7959   // NOTE: we take non-const ref here for claim_op_out_data below; we must
7960   // be careful not to modify anything else that will upset a racing
7961   // operator<<
7962   MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
7963   uint64_t features = m->get_features();
7964   object_copy_data_t reply_obj;
7965
7966   pg_log.get_log().get_object_reqids(oid, 10, &reply_obj.reqids);
7967   dout(20) << __func__ << " got reqids " << reply_obj.reqids << dendl;
7968   ::encode(reply_obj, osd_op.outdata, features);
7969   osd_op.rval = -ENOENT;
7970   MOSDOpReply *reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0, false);
7971   reply->claim_op_out_data(m->ops);
7972   reply->set_result(-ENOENT);
7973   reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
7974   osd->send_message_osd_client(reply, m->get_connection());
7975 }
7976
7977 void PrimaryLogPG::start_copy(CopyCallback *cb, ObjectContextRef obc,
7978                               hobject_t src, object_locator_t oloc,
7979                               version_t version, unsigned flags,
7980                               bool mirror_snapset,
7981                               unsigned src_obj_fadvise_flags,
7982                               unsigned dest_obj_fadvise_flags)
7983 {
7984   const hobject_t& dest = obc->obs.oi.soid;
7985   dout(10) << __func__ << " " << dest
7986            << " from " << src << " " << oloc << " v" << version
7987            << " flags " << flags
7988            << (mirror_snapset ? " mirror_snapset" : "")
7989            << dendl;
7990
7991   assert(!mirror_snapset || (src.snap == CEPH_NOSNAP ||
7992                              src.snap == CEPH_SNAPDIR));
7993
7994   // cancel a previous in-progress copy?
7995   if (copy_ops.count(dest)) {
7996     // FIXME: if the src etc match, we could avoid restarting from the
7997     // beginning.
7998     CopyOpRef cop = copy_ops[dest];
7999     cancel_copy(cop, false);
8000   }
8001
8002   CopyOpRef cop(std::make_shared<CopyOp>(cb, obc, src, oloc, version, flags,
8003                            mirror_snapset, src_obj_fadvise_flags,
8004                            dest_obj_fadvise_flags));
8005   copy_ops[dest] = cop;
8006   obc->start_block();
8007
8008   _copy_some(obc, cop);
8009 }
8010
8011 void PrimaryLogPG::_copy_some(ObjectContextRef obc, CopyOpRef cop)
8012 {
8013   dout(10) << __func__ << " " << obc << " " << cop << dendl;
8014
8015   unsigned flags = 0;
8016   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_FLUSH)
8017     flags |= CEPH_OSD_FLAG_FLUSH;
8018   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE)
8019     flags |= CEPH_OSD_FLAG_IGNORE_CACHE;
8020   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY)
8021     flags |= CEPH_OSD_FLAG_IGNORE_OVERLAY;
8022   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE)
8023     flags |= CEPH_OSD_FLAG_MAP_SNAP_CLONE;
8024   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_RWORDERED)
8025     flags |= CEPH_OSD_FLAG_RWORDERED;
8026
8027   C_GatherBuilder gather(cct);
8028
8029   if (cop->cursor.is_initial() && cop->mirror_snapset) {
8030     // list snaps too.
8031     assert(cop->src.snap == CEPH_NOSNAP);
8032     ObjectOperation op;
8033     op.list_snaps(&cop->results.snapset, NULL);
8034     ceph_tid_t tid = osd->objecter->read(cop->src.oid, cop->oloc, op,
8035                                     CEPH_SNAPDIR, NULL,
8036                                     flags, gather.new_sub(), NULL);
8037     cop->objecter_tid2 = tid;
8038   }
8039
8040   ObjectOperation op;
8041   if (cop->results.user_version) {
8042     op.assert_version(cop->results.user_version);
8043   } else {
8044     // we should learn the version after the first chunk, if we didn't know
8045     // it already!
8046     assert(cop->cursor.is_initial());
8047   }
8048   op.copy_get(&cop->cursor, get_copy_chunk_size(),
8049               &cop->results.object_size, &cop->results.mtime,
8050               &cop->attrs, &cop->data, &cop->omap_header, &cop->omap_data,
8051               &cop->results.snaps, &cop->results.snap_seq,
8052               &cop->results.flags,
8053               &cop->results.source_data_digest,
8054               &cop->results.source_omap_digest,
8055               &cop->results.reqids,
8056               &cop->results.truncate_seq,
8057               &cop->results.truncate_size,
8058               &cop->rval);
8059   op.set_last_op_flags(cop->src_obj_fadvise_flags);
8060
8061   C_Copyfrom *fin = new C_Copyfrom(this, obc->obs.oi.soid,
8062                                    get_last_peering_reset(), cop);
8063   gather.set_finisher(new C_OnFinisher(fin,
8064                                        &osd->objecter_finisher));
8065
8066   ceph_tid_t tid = osd->objecter->read(cop->src.oid, cop->oloc, op,
8067                                   cop->src.snap, NULL,
8068                                   flags,
8069                                   gather.new_sub(),
8070                                   // discover the object version if we don't know it yet
8071                                   cop->results.user_version ? NULL : &cop->results.user_version);
8072   fin->tid = tid;
8073   cop->objecter_tid = tid;
8074   gather.activate();
8075 }
8076
8077 void PrimaryLogPG::process_copy_chunk(hobject_t oid, ceph_tid_t tid, int r)
8078 {
8079   dout(10) << __func__ << " " << oid << " tid " << tid
8080            << " " << cpp_strerror(r) << dendl;
8081   map<hobject_t,CopyOpRef>::iterator p = copy_ops.find(oid);
8082   if (p == copy_ops.end()) {
8083     dout(10) << __func__ << " no copy_op found" << dendl;
8084     return;
8085   }
8086   CopyOpRef cop = p->second;
8087   if (tid != cop->objecter_tid) {
8088     dout(10) << __func__ << " tid " << tid << " != cop " << cop
8089              << " tid " << cop->objecter_tid << dendl;
8090     return;
8091   }
8092
8093   if (cop->omap_data.length() || cop->omap_header.length())
8094     cop->results.has_omap = true;
8095
8096   if (r >= 0 && !pool.info.supports_omap() &&
8097       (cop->omap_data.length() || cop->omap_header.length())) {
8098     r = -EOPNOTSUPP;
8099   }
8100   cop->objecter_tid = 0;
8101   cop->objecter_tid2 = 0;  // assume this ordered before us (if it happened)
8102   ObjectContextRef& cobc = cop->obc;
8103
8104   if (r < 0)
8105     goto out;
8106
8107   assert(cop->rval >= 0);
8108
8109   if (oid.snap < CEPH_NOSNAP && !cop->results.snaps.empty()) {
8110     // verify snap hasn't been deleted
8111     vector<snapid_t>::iterator p = cop->results.snaps.begin();
8112     while (p != cop->results.snaps.end()) {
8113       if (pool.info.is_removed_snap(*p)) {
8114         dout(10) << __func__ << " clone snap " << *p << " has been deleted"
8115                  << dendl;
8116         for (vector<snapid_t>::iterator q = p + 1;
8117              q != cop->results.snaps.end();
8118              ++q)
8119           *(q - 1) = *q;
8120         cop->results.snaps.resize(cop->results.snaps.size() - 1);
8121       } else {
8122         ++p;
8123       }
8124     }
8125     if (cop->results.snaps.empty()) {
8126       dout(10) << __func__ << " no more snaps for " << oid << dendl;
8127       r = -ENOENT;
8128       goto out;
8129     }
8130   }
8131
8132   assert(cop->rval >= 0);
8133
8134   if (!cop->temp_cursor.data_complete) {
8135     cop->results.data_digest = cop->data.crc32c(cop->results.data_digest);
8136   }
8137   if (pool.info.supports_omap() && !cop->temp_cursor.omap_complete) {
8138     if (cop->omap_header.length()) {
8139       cop->results.omap_digest =
8140         cop->omap_header.crc32c(cop->results.omap_digest);
8141     }
8142     if (cop->omap_data.length()) {
8143       bufferlist keys;
8144       keys.substr_of(cop->omap_data, 4, cop->omap_data.length() - 4);
8145       cop->results.omap_digest = keys.crc32c(cop->results.omap_digest);
8146     }
8147   }
8148
8149   if (!cop->temp_cursor.attr_complete) {
8150     for (map<string,bufferlist>::iterator p = cop->attrs.begin();
8151          p != cop->attrs.end();
8152          ++p) {
8153       cop->results.attrs[string("_") + p->first] = p->second;
8154     }
8155     cop->attrs.clear();
8156   }
8157
8158   if (!cop->cursor.is_complete()) {
8159     // write out what we have so far
8160     if (cop->temp_cursor.is_initial()) {
8161       assert(!cop->results.started_temp_obj);
8162       cop->results.started_temp_obj = true;
8163       cop->results.temp_oid = generate_temp_object(oid);
8164       dout(20) << __func__ << " using temp " << cop->results.temp_oid << dendl;
8165     }
8166     ObjectContextRef tempobc = get_object_context(cop->results.temp_oid, true);
8167     OpContextUPtr ctx = simple_opc_create(tempobc);
8168     if (cop->temp_cursor.is_initial()) {
8169       ctx->new_temp_oid = cop->results.temp_oid;
8170     }
8171     _write_copy_chunk(cop, ctx->op_t.get());
8172     simple_opc_submit(std::move(ctx));
8173     dout(10) << __func__ << " fetching more" << dendl;
8174     _copy_some(cobc, cop);
8175     return;
8176   }
8177
8178   // verify digests?
8179   if (cop->results.is_data_digest() || cop->results.is_omap_digest()) {
8180     dout(20) << __func__ << std::hex
8181       << " got digest: rx data 0x" << cop->results.data_digest
8182       << " omap 0x" << cop->results.omap_digest
8183       << ", source: data 0x" << cop->results.source_data_digest
8184       << " omap 0x" <<  cop->results.source_omap_digest
8185       << std::dec
8186       << " flags " << cop->results.flags
8187       << dendl;
8188   }
8189   if (cop->results.is_data_digest() &&
8190       cop->results.data_digest != cop->results.source_data_digest) {
8191     derr << __func__ << std::hex << " data digest 0x" << cop->results.data_digest
8192          << " != source 0x" << cop->results.source_data_digest << std::dec
8193          << dendl;
8194     osd->clog->error() << info.pgid << " copy from " << cop->src
8195                        << " to " << cop->obc->obs.oi.soid << std::hex
8196                        << " data digest 0x" << cop->results.data_digest
8197                        << " != source 0x" << cop->results.source_data_digest
8198                        << std::dec;
8199     r = -EIO;
8200     goto out;
8201   }
8202   if (cop->results.is_omap_digest() &&
8203       cop->results.omap_digest != cop->results.source_omap_digest) {
8204     derr << __func__ << std::hex
8205          << " omap digest 0x" << cop->results.omap_digest
8206          << " != source 0x" << cop->results.source_omap_digest
8207          << std::dec << dendl;
8208     osd->clog->error() << info.pgid << " copy from " << cop->src
8209                        << " to " << cop->obc->obs.oi.soid << std::hex
8210                        << " omap digest 0x" << cop->results.omap_digest
8211                        << " != source 0x" << cop->results.source_omap_digest
8212                        << std::dec;
8213     r = -EIO;
8214     goto out;
8215   }
8216   if (cct->_conf->osd_debug_inject_copyfrom_error) {
8217     derr << __func__ << " injecting copyfrom failure" << dendl;
8218     r = -EIO;
8219     goto out;
8220   }
8221
8222   cop->results.fill_in_final_tx = std::function<void(PGTransaction*)>(
8223     [this, &cop /* avoid ref cycle */](PGTransaction *t) {
8224       ObjectState& obs = cop->obc->obs;
8225       if (cop->temp_cursor.is_initial()) {
8226         dout(20) << "fill_in_final_tx: writing "
8227                  << "directly to final object" << dendl;
8228         // write directly to final object
8229         cop->results.temp_oid = obs.oi.soid;
8230         _write_copy_chunk(cop, t);
8231       } else {
8232         // finish writing to temp object, then move into place
8233         dout(20) << "fill_in_final_tx: writing to temp object" << dendl;
8234         _write_copy_chunk(cop, t);
8235         t->rename(obs.oi.soid, cop->results.temp_oid);
8236       }
8237       t->setattrs(obs.oi.soid, cop->results.attrs);
8238     });
8239
8240   dout(20) << __func__ << " success; committing" << dendl;
8241
8242  out:
8243   dout(20) << __func__ << " complete r = " << cpp_strerror(r) << dendl;
8244   CopyCallbackResults results(r, &cop->results);
8245   cop->cb->complete(results);
8246
8247   copy_ops.erase(cobc->obs.oi.soid);
8248   cobc->stop_block();
8249
8250   if (r < 0 && cop->results.started_temp_obj) {
8251     dout(10) << __func__ << " deleting partial temp object "
8252              << cop->results.temp_oid << dendl;
8253     ObjectContextRef tempobc = get_object_context(cop->results.temp_oid, true);
8254     OpContextUPtr ctx = simple_opc_create(tempobc);
8255     ctx->op_t->remove(cop->results.temp_oid);
8256     ctx->discard_temp_oid = cop->results.temp_oid;
8257     simple_opc_submit(std::move(ctx));
8258   }
8259
8260   // cancel and requeue proxy ops on this object
8261   if (!r) {
8262     for (map<ceph_tid_t, ProxyReadOpRef>::iterator it = proxyread_ops.begin();
8263         it != proxyread_ops.end();) {
8264       if (it->second->soid == cobc->obs.oi.soid) {
8265         cancel_proxy_read((it++)->second);
8266       } else {
8267         ++it;
8268       }
8269     }
8270     for (map<ceph_tid_t, ProxyWriteOpRef>::iterator it = proxywrite_ops.begin();
8271          it != proxywrite_ops.end();) {
8272       if (it->second->soid == cobc->obs.oi.soid) {
8273         cancel_proxy_write((it++)->second);
8274       } else {
8275         ++it;
8276       }
8277     }
8278     kick_proxy_ops_blocked(cobc->obs.oi.soid);
8279   }
8280
8281   kick_object_context_blocked(cobc);
8282 }
8283
8284 void PrimaryLogPG::_write_copy_chunk(CopyOpRef cop, PGTransaction *t)
8285 {
8286   dout(20) << __func__ << " " << cop
8287            << " " << cop->attrs.size() << " attrs"
8288            << " " << cop->data.length() << " bytes"
8289            << " " << cop->omap_header.length() << " omap header bytes"
8290            << " " << cop->omap_data.length() << " omap data bytes"
8291            << dendl;
8292   if (!cop->temp_cursor.attr_complete) {
8293     t->create(cop->results.temp_oid);
8294   }
8295   if (!cop->temp_cursor.data_complete) {
8296     assert(cop->data.length() + cop->temp_cursor.data_offset ==
8297            cop->cursor.data_offset);
8298     if (pool.info.requires_aligned_append() &&
8299         !cop->cursor.data_complete) {
8300       /**
8301        * Trim off the unaligned bit at the end, we'll adjust cursor.data_offset
8302        * to pick it up on the next pass.
8303        */
8304       assert(cop->temp_cursor.data_offset %
8305              pool.info.required_alignment() == 0);
8306       if (cop->data.length() % pool.info.required_alignment() != 0) {
8307         uint64_t to_trim =
8308           cop->data.length() % pool.info.required_alignment();
8309         bufferlist bl;
8310         bl.substr_of(cop->data, 0, cop->data.length() - to_trim);
8311         cop->data.swap(bl);
8312         cop->cursor.data_offset -= to_trim;
8313         assert(cop->data.length() + cop->temp_cursor.data_offset ==
8314                cop->cursor.data_offset);
8315       }
8316     }
8317     if (cop->data.length()) {
8318       t->write(
8319         cop->results.temp_oid,
8320         cop->temp_cursor.data_offset,
8321         cop->data.length(),
8322         cop->data,
8323         cop->dest_obj_fadvise_flags);
8324     }
8325     cop->data.clear();
8326   }
8327   if (pool.info.supports_omap()) {
8328     if (!cop->temp_cursor.omap_complete) {
8329       if (cop->omap_header.length()) {
8330         t->omap_setheader(
8331           cop->results.temp_oid,
8332           cop->omap_header);
8333         cop->omap_header.clear();
8334       }
8335       if (cop->omap_data.length()) {
8336         map<string,bufferlist> omap;
8337         bufferlist::iterator p = cop->omap_data.begin();
8338         ::decode(omap, p);
8339         t->omap_setkeys(cop->results.temp_oid, omap);
8340         cop->omap_data.clear();
8341       }
8342     }
8343   } else {
8344     assert(cop->omap_header.length() == 0);
8345     assert(cop->omap_data.length() == 0);
8346   }
8347   cop->temp_cursor = cop->cursor;
8348 }
8349
8350 void PrimaryLogPG::finish_copyfrom(CopyFromCallback *cb)
8351 {
8352   OpContext *ctx = cb->ctx;
8353   dout(20) << "finish_copyfrom on " << ctx->obs->oi.soid << dendl;
8354
8355   ObjectState& obs = ctx->new_obs;
8356   if (obs.exists) {
8357     dout(20) << __func__ << ": exists, removing" << dendl;
8358     ctx->op_t->remove(obs.oi.soid);
8359   } else {
8360     ctx->delta_stats.num_objects++;
8361     obs.exists = true;
8362   }
8363   if (cb->is_temp_obj_used()) {
8364     ctx->discard_temp_oid = cb->results->temp_oid;
8365   }
8366   cb->results->fill_in_final_tx(ctx->op_t.get());
8367
8368   // CopyFromCallback fills this in for us
8369   obs.oi.user_version = ctx->user_at_version;
8370
8371   obs.oi.set_data_digest(cb->results->data_digest);
8372   obs.oi.set_omap_digest(cb->results->omap_digest);
8373
8374   obs.oi.truncate_seq = cb->results->truncate_seq;
8375   obs.oi.truncate_size = cb->results->truncate_size;
8376
8377   ctx->extra_reqids = cb->results->reqids;
8378
8379   // cache: clear whiteout?
8380   if (obs.oi.is_whiteout()) {
8381     dout(10) << __func__ << " clearing whiteout on " << obs.oi.soid << dendl;
8382     obs.oi.clear_flag(object_info_t::FLAG_WHITEOUT);
8383     --ctx->delta_stats.num_whiteouts;
8384   }
8385
8386   if (cb->results->has_omap) {
8387     dout(10) << __func__ << " setting omap flag on " << obs.oi.soid << dendl;
8388     obs.oi.set_flag(object_info_t::FLAG_OMAP);
8389   } else {
8390     dout(10) << __func__ << " clearing omap flag on " << obs.oi.soid << dendl;
8391     obs.oi.clear_flag(object_info_t::FLAG_OMAP);
8392   }
8393
8394   interval_set<uint64_t> ch;
8395   if (obs.oi.size > 0)
8396     ch.insert(0, obs.oi.size);
8397   ctx->modified_ranges.union_of(ch);
8398
8399   if (cb->get_data_size() != obs.oi.size) {
8400     ctx->delta_stats.num_bytes -= obs.oi.size;
8401     obs.oi.size = cb->get_data_size();
8402     ctx->delta_stats.num_bytes += obs.oi.size;
8403   }
8404   ctx->delta_stats.num_wr++;
8405   ctx->delta_stats.num_wr_kb += SHIFT_ROUND_UP(obs.oi.size, 10);
8406
8407   osd->logger->inc(l_osd_copyfrom);
8408 }
8409
8410 void PrimaryLogPG::finish_promote(int r, CopyResults *results,
8411                                   ObjectContextRef obc)
8412 {
8413   const hobject_t& soid = obc->obs.oi.soid;
8414   dout(10) << __func__ << " " << soid << " r=" << r
8415            << " uv" << results->user_version << dendl;
8416
8417   if (r == -ECANCELED) {
8418     return;
8419   }
8420
8421   if (r != -ENOENT && soid.is_snap()) {
8422     if (results->snaps.empty()) {
8423       // we must have read "snap" content from the head object in
8424       // the base pool.  use snap_seq to construct what snaps should
8425       // be for this clone (what is was before we evicted the clean
8426       // clone from this pool, and what it will be when we flush and
8427       // the clone eventually happens in the base pool).
8428       SnapSet& snapset = obc->ssc->snapset;
8429       vector<snapid_t>::iterator p = snapset.snaps.begin();
8430       while (p != snapset.snaps.end() && *p > soid.snap)
8431         ++p;
8432       while (p != snapset.snaps.end() && *p > results->snap_seq) {
8433         results->snaps.push_back(*p);
8434         ++p;
8435       }
8436     }
8437
8438     dout(20) << __func__ << " snaps " << results->snaps << dendl;
8439     filter_snapc(results->snaps);
8440
8441     dout(20) << __func__ << " filtered snaps " << results->snaps << dendl;
8442     if (results->snaps.empty()) {
8443       dout(20) << __func__
8444                << " snaps are empty, clone is invalid,"
8445                << " setting r to ENOENT" << dendl;
8446       r = -ENOENT;
8447     }
8448   }
8449
8450   if (r < 0 && results->started_temp_obj) {
8451     dout(10) << __func__ << " abort; will clean up partial work" << dendl;
8452     ObjectContextRef tempobc = get_object_context(results->temp_oid, false);
8453     assert(tempobc);
8454     OpContextUPtr ctx = simple_opc_create(tempobc);
8455     ctx->op_t->remove(results->temp_oid);
8456     simple_opc_submit(std::move(ctx));
8457     results->started_temp_obj = false;
8458   }
8459
8460   if (r == -ENOENT && soid.is_snap()) {
8461     dout(10) << __func__
8462              << ": enoent while trying to promote clone, " << soid
8463              << " must have been trimmed, removing from snapset"
8464              << dendl;
8465     hobject_t head(soid.get_head());
8466     ObjectContextRef obc = get_object_context(head, false);
8467     assert(obc);
8468
8469     OpContextUPtr tctx = simple_opc_create(obc);
8470     tctx->at_version = get_next_version();
8471     filter_snapc(tctx->new_snapset.snaps);
8472     vector<snapid_t> new_clones;
8473     map<snapid_t, vector<snapid_t>> new_clone_snaps;
8474     for (vector<snapid_t>::iterator i = tctx->new_snapset.clones.begin();
8475          i != tctx->new_snapset.clones.end();
8476          ++i) {
8477       if (*i != soid.snap) {
8478         new_clones.push_back(*i);
8479         auto p = tctx->new_snapset.clone_snaps.find(*i);
8480         if (p != tctx->new_snapset.clone_snaps.end()) {
8481           new_clone_snaps[*i] = p->second;
8482         }
8483       }
8484     }
8485     tctx->new_snapset.clones.swap(new_clones);
8486     tctx->new_snapset.clone_overlap.erase(soid.snap);
8487     tctx->new_snapset.clone_size.erase(soid.snap);
8488     tctx->new_snapset.clone_snaps.swap(new_clone_snaps);
8489
8490     // take RWWRITE lock for duration of our local write.  ignore starvation.
8491     if (!tctx->lock_manager.take_write_lock(
8492           head,
8493           obc)) {
8494       assert(0 == "problem!");
8495     }
8496     dout(20) << __func__ << " took lock on obc, " << obc->rwstate << dendl;
8497
8498     finish_ctx(tctx.get(), pg_log_entry_t::PROMOTE);
8499
8500     simple_opc_submit(std::move(tctx));
8501     return;
8502   }
8503
8504   bool whiteout = false;
8505   if (r == -ENOENT) {
8506     assert(soid.snap == CEPH_NOSNAP); // snap case is above
8507     dout(10) << __func__ << " whiteout " << soid << dendl;
8508     whiteout = true;
8509   }
8510
8511   if (r < 0 && !whiteout) {
8512     derr << __func__ << " unexpected promote error " << cpp_strerror(r) << dendl;
8513     // pass error to everyone blocked on this object
8514     // FIXME: this is pretty sloppy, but at this point we got
8515     // something unexpected and don't have many other options.
8516     map<hobject_t,list<OpRequestRef>>::iterator blocked_iter =
8517       waiting_for_blocked_object.find(soid);
8518     if (blocked_iter != waiting_for_blocked_object.end()) {
8519       while (!blocked_iter->second.empty()) {
8520         osd->reply_op_error(blocked_iter->second.front(), r);
8521         blocked_iter->second.pop_front();
8522       }
8523       waiting_for_blocked_object.erase(blocked_iter);
8524     }
8525     return;
8526   }
8527
8528   osd->promote_finish(results->object_size);
8529
8530   OpContextUPtr tctx =  simple_opc_create(obc);
8531   tctx->at_version = get_next_version();
8532
8533   ++tctx->delta_stats.num_objects;
8534   if (soid.snap < CEPH_NOSNAP)
8535     ++tctx->delta_stats.num_object_clones;
8536   tctx->new_obs.exists = true;
8537
8538   tctx->extra_reqids = results->reqids;
8539
8540   bool legacy_snapset = tctx->new_snapset.is_legacy() ||
8541     get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
8542
8543   if (whiteout) {
8544     // create a whiteout
8545     tctx->op_t->create(soid);
8546     tctx->new_obs.oi.set_flag(object_info_t::FLAG_WHITEOUT);
8547     ++tctx->delta_stats.num_whiteouts;
8548     dout(20) << __func__ << " creating whiteout on " << soid << dendl;
8549     osd->logger->inc(l_osd_tier_whiteout);
8550   } else {
8551     if (results->has_omap) {
8552       dout(10) << __func__ << " setting omap flag on " << soid << dendl;
8553       tctx->new_obs.oi.set_flag(object_info_t::FLAG_OMAP);
8554       ++tctx->delta_stats.num_objects_omap;
8555     }
8556
8557     results->fill_in_final_tx(tctx->op_t.get());
8558     if (results->started_temp_obj) {
8559       tctx->discard_temp_oid = results->temp_oid;
8560     }
8561     tctx->new_obs.oi.size = results->object_size;
8562     tctx->new_obs.oi.user_version = results->user_version;
8563     // Don't care src object whether have data or omap digest
8564     if (results->object_size)
8565       tctx->new_obs.oi.set_data_digest(results->data_digest);
8566     if (results->has_omap)
8567       tctx->new_obs.oi.set_omap_digest(results->omap_digest);
8568     tctx->new_obs.oi.truncate_seq = results->truncate_seq;
8569     tctx->new_obs.oi.truncate_size = results->truncate_size;
8570
8571     if (soid.snap != CEPH_NOSNAP) {
8572       if (legacy_snapset) {
8573         tctx->new_obs.oi.legacy_snaps = results->snaps;
8574         assert(!tctx->new_obs.oi.legacy_snaps.empty());
8575       } else {
8576         // it's already in the snapset
8577         assert(obc->ssc->snapset.clone_snaps.count(soid.snap));
8578       }
8579       assert(obc->ssc->snapset.clone_size.count(soid.snap));
8580       assert(obc->ssc->snapset.clone_size[soid.snap] ==
8581              results->object_size);
8582       assert(obc->ssc->snapset.clone_overlap.count(soid.snap));
8583
8584       tctx->delta_stats.num_bytes += obc->ssc->snapset.get_clone_bytes(soid.snap);
8585     } else {
8586       tctx->delta_stats.num_bytes += results->object_size;
8587     }
8588   }
8589
8590   if (results->mirror_snapset) {
8591     assert(tctx->new_obs.oi.soid.snap == CEPH_NOSNAP);
8592     tctx->new_snapset.from_snap_set(
8593       results->snapset,
8594       get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS);
8595   }
8596   tctx->new_snapset.head_exists = true;
8597   dout(20) << __func__ << " new_snapset " << tctx->new_snapset << dendl;
8598
8599   // take RWWRITE lock for duration of our local write.  ignore starvation.
8600   if (!tctx->lock_manager.take_write_lock(
8601         obc->obs.oi.soid,
8602         obc)) {
8603     assert(0 == "problem!");
8604   }
8605   dout(20) << __func__ << " took lock on obc, " << obc->rwstate << dendl;
8606
8607   finish_ctx(tctx.get(), pg_log_entry_t::PROMOTE);
8608
8609   simple_opc_submit(std::move(tctx));
8610
8611   osd->logger->inc(l_osd_tier_promote);
8612
8613   if (agent_state &&
8614       agent_state->is_idle())
8615     agent_choose_mode();
8616 }
8617
8618 void PrimaryLogPG::cancel_copy(CopyOpRef cop, bool requeue)
8619 {
8620   dout(10) << __func__ << " " << cop->obc->obs.oi.soid
8621            << " from " << cop->src << " " << cop->oloc
8622            << " v" << cop->results.user_version << dendl;
8623
8624   // cancel objecter op, if we can
8625   if (cop->objecter_tid) {
8626     osd->objecter->op_cancel(cop->objecter_tid, -ECANCELED);
8627     cop->objecter_tid = 0;
8628     if (cop->objecter_tid2) {
8629       osd->objecter->op_cancel(cop->objecter_tid2, -ECANCELED);
8630       cop->objecter_tid2 = 0;
8631     }
8632   }
8633
8634   copy_ops.erase(cop->obc->obs.oi.soid);
8635   cop->obc->stop_block();
8636
8637   kick_object_context_blocked(cop->obc);
8638   cop->results.should_requeue = requeue;
8639   CopyCallbackResults result(-ECANCELED, &cop->results);
8640   cop->cb->complete(result);
8641
8642   // There may still be an objecter callback referencing this copy op.
8643   // That callback will not need the obc since it's been canceled, and
8644   // we need the obc reference to go away prior to flush.
8645   cop->obc = ObjectContextRef();
8646 }
8647
8648 void PrimaryLogPG::cancel_copy_ops(bool requeue)
8649 {
8650   dout(10) << __func__ << dendl;
8651   map<hobject_t,CopyOpRef>::iterator p = copy_ops.begin();
8652   while (p != copy_ops.end()) {
8653     // requeue this op? can I queue up all of them?
8654     cancel_copy((p++)->second, requeue);
8655   }
8656 }
8657
8658
8659 // ========================================================================
8660 // flush
8661 //
8662 // Flush a dirty object in the cache tier by writing it back to the
8663 // base tier.  The sequence looks like:
8664 //
8665 //  * send a copy-from operation to the base tier to copy the current
8666 //    version of the object
8667 //  * base tier will pull the object via (perhaps multiple) copy-get(s)
8668 //  * on completion, we check if the object has been modified.  if so,
8669 //    just reply with -EAGAIN.
8670 //  * try to take a write lock so we can clear the dirty flag.  if this
8671 //    fails, wait and retry
8672 //  * start a repop that clears the bit.
8673 //
8674 // If we have to wait, we will retry by coming back through the
8675 // start_flush method.  We check if a flush is already in progress
8676 // and, if so, try to finish it by rechecking the version and trying
8677 // to clear the dirty bit.
8678 //
8679 // In order for the cache-flush (a write op) to not block the copy-get
8680 // from reading the object, the client *must* set the SKIPRWLOCKS
8681 // flag.
8682 //
8683 // NOTE: normally writes are strictly ordered for the client, but
8684 // flushes are special in that they can be reordered with respect to
8685 // other writes.  In particular, we can't have a flush request block
8686 // an update to the cache pool object!
8687
8688 struct C_Flush : public Context {
8689   PrimaryLogPGRef pg;
8690   hobject_t oid;
8691   epoch_t last_peering_reset;
8692   ceph_tid_t tid;
8693   utime_t start;
8694   C_Flush(PrimaryLogPG *p, hobject_t o, epoch_t lpr)
8695     : pg(p), oid(o), last_peering_reset(lpr),
8696       tid(0), start(ceph_clock_now())
8697   {}
8698   void finish(int r) override {
8699     if (r == -ECANCELED)
8700       return;
8701     pg->lock();
8702     if (last_peering_reset == pg->get_last_peering_reset()) {
8703       pg->finish_flush(oid, tid, r);
8704       pg->osd->logger->tinc(l_osd_tier_flush_lat, ceph_clock_now() - start);
8705     }
8706     pg->unlock();
8707   }
8708 };
8709
8710 int PrimaryLogPG::start_flush(
8711   OpRequestRef op, ObjectContextRef obc,
8712   bool blocking, hobject_t *pmissing,
8713   boost::optional<std::function<void()>> &&on_flush)
8714 {
8715   const object_info_t& oi = obc->obs.oi;
8716   const hobject_t& soid = oi.soid;
8717   dout(10) << __func__ << " " << soid
8718            << " v" << oi.version
8719            << " uv" << oi.user_version
8720            << " " << (blocking ? "blocking" : "non-blocking/best-effort")
8721            << dendl;
8722
8723   // get a filtered snapset, need to remove removed snaps
8724   SnapSet snapset = obc->ssc->snapset.get_filtered(pool.info);
8725
8726   // verify there are no (older) check for dirty clones
8727   {
8728     dout(20) << " snapset " << snapset << dendl;
8729     vector<snapid_t>::reverse_iterator p = snapset.clones.rbegin();
8730     while (p != snapset.clones.rend() && *p >= soid.snap)
8731       ++p;
8732     if (p != snapset.clones.rend()) {
8733       hobject_t next = soid;
8734       next.snap = *p;
8735       assert(next.snap < soid.snap);
8736       if (pg_log.get_missing().is_missing(next)) {
8737         dout(10) << __func__ << " missing clone is " << next << dendl;
8738         if (pmissing)
8739           *pmissing = next;
8740         return -ENOENT;
8741       }
8742       ObjectContextRef older_obc = get_object_context(next, false);
8743       if (older_obc) {
8744         dout(20) << __func__ << " next oldest clone is " << older_obc->obs.oi
8745                  << dendl;
8746         if (older_obc->obs.oi.is_dirty()) {
8747           dout(10) << __func__ << " next oldest clone is dirty: "
8748                    << older_obc->obs.oi << dendl;
8749           return -EBUSY;
8750         }
8751       } else {
8752         dout(20) << __func__ << " next oldest clone " << next
8753                  << " is not present; implicitly clean" << dendl;
8754       }
8755     } else {
8756       dout(20) << __func__ << " no older clones" << dendl;
8757     }
8758   }
8759
8760   if (blocking)
8761     obc->start_block();
8762
8763   map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(soid);
8764   if (p != flush_ops.end()) {
8765     FlushOpRef fop = p->second;
8766     if (fop->op == op) {
8767       // we couldn't take the write lock on a cache-try-flush before;
8768       // now we are trying again for the lock.
8769       return try_flush_mark_clean(fop);
8770     }
8771     if (fop->flushed_version == obc->obs.oi.user_version &&
8772         (fop->blocking || !blocking)) {
8773       // nonblocking can join anything
8774       // blocking can only join a blocking flush
8775       dout(20) << __func__ << " piggybacking on existing flush " << dendl;
8776       if (op)
8777         fop->dup_ops.push_back(op);
8778       return -EAGAIN;   // clean up this ctx; op will retry later
8779     }
8780
8781     // cancel current flush since it will fail anyway, or because we
8782     // are blocking and the existing flush is nonblocking.
8783     dout(20) << __func__ << " canceling previous flush; it will fail" << dendl;
8784     if (fop->op)
8785       osd->reply_op_error(fop->op, -EBUSY);
8786     while (!fop->dup_ops.empty()) {
8787       osd->reply_op_error(fop->dup_ops.front(), -EBUSY);
8788       fop->dup_ops.pop_front();
8789     }
8790     cancel_flush(fop, false);
8791   }
8792
8793   /**
8794    * In general, we need to send a delete and a copyfrom.
8795    * Consider snapc 10:[10, 9, 8, 4, 3, 2]:[10(10, 9), 4(4,3,2)]
8796    * where 4 is marked as clean.  To flush 10, we have to:
8797    * 1) delete 4:[4,3,2] -- Logically, the object does not exist after 4
8798    * 2) copyfrom 8:[8,4,3,2] -- flush object after snap 8
8799    *
8800    * There is a complicating case.  Supposed there had been a clone 7
8801    * for snaps [7, 6] which has been trimmed since they no longer exist.
8802    * In the base pool, we'd have 5:[4,3,2]:[4(4,3,2)]+head.  When we submit
8803    * the delete, the snap will be promoted to 5, and the head will become
8804    * a snapdir.  When the copy-from goes through, we'll end up with
8805    * 8:[8,4,3,2]:[4(4,3,2)]+head.
8806    *
8807    * Another complication is the case where there is an interval change
8808    * after doing the delete and the flush but before marking the object
8809    * clean.  We'll happily delete head and then recreate it at the same
8810    * sequence number, which works out ok.
8811    */
8812
8813   SnapContext snapc, dsnapc;
8814   if (snapset.seq != 0) {
8815     if (soid.snap == CEPH_NOSNAP) {
8816       snapc.seq = snapset.seq;
8817       snapc.snaps = snapset.snaps;
8818     } else {
8819       snapid_t min_included_snap;
8820       if (snapset.is_legacy()) {
8821         min_included_snap = oi.legacy_snaps.back();
8822       } else {
8823         auto p = snapset.clone_snaps.find(soid.snap);
8824         assert(p != snapset.clone_snaps.end());
8825         min_included_snap = p->second.back();
8826       }
8827       snapc = snapset.get_ssc_as_of(min_included_snap - 1);
8828     }
8829
8830     snapid_t prev_snapc = 0;
8831     for (vector<snapid_t>::reverse_iterator citer = snapset.clones.rbegin();
8832          citer != snapset.clones.rend();
8833          ++citer) {
8834       if (*citer < soid.snap) {
8835         prev_snapc = *citer;
8836         break;
8837       }
8838     }
8839
8840     dsnapc = snapset.get_ssc_as_of(prev_snapc);
8841   }
8842
8843   object_locator_t base_oloc(soid);
8844   base_oloc.pool = pool.info.tier_of;
8845
8846   if (dsnapc.seq < snapc.seq) {
8847     ObjectOperation o;
8848     o.remove();
8849     osd->objecter->mutate(
8850       soid.oid,
8851       base_oloc,
8852       o,
8853       dsnapc,
8854       ceph::real_clock::from_ceph_timespec(oi.mtime),
8855       (CEPH_OSD_FLAG_IGNORE_OVERLAY |
8856        CEPH_OSD_FLAG_ENFORCE_SNAPC),
8857       NULL /* no callback, we'll rely on the ordering w.r.t the next op */);
8858   }
8859
8860   FlushOpRef fop(std::make_shared<FlushOp>());
8861   fop->obc = obc;
8862   fop->flushed_version = oi.user_version;
8863   fop->blocking = blocking;
8864   fop->on_flush = std::move(on_flush);
8865   fop->op = op;
8866
8867   ObjectOperation o;
8868   if (oi.is_whiteout()) {
8869     fop->removal = true;
8870     o.remove();
8871   } else {
8872     object_locator_t oloc(soid);
8873     o.copy_from(soid.oid.name, soid.snap, oloc, oi.user_version,
8874                 CEPH_OSD_COPY_FROM_FLAG_FLUSH |
8875                 CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
8876                 CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
8877                 CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE,
8878                 LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL|LIBRADOS_OP_FLAG_FADVISE_NOCACHE);
8879
8880     //mean the base tier don't cache data after this
8881     if (agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL)
8882       o.set_last_op_flags(LIBRADOS_OP_FLAG_FADVISE_DONTNEED);
8883   }
8884   C_Flush *fin = new C_Flush(this, soid, get_last_peering_reset());
8885
8886   ceph_tid_t tid = osd->objecter->mutate(
8887     soid.oid, base_oloc, o, snapc,
8888     ceph::real_clock::from_ceph_timespec(oi.mtime),
8889     CEPH_OSD_FLAG_IGNORE_OVERLAY | CEPH_OSD_FLAG_ENFORCE_SNAPC,
8890     new C_OnFinisher(fin,
8891                      &osd->objecter_finisher));
8892   /* we're under the pg lock and fin->finish() is grabbing that */
8893   fin->tid = tid;
8894   fop->objecter_tid = tid;
8895
8896   flush_ops[soid] = fop;
8897   info.stats.stats.sum.num_flush++;
8898   info.stats.stats.sum.num_flush_kb += SHIFT_ROUND_UP(oi.size, 10);
8899   return -EINPROGRESS;
8900 }
8901
8902 void PrimaryLogPG::finish_flush(hobject_t oid, ceph_tid_t tid, int r)
8903 {
8904   dout(10) << __func__ << " " << oid << " tid " << tid
8905            << " " << cpp_strerror(r) << dendl;
8906   map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(oid);
8907   if (p == flush_ops.end()) {
8908     dout(10) << __func__ << " no flush_op found" << dendl;
8909     return;
8910   }
8911   FlushOpRef fop = p->second;
8912   if (tid != fop->objecter_tid) {
8913     dout(10) << __func__ << " tid " << tid << " != fop " << fop
8914              << " tid " << fop->objecter_tid << dendl;
8915     return;
8916   }
8917   ObjectContextRef obc = fop->obc;
8918   fop->objecter_tid = 0;
8919
8920   if (r < 0 && !(r == -ENOENT && fop->removal)) {
8921     if (fop->op)
8922       osd->reply_op_error(fop->op, -EBUSY);
8923     if (fop->blocking) {
8924       obc->stop_block();
8925       kick_object_context_blocked(obc);
8926     }
8927
8928     if (!fop->dup_ops.empty()) {
8929       dout(20) << __func__ << " requeueing dups" << dendl;
8930       requeue_ops(fop->dup_ops);
8931     }
8932     if (fop->on_flush) {
8933       (*(fop->on_flush))();
8934       fop->on_flush = boost::none;
8935     }
8936     flush_ops.erase(oid);
8937     return;
8938   }
8939
8940   r = try_flush_mark_clean(fop);
8941   if (r == -EBUSY && fop->op) {
8942     osd->reply_op_error(fop->op, r);
8943   }
8944 }
8945
8946 int PrimaryLogPG::try_flush_mark_clean(FlushOpRef fop)
8947 {
8948   ObjectContextRef obc = fop->obc;
8949   const hobject_t& oid = obc->obs.oi.soid;
8950
8951   if (fop->blocking) {
8952     obc->stop_block();
8953     kick_object_context_blocked(obc);
8954   }
8955
8956   if (fop->flushed_version != obc->obs.oi.user_version ||
8957       !obc->obs.exists) {
8958     if (obc->obs.exists)
8959       dout(10) << __func__ << " flushed_version " << fop->flushed_version
8960                << " != current " << obc->obs.oi.user_version
8961                << dendl;
8962     else
8963       dout(10) << __func__ << " object no longer exists" << dendl;
8964
8965     if (!fop->dup_ops.empty()) {
8966       dout(20) << __func__ << " requeueing dups" << dendl;
8967       requeue_ops(fop->dup_ops);
8968     }
8969     if (fop->on_flush) {
8970       (*(fop->on_flush))();
8971       fop->on_flush = boost::none;
8972     }
8973     flush_ops.erase(oid);
8974     if (fop->blocking)
8975       osd->logger->inc(l_osd_tier_flush_fail);
8976     else
8977       osd->logger->inc(l_osd_tier_try_flush_fail);
8978     return -EBUSY;
8979   }
8980
8981   if (!fop->blocking &&
8982       scrubber.write_blocked_by_scrub(oid)) {
8983     if (fop->op) {
8984       dout(10) << __func__ << " blocked by scrub" << dendl;
8985       requeue_op(fop->op);
8986       requeue_ops(fop->dup_ops);
8987       return -EAGAIN;    // will retry
8988     } else {
8989       osd->logger->inc(l_osd_tier_try_flush_fail);
8990       cancel_flush(fop, false);
8991       return -ECANCELED;
8992     }
8993   }
8994
8995   // successfully flushed, can we evict this object?
8996   if (!fop->op && agent_state->evict_mode != TierAgentState::EVICT_MODE_IDLE &&
8997       agent_maybe_evict(obc, true)) {
8998     osd->logger->inc(l_osd_tier_clean);
8999     if (fop->on_flush) {
9000       (*(fop->on_flush))();
9001       fop->on_flush = boost::none;
9002     }
9003     flush_ops.erase(oid);
9004     return 0;
9005   }
9006
9007   dout(10) << __func__ << " clearing DIRTY flag for " << oid << dendl;
9008   OpContextUPtr ctx = simple_opc_create(fop->obc);
9009
9010   // successfully flushed; can we clear the dirty bit?
9011   // try to take the lock manually, since we don't
9012   // have a ctx yet.
9013   if (ctx->lock_manager.get_lock_type(
9014         ObjectContext::RWState::RWWRITE,
9015         oid,
9016         obc,
9017         fop->op)) {
9018     dout(20) << __func__ << " took write lock" << dendl;
9019   } else if (fop->op) {
9020     dout(10) << __func__ << " waiting on write lock" << dendl;
9021     close_op_ctx(ctx.release());
9022     requeue_op(fop->op);
9023     requeue_ops(fop->dup_ops);
9024     return -EAGAIN;    // will retry
9025   } else {
9026     dout(10) << __func__ << " failed write lock, no op; failing" << dendl;
9027     close_op_ctx(ctx.release());
9028     osd->logger->inc(l_osd_tier_try_flush_fail);
9029     cancel_flush(fop, false);
9030     return -ECANCELED;
9031   }
9032
9033   if (fop->on_flush) {
9034     ctx->register_on_finish(*(fop->on_flush));
9035     fop->on_flush = boost::none;
9036   }
9037
9038   ctx->at_version = get_next_version();
9039
9040   ctx->new_obs = obc->obs;
9041   ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
9042   --ctx->delta_stats.num_objects_dirty;
9043
9044   finish_ctx(ctx.get(), pg_log_entry_t::CLEAN);
9045
9046   osd->logger->inc(l_osd_tier_clean);
9047
9048   if (!fop->dup_ops.empty() || fop->op) {
9049     dout(20) << __func__ << " requeueing for " << ctx->at_version << dendl;
9050     list<OpRequestRef> ls;
9051     if (fop->op)
9052       ls.push_back(fop->op);
9053     ls.splice(ls.end(), fop->dup_ops);
9054     requeue_ops(ls);
9055   }
9056
9057   simple_opc_submit(std::move(ctx));
9058
9059   flush_ops.erase(oid);
9060
9061   if (fop->blocking)
9062     osd->logger->inc(l_osd_tier_flush);
9063   else
9064     osd->logger->inc(l_osd_tier_try_flush);
9065
9066   return -EINPROGRESS;
9067 }
9068
9069 void PrimaryLogPG::cancel_flush(FlushOpRef fop, bool requeue)
9070 {
9071   dout(10) << __func__ << " " << fop->obc->obs.oi.soid << " tid "
9072            << fop->objecter_tid << dendl;
9073   if (fop->objecter_tid) {
9074     osd->objecter->op_cancel(fop->objecter_tid, -ECANCELED);
9075     fop->objecter_tid = 0;
9076   }
9077   if (fop->blocking) {
9078     fop->obc->stop_block();
9079     kick_object_context_blocked(fop->obc);
9080   }
9081   if (requeue) {
9082     if (fop->op)
9083       requeue_op(fop->op);
9084     requeue_ops(fop->dup_ops);
9085   }
9086   if (fop->on_flush) {
9087     (*(fop->on_flush))();
9088     fop->on_flush = boost::none;
9089   }
9090   flush_ops.erase(fop->obc->obs.oi.soid);
9091 }
9092
9093 void PrimaryLogPG::cancel_flush_ops(bool requeue)
9094 {
9095   dout(10) << __func__ << dendl;
9096   map<hobject_t,FlushOpRef>::iterator p = flush_ops.begin();
9097   while (p != flush_ops.end()) {
9098     cancel_flush((p++)->second, requeue);
9099   }
9100 }
9101
9102 bool PrimaryLogPG::is_present_clone(hobject_t coid)
9103 {
9104   if (!pool.info.allow_incomplete_clones())
9105     return true;
9106   if (is_missing_object(coid))
9107     return true;
9108   ObjectContextRef obc = get_object_context(coid, false);
9109   return obc && obc->obs.exists;
9110 }
9111
9112 // ========================================================================
9113 // rep op gather
9114
9115 class C_OSD_RepopApplied : public Context {
9116   PrimaryLogPGRef pg;
9117   boost::intrusive_ptr<PrimaryLogPG::RepGather> repop;
9118 public:
9119   C_OSD_RepopApplied(PrimaryLogPG *pg, PrimaryLogPG::RepGather *repop)
9120   : pg(pg), repop(repop) {}
9121   void finish(int) override {
9122     pg->repop_all_applied(repop.get());
9123   }
9124 };
9125
9126
9127 void PrimaryLogPG::repop_all_applied(RepGather *repop)
9128 {
9129   dout(10) << __func__ << ": repop tid " << repop->rep_tid << " all applied "
9130            << dendl;
9131   assert(!repop->applies_with_commit);
9132   repop->all_applied = true;
9133   if (!repop->rep_aborted) {
9134     eval_repop(repop);
9135   }
9136 }
9137
9138 class C_OSD_RepopCommit : public Context {
9139   PrimaryLogPGRef pg;
9140   boost::intrusive_ptr<PrimaryLogPG::RepGather> repop;
9141 public:
9142   C_OSD_RepopCommit(PrimaryLogPG *pg, PrimaryLogPG::RepGather *repop)
9143     : pg(pg), repop(repop) {}
9144   void finish(int) override {
9145     pg->repop_all_committed(repop.get());
9146   }
9147 };
9148
9149 void PrimaryLogPG::repop_all_committed(RepGather *repop)
9150 {
9151   dout(10) << __func__ << ": repop tid " << repop->rep_tid << " all committed "
9152            << dendl;
9153   repop->all_committed = true;
9154   if (repop->applies_with_commit) {
9155     assert(!repop->all_applied);
9156     repop->all_applied = true;
9157   }
9158
9159   if (!repop->rep_aborted) {
9160     if (repop->v != eversion_t()) {
9161       last_update_ondisk = repop->v;
9162       last_complete_ondisk = repop->pg_local_last_complete;
9163     }
9164     eval_repop(repop);
9165   }
9166 }
9167
9168 void PrimaryLogPG::op_applied(const eversion_t &applied_version)
9169 {
9170   dout(10) << "op_applied version " << applied_version << dendl;
9171   if (applied_version == eversion_t())
9172     return;
9173   assert(applied_version > last_update_applied);
9174   assert(applied_version <= info.last_update);
9175   last_update_applied = applied_version;
9176   if (is_primary()) {
9177     if (scrubber.active) {
9178       if (last_update_applied >= scrubber.subset_last_update) {
9179         if (ops_blocked_by_scrub()) {
9180           requeue_scrub(true);
9181         } else {
9182           requeue_scrub(false);
9183         }
9184
9185       }
9186     } else {
9187       assert(scrubber.start == scrubber.end);
9188     }
9189   } else {
9190     if (scrubber.active_rep_scrub) {
9191       if (last_update_applied >= static_cast<const MOSDRepScrub*>(
9192             scrubber.active_rep_scrub->get_req())->scrub_to) {
9193         osd->enqueue_back(
9194           info.pgid,
9195           PGQueueable(scrubber.active_rep_scrub, get_osdmap()->get_epoch()));
9196         scrubber.active_rep_scrub = OpRequestRef();
9197       }
9198     }
9199   }
9200 }
9201
9202 void PrimaryLogPG::eval_repop(RepGather *repop)
9203 {
9204   const MOSDOp *m = NULL;
9205   if (repop->op)
9206     m = static_cast<const MOSDOp *>(repop->op->get_req());
9207
9208   if (m)
9209     dout(10) << "eval_repop " << *repop
9210              << (repop->rep_done ? " DONE" : "")
9211              << dendl;
9212   else
9213     dout(10) << "eval_repop " << *repop << " (no op)"
9214              << (repop->rep_done ? " DONE" : "")
9215              << dendl;
9216
9217   if (repop->rep_done)
9218     return;
9219
9220   // ondisk?
9221   if (repop->all_committed) {
9222     dout(10) << " commit: " << *repop << dendl;
9223     for (auto p = repop->on_committed.begin();
9224          p != repop->on_committed.end();
9225          repop->on_committed.erase(p++)) {
9226       (*p)();
9227     }
9228     // send dup commits, in order
9229     if (waiting_for_ondisk.count(repop->v)) {
9230       assert(waiting_for_ondisk.begin()->first == repop->v);
9231       for (list<pair<OpRequestRef, version_t> >::iterator i =
9232              waiting_for_ondisk[repop->v].begin();
9233            i != waiting_for_ondisk[repop->v].end();
9234            ++i) {
9235         osd->reply_op_error(i->first, repop->r, repop->v,
9236                             i->second);
9237       }
9238       waiting_for_ondisk.erase(repop->v);
9239     }
9240   }
9241
9242   // applied?
9243   if (repop->all_applied) {
9244     if (repop->applies_with_commit) {
9245       assert(repop->on_applied.empty());
9246     }
9247     dout(10) << " applied: " << *repop << " " << dendl;
9248     for (auto p = repop->on_applied.begin();
9249          p != repop->on_applied.end();
9250          repop->on_applied.erase(p++)) {
9251       (*p)();
9252     }
9253   }
9254
9255   // done.
9256   if (repop->all_applied && repop->all_committed) {
9257     repop->rep_done = true;
9258
9259     publish_stats_to_osd();
9260     calc_min_last_complete_ondisk();
9261
9262     dout(10) << " removing " << *repop << dendl;
9263     assert(!repop_queue.empty());
9264     dout(20) << "   q front is " << *repop_queue.front() << dendl;
9265     if (repop_queue.front() != repop) {
9266       if (!repop->applies_with_commit) {
9267         dout(0) << " removing " << *repop << dendl;
9268         dout(0) << "   q front is " << *repop_queue.front() << dendl;
9269         assert(repop_queue.front() == repop);
9270       }
9271     } else {
9272       RepGather *to_remove = nullptr;
9273       while (!repop_queue.empty() &&
9274              (to_remove = repop_queue.front())->rep_done) {
9275         repop_queue.pop_front();
9276         for (auto p = to_remove->on_success.begin();
9277              p != to_remove->on_success.end();
9278              to_remove->on_success.erase(p++)) {
9279           (*p)();
9280         }
9281         remove_repop(to_remove);
9282       }
9283     }
9284   }
9285 }
9286
9287 void PrimaryLogPG::issue_repop(RepGather *repop, OpContext *ctx)
9288 {
9289   FUNCTRACE();
9290   const hobject_t& soid = ctx->obs->oi.soid;
9291   dout(7) << "issue_repop rep_tid " << repop->rep_tid
9292           << " o " << soid
9293           << dendl;
9294
9295   repop->v = ctx->at_version;
9296   if (ctx->at_version > eversion_t()) {
9297     for (set<pg_shard_t>::iterator i = actingbackfill.begin();
9298          i != actingbackfill.end();
9299          ++i) {
9300       if (*i == get_primary()) continue;
9301       pg_info_t &pinfo = peer_info[*i];
9302       // keep peer_info up to date
9303       if (pinfo.last_complete == pinfo.last_update)
9304         pinfo.last_complete = ctx->at_version;
9305       pinfo.last_update = ctx->at_version;
9306     }
9307   }
9308
9309   ctx->obc->ondisk_write_lock();
9310
9311   bool unlock_snapset_obc = false;
9312   ctx->op_t->add_obc(ctx->obc);
9313   if (ctx->clone_obc) {
9314     ctx->clone_obc->ondisk_write_lock();
9315     ctx->op_t->add_obc(ctx->clone_obc);
9316   }
9317   if (ctx->snapset_obc && ctx->snapset_obc->obs.oi.soid !=
9318       ctx->obc->obs.oi.soid) {
9319     ctx->snapset_obc->ondisk_write_lock();
9320     unlock_snapset_obc = true;
9321     ctx->op_t->add_obc(ctx->snapset_obc);
9322   }
9323
9324   Context *on_all_commit = new C_OSD_RepopCommit(this, repop);
9325   Context *on_all_applied = new C_OSD_RepopApplied(this, repop);
9326   Context *onapplied_sync = new C_OSD_OndiskWriteUnlock(
9327     ctx->obc,
9328     ctx->clone_obc,
9329     unlock_snapset_obc ? ctx->snapset_obc : ObjectContextRef());
9330   if (!(ctx->log.empty())) {
9331     assert(ctx->at_version >= projected_last_update);
9332     projected_last_update = ctx->at_version;
9333   }
9334   for (auto &&entry: ctx->log) {
9335     projected_log.add(entry);
9336   }
9337   pgbackend->submit_transaction(
9338     soid,
9339     ctx->delta_stats,
9340     ctx->at_version,
9341     std::move(ctx->op_t),
9342     pg_trim_to,
9343     min_last_complete_ondisk,
9344     ctx->log,
9345     ctx->updated_hset_history,
9346     onapplied_sync,
9347     on_all_applied,
9348     on_all_commit,
9349     repop->rep_tid,
9350     ctx->reqid,
9351     ctx->op);
9352 }
9353
9354 PrimaryLogPG::RepGather *PrimaryLogPG::new_repop(
9355   OpContext *ctx, ObjectContextRef obc,
9356   ceph_tid_t rep_tid)
9357 {
9358   if (ctx->op)
9359     dout(10) << "new_repop rep_tid " << rep_tid << " on " << *ctx->op->get_req() << dendl;
9360   else
9361     dout(10) << "new_repop rep_tid " << rep_tid << " (no op)" << dendl;
9362
9363   RepGather *repop = new RepGather(
9364     ctx, rep_tid, info.last_complete, false);
9365
9366   repop->start = ceph_clock_now();
9367
9368   repop_queue.push_back(&repop->queue_item);
9369   repop->get();
9370
9371   osd->logger->inc(l_osd_op_wip);
9372
9373   dout(10) << __func__ << ": " << *repop << dendl;
9374   return repop;
9375 }
9376
9377 boost::intrusive_ptr<PrimaryLogPG::RepGather> PrimaryLogPG::new_repop(
9378   eversion_t version,
9379   int r,
9380   ObcLockManager &&manager,
9381   OpRequestRef &&op,
9382   boost::optional<std::function<void(void)> > &&on_complete)
9383 {
9384   RepGather *repop = new RepGather(
9385     std::move(manager),
9386     std::move(op),
9387     std::move(on_complete),
9388     osd->get_tid(),
9389     info.last_complete,
9390     true,
9391     r);
9392   repop->v = version;
9393
9394   repop->start = ceph_clock_now();
9395
9396   repop_queue.push_back(&repop->queue_item);
9397
9398   osd->logger->inc(l_osd_op_wip);
9399
9400   dout(10) << __func__ << ": " << *repop << dendl;
9401   return boost::intrusive_ptr<RepGather>(repop);
9402 }
9403
9404 void PrimaryLogPG::remove_repop(RepGather *repop)
9405 {
9406   dout(20) << __func__ << " " << *repop << dendl;
9407
9408   for (auto p = repop->on_finish.begin();
9409        p != repop->on_finish.end();
9410        repop->on_finish.erase(p++)) {
9411     (*p)();
9412   }
9413
9414   release_object_locks(
9415     repop->lock_manager);
9416   repop->put();
9417
9418   osd->logger->dec(l_osd_op_wip);
9419 }
9420
9421 PrimaryLogPG::OpContextUPtr PrimaryLogPG::simple_opc_create(ObjectContextRef obc)
9422 {
9423   dout(20) << __func__ << " " << obc->obs.oi.soid << dendl;
9424   ceph_tid_t rep_tid = osd->get_tid();
9425   osd_reqid_t reqid(osd->get_cluster_msgr_name(), 0, rep_tid);
9426   OpContextUPtr ctx(new OpContext(OpRequestRef(), reqid, nullptr, obc, this));
9427   ctx->op_t.reset(new PGTransaction());
9428   ctx->mtime = ceph_clock_now();
9429   return ctx;
9430 }
9431
9432 void PrimaryLogPG::simple_opc_submit(OpContextUPtr ctx)
9433 {
9434   RepGather *repop = new_repop(ctx.get(), ctx->obc, ctx->reqid.tid);
9435   dout(20) << __func__ << " " << repop << dendl;
9436   issue_repop(repop, ctx.get());
9437   eval_repop(repop);
9438   calc_trim_to();
9439   repop->put();
9440 }
9441
9442
9443 void PrimaryLogPG::submit_log_entries(
9444   const mempool::osd_pglog::list<pg_log_entry_t> &entries,
9445   ObcLockManager &&manager,
9446   boost::optional<std::function<void(void)> > &&_on_complete,
9447   OpRequestRef op,
9448   int r)
9449 {
9450   dout(10) << __func__ << " " << entries << dendl;
9451   assert(is_primary());
9452
9453   eversion_t version;
9454   if (!entries.empty()) {
9455     assert(entries.rbegin()->version >= projected_last_update);
9456     version = projected_last_update = entries.rbegin()->version;
9457   }
9458
9459   boost::intrusive_ptr<RepGather> repop;
9460   boost::optional<std::function<void(void)> > on_complete;
9461   if (get_osdmap()->require_osd_release >= CEPH_RELEASE_JEWEL) {
9462     repop = new_repop(
9463       version,
9464       r,
9465       std::move(manager),
9466       std::move(op),
9467       std::move(_on_complete));
9468   } else {
9469     on_complete = std::move(_on_complete);
9470   }
9471
9472   pgbackend->call_write_ordered(
9473     [this, entries, repop, on_complete]() {
9474       ObjectStore::Transaction t;
9475       eversion_t old_last_update = info.last_update;
9476       merge_new_log_entries(entries, t);
9477
9478
9479       set<pg_shard_t> waiting_on;
9480       for (set<pg_shard_t>::const_iterator i = actingbackfill.begin();
9481            i != actingbackfill.end();
9482            ++i) {
9483         pg_shard_t peer(*i);
9484         if (peer == pg_whoami) continue;
9485         assert(peer_missing.count(peer));
9486         assert(peer_info.count(peer));
9487         if (get_osdmap()->require_osd_release >= CEPH_RELEASE_JEWEL) {
9488           assert(repop);
9489           MOSDPGUpdateLogMissing *m = new MOSDPGUpdateLogMissing(
9490             entries,
9491             spg_t(info.pgid.pgid, i->shard),
9492             pg_whoami.shard,
9493             get_osdmap()->get_epoch(),
9494             last_peering_reset,
9495             repop->rep_tid);
9496           osd->send_message_osd_cluster(
9497             peer.osd, m, get_osdmap()->get_epoch());
9498           waiting_on.insert(peer);
9499         } else {
9500           MOSDPGLog *m = new MOSDPGLog(
9501             peer.shard, pg_whoami.shard,
9502             info.last_update.epoch,
9503             info);
9504           m->log.log = entries;
9505           m->log.tail = old_last_update;
9506           m->log.head = info.last_update;
9507           osd->send_message_osd_cluster(
9508             peer.osd, m, get_osdmap()->get_epoch());
9509         }
9510       }
9511       if (get_osdmap()->require_osd_release >= CEPH_RELEASE_JEWEL) {
9512         ceph_tid_t rep_tid = repop->rep_tid;
9513         waiting_on.insert(pg_whoami);
9514         log_entry_update_waiting_on.insert(
9515           make_pair(
9516             rep_tid,
9517             LogUpdateCtx{std::move(repop), std::move(waiting_on)}
9518             ));
9519         struct OnComplete : public Context {
9520           PrimaryLogPGRef pg;
9521           ceph_tid_t rep_tid;
9522           epoch_t epoch;
9523           OnComplete(
9524             PrimaryLogPGRef pg,
9525             ceph_tid_t rep_tid,
9526             epoch_t epoch)
9527             : pg(pg), rep_tid(rep_tid), epoch(epoch) {}
9528           void finish(int) override {
9529             pg->lock();
9530             if (!pg->pg_has_reset_since(epoch)) {
9531               auto it = pg->log_entry_update_waiting_on.find(rep_tid);
9532               assert(it != pg->log_entry_update_waiting_on.end());
9533               auto it2 = it->second.waiting_on.find(pg->pg_whoami);
9534               assert(it2 != it->second.waiting_on.end());
9535               it->second.waiting_on.erase(it2);
9536               if (it->second.waiting_on.empty()) {
9537                 pg->repop_all_committed(it->second.repop.get());
9538                 pg->log_entry_update_waiting_on.erase(it);
9539               }
9540             }
9541             pg->unlock();
9542           }
9543         };
9544         t.register_on_commit(
9545           new OnComplete{this, rep_tid, get_osdmap()->get_epoch()});
9546       } else {
9547         if (on_complete) {
9548           struct OnComplete : public Context {
9549             PrimaryLogPGRef pg;
9550             std::function<void(void)> on_complete;
9551             epoch_t epoch;
9552             OnComplete(
9553               PrimaryLogPGRef pg,
9554               const std::function<void(void)> &on_complete,
9555               epoch_t epoch)
9556               : pg(pg),
9557                 on_complete(std::move(on_complete)),
9558                 epoch(epoch) {}
9559             void finish(int) override {
9560               pg->lock();
9561               if (!pg->pg_has_reset_since(epoch))
9562                 on_complete();
9563               pg->unlock();
9564             }
9565           };
9566           t.register_on_complete(
9567             new OnComplete{
9568               this, *on_complete, get_osdmap()->get_epoch()
9569                 });
9570         }
9571       }
9572       t.register_on_applied(
9573         new C_OSD_OnApplied{this, get_osdmap()->get_epoch(), info.last_update});
9574       int r = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
9575       assert(r == 0);
9576     });
9577 }
9578
9579 void PrimaryLogPG::cancel_log_updates()
9580 {
9581   // get rid of all the LogUpdateCtx so their references to repops are
9582   // dropped
9583   log_entry_update_waiting_on.clear();
9584 }
9585
9586 // -------------------------------------------------------
9587
9588 void PrimaryLogPG::get_watchers(list<obj_watch_item_t> &pg_watchers)
9589 {
9590   pair<hobject_t, ObjectContextRef> i;
9591   while (object_contexts.get_next(i.first, &i)) {
9592     ObjectContextRef obc(i.second);
9593     get_obc_watchers(obc, pg_watchers);
9594   }
9595 }
9596
9597 void PrimaryLogPG::get_obc_watchers(ObjectContextRef obc, list<obj_watch_item_t> &pg_watchers)
9598 {
9599   for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j =
9600          obc->watchers.begin();
9601         j != obc->watchers.end();
9602         ++j) {
9603     obj_watch_item_t owi;
9604
9605     owi.obj = obc->obs.oi.soid;
9606     owi.wi.addr = j->second->get_peer_addr();
9607     owi.wi.name = j->second->get_entity();
9608     owi.wi.cookie = j->second->get_cookie();
9609     owi.wi.timeout_seconds = j->second->get_timeout();
9610
9611     dout(30) << "watch: Found oid=" << owi.obj << " addr=" << owi.wi.addr
9612       << " name=" << owi.wi.name << " cookie=" << owi.wi.cookie << dendl;
9613
9614     pg_watchers.push_back(owi);
9615   }
9616 }
9617
9618 void PrimaryLogPG::check_blacklisted_watchers()
9619 {
9620   dout(20) << "PrimaryLogPG::check_blacklisted_watchers for pg " << get_pgid() << dendl;
9621   pair<hobject_t, ObjectContextRef> i;
9622   while (object_contexts.get_next(i.first, &i))
9623     check_blacklisted_obc_watchers(i.second);
9624 }
9625
9626 void PrimaryLogPG::check_blacklisted_obc_watchers(ObjectContextRef obc)
9627 {
9628   dout(20) << "PrimaryLogPG::check_blacklisted_obc_watchers for obc " << obc->obs.oi.soid << dendl;
9629   for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator k =
9630          obc->watchers.begin();
9631         k != obc->watchers.end();
9632         ) {
9633     //Advance iterator now so handle_watch_timeout() can erase element
9634     map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j = k++;
9635     dout(30) << "watch: Found " << j->second->get_entity() << " cookie " << j->second->get_cookie() << dendl;
9636     entity_addr_t ea = j->second->get_peer_addr();
9637     dout(30) << "watch: Check entity_addr_t " << ea << dendl;
9638     if (get_osdmap()->is_blacklisted(ea)) {
9639       dout(10) << "watch: Found blacklisted watcher for " << ea << dendl;
9640       assert(j->second->get_pg() == this);
9641       j->second->unregister_cb();
9642       handle_watch_timeout(j->second);
9643     }
9644   }
9645 }
9646
9647 void PrimaryLogPG::populate_obc_watchers(ObjectContextRef obc)
9648 {
9649   assert(is_active());
9650   assert((recovering.count(obc->obs.oi.soid) ||
9651           !is_missing_object(obc->obs.oi.soid)) ||
9652          (pg_log.get_log().objects.count(obc->obs.oi.soid) && // or this is a revert... see recover_primary()
9653           pg_log.get_log().objects.find(obc->obs.oi.soid)->second->op ==
9654             pg_log_entry_t::LOST_REVERT &&
9655           pg_log.get_log().objects.find(obc->obs.oi.soid)->second->reverting_to ==
9656             obc->obs.oi.version));
9657
9658   dout(10) << "populate_obc_watchers " << obc->obs.oi.soid << dendl;
9659   assert(obc->watchers.empty());
9660   // populate unconnected_watchers
9661   for (map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator p =
9662         obc->obs.oi.watchers.begin();
9663        p != obc->obs.oi.watchers.end();
9664        ++p) {
9665     utime_t expire = info.stats.last_became_active;
9666     expire += p->second.timeout_seconds;
9667     dout(10) << "  unconnected watcher " << p->first << " will expire " << expire << dendl;
9668     WatchRef watch(
9669       Watch::makeWatchRef(
9670         this, osd, obc, p->second.timeout_seconds, p->first.first,
9671         p->first.second, p->second.addr));
9672     watch->disconnect();
9673     obc->watchers.insert(
9674       make_pair(
9675         make_pair(p->first.first, p->first.second),
9676         watch));
9677   }
9678   // Look for watchers from blacklisted clients and drop
9679   check_blacklisted_obc_watchers(obc);
9680 }
9681
9682 void PrimaryLogPG::handle_watch_timeout(WatchRef watch)
9683 {
9684   ObjectContextRef obc = watch->get_obc(); // handle_watch_timeout owns this ref
9685   dout(10) << "handle_watch_timeout obc " << obc << dendl;
9686
9687   if (!is_active()) {
9688     dout(10) << "handle_watch_timeout not active, no-op" << dendl;
9689     return;
9690   }
9691   if (is_degraded_or_backfilling_object(obc->obs.oi.soid)) {
9692     callbacks_for_degraded_object[obc->obs.oi.soid].push_back(
9693       watch->get_delayed_cb()
9694       );
9695     dout(10) << "handle_watch_timeout waiting for degraded on obj "
9696              << obc->obs.oi.soid
9697              << dendl;
9698     return;
9699   }
9700
9701   if (scrubber.write_blocked_by_scrub(obc->obs.oi.soid)) {
9702     dout(10) << "handle_watch_timeout waiting for scrub on obj "
9703              << obc->obs.oi.soid
9704              << dendl;
9705     scrubber.add_callback(
9706       watch->get_delayed_cb() // This callback!
9707       );
9708     return;
9709   }
9710
9711   OpContextUPtr ctx = simple_opc_create(obc);
9712   ctx->at_version = get_next_version();
9713
9714   object_info_t& oi = ctx->new_obs.oi;
9715   oi.watchers.erase(make_pair(watch->get_cookie(),
9716                               watch->get_entity()));
9717
9718   list<watch_disconnect_t> watch_disconnects = {
9719     watch_disconnect_t(watch->get_cookie(), watch->get_entity(), true)
9720   };
9721   ctx->register_on_success(
9722     [this, obc, watch_disconnects]() {
9723       complete_disconnect_watches(obc, watch_disconnects);
9724     });
9725
9726
9727   PGTransaction *t = ctx->op_t.get();
9728   ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::MODIFY, obc->obs.oi.soid,
9729                                     ctx->at_version,
9730                                     oi.version,
9731                                     0,
9732                                     osd_reqid_t(), ctx->mtime, 0));
9733
9734   oi.prior_version = obc->obs.oi.version;
9735   oi.version = ctx->at_version;
9736   bufferlist bl;
9737   ::encode(oi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
9738   t->setattr(obc->obs.oi.soid, OI_ATTR, bl);
9739
9740   // apply new object state.
9741   ctx->obc->obs = ctx->new_obs;
9742
9743   // no ctx->delta_stats
9744   simple_opc_submit(std::move(ctx));
9745 }
9746
9747 ObjectContextRef PrimaryLogPG::create_object_context(const object_info_t& oi,
9748                                                      SnapSetContext *ssc)
9749 {
9750   ObjectContextRef obc(object_contexts.lookup_or_create(oi.soid));
9751   assert(obc->destructor_callback == NULL);
9752   obc->destructor_callback = new C_PG_ObjectContext(this, obc.get());
9753   obc->obs.oi = oi;
9754   obc->obs.exists = false;
9755   obc->ssc = ssc;
9756   if (ssc)
9757     register_snapset_context(ssc);
9758   dout(10) << "create_object_context " << (void*)obc.get() << " " << oi.soid << " " << dendl;
9759   if (is_active())
9760     populate_obc_watchers(obc);
9761   return obc;
9762 }
9763
9764 ObjectContextRef PrimaryLogPG::get_object_context(
9765   const hobject_t& soid,
9766   bool can_create,
9767   const map<string, bufferlist> *attrs)
9768 {
9769   assert(
9770     attrs || !pg_log.get_missing().is_missing(soid) ||
9771     // or this is a revert... see recover_primary()
9772     (pg_log.get_log().objects.count(soid) &&
9773       pg_log.get_log().objects.find(soid)->second->op ==
9774       pg_log_entry_t::LOST_REVERT));
9775   ObjectContextRef obc = object_contexts.lookup(soid);
9776   osd->logger->inc(l_osd_object_ctx_cache_total);
9777   if (obc) {
9778     osd->logger->inc(l_osd_object_ctx_cache_hit);
9779     dout(10) << __func__ << ": found obc in cache: " << obc
9780              << dendl;
9781   } else {
9782     dout(10) << __func__ << ": obc NOT found in cache: " << soid << dendl;
9783     // check disk
9784     bufferlist bv;
9785     if (attrs) {
9786       assert(attrs->count(OI_ATTR));
9787       bv = attrs->find(OI_ATTR)->second;
9788     } else {
9789       int r = pgbackend->objects_get_attr(soid, OI_ATTR, &bv);
9790       if (r < 0) {
9791         if (!can_create) {
9792           dout(10) << __func__ << ": no obc for soid "
9793                    << soid << " and !can_create"
9794                    << dendl;
9795           return ObjectContextRef();   // -ENOENT!
9796         }
9797
9798         dout(10) << __func__ << ": no obc for soid "
9799                  << soid << " but can_create"
9800                  << dendl;
9801         // new object.
9802         object_info_t oi(soid);
9803         SnapSetContext *ssc = get_snapset_context(
9804           soid, true, 0, false);
9805         assert(ssc);
9806         obc = create_object_context(oi, ssc);
9807         dout(10) << __func__ << ": " << obc << " " << soid
9808                  << " " << obc->rwstate
9809                  << " oi: " << obc->obs.oi
9810                  << " ssc: " << obc->ssc
9811                  << " snapset: " << obc->ssc->snapset << dendl;
9812         return obc;
9813       }
9814     }
9815
9816     object_info_t oi;
9817     try {
9818       bufferlist::iterator bliter = bv.begin();
9819       ::decode(oi, bliter);
9820     } catch (...) {
9821       dout(0) << __func__ << ": obc corrupt: " << soid << dendl;
9822       return ObjectContextRef();   // -ENOENT!
9823     }
9824
9825     assert(oi.soid.pool == (int64_t)info.pgid.pool());
9826
9827     obc = object_contexts.lookup_or_create(oi.soid);
9828     obc->destructor_callback = new C_PG_ObjectContext(this, obc.get());
9829     obc->obs.oi = oi;
9830     obc->obs.exists = true;
9831
9832     obc->ssc = get_snapset_context(
9833       soid, true,
9834       soid.has_snapset() ? attrs : 0);
9835
9836     if (is_active())
9837       populate_obc_watchers(obc);
9838
9839     if (pool.info.require_rollback()) {
9840       if (attrs) {
9841         obc->attr_cache = *attrs;
9842       } else {
9843         int r = pgbackend->objects_get_attrs(
9844           soid,
9845           &obc->attr_cache);
9846         assert(r == 0);
9847       }
9848     }
9849
9850     dout(10) << __func__ << ": creating obc from disk: " << obc
9851              << dendl;
9852   }
9853
9854   // XXX: Caller doesn't expect this
9855   if (obc->ssc == NULL) {
9856     derr << __func__ << ": obc->ssc not available, not returning context" << dendl;
9857     return ObjectContextRef();   // -ENOENT!
9858   }
9859
9860   dout(10) << __func__ << ": " << obc << " " << soid
9861            << " " << obc->rwstate
9862            << " oi: " << obc->obs.oi
9863            << " exists: " << (int)obc->obs.exists
9864            << " ssc: " << obc->ssc
9865            << " snapset: " << obc->ssc->snapset << dendl;
9866   return obc;
9867 }
9868
9869 void PrimaryLogPG::context_registry_on_change()
9870 {
9871   pair<hobject_t, ObjectContextRef> i;
9872   while (object_contexts.get_next(i.first, &i)) {
9873     ObjectContextRef obc(i.second);
9874     if (obc) {
9875       for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j =
9876              obc->watchers.begin();
9877            j != obc->watchers.end();
9878            obc->watchers.erase(j++)) {
9879         j->second->discard();
9880       }
9881     }
9882   }
9883 }
9884
9885
9886 /*
9887  * If we return an error, and set *pmissing, then promoting that
9888  * object may help.
9889  *
9890  * If we return -EAGAIN, we will always set *pmissing to the missing
9891  * object to wait for.
9892  *
9893  * If we return an error but do not set *pmissing, then we know the
9894  * object does not exist.
9895  */
9896 int PrimaryLogPG::find_object_context(const hobject_t& oid,
9897                                       ObjectContextRef *pobc,
9898                                       bool can_create,
9899                                       bool map_snapid_to_clone,
9900                                       hobject_t *pmissing)
9901 {
9902   FUNCTRACE();
9903   assert(oid.pool == static_cast<int64_t>(info.pgid.pool()));
9904   // want the head?
9905   if (oid.snap == CEPH_NOSNAP) {
9906     ObjectContextRef obc = get_object_context(oid, can_create);
9907     if (!obc) {
9908       if (pmissing)
9909         *pmissing = oid;
9910       return -ENOENT;
9911     }
9912     dout(10) << "find_object_context " << oid
9913        << " @" << oid.snap
9914        << " oi=" << obc->obs.oi
9915        << dendl;
9916     *pobc = obc;
9917
9918     return 0;
9919   }
9920
9921   hobject_t head = oid.get_head();
9922
9923   // want the snapdir?
9924   if (oid.snap == CEPH_SNAPDIR) {
9925     // return head or snapdir, whichever exists.
9926     ObjectContextRef headobc = get_object_context(head, can_create);
9927     ObjectContextRef obc = headobc;
9928     if (!obc || !obc->obs.exists)
9929       obc = get_object_context(oid, can_create);
9930     if (!obc || !obc->obs.exists) {
9931       // if we have neither, we would want to promote the head.
9932       if (pmissing)
9933         *pmissing = head;
9934       if (pobc)
9935         *pobc = headobc; // may be null
9936       return -ENOENT;
9937     }
9938     dout(10) << "find_object_context " << oid
9939              << " @" << oid.snap
9940              << " oi=" << obc->obs.oi
9941              << dendl;
9942     *pobc = obc;
9943
9944     // always populate ssc for SNAPDIR...
9945     if (!obc->ssc)
9946       obc->ssc = get_snapset_context(
9947         oid, true);
9948     return 0;
9949   }
9950
9951   // we want a snap
9952   if (!map_snapid_to_clone && pool.info.is_removed_snap(oid.snap)) {
9953     dout(10) << __func__ << " snap " << oid.snap << " is removed" << dendl;
9954     return -ENOENT;
9955   }
9956
9957   SnapSetContext *ssc = get_snapset_context(oid, can_create);
9958   if (!ssc || !(ssc->exists || can_create)) {
9959     dout(20) << __func__ << " " << oid << " no snapset" << dendl;
9960     if (pmissing)
9961       *pmissing = head;  // start by getting the head
9962     if (ssc)
9963       put_snapset_context(ssc);
9964     return -ENOENT;
9965   }
9966
9967   if (map_snapid_to_clone) {
9968     dout(10) << "find_object_context " << oid << " @" << oid.snap
9969              << " snapset " << ssc->snapset
9970              << " map_snapid_to_clone=true" << dendl;
9971     if (oid.snap > ssc->snapset.seq) {
9972       // already must be readable
9973       ObjectContextRef obc = get_object_context(head, false);
9974       dout(10) << "find_object_context " << oid << " @" << oid.snap
9975                << " snapset " << ssc->snapset
9976                << " maps to head" << dendl;
9977       *pobc = obc;
9978       put_snapset_context(ssc);
9979       return (obc && obc->obs.exists) ? 0 : -ENOENT;
9980     } else {
9981       vector<snapid_t>::const_iterator citer = std::find(
9982         ssc->snapset.clones.begin(),
9983         ssc->snapset.clones.end(),
9984         oid.snap);
9985       if (citer == ssc->snapset.clones.end()) {
9986         dout(10) << "find_object_context " << oid << " @" << oid.snap
9987                  << " snapset " << ssc->snapset
9988                  << " maps to nothing" << dendl;
9989         put_snapset_context(ssc);
9990         return -ENOENT;
9991       }
9992
9993       dout(10) << "find_object_context " << oid << " @" << oid.snap
9994                << " snapset " << ssc->snapset
9995                << " maps to " << oid << dendl;
9996
9997       if (pg_log.get_missing().is_missing(oid)) {
9998         dout(10) << "find_object_context " << oid << " @" << oid.snap
9999                  << " snapset " << ssc->snapset
10000                  << " " << oid << " is missing" << dendl;
10001         if (pmissing)
10002           *pmissing = oid;
10003         put_snapset_context(ssc);
10004         return -EAGAIN;
10005       }
10006
10007       ObjectContextRef obc = get_object_context(oid, false);
10008       if (!obc || !obc->obs.exists) {
10009         dout(10) << "find_object_context " << oid << " @" << oid.snap
10010                  << " snapset " << ssc->snapset
10011                  << " " << oid << " is not present" << dendl;
10012         if (pmissing)
10013           *pmissing = oid;
10014         put_snapset_context(ssc);
10015         return -ENOENT;
10016       }
10017       dout(10) << "find_object_context " << oid << " @" << oid.snap
10018                << " snapset " << ssc->snapset
10019                << " " << oid << " HIT" << dendl;
10020       *pobc = obc;
10021       put_snapset_context(ssc);
10022       return 0;
10023     }
10024     ceph_abort(); //unreachable
10025   }
10026
10027   dout(10) << "find_object_context " << oid << " @" << oid.snap
10028            << " snapset " << ssc->snapset << dendl;
10029
10030   // head?
10031   if (oid.snap > ssc->snapset.seq) {
10032     if (ssc->snapset.head_exists) {
10033       ObjectContextRef obc = get_object_context(head, false);
10034       dout(10) << "find_object_context  " << head
10035                << " want " << oid.snap << " > snapset seq " << ssc->snapset.seq
10036                << " -- HIT " << obc->obs
10037                << dendl;
10038       if (!obc->ssc)
10039         obc->ssc = ssc;
10040       else {
10041         assert(ssc == obc->ssc);
10042         put_snapset_context(ssc);
10043       }
10044       *pobc = obc;
10045       return 0;
10046     }
10047     dout(10) << "find_object_context  " << head
10048              << " want " << oid.snap << " > snapset seq " << ssc->snapset.seq
10049              << " but head dne -- DNE"
10050              << dendl;
10051     put_snapset_context(ssc);
10052     return -ENOENT;
10053   }
10054
10055   // which clone would it be?
10056   unsigned k = 0;
10057   while (k < ssc->snapset.clones.size() &&
10058          ssc->snapset.clones[k] < oid.snap)
10059     k++;
10060   if (k == ssc->snapset.clones.size()) {
10061     dout(10) << "find_object_context  no clones with last >= oid.snap "
10062              << oid.snap << " -- DNE" << dendl;
10063     put_snapset_context(ssc);
10064     return -ENOENT;
10065   }
10066   hobject_t soid(oid.oid, oid.get_key(), ssc->snapset.clones[k], oid.get_hash(),
10067                  info.pgid.pool(), oid.get_namespace());
10068
10069   if (pg_log.get_missing().is_missing(soid)) {
10070     dout(20) << "find_object_context  " << soid << " missing, try again later"
10071              << dendl;
10072     if (pmissing)
10073       *pmissing = soid;
10074     put_snapset_context(ssc);
10075     return -EAGAIN;
10076   }
10077
10078   ObjectContextRef obc = get_object_context(soid, false);
10079   if (!obc || !obc->obs.exists) {
10080     if (pmissing)
10081       *pmissing = soid;
10082     put_snapset_context(ssc);
10083     if (is_degraded_or_backfilling_object(soid)) {
10084       dout(20) << __func__ << " clone is degraded or backfilling " << soid << dendl;
10085       return -EAGAIN;
10086     } else {
10087       dout(20) << __func__ << " missing clone " << soid << dendl;
10088       return -ENOENT;
10089     }
10090   }
10091
10092   if (!obc->ssc) {
10093     obc->ssc = ssc;
10094   } else {
10095     assert(obc->ssc == ssc);
10096     put_snapset_context(ssc);
10097   }
10098   ssc = 0;
10099
10100   // clone
10101   dout(20) << "find_object_context  " << soid
10102            << " snapset " << obc->ssc->snapset
10103            << " legacy_snaps " << obc->obs.oi.legacy_snaps
10104            << dendl;
10105   snapid_t first, last;
10106   if (obc->ssc->snapset.is_legacy()) {
10107     first = obc->obs.oi.legacy_snaps.back();
10108     last = obc->obs.oi.legacy_snaps.front();
10109   } else {
10110     auto p = obc->ssc->snapset.clone_snaps.find(soid.snap);
10111     assert(p != obc->ssc->snapset.clone_snaps.end());
10112     first = p->second.back();
10113     last = p->second.front();
10114   }
10115   if (first <= oid.snap) {
10116     dout(20) << "find_object_context  " << soid << " [" << first << "," << last
10117              << "] contains " << oid.snap << " -- HIT " << obc->obs << dendl;
10118     *pobc = obc;
10119     return 0;
10120   } else {
10121     dout(20) << "find_object_context  " << soid << " [" << first << "," << last
10122              << "] does not contain " << oid.snap << " -- DNE" << dendl;
10123     return -ENOENT;
10124   }
10125 }
10126
10127 void PrimaryLogPG::object_context_destructor_callback(ObjectContext *obc)
10128 {
10129   if (obc->ssc)
10130     put_snapset_context(obc->ssc);
10131 }
10132
10133 void PrimaryLogPG::add_object_context_to_pg_stat(ObjectContextRef obc, pg_stat_t *pgstat)
10134 {
10135   object_info_t& oi = obc->obs.oi;
10136
10137   dout(10) << "add_object_context_to_pg_stat " << oi.soid << dendl;
10138   object_stat_sum_t stat;
10139
10140   stat.num_bytes += oi.size;
10141
10142   if (oi.soid.snap != CEPH_SNAPDIR)
10143     stat.num_objects++;
10144   if (oi.is_dirty())
10145     stat.num_objects_dirty++;
10146   if (oi.is_whiteout())
10147     stat.num_whiteouts++;
10148   if (oi.is_omap())
10149     stat.num_objects_omap++;
10150   if (oi.is_cache_pinned())
10151     stat.num_objects_pinned++;
10152
10153   if (oi.soid.snap && oi.soid.snap != CEPH_NOSNAP && oi.soid.snap != CEPH_SNAPDIR) {
10154     stat.num_object_clones++;
10155
10156     if (!obc->ssc)
10157       obc->ssc = get_snapset_context(oi.soid, false);
10158     assert(obc->ssc);
10159
10160     // subtract off clone overlap
10161     if (obc->ssc->snapset.clone_overlap.count(oi.soid.snap)) {
10162       interval_set<uint64_t>& o = obc->ssc->snapset.clone_overlap[oi.soid.snap];
10163       for (interval_set<uint64_t>::const_iterator r = o.begin();
10164            r != o.end();
10165            ++r) {
10166         stat.num_bytes -= r.get_len();
10167       }
10168     }
10169   }
10170
10171   // add it in
10172   pgstat->stats.sum.add(stat);
10173 }
10174
10175 void PrimaryLogPG::kick_object_context_blocked(ObjectContextRef obc)
10176 {
10177   const hobject_t& soid = obc->obs.oi.soid;
10178   if (obc->is_blocked()) {
10179     dout(10) << __func__ << " " << soid << " still blocked" << dendl;
10180     return;
10181   }
10182
10183   map<hobject_t, list<OpRequestRef>>::iterator p = waiting_for_blocked_object.find(soid);
10184   if (p != waiting_for_blocked_object.end()) {
10185     list<OpRequestRef>& ls = p->second;
10186     dout(10) << __func__ << " " << soid << " requeuing " << ls.size() << " requests" << dendl;
10187     requeue_ops(ls);
10188     waiting_for_blocked_object.erase(p);
10189   }
10190
10191   map<hobject_t, ObjectContextRef>::iterator i =
10192     objects_blocked_on_snap_promotion.find(obc->obs.oi.soid.get_head());
10193   if (i != objects_blocked_on_snap_promotion.end()) {
10194     assert(i->second == obc);
10195     objects_blocked_on_snap_promotion.erase(i);
10196   }
10197
10198   if (obc->requeue_scrub_on_unblock) {
10199     obc->requeue_scrub_on_unblock = false;
10200     requeue_scrub();
10201   }
10202 }
10203
10204 SnapSetContext *PrimaryLogPG::get_snapset_context(
10205   const hobject_t& oid,
10206   bool can_create,
10207   const map<string, bufferlist> *attrs,
10208   bool oid_existed)
10209 {
10210   Mutex::Locker l(snapset_contexts_lock);
10211   SnapSetContext *ssc;
10212   map<hobject_t, SnapSetContext*>::iterator p = snapset_contexts.find(
10213     oid.get_snapdir());
10214   if (p != snapset_contexts.end()) {
10215     if (can_create || p->second->exists) {
10216       ssc = p->second;
10217     } else {
10218       return NULL;
10219     }
10220   } else {
10221     bufferlist bv;
10222     if (!attrs) {
10223       int r = -ENOENT;
10224       if (!(oid.is_head() && !oid_existed))
10225         r = pgbackend->objects_get_attr(oid.get_head(), SS_ATTR, &bv);
10226       if (r < 0) {
10227         // try _snapset
10228         if (!(oid.is_snapdir() && !oid_existed))
10229           r = pgbackend->objects_get_attr(oid.get_snapdir(), SS_ATTR, &bv);
10230         if (r < 0 && !can_create)
10231           return NULL;
10232       }
10233     } else {
10234       assert(attrs->count(SS_ATTR));
10235       bv = attrs->find(SS_ATTR)->second;
10236     }
10237     ssc = new SnapSetContext(oid.get_snapdir());
10238     _register_snapset_context(ssc);
10239     if (bv.length()) {
10240       bufferlist::iterator bvp = bv.begin();
10241       try {
10242         ssc->snapset.decode(bvp);
10243       } catch (buffer::error& e) {
10244         dout(0) << __func__ << " Can't decode snapset: " << e << dendl;
10245         return NULL;
10246       }
10247       ssc->exists = true;
10248     } else {
10249       ssc->exists = false;
10250     }
10251   }
10252   assert(ssc);
10253   ssc->ref++;
10254   return ssc;
10255 }
10256
10257 void PrimaryLogPG::put_snapset_context(SnapSetContext *ssc)
10258 {
10259   Mutex::Locker l(snapset_contexts_lock);
10260   --ssc->ref;
10261   if (ssc->ref == 0) {
10262     if (ssc->registered)
10263       snapset_contexts.erase(ssc->oid);
10264     delete ssc;
10265   }
10266 }
10267
10268 /** pull - request object from a peer
10269  */
10270
10271 /*
10272  * Return values:
10273  *  NONE  - didn't pull anything
10274  *  YES   - pulled what the caller wanted
10275  *  OTHER - needed to pull something else first (_head or _snapdir)
10276  */
10277 enum { PULL_NONE, PULL_OTHER, PULL_YES };
10278
10279 int PrimaryLogPG::recover_missing(
10280   const hobject_t &soid, eversion_t v,
10281   int priority,
10282   PGBackend::RecoveryHandle *h)
10283 {
10284   if (missing_loc.is_unfound(soid)) {
10285     dout(7) << "pull " << soid
10286             << " v " << v
10287             << " but it is unfound" << dendl;
10288     return PULL_NONE;
10289   }
10290
10291   if (missing_loc.is_deleted(soid)) {
10292     start_recovery_op(soid);
10293     assert(!recovering.count(soid));
10294     recovering.insert(make_pair(soid, ObjectContextRef()));
10295     epoch_t cur_epoch = get_osdmap()->get_epoch();
10296     remove_missing_object(soid, v, new FunctionContext(
10297      [=](int) {
10298        lock();
10299        if (!pg_has_reset_since(cur_epoch)) {
10300          bool object_missing = false;
10301          for (const auto& shard : actingbackfill) {
10302            if (shard == pg_whoami)
10303              continue;
10304            if (peer_missing[shard].is_missing(soid)) {
10305              dout(20) << __func__ << ": soid " << soid << " needs to be deleted from replica " << shard << dendl;
10306              object_missing = true;
10307              break;
10308            }
10309          }
10310          if (!object_missing) {
10311            object_stat_sum_t stat_diff;
10312            stat_diff.num_objects_recovered = 1;
10313            on_global_recover(soid, stat_diff, true);
10314          } else {
10315            auto recovery_handle = pgbackend->open_recovery_op();
10316            pgbackend->recover_delete_object(soid, v, recovery_handle);
10317            pgbackend->run_recovery_op(recovery_handle, priority);
10318          }
10319        }
10320        unlock();
10321      }));
10322     return PULL_YES;
10323   }
10324
10325   // is this a snapped object?  if so, consult the snapset.. we may not need the entire object!
10326   ObjectContextRef obc;
10327   ObjectContextRef head_obc;
10328   if (soid.snap && soid.snap < CEPH_NOSNAP) {
10329     // do we have the head and/or snapdir?
10330     hobject_t head = soid.get_head();
10331     if (pg_log.get_missing().is_missing(head)) {
10332       if (recovering.count(head)) {
10333         dout(10) << " missing but already recovering head " << head << dendl;
10334         return PULL_NONE;
10335       } else {
10336         int r = recover_missing(
10337           head, pg_log.get_missing().get_items().find(head)->second.need, priority,
10338           h);
10339         if (r != PULL_NONE)
10340           return PULL_OTHER;
10341         return PULL_NONE;
10342       }
10343     }
10344     head = soid.get_snapdir();
10345     if (pg_log.get_missing().is_missing(head)) {
10346       if (recovering.count(head)) {
10347         dout(10) << " missing but already recovering snapdir " << head << dendl;
10348         return PULL_NONE;
10349       } else {
10350         int r = recover_missing(
10351           head, pg_log.get_missing().get_items().find(head)->second.need, priority,
10352           h);
10353         if (r != PULL_NONE)
10354           return PULL_OTHER;
10355         return PULL_NONE;
10356       }
10357     }
10358
10359     // we must have one or the other
10360     head_obc = get_object_context(
10361       soid.get_head(),
10362       false,
10363       0);
10364     if (!head_obc)
10365       head_obc = get_object_context(
10366         soid.get_snapdir(),
10367         false,
10368         0);
10369     assert(head_obc);
10370   }
10371   start_recovery_op(soid);
10372   assert(!recovering.count(soid));
10373   recovering.insert(make_pair(soid, obc));
10374   int r = pgbackend->recover_object(
10375     soid,
10376     v,
10377     head_obc,
10378     obc,
10379     h);
10380   // This is only a pull which shouldn't return an error
10381   assert(r >= 0);
10382   return PULL_YES;
10383 }
10384
10385 void PrimaryLogPG::send_remove_op(
10386   const hobject_t& oid, eversion_t v, pg_shard_t peer)
10387 {
10388   ceph_tid_t tid = osd->get_tid();
10389   osd_reqid_t rid(osd->get_cluster_msgr_name(), 0, tid);
10390
10391   dout(10) << "send_remove_op " << oid << " from osd." << peer
10392            << " tid " << tid << dendl;
10393
10394   MOSDSubOp *subop = new MOSDSubOp(
10395     rid, pg_whoami, spg_t(info.pgid.pgid, peer.shard),
10396     oid, CEPH_OSD_FLAG_ACK,
10397     get_osdmap()->get_epoch(), tid, v);
10398   subop->ops = vector<OSDOp>(1);
10399   subop->ops[0].op.op = CEPH_OSD_OP_DELETE;
10400
10401   osd->send_message_osd_cluster(peer.osd, subop, get_osdmap()->get_epoch());
10402 }
10403
10404 void PrimaryLogPG::remove_missing_object(const hobject_t &soid,
10405                                          eversion_t v, Context *on_complete)
10406 {
10407   dout(20) << __func__ << " " << soid << " " << v << dendl;
10408   assert(on_complete != nullptr);
10409   // delete locally
10410   ObjectStore::Transaction t;
10411   remove_snap_mapped_object(t, soid);
10412
10413   ObjectRecoveryInfo recovery_info;
10414   recovery_info.soid = soid;
10415   recovery_info.version = v;
10416
10417   epoch_t cur_epoch = get_osdmap()->get_epoch();
10418   t.register_on_complete(new FunctionContext(
10419      [=](int) {
10420        lock();
10421        if (!pg_has_reset_since(cur_epoch)) {
10422          ObjectStore::Transaction t2;
10423          on_local_recover(soid, recovery_info, ObjectContextRef(), true, &t2);
10424          t2.register_on_complete(on_complete);
10425          int r = osd->store->queue_transaction(osr.get(), std::move(t2), nullptr);
10426          assert(r == 0);
10427          unlock();
10428        } else {
10429          unlock();
10430          on_complete->complete(-EAGAIN);
10431        }
10432      }));
10433   int r = osd->store->queue_transaction(osr.get(), std::move(t), nullptr);
10434   assert(r == 0);
10435 }
10436
10437 void PrimaryLogPG::finish_degraded_object(const hobject_t& oid)
10438 {
10439   dout(10) << "finish_degraded_object " << oid << dendl;
10440   if (callbacks_for_degraded_object.count(oid)) {
10441     list<Context*> contexts;
10442     contexts.swap(callbacks_for_degraded_object[oid]);
10443     callbacks_for_degraded_object.erase(oid);
10444     for (list<Context*>::iterator i = contexts.begin();
10445          i != contexts.end();
10446          ++i) {
10447       (*i)->complete(0);
10448     }
10449   }
10450   map<hobject_t, snapid_t>::iterator i = objects_blocked_on_degraded_snap.find(
10451     oid.get_head());
10452   if (i != objects_blocked_on_degraded_snap.end() &&
10453       i->second == oid.snap)
10454     objects_blocked_on_degraded_snap.erase(i);
10455 }
10456
10457 void PrimaryLogPG::_committed_pushed_object(
10458   epoch_t epoch, eversion_t last_complete)
10459 {
10460   lock();
10461   if (!pg_has_reset_since(epoch)) {
10462     dout(10) << "_committed_pushed_object last_complete " << last_complete << " now ondisk" << dendl;
10463     last_complete_ondisk = last_complete;
10464
10465     if (last_complete_ondisk == info.last_update) {
10466       if (!is_primary()) {
10467         // Either we are a replica or backfill target.
10468         // we are fully up to date.  tell the primary!
10469         osd->send_message_osd_cluster(
10470           get_primary().osd,
10471           new MOSDPGTrim(
10472             get_osdmap()->get_epoch(),
10473             spg_t(info.pgid.pgid, get_primary().shard),
10474             last_complete_ondisk),
10475           get_osdmap()->get_epoch());
10476       } else {
10477         calc_min_last_complete_ondisk();
10478       }
10479     }
10480
10481   } else {
10482     dout(10) << "_committed_pushed_object pg has changed, not touching last_complete_ondisk" << dendl;
10483   }
10484
10485   unlock();
10486 }
10487
10488 void PrimaryLogPG::_applied_recovered_object(ObjectContextRef obc)
10489 {
10490   lock();
10491   dout(20) << __func__ << dendl;
10492   if (obc) {
10493     dout(20) << "obc = " << *obc << dendl;
10494   }
10495   assert(active_pushes >= 1);
10496   --active_pushes;
10497
10498   // requeue an active chunky scrub waiting on recovery ops
10499   if (!deleting && active_pushes == 0
10500       && scrubber.is_chunky_scrub_active()) {
10501     if (ops_blocked_by_scrub()) {
10502       requeue_scrub(true);
10503     } else {
10504       requeue_scrub(false);
10505     }
10506   }
10507   unlock();
10508 }
10509
10510 void PrimaryLogPG::_applied_recovered_object_replica()
10511 {
10512   lock();
10513   dout(20) << __func__ << dendl;
10514   assert(active_pushes >= 1);
10515   --active_pushes;
10516
10517   // requeue an active chunky scrub waiting on recovery ops
10518   if (!deleting && active_pushes == 0 &&
10519       scrubber.active_rep_scrub && static_cast<const MOSDRepScrub*>(
10520         scrubber.active_rep_scrub->get_req())->chunky) {
10521     osd->enqueue_back(
10522       info.pgid,
10523       PGQueueable(scrubber.active_rep_scrub, get_osdmap()->get_epoch()));
10524     scrubber.active_rep_scrub = OpRequestRef();
10525   }
10526   unlock();
10527 }
10528
10529 void PrimaryLogPG::recover_got(hobject_t oid, eversion_t v)
10530 {
10531   dout(10) << "got missing " << oid << " v " << v << dendl;
10532   pg_log.recover_got(oid, v, info);
10533   if (pg_log.get_log().complete_to != pg_log.get_log().log.end()) {
10534     dout(10) << "last_complete now " << info.last_complete
10535              << " log.complete_to " << pg_log.get_log().complete_to->version
10536              << dendl;
10537   } else {
10538     dout(10) << "last_complete now " << info.last_complete
10539              << " log.complete_to at end" << dendl;
10540     //below is not true in the repair case.
10541     //assert(missing.num_missing() == 0);  // otherwise, complete_to was wrong.
10542     assert(info.last_complete == info.last_update);
10543   }
10544 }
10545
10546 void PrimaryLogPG::primary_failed(const hobject_t &soid)
10547 {
10548   list<pg_shard_t> fl = { pg_whoami };
10549   failed_push(fl, soid);
10550 }
10551
10552 void PrimaryLogPG::failed_push(const list<pg_shard_t> &from, const hobject_t &soid)
10553 {
10554   dout(20) << __func__ << ": " << soid << dendl;
10555   assert(recovering.count(soid));
10556   auto obc = recovering[soid];
10557   if (obc) {
10558     list<OpRequestRef> blocked_ops;
10559     obc->drop_recovery_read(&blocked_ops);
10560     requeue_ops(blocked_ops);
10561   }
10562   recovering.erase(soid);
10563   for (auto&& i : from)
10564     missing_loc.remove_location(soid, i);
10565   dout(0) << __func__ << " " << soid << " from shard " << from
10566           << ", reps on " << missing_loc.get_locations(soid)
10567           << " unfound? " << missing_loc.is_unfound(soid) << dendl;
10568   finish_recovery_op(soid);  // close out this attempt,
10569 }
10570
10571 void PrimaryLogPG::sub_op_remove(OpRequestRef op)
10572 {
10573   const MOSDSubOp *m = static_cast<const MOSDSubOp*>(op->get_req());
10574   assert(m->get_type() == MSG_OSD_SUBOP);
10575   dout(7) << "sub_op_remove " << m->poid << dendl;
10576
10577   op->mark_started();
10578
10579   ObjectStore::Transaction t;
10580   remove_snap_mapped_object(t, m->poid);
10581   int r = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
10582   assert(r == 0);
10583 }
10584
10585 eversion_t PrimaryLogPG::pick_newest_available(const hobject_t& oid)
10586 {
10587   eversion_t v;
10588   pg_missing_item pmi;
10589   bool is_missing = pg_log.get_missing().is_missing(oid, &pmi);
10590   assert(is_missing);
10591   v = pmi.have;
10592   dout(10) << "pick_newest_available " << oid << " " << v << " on osd." << osd->whoami << " (local)" << dendl;
10593
10594   assert(!actingbackfill.empty());
10595   for (set<pg_shard_t>::iterator i = actingbackfill.begin();
10596        i != actingbackfill.end();
10597        ++i) {
10598     if (*i == get_primary()) continue;
10599     pg_shard_t peer = *i;
10600     if (!peer_missing[peer].is_missing(oid)) {
10601       continue;
10602     }
10603     eversion_t h = peer_missing[peer].get_items().at(oid).have;
10604     dout(10) << "pick_newest_available " << oid << " " << h << " on osd." << peer << dendl;
10605     if (h > v)
10606       v = h;
10607   }
10608
10609   dout(10) << "pick_newest_available " << oid << " " << v << " (newest)" << dendl;
10610   return v;
10611 }
10612
10613 void PrimaryLogPG::do_update_log_missing(OpRequestRef &op)
10614 {
10615   const MOSDPGUpdateLogMissing *m = static_cast<const MOSDPGUpdateLogMissing*>(
10616     op->get_req());
10617   assert(m->get_type() == MSG_OSD_PG_UPDATE_LOG_MISSING);
10618   ObjectStore::Transaction t;
10619   append_log_entries_update_missing(m->entries, t);
10620
10621   Context *complete = new FunctionContext(
10622     [=](int) {
10623       const MOSDPGUpdateLogMissing *msg = static_cast<const MOSDPGUpdateLogMissing*>(
10624         op->get_req());
10625       lock();
10626       if (!pg_has_reset_since(msg->get_epoch())) {
10627         MOSDPGUpdateLogMissingReply *reply =
10628           new MOSDPGUpdateLogMissingReply(
10629             spg_t(info.pgid.pgid, primary_shard().shard),
10630             pg_whoami.shard,
10631             msg->get_epoch(),
10632             msg->min_epoch,
10633             msg->get_tid());
10634         reply->set_priority(CEPH_MSG_PRIO_HIGH);
10635         msg->get_connection()->send_message(reply);
10636       }
10637       unlock();
10638     });
10639
10640   if (get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
10641     t.register_on_commit(complete);
10642   } else {
10643     /* Hack to work around the fact that ReplicatedBackend sends
10644      * ack+commit if commit happens first
10645      *
10646      * This behavior is no longer necessary, but we preserve it so old
10647      * primaries can keep their repops in order */
10648     if (pool.info.ec_pool()) {
10649       t.register_on_complete(complete);
10650     } else {
10651       t.register_on_commit(complete);
10652     }
10653   }
10654   t.register_on_applied(
10655     new C_OSD_OnApplied{this, get_osdmap()->get_epoch(), info.last_update});
10656   int tr = osd->store->queue_transaction(
10657     osr.get(),
10658     std::move(t),
10659     nullptr);
10660   assert(tr == 0);
10661 }
10662
10663 void PrimaryLogPG::do_update_log_missing_reply(OpRequestRef &op)
10664 {
10665   const MOSDPGUpdateLogMissingReply *m =
10666     static_cast<const MOSDPGUpdateLogMissingReply*>(
10667     op->get_req());
10668   dout(20) << __func__ << " got reply from "
10669            << m->get_from() << dendl;
10670
10671   auto it = log_entry_update_waiting_on.find(m->get_tid());
10672   if (it != log_entry_update_waiting_on.end()) {
10673     if (it->second.waiting_on.count(m->get_from())) {
10674       it->second.waiting_on.erase(m->get_from());
10675     } else {
10676       osd->clog->error()
10677         << info.pgid << " got reply "
10678         << *m << " from shard we are not waiting for "
10679         << m->get_from();
10680     }
10681
10682     if (it->second.waiting_on.empty()) {
10683       repop_all_committed(it->second.repop.get());
10684       log_entry_update_waiting_on.erase(it);
10685     }
10686   } else {
10687     osd->clog->error()
10688       << info.pgid << " got reply "
10689       << *m << " on unknown tid " << m->get_tid();
10690   }
10691 }
10692
10693 /* Mark all unfound objects as lost.
10694  */
10695 void PrimaryLogPG::mark_all_unfound_lost(
10696   int what,
10697   ConnectionRef con,
10698   ceph_tid_t tid)
10699 {
10700   dout(3) << __func__ << " " << pg_log_entry_t::get_op_name(what) << dendl;
10701   list<hobject_t> oids;
10702
10703   dout(30) << __func__ << ": log before:\n";
10704   pg_log.get_log().print(*_dout);
10705   *_dout << dendl;
10706
10707   mempool::osd_pglog::list<pg_log_entry_t> log_entries;
10708
10709   utime_t mtime = ceph_clock_now();
10710   map<hobject_t, pg_missing_item>::const_iterator m =
10711     missing_loc.get_needs_recovery().begin();
10712   map<hobject_t, pg_missing_item>::const_iterator mend =
10713     missing_loc.get_needs_recovery().end();
10714
10715   ObcLockManager manager;
10716   eversion_t v = get_next_version();
10717   v.epoch = get_osdmap()->get_epoch();
10718   uint64_t num_unfound = missing_loc.num_unfound();
10719   while (m != mend) {
10720     const hobject_t &oid(m->first);
10721     if (!missing_loc.is_unfound(oid)) {
10722       // We only care about unfound objects
10723       ++m;
10724       continue;
10725     }
10726
10727     ObjectContextRef obc;
10728     eversion_t prev;
10729
10730     switch (what) {
10731     case pg_log_entry_t::LOST_MARK:
10732       assert(0 == "actually, not implemented yet!");
10733       break;
10734
10735     case pg_log_entry_t::LOST_REVERT:
10736       prev = pick_newest_available(oid);
10737       if (prev > eversion_t()) {
10738         // log it
10739         pg_log_entry_t e(
10740           pg_log_entry_t::LOST_REVERT, oid, v,
10741           m->second.need, 0, osd_reqid_t(), mtime, 0);
10742         e.reverting_to = prev;
10743         e.mark_unrollbackable();
10744         log_entries.push_back(e);
10745         dout(10) << e << dendl;
10746
10747         // we are now missing the new version; recovery code will sort it out.
10748         ++v.version;
10749         ++m;
10750         break;
10751       }
10752
10753     case pg_log_entry_t::LOST_DELETE:
10754       {
10755         pg_log_entry_t e(pg_log_entry_t::LOST_DELETE, oid, v, m->second.need,
10756                          0, osd_reqid_t(), mtime, 0);
10757         if (get_osdmap()->require_osd_release >= CEPH_RELEASE_JEWEL) {
10758           if (pool.info.require_rollback()) {
10759             e.mod_desc.try_rmobject(v.version);
10760           } else {
10761             e.mark_unrollbackable();
10762           }
10763         } // otherwise, just do what we used to do
10764         dout(10) << e << dendl;
10765         log_entries.push_back(e);
10766         oids.push_back(oid);
10767
10768         ++v.version;
10769         ++m;
10770       }
10771       break;
10772
10773     default:
10774       ceph_abort();
10775     }
10776   }
10777
10778   info.stats.stats_invalid = true;
10779
10780   submit_log_entries(
10781     log_entries,
10782     std::move(manager),
10783     boost::optional<std::function<void(void)> >(
10784       [this, oids, con, num_unfound, tid]() {
10785         if (perform_deletes_during_peering()) {
10786           for (auto oid : oids) {
10787             // clear old locations - merge_new_log_entries will have
10788             // handled rebuilding missing_loc for each of these
10789             // objects if we have the RECOVERY_DELETES flag
10790             missing_loc.recovered(oid);
10791           }
10792         }
10793
10794         for (auto& p : waiting_for_unreadable_object) {
10795           release_backoffs(p.first);
10796         }
10797         requeue_object_waiters(waiting_for_unreadable_object);
10798         queue_recovery();
10799
10800         stringstream ss;
10801         ss << "pg has " << num_unfound
10802            << " objects unfound and apparently lost marking";
10803         string rs = ss.str();
10804         dout(0) << "do_command r=" << 0 << " " << rs << dendl;
10805         osd->clog->info() << rs;
10806         if (con) {
10807           MCommandReply *reply = new MCommandReply(0, rs);
10808           reply->set_tid(tid);
10809           con->send_message(reply);
10810         }
10811       }),
10812     OpRequestRef());
10813 }
10814
10815 void PrimaryLogPG::_split_into(pg_t child_pgid, PG *child, unsigned split_bits)
10816 {
10817   assert(repop_queue.empty());
10818 }
10819
10820 /*
10821  * pg status change notification
10822  */
10823
10824 void PrimaryLogPG::apply_and_flush_repops(bool requeue)
10825 {
10826   list<OpRequestRef> rq;
10827
10828   // apply all repops
10829   while (!repop_queue.empty()) {
10830     RepGather *repop = repop_queue.front();
10831     repop_queue.pop_front();
10832     dout(10) << " canceling repop tid " << repop->rep_tid << dendl;
10833     repop->rep_aborted = true;
10834     repop->on_applied.clear();
10835     repop->on_committed.clear();
10836     repop->on_success.clear();
10837
10838     if (requeue) {
10839       if (repop->op) {
10840         dout(10) << " requeuing " << *repop->op->get_req() << dendl;
10841         rq.push_back(repop->op);
10842         repop->op = OpRequestRef();
10843       }
10844
10845       // also requeue any dups, interleaved into position
10846       map<eversion_t, list<pair<OpRequestRef, version_t> > >::iterator p =
10847         waiting_for_ondisk.find(repop->v);
10848       if (p != waiting_for_ondisk.end()) {
10849         dout(10) << " also requeuing ondisk waiters " << p->second << dendl;
10850         for (list<pair<OpRequestRef, version_t> >::iterator i =
10851                p->second.begin();
10852              i != p->second.end();
10853              ++i) {
10854           rq.push_back(i->first);
10855         }
10856         waiting_for_ondisk.erase(p);
10857       }
10858     }
10859
10860     remove_repop(repop);
10861   }
10862
10863   assert(repop_queue.empty());
10864
10865   if (requeue) {
10866     requeue_ops(rq);
10867     if (!waiting_for_ondisk.empty()) {
10868       for (map<eversion_t, list<pair<OpRequestRef, version_t> > >::iterator i =
10869              waiting_for_ondisk.begin();
10870            i != waiting_for_ondisk.end();
10871            ++i) {
10872         for (list<pair<OpRequestRef, version_t> >::iterator j =
10873                i->second.begin();
10874              j != i->second.end();
10875              ++j) {
10876           derr << __func__ << ": op " << *(j->first->get_req()) << " waiting on "
10877                << i->first << dendl;
10878         }
10879       }
10880       assert(waiting_for_ondisk.empty());
10881     }
10882   }
10883
10884   waiting_for_ondisk.clear();
10885 }
10886
10887 void PrimaryLogPG::on_flushed()
10888 {
10889   assert(flushes_in_progress > 0);
10890   flushes_in_progress--;
10891   if (flushes_in_progress == 0) {
10892     requeue_ops(waiting_for_peered);
10893   }
10894   if (!is_peered() || !is_primary()) {
10895     pair<hobject_t, ObjectContextRef> i;
10896     while (object_contexts.get_next(i.first, &i)) {
10897       derr << "on_flushed: object " << i.first << " obc still alive" << dendl;
10898     }
10899     assert(object_contexts.empty());
10900   }
10901   pgbackend->on_flushed();
10902 }
10903
10904 void PrimaryLogPG::on_removal(ObjectStore::Transaction *t)
10905 {
10906   dout(10) << "on_removal" << dendl;
10907
10908   // adjust info to backfill
10909   info.set_last_backfill(hobject_t());
10910   pg_log.reset_backfill();
10911   dirty_info = true;
10912
10913
10914   // clear log
10915   PGLogEntryHandler rollbacker{this, t};
10916   pg_log.roll_forward(&rollbacker);
10917
10918   write_if_dirty(*t);
10919
10920   if (!deleting)
10921     on_shutdown();
10922 }
10923
10924 void PrimaryLogPG::clear_async_reads()
10925 {
10926   dout(10) << __func__ << dendl;
10927   for(auto& i : in_progress_async_reads) {
10928     dout(10) << "clear ctx: "
10929              << "OpRequestRef " << i.first
10930              << " OpContext " << i.second
10931              << dendl;
10932     close_op_ctx(i.second);
10933   }
10934 }
10935
10936 void PrimaryLogPG::on_shutdown()
10937 {
10938   dout(10) << "on_shutdown" << dendl;
10939
10940   // remove from queues
10941   osd->pg_stat_queue_dequeue(this);
10942   osd->peering_wq.dequeue(this);
10943
10944   // handles queue races
10945   deleting = true;
10946
10947   if (recovery_queued) {
10948     recovery_queued = false;
10949     osd->clear_queued_recovery(this);
10950   }
10951
10952   clear_scrub_reserved();
10953   scrub_clear_state();
10954
10955   unreg_next_scrub();
10956   cancel_copy_ops(false);
10957   cancel_flush_ops(false);
10958   cancel_proxy_ops(false);
10959   apply_and_flush_repops(false);
10960   cancel_log_updates();
10961   // we must remove PGRefs, so do this this prior to release_backoffs() callers
10962   clear_backoffs();
10963   // clean up snap trim references
10964   snap_trimmer_machine.process_event(Reset());
10965
10966   pgbackend->on_change();
10967
10968   context_registry_on_change();
10969   object_contexts.clear();
10970
10971   clear_async_reads();
10972
10973   osd->remote_reserver.cancel_reservation(info.pgid);
10974   osd->local_reserver.cancel_reservation(info.pgid);
10975
10976   clear_primary_state();
10977   cancel_recovery();
10978 }
10979
10980 void PrimaryLogPG::on_activate()
10981 {
10982   // all clean?
10983   if (needs_recovery()) {
10984     dout(10) << "activate not all replicas are up-to-date, queueing recovery" << dendl;
10985     queue_peering_event(
10986       CephPeeringEvtRef(
10987         std::make_shared<CephPeeringEvt>(
10988           get_osdmap()->get_epoch(),
10989           get_osdmap()->get_epoch(),
10990           DoRecovery())));
10991   } else if (needs_backfill()) {
10992     dout(10) << "activate queueing backfill" << dendl;
10993     queue_peering_event(
10994       CephPeeringEvtRef(
10995         std::make_shared<CephPeeringEvt>(
10996           get_osdmap()->get_epoch(),
10997           get_osdmap()->get_epoch(),
10998           RequestBackfill())));
10999   } else {
11000     dout(10) << "activate all replicas clean, no recovery" << dendl;
11001     eio_errors_to_process = false;
11002     queue_peering_event(
11003       CephPeeringEvtRef(
11004         std::make_shared<CephPeeringEvt>(
11005           get_osdmap()->get_epoch(),
11006           get_osdmap()->get_epoch(),
11007           AllReplicasRecovered())));
11008   }
11009
11010   publish_stats_to_osd();
11011
11012   if (!backfill_targets.empty()) {
11013     last_backfill_started = earliest_backfill();
11014     new_backfill = true;
11015     assert(!last_backfill_started.is_max());
11016     dout(5) << "on activate: bft=" << backfill_targets
11017            << " from " << last_backfill_started << dendl;
11018     for (set<pg_shard_t>::iterator i = backfill_targets.begin();
11019          i != backfill_targets.end();
11020          ++i) {
11021       dout(5) << "target shard " << *i
11022              << " from " << peer_info[*i].last_backfill
11023              << dendl;
11024     }
11025   }
11026
11027   hit_set_setup();
11028   agent_setup();
11029 }
11030
11031 void PrimaryLogPG::_on_new_interval()
11032 {
11033   dout(20) << __func__ << "checking missing set deletes flag. missing = " << pg_log.get_missing() << dendl;
11034   if (!pg_log.get_missing().may_include_deletes &&
11035       get_osdmap()->test_flag(CEPH_OSDMAP_RECOVERY_DELETES)) {
11036     pg_log.rebuild_missing_set_with_deletes(osd->store, coll, info);
11037   }
11038   assert(pg_log.get_missing().may_include_deletes == get_osdmap()->test_flag(CEPH_OSDMAP_RECOVERY_DELETES));
11039 }
11040
11041 void PrimaryLogPG::on_change(ObjectStore::Transaction *t)
11042 {
11043   dout(10) << "on_change" << dendl;
11044
11045   if (hit_set && hit_set->insert_count() == 0) {
11046     dout(20) << " discarding empty hit_set" << dendl;
11047     hit_set_clear();
11048   }
11049
11050   if (recovery_queued) {
11051     recovery_queued = false;
11052     osd->clear_queued_recovery(this);
11053   }
11054
11055   // requeue everything in the reverse order they should be
11056   // reexamined.
11057   requeue_ops(waiting_for_peered);
11058   requeue_ops(waiting_for_active);
11059
11060   clear_scrub_reserved();
11061
11062   cancel_copy_ops(is_primary());
11063   cancel_flush_ops(is_primary());
11064   cancel_proxy_ops(is_primary());
11065
11066   // requeue object waiters
11067   for (auto& p : waiting_for_unreadable_object) {
11068     release_backoffs(p.first);
11069   }
11070   if (is_primary()) {
11071     requeue_object_waiters(waiting_for_unreadable_object);
11072   } else {
11073     waiting_for_unreadable_object.clear();
11074   }
11075   for (map<hobject_t,list<OpRequestRef>>::iterator p = waiting_for_degraded_object.begin();
11076        p != waiting_for_degraded_object.end();
11077        waiting_for_degraded_object.erase(p++)) {
11078     release_backoffs(p->first);
11079     if (is_primary())
11080       requeue_ops(p->second);
11081     else
11082       p->second.clear();
11083     finish_degraded_object(p->first);
11084   }
11085
11086   // requeues waiting_for_scrub
11087   scrub_clear_state();
11088
11089   for (auto p = waiting_for_blocked_object.begin();
11090        p != waiting_for_blocked_object.end();
11091        waiting_for_blocked_object.erase(p++)) {
11092     if (is_primary())
11093       requeue_ops(p->second);
11094     else
11095       p->second.clear();
11096   }
11097   for (auto i = callbacks_for_degraded_object.begin();
11098        i != callbacks_for_degraded_object.end();
11099     ) {
11100     finish_degraded_object((i++)->first);
11101   }
11102   assert(callbacks_for_degraded_object.empty());
11103
11104   if (is_primary()) {
11105     requeue_ops(waiting_for_cache_not_full);
11106   } else {
11107     waiting_for_cache_not_full.clear();
11108   }
11109   objects_blocked_on_cache_full.clear();
11110
11111   for (list<pair<OpRequestRef, OpContext*> >::iterator i =
11112          in_progress_async_reads.begin();
11113        i != in_progress_async_reads.end();
11114        in_progress_async_reads.erase(i++)) {
11115     close_op_ctx(i->second);
11116     if (is_primary())
11117       requeue_op(i->first);
11118   }
11119
11120   // this will requeue ops we were working on but didn't finish, and
11121   // any dups
11122   apply_and_flush_repops(is_primary());
11123   cancel_log_updates();
11124
11125   // do this *after* apply_and_flush_repops so that we catch any newly
11126   // registered watches.
11127   context_registry_on_change();
11128
11129   pgbackend->on_change_cleanup(t);
11130   scrubber.cleanup_store(t);
11131   pgbackend->on_change();
11132
11133   // clear snap_trimmer state
11134   snap_trimmer_machine.process_event(Reset());
11135
11136   debug_op_order.clear();
11137   unstable_stats.clear();
11138
11139   // we don't want to cache object_contexts through the interval change
11140   // NOTE: we actually assert that all currently live references are dead
11141   // by the time the flush for the next interval completes.
11142   object_contexts.clear();
11143
11144   // should have been cleared above by finishing all of the degraded objects
11145   assert(objects_blocked_on_degraded_snap.empty());
11146 }
11147
11148 void PrimaryLogPG::on_role_change()
11149 {
11150   dout(10) << "on_role_change" << dendl;
11151   if (get_role() != 0 && hit_set) {
11152     dout(10) << " clearing hit set" << dendl;
11153     hit_set_clear();
11154   }
11155 }
11156
11157 void PrimaryLogPG::on_pool_change()
11158 {
11159   dout(10) << __func__ << dendl;
11160   // requeue cache full waiters just in case the cache_mode is
11161   // changing away from writeback mode.  note that if we are not
11162   // active the normal requeuing machinery is sufficient (and properly
11163   // ordered).
11164   if (is_active() &&
11165       pool.info.cache_mode != pg_pool_t::CACHEMODE_WRITEBACK &&
11166       !waiting_for_cache_not_full.empty()) {
11167     dout(10) << __func__ << " requeuing full waiters (not in writeback) "
11168              << dendl;
11169     requeue_ops(waiting_for_cache_not_full);
11170     objects_blocked_on_cache_full.clear();
11171   }
11172   hit_set_setup();
11173   agent_setup();
11174 }
11175
11176 // clear state.  called on recovery completion AND cancellation.
11177 void PrimaryLogPG::_clear_recovery_state()
11178 {
11179   missing_loc.clear();
11180 #ifdef DEBUG_RECOVERY_OIDS
11181   recovering_oids.clear();
11182 #endif
11183   last_backfill_started = hobject_t();
11184   set<hobject_t>::iterator i = backfills_in_flight.begin();
11185   while (i != backfills_in_flight.end()) {
11186     assert(recovering.count(*i));
11187     backfills_in_flight.erase(i++);
11188   }
11189
11190   list<OpRequestRef> blocked_ops;
11191   for (map<hobject_t, ObjectContextRef>::iterator i = recovering.begin();
11192        i != recovering.end();
11193        recovering.erase(i++)) {
11194     if (i->second) {
11195       i->second->drop_recovery_read(&blocked_ops);
11196       requeue_ops(blocked_ops);
11197     }
11198   }
11199   assert(backfills_in_flight.empty());
11200   pending_backfill_updates.clear();
11201   assert(recovering.empty());
11202   pgbackend->clear_recovery_state();
11203 }
11204
11205 void PrimaryLogPG::cancel_pull(const hobject_t &soid)
11206 {
11207   dout(20) << __func__ << ": " << soid << dendl;
11208   assert(recovering.count(soid));
11209   ObjectContextRef obc = recovering[soid];
11210   if (obc) {
11211     list<OpRequestRef> blocked_ops;
11212     obc->drop_recovery_read(&blocked_ops);
11213     requeue_ops(blocked_ops);
11214   }
11215   recovering.erase(soid);
11216   finish_recovery_op(soid);
11217   release_backoffs(soid);
11218   if (waiting_for_degraded_object.count(soid)) {
11219     dout(20) << " kicking degraded waiters on " << soid << dendl;
11220     requeue_ops(waiting_for_degraded_object[soid]);
11221     waiting_for_degraded_object.erase(soid);
11222   }
11223   if (waiting_for_unreadable_object.count(soid)) {
11224     dout(20) << " kicking unreadable waiters on " << soid << dendl;
11225     requeue_ops(waiting_for_unreadable_object[soid]);
11226     waiting_for_unreadable_object.erase(soid);
11227   }
11228   if (is_missing_object(soid))
11229     pg_log.set_last_requested(0); // get recover_primary to start over
11230   finish_degraded_object(soid);
11231 }
11232
11233 void PrimaryLogPG::check_recovery_sources(const OSDMapRef& osdmap)
11234 {
11235   /*
11236    * check that any peers we are planning to (or currently) pulling
11237    * objects from are dealt with.
11238    */
11239   missing_loc.check_recovery_sources(osdmap);
11240   pgbackend->check_recovery_sources(osdmap);
11241
11242   for (set<pg_shard_t>::iterator i = peer_log_requested.begin();
11243        i != peer_log_requested.end();
11244        ) {
11245     if (!osdmap->is_up(i->osd)) {
11246       dout(10) << "peer_log_requested removing " << *i << dendl;
11247       peer_log_requested.erase(i++);
11248     } else {
11249       ++i;
11250     }
11251   }
11252
11253   for (set<pg_shard_t>::iterator i = peer_missing_requested.begin();
11254        i != peer_missing_requested.end();
11255        ) {
11256     if (!osdmap->is_up(i->osd)) {
11257       dout(10) << "peer_missing_requested removing " << *i << dendl;
11258       peer_missing_requested.erase(i++);
11259     } else {
11260       ++i;
11261     }
11262   }
11263 }
11264
11265 void PG::MissingLoc::check_recovery_sources(const OSDMapRef& osdmap)
11266 {
11267   set<pg_shard_t> now_down;
11268   for (set<pg_shard_t>::iterator p = missing_loc_sources.begin();
11269        p != missing_loc_sources.end();
11270        ) {
11271     if (osdmap->is_up(p->osd)) {
11272       ++p;
11273       continue;
11274     }
11275     ldout(pg->cct, 10) << "check_recovery_sources source osd." << *p << " now down" << dendl;
11276     now_down.insert(*p);
11277     missing_loc_sources.erase(p++);
11278   }
11279
11280   if (now_down.empty()) {
11281     ldout(pg->cct, 10) << "check_recovery_sources no source osds (" << missing_loc_sources << ") went down" << dendl;
11282   } else {
11283     ldout(pg->cct, 10) << "check_recovery_sources sources osds " << now_down << " now down, remaining sources are "
11284                        << missing_loc_sources << dendl;
11285
11286     // filter missing_loc
11287     map<hobject_t, set<pg_shard_t>>::iterator p = missing_loc.begin();
11288     while (p != missing_loc.end()) {
11289       set<pg_shard_t>::iterator q = p->second.begin();
11290       while (q != p->second.end())
11291         if (now_down.count(*q)) {
11292           p->second.erase(q++);
11293         } else {
11294           ++q;
11295         }
11296       if (p->second.empty())
11297         missing_loc.erase(p++);
11298       else
11299         ++p;
11300     }
11301   }
11302 }
11303
11304
11305 bool PrimaryLogPG::start_recovery_ops(
11306   uint64_t max,
11307   ThreadPool::TPHandle &handle,
11308   uint64_t *ops_started)
11309 {
11310   uint64_t& started = *ops_started;
11311   started = 0;
11312   bool work_in_progress = false;
11313   assert(is_primary());
11314
11315   if (!state_test(PG_STATE_RECOVERING) &&
11316       !state_test(PG_STATE_BACKFILL)) {
11317     /* TODO: I think this case is broken and will make do_recovery()
11318      * unhappy since we're returning false */
11319     dout(10) << "recovery raced and were queued twice, ignoring!" << dendl;
11320     return false;
11321   }
11322
11323   const auto &missing = pg_log.get_missing();
11324
11325   unsigned int num_missing = missing.num_missing();
11326   uint64_t num_unfound = get_num_unfound();
11327
11328   if (num_missing == 0) {
11329     info.last_complete = info.last_update;
11330   }
11331
11332   if (num_missing == num_unfound) {
11333     // All of the missing objects we have are unfound.
11334     // Recover the replicas.
11335     started = recover_replicas(max, handle);
11336   }
11337   if (!started) {
11338     // We still have missing objects that we should grab from replicas.
11339     started += recover_primary(max, handle);
11340   }
11341   if (!started && num_unfound != get_num_unfound()) {
11342     // second chance to recovery replicas
11343     started = recover_replicas(max, handle);
11344   }
11345
11346   if (started)
11347     work_in_progress = true;
11348
11349   bool deferred_backfill = false;
11350   if (recovering.empty() &&
11351       state_test(PG_STATE_BACKFILL) &&
11352       !backfill_targets.empty() && started < max &&
11353       missing.num_missing() == 0 &&
11354       waiting_on_backfill.empty()) {
11355     if (get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL)) {
11356       dout(10) << "deferring backfill due to NOBACKFILL" << dendl;
11357       deferred_backfill = true;
11358     } else if (get_osdmap()->test_flag(CEPH_OSDMAP_NOREBALANCE) &&
11359                !is_degraded())  {
11360       dout(10) << "deferring backfill due to NOREBALANCE" << dendl;
11361       deferred_backfill = true;
11362     } else if (!backfill_reserved) {
11363       dout(10) << "deferring backfill due to !backfill_reserved" << dendl;
11364       if (!backfill_reserving) {
11365         dout(10) << "queueing RequestBackfill" << dendl;
11366         backfill_reserving = true;
11367         queue_peering_event(
11368           CephPeeringEvtRef(
11369             std::make_shared<CephPeeringEvt>(
11370               get_osdmap()->get_epoch(),
11371               get_osdmap()->get_epoch(),
11372               RequestBackfill())));
11373       }
11374       deferred_backfill = true;
11375     } else {
11376       started += recover_backfill(max - started, handle, &work_in_progress);
11377     }
11378   }
11379
11380   dout(10) << " started " << started << dendl;
11381   osd->logger->inc(l_osd_rop, started);
11382
11383   if (!recovering.empty() ||
11384       work_in_progress || recovery_ops_active > 0 || deferred_backfill)
11385     return work_in_progress;
11386
11387   assert(recovering.empty());
11388   assert(recovery_ops_active == 0);
11389
11390   dout(10) << __func__ << " needs_recovery: "
11391            << missing_loc.get_needs_recovery()
11392            << dendl;
11393   dout(10) << __func__ << " missing_loc: "
11394            << missing_loc.get_missing_locs()
11395            << dendl;
11396   int unfound = get_num_unfound();
11397   if (unfound) {
11398     dout(10) << " still have " << unfound << " unfound" << dendl;
11399     return work_in_progress;
11400   }
11401
11402   if (missing.num_missing() > 0) {
11403     // this shouldn't happen!
11404     osd->clog->error() << info.pgid << " Unexpected Error: recovery ending with "
11405                        << missing.num_missing() << ": " << missing.get_items();
11406     return work_in_progress;
11407   }
11408
11409   if (needs_recovery()) {
11410     // this shouldn't happen!
11411     // We already checked num_missing() so we must have missing replicas
11412     osd->clog->error() << info.pgid
11413                        << " Unexpected Error: recovery ending with missing replicas";
11414     return work_in_progress;
11415   }
11416
11417   if (state_test(PG_STATE_RECOVERING)) {
11418     state_clear(PG_STATE_RECOVERING);
11419     state_clear(PG_STATE_FORCED_RECOVERY);
11420     if (needs_backfill()) {
11421       dout(10) << "recovery done, queuing backfill" << dendl;
11422       queue_peering_event(
11423         CephPeeringEvtRef(
11424           std::make_shared<CephPeeringEvt>(
11425             get_osdmap()->get_epoch(),
11426             get_osdmap()->get_epoch(),
11427             RequestBackfill())));
11428     } else {
11429       dout(10) << "recovery done, no backfill" << dendl;
11430       eio_errors_to_process = false;
11431       state_clear(PG_STATE_FORCED_BACKFILL);
11432       queue_peering_event(
11433         CephPeeringEvtRef(
11434           std::make_shared<CephPeeringEvt>(
11435             get_osdmap()->get_epoch(),
11436             get_osdmap()->get_epoch(),
11437             AllReplicasRecovered())));
11438     }
11439   } else { // backfilling
11440     state_clear(PG_STATE_BACKFILL);
11441     state_clear(PG_STATE_FORCED_BACKFILL);
11442     state_clear(PG_STATE_FORCED_RECOVERY);
11443     dout(10) << "recovery done, backfill done" << dendl;
11444     eio_errors_to_process = false;
11445     queue_peering_event(
11446       CephPeeringEvtRef(
11447         std::make_shared<CephPeeringEvt>(
11448           get_osdmap()->get_epoch(),
11449           get_osdmap()->get_epoch(),
11450           Backfilled())));
11451   }
11452
11453   return false;
11454 }
11455
11456 /**
11457  * do one recovery op.
11458  * return true if done, false if nothing left to do.
11459  */
11460 uint64_t PrimaryLogPG::recover_primary(uint64_t max, ThreadPool::TPHandle &handle)
11461 {
11462   assert(is_primary());
11463
11464   const auto &missing = pg_log.get_missing();
11465
11466   dout(10) << "recover_primary recovering " << recovering.size()
11467            << " in pg" << dendl;
11468   dout(10) << "recover_primary " << missing << dendl;
11469   dout(25) << "recover_primary " << missing.get_items() << dendl;
11470
11471   // look at log!
11472   pg_log_entry_t *latest = 0;
11473   unsigned started = 0;
11474   int skipped = 0;
11475
11476   PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
11477   map<version_t, hobject_t>::const_iterator p =
11478     missing.get_rmissing().lower_bound(pg_log.get_log().last_requested);
11479   while (p != missing.get_rmissing().end()) {
11480     handle.reset_tp_timeout();
11481     hobject_t soid;
11482     version_t v = p->first;
11483
11484     if (pg_log.get_log().objects.count(p->second)) {
11485       latest = pg_log.get_log().objects.find(p->second)->second;
11486       assert(latest->is_update() || latest->is_delete());
11487       soid = latest->soid;
11488     } else {
11489       latest = 0;
11490       soid = p->second;
11491     }
11492     const pg_missing_item& item = missing.get_items().find(p->second)->second;
11493     ++p;
11494
11495     hobject_t head = soid.get_head();
11496
11497     eversion_t need = item.need;
11498
11499     dout(10) << "recover_primary "
11500              << soid << " " << item.need
11501              << (missing.is_missing(soid) ? " (missing)":"")
11502              << (missing.is_missing(head) ? " (missing head)":"")
11503              << (recovering.count(soid) ? " (recovering)":"")
11504              << (recovering.count(head) ? " (recovering head)":"")
11505              << dendl;
11506
11507     if (latest) {
11508       switch (latest->op) {
11509       case pg_log_entry_t::CLONE:
11510         /*
11511          * Handling for this special case removed for now, until we
11512          * can correctly construct an accurate SnapSet from the old
11513          * one.
11514          */
11515         break;
11516
11517       case pg_log_entry_t::LOST_REVERT:
11518         {
11519           if (item.have == latest->reverting_to) {
11520             ObjectContextRef obc = get_object_context(soid, true);
11521
11522             if (obc->obs.oi.version == latest->version) {
11523               // I'm already reverting
11524               dout(10) << " already reverting " << soid << dendl;
11525             } else {
11526               dout(10) << " reverting " << soid << " to " << latest->prior_version << dendl;
11527               obc->ondisk_write_lock();
11528               obc->obs.oi.version = latest->version;
11529
11530               ObjectStore::Transaction t;
11531               bufferlist b2;
11532               obc->obs.oi.encode(
11533                 b2,
11534                 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
11535               assert(!pool.info.require_rollback());
11536               t.setattr(coll, ghobject_t(soid), OI_ATTR, b2);
11537
11538               recover_got(soid, latest->version);
11539               missing_loc.add_location(soid, pg_whoami);
11540
11541               ++active_pushes;
11542
11543               osd->store->queue_transaction(osr.get(), std::move(t),
11544                                             new C_OSD_AppliedRecoveredObject(this, obc),
11545                                             new C_OSD_CommittedPushedObject(
11546                                               this,
11547                                               get_osdmap()->get_epoch(),
11548                                               info.last_complete),
11549                                             new C_OSD_OndiskWriteUnlock(obc));
11550               continue;
11551             }
11552           } else {
11553             /*
11554              * Pull the old version of the object.  Update missing_loc here to have the location
11555              * of the version we want.
11556              *
11557              * This doesn't use the usual missing_loc paths, but that's okay:
11558              *  - if we have it locally, we hit the case above, and go from there.
11559              *  - if we don't, we always pass through this case during recovery and set up the location
11560              *    properly.
11561              *  - this way we don't need to mangle the missing code to be general about needing an old
11562              *    version...
11563              */
11564             eversion_t alternate_need = latest->reverting_to;
11565             dout(10) << " need to pull prior_version " << alternate_need << " for revert " << item << dendl;
11566
11567             for (map<pg_shard_t, pg_missing_t>::iterator p = peer_missing.begin();
11568                  p != peer_missing.end();
11569                  ++p)
11570               if (p->second.is_missing(soid, need) &&
11571                   p->second.get_items().at(soid).have == alternate_need) {
11572                 missing_loc.add_location(soid, p->first);
11573               }
11574             dout(10) << " will pull " << alternate_need << " or " << need
11575                      << " from one of " << missing_loc.get_locations(soid)
11576                      << dendl;
11577           }
11578         }
11579         break;
11580       }
11581     }
11582
11583     if (!recovering.count(soid)) {
11584       if (recovering.count(head)) {
11585         ++skipped;
11586       } else {
11587         int r = recover_missing(
11588           soid, need, get_recovery_op_priority(), h);
11589         switch (r) {
11590         case PULL_YES:
11591           ++started;
11592           break;
11593         case PULL_OTHER:
11594           ++started;
11595         case PULL_NONE:
11596           ++skipped;
11597           break;
11598         default:
11599           ceph_abort();
11600         }
11601         if (started >= max)
11602           break;
11603       }
11604     }
11605
11606     // only advance last_requested if we haven't skipped anything
11607     if (!skipped)
11608       pg_log.set_last_requested(v);
11609   }
11610
11611   pgbackend->run_recovery_op(h, get_recovery_op_priority());
11612   return started;
11613 }
11614
11615 bool PrimaryLogPG::primary_error(
11616   const hobject_t& soid, eversion_t v)
11617 {
11618   pg_log.missing_add(soid, v, eversion_t());
11619   pg_log.set_last_requested(0);
11620   missing_loc.remove_location(soid, pg_whoami);
11621   bool uhoh = true;
11622   assert(!actingbackfill.empty());
11623   for (set<pg_shard_t>::iterator i = actingbackfill.begin();
11624        i != actingbackfill.end();
11625        ++i) {
11626     if (*i == get_primary()) continue;
11627     pg_shard_t peer = *i;
11628     if (!peer_missing[peer].is_missing(soid, v)) {
11629       missing_loc.add_location(soid, peer);
11630       dout(10) << info.pgid << " unexpectedly missing " << soid << " v" << v
11631                << ", there should be a copy on shard " << peer << dendl;
11632       uhoh = false;
11633     }
11634   }
11635   if (uhoh)
11636     osd->clog->error() << info.pgid << " missing primary copy of " << soid << ", unfound";
11637   else
11638     osd->clog->error() << info.pgid << " missing primary copy of " << soid
11639                          << ", will try copies on " << missing_loc.get_locations(soid);
11640   return uhoh;
11641 }
11642
11643 int PrimaryLogPG::prep_object_replica_deletes(
11644   const hobject_t& soid, eversion_t v,
11645   PGBackend::RecoveryHandle *h)
11646 {
11647   assert(is_primary());
11648   dout(10) << __func__ << ": on " << soid << dendl;
11649
11650   start_recovery_op(soid);
11651   assert(!recovering.count(soid));
11652   recovering.insert(make_pair(soid, ObjectContextRef()));
11653
11654   pgbackend->recover_delete_object(soid, v, h);
11655   return 1;
11656 }
11657
11658 int PrimaryLogPG::prep_object_replica_pushes(
11659   const hobject_t& soid, eversion_t v,
11660   PGBackend::RecoveryHandle *h)
11661 {
11662   assert(is_primary());
11663   dout(10) << __func__ << ": on " << soid << dendl;
11664
11665   // NOTE: we know we will get a valid oloc off of disk here.
11666   ObjectContextRef obc = get_object_context(soid, false);
11667   if (!obc) {
11668     primary_error(soid, v);
11669     return 0;
11670   }
11671
11672   if (!obc->get_recovery_read()) {
11673     dout(20) << "recovery delayed on " << soid
11674              << "; could not get rw_manager lock" << dendl;
11675     return 0;
11676   } else {
11677     dout(20) << "recovery got recovery read lock on " << soid
11678              << dendl;
11679   }
11680
11681   start_recovery_op(soid);
11682   assert(!recovering.count(soid));
11683   recovering.insert(make_pair(soid, obc));
11684
11685   /* We need this in case there is an in progress write on the object.  In fact,
11686    * the only possible write is an update to the xattr due to a lost_revert --
11687    * a client write would be blocked since the object is degraded.
11688    * In almost all cases, therefore, this lock should be uncontended.
11689    */
11690   obc->ondisk_read_lock();
11691   int r = pgbackend->recover_object(
11692     soid,
11693     v,
11694     ObjectContextRef(),
11695     obc, // has snapset context
11696     h);
11697   obc->ondisk_read_unlock();
11698   if (r < 0) {
11699     dout(0) << __func__ << " Error " << r << " on oid " << soid << dendl;
11700     primary_failed(soid);
11701     primary_error(soid, v);
11702     return 0;
11703   }
11704   return 1;
11705 }
11706
11707 uint64_t PrimaryLogPG::recover_replicas(uint64_t max, ThreadPool::TPHandle &handle)
11708 {
11709   dout(10) << __func__ << "(" << max << ")" << dendl;
11710   uint64_t started = 0;
11711
11712   PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
11713
11714   // this is FAR from an optimal recovery order.  pretty lame, really.
11715   assert(!actingbackfill.empty());
11716   for (set<pg_shard_t>::iterator i = actingbackfill.begin();
11717        i != actingbackfill.end();
11718        ++i) {
11719     if (*i == get_primary()) continue;
11720     pg_shard_t peer = *i;
11721     map<pg_shard_t, pg_missing_t>::const_iterator pm = peer_missing.find(peer);
11722     assert(pm != peer_missing.end());
11723     map<pg_shard_t, pg_info_t>::const_iterator pi = peer_info.find(peer);
11724     assert(pi != peer_info.end());
11725     size_t m_sz = pm->second.num_missing();
11726
11727     dout(10) << " peer osd." << peer << " missing " << m_sz << " objects." << dendl;
11728     dout(20) << " peer osd." << peer << " missing " << pm->second.get_items() << dendl;
11729
11730     // oldest first!
11731     const pg_missing_t &m(pm->second);
11732     for (map<version_t, hobject_t>::const_iterator p = m.get_rmissing().begin();
11733          p != m.get_rmissing().end() && started < max;
11734            ++p) {
11735       handle.reset_tp_timeout();
11736       const hobject_t soid(p->second);
11737
11738       if (missing_loc.is_unfound(soid)) {
11739         dout(10) << __func__ << ": " << soid << " still unfound" << dendl;
11740         continue;
11741       }
11742
11743       if (soid > pi->second.last_backfill) {
11744         if (!recovering.count(soid)) {
11745           derr << __func__ << ": object " << soid << " last_backfill " << pi->second.last_backfill << dendl;
11746           derr << __func__ << ": object added to missing set for backfill, but "
11747                << "is not in recovering, error!" << dendl;
11748           ceph_abort();
11749         }
11750         continue;
11751       }
11752
11753       if (recovering.count(soid)) {
11754         dout(10) << __func__ << ": already recovering " << soid << dendl;
11755         continue;
11756       }
11757
11758       if (missing_loc.is_deleted(soid)) {
11759         dout(10) << __func__ << ": " << soid << " is a delete, removing" << dendl;
11760         map<hobject_t,pg_missing_item>::const_iterator r = m.get_items().find(soid);
11761         started += prep_object_replica_deletes(soid, r->second.need, h);
11762         continue;
11763       }
11764
11765       if (soid.is_snap() && pg_log.get_missing().is_missing(soid.get_head())) {
11766         dout(10) << __func__ << ": " << soid.get_head()
11767                  << " still missing on primary" << dendl;
11768         continue;
11769       }
11770
11771       if (soid.is_snap() && pg_log.get_missing().is_missing(soid.get_snapdir())) {
11772         dout(10) << __func__ << ": " << soid.get_snapdir()
11773                  << " still missing on primary" << dendl;
11774         continue;
11775       }
11776
11777       if (pg_log.get_missing().is_missing(soid)) {
11778         dout(10) << __func__ << ": " << soid << " still missing on primary" << dendl;
11779         continue;
11780       }
11781
11782       dout(10) << __func__ << ": recover_object_replicas(" << soid << ")" << dendl;
11783       map<hobject_t,pg_missing_item>::const_iterator r = m.get_items().find(soid);
11784       started += prep_object_replica_pushes(soid, r->second.need,
11785                                             h);
11786     }
11787   }
11788
11789   pgbackend->run_recovery_op(h, get_recovery_op_priority());
11790   return started;
11791 }
11792
11793 hobject_t PrimaryLogPG::earliest_peer_backfill() const
11794 {
11795   hobject_t e = hobject_t::get_max();
11796   for (set<pg_shard_t>::const_iterator i = backfill_targets.begin();
11797        i != backfill_targets.end();
11798        ++i) {
11799     pg_shard_t peer = *i;
11800     map<pg_shard_t, BackfillInterval>::const_iterator iter =
11801       peer_backfill_info.find(peer);
11802     assert(iter != peer_backfill_info.end());
11803     if (iter->second.begin < e)
11804       e = iter->second.begin;
11805   }
11806   return e;
11807 }
11808
11809 bool PrimaryLogPG::all_peer_done() const
11810 {
11811   // Primary hasn't got any more objects
11812   assert(backfill_info.empty());
11813
11814   for (set<pg_shard_t>::const_iterator i = backfill_targets.begin();
11815        i != backfill_targets.end();
11816        ++i) {
11817     pg_shard_t bt = *i;
11818     map<pg_shard_t, BackfillInterval>::const_iterator piter =
11819       peer_backfill_info.find(bt);
11820     assert(piter != peer_backfill_info.end());
11821     const BackfillInterval& pbi = piter->second;
11822     // See if peer has more to process
11823     if (!pbi.extends_to_end() || !pbi.empty())
11824         return false;
11825   }
11826   return true;
11827 }
11828
11829 /**
11830  * recover_backfill
11831  *
11832  * Invariants:
11833  *
11834  * backfilled: fully pushed to replica or present in replica's missing set (both
11835  * our copy and theirs).
11836  *
11837  * All objects on a backfill_target in
11838  * [MIN,peer_backfill_info[backfill_target].begin) are valid; logically-removed
11839  * objects have been actually deleted and all logically-valid objects are replicated.
11840  * There may be PG objects in this interval yet to be backfilled.
11841  *
11842  * All objects in PG in [MIN,backfill_info.begin) have been backfilled to all
11843  * backfill_targets.  There may be objects on backfill_target(s) yet to be deleted.
11844  *
11845  * For a backfill target, all objects < MIN(peer_backfill_info[target].begin,
11846  *     backfill_info.begin) in PG are backfilled.  No deleted objects in this
11847  * interval remain on the backfill target.
11848  *
11849  * For a backfill target, all objects <= peer_info[target].last_backfill
11850  * have been backfilled to target
11851  *
11852  * There *MAY* be missing/outdated objects between last_backfill_started and
11853  * MIN(peer_backfill_info[*].begin, backfill_info.begin) in the event that client
11854  * io created objects since the last scan.  For this reason, we call
11855  * update_range() again before continuing backfill.
11856  */
11857 uint64_t PrimaryLogPG::recover_backfill(
11858   uint64_t max,
11859   ThreadPool::TPHandle &handle, bool *work_started)
11860 {
11861   dout(10) << "recover_backfill (" << max << ")"
11862            << " bft=" << backfill_targets
11863            << " last_backfill_started " << last_backfill_started
11864            << (new_backfill ? " new_backfill":"")
11865            << dendl;
11866   assert(!backfill_targets.empty());
11867
11868   // Initialize from prior backfill state
11869   if (new_backfill) {
11870     // on_activate() was called prior to getting here
11871     assert(last_backfill_started == earliest_backfill());
11872     new_backfill = false;
11873
11874     // initialize BackfillIntervals
11875     for (set<pg_shard_t>::iterator i = backfill_targets.begin();
11876          i != backfill_targets.end();
11877          ++i) {
11878       peer_backfill_info[*i].reset(peer_info[*i].last_backfill);
11879     }
11880     backfill_info.reset(last_backfill_started);
11881
11882     backfills_in_flight.clear();
11883     pending_backfill_updates.clear();
11884   }
11885
11886   for (set<pg_shard_t>::iterator i = backfill_targets.begin();
11887        i != backfill_targets.end();
11888        ++i) {
11889     dout(10) << "peer osd." << *i
11890            << " info " << peer_info[*i]
11891            << " interval " << peer_backfill_info[*i].begin
11892            << "-" << peer_backfill_info[*i].end
11893            << " " << peer_backfill_info[*i].objects.size() << " objects"
11894            << dendl;
11895   }
11896
11897   // update our local interval to cope with recent changes
11898   backfill_info.begin = last_backfill_started;
11899   update_range(&backfill_info, handle);
11900
11901   unsigned ops = 0;
11902   vector<boost::tuple<hobject_t, eversion_t, pg_shard_t> > to_remove;
11903   set<hobject_t> add_to_stat;
11904
11905   for (set<pg_shard_t>::iterator i = backfill_targets.begin();
11906        i != backfill_targets.end();
11907        ++i) {
11908     peer_backfill_info[*i].trim_to(
11909       std::max(peer_info[*i].last_backfill, last_backfill_started));
11910   }
11911   backfill_info.trim_to(last_backfill_started);
11912
11913   PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
11914   while (ops < max) {
11915     if (backfill_info.begin <= earliest_peer_backfill() &&
11916         !backfill_info.extends_to_end() && backfill_info.empty()) {
11917       hobject_t next = backfill_info.end;
11918       backfill_info.reset(next);
11919       backfill_info.end = hobject_t::get_max();
11920       update_range(&backfill_info, handle);
11921       backfill_info.trim();
11922     }
11923
11924     dout(20) << "   my backfill interval " << backfill_info << dendl;
11925
11926     bool sent_scan = false;
11927     for (set<pg_shard_t>::iterator i = backfill_targets.begin();
11928          i != backfill_targets.end();
11929          ++i) {
11930       pg_shard_t bt = *i;
11931       BackfillInterval& pbi = peer_backfill_info[bt];
11932
11933       dout(20) << " peer shard " << bt << " backfill " << pbi << dendl;
11934       if (pbi.begin <= backfill_info.begin &&
11935           !pbi.extends_to_end() && pbi.empty()) {
11936         dout(10) << " scanning peer osd." << bt << " from " << pbi.end << dendl;
11937         epoch_t e = get_osdmap()->get_epoch();
11938         MOSDPGScan *m = new MOSDPGScan(
11939           MOSDPGScan::OP_SCAN_GET_DIGEST, pg_whoami, e, last_peering_reset,
11940           spg_t(info.pgid.pgid, bt.shard),
11941           pbi.end, hobject_t());
11942         osd->send_message_osd_cluster(bt.osd, m, get_osdmap()->get_epoch());
11943         assert(waiting_on_backfill.find(bt) == waiting_on_backfill.end());
11944         waiting_on_backfill.insert(bt);
11945         sent_scan = true;
11946       }
11947     }
11948
11949     // Count simultaneous scans as a single op and let those complete
11950     if (sent_scan) {
11951       ops++;
11952       start_recovery_op(hobject_t::get_max()); // XXX: was pbi.end
11953       break;
11954     }
11955
11956     if (backfill_info.empty() && all_peer_done()) {
11957       dout(10) << " reached end for both local and all peers" << dendl;
11958       break;
11959     }
11960
11961     // Get object within set of peers to operate on and
11962     // the set of targets for which that object applies.
11963     hobject_t check = earliest_peer_backfill();
11964
11965     if (check < backfill_info.begin) {
11966
11967       set<pg_shard_t> check_targets;
11968       for (set<pg_shard_t>::iterator i = backfill_targets.begin();
11969            i != backfill_targets.end();
11970            ++i) {
11971         pg_shard_t bt = *i;
11972         BackfillInterval& pbi = peer_backfill_info[bt];
11973         if (pbi.begin == check)
11974           check_targets.insert(bt);
11975       }
11976       assert(!check_targets.empty());
11977
11978       dout(20) << " BACKFILL removing " << check
11979                << " from peers " << check_targets << dendl;
11980       for (set<pg_shard_t>::iterator i = check_targets.begin();
11981            i != check_targets.end();
11982            ++i) {
11983         pg_shard_t bt = *i;
11984         BackfillInterval& pbi = peer_backfill_info[bt];
11985         assert(pbi.begin == check);
11986
11987         to_remove.push_back(boost::make_tuple(check, pbi.objects.begin()->second, bt));
11988         pbi.pop_front();
11989       }
11990
11991       /* This requires a bit of explanation.  We compare head against
11992        * last_backfill to determine whether to send an operation
11993        * to the replica.  A single write operation can touch up to three
11994        * objects: head, the snapdir, and a new clone which sorts closer to
11995        * head than any existing clone.  If last_backfill points at a clone,
11996        * the transaction won't be sent and all 3 must lie on the right side
11997        * of the line (i.e., we'll backfill them later).  If last_backfill
11998        * points at snapdir, it sorts greater than head, so we send the
11999        * transaction which is correct because all three must lie to the left
12000        * of the line.
12001        *
12002        * If it points at head, we have a bit of an issue.  If head actually
12003        * exists, no problem, because any transaction which touches snapdir
12004        * must end up creating it (and deleting head), so sending the
12005        * operation won't pose a problem -- we'll end up having to scan it,
12006        * but it'll end up being the right version so we won't bother to
12007        * rebackfill it.  However, if head doesn't exist, any write on head
12008        * will remove snapdir.  For a replicated pool, this isn't a problem,
12009        * ENOENT on remove isn't an issue and it's in backfill future anyway.
12010        * It only poses a problem for EC pools, because we never just delete
12011        * an object, we rename it into a rollback object.  That operation
12012        * will end up crashing the osd with ENOENT.  Tolerating the failure
12013        * wouldn't work either, even if snapdir exists, we'd be creating a
12014        * rollback object past the last_backfill line which wouldn't get
12015        * cleaned up (no rollback objects past the last_backfill line is an
12016        * existing important invariant).  Thus, let's avoid the whole issue
12017        * by just not updating last_backfill_started here if head doesn't
12018        * exist and snapdir does.  We aren't using up a recovery count here,
12019        * so we're going to recover snapdir immediately anyway.  We'll only
12020        * fail "backward" if we fail to get the rw lock and that just means
12021        * we'll re-process this section of the hash space again.
12022        *
12023        * I'm choosing this hack here because the really "correct" answer is
12024        * going to be to unify snapdir and head into a single object (a
12025        * snapdir is really just a confusing way to talk about head existing
12026        * as a whiteout), but doing that is going to be a somewhat larger
12027        * undertaking.
12028        *
12029        * @see http://tracker.ceph.com/issues/17668
12030        */
12031       if (!(check.is_head() &&
12032             backfill_info.begin.is_snapdir() &&
12033             check == backfill_info.begin.get_head()))
12034         last_backfill_started = check;
12035
12036       // Don't increment ops here because deletions
12037       // are cheap and not replied to unlike real recovery_ops,
12038       // and we can't increment ops without requeueing ourself
12039       // for recovery.
12040     } else {
12041       eversion_t& obj_v = backfill_info.objects.begin()->second;
12042
12043       vector<pg_shard_t> need_ver_targs, missing_targs, keep_ver_targs, skip_targs;
12044       for (set<pg_shard_t>::iterator i = backfill_targets.begin();
12045            i != backfill_targets.end();
12046            ++i) {
12047         pg_shard_t bt = *i;
12048         BackfillInterval& pbi = peer_backfill_info[bt];
12049         // Find all check peers that have the wrong version
12050         if (check == backfill_info.begin && check == pbi.begin) {
12051           if (pbi.objects.begin()->second != obj_v) {
12052             need_ver_targs.push_back(bt);
12053           } else {
12054             keep_ver_targs.push_back(bt);
12055           }
12056         } else {
12057           pg_info_t& pinfo = peer_info[bt];
12058
12059           // Only include peers that we've caught up to their backfill line
12060           // otherwise, they only appear to be missing this object
12061           // because their pbi.begin > backfill_info.begin.
12062           if (backfill_info.begin > pinfo.last_backfill)
12063             missing_targs.push_back(bt);
12064           else
12065             skip_targs.push_back(bt);
12066         }
12067       }
12068
12069       if (!keep_ver_targs.empty()) {
12070         // These peers have version obj_v
12071         dout(20) << " BACKFILL keeping " << check
12072                  << " with ver " << obj_v
12073                  << " on peers " << keep_ver_targs << dendl;
12074         //assert(!waiting_for_degraded_object.count(check));
12075       }
12076       if (!need_ver_targs.empty() || !missing_targs.empty()) {
12077         ObjectContextRef obc = get_object_context(backfill_info.begin, false);
12078         assert(obc);
12079         if (obc->get_recovery_read()) {
12080           if (!need_ver_targs.empty()) {
12081             dout(20) << " BACKFILL replacing " << check
12082                    << " with ver " << obj_v
12083                    << " to peers " << need_ver_targs << dendl;
12084           }
12085           if (!missing_targs.empty()) {
12086             dout(20) << " BACKFILL pushing " << backfill_info.begin
12087                  << " with ver " << obj_v
12088                  << " to peers " << missing_targs << dendl;
12089           }
12090           vector<pg_shard_t> all_push = need_ver_targs;
12091           all_push.insert(all_push.end(), missing_targs.begin(), missing_targs.end());
12092
12093           handle.reset_tp_timeout();
12094           int r = prep_backfill_object_push(backfill_info.begin, obj_v, obc, all_push, h);
12095           if (r < 0) {
12096             *work_started = true;
12097             dout(0) << __func__ << " Error " << r << " trying to backfill " << backfill_info.begin << dendl;
12098             break;
12099           }
12100           ops++;
12101         } else {
12102           *work_started = true;
12103           dout(20) << "backfill blocking on " << backfill_info.begin
12104                    << "; could not get rw_manager lock" << dendl;
12105           break;
12106         }
12107       }
12108       dout(20) << "need_ver_targs=" << need_ver_targs
12109                << " keep_ver_targs=" << keep_ver_targs << dendl;
12110       dout(20) << "backfill_targets=" << backfill_targets
12111                << " missing_targs=" << missing_targs
12112                << " skip_targs=" << skip_targs << dendl;
12113
12114       last_backfill_started = backfill_info.begin;
12115       add_to_stat.insert(backfill_info.begin); // XXX: Only one for all pushes?
12116       backfill_info.pop_front();
12117       vector<pg_shard_t> check_targets = need_ver_targs;
12118       check_targets.insert(check_targets.end(), keep_ver_targs.begin(), keep_ver_targs.end());
12119       for (vector<pg_shard_t>::iterator i = check_targets.begin();
12120            i != check_targets.end();
12121            ++i) {
12122         pg_shard_t bt = *i;
12123         BackfillInterval& pbi = peer_backfill_info[bt];
12124         pbi.pop_front();
12125       }
12126     }
12127   }
12128
12129   hobject_t backfill_pos =
12130     std::min(backfill_info.begin, earliest_peer_backfill());
12131
12132   for (set<hobject_t>::iterator i = add_to_stat.begin();
12133        i != add_to_stat.end();
12134        ++i) {
12135     ObjectContextRef obc = get_object_context(*i, false);
12136     assert(obc);
12137     pg_stat_t stat;
12138     add_object_context_to_pg_stat(obc, &stat);
12139     pending_backfill_updates[*i] = stat;
12140   }
12141   if (HAVE_FEATURE(get_min_upacting_features(), SERVER_LUMINOUS)) {
12142     map<pg_shard_t,MOSDPGBackfillRemove*> reqs;
12143     for (unsigned i = 0; i < to_remove.size(); ++i) {
12144       handle.reset_tp_timeout();
12145       const hobject_t& oid = to_remove[i].get<0>();
12146       eversion_t v = to_remove[i].get<1>();
12147       pg_shard_t peer = to_remove[i].get<2>();
12148       MOSDPGBackfillRemove *m;
12149       auto it = reqs.find(peer);
12150       if (it != reqs.end()) {
12151         m = it->second;
12152       } else {
12153         m = reqs[peer] = new MOSDPGBackfillRemove(
12154           spg_t(info.pgid.pgid, peer.shard),
12155           get_osdmap()->get_epoch());
12156       }
12157       m->ls.push_back(make_pair(oid, v));
12158
12159       if (oid <= last_backfill_started)
12160         pending_backfill_updates[oid]; // add empty stat!
12161     }
12162     for (auto p : reqs) {
12163       osd->send_message_osd_cluster(p.first.osd, p.second,
12164                                     get_osdmap()->get_epoch());
12165     }
12166   } else {
12167     // for jewel targets
12168     for (unsigned i = 0; i < to_remove.size(); ++i) {
12169       handle.reset_tp_timeout();
12170
12171       // ordered before any subsequent updates
12172       send_remove_op(to_remove[i].get<0>(), to_remove[i].get<1>(),
12173                      to_remove[i].get<2>());
12174
12175       if (to_remove[i].get<0>() <= last_backfill_started)
12176         pending_backfill_updates[to_remove[i].get<0>()]; // add empty stat!
12177     }
12178   }
12179
12180   pgbackend->run_recovery_op(h, get_recovery_op_priority());
12181
12182   dout(5) << "backfill_pos is " << backfill_pos << dendl;
12183   for (set<hobject_t>::iterator i = backfills_in_flight.begin();
12184        i != backfills_in_flight.end();
12185        ++i) {
12186     dout(20) << *i << " is still in flight" << dendl;
12187   }
12188
12189   hobject_t next_backfill_to_complete = backfills_in_flight.empty() ?
12190     backfill_pos : *(backfills_in_flight.begin());
12191   hobject_t new_last_backfill = earliest_backfill();
12192   dout(10) << "starting new_last_backfill at " << new_last_backfill << dendl;
12193   for (map<hobject_t, pg_stat_t>::iterator i =
12194          pending_backfill_updates.begin();
12195        i != pending_backfill_updates.end() &&
12196          i->first < next_backfill_to_complete;
12197        pending_backfill_updates.erase(i++)) {
12198     dout(20) << " pending_backfill_update " << i->first << dendl;
12199     assert(i->first > new_last_backfill);
12200     for (set<pg_shard_t>::iterator j = backfill_targets.begin();
12201          j != backfill_targets.end();
12202          ++j) {
12203       pg_shard_t bt = *j;
12204       pg_info_t& pinfo = peer_info[bt];
12205       //Add stats to all peers that were missing object
12206       if (i->first > pinfo.last_backfill)
12207         pinfo.stats.add(i->second);
12208     }
12209     new_last_backfill = i->first;
12210   }
12211   dout(10) << "possible new_last_backfill at " << new_last_backfill << dendl;
12212
12213   assert(!pending_backfill_updates.empty() ||
12214          new_last_backfill == last_backfill_started);
12215   if (pending_backfill_updates.empty() &&
12216       backfill_pos.is_max()) {
12217     assert(backfills_in_flight.empty());
12218     new_last_backfill = backfill_pos;
12219     last_backfill_started = backfill_pos;
12220   }
12221   dout(10) << "final new_last_backfill at " << new_last_backfill << dendl;
12222
12223   // If new_last_backfill == MAX, then we will send OP_BACKFILL_FINISH to
12224   // all the backfill targets.  Otherwise, we will move last_backfill up on
12225   // those targets need it and send OP_BACKFILL_PROGRESS to them.
12226   for (set<pg_shard_t>::iterator i = backfill_targets.begin();
12227        i != backfill_targets.end();
12228        ++i) {
12229     pg_shard_t bt = *i;
12230     pg_info_t& pinfo = peer_info[bt];
12231
12232     if (new_last_backfill > pinfo.last_backfill) {
12233       pinfo.set_last_backfill(new_last_backfill);
12234       epoch_t e = get_osdmap()->get_epoch();
12235       MOSDPGBackfill *m = NULL;
12236       if (pinfo.last_backfill.is_max()) {
12237         m = new MOSDPGBackfill(
12238           MOSDPGBackfill::OP_BACKFILL_FINISH,
12239           e,
12240           last_peering_reset,
12241           spg_t(info.pgid.pgid, bt.shard));
12242         // Use default priority here, must match sub_op priority
12243         /* pinfo.stats might be wrong if we did log-based recovery on the
12244          * backfilled portion in addition to continuing backfill.
12245          */
12246         pinfo.stats = info.stats;
12247         start_recovery_op(hobject_t::get_max());
12248       } else {
12249         m = new MOSDPGBackfill(
12250           MOSDPGBackfill::OP_BACKFILL_PROGRESS,
12251           e,
12252           last_peering_reset,
12253           spg_t(info.pgid.pgid, bt.shard));
12254         // Use default priority here, must match sub_op priority
12255       }
12256       m->last_backfill = pinfo.last_backfill;
12257       m->stats = pinfo.stats;
12258       osd->send_message_osd_cluster(bt.osd, m, get_osdmap()->get_epoch());
12259       dout(10) << " peer " << bt
12260                << " num_objects now " << pinfo.stats.stats.sum.num_objects
12261                << " / " << info.stats.stats.sum.num_objects << dendl;
12262     }
12263   }
12264
12265   if (ops)
12266     *work_started = true;
12267   return ops;
12268 }
12269
12270 int PrimaryLogPG::prep_backfill_object_push(
12271   hobject_t oid, eversion_t v,
12272   ObjectContextRef obc,
12273   vector<pg_shard_t> peers,
12274   PGBackend::RecoveryHandle *h)
12275 {
12276   dout(10) << __func__ << " " << oid << " v " << v << " to peers " << peers << dendl;
12277   assert(!peers.empty());
12278
12279   backfills_in_flight.insert(oid);
12280   for (unsigned int i = 0 ; i < peers.size(); ++i) {
12281     map<pg_shard_t, pg_missing_t>::iterator bpm = peer_missing.find(peers[i]);
12282     assert(bpm != peer_missing.end());
12283     bpm->second.add(oid, eversion_t(), eversion_t(), false);
12284   }
12285
12286   assert(!recovering.count(oid));
12287
12288   start_recovery_op(oid);
12289   recovering.insert(make_pair(oid, obc));
12290
12291   // We need to take the read_lock here in order to flush in-progress writes
12292   obc->ondisk_read_lock();
12293   int r = pgbackend->recover_object(
12294     oid,
12295     v,
12296     ObjectContextRef(),
12297     obc,
12298     h);
12299   obc->ondisk_read_unlock();
12300   if (r < 0) {
12301     dout(0) << __func__ << " Error " << r << " on oid " << oid << dendl;
12302     primary_failed(oid);
12303     primary_error(oid, v);
12304     backfills_in_flight.erase(oid);
12305     missing_loc.add_missing(oid, v, eversion_t());
12306   }
12307   return r;
12308 }
12309
12310 void PrimaryLogPG::update_range(
12311   BackfillInterval *bi,
12312   ThreadPool::TPHandle &handle)
12313 {
12314   int local_min = cct->_conf->osd_backfill_scan_min;
12315   int local_max = cct->_conf->osd_backfill_scan_max;
12316
12317   if (bi->version < info.log_tail) {
12318     dout(10) << __func__<< ": bi is old, rescanning local backfill_info"
12319              << dendl;
12320     if (last_update_applied >= info.log_tail) {
12321       bi->version = last_update_applied;
12322     } else {
12323       osr->flush();
12324       bi->version = info.last_update;
12325     }
12326     scan_range(local_min, local_max, bi, handle);
12327   }
12328
12329   if (bi->version >= projected_last_update) {
12330     dout(10) << __func__<< ": bi is current " << dendl;
12331     assert(bi->version == projected_last_update);
12332   } else if (bi->version >= info.log_tail) {
12333     if (pg_log.get_log().empty() && projected_log.empty()) {
12334       /* Because we don't move log_tail on split, the log might be
12335        * empty even if log_tail != last_update.  However, the only
12336        * way to get here with an empty log is if log_tail is actually
12337        * eversion_t(), because otherwise the entry which changed
12338        * last_update since the last scan would have to be present.
12339        */
12340       assert(bi->version == eversion_t());
12341       return;
12342     }
12343
12344     dout(10) << __func__<< ": bi is old, (" << bi->version
12345              << ") can be updated with log to projected_last_update "
12346              << projected_last_update << dendl;
12347
12348     auto func = [&](const pg_log_entry_t &e) {
12349       dout(10) << __func__ << ": updating from version " << e.version
12350                << dendl;
12351       const hobject_t &soid = e.soid;
12352       if (soid >= bi->begin &&
12353           soid < bi->end) {
12354         if (e.is_update()) {
12355           dout(10) << __func__ << ": " << e.soid << " updated to version "
12356                    << e.version << dendl;
12357           bi->objects.erase(e.soid);
12358           bi->objects.insert(
12359             make_pair(
12360               e.soid,
12361               e.version));
12362         } else if (e.is_delete()) {
12363           dout(10) << __func__ << ": " << e.soid << " removed" << dendl;
12364           bi->objects.erase(e.soid);
12365         }
12366       }
12367     };
12368     dout(10) << "scanning pg log first" << dendl;
12369     pg_log.get_log().scan_log_after(bi->version, func);
12370     dout(10) << "scanning projected log" << dendl;
12371     projected_log.scan_log_after(bi->version, func);
12372     bi->version = projected_last_update;
12373   } else {
12374     assert(0 == "scan_range should have raised bi->version past log_tail");
12375   }
12376 }
12377
12378 void PrimaryLogPG::scan_range(
12379   int min, int max, BackfillInterval *bi,
12380   ThreadPool::TPHandle &handle)
12381 {
12382   assert(is_locked());
12383   dout(10) << "scan_range from " << bi->begin << dendl;
12384   bi->clear_objects();
12385
12386   vector<hobject_t> ls;
12387   ls.reserve(max);
12388   int r = pgbackend->objects_list_partial(bi->begin, min, max, &ls, &bi->end);
12389   assert(r >= 0);
12390   dout(10) << " got " << ls.size() << " items, next " << bi->end << dendl;
12391   dout(20) << ls << dendl;
12392
12393   for (vector<hobject_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
12394     handle.reset_tp_timeout();
12395     ObjectContextRef obc;
12396     if (is_primary())
12397       obc = object_contexts.lookup(*p);
12398     if (obc) {
12399       bi->objects[*p] = obc->obs.oi.version;
12400       dout(20) << "  " << *p << " " << obc->obs.oi.version << dendl;
12401     } else {
12402       bufferlist bl;
12403       int r = pgbackend->objects_get_attr(*p, OI_ATTR, &bl);
12404
12405       /* If the object does not exist here, it must have been removed
12406          * between the collection_list_partial and here.  This can happen
12407          * for the first item in the range, which is usually last_backfill.
12408          */
12409       if (r == -ENOENT)
12410         continue;
12411
12412       assert(r >= 0);
12413       object_info_t oi(bl);
12414       bi->objects[*p] = oi.version;
12415       dout(20) << "  " << *p << " " << oi.version << dendl;
12416     }
12417   }
12418 }
12419
12420
12421 /** check_local
12422  *
12423  * verifies that stray objects have been deleted
12424  */
12425 void PrimaryLogPG::check_local()
12426 {
12427   dout(10) << __func__ << dendl;
12428
12429   assert(info.last_update >= pg_log.get_tail());  // otherwise we need some help!
12430
12431   if (!cct->_conf->osd_debug_verify_stray_on_activate)
12432     return;
12433
12434   // just scan the log.
12435   set<hobject_t> did;
12436   for (list<pg_log_entry_t>::const_reverse_iterator p = pg_log.get_log().log.rbegin();
12437        p != pg_log.get_log().log.rend();
12438        ++p) {
12439     if (did.count(p->soid))
12440       continue;
12441     did.insert(p->soid);
12442
12443     if (p->is_delete() && !is_missing_object(p->soid)) {
12444       dout(10) << " checking " << p->soid
12445                << " at " << p->version << dendl;
12446       struct stat st;
12447       int r = osd->store->stat(
12448         ch,
12449         ghobject_t(p->soid, ghobject_t::NO_GEN, pg_whoami.shard),
12450         &st);
12451       if (r != -ENOENT) {
12452         derr << __func__ << " " << p->soid << " exists, but should have been "
12453              << "deleted" << dendl;
12454         assert(0 == "erroneously present object");
12455       }
12456     } else {
12457       // ignore old(+missing) objects
12458     }
12459   }
12460 }
12461
12462
12463
12464 // ===========================
12465 // hit sets
12466
12467 hobject_t PrimaryLogPG::get_hit_set_current_object(utime_t stamp)
12468 {
12469   ostringstream ss;
12470   ss << "hit_set_" << info.pgid.pgid << "_current_" << stamp;
12471   hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "",
12472                  info.pgid.ps(), info.pgid.pool(),
12473                  cct->_conf->osd_hit_set_namespace);
12474   dout(20) << __func__ << " " << hoid << dendl;
12475   return hoid;
12476 }
12477
12478 hobject_t PrimaryLogPG::get_hit_set_archive_object(utime_t start,
12479                                                    utime_t end,
12480                                                    bool using_gmt)
12481 {
12482   ostringstream ss;
12483   ss << "hit_set_" << info.pgid.pgid << "_archive_";
12484   if (using_gmt) {
12485     start.gmtime(ss) << "_";
12486     end.gmtime(ss);
12487   } else {
12488     start.localtime(ss) << "_";
12489     end.localtime(ss);
12490   }
12491   hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "",
12492                  info.pgid.ps(), info.pgid.pool(),
12493                  cct->_conf->osd_hit_set_namespace);
12494   dout(20) << __func__ << " " << hoid << dendl;
12495   return hoid;
12496 }
12497
12498 void PrimaryLogPG::hit_set_clear()
12499 {
12500   dout(20) << __func__ << dendl;
12501   hit_set.reset();
12502   hit_set_start_stamp = utime_t();
12503 }
12504
12505 void PrimaryLogPG::hit_set_setup()
12506 {
12507   if (!is_active() ||
12508       !is_primary()) {
12509     hit_set_clear();
12510     return;
12511   }
12512
12513   if (is_active() && is_primary() &&
12514       (!pool.info.hit_set_count ||
12515        !pool.info.hit_set_period ||
12516        pool.info.hit_set_params.get_type() == HitSet::TYPE_NONE)) {
12517     hit_set_clear();
12518
12519     // only primary is allowed to remove all the hit set objects
12520     hit_set_remove_all();
12521     return;
12522   }
12523
12524   // FIXME: discard any previous data for now
12525   hit_set_create();
12526
12527   // include any writes we know about from the pg log.  this doesn't
12528   // capture reads, but it is better than nothing!
12529   hit_set_apply_log();
12530 }
12531
12532 void PrimaryLogPG::hit_set_remove_all()
12533 {
12534   // If any archives are degraded we skip this
12535   for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin();
12536        p != info.hit_set.history.end();
12537        ++p) {
12538     hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
12539
12540     // Once we hit a degraded object just skip
12541     if (is_degraded_or_backfilling_object(aoid))
12542       return;
12543     if (scrubber.write_blocked_by_scrub(aoid))
12544       return;
12545   }
12546
12547   if (!info.hit_set.history.empty()) {
12548     list<pg_hit_set_info_t>::reverse_iterator p = info.hit_set.history.rbegin();
12549     assert(p != info.hit_set.history.rend());
12550     hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
12551     assert(!is_degraded_or_backfilling_object(oid));
12552     ObjectContextRef obc = get_object_context(oid, false);
12553     assert(obc);
12554
12555     OpContextUPtr ctx = simple_opc_create(obc);
12556     ctx->at_version = get_next_version();
12557     ctx->updated_hset_history = info.hit_set;
12558     utime_t now = ceph_clock_now();
12559     ctx->mtime = now;
12560     hit_set_trim(ctx, 0);
12561     simple_opc_submit(std::move(ctx));
12562   }
12563
12564   info.hit_set = pg_hit_set_history_t();
12565   if (agent_state) {
12566     agent_state->discard_hit_sets();
12567   }
12568 }
12569
12570 void PrimaryLogPG::hit_set_create()
12571 {
12572   utime_t now = ceph_clock_now();
12573   // make a copy of the params to modify
12574   HitSet::Params params(pool.info.hit_set_params);
12575
12576   dout(20) << __func__ << " " << params << dendl;
12577   if (pool.info.hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
12578     BloomHitSet::Params *p =
12579       static_cast<BloomHitSet::Params*>(params.impl.get());
12580
12581     // convert false positive rate so it holds up across the full period
12582     p->set_fpp(p->get_fpp() / pool.info.hit_set_count);
12583     if (p->get_fpp() <= 0.0)
12584       p->set_fpp(.01);  // fpp cannot be zero!
12585
12586     // if we don't have specified size, estimate target size based on the
12587     // previous bin!
12588     if (p->target_size == 0 && hit_set) {
12589       utime_t dur = now - hit_set_start_stamp;
12590       unsigned unique = hit_set->approx_unique_insert_count();
12591       dout(20) << __func__ << " previous set had approx " << unique
12592                << " unique items over " << dur << " seconds" << dendl;
12593       p->target_size = (double)unique * (double)pool.info.hit_set_period
12594                      / (double)dur;
12595     }
12596     if (p->target_size <
12597         static_cast<uint64_t>(cct->_conf->osd_hit_set_min_size))
12598       p->target_size = cct->_conf->osd_hit_set_min_size;
12599
12600     if (p->target_size
12601         > static_cast<uint64_t>(cct->_conf->osd_hit_set_max_size))
12602       p->target_size = cct->_conf->osd_hit_set_max_size;
12603
12604     p->seed = now.sec();
12605
12606     dout(10) << __func__ << " target_size " << p->target_size
12607              << " fpp " << p->get_fpp() << dendl;
12608   }
12609   hit_set.reset(new HitSet(params));
12610   hit_set_start_stamp = now;
12611 }
12612
12613 /**
12614  * apply log entries to set
12615  *
12616  * this would only happen after peering, to at least capture writes
12617  * during an interval that was potentially lost.
12618  */
12619 bool PrimaryLogPG::hit_set_apply_log()
12620 {
12621   if (!hit_set)
12622     return false;
12623
12624   eversion_t to = info.last_update;
12625   eversion_t from = info.hit_set.current_last_update;
12626   if (to <= from) {
12627     dout(20) << __func__ << " no update" << dendl;
12628     return false;
12629   }
12630
12631   dout(20) << __func__ << " " << to << " .. " << info.last_update << dendl;
12632   list<pg_log_entry_t>::const_reverse_iterator p = pg_log.get_log().log.rbegin();
12633   while (p != pg_log.get_log().log.rend() && p->version > to)
12634     ++p;
12635   while (p != pg_log.get_log().log.rend() && p->version > from) {
12636     hit_set->insert(p->soid);
12637     ++p;
12638   }
12639
12640   return true;
12641 }
12642
12643 void PrimaryLogPG::hit_set_persist()
12644 {
12645   dout(10) << __func__  << dendl;
12646   bufferlist bl;
12647   unsigned max = pool.info.hit_set_count;
12648
12649   utime_t now = ceph_clock_now();
12650   hobject_t oid;
12651
12652   // If any archives are degraded we skip this persist request
12653   // account for the additional entry being added below
12654   for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin();
12655        p != info.hit_set.history.end();
12656        ++p) {
12657     hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
12658
12659     // Once we hit a degraded object just skip further trim
12660     if (is_degraded_or_backfilling_object(aoid))
12661       return;
12662     if (scrubber.write_blocked_by_scrub(aoid))
12663       return;
12664   }
12665
12666   // If backfill is in progress and we could possibly overlap with the
12667   // hit_set_* objects, back off.  Since these all have
12668   // hobject_t::hash set to pgid.ps(), and those sort first, we can
12669   // look just at that.  This is necessary because our transactions
12670   // may include a modify of the new hit_set *and* a delete of the
12671   // old one, and this may span the backfill boundary.
12672   for (set<pg_shard_t>::iterator p = backfill_targets.begin();
12673        p != backfill_targets.end();
12674        ++p) {
12675     assert(peer_info.count(*p));
12676     const pg_info_t& pi = peer_info[*p];
12677     if (pi.last_backfill == hobject_t() ||
12678         pi.last_backfill.get_hash() == info.pgid.ps()) {
12679       dout(10) << __func__ << " backfill target osd." << *p
12680                << " last_backfill has not progressed past pgid ps"
12681                << dendl;
12682       return;
12683     }
12684   }
12685
12686
12687   pg_hit_set_info_t new_hset = pg_hit_set_info_t(pool.info.use_gmt_hitset);
12688   new_hset.begin = hit_set_start_stamp;
12689   new_hset.end = now;
12690   oid = get_hit_set_archive_object(
12691     new_hset.begin,
12692     new_hset.end,
12693     new_hset.using_gmt);
12694
12695   // If the current object is degraded we skip this persist request
12696   if (scrubber.write_blocked_by_scrub(oid))
12697     return;
12698
12699   hit_set->seal();
12700   ::encode(*hit_set, bl);
12701   dout(20) << __func__ << " archive " << oid << dendl;
12702
12703   if (agent_state) {
12704     agent_state->add_hit_set(new_hset.begin, hit_set);
12705     uint32_t size = agent_state->hit_set_map.size();
12706     if (size >= pool.info.hit_set_count) {
12707       size = pool.info.hit_set_count > 0 ? pool.info.hit_set_count - 1: 0;
12708     }
12709     hit_set_in_memory_trim(size);
12710   }
12711
12712   ObjectContextRef obc = get_object_context(oid, true);
12713   OpContextUPtr ctx = simple_opc_create(obc);
12714
12715   ctx->at_version = get_next_version();
12716   ctx->updated_hset_history = info.hit_set;
12717   pg_hit_set_history_t &updated_hit_set_hist = *(ctx->updated_hset_history);
12718
12719   updated_hit_set_hist.current_last_update = info.last_update;
12720   new_hset.version = ctx->at_version;
12721
12722   updated_hit_set_hist.history.push_back(new_hset);
12723   hit_set_create();
12724
12725   // fabricate an object_info_t and SnapSet
12726   obc->obs.oi.version = ctx->at_version;
12727   obc->obs.oi.mtime = now;
12728   obc->obs.oi.size = bl.length();
12729   obc->obs.exists = true;
12730   obc->obs.oi.set_data_digest(bl.crc32c(-1));
12731
12732   ctx->new_obs = obc->obs;
12733
12734   obc->ssc->snapset.head_exists = true;
12735   ctx->new_snapset = obc->ssc->snapset;
12736
12737   ctx->delta_stats.num_objects++;
12738   ctx->delta_stats.num_objects_hit_set_archive++;
12739   ctx->delta_stats.num_bytes += bl.length();
12740   ctx->delta_stats.num_bytes_hit_set_archive += bl.length();
12741
12742   bufferlist bss;
12743   ::encode(ctx->new_snapset, bss);
12744   bufferlist boi(sizeof(ctx->new_obs.oi));
12745   ::encode(ctx->new_obs.oi, boi,
12746            get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
12747
12748   ctx->op_t->create(oid);
12749   if (bl.length()) {
12750     ctx->op_t->write(oid, 0, bl.length(), bl, 0);
12751   }
12752   map <string, bufferlist> attrs;
12753   attrs[OI_ATTR].claim(boi);
12754   attrs[SS_ATTR].claim(bss);
12755   setattrs_maybe_cache(ctx->obc, ctx.get(), ctx->op_t.get(), attrs);
12756   ctx->log.push_back(
12757     pg_log_entry_t(
12758       pg_log_entry_t::MODIFY,
12759       oid,
12760       ctx->at_version,
12761       eversion_t(),
12762       0,
12763       osd_reqid_t(),
12764       ctx->mtime,
12765       0)
12766     );
12767
12768   hit_set_trim(ctx, max);
12769
12770   simple_opc_submit(std::move(ctx));
12771 }
12772
12773 void PrimaryLogPG::hit_set_trim(OpContextUPtr &ctx, unsigned max)
12774 {
12775   assert(ctx->updated_hset_history);
12776   pg_hit_set_history_t &updated_hit_set_hist =
12777     *(ctx->updated_hset_history);
12778   for (unsigned num = updated_hit_set_hist.history.size(); num > max; --num) {
12779     list<pg_hit_set_info_t>::iterator p = updated_hit_set_hist.history.begin();
12780     assert(p != updated_hit_set_hist.history.end());
12781     hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
12782
12783     assert(!is_degraded_or_backfilling_object(oid));
12784
12785     dout(20) << __func__ << " removing " << oid << dendl;
12786     ++ctx->at_version.version;
12787     ctx->log.push_back(
12788         pg_log_entry_t(pg_log_entry_t::DELETE,
12789                        oid,
12790                        ctx->at_version,
12791                        p->version,
12792                        0,
12793                        osd_reqid_t(),
12794                        ctx->mtime,
12795                        0));
12796
12797     ctx->op_t->remove(oid);
12798     updated_hit_set_hist.history.pop_front();
12799
12800     ObjectContextRef obc = get_object_context(oid, false);
12801     assert(obc);
12802     --ctx->delta_stats.num_objects;
12803     --ctx->delta_stats.num_objects_hit_set_archive;
12804     ctx->delta_stats.num_bytes -= obc->obs.oi.size;
12805     ctx->delta_stats.num_bytes_hit_set_archive -= obc->obs.oi.size;
12806   }
12807 }
12808
12809 void PrimaryLogPG::hit_set_in_memory_trim(uint32_t max_in_memory)
12810 {
12811   while (agent_state->hit_set_map.size() > max_in_memory) {
12812     agent_state->remove_oldest_hit_set();
12813   }
12814 }
12815
12816
12817 // =======================================
12818 // cache agent
12819
12820 void PrimaryLogPG::agent_setup()
12821 {
12822   assert(is_locked());
12823   if (!is_active() ||
12824       !is_primary() ||
12825       pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE ||
12826       pool.info.tier_of < 0 ||
12827       !get_osdmap()->have_pg_pool(pool.info.tier_of)) {
12828     agent_clear();
12829     return;
12830   }
12831   if (!agent_state) {
12832     agent_state.reset(new TierAgentState);
12833
12834     // choose random starting position
12835     agent_state->position = hobject_t();
12836     agent_state->position.pool = info.pgid.pool();
12837     agent_state->position.set_hash(pool.info.get_random_pg_position(
12838       info.pgid.pgid,
12839       rand()));
12840     agent_state->start = agent_state->position;
12841
12842     dout(10) << __func__ << " allocated new state, position "
12843              << agent_state->position << dendl;
12844   } else {
12845     dout(10) << __func__ << " keeping existing state" << dendl;
12846   }
12847
12848   if (info.stats.stats_invalid) {
12849     osd->clog->warn() << "pg " << info.pgid << " has invalid (post-split) stats; must scrub before tier agent can activate";
12850   }
12851
12852   agent_choose_mode();
12853 }
12854
12855 void PrimaryLogPG::agent_clear()
12856 {
12857   agent_stop();
12858   agent_state.reset(NULL);
12859 }
12860
12861 // Return false if no objects operated on since start of object hash space
12862 bool PrimaryLogPG::agent_work(int start_max, int agent_flush_quota)
12863 {
12864   lock();
12865   if (!agent_state) {
12866     dout(10) << __func__ << " no agent state, stopping" << dendl;
12867     unlock();
12868     return true;
12869   }
12870
12871   assert(!deleting);
12872
12873   if (agent_state->is_idle()) {
12874     dout(10) << __func__ << " idle, stopping" << dendl;
12875     unlock();
12876     return true;
12877   }
12878
12879   osd->logger->inc(l_osd_agent_wake);
12880
12881   dout(10) << __func__
12882            << " max " << start_max
12883            << ", flush " << agent_state->get_flush_mode_name()
12884            << ", evict " << agent_state->get_evict_mode_name()
12885            << ", pos " << agent_state->position
12886            << dendl;
12887   assert(is_primary());
12888   assert(is_active());
12889
12890   agent_load_hit_sets();
12891
12892   const pg_pool_t *base_pool = get_osdmap()->get_pg_pool(pool.info.tier_of);
12893   assert(base_pool);
12894
12895   int ls_min = 1;
12896   int ls_max = cct->_conf->osd_pool_default_cache_max_evict_check_size;
12897
12898   // list some objects.  this conveniently lists clones (oldest to
12899   // newest) before heads... the same order we want to flush in.
12900   //
12901   // NOTE: do not flush the Sequencer.  we will assume that the
12902   // listing we get back is imprecise.
12903   vector<hobject_t> ls;
12904   hobject_t next;
12905   int r = pgbackend->objects_list_partial(agent_state->position, ls_min, ls_max,
12906                                           &ls, &next);
12907   assert(r >= 0);
12908   dout(20) << __func__ << " got " << ls.size() << " objects" << dendl;
12909   int started = 0;
12910   for (vector<hobject_t>::iterator p = ls.begin();
12911        p != ls.end();
12912        ++p) {
12913     if (p->nspace == cct->_conf->osd_hit_set_namespace) {
12914       dout(20) << __func__ << " skip (hit set) " << *p << dendl;
12915       osd->logger->inc(l_osd_agent_skip);
12916       continue;
12917     }
12918     if (is_degraded_or_backfilling_object(*p)) {
12919       dout(20) << __func__ << " skip (degraded) " << *p << dendl;
12920       osd->logger->inc(l_osd_agent_skip);
12921       continue;
12922     }
12923     if (is_missing_object(p->get_head())) {
12924       dout(20) << __func__ << " skip (missing head) " << *p << dendl;
12925       osd->logger->inc(l_osd_agent_skip);
12926       continue;
12927     }
12928     ObjectContextRef obc = get_object_context(*p, false, NULL);
12929     if (!obc) {
12930       // we didn't flush; we may miss something here.
12931       dout(20) << __func__ << " skip (no obc) " << *p << dendl;
12932       osd->logger->inc(l_osd_agent_skip);
12933       continue;
12934     }
12935     if (!obc->obs.exists) {
12936       dout(20) << __func__ << " skip (dne) " << obc->obs.oi.soid << dendl;
12937       osd->logger->inc(l_osd_agent_skip);
12938       continue;
12939     }
12940     if (scrubber.write_blocked_by_scrub(obc->obs.oi.soid)) {
12941       dout(20) << __func__ << " skip (scrubbing) " << obc->obs.oi << dendl;
12942       osd->logger->inc(l_osd_agent_skip);
12943       continue;
12944     }
12945     if (obc->is_blocked()) {
12946       dout(20) << __func__ << " skip (blocked) " << obc->obs.oi << dendl;
12947       osd->logger->inc(l_osd_agent_skip);
12948       continue;
12949     }
12950     if (obc->is_request_pending()) {
12951       dout(20) << __func__ << " skip (request pending) " << obc->obs.oi << dendl;
12952       osd->logger->inc(l_osd_agent_skip);
12953       continue;
12954     }
12955
12956     // be careful flushing omap to an EC pool.
12957     if (!base_pool->supports_omap() &&
12958         obc->obs.oi.is_omap()) {
12959       dout(20) << __func__ << " skip (omap to EC) " << obc->obs.oi << dendl;
12960       osd->logger->inc(l_osd_agent_skip);
12961       continue;
12962     }
12963
12964     if (agent_state->evict_mode != TierAgentState::EVICT_MODE_IDLE &&
12965         agent_maybe_evict(obc, false))
12966       ++started;
12967     else if (agent_state->flush_mode != TierAgentState::FLUSH_MODE_IDLE &&
12968              agent_flush_quota > 0 && agent_maybe_flush(obc)) {
12969       ++started;
12970       --agent_flush_quota;
12971     }
12972     if (started >= start_max) {
12973       // If finishing early, set "next" to the next object
12974       if (++p != ls.end())
12975         next = *p;
12976       break;
12977     }
12978   }
12979
12980   if (++agent_state->hist_age > cct->_conf->osd_agent_hist_halflife) {
12981     dout(20) << __func__ << " resetting atime and temp histograms" << dendl;
12982     agent_state->hist_age = 0;
12983     agent_state->temp_hist.decay();
12984   }
12985
12986   // Total objects operated on so far
12987   int total_started = agent_state->started + started;
12988   bool need_delay = false;
12989
12990   dout(20) << __func__ << " start pos " << agent_state->position
12991     << " next start pos " << next
12992     << " started " << total_started << dendl;
12993
12994   // See if we've made a full pass over the object hash space
12995   // This might check at most ls_max objects a second time to notice that
12996   // we've checked every objects at least once.
12997   if (agent_state->position < agent_state->start &&
12998       next >= agent_state->start) {
12999     dout(20) << __func__ << " wrap around " << agent_state->start << dendl;
13000     if (total_started == 0)
13001       need_delay = true;
13002     else
13003       total_started = 0;
13004     agent_state->start = next;
13005   }
13006   agent_state->started = total_started;
13007
13008   // See if we are starting from beginning
13009   if (next.is_max())
13010     agent_state->position = hobject_t();
13011   else
13012     agent_state->position = next;
13013
13014   // Discard old in memory HitSets
13015   hit_set_in_memory_trim(pool.info.hit_set_count);
13016
13017   if (need_delay) {
13018     assert(agent_state->delaying == false);
13019     agent_delay();
13020     unlock();
13021     return false;
13022   }
13023   agent_choose_mode();
13024   unlock();
13025   return true;
13026 }
13027
13028 void PrimaryLogPG::agent_load_hit_sets()
13029 {
13030   if (agent_state->evict_mode == TierAgentState::EVICT_MODE_IDLE) {
13031     return;
13032   }
13033
13034   if (agent_state->hit_set_map.size() < info.hit_set.history.size()) {
13035     dout(10) << __func__ << dendl;
13036     for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin();
13037          p != info.hit_set.history.end(); ++p) {
13038       if (agent_state->hit_set_map.count(p->begin.sec()) == 0) {
13039         dout(10) << __func__ << " loading " << p->begin << "-"
13040                  << p->end << dendl;
13041         if (!pool.info.is_replicated()) {
13042           // FIXME: EC not supported here yet
13043           derr << __func__ << " on non-replicated pool" << dendl;
13044           break;
13045         }
13046
13047         hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
13048         if (is_unreadable_object(oid)) {
13049           dout(10) << __func__ << " unreadable " << oid << ", waiting" << dendl;
13050           break;
13051         }
13052
13053         ObjectContextRef obc = get_object_context(oid, false);
13054         if (!obc) {
13055           derr << __func__ << ": could not load hitset " << oid << dendl;
13056           break;
13057         }
13058
13059         bufferlist bl;
13060         {
13061           obc->ondisk_read_lock();
13062           int r = osd->store->read(ch, ghobject_t(oid), 0, 0, bl);
13063           assert(r >= 0);
13064           obc->ondisk_read_unlock();
13065         }
13066         HitSetRef hs(new HitSet);
13067         bufferlist::iterator pbl = bl.begin();
13068         ::decode(*hs, pbl);
13069         agent_state->add_hit_set(p->begin.sec(), hs);
13070       }
13071     }
13072   }
13073 }
13074
13075 bool PrimaryLogPG::agent_maybe_flush(ObjectContextRef& obc)
13076 {
13077   if (!obc->obs.oi.is_dirty()) {
13078     dout(20) << __func__ << " skip (clean) " << obc->obs.oi << dendl;
13079     osd->logger->inc(l_osd_agent_skip);
13080     return false;
13081   }
13082   if (obc->obs.oi.is_cache_pinned()) {
13083     dout(20) << __func__ << " skip (cache_pinned) " << obc->obs.oi << dendl;
13084     osd->logger->inc(l_osd_agent_skip);
13085     return false;
13086   }
13087
13088   utime_t now = ceph_clock_now();
13089   utime_t ob_local_mtime;
13090   if (obc->obs.oi.local_mtime != utime_t()) {
13091     ob_local_mtime = obc->obs.oi.local_mtime;
13092   } else {
13093     ob_local_mtime = obc->obs.oi.mtime;
13094   }
13095   bool evict_mode_full =
13096     (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL);
13097   if (!evict_mode_full &&
13098       obc->obs.oi.soid.snap == CEPH_NOSNAP &&  // snaps immutable; don't delay
13099       (ob_local_mtime + utime_t(pool.info.cache_min_flush_age, 0) > now)) {
13100     dout(20) << __func__ << " skip (too young) " << obc->obs.oi << dendl;
13101     osd->logger->inc(l_osd_agent_skip);
13102     return false;
13103   }
13104
13105   if (osd->agent_is_active_oid(obc->obs.oi.soid)) {
13106     dout(20) << __func__ << " skip (flushing) " << obc->obs.oi << dendl;
13107     osd->logger->inc(l_osd_agent_skip);
13108     return false;
13109   }
13110
13111   dout(10) << __func__ << " flushing " << obc->obs.oi << dendl;
13112
13113   // FIXME: flush anything dirty, regardless of what distribution of
13114   // ages we expect.
13115
13116   hobject_t oid = obc->obs.oi.soid;
13117   osd->agent_start_op(oid);
13118   // no need to capture a pg ref, can't outlive fop or ctx
13119   std::function<void()> on_flush = [this, oid]() {
13120     osd->agent_finish_op(oid);
13121   };
13122
13123   int result = start_flush(
13124     OpRequestRef(), obc, false, NULL,
13125     on_flush);
13126   if (result != -EINPROGRESS) {
13127     on_flush();
13128     dout(10) << __func__ << " start_flush() failed " << obc->obs.oi
13129       << " with " << result << dendl;
13130     osd->logger->inc(l_osd_agent_skip);
13131     return false;
13132   }
13133
13134   osd->logger->inc(l_osd_agent_flush);
13135   return true;
13136 }
13137
13138 bool PrimaryLogPG::agent_maybe_evict(ObjectContextRef& obc, bool after_flush)
13139 {
13140   const hobject_t& soid = obc->obs.oi.soid;
13141   if (!after_flush && obc->obs.oi.is_dirty()) {
13142     dout(20) << __func__ << " skip (dirty) " << obc->obs.oi << dendl;
13143     return false;
13144   }
13145   if (!obc->obs.oi.watchers.empty()) {
13146     dout(20) << __func__ << " skip (watchers) " << obc->obs.oi << dendl;
13147     return false;
13148   }
13149   if (obc->is_blocked()) {
13150     dout(20) << __func__ << " skip (blocked) " << obc->obs.oi << dendl;
13151     return false;
13152   }
13153   if (obc->obs.oi.is_cache_pinned()) {
13154     dout(20) << __func__ << " skip (cache_pinned) " << obc->obs.oi << dendl;
13155     return false;
13156   }
13157
13158   if (soid.snap == CEPH_NOSNAP) {
13159     int result = _verify_no_head_clones(soid, obc->ssc->snapset);
13160     if (result < 0) {
13161       dout(20) << __func__ << " skip (clones) " << obc->obs.oi << dendl;
13162       return false;
13163     }
13164   }
13165
13166   if (agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL) {
13167     // is this object old than cache_min_evict_age?
13168     utime_t now = ceph_clock_now();
13169     utime_t ob_local_mtime;
13170     if (obc->obs.oi.local_mtime != utime_t()) {
13171       ob_local_mtime = obc->obs.oi.local_mtime;
13172     } else {
13173       ob_local_mtime = obc->obs.oi.mtime;
13174     }
13175     if (ob_local_mtime + utime_t(pool.info.cache_min_evict_age, 0) > now) {
13176       dout(20) << __func__ << " skip (too young) " << obc->obs.oi << dendl;
13177       osd->logger->inc(l_osd_agent_skip);
13178       return false;
13179     }
13180     // is this object old and/or cold enough?
13181     int temp = 0;
13182     uint64_t temp_upper = 0, temp_lower = 0;
13183     if (hit_set)
13184       agent_estimate_temp(soid, &temp);
13185     agent_state->temp_hist.add(temp);
13186     agent_state->temp_hist.get_position_micro(temp, &temp_lower, &temp_upper);
13187
13188     dout(20) << __func__
13189              << " temp " << temp
13190              << " pos " << temp_lower << "-" << temp_upper
13191              << ", evict_effort " << agent_state->evict_effort
13192              << dendl;
13193     dout(30) << "agent_state:\n";
13194     Formatter *f = Formatter::create("");
13195     f->open_object_section("agent_state");
13196     agent_state->dump(f);
13197     f->close_section();
13198     f->flush(*_dout);
13199     delete f;
13200     *_dout << dendl;
13201
13202     if (1000000 - temp_upper >= agent_state->evict_effort)
13203       return false;
13204   }
13205
13206   dout(10) << __func__ << " evicting " << obc->obs.oi << dendl;
13207   OpContextUPtr ctx = simple_opc_create(obc);
13208
13209   if (!ctx->lock_manager.get_lock_type(
13210         ObjectContext::RWState::RWWRITE,
13211         obc->obs.oi.soid,
13212         obc,
13213         OpRequestRef())) {
13214     close_op_ctx(ctx.release());
13215     dout(20) << __func__ << " skip (cannot get lock) " << obc->obs.oi << dendl;
13216     return false;
13217   }
13218
13219   osd->agent_start_evict_op();
13220   ctx->register_on_finish(
13221     [this]() {
13222       osd->agent_finish_evict_op();
13223     });
13224
13225   ctx->at_version = get_next_version();
13226   assert(ctx->new_obs.exists);
13227   int r = _delete_oid(ctx.get(), true, false);
13228   if (obc->obs.oi.is_omap())
13229     ctx->delta_stats.num_objects_omap--;
13230   ctx->delta_stats.num_evict++;
13231   ctx->delta_stats.num_evict_kb += SHIFT_ROUND_UP(obc->obs.oi.size, 10);
13232   if (obc->obs.oi.is_dirty())
13233     --ctx->delta_stats.num_objects_dirty;
13234   assert(r == 0);
13235   finish_ctx(ctx.get(), pg_log_entry_t::DELETE, false);
13236   simple_opc_submit(std::move(ctx));
13237   osd->logger->inc(l_osd_tier_evict);
13238   osd->logger->inc(l_osd_agent_evict);
13239   return true;
13240 }
13241
13242 void PrimaryLogPG::agent_stop()
13243 {
13244   dout(20) << __func__ << dendl;
13245   if (agent_state && !agent_state->is_idle()) {
13246     agent_state->evict_mode = TierAgentState::EVICT_MODE_IDLE;
13247     agent_state->flush_mode = TierAgentState::FLUSH_MODE_IDLE;
13248     osd->agent_disable_pg(this, agent_state->evict_effort);
13249   }
13250 }
13251
13252 void PrimaryLogPG::agent_delay()
13253 {
13254   dout(20) << __func__ << dendl;
13255   if (agent_state && !agent_state->is_idle()) {
13256     assert(agent_state->delaying == false);
13257     agent_state->delaying = true;
13258     osd->agent_disable_pg(this, agent_state->evict_effort);
13259   }
13260 }
13261
13262 void PrimaryLogPG::agent_choose_mode_restart()
13263 {
13264   dout(20) << __func__ << dendl;
13265   lock();
13266   if (agent_state && agent_state->delaying) {
13267     agent_state->delaying = false;
13268     agent_choose_mode(true);
13269   }
13270   unlock();
13271 }
13272
13273 bool PrimaryLogPG::agent_choose_mode(bool restart, OpRequestRef op)
13274 {
13275   bool requeued = false;
13276   // Let delay play out
13277   if (agent_state->delaying) {
13278     dout(20) << __func__ << this << " delaying, ignored" << dendl;
13279     return requeued;
13280   }
13281
13282   TierAgentState::flush_mode_t flush_mode = TierAgentState::FLUSH_MODE_IDLE;
13283   TierAgentState::evict_mode_t evict_mode = TierAgentState::EVICT_MODE_IDLE;
13284   unsigned evict_effort = 0;
13285
13286   if (info.stats.stats_invalid) {
13287     // idle; stats can't be trusted until we scrub.
13288     dout(20) << __func__ << " stats invalid (post-split), idle" << dendl;
13289     goto skip_calc;
13290   }
13291
13292   {
13293   uint64_t divisor = pool.info.get_pg_num_divisor(info.pgid.pgid);
13294   assert(divisor > 0);
13295
13296   // adjust (effective) user objects down based on the number
13297   // of HitSet objects, which should not count toward our total since
13298   // they cannot be flushed.
13299   uint64_t unflushable = info.stats.stats.sum.num_objects_hit_set_archive;
13300
13301   // also exclude omap objects if ec backing pool
13302   const pg_pool_t *base_pool = get_osdmap()->get_pg_pool(pool.info.tier_of);
13303   assert(base_pool);
13304   if (!base_pool->supports_omap())
13305     unflushable += info.stats.stats.sum.num_objects_omap;
13306
13307   uint64_t num_user_objects = info.stats.stats.sum.num_objects;
13308   if (num_user_objects > unflushable)
13309     num_user_objects -= unflushable;
13310   else
13311     num_user_objects = 0;
13312
13313   uint64_t num_user_bytes = info.stats.stats.sum.num_bytes;
13314   uint64_t unflushable_bytes = info.stats.stats.sum.num_bytes_hit_set_archive;
13315   num_user_bytes -= unflushable_bytes;
13316   uint64_t num_overhead_bytes = osd->store->estimate_objects_overhead(num_user_objects);
13317   num_user_bytes += num_overhead_bytes;
13318
13319   // also reduce the num_dirty by num_objects_omap
13320   int64_t num_dirty = info.stats.stats.sum.num_objects_dirty;
13321   if (!base_pool->supports_omap()) {
13322     if (num_dirty > info.stats.stats.sum.num_objects_omap)
13323       num_dirty -= info.stats.stats.sum.num_objects_omap;
13324     else
13325       num_dirty = 0;
13326   }
13327
13328   dout(10) << __func__
13329            << " flush_mode: "
13330            << TierAgentState::get_flush_mode_name(agent_state->flush_mode)
13331            << " evict_mode: "
13332            << TierAgentState::get_evict_mode_name(agent_state->evict_mode)
13333            << " num_objects: " << info.stats.stats.sum.num_objects
13334            << " num_bytes: " << info.stats.stats.sum.num_bytes
13335            << " num_objects_dirty: " << info.stats.stats.sum.num_objects_dirty
13336            << " num_objects_omap: " << info.stats.stats.sum.num_objects_omap
13337            << " num_dirty: " << num_dirty
13338            << " num_user_objects: " << num_user_objects
13339            << " num_user_bytes: " << num_user_bytes
13340            << " num_overhead_bytes: " << num_overhead_bytes
13341            << " pool.info.target_max_bytes: " << pool.info.target_max_bytes
13342            << " pool.info.target_max_objects: " << pool.info.target_max_objects
13343            << dendl;
13344
13345   // get dirty, full ratios
13346   uint64_t dirty_micro = 0;
13347   uint64_t full_micro = 0;
13348   if (pool.info.target_max_bytes && num_user_objects > 0) {
13349     uint64_t avg_size = num_user_bytes / num_user_objects;
13350     dirty_micro =
13351       num_dirty * avg_size * 1000000 /
13352       MAX(pool.info.target_max_bytes / divisor, 1);
13353     full_micro =
13354       num_user_objects * avg_size * 1000000 /
13355       MAX(pool.info.target_max_bytes / divisor, 1);
13356   }
13357   if (pool.info.target_max_objects > 0) {
13358     uint64_t dirty_objects_micro =
13359       num_dirty * 1000000 /
13360       MAX(pool.info.target_max_objects / divisor, 1);
13361     if (dirty_objects_micro > dirty_micro)
13362       dirty_micro = dirty_objects_micro;
13363     uint64_t full_objects_micro =
13364       num_user_objects * 1000000 /
13365       MAX(pool.info.target_max_objects / divisor, 1);
13366     if (full_objects_micro > full_micro)
13367       full_micro = full_objects_micro;
13368   }
13369   dout(20) << __func__ << " dirty " << ((float)dirty_micro / 1000000.0)
13370            << " full " << ((float)full_micro / 1000000.0)
13371            << dendl;
13372
13373   // flush mode
13374   uint64_t flush_target = pool.info.cache_target_dirty_ratio_micro;
13375   uint64_t flush_high_target = pool.info.cache_target_dirty_high_ratio_micro;
13376   uint64_t flush_slop = (float)flush_target * cct->_conf->osd_agent_slop;
13377   if (restart || agent_state->flush_mode == TierAgentState::FLUSH_MODE_IDLE) {
13378     flush_target += flush_slop;
13379     flush_high_target += flush_slop;
13380   } else {
13381     flush_target -= MIN(flush_target, flush_slop);
13382     flush_high_target -= MIN(flush_high_target, flush_slop);
13383   }
13384
13385   if (dirty_micro > flush_high_target) {
13386     flush_mode = TierAgentState::FLUSH_MODE_HIGH;
13387   } else if (dirty_micro > flush_target) {
13388     flush_mode = TierAgentState::FLUSH_MODE_LOW;
13389   }
13390
13391   // evict mode
13392   uint64_t evict_target = pool.info.cache_target_full_ratio_micro;
13393   uint64_t evict_slop = (float)evict_target * cct->_conf->osd_agent_slop;
13394   if (restart || agent_state->evict_mode == TierAgentState::EVICT_MODE_IDLE)
13395     evict_target += evict_slop;
13396   else
13397     evict_target -= MIN(evict_target, evict_slop);
13398
13399   if (full_micro > 1000000) {
13400     // evict anything clean
13401     evict_mode = TierAgentState::EVICT_MODE_FULL;
13402     evict_effort = 1000000;
13403   } else if (full_micro > evict_target) {
13404     // set effort in [0..1] range based on where we are between
13405     evict_mode = TierAgentState::EVICT_MODE_SOME;
13406     uint64_t over = full_micro - evict_target;
13407     uint64_t span  = 1000000 - evict_target;
13408     evict_effort = MAX(over * 1000000 / span,
13409                        (unsigned)(1000000.0 * cct->_conf->osd_agent_min_evict_effort));
13410
13411     // quantize effort to avoid too much reordering in the agent_queue.
13412     uint64_t inc = cct->_conf->osd_agent_quantize_effort * 1000000;
13413     assert(inc > 0);
13414     uint64_t was = evict_effort;
13415     evict_effort -= evict_effort % inc;
13416     if (evict_effort < inc)
13417       evict_effort = inc;
13418     assert(evict_effort >= inc && evict_effort <= 1000000);
13419     dout(30) << __func__ << " evict_effort " << was << " quantized by " << inc << " to " << evict_effort << dendl;
13420   }
13421   }
13422
13423   skip_calc:
13424   bool old_idle = agent_state->is_idle();
13425   if (flush_mode != agent_state->flush_mode) {
13426     dout(5) << __func__ << " flush_mode "
13427             << TierAgentState::get_flush_mode_name(agent_state->flush_mode)
13428             << " -> "
13429             << TierAgentState::get_flush_mode_name(flush_mode)
13430             << dendl;
13431     if (flush_mode == TierAgentState::FLUSH_MODE_HIGH) {
13432       osd->agent_inc_high_count();
13433       info.stats.stats.sum.num_flush_mode_high = 1;
13434     } else if (flush_mode == TierAgentState::FLUSH_MODE_LOW) {
13435       info.stats.stats.sum.num_flush_mode_low = 1;
13436     }
13437     if (agent_state->flush_mode == TierAgentState::FLUSH_MODE_HIGH) {
13438       osd->agent_dec_high_count();
13439       info.stats.stats.sum.num_flush_mode_high = 0;
13440     } else if (agent_state->flush_mode == TierAgentState::FLUSH_MODE_LOW) {
13441       info.stats.stats.sum.num_flush_mode_low = 0;
13442     }
13443     agent_state->flush_mode = flush_mode;
13444   }
13445   if (evict_mode != agent_state->evict_mode) {
13446     dout(5) << __func__ << " evict_mode "
13447             << TierAgentState::get_evict_mode_name(agent_state->evict_mode)
13448             << " -> "
13449             << TierAgentState::get_evict_mode_name(evict_mode)
13450             << dendl;
13451     if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL &&
13452         is_active()) {
13453       if (op)
13454         requeue_op(op);
13455       requeue_ops(waiting_for_active);
13456       requeue_ops(waiting_for_scrub);
13457       requeue_ops(waiting_for_cache_not_full);
13458       objects_blocked_on_cache_full.clear();
13459       requeued = true;
13460     }
13461     if (evict_mode == TierAgentState::EVICT_MODE_SOME) {
13462       info.stats.stats.sum.num_evict_mode_some = 1;
13463     } else if (evict_mode == TierAgentState::EVICT_MODE_FULL) {
13464       info.stats.stats.sum.num_evict_mode_full = 1;
13465     }
13466     if (agent_state->evict_mode == TierAgentState::EVICT_MODE_SOME) {
13467       info.stats.stats.sum.num_evict_mode_some = 0;
13468     } else if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
13469       info.stats.stats.sum.num_evict_mode_full = 0;
13470     }
13471     agent_state->evict_mode = evict_mode;
13472   }
13473   uint64_t old_effort = agent_state->evict_effort;
13474   if (evict_effort != agent_state->evict_effort) {
13475     dout(5) << __func__ << " evict_effort "
13476             << ((float)agent_state->evict_effort / 1000000.0)
13477             << " -> "
13478             << ((float)evict_effort / 1000000.0)
13479             << dendl;
13480     agent_state->evict_effort = evict_effort;
13481   }
13482
13483   // NOTE: we are using evict_effort as a proxy for *all* agent effort
13484   // (including flush).  This is probably fine (they should be
13485   // correlated) but it is not precisely correct.
13486   if (agent_state->is_idle()) {
13487     if (!restart && !old_idle) {
13488       osd->agent_disable_pg(this, old_effort);
13489     }
13490   } else {
13491     if (restart || old_idle) {
13492       osd->agent_enable_pg(this, agent_state->evict_effort);
13493     } else if (old_effort != agent_state->evict_effort) {
13494       osd->agent_adjust_pg(this, old_effort, agent_state->evict_effort);
13495     }
13496   }
13497   return requeued;
13498 }
13499
13500 void PrimaryLogPG::agent_estimate_temp(const hobject_t& oid, int *temp)
13501 {
13502   assert(hit_set);
13503   assert(temp);
13504   *temp = 0;
13505   if (hit_set->contains(oid))
13506     *temp = 1000000;
13507   unsigned i = 0;
13508   int last_n = pool.info.hit_set_search_last_n;
13509   for (map<time_t,HitSetRef>::reverse_iterator p =
13510        agent_state->hit_set_map.rbegin(); last_n > 0 &&
13511        p != agent_state->hit_set_map.rend(); ++p, ++i) {
13512     if (p->second->contains(oid)) {
13513       *temp += pool.info.get_grade(i);
13514       --last_n;
13515     }
13516   }
13517 }
13518
13519 // Dup op detection
13520
13521 bool PrimaryLogPG::already_complete(eversion_t v)
13522 {
13523   dout(20) << __func__ << ": " << v << dendl;
13524   for (xlist<RepGather*>::iterator i = repop_queue.begin();
13525        !i.end();
13526        ++i) {
13527     dout(20) << __func__ << ": " << **i << dendl;
13528     // skip copy from temp object ops
13529     if ((*i)->v == eversion_t()) {
13530       dout(20) << __func__ << ": " << **i
13531                << " version is empty" << dendl;
13532       continue;
13533     }
13534     if ((*i)->v > v) {
13535       dout(20) << __func__ << ": " << **i
13536                << " (*i)->v past v" << dendl;
13537       break;
13538     }
13539     if (!(*i)->all_committed) {
13540       dout(20) << __func__ << ": " << **i
13541                << " not committed, returning false"
13542                << dendl;
13543       return false;
13544     }
13545   }
13546   dout(20) << __func__ << ": returning true" << dendl;
13547   return true;
13548 }
13549
13550 bool PrimaryLogPG::already_ack(eversion_t v)
13551 {
13552   dout(20) << __func__ << ": " << v << dendl;
13553   for (xlist<RepGather*>::iterator i = repop_queue.begin();
13554        !i.end();
13555        ++i) {
13556     // skip copy from temp object ops
13557     if ((*i)->v == eversion_t()) {
13558       dout(20) << __func__ << ": " << **i
13559                << " version is empty" << dendl;
13560       continue;
13561     }
13562     if ((*i)->v > v) {
13563       dout(20) << __func__ << ": " << **i
13564                << " (*i)->v past v" << dendl;
13565       break;
13566     }
13567     if (!(*i)->all_applied) {
13568       dout(20) << __func__ << ": " << **i
13569                << " not applied, returning false"
13570                << dendl;
13571       return false;
13572     }
13573   }
13574   dout(20) << __func__ << ": returning true" << dendl;
13575   return true;
13576 }
13577
13578
13579 // ==========================================================================================
13580 // SCRUB
13581
13582
13583 bool PrimaryLogPG::_range_available_for_scrub(
13584   const hobject_t &begin, const hobject_t &end)
13585 {
13586   pair<hobject_t, ObjectContextRef> next;
13587   next.second = object_contexts.lookup(begin);
13588   next.first = begin;
13589   bool more = true;
13590   while (more && next.first < end) {
13591     if (next.second && next.second->is_blocked()) {
13592       next.second->requeue_scrub_on_unblock = true;
13593       dout(10) << __func__ << ": scrub delayed, "
13594                << next.first << " is blocked"
13595                << dendl;
13596       return false;
13597     }
13598     more = object_contexts.get_next(next.first, &next);
13599   }
13600   return true;
13601 }
13602
13603 static bool doing_clones(const boost::optional<SnapSet> &snapset,
13604                          const vector<snapid_t>::reverse_iterator &curclone) {
13605     return snapset && curclone != snapset.get().clones.rend();
13606 }
13607
13608 void PrimaryLogPG::log_missing(unsigned missing,
13609                         const boost::optional<hobject_t> &head,
13610                         LogChannelRef clog,
13611                         const spg_t &pgid,
13612                         const char *func,
13613                         const char *mode,
13614                         bool allow_incomplete_clones)
13615 {
13616   assert(head);
13617   if (allow_incomplete_clones) {
13618     dout(20) << func << " " << mode << " " << pgid << " " << head.get()
13619                << " skipped " << missing << " clone(s) in cache tier" << dendl;
13620   } else {
13621     clog->info() << mode << " " << pgid << " " << head.get()
13622                        << " " << missing << " missing clone(s)";
13623   }
13624 }
13625
13626 unsigned PrimaryLogPG::process_clones_to(const boost::optional<hobject_t> &head,
13627   const boost::optional<SnapSet> &snapset,
13628   LogChannelRef clog,
13629   const spg_t &pgid,
13630   const char *mode,
13631   bool allow_incomplete_clones,
13632   boost::optional<snapid_t> target,
13633   vector<snapid_t>::reverse_iterator *curclone,
13634   inconsistent_snapset_wrapper &e)
13635 {
13636   assert(head);
13637   assert(snapset);
13638   unsigned missing = 0;
13639
13640   // NOTE: clones are in descending order, thus **curclone > target test here
13641   hobject_t next_clone(head.get());
13642   while(doing_clones(snapset, *curclone) && (!target || **curclone > *target)) {
13643     ++missing;
13644     // it is okay to be missing one or more clones in a cache tier.
13645     // skip higher-numbered clones in the list.
13646     if (!allow_incomplete_clones) {
13647       next_clone.snap = **curclone;
13648       clog->error() << mode << " " << pgid << " " << head.get()
13649                          << " expected clone " << next_clone << " " << missing
13650                          << " missing";
13651       ++scrubber.shallow_errors;
13652       e.set_clone_missing(next_clone.snap);
13653     }
13654     // Clones are descending
13655     ++(*curclone);
13656   }
13657   return missing;
13658 }
13659
13660 /*
13661  * Validate consistency of the object info and snap sets.
13662  *
13663  * We are sort of comparing 2 lists. The main loop is on objmap.objects. But
13664  * the comparison of the objects is against multiple snapset.clones. There are
13665  * multiple clone lists and in between lists we expect head or snapdir.
13666  *
13667  * Example
13668  *
13669  * objects              expected
13670  * =======              =======
13671  * obj1 snap 1          head/snapdir, unexpected obj1 snap 1
13672  * obj2 head            head/snapdir, head ok
13673  *              [SnapSet clones 6 4 2 1]
13674  * obj2 snap 7          obj2 snap 6, unexpected obj2 snap 7
13675  * obj2 snap 6          obj2 snap 6, match
13676  * obj2 snap 4          obj2 snap 4, match
13677  * obj3 head            obj2 snap 2 (expected), obj2 snap 1 (expected), head ok
13678  *              [Snapset clones 3 1]
13679  * obj3 snap 3          obj3 snap 3 match
13680  * obj3 snap 1          obj3 snap 1 match
13681  * obj4 snapdir         head/snapdir, snapdir ok
13682  *              [Snapset clones 4]
13683  * EOL                  obj4 snap 4, (expected)
13684  */
13685 void PrimaryLogPG::scrub_snapshot_metadata(
13686   ScrubMap &scrubmap,
13687   const map<hobject_t, pair<uint32_t, uint32_t>> &missing_digest)
13688 {
13689   dout(10) << __func__ << dendl;
13690
13691   coll_t c(info.pgid);
13692   bool repair = state_test(PG_STATE_REPAIR);
13693   bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
13694   const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
13695   boost::optional<snapid_t> all_clones;   // Unspecified snapid_t or boost::none
13696
13697   /// snapsets to repair
13698   map<hobject_t,SnapSet> snapset_to_repair;
13699
13700   // traverse in reverse order.
13701   boost::optional<hobject_t> head;
13702   boost::optional<SnapSet> snapset; // If initialized so will head (above)
13703   vector<snapid_t>::reverse_iterator curclone; // Defined only if snapset initialized
13704   unsigned missing = 0;
13705   inconsistent_snapset_wrapper soid_error, head_error;
13706
13707   bufferlist last_data;
13708
13709   for (map<hobject_t,ScrubMap::object>::reverse_iterator
13710        p = scrubmap.objects.rbegin(); p != scrubmap.objects.rend(); ++p) {
13711     const hobject_t& soid = p->first;
13712     soid_error = inconsistent_snapset_wrapper{soid};
13713     object_stat_sum_t stat;
13714     boost::optional<object_info_t> oi;
13715
13716     if (!soid.is_snapdir())
13717       stat.num_objects++;
13718
13719     if (soid.nspace == cct->_conf->osd_hit_set_namespace)
13720       stat.num_objects_hit_set_archive++;
13721
13722     if (soid.is_snap()) {
13723       // it's a clone
13724       stat.num_object_clones++;
13725     }
13726
13727     // basic checks.
13728     if (p->second.attrs.count(OI_ATTR) == 0) {
13729       oi = boost::none;
13730       osd->clog->error() << mode << " " << info.pgid << " " << soid
13731                         << " no '" << OI_ATTR << "' attr";
13732       ++scrubber.shallow_errors;
13733       soid_error.set_oi_attr_missing();
13734     } else {
13735       bufferlist bv;
13736       bv.push_back(p->second.attrs[OI_ATTR]);
13737       try {
13738         oi = object_info_t(); // Initialize optional<> before decode into it
13739         oi.get().decode(bv);
13740       } catch (buffer::error& e) {
13741         oi = boost::none;
13742         osd->clog->error() << mode << " " << info.pgid << " " << soid
13743                 << " can't decode '" << OI_ATTR << "' attr " << e.what();
13744         ++scrubber.shallow_errors;
13745         soid_error.set_oi_attr_corrupted();
13746         soid_error.set_oi_attr_missing(); // Not available too
13747       }
13748     }
13749
13750     if (oi) {
13751       if (pgbackend->be_get_ondisk_size(oi->size) != p->second.size) {
13752         osd->clog->error() << mode << " " << info.pgid << " " << soid
13753                            << " on disk size (" << p->second.size
13754                            << ") does not match object info size ("
13755                            << oi->size << ") adjusted for ondisk to ("
13756                            << pgbackend->be_get_ondisk_size(oi->size)
13757                            << ")";
13758         soid_error.set_size_mismatch();
13759         ++scrubber.shallow_errors;
13760       }
13761
13762       dout(20) << mode << "  " << soid << " " << oi.get() << dendl;
13763
13764       // A clone num_bytes will be added later when we have snapset
13765       if (!soid.is_snap()) {
13766         stat.num_bytes += oi->size;
13767       }
13768       if (soid.nspace == cct->_conf->osd_hit_set_namespace)
13769         stat.num_bytes_hit_set_archive += oi->size;
13770
13771       if (!soid.is_snapdir()) {
13772         if (oi->is_dirty())
13773           ++stat.num_objects_dirty;
13774         if (oi->is_whiteout())
13775           ++stat.num_whiteouts;
13776         if (oi->is_omap())
13777           ++stat.num_objects_omap;
13778         if (oi->is_cache_pinned())
13779           ++stat.num_objects_pinned;
13780       }
13781     } else {
13782       // pessimistic assumption that this object might contain a
13783       // legacy SnapSet
13784       stat.num_legacy_snapsets++;
13785     }
13786
13787     // Check for any problems while processing clones
13788     if (doing_clones(snapset, curclone)) {
13789       boost::optional<snapid_t> target;
13790       // Expecting an object with snap for current head
13791       if (soid.has_snapset() || soid.get_head() != head->get_head()) {
13792
13793         dout(10) << __func__ << " " << mode << " " << info.pgid << " new object "
13794                  << soid << " while processing " << head.get() << dendl;
13795
13796         target = all_clones;
13797       } else {
13798         assert(soid.is_snap());
13799         target = soid.snap;
13800       }
13801
13802       // Log any clones we were expecting to be there up to target
13803       // This will set missing, but will be a no-op if snap.soid == *curclone.
13804       missing += process_clones_to(head, snapset, osd->clog, info.pgid, mode,
13805                         pool.info.allow_incomplete_clones(), target, &curclone,
13806                         head_error);
13807     }
13808     bool expected;
13809     // Check doing_clones() again in case we ran process_clones_to()
13810     if (doing_clones(snapset, curclone)) {
13811       // A head/snapdir would have processed all clones above
13812       // or all greater than *curclone.
13813       assert(soid.is_snap() && *curclone <= soid.snap);
13814
13815       // After processing above clone snap should match the expected curclone
13816       expected = (*curclone == soid.snap);
13817     } else {
13818       // If we aren't doing clones any longer, then expecting head/snapdir
13819       expected = soid.has_snapset();
13820     }
13821     if (!expected) {
13822       // If we couldn't read the head's snapset, just ignore clones
13823       if (head && !snapset) {
13824         osd->clog->error() << mode << " " << info.pgid << " " << soid
13825                           << " clone ignored due to missing snapset";
13826       } else {
13827         osd->clog->error() << mode << " " << info.pgid << " " << soid
13828                            << " is an unexpected clone";
13829       }
13830       ++scrubber.shallow_errors;
13831       soid_error.set_headless();
13832       scrubber.store->add_snap_error(pool.id, soid_error);
13833       if (head && soid.get_head() == head->get_head())
13834         head_error.set_clone(soid.snap);
13835       continue;
13836     }
13837
13838     // new snapset?
13839     if (soid.has_snapset()) {
13840
13841       if (missing) {
13842         log_missing(missing, head, osd->clog, info.pgid, __func__, mode,
13843                     pool.info.allow_incomplete_clones());
13844       }
13845
13846       // Save previous head error information
13847       if (head && head_error.errors)
13848         scrubber.store->add_snap_error(pool.id, head_error);
13849       // Set this as a new head object
13850       head = soid;
13851       missing = 0;
13852       head_error = soid_error;
13853
13854       dout(20) << __func__ << " " << mode << " new head " << head << dendl;
13855
13856       if (p->second.attrs.count(SS_ATTR) == 0) {
13857         osd->clog->error() << mode << " " << info.pgid << " " << soid
13858                           << " no '" << SS_ATTR << "' attr";
13859         ++scrubber.shallow_errors;
13860         snapset = boost::none;
13861         head_error.set_ss_attr_missing();
13862       } else {
13863         bufferlist bl;
13864         bl.push_back(p->second.attrs[SS_ATTR]);
13865         bufferlist::iterator blp = bl.begin();
13866         try {
13867           snapset = SnapSet(); // Initialize optional<> before decoding into it
13868           ::decode(snapset.get(), blp);
13869         } catch (buffer::error& e) {
13870           snapset = boost::none;
13871           osd->clog->error() << mode << " " << info.pgid << " " << soid
13872                 << " can't decode '" << SS_ATTR << "' attr " << e.what();
13873           ++scrubber.shallow_errors;
13874           head_error.set_ss_attr_corrupted();
13875         }
13876       }
13877
13878       if (snapset) {
13879         // what will be next?
13880         curclone = snapset->clones.rbegin();
13881
13882         if (!snapset->clones.empty()) {
13883           dout(20) << "  snapset " << snapset.get() << dendl;
13884           if (snapset->seq == 0) {
13885             osd->clog->error() << mode << " " << info.pgid << " " << soid
13886                                << " snaps.seq not set";
13887             ++scrubber.shallow_errors;
13888             head_error.set_snapset_mismatch();
13889           }
13890         }
13891
13892         if (soid.is_head() && !snapset->head_exists) {
13893           osd->clog->error() << mode << " " << info.pgid << " " << soid
13894                           << " snapset.head_exists=false, but head exists";
13895           ++scrubber.shallow_errors;
13896           head_error.set_head_mismatch();
13897           // Fix head_exists locally so is_legacy() returns correctly
13898           snapset->head_exists = true;
13899         }
13900         if (soid.is_snapdir() && snapset->head_exists) {
13901           osd->clog->error() << mode << " " << info.pgid << " " << soid
13902                           << " snapset.head_exists=true, but snapdir exists";
13903           ++scrubber.shallow_errors;
13904           head_error.set_head_mismatch();
13905           // For symmetry fix this too, but probably doesn't matter
13906           snapset->head_exists = false;
13907         }
13908
13909         if (get_osdmap()->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
13910           if (soid.is_snapdir()) {
13911             dout(10) << " will move snapset to head from " << soid << dendl;
13912             snapset_to_repair[soid.get_head()] = *snapset;
13913           } else if (snapset->is_legacy()) {
13914             dout(10) << " will convert legacy snapset on " << soid << " " << *snapset
13915                      << dendl;
13916             snapset_to_repair[soid.get_head()] = *snapset;
13917           }
13918         } else {
13919           stat.num_legacy_snapsets++;
13920         }
13921       } else {
13922         // pessimistic assumption that this object might contain a
13923         // legacy SnapSet
13924         stat.num_legacy_snapsets++;
13925       }
13926     } else {
13927       assert(soid.is_snap());
13928       assert(head);
13929       assert(snapset);
13930       assert(soid.snap == *curclone);
13931
13932       dout(20) << __func__ << " " << mode << " matched clone " << soid << dendl;
13933
13934       if (snapset->clone_size.count(soid.snap) == 0) {
13935         osd->clog->error() << mode << " " << info.pgid << " " << soid
13936                            << " is missing in clone_size";
13937         ++scrubber.shallow_errors;
13938         soid_error.set_size_mismatch();
13939       } else {
13940         if (oi && oi->size != snapset->clone_size[soid.snap]) {
13941           osd->clog->error() << mode << " " << info.pgid << " " << soid
13942                              << " size " << oi->size << " != clone_size "
13943                              << snapset->clone_size[*curclone];
13944           ++scrubber.shallow_errors;
13945           soid_error.set_size_mismatch();
13946         }
13947
13948         if (snapset->clone_overlap.count(soid.snap) == 0) {
13949           osd->clog->error() << mode << " " << info.pgid << " " << soid
13950                              << " is missing in clone_overlap";
13951           ++scrubber.shallow_errors;
13952           soid_error.set_size_mismatch();
13953         } else {
13954           // This checking is based on get_clone_bytes().  The first 2 asserts
13955           // can't happen because we know we have a clone_size and
13956           // a clone_overlap.  Now we check that the interval_set won't
13957           // cause the last assert.
13958           uint64_t size = snapset->clone_size.find(soid.snap)->second;
13959           const interval_set<uint64_t> &overlap =
13960                 snapset->clone_overlap.find(soid.snap)->second;
13961           bool bad_interval_set = false;
13962           for (interval_set<uint64_t>::const_iterator i = overlap.begin();
13963                i != overlap.end(); ++i) {
13964             if (size < i.get_len()) {
13965               bad_interval_set = true;
13966               break;
13967             }
13968             size -= i.get_len();
13969           }
13970
13971           if (bad_interval_set) {
13972             osd->clog->error() << mode << " " << info.pgid << " " << soid
13973                                << " bad interval_set in clone_overlap";
13974             ++scrubber.shallow_errors;
13975             soid_error.set_size_mismatch();
13976           } else {
13977             stat.num_bytes += snapset->get_clone_bytes(soid.snap);
13978           }
13979         }
13980       }
13981
13982       // migrate legacy_snaps to snapset?
13983       auto p = snapset_to_repair.find(soid.get_head());
13984       if (p != snapset_to_repair.end()) {
13985         if (!oi || oi->legacy_snaps.empty()) {
13986           osd->clog->error() << mode << " " << info.pgid << " " << soid
13987                              << " has no oi or legacy_snaps; cannot convert "
13988                              << *snapset;
13989           ++scrubber.shallow_errors;
13990         } else {
13991           dout(20) << __func__ << "   copying legacy_snaps " << oi->legacy_snaps
13992                    << " to snapset " << p->second << dendl;
13993           p->second.clone_snaps[soid.snap] = oi->legacy_snaps;
13994         }
13995       }
13996
13997       // what's next?
13998       ++curclone;
13999       if (soid_error.errors)
14000         scrubber.store->add_snap_error(pool.id, soid_error);
14001     }
14002
14003     scrub_cstat.add(stat);
14004   }
14005
14006   if (doing_clones(snapset, curclone)) {
14007     dout(10) << __func__ << " " << mode << " " << info.pgid
14008              << " No more objects while processing " << head.get() << dendl;
14009
14010     missing += process_clones_to(head, snapset, osd->clog, info.pgid, mode,
14011                       pool.info.allow_incomplete_clones(), all_clones, &curclone,
14012                       head_error);
14013   }
14014   // There could be missing found by the test above or even
14015   // before dropping out of the loop for the last head.
14016   if (missing) {
14017     log_missing(missing, head, osd->clog, info.pgid, __func__,
14018                 mode, pool.info.allow_incomplete_clones());
14019   }
14020   if (head && head_error.errors)
14021     scrubber.store->add_snap_error(pool.id, head_error);
14022
14023   for (map<hobject_t,pair<uint32_t,uint32_t>>::const_iterator p =
14024          missing_digest.begin();
14025        p != missing_digest.end();
14026        ++p) {
14027     if (p->first.is_snapdir())
14028       continue;
14029     dout(10) << __func__ << " recording digests for " << p->first << dendl;
14030     ObjectContextRef obc = get_object_context(p->first, false);
14031     if (!obc) {
14032       osd->clog->error() << info.pgid << " " << mode
14033                          << " cannot get object context for object "
14034                          << p->first;
14035       continue;
14036     } else if (obc->obs.oi.soid != p->first) {
14037       osd->clog->error() << info.pgid << " " << mode
14038                          << " object " << p->first
14039                          << " has a valid oi attr with a mismatched name, "
14040                          << " obc->obs.oi.soid: " << obc->obs.oi.soid;
14041       continue;
14042     }
14043     OpContextUPtr ctx = simple_opc_create(obc);
14044     ctx->at_version = get_next_version();
14045     ctx->mtime = utime_t();      // do not update mtime
14046     ctx->new_obs.oi.set_data_digest(p->second.first);
14047     ctx->new_obs.oi.set_omap_digest(p->second.second);
14048     finish_ctx(ctx.get(), pg_log_entry_t::MODIFY);
14049
14050     ctx->register_on_success(
14051       [this]() {
14052         dout(20) << "updating scrub digest" << dendl;
14053         if (--scrubber.num_digest_updates_pending == 0) {
14054           requeue_scrub();
14055         }
14056       });
14057
14058     simple_opc_submit(std::move(ctx));
14059     ++scrubber.num_digest_updates_pending;
14060   }
14061   for (auto& p : snapset_to_repair) {
14062     // cache pools may not have the clones, which means we won't know
14063     // what snaps they have.  fake out the clone_snaps entries anyway (with
14064     // blank snap lists).
14065     p.second.head_exists = true;
14066     if (pool.info.allow_incomplete_clones()) {
14067       for (auto s : p.second.clones) {
14068         if (p.second.clone_snaps.count(s) == 0) {
14069           dout(10) << __func__ << " " << p.first << " faking clone_snaps for "
14070                    << s << dendl;
14071           p.second.clone_snaps[s];
14072         }
14073       }
14074     }
14075     if (p.second.clones.size() != p.second.clone_snaps.size() ||
14076         p.second.is_legacy()) {
14077       // this happens if we encounter other errors above, like a missing
14078       // or extra clone.
14079       dout(10) << __func__ << " not writing snapset to " << p.first
14080                << " snapset " << p.second << " clones " << p.second.clones
14081                << "; didn't convert fully" << dendl;
14082       scrub_cstat.sum.num_legacy_snapsets++;
14083       continue;
14084     }
14085     dout(10) << __func__ << " writing snapset to " << p.first
14086              << " " << p.second << dendl;
14087     ObjectContextRef obc = get_object_context(p.first, true);
14088     if (!obc) {
14089       osd->clog->error() << info.pgid << " " << mode
14090                          << " cannot get object context for object "
14091                          << p.first;
14092       continue;
14093     } else if (obc->obs.oi.soid != p.first) {
14094       osd->clog->error() << info.pgid << " " << mode
14095                          << " object " << p.first
14096                          << " has a valid oi attr with a mismatched name, "
14097                          << " obc->obs.oi.soid: " << obc->obs.oi.soid;
14098       continue;
14099     }
14100     ObjectContextRef snapset_obc;
14101     if (!obc->obs.exists) {
14102       snapset_obc = get_object_context(p.first.get_snapdir(), false);
14103       if (!snapset_obc) {
14104         osd->clog->error() << info.pgid << " " << mode
14105                            << " cannot get object context for "
14106                            << p.first.get_snapdir();
14107         continue;
14108       }
14109     }
14110     OpContextUPtr ctx = simple_opc_create(obc);
14111     PGTransaction *t = ctx->op_t.get();
14112     ctx->snapset_obc = snapset_obc;
14113     ctx->at_version = get_next_version();
14114     ctx->mtime = utime_t();      // do not update mtime
14115     ctx->new_snapset = p.second;
14116     if (!ctx->new_obs.exists) {
14117       dout(20) << __func__ << "   making " << p.first << " a whiteout" << dendl;
14118       ctx->new_obs.exists = true;
14119       ctx->new_snapset.head_exists = true;
14120       ctx->new_obs.oi.set_flag(object_info_t::FLAG_WHITEOUT);
14121       ++ctx->delta_stats.num_whiteouts;
14122       ++ctx->delta_stats.num_objects;
14123       t->create(p.first);
14124       if (p.first < scrubber.start) {
14125         dout(20) << __func__ << " kludging around update outside of scrub range"
14126                  << dendl;
14127       } else {
14128         scrub_cstat.add(ctx->delta_stats);
14129       }
14130     }
14131     dout(20) << __func__ << "   final snapset " << ctx->new_snapset << dendl;
14132     assert(!ctx->new_snapset.is_legacy());
14133     finish_ctx(ctx.get(), pg_log_entry_t::MODIFY);
14134     ctx->register_on_success(
14135       [this]() {
14136         dout(20) << "updating snapset" << dendl;
14137         if (--scrubber.num_digest_updates_pending == 0) {
14138           requeue_scrub();
14139         }
14140       });
14141
14142     simple_opc_submit(std::move(ctx));
14143     ++scrubber.num_digest_updates_pending;
14144   }
14145
14146   dout(10) << __func__ << " (" << mode << ") finish" << dendl;
14147 }
14148
14149 void PrimaryLogPG::_scrub_clear_state()
14150 {
14151   scrub_cstat = object_stat_collection_t();
14152 }
14153
14154 void PrimaryLogPG::_scrub_finish()
14155 {
14156   bool repair = state_test(PG_STATE_REPAIR);
14157   bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
14158   const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
14159
14160   if (info.stats.stats_invalid) {
14161     info.stats.stats = scrub_cstat;
14162     info.stats.stats_invalid = false;
14163
14164     if (agent_state)
14165       agent_choose_mode();
14166   }
14167
14168   dout(10) << mode << " got "
14169            << scrub_cstat.sum.num_objects << "/" << info.stats.stats.sum.num_objects << " objects, "
14170            << scrub_cstat.sum.num_object_clones << "/" << info.stats.stats.sum.num_object_clones << " clones, "
14171            << scrub_cstat.sum.num_objects_dirty << "/" << info.stats.stats.sum.num_objects_dirty << " dirty, "
14172            << scrub_cstat.sum.num_objects_omap << "/" << info.stats.stats.sum.num_objects_omap << " omap, "
14173            << scrub_cstat.sum.num_objects_pinned << "/" << info.stats.stats.sum.num_objects_pinned << " pinned, "
14174            << scrub_cstat.sum.num_objects_hit_set_archive << "/" << info.stats.stats.sum.num_objects_hit_set_archive << " hit_set_archive, "
14175            << scrub_cstat.sum.num_bytes << "/" << info.stats.stats.sum.num_bytes << " bytes, "
14176            << scrub_cstat.sum.num_bytes_hit_set_archive << "/" << info.stats.stats.sum.num_bytes_hit_set_archive << " hit_set_archive bytes."
14177            << dendl;
14178
14179   if (scrub_cstat.sum.num_objects != info.stats.stats.sum.num_objects ||
14180       scrub_cstat.sum.num_object_clones != info.stats.stats.sum.num_object_clones ||
14181       (scrub_cstat.sum.num_objects_dirty != info.stats.stats.sum.num_objects_dirty &&
14182        !info.stats.dirty_stats_invalid) ||
14183       (scrub_cstat.sum.num_objects_omap != info.stats.stats.sum.num_objects_omap &&
14184        !info.stats.omap_stats_invalid) ||
14185       (scrub_cstat.sum.num_objects_pinned != info.stats.stats.sum.num_objects_pinned &&
14186        !info.stats.pin_stats_invalid) ||
14187       (scrub_cstat.sum.num_objects_hit_set_archive != info.stats.stats.sum.num_objects_hit_set_archive &&
14188        !info.stats.hitset_stats_invalid) ||
14189       (scrub_cstat.sum.num_bytes_hit_set_archive != info.stats.stats.sum.num_bytes_hit_set_archive &&
14190        !info.stats.hitset_bytes_stats_invalid) ||
14191       scrub_cstat.sum.num_whiteouts != info.stats.stats.sum.num_whiteouts ||
14192       scrub_cstat.sum.num_bytes != info.stats.stats.sum.num_bytes) {
14193     osd->clog->error() << info.pgid << " " << mode
14194                       << " stat mismatch, got "
14195                       << scrub_cstat.sum.num_objects << "/" << info.stats.stats.sum.num_objects << " objects, "
14196                       << scrub_cstat.sum.num_object_clones << "/" << info.stats.stats.sum.num_object_clones << " clones, "
14197                       << scrub_cstat.sum.num_objects_dirty << "/" << info.stats.stats.sum.num_objects_dirty << " dirty, "
14198                       << scrub_cstat.sum.num_objects_omap << "/" << info.stats.stats.sum.num_objects_omap << " omap, "
14199                       << scrub_cstat.sum.num_objects_pinned << "/" << info.stats.stats.sum.num_objects_pinned << " pinned, "
14200                       << scrub_cstat.sum.num_objects_hit_set_archive << "/" << info.stats.stats.sum.num_objects_hit_set_archive << " hit_set_archive, "
14201                       << scrub_cstat.sum.num_whiteouts << "/" << info.stats.stats.sum.num_whiteouts << " whiteouts, "
14202                       << scrub_cstat.sum.num_bytes << "/" << info.stats.stats.sum.num_bytes << " bytes, "
14203                       << scrub_cstat.sum.num_bytes_hit_set_archive << "/" << info.stats.stats.sum.num_bytes_hit_set_archive << " hit_set_archive bytes.";
14204     ++scrubber.shallow_errors;
14205
14206     if (repair) {
14207       ++scrubber.fixed;
14208       info.stats.stats = scrub_cstat;
14209       info.stats.dirty_stats_invalid = false;
14210       info.stats.omap_stats_invalid = false;
14211       info.stats.hitset_stats_invalid = false;
14212       info.stats.hitset_bytes_stats_invalid = false;
14213       publish_stats_to_osd();
14214       share_pg_info();
14215     }
14216   } else if (scrub_cstat.sum.num_legacy_snapsets !=
14217              info.stats.stats.sum.num_legacy_snapsets) {
14218     osd->clog->info() << info.pgid << " " << mode << " updated num_legacy_snapsets"
14219                       << " from " << info.stats.stats.sum.num_legacy_snapsets
14220                       << " -> " << scrub_cstat.sum.num_legacy_snapsets << "\n";
14221     info.stats.stats.sum.num_legacy_snapsets = scrub_cstat.sum.num_legacy_snapsets;
14222     publish_stats_to_osd();
14223     share_pg_info();
14224   }
14225   // Clear object context cache to get repair information
14226   if (repair)
14227     object_contexts.clear();
14228 }
14229
14230 bool PrimaryLogPG::check_osdmap_full(const set<pg_shard_t> &missing_on)
14231 {
14232     return osd->check_osdmap_full(missing_on);
14233 }
14234
14235 int PrimaryLogPG::rep_repair_primary_object(const hobject_t& soid, OpRequestRef op)
14236 {
14237   // Only supports replicated pools
14238   assert(!pool.info.require_rollback());
14239   assert(is_primary());
14240
14241   dout(10) << __func__ << " " << soid
14242            << " peers osd.{" << actingbackfill << "}" << dendl;
14243
14244   if (!is_clean()) {
14245     block_for_clean(soid, op);
14246     return -EAGAIN;
14247   }
14248
14249   assert(!pg_log.get_missing().is_missing(soid));
14250   bufferlist bv;
14251   object_info_t oi;
14252   eversion_t v;
14253   int r = get_pgbackend()->objects_get_attr(soid, OI_ATTR, &bv);
14254   if (r < 0) {
14255     // Leave v and try to repair without a version, getting attr failed
14256     dout(0) << __func__ << ": Need version of replica, objects_get_attr failed: "
14257             << soid << " error=" << r << dendl;
14258   } else try {
14259     bufferlist::iterator bliter = bv.begin();
14260     ::decode(oi, bliter);
14261     v = oi.version;
14262   } catch (...) {
14263     // Leave v as default constructed. This will fail when sent to older OSDs, but
14264     // not much worse than failing here.
14265     dout(0) << __func__ << ": Need version of replica, bad object_info_t: " << soid << dendl;
14266   }
14267
14268   missing_loc.add_missing(soid, v, eversion_t());
14269   if (primary_error(soid, v)) {
14270     dout(0) << __func__ << " No other replicas available for " << soid << dendl;
14271     // XXX: If we knew that there is no down osd which could include this
14272     // object, it would be nice if we could return EIO here.
14273     // If a "never fail" flag was available, that could be used
14274     // for rbd to NOT return EIO until object marked lost.
14275
14276     // Drop through to save this op in case an osd comes up with the object.
14277   }
14278
14279   // Restart the op after object becomes readable again
14280   waiting_for_unreadable_object[soid].push_back(op);
14281   op->mark_delayed("waiting for missing object");
14282
14283   if (!eio_errors_to_process) {
14284     eio_errors_to_process = true;
14285     assert(is_clean());
14286     queue_peering_event(
14287         CephPeeringEvtRef(
14288           std::make_shared<CephPeeringEvt>(
14289           get_osdmap()->get_epoch(),
14290           get_osdmap()->get_epoch(),
14291           DoRecovery())));
14292   } else {
14293     // A prior error must have already cleared clean state and queued recovery
14294     // or a map change has triggered re-peering.
14295     // Not inlining the recovery by calling maybe_kick_recovery(soid);
14296     dout(5) << __func__<< ": Read error on " << soid << ", but already seen errors" << dendl;
14297   }
14298
14299   return -EAGAIN;
14300 }
14301
14302 /*---SnapTrimmer Logging---*/
14303 #undef dout_prefix
14304 #define dout_prefix *_dout << pg->gen_prefix()
14305
14306 void PrimaryLogPG::SnapTrimmer::log_enter(const char *state_name)
14307 {
14308   ldout(pg->cct, 20) << "enter " << state_name << dendl;
14309 }
14310
14311 void PrimaryLogPG::SnapTrimmer::log_exit(const char *state_name, utime_t enter_time)
14312 {
14313   ldout(pg->cct, 20) << "exit " << state_name << dendl;
14314 }
14315
14316 /*---SnapTrimmer states---*/
14317 #undef dout_prefix
14318 #define dout_prefix (*_dout << context< SnapTrimmer >().pg->gen_prefix() \
14319                      << "SnapTrimmer state<" << get_state_name() << ">: ")
14320
14321 /* NotTrimming */
14322 PrimaryLogPG::NotTrimming::NotTrimming(my_context ctx)
14323   : my_base(ctx),
14324     NamedState(context< SnapTrimmer >().pg, "NotTrimming")
14325 {
14326   context< SnapTrimmer >().log_enter(state_name);
14327 }
14328
14329 void PrimaryLogPG::NotTrimming::exit()
14330 {
14331   context< SnapTrimmer >().log_exit(state_name, enter_time);
14332 }
14333
14334 boost::statechart::result PrimaryLogPG::NotTrimming::react(const KickTrim&)
14335 {
14336   PrimaryLogPG *pg = context< SnapTrimmer >().pg;
14337   ldout(pg->cct, 10) << "NotTrimming react KickTrim" << dendl;
14338
14339   if (!(pg->is_primary() && pg->is_active())) {
14340     ldout(pg->cct, 10) << "NotTrimming not primary or active" << dendl;
14341     return discard_event();
14342   }
14343   if (!pg->is_clean() ||
14344       pg->snap_trimq.empty()) {
14345     ldout(pg->cct, 10) << "NotTrimming not clean or nothing to trim" << dendl;
14346     return discard_event();
14347   }
14348   if (pg->scrubber.active) {
14349     ldout(pg->cct, 10) << " scrubbing, will requeue snap_trimmer after" << dendl;
14350     return transit< WaitScrub >();
14351   } else {
14352     return transit< Trimming >();
14353   }
14354 }
14355
14356 boost::statechart::result PrimaryLogPG::WaitReservation::react(const SnapTrimReserved&)
14357 {
14358   PrimaryLogPG *pg = context< SnapTrimmer >().pg;
14359   ldout(pg->cct, 10) << "WaitReservation react SnapTrimReserved" << dendl;
14360
14361   pending = nullptr;
14362   if (!context< SnapTrimmer >().can_trim()) {
14363     post_event(KickTrim());
14364     return transit< NotTrimming >();
14365   }
14366
14367   context<Trimming>().snap_to_trim = pg->snap_trimq.range_start();
14368   ldout(pg->cct, 10) << "NotTrimming: trimming "
14369                      << pg->snap_trimq.range_start()
14370                      << dendl;
14371   return transit< AwaitAsyncWork >();
14372 }
14373
14374 /* AwaitAsyncWork */
14375 PrimaryLogPG::AwaitAsyncWork::AwaitAsyncWork(my_context ctx)
14376   : my_base(ctx),
14377     NamedState(context< SnapTrimmer >().pg, "Trimming/AwaitAsyncWork")
14378 {
14379   auto *pg = context< SnapTrimmer >().pg;
14380   context< SnapTrimmer >().log_enter(state_name);
14381   context< SnapTrimmer >().pg->osd->queue_for_snap_trim(pg);
14382   pg->state_set(PG_STATE_SNAPTRIM);
14383   pg->state_clear(PG_STATE_SNAPTRIM_ERROR);
14384   pg->publish_stats_to_osd();
14385 }
14386
14387 boost::statechart::result PrimaryLogPG::AwaitAsyncWork::react(const DoSnapWork&)
14388 {
14389   PrimaryLogPGRef pg = context< SnapTrimmer >().pg;
14390   snapid_t snap_to_trim = context<Trimming>().snap_to_trim;
14391   auto &in_flight = context<Trimming>().in_flight;
14392   assert(in_flight.empty());
14393
14394   assert(pg->is_primary() && pg->is_active());
14395   if (!context< SnapTrimmer >().can_trim()) {
14396     ldout(pg->cct, 10) << "something changed, reverting to NotTrimming" << dendl;
14397     post_event(KickTrim());
14398     return transit< NotTrimming >();
14399   }
14400
14401   ldout(pg->cct, 10) << "AwaitAsyncWork: trimming snap " << snap_to_trim << dendl;
14402
14403   vector<hobject_t> to_trim;
14404   unsigned max = pg->cct->_conf->osd_pg_max_concurrent_snap_trims;
14405   to_trim.reserve(max);
14406   int r = pg->snap_mapper.get_next_objects_to_trim(
14407     snap_to_trim,
14408     max,
14409     &to_trim);
14410   if (r != 0 && r != -ENOENT) {
14411     lderr(pg->cct) << "get_next_objects_to_trim returned "
14412                    << cpp_strerror(r) << dendl;
14413     assert(0 == "get_next_objects_to_trim returned an invalid code");
14414   } else if (r == -ENOENT) {
14415     // Done!
14416     ldout(pg->cct, 10) << "got ENOENT" << dendl;
14417
14418     ldout(pg->cct, 10) << "adding snap " << snap_to_trim
14419                        << " to purged_snaps"
14420                        << dendl;
14421     pg->info.purged_snaps.insert(snap_to_trim);
14422     pg->snap_trimq.erase(snap_to_trim);
14423     ldout(pg->cct, 10) << "purged_snaps now "
14424                        << pg->info.purged_snaps << ", snap_trimq now "
14425                        << pg->snap_trimq << dendl;
14426
14427     ObjectStore::Transaction t;
14428     pg->dirty_big_info = true;
14429     pg->write_if_dirty(t);
14430     int tr = pg->osd->store->queue_transaction(pg->osr.get(), std::move(t), NULL);
14431     assert(tr == 0);
14432
14433     pg->share_pg_info();
14434     post_event(KickTrim());
14435     return transit< NotTrimming >();
14436   }
14437   assert(!to_trim.empty());
14438
14439   for (auto &&object: to_trim) {
14440     // Get next
14441     ldout(pg->cct, 10) << "AwaitAsyncWork react trimming " << object << dendl;
14442     OpContextUPtr ctx;
14443     int error = pg->trim_object(in_flight.empty(), object, &ctx);
14444     if (error) {
14445       if (error == -ENOLCK) {
14446         ldout(pg->cct, 10) << "could not get write lock on obj "
14447                            << object << dendl;
14448       } else {
14449         pg->state_set(PG_STATE_SNAPTRIM_ERROR);
14450         ldout(pg->cct, 10) << "Snaptrim error=" << error << dendl;
14451       }
14452       if (!in_flight.empty()) {
14453         ldout(pg->cct, 10) << "letting the ones we already started finish" << dendl;
14454         return transit< WaitRepops >();
14455       }
14456       if (error == -ENOLCK) {
14457         ldout(pg->cct, 10) << "waiting for it to clear"
14458                            << dendl;
14459         return transit< WaitRWLock >();
14460       } else {
14461         return transit< NotTrimming >();
14462       }
14463     }
14464
14465     in_flight.insert(object);
14466     ctx->register_on_success(
14467       [pg, object, &in_flight]() {
14468         assert(in_flight.find(object) != in_flight.end());
14469         in_flight.erase(object);
14470         if (in_flight.empty()) {
14471           if (pg->state_test(PG_STATE_SNAPTRIM_ERROR)) {
14472             pg->snap_trimmer_machine.process_event(Reset());
14473           } else {
14474             pg->snap_trimmer_machine.process_event(RepopsComplete());
14475           }
14476         }
14477       });
14478
14479     pg->simple_opc_submit(std::move(ctx));
14480   }
14481
14482   return transit< WaitRepops >();
14483 }
14484
14485 void PrimaryLogPG::setattr_maybe_cache(
14486   ObjectContextRef obc,
14487   OpContext *op,
14488   PGTransaction *t,
14489   const string &key,
14490   bufferlist &val)
14491 {
14492   t->setattr(obc->obs.oi.soid, key, val);
14493 }
14494
14495 void PrimaryLogPG::setattrs_maybe_cache(
14496   ObjectContextRef obc,
14497   OpContext *op,
14498   PGTransaction *t,
14499   map<string, bufferlist> &attrs)
14500 {
14501   t->setattrs(obc->obs.oi.soid, attrs);
14502 }
14503
14504 void PrimaryLogPG::rmattr_maybe_cache(
14505   ObjectContextRef obc,
14506   OpContext *op,
14507   PGTransaction *t,
14508   const string &key)
14509 {
14510   t->rmattr(obc->obs.oi.soid, key);
14511 }
14512
14513 int PrimaryLogPG::getattr_maybe_cache(
14514   ObjectContextRef obc,
14515   const string &key,
14516   bufferlist *val)
14517 {
14518   if (pool.info.require_rollback()) {
14519     map<string, bufferlist>::iterator i = obc->attr_cache.find(key);
14520     if (i != obc->attr_cache.end()) {
14521       if (val)
14522         *val = i->second;
14523       return 0;
14524     } else {
14525       return -ENODATA;
14526     }
14527   }
14528   return pgbackend->objects_get_attr(obc->obs.oi.soid, key, val);
14529 }
14530
14531 int PrimaryLogPG::getattrs_maybe_cache(
14532   ObjectContextRef obc,
14533   map<string, bufferlist> *out,
14534   bool user_only)
14535 {
14536   int r = 0;
14537   if (pool.info.require_rollback()) {
14538     if (out)
14539       *out = obc->attr_cache;
14540   } else {
14541     r = pgbackend->objects_get_attrs(obc->obs.oi.soid, out);
14542   }
14543   if (out && user_only) {
14544     map<string, bufferlist> tmp;
14545     for (map<string, bufferlist>::iterator i = out->begin();
14546          i != out->end();
14547          ++i) {
14548       if (i->first.size() > 1 && i->first[0] == '_')
14549         tmp[i->first.substr(1, i->first.size())].claim(i->second);
14550     }
14551     tmp.swap(*out);
14552   }
14553   return r;
14554 }
14555
14556 bool PrimaryLogPG::check_failsafe_full(ostream &ss) {
14557     return osd->check_failsafe_full(ss);
14558 }
14559
14560 void intrusive_ptr_add_ref(PrimaryLogPG *pg) { pg->get("intptr"); }
14561 void intrusive_ptr_release(PrimaryLogPG *pg) { pg->put("intptr"); }
14562
14563 #ifdef PG_DEBUG_REFS
14564 uint64_t get_with_id(PrimaryLogPG *pg) { return pg->get_with_id(); }
14565 void put_with_id(PrimaryLogPG *pg, uint64_t id) { return pg->put_with_id(id); }
14566 #endif
14567
14568 void intrusive_ptr_add_ref(PrimaryLogPG::RepGather *repop) { repop->get(); }
14569 void intrusive_ptr_release(PrimaryLogPG::RepGather *repop) { repop->put(); }