ceph/src/osd/PrimaryLogPG.cc

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Ceph - scalable distributed file system
   5  *
   6  * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
   7  * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
   8  *
   9  * Author: Loic Dachary <loic@dachary.org>
  10  *
  11  * This is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public
  13  * License version 2.1, as published by the Free Software
  14  * Foundation.  See file COPYING.
  15  *
  16  */
  17
  18 #include "boost/tuple/tuple.hpp"
  19 #include "boost/intrusive_ptr.hpp"
  20 #include "PG.h"
  21 #include "PrimaryLogPG.h"
  22 #include "OSD.h"
  23 #include "OpRequest.h"
  24 #include "ScrubStore.h"
  25 #include "Session.h"
  26 #include "objclass/objclass.h"
  27
  28 #include "common/errno.h"
  29 #include "common/scrub_types.h"
  30 #include "common/perf_counters.h"
  31
  32 #include "messages/MOSDOp.h"
  33 #include "messages/MOSDBackoff.h"
  34 #include "messages/MOSDSubOp.h"
  35 #include "messages/MOSDSubOpReply.h"
  36 #include "messages/MOSDPGTrim.h"
  37 #include "messages/MOSDPGScan.h"
  38 #include "messages/MOSDRepScrub.h"
  39 #include "messages/MOSDPGBackfill.h"
  40 #include "messages/MOSDPGBackfillRemove.h"
  41 #include "messages/MOSDPGUpdateLogMissing.h"
  42 #include "messages/MOSDPGUpdateLogMissingReply.h"
  43 #include "messages/MCommandReply.h"
  44 #include "messages/MOSDScrubReserve.h"
  45 #include "mds/inode_backtrace.h" // Ugh
  46 #include "common/EventTrace.h"
  47
  48 #include "common/config.h"
  49 #include "include/compat.h"
  50 #include "mon/MonClient.h"
  51 #include "osdc/Objecter.h"
  52 #include "json_spirit/json_spirit_value.h"
  53 #include "json_spirit/json_spirit_reader.h"
  54 #include "include/assert.h"  // json_spirit clobbers it
  55 #include "include/rados/rados_types.hpp"
  56
  57 #ifdef WITH_LTTNG
  58 #include "tracing/osd.h"
  59 #else
  60 #define tracepoint(...)
  61 #endif
  62
  63 #define dout_context cct
  64 #define dout_subsys ceph_subsys_osd
  65 #define DOUT_PREFIX_ARGS this, osd->whoami, get_osdmap()
  66 #undef dout_prefix
  67 #define dout_prefix _prefix(_dout, this)
  68 template <typename T>
  69 static ostream& _prefix(std::ostream *_dout, T *pg) {
  70   return *_dout << pg->gen_prefix();
  71 }
  72
  73
  74 #include <sstream>
  75 #include <utility>
  76
  77 #include <errno.h>
  78
  79 MEMPOOL_DEFINE_OBJECT_FACTORY(PrimaryLogPG, replicatedpg, osd);
  80
  81 PGLSFilter::PGLSFilter() : cct(nullptr)
  82 {
  83 }
  84
  85 PGLSFilter::~PGLSFilter()
  86 {
  87 }
  88
  89 struct PrimaryLogPG::C_OSD_OnApplied : Context {
  90   PrimaryLogPGRef pg;
  91   epoch_t epoch;
  92   eversion_t v;
  93   C_OSD_OnApplied(
  94     PrimaryLogPGRef pg,
  95     epoch_t epoch,
  96     eversion_t v)
  97     : pg(pg), epoch(epoch), v(v) {}
  98   void finish(int) override {
  99     pg->lock();
 100     if (!pg->pg_has_reset_since(epoch))
 101       pg->op_applied(v);
 102     pg->unlock();
 103   }
 104 };
 105
 106 /**
 107  * The CopyCallback class defines an interface for completions to the
 108  * copy_start code. Users of the copy infrastructure must implement
 109  * one and give an instance of the class to start_copy.
 110  *
 111  * The implementer is responsible for making sure that the CopyCallback
 112  * can associate itself with the correct copy operation.
 113  */
 114 class PrimaryLogPG::CopyCallback : public GenContext<CopyCallbackResults> {
 115 protected:
 116   CopyCallback() {}
 117   /**
 118    * results.get<0>() is the return code: 0 for success; -ECANCELED if
 119    * the operation was cancelled by the local OSD; -errno for other issues.
 120    * results.get<1>() is a pointer to a CopyResults object, which you are
 121    * responsible for deleting.
 122    */
 123   void finish(CopyCallbackResults results_) override = 0;
 124
 125 public:
 126   /// Provide the final size of the copied object to the CopyCallback
 127   ~CopyCallback() override {}
 128 };
 129
 130 template <typename T>
 131 class PrimaryLogPG::BlessedGenContext : public GenContext<T> {
 132   PrimaryLogPGRef pg;
 133   unique_ptr<GenContext<T>> c;
 134   epoch_t e;
 135 public:
 136   BlessedGenContext(PrimaryLogPG *pg, GenContext<T> *c, epoch_t e)
 137     : pg(pg), c(c), e(e) {}
 138   void finish(T t) override {
 139     pg->lock();
 140     if (pg->pg_has_reset_since(e))
 141       c.reset();
 142     else
 143       c.release()->complete(t);
 144     pg->unlock();
 145   }
 146 };
 147
 148 GenContext<ThreadPool::TPHandle&> *PrimaryLogPG::bless_gencontext(
 149   GenContext<ThreadPool::TPHandle&> *c) {
 150   return new BlessedGenContext<ThreadPool::TPHandle&>(
 151     this, c, get_osdmap()->get_epoch());
 152 }
 153
 154 class PrimaryLogPG::BlessedContext : public Context {
 155   PrimaryLogPGRef pg;
 156   unique_ptr<Context> c;
 157   epoch_t e;
 158 public:
 159   BlessedContext(PrimaryLogPG *pg, Context *c, epoch_t e)
 160     : pg(pg), c(c), e(e) {}
 161   void finish(int r) override {
 162     pg->lock();
 163     if (pg->pg_has_reset_since(e))
 164       c.reset();
 165     else
 166       c.release()->complete(r);
 167     pg->unlock();
 168   }
 169 };
 170
 171
 172 Context *PrimaryLogPG::bless_context(Context *c) {
 173   return new BlessedContext(this, c, get_osdmap()->get_epoch());
 174 }
 175
 176 class PrimaryLogPG::C_PG_ObjectContext : public Context {
 177   PrimaryLogPGRef pg;
 178   ObjectContext *obc;
 179   public:
 180   C_PG_ObjectContext(PrimaryLogPG *p, ObjectContext *o) :
 181     pg(p), obc(o) {}
 182   void finish(int r) override {
 183     pg->object_context_destructor_callback(obc);
 184   }
 185 };
 186
 187 class PrimaryLogPG::C_OSD_OndiskWriteUnlock : public Context {
 188   ObjectContextRef obc, obc2, obc3;
 189   public:
 190   C_OSD_OndiskWriteUnlock(
 191     ObjectContextRef o,
 192     ObjectContextRef o2 = ObjectContextRef(),
 193     ObjectContextRef o3 = ObjectContextRef()) : obc(o), obc2(o2), obc3(o3) {}
 194   void finish(int r) override {
 195     obc->ondisk_write_unlock();
 196     if (obc2)
 197       obc2->ondisk_write_unlock();
 198     if (obc3)
 199       obc3->ondisk_write_unlock();
 200   }
 201 };
 202
 203 struct OnReadComplete : public Context {
 204   PrimaryLogPG *pg;
 205   PrimaryLogPG::OpContext *opcontext;
 206   OnReadComplete(
 207     PrimaryLogPG *pg,
 208     PrimaryLogPG::OpContext *ctx) : pg(pg), opcontext(ctx) {}
 209   void finish(int r) override {
 210     opcontext->finish_read(pg);
 211   }
 212   ~OnReadComplete() override {}
 213 };
 214
 215 class PrimaryLogPG::C_OSD_AppliedRecoveredObject : public Context {
 216   PrimaryLogPGRef pg;
 217   ObjectContextRef obc;
 218   public:
 219   C_OSD_AppliedRecoveredObject(PrimaryLogPG *p, ObjectContextRef o) :
 220     pg(p), obc(o) {}
 221   void finish(int r) override {
 222     pg->_applied_recovered_object(obc);
 223   }
 224 };
 225
 226 class PrimaryLogPG::C_OSD_CommittedPushedObject : public Context {
 227   PrimaryLogPGRef pg;
 228   epoch_t epoch;
 229   eversion_t last_complete;
 230   public:
 231   C_OSD_CommittedPushedObject(
 232     PrimaryLogPG *p, epoch_t epoch, eversion_t lc) :
 233     pg(p), epoch(epoch), last_complete(lc) {
 234   }
 235   void finish(int r) override {
 236     pg->_committed_pushed_object(epoch, last_complete);
 237   }
 238 };
 239
 240 class PrimaryLogPG::C_OSD_AppliedRecoveredObjectReplica : public Context {
 241   PrimaryLogPGRef pg;
 242   public:
 243   explicit C_OSD_AppliedRecoveredObjectReplica(PrimaryLogPG *p) :
 244     pg(p) {}
 245   void finish(int r) override {
 246     pg->_applied_recovered_object_replica();
 247   }
 248 };
 249
 250 // OpContext
 251 void PrimaryLogPG::OpContext::start_async_reads(PrimaryLogPG *pg)
 252 {
 253   inflightreads = 1;
 254   list<pair<boost::tuple<uint64_t, uint64_t, unsigned>,
 255             pair<bufferlist*, Context*> > > in;
 256   in.swap(pending_async_reads);
 257   pg->pgbackend->objects_read_async(
 258     obc->obs.oi.soid,
 259     in,
 260     new OnReadComplete(pg, this), pg->get_pool().fast_read);
 261 }
 262 void PrimaryLogPG::OpContext::finish_read(PrimaryLogPG *pg)
 263 {
 264   assert(inflightreads > 0);
 265   --inflightreads;
 266   if (async_reads_complete()) {
 267     assert(pg->in_progress_async_reads.size());
 268     assert(pg->in_progress_async_reads.front().second == this);
 269     pg->in_progress_async_reads.pop_front();
 270
 271     // Restart the op context now that all reads have been
 272     // completed. Read failures will be handled by the op finisher
 273     pg->execute_ctx(this);
 274   }
 275 }
 276
 277 class CopyFromCallback : public PrimaryLogPG::CopyCallback {
 278 public:
 279   PrimaryLogPG::CopyResults *results = nullptr;
 280   PrimaryLogPG::OpContext *ctx;
 281   OSDOp &osd_op;
 282
 283   CopyFromCallback(PrimaryLogPG::OpContext *ctx, OSDOp &osd_op)
 284     : ctx(ctx), osd_op(osd_op) {
 285   }
 286   ~CopyFromCallback() override {}
 287
 288   void finish(PrimaryLogPG::CopyCallbackResults results_) override {
 289     results = results_.get<1>();
 290     int r = results_.get<0>();
 291
 292     // for finish_copyfrom
 293     ctx->user_at_version = results->user_version;
 294
 295     if (r >= 0) {
 296       ctx->pg->execute_ctx(ctx);
 297     } else {
 298       if (r != -ECANCELED) { // on cancel just toss it out; client resends
 299         if (ctx->op)
 300           ctx->pg->osd->reply_op_error(ctx->op, r);
 301       } else if (results->should_requeue) {
 302         if (ctx->op)
 303           ctx->pg->requeue_op(ctx->op);
 304       }
 305       ctx->pg->close_op_ctx(ctx);
 306     }
 307   }
 308
 309   bool is_temp_obj_used() {
 310     return results->started_temp_obj;
 311   }
 312   uint64_t get_data_size() {
 313     return results->object_size;
 314   }
 315 };
 316
 317 struct CopyFromFinisher : public PrimaryLogPG::OpFinisher {
 318   CopyFromCallback *copy_from_callback;
 319
 320   CopyFromFinisher(CopyFromCallback *copy_from_callback)
 321     : copy_from_callback(copy_from_callback) {
 322   }
 323
 324   int execute() override {
 325     // instance will be destructed after this method completes
 326     copy_from_callback->ctx->pg->finish_copyfrom(copy_from_callback);
 327     return 0;
 328   }
 329 };
 330
 331 // ======================
 332 // PGBackend::Listener
 333
 334 void PrimaryLogPG::on_local_recover(
 335   const hobject_t &hoid,
 336   const ObjectRecoveryInfo &_recovery_info,
 337   ObjectContextRef obc,
 338   bool is_delete,
 339   ObjectStore::Transaction *t
 340   )
 341 {
 342   dout(10) << __func__ << ": " << hoid << dendl;
 343
 344   ObjectRecoveryInfo recovery_info(_recovery_info);
 345   clear_object_snap_mapping(t, hoid);
 346   if (!is_delete && recovery_info.soid.is_snap()) {
 347     OSDriver::OSTransaction _t(osdriver.get_transaction(t));
 348     set<snapid_t> snaps;
 349     dout(20) << " snapset " << recovery_info.ss
 350              << " legacy_snaps " << recovery_info.oi.legacy_snaps << dendl;
 351     if (recovery_info.ss.is_legacy() ||
 352         recovery_info.ss.seq == 0 /* jewel osd doesn't populate this */) {
 353       assert(recovery_info.oi.legacy_snaps.size());
 354       snaps.insert(recovery_info.oi.legacy_snaps.begin(),
 355                    recovery_info.oi.legacy_snaps.end());
 356     } else {
 357       auto p = recovery_info.ss.clone_snaps.find(hoid.snap);
 358       assert(p != recovery_info.ss.clone_snaps.end());  // hmm, should we warn?
 359       snaps.insert(p->second.begin(), p->second.end());
 360     }
 361     dout(20) << " snaps " << snaps << dendl;
 362     snap_mapper.add_oid(
 363       recovery_info.soid,
 364       snaps,
 365       &_t);
 366   }
 367   if (!is_delete && pg_log.get_missing().is_missing(recovery_info.soid) &&
 368       pg_log.get_missing().get_items().find(recovery_info.soid)->second.need > recovery_info.version) {
 369     assert(is_primary());
 370     const pg_log_entry_t *latest = pg_log.get_log().objects.find(recovery_info.soid)->second;
 371     if (latest->op == pg_log_entry_t::LOST_REVERT &&
 372         latest->reverting_to == recovery_info.version) {
 373       dout(10) << " got old revert version " << recovery_info.version
 374                << " for " << *latest << dendl;
 375       recovery_info.version = latest->version;
 376       // update the attr to the revert event version
 377       recovery_info.oi.prior_version = recovery_info.oi.version;
 378       recovery_info.oi.version = latest->version;
 379       bufferlist bl;
 380       ::encode(recovery_info.oi, bl,
 381                get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
 382       assert(!pool.info.require_rollback());
 383       t->setattr(coll, ghobject_t(recovery_info.soid), OI_ATTR, bl);
 384       if (obc)
 385         obc->attr_cache[OI_ATTR] = bl;
 386     }
 387   }
 388
 389   // keep track of active pushes for scrub
 390   ++active_pushes;
 391
 392   if (recovery_info.version > pg_log.get_can_rollback_to()) {
 393     /* This can only happen during a repair, and even then, it would
 394      * be one heck of a race.  If we are repairing the object, the
 395      * write in question must be fully committed, so it's not valid
 396      * to roll it back anyway (and we'll be rolled forward shortly
 397      * anyway) */
 398     PGLogEntryHandler h{this, t};
 399     pg_log.roll_forward_to(recovery_info.version, &h);
 400   }
 401   recover_got(recovery_info.soid, recovery_info.version);
 402
 403   if (is_primary()) {
 404     if (!is_delete) {
 405       obc->obs.exists = true;
 406       obc->ondisk_write_lock();
 407
 408       bool got = obc->get_recovery_read();
 409       assert(got);
 410
 411       assert(recovering.count(obc->obs.oi.soid));
 412       recovering[obc->obs.oi.soid] = obc;
 413       obc->obs.oi = recovery_info.oi;  // may have been updated above
 414       t->register_on_applied_sync(new C_OSD_OndiskWriteUnlock(obc));
 415     }
 416
 417     t->register_on_applied(new C_OSD_AppliedRecoveredObject(this, obc));
 418
 419     publish_stats_to_osd();
 420     assert(missing_loc.needs_recovery(hoid));
 421     if (!is_delete)
 422       missing_loc.add_location(hoid, pg_whoami);
 423     release_backoffs(hoid);
 424     if (!is_unreadable_object(hoid)) {
 425       auto unreadable_object_entry = waiting_for_unreadable_object.find(hoid);
 426       if (unreadable_object_entry != waiting_for_unreadable_object.end()) {
 427         dout(20) << " kicking unreadable waiters on " << hoid << dendl;
 428         requeue_ops(unreadable_object_entry->second);
 429         waiting_for_unreadable_object.erase(unreadable_object_entry);
 430       }
 431     }
 432   } else {
 433     t->register_on_applied(
 434       new C_OSD_AppliedRecoveredObjectReplica(this));
 435
 436   }
 437
 438   t->register_on_commit(
 439     new C_OSD_CommittedPushedObject(
 440       this,
 441       get_osdmap()->get_epoch(),
 442       info.last_complete));
 443
 444   // update pg
 445   dirty_info = true;
 446   write_if_dirty(*t);
 447 }
 448
 449 void PrimaryLogPG::on_global_recover(
 450   const hobject_t &soid,
 451   const object_stat_sum_t &stat_diff,
 452   bool is_delete)
 453 {
 454   info.stats.stats.sum.add(stat_diff);
 455   missing_loc.recovered(soid);
 456   publish_stats_to_osd();
 457   dout(10) << "pushed " << soid << " to all replicas" << dendl;
 458   map<hobject_t, ObjectContextRef>::iterator i = recovering.find(soid);
 459   assert(i != recovering.end());
 460
 461   if (!is_delete) {
 462     // recover missing won't have had an obc, but it gets filled in
 463     // during on_local_recover
 464     assert(i->second);
 465     list<OpRequestRef> requeue_list;
 466     i->second->drop_recovery_read(&requeue_list);
 467     requeue_ops(requeue_list);
 468   }
 469
 470   backfills_in_flight.erase(soid);
 471
 472   recovering.erase(i);
 473   finish_recovery_op(soid);
 474   release_backoffs(soid);
 475   auto degraded_object_entry = waiting_for_degraded_object.find(soid);
 476   if (degraded_object_entry != waiting_for_degraded_object.end()) {
 477     dout(20) << " kicking degraded waiters on " << soid << dendl;
 478     requeue_ops(degraded_object_entry->second);
 479     waiting_for_degraded_object.erase(degraded_object_entry);
 480   }
 481   auto unreadable_object_entry = waiting_for_unreadable_object.find(soid);
 482   if (unreadable_object_entry != waiting_for_unreadable_object.end()) {
 483     dout(20) << " kicking unreadable waiters on " << soid << dendl;
 484     requeue_ops(unreadable_object_entry->second);
 485     waiting_for_unreadable_object.erase(unreadable_object_entry);
 486   }
 487   finish_degraded_object(soid);
 488 }
 489
 490 void PrimaryLogPG::on_peer_recover(
 491   pg_shard_t peer,
 492   const hobject_t &soid,
 493   const ObjectRecoveryInfo &recovery_info)
 494 {
 495   publish_stats_to_osd();
 496   // done!
 497   peer_missing[peer].got(soid, recovery_info.version);
 498 }
 499
 500 void PrimaryLogPG::begin_peer_recover(
 501   pg_shard_t peer,
 502   const hobject_t soid)
 503 {
 504   peer_missing[peer].revise_have(soid, eversion_t());
 505 }
 506
 507 void PrimaryLogPG::schedule_recovery_work(
 508   GenContext<ThreadPool::TPHandle&> *c)
 509 {
 510   osd->recovery_gen_wq.queue(c);
 511 }
 512
 513 void PrimaryLogPG::send_message_osd_cluster(
 514   int peer, Message *m, epoch_t from_epoch)
 515 {
 516   osd->send_message_osd_cluster(peer, m, from_epoch);
 517 }
 518
 519 void PrimaryLogPG::send_message_osd_cluster(
 520   Message *m, Connection *con)
 521 {
 522   osd->send_message_osd_cluster(m, con);
 523 }
 524
 525 void PrimaryLogPG::send_message_osd_cluster(
 526   Message *m, const ConnectionRef& con)
 527 {
 528   osd->send_message_osd_cluster(m, con);
 529 }
 530
 531 void PrimaryLogPG::on_primary_error(
 532   const hobject_t &oid,
 533   eversion_t v)
 534 {
 535   dout(0) << __func__ << ": oid " << oid << " version " << v << dendl;
 536   primary_failed(oid);
 537   primary_error(oid, v);
 538   backfills_in_flight.erase(oid);
 539   missing_loc.add_missing(oid, v, eversion_t());
 540 }
 541
 542 ConnectionRef PrimaryLogPG::get_con_osd_cluster(
 543   int peer, epoch_t from_epoch)
 544 {
 545   return osd->get_con_osd_cluster(peer, from_epoch);
 546 }
 547
 548 PerfCounters *PrimaryLogPG::get_logger()
 549 {
 550   return osd->logger;
 551 }
 552
 553
 554 // ====================
 555 // missing objects
 556
 557 bool PrimaryLogPG::is_missing_object(const hobject_t& soid) const
 558 {
 559   return pg_log.get_missing().get_items().count(soid);
 560 }
 561
 562 void PrimaryLogPG::maybe_kick_recovery(
 563   const hobject_t &soid)
 564 {
 565   eversion_t v;
 566   if (!missing_loc.needs_recovery(soid, &v))
 567     return;
 568
 569   map<hobject_t, ObjectContextRef>::const_iterator p = recovering.find(soid);
 570   if (p != recovering.end()) {
 571     dout(7) << "object " << soid << " v " << v << ", already recovering." << dendl;
 572   } else if (missing_loc.is_unfound(soid)) {
 573     dout(7) << "object " << soid << " v " << v << ", is unfound." << dendl;
 574   } else {
 575     dout(7) << "object " << soid << " v " << v << ", recovering." << dendl;
 576     PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
 577     if (is_missing_object(soid)) {
 578       recover_missing(soid, v, cct->_conf->osd_client_op_priority, h);
 579     } else if (missing_loc.is_deleted(soid)) {
 580       prep_object_replica_deletes(soid, v, h);
 581     } else {
 582       prep_object_replica_pushes(soid, v, h);
 583     }
 584     pgbackend->run_recovery_op(h, cct->_conf->osd_client_op_priority);
 585   }
 586 }
 587
 588 void PrimaryLogPG::wait_for_unreadable_object(
 589   const hobject_t& soid, OpRequestRef op)
 590 {
 591   assert(is_unreadable_object(soid));
 592   maybe_kick_recovery(soid);
 593   waiting_for_unreadable_object[soid].push_back(op);
 594   op->mark_delayed("waiting for missing object");
 595 }
 596
 597 bool PrimaryLogPG::is_degraded_or_backfilling_object(const hobject_t& soid)
 598 {
 599   /* The conditions below may clear (on_local_recover, before we queue
 600    * the transaction) before we actually requeue the degraded waiters
 601    * in on_global_recover after the transaction completes.
 602    */
 603   if (waiting_for_degraded_object.count(soid))
 604     return true;
 605   if (pg_log.get_missing().get_items().count(soid))
 606     return true;
 607   assert(!actingbackfill.empty());
 608   for (set<pg_shard_t>::iterator i = actingbackfill.begin();
 609        i != actingbackfill.end();
 610        ++i) {
 611     if (*i == get_primary()) continue;
 612     pg_shard_t peer = *i;
 613     auto peer_missing_entry = peer_missing.find(peer);
 614     if (peer_missing_entry != peer_missing.end() &&
 615         peer_missing_entry->second.get_items().count(soid))
 616       return true;
 617
 618     // Object is degraded if after last_backfill AND
 619     // we are backfilling it
 620     if (is_backfill_targets(peer) &&
 621         peer_info[peer].last_backfill <= soid &&
 622         last_backfill_started >= soid &&
 623         backfills_in_flight.count(soid))
 624       return true;
 625   }
 626   return false;
 627 }
 628
 629 void PrimaryLogPG::wait_for_degraded_object(const hobject_t& soid, OpRequestRef op)
 630 {
 631   assert(is_degraded_or_backfilling_object(soid));
 632
 633   maybe_kick_recovery(soid);
 634   waiting_for_degraded_object[soid].push_back(op);
 635   op->mark_delayed("waiting for degraded object");
 636 }
 637
 638 void PrimaryLogPG::block_write_on_full_cache(
 639   const hobject_t& _oid, OpRequestRef op)
 640 {
 641   const hobject_t oid = _oid.get_head();
 642   dout(20) << __func__ << ": blocking object " << oid
 643            << " on full cache" << dendl;
 644   objects_blocked_on_cache_full.insert(oid);
 645   waiting_for_cache_not_full.push_back(op);
 646   op->mark_delayed("waiting for cache not full");
 647 }
 648
 649 void PrimaryLogPG::block_for_clean(
 650   const hobject_t& oid, OpRequestRef op)
 651 {
 652   dout(20) << __func__ << ": blocking object " << oid
 653            << " on primary repair" << dendl;
 654   waiting_for_clean_to_primary_repair.push_back(op);
 655   op->mark_delayed("waiting for clean to repair");
 656 }
 657
 658 void PrimaryLogPG::block_write_on_snap_rollback(
 659   const hobject_t& oid, ObjectContextRef obc, OpRequestRef op)
 660 {
 661   dout(20) << __func__ << ": blocking object " << oid.get_head()
 662            << " on snap promotion " << obc->obs.oi.soid << dendl;
 663   // otherwise, we'd have blocked in do_op
 664   assert(oid.is_head());
 665   assert(objects_blocked_on_snap_promotion.count(oid) == 0);
 666   objects_blocked_on_snap_promotion[oid] = obc;
 667   wait_for_blocked_object(obc->obs.oi.soid, op);
 668 }
 669
 670 void PrimaryLogPG::block_write_on_degraded_snap(
 671   const hobject_t& snap, OpRequestRef op)
 672 {
 673   dout(20) << __func__ << ": blocking object " << snap.get_head()
 674            << " on degraded snap " << snap << dendl;
 675   // otherwise, we'd have blocked in do_op
 676   assert(objects_blocked_on_degraded_snap.count(snap.get_head()) == 0);
 677   objects_blocked_on_degraded_snap[snap.get_head()] = snap.snap;
 678   wait_for_degraded_object(snap, op);
 679 }
 680
 681 bool PrimaryLogPG::maybe_await_blocked_snapset(
 682   const hobject_t &hoid,
 683   OpRequestRef op)
 684 {
 685   ObjectContextRef obc;
 686   obc = object_contexts.lookup(hoid.get_head());
 687   if (obc) {
 688     if (obc->is_blocked()) {
 689       wait_for_blocked_object(obc->obs.oi.soid, op);
 690       return true;
 691     } else {
 692       return false;
 693     }
 694   }
 695   obc = object_contexts.lookup(hoid.get_snapdir());
 696   if (obc) {
 697     if (obc->is_blocked()) {
 698       wait_for_blocked_object(obc->obs.oi.soid, op);
 699       return true;
 700     } else {
 701       return false;
 702     }
 703   }
 704   return false;
 705 }
 706
 707 void PrimaryLogPG::wait_for_blocked_object(const hobject_t& soid, OpRequestRef op)
 708 {
 709   dout(10) << __func__ << " " << soid << " " << op << dendl;
 710   waiting_for_blocked_object[soid].push_back(op);
 711   op->mark_delayed("waiting for blocked object");
 712 }
 713
 714 void PrimaryLogPG::maybe_force_recovery()
 715 {
 716   // no force if not in degraded/recovery/backfill stats
 717   if (!is_degraded() &&
 718       !state_test(PG_STATE_RECOVERING |
 719                   PG_STATE_RECOVERY_WAIT |
 720                   PG_STATE_BACKFILL |
 721                   PG_STATE_BACKFILL_WAIT |
 722                   PG_STATE_BACKFILL_TOOFULL))
 723     return;
 724
 725   if (pg_log.get_log().approx_size() <
 726       cct->_conf->osd_max_pg_log_entries *
 727         cct->_conf->osd_force_recovery_pg_log_entries_factor)
 728     return;
 729
 730   // find the oldest missing object
 731   version_t min_version = 0;
 732   hobject_t soid;
 733   if (!pg_log.get_missing().get_items().empty()) {
 734     min_version = pg_log.get_missing().get_rmissing().begin()->first;
 735     soid = pg_log.get_missing().get_rmissing().begin()->second;
 736   }
 737   assert(!actingbackfill.empty());
 738   for (set<pg_shard_t>::iterator it = actingbackfill.begin();
 739        it != actingbackfill.end();
 740        ++it) {
 741     if (*it == get_primary()) continue;
 742     pg_shard_t peer = *it;
 743     if (peer_missing.count(peer) &&
 744         !peer_missing[peer].get_items().empty() &&
 745         min_version > peer_missing[peer].get_rmissing().begin()->first) {
 746       min_version = peer_missing[peer].get_rmissing().begin()->first;
 747       soid = peer_missing[peer].get_rmissing().begin()->second;
 748     }
 749   }
 750
 751   // recover it
 752   if (soid != hobject_t())
 753     maybe_kick_recovery(soid);
 754 }
 755
 756 class PGLSPlainFilter : public PGLSFilter {
 757   string val;
 758 public:
 759   int init(bufferlist::iterator &params) override
 760   {
 761     try {
 762       ::decode(xattr, params);
 763       ::decode(val, params);
 764     } catch (buffer::error &e) {
 765       return -EINVAL;
 766     }
 767
 768     return 0;
 769   }
 770   ~PGLSPlainFilter() override {}
 771   bool filter(const hobject_t &obj, bufferlist& xattr_data,
 772                       bufferlist& outdata) override;
 773 };
 774
 775 class PGLSParentFilter : public PGLSFilter {
 776   inodeno_t parent_ino;
 777 public:
 778   CephContext* cct;
 779   PGLSParentFilter(CephContext* cct) : cct(cct) {
 780     xattr = "_parent";
 781   }
 782   int init(bufferlist::iterator &params) override
 783   {
 784     try {
 785       ::decode(parent_ino, params);
 786     } catch (buffer::error &e) {
 787       return -EINVAL;
 788     }
 789     generic_dout(0) << "parent_ino=" << parent_ino << dendl;
 790
 791     return 0;
 792   }
 793   ~PGLSParentFilter() override {}
 794   bool filter(const hobject_t &obj, bufferlist& xattr_data,
 795                       bufferlist& outdata) override;
 796 };
 797
 798 bool PGLSParentFilter::filter(const hobject_t &obj,
 799                               bufferlist& xattr_data, bufferlist& outdata)
 800 {
 801   bufferlist::iterator iter = xattr_data.begin();
 802   inode_backtrace_t bt;
 803
 804   generic_dout(0) << "PGLSParentFilter::filter" << dendl;
 805
 806   ::decode(bt, iter);
 807
 808   vector<inode_backpointer_t>::iterator vi;
 809   for (vi = bt.ancestors.begin(); vi != bt.ancestors.end(); ++vi) {
 810     generic_dout(0) << "vi->dirino=" << vi->dirino << " parent_ino=" << parent_ino << dendl;
 811     if (vi->dirino == parent_ino) {
 812       ::encode(*vi, outdata);
 813       return true;
 814     }
 815   }
 816
 817   return false;
 818 }
 819
 820 bool PGLSPlainFilter::filter(const hobject_t &obj,
 821                              bufferlist& xattr_data, bufferlist& outdata)
 822 {
 823   if (val.size() != xattr_data.length())
 824     return false;
 825
 826   if (memcmp(val.c_str(), xattr_data.c_str(), val.size()))
 827     return false;
 828
 829   return true;
 830 }
 831
 832 bool PrimaryLogPG::pgls_filter(PGLSFilter *filter, hobject_t& sobj, bufferlist& outdata)
 833 {
 834   bufferlist bl;
 835
 836   // If filter has expressed an interest in an xattr, load it.
 837   if (!filter->get_xattr().empty()) {
 838     int ret = pgbackend->objects_get_attr(
 839       sobj,
 840       filter->get_xattr(),
 841       &bl);
 842     dout(0) << "getattr (sobj=" << sobj << ", attr=" << filter->get_xattr() << ") returned " << ret << dendl;
 843     if (ret < 0) {
 844       if (ret != -ENODATA || filter->reject_empty_xattr()) {
 845         return false;
 846       }
 847     }
 848   }
 849
 850   return filter->filter(sobj, bl, outdata);
 851 }
 852
 853 int PrimaryLogPG::get_pgls_filter(bufferlist::iterator& iter, PGLSFilter **pfilter)
 854 {
 855   string type;
 856   PGLSFilter *filter;
 857
 858   try {
 859     ::decode(type, iter);
 860   }
 861   catch (buffer::error& e) {
 862     return -EINVAL;
 863   }
 864
 865   if (type.compare("parent") == 0) {
 866     filter = new PGLSParentFilter(cct);
 867   } else if (type.compare("plain") == 0) {
 868     filter = new PGLSPlainFilter();
 869   } else {
 870     std::size_t dot = type.find(".");
 871     if (dot == std::string::npos || dot == 0 || dot == type.size() - 1) {
 872       return -EINVAL;
 873     }
 874
 875     const std::string class_name = type.substr(0, dot);
 876     const std::string filter_name = type.substr(dot + 1);
 877     ClassHandler::ClassData *cls = NULL;
 878     int r = osd->class_handler->open_class(class_name, &cls);
 879     if (r != 0) {
 880       derr << "Error opening class '" << class_name << "': "
 881            << cpp_strerror(r) << dendl;
 882       if (r != -EPERM) // propogate permission error
 883         r = -EINVAL;
 884       return r;
 885     } else {
 886       assert(cls);
 887     }
 888
 889     ClassHandler::ClassFilter *class_filter = cls->get_filter(filter_name);
 890     if (class_filter == NULL) {
 891       derr << "Error finding filter '" << filter_name << "' in class "
 892            << class_name << dendl;
 893       return -EINVAL;
 894     }
 895     filter = class_filter->fn();
 896     if (!filter) {
 897       // Object classes are obliged to return us something, but let's
 898       // give an error rather than asserting out.
 899       derr << "Buggy class " << class_name << " failed to construct "
 900               "filter " << filter_name << dendl;
 901       return -EINVAL;
 902     }
 903   }
 904
 905   assert(filter);
 906   int r = filter->init(iter);
 907   if (r < 0) {
 908     derr << "Error initializing filter " << type << ": "
 909          << cpp_strerror(r) << dendl;
 910     delete filter;
 911     return -EINVAL;
 912   } else {
 913     // Successfully constructed and initialized, return it.
 914     *pfilter = filter;
 915     return 0;
 916   }
 917 }
 918
 919
 920 // ==========================================================
 921
 922 int PrimaryLogPG::do_command(
 923   cmdmap_t cmdmap,
 924   ostream& ss,
 925   bufferlist& idata,
 926   bufferlist& odata,
 927   ConnectionRef con,
 928   ceph_tid_t tid)
 929 {
 930   const auto &missing = pg_log.get_missing();
 931   string prefix;
 932   string format;
 933
 934   cmd_getval(cct, cmdmap, "format", format);
 935   boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json"));
 936
 937   string command;
 938   cmd_getval(cct, cmdmap, "cmd", command);
 939   if (command == "query") {
 940     f->open_object_section("pg");
 941     f->dump_string("state", pg_state_string(get_state()));
 942     f->dump_stream("snap_trimq") << snap_trimq;
 943     f->dump_unsigned("epoch", get_osdmap()->get_epoch());
 944     f->open_array_section("up");
 945     for (vector<int>::iterator p = up.begin(); p != up.end(); ++p)
 946       f->dump_unsigned("osd", *p);
 947     f->close_section();
 948     f->open_array_section("acting");
 949     for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p)
 950       f->dump_unsigned("osd", *p);
 951     f->close_section();
 952     if (!backfill_targets.empty()) {
 953       f->open_array_section("backfill_targets");
 954       for (set<pg_shard_t>::iterator p = backfill_targets.begin();
 955            p != backfill_targets.end();
 956            ++p)
 957         f->dump_stream("shard") << *p;
 958       f->close_section();
 959     }
 960     if (!actingbackfill.empty()) {
 961       f->open_array_section("actingbackfill");
 962       for (set<pg_shard_t>::iterator p = actingbackfill.begin();
 963            p != actingbackfill.end();
 964            ++p)
 965         f->dump_stream("shard") << *p;
 966       f->close_section();
 967     }
 968     f->open_object_section("info");
 969     _update_calc_stats();
 970     info.dump(f.get());
 971     f->close_section();
 972
 973     f->open_array_section("peer_info");
 974     for (map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
 975          p != peer_info.end();
 976          ++p) {
 977       f->open_object_section("info");
 978       f->dump_stream("peer") << p->first;
 979       p->second.dump(f.get());
 980       f->close_section();
 981     }
 982     f->close_section();
 983
 984     f->open_array_section("recovery_state");
 985     handle_query_state(f.get());
 986     f->close_section();
 987
 988     f->open_object_section("agent_state");
 989     if (agent_state)
 990       agent_state->dump(f.get());
 991     f->close_section();
 992
 993     f->close_section();
 994     f->flush(odata);
 995     return 0;
 996   }
 997   else if (command == "mark_unfound_lost") {
 998     string mulcmd;
 999     cmd_getval(cct, cmdmap, "mulcmd", mulcmd);
1000     int mode = -1;
1001     if (mulcmd == "revert") {
1002       if (pool.info.ec_pool()) {
1003         ss << "mode must be 'delete' for ec pool";
1004         return -EINVAL;
1005       }
1006       mode = pg_log_entry_t::LOST_REVERT;
1007     } else if (mulcmd == "delete") {
1008       mode = pg_log_entry_t::LOST_DELETE;
1009     } else {
1010       ss << "mode must be 'revert' or 'delete'; mark not yet implemented";
1011       return -EINVAL;
1012     }
1013     assert(mode == pg_log_entry_t::LOST_REVERT ||
1014            mode == pg_log_entry_t::LOST_DELETE);
1015
1016     if (!is_primary()) {
1017       ss << "not primary";
1018       return -EROFS;
1019     }
1020
1021     uint64_t unfound = missing_loc.num_unfound();
1022     if (!unfound) {
1023       ss << "pg has no unfound objects";
1024       return 0;  // make command idempotent
1025     }
1026
1027     if (!all_unfound_are_queried_or_lost(get_osdmap())) {
1028       ss << "pg has " << unfound
1029          << " unfound objects but we haven't probed all sources, not marking lost";
1030       return -EINVAL;
1031     }
1032
1033     mark_all_unfound_lost(mode, con, tid);
1034     return -EAGAIN;
1035   }
1036   else if (command == "list_missing") {
1037     hobject_t offset;
1038     string offset_json;
1039     if (cmd_getval(cct, cmdmap, "offset", offset_json)) {
1040       json_spirit::Value v;
1041       try {
1042         if (!json_spirit::read(offset_json, v))
1043           throw std::runtime_error("bad json");
1044         offset.decode(v);
1045       } catch (std::runtime_error& e) {
1046         ss << "error parsing offset: " << e.what();
1047         return -EINVAL;
1048       }
1049     }
1050     f->open_object_section("missing");
1051     {
1052       f->open_object_section("offset");
1053       offset.dump(f.get());
1054       f->close_section();
1055     }
1056     f->dump_int("num_missing", missing.num_missing());
1057     f->dump_int("num_unfound", get_num_unfound());
1058     const map<hobject_t, pg_missing_item> &needs_recovery_map =
1059       missing_loc.get_needs_recovery();
1060     map<hobject_t, pg_missing_item>::const_iterator p =
1061       needs_recovery_map.upper_bound(offset);
1062     {
1063       f->open_array_section("objects");
1064       int32_t num = 0;
1065       for (; p != needs_recovery_map.end() && num < cct->_conf->osd_command_max_records; ++p) {
1066         if (missing_loc.is_unfound(p->first)) {
1067           f->open_object_section("object");
1068           {
1069             f->open_object_section("oid");
1070             p->first.dump(f.get());
1071             f->close_section();
1072           }
1073           p->second.dump(f.get()); // have, need keys
1074           {
1075             f->open_array_section("locations");
1076             for (set<pg_shard_t>::iterator r =
1077                 missing_loc.get_locations(p->first).begin();
1078                 r != missing_loc.get_locations(p->first).end();
1079                 ++r)
1080               f->dump_stream("shard") << *r;
1081             f->close_section();
1082           }
1083           f->close_section();
1084           num++;
1085         }
1086       }
1087       f->close_section();
1088     }
1089     f->dump_bool("more", p != needs_recovery_map.end());
1090     f->close_section();
1091     f->flush(odata);
1092     return 0;
1093   }
1094
1095   ss << "unknown pg command " << prefix;
1096   return -EINVAL;
1097 }
1098
1099 // ==========================================================
1100
1101 void PrimaryLogPG::do_pg_op(OpRequestRef op)
1102 {
1103   // NOTE: this is non-const because we modify the OSDOp.outdata in
1104   // place
1105   MOSDOp *m = static_cast<MOSDOp *>(op->get_nonconst_req());
1106   assert(m->get_type() == CEPH_MSG_OSD_OP);
1107   dout(10) << "do_pg_op " << *m << dendl;
1108
1109   op->mark_started();
1110
1111   int result = 0;
1112   string cname, mname;
1113   PGLSFilter *filter = NULL;
1114   bufferlist filter_out;
1115
1116   snapid_t snapid = m->get_snapid();
1117
1118   vector<OSDOp> ops = m->ops;
1119
1120   for (vector<OSDOp>::iterator p = ops.begin(); p != ops.end(); ++p) {
1121     OSDOp& osd_op = *p;
1122     bufferlist::iterator bp = p->indata.begin();
1123     switch (p->op.op) {
1124     case CEPH_OSD_OP_PGNLS_FILTER:
1125       try {
1126         ::decode(cname, bp);
1127         ::decode(mname, bp);
1128       }
1129       catch (const buffer::error& e) {
1130         dout(0) << "unable to decode PGLS_FILTER description in " << *m << dendl;
1131         result = -EINVAL;
1132         break;
1133       }
1134       if (filter) {
1135         delete filter;
1136         filter = NULL;
1137       }
1138       result = get_pgls_filter(bp, &filter);
1139       if (result < 0)
1140         break;
1141
1142       assert(filter);
1143
1144       // fall through
1145
1146     case CEPH_OSD_OP_PGNLS:
1147       if (snapid != CEPH_NOSNAP) {
1148         result = -EINVAL;
1149         break;
1150       }
1151       if (get_osdmap()->raw_pg_to_pg(m->get_pg()) != info.pgid.pgid) {
1152         dout(10) << " pgnls pg=" << m->get_pg()
1153                  << " " << get_osdmap()->raw_pg_to_pg(m->get_pg())
1154                  << " != " << info.pgid << dendl;
1155         result = 0; // hmm?
1156       } else {
1157         unsigned list_size = MIN(cct->_conf->osd_max_pgls, p->op.pgls.count);
1158
1159         dout(10) << " pgnls pg=" << m->get_pg() << " count " << list_size << dendl;
1160         // read into a buffer
1161         vector<hobject_t> sentries;
1162         pg_nls_response_t response;
1163         try {
1164           ::decode(response.handle, bp);
1165         }
1166         catch (const buffer::error& e) {
1167           dout(0) << "unable to decode PGNLS handle in " << *m << dendl;
1168           result = -EINVAL;
1169           break;
1170         }
1171
1172         hobject_t next;
1173         hobject_t lower_bound = response.handle;
1174         hobject_t pg_start = info.pgid.pgid.get_hobj_start();
1175         hobject_t pg_end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1176         dout(10) << " pgnls lower_bound " << lower_bound
1177                  << " pg_end " << pg_end << dendl;
1178         if (((!lower_bound.is_max() && lower_bound >= pg_end) ||
1179              (lower_bound != hobject_t() && lower_bound < pg_start))) {
1180           // this should only happen with a buggy client.
1181           dout(10) << "outside of PG bounds " << pg_start << " .. "
1182                    << pg_end << dendl;
1183           result = -EINVAL;
1184           break;
1185         }
1186
1187         hobject_t current = lower_bound;
1188         osr->flush();
1189         int r = pgbackend->objects_list_partial(
1190           current,
1191           list_size,
1192           list_size,
1193           &sentries,
1194           &next);
1195         if (r != 0) {
1196           result = -EINVAL;
1197           break;
1198         }
1199
1200         map<hobject_t, pg_missing_item>::const_iterator missing_iter =
1201           pg_log.get_missing().get_items().lower_bound(current);
1202         vector<hobject_t>::iterator ls_iter = sentries.begin();
1203         hobject_t _max = hobject_t::get_max();
1204         while (1) {
1205           const hobject_t &mcand =
1206             missing_iter == pg_log.get_missing().get_items().end() ?
1207             _max :
1208             missing_iter->first;
1209           const hobject_t &lcand =
1210             ls_iter == sentries.end() ?
1211             _max :
1212             *ls_iter;
1213
1214           hobject_t candidate;
1215           if (mcand == lcand) {
1216             candidate = mcand;
1217             if (!mcand.is_max()) {
1218               ++ls_iter;
1219               ++missing_iter;
1220             }
1221           } else if (mcand < lcand) {
1222             candidate = mcand;
1223             assert(!mcand.is_max());
1224             ++missing_iter;
1225           } else {
1226             candidate = lcand;
1227             assert(!lcand.is_max());
1228             ++ls_iter;
1229           }
1230
1231           dout(10) << " pgnls candidate 0x" << std::hex << candidate.get_hash()
1232             << " vs lower bound 0x" << lower_bound.get_hash() << dendl;
1233
1234           if (candidate >= next) {
1235             break;
1236           }
1237
1238           if (response.entries.size() == list_size) {
1239             next = candidate;
1240             break;
1241           }
1242
1243           // skip snapdir objects
1244           if (candidate.snap == CEPH_SNAPDIR)
1245             continue;
1246
1247           if (candidate.snap != CEPH_NOSNAP)
1248             continue;
1249
1250           // skip internal namespace
1251           if (candidate.get_namespace() == cct->_conf->osd_hit_set_namespace)
1252             continue;
1253
1254           if (missing_loc.is_deleted(candidate))
1255             continue;
1256
1257           // skip wrong namespace
1258           if (m->get_hobj().nspace != librados::all_nspaces &&
1259                candidate.get_namespace() != m->get_hobj().nspace)
1260             continue;
1261
1262           if (filter && !pgls_filter(filter, candidate, filter_out))
1263             continue;
1264
1265           dout(20) << "pgnls item 0x" << std::hex
1266             << candidate.get_hash()
1267             << ", rev 0x" << hobject_t::_reverse_bits(candidate.get_hash())
1268             << std::dec << " "
1269             << candidate.oid.name << dendl;
1270
1271           librados::ListObjectImpl item;
1272           item.nspace = candidate.get_namespace();
1273           item.oid = candidate.oid.name;
1274           item.locator = candidate.get_key();
1275           response.entries.push_back(item);
1276         }
1277
1278         if (next.is_max() &&
1279             missing_iter == pg_log.get_missing().get_items().end() &&
1280             ls_iter == sentries.end()) {
1281           result = 1;
1282
1283           // Set response.handle to the start of the next PG according
1284           // to the object sort order.
1285           response.handle = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1286         } else {
1287           response.handle = next;
1288         }
1289         dout(10) << "pgnls handle=" << response.handle << dendl;
1290         ::encode(response, osd_op.outdata);
1291         if (filter)
1292           ::encode(filter_out, osd_op.outdata);
1293         dout(10) << " pgnls result=" << result << " outdata.length()="
1294                  << osd_op.outdata.length() << dendl;
1295       }
1296       break;
1297
1298     case CEPH_OSD_OP_PGLS_FILTER:
1299       try {
1300         ::decode(cname, bp);
1301         ::decode(mname, bp);
1302       }
1303       catch (const buffer::error& e) {
1304         dout(0) << "unable to decode PGLS_FILTER description in " << *m << dendl;
1305         result = -EINVAL;
1306         break;
1307       }
1308       if (filter) {
1309         delete filter;
1310         filter = NULL;
1311       }
1312       result = get_pgls_filter(bp, &filter);
1313       if (result < 0)
1314         break;
1315
1316       assert(filter);
1317
1318       // fall through
1319
1320     case CEPH_OSD_OP_PGLS:
1321       if (snapid != CEPH_NOSNAP) {
1322         result = -EINVAL;
1323         break;
1324       }
1325       if (get_osdmap()->raw_pg_to_pg(m->get_pg()) != info.pgid.pgid) {
1326         dout(10) << " pgls pg=" << m->get_pg()
1327                  << " " << get_osdmap()->raw_pg_to_pg(m->get_pg())
1328                  << " != " << info.pgid << dendl;
1329         result = 0; // hmm?
1330       } else {
1331         unsigned list_size = MIN(cct->_conf->osd_max_pgls, p->op.pgls.count);
1332
1333         dout(10) << " pgls pg=" << m->get_pg() << " count " << list_size << dendl;
1334         // read into a buffer
1335         vector<hobject_t> sentries;
1336         pg_ls_response_t response;
1337         try {
1338           ::decode(response.handle, bp);
1339         }
1340         catch (const buffer::error& e) {
1341           dout(0) << "unable to decode PGLS handle in " << *m << dendl;
1342           result = -EINVAL;
1343           break;
1344         }
1345
1346         hobject_t next;
1347         hobject_t current = response.handle;
1348         osr->flush();
1349         int r = pgbackend->objects_list_partial(
1350           current,
1351           list_size,
1352           list_size,
1353           &sentries,
1354           &next);
1355         if (r != 0) {
1356           result = -EINVAL;
1357           break;
1358         }
1359
1360         assert(snapid == CEPH_NOSNAP || pg_log.get_missing().get_items().empty());
1361
1362         map<hobject_t, pg_missing_item>::const_iterator missing_iter =
1363           pg_log.get_missing().get_items().lower_bound(current);
1364         vector<hobject_t>::iterator ls_iter = sentries.begin();
1365         hobject_t _max = hobject_t::get_max();
1366         while (1) {
1367           const hobject_t &mcand =
1368             missing_iter == pg_log.get_missing().get_items().end() ?
1369             _max :
1370             missing_iter->first;
1371           const hobject_t &lcand =
1372             ls_iter == sentries.end() ?
1373             _max :
1374             *ls_iter;
1375
1376           hobject_t candidate;
1377           if (mcand == lcand) {
1378             candidate = mcand;
1379             if (!mcand.is_max()) {
1380               ++ls_iter;
1381               ++missing_iter;
1382             }
1383           } else if (mcand < lcand) {
1384             candidate = mcand;
1385             assert(!mcand.is_max());
1386             ++missing_iter;
1387           } else {
1388             candidate = lcand;
1389             assert(!lcand.is_max());
1390             ++ls_iter;
1391           }
1392
1393           if (candidate >= next) {
1394             break;
1395           }
1396
1397           if (response.entries.size() == list_size) {
1398             next = candidate;
1399             break;
1400           }
1401
1402           // skip snapdir objects
1403           if (candidate.snap == CEPH_SNAPDIR)
1404             continue;
1405
1406           if (candidate.snap != CEPH_NOSNAP)
1407             continue;
1408
1409           // skip wrong namespace
1410           if (candidate.get_namespace() != m->get_hobj().nspace)
1411             continue;
1412
1413           if (missing_loc.is_deleted(candidate))
1414             continue;
1415
1416           if (filter && !pgls_filter(filter, candidate, filter_out))
1417             continue;
1418
1419           response.entries.push_back(make_pair(candidate.oid,
1420                                                candidate.get_key()));
1421         }
1422         if (next.is_max() &&
1423             missing_iter == pg_log.get_missing().get_items().end() &&
1424             ls_iter == sentries.end()) {
1425           result = 1;
1426         }
1427         response.handle = next;
1428         ::encode(response, osd_op.outdata);
1429         if (filter)
1430           ::encode(filter_out, osd_op.outdata);
1431         dout(10) << " pgls result=" << result << " outdata.length()="
1432                  << osd_op.outdata.length() << dendl;
1433       }
1434       break;
1435
1436     case CEPH_OSD_OP_PG_HITSET_LS:
1437       {
1438         list< pair<utime_t,utime_t> > ls;
1439         for (list<pg_hit_set_info_t>::const_iterator p = info.hit_set.history.begin();
1440              p != info.hit_set.history.end();
1441              ++p)
1442           ls.push_back(make_pair(p->begin, p->end));
1443         if (hit_set)
1444           ls.push_back(make_pair(hit_set_start_stamp, utime_t()));
1445         ::encode(ls, osd_op.outdata);
1446       }
1447       break;
1448
1449     case CEPH_OSD_OP_PG_HITSET_GET:
1450       {
1451         utime_t stamp(osd_op.op.hit_set_get.stamp);
1452         if (hit_set_start_stamp && stamp >= hit_set_start_stamp) {
1453           // read the current in-memory HitSet, not the version we've
1454           // checkpointed.
1455           if (!hit_set) {
1456             result= -ENOENT;
1457             break;
1458           }
1459           ::encode(*hit_set, osd_op.outdata);
1460           result = osd_op.outdata.length();
1461         } else {
1462           // read an archived HitSet.
1463           hobject_t oid;
1464           for (list<pg_hit_set_info_t>::const_iterator p = info.hit_set.history.begin();
1465                p != info.hit_set.history.end();
1466                ++p) {
1467             if (stamp >= p->begin && stamp <= p->end) {
1468               oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
1469               break;
1470             }
1471           }
1472           if (oid == hobject_t()) {
1473             result = -ENOENT;
1474             break;
1475           }
1476           if (!pool.info.is_replicated()) {
1477             // FIXME: EC not supported yet
1478             result = -EOPNOTSUPP;
1479             break;
1480           }
1481           if (is_unreadable_object(oid)) {
1482             wait_for_unreadable_object(oid, op);
1483             delete filter;
1484             return;
1485           }
1486           result = osd->store->read(ch, ghobject_t(oid), 0, 0, osd_op.outdata);
1487         }
1488       }
1489       break;
1490
1491    case CEPH_OSD_OP_SCRUBLS:
1492       result = do_scrub_ls(m, &osd_op);
1493       break;
1494
1495     default:
1496       result = -EINVAL;
1497       break;
1498     }
1499
1500     if (result < 0)
1501       break;
1502   }
1503
1504   // reply
1505   MOSDOpReply *reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(),
1506                                        CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK,
1507                                        false);
1508   reply->claim_op_out_data(ops);
1509   reply->set_result(result);
1510   reply->set_reply_versions(info.last_update, info.last_user_version);
1511   osd->send_message_osd_client(reply, m->get_connection());
1512   delete filter;
1513 }
1514
1515 int PrimaryLogPG::do_scrub_ls(MOSDOp *m, OSDOp *osd_op)
1516 {
1517   if (m->get_pg() != info.pgid.pgid) {
1518     dout(10) << " scrubls pg=" << m->get_pg() << " != " << info.pgid << dendl;
1519     return -EINVAL; // hmm?
1520   }
1521   auto bp = osd_op->indata.begin();
1522   scrub_ls_arg_t arg;
1523   try {
1524     arg.decode(bp);
1525   } catch (buffer::error&) {
1526     dout(10) << " corrupted scrub_ls_arg_t" << dendl;
1527     return -EINVAL;
1528   }
1529   int r = 0;
1530   scrub_ls_result_t result = {.interval = info.history.same_interval_since};
1531   if (arg.interval != 0 && arg.interval != info.history.same_interval_since) {
1532     r = -EAGAIN;
1533   } else if (!scrubber.store) {
1534     r = -ENOENT;
1535   } else if (arg.get_snapsets) {
1536     result.vals = scrubber.store->get_snap_errors(osd->store,
1537                                                   get_pgid().pool(),
1538                                                   arg.start_after,
1539                                                   arg.max_return);
1540   } else {
1541     result.vals = scrubber.store->get_object_errors(osd->store,
1542                                                     get_pgid().pool(),
1543                                                     arg.start_after,
1544                                                     arg.max_return);
1545   }
1546   ::encode(result, osd_op->outdata);
1547   return r;
1548 }
1549
1550 void PrimaryLogPG::calc_trim_to()
1551 {
1552   size_t target = cct->_conf->osd_min_pg_log_entries;
1553   if (is_degraded() ||
1554       state_test(PG_STATE_RECOVERING |
1555                  PG_STATE_RECOVERY_WAIT |
1556                  PG_STATE_BACKFILL |
1557                  PG_STATE_BACKFILL_WAIT |
1558                  PG_STATE_BACKFILL_TOOFULL)) {
1559     target = cct->_conf->osd_max_pg_log_entries;
1560   }
1561
1562   eversion_t limit = MIN(
1563     min_last_complete_ondisk,
1564     pg_log.get_can_rollback_to());
1565   if (limit != eversion_t() &&
1566       limit != pg_trim_to &&
1567       pg_log.get_log().approx_size() > target) {
1568     size_t num_to_trim = pg_log.get_log().approx_size() - target;
1569     if (num_to_trim < cct->_conf->osd_pg_log_trim_min) {
1570       return;
1571     }
1572     list<pg_log_entry_t>::const_iterator it = pg_log.get_log().log.begin();
1573     eversion_t new_trim_to;
1574     for (size_t i = 0; i < num_to_trim; ++i) {
1575       new_trim_to = it->version;
1576       ++it;
1577       if (new_trim_to > limit) {
1578         new_trim_to = limit;
1579         dout(10) << "calc_trim_to trimming to min_last_complete_ondisk" << dendl;
1580         break;
1581       }
1582     }
1583     dout(10) << "calc_trim_to " << pg_trim_to << " -> " << new_trim_to << dendl;
1584     pg_trim_to = new_trim_to;
1585     assert(pg_trim_to <= pg_log.get_head());
1586     assert(pg_trim_to <= min_last_complete_ondisk);
1587   }
1588 }
1589
1590 PrimaryLogPG::PrimaryLogPG(OSDService *o, OSDMapRef curmap,
1591                            const PGPool &_pool, spg_t p) :
1592   PG(o, curmap, _pool, p),
1593   pgbackend(
1594     PGBackend::build_pg_backend(
1595       _pool.info, curmap, this, coll_t(p), ch, o->store, cct)),
1596   object_contexts(o->cct, o->cct->_conf->osd_pg_object_context_cache_count),
1597   snapset_contexts_lock("PrimaryLogPG::snapset_contexts_lock"),
1598   new_backfill(false),
1599   temp_seq(0),
1600   snap_trimmer_machine(this)
1601 {
1602   missing_loc.set_backend_predicates(
1603     pgbackend->get_is_readable_predicate(),
1604     pgbackend->get_is_recoverable_predicate());
1605   snap_trimmer_machine.initiate();
1606 }
1607
1608 void PrimaryLogPG::get_src_oloc(const object_t& oid, const object_locator_t& oloc, object_locator_t& src_oloc)
1609 {
1610   src_oloc = oloc;
1611   if (oloc.key.empty())
1612     src_oloc.key = oid.name;
1613 }
1614
1615 void PrimaryLogPG::handle_backoff(OpRequestRef& op)
1616 {
1617   const MOSDBackoff *m = static_cast<const MOSDBackoff*>(op->get_req());
1618   SessionRef session = static_cast<Session*>(m->get_connection()->get_priv());
1619   if (!session)
1620     return;  // drop it.
1621   session->put();  // get_priv takes a ref, and so does the SessionRef
1622   hobject_t begin = info.pgid.pgid.get_hobj_start();
1623   hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1624   if (begin < m->begin) {
1625     begin = m->begin;
1626   }
1627   if (end > m->end) {
1628     end = m->end;
1629   }
1630   dout(10) << __func__ << " backoff ack id " << m->id
1631            << " [" << begin << "," << end << ")" << dendl;
1632   session->ack_backoff(cct, m->pgid, m->id, begin, end);
1633 }
1634
1635 void PrimaryLogPG::do_request(
1636   OpRequestRef& op,
1637   ThreadPool::TPHandle &handle)
1638 {
1639   if (op->osd_trace) {
1640     op->pg_trace.init("pg op", &trace_endpoint, &op->osd_trace);
1641     op->pg_trace.event("do request");
1642   }
1643   // make sure we have a new enough map
1644   auto p = waiting_for_map.find(op->get_source());
1645   if (p != waiting_for_map.end()) {
1646     // preserve ordering
1647     dout(20) << __func__ << " waiting_for_map "
1648              << p->first << " not empty, queueing" << dendl;
1649     p->second.push_back(op);
1650     op->mark_delayed("waiting_for_map not empty");
1651     return;
1652   }
1653   if (!have_same_or_newer_map(op->min_epoch)) {
1654     dout(20) << __func__ << " min " << op->min_epoch
1655              << ", queue on waiting_for_map " << op->get_source() << dendl;
1656     waiting_for_map[op->get_source()].push_back(op);
1657     op->mark_delayed("op must wait for map");
1658     return;
1659   }
1660
1661   if (can_discard_request(op)) {
1662     return;
1663   }
1664
1665   // pg-wide backoffs
1666   const Message *m = op->get_req();
1667   if (m->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF)) {
1668     SessionRef session = static_cast<Session*>(m->get_connection()->get_priv());
1669     if (!session)
1670       return;  // drop it.
1671     session->put();  // get_priv takes a ref, and so does the SessionRef
1672
1673     if (op->get_req()->get_type() == CEPH_MSG_OSD_OP) {
1674       if (session->check_backoff(cct, info.pgid,
1675                                  info.pgid.pgid.get_hobj_start(), m)) {
1676         return;
1677       }
1678
1679       bool backoff =
1680         is_down() ||
1681         is_incomplete() ||
1682         (!is_active() && is_peered());
1683       if (g_conf->osd_backoff_on_peering && !backoff) {
1684         if (is_peering()) {
1685           backoff = true;
1686         }
1687       }
1688       if (backoff) {
1689         add_pg_backoff(session);
1690         return;
1691       }
1692     }
1693     // pg backoff acks at pg-level
1694     if (op->get_req()->get_type() == CEPH_MSG_OSD_BACKOFF) {
1695       const MOSDBackoff *ba = static_cast<const MOSDBackoff*>(m);
1696       if (ba->begin != ba->end) {
1697         handle_backoff(op);
1698         return;
1699       }
1700     }
1701   }
1702
1703   if (flushes_in_progress > 0) {
1704     dout(20) << flushes_in_progress
1705              << " flushes_in_progress pending "
1706              << "waiting for active on " << op << dendl;
1707     waiting_for_peered.push_back(op);
1708     op->mark_delayed("waiting for peered");
1709     return;
1710   }
1711
1712   if (!is_peered()) {
1713     // Delay unless PGBackend says it's ok
1714     if (pgbackend->can_handle_while_inactive(op)) {
1715       bool handled = pgbackend->handle_message(op);
1716       assert(handled);
1717       return;
1718     } else {
1719       waiting_for_peered.push_back(op);
1720       op->mark_delayed("waiting for peered");
1721       return;
1722     }
1723   }
1724
1725   assert(is_peered() && flushes_in_progress == 0);
1726   if (pgbackend->handle_message(op))
1727     return;
1728
1729   switch (op->get_req()->get_type()) {
1730   case CEPH_MSG_OSD_OP:
1731   case CEPH_MSG_OSD_BACKOFF:
1732     if (!is_active()) {
1733       dout(20) << " peered, not active, waiting for active on " << op << dendl;
1734       waiting_for_active.push_back(op);
1735       op->mark_delayed("waiting for active");
1736       return;
1737     }
1738     switch (op->get_req()->get_type()) {
1739     case CEPH_MSG_OSD_OP:
1740       // verify client features
1741       if ((pool.info.has_tiers() || pool.info.is_tier()) &&
1742           !op->has_feature(CEPH_FEATURE_OSD_CACHEPOOL)) {
1743         osd->reply_op_error(op, -EOPNOTSUPP);
1744         return;
1745       }
1746       do_op(op);
1747       break;
1748     case CEPH_MSG_OSD_BACKOFF:
1749       // object-level backoff acks handled in osdop context
1750       handle_backoff(op);
1751       break;
1752     }
1753     break;
1754
1755   case MSG_OSD_SUBOP:
1756     do_sub_op(op);
1757     break;
1758
1759   case MSG_OSD_SUBOPREPLY:
1760     do_sub_op_reply(op);
1761     break;
1762
1763   case MSG_OSD_PG_SCAN:
1764     do_scan(op, handle);
1765     break;
1766
1767   case MSG_OSD_PG_BACKFILL:
1768     do_backfill(op);
1769     break;
1770
1771   case MSG_OSD_PG_BACKFILL_REMOVE:
1772     do_backfill_remove(op);
1773     break;
1774
1775   case MSG_OSD_SCRUB_RESERVE:
1776     {
1777       const MOSDScrubReserve *m =
1778         static_cast<const MOSDScrubReserve*>(op->get_req());
1779       switch (m->type) {
1780       case MOSDScrubReserve::REQUEST:
1781         handle_scrub_reserve_request(op);
1782         break;
1783       case MOSDScrubReserve::GRANT:
1784         handle_scrub_reserve_grant(op, m->from);
1785         break;
1786       case MOSDScrubReserve::REJECT:
1787         handle_scrub_reserve_reject(op, m->from);
1788         break;
1789       case MOSDScrubReserve::RELEASE:
1790         handle_scrub_reserve_release(op);
1791         break;
1792       }
1793     }
1794     break;
1795
1796   case MSG_OSD_REP_SCRUB:
1797     replica_scrub(op, handle);
1798     break;
1799
1800   case MSG_OSD_REP_SCRUBMAP:
1801     do_replica_scrub_map(op);
1802     break;
1803
1804   case MSG_OSD_PG_UPDATE_LOG_MISSING:
1805     do_update_log_missing(op);
1806     break;
1807
1808   case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY:
1809     do_update_log_missing_reply(op);
1810     break;
1811
1812   default:
1813     assert(0 == "bad message type in do_request");
1814   }
1815 }
1816
1817 hobject_t PrimaryLogPG::earliest_backfill() const
1818 {
1819   hobject_t e = hobject_t::get_max();
1820   for (set<pg_shard_t>::iterator i = backfill_targets.begin();
1821        i != backfill_targets.end();
1822        ++i) {
1823     pg_shard_t bt = *i;
1824     map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(bt);
1825     assert(iter != peer_info.end());
1826     if (iter->second.last_backfill < e)
1827       e = iter->second.last_backfill;
1828   }
1829   return e;
1830 }
1831
1832 /** do_op - do an op
1833  * pg lock will be held (if multithreaded)
1834  * osd_lock NOT held.
1835  */
1836 void PrimaryLogPG::do_op(OpRequestRef& op)
1837 {
1838   FUNCTRACE();
1839   // NOTE: take a non-const pointer here; we must be careful not to
1840   // change anything that will break other reads on m (operator<<).
1841   MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
1842   assert(m->get_type() == CEPH_MSG_OSD_OP);
1843   if (m->finish_decode()) {
1844     op->reset_desc();   // for TrackedOp
1845     m->clear_payload();
1846   }
1847
1848   dout(20) << __func__ << ": op " << *m << dendl;
1849
1850   hobject_t head = m->get_hobj();
1851   head.snap = CEPH_NOSNAP;
1852
1853   if (!info.pgid.pgid.contains(
1854         info.pgid.pgid.get_split_bits(pool.info.get_pg_num()), head)) {
1855     derr << __func__ << " " << info.pgid.pgid << " does not contain "
1856          << head << " pg_num " << pool.info.get_pg_num() << " hash "
1857          << std::hex << head.get_hash() << std::dec << dendl;
1858     osd->clog->warn() << info.pgid.pgid << " does not contain " << head
1859                       << " op " << *m;
1860     assert(!cct->_conf->osd_debug_misdirected_ops);
1861     return;
1862   }
1863
1864   bool can_backoff =
1865     m->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF);
1866   SessionRef session;
1867   if (can_backoff) {
1868     session = static_cast<Session*>(m->get_connection()->get_priv());
1869     if (!session.get()) {
1870       dout(10) << __func__ << " no session" << dendl;
1871       return;
1872     }
1873     session->put();  // get_priv() takes a ref, and so does the intrusive_ptr
1874
1875     if (session->check_backoff(cct, info.pgid, head, m)) {
1876       return;
1877     }
1878   }
1879
1880   if (m->has_flag(CEPH_OSD_FLAG_PARALLELEXEC)) {
1881     // not implemented.
1882     dout(20) << __func__ << ": PARALLELEXEC not implemented " << *m << dendl;
1883     osd->reply_op_error(op, -EINVAL);
1884     return;
1885   }
1886
1887   if (op->rmw_flags == 0) {
1888     int r = osd->osd->init_op_flags(op);
1889     if (r) {
1890       osd->reply_op_error(op, r);
1891       return;
1892     }
1893   }
1894
1895   if ((m->get_flags() & (CEPH_OSD_FLAG_BALANCE_READS |
1896                          CEPH_OSD_FLAG_LOCALIZE_READS)) &&
1897       op->may_read() &&
1898       !(op->may_write() || op->may_cache())) {
1899     // balanced reads; any replica will do
1900     if (!(is_primary() || is_replica())) {
1901       osd->handle_misdirected_op(this, op);
1902       return;
1903     }
1904   } else {
1905     // normal case; must be primary
1906     if (!is_primary()) {
1907       osd->handle_misdirected_op(this, op);
1908       return;
1909     }
1910   }
1911
1912   if (!op_has_sufficient_caps(op)) {
1913     osd->reply_op_error(op, -EPERM);
1914     return;
1915   }
1916
1917   if (op->includes_pg_op()) {
1918     return do_pg_op(op);
1919   }
1920
1921   // object name too long?
1922   if (m->get_oid().name.size() > cct->_conf->osd_max_object_name_len) {
1923     dout(4) << "do_op name is longer than "
1924             << cct->_conf->osd_max_object_name_len
1925             << " bytes" << dendl;
1926     osd->reply_op_error(op, -ENAMETOOLONG);
1927     return;
1928   }
1929   if (m->get_hobj().get_key().size() > cct->_conf->osd_max_object_name_len) {
1930     dout(4) << "do_op locator is longer than "
1931             << cct->_conf->osd_max_object_name_len
1932             << " bytes" << dendl;
1933     osd->reply_op_error(op, -ENAMETOOLONG);
1934     return;
1935   }
1936   if (m->get_hobj().nspace.size() > cct->_conf->osd_max_object_namespace_len) {
1937     dout(4) << "do_op namespace is longer than "
1938             << cct->_conf->osd_max_object_namespace_len
1939             << " bytes" << dendl;
1940     osd->reply_op_error(op, -ENAMETOOLONG);
1941     return;
1942   }
1943
1944   if (int r = osd->store->validate_hobject_key(head)) {
1945     dout(4) << "do_op object " << head << " invalid for backing store: "
1946             << r << dendl;
1947     osd->reply_op_error(op, r);
1948     return;
1949   }
1950
1951   // blacklisted?
1952   if (get_osdmap()->is_blacklisted(m->get_source_addr())) {
1953     dout(10) << "do_op " << m->get_source_addr() << " is blacklisted" << dendl;
1954     osd->reply_op_error(op, -EBLACKLISTED);
1955     return;
1956   }
1957
1958   // order this op as a write?
1959   bool write_ordered = op->rwordered();
1960
1961   // discard due to cluster full transition?  (we discard any op that
1962   // originates before the cluster or pool is marked full; the client
1963   // will resend after the full flag is removed or if they expect the
1964   // op to succeed despite being full).  The except is FULL_FORCE and
1965   // FULL_TRY ops, which there is no reason to discard because they
1966   // bypass all full checks anyway.  If this op isn't write or
1967   // read-ordered, we skip.
1968   // FIXME: we exclude mds writes for now.
1969   if (write_ordered && !(m->get_source().is_mds() ||
1970                          m->has_flag(CEPH_OSD_FLAG_FULL_TRY) ||
1971                          m->has_flag(CEPH_OSD_FLAG_FULL_FORCE)) &&
1972       info.history.last_epoch_marked_full > m->get_map_epoch()) {
1973     dout(10) << __func__ << " discarding op sent before full " << m << " "
1974              << *m << dendl;
1975     return;
1976   }
1977   // mds should have stopped writing before this point.
1978   // We can't allow OSD to become non-startable even if mds
1979   // could be writing as part of file removals.
1980   ostringstream ss;
1981   if (write_ordered && osd->check_failsafe_full(ss)) {
1982     dout(10) << __func__ << " fail-safe full check failed, dropping request"
1983              << ss.str()
1984              << dendl;
1985     return;
1986   }
1987   int64_t poolid = get_pgid().pool();
1988   if (op->may_write()) {
1989
1990     const pg_pool_t *pi = get_osdmap()->get_pg_pool(poolid);
1991     if (!pi) {
1992       return;
1993     }
1994
1995     // invalid?
1996     if (m->get_snapid() != CEPH_NOSNAP) {
1997       dout(20) << __func__ << ": write to clone not valid " << *m << dendl;
1998       osd->reply_op_error(op, -EINVAL);
1999       return;
2000     }
2001
2002     // too big?
2003     if (cct->_conf->osd_max_write_size &&
2004         m->get_data_len() > cct->_conf->osd_max_write_size << 20) {
2005       // journal can't hold commit!
2006       derr << "do_op msg data len " << m->get_data_len()
2007            << " > osd_max_write_size " << (cct->_conf->osd_max_write_size << 20)
2008            << " on " << *m << dendl;
2009       osd->reply_op_error(op, -OSD_WRITETOOBIG);
2010       return;
2011     }
2012   }
2013
2014   dout(10) << "do_op " << *m
2015            << (op->may_write() ? " may_write" : "")
2016            << (op->may_read() ? " may_read" : "")
2017            << (op->may_cache() ? " may_cache" : "")
2018            << " -> " << (write_ordered ? "write-ordered" : "read-ordered")
2019            << " flags " << ceph_osd_flag_string(m->get_flags())
2020            << dendl;
2021
2022   // missing object?
2023   if (is_unreadable_object(head)) {
2024     if (!is_primary()) {
2025       osd->reply_op_error(op, -EAGAIN);
2026       return;
2027     }
2028     if (can_backoff &&
2029         (g_conf->osd_backoff_on_degraded ||
2030          (g_conf->osd_backoff_on_unfound && missing_loc.is_unfound(head)))) {
2031       add_backoff(session, head, head);
2032       maybe_kick_recovery(head);
2033     } else {
2034       wait_for_unreadable_object(head, op);
2035     }
2036     return;
2037   }
2038
2039   // degraded object?
2040   if (write_ordered && is_degraded_or_backfilling_object(head)) {
2041     if (can_backoff && g_conf->osd_backoff_on_degraded) {
2042       add_backoff(session, head, head);
2043     } else {
2044       wait_for_degraded_object(head, op);
2045     }
2046     return;
2047   }
2048
2049   if (write_ordered &&
2050       scrubber.write_blocked_by_scrub(head)) {
2051     dout(20) << __func__ << ": waiting for scrub" << dendl;
2052     waiting_for_scrub.push_back(op);
2053     op->mark_delayed("waiting for scrub");
2054     return;
2055   }
2056
2057   // blocked on snap?
2058   map<hobject_t, snapid_t>::iterator blocked_iter =
2059     objects_blocked_on_degraded_snap.find(head);
2060   if (write_ordered && blocked_iter != objects_blocked_on_degraded_snap.end()) {
2061     hobject_t to_wait_on(head);
2062     to_wait_on.snap = blocked_iter->second;
2063     wait_for_degraded_object(to_wait_on, op);
2064     return;
2065   }
2066   map<hobject_t, ObjectContextRef>::iterator blocked_snap_promote_iter =
2067     objects_blocked_on_snap_promotion.find(head);
2068   if (write_ordered &&
2069       blocked_snap_promote_iter != objects_blocked_on_snap_promotion.end()) {
2070     wait_for_blocked_object(
2071       blocked_snap_promote_iter->second->obs.oi.soid,
2072       op);
2073     return;
2074   }
2075   if (write_ordered && objects_blocked_on_cache_full.count(head)) {
2076     block_write_on_full_cache(head, op);
2077     return;
2078   }
2079
2080   // missing snapdir?
2081   hobject_t snapdir = head.get_snapdir();
2082
2083   if (is_unreadable_object(snapdir)) {
2084     wait_for_unreadable_object(snapdir, op);
2085     return;
2086   }
2087
2088   // degraded object?
2089   if (write_ordered && is_degraded_or_backfilling_object(snapdir)) {
2090     wait_for_degraded_object(snapdir, op);
2091     return;
2092   }
2093
2094   // dup/resent?
2095   if (op->may_write() || op->may_cache()) {
2096     // warning: we will get back *a* request for this reqid, but not
2097     // necessarily the most recent.  this happens with flush and
2098     // promote ops, but we can't possible have both in our log where
2099     // the original request is still not stable on disk, so for our
2100     // purposes here it doesn't matter which one we get.
2101     eversion_t version;
2102     version_t user_version;
2103     int return_code = 0;
2104     bool got = check_in_progress_op(
2105       m->get_reqid(), &version, &user_version, &return_code);
2106     if (got) {
2107       dout(3) << __func__ << " dup " << m->get_reqid()
2108               << " version " << version << dendl;
2109       if (already_complete(version)) {
2110         osd->reply_op_error(op, return_code, version, user_version);
2111       } else {
2112         dout(10) << " waiting for " << version << " to commit" << dendl;
2113         // always queue ondisk waiters, so that we can requeue if needed
2114         waiting_for_ondisk[version].push_back(make_pair(op, user_version));
2115         op->mark_delayed("waiting for ondisk");
2116       }
2117       return;
2118     }
2119   }
2120
2121   ObjectContextRef obc;
2122   bool can_create = op->may_write() || op->may_cache();
2123   hobject_t missing_oid;
2124   const hobject_t& oid = m->get_hobj();
2125
2126   // io blocked on obc?
2127   if (!m->has_flag(CEPH_OSD_FLAG_FLUSH) &&
2128       maybe_await_blocked_snapset(oid, op)) {
2129     return;
2130   }
2131
2132   int r = find_object_context(
2133     oid, &obc, can_create,
2134     m->has_flag(CEPH_OSD_FLAG_MAP_SNAP_CLONE),
2135     &missing_oid);
2136
2137   if (r == -EAGAIN) {
2138     // If we're not the primary of this OSD, we just return -EAGAIN. Otherwise,
2139     // we have to wait for the object.
2140     if (is_primary()) {
2141       // missing the specific snap we need; requeue and wait.
2142       assert(!op->may_write()); // only happens on a read/cache
2143       wait_for_unreadable_object(missing_oid, op);
2144       return;
2145     }
2146   } else if (r == 0) {
2147     if (is_unreadable_object(obc->obs.oi.soid)) {
2148       dout(10) << __func__ << ": clone " << obc->obs.oi.soid
2149                << " is unreadable, waiting" << dendl;
2150       wait_for_unreadable_object(obc->obs.oi.soid, op);
2151       return;
2152     }
2153
2154     // degraded object?  (the check above was for head; this could be a clone)
2155     if (write_ordered &&
2156         obc->obs.oi.soid.snap != CEPH_NOSNAP &&
2157         is_degraded_or_backfilling_object(obc->obs.oi.soid)) {
2158       dout(10) << __func__ << ": clone " << obc->obs.oi.soid
2159                << " is degraded, waiting" << dendl;
2160       wait_for_degraded_object(obc->obs.oi.soid, op);
2161       return;
2162     }
2163   }
2164
2165   bool in_hit_set = false;
2166   if (hit_set) {
2167     if (obc.get()) {
2168       if (obc->obs.oi.soid != hobject_t() && hit_set->contains(obc->obs.oi.soid))
2169         in_hit_set = true;
2170     } else {
2171       if (missing_oid != hobject_t() && hit_set->contains(missing_oid))
2172         in_hit_set = true;
2173     }
2174     if (!op->hitset_inserted) {
2175       hit_set->insert(oid);
2176       op->hitset_inserted = true;
2177       if (hit_set->is_full() ||
2178           hit_set_start_stamp + pool.info.hit_set_period <= m->get_recv_stamp()) {
2179         hit_set_persist();
2180       }
2181     }
2182   }
2183
2184   if (agent_state) {
2185     if (agent_choose_mode(false, op))
2186       return;
2187   }
2188
2189   if (obc.get() && obc->obs.exists && obc->obs.oi.has_manifest()) {
2190     if (maybe_handle_manifest(op,
2191                                write_ordered,
2192                                obc))
2193     return;
2194   }
2195
2196   if (maybe_handle_cache(op,
2197                          write_ordered,
2198                          obc,
2199                          r,
2200                          missing_oid,
2201                          false,
2202                          in_hit_set))
2203     return;
2204
2205   if (r && (r != -ENOENT || !obc)) {
2206     // copy the reqids for copy get on ENOENT
2207     if (r == -ENOENT &&
2208         (m->ops[0].op.op == CEPH_OSD_OP_COPY_GET)) {
2209       fill_in_copy_get_noent(op, oid, m->ops[0]);
2210       return;
2211     }
2212     dout(20) << __func__ << ": find_object_context got error " << r << dendl;
2213     if (op->may_write() &&
2214         get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
2215       record_write_error(op, oid, nullptr, r);
2216     } else {
2217       osd->reply_op_error(op, r);
2218     }
2219     return;
2220   }
2221
2222   // make sure locator is consistent
2223   object_locator_t oloc(obc->obs.oi.soid);
2224   if (m->get_object_locator() != oloc) {
2225     dout(10) << " provided locator " << m->get_object_locator()
2226              << " != object's " << obc->obs.oi.soid << dendl;
2227     osd->clog->warn() << "bad locator " << m->get_object_locator()
2228                      << " on object " << oloc
2229                       << " op " << *m;
2230   }
2231
2232   // io blocked on obc?
2233   if (obc->is_blocked() &&
2234       !m->has_flag(CEPH_OSD_FLAG_FLUSH)) {
2235     wait_for_blocked_object(obc->obs.oi.soid, op);
2236     return;
2237   }
2238
2239   dout(25) << __func__ << " oi " << obc->obs.oi << dendl;
2240
2241   for (vector<OSDOp>::iterator p = m->ops.begin(); p != m->ops.end(); ++p) {
2242     OSDOp& osd_op = *p;
2243
2244     // make sure LIST_SNAPS is on CEPH_SNAPDIR and nothing else
2245     if (osd_op.op.op == CEPH_OSD_OP_LIST_SNAPS &&
2246         m->get_snapid() != CEPH_SNAPDIR) {
2247       dout(10) << "LIST_SNAPS with incorrect context" << dendl;
2248       osd->reply_op_error(op, -EINVAL);
2249       return;
2250     }
2251   }
2252
2253   OpContext *ctx = new OpContext(op, m->get_reqid(), &m->ops, obc, this);
2254
2255   if (!obc->obs.exists)
2256     ctx->snapset_obc = get_object_context(obc->obs.oi.soid.get_snapdir(), false);
2257
2258   /* Due to obc caching, we might have a cached non-existent snapset_obc
2259    * for the snapdir.  If so, we can ignore it.  Subsequent parts of the
2260    * do_op pipeline make decisions based on whether snapset_obc is
2261    * populated.
2262    */
2263   if (ctx->snapset_obc && !ctx->snapset_obc->obs.exists)
2264     ctx->snapset_obc = ObjectContextRef();
2265
2266   if (m->has_flag(CEPH_OSD_FLAG_SKIPRWLOCKS)) {
2267     dout(20) << __func__ << ": skipping rw locks" << dendl;
2268   } else if (m->get_flags() & CEPH_OSD_FLAG_FLUSH) {
2269     dout(20) << __func__ << ": part of flush, will ignore write lock" << dendl;
2270
2271     // verify there is in fact a flush in progress
2272     // FIXME: we could make this a stronger test.
2273     map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(obc->obs.oi.soid);
2274     if (p == flush_ops.end()) {
2275       dout(10) << __func__ << " no flush in progress, aborting" << dendl;
2276       reply_ctx(ctx, -EINVAL);
2277       return;
2278     }
2279   } else if (!get_rw_locks(write_ordered, ctx)) {
2280     dout(20) << __func__ << " waiting for rw locks " << dendl;
2281     op->mark_delayed("waiting for rw locks");
2282     close_op_ctx(ctx);
2283     return;
2284   }
2285   dout(20) << __func__ << " obc " << *obc << dendl;
2286
2287   if (r) {
2288     dout(20) << __func__ << " returned an error: " << r << dendl;
2289     close_op_ctx(ctx);
2290     if (op->may_write() &&
2291         get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
2292       record_write_error(op, oid, nullptr, r);
2293     } else {
2294       osd->reply_op_error(op, r);
2295     }
2296     return;
2297   }
2298
2299   if (m->has_flag(CEPH_OSD_FLAG_IGNORE_CACHE)) {
2300     ctx->ignore_cache = true;
2301   }
2302
2303   if ((op->may_read()) && (obc->obs.oi.is_lost())) {
2304     // This object is lost. Reading from it returns an error.
2305     dout(20) << __func__ << ": object " << obc->obs.oi.soid
2306              << " is lost" << dendl;
2307     reply_ctx(ctx, -ENFILE);
2308     return;
2309   }
2310   if (!op->may_write() &&
2311       !op->may_cache() &&
2312       (!obc->obs.exists ||
2313        ((m->get_snapid() != CEPH_SNAPDIR) &&
2314         obc->obs.oi.is_whiteout()))) {
2315     // copy the reqids for copy get on ENOENT
2316     if (m->ops[0].op.op == CEPH_OSD_OP_COPY_GET) {
2317       fill_in_copy_get_noent(op, oid, m->ops[0]);
2318       close_op_ctx(ctx);
2319       return;
2320     }
2321     reply_ctx(ctx, -ENOENT);
2322     return;
2323   }
2324
2325   op->mark_started();
2326
2327   execute_ctx(ctx);
2328   utime_t prepare_latency = ceph_clock_now();
2329   prepare_latency -= op->get_dequeued_time();
2330   osd->logger->tinc(l_osd_op_prepare_lat, prepare_latency);
2331   if (op->may_read() && op->may_write()) {
2332     osd->logger->tinc(l_osd_op_rw_prepare_lat, prepare_latency);
2333   } else if (op->may_read()) {
2334     osd->logger->tinc(l_osd_op_r_prepare_lat, prepare_latency);
2335   } else if (op->may_write() || op->may_cache()) {
2336     osd->logger->tinc(l_osd_op_w_prepare_lat, prepare_latency);
2337   }
2338
2339   // force recovery of the oldest missing object if too many logs
2340   maybe_force_recovery();
2341 }
2342 PrimaryLogPG::cache_result_t PrimaryLogPG::maybe_handle_manifest_detail(
2343   OpRequestRef op,
2344   bool write_ordered,
2345   ObjectContextRef obc)
2346 {
2347   if (static_cast<const MOSDOp *>(op->get_req())->get_flags() &
2348       CEPH_OSD_FLAG_IGNORE_REDIRECT) {
2349     dout(20) << __func__ << ": ignoring redirect due to flag" << dendl;
2350     return cache_result_t::NOOP;
2351   }
2352
2353   if (obc)
2354     dout(10) << __func__ << " " << obc->obs.oi << " "
2355        << (obc->obs.exists ? "exists" : "DNE")
2356        << dendl;
2357
2358   // if it is write-ordered and blocked, stop now
2359   if (obc.get() && obc->is_blocked() && write_ordered) {
2360     // we're already doing something with this object
2361     dout(20) << __func__ << " blocked on " << obc->obs.oi.soid << dendl;
2362     return cache_result_t::NOOP;
2363   }
2364
2365   vector<OSDOp> ops = static_cast<const MOSDOp*>(op->get_req())->ops;
2366   for (vector<OSDOp>::iterator p = ops.begin(); p != ops.end(); ++p) {
2367     OSDOp& osd_op = *p;
2368     ceph_osd_op& op = osd_op.op;
2369     if (op.op == CEPH_OSD_OP_SET_REDIRECT) {
2370       return cache_result_t::NOOP;
2371     }
2372   }
2373
2374   switch (obc->obs.oi.manifest.type) {
2375   case object_manifest_t::TYPE_REDIRECT:
2376     if (op->may_write() || write_ordered) {
2377       do_proxy_write(op, obc->obs.oi.soid, obc);
2378     } else {
2379       do_proxy_read(op, obc);
2380     }
2381     return cache_result_t::HANDLED_PROXY;
2382   case object_manifest_t::TYPE_CHUNKED:
2383   default:
2384     assert(0 == "unrecognized manifest type");
2385   }
2386
2387   return cache_result_t::NOOP;
2388 }
2389
2390 void PrimaryLogPG::record_write_error(OpRequestRef op, const hobject_t &soid,
2391                                       MOSDOpReply *orig_reply, int r)
2392 {
2393   dout(20) << __func__ << " r=" << r << dendl;
2394   assert(op->may_write());
2395   const osd_reqid_t &reqid = static_cast<const MOSDOp*>(op->get_req())->get_reqid();
2396   ObjectContextRef obc;
2397   mempool::osd_pglog::list<pg_log_entry_t> entries;
2398   entries.push_back(pg_log_entry_t(pg_log_entry_t::ERROR, soid,
2399                                    get_next_version(), eversion_t(), 0,
2400                                    reqid, utime_t(), r));
2401
2402   struct OnComplete {
2403     PrimaryLogPG *pg;
2404     OpRequestRef op;
2405     boost::intrusive_ptr<MOSDOpReply> orig_reply;
2406     int r;
2407     OnComplete(
2408       PrimaryLogPG *pg,
2409       OpRequestRef op,
2410       MOSDOpReply *orig_reply,
2411       int r)
2412       : pg(pg), op(op),
2413         orig_reply(orig_reply, false /* take over ref */), r(r)
2414       {}
2415     void operator()() {
2416       ldpp_dout(pg, 20) << "finished " << __func__ << " r=" << r << dendl;
2417       const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
2418       int flags = m->get_flags() & (CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
2419       MOSDOpReply *reply = orig_reply.detach();
2420       if (reply == nullptr) {
2421         reply = new MOSDOpReply(m, r, pg->get_osdmap()->get_epoch(),
2422                                 flags, true);
2423       }
2424       ldpp_dout(pg, 10) << " sending commit on " << *m << " " << reply << dendl;
2425       pg->osd->send_message_osd_client(reply, m->get_connection());
2426     }
2427   };
2428
2429   ObcLockManager lock_manager;
2430   submit_log_entries(
2431     entries,
2432     std::move(lock_manager),
2433     boost::optional<std::function<void(void)> >(
2434       OnComplete(this, op, orig_reply, r)),
2435     op,
2436     r);
2437 }
2438
2439 PrimaryLogPG::cache_result_t PrimaryLogPG::maybe_handle_cache_detail(
2440   OpRequestRef op,
2441   bool write_ordered,
2442   ObjectContextRef obc,
2443   int r, hobject_t missing_oid,
2444   bool must_promote,
2445   bool in_hit_set,
2446   ObjectContextRef *promote_obc)
2447 {
2448   if (op &&
2449       op->get_req() &&
2450       op->get_req()->get_type() == CEPH_MSG_OSD_OP &&
2451       (static_cast<const MOSDOp *>(op->get_req())->get_flags() &
2452        CEPH_OSD_FLAG_IGNORE_CACHE)) {
2453     dout(20) << __func__ << ": ignoring cache due to flag" << dendl;
2454     return cache_result_t::NOOP;
2455   }
2456   // return quickly if caching is not enabled
2457   if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)
2458     return cache_result_t::NOOP;
2459
2460   must_promote = must_promote || op->need_promote();
2461
2462   if (obc)
2463     dout(25) << __func__ << " " << obc->obs.oi << " "
2464              << (obc->obs.exists ? "exists" : "DNE")
2465              << " missing_oid " << missing_oid
2466              << " must_promote " << (int)must_promote
2467              << " in_hit_set " << (int)in_hit_set
2468              << dendl;
2469   else
2470     dout(25) << __func__ << " (no obc)"
2471              << " missing_oid " << missing_oid
2472              << " must_promote " << (int)must_promote
2473              << " in_hit_set " << (int)in_hit_set
2474              << dendl;
2475
2476   // if it is write-ordered and blocked, stop now
2477   if (obc.get() && obc->is_blocked() && write_ordered) {
2478     // we're already doing something with this object
2479     dout(20) << __func__ << " blocked on " << obc->obs.oi.soid << dendl;
2480     return cache_result_t::NOOP;
2481   }
2482
2483   if (r == -ENOENT && missing_oid == hobject_t()) {
2484     // we know this object is logically absent (e.g., an undefined clone)
2485     return cache_result_t::NOOP;
2486   }
2487
2488   if (obc.get() && obc->obs.exists) {
2489     osd->logger->inc(l_osd_op_cache_hit);
2490     return cache_result_t::NOOP;
2491   }
2492
2493   if (missing_oid == hobject_t() && obc.get()) {
2494     missing_oid = obc->obs.oi.soid;
2495   }
2496
2497   const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
2498   const object_locator_t oloc = m->get_object_locator();
2499
2500   if (op->need_skip_handle_cache()) {
2501     return cache_result_t::NOOP;
2502   }
2503
2504   // older versions do not proxy the feature bits.
2505   bool can_proxy_write = get_osdmap()->get_up_osd_features() &
2506     CEPH_FEATURE_OSD_PROXY_WRITE_FEATURES;
2507   OpRequestRef promote_op;
2508
2509   switch (pool.info.cache_mode) {
2510   case pg_pool_t::CACHEMODE_WRITEBACK:
2511     if (agent_state &&
2512         agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2513       if (!op->may_write() && !op->may_cache() &&
2514           !write_ordered && !must_promote) {
2515         dout(20) << __func__ << " cache pool full, proxying read" << dendl;
2516         do_proxy_read(op);
2517         return cache_result_t::HANDLED_PROXY;
2518       }
2519       dout(20) << __func__ << " cache pool full, waiting" << dendl;
2520       block_write_on_full_cache(missing_oid, op);
2521       return cache_result_t::BLOCKED_FULL;
2522     }
2523
2524     if (must_promote || (!hit_set && !op->need_skip_promote())) {
2525       promote_object(obc, missing_oid, oloc, op, promote_obc);
2526       return cache_result_t::BLOCKED_PROMOTE;
2527     }
2528
2529     if (op->may_write() || op->may_cache()) {
2530       if (can_proxy_write) {
2531         do_proxy_write(op, missing_oid);
2532       } else {
2533         // promote if can't proxy the write
2534         promote_object(obc, missing_oid, oloc, op, promote_obc);
2535         return cache_result_t::BLOCKED_PROMOTE;
2536       }
2537
2538       // Promote too?
2539       if (!op->need_skip_promote() &&
2540           maybe_promote(obc, missing_oid, oloc, in_hit_set,
2541                       pool.info.min_write_recency_for_promote,
2542                       OpRequestRef(),
2543                       promote_obc)) {
2544         return cache_result_t::BLOCKED_PROMOTE;
2545       }
2546       return cache_result_t::HANDLED_PROXY;
2547     } else {
2548       do_proxy_read(op);
2549
2550       // Avoid duplicate promotion
2551       if (obc.get() && obc->is_blocked()) {
2552         if (promote_obc)
2553           *promote_obc = obc;
2554         return cache_result_t::BLOCKED_PROMOTE;
2555       }
2556
2557       // Promote too?
2558       if (!op->need_skip_promote()) {
2559         (void)maybe_promote(obc, missing_oid, oloc, in_hit_set,
2560                             pool.info.min_read_recency_for_promote,
2561                             promote_op, promote_obc);
2562       }
2563
2564       return cache_result_t::HANDLED_PROXY;
2565     }
2566     assert(0 == "unreachable");
2567     return cache_result_t::NOOP;
2568
2569   case pg_pool_t::CACHEMODE_FORWARD:
2570     // FIXME: this mode allows requests to be reordered.
2571     do_cache_redirect(op);
2572     return cache_result_t::HANDLED_REDIRECT;
2573
2574   case pg_pool_t::CACHEMODE_READONLY:
2575     // TODO: clean this case up
2576     if (!obc.get() && r == -ENOENT) {
2577       // we don't have the object and op's a read
2578       promote_object(obc, missing_oid, oloc, op, promote_obc);
2579       return cache_result_t::BLOCKED_PROMOTE;
2580     }
2581     if (!r) { // it must be a write
2582       do_cache_redirect(op);
2583       return cache_result_t::HANDLED_REDIRECT;
2584     }
2585     // crap, there was a failure of some kind
2586     return cache_result_t::NOOP;
2587
2588   case pg_pool_t::CACHEMODE_READFORWARD:
2589     // Do writeback to the cache tier for writes
2590     if (op->may_write() || write_ordered || must_promote) {
2591       if (agent_state &&
2592           agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2593         dout(20) << __func__ << " cache pool full, waiting" << dendl;
2594         block_write_on_full_cache(missing_oid, op);
2595         return cache_result_t::BLOCKED_FULL;
2596       }
2597       promote_object(obc, missing_oid, oloc, op, promote_obc);
2598       return cache_result_t::BLOCKED_PROMOTE;
2599     }
2600
2601     // If it is a read, we can read, we need to forward it
2602     do_cache_redirect(op);
2603     return cache_result_t::HANDLED_REDIRECT;
2604
2605   case pg_pool_t::CACHEMODE_PROXY:
2606     if (!must_promote) {
2607       if (op->may_write() || op->may_cache() || write_ordered) {
2608         if (can_proxy_write) {
2609           do_proxy_write(op, missing_oid);
2610           return cache_result_t::HANDLED_PROXY;
2611         }
2612       } else {
2613         do_proxy_read(op);
2614         return cache_result_t::HANDLED_PROXY;
2615       }
2616     }
2617     // ugh, we're forced to promote.
2618     if (agent_state &&
2619         agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2620       dout(20) << __func__ << " cache pool full, waiting" << dendl;
2621       block_write_on_full_cache(missing_oid, op);
2622       return cache_result_t::BLOCKED_FULL;
2623     }
2624     promote_object(obc, missing_oid, oloc, op, promote_obc);
2625     return cache_result_t::BLOCKED_PROMOTE;
2626
2627   case pg_pool_t::CACHEMODE_READPROXY:
2628     // Do writeback to the cache tier for writes
2629     if (op->may_write() || write_ordered || must_promote) {
2630       if (agent_state &&
2631           agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2632         dout(20) << __func__ << " cache pool full, waiting" << dendl;
2633         block_write_on_full_cache(missing_oid, op);
2634         return cache_result_t::BLOCKED_FULL;
2635       }
2636       promote_object(obc, missing_oid, oloc, op, promote_obc);
2637       return cache_result_t::BLOCKED_PROMOTE;
2638     }
2639
2640     // If it is a read, we can read, we need to proxy it
2641     do_proxy_read(op);
2642     return cache_result_t::HANDLED_PROXY;
2643
2644   default:
2645     assert(0 == "unrecognized cache_mode");
2646   }
2647   return cache_result_t::NOOP;
2648 }
2649
2650 bool PrimaryLogPG::maybe_promote(ObjectContextRef obc,
2651                                  const hobject_t& missing_oid,
2652                                  const object_locator_t& oloc,
2653                                  bool in_hit_set,
2654                                  uint32_t recency,
2655                                  OpRequestRef promote_op,
2656                                  ObjectContextRef *promote_obc)
2657 {
2658   dout(20) << __func__ << " missing_oid " << missing_oid
2659            << "  in_hit_set " << in_hit_set << dendl;
2660
2661   switch (recency) {
2662   case 0:
2663     break;
2664   case 1:
2665     // Check if in the current hit set
2666     if (in_hit_set) {
2667       break;
2668     } else {
2669       // not promoting
2670       return false;
2671     }
2672     break;
2673   default:
2674     {
2675       unsigned count = (int)in_hit_set;
2676       if (count) {
2677         // Check if in other hit sets
2678         const hobject_t& oid = obc.get() ? obc->obs.oi.soid : missing_oid;
2679         for (map<time_t,HitSetRef>::reverse_iterator itor =
2680                agent_state->hit_set_map.rbegin();
2681              itor != agent_state->hit_set_map.rend();
2682              ++itor) {
2683           if (!itor->second->contains(oid)) {
2684             break;
2685           }
2686           ++count;
2687           if (count >= recency) {
2688             break;
2689           }
2690         }
2691       }
2692       if (count >= recency) {
2693         break;
2694       }
2695       return false;     // not promoting
2696     }
2697     break;
2698   }
2699
2700   if (osd->promote_throttle()) {
2701     dout(10) << __func__ << " promote throttled" << dendl;
2702     return false;
2703   }
2704   promote_object(obc, missing_oid, oloc, promote_op, promote_obc);
2705   return true;
2706 }
2707
2708 void PrimaryLogPG::do_cache_redirect(OpRequestRef op)
2709 {
2710   const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
2711   int flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
2712   MOSDOpReply *reply = new MOSDOpReply(m, -ENOENT,
2713                                        get_osdmap()->get_epoch(), flags, false);
2714   request_redirect_t redir(m->get_object_locator(), pool.info.tier_of);
2715   reply->set_redirect(redir);
2716   dout(10) << "sending redirect to pool " << pool.info.tier_of << " for op "
2717            << op << dendl;
2718   m->get_connection()->send_message(reply);
2719   return;
2720 }
2721
2722 struct C_ProxyRead : public Context {
2723   PrimaryLogPGRef pg;
2724   hobject_t oid;
2725   epoch_t last_peering_reset;
2726   ceph_tid_t tid;
2727   PrimaryLogPG::ProxyReadOpRef prdop;
2728   utime_t start;
2729   C_ProxyRead(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
2730              const PrimaryLogPG::ProxyReadOpRef& prd)
2731     : pg(p), oid(o), last_peering_reset(lpr),
2732       tid(0), prdop(prd), start(ceph_clock_now())
2733   {}
2734   void finish(int r) override {
2735     if (prdop->canceled)
2736       return;
2737     pg->lock();
2738     if (prdop->canceled) {
2739       pg->unlock();
2740       return;
2741     }
2742     if (last_peering_reset == pg->get_last_peering_reset()) {
2743       pg->finish_proxy_read(oid, tid, r);
2744       pg->osd->logger->tinc(l_osd_tier_r_lat, ceph_clock_now() - start);
2745     }
2746     pg->unlock();
2747   }
2748 };
2749
2750 void PrimaryLogPG::do_proxy_read(OpRequestRef op, ObjectContextRef obc)
2751 {
2752   // NOTE: non-const here because the ProxyReadOp needs mutable refs to
2753   // stash the result in the request's OSDOp vector
2754   MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
2755   object_locator_t oloc;
2756   hobject_t soid;
2757   /* extensible tier */
2758   if (obc && obc->obs.exists && obc->obs.oi.has_manifest()) {
2759     switch (obc->obs.oi.manifest.type) {
2760       case object_manifest_t::TYPE_REDIRECT:
2761           oloc = object_locator_t(obc->obs.oi.manifest.redirect_target);
2762           soid = obc->obs.oi.manifest.redirect_target;
2763           break;
2764       case object_manifest_t::TYPE_CHUNKED:
2765       default:
2766         assert(0 == "unrecognized manifest type");
2767     }
2768   } else {
2769   /* proxy */
2770     soid = m->get_hobj();
2771     oloc = object_locator_t(m->get_object_locator());
2772     oloc.pool = pool.info.tier_of;
2773   }
2774   unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY;
2775
2776   // pass through some original flags that make sense.
2777   //  - leave out redirection and balancing flags since we are
2778   //    already proxying through the primary
2779   //  - leave off read/write/exec flags that are derived from the op
2780   flags |= m->get_flags() & (CEPH_OSD_FLAG_RWORDERED |
2781                              CEPH_OSD_FLAG_ORDERSNAP |
2782                              CEPH_OSD_FLAG_ENFORCE_SNAPC |
2783                              CEPH_OSD_FLAG_MAP_SNAP_CLONE);
2784
2785   dout(10) << __func__ << " Start proxy read for " << *m << dendl;
2786
2787   ProxyReadOpRef prdop(std::make_shared<ProxyReadOp>(op, soid, m->ops));
2788
2789   ObjectOperation obj_op;
2790   obj_op.dup(prdop->ops);
2791
2792   if (pool.info.cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
2793       (agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL)) {
2794     for (unsigned i = 0; i < obj_op.ops.size(); i++) {
2795       ceph_osd_op op = obj_op.ops[i].op;
2796       switch (op.op) {
2797         case CEPH_OSD_OP_READ:
2798         case CEPH_OSD_OP_SYNC_READ:
2799         case CEPH_OSD_OP_SPARSE_READ:
2800         case CEPH_OSD_OP_CHECKSUM:
2801         case CEPH_OSD_OP_CMPEXT:
2802           op.flags = (op.flags | CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL) &
2803                        ~(CEPH_OSD_OP_FLAG_FADVISE_DONTNEED | CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
2804       }
2805     }
2806   }
2807
2808   C_ProxyRead *fin = new C_ProxyRead(this, soid, get_last_peering_reset(),
2809                                      prdop);
2810   ceph_tid_t tid = osd->objecter->read(
2811     soid.oid, oloc, obj_op,
2812     m->get_snapid(), NULL,
2813     flags, new C_OnFinisher(fin, &osd->objecter_finisher),
2814     &prdop->user_version,
2815     &prdop->data_offset,
2816     m->get_features());
2817   fin->tid = tid;
2818   prdop->objecter_tid = tid;
2819   proxyread_ops[tid] = prdop;
2820   in_progress_proxy_ops[soid].push_back(op);
2821 }
2822
2823 void PrimaryLogPG::finish_proxy_read(hobject_t oid, ceph_tid_t tid, int r)
2824 {
2825   dout(10) << __func__ << " " << oid << " tid " << tid
2826            << " " << cpp_strerror(r) << dendl;
2827
2828   map<ceph_tid_t, ProxyReadOpRef>::iterator p = proxyread_ops.find(tid);
2829   if (p == proxyread_ops.end()) {
2830     dout(10) << __func__ << " no proxyread_op found" << dendl;
2831     return;
2832   }
2833   ProxyReadOpRef prdop = p->second;
2834   if (tid != prdop->objecter_tid) {
2835     dout(10) << __func__ << " tid " << tid << " != prdop " << prdop
2836              << " tid " << prdop->objecter_tid << dendl;
2837     return;
2838   }
2839   if (oid != prdop->soid) {
2840     dout(10) << __func__ << " oid " << oid << " != prdop " << prdop
2841              << " soid " << prdop->soid << dendl;
2842     return;
2843   }
2844   proxyread_ops.erase(tid);
2845
2846   map<hobject_t, list<OpRequestRef>>::iterator q = in_progress_proxy_ops.find(oid);
2847   if (q == in_progress_proxy_ops.end()) {
2848     dout(10) << __func__ << " no in_progress_proxy_ops found" << dendl;
2849     return;
2850   }
2851   assert(q->second.size());
2852   list<OpRequestRef>::iterator it = std::find(q->second.begin(),
2853                                               q->second.end(),
2854                                               prdop->op);
2855   assert(it != q->second.end());
2856   OpRequestRef op = *it;
2857   q->second.erase(it);
2858   if (q->second.size() == 0) {
2859     in_progress_proxy_ops.erase(oid);
2860   }
2861
2862   osd->logger->inc(l_osd_tier_proxy_read);
2863
2864   const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
2865   OpContext *ctx = new OpContext(op, m->get_reqid(), &prdop->ops, this);
2866   ctx->reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0, false);
2867   ctx->user_at_version = prdop->user_version;
2868   ctx->data_off = prdop->data_offset;
2869   ctx->ignore_log_op_stats = true;
2870   complete_read_ctx(r, ctx);
2871 }
2872
2873 void PrimaryLogPG::kick_proxy_ops_blocked(hobject_t& soid)
2874 {
2875   map<hobject_t, list<OpRequestRef>>::iterator p = in_progress_proxy_ops.find(soid);
2876   if (p == in_progress_proxy_ops.end())
2877     return;
2878
2879   list<OpRequestRef>& ls = p->second;
2880   dout(10) << __func__ << " " << soid << " requeuing " << ls.size() << " requests" << dendl;
2881   requeue_ops(ls);
2882   in_progress_proxy_ops.erase(p);
2883 }
2884
2885 void PrimaryLogPG::cancel_proxy_read(ProxyReadOpRef prdop)
2886 {
2887   dout(10) << __func__ << " " << prdop->soid << dendl;
2888   prdop->canceled = true;
2889
2890   // cancel objecter op, if we can
2891   if (prdop->objecter_tid) {
2892     osd->objecter->op_cancel(prdop->objecter_tid, -ECANCELED);
2893     for (uint32_t i = 0; i < prdop->ops.size(); i++) {
2894       prdop->ops[i].outdata.clear();
2895     }
2896     proxyread_ops.erase(prdop->objecter_tid);
2897     prdop->objecter_tid = 0;
2898   }
2899 }
2900
2901 void PrimaryLogPG::cancel_proxy_ops(bool requeue)
2902 {
2903   dout(10) << __func__ << dendl;
2904
2905   // cancel proxy reads
2906   map<ceph_tid_t, ProxyReadOpRef>::iterator p = proxyread_ops.begin();
2907   while (p != proxyread_ops.end()) {
2908     cancel_proxy_read((p++)->second);
2909   }
2910
2911   // cancel proxy writes
2912   map<ceph_tid_t, ProxyWriteOpRef>::iterator q = proxywrite_ops.begin();
2913   while (q != proxywrite_ops.end()) {
2914     cancel_proxy_write((q++)->second);
2915   }
2916
2917   if (requeue) {
2918     map<hobject_t, list<OpRequestRef>>::iterator p =
2919       in_progress_proxy_ops.begin();
2920     while (p != in_progress_proxy_ops.end()) {
2921       list<OpRequestRef>& ls = p->second;
2922       dout(10) << __func__ << " " << p->first << " requeuing " << ls.size()
2923                << " requests" << dendl;
2924       requeue_ops(ls);
2925       in_progress_proxy_ops.erase(p++);
2926     }
2927   } else {
2928     in_progress_proxy_ops.clear();
2929   }
2930 }
2931
2932 struct C_ProxyWrite_Commit : public Context {
2933   PrimaryLogPGRef pg;
2934   hobject_t oid;
2935   epoch_t last_peering_reset;
2936   ceph_tid_t tid;
2937   PrimaryLogPG::ProxyWriteOpRef pwop;
2938   C_ProxyWrite_Commit(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
2939                       const PrimaryLogPG::ProxyWriteOpRef& pw)
2940     : pg(p), oid(o), last_peering_reset(lpr),
2941       tid(0), pwop(pw)
2942   {}
2943   void finish(int r) override {
2944     if (pwop->canceled)
2945       return;
2946     pg->lock();
2947     if (pwop->canceled) {
2948       pg->unlock();
2949       return;
2950     }
2951     if (last_peering_reset == pg->get_last_peering_reset()) {
2952       pg->finish_proxy_write(oid, tid, r);
2953     }
2954     pg->unlock();
2955   }
2956 };
2957
2958 void PrimaryLogPG::do_proxy_write(OpRequestRef op, const hobject_t& missing_oid, ObjectContextRef obc)
2959 {
2960   // NOTE: non-const because ProxyWriteOp takes a mutable ref
2961   MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
2962   object_locator_t oloc;
2963   SnapContext snapc(m->get_snap_seq(), m->get_snaps());
2964   hobject_t soid;
2965   /* extensible tier */
2966   if (obc && obc->obs.exists && obc->obs.oi.has_manifest()) {
2967     switch (obc->obs.oi.manifest.type) {
2968       case object_manifest_t::TYPE_REDIRECT:
2969           oloc = object_locator_t(obc->obs.oi.manifest.redirect_target);
2970           soid = obc->obs.oi.manifest.redirect_target;
2971           break;
2972       case object_manifest_t::TYPE_CHUNKED:
2973       default:
2974         assert(0 == "unrecognized manifest type");
2975     }
2976   } else {
2977   /* proxy */
2978     soid = m->get_hobj();
2979     oloc = object_locator_t(m->get_object_locator());
2980     oloc.pool = pool.info.tier_of;
2981   }
2982
2983   unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY;
2984   if (!(op->may_write() || op->may_cache())) {
2985     flags |= CEPH_OSD_FLAG_RWORDERED;
2986   }
2987   dout(10) << __func__ << " Start proxy write for " << *m << dendl;
2988
2989   ProxyWriteOpRef pwop(std::make_shared<ProxyWriteOp>(op, soid, m->ops, m->get_reqid()));
2990   pwop->ctx = new OpContext(op, m->get_reqid(), &pwop->ops, this);
2991   pwop->mtime = m->get_mtime();
2992
2993   ObjectOperation obj_op;
2994   obj_op.dup(pwop->ops);
2995
2996   C_ProxyWrite_Commit *fin = new C_ProxyWrite_Commit(
2997       this, soid, get_last_peering_reset(), pwop);
2998   ceph_tid_t tid = osd->objecter->mutate(
2999     soid.oid, oloc, obj_op, snapc,
3000     ceph::real_clock::from_ceph_timespec(pwop->mtime),
3001     flags, new C_OnFinisher(fin, &osd->objecter_finisher),
3002     &pwop->user_version, pwop->reqid);
3003   fin->tid = tid;
3004   pwop->objecter_tid = tid;
3005   proxywrite_ops[tid] = pwop;
3006   in_progress_proxy_ops[soid].push_back(op);
3007 }
3008
3009 void PrimaryLogPG::finish_proxy_write(hobject_t oid, ceph_tid_t tid, int r)
3010 {
3011   dout(10) << __func__ << " " << oid << " tid " << tid
3012            << " " << cpp_strerror(r) << dendl;
3013
3014   map<ceph_tid_t, ProxyWriteOpRef>::iterator p = proxywrite_ops.find(tid);
3015   if (p == proxywrite_ops.end()) {
3016     dout(10) << __func__ << " no proxywrite_op found" << dendl;
3017     return;
3018   }
3019   ProxyWriteOpRef pwop = p->second;
3020   assert(tid == pwop->objecter_tid);
3021   assert(oid == pwop->soid);
3022
3023   proxywrite_ops.erase(tid);
3024
3025   map<hobject_t, list<OpRequestRef> >::iterator q = in_progress_proxy_ops.find(oid);
3026   if (q == in_progress_proxy_ops.end()) {
3027     dout(10) << __func__ << " no in_progress_proxy_ops found" << dendl;
3028     delete pwop->ctx;
3029     pwop->ctx = NULL;
3030     return;
3031   }
3032   list<OpRequestRef>& in_progress_op = q->second;
3033   assert(in_progress_op.size());
3034   list<OpRequestRef>::iterator it = std::find(in_progress_op.begin(),
3035                                               in_progress_op.end(),
3036                                               pwop->op);
3037   assert(it != in_progress_op.end());
3038   in_progress_op.erase(it);
3039   if (in_progress_op.size() == 0) {
3040     in_progress_proxy_ops.erase(oid);
3041   }
3042
3043   osd->logger->inc(l_osd_tier_proxy_write);
3044
3045   const MOSDOp *m = static_cast<const MOSDOp*>(pwop->op->get_req());
3046   assert(m != NULL);
3047
3048   if (!pwop->sent_reply) {
3049     // send commit.
3050     MOSDOpReply *reply = pwop->ctx->reply;
3051     if (reply)
3052       pwop->ctx->reply = NULL;
3053     else {
3054       reply = new MOSDOpReply(m, r, get_osdmap()->get_epoch(), 0, true);
3055       reply->set_reply_versions(eversion_t(), pwop->user_version);
3056     }
3057     reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
3058     dout(10) << " sending commit on " << pwop << " " << reply << dendl;
3059     osd->send_message_osd_client(reply, m->get_connection());
3060     pwop->sent_reply = true;
3061     pwop->ctx->op->mark_commit_sent();
3062   }
3063
3064   delete pwop->ctx;
3065   pwop->ctx = NULL;
3066 }
3067
3068 void PrimaryLogPG::cancel_proxy_write(ProxyWriteOpRef pwop)
3069 {
3070   dout(10) << __func__ << " " << pwop->soid << dendl;
3071   pwop->canceled = true;
3072
3073   // cancel objecter op, if we can
3074   if (pwop->objecter_tid) {
3075     osd->objecter->op_cancel(pwop->objecter_tid, -ECANCELED);
3076     delete pwop->ctx;
3077     pwop->ctx = NULL;
3078     proxywrite_ops.erase(pwop->objecter_tid);
3079     pwop->objecter_tid = 0;
3080   }
3081 }
3082
3083 class PromoteCallback: public PrimaryLogPG::CopyCallback {
3084   ObjectContextRef obc;
3085   PrimaryLogPG *pg;
3086   utime_t start;
3087 public:
3088   PromoteCallback(ObjectContextRef obc_, PrimaryLogPG *pg_)
3089     : obc(obc_),
3090       pg(pg_),
3091       start(ceph_clock_now()) {}
3092
3093   void finish(PrimaryLogPG::CopyCallbackResults results) override {
3094     PrimaryLogPG::CopyResults *results_data = results.get<1>();
3095     int r = results.get<0>();
3096     pg->finish_promote(r, results_data, obc);
3097     pg->osd->logger->tinc(l_osd_tier_promote_lat, ceph_clock_now() - start);
3098   }
3099 };
3100
3101 void PrimaryLogPG::promote_object(ObjectContextRef obc,
3102                                   const hobject_t& missing_oid,
3103                                   const object_locator_t& oloc,
3104                                   OpRequestRef op,
3105                                   ObjectContextRef *promote_obc)
3106 {
3107   hobject_t hoid = obc ? obc->obs.oi.soid : missing_oid;
3108   assert(hoid != hobject_t());
3109   if (scrubber.write_blocked_by_scrub(hoid)) {
3110     dout(10) << __func__ << " " << hoid
3111              << " blocked by scrub" << dendl;
3112     if (op) {
3113       waiting_for_scrub.push_back(op);
3114       op->mark_delayed("waiting for scrub");
3115       dout(10) << __func__ << " " << hoid
3116                << " placing op in waiting_for_scrub" << dendl;
3117     } else {
3118       dout(10) << __func__ << " " << hoid
3119                << " no op, dropping on the floor" << dendl;
3120     }
3121     return;
3122   }
3123   if (!obc) { // we need to create an ObjectContext
3124     assert(missing_oid != hobject_t());
3125     obc = get_object_context(missing_oid, true);
3126   }
3127   if (promote_obc)
3128     *promote_obc = obc;
3129
3130   /*
3131    * Before promote complete, if there are  proxy-reads for the object,
3132    * for this case we don't use DONTNEED.
3133    */
3134   unsigned src_fadvise_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL;
3135   map<hobject_t, list<OpRequestRef>>::iterator q = in_progress_proxy_ops.find(obc->obs.oi.soid);
3136   if (q == in_progress_proxy_ops.end()) {
3137     src_fadvise_flags |= LIBRADOS_OP_FLAG_FADVISE_DONTNEED;
3138   }
3139
3140   PromoteCallback *cb = new PromoteCallback(obc, this);
3141   object_locator_t my_oloc = oloc;
3142   my_oloc.pool = pool.info.tier_of;
3143
3144   unsigned flags = CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
3145                    CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
3146                    CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE |
3147                    CEPH_OSD_COPY_FROM_FLAG_RWORDERED;
3148   start_copy(cb, obc, obc->obs.oi.soid, my_oloc, 0, flags,
3149              obc->obs.oi.soid.snap == CEPH_NOSNAP,
3150              src_fadvise_flags, 0);
3151
3152   assert(obc->is_blocked());
3153
3154   if (op)
3155     wait_for_blocked_object(obc->obs.oi.soid, op);
3156   info.stats.stats.sum.num_promote++;
3157 }
3158
3159 void PrimaryLogPG::execute_ctx(OpContext *ctx)
3160 {
3161   FUNCTRACE();
3162   dout(10) << __func__ << " " << ctx << dendl;
3163   ctx->reset_obs(ctx->obc);
3164   ctx->update_log_only = false; // reset in case finish_copyfrom() is re-running execute_ctx
3165   OpRequestRef op = ctx->op;
3166   const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
3167   ObjectContextRef obc = ctx->obc;
3168   const hobject_t& soid = obc->obs.oi.soid;
3169
3170   // this method must be idempotent since we may call it several times
3171   // before we finally apply the resulting transaction.
3172   ctx->op_t.reset(new PGTransaction);
3173
3174   if (op->may_write() || op->may_cache()) {
3175     // snap
3176     if (!(m->has_flag(CEPH_OSD_FLAG_ENFORCE_SNAPC)) &&
3177         pool.info.is_pool_snaps_mode()) {
3178       // use pool's snapc
3179       ctx->snapc = pool.snapc;
3180     } else {
3181       // client specified snapc
3182       ctx->snapc.seq = m->get_snap_seq();
3183       ctx->snapc.snaps = m->get_snaps();
3184       filter_snapc(ctx->snapc.snaps);
3185     }
3186     if ((m->has_flag(CEPH_OSD_FLAG_ORDERSNAP)) &&
3187         ctx->snapc.seq < obc->ssc->snapset.seq) {
3188       dout(10) << " ORDERSNAP flag set and snapc seq " << ctx->snapc.seq
3189                << " < snapset seq " << obc->ssc->snapset.seq
3190                << " on " << obc->obs.oi.soid << dendl;
3191       reply_ctx(ctx, -EOLDSNAPC);
3192       return;
3193     }
3194
3195     // version
3196     ctx->at_version = get_next_version();
3197     ctx->mtime = m->get_mtime();
3198
3199     dout(10) << __func__ << " " << soid << " " << *ctx->ops
3200              << " ov " << obc->obs.oi.version << " av " << ctx->at_version
3201              << " snapc " << ctx->snapc
3202              << " snapset " << obc->ssc->snapset
3203              << dendl;
3204   } else {
3205     dout(10) << __func__ << " " << soid << " " << *ctx->ops
3206              << " ov " << obc->obs.oi.version
3207              << dendl;
3208   }
3209
3210   if (!ctx->user_at_version)
3211     ctx->user_at_version = obc->obs.oi.user_version;
3212   dout(30) << __func__ << " user_at_version " << ctx->user_at_version << dendl;
3213
3214   if (op->may_read()) {
3215     dout(10) << " taking ondisk_read_lock" << dendl;
3216     obc->ondisk_read_lock();
3217   }
3218
3219   {
3220 #ifdef WITH_LTTNG
3221     osd_reqid_t reqid = ctx->op->get_reqid();
3222 #endif
3223     tracepoint(osd, prepare_tx_enter, reqid.name._type,
3224         reqid.name._num, reqid.tid, reqid.inc);
3225   }
3226
3227   int result = prepare_transaction(ctx);
3228
3229   {
3230 #ifdef WITH_LTTNG
3231     osd_reqid_t reqid = ctx->op->get_reqid();
3232 #endif
3233     tracepoint(osd, prepare_tx_exit, reqid.name._type,
3234         reqid.name._num, reqid.tid, reqid.inc);
3235   }
3236
3237   if (op->may_read()) {
3238     dout(10) << " dropping ondisk_read_lock" << dendl;
3239     obc->ondisk_read_unlock();
3240   }
3241
3242   bool pending_async_reads = !ctx->pending_async_reads.empty();
3243   if (result == -EINPROGRESS || pending_async_reads) {
3244     // come back later.
3245     if (pending_async_reads) {
3246       in_progress_async_reads.push_back(make_pair(op, ctx));
3247       ctx->start_async_reads(this);
3248     }
3249     return;
3250   }
3251
3252   if (result == -EAGAIN) {
3253     // clean up after the ctx
3254     close_op_ctx(ctx);
3255     return;
3256   }
3257
3258   bool successful_write = !ctx->op_t->empty() && op->may_write() && result >= 0;
3259   // prepare the reply
3260   ctx->reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0,
3261                                successful_write);
3262
3263   // Write operations aren't allowed to return a data payload because
3264   // we can't do so reliably. If the client has to resend the request
3265   // and it has already been applied, we will return 0 with no
3266   // payload.  Non-deterministic behavior is no good.  However, it is
3267   // possible to construct an operation that does a read, does a guard
3268   // check (e.g., CMPXATTR), and then a write.  Then we either succeed
3269   // with the write, or return a CMPXATTR and the read value.
3270   if (successful_write) {
3271     // write.  normalize the result code.
3272     dout(20) << " zeroing write result code " << result << dendl;
3273     result = 0;
3274   }
3275   ctx->reply->set_result(result);
3276
3277   // read or error?
3278   if ((ctx->op_t->empty() || result < 0) && !ctx->update_log_only) {
3279     // finish side-effects
3280     if (result >= 0)
3281       do_osd_op_effects(ctx, m->get_connection());
3282
3283     complete_read_ctx(result, ctx);
3284     return;
3285   }
3286
3287   ctx->reply->set_reply_versions(ctx->at_version, ctx->user_at_version);
3288
3289   assert(op->may_write() || op->may_cache());
3290
3291   // trim log?
3292   calc_trim_to();
3293
3294   // verify that we are doing this in order?
3295   if (cct->_conf->osd_debug_op_order && m->get_source().is_client() &&
3296       !pool.info.is_tier() && !pool.info.has_tiers()) {
3297     map<client_t,ceph_tid_t>& cm = debug_op_order[obc->obs.oi.soid];
3298     ceph_tid_t t = m->get_tid();
3299     client_t n = m->get_source().num();
3300     map<client_t,ceph_tid_t>::iterator p = cm.find(n);
3301     if (p == cm.end()) {
3302       dout(20) << " op order client." << n << " tid " << t << " (first)" << dendl;
3303       cm[n] = t;
3304     } else {
3305       dout(20) << " op order client." << n << " tid " << t << " last was " << p->second << dendl;
3306       if (p->second > t) {
3307         derr << "bad op order, already applied " << p->second << " > this " << t << dendl;
3308         assert(0 == "out of order op");
3309       }
3310       p->second = t;
3311     }
3312   }
3313
3314   if (ctx->update_log_only) {
3315     if (result >= 0)
3316       do_osd_op_effects(ctx, m->get_connection());
3317
3318     dout(20) << __func__ << " update_log_only -- result=" << result << dendl;
3319     // save just what we need from ctx
3320     MOSDOpReply *reply = ctx->reply;
3321     ctx->reply = nullptr;
3322     reply->claim_op_out_data(*ctx->ops);
3323     reply->get_header().data_off = (ctx->data_off ? *ctx->data_off : 0);
3324     close_op_ctx(ctx);
3325
3326     if (result == -ENOENT) {
3327       reply->set_enoent_reply_versions(info.last_update,
3328                                        info.last_user_version);
3329     }
3330     reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
3331     // append to pg log for dup detection - don't save buffers for now
3332     record_write_error(op, soid, reply, result);
3333     return;
3334   }
3335
3336   // no need to capture PG ref, repop cancel will handle that
3337   // Can capture the ctx by pointer, it's owned by the repop
3338   ctx->register_on_commit(
3339     [m, ctx, this](){
3340       if (ctx->op)
3341         log_op_stats(
3342           ctx);
3343
3344       if (m && !ctx->sent_reply) {
3345         MOSDOpReply *reply = ctx->reply;
3346         if (reply)
3347           ctx->reply = nullptr;
3348         else {
3349           reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0, true);
3350           reply->set_reply_versions(ctx->at_version,
3351                                     ctx->user_at_version);
3352         }
3353         reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
3354         dout(10) << " sending reply on " << *m << " " << reply << dendl;
3355         osd->send_message_osd_client(reply, m->get_connection());
3356         ctx->sent_reply = true;
3357         ctx->op->mark_commit_sent();
3358       }
3359     });
3360   ctx->register_on_success(
3361     [ctx, this]() {
3362       do_osd_op_effects(
3363         ctx,
3364         ctx->op ? ctx->op->get_req()->get_connection() :
3365         ConnectionRef());
3366     });
3367   ctx->register_on_finish(
3368     [ctx, this]() {
3369       delete ctx;
3370     });
3371
3372   // issue replica writes
3373   ceph_tid_t rep_tid = osd->get_tid();
3374
3375   RepGather *repop = new_repop(ctx, obc, rep_tid);
3376
3377   issue_repop(repop, ctx);
3378   eval_repop(repop);
3379   repop->put();
3380 }
3381
3382 void PrimaryLogPG::close_op_ctx(OpContext *ctx) {
3383   release_object_locks(ctx->lock_manager);
3384
3385   ctx->op_t.reset();
3386
3387   for (auto p = ctx->on_finish.begin(); p != ctx->on_finish.end();
3388        ctx->on_finish.erase(p++)) {
3389     (*p)();
3390   }
3391   delete ctx;
3392 }
3393
3394 void PrimaryLogPG::reply_ctx(OpContext *ctx, int r)
3395 {
3396   if (ctx->op)
3397     osd->reply_op_error(ctx->op, r);
3398   close_op_ctx(ctx);
3399 }
3400
3401 void PrimaryLogPG::reply_ctx(OpContext *ctx, int r, eversion_t v, version_t uv)
3402 {
3403   if (ctx->op)
3404     osd->reply_op_error(ctx->op, r, v, uv);
3405   close_op_ctx(ctx);
3406 }
3407
3408 void PrimaryLogPG::log_op_stats(OpContext *ctx)
3409 {
3410   OpRequestRef op = ctx->op;
3411   const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
3412
3413   utime_t now = ceph_clock_now();
3414   utime_t latency = now;
3415   latency -= ctx->op->get_req()->get_recv_stamp();
3416   utime_t process_latency = now;
3417   process_latency -= ctx->op->get_dequeued_time();
3418
3419   uint64_t inb = ctx->bytes_written;
3420   uint64_t outb = ctx->bytes_read;
3421
3422   osd->logger->inc(l_osd_op);
3423
3424   osd->logger->inc(l_osd_op_outb, outb);
3425   osd->logger->inc(l_osd_op_inb, inb);
3426   osd->logger->tinc(l_osd_op_lat, latency);
3427   osd->logger->tinc(l_osd_op_process_lat, process_latency);
3428
3429   if (op->may_read() && op->may_write()) {
3430     osd->logger->inc(l_osd_op_rw);
3431     osd->logger->inc(l_osd_op_rw_inb, inb);
3432     osd->logger->inc(l_osd_op_rw_outb, outb);
3433     osd->logger->tinc(l_osd_op_rw_lat, latency);
3434     osd->logger->hinc(l_osd_op_rw_lat_inb_hist, latency.to_nsec(), inb);
3435     osd->logger->hinc(l_osd_op_rw_lat_outb_hist, latency.to_nsec(), outb);
3436     osd->logger->tinc(l_osd_op_rw_process_lat, process_latency);
3437   } else if (op->may_read()) {
3438     osd->logger->inc(l_osd_op_r);
3439     osd->logger->inc(l_osd_op_r_outb, outb);
3440     osd->logger->tinc(l_osd_op_r_lat, latency);
3441     osd->logger->hinc(l_osd_op_r_lat_outb_hist, latency.to_nsec(), outb);
3442     osd->logger->tinc(l_osd_op_r_process_lat, process_latency);
3443   } else if (op->may_write() || op->may_cache()) {
3444     osd->logger->inc(l_osd_op_w);
3445     osd->logger->inc(l_osd_op_w_inb, inb);
3446     osd->logger->tinc(l_osd_op_w_lat, latency);
3447     osd->logger->hinc(l_osd_op_w_lat_inb_hist, latency.to_nsec(), inb);
3448     osd->logger->tinc(l_osd_op_w_process_lat, process_latency);
3449   } else
3450     ceph_abort();
3451
3452   dout(15) << "log_op_stats " << *m
3453            << " inb " << inb
3454            << " outb " << outb
3455            << " lat " << latency << dendl;
3456 }
3457
3458 void PrimaryLogPG::do_sub_op(OpRequestRef op)
3459 {
3460   const MOSDSubOp *m = static_cast<const MOSDSubOp*>(op->get_req());
3461   assert(have_same_or_newer_map(m->map_epoch));
3462   assert(m->get_type() == MSG_OSD_SUBOP);
3463   dout(15) << "do_sub_op " << *op->get_req() << dendl;
3464
3465   if (!is_peered()) {
3466     waiting_for_peered.push_back(op);
3467     op->mark_delayed("waiting for active");
3468     return;
3469   }
3470
3471   const OSDOp *first = NULL;
3472   if (m->ops.size() >= 1) {
3473     first = &m->ops[0];
3474   }
3475
3476   if (first) {
3477     switch (first->op.op) {
3478     case CEPH_OSD_OP_DELETE:
3479       sub_op_remove(op);
3480       return;
3481     case CEPH_OSD_OP_SCRUB_RESERVE:
3482       handle_scrub_reserve_request(op);
3483       return;
3484     case CEPH_OSD_OP_SCRUB_UNRESERVE:
3485       handle_scrub_reserve_release(op);
3486       return;
3487     case CEPH_OSD_OP_SCRUB_MAP:
3488       sub_op_scrub_map(op);
3489       return;
3490     }
3491   }
3492 }
3493
3494 void PrimaryLogPG::do_sub_op_reply(OpRequestRef op)
3495 {
3496   const MOSDSubOpReply *r = static_cast<const MOSDSubOpReply *>(op->get_req());
3497   assert(r->get_type() == MSG_OSD_SUBOPREPLY);
3498   if (r->ops.size() >= 1) {
3499     const OSDOp& first = r->ops[0];
3500     switch (first.op.op) {
3501     case CEPH_OSD_OP_SCRUB_RESERVE:
3502       {
3503         pg_shard_t from = r->from;
3504         bufferlist::iterator p = const_cast<bufferlist&>(r->get_data()).begin();
3505         bool reserved;
3506         ::decode(reserved, p);
3507         if (reserved) {
3508           handle_scrub_reserve_grant(op, from);
3509         } else {
3510           handle_scrub_reserve_reject(op, from);
3511         }
3512       }
3513       return;
3514     }
3515   }
3516 }
3517
3518 void PrimaryLogPG::do_scan(
3519   OpRequestRef op,
3520   ThreadPool::TPHandle &handle)
3521 {
3522   const MOSDPGScan *m = static_cast<const MOSDPGScan*>(op->get_req());
3523   assert(m->get_type() == MSG_OSD_PG_SCAN);
3524   dout(10) << "do_scan " << *m << dendl;
3525
3526   op->mark_started();
3527
3528   switch (m->op) {
3529   case MOSDPGScan::OP_SCAN_GET_DIGEST:
3530     {
3531       ostringstream ss;
3532       if (osd->check_backfill_full(ss)) {
3533         dout(1) << __func__ << ": Canceling backfill, " << ss.str() << dendl;
3534         queue_peering_event(
3535           CephPeeringEvtRef(
3536             std::make_shared<CephPeeringEvt>(
3537               get_osdmap()->get_epoch(),
3538               get_osdmap()->get_epoch(),
3539               BackfillTooFull())));
3540         return;
3541       }
3542
3543       BackfillInterval bi;
3544       bi.begin = m->begin;
3545       // No need to flush, there won't be any in progress writes occuring
3546       // past m->begin
3547       scan_range(
3548         cct->_conf->osd_backfill_scan_min,
3549         cct->_conf->osd_backfill_scan_max,
3550         &bi,
3551         handle);
3552       MOSDPGScan *reply = new MOSDPGScan(
3553         MOSDPGScan::OP_SCAN_DIGEST,
3554         pg_whoami,
3555         get_osdmap()->get_epoch(), m->query_epoch,
3556         spg_t(info.pgid.pgid, get_primary().shard), bi.begin, bi.end);
3557       ::encode(bi.objects, reply->get_data());
3558       osd->send_message_osd_cluster(reply, m->get_connection());
3559     }
3560     break;
3561
3562   case MOSDPGScan::OP_SCAN_DIGEST:
3563     {
3564       pg_shard_t from = m->from;
3565
3566       // Check that from is in backfill_targets vector
3567       assert(is_backfill_targets(from));
3568
3569       BackfillInterval& bi = peer_backfill_info[from];
3570       bi.begin = m->begin;
3571       bi.end = m->end;
3572       bufferlist::iterator p = const_cast<bufferlist&>(m->get_data()).begin();
3573
3574       // take care to preserve ordering!
3575       bi.clear_objects();
3576       ::decode_noclear(bi.objects, p);
3577
3578       if (waiting_on_backfill.erase(from)) {
3579         if (waiting_on_backfill.empty()) {
3580           assert(peer_backfill_info.size() == backfill_targets.size());
3581           finish_recovery_op(hobject_t::get_max());
3582         }
3583       } else {
3584         // we canceled backfill for a while due to a too full, and this
3585         // is an extra response from a non-too-full peer
3586       }
3587     }
3588     break;
3589   }
3590 }
3591
3592 void PrimaryLogPG::do_backfill(OpRequestRef op)
3593 {
3594   const MOSDPGBackfill *m = static_cast<const MOSDPGBackfill*>(op->get_req());
3595   assert(m->get_type() == MSG_OSD_PG_BACKFILL);
3596   dout(10) << "do_backfill " << *m << dendl;
3597
3598   op->mark_started();
3599
3600   switch (m->op) {
3601   case MOSDPGBackfill::OP_BACKFILL_FINISH:
3602     {
3603       assert(cct->_conf->osd_kill_backfill_at != 1);
3604
3605       MOSDPGBackfill *reply = new MOSDPGBackfill(
3606         MOSDPGBackfill::OP_BACKFILL_FINISH_ACK,
3607         get_osdmap()->get_epoch(),
3608         m->query_epoch,
3609         spg_t(info.pgid.pgid, get_primary().shard));
3610       reply->set_priority(get_recovery_op_priority());
3611       osd->send_message_osd_cluster(reply, m->get_connection());
3612       queue_peering_event(
3613         CephPeeringEvtRef(
3614           std::make_shared<CephPeeringEvt>(
3615             get_osdmap()->get_epoch(),
3616             get_osdmap()->get_epoch(),
3617             RecoveryDone())));
3618     }
3619     // fall-thru
3620
3621   case MOSDPGBackfill::OP_BACKFILL_PROGRESS:
3622     {
3623       assert(cct->_conf->osd_kill_backfill_at != 2);
3624
3625       info.set_last_backfill(m->last_backfill);
3626       info.stats = m->stats;
3627
3628       ObjectStore::Transaction t;
3629       dirty_info = true;
3630       write_if_dirty(t);
3631       int tr = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
3632       assert(tr == 0);
3633     }
3634     break;
3635
3636   case MOSDPGBackfill::OP_BACKFILL_FINISH_ACK:
3637     {
3638       assert(is_primary());
3639       assert(cct->_conf->osd_kill_backfill_at != 3);
3640       finish_recovery_op(hobject_t::get_max());
3641     }
3642     break;
3643   }
3644 }
3645
3646 void PrimaryLogPG::do_backfill_remove(OpRequestRef op)
3647 {
3648   const MOSDPGBackfillRemove *m = static_cast<const MOSDPGBackfillRemove*>(
3649     op->get_req());
3650   assert(m->get_type() == MSG_OSD_PG_BACKFILL_REMOVE);
3651   dout(7) << __func__ << " " << m->ls << dendl;
3652
3653   op->mark_started();
3654
3655   ObjectStore::Transaction t;
3656   for (auto& p : m->ls) {
3657     remove_snap_mapped_object(t, p.first);
3658   }
3659   int r = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
3660   assert(r == 0);
3661 }
3662
3663 int PrimaryLogPG::trim_object(
3664   bool first, const hobject_t &coid, PrimaryLogPG::OpContextUPtr *ctxp)
3665 {
3666   *ctxp = NULL;
3667   // load clone info
3668   bufferlist bl;
3669   ObjectContextRef obc = get_object_context(coid, false, NULL);
3670   if (!obc || !obc->ssc || !obc->ssc->exists) {
3671     osd->clog->error() << __func__ << ": Can not trim " << coid
3672       << " repair needed " << (obc ? "(no obc->ssc or !exists)" : "(no obc)");
3673     return -ENOENT;
3674   }
3675
3676   hobject_t snapoid(
3677     coid.oid, coid.get_key(),
3678     obc->ssc->snapset.head_exists ? CEPH_NOSNAP:CEPH_SNAPDIR, coid.get_hash(),
3679     info.pgid.pool(), coid.get_namespace());
3680   ObjectContextRef snapset_obc = get_object_context(snapoid, false);
3681   if (!snapset_obc) {
3682     osd->clog->error() << __func__ << ": Can not trim " << coid
3683       << " repair needed, no snapset obc for " << snapoid;
3684     return -ENOENT;
3685   }
3686
3687   SnapSet& snapset = obc->ssc->snapset;
3688
3689   bool legacy = snapset.is_legacy() ||
3690     get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
3691
3692   object_info_t &coi = obc->obs.oi;
3693   set<snapid_t> old_snaps;
3694   if (legacy) {
3695     old_snaps.insert(coi.legacy_snaps.begin(), coi.legacy_snaps.end());
3696   } else {
3697     auto p = snapset.clone_snaps.find(coid.snap);
3698     if (p == snapset.clone_snaps.end()) {
3699       osd->clog->error() << "No clone_snaps in snapset " << snapset
3700                          << " for object " << coid << "\n";
3701       return -ENOENT;
3702     }
3703     old_snaps.insert(snapset.clone_snaps[coid.snap].begin(),
3704                      snapset.clone_snaps[coid.snap].end());
3705   }
3706   if (old_snaps.empty()) {
3707     osd->clog->error() << "No object info snaps for object " << coid;
3708     return -ENOENT;
3709   }
3710
3711   dout(10) << coid << " old_snaps " << old_snaps
3712            << " old snapset " << snapset << dendl;
3713   if (snapset.seq == 0) {
3714     osd->clog->error() << "No snapset.seq for object " << coid;
3715     return -ENOENT;
3716   }
3717
3718   set<snapid_t> new_snaps;
3719   for (set<snapid_t>::iterator i = old_snaps.begin();
3720        i != old_snaps.end();
3721        ++i) {
3722     if (!pool.info.is_removed_snap(*i))
3723       new_snaps.insert(*i);
3724   }
3725
3726   vector<snapid_t>::iterator p = snapset.clones.end();
3727
3728   if (new_snaps.empty()) {
3729     p = std::find(snapset.clones.begin(), snapset.clones.end(), coid.snap);
3730     if (p == snapset.clones.end()) {
3731       osd->clog->error() << "Snap " << coid.snap << " not in clones";
3732       return -ENOENT;
3733     }
3734   }
3735
3736   OpContextUPtr ctx = simple_opc_create(obc);
3737   ctx->snapset_obc = snapset_obc;
3738
3739   if (!ctx->lock_manager.get_snaptrimmer_write(
3740         coid,
3741         obc,
3742         first)) {
3743     close_op_ctx(ctx.release());
3744     dout(10) << __func__ << ": Unable to get a wlock on " << coid << dendl;
3745     return -ENOLCK;
3746   }
3747
3748   if (!ctx->lock_manager.get_snaptrimmer_write(
3749         snapoid,
3750         snapset_obc,
3751         first)) {
3752     close_op_ctx(ctx.release());
3753     dout(10) << __func__ << ": Unable to get a wlock on " << snapoid << dendl;
3754     return -ENOLCK;
3755   }
3756
3757   ctx->at_version = get_next_version();
3758
3759   PGTransaction *t = ctx->op_t.get();
3760
3761   if (new_snaps.empty()) {
3762     // remove clone
3763     dout(10) << coid << " snaps " << old_snaps << " -> "
3764              << new_snaps << " ... deleting" << dendl;
3765
3766     // ...from snapset
3767     assert(p != snapset.clones.end());
3768
3769     snapid_t last = coid.snap;
3770     ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(last);
3771
3772     if (p != snapset.clones.begin()) {
3773       // not the oldest... merge overlap into next older clone
3774       vector<snapid_t>::iterator n = p - 1;
3775       hobject_t prev_coid = coid;
3776       prev_coid.snap = *n;
3777       bool adjust_prev_bytes = is_present_clone(prev_coid);
3778
3779       if (adjust_prev_bytes)
3780         ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(*n);
3781
3782       snapset.clone_overlap[*n].intersection_of(
3783         snapset.clone_overlap[*p]);
3784
3785       if (adjust_prev_bytes)
3786         ctx->delta_stats.num_bytes += snapset.get_clone_bytes(*n);
3787     }
3788     ctx->delta_stats.num_objects--;
3789     if (coi.is_dirty())
3790       ctx->delta_stats.num_objects_dirty--;
3791     if (coi.is_omap())
3792       ctx->delta_stats.num_objects_omap--;
3793     if (coi.is_whiteout()) {
3794       dout(20) << __func__ << " trimming whiteout on " << coid << dendl;
3795       ctx->delta_stats.num_whiteouts--;
3796     }
3797     ctx->delta_stats.num_object_clones--;
3798     if (coi.is_cache_pinned())
3799       ctx->delta_stats.num_objects_pinned--;
3800     obc->obs.exists = false;
3801
3802     snapset.clones.erase(p);
3803     snapset.clone_overlap.erase(last);
3804     snapset.clone_size.erase(last);
3805     snapset.clone_snaps.erase(last);
3806
3807     ctx->log.push_back(
3808       pg_log_entry_t(
3809         pg_log_entry_t::DELETE,
3810         coid,
3811         ctx->at_version,
3812         ctx->obs->oi.version,
3813         0,
3814         osd_reqid_t(),
3815         ctx->mtime,
3816         0)
3817       );
3818     t->remove(coid);
3819     t->update_snaps(
3820       coid,
3821       old_snaps,
3822       new_snaps);
3823
3824     coi = object_info_t(coid);
3825
3826     ctx->at_version.version++;
3827   } else {
3828     // save adjusted snaps for this object
3829     dout(10) << coid << " snaps " << old_snaps << " -> " << new_snaps << dendl;
3830     if (legacy) {
3831       coi.legacy_snaps = vector<snapid_t>(new_snaps.rbegin(), new_snaps.rend());
3832     } else {
3833       snapset.clone_snaps[coid.snap] = vector<snapid_t>(new_snaps.rbegin(),
3834                                                         new_snaps.rend());
3835       // we still do a 'modify' event on this object just to trigger a
3836       // snapmapper.update ... :(
3837     }
3838
3839     coi.prior_version = coi.version;
3840     coi.version = ctx->at_version;
3841     bl.clear();
3842     ::encode(coi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
3843     t->setattr(coid, OI_ATTR, bl);
3844
3845     ctx->log.push_back(
3846       pg_log_entry_t(
3847         pg_log_entry_t::MODIFY,
3848         coid,
3849         coi.version,
3850         coi.prior_version,
3851         0,
3852         osd_reqid_t(),
3853         ctx->mtime,
3854         0)
3855       );
3856     ctx->at_version.version++;
3857
3858     t->update_snaps(
3859       coid,
3860       old_snaps,
3861       new_snaps);
3862   }
3863
3864   // save head snapset
3865   dout(10) << coid << " new snapset " << snapset << " on "
3866            << snapset_obc->obs.oi << dendl;
3867   if (snapset.clones.empty() &&
3868       (!snapset.head_exists ||
3869        (snapset_obc->obs.oi.is_whiteout() &&
3870         !(snapset_obc->obs.oi.is_dirty() && pool.info.is_tier()) &&
3871         !snapset_obc->obs.oi.is_cache_pinned()))) {
3872     // NOTE: this arguably constitutes minor interference with the
3873     // tiering agent if this is a cache tier since a snap trim event
3874     // is effectively evicting a whiteout we might otherwise want to
3875     // keep around.
3876     dout(10) << coid << " removing " << snapoid << dendl;
3877     ctx->log.push_back(
3878       pg_log_entry_t(
3879         pg_log_entry_t::DELETE,
3880         snapoid,
3881         ctx->at_version,
3882         ctx->snapset_obc->obs.oi.version,
3883         0,
3884         osd_reqid_t(),
3885         ctx->mtime,
3886         0)
3887       );
3888     if (snapoid.is_head()) {
3889       derr << "removing snap head" << dendl;
3890       object_info_t& oi = ctx->snapset_obc->obs.oi;
3891       ctx->delta_stats.num_objects--;
3892       if (oi.is_dirty()) {
3893         ctx->delta_stats.num_objects_dirty--;
3894       }
3895       if (oi.is_omap())
3896         ctx->delta_stats.num_objects_omap--;
3897       if (oi.is_whiteout()) {
3898         dout(20) << __func__ << " trimming whiteout on " << oi.soid << dendl;
3899         ctx->delta_stats.num_whiteouts--;
3900       }
3901       if (oi.is_cache_pinned()) {
3902         ctx->delta_stats.num_objects_pinned--;
3903       }
3904     }
3905     ctx->snapset_obc->obs.exists = false;
3906     ctx->snapset_obc->obs.oi = object_info_t(snapoid);
3907     t->remove(snapoid);
3908   } else {
3909     dout(10) << coid << " filtering snapset on " << snapoid << dendl;
3910     snapset.filter(pool.info);
3911     dout(10) << coid << " writing updated snapset on " << snapoid
3912              << ", snapset is " << snapset << dendl;
3913     ctx->log.push_back(
3914       pg_log_entry_t(
3915         pg_log_entry_t::MODIFY,
3916         snapoid,
3917         ctx->at_version,
3918         ctx->snapset_obc->obs.oi.version,
3919         0,
3920         osd_reqid_t(),
3921         ctx->mtime,
3922         0)
3923       );
3924
3925     ctx->snapset_obc->obs.oi.prior_version =
3926       ctx->snapset_obc->obs.oi.version;
3927     ctx->snapset_obc->obs.oi.version = ctx->at_version;
3928
3929     map <string, bufferlist> attrs;
3930     bl.clear();
3931     ::encode(snapset, bl);
3932     attrs[SS_ATTR].claim(bl);
3933
3934     bl.clear();
3935     ::encode(ctx->snapset_obc->obs.oi, bl,
3936              get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
3937     attrs[OI_ATTR].claim(bl);
3938     t->setattrs(snapoid, attrs);
3939   }
3940
3941   *ctxp = std::move(ctx);
3942   return 0;
3943 }
3944
3945 void PrimaryLogPG::kick_snap_trim()
3946 {
3947   assert(is_active());
3948   assert(is_primary());
3949   if (is_clean() && !snap_trimq.empty()) {
3950     dout(10) << __func__ << ": clean and snaps to trim, kicking" << dendl;
3951     snap_trimmer_machine.process_event(KickTrim());
3952   }
3953 }
3954
3955 void PrimaryLogPG::snap_trimmer_scrub_complete()
3956 {
3957   if (is_primary() && is_active() && is_clean()) {
3958     assert(!snap_trimq.empty());
3959     snap_trimmer_machine.process_event(ScrubComplete());
3960   }
3961 }
3962
3963 void PrimaryLogPG::snap_trimmer(epoch_t queued)
3964 {
3965   if (deleting || pg_has_reset_since(queued)) {
3966     return;
3967   }
3968
3969   assert(is_primary());
3970
3971   dout(10) << "snap_trimmer posting" << dendl;
3972   snap_trimmer_machine.process_event(DoSnapWork());
3973   dout(10) << "snap_trimmer complete" << dendl;
3974   return;
3975 }
3976
3977 int PrimaryLogPG::do_xattr_cmp_u64(int op, __u64 v1, bufferlist& xattr)
3978 {
3979   __u64 v2;
3980
3981   string v2s(xattr.c_str(), xattr.length());
3982   if (v2s.length())
3983     v2 = strtoull(v2s.c_str(), NULL, 10);
3984   else
3985     v2 = 0;
3986
3987   dout(20) << "do_xattr_cmp_u64 '" << v1 << "' vs '" << v2 << "' op " << op << dendl;
3988
3989   switch (op) {
3990   case CEPH_OSD_CMPXATTR_OP_EQ:
3991     return (v1 == v2);
3992   case CEPH_OSD_CMPXATTR_OP_NE:
3993     return (v1 != v2);
3994   case CEPH_OSD_CMPXATTR_OP_GT:
3995     return (v1 > v2);
3996   case CEPH_OSD_CMPXATTR_OP_GTE:
3997     return (v1 >= v2);
3998   case CEPH_OSD_CMPXATTR_OP_LT:
3999     return (v1 < v2);
4000   case CEPH_OSD_CMPXATTR_OP_LTE:
4001     return (v1 <= v2);
4002   default:
4003     return -EINVAL;
4004   }
4005 }
4006
4007 int PrimaryLogPG::do_xattr_cmp_str(int op, string& v1s, bufferlist& xattr)
4008 {
4009   string v2s(xattr.c_str(), xattr.length());
4010
4011   dout(20) << "do_xattr_cmp_str '" << v1s << "' vs '" << v2s << "' op " << op << dendl;
4012
4013   switch (op) {
4014   case CEPH_OSD_CMPXATTR_OP_EQ:
4015     return (v1s.compare(v2s) == 0);
4016   case CEPH_OSD_CMPXATTR_OP_NE:
4017     return (v1s.compare(v2s) != 0);
4018   case CEPH_OSD_CMPXATTR_OP_GT:
4019     return (v1s.compare(v2s) > 0);
4020   case CEPH_OSD_CMPXATTR_OP_GTE:
4021     return (v1s.compare(v2s) >= 0);
4022   case CEPH_OSD_CMPXATTR_OP_LT:
4023     return (v1s.compare(v2s) < 0);
4024   case CEPH_OSD_CMPXATTR_OP_LTE:
4025     return (v1s.compare(v2s) <= 0);
4026   default:
4027     return -EINVAL;
4028   }
4029 }
4030
4031 int PrimaryLogPG::do_writesame(OpContext *ctx, OSDOp& osd_op)
4032 {
4033   ceph_osd_op& op = osd_op.op;
4034   vector<OSDOp> write_ops(1);
4035   OSDOp& write_op = write_ops[0];
4036   uint64_t write_length = op.writesame.length;
4037   int result = 0;
4038
4039   if (!write_length)
4040     return 0;
4041
4042   if (!op.writesame.data_length || write_length % op.writesame.data_length)
4043     return -EINVAL;
4044
4045   if (op.writesame.data_length != osd_op.indata.length()) {
4046     derr << "invalid length ws data length " << op.writesame.data_length << " actual len " << osd_op.indata.length() << dendl;
4047     return -EINVAL;
4048   }
4049
4050   while (write_length) {
4051     write_op.indata.append(osd_op.indata);
4052     write_length -= op.writesame.data_length;
4053   }
4054
4055   write_op.op.op = CEPH_OSD_OP_WRITE;
4056   write_op.op.extent.offset = op.writesame.offset;
4057   write_op.op.extent.length = op.writesame.length;
4058   result = do_osd_ops(ctx, write_ops);
4059   if (result < 0)
4060     derr << "do_writesame do_osd_ops failed " << result << dendl;
4061
4062   return result;
4063 }
4064
4065 // ========================================================================
4066 // low level osd ops
4067
4068 int PrimaryLogPG::do_tmap2omap(OpContext *ctx, unsigned flags)
4069 {
4070   dout(20) << " convert tmap to omap for " << ctx->new_obs.oi.soid << dendl;
4071   bufferlist header, vals;
4072   int r = _get_tmap(ctx, &header, &vals);
4073   if (r < 0) {
4074     if (r == -ENODATA && (flags & CEPH_OSD_TMAP2OMAP_NULLOK))
4075       r = 0;
4076     return r;
4077   }
4078
4079   vector<OSDOp> ops(3);
4080
4081   ops[0].op.op = CEPH_OSD_OP_TRUNCATE;
4082   ops[0].op.extent.offset = 0;
4083   ops[0].op.extent.length = 0;
4084
4085   ops[1].op.op = CEPH_OSD_OP_OMAPSETHEADER;
4086   ops[1].indata.claim(header);
4087
4088   ops[2].op.op = CEPH_OSD_OP_OMAPSETVALS;
4089   ops[2].indata.claim(vals);
4090
4091   return do_osd_ops(ctx, ops);
4092 }
4093
4094 int PrimaryLogPG::do_tmapup_slow(OpContext *ctx, bufferlist::iterator& bp, OSDOp& osd_op,
4095                                     bufferlist& bl)
4096 {
4097   // decode
4098   bufferlist header;
4099   map<string, bufferlist> m;
4100   if (bl.length()) {
4101     bufferlist::iterator p = bl.begin();
4102     ::decode(header, p);
4103     ::decode(m, p);
4104     assert(p.end());
4105   }
4106
4107   // do the update(s)
4108   while (!bp.end()) {
4109     __u8 op;
4110     string key;
4111     ::decode(op, bp);
4112
4113     switch (op) {
4114     case CEPH_OSD_TMAP_SET: // insert key
4115       {
4116         ::decode(key, bp);
4117         bufferlist data;
4118         ::decode(data, bp);
4119         m[key] = data;
4120       }
4121       break;
4122     case CEPH_OSD_TMAP_RM: // remove key
4123       ::decode(key, bp);
4124       if (!m.count(key)) {
4125         return -ENOENT;
4126       }
4127       m.erase(key);
4128       break;
4129     case CEPH_OSD_TMAP_RMSLOPPY: // remove key
4130       ::decode(key, bp);
4131       m.erase(key);
4132       break;
4133     case CEPH_OSD_TMAP_HDR: // update header
4134       {
4135         ::decode(header, bp);
4136       }
4137       break;
4138     default:
4139       return -EINVAL;
4140     }
4141   }
4142
4143   // reencode
4144   bufferlist obl;
4145   ::encode(header, obl);
4146   ::encode(m, obl);
4147
4148   // write it out
4149   vector<OSDOp> nops(1);
4150   OSDOp& newop = nops[0];
4151   newop.op.op = CEPH_OSD_OP_WRITEFULL;
4152   newop.op.extent.offset = 0;
4153   newop.op.extent.length = obl.length();
4154   newop.indata = obl;
4155   do_osd_ops(ctx, nops);
4156   osd_op.outdata.claim(newop.outdata);
4157   return 0;
4158 }
4159
4160 int PrimaryLogPG::do_tmapup(OpContext *ctx, bufferlist::iterator& bp, OSDOp& osd_op)
4161 {
4162   bufferlist::iterator orig_bp = bp;
4163   int result = 0;
4164   if (bp.end()) {
4165     dout(10) << "tmapup is a no-op" << dendl;
4166   } else {
4167     // read the whole object
4168     vector<OSDOp> nops(1);
4169     OSDOp& newop = nops[0];
4170     newop.op.op = CEPH_OSD_OP_READ;
4171     newop.op.extent.offset = 0;
4172     newop.op.extent.length = 0;
4173     result = do_osd_ops(ctx, nops);
4174
4175     dout(10) << "tmapup read " << newop.outdata.length() << dendl;
4176
4177     dout(30) << " starting is \n";
4178     newop.outdata.hexdump(*_dout);
4179     *_dout << dendl;
4180
4181     bufferlist::iterator ip = newop.outdata.begin();
4182     bufferlist obl;
4183
4184     dout(30) << "the update command is: \n";
4185     osd_op.indata.hexdump(*_dout);
4186     *_dout << dendl;
4187
4188     // header
4189     bufferlist header;
4190     __u32 nkeys = 0;
4191     if (newop.outdata.length()) {
4192       ::decode(header, ip);
4193       ::decode(nkeys, ip);
4194     }
4195     dout(10) << "tmapup header " << header.length() << dendl;
4196
4197     if (!bp.end() && *bp == CEPH_OSD_TMAP_HDR) {
4198       ++bp;
4199       ::decode(header, bp);
4200       dout(10) << "tmapup new header " << header.length() << dendl;
4201     }
4202
4203     ::encode(header, obl);
4204
4205     dout(20) << "tmapup initial nkeys " << nkeys << dendl;
4206
4207     // update keys
4208     bufferlist newkeydata;
4209     string nextkey, last_in_key;
4210     bufferlist nextval;
4211     bool have_next = false;
4212     if (!ip.end()) {
4213       have_next = true;
4214       ::decode(nextkey, ip);
4215       ::decode(nextval, ip);
4216     }
4217     while (!bp.end() && !result) {
4218       __u8 op;
4219       string key;
4220       try {
4221         ::decode(op, bp);
4222         ::decode(key, bp);
4223       }
4224       catch (buffer::error& e) {
4225         return -EINVAL;
4226       }
4227       if (key < last_in_key) {
4228         dout(5) << "tmapup warning: key '" << key << "' < previous key '" << last_in_key
4229                 << "', falling back to an inefficient (unsorted) update" << dendl;
4230         bp = orig_bp;
4231         return do_tmapup_slow(ctx, bp, osd_op, newop.outdata);
4232       }
4233       last_in_key = key;
4234
4235       dout(10) << "tmapup op " << (int)op << " key " << key << dendl;
4236
4237       // skip existing intervening keys
4238       bool key_exists = false;
4239       while (have_next && !key_exists) {
4240         dout(20) << "  (have_next=" << have_next << " nextkey=" << nextkey << ")" << dendl;
4241         if (nextkey > key)
4242           break;
4243         if (nextkey < key) {
4244           // copy untouched.
4245           ::encode(nextkey, newkeydata);
4246           ::encode(nextval, newkeydata);
4247           dout(20) << "  keep " << nextkey << " " << nextval.length() << dendl;
4248         } else {
4249           // don't copy; discard old value.  and stop.
4250           dout(20) << "  drop " << nextkey << " " << nextval.length() << dendl;
4251           key_exists = true;
4252           nkeys--;
4253         }
4254         if (!ip.end()) {
4255           ::decode(nextkey, ip);
4256           ::decode(nextval, ip);
4257         } else {
4258           have_next = false;
4259         }
4260       }
4261
4262       if (op == CEPH_OSD_TMAP_SET) {
4263         bufferlist val;
4264         try {
4265           ::decode(val, bp);
4266         }
4267         catch (buffer::error& e) {
4268           return -EINVAL;
4269         }
4270         ::encode(key, newkeydata);
4271         ::encode(val, newkeydata);
4272         dout(20) << "   set " << key << " " << val.length() << dendl;
4273         nkeys++;
4274       } else if (op == CEPH_OSD_TMAP_CREATE) {
4275         if (key_exists) {
4276           return -EEXIST;
4277         }
4278         bufferlist val;
4279         try {
4280           ::decode(val, bp);
4281         }
4282         catch (buffer::error& e) {
4283           return -EINVAL;
4284         }
4285         ::encode(key, newkeydata);
4286         ::encode(val, newkeydata);
4287         dout(20) << "   create " << key << " " << val.length() << dendl;
4288         nkeys++;
4289       } else if (op == CEPH_OSD_TMAP_RM) {
4290         // do nothing.
4291         if (!key_exists) {
4292           return -ENOENT;
4293         }
4294       } else if (op == CEPH_OSD_TMAP_RMSLOPPY) {
4295         // do nothing
4296       } else {
4297         dout(10) << "  invalid tmap op " << (int)op << dendl;
4298         return -EINVAL;
4299       }
4300     }
4301
4302     // copy remaining
4303     if (have_next) {
4304       ::encode(nextkey, newkeydata);
4305       ::encode(nextval, newkeydata);
4306       dout(20) << "  keep " << nextkey << " " << nextval.length() << dendl;
4307     }
4308     if (!ip.end()) {
4309       bufferlist rest;
4310       rest.substr_of(newop.outdata, ip.get_off(), newop.outdata.length() - ip.get_off());
4311       dout(20) << "  keep trailing " << rest.length()
4312                << " at " << newkeydata.length() << dendl;
4313       newkeydata.claim_append(rest);
4314     }
4315
4316     // encode final key count + key data
4317     dout(20) << "tmapup final nkeys " << nkeys << dendl;
4318     ::encode(nkeys, obl);
4319     obl.claim_append(newkeydata);
4320
4321     if (0) {
4322       dout(30) << " final is \n";
4323       obl.hexdump(*_dout);
4324       *_dout << dendl;
4325
4326       // sanity check
4327       bufferlist::iterator tp = obl.begin();
4328       bufferlist h;
4329       ::decode(h, tp);
4330       map<string,bufferlist> d;
4331       ::decode(d, tp);
4332       assert(tp.end());
4333       dout(0) << " **** debug sanity check, looks ok ****" << dendl;
4334     }
4335
4336     // write it out
4337     if (!result) {
4338       dout(20) << "tmapput write " << obl.length() << dendl;
4339       newop.op.op = CEPH_OSD_OP_WRITEFULL;
4340       newop.op.extent.offset = 0;
4341       newop.op.extent.length = obl.length();
4342       newop.indata = obl;
4343       do_osd_ops(ctx, nops);
4344       osd_op.outdata.claim(newop.outdata);
4345     }
4346   }
4347   return result;
4348 }
4349
4350 static int check_offset_and_length(uint64_t offset, uint64_t length, uint64_t max)
4351 {
4352   if (offset >= max ||
4353       length > max ||
4354       offset + length > max)
4355     return -EFBIG;
4356
4357   return 0;
4358 }
4359
4360 struct FillInVerifyExtent : public Context {
4361   ceph_le64 *r;
4362   int32_t *rval;
4363   bufferlist *outdatap;
4364   boost::optional<uint32_t> maybe_crc;
4365   uint64_t size;
4366   OSDService *osd;
4367   hobject_t soid;
4368   __le32 flags;
4369   FillInVerifyExtent(ceph_le64 *r, int32_t *rv, bufferlist *blp,
4370                      boost::optional<uint32_t> mc, uint64_t size,
4371                      OSDService *osd, hobject_t soid, __le32 flags) :
4372     r(r), rval(rv), outdatap(blp), maybe_crc(mc),
4373     size(size), osd(osd), soid(soid), flags(flags) {}
4374   void finish(int len) override {
4375     *r = len;
4376     if (len < 0) {
4377       *rval = len;
4378       return;
4379     }
4380     *rval = 0;
4381
4382     // whole object?  can we verify the checksum?
4383     if (maybe_crc && *r == size) {
4384       uint32_t crc = outdatap->crc32c(-1);
4385       if (maybe_crc != crc) {
4386         osd->clog->error() << std::hex << " full-object read crc 0x" << crc
4387                            << " != expected 0x" << *maybe_crc
4388                            << std::dec << " on " << soid;
4389         if (!(flags & CEPH_OSD_OP_FLAG_FAILOK)) {
4390           *rval = -EIO;
4391           *r = 0;
4392         }
4393       }
4394     }
4395   }
4396 };
4397
4398 struct ToSparseReadResult : public Context {
4399   int* result;
4400   bufferlist* data_bl;
4401   uint64_t data_offset;
4402   ceph_le64* len;
4403   ToSparseReadResult(int* result, bufferlist* bl, uint64_t offset,
4404                      ceph_le64* len)
4405     : result(result), data_bl(bl), data_offset(offset),len(len) {}
4406   void finish(int r) override {
4407     if (r < 0) {
4408       *result = r;
4409       return;
4410     }
4411     *result = 0;
4412     *len = r;
4413     bufferlist outdata;
4414     map<uint64_t, uint64_t> extents = {{data_offset, r}};
4415     ::encode(extents, outdata);
4416     ::encode_destructively(*data_bl, outdata);
4417     data_bl->swap(outdata);
4418   }
4419 };
4420
4421 template<typename V>
4422 static string list_keys(const map<string, V>& m) {
4423   string s;
4424   for (typename map<string, V>::const_iterator itr = m.begin(); itr != m.end(); ++itr) {
4425     if (!s.empty()) {
4426       s.push_back(',');
4427     }
4428     s.append(itr->first);
4429   }
4430   return s;
4431 }
4432
4433 template<typename T>
4434 static string list_entries(const T& m) {
4435   string s;
4436   for (typename T::const_iterator itr = m.begin(); itr != m.end(); ++itr) {
4437     if (!s.empty()) {
4438       s.push_back(',');
4439     }
4440     s.append(*itr);
4441   }
4442   return s;
4443 }
4444
4445 void PrimaryLogPG::maybe_create_new_object(
4446   OpContext *ctx,
4447   bool ignore_transaction)
4448 {
4449   ObjectState& obs = ctx->new_obs;
4450   if (!obs.exists) {
4451     ctx->delta_stats.num_objects++;
4452     obs.exists = true;
4453     assert(!obs.oi.is_whiteout());
4454     obs.oi.new_object();
4455     if (!ignore_transaction)
4456       ctx->op_t->create(obs.oi.soid);
4457   } else if (obs.oi.is_whiteout()) {
4458     dout(10) << __func__ << " clearing whiteout on " << obs.oi.soid << dendl;
4459     ctx->new_obs.oi.clear_flag(object_info_t::FLAG_WHITEOUT);
4460     --ctx->delta_stats.num_whiteouts;
4461   }
4462 }
4463
4464 struct ReadFinisher : public PrimaryLogPG::OpFinisher {
4465   OSDOp& osd_op;
4466
4467   ReadFinisher(OSDOp& osd_op) : osd_op(osd_op) {
4468   }
4469
4470   int execute() override {
4471     return osd_op.rval;
4472   }
4473 };
4474
4475 struct C_ChecksumRead : public Context {
4476   PrimaryLogPG *primary_log_pg;
4477   OSDOp &osd_op;
4478   Checksummer::CSumType csum_type;
4479   bufferlist init_value_bl;
4480   ceph_le64 read_length;
4481   bufferlist read_bl;
4482   Context *fill_extent_ctx;
4483
4484   C_ChecksumRead(PrimaryLogPG *primary_log_pg, OSDOp &osd_op,
4485                  Checksummer::CSumType csum_type, bufferlist &&init_value_bl,
4486                  boost::optional<uint32_t> maybe_crc, uint64_t size,
4487                  OSDService *osd, hobject_t soid, __le32 flags)
4488     : primary_log_pg(primary_log_pg), osd_op(osd_op),
4489       csum_type(csum_type), init_value_bl(std::move(init_value_bl)),
4490       fill_extent_ctx(new FillInVerifyExtent(&read_length, &osd_op.rval,
4491                                              &read_bl, maybe_crc, size,
4492                                              osd, soid, flags)) {
4493   }
4494   ~C_ChecksumRead() override {
4495     delete fill_extent_ctx;
4496   }
4497
4498   void finish(int r) override {
4499     fill_extent_ctx->complete(r);
4500     fill_extent_ctx = nullptr;
4501
4502     if (osd_op.rval >= 0) {
4503       bufferlist::iterator init_value_bl_it = init_value_bl.begin();
4504       osd_op.rval = primary_log_pg->finish_checksum(osd_op, csum_type,
4505                                                     &init_value_bl_it, read_bl);
4506     }
4507   }
4508 };
4509
4510 int PrimaryLogPG::do_checksum(OpContext *ctx, OSDOp& osd_op,
4511                               bufferlist::iterator *bl_it)
4512 {
4513   dout(20) << __func__ << dendl;
4514
4515   auto& op = osd_op.op;
4516   if (op.checksum.chunk_size > 0) {
4517     if (op.checksum.length == 0) {
4518       dout(10) << __func__ << ": length required when chunk size provided"
4519                << dendl;
4520       return -EINVAL;
4521     }
4522     if (op.checksum.length % op.checksum.chunk_size != 0) {
4523       dout(10) << __func__ << ": length not aligned to chunk size" << dendl;
4524       return -EINVAL;
4525     }
4526   }
4527
4528   auto& oi = ctx->new_obs.oi;
4529   if (op.checksum.offset == 0 && op.checksum.length == 0) {
4530     // zeroed offset+length implies checksum whole object
4531     op.checksum.length = oi.size;
4532   } else if (op.checksum.offset + op.checksum.length > oi.size) {
4533     return -EOVERFLOW;
4534   }
4535
4536   Checksummer::CSumType csum_type;
4537   switch (op.checksum.type) {
4538   case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH32:
4539     csum_type = Checksummer::CSUM_XXHASH32;
4540     break;
4541   case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH64:
4542     csum_type = Checksummer::CSUM_XXHASH64;
4543     break;
4544   case CEPH_OSD_CHECKSUM_OP_TYPE_CRC32C:
4545     csum_type = Checksummer::CSUM_CRC32C;
4546     break;
4547   default:
4548     dout(10) << __func__ << ": unknown crc type ("
4549              << static_cast<uint32_t>(op.checksum.type) << ")" << dendl;
4550     return -EINVAL;
4551   }
4552
4553   size_t csum_init_value_size = Checksummer::get_csum_init_value_size(csum_type);
4554   if (bl_it->get_remaining() < csum_init_value_size) {
4555     dout(10) << __func__ << ": init value not provided" << dendl;
4556     return -EINVAL;
4557   }
4558
4559   bufferlist init_value_bl;
4560   init_value_bl.substr_of(bl_it->get_bl(), bl_it->get_off(),
4561                           csum_init_value_size);
4562   bl_it->advance(csum_init_value_size);
4563
4564   if (pool.info.require_rollback() && op.checksum.length > 0) {
4565     // If there is a data digest and it is possible we are reading
4566     // entire object, pass the digest.
4567     boost::optional<uint32_t> maybe_crc;
4568     if (oi.is_data_digest() && op.checksum.offset == 0 &&
4569         op.checksum.length >= oi.size) {
4570       maybe_crc = oi.data_digest;
4571     }
4572
4573     // async read
4574     auto& soid = oi.soid;
4575     auto checksum_ctx = new C_ChecksumRead(this, osd_op, csum_type,
4576                                            std::move(init_value_bl), maybe_crc,
4577                                            oi.size, osd, soid, op.flags);
4578
4579     ctx->pending_async_reads.push_back({
4580       {op.checksum.offset, op.checksum.length, op.flags},
4581       {&checksum_ctx->read_bl, checksum_ctx}});
4582
4583     dout(10) << __func__ << ": async_read noted for " << soid << dendl;
4584     ctx->op_finishers[ctx->current_osd_subop_num].reset(
4585       new ReadFinisher(osd_op));
4586     return -EINPROGRESS;
4587   }
4588
4589   // sync read
4590   std::vector<OSDOp> read_ops(1);
4591   auto& read_op = read_ops[0];
4592   if (op.checksum.length > 0) {
4593     read_op.op.op = CEPH_OSD_OP_READ;
4594     read_op.op.flags = op.flags;
4595     read_op.op.extent.offset = op.checksum.offset;
4596     read_op.op.extent.length = op.checksum.length;
4597     read_op.op.extent.truncate_size = 0;
4598     read_op.op.extent.truncate_seq = 0;
4599
4600     int r = do_osd_ops(ctx, read_ops);
4601     if (r < 0) {
4602       derr << __func__ << ": do_osd_ops failed: " << cpp_strerror(r) << dendl;
4603       return r;
4604     }
4605   }
4606
4607   bufferlist::iterator init_value_bl_it = init_value_bl.begin();
4608   return finish_checksum(osd_op, csum_type, &init_value_bl_it,
4609                          read_op.outdata);
4610 }
4611
4612 int PrimaryLogPG::finish_checksum(OSDOp& osd_op,
4613                                   Checksummer::CSumType csum_type,
4614                                   bufferlist::iterator *init_value_bl_it,
4615                                   const bufferlist &read_bl) {
4616   dout(20) << __func__ << dendl;
4617
4618   auto& op = osd_op.op;
4619
4620   if (op.checksum.length > 0 && read_bl.length() != op.checksum.length) {
4621     derr << __func__ << ": bytes read " << read_bl.length() << " != "
4622          << op.checksum.length << dendl;
4623     return -EINVAL;
4624   }
4625
4626   size_t csum_chunk_size = (op.checksum.chunk_size != 0 ?
4627                               op.checksum.chunk_size : read_bl.length());
4628   uint32_t csum_count = (csum_chunk_size > 0 ?
4629                            read_bl.length() / csum_chunk_size : 0);
4630
4631   bufferlist csum;
4632   bufferptr csum_data;
4633   if (csum_count > 0) {
4634     size_t csum_value_size = Checksummer::get_csum_value_size(csum_type);
4635     csum_data = buffer::create(csum_value_size * csum_count);
4636     csum_data.zero();
4637     csum.append(csum_data);
4638
4639     switch (csum_type) {
4640     case Checksummer::CSUM_XXHASH32:
4641       {
4642         Checksummer::xxhash32::init_value_t init_value;
4643         ::decode(init_value, *init_value_bl_it);
4644         Checksummer::calculate<Checksummer::xxhash32>(
4645           init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
4646           &csum_data);
4647       }
4648       break;
4649     case Checksummer::CSUM_XXHASH64:
4650       {
4651         Checksummer::xxhash64::init_value_t init_value;
4652         ::decode(init_value, *init_value_bl_it);
4653         Checksummer::calculate<Checksummer::xxhash64>(
4654           init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
4655           &csum_data);
4656       }
4657       break;
4658     case Checksummer::CSUM_CRC32C:
4659       {
4660         Checksummer::crc32c::init_value_t init_value;
4661         ::decode(init_value, *init_value_bl_it);
4662         Checksummer::calculate<Checksummer::crc32c>(
4663           init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
4664           &csum_data);
4665       }
4666       break;
4667     default:
4668       break;
4669     }
4670   }
4671
4672   ::encode(csum_count, osd_op.outdata);
4673   osd_op.outdata.claim_append(csum);
4674   return 0;
4675 }
4676
4677 struct C_ExtentCmpRead : public Context {
4678   PrimaryLogPG *primary_log_pg;
4679   OSDOp &osd_op;
4680   ceph_le64 read_length;
4681   bufferlist read_bl;
4682   Context *fill_extent_ctx;
4683
4684   C_ExtentCmpRead(PrimaryLogPG *primary_log_pg, OSDOp &osd_op,
4685                   boost::optional<uint32_t> maybe_crc, uint64_t size,
4686                   OSDService *osd, hobject_t soid, __le32 flags)
4687     : primary_log_pg(primary_log_pg), osd_op(osd_op),
4688       fill_extent_ctx(new FillInVerifyExtent(&read_length, &osd_op.rval,
4689                                              &read_bl, maybe_crc, size,
4690                                              osd, soid, flags)) {
4691   }
4692   ~C_ExtentCmpRead() override {
4693     delete fill_extent_ctx;
4694   }
4695
4696   void finish(int r) override {
4697     if (r == -ENOENT) {
4698       osd_op.rval = 0;
4699       read_bl.clear();
4700       delete fill_extent_ctx;
4701     } else {
4702       fill_extent_ctx->complete(r);
4703     }
4704     fill_extent_ctx = nullptr;
4705
4706     if (osd_op.rval >= 0) {
4707       osd_op.rval = primary_log_pg->finish_extent_cmp(osd_op, read_bl);
4708     }
4709   }
4710 };
4711
4712 int PrimaryLogPG::do_extent_cmp(OpContext *ctx, OSDOp& osd_op)
4713 {
4714   dout(20) << __func__ << dendl;
4715   ceph_osd_op& op = osd_op.op;
4716
4717   if (!ctx->obs->exists || ctx->obs->oi.is_whiteout()) {
4718     dout(20) << __func__ << " object DNE" << dendl;
4719     return finish_extent_cmp(osd_op, {});
4720   } else if (pool.info.require_rollback()) {
4721     // If there is a data digest and it is possible we are reading
4722     // entire object, pass the digest.
4723     auto& oi = ctx->new_obs.oi;
4724     boost::optional<uint32_t> maybe_crc;
4725     if (oi.is_data_digest() && op.checksum.offset == 0 &&
4726         op.checksum.length >= oi.size) {
4727       maybe_crc = oi.data_digest;
4728     }
4729
4730     // async read
4731     auto& soid = oi.soid;
4732     auto extent_cmp_ctx = new C_ExtentCmpRead(this, osd_op, maybe_crc, oi.size,
4733                                               osd, soid, op.flags);
4734     ctx->pending_async_reads.push_back({
4735       {op.extent.offset, op.extent.length, op.flags},
4736       {&extent_cmp_ctx->read_bl, extent_cmp_ctx}});
4737
4738     dout(10) << __func__ << ": async_read noted for " << soid << dendl;
4739
4740     ctx->op_finishers[ctx->current_osd_subop_num].reset(
4741       new ReadFinisher(osd_op));
4742     return -EINPROGRESS;
4743   }
4744
4745   // sync read
4746   vector<OSDOp> read_ops(1);
4747   OSDOp& read_op = read_ops[0];
4748
4749   read_op.op.op = CEPH_OSD_OP_SYNC_READ;
4750   read_op.op.extent.offset = op.extent.offset;
4751   read_op.op.extent.length = op.extent.length;
4752   read_op.op.extent.truncate_seq = op.extent.truncate_seq;
4753   read_op.op.extent.truncate_size = op.extent.truncate_size;
4754
4755   int result = do_osd_ops(ctx, read_ops);
4756   if (result < 0) {
4757     derr << __func__ << " failed " << result << dendl;
4758     return result;
4759   }
4760   return finish_extent_cmp(osd_op, read_op.outdata);
4761 }
4762
4763 int PrimaryLogPG::finish_extent_cmp(OSDOp& osd_op, const bufferlist &read_bl)
4764 {
4765   for (uint64_t idx = 0; idx < osd_op.indata.length(); ++idx) {
4766     char read_byte = (idx < read_bl.length() ? read_bl[idx] : 0);
4767     if (osd_op.indata[idx] != read_byte) {
4768         return (-MAX_ERRNO - idx);
4769     }
4770   }
4771
4772   return 0;
4773 }
4774
4775 int PrimaryLogPG::do_read(OpContext *ctx, OSDOp& osd_op) {
4776   dout(20) << __func__ << dendl;
4777   auto& op = osd_op.op;
4778   auto& oi = ctx->new_obs.oi;
4779   auto& soid = oi.soid;
4780   __u32 seq = oi.truncate_seq;
4781   uint64_t size = oi.size;
4782   bool trimmed_read = false;
4783
4784   // are we beyond truncate_size?
4785   if ( (seq < op.extent.truncate_seq) &&
4786        (op.extent.offset + op.extent.length > op.extent.truncate_size) )
4787     size = op.extent.truncate_size;
4788
4789   if (op.extent.length == 0) //length is zero mean read the whole object
4790     op.extent.length = size;
4791
4792   if (op.extent.offset >= size) {
4793     op.extent.length = 0;
4794     trimmed_read = true;
4795   } else if (op.extent.offset + op.extent.length > size) {
4796     op.extent.length = size - op.extent.offset;
4797     trimmed_read = true;
4798   }
4799
4800   // read into a buffer
4801   int result = 0;
4802   if (trimmed_read && op.extent.length == 0) {
4803     // read size was trimmed to zero and it is expected to do nothing
4804     // a read operation of 0 bytes does *not* do nothing, this is why
4805     // the trimmed_read boolean is needed
4806   } else if (pool.info.require_rollback()) {
4807     boost::optional<uint32_t> maybe_crc;
4808     // If there is a data digest and it is possible we are reading
4809     // entire object, pass the digest.  FillInVerifyExtent will
4810     // will check the oi.size again.
4811     if (oi.is_data_digest() && op.extent.offset == 0 &&
4812         op.extent.length >= oi.size)
4813       maybe_crc = oi.data_digest;
4814     ctx->pending_async_reads.push_back(
4815       make_pair(
4816         boost::make_tuple(op.extent.offset, op.extent.length, op.flags),
4817         make_pair(&osd_op.outdata,
4818                   new FillInVerifyExtent(&op.extent.length, &osd_op.rval,
4819                                          &osd_op.outdata, maybe_crc, oi.size,
4820                                          osd, soid, op.flags))));
4821     dout(10) << " async_read noted for " << soid << dendl;
4822
4823     ctx->op_finishers[ctx->current_osd_subop_num].reset(
4824       new ReadFinisher(osd_op));
4825   } else {
4826     int r = pgbackend->objects_read_sync(
4827       soid, op.extent.offset, op.extent.length, op.flags, &osd_op.outdata);
4828     if (r == -EIO) {
4829       r = rep_repair_primary_object(soid, ctx->op);
4830     }
4831     if (r >= 0)
4832       op.extent.length = r;
4833     else {
4834       result = r;
4835       op.extent.length = 0;
4836     }
4837     dout(10) << " read got " << r << " / " << op.extent.length
4838              << " bytes from obj " << soid << dendl;
4839
4840     // whole object?  can we verify the checksum?
4841     if (op.extent.length == oi.size && oi.is_data_digest()) {
4842       uint32_t crc = osd_op.outdata.crc32c(-1);
4843       if (oi.data_digest != crc) {
4844         osd->clog->error() << info.pgid << std::hex
4845                            << " full-object read crc 0x" << crc
4846                            << " != expected 0x" << oi.data_digest
4847                            << std::dec << " on " << soid;
4848         // FIXME fall back to replica or something?
4849         result = -EIO;
4850       }
4851     }
4852   }
4853
4854   // XXX the op.extent.length is the requested length for async read
4855   // On error this length is changed to 0 after the error comes back.
4856   ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(op.extent.length, 10);
4857   ctx->delta_stats.num_rd++;
4858   return result;
4859 }
4860
4861 int PrimaryLogPG::do_sparse_read(OpContext *ctx, OSDOp& osd_op) {
4862   dout(20) << __func__ << dendl;
4863   auto& op = osd_op.op;
4864   auto& oi = ctx->new_obs.oi;
4865   auto& soid = oi.soid;
4866
4867   if (op.extent.truncate_seq) {
4868     dout(0) << "sparse_read does not support truncation sequence " << dendl;
4869     return -EINVAL;
4870   }
4871
4872   ++ctx->num_read;
4873   if (pool.info.ec_pool()) {
4874     // translate sparse read to a normal one if not supported
4875     uint64_t offset = op.extent.offset;
4876     uint64_t length = op.extent.length;
4877     if (offset > oi.size) {
4878       length = 0;
4879     } else if (offset + length > oi.size) {
4880       length = oi.size - offset;
4881     }
4882
4883     if (length > 0) {
4884       ctx->pending_async_reads.push_back(
4885         make_pair(
4886           boost::make_tuple(offset, length, op.flags),
4887           make_pair(
4888             &osd_op.outdata,
4889             new ToSparseReadResult(&osd_op.rval, &osd_op.outdata, offset,
4890                                    &op.extent.length))));
4891       dout(10) << " async_read (was sparse_read) noted for " << soid << dendl;
4892
4893       ctx->op_finishers[ctx->current_osd_subop_num].reset(
4894         new ReadFinisher(osd_op));
4895     } else {
4896       dout(10) << " sparse read ended up empty for " << soid << dendl;
4897       map<uint64_t, uint64_t> extents;
4898       ::encode(extents, osd_op.outdata);
4899     }
4900   } else {
4901     // read into a buffer
4902     map<uint64_t, uint64_t> m;
4903     uint32_t total_read = 0;
4904     int r = osd->store->fiemap(ch, ghobject_t(soid, ghobject_t::NO_GEN,
4905                                               info.pgid.shard),
4906                                op.extent.offset, op.extent.length, m);
4907     if (r < 0)  {
4908       return r;
4909     }
4910
4911     map<uint64_t, uint64_t>::iterator miter;
4912     bufferlist data_bl;
4913     uint64_t last = op.extent.offset;
4914     for (miter = m.begin(); miter != m.end(); ++miter) {
4915       // verify hole?
4916       if (cct->_conf->osd_verify_sparse_read_holes &&
4917           last < miter->first) {
4918         bufferlist t;
4919         uint64_t len = miter->first - last;
4920         r = pgbackend->objects_read_sync(soid, last, len, op.flags, &t);
4921         if (r == -EIO) {
4922           r = rep_repair_primary_object(soid, ctx->op);
4923         }
4924         if (r < 0) {
4925           osd->clog->error() << coll << " " << soid
4926                              << " sparse-read failed to read: "
4927                              << r;
4928         } else if (!t.is_zero()) {
4929           osd->clog->error() << coll << " " << soid
4930                              << " sparse-read found data in hole "
4931                              << last << "~" << len;
4932         }
4933       }
4934
4935       bufferlist tmpbl;
4936       r = pgbackend->objects_read_sync(soid, miter->first, miter->second,
4937                                        op.flags, &tmpbl);
4938       if (r < 0) {
4939         return r;
4940       }
4941
4942       // this is usually happen when we get extent that exceeds the actual file
4943       // size
4944       if (r < (int)miter->second)
4945         miter->second = r;
4946       total_read += r;
4947       dout(10) << "sparse-read " << miter->first << "@" << miter->second
4948                << dendl;
4949       data_bl.claim_append(tmpbl);
4950       last = miter->first + r;
4951     }
4952
4953     if (r < 0) {
4954       return r;
4955     }
4956
4957     // verify trailing hole?
4958     if (cct->_conf->osd_verify_sparse_read_holes) {
4959       uint64_t end = MIN(op.extent.offset + op.extent.length, oi.size);
4960       if (last < end) {
4961         bufferlist t;
4962         uint64_t len = end - last;
4963         r = pgbackend->objects_read_sync(soid, last, len, op.flags, &t);
4964         if (r < 0) {
4965           osd->clog->error() << coll << " " << soid
4966                              << " sparse-read failed to read: " << r;
4967         } else if (!t.is_zero()) {
4968           osd->clog->error() << coll << " " << soid
4969                              << " sparse-read found data in hole "
4970                              << last << "~" << len;
4971         }
4972       }
4973     }
4974
4975     // Why SPARSE_READ need checksum? In fact, librbd always use sparse-read.
4976     // Maybe at first, there is no much whole objects. With continued use, more
4977     // and more whole object exist. So from this point, for spare-read add
4978     // checksum make sense.
4979     if (total_read == oi.size && oi.is_data_digest()) {
4980       uint32_t crc = data_bl.crc32c(-1);
4981       if (oi.data_digest != crc) {
4982         osd->clog->error() << info.pgid << std::hex
4983           << " full-object read crc 0x" << crc
4984           << " != expected 0x" << oi.data_digest
4985           << std::dec << " on " << soid;
4986         // FIXME fall back to replica or something?
4987         return -EIO;
4988       }
4989     }
4990
4991     op.extent.length = total_read;
4992
4993     ::encode(m, osd_op.outdata); // re-encode since it might be modified
4994     ::encode_destructively(data_bl, osd_op.outdata);
4995
4996     dout(10) << " sparse_read got " << total_read << " bytes from object "
4997              << soid << dendl;
4998   }
4999
5000   ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(op.extent.length, 10);
5001   ctx->delta_stats.num_rd++;
5002   return 0;
5003 }
5004
5005 int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
5006 {
5007   int result = 0;
5008   SnapSetContext *ssc = ctx->obc->ssc;
5009   ObjectState& obs = ctx->new_obs;
5010   object_info_t& oi = obs.oi;
5011   const hobject_t& soid = oi.soid;
5012
5013   PGTransaction* t = ctx->op_t.get();
5014
5015   dout(10) << "do_osd_op " << soid << " " << ops << dendl;
5016
5017   ctx->current_osd_subop_num = 0;
5018   for (vector<OSDOp>::iterator p = ops.begin(); p != ops.end(); ++p, ctx->current_osd_subop_num++) {
5019     OSDOp& osd_op = *p;
5020     ceph_osd_op& op = osd_op.op;
5021
5022     OpFinisher* op_finisher = nullptr;
5023     {
5024       auto op_finisher_it = ctx->op_finishers.find(ctx->current_osd_subop_num);
5025       if (op_finisher_it != ctx->op_finishers.end()) {
5026         op_finisher = op_finisher_it->second.get();
5027       }
5028     }
5029
5030     // TODO: check endianness (__le32 vs uint32_t, etc.)
5031     // The fields in ceph_osd_op are little-endian (according to the definition in rados.h),
5032     // but the code in this function seems to treat them as native-endian.  What should the
5033     // tracepoints do?
5034     tracepoint(osd, do_osd_op_pre, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op), op.flags);
5035
5036     dout(10) << "do_osd_op  " << osd_op << dendl;
5037
5038     bufferlist::iterator bp = osd_op.indata.begin();
5039
5040     // user-visible modifcation?
5041     switch (op.op) {
5042       // non user-visible modifications
5043     case CEPH_OSD_OP_WATCH:
5044     case CEPH_OSD_OP_CACHE_EVICT:
5045     case CEPH_OSD_OP_CACHE_FLUSH:
5046     case CEPH_OSD_OP_CACHE_TRY_FLUSH:
5047     case CEPH_OSD_OP_UNDIRTY:
5048     case CEPH_OSD_OP_COPY_FROM:  // we handle user_version update explicitly
5049     case CEPH_OSD_OP_CACHE_PIN:
5050     case CEPH_OSD_OP_CACHE_UNPIN:
5051     case CEPH_OSD_OP_SET_REDIRECT:
5052       break;
5053     default:
5054       if (op.op & CEPH_OSD_OP_MODE_WR)
5055         ctx->user_modify = true;
5056     }
5057
5058     // munge -1 truncate to 0 truncate
5059     if (ceph_osd_op_uses_extent(op.op) &&
5060         op.extent.truncate_seq == 1 &&
5061         op.extent.truncate_size == (-1ULL)) {
5062       op.extent.truncate_size = 0;
5063       op.extent.truncate_seq = 0;
5064     }
5065
5066     // munge ZERO -> TRUNCATE?  (don't munge to DELETE or we risk hosing attributes)
5067     if (op.op == CEPH_OSD_OP_ZERO &&
5068         obs.exists &&
5069         op.extent.offset < cct->_conf->osd_max_object_size &&
5070         op.extent.length >= 1 &&
5071         op.extent.length <= cct->_conf->osd_max_object_size &&
5072         op.extent.offset + op.extent.length >= oi.size) {
5073       if (op.extent.offset >= oi.size) {
5074         // no-op
5075         goto fail;
5076       }
5077       dout(10) << " munging ZERO " << op.extent.offset << "~" << op.extent.length
5078                << " -> TRUNCATE " << op.extent.offset << " (old size is " << oi.size << ")" << dendl;
5079       op.op = CEPH_OSD_OP_TRUNCATE;
5080     }
5081
5082     switch (op.op) {
5083
5084       // --- READS ---
5085
5086     case CEPH_OSD_OP_CMPEXT:
5087       ++ctx->num_read;
5088       tracepoint(osd, do_osd_op_pre_extent_cmp, soid.oid.name.c_str(),
5089                  soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
5090                  op.extent.length, op.extent.truncate_size,
5091                  op.extent.truncate_seq);
5092
5093       if (op_finisher == nullptr) {
5094         result = do_extent_cmp(ctx, osd_op);
5095       } else {
5096         result = op_finisher->execute();
5097       }
5098       break;
5099
5100     case CEPH_OSD_OP_SYNC_READ:
5101       if (pool.info.require_rollback()) {
5102         result = -EOPNOTSUPP;
5103         break;
5104       }
5105       // fall through
5106     case CEPH_OSD_OP_READ:
5107       ++ctx->num_read;
5108       tracepoint(osd, do_osd_op_pre_read, soid.oid.name.c_str(),
5109                  soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
5110                  op.extent.length, op.extent.truncate_size,
5111                  op.extent.truncate_seq);
5112       if (op_finisher == nullptr) {
5113         if (!ctx->data_off) {
5114           ctx->data_off = op.extent.offset;
5115         }
5116         result = do_read(ctx, osd_op);
5117       } else {
5118         result = op_finisher->execute();
5119       }
5120       break;
5121
5122     case CEPH_OSD_OP_CHECKSUM:
5123       ++ctx->num_read;
5124       {
5125         tracepoint(osd, do_osd_op_pre_checksum, soid.oid.name.c_str(),
5126                    soid.snap.val, oi.size, oi.truncate_seq, op.checksum.type,
5127                    op.checksum.offset, op.checksum.length,
5128                    op.checksum.chunk_size);
5129
5130         if (op_finisher == nullptr) {
5131           result = do_checksum(ctx, osd_op, &bp);
5132         } else {
5133           result = op_finisher->execute();
5134         }
5135       }
5136       break;
5137
5138     /* map extents */
5139     case CEPH_OSD_OP_MAPEXT:
5140       tracepoint(osd, do_osd_op_pre_mapext, soid.oid.name.c_str(), soid.snap.val, op.extent.offset, op.extent.length);
5141       if (pool.info.require_rollback()) {
5142         result = -EOPNOTSUPP;
5143         break;
5144       }
5145       ++ctx->num_read;
5146       {
5147         // read into a buffer
5148         bufferlist bl;
5149         int r = osd->store->fiemap(ch, ghobject_t(soid, ghobject_t::NO_GEN,
5150                                                   info.pgid.shard),
5151                                    op.extent.offset, op.extent.length, bl);
5152         osd_op.outdata.claim(bl);
5153         if (r < 0)
5154           result = r;
5155         else
5156           ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(bl.length(), 10);
5157         ctx->delta_stats.num_rd++;
5158         dout(10) << " map_extents done on object " << soid << dendl;
5159       }
5160       break;
5161
5162     /* map extents */
5163     case CEPH_OSD_OP_SPARSE_READ:
5164       tracepoint(osd, do_osd_op_pre_sparse_read, soid.oid.name.c_str(),
5165                  soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset,
5166                  op.extent.length, op.extent.truncate_size,
5167                  op.extent.truncate_seq);
5168       if (op_finisher == nullptr) {
5169         result = do_sparse_read(ctx, osd_op);
5170       } else {
5171         result = op_finisher->execute();
5172       }
5173       break;
5174
5175     case CEPH_OSD_OP_CALL:
5176       {
5177         string cname, mname;
5178         bufferlist indata;
5179         try {
5180           bp.copy(op.cls.class_len, cname);
5181           bp.copy(op.cls.method_len, mname);
5182           bp.copy(op.cls.indata_len, indata);
5183         } catch (buffer::error& e) {
5184           dout(10) << "call unable to decode class + method + indata" << dendl;
5185           dout(30) << "in dump: ";
5186           osd_op.indata.hexdump(*_dout);
5187           *_dout << dendl;
5188           result = -EINVAL;
5189           tracepoint(osd, do_osd_op_pre_call, soid.oid.name.c_str(), soid.snap.val, "???", "???");
5190           break;
5191         }
5192         tracepoint(osd, do_osd_op_pre_call, soid.oid.name.c_str(), soid.snap.val, cname.c_str(), mname.c_str());
5193
5194         ClassHandler::ClassData *cls;
5195         result = osd->class_handler->open_class(cname, &cls);
5196         assert(result == 0);   // init_op_flags() already verified this works.
5197
5198         ClassHandler::ClassMethod *method = cls->get_method(mname.c_str());
5199         if (!method) {
5200           dout(10) << "call method " << cname << "." << mname << " does not exist" << dendl;
5201           result = -EOPNOTSUPP;
5202           break;
5203         }
5204
5205         int flags = method->get_flags();
5206         if (flags & CLS_METHOD_WR)
5207           ctx->user_modify = true;
5208
5209         bufferlist outdata;
5210         dout(10) << "call method " << cname << "." << mname << dendl;
5211         int prev_rd = ctx->num_read;
5212         int prev_wr = ctx->num_write;
5213         result = method->exec((cls_method_context_t)&ctx, indata, outdata);
5214
5215         if (ctx->num_read > prev_rd && !(flags & CLS_METHOD_RD)) {
5216           derr << "method " << cname << "." << mname << " tried to read object but is not marked RD" << dendl;
5217           result = -EIO;
5218           break;
5219         }
5220         if (ctx->num_write > prev_wr && !(flags & CLS_METHOD_WR)) {
5221           derr << "method " << cname << "." << mname << " tried to update object but is not marked WR" << dendl;
5222           result = -EIO;
5223           break;
5224         }
5225
5226         dout(10) << "method called response length=" << outdata.length() << dendl;
5227         op.extent.length = outdata.length();
5228         osd_op.outdata.claim_append(outdata);
5229         dout(30) << "out dump: ";
5230         osd_op.outdata.hexdump(*_dout);
5231         *_dout << dendl;
5232       }
5233       break;
5234
5235     case CEPH_OSD_OP_STAT:
5236       // note: stat does not require RD
5237       {
5238         tracepoint(osd, do_osd_op_pre_stat, soid.oid.name.c_str(), soid.snap.val);
5239
5240         if (obs.exists && !oi.is_whiteout()) {
5241           ::encode(oi.size, osd_op.outdata);
5242           ::encode(oi.mtime, osd_op.outdata);
5243           dout(10) << "stat oi has " << oi.size << " " << oi.mtime << dendl;
5244         } else {
5245           result = -ENOENT;
5246           dout(10) << "stat oi object does not exist" << dendl;
5247         }
5248
5249         ctx->delta_stats.num_rd++;
5250       }
5251       break;
5252
5253     case CEPH_OSD_OP_ISDIRTY:
5254       ++ctx->num_read;
5255       {
5256         tracepoint(osd, do_osd_op_pre_isdirty, soid.oid.name.c_str(), soid.snap.val);
5257         bool is_dirty = obs.oi.is_dirty();
5258         ::encode(is_dirty, osd_op.outdata);
5259         ctx->delta_stats.num_rd++;
5260         result = 0;
5261       }
5262       break;
5263
5264     case CEPH_OSD_OP_UNDIRTY:
5265       ++ctx->num_write;
5266       {
5267         tracepoint(osd, do_osd_op_pre_undirty, soid.oid.name.c_str(), soid.snap.val);
5268         if (oi.is_dirty()) {
5269           ctx->undirty = true;  // see make_writeable()
5270           ctx->modify = true;
5271           ctx->delta_stats.num_wr++;
5272         }
5273         result = 0;
5274       }
5275       break;
5276
5277     case CEPH_OSD_OP_CACHE_TRY_FLUSH:
5278       ++ctx->num_write;
5279       {
5280         tracepoint(osd, do_osd_op_pre_try_flush, soid.oid.name.c_str(), soid.snap.val);
5281         if (ctx->lock_type != ObjectContext::RWState::RWNONE) {
5282           dout(10) << "cache-try-flush without SKIPRWLOCKS flag set" << dendl;
5283           result = -EINVAL;
5284           break;
5285         }
5286         if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
5287           result = -EINVAL;
5288           break;
5289         }
5290         if (!obs.exists) {
5291           result = 0;
5292           break;
5293         }
5294         if (oi.is_cache_pinned()) {
5295           dout(10) << "cache-try-flush on a pinned object, consider unpin this object first" << dendl;
5296           result = -EPERM;
5297           break;
5298         }
5299         if (oi.is_dirty()) {
5300           result = start_flush(ctx->op, ctx->obc, false, NULL, boost::none);
5301           if (result == -EINPROGRESS)
5302             result = -EAGAIN;
5303         } else {
5304           result = 0;
5305         }
5306       }
5307       break;
5308
5309     case CEPH_OSD_OP_CACHE_FLUSH:
5310       ++ctx->num_write;
5311       {
5312         tracepoint(osd, do_osd_op_pre_cache_flush, soid.oid.name.c_str(), soid.snap.val);
5313         if (ctx->lock_type == ObjectContext::RWState::RWNONE) {
5314           dout(10) << "cache-flush with SKIPRWLOCKS flag set" << dendl;
5315           result = -EINVAL;
5316           break;
5317         }
5318         if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
5319           result = -EINVAL;
5320           break;
5321         }
5322         if (!obs.exists) {
5323           result = 0;
5324           break;
5325         }
5326         if (oi.is_cache_pinned()) {
5327           dout(10) << "cache-flush on a pinned object, consider unpin this object first" << dendl;
5328           result = -EPERM;
5329           break;
5330         }
5331         hobject_t missing;
5332         if (oi.is_dirty()) {
5333           result = start_flush(ctx->op, ctx->obc, true, &missing, boost::none);
5334           if (result == -EINPROGRESS)
5335             result = -EAGAIN;
5336         } else {
5337           result = 0;
5338         }
5339         // Check special return value which has set missing_return
5340         if (result == -ENOENT) {
5341           dout(10) << __func__ << " CEPH_OSD_OP_CACHE_FLUSH got ENOENT" << dendl;
5342           assert(!missing.is_min());
5343           wait_for_unreadable_object(missing, ctx->op);
5344           // Error code which is used elsewhere when wait_for_unreadable_object() is used
5345           result = -EAGAIN;
5346         }
5347       }
5348       break;
5349
5350     case CEPH_OSD_OP_CACHE_EVICT:
5351       ++ctx->num_write;
5352       {
5353         tracepoint(osd, do_osd_op_pre_cache_evict, soid.oid.name.c_str(), soid.snap.val);
5354         if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
5355           result = -EINVAL;
5356           break;
5357         }
5358         if (!obs.exists) {
5359           result = 0;
5360           break;
5361         }
5362         if (oi.is_cache_pinned()) {
5363           dout(10) << "cache-evict on a pinned object, consider unpin this object first" << dendl;
5364           result = -EPERM;
5365           break;
5366         }
5367         if (oi.is_dirty()) {
5368           result = -EBUSY;
5369           break;
5370         }
5371         if (!oi.watchers.empty()) {
5372           result = -EBUSY;
5373           break;
5374         }
5375         if (soid.snap == CEPH_NOSNAP) {
5376           result = _verify_no_head_clones(soid, ssc->snapset);
5377           if (result < 0)
5378             break;
5379         }
5380         result = _delete_oid(ctx, true, false);
5381         if (result >= 0) {
5382           // mark that this is a cache eviction to avoid triggering normal
5383           // make_writeable() clone or snapdir object creation in finish_ctx()
5384           ctx->cache_evict = true;
5385         }
5386         osd->logger->inc(l_osd_tier_evict);
5387       }
5388       break;
5389
5390     case CEPH_OSD_OP_GETXATTR:
5391       ++ctx->num_read;
5392       {
5393         string aname;
5394         bp.copy(op.xattr.name_len, aname);
5395         tracepoint(osd, do_osd_op_pre_getxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
5396         string name = "_" + aname;
5397         int r = getattr_maybe_cache(
5398           ctx->obc,
5399           name,
5400           &(osd_op.outdata));
5401         if (r >= 0) {
5402           op.xattr.value_len = osd_op.outdata.length();
5403           result = 0;
5404           ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
5405         } else
5406           result = r;
5407
5408         ctx->delta_stats.num_rd++;
5409       }
5410       break;
5411
5412    case CEPH_OSD_OP_GETXATTRS:
5413       ++ctx->num_read;
5414       {
5415         tracepoint(osd, do_osd_op_pre_getxattrs, soid.oid.name.c_str(), soid.snap.val);
5416         map<string, bufferlist> out;
5417         result = getattrs_maybe_cache(
5418           ctx->obc,
5419           &out,
5420           true);
5421
5422         bufferlist bl;
5423         ::encode(out, bl);
5424         ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(bl.length(), 10);
5425         ctx->delta_stats.num_rd++;
5426         osd_op.outdata.claim_append(bl);
5427       }
5428       break;
5429
5430     case CEPH_OSD_OP_CMPXATTR:
5431       ++ctx->num_read;
5432       {
5433         string aname;
5434         bp.copy(op.xattr.name_len, aname);
5435         tracepoint(osd, do_osd_op_pre_cmpxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
5436         string name = "_" + aname;
5437         name[op.xattr.name_len + 1] = 0;
5438
5439         bufferlist xattr;
5440         result = getattr_maybe_cache(
5441           ctx->obc,
5442           name,
5443           &xattr);
5444         if (result < 0 && result != -EEXIST && result != -ENODATA)
5445           break;
5446
5447         ctx->delta_stats.num_rd++;
5448         ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(xattr.length(), 10);
5449
5450         switch (op.xattr.cmp_mode) {
5451         case CEPH_OSD_CMPXATTR_MODE_STRING:
5452           {
5453             string val;
5454             bp.copy(op.xattr.value_len, val);
5455             val[op.xattr.value_len] = 0;
5456             dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name << " val=" << val
5457                      << " op=" << (int)op.xattr.cmp_op << " mode=" << (int)op.xattr.cmp_mode << dendl;
5458             result = do_xattr_cmp_str(op.xattr.cmp_op, val, xattr);
5459           }
5460           break;
5461
5462         case CEPH_OSD_CMPXATTR_MODE_U64:
5463           {
5464             uint64_t u64val;
5465             try {
5466               ::decode(u64val, bp);
5467             }
5468             catch (buffer::error& e) {
5469               result = -EINVAL;
5470               goto fail;
5471             }
5472             dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name << " val=" << u64val
5473                      << " op=" << (int)op.xattr.cmp_op << " mode=" << (int)op.xattr.cmp_mode << dendl;
5474             result = do_xattr_cmp_u64(op.xattr.cmp_op, u64val, xattr);
5475           }
5476           break;
5477
5478         default:
5479           dout(10) << "bad cmp mode " << (int)op.xattr.cmp_mode << dendl;
5480           result = -EINVAL;
5481         }
5482
5483         if (!result) {
5484           dout(10) << "comparison returned false" << dendl;
5485           result = -ECANCELED;
5486           break;
5487         }
5488         if (result < 0) {
5489           dout(10) << "comparison returned " << result << " " << cpp_strerror(-result) << dendl;
5490           break;
5491         }
5492
5493         dout(10) << "comparison returned true" << dendl;
5494       }
5495       break;
5496
5497     case CEPH_OSD_OP_ASSERT_VER:
5498       ++ctx->num_read;
5499       {
5500         uint64_t ver = op.assert_ver.ver;
5501         tracepoint(osd, do_osd_op_pre_assert_ver, soid.oid.name.c_str(), soid.snap.val, ver);
5502         if (!ver)
5503           result = -EINVAL;
5504         else if (ver < oi.user_version)
5505           result = -ERANGE;
5506         else if (ver > oi.user_version)
5507           result = -EOVERFLOW;
5508       }
5509       break;
5510
5511     case CEPH_OSD_OP_LIST_WATCHERS:
5512       ++ctx->num_read;
5513       {
5514         tracepoint(osd, do_osd_op_pre_list_watchers, soid.oid.name.c_str(), soid.snap.val);
5515         obj_list_watch_response_t resp;
5516
5517         map<pair<uint64_t, entity_name_t>, watch_info_t>::const_iterator oi_iter;
5518         for (oi_iter = oi.watchers.begin(); oi_iter != oi.watchers.end();
5519                                        ++oi_iter) {
5520           dout(20) << "key cookie=" << oi_iter->first.first
5521                << " entity=" << oi_iter->first.second << " "
5522                << oi_iter->second << dendl;
5523           assert(oi_iter->first.first == oi_iter->second.cookie);
5524           assert(oi_iter->first.second.is_client());
5525
5526           watch_item_t wi(oi_iter->first.second, oi_iter->second.cookie,
5527                  oi_iter->second.timeout_seconds, oi_iter->second.addr);
5528           resp.entries.push_back(wi);
5529         }
5530
5531         resp.encode(osd_op.outdata, ctx->get_features());
5532         result = 0;
5533
5534         ctx->delta_stats.num_rd++;
5535         break;
5536       }
5537
5538     case CEPH_OSD_OP_LIST_SNAPS:
5539       ++ctx->num_read;
5540       {
5541         tracepoint(osd, do_osd_op_pre_list_snaps, soid.oid.name.c_str(), soid.snap.val);
5542         obj_list_snap_response_t resp;
5543
5544         if (!ssc) {
5545           ssc = ctx->obc->ssc = get_snapset_context(soid, false);
5546         }
5547         assert(ssc);
5548
5549         int clonecount = ssc->snapset.clones.size();
5550         if (ssc->snapset.head_exists)
5551           clonecount++;
5552         resp.clones.reserve(clonecount);
5553         for (auto clone_iter = ssc->snapset.clones.begin();
5554              clone_iter != ssc->snapset.clones.end(); ++clone_iter) {
5555           clone_info ci;
5556           ci.cloneid = *clone_iter;
5557
5558           hobject_t clone_oid = soid;
5559           clone_oid.snap = *clone_iter;
5560
5561           if (!ssc->snapset.is_legacy()) {
5562             auto p = ssc->snapset.clone_snaps.find(*clone_iter);
5563             if (p == ssc->snapset.clone_snaps.end()) {
5564               osd->clog->error() << "osd." << osd->whoami
5565                                  << ": inconsistent clone_snaps found for oid "
5566                                  << soid << " clone " << *clone_iter
5567                                  << " snapset " << ssc->snapset;
5568               result = -EINVAL;
5569               break;
5570             }
5571             for (auto q = p->second.rbegin(); q != p->second.rend(); ++q) {
5572               ci.snaps.push_back(*q);
5573             }
5574           } else {
5575             /* No need to take a lock here.  We are only inspecting state cached on
5576              * in the ObjectContext, so we aren't performing an actual read unless
5577              * the clone obc is not already loaded (in which case, it cannot have
5578              * an in progress write).  We also do not risk exposing uncommitted
5579              * state since we do have a read lock on the head object or snapdir,
5580              * which we would have to write lock in order to make user visible
5581              * modifications to the snapshot state (snap trim related mutations
5582              * are not user visible).
5583              */
5584             if (is_missing_object(clone_oid)) {
5585               dout(20) << "LIST_SNAPS " << clone_oid << " missing" << dendl;
5586               wait_for_unreadable_object(clone_oid, ctx->op);
5587               result = -EAGAIN;
5588               break;
5589             }
5590
5591             ObjectContextRef clone_obc = get_object_context(clone_oid, false);
5592             if (!clone_obc) {
5593               if (maybe_handle_cache(
5594                     ctx->op, true, clone_obc, -ENOENT, clone_oid, true)) {
5595                 // promoting the clone
5596                 result = -EAGAIN;
5597               } else {
5598                 osd->clog->error() << "osd." << osd->whoami
5599                                    << ": missing clone " << clone_oid
5600                                    << " for oid "
5601                                    << soid;
5602                 // should not happen
5603                 result = -ENOENT;
5604               }
5605               break;
5606             }
5607             for (vector<snapid_t>::reverse_iterator p =
5608                    clone_obc->obs.oi.legacy_snaps.rbegin();
5609                  p != clone_obc->obs.oi.legacy_snaps.rend();
5610                  ++p) {
5611               ci.snaps.push_back(*p);
5612             }
5613           }
5614
5615           dout(20) << " clone " << *clone_iter << " snaps " << ci.snaps << dendl;
5616
5617           map<snapid_t, interval_set<uint64_t> >::const_iterator coi;
5618           coi = ssc->snapset.clone_overlap.find(ci.cloneid);
5619           if (coi == ssc->snapset.clone_overlap.end()) {
5620             osd->clog->error() << "osd." << osd->whoami
5621                                << ": inconsistent clone_overlap found for oid "
5622                               << soid << " clone " << *clone_iter;
5623             result = -EINVAL;
5624             break;
5625           }
5626           const interval_set<uint64_t> &o = coi->second;
5627           ci.overlap.reserve(o.num_intervals());
5628           for (interval_set<uint64_t>::const_iterator r = o.begin();
5629                r != o.end(); ++r) {
5630             ci.overlap.push_back(pair<uint64_t,uint64_t>(r.get_start(),
5631                                                          r.get_len()));
5632           }
5633
5634           map<snapid_t, uint64_t>::const_iterator si;
5635           si = ssc->snapset.clone_size.find(ci.cloneid);
5636           if (si == ssc->snapset.clone_size.end()) {
5637             osd->clog->error() << "osd." << osd->whoami
5638                                << ": inconsistent clone_size found for oid "
5639                                << soid << " clone " << *clone_iter;
5640             result = -EINVAL;
5641             break;
5642           }
5643           ci.size = si->second;
5644
5645           resp.clones.push_back(ci);
5646         }
5647         if (result < 0) {
5648           break;
5649         }
5650         if (ssc->snapset.head_exists &&
5651             !ctx->obc->obs.oi.is_whiteout()) {
5652           assert(obs.exists);
5653           clone_info ci;
5654           ci.cloneid = CEPH_NOSNAP;
5655
5656           //Size for HEAD is oi.size
5657           ci.size = oi.size;
5658
5659           resp.clones.push_back(ci);
5660         }
5661         resp.seq = ssc->snapset.seq;
5662
5663         resp.encode(osd_op.outdata);
5664         result = 0;
5665
5666         ctx->delta_stats.num_rd++;
5667         break;
5668       }
5669
5670    case CEPH_OSD_OP_NOTIFY:
5671       ++ctx->num_read;
5672       {
5673         uint32_t timeout;
5674         bufferlist bl;
5675
5676         try {
5677           uint32_t ver; // obsolete
5678           ::decode(ver, bp);
5679           ::decode(timeout, bp);
5680           ::decode(bl, bp);
5681         } catch (const buffer::error &e) {
5682           timeout = 0;
5683         }
5684         tracepoint(osd, do_osd_op_pre_notify, soid.oid.name.c_str(), soid.snap.val, timeout);
5685         if (!timeout)
5686           timeout = cct->_conf->osd_default_notify_timeout;
5687
5688         notify_info_t n;
5689         n.timeout = timeout;
5690         n.notify_id = osd->get_next_id(get_osdmap()->get_epoch());
5691         n.cookie = op.watch.cookie;
5692         n.bl = bl;
5693         ctx->notifies.push_back(n);
5694
5695         // return our unique notify id to the client
5696         ::encode(n.notify_id, osd_op.outdata);
5697       }
5698       break;
5699
5700     case CEPH_OSD_OP_NOTIFY_ACK:
5701       ++ctx->num_read;
5702       {
5703         try {
5704           uint64_t notify_id = 0;
5705           uint64_t watch_cookie = 0;
5706           ::decode(notify_id, bp);
5707           ::decode(watch_cookie, bp);
5708           bufferlist reply_bl;
5709           if (!bp.end()) {
5710             ::decode(reply_bl, bp);
5711           }
5712           tracepoint(osd, do_osd_op_pre_notify_ack, soid.oid.name.c_str(), soid.snap.val, notify_id, watch_cookie, "Y");
5713           OpContext::NotifyAck ack(notify_id, watch_cookie, reply_bl);
5714           ctx->notify_acks.push_back(ack);
5715         } catch (const buffer::error &e) {
5716           tracepoint(osd, do_osd_op_pre_notify_ack, soid.oid.name.c_str(), soid.snap.val, op.watch.cookie, 0, "N");
5717           OpContext::NotifyAck ack(
5718             // op.watch.cookie is actually the notify_id for historical reasons
5719             op.watch.cookie
5720             );
5721           ctx->notify_acks.push_back(ack);
5722         }
5723       }
5724       break;
5725
5726     case CEPH_OSD_OP_SETALLOCHINT:
5727       ++ctx->num_write;
5728       {
5729         tracepoint(osd, do_osd_op_pre_setallochint, soid.oid.name.c_str(), soid.snap.val, op.alloc_hint.expected_object_size, op.alloc_hint.expected_write_size);
5730         maybe_create_new_object(ctx);
5731         oi.expected_object_size = op.alloc_hint.expected_object_size;
5732         oi.expected_write_size = op.alloc_hint.expected_write_size;
5733         oi.alloc_hint_flags = op.alloc_hint.flags;
5734         t->set_alloc_hint(soid, op.alloc_hint.expected_object_size,
5735                           op.alloc_hint.expected_write_size,
5736                           op.alloc_hint.flags);
5737         ctx->delta_stats.num_wr++;
5738         result = 0;
5739       }
5740       break;
5741
5742
5743       // --- WRITES ---
5744
5745       // -- object data --
5746
5747     case CEPH_OSD_OP_WRITE:
5748       ++ctx->num_write;
5749       { // write
5750         __u32 seq = oi.truncate_seq;
5751         tracepoint(osd, do_osd_op_pre_write, soid.oid.name.c_str(), soid.snap.val, oi.size, seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
5752         if (op.extent.length != osd_op.indata.length()) {
5753           result = -EINVAL;
5754           break;
5755         }
5756
5757         if (pool.info.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED))
5758           op.flags = op.flags | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
5759
5760         if (pool.info.requires_aligned_append() &&
5761             (op.extent.offset % pool.info.required_alignment() != 0)) {
5762           result = -EOPNOTSUPP;
5763           break;
5764         }
5765
5766         if (!obs.exists) {
5767           if (pool.info.requires_aligned_append() && op.extent.offset) {
5768             result = -EOPNOTSUPP;
5769             break;
5770           }
5771         } else if (op.extent.offset != oi.size &&
5772                    pool.info.requires_aligned_append()) {
5773           result = -EOPNOTSUPP;
5774           break;
5775         }
5776
5777         if (seq && (seq > op.extent.truncate_seq) &&
5778             (op.extent.offset + op.extent.length > oi.size)) {
5779           // old write, arrived after trimtrunc
5780           op.extent.length = (op.extent.offset > oi.size ? 0 : oi.size - op.extent.offset);
5781           dout(10) << " old truncate_seq " << op.extent.truncate_seq << " < current " << seq
5782                    << ", adjusting write length to " << op.extent.length << dendl;
5783           bufferlist t;
5784           t.substr_of(osd_op.indata, 0, op.extent.length);
5785           osd_op.indata.swap(t);
5786         }
5787         if (op.extent.truncate_seq > seq) {
5788           // write arrives before trimtrunc
5789           if (obs.exists && !oi.is_whiteout()) {
5790             dout(10) << " truncate_seq " << op.extent.truncate_seq << " > current " << seq
5791                      << ", truncating to " << op.extent.truncate_size << dendl;
5792             t->truncate(soid, op.extent.truncate_size);
5793             oi.truncate_seq = op.extent.truncate_seq;
5794             oi.truncate_size = op.extent.truncate_size;
5795             if (op.extent.truncate_size != oi.size) {
5796               ctx->delta_stats.num_bytes -= oi.size;
5797               ctx->delta_stats.num_bytes += op.extent.truncate_size;
5798               oi.size = op.extent.truncate_size;
5799             }
5800           } else {
5801             dout(10) << " truncate_seq " << op.extent.truncate_seq << " > current " << seq
5802                      << ", but object is new" << dendl;
5803             oi.truncate_seq = op.extent.truncate_seq;
5804             oi.truncate_size = op.extent.truncate_size;
5805           }
5806         }
5807         result = check_offset_and_length(op.extent.offset, op.extent.length, cct->_conf->osd_max_object_size);
5808         if (result < 0)
5809           break;
5810
5811         maybe_create_new_object(ctx);
5812
5813         if (op.extent.length == 0) {
5814           if (op.extent.offset > oi.size) {
5815             t->truncate(
5816               soid, op.extent.offset);
5817           } else {
5818             t->nop(soid);
5819           }
5820         } else {
5821           t->write(
5822             soid, op.extent.offset, op.extent.length, osd_op.indata, op.flags);
5823         }
5824
5825         if (op.extent.offset == 0 && op.extent.length >= oi.size)
5826           obs.oi.set_data_digest(osd_op.indata.crc32c(-1));
5827         else if (op.extent.offset == oi.size && obs.oi.is_data_digest())
5828           obs.oi.set_data_digest(osd_op.indata.crc32c(obs.oi.data_digest));
5829         else
5830           obs.oi.clear_data_digest();
5831         write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
5832                                     op.extent.offset, op.extent.length);
5833
5834       }
5835       break;
5836
5837     case CEPH_OSD_OP_WRITEFULL:
5838       ++ctx->num_write;
5839       { // write full object
5840         tracepoint(osd, do_osd_op_pre_writefull, soid.oid.name.c_str(), soid.snap.val, oi.size, 0, op.extent.length);
5841
5842         if (op.extent.length != osd_op.indata.length()) {
5843           result = -EINVAL;
5844           break;
5845         }
5846         result = check_offset_and_length(0, op.extent.length, cct->_conf->osd_max_object_size);
5847         if (result < 0)
5848           break;
5849
5850         if (pool.info.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED))
5851           op.flags = op.flags | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
5852
5853         maybe_create_new_object(ctx);
5854         if (pool.info.require_rollback()) {
5855           t->truncate(soid, 0);
5856         } else if (obs.exists && op.extent.length < oi.size) {
5857           t->truncate(soid, op.extent.length);
5858         }
5859         if (op.extent.length) {
5860           t->write(soid, 0, op.extent.length, osd_op.indata, op.flags);
5861         }
5862         obs.oi.set_data_digest(osd_op.indata.crc32c(-1));
5863
5864         write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
5865             0, op.extent.length, true);
5866       }
5867       break;
5868
5869     case CEPH_OSD_OP_WRITESAME:
5870       ++ctx->num_write;
5871       tracepoint(osd, do_osd_op_pre_writesame, soid.oid.name.c_str(), soid.snap.val, oi.size, op.writesame.offset, op.writesame.length, op.writesame.data_length);
5872       result = do_writesame(ctx, osd_op);
5873       break;
5874
5875     case CEPH_OSD_OP_ROLLBACK :
5876       ++ctx->num_write;
5877       tracepoint(osd, do_osd_op_pre_rollback, soid.oid.name.c_str(), soid.snap.val);
5878       result = _rollback_to(ctx, op);
5879       break;
5880
5881     case CEPH_OSD_OP_ZERO:
5882       tracepoint(osd, do_osd_op_pre_zero, soid.oid.name.c_str(), soid.snap.val, op.extent.offset, op.extent.length);
5883       if (pool.info.requires_aligned_append()) {
5884         result = -EOPNOTSUPP;
5885         break;
5886       }
5887       ++ctx->num_write;
5888       { // zero
5889         result = check_offset_and_length(op.extent.offset, op.extent.length, cct->_conf->osd_max_object_size);
5890         if (result < 0)
5891           break;
5892         assert(op.extent.length);
5893         if (obs.exists && !oi.is_whiteout()) {
5894           t->zero(soid, op.extent.offset, op.extent.length);
5895           interval_set<uint64_t> ch;
5896           ch.insert(op.extent.offset, op.extent.length);
5897           ctx->modified_ranges.union_of(ch);
5898           ctx->delta_stats.num_wr++;
5899           oi.clear_data_digest();
5900         } else {
5901           // no-op
5902         }
5903       }
5904       break;
5905     case CEPH_OSD_OP_CREATE:
5906       ++ctx->num_write;
5907       {
5908         tracepoint(osd, do_osd_op_pre_create, soid.oid.name.c_str(), soid.snap.val);
5909         int flags = le32_to_cpu(op.flags);
5910         if (obs.exists && !oi.is_whiteout() &&
5911             (flags & CEPH_OSD_OP_FLAG_EXCL)) {
5912           result = -EEXIST; /* this is an exclusive create */
5913         } else {
5914           if (osd_op.indata.length()) {
5915             bufferlist::iterator p = osd_op.indata.begin();
5916             string category;
5917             try {
5918               ::decode(category, p);
5919             }
5920             catch (buffer::error& e) {
5921               result = -EINVAL;
5922               goto fail;
5923             }
5924             // category is no longer implemented.
5925           }
5926           if (result >= 0) {
5927             maybe_create_new_object(ctx);
5928             t->nop(soid);
5929           }
5930         }
5931       }
5932       break;
5933
5934     case CEPH_OSD_OP_TRIMTRUNC:
5935       op.extent.offset = op.extent.truncate_size;
5936       // falling through
5937
5938     case CEPH_OSD_OP_TRUNCATE:
5939       tracepoint(osd, do_osd_op_pre_truncate, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
5940       if (pool.info.requires_aligned_append()) {
5941         result = -EOPNOTSUPP;
5942         break;
5943       }
5944       ++ctx->num_write;
5945       {
5946         // truncate
5947         if (!obs.exists || oi.is_whiteout()) {
5948           dout(10) << " object dne, truncate is a no-op" << dendl;
5949           break;
5950         }
5951
5952         if (op.extent.offset > cct->_conf->osd_max_object_size) {
5953           result = -EFBIG;
5954           break;
5955         }
5956
5957         if (op.extent.truncate_seq) {
5958           assert(op.extent.offset == op.extent.truncate_size);
5959           if (op.extent.truncate_seq <= oi.truncate_seq) {
5960             dout(10) << " truncate seq " << op.extent.truncate_seq << " <= current " << oi.truncate_seq
5961                      << ", no-op" << dendl;
5962             break; // old
5963           }
5964           dout(10) << " truncate seq " << op.extent.truncate_seq << " > current " << oi.truncate_seq
5965                    << ", truncating" << dendl;
5966           oi.truncate_seq = op.extent.truncate_seq;
5967           oi.truncate_size = op.extent.truncate_size;
5968         }
5969
5970         maybe_create_new_object(ctx);
5971         t->truncate(soid, op.extent.offset);
5972         if (oi.size > op.extent.offset) {
5973           interval_set<uint64_t> trim;
5974           trim.insert(op.extent.offset, oi.size-op.extent.offset);
5975           ctx->modified_ranges.union_of(trim);
5976         }
5977         if (op.extent.offset != oi.size) {
5978           ctx->delta_stats.num_bytes -= oi.size;
5979           ctx->delta_stats.num_bytes += op.extent.offset;
5980           oi.size = op.extent.offset;
5981         }
5982         ctx->delta_stats.num_wr++;
5983         // do no set exists, or we will break above DELETE -> TRUNCATE munging.
5984
5985         oi.clear_data_digest();
5986       }
5987       break;
5988
5989     case CEPH_OSD_OP_DELETE:
5990       ++ctx->num_write;
5991       tracepoint(osd, do_osd_op_pre_delete, soid.oid.name.c_str(), soid.snap.val);
5992       {
5993         result = _delete_oid(ctx, false, ctx->ignore_cache);
5994       }
5995       break;
5996
5997     case CEPH_OSD_OP_WATCH:
5998       ++ctx->num_write;
5999       {
6000         tracepoint(osd, do_osd_op_pre_watch, soid.oid.name.c_str(), soid.snap.val,
6001                    op.watch.cookie, op.watch.op);
6002         if (!obs.exists) {
6003           result = -ENOENT;
6004           break;
6005         }
6006         uint64_t cookie = op.watch.cookie;
6007         entity_name_t entity = ctx->reqid.name;
6008         ObjectContextRef obc = ctx->obc;
6009
6010         dout(10) << "watch " << ceph_osd_watch_op_name(op.watch.op)
6011                  << ": ctx->obc=" << (void *)obc.get() << " cookie=" << cookie
6012                  << " oi.version=" << oi.version.version << " ctx->at_version=" << ctx->at_version << dendl;
6013         dout(10) << "watch: oi.user_version=" << oi.user_version<< dendl;
6014         dout(10) << "watch: peer_addr="
6015           << ctx->op->get_req()->get_connection()->get_peer_addr() << dendl;
6016
6017         uint32_t timeout = cct->_conf->osd_client_watch_timeout;
6018         if (op.watch.timeout != 0) {
6019           timeout = op.watch.timeout;
6020         }
6021
6022         watch_info_t w(cookie, timeout,
6023           ctx->op->get_req()->get_connection()->get_peer_addr());
6024         if (op.watch.op == CEPH_OSD_WATCH_OP_WATCH ||
6025             op.watch.op == CEPH_OSD_WATCH_OP_LEGACY_WATCH) {
6026           if (oi.watchers.count(make_pair(cookie, entity))) {
6027             dout(10) << " found existing watch " << w << " by " << entity << dendl;
6028           } else {
6029             dout(10) << " registered new watch " << w << " by " << entity << dendl;
6030             oi.watchers[make_pair(cookie, entity)] = w;
6031             t->nop(soid);  // make sure update the object_info on disk!
6032           }
6033           bool will_ping = (op.watch.op == CEPH_OSD_WATCH_OP_WATCH);
6034           ctx->watch_connects.push_back(make_pair(w, will_ping));
6035         } else if (op.watch.op == CEPH_OSD_WATCH_OP_RECONNECT) {
6036           if (!oi.watchers.count(make_pair(cookie, entity))) {
6037             result = -ENOTCONN;
6038             break;
6039           }
6040           dout(10) << " found existing watch " << w << " by " << entity << dendl;
6041           ctx->watch_connects.push_back(make_pair(w, true));
6042         } else if (op.watch.op == CEPH_OSD_WATCH_OP_PING) {
6043           /* Note: WATCH with PING doesn't cause may_write() to return true,
6044            * so if there is nothing else in the transaction, this is going
6045            * to run do_osd_op_effects, but not write out a log entry */
6046           if (!oi.watchers.count(make_pair(cookie, entity))) {
6047             result = -ENOTCONN;
6048             break;
6049           }
6050           map<pair<uint64_t,entity_name_t>,WatchRef>::iterator p =
6051             obc->watchers.find(make_pair(cookie, entity));
6052           if (p == obc->watchers.end() ||
6053               !p->second->is_connected()) {
6054             // client needs to reconnect
6055             result = -ETIMEDOUT;
6056             break;
6057           }
6058           dout(10) << " found existing watch " << w << " by " << entity << dendl;
6059           p->second->got_ping(ceph_clock_now());
6060           result = 0;
6061         } else if (op.watch.op == CEPH_OSD_WATCH_OP_UNWATCH) {
6062           map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator oi_iter =
6063             oi.watchers.find(make_pair(cookie, entity));
6064           if (oi_iter != oi.watchers.end()) {
6065             dout(10) << " removed watch " << oi_iter->second << " by "
6066                      << entity << dendl;
6067             oi.watchers.erase(oi_iter);
6068             t->nop(soid);  // update oi on disk
6069             ctx->watch_disconnects.push_back(
6070               watch_disconnect_t(cookie, entity, false));
6071           } else {
6072             dout(10) << " can't remove: no watch by " << entity << dendl;
6073           }
6074         }
6075       }
6076       break;
6077
6078     case CEPH_OSD_OP_CACHE_PIN:
6079       tracepoint(osd, do_osd_op_pre_cache_pin, soid.oid.name.c_str(), soid.snap.val);
6080       if ((!pool.info.is_tier() ||
6081           pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)) {
6082         result = -EINVAL;
6083         dout(10) << " pin object is only allowed on the cache tier " << dendl;
6084         break;
6085       }
6086       ++ctx->num_write;
6087       {
6088         if (!obs.exists || oi.is_whiteout()) {
6089           result = -ENOENT;
6090           break;
6091         }
6092
6093         if (!oi.is_cache_pinned()) {
6094           oi.set_flag(object_info_t::FLAG_CACHE_PIN);
6095           ctx->modify = true;
6096           ctx->delta_stats.num_objects_pinned++;
6097           ctx->delta_stats.num_wr++;
6098         }
6099         result = 0;
6100       }
6101       break;
6102
6103     case CEPH_OSD_OP_CACHE_UNPIN:
6104       tracepoint(osd, do_osd_op_pre_cache_unpin, soid.oid.name.c_str(), soid.snap.val);
6105       if ((!pool.info.is_tier() ||
6106           pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)) {
6107         result = -EINVAL;
6108         dout(10) << " pin object is only allowed on the cache tier " << dendl;
6109         break;
6110       }
6111       ++ctx->num_write;
6112       {
6113         if (!obs.exists || oi.is_whiteout()) {
6114           result = -ENOENT;
6115           break;
6116         }
6117
6118         if (oi.is_cache_pinned()) {
6119           oi.clear_flag(object_info_t::FLAG_CACHE_PIN);
6120           ctx->modify = true;
6121           ctx->delta_stats.num_objects_pinned--;
6122           ctx->delta_stats.num_wr++;
6123         }
6124         result = 0;
6125       }
6126       break;
6127
6128     case CEPH_OSD_OP_SET_REDIRECT:
6129       ++ctx->num_write;
6130       {
6131         if (pool.info.is_tier()) {
6132           result = -EINVAL;
6133           break;
6134         }
6135         if (!obs.exists) {
6136           result = -ENOENT;
6137           break;
6138         }
6139         if (get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS) {
6140           result = -EOPNOTSUPP;
6141           break;
6142         }
6143
6144         object_t target_name;
6145         object_locator_t target_oloc;
6146         snapid_t target_snapid = (uint64_t)op.copy_from.snapid;
6147         version_t target_version = op.copy_from.src_version;
6148         try {
6149           ::decode(target_name, bp);
6150           ::decode(target_oloc, bp);
6151         }
6152         catch (buffer::error& e) {
6153           result = -EINVAL;
6154           goto fail;
6155         }
6156         pg_t raw_pg;
6157         get_osdmap()->object_locator_to_pg(target_name, target_oloc, raw_pg);
6158         hobject_t target(target_name, target_oloc.key, target_snapid,
6159                 raw_pg.ps(), raw_pg.pool(),
6160                 target_oloc.nspace);
6161         if (target == soid) {
6162           dout(20) << " set-redirect self is invalid" << dendl;
6163           result = -EINVAL;
6164           break;
6165         }
6166         oi.set_flag(object_info_t::FLAG_MANIFEST);
6167         oi.manifest.redirect_target = target;
6168         oi.manifest.type = object_manifest_t::TYPE_REDIRECT;
6169         t->truncate(soid, 0);
6170         if (oi.is_omap() && pool.info.supports_omap()) {
6171           t->omap_clear(soid);
6172           obs.oi.clear_omap_digest();
6173           obs.oi.clear_flag(object_info_t::FLAG_OMAP);
6174         }
6175         ctx->delta_stats.num_bytes -= oi.size;
6176         oi.size = 0;
6177         oi.new_object();
6178         oi.user_version = target_version;
6179         ctx->user_at_version = target_version;
6180         /* rm_attrs */
6181         map<string,bufferlist> rmattrs;
6182         result = getattrs_maybe_cache(ctx->obc,
6183                     &rmattrs,
6184                     true);
6185         if (result < 0) {
6186           return result;
6187         }
6188         map<string, bufferlist>::iterator iter;
6189         for (iter = rmattrs.begin(); iter != rmattrs.end(); ++iter) {
6190           const string& name = iter->first;
6191           t->rmattr(soid, name);
6192         }
6193         dout(10) << "set-redirect oid:" << oi.soid << " user_version: " << oi.user_version << dendl;
6194       }
6195
6196       break;
6197
6198       // -- object attrs --
6199
6200     case CEPH_OSD_OP_SETXATTR:
6201       ++ctx->num_write;
6202       {
6203         if (cct->_conf->osd_max_attr_size > 0 &&
6204             op.xattr.value_len > cct->_conf->osd_max_attr_size) {
6205           tracepoint(osd, do_osd_op_pre_setxattr, soid.oid.name.c_str(), soid.snap.val, "???");
6206           result = -EFBIG;
6207           break;
6208         }
6209         unsigned max_name_len = MIN(osd->store->get_max_attr_name_length(),
6210                                     cct->_conf->osd_max_attr_name_len);
6211         if (op.xattr.name_len > max_name_len) {
6212           result = -ENAMETOOLONG;
6213           break;
6214         }
6215         maybe_create_new_object(ctx);
6216         string aname;
6217         bp.copy(op.xattr.name_len, aname);
6218         tracepoint(osd, do_osd_op_pre_setxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
6219         string name = "_" + aname;
6220         bufferlist bl;
6221         bp.copy(op.xattr.value_len, bl);
6222         t->setattr(soid, name, bl);
6223         ctx->delta_stats.num_wr++;
6224       }
6225       break;
6226
6227     case CEPH_OSD_OP_RMXATTR:
6228       ++ctx->num_write;
6229       {
6230         string aname;
6231         bp.copy(op.xattr.name_len, aname);
6232         tracepoint(osd, do_osd_op_pre_rmxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
6233         if (!obs.exists || oi.is_whiteout()) {
6234           result = -ENOENT;
6235           break;
6236         }
6237         string name = "_" + aname;
6238         t->rmattr(soid, name);
6239         ctx->delta_stats.num_wr++;
6240       }
6241       break;
6242
6243
6244       // -- fancy writers --
6245     case CEPH_OSD_OP_APPEND:
6246       {
6247         tracepoint(osd, do_osd_op_pre_append, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
6248         // just do it inline; this works because we are happy to execute
6249         // fancy op on replicas as well.
6250         vector<OSDOp> nops(1);
6251         OSDOp& newop = nops[0];
6252         newop.op.op = CEPH_OSD_OP_WRITE;
6253         newop.op.extent.offset = oi.size;
6254         newop.op.extent.length = op.extent.length;
6255         newop.op.extent.truncate_seq = oi.truncate_seq;
6256         newop.indata = osd_op.indata;
6257         result = do_osd_ops(ctx, nops);
6258         osd_op.outdata.claim(newop.outdata);
6259       }
6260       break;
6261
6262     case CEPH_OSD_OP_STARTSYNC:
6263       tracepoint(osd, do_osd_op_pre_startsync, soid.oid.name.c_str(), soid.snap.val);
6264       t->nop(soid);
6265       break;
6266
6267
6268       // -- trivial map --
6269     case CEPH_OSD_OP_TMAPGET:
6270       tracepoint(osd, do_osd_op_pre_tmapget, soid.oid.name.c_str(), soid.snap.val);
6271       if (pool.info.require_rollback()) {
6272         result = -EOPNOTSUPP;
6273         break;
6274       }
6275       {
6276         vector<OSDOp> nops(1);
6277         OSDOp& newop = nops[0];
6278         newop.op.op = CEPH_OSD_OP_SYNC_READ;
6279         newop.op.extent.offset = 0;
6280         newop.op.extent.length = 0;
6281         do_osd_ops(ctx, nops);
6282         osd_op.outdata.claim(newop.outdata);
6283       }
6284       break;
6285
6286     case CEPH_OSD_OP_TMAPPUT:
6287       tracepoint(osd, do_osd_op_pre_tmapput, soid.oid.name.c_str(), soid.snap.val);
6288       if (pool.info.require_rollback()) {
6289         result = -EOPNOTSUPP;
6290         break;
6291       }
6292       {
6293         //_dout_lock.Lock();
6294         //osd_op.data.hexdump(*_dout);
6295         //_dout_lock.Unlock();
6296
6297         // verify sort order
6298         bool unsorted = false;
6299         if (true) {
6300           bufferlist header;
6301           ::decode(header, bp);
6302           uint32_t n;
6303           ::decode(n, bp);
6304           string last_key;
6305           while (n--) {
6306             string key;
6307             ::decode(key, bp);
6308             dout(10) << "tmapput key " << key << dendl;
6309             bufferlist val;
6310             ::decode(val, bp);
6311             if (key < last_key) {
6312               dout(10) << "TMAPPUT is unordered; resorting" << dendl;
6313               unsorted = true;
6314               break;
6315             }
6316             last_key = key;
6317           }
6318         }
6319
6320         // write it
6321         vector<OSDOp> nops(1);
6322         OSDOp& newop = nops[0];
6323         newop.op.op = CEPH_OSD_OP_WRITEFULL;
6324         newop.op.extent.offset = 0;
6325         newop.op.extent.length = osd_op.indata.length();
6326         newop.indata = osd_op.indata;
6327
6328         if (unsorted) {
6329           bp = osd_op.indata.begin();
6330           bufferlist header;
6331           map<string, bufferlist> m;
6332           ::decode(header, bp);
6333           ::decode(m, bp);
6334           assert(bp.end());
6335           bufferlist newbl;
6336           ::encode(header, newbl);
6337           ::encode(m, newbl);
6338           newop.indata = newbl;
6339         }
6340         result = do_osd_ops(ctx, nops);
6341         assert(result == 0);
6342       }
6343       break;
6344
6345     case CEPH_OSD_OP_TMAPUP:
6346       tracepoint(osd, do_osd_op_pre_tmapup, soid.oid.name.c_str(), soid.snap.val);
6347       if (pool.info.require_rollback()) {
6348         result = -EOPNOTSUPP;
6349         break;
6350       }
6351       ++ctx->num_write;
6352       result = do_tmapup(ctx, bp, osd_op);
6353       break;
6354
6355     case CEPH_OSD_OP_TMAP2OMAP:
6356       ++ctx->num_write;
6357       tracepoint(osd, do_osd_op_pre_tmap2omap, soid.oid.name.c_str(), soid.snap.val);
6358       result = do_tmap2omap(ctx, op.tmap2omap.flags);
6359       break;
6360
6361       // OMAP Read ops
6362     case CEPH_OSD_OP_OMAPGETKEYS:
6363       ++ctx->num_read;
6364       {
6365         string start_after;
6366         uint64_t max_return;
6367         try {
6368           ::decode(start_after, bp);
6369           ::decode(max_return, bp);
6370         }
6371         catch (buffer::error& e) {
6372           result = -EINVAL;
6373           tracepoint(osd, do_osd_op_pre_omapgetkeys, soid.oid.name.c_str(), soid.snap.val, "???", 0);
6374           goto fail;
6375         }
6376         if (max_return > cct->_conf->osd_max_omap_entries_per_request) {
6377           max_return = cct->_conf->osd_max_omap_entries_per_request;
6378         }
6379         tracepoint(osd, do_osd_op_pre_omapgetkeys, soid.oid.name.c_str(), soid.snap.val, start_after.c_str(), max_return);
6380
6381         bufferlist bl;
6382         uint32_t num = 0;
6383         bool truncated = false;
6384         if (oi.is_omap()) {
6385           ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(
6386             coll, ghobject_t(soid)
6387             );
6388           assert(iter);
6389           iter->upper_bound(start_after);
6390           for (num = 0; iter->valid(); ++num, iter->next(false)) {
6391             if (num >= max_return ||
6392                 bl.length() >= cct->_conf->osd_max_omap_bytes_per_request) {
6393               truncated = true;
6394               break;
6395             }
6396             ::encode(iter->key(), bl);
6397           }
6398         } // else return empty out_set
6399         ::encode(num, osd_op.outdata);
6400         osd_op.outdata.claim_append(bl);
6401         ::encode(truncated, osd_op.outdata);
6402         ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
6403         ctx->delta_stats.num_rd++;
6404       }
6405       break;
6406
6407     case CEPH_OSD_OP_OMAPGETVALS:
6408       ++ctx->num_read;
6409       {
6410         string start_after;
6411         uint64_t max_return;
6412         string filter_prefix;
6413         try {
6414           ::decode(start_after, bp);
6415           ::decode(max_return, bp);
6416           ::decode(filter_prefix, bp);
6417         }
6418         catch (buffer::error& e) {
6419           result = -EINVAL;
6420           tracepoint(osd, do_osd_op_pre_omapgetvals, soid.oid.name.c_str(), soid.snap.val, "???", 0, "???");
6421           goto fail;
6422         }
6423         if (max_return > cct->_conf->osd_max_omap_entries_per_request) {
6424           max_return = cct->_conf->osd_max_omap_entries_per_request;
6425         }
6426         tracepoint(osd, do_osd_op_pre_omapgetvals, soid.oid.name.c_str(), soid.snap.val, start_after.c_str(), max_return, filter_prefix.c_str());
6427
6428         uint32_t num = 0;
6429         bool truncated = false;
6430         bufferlist bl;
6431         if (oi.is_omap()) {
6432           ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(
6433             coll, ghobject_t(soid)
6434             );
6435           if (!iter) {
6436             result = -ENOENT;
6437             goto fail;
6438           }
6439           iter->upper_bound(start_after);
6440           if (filter_prefix > start_after) iter->lower_bound(filter_prefix);
6441           for (num = 0;
6442                iter->valid() &&
6443                  iter->key().substr(0, filter_prefix.size()) == filter_prefix;
6444                ++num, iter->next(false)) {
6445             dout(20) << "Found key " << iter->key() << dendl;
6446             if (num >= max_return ||
6447                 bl.length() >= cct->_conf->osd_max_omap_bytes_per_request) {
6448               truncated = true;
6449               break;
6450             }
6451             ::encode(iter->key(), bl);
6452             ::encode(iter->value(), bl);
6453           }
6454         } // else return empty out_set
6455         ::encode(num, osd_op.outdata);
6456         osd_op.outdata.claim_append(bl);
6457         ::encode(truncated, osd_op.outdata);
6458         ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
6459         ctx->delta_stats.num_rd++;
6460       }
6461       break;
6462
6463     case CEPH_OSD_OP_OMAPGETHEADER:
6464       tracepoint(osd, do_osd_op_pre_omapgetheader, soid.oid.name.c_str(), soid.snap.val);
6465       if (!oi.is_omap()) {
6466         // return empty header
6467         break;
6468       }
6469       ++ctx->num_read;
6470       {
6471         osd->store->omap_get_header(ch, ghobject_t(soid), &osd_op.outdata);
6472         ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
6473         ctx->delta_stats.num_rd++;
6474       }
6475       break;
6476
6477     case CEPH_OSD_OP_OMAPGETVALSBYKEYS:
6478       ++ctx->num_read;
6479       {
6480         set<string> keys_to_get;
6481         try {
6482           ::decode(keys_to_get, bp);
6483         }
6484         catch (buffer::error& e) {
6485           result = -EINVAL;
6486           tracepoint(osd, do_osd_op_pre_omapgetvalsbykeys, soid.oid.name.c_str(), soid.snap.val, "???");
6487           goto fail;
6488         }
6489         tracepoint(osd, do_osd_op_pre_omapgetvalsbykeys, soid.oid.name.c_str(), soid.snap.val, list_entries(keys_to_get).c_str());
6490         map<string, bufferlist> out;
6491         if (oi.is_omap()) {
6492           osd->store->omap_get_values(ch, ghobject_t(soid), keys_to_get, &out);
6493         } // else return empty omap entries
6494         ::encode(out, osd_op.outdata);
6495         ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
6496         ctx->delta_stats.num_rd++;
6497       }
6498       break;
6499
6500     case CEPH_OSD_OP_OMAP_CMP:
6501       ++ctx->num_read;
6502       {
6503         if (!obs.exists || oi.is_whiteout()) {
6504           result = -ENOENT;
6505           tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, "???");
6506           break;
6507         }
6508         map<string, pair<bufferlist, int> > assertions;
6509         try {
6510           ::decode(assertions, bp);
6511         }
6512         catch (buffer::error& e) {
6513           result = -EINVAL;
6514           tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, "???");
6515           goto fail;
6516         }
6517         tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, list_keys(assertions).c_str());
6518
6519         map<string, bufferlist> out;
6520
6521         if (oi.is_omap()) {
6522           set<string> to_get;
6523           for (map<string, pair<bufferlist, int> >::iterator i = assertions.begin();
6524                i != assertions.end();
6525                ++i)
6526             to_get.insert(i->first);
6527           int r = osd->store->omap_get_values(ch, ghobject_t(soid),
6528                                               to_get, &out);
6529           if (r < 0) {
6530             result = r;
6531             break;
6532           }
6533         } // else leave out empty
6534
6535         //Should set num_rd_kb based on encode length of map
6536         ctx->delta_stats.num_rd++;
6537
6538         int r = 0;
6539         bufferlist empty;
6540         for (map<string, pair<bufferlist, int> >::iterator i = assertions.begin();
6541              i != assertions.end();
6542              ++i) {
6543           auto out_entry = out.find(i->first);
6544           bufferlist &bl = (out_entry != out.end()) ?
6545             out_entry->second : empty;
6546           switch (i->second.second) {
6547           case CEPH_OSD_CMPXATTR_OP_EQ:
6548             if (!(bl == i->second.first)) {
6549               r = -ECANCELED;
6550             }
6551             break;
6552           case CEPH_OSD_CMPXATTR_OP_LT:
6553             if (!(bl < i->second.first)) {
6554               r = -ECANCELED;
6555             }
6556             break;
6557           case CEPH_OSD_CMPXATTR_OP_GT:
6558             if (!(bl > i->second.first)) {
6559               r = -ECANCELED;
6560             }
6561             break;
6562           default:
6563             r = -EINVAL;
6564             break;
6565           }
6566           if (r < 0)
6567             break;
6568         }
6569         if (r < 0) {
6570           result = r;
6571         }
6572       }
6573       break;
6574
6575       // OMAP Write ops
6576     case CEPH_OSD_OP_OMAPSETVALS:
6577       if (!pool.info.supports_omap()) {
6578         result = -EOPNOTSUPP;
6579         tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
6580         break;
6581       }
6582       ++ctx->num_write;
6583       {
6584         maybe_create_new_object(ctx);
6585         bufferlist to_set_bl;
6586         try {
6587           decode_str_str_map_to_bl(bp, &to_set_bl);
6588         }
6589         catch (buffer::error& e) {
6590           result = -EINVAL;
6591           tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
6592           goto fail;
6593         }
6594         tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
6595         if (cct->_conf->subsys.should_gather(dout_subsys, 20)) {
6596           dout(20) << "setting vals: " << dendl;
6597           map<string,bufferlist> to_set;
6598           bufferlist::iterator pt = to_set_bl.begin();
6599           ::decode(to_set, pt);
6600           for (map<string, bufferlist>::iterator i = to_set.begin();
6601                i != to_set.end();
6602                ++i) {
6603             dout(20) << "\t" << i->first << dendl;
6604           }
6605         }
6606         t->omap_setkeys(soid, to_set_bl);
6607         ctx->delta_stats.num_wr++;
6608       }
6609       obs.oi.set_flag(object_info_t::FLAG_OMAP);
6610       obs.oi.clear_omap_digest();
6611       break;
6612
6613     case CEPH_OSD_OP_OMAPSETHEADER:
6614       tracepoint(osd, do_osd_op_pre_omapsetheader, soid.oid.name.c_str(), soid.snap.val);
6615       if (!pool.info.supports_omap()) {
6616         result = -EOPNOTSUPP;
6617         break;
6618       }
6619       ++ctx->num_write;
6620       {
6621         maybe_create_new_object(ctx);
6622         t->omap_setheader(soid, osd_op.indata);
6623         ctx->delta_stats.num_wr++;
6624       }
6625       obs.oi.set_flag(object_info_t::FLAG_OMAP);
6626       obs.oi.clear_omap_digest();
6627       break;
6628
6629     case CEPH_OSD_OP_OMAPCLEAR:
6630       tracepoint(osd, do_osd_op_pre_omapclear, soid.oid.name.c_str(), soid.snap.val);
6631       if (!pool.info.supports_omap()) {
6632         result = -EOPNOTSUPP;
6633         break;
6634       }
6635       ++ctx->num_write;
6636       {
6637         if (!obs.exists || oi.is_whiteout()) {
6638           result = -ENOENT;
6639           break;
6640         }
6641         if (oi.is_omap()) {
6642           t->omap_clear(soid);
6643           ctx->delta_stats.num_wr++;
6644           obs.oi.clear_omap_digest();
6645           obs.oi.clear_flag(object_info_t::FLAG_OMAP);
6646         }
6647       }
6648       break;
6649
6650     case CEPH_OSD_OP_OMAPRMKEYS:
6651       if (!pool.info.supports_omap()) {
6652         result = -EOPNOTSUPP;
6653         tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
6654         break;
6655       }
6656       ++ctx->num_write;
6657       {
6658         if (!obs.exists || oi.is_whiteout()) {
6659           result = -ENOENT;
6660           tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
6661           break;
6662         }
6663         bufferlist to_rm_bl;
6664         try {
6665           decode_str_set_to_bl(bp, &to_rm_bl);
6666         }
6667         catch (buffer::error& e) {
6668           result = -EINVAL;
6669           tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
6670           goto fail;
6671         }
6672         tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
6673         t->omap_rmkeys(soid, to_rm_bl);
6674         ctx->delta_stats.num_wr++;
6675       }
6676       obs.oi.clear_omap_digest();
6677       break;
6678
6679     case CEPH_OSD_OP_COPY_GET:
6680       ++ctx->num_read;
6681       tracepoint(osd, do_osd_op_pre_copy_get, soid.oid.name.c_str(),
6682                  soid.snap.val);
6683       if (op_finisher == nullptr) {
6684         result = do_copy_get(ctx, bp, osd_op, ctx->obc);
6685       } else {
6686         result = op_finisher->execute();
6687       }
6688       break;
6689
6690     case CEPH_OSD_OP_COPY_FROM:
6691       ++ctx->num_write;
6692       {
6693         object_t src_name;
6694         object_locator_t src_oloc;
6695         snapid_t src_snapid = (uint64_t)op.copy_from.snapid;
6696         version_t src_version = op.copy_from.src_version;
6697         try {
6698           ::decode(src_name, bp);
6699           ::decode(src_oloc, bp);
6700         }
6701         catch (buffer::error& e) {
6702           result = -EINVAL;
6703           tracepoint(osd,
6704                      do_osd_op_pre_copy_from,
6705                      soid.oid.name.c_str(),
6706                      soid.snap.val,
6707                      "???",
6708                      0,
6709                      "???",
6710                      "???",
6711                      0,
6712                      src_snapid,
6713                      src_version);
6714           goto fail;
6715         }
6716         tracepoint(osd,
6717                    do_osd_op_pre_copy_from,
6718                    soid.oid.name.c_str(),
6719                    soid.snap.val,
6720                    src_name.name.c_str(),
6721                    src_oloc.pool,
6722                    src_oloc.key.c_str(),
6723                    src_oloc.nspace.c_str(),
6724                    src_oloc.hash,
6725                    src_snapid,
6726                    src_version);
6727         if (op_finisher == nullptr) {
6728           // start
6729           pg_t raw_pg;
6730           get_osdmap()->object_locator_to_pg(src_name, src_oloc, raw_pg);
6731           hobject_t src(src_name, src_oloc.key, src_snapid,
6732                         raw_pg.ps(), raw_pg.pool(),
6733                         src_oloc.nspace);
6734           if (src == soid) {
6735             dout(20) << " copy from self is invalid" << dendl;
6736             result = -EINVAL;
6737             break;
6738           }
6739           CopyFromCallback *cb = new CopyFromCallback(ctx, osd_op);
6740           ctx->op_finishers[ctx->current_osd_subop_num].reset(
6741             new CopyFromFinisher(cb));
6742           start_copy(cb, ctx->obc, src, src_oloc, src_version,
6743                      op.copy_from.flags,
6744                      false,
6745                      op.copy_from.src_fadvise_flags,
6746                      op.flags);
6747           result = -EINPROGRESS;
6748         } else {
6749           // finish
6750           result = op_finisher->execute();
6751           assert(result == 0);
6752
6753           // COPY_FROM cannot be executed multiple times -- it must restart
6754           ctx->op_finishers.erase(ctx->current_osd_subop_num);
6755         }
6756       }
6757       break;
6758
6759     default:
6760       tracepoint(osd, do_osd_op_pre_unknown, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op));
6761       dout(1) << "unrecognized osd op " << op.op
6762               << " " << ceph_osd_op_name(op.op)
6763               << dendl;
6764       result = -EOPNOTSUPP;
6765     }
6766
6767   fail:
6768     osd_op.rval = result;
6769     tracepoint(osd, do_osd_op_post, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op), op.flags, result);
6770     if (result < 0 && (op.flags & CEPH_OSD_OP_FLAG_FAILOK))
6771       result = 0;
6772
6773     if (result < 0)
6774       break;
6775   }
6776   return result;
6777 }
6778
6779 int PrimaryLogPG::_get_tmap(OpContext *ctx, bufferlist *header, bufferlist *vals)
6780 {
6781   if (ctx->new_obs.oi.size == 0) {
6782     dout(20) << "unable to get tmap for zero sized " << ctx->new_obs.oi.soid << dendl;
6783     return -ENODATA;
6784   }
6785   vector<OSDOp> nops(1);
6786   OSDOp &newop = nops[0];
6787   newop.op.op = CEPH_OSD_OP_TMAPGET;
6788   do_osd_ops(ctx, nops);
6789   try {
6790     bufferlist::iterator i = newop.outdata.begin();
6791     ::decode(*header, i);
6792     (*vals).substr_of(newop.outdata, i.get_off(), i.get_remaining());
6793   } catch (...) {
6794     dout(20) << "unsuccessful at decoding tmap for " << ctx->new_obs.oi.soid
6795              << dendl;
6796     return -EINVAL;
6797   }
6798   dout(20) << "successful at decoding tmap for " << ctx->new_obs.oi.soid
6799            << dendl;
6800   return 0;
6801 }
6802
6803 int PrimaryLogPG::_verify_no_head_clones(const hobject_t& soid,
6804                                         const SnapSet& ss)
6805 {
6806   // verify that all clones have been evicted
6807   dout(20) << __func__ << " verifying clones are absent "
6808            << ss << dendl;
6809   for (vector<snapid_t>::const_iterator p = ss.clones.begin();
6810        p != ss.clones.end();
6811        ++p) {
6812     hobject_t clone_oid = soid;
6813     clone_oid.snap = *p;
6814     if (is_missing_object(clone_oid))
6815       return -EBUSY;
6816     ObjectContextRef clone_obc = get_object_context(clone_oid, false);
6817     if (clone_obc && clone_obc->obs.exists) {
6818       dout(10) << __func__ << " cannot evict head before clone "
6819                << clone_oid << dendl;
6820       return -EBUSY;
6821     }
6822     if (copy_ops.count(clone_oid)) {
6823       dout(10) << __func__ << " cannot evict head, pending promote on clone "
6824                << clone_oid << dendl;
6825       return -EBUSY;
6826     }
6827   }
6828   return 0;
6829 }
6830
6831 inline int PrimaryLogPG::_delete_oid(
6832   OpContext *ctx,
6833   bool no_whiteout,     // no whiteouts, no matter what.
6834   bool try_no_whiteout) // try not to whiteout
6835 {
6836   SnapSet& snapset = ctx->new_snapset;
6837   ObjectState& obs = ctx->new_obs;
6838   object_info_t& oi = obs.oi;
6839   const hobject_t& soid = oi.soid;
6840   PGTransaction* t = ctx->op_t.get();
6841
6842   // cache: cache: set whiteout on delete?
6843   bool whiteout = false;
6844   if (pool.info.cache_mode != pg_pool_t::CACHEMODE_NONE
6845       && !no_whiteout
6846       && !try_no_whiteout) {
6847     whiteout = true;
6848   }
6849   bool legacy;
6850   if (get_osdmap()->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
6851     legacy = false;
6852     // in luminous or later, we can't delete the head if there are
6853     // clones. we trust the caller passing no_whiteout has already
6854     // verified they don't exist.
6855     if (!snapset.clones.empty() ||
6856         (!ctx->snapc.snaps.empty() && ctx->snapc.snaps[0] > snapset.seq)) {
6857       if (no_whiteout) {
6858         dout(20) << __func__ << " has or will have clones but no_whiteout=1"
6859                  << dendl;
6860       } else {
6861         dout(20) << __func__ << " has or will have clones; will whiteout"
6862                  << dendl;
6863         whiteout = true;
6864       }
6865     }
6866   } else {
6867     legacy = false;
6868   }
6869   dout(20) << __func__ << " " << soid << " whiteout=" << (int)whiteout
6870            << " no_whiteout=" << (int)no_whiteout
6871            << " try_no_whiteout=" << (int)try_no_whiteout
6872            << dendl;
6873   if (!obs.exists || (obs.oi.is_whiteout() && whiteout))
6874     return -ENOENT;
6875
6876   t->remove(soid);
6877
6878   if (oi.size > 0) {
6879     interval_set<uint64_t> ch;
6880     ch.insert(0, oi.size);
6881     ctx->modified_ranges.union_of(ch);
6882   }
6883
6884   ctx->delta_stats.num_wr++;
6885   if (soid.is_snap()) {
6886     assert(ctx->obc->ssc->snapset.clone_overlap.count(soid.snap));
6887     ctx->delta_stats.num_bytes -= ctx->obc->ssc->snapset.get_clone_bytes(soid.snap);
6888   } else {
6889     ctx->delta_stats.num_bytes -= oi.size;
6890   }
6891   oi.size = 0;
6892   oi.new_object();
6893
6894   // disconnect all watchers
6895   for (map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator p =
6896          oi.watchers.begin();
6897        p != oi.watchers.end();
6898        ++p) {
6899     dout(20) << __func__ << " will disconnect watcher " << p->first << dendl;
6900     ctx->watch_disconnects.push_back(
6901       watch_disconnect_t(p->first.first, p->first.second, true));
6902   }
6903   oi.watchers.clear();
6904
6905   if (whiteout) {
6906     dout(20) << __func__ << " setting whiteout on " << soid << dendl;
6907     oi.set_flag(object_info_t::FLAG_WHITEOUT);
6908     ctx->delta_stats.num_whiteouts++;
6909     t->create(soid);
6910     osd->logger->inc(l_osd_tier_whiteout);
6911     return 0;
6912   }
6913
6914   // delete the head
6915   ctx->delta_stats.num_objects--;
6916   if (soid.is_snap())
6917     ctx->delta_stats.num_object_clones--;
6918   if (oi.is_whiteout()) {
6919     dout(20) << __func__ << " deleting whiteout on " << soid << dendl;
6920     ctx->delta_stats.num_whiteouts--;
6921     oi.clear_flag(object_info_t::FLAG_WHITEOUT);
6922   }
6923   if (oi.is_cache_pinned()) {
6924     ctx->delta_stats.num_objects_pinned--;
6925   }
6926   if ((legacy || snapset.is_legacy()) && soid.is_head()) {
6927     snapset.head_exists = false;
6928   }
6929   obs.exists = false;
6930   return 0;
6931 }
6932
6933 int PrimaryLogPG::_rollback_to(OpContext *ctx, ceph_osd_op& op)
6934 {
6935   SnapSet& snapset = ctx->new_snapset;
6936   ObjectState& obs = ctx->new_obs;
6937   object_info_t& oi = obs.oi;
6938   const hobject_t& soid = oi.soid;
6939   PGTransaction* t = ctx->op_t.get();
6940   snapid_t snapid = (uint64_t)op.snap.snapid;
6941   hobject_t missing_oid;
6942
6943   dout(10) << "_rollback_to " << soid << " snapid " << snapid << dendl;
6944
6945   ObjectContextRef rollback_to;
6946   int ret = find_object_context(
6947     hobject_t(soid.oid, soid.get_key(), snapid, soid.get_hash(), info.pgid.pool(),
6948               soid.get_namespace()),
6949     &rollback_to, false, false, &missing_oid);
6950   if (ret == -EAGAIN) {
6951     /* clone must be missing */
6952     assert(is_degraded_or_backfilling_object(missing_oid));
6953     dout(20) << "_rollback_to attempted to roll back to a missing or backfilling clone "
6954              << missing_oid << " (requested snapid: ) " << snapid << dendl;
6955     block_write_on_degraded_snap(missing_oid, ctx->op);
6956     return ret;
6957   }
6958   {
6959     ObjectContextRef promote_obc;
6960     cache_result_t tier_mode_result;
6961     if (obs.exists && obs.oi.has_manifest()) {
6962       tier_mode_result =
6963         maybe_handle_manifest_detail(
6964           ctx->op,
6965           true,
6966           rollback_to);
6967     } else {
6968       tier_mode_result =
6969         maybe_handle_cache_detail(
6970           ctx->op,
6971           true,
6972           rollback_to,
6973           ret,
6974           missing_oid,
6975           true,
6976           false,
6977           &promote_obc);
6978     }
6979     switch (tier_mode_result) {
6980     case cache_result_t::NOOP:
6981       break;
6982     case cache_result_t::BLOCKED_PROMOTE:
6983       assert(promote_obc);
6984       block_write_on_snap_rollback(soid, promote_obc, ctx->op);
6985       return -EAGAIN;
6986     case cache_result_t::BLOCKED_FULL:
6987       block_write_on_full_cache(soid, ctx->op);
6988       return -EAGAIN;
6989     default:
6990       assert(0 == "must promote was set, other values are not valid");
6991       return -EAGAIN;
6992     }
6993   }
6994
6995   if (ret == -ENOENT || (rollback_to && rollback_to->obs.oi.is_whiteout())) {
6996     // there's no snapshot here, or there's no object.
6997     // if there's no snapshot, we delete the object; otherwise, do nothing.
6998     dout(20) << "_rollback_to deleting head on " << soid.oid
6999              << " because got ENOENT|whiteout on find_object_context" << dendl;
7000     if (ctx->obc->obs.oi.watchers.size()) {
7001       // Cannot delete an object with watchers
7002       ret = -EBUSY;
7003     } else {
7004       _delete_oid(ctx, false, false);
7005       ret = 0;
7006     }
7007   } else if (ret) {
7008     // ummm....huh? It *can't* return anything else at time of writing.
7009     assert(0 == "unexpected error code in _rollback_to");
7010   } else { //we got our context, let's use it to do the rollback!
7011     hobject_t& rollback_to_sobject = rollback_to->obs.oi.soid;
7012     if (is_degraded_or_backfilling_object(rollback_to_sobject)) {
7013       dout(20) << "_rollback_to attempted to roll back to a degraded object "
7014                << rollback_to_sobject << " (requested snapid: ) " << snapid << dendl;
7015       block_write_on_degraded_snap(rollback_to_sobject, ctx->op);
7016       ret = -EAGAIN;
7017     } else if (rollback_to->obs.oi.soid.snap == CEPH_NOSNAP) {
7018       // rolling back to the head; we just need to clone it.
7019       ctx->modify = true;
7020     } else {
7021       /* 1) Delete current head
7022        * 2) Clone correct snapshot into head
7023        * 3) Calculate clone_overlaps by following overlaps
7024        *    forward from rollback snapshot */
7025       dout(10) << "_rollback_to deleting " << soid.oid
7026                << " and rolling back to old snap" << dendl;
7027
7028       if (obs.exists) {
7029         t->remove(soid);
7030       }
7031       t->clone(soid, rollback_to_sobject);
7032       snapset.head_exists = true;
7033       t->add_obc(rollback_to);
7034
7035       map<snapid_t, interval_set<uint64_t> >::iterator iter =
7036         snapset.clone_overlap.lower_bound(snapid);
7037       interval_set<uint64_t> overlaps = iter->second;
7038       assert(iter != snapset.clone_overlap.end());
7039       for ( ;
7040             iter != snapset.clone_overlap.end();
7041             ++iter)
7042         overlaps.intersection_of(iter->second);
7043
7044       if (obs.oi.size > 0) {
7045         interval_set<uint64_t> modified;
7046         modified.insert(0, obs.oi.size);
7047         overlaps.intersection_of(modified);
7048         modified.subtract(overlaps);
7049         ctx->modified_ranges.union_of(modified);
7050       }
7051
7052       // Adjust the cached objectcontext
7053       maybe_create_new_object(ctx, true);
7054       ctx->delta_stats.num_bytes -= obs.oi.size;
7055       ctx->delta_stats.num_bytes += rollback_to->obs.oi.size;
7056       obs.oi.size = rollback_to->obs.oi.size;
7057       if (rollback_to->obs.oi.is_data_digest())
7058         obs.oi.set_data_digest(rollback_to->obs.oi.data_digest);
7059       else
7060         obs.oi.clear_data_digest();
7061       if (rollback_to->obs.oi.is_omap_digest())
7062         obs.oi.set_omap_digest(rollback_to->obs.oi.omap_digest);
7063       else
7064         obs.oi.clear_omap_digest();
7065
7066       if (rollback_to->obs.oi.is_omap()) {
7067         dout(10) << __func__ << " setting omap flag on " << obs.oi.soid << dendl;
7068         obs.oi.set_flag(object_info_t::FLAG_OMAP);
7069       } else {
7070         dout(10) << __func__ << " clearing omap flag on " << obs.oi.soid << dendl;
7071         obs.oi.clear_flag(object_info_t::FLAG_OMAP);
7072       }
7073
7074       snapset.head_exists = true;
7075     }
7076   }
7077   return ret;
7078 }
7079
7080 void PrimaryLogPG::_make_clone(
7081   OpContext *ctx,
7082   PGTransaction* t,
7083   ObjectContextRef obc,
7084   const hobject_t& head, const hobject_t& coid,
7085   object_info_t *poi)
7086 {
7087   bufferlist bv;
7088   ::encode(*poi, bv, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
7089
7090   t->clone(coid, head);
7091   setattr_maybe_cache(obc, ctx, t, OI_ATTR, bv);
7092   rmattr_maybe_cache(obc, ctx, t, SS_ATTR);
7093 }
7094
7095 void PrimaryLogPG::make_writeable(OpContext *ctx)
7096 {
7097   const hobject_t& soid = ctx->obs->oi.soid;
7098   SnapContext& snapc = ctx->snapc;
7099
7100   // clone?
7101   assert(soid.snap == CEPH_NOSNAP);
7102   dout(20) << "make_writeable " << soid << " snapset=" << ctx->new_snapset
7103            << "  snapc=" << snapc << dendl;
7104
7105   bool was_dirty = ctx->obc->obs.oi.is_dirty();
7106   if (ctx->new_obs.exists) {
7107     // we will mark the object dirty
7108     if (ctx->undirty && was_dirty) {
7109       dout(20) << " clearing DIRTY flag" << dendl;
7110       assert(ctx->new_obs.oi.is_dirty());
7111       ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
7112       --ctx->delta_stats.num_objects_dirty;
7113       osd->logger->inc(l_osd_tier_clean);
7114     } else if (!was_dirty && !ctx->undirty) {
7115       dout(20) << " setting DIRTY flag" << dendl;
7116       ctx->new_obs.oi.set_flag(object_info_t::FLAG_DIRTY);
7117       ++ctx->delta_stats.num_objects_dirty;
7118       osd->logger->inc(l_osd_tier_dirty);
7119     }
7120   } else {
7121     if (was_dirty) {
7122       dout(20) << " deletion, decrementing num_dirty and clearing flag" << dendl;
7123       ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
7124       --ctx->delta_stats.num_objects_dirty;
7125     }
7126   }
7127
7128   if ((ctx->new_obs.exists &&
7129        ctx->new_obs.oi.is_omap()) &&
7130       (!ctx->obc->obs.exists ||
7131        !ctx->obc->obs.oi.is_omap())) {
7132     ++ctx->delta_stats.num_objects_omap;
7133   }
7134   if ((!ctx->new_obs.exists ||
7135        !ctx->new_obs.oi.is_omap()) &&
7136       (ctx->obc->obs.exists &&
7137        ctx->obc->obs.oi.is_omap())) {
7138     --ctx->delta_stats.num_objects_omap;
7139   }
7140
7141   // use newer snapc?
7142   if (ctx->new_snapset.seq > snapc.seq) {
7143     snapc.seq = ctx->new_snapset.seq;
7144     snapc.snaps = ctx->new_snapset.snaps;
7145     filter_snapc(snapc.snaps);
7146     dout(10) << " using newer snapc " << snapc << dendl;
7147   }
7148
7149   if ((ctx->obs->exists && !ctx->obs->oi.is_whiteout()) && // head exist(ed)
7150       snapc.snaps.size() &&                 // there are snaps
7151       !ctx->cache_evict &&
7152       snapc.snaps[0] > ctx->new_snapset.seq) {  // existing object is old
7153     // clone
7154     hobject_t coid = soid;
7155     coid.snap = snapc.seq;
7156
7157     unsigned l;
7158     for (l=1; l<snapc.snaps.size() && snapc.snaps[l] > ctx->new_snapset.seq; l++) ;
7159
7160     vector<snapid_t> snaps(l);
7161     for (unsigned i=0; i<l; i++)
7162       snaps[i] = snapc.snaps[i];
7163
7164     // prepare clone
7165     object_info_t static_snap_oi(coid);
7166     object_info_t *snap_oi;
7167     if (is_primary()) {
7168       ctx->clone_obc = object_contexts.lookup_or_create(static_snap_oi.soid);
7169       ctx->clone_obc->destructor_callback = new C_PG_ObjectContext(this, ctx->clone_obc.get());
7170       ctx->clone_obc->obs.oi = static_snap_oi;
7171       ctx->clone_obc->obs.exists = true;
7172       ctx->clone_obc->ssc = ctx->obc->ssc;
7173       ctx->clone_obc->ssc->ref++;
7174       if (pool.info.require_rollback())
7175         ctx->clone_obc->attr_cache = ctx->obc->attr_cache;
7176       snap_oi = &ctx->clone_obc->obs.oi;
7177       bool got = ctx->lock_manager.get_write_greedy(
7178         coid,
7179         ctx->clone_obc,
7180         ctx->op);
7181       assert(got);
7182       dout(20) << " got greedy write on clone_obc " << *ctx->clone_obc << dendl;
7183     } else {
7184       snap_oi = &static_snap_oi;
7185     }
7186     snap_oi->version = ctx->at_version;
7187     snap_oi->prior_version = ctx->obs->oi.version;
7188     snap_oi->copy_user_bits(ctx->obs->oi);
7189
7190     bool legacy = ctx->new_snapset.is_legacy() ||
7191       get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
7192     if (legacy) {
7193       snap_oi->legacy_snaps = snaps;
7194     }
7195
7196     _make_clone(ctx, ctx->op_t.get(), ctx->clone_obc, soid, coid, snap_oi);
7197
7198     ctx->delta_stats.num_objects++;
7199     if (snap_oi->is_dirty()) {
7200       ctx->delta_stats.num_objects_dirty++;
7201       osd->logger->inc(l_osd_tier_dirty);
7202     }
7203     if (snap_oi->is_omap())
7204       ctx->delta_stats.num_objects_omap++;
7205     if (snap_oi->is_cache_pinned())
7206       ctx->delta_stats.num_objects_pinned++;
7207     ctx->delta_stats.num_object_clones++;
7208     ctx->new_snapset.clones.push_back(coid.snap);
7209     ctx->new_snapset.clone_size[coid.snap] = ctx->obs->oi.size;
7210     if (!legacy) {
7211       ctx->new_snapset.clone_snaps[coid.snap] = snaps;
7212     }
7213
7214     // clone_overlap should contain an entry for each clone
7215     // (an empty interval_set if there is no overlap)
7216     ctx->new_snapset.clone_overlap[coid.snap];
7217     if (ctx->obs->oi.size)
7218       ctx->new_snapset.clone_overlap[coid.snap].insert(0, ctx->obs->oi.size);
7219
7220     // log clone
7221     dout(10) << " cloning v " << ctx->obs->oi.version
7222              << " to " << coid << " v " << ctx->at_version
7223              << " snaps=" << snaps
7224              << " snapset=" << ctx->new_snapset << dendl;
7225     ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::CLONE, coid, ctx->at_version,
7226                                       ctx->obs->oi.version,
7227                                       ctx->obs->oi.user_version,
7228                                       osd_reqid_t(), ctx->new_obs.oi.mtime, 0));
7229     ::encode(snaps, ctx->log.back().snaps);
7230
7231     ctx->at_version.version++;
7232   }
7233
7234   // update most recent clone_overlap and usage stats
7235   if (ctx->new_snapset.clones.size() > 0) {
7236     /* we need to check whether the most recent clone exists, if it's been evicted,
7237      * it's not included in the stats */
7238     hobject_t last_clone_oid = soid;
7239     last_clone_oid.snap = ctx->new_snapset.clone_overlap.rbegin()->first;
7240     if (is_present_clone(last_clone_oid)) {
7241       interval_set<uint64_t> &newest_overlap = ctx->new_snapset.clone_overlap.rbegin()->second;
7242       ctx->modified_ranges.intersection_of(newest_overlap);
7243       // modified_ranges is still in use by the clone
7244       add_interval_usage(ctx->modified_ranges, ctx->delta_stats);
7245       newest_overlap.subtract(ctx->modified_ranges);
7246     }
7247   }
7248
7249   // update snapset with latest snap context
7250   ctx->new_snapset.seq = snapc.seq;
7251   ctx->new_snapset.snaps = snapc.snaps;
7252   if (get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS) {
7253     // pessimistic assumption that this is a net-new legacy SnapSet
7254     ctx->delta_stats.num_legacy_snapsets++;
7255     ctx->new_snapset.head_exists = ctx->new_obs.exists;
7256   } else if (ctx->new_snapset.is_legacy()) {
7257     ctx->new_snapset.head_exists = ctx->new_obs.exists;
7258   }
7259   dout(20) << "make_writeable " << soid
7260            << " done, snapset=" << ctx->new_snapset << dendl;
7261 }
7262
7263
7264 void PrimaryLogPG::write_update_size_and_usage(object_stat_sum_t& delta_stats, object_info_t& oi,
7265                                                interval_set<uint64_t>& modified, uint64_t offset,
7266                                                uint64_t length, bool write_full)
7267 {
7268   interval_set<uint64_t> ch;
7269   if (write_full) {
7270     if (oi.size)
7271       ch.insert(0, oi.size);
7272   } else if (length)
7273     ch.insert(offset, length);
7274   modified.union_of(ch);
7275   if (write_full || offset + length > oi.size) {
7276     uint64_t new_size = offset + length;
7277     delta_stats.num_bytes -= oi.size;
7278     delta_stats.num_bytes += new_size;
7279     oi.size = new_size;
7280   }
7281   delta_stats.num_wr++;
7282   delta_stats.num_wr_kb += SHIFT_ROUND_UP(length, 10);
7283 }
7284
7285 void PrimaryLogPG::add_interval_usage(interval_set<uint64_t>& s, object_stat_sum_t& delta_stats)
7286 {
7287   for (interval_set<uint64_t>::const_iterator p = s.begin(); p != s.end(); ++p) {
7288     delta_stats.num_bytes += p.get_len();
7289   }
7290 }
7291
7292 void PrimaryLogPG::complete_disconnect_watches(
7293   ObjectContextRef obc,
7294   const list<watch_disconnect_t> &to_disconnect)
7295 {
7296   for (list<watch_disconnect_t>::const_iterator i =
7297          to_disconnect.begin();
7298        i != to_disconnect.end();
7299        ++i) {
7300     pair<uint64_t, entity_name_t> watcher(i->cookie, i->name);
7301     auto watchers_entry = obc->watchers.find(watcher);
7302     if (watchers_entry != obc->watchers.end()) {
7303       WatchRef watch = watchers_entry->second;
7304       dout(10) << "do_osd_op_effects disconnect watcher " << watcher << dendl;
7305       obc->watchers.erase(watcher);
7306       watch->remove(i->send_disconnect);
7307     } else {
7308       dout(10) << "do_osd_op_effects disconnect failed to find watcher "
7309                << watcher << dendl;
7310     }
7311   }
7312 }
7313
7314 void PrimaryLogPG::do_osd_op_effects(OpContext *ctx, const ConnectionRef& conn)
7315 {
7316   entity_name_t entity = ctx->reqid.name;
7317   dout(15) << "do_osd_op_effects " << entity << " con " << conn.get() << dendl;
7318
7319   // disconnects first
7320   complete_disconnect_watches(ctx->obc, ctx->watch_disconnects);
7321
7322   assert(conn);
7323
7324   boost::intrusive_ptr<Session> session((Session *)conn->get_priv());
7325   if (!session.get())
7326     return;
7327   session->put();  // get_priv() takes a ref, and so does the intrusive_ptr
7328
7329   for (list<pair<watch_info_t,bool> >::iterator i = ctx->watch_connects.begin();
7330        i != ctx->watch_connects.end();
7331        ++i) {
7332     pair<uint64_t, entity_name_t> watcher(i->first.cookie, entity);
7333     dout(15) << "do_osd_op_effects applying watch connect on session "
7334              << session.get() << " watcher " << watcher << dendl;
7335     WatchRef watch;
7336     if (ctx->obc->watchers.count(watcher)) {
7337       dout(15) << "do_osd_op_effects found existing watch watcher " << watcher
7338                << dendl;
7339       watch = ctx->obc->watchers[watcher];
7340     } else {
7341       dout(15) << "do_osd_op_effects new watcher " << watcher
7342                << dendl;
7343       watch = Watch::makeWatchRef(
7344         this, osd, ctx->obc, i->first.timeout_seconds,
7345         i->first.cookie, entity, conn->get_peer_addr());
7346       ctx->obc->watchers.insert(
7347         make_pair(
7348           watcher,
7349           watch));
7350     }
7351     watch->connect(conn, i->second);
7352   }
7353
7354   for (list<notify_info_t>::iterator p = ctx->notifies.begin();
7355        p != ctx->notifies.end();
7356        ++p) {
7357     dout(10) << "do_osd_op_effects, notify " << *p << dendl;
7358     ConnectionRef conn(ctx->op->get_req()->get_connection());
7359     NotifyRef notif(
7360       Notify::makeNotifyRef(
7361         conn,
7362         ctx->reqid.name.num(),
7363         p->bl,
7364         p->timeout,
7365         p->cookie,
7366         p->notify_id,
7367         ctx->obc->obs.oi.user_version,
7368         osd));
7369     for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator i =
7370            ctx->obc->watchers.begin();
7371          i != ctx->obc->watchers.end();
7372          ++i) {
7373       dout(10) << "starting notify on watch " << i->first << dendl;
7374       i->second->start_notify(notif);
7375     }
7376     notif->init();
7377   }
7378
7379   for (list<OpContext::NotifyAck>::iterator p = ctx->notify_acks.begin();
7380        p != ctx->notify_acks.end();
7381        ++p) {
7382     if (p->watch_cookie)
7383       dout(10) << "notify_ack " << make_pair(p->watch_cookie.get(), p->notify_id) << dendl;
7384     else
7385       dout(10) << "notify_ack " << make_pair("NULL", p->notify_id) << dendl;
7386     for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator i =
7387            ctx->obc->watchers.begin();
7388          i != ctx->obc->watchers.end();
7389          ++i) {
7390       if (i->first.second != entity) continue;
7391       if (p->watch_cookie &&
7392           p->watch_cookie.get() != i->first.first) continue;
7393       dout(10) << "acking notify on watch " << i->first << dendl;
7394       i->second->notify_ack(p->notify_id, p->reply_bl);
7395     }
7396   }
7397 }
7398
7399 hobject_t PrimaryLogPG::generate_temp_object(const hobject_t& target)
7400 {
7401   ostringstream ss;
7402   ss << "temp_" << info.pgid << "_" << get_role()
7403      << "_" << osd->monc->get_global_id() << "_" << (++temp_seq);
7404   hobject_t hoid = target.make_temp_hobject(ss.str());
7405   dout(20) << __func__ << " " << hoid << dendl;
7406   return hoid;
7407 }
7408
7409 hobject_t PrimaryLogPG::get_temp_recovery_object(
7410   const hobject_t& target,
7411   eversion_t version)
7412 {
7413   ostringstream ss;
7414   ss << "temp_recovering_" << info.pgid  // (note this includes the shardid)
7415      << "_" << version
7416      << "_" << info.history.same_interval_since
7417      << "_" << target.snap;
7418   // pgid + version + interval + snapid is unique, and short
7419   hobject_t hoid = target.make_temp_hobject(ss.str());
7420   dout(20) << __func__ << " " << hoid << dendl;
7421   return hoid;
7422 }
7423
7424 int PrimaryLogPG::prepare_transaction(OpContext *ctx)
7425 {
7426   assert(!ctx->ops->empty());
7427
7428   const hobject_t& soid = ctx->obs->oi.soid;
7429
7430   // valid snap context?
7431   if (!ctx->snapc.is_valid()) {
7432     dout(10) << " invalid snapc " << ctx->snapc << dendl;
7433     return -EINVAL;
7434   }
7435
7436   // prepare the actual mutation
7437   int result = do_osd_ops(ctx, *ctx->ops);
7438   if (result < 0) {
7439     if (ctx->op->may_write() &&
7440         get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
7441       // need to save the error code in the pg log, to detect dup ops,
7442       // but do nothing else
7443       ctx->update_log_only = true;
7444     }
7445     return result;
7446   }
7447
7448   // read-op?  write-op noop? done?
7449   if (ctx->op_t->empty() && !ctx->modify) {
7450     unstable_stats.add(ctx->delta_stats);
7451     if (ctx->op->may_write() &&
7452         get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
7453       ctx->update_log_only = true;
7454     }
7455     return result;
7456   }
7457
7458   // check for full
7459   if ((ctx->delta_stats.num_bytes > 0 ||
7460        ctx->delta_stats.num_objects > 0) &&  // FIXME: keys?
7461       (pool.info.has_flag(pg_pool_t::FLAG_FULL) ||
7462        get_osdmap()->test_flag(CEPH_OSDMAP_FULL))) {
7463     const MOSDOp *m = static_cast<const MOSDOp*>(ctx->op->get_req());
7464     if (ctx->reqid.name.is_mds() ||   // FIXME: ignore MDS for now
7465         m->has_flag(CEPH_OSD_FLAG_FULL_FORCE)) {
7466       dout(20) << __func__ << " full, but proceeding due to FULL_FORCE or MDS"
7467                << dendl;
7468     } else if (m->has_flag(CEPH_OSD_FLAG_FULL_TRY)) {
7469       // they tried, they failed.
7470       dout(20) << __func__ << " full, replying to FULL_TRY op" << dendl;
7471       return pool.info.has_flag(pg_pool_t::FLAG_FULL) ? -EDQUOT : -ENOSPC;
7472     } else {
7473       // drop request
7474       dout(20) << __func__ << " full, dropping request (bad client)" << dendl;
7475       return -EAGAIN;
7476     }
7477   }
7478
7479   // clone, if necessary
7480   if (soid.snap == CEPH_NOSNAP)
7481     make_writeable(ctx);
7482
7483   finish_ctx(ctx,
7484              ctx->new_obs.exists ? pg_log_entry_t::MODIFY :
7485              pg_log_entry_t::DELETE);
7486
7487   return result;
7488 }
7489
7490 void PrimaryLogPG::finish_ctx(OpContext *ctx, int log_op_type, bool maintain_ssc)
7491 {
7492   const hobject_t& soid = ctx->obs->oi.soid;
7493   dout(20) << __func__ << " " << soid << " " << ctx
7494            << " op " << pg_log_entry_t::get_op_name(log_op_type)
7495            << dendl;
7496   utime_t now = ceph_clock_now();
7497
7498   // snapset
7499   bufferlist bss;
7500
7501   if (soid.snap == CEPH_NOSNAP && maintain_ssc) {
7502     ::encode(ctx->new_snapset, bss);
7503     assert(ctx->new_obs.exists == ctx->new_snapset.head_exists ||
7504            !ctx->new_snapset.is_legacy());
7505
7506     if (ctx->new_obs.exists) {
7507       if (!ctx->obs->exists) {
7508         if (ctx->snapset_obc && ctx->snapset_obc->obs.exists) {
7509           hobject_t snapoid = soid.get_snapdir();
7510           dout(10) << " removing unneeded snapdir " << snapoid << dendl;
7511           ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::DELETE, snapoid,
7512               ctx->at_version,
7513               ctx->snapset_obc->obs.oi.version,
7514               0, osd_reqid_t(), ctx->mtime, 0));
7515           ctx->op_t->remove(snapoid);
7516
7517           ctx->at_version.version++;
7518
7519           ctx->snapset_obc->obs.exists = false;
7520         }
7521       }
7522     } else if (!ctx->new_snapset.clones.empty() &&
7523                !ctx->cache_evict &&
7524                !ctx->new_snapset.head_exists &&
7525                (!ctx->snapset_obc || !ctx->snapset_obc->obs.exists)) {
7526       // save snapset on _snap
7527       hobject_t snapoid(soid.oid, soid.get_key(), CEPH_SNAPDIR, soid.get_hash(),
7528                         info.pgid.pool(), soid.get_namespace());
7529       dout(10) << " final snapset " << ctx->new_snapset
7530                << " in " << snapoid << dendl;
7531       assert(get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS);
7532       ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::MODIFY, snapoid,
7533                                         ctx->at_version,
7534                                         eversion_t(),
7535                                         0, osd_reqid_t(), ctx->mtime, 0));
7536
7537       if (!ctx->snapset_obc)
7538         ctx->snapset_obc = get_object_context(snapoid, true);
7539       bool got = false;
7540       if (ctx->lock_type == ObjectContext::RWState::RWWRITE) {
7541         got = ctx->lock_manager.get_write_greedy(
7542           snapoid,
7543           ctx->snapset_obc,
7544           ctx->op);
7545       } else {
7546         assert(ctx->lock_type == ObjectContext::RWState::RWEXCL);
7547         got = ctx->lock_manager.get_lock_type(
7548           ObjectContext::RWState::RWEXCL,
7549           snapoid,
7550           ctx->snapset_obc,
7551           ctx->op);
7552       }
7553       assert(got);
7554       dout(20) << " got greedy write on snapset_obc " << *ctx->snapset_obc << dendl;
7555       ctx->snapset_obc->obs.exists = true;
7556       ctx->snapset_obc->obs.oi.version = ctx->at_version;
7557       ctx->snapset_obc->obs.oi.last_reqid = ctx->reqid;
7558       ctx->snapset_obc->obs.oi.mtime = ctx->mtime;
7559       ctx->snapset_obc->obs.oi.local_mtime = now;
7560
7561       map<string, bufferlist> attrs;
7562       bufferlist bv(sizeof(ctx->new_obs.oi));
7563       ::encode(ctx->snapset_obc->obs.oi, bv,
7564                get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
7565       ctx->op_t->create(snapoid);
7566       attrs[OI_ATTR].claim(bv);
7567       attrs[SS_ATTR].claim(bss);
7568       setattrs_maybe_cache(ctx->snapset_obc, ctx, ctx->op_t.get(), attrs);
7569       ctx->at_version.version++;
7570     }
7571   }
7572
7573   // finish and log the op.
7574   if (ctx->user_modify) {
7575     // update the user_version for any modify ops, except for the watch op
7576     ctx->user_at_version = MAX(info.last_user_version, ctx->new_obs.oi.user_version) + 1;
7577     /* In order for new clients and old clients to interoperate properly
7578      * when exchanging versions, we need to lower bound the user_version
7579      * (which our new clients pay proper attention to)
7580      * by the at_version (which is all the old clients can ever see). */
7581     if (ctx->at_version.version > ctx->user_at_version)
7582       ctx->user_at_version = ctx->at_version.version;
7583     ctx->new_obs.oi.user_version = ctx->user_at_version;
7584   }
7585   ctx->bytes_written = ctx->op_t->get_bytes_written();
7586
7587   if (ctx->new_obs.exists) {
7588     // on the head object
7589     ctx->new_obs.oi.version = ctx->at_version;
7590     ctx->new_obs.oi.prior_version = ctx->obs->oi.version;
7591     ctx->new_obs.oi.last_reqid = ctx->reqid;
7592     if (ctx->mtime != utime_t()) {
7593       ctx->new_obs.oi.mtime = ctx->mtime;
7594       dout(10) << " set mtime to " << ctx->new_obs.oi.mtime << dendl;
7595       ctx->new_obs.oi.local_mtime = now;
7596     } else {
7597       dout(10) << " mtime unchanged at " << ctx->new_obs.oi.mtime << dendl;
7598     }
7599
7600     map <string, bufferlist> attrs;
7601     bufferlist bv(sizeof(ctx->new_obs.oi));
7602     ::encode(ctx->new_obs.oi, bv,
7603              get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
7604     attrs[OI_ATTR].claim(bv);
7605
7606     if (soid.snap == CEPH_NOSNAP) {
7607       dout(10) << " final snapset " << ctx->new_snapset
7608                << " in " << soid << dendl;
7609       attrs[SS_ATTR].claim(bss);
7610     } else {
7611       dout(10) << " no snapset (this is a clone)" << dendl;
7612     }
7613     ctx->op_t->setattrs(soid, attrs);
7614   } else {
7615     ctx->new_obs.oi = object_info_t(ctx->obc->obs.oi.soid);
7616   }
7617
7618   bool legacy_snapset = ctx->new_snapset.is_legacy() ||
7619     get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
7620
7621   // append to log
7622   ctx->log.push_back(pg_log_entry_t(log_op_type, soid, ctx->at_version,
7623                                     ctx->obs->oi.version,
7624                                     ctx->user_at_version, ctx->reqid,
7625                                     ctx->mtime, 0));
7626   if (soid.snap < CEPH_NOSNAP) {
7627     switch (log_op_type) {
7628     case pg_log_entry_t::MODIFY:
7629     case pg_log_entry_t::PROMOTE:
7630     case pg_log_entry_t::CLEAN:
7631       if (legacy_snapset) {
7632         dout(20) << __func__ << " encoding legacy_snaps "
7633                  << ctx->new_obs.oi.legacy_snaps
7634                  << dendl;
7635         ::encode(ctx->new_obs.oi.legacy_snaps, ctx->log.back().snaps);
7636       } else {
7637         dout(20) << __func__ << " encoding snaps from " << ctx->new_snapset
7638                  << dendl;
7639         ::encode(ctx->new_snapset.clone_snaps[soid.snap], ctx->log.back().snaps);
7640       }
7641       break;
7642     default:
7643       break;
7644     }
7645   }
7646
7647   if (!ctx->extra_reqids.empty()) {
7648     dout(20) << __func__ << "  extra_reqids " << ctx->extra_reqids << dendl;
7649     ctx->log.back().extra_reqids.swap(ctx->extra_reqids);
7650   }
7651
7652   // apply new object state.
7653   ctx->obc->obs = ctx->new_obs;
7654
7655   if (soid.is_head() && !ctx->obc->obs.exists &&
7656       (!maintain_ssc || ctx->cache_evict)) {
7657     ctx->obc->ssc->exists = false;
7658     ctx->obc->ssc->snapset = SnapSet();
7659   } else {
7660     ctx->obc->ssc->exists = true;
7661     ctx->obc->ssc->snapset = ctx->new_snapset;
7662   }
7663 }
7664
7665 void PrimaryLogPG::apply_stats(
7666   const hobject_t &soid,
7667   const object_stat_sum_t &delta_stats) {
7668
7669   info.stats.stats.add(delta_stats);
7670
7671   for (set<pg_shard_t>::iterator i = backfill_targets.begin();
7672        i != backfill_targets.end();
7673        ++i) {
7674     pg_shard_t bt = *i;
7675     pg_info_t& pinfo = peer_info[bt];
7676     if (soid <= pinfo.last_backfill)
7677       pinfo.stats.stats.add(delta_stats);
7678     else if (soid <= last_backfill_started)
7679       pending_backfill_updates[soid].stats.add(delta_stats);
7680   }
7681
7682   if (is_primary() && scrubber.active) {
7683     if (soid < scrubber.start) {
7684       dout(20) << __func__ << " " << soid << " < [" << scrubber.start
7685                << "," << scrubber.end << ")" << dendl;
7686       scrub_cstat.add(delta_stats);
7687     } else {
7688       dout(20) << __func__ << " " << soid << " >= [" << scrubber.start
7689                << "," << scrubber.end << ")" << dendl;
7690     }
7691   }
7692 }
7693
7694 void PrimaryLogPG::complete_read_ctx(int result, OpContext *ctx)
7695 {
7696   const MOSDOp *m = static_cast<const MOSDOp*>(ctx->op->get_req());
7697   assert(ctx->async_reads_complete());
7698
7699   for (vector<OSDOp>::iterator p = ctx->ops->begin();
7700     p != ctx->ops->end() && result >= 0; ++p) {
7701     if (p->rval < 0 && !(p->op.flags & CEPH_OSD_OP_FLAG_FAILOK)) {
7702       result = p->rval;
7703       break;
7704     }
7705     ctx->bytes_read += p->outdata.length();
7706   }
7707   ctx->reply->claim_op_out_data(*ctx->ops);
7708   ctx->reply->get_header().data_off = (ctx->data_off ? *ctx->data_off : 0);
7709
7710   MOSDOpReply *reply = ctx->reply;
7711   ctx->reply = nullptr;
7712
7713   if (result >= 0) {
7714     if (!ctx->ignore_log_op_stats) {
7715       log_op_stats(ctx);
7716       publish_stats_to_osd();
7717     }
7718
7719     // on read, return the current object version
7720     if (ctx->obs) {
7721       reply->set_reply_versions(eversion_t(), ctx->obs->oi.user_version);
7722     } else {
7723       reply->set_reply_versions(eversion_t(), ctx->user_at_version);
7724     }
7725   } else if (result == -ENOENT) {
7726     // on ENOENT, set a floor for what the next user version will be.
7727     reply->set_enoent_reply_versions(info.last_update, info.last_user_version);
7728   }
7729
7730   reply->set_result(result);
7731   reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
7732   osd->send_message_osd_client(reply, m->get_connection());
7733   close_op_ctx(ctx);
7734 }
7735
7736 // ========================================================================
7737 // copyfrom
7738
7739 struct C_Copyfrom : public Context {
7740   PrimaryLogPGRef pg;
7741   hobject_t oid;
7742   epoch_t last_peering_reset;
7743   ceph_tid_t tid;
7744   PrimaryLogPG::CopyOpRef cop;
7745   C_Copyfrom(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
7746              const PrimaryLogPG::CopyOpRef& c)
7747     : pg(p), oid(o), last_peering_reset(lpr),
7748       tid(0), cop(c)
7749   {}
7750   void finish(int r) override {
7751     if (r == -ECANCELED)
7752       return;
7753     pg->lock();
7754     if (last_peering_reset == pg->get_last_peering_reset()) {
7755       pg->process_copy_chunk(oid, tid, r);
7756     }
7757     pg->unlock();
7758   }
7759 };
7760
7761 struct C_CopyFrom_AsyncReadCb : public Context {
7762   OSDOp *osd_op;
7763   object_copy_data_t reply_obj;
7764   uint64_t features;
7765   size_t len;
7766   C_CopyFrom_AsyncReadCb(OSDOp *osd_op, uint64_t features) :
7767     osd_op(osd_op), features(features), len(0) {}
7768   void finish(int r) override {
7769     osd_op->rval = r;
7770     if (r < 0) {
7771       return;
7772     }
7773
7774     assert(len > 0);
7775     assert(len <= reply_obj.data.length());
7776     bufferlist bl;
7777     bl.substr_of(reply_obj.data, 0, len);
7778     reply_obj.data.swap(bl);
7779     ::encode(reply_obj, osd_op->outdata, features);
7780   }
7781 };
7782
7783 int PrimaryLogPG::do_copy_get(OpContext *ctx, bufferlist::iterator& bp,
7784                               OSDOp& osd_op, ObjectContextRef &obc)
7785 {
7786   object_info_t& oi = obc->obs.oi;
7787   hobject_t& soid = oi.soid;
7788   int result = 0;
7789   object_copy_cursor_t cursor;
7790   uint64_t out_max;
7791   try {
7792     ::decode(cursor, bp);
7793     ::decode(out_max, bp);
7794   }
7795   catch (buffer::error& e) {
7796     result = -EINVAL;
7797     return result;
7798   }
7799
7800   const MOSDOp *op = reinterpret_cast<const MOSDOp*>(ctx->op->get_req());
7801   uint64_t features = op->get_features();
7802
7803   bool async_read_started = false;
7804   object_copy_data_t _reply_obj;
7805   C_CopyFrom_AsyncReadCb *cb = NULL;
7806   if (pool.info.require_rollback()) {
7807     cb = new C_CopyFrom_AsyncReadCb(&osd_op, features);
7808   }
7809   object_copy_data_t &reply_obj = cb ? cb->reply_obj : _reply_obj;
7810   // size, mtime
7811   reply_obj.size = oi.size;
7812   reply_obj.mtime = oi.mtime;
7813   assert(obc->ssc);
7814   if (soid.snap < CEPH_NOSNAP) {
7815     if (obc->ssc->snapset.is_legacy()) {
7816       reply_obj.snaps = oi.legacy_snaps;
7817     } else {
7818       auto p = obc->ssc->snapset.clone_snaps.find(soid.snap);
7819       assert(p != obc->ssc->snapset.clone_snaps.end()); // warn?
7820       reply_obj.snaps = p->second;
7821     }
7822   } else {
7823     reply_obj.snap_seq = obc->ssc->snapset.seq;
7824   }
7825   if (oi.is_data_digest()) {
7826     reply_obj.flags |= object_copy_data_t::FLAG_DATA_DIGEST;
7827     reply_obj.data_digest = oi.data_digest;
7828   }
7829   if (oi.is_omap_digest()) {
7830     reply_obj.flags |= object_copy_data_t::FLAG_OMAP_DIGEST;
7831     reply_obj.omap_digest = oi.omap_digest;
7832   }
7833   reply_obj.truncate_seq = oi.truncate_seq;
7834   reply_obj.truncate_size = oi.truncate_size;
7835
7836   // attrs
7837   map<string,bufferlist>& out_attrs = reply_obj.attrs;
7838   if (!cursor.attr_complete) {
7839     result = getattrs_maybe_cache(
7840       ctx->obc,
7841       &out_attrs,
7842       true);
7843     if (result < 0) {
7844       if (cb) {
7845         delete cb;
7846       }
7847       return result;
7848     }
7849     cursor.attr_complete = true;
7850     dout(20) << " got attrs" << dendl;
7851   }
7852
7853   int64_t left = out_max - osd_op.outdata.length();
7854
7855   // data
7856   bufferlist& bl = reply_obj.data;
7857   if (left > 0 && !cursor.data_complete) {
7858     if (cursor.data_offset < oi.size) {
7859       uint64_t max_read = MIN(oi.size - cursor.data_offset, (uint64_t)left);
7860       if (cb) {
7861         async_read_started = true;
7862         ctx->pending_async_reads.push_back(
7863           make_pair(
7864             boost::make_tuple(cursor.data_offset, max_read, osd_op.op.flags),
7865             make_pair(&bl, cb)));
7866         cb->len = max_read;
7867
7868         ctx->op_finishers[ctx->current_osd_subop_num].reset(
7869           new ReadFinisher(osd_op));
7870         result = -EINPROGRESS;
7871
7872         dout(10) << __func__ << ": async_read noted for " << soid << dendl;
7873       } else {
7874         result = pgbackend->objects_read_sync(
7875           oi.soid, cursor.data_offset, max_read, osd_op.op.flags, &bl);
7876         if (result < 0)
7877           return result;
7878       }
7879       left -= max_read;
7880       cursor.data_offset += max_read;
7881     }
7882     if (cursor.data_offset == oi.size) {
7883       cursor.data_complete = true;
7884       dout(20) << " got data" << dendl;
7885     }
7886     assert(cursor.data_offset <= oi.size);
7887   }
7888
7889   // omap
7890   uint32_t omap_keys = 0;
7891   if (!pool.info.supports_omap() || !oi.is_omap()) {
7892     cursor.omap_complete = true;
7893   } else {
7894     if (left > 0 && !cursor.omap_complete) {
7895       assert(cursor.data_complete);
7896       if (cursor.omap_offset.empty()) {
7897         osd->store->omap_get_header(ch, ghobject_t(oi.soid),
7898                                     &reply_obj.omap_header);
7899       }
7900       bufferlist omap_data;
7901       ObjectMap::ObjectMapIterator iter =
7902         osd->store->get_omap_iterator(coll, ghobject_t(oi.soid));
7903       assert(iter);
7904       iter->upper_bound(cursor.omap_offset);
7905       for (; iter->valid(); iter->next(false)) {
7906         ++omap_keys;
7907         ::encode(iter->key(), omap_data);
7908         ::encode(iter->value(), omap_data);
7909         left -= iter->key().length() + 4 + iter->value().length() + 4;
7910         if (left <= 0)
7911           break;
7912       }
7913       if (omap_keys) {
7914         ::encode(omap_keys, reply_obj.omap_data);
7915         reply_obj.omap_data.claim_append(omap_data);
7916       }
7917       if (iter->valid()) {
7918         cursor.omap_offset = iter->key();
7919       } else {
7920         cursor.omap_complete = true;
7921         dout(20) << " got omap" << dendl;
7922       }
7923     }
7924   }
7925
7926   if (cursor.is_complete()) {
7927     // include reqids only in the final step.  this is a bit fragile
7928     // but it works...
7929     pg_log.get_log().get_object_reqids(ctx->obc->obs.oi.soid, 10, &reply_obj.reqids);
7930     dout(20) << " got reqids" << dendl;
7931   }
7932
7933   dout(20) << " cursor.is_complete=" << cursor.is_complete()
7934            << " " << out_attrs.size() << " attrs"
7935            << " " << bl.length() << " bytes"
7936            << " " << reply_obj.omap_header.length() << " omap header bytes"
7937            << " " << reply_obj.omap_data.length() << " omap data bytes in "
7938            << omap_keys << " keys"
7939            << " " << reply_obj.reqids.size() << " reqids"
7940            << dendl;
7941   reply_obj.cursor = cursor;
7942   if (!async_read_started) {
7943     ::encode(reply_obj, osd_op.outdata, features);
7944   }
7945   if (cb && !async_read_started) {
7946     delete cb;
7947   }
7948
7949   if (result > 0) {
7950     result = 0;
7951   }
7952   return result;
7953 }
7954
7955 void PrimaryLogPG::fill_in_copy_get_noent(OpRequestRef& op, hobject_t oid,
7956                                           OSDOp& osd_op)
7957 {
7958   // NOTE: we take non-const ref here for claim_op_out_data below; we must
7959   // be careful not to modify anything else that will upset a racing
7960   // operator<<
7961   MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
7962   uint64_t features = m->get_features();
7963   object_copy_data_t reply_obj;
7964
7965   pg_log.get_log().get_object_reqids(oid, 10, &reply_obj.reqids);
7966   dout(20) << __func__ << " got reqids " << reply_obj.reqids << dendl;
7967   ::encode(reply_obj, osd_op.outdata, features);
7968   osd_op.rval = -ENOENT;
7969   MOSDOpReply *reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0, false);
7970   reply->claim_op_out_data(m->ops);
7971   reply->set_result(-ENOENT);
7972   reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
7973   osd->send_message_osd_client(reply, m->get_connection());
7974 }
7975
7976 void PrimaryLogPG::start_copy(CopyCallback *cb, ObjectContextRef obc,
7977                               hobject_t src, object_locator_t oloc,
7978                               version_t version, unsigned flags,
7979                               bool mirror_snapset,
7980                               unsigned src_obj_fadvise_flags,
7981                               unsigned dest_obj_fadvise_flags)
7982 {
7983   const hobject_t& dest = obc->obs.oi.soid;
7984   dout(10) << __func__ << " " << dest
7985            << " from " << src << " " << oloc << " v" << version
7986            << " flags " << flags
7987            << (mirror_snapset ? " mirror_snapset" : "")
7988            << dendl;
7989
7990   assert(!mirror_snapset || (src.snap == CEPH_NOSNAP ||
7991                              src.snap == CEPH_SNAPDIR));
7992
7993   // cancel a previous in-progress copy?
7994   if (copy_ops.count(dest)) {
7995     // FIXME: if the src etc match, we could avoid restarting from the
7996     // beginning.
7997     CopyOpRef cop = copy_ops[dest];
7998     cancel_copy(cop, false);
7999   }
8000
8001   CopyOpRef cop(std::make_shared<CopyOp>(cb, obc, src, oloc, version, flags,
8002                            mirror_snapset, src_obj_fadvise_flags,
8003                            dest_obj_fadvise_flags));
8004   copy_ops[dest] = cop;
8005   obc->start_block();
8006
8007   _copy_some(obc, cop);
8008 }
8009
8010 void PrimaryLogPG::_copy_some(ObjectContextRef obc, CopyOpRef cop)
8011 {
8012   dout(10) << __func__ << " " << obc << " " << cop << dendl;
8013
8014   unsigned flags = 0;
8015   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_FLUSH)
8016     flags |= CEPH_OSD_FLAG_FLUSH;
8017   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE)
8018     flags |= CEPH_OSD_FLAG_IGNORE_CACHE;
8019   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY)
8020     flags |= CEPH_OSD_FLAG_IGNORE_OVERLAY;
8021   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE)
8022     flags |= CEPH_OSD_FLAG_MAP_SNAP_CLONE;
8023   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_RWORDERED)
8024     flags |= CEPH_OSD_FLAG_RWORDERED;
8025
8026   C_GatherBuilder gather(cct);
8027
8028   if (cop->cursor.is_initial() && cop->mirror_snapset) {
8029     // list snaps too.
8030     assert(cop->src.snap == CEPH_NOSNAP);
8031     ObjectOperation op;
8032     op.list_snaps(&cop->results.snapset, NULL);
8033     ceph_tid_t tid = osd->objecter->read(cop->src.oid, cop->oloc, op,
8034                                     CEPH_SNAPDIR, NULL,
8035                                     flags, gather.new_sub(), NULL);
8036     cop->objecter_tid2 = tid;
8037   }
8038
8039   ObjectOperation op;
8040   if (cop->results.user_version) {
8041     op.assert_version(cop->results.user_version);
8042   } else {
8043     // we should learn the version after the first chunk, if we didn't know
8044     // it already!
8045     assert(cop->cursor.is_initial());
8046   }
8047   op.copy_get(&cop->cursor, get_copy_chunk_size(),
8048               &cop->results.object_size, &cop->results.mtime,
8049               &cop->attrs, &cop->data, &cop->omap_header, &cop->omap_data,
8050               &cop->results.snaps, &cop->results.snap_seq,
8051               &cop->results.flags,
8052               &cop->results.source_data_digest,
8053               &cop->results.source_omap_digest,
8054               &cop->results.reqids,
8055               &cop->results.truncate_seq,
8056               &cop->results.truncate_size,
8057               &cop->rval);
8058   op.set_last_op_flags(cop->src_obj_fadvise_flags);
8059
8060   C_Copyfrom *fin = new C_Copyfrom(this, obc->obs.oi.soid,
8061                                    get_last_peering_reset(), cop);
8062   gather.set_finisher(new C_OnFinisher(fin,
8063                                        &osd->objecter_finisher));
8064
8065   ceph_tid_t tid = osd->objecter->read(cop->src.oid, cop->oloc, op,
8066                                   cop->src.snap, NULL,
8067                                   flags,
8068                                   gather.new_sub(),
8069                                   // discover the object version if we don't know it yet
8070                                   cop->results.user_version ? NULL : &cop->results.user_version);
8071   fin->tid = tid;
8072   cop->objecter_tid = tid;
8073   gather.activate();
8074 }
8075
8076 void PrimaryLogPG::process_copy_chunk(hobject_t oid, ceph_tid_t tid, int r)
8077 {
8078   dout(10) << __func__ << " " << oid << " tid " << tid
8079            << " " << cpp_strerror(r) << dendl;
8080   map<hobject_t,CopyOpRef>::iterator p = copy_ops.find(oid);
8081   if (p == copy_ops.end()) {
8082     dout(10) << __func__ << " no copy_op found" << dendl;
8083     return;
8084   }
8085   CopyOpRef cop = p->second;
8086   if (tid != cop->objecter_tid) {
8087     dout(10) << __func__ << " tid " << tid << " != cop " << cop
8088              << " tid " << cop->objecter_tid << dendl;
8089     return;
8090   }
8091
8092   if (cop->omap_data.length() || cop->omap_header.length())
8093     cop->results.has_omap = true;
8094
8095   if (r >= 0 && !pool.info.supports_omap() &&
8096       (cop->omap_data.length() || cop->omap_header.length())) {
8097     r = -EOPNOTSUPP;
8098   }
8099   cop->objecter_tid = 0;
8100   cop->objecter_tid2 = 0;  // assume this ordered before us (if it happened)
8101   ObjectContextRef& cobc = cop->obc;
8102
8103   if (r < 0)
8104     goto out;
8105
8106   assert(cop->rval >= 0);
8107
8108   if (oid.snap < CEPH_NOSNAP && !cop->results.snaps.empty()) {
8109     // verify snap hasn't been deleted
8110     vector<snapid_t>::iterator p = cop->results.snaps.begin();
8111     while (p != cop->results.snaps.end()) {
8112       if (pool.info.is_removed_snap(*p)) {
8113         dout(10) << __func__ << " clone snap " << *p << " has been deleted"
8114                  << dendl;
8115         for (vector<snapid_t>::iterator q = p + 1;
8116              q != cop->results.snaps.end();
8117              ++q)
8118           *(q - 1) = *q;
8119         cop->results.snaps.resize(cop->results.snaps.size() - 1);
8120       } else {
8121         ++p;
8122       }
8123     }
8124     if (cop->results.snaps.empty()) {
8125       dout(10) << __func__ << " no more snaps for " << oid << dendl;
8126       r = -ENOENT;
8127       goto out;
8128     }
8129   }
8130
8131   assert(cop->rval >= 0);
8132
8133   if (!cop->temp_cursor.data_complete) {
8134     cop->results.data_digest = cop->data.crc32c(cop->results.data_digest);
8135   }
8136   if (pool.info.supports_omap() && !cop->temp_cursor.omap_complete) {
8137     if (cop->omap_header.length()) {
8138       cop->results.omap_digest =
8139         cop->omap_header.crc32c(cop->results.omap_digest);
8140     }
8141     if (cop->omap_data.length()) {
8142       bufferlist keys;
8143       keys.substr_of(cop->omap_data, 4, cop->omap_data.length() - 4);
8144       cop->results.omap_digest = keys.crc32c(cop->results.omap_digest);
8145     }
8146   }
8147
8148   if (!cop->temp_cursor.attr_complete) {
8149     for (map<string,bufferlist>::iterator p = cop->attrs.begin();
8150          p != cop->attrs.end();
8151          ++p) {
8152       cop->results.attrs[string("_") + p->first] = p->second;
8153     }
8154     cop->attrs.clear();
8155   }
8156
8157   if (!cop->cursor.is_complete()) {
8158     // write out what we have so far
8159     if (cop->temp_cursor.is_initial()) {
8160       assert(!cop->results.started_temp_obj);
8161       cop->results.started_temp_obj = true;
8162       cop->results.temp_oid = generate_temp_object(oid);
8163       dout(20) << __func__ << " using temp " << cop->results.temp_oid << dendl;
8164     }
8165     ObjectContextRef tempobc = get_object_context(cop->results.temp_oid, true);
8166     OpContextUPtr ctx = simple_opc_create(tempobc);
8167     if (cop->temp_cursor.is_initial()) {
8168       ctx->new_temp_oid = cop->results.temp_oid;
8169     }
8170     _write_copy_chunk(cop, ctx->op_t.get());
8171     simple_opc_submit(std::move(ctx));
8172     dout(10) << __func__ << " fetching more" << dendl;
8173     _copy_some(cobc, cop);
8174     return;
8175   }
8176
8177   // verify digests?
8178   if (cop->results.is_data_digest() || cop->results.is_omap_digest()) {
8179     dout(20) << __func__ << std::hex
8180       << " got digest: rx data 0x" << cop->results.data_digest
8181       << " omap 0x" << cop->results.omap_digest
8182       << ", source: data 0x" << cop->results.source_data_digest
8183       << " omap 0x" <<  cop->results.source_omap_digest
8184       << std::dec
8185       << " flags " << cop->results.flags
8186       << dendl;
8187   }
8188   if (cop->results.is_data_digest() &&
8189       cop->results.data_digest != cop->results.source_data_digest) {
8190     derr << __func__ << std::hex << " data digest 0x" << cop->results.data_digest
8191          << " != source 0x" << cop->results.source_data_digest << std::dec
8192          << dendl;
8193     osd->clog->error() << info.pgid << " copy from " << cop->src
8194                        << " to " << cop->obc->obs.oi.soid << std::hex
8195                        << " data digest 0x" << cop->results.data_digest
8196                        << " != source 0x" << cop->results.source_data_digest
8197                        << std::dec;
8198     r = -EIO;
8199     goto out;
8200   }
8201   if (cop->results.is_omap_digest() &&
8202       cop->results.omap_digest != cop->results.source_omap_digest) {
8203     derr << __func__ << std::hex
8204          << " omap digest 0x" << cop->results.omap_digest
8205          << " != source 0x" << cop->results.source_omap_digest
8206          << std::dec << dendl;
8207     osd->clog->error() << info.pgid << " copy from " << cop->src
8208                        << " to " << cop->obc->obs.oi.soid << std::hex
8209                        << " omap digest 0x" << cop->results.omap_digest
8210                        << " != source 0x" << cop->results.source_omap_digest
8211                        << std::dec;
8212     r = -EIO;
8213     goto out;
8214   }
8215   if (cct->_conf->osd_debug_inject_copyfrom_error) {
8216     derr << __func__ << " injecting copyfrom failure" << dendl;
8217     r = -EIO;
8218     goto out;
8219   }
8220
8221   cop->results.fill_in_final_tx = std::function<void(PGTransaction*)>(
8222     [this, &cop /* avoid ref cycle */](PGTransaction *t) {
8223       ObjectState& obs = cop->obc->obs;
8224       if (cop->temp_cursor.is_initial()) {
8225         dout(20) << "fill_in_final_tx: writing "
8226                  << "directly to final object" << dendl;
8227         // write directly to final object
8228         cop->results.temp_oid = obs.oi.soid;
8229         _write_copy_chunk(cop, t);
8230       } else {
8231         // finish writing to temp object, then move into place
8232         dout(20) << "fill_in_final_tx: writing to temp object" << dendl;
8233         _write_copy_chunk(cop, t);
8234         t->rename(obs.oi.soid, cop->results.temp_oid);
8235       }
8236       t->setattrs(obs.oi.soid, cop->results.attrs);
8237     });
8238
8239   dout(20) << __func__ << " success; committing" << dendl;
8240
8241  out:
8242   dout(20) << __func__ << " complete r = " << cpp_strerror(r) << dendl;
8243   CopyCallbackResults results(r, &cop->results);
8244   cop->cb->complete(results);
8245
8246   copy_ops.erase(cobc->obs.oi.soid);
8247   cobc->stop_block();
8248
8249   if (r < 0 && cop->results.started_temp_obj) {
8250     dout(10) << __func__ << " deleting partial temp object "
8251              << cop->results.temp_oid << dendl;
8252     ObjectContextRef tempobc = get_object_context(cop->results.temp_oid, true);
8253     OpContextUPtr ctx = simple_opc_create(tempobc);
8254     ctx->op_t->remove(cop->results.temp_oid);
8255     ctx->discard_temp_oid = cop->results.temp_oid;
8256     simple_opc_submit(std::move(ctx));
8257   }
8258
8259   // cancel and requeue proxy ops on this object
8260   if (!r) {
8261     for (map<ceph_tid_t, ProxyReadOpRef>::iterator it = proxyread_ops.begin();
8262         it != proxyread_ops.end();) {
8263       if (it->second->soid == cobc->obs.oi.soid) {
8264         cancel_proxy_read((it++)->second);
8265       } else {
8266         ++it;
8267       }
8268     }
8269     for (map<ceph_tid_t, ProxyWriteOpRef>::iterator it = proxywrite_ops.begin();
8270          it != proxywrite_ops.end();) {
8271       if (it->second->soid == cobc->obs.oi.soid) {
8272         cancel_proxy_write((it++)->second);
8273       } else {
8274         ++it;
8275       }
8276     }
8277     kick_proxy_ops_blocked(cobc->obs.oi.soid);
8278   }
8279
8280   kick_object_context_blocked(cobc);
8281 }
8282
8283 void PrimaryLogPG::_write_copy_chunk(CopyOpRef cop, PGTransaction *t)
8284 {
8285   dout(20) << __func__ << " " << cop
8286            << " " << cop->attrs.size() << " attrs"
8287            << " " << cop->data.length() << " bytes"
8288            << " " << cop->omap_header.length() << " omap header bytes"
8289            << " " << cop->omap_data.length() << " omap data bytes"
8290            << dendl;
8291   if (!cop->temp_cursor.attr_complete) {
8292     t->create(cop->results.temp_oid);
8293   }
8294   if (!cop->temp_cursor.data_complete) {
8295     assert(cop->data.length() + cop->temp_cursor.data_offset ==
8296            cop->cursor.data_offset);
8297     if (pool.info.requires_aligned_append() &&
8298         !cop->cursor.data_complete) {
8299       /**
8300        * Trim off the unaligned bit at the end, we'll adjust cursor.data_offset
8301        * to pick it up on the next pass.
8302        */
8303       assert(cop->temp_cursor.data_offset %
8304              pool.info.required_alignment() == 0);
8305       if (cop->data.length() % pool.info.required_alignment() != 0) {
8306         uint64_t to_trim =
8307           cop->data.length() % pool.info.required_alignment();
8308         bufferlist bl;
8309         bl.substr_of(cop->data, 0, cop->data.length() - to_trim);
8310         cop->data.swap(bl);
8311         cop->cursor.data_offset -= to_trim;
8312         assert(cop->data.length() + cop->temp_cursor.data_offset ==
8313                cop->cursor.data_offset);
8314       }
8315     }
8316     if (cop->data.length()) {
8317       t->write(
8318         cop->results.temp_oid,
8319         cop->temp_cursor.data_offset,
8320         cop->data.length(),
8321         cop->data,
8322         cop->dest_obj_fadvise_flags);
8323     }
8324     cop->data.clear();
8325   }
8326   if (pool.info.supports_omap()) {
8327     if (!cop->temp_cursor.omap_complete) {
8328       if (cop->omap_header.length()) {
8329         t->omap_setheader(
8330           cop->results.temp_oid,
8331           cop->omap_header);
8332         cop->omap_header.clear();
8333       }
8334       if (cop->omap_data.length()) {
8335         map<string,bufferlist> omap;
8336         bufferlist::iterator p = cop->omap_data.begin();
8337         ::decode(omap, p);
8338         t->omap_setkeys(cop->results.temp_oid, omap);
8339         cop->omap_data.clear();
8340       }
8341     }
8342   } else {
8343     assert(cop->omap_header.length() == 0);
8344     assert(cop->omap_data.length() == 0);
8345   }
8346   cop->temp_cursor = cop->cursor;
8347 }
8348
8349 void PrimaryLogPG::finish_copyfrom(CopyFromCallback *cb)
8350 {
8351   OpContext *ctx = cb->ctx;
8352   dout(20) << "finish_copyfrom on " << ctx->obs->oi.soid << dendl;
8353
8354   ObjectState& obs = ctx->new_obs;
8355   if (obs.exists) {
8356     dout(20) << __func__ << ": exists, removing" << dendl;
8357     ctx->op_t->remove(obs.oi.soid);
8358   } else {
8359     ctx->delta_stats.num_objects++;
8360     obs.exists = true;
8361   }
8362   if (cb->is_temp_obj_used()) {
8363     ctx->discard_temp_oid = cb->results->temp_oid;
8364   }
8365   cb->results->fill_in_final_tx(ctx->op_t.get());
8366
8367   // CopyFromCallback fills this in for us
8368   obs.oi.user_version = ctx->user_at_version;
8369
8370   obs.oi.set_data_digest(cb->results->data_digest);
8371   obs.oi.set_omap_digest(cb->results->omap_digest);
8372
8373   obs.oi.truncate_seq = cb->results->truncate_seq;
8374   obs.oi.truncate_size = cb->results->truncate_size;
8375
8376   ctx->extra_reqids = cb->results->reqids;
8377
8378   // cache: clear whiteout?
8379   if (obs.oi.is_whiteout()) {
8380     dout(10) << __func__ << " clearing whiteout on " << obs.oi.soid << dendl;
8381     obs.oi.clear_flag(object_info_t::FLAG_WHITEOUT);
8382     --ctx->delta_stats.num_whiteouts;
8383   }
8384
8385   if (cb->results->has_omap) {
8386     dout(10) << __func__ << " setting omap flag on " << obs.oi.soid << dendl;
8387     obs.oi.set_flag(object_info_t::FLAG_OMAP);
8388   } else {
8389     dout(10) << __func__ << " clearing omap flag on " << obs.oi.soid << dendl;
8390     obs.oi.clear_flag(object_info_t::FLAG_OMAP);
8391   }
8392
8393   interval_set<uint64_t> ch;
8394   if (obs.oi.size > 0)
8395     ch.insert(0, obs.oi.size);
8396   ctx->modified_ranges.union_of(ch);
8397
8398   if (cb->get_data_size() != obs.oi.size) {
8399     ctx->delta_stats.num_bytes -= obs.oi.size;
8400     obs.oi.size = cb->get_data_size();
8401     ctx->delta_stats.num_bytes += obs.oi.size;
8402   }
8403   ctx->delta_stats.num_wr++;
8404   ctx->delta_stats.num_wr_kb += SHIFT_ROUND_UP(obs.oi.size, 10);
8405
8406   osd->logger->inc(l_osd_copyfrom);
8407 }
8408
8409 void PrimaryLogPG::finish_promote(int r, CopyResults *results,
8410                                   ObjectContextRef obc)
8411 {
8412   const hobject_t& soid = obc->obs.oi.soid;
8413   dout(10) << __func__ << " " << soid << " r=" << r
8414            << " uv" << results->user_version << dendl;
8415
8416   if (r == -ECANCELED) {
8417     return;
8418   }
8419
8420   if (r != -ENOENT && soid.is_snap()) {
8421     if (results->snaps.empty()) {
8422       // we must have read "snap" content from the head object in
8423       // the base pool.  use snap_seq to construct what snaps should
8424       // be for this clone (what is was before we evicted the clean
8425       // clone from this pool, and what it will be when we flush and
8426       // the clone eventually happens in the base pool).
8427       SnapSet& snapset = obc->ssc->snapset;
8428       vector<snapid_t>::iterator p = snapset.snaps.begin();
8429       while (p != snapset.snaps.end() && *p > soid.snap)
8430         ++p;
8431       while (p != snapset.snaps.end() && *p > results->snap_seq) {
8432         results->snaps.push_back(*p);
8433         ++p;
8434       }
8435     }
8436
8437     dout(20) << __func__ << " snaps " << results->snaps << dendl;
8438     filter_snapc(results->snaps);
8439
8440     dout(20) << __func__ << " filtered snaps " << results->snaps << dendl;
8441     if (results->snaps.empty()) {
8442       dout(20) << __func__
8443                << " snaps are empty, clone is invalid,"
8444                << " setting r to ENOENT" << dendl;
8445       r = -ENOENT;
8446     }
8447   }
8448
8449   if (r < 0 && results->started_temp_obj) {
8450     dout(10) << __func__ << " abort; will clean up partial work" << dendl;
8451     ObjectContextRef tempobc = get_object_context(results->temp_oid, false);
8452     assert(tempobc);
8453     OpContextUPtr ctx = simple_opc_create(tempobc);
8454     ctx->op_t->remove(results->temp_oid);
8455     simple_opc_submit(std::move(ctx));
8456     results->started_temp_obj = false;
8457   }
8458
8459   if (r == -ENOENT && soid.is_snap()) {
8460     dout(10) << __func__
8461              << ": enoent while trying to promote clone, " << soid
8462              << " must have been trimmed, removing from snapset"
8463              << dendl;
8464     hobject_t head(soid.get_head());
8465     ObjectContextRef obc = get_object_context(head, false);
8466     assert(obc);
8467
8468     OpContextUPtr tctx = simple_opc_create(obc);
8469     tctx->at_version = get_next_version();
8470     filter_snapc(tctx->new_snapset.snaps);
8471     vector<snapid_t> new_clones;
8472     map<snapid_t, vector<snapid_t>> new_clone_snaps;
8473     for (vector<snapid_t>::iterator i = tctx->new_snapset.clones.begin();
8474          i != tctx->new_snapset.clones.end();
8475          ++i) {
8476       if (*i != soid.snap) {
8477         new_clones.push_back(*i);
8478         auto p = tctx->new_snapset.clone_snaps.find(*i);
8479         if (p != tctx->new_snapset.clone_snaps.end()) {
8480           new_clone_snaps[*i] = p->second;
8481         }
8482       }
8483     }
8484     tctx->new_snapset.clones.swap(new_clones);
8485     tctx->new_snapset.clone_overlap.erase(soid.snap);
8486     tctx->new_snapset.clone_size.erase(soid.snap);
8487     tctx->new_snapset.clone_snaps.swap(new_clone_snaps);
8488
8489     // take RWWRITE lock for duration of our local write.  ignore starvation.
8490     if (!tctx->lock_manager.take_write_lock(
8491           head,
8492           obc)) {
8493       assert(0 == "problem!");
8494     }
8495     dout(20) << __func__ << " took lock on obc, " << obc->rwstate << dendl;
8496
8497     finish_ctx(tctx.get(), pg_log_entry_t::PROMOTE);
8498
8499     simple_opc_submit(std::move(tctx));
8500     return;
8501   }
8502
8503   bool whiteout = false;
8504   if (r == -ENOENT) {
8505     assert(soid.snap == CEPH_NOSNAP); // snap case is above
8506     dout(10) << __func__ << " whiteout " << soid << dendl;
8507     whiteout = true;
8508   }
8509
8510   if (r < 0 && !whiteout) {
8511     derr << __func__ << " unexpected promote error " << cpp_strerror(r) << dendl;
8512     // pass error to everyone blocked on this object
8513     // FIXME: this is pretty sloppy, but at this point we got
8514     // something unexpected and don't have many other options.
8515     map<hobject_t,list<OpRequestRef>>::iterator blocked_iter =
8516       waiting_for_blocked_object.find(soid);
8517     if (blocked_iter != waiting_for_blocked_object.end()) {
8518       while (!blocked_iter->second.empty()) {
8519         osd->reply_op_error(blocked_iter->second.front(), r);
8520         blocked_iter->second.pop_front();
8521       }
8522       waiting_for_blocked_object.erase(blocked_iter);
8523     }
8524     return;
8525   }
8526
8527   osd->promote_finish(results->object_size);
8528
8529   OpContextUPtr tctx =  simple_opc_create(obc);
8530   tctx->at_version = get_next_version();
8531
8532   ++tctx->delta_stats.num_objects;
8533   if (soid.snap < CEPH_NOSNAP)
8534     ++tctx->delta_stats.num_object_clones;
8535   tctx->new_obs.exists = true;
8536
8537   tctx->extra_reqids = results->reqids;
8538
8539   bool legacy_snapset = tctx->new_snapset.is_legacy() ||
8540     get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
8541
8542   if (whiteout) {
8543     // create a whiteout
8544     tctx->op_t->create(soid);
8545     tctx->new_obs.oi.set_flag(object_info_t::FLAG_WHITEOUT);
8546     ++tctx->delta_stats.num_whiteouts;
8547     dout(20) << __func__ << " creating whiteout on " << soid << dendl;
8548     osd->logger->inc(l_osd_tier_whiteout);
8549   } else {
8550     if (results->has_omap) {
8551       dout(10) << __func__ << " setting omap flag on " << soid << dendl;
8552       tctx->new_obs.oi.set_flag(object_info_t::FLAG_OMAP);
8553       ++tctx->delta_stats.num_objects_omap;
8554     }
8555
8556     results->fill_in_final_tx(tctx->op_t.get());
8557     if (results->started_temp_obj) {
8558       tctx->discard_temp_oid = results->temp_oid;
8559     }
8560     tctx->new_obs.oi.size = results->object_size;
8561     tctx->new_obs.oi.user_version = results->user_version;
8562     // Don't care src object whether have data or omap digest
8563     if (results->object_size)
8564       tctx->new_obs.oi.set_data_digest(results->data_digest);
8565     if (results->has_omap)
8566       tctx->new_obs.oi.set_omap_digest(results->omap_digest);
8567     tctx->new_obs.oi.truncate_seq = results->truncate_seq;
8568     tctx->new_obs.oi.truncate_size = results->truncate_size;
8569
8570     if (soid.snap != CEPH_NOSNAP) {
8571       if (legacy_snapset) {
8572         tctx->new_obs.oi.legacy_snaps = results->snaps;
8573         assert(!tctx->new_obs.oi.legacy_snaps.empty());
8574       } else {
8575         // it's already in the snapset
8576         assert(obc->ssc->snapset.clone_snaps.count(soid.snap));
8577       }
8578       assert(obc->ssc->snapset.clone_size.count(soid.snap));
8579       assert(obc->ssc->snapset.clone_size[soid.snap] ==
8580              results->object_size);
8581       assert(obc->ssc->snapset.clone_overlap.count(soid.snap));
8582
8583       tctx->delta_stats.num_bytes += obc->ssc->snapset.get_clone_bytes(soid.snap);
8584     } else {
8585       tctx->delta_stats.num_bytes += results->object_size;
8586     }
8587   }
8588
8589   if (results->mirror_snapset) {
8590     assert(tctx->new_obs.oi.soid.snap == CEPH_NOSNAP);
8591     tctx->new_snapset.from_snap_set(
8592       results->snapset,
8593       get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS);
8594   }
8595   tctx->new_snapset.head_exists = true;
8596   dout(20) << __func__ << " new_snapset " << tctx->new_snapset << dendl;
8597
8598   // take RWWRITE lock for duration of our local write.  ignore starvation.
8599   if (!tctx->lock_manager.take_write_lock(
8600         obc->obs.oi.soid,
8601         obc)) {
8602     assert(0 == "problem!");
8603   }
8604   dout(20) << __func__ << " took lock on obc, " << obc->rwstate << dendl;
8605
8606   finish_ctx(tctx.get(), pg_log_entry_t::PROMOTE);
8607
8608   simple_opc_submit(std::move(tctx));
8609
8610   osd->logger->inc(l_osd_tier_promote);
8611
8612   if (agent_state &&
8613       agent_state->is_idle())
8614     agent_choose_mode();
8615 }
8616
8617 void PrimaryLogPG::cancel_copy(CopyOpRef cop, bool requeue)
8618 {
8619   dout(10) << __func__ << " " << cop->obc->obs.oi.soid
8620            << " from " << cop->src << " " << cop->oloc
8621            << " v" << cop->results.user_version << dendl;
8622
8623   // cancel objecter op, if we can
8624   if (cop->objecter_tid) {
8625     osd->objecter->op_cancel(cop->objecter_tid, -ECANCELED);
8626     cop->objecter_tid = 0;
8627     if (cop->objecter_tid2) {
8628       osd->objecter->op_cancel(cop->objecter_tid2, -ECANCELED);
8629       cop->objecter_tid2 = 0;
8630     }
8631   }
8632
8633   copy_ops.erase(cop->obc->obs.oi.soid);
8634   cop->obc->stop_block();
8635
8636   kick_object_context_blocked(cop->obc);
8637   cop->results.should_requeue = requeue;
8638   CopyCallbackResults result(-ECANCELED, &cop->results);
8639   cop->cb->complete(result);
8640
8641   // There may still be an objecter callback referencing this copy op.
8642   // That callback will not need the obc since it's been canceled, and
8643   // we need the obc reference to go away prior to flush.
8644   cop->obc = ObjectContextRef();
8645 }
8646
8647 void PrimaryLogPG::cancel_copy_ops(bool requeue)
8648 {
8649   dout(10) << __func__ << dendl;
8650   map<hobject_t,CopyOpRef>::iterator p = copy_ops.begin();
8651   while (p != copy_ops.end()) {
8652     // requeue this op? can I queue up all of them?
8653     cancel_copy((p++)->second, requeue);
8654   }
8655 }
8656
8657
8658 // ========================================================================
8659 // flush
8660 //
8661 // Flush a dirty object in the cache tier by writing it back to the
8662 // base tier.  The sequence looks like:
8663 //
8664 //  * send a copy-from operation to the base tier to copy the current
8665 //    version of the object
8666 //  * base tier will pull the object via (perhaps multiple) copy-get(s)
8667 //  * on completion, we check if the object has been modified.  if so,
8668 //    just reply with -EAGAIN.
8669 //  * try to take a write lock so we can clear the dirty flag.  if this
8670 //    fails, wait and retry
8671 //  * start a repop that clears the bit.
8672 //
8673 // If we have to wait, we will retry by coming back through the
8674 // start_flush method.  We check if a flush is already in progress
8675 // and, if so, try to finish it by rechecking the version and trying
8676 // to clear the dirty bit.
8677 //
8678 // In order for the cache-flush (a write op) to not block the copy-get
8679 // from reading the object, the client *must* set the SKIPRWLOCKS
8680 // flag.
8681 //
8682 // NOTE: normally writes are strictly ordered for the client, but
8683 // flushes are special in that they can be reordered with respect to
8684 // other writes.  In particular, we can't have a flush request block
8685 // an update to the cache pool object!
8686
8687 struct C_Flush : public Context {
8688   PrimaryLogPGRef pg;
8689   hobject_t oid;
8690   epoch_t last_peering_reset;
8691   ceph_tid_t tid;
8692   utime_t start;
8693   C_Flush(PrimaryLogPG *p, hobject_t o, epoch_t lpr)
8694     : pg(p), oid(o), last_peering_reset(lpr),
8695       tid(0), start(ceph_clock_now())
8696   {}
8697   void finish(int r) override {
8698     if (r == -ECANCELED)
8699       return;
8700     pg->lock();
8701     if (last_peering_reset == pg->get_last_peering_reset()) {
8702       pg->finish_flush(oid, tid, r);
8703       pg->osd->logger->tinc(l_osd_tier_flush_lat, ceph_clock_now() - start);
8704     }
8705     pg->unlock();
8706   }
8707 };
8708
8709 int PrimaryLogPG::start_flush(
8710   OpRequestRef op, ObjectContextRef obc,
8711   bool blocking, hobject_t *pmissing,
8712   boost::optional<std::function<void()>> &&on_flush)
8713 {
8714   const object_info_t& oi = obc->obs.oi;
8715   const hobject_t& soid = oi.soid;
8716   dout(10) << __func__ << " " << soid
8717            << " v" << oi.version
8718            << " uv" << oi.user_version
8719            << " " << (blocking ? "blocking" : "non-blocking/best-effort")
8720            << dendl;
8721
8722   // get a filtered snapset, need to remove removed snaps
8723   SnapSet snapset = obc->ssc->snapset.get_filtered(pool.info);
8724
8725   // verify there are no (older) check for dirty clones
8726   {
8727     dout(20) << " snapset " << snapset << dendl;
8728     vector<snapid_t>::reverse_iterator p = snapset.clones.rbegin();
8729     while (p != snapset.clones.rend() && *p >= soid.snap)
8730       ++p;
8731     if (p != snapset.clones.rend()) {
8732       hobject_t next = soid;
8733       next.snap = *p;
8734       assert(next.snap < soid.snap);
8735       if (pg_log.get_missing().is_missing(next)) {
8736         dout(10) << __func__ << " missing clone is " << next << dendl;
8737         if (pmissing)
8738           *pmissing = next;
8739         return -ENOENT;
8740       }
8741       ObjectContextRef older_obc = get_object_context(next, false);
8742       if (older_obc) {
8743         dout(20) << __func__ << " next oldest clone is " << older_obc->obs.oi
8744                  << dendl;
8745         if (older_obc->obs.oi.is_dirty()) {
8746           dout(10) << __func__ << " next oldest clone is dirty: "
8747                    << older_obc->obs.oi << dendl;
8748           return -EBUSY;
8749         }
8750       } else {
8751         dout(20) << __func__ << " next oldest clone " << next
8752                  << " is not present; implicitly clean" << dendl;
8753       }
8754     } else {
8755       dout(20) << __func__ << " no older clones" << dendl;
8756     }
8757   }
8758
8759   if (blocking)
8760     obc->start_block();
8761
8762   map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(soid);
8763   if (p != flush_ops.end()) {
8764     FlushOpRef fop = p->second;
8765     if (fop->op == op) {
8766       // we couldn't take the write lock on a cache-try-flush before;
8767       // now we are trying again for the lock.
8768       return try_flush_mark_clean(fop);
8769     }
8770     if (fop->flushed_version == obc->obs.oi.user_version &&
8771         (fop->blocking || !blocking)) {
8772       // nonblocking can join anything
8773       // blocking can only join a blocking flush
8774       dout(20) << __func__ << " piggybacking on existing flush " << dendl;
8775       if (op)
8776         fop->dup_ops.push_back(op);
8777       return -EAGAIN;   // clean up this ctx; op will retry later
8778     }
8779
8780     // cancel current flush since it will fail anyway, or because we
8781     // are blocking and the existing flush is nonblocking.
8782     dout(20) << __func__ << " canceling previous flush; it will fail" << dendl;
8783     if (fop->op)
8784       osd->reply_op_error(fop->op, -EBUSY);
8785     while (!fop->dup_ops.empty()) {
8786       osd->reply_op_error(fop->dup_ops.front(), -EBUSY);
8787       fop->dup_ops.pop_front();
8788     }
8789     cancel_flush(fop, false);
8790   }
8791
8792   /**
8793    * In general, we need to send a delete and a copyfrom.
8794    * Consider snapc 10:[10, 9, 8, 4, 3, 2]:[10(10, 9), 4(4,3,2)]
8795    * where 4 is marked as clean.  To flush 10, we have to:
8796    * 1) delete 4:[4,3,2] -- Logically, the object does not exist after 4
8797    * 2) copyfrom 8:[8,4,3,2] -- flush object after snap 8
8798    *
8799    * There is a complicating case.  Supposed there had been a clone 7
8800    * for snaps [7, 6] which has been trimmed since they no longer exist.
8801    * In the base pool, we'd have 5:[4,3,2]:[4(4,3,2)]+head.  When we submit
8802    * the delete, the snap will be promoted to 5, and the head will become
8803    * a snapdir.  When the copy-from goes through, we'll end up with
8804    * 8:[8,4,3,2]:[4(4,3,2)]+head.
8805    *
8806    * Another complication is the case where there is an interval change
8807    * after doing the delete and the flush but before marking the object
8808    * clean.  We'll happily delete head and then recreate it at the same
8809    * sequence number, which works out ok.
8810    */
8811
8812   SnapContext snapc, dsnapc;
8813   if (snapset.seq != 0) {
8814     if (soid.snap == CEPH_NOSNAP) {
8815       snapc.seq = snapset.seq;
8816       snapc.snaps = snapset.snaps;
8817     } else {
8818       snapid_t min_included_snap;
8819       if (snapset.is_legacy()) {
8820         min_included_snap = oi.legacy_snaps.back();
8821       } else {
8822         auto p = snapset.clone_snaps.find(soid.snap);
8823         assert(p != snapset.clone_snaps.end());
8824         min_included_snap = p->second.back();
8825       }
8826       snapc = snapset.get_ssc_as_of(min_included_snap - 1);
8827     }
8828
8829     snapid_t prev_snapc = 0;
8830     for (vector<snapid_t>::reverse_iterator citer = snapset.clones.rbegin();
8831          citer != snapset.clones.rend();
8832          ++citer) {
8833       if (*citer < soid.snap) {
8834         prev_snapc = *citer;
8835         break;
8836       }
8837     }
8838
8839     dsnapc = snapset.get_ssc_as_of(prev_snapc);
8840   }
8841
8842   object_locator_t base_oloc(soid);
8843   base_oloc.pool = pool.info.tier_of;
8844
8845   if (dsnapc.seq < snapc.seq) {
8846     ObjectOperation o;
8847     o.remove();
8848     osd->objecter->mutate(
8849       soid.oid,
8850       base_oloc,
8851       o,
8852       dsnapc,
8853       ceph::real_clock::from_ceph_timespec(oi.mtime),
8854       (CEPH_OSD_FLAG_IGNORE_OVERLAY |
8855        CEPH_OSD_FLAG_ENFORCE_SNAPC),
8856       NULL /* no callback, we'll rely on the ordering w.r.t the next op */);
8857   }
8858
8859   FlushOpRef fop(std::make_shared<FlushOp>());
8860   fop->obc = obc;
8861   fop->flushed_version = oi.user_version;
8862   fop->blocking = blocking;
8863   fop->on_flush = std::move(on_flush);
8864   fop->op = op;
8865
8866   ObjectOperation o;
8867   if (oi.is_whiteout()) {
8868     fop->removal = true;
8869     o.remove();
8870   } else {
8871     object_locator_t oloc(soid);
8872     o.copy_from(soid.oid.name, soid.snap, oloc, oi.user_version,
8873                 CEPH_OSD_COPY_FROM_FLAG_FLUSH |
8874                 CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
8875                 CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
8876                 CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE,
8877                 LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL|LIBRADOS_OP_FLAG_FADVISE_NOCACHE);
8878
8879     //mean the base tier don't cache data after this
8880     if (agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL)
8881       o.set_last_op_flags(LIBRADOS_OP_FLAG_FADVISE_DONTNEED);
8882   }
8883   C_Flush *fin = new C_Flush(this, soid, get_last_peering_reset());
8884
8885   ceph_tid_t tid = osd->objecter->mutate(
8886     soid.oid, base_oloc, o, snapc,
8887     ceph::real_clock::from_ceph_timespec(oi.mtime),
8888     CEPH_OSD_FLAG_IGNORE_OVERLAY | CEPH_OSD_FLAG_ENFORCE_SNAPC,
8889     new C_OnFinisher(fin,
8890                      &osd->objecter_finisher));
8891   /* we're under the pg lock and fin->finish() is grabbing that */
8892   fin->tid = tid;
8893   fop->objecter_tid = tid;
8894
8895   flush_ops[soid] = fop;
8896   info.stats.stats.sum.num_flush++;
8897   info.stats.stats.sum.num_flush_kb += SHIFT_ROUND_UP(oi.size, 10);
8898   return -EINPROGRESS;
8899 }
8900
8901 void PrimaryLogPG::finish_flush(hobject_t oid, ceph_tid_t tid, int r)
8902 {
8903   dout(10) << __func__ << " " << oid << " tid " << tid
8904            << " " << cpp_strerror(r) << dendl;
8905   map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(oid);
8906   if (p == flush_ops.end()) {
8907     dout(10) << __func__ << " no flush_op found" << dendl;
8908     return;
8909   }
8910   FlushOpRef fop = p->second;
8911   if (tid != fop->objecter_tid) {
8912     dout(10) << __func__ << " tid " << tid << " != fop " << fop
8913              << " tid " << fop->objecter_tid << dendl;
8914     return;
8915   }
8916   ObjectContextRef obc = fop->obc;
8917   fop->objecter_tid = 0;
8918
8919   if (r < 0 && !(r == -ENOENT && fop->removal)) {
8920     if (fop->op)
8921       osd->reply_op_error(fop->op, -EBUSY);
8922     if (fop->blocking) {
8923       obc->stop_block();
8924       kick_object_context_blocked(obc);
8925     }
8926
8927     if (!fop->dup_ops.empty()) {
8928       dout(20) << __func__ << " requeueing dups" << dendl;
8929       requeue_ops(fop->dup_ops);
8930     }
8931     if (fop->on_flush) {
8932       (*(fop->on_flush))();
8933       fop->on_flush = boost::none;
8934     }
8935     flush_ops.erase(oid);
8936     return;
8937   }
8938
8939   r = try_flush_mark_clean(fop);
8940   if (r == -EBUSY && fop->op) {
8941     osd->reply_op_error(fop->op, r);
8942   }
8943 }
8944
8945 int PrimaryLogPG::try_flush_mark_clean(FlushOpRef fop)
8946 {
8947   ObjectContextRef obc = fop->obc;
8948   const hobject_t& oid = obc->obs.oi.soid;
8949
8950   if (fop->blocking) {
8951     obc->stop_block();
8952     kick_object_context_blocked(obc);
8953   }
8954
8955   if (fop->flushed_version != obc->obs.oi.user_version ||
8956       !obc->obs.exists) {
8957     if (obc->obs.exists)
8958       dout(10) << __func__ << " flushed_version " << fop->flushed_version
8959                << " != current " << obc->obs.oi.user_version
8960                << dendl;
8961     else
8962       dout(10) << __func__ << " object no longer exists" << dendl;
8963
8964     if (!fop->dup_ops.empty()) {
8965       dout(20) << __func__ << " requeueing dups" << dendl;
8966       requeue_ops(fop->dup_ops);
8967     }
8968     if (fop->on_flush) {
8969       (*(fop->on_flush))();
8970       fop->on_flush = boost::none;
8971     }
8972     flush_ops.erase(oid);
8973     if (fop->blocking)
8974       osd->logger->inc(l_osd_tier_flush_fail);
8975     else
8976       osd->logger->inc(l_osd_tier_try_flush_fail);
8977     return -EBUSY;
8978   }
8979
8980   if (!fop->blocking &&
8981       scrubber.write_blocked_by_scrub(oid)) {
8982     if (fop->op) {
8983       dout(10) << __func__ << " blocked by scrub" << dendl;
8984       requeue_op(fop->op);
8985       requeue_ops(fop->dup_ops);
8986       return -EAGAIN;    // will retry
8987     } else {
8988       osd->logger->inc(l_osd_tier_try_flush_fail);
8989       cancel_flush(fop, false);
8990       return -ECANCELED;
8991     }
8992   }
8993
8994   // successfully flushed, can we evict this object?
8995   if (!fop->op && agent_state->evict_mode != TierAgentState::EVICT_MODE_IDLE &&
8996       agent_maybe_evict(obc, true)) {
8997     osd->logger->inc(l_osd_tier_clean);
8998     if (fop->on_flush) {
8999       (*(fop->on_flush))();
9000       fop->on_flush = boost::none;
9001     }
9002     flush_ops.erase(oid);
9003     return 0;
9004   }
9005
9006   dout(10) << __func__ << " clearing DIRTY flag for " << oid << dendl;
9007   OpContextUPtr ctx = simple_opc_create(fop->obc);
9008
9009   // successfully flushed; can we clear the dirty bit?
9010   // try to take the lock manually, since we don't
9011   // have a ctx yet.
9012   if (ctx->lock_manager.get_lock_type(
9013         ObjectContext::RWState::RWWRITE,
9014         oid,
9015         obc,
9016         fop->op)) {
9017     dout(20) << __func__ << " took write lock" << dendl;
9018   } else if (fop->op) {
9019     dout(10) << __func__ << " waiting on write lock" << dendl;
9020     close_op_ctx(ctx.release());
9021     requeue_op(fop->op);
9022     requeue_ops(fop->dup_ops);
9023     return -EAGAIN;    // will retry
9024   } else {
9025     dout(10) << __func__ << " failed write lock, no op; failing" << dendl;
9026     close_op_ctx(ctx.release());
9027     osd->logger->inc(l_osd_tier_try_flush_fail);
9028     cancel_flush(fop, false);
9029     return -ECANCELED;
9030   }
9031
9032   if (fop->on_flush) {
9033     ctx->register_on_finish(*(fop->on_flush));
9034     fop->on_flush = boost::none;
9035   }
9036
9037   ctx->at_version = get_next_version();
9038
9039   ctx->new_obs = obc->obs;
9040   ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
9041   --ctx->delta_stats.num_objects_dirty;
9042
9043   finish_ctx(ctx.get(), pg_log_entry_t::CLEAN);
9044
9045   osd->logger->inc(l_osd_tier_clean);
9046
9047   if (!fop->dup_ops.empty() || fop->op) {
9048     dout(20) << __func__ << " requeueing for " << ctx->at_version << dendl;
9049     list<OpRequestRef> ls;
9050     if (fop->op)
9051       ls.push_back(fop->op);
9052     ls.splice(ls.end(), fop->dup_ops);
9053     requeue_ops(ls);
9054   }
9055
9056   simple_opc_submit(std::move(ctx));
9057
9058   flush_ops.erase(oid);
9059
9060   if (fop->blocking)
9061     osd->logger->inc(l_osd_tier_flush);
9062   else
9063     osd->logger->inc(l_osd_tier_try_flush);
9064
9065   return -EINPROGRESS;
9066 }
9067
9068 void PrimaryLogPG::cancel_flush(FlushOpRef fop, bool requeue)
9069 {
9070   dout(10) << __func__ << " " << fop->obc->obs.oi.soid << " tid "
9071            << fop->objecter_tid << dendl;
9072   if (fop->objecter_tid) {
9073     osd->objecter->op_cancel(fop->objecter_tid, -ECANCELED);
9074     fop->objecter_tid = 0;
9075   }
9076   if (fop->blocking) {
9077     fop->obc->stop_block();
9078     kick_object_context_blocked(fop->obc);
9079   }
9080   if (requeue) {
9081     if (fop->op)
9082       requeue_op(fop->op);
9083     requeue_ops(fop->dup_ops);
9084   }
9085   if (fop->on_flush) {
9086     (*(fop->on_flush))();
9087     fop->on_flush = boost::none;
9088   }
9089   flush_ops.erase(fop->obc->obs.oi.soid);
9090 }
9091
9092 void PrimaryLogPG::cancel_flush_ops(bool requeue)
9093 {
9094   dout(10) << __func__ << dendl;
9095   map<hobject_t,FlushOpRef>::iterator p = flush_ops.begin();
9096   while (p != flush_ops.end()) {
9097     cancel_flush((p++)->second, requeue);
9098   }
9099 }
9100
9101 bool PrimaryLogPG::is_present_clone(hobject_t coid)
9102 {
9103   if (!pool.info.allow_incomplete_clones())
9104     return true;
9105   if (is_missing_object(coid))
9106     return true;
9107   ObjectContextRef obc = get_object_context(coid, false);
9108   return obc && obc->obs.exists;
9109 }
9110
9111 // ========================================================================
9112 // rep op gather
9113
9114 class C_OSD_RepopApplied : public Context {
9115   PrimaryLogPGRef pg;
9116   boost::intrusive_ptr<PrimaryLogPG::RepGather> repop;
9117 public:
9118   C_OSD_RepopApplied(PrimaryLogPG *pg, PrimaryLogPG::RepGather *repop)
9119   : pg(pg), repop(repop) {}
9120   void finish(int) override {
9121     pg->repop_all_applied(repop.get());
9122   }
9123 };
9124
9125
9126 void PrimaryLogPG::repop_all_applied(RepGather *repop)
9127 {
9128   dout(10) << __func__ << ": repop tid " << repop->rep_tid << " all applied "
9129            << dendl;
9130   assert(!repop->applies_with_commit);
9131   repop->all_applied = true;
9132   if (!repop->rep_aborted) {
9133     eval_repop(repop);
9134   }
9135 }
9136
9137 class C_OSD_RepopCommit : public Context {
9138   PrimaryLogPGRef pg;
9139   boost::intrusive_ptr<PrimaryLogPG::RepGather> repop;
9140 public:
9141   C_OSD_RepopCommit(PrimaryLogPG *pg, PrimaryLogPG::RepGather *repop)
9142     : pg(pg), repop(repop) {}
9143   void finish(int) override {
9144     pg->repop_all_committed(repop.get());
9145   }
9146 };
9147
9148 void PrimaryLogPG::repop_all_committed(RepGather *repop)
9149 {
9150   dout(10) << __func__ << ": repop tid " << repop->rep_tid << " all committed "
9151            << dendl;
9152   repop->all_committed = true;
9153   if (repop->applies_with_commit) {
9154     assert(!repop->all_applied);
9155     repop->all_applied = true;
9156   }
9157
9158   if (!repop->rep_aborted) {
9159     if (repop->v != eversion_t()) {
9160       last_update_ondisk = repop->v;
9161       last_complete_ondisk = repop->pg_local_last_complete;
9162     }
9163     eval_repop(repop);
9164   }
9165 }
9166
9167 void PrimaryLogPG::op_applied(const eversion_t &applied_version)
9168 {
9169   dout(10) << "op_applied version " << applied_version << dendl;
9170   if (applied_version == eversion_t())
9171     return;
9172   assert(applied_version > last_update_applied);
9173   assert(applied_version <= info.last_update);
9174   last_update_applied = applied_version;
9175   if (is_primary()) {
9176     if (scrubber.active) {
9177       if (last_update_applied >= scrubber.subset_last_update) {
9178         if (ops_blocked_by_scrub()) {
9179           requeue_scrub(true);
9180         } else {
9181           requeue_scrub(false);
9182         }
9183
9184       }
9185     } else {
9186       assert(scrubber.start == scrubber.end);
9187     }
9188   } else {
9189     if (scrubber.active_rep_scrub) {
9190       if (last_update_applied >= static_cast<const MOSDRepScrub*>(
9191             scrubber.active_rep_scrub->get_req())->scrub_to) {
9192         osd->enqueue_back(
9193           info.pgid,
9194           PGQueueable(scrubber.active_rep_scrub, get_osdmap()->get_epoch()));
9195         scrubber.active_rep_scrub = OpRequestRef();
9196       }
9197     }
9198   }
9199 }
9200
9201 void PrimaryLogPG::eval_repop(RepGather *repop)
9202 {
9203   const MOSDOp *m = NULL;
9204   if (repop->op)
9205     m = static_cast<const MOSDOp *>(repop->op->get_req());
9206
9207   if (m)
9208     dout(10) << "eval_repop " << *repop
9209              << (repop->rep_done ? " DONE" : "")
9210              << dendl;
9211   else
9212     dout(10) << "eval_repop " << *repop << " (no op)"
9213              << (repop->rep_done ? " DONE" : "")
9214              << dendl;
9215
9216   if (repop->rep_done)
9217     return;
9218
9219   // ondisk?
9220   if (repop->all_committed) {
9221     dout(10) << " commit: " << *repop << dendl;
9222     for (auto p = repop->on_committed.begin();
9223          p != repop->on_committed.end();
9224          repop->on_committed.erase(p++)) {
9225       (*p)();
9226     }
9227     // send dup commits, in order
9228     if (waiting_for_ondisk.count(repop->v)) {
9229       assert(waiting_for_ondisk.begin()->first == repop->v);
9230       for (list<pair<OpRequestRef, version_t> >::iterator i =
9231              waiting_for_ondisk[repop->v].begin();
9232            i != waiting_for_ondisk[repop->v].end();
9233            ++i) {
9234         osd->reply_op_error(i->first, repop->r, repop->v,
9235                             i->second);
9236       }
9237       waiting_for_ondisk.erase(repop->v);
9238     }
9239   }
9240
9241   // applied?
9242   if (repop->all_applied) {
9243     if (repop->applies_with_commit) {
9244       assert(repop->on_applied.empty());
9245     }
9246     dout(10) << " applied: " << *repop << " " << dendl;
9247     for (auto p = repop->on_applied.begin();
9248          p != repop->on_applied.end();
9249          repop->on_applied.erase(p++)) {
9250       (*p)();
9251     }
9252   }
9253
9254   // done.
9255   if (repop->all_applied && repop->all_committed) {
9256     repop->rep_done = true;
9257
9258     publish_stats_to_osd();
9259     calc_min_last_complete_ondisk();
9260
9261     dout(10) << " removing " << *repop << dendl;
9262     assert(!repop_queue.empty());
9263     dout(20) << "   q front is " << *repop_queue.front() << dendl;
9264     if (repop_queue.front() != repop) {
9265       if (!repop->applies_with_commit) {
9266         dout(0) << " removing " << *repop << dendl;
9267         dout(0) << "   q front is " << *repop_queue.front() << dendl;
9268         assert(repop_queue.front() == repop);
9269       }
9270     } else {
9271       RepGather *to_remove = nullptr;
9272       while (!repop_queue.empty() &&
9273              (to_remove = repop_queue.front())->rep_done) {
9274         repop_queue.pop_front();
9275         for (auto p = to_remove->on_success.begin();
9276              p != to_remove->on_success.end();
9277              to_remove->on_success.erase(p++)) {
9278           (*p)();
9279         }
9280         remove_repop(to_remove);
9281       }
9282     }
9283   }
9284 }
9285
9286 void PrimaryLogPG::issue_repop(RepGather *repop, OpContext *ctx)
9287 {
9288   FUNCTRACE();
9289   const hobject_t& soid = ctx->obs->oi.soid;
9290   dout(7) << "issue_repop rep_tid " << repop->rep_tid
9291           << " o " << soid
9292           << dendl;
9293
9294   repop->v = ctx->at_version;
9295   if (ctx->at_version > eversion_t()) {
9296     for (set<pg_shard_t>::iterator i = actingbackfill.begin();
9297          i != actingbackfill.end();
9298          ++i) {
9299       if (*i == get_primary()) continue;
9300       pg_info_t &pinfo = peer_info[*i];
9301       // keep peer_info up to date
9302       if (pinfo.last_complete == pinfo.last_update)
9303         pinfo.last_complete = ctx->at_version;
9304       pinfo.last_update = ctx->at_version;
9305     }
9306   }
9307
9308   ctx->obc->ondisk_write_lock();
9309
9310   bool unlock_snapset_obc = false;
9311   ctx->op_t->add_obc(ctx->obc);
9312   if (ctx->clone_obc) {
9313     ctx->clone_obc->ondisk_write_lock();
9314     ctx->op_t->add_obc(ctx->clone_obc);
9315   }
9316   if (ctx->snapset_obc && ctx->snapset_obc->obs.oi.soid !=
9317       ctx->obc->obs.oi.soid) {
9318     ctx->snapset_obc->ondisk_write_lock();
9319     unlock_snapset_obc = true;
9320     ctx->op_t->add_obc(ctx->snapset_obc);
9321   }
9322
9323   Context *on_all_commit = new C_OSD_RepopCommit(this, repop);
9324   Context *on_all_applied = new C_OSD_RepopApplied(this, repop);
9325   Context *onapplied_sync = new C_OSD_OndiskWriteUnlock(
9326     ctx->obc,
9327     ctx->clone_obc,
9328     unlock_snapset_obc ? ctx->snapset_obc : ObjectContextRef());
9329   if (!(ctx->log.empty())) {
9330     assert(ctx->at_version >= projected_last_update);
9331     projected_last_update = ctx->at_version;
9332   }
9333   for (auto &&entry: ctx->log) {
9334     projected_log.add(entry);
9335   }
9336   pgbackend->submit_transaction(
9337     soid,
9338     ctx->delta_stats,
9339     ctx->at_version,
9340     std::move(ctx->op_t),
9341     pg_trim_to,
9342     min_last_complete_ondisk,
9343     ctx->log,
9344     ctx->updated_hset_history,
9345     onapplied_sync,
9346     on_all_applied,
9347     on_all_commit,
9348     repop->rep_tid,
9349     ctx->reqid,
9350     ctx->op);
9351 }
9352
9353 PrimaryLogPG::RepGather *PrimaryLogPG::new_repop(
9354   OpContext *ctx, ObjectContextRef obc,
9355   ceph_tid_t rep_tid)
9356 {
9357   if (ctx->op)
9358     dout(10) << "new_repop rep_tid " << rep_tid << " on " << *ctx->op->get_req() << dendl;
9359   else
9360     dout(10) << "new_repop rep_tid " << rep_tid << " (no op)" << dendl;
9361
9362   RepGather *repop = new RepGather(
9363     ctx, rep_tid, info.last_complete, false);
9364
9365   repop->start = ceph_clock_now();
9366
9367   repop_queue.push_back(&repop->queue_item);
9368   repop->get();
9369
9370   osd->logger->inc(l_osd_op_wip);
9371
9372   dout(10) << __func__ << ": " << *repop << dendl;
9373   return repop;
9374 }
9375
9376 boost::intrusive_ptr<PrimaryLogPG::RepGather> PrimaryLogPG::new_repop(
9377   eversion_t version,
9378   int r,
9379   ObcLockManager &&manager,
9380   OpRequestRef &&op,
9381   boost::optional<std::function<void(void)> > &&on_complete)
9382 {
9383   RepGather *repop = new RepGather(
9384     std::move(manager),
9385     std::move(op),
9386     std::move(on_complete),
9387     osd->get_tid(),
9388     info.last_complete,
9389     true,
9390     r);
9391   repop->v = version;
9392
9393   repop->start = ceph_clock_now();
9394
9395   repop_queue.push_back(&repop->queue_item);
9396
9397   osd->logger->inc(l_osd_op_wip);
9398
9399   dout(10) << __func__ << ": " << *repop << dendl;
9400   return boost::intrusive_ptr<RepGather>(repop);
9401 }
9402
9403 void PrimaryLogPG::remove_repop(RepGather *repop)
9404 {
9405   dout(20) << __func__ << " " << *repop << dendl;
9406
9407   for (auto p = repop->on_finish.begin();
9408        p != repop->on_finish.end();
9409        repop->on_finish.erase(p++)) {
9410     (*p)();
9411   }
9412
9413   release_object_locks(
9414     repop->lock_manager);
9415   repop->put();
9416
9417   osd->logger->dec(l_osd_op_wip);
9418 }
9419
9420 PrimaryLogPG::OpContextUPtr PrimaryLogPG::simple_opc_create(ObjectContextRef obc)
9421 {
9422   dout(20) << __func__ << " " << obc->obs.oi.soid << dendl;
9423   ceph_tid_t rep_tid = osd->get_tid();
9424   osd_reqid_t reqid(osd->get_cluster_msgr_name(), 0, rep_tid);
9425   OpContextUPtr ctx(new OpContext(OpRequestRef(), reqid, nullptr, obc, this));
9426   ctx->op_t.reset(new PGTransaction());
9427   ctx->mtime = ceph_clock_now();
9428   return ctx;
9429 }
9430
9431 void PrimaryLogPG::simple_opc_submit(OpContextUPtr ctx)
9432 {
9433   RepGather *repop = new_repop(ctx.get(), ctx->obc, ctx->reqid.tid);
9434   dout(20) << __func__ << " " << repop << dendl;
9435   issue_repop(repop, ctx.get());
9436   eval_repop(repop);
9437   calc_trim_to();
9438   repop->put();
9439 }
9440
9441
9442 void PrimaryLogPG::submit_log_entries(
9443   const mempool::osd_pglog::list<pg_log_entry_t> &entries,
9444   ObcLockManager &&manager,
9445   boost::optional<std::function<void(void)> > &&_on_complete,
9446   OpRequestRef op,
9447   int r)
9448 {
9449   dout(10) << __func__ << " " << entries << dendl;
9450   assert(is_primary());
9451
9452   eversion_t version;
9453   if (!entries.empty()) {
9454     assert(entries.rbegin()->version >= projected_last_update);
9455     version = projected_last_update = entries.rbegin()->version;
9456   }
9457
9458   boost::intrusive_ptr<RepGather> repop;
9459   boost::optional<std::function<void(void)> > on_complete;
9460   if (get_osdmap()->require_osd_release >= CEPH_RELEASE_JEWEL) {
9461     repop = new_repop(
9462       version,
9463       r,
9464       std::move(manager),
9465       std::move(op),
9466       std::move(_on_complete));
9467   } else {
9468     on_complete = std::move(_on_complete);
9469   }
9470
9471   pgbackend->call_write_ordered(
9472     [this, entries, repop, on_complete]() {
9473       ObjectStore::Transaction t;
9474       eversion_t old_last_update = info.last_update;
9475       merge_new_log_entries(entries, t);
9476
9477
9478       set<pg_shard_t> waiting_on;
9479       for (set<pg_shard_t>::const_iterator i = actingbackfill.begin();
9480            i != actingbackfill.end();
9481            ++i) {
9482         pg_shard_t peer(*i);
9483         if (peer == pg_whoami) continue;
9484         assert(peer_missing.count(peer));
9485         assert(peer_info.count(peer));
9486         if (get_osdmap()->require_osd_release >= CEPH_RELEASE_JEWEL) {
9487           assert(repop);
9488           MOSDPGUpdateLogMissing *m = new MOSDPGUpdateLogMissing(
9489             entries,
9490             spg_t(info.pgid.pgid, i->shard),
9491             pg_whoami.shard,
9492             get_osdmap()->get_epoch(),
9493             last_peering_reset,
9494             repop->rep_tid);
9495           osd->send_message_osd_cluster(
9496             peer.osd, m, get_osdmap()->get_epoch());
9497           waiting_on.insert(peer);
9498         } else {
9499           MOSDPGLog *m = new MOSDPGLog(
9500             peer.shard, pg_whoami.shard,
9501             info.last_update.epoch,
9502             info);
9503           m->log.log = entries;
9504           m->log.tail = old_last_update;
9505           m->log.head = info.last_update;
9506           osd->send_message_osd_cluster(
9507             peer.osd, m, get_osdmap()->get_epoch());
9508         }
9509       }
9510       if (get_osdmap()->require_osd_release >= CEPH_RELEASE_JEWEL) {
9511         ceph_tid_t rep_tid = repop->rep_tid;
9512         waiting_on.insert(pg_whoami);
9513         log_entry_update_waiting_on.insert(
9514           make_pair(
9515             rep_tid,
9516             LogUpdateCtx{std::move(repop), std::move(waiting_on)}
9517             ));
9518         struct OnComplete : public Context {
9519           PrimaryLogPGRef pg;
9520           ceph_tid_t rep_tid;
9521           epoch_t epoch;
9522           OnComplete(
9523             PrimaryLogPGRef pg,
9524             ceph_tid_t rep_tid,
9525             epoch_t epoch)
9526             : pg(pg), rep_tid(rep_tid), epoch(epoch) {}
9527           void finish(int) override {
9528             pg->lock();
9529             if (!pg->pg_has_reset_since(epoch)) {
9530               auto it = pg->log_entry_update_waiting_on.find(rep_tid);
9531               assert(it != pg->log_entry_update_waiting_on.end());
9532               auto it2 = it->second.waiting_on.find(pg->pg_whoami);
9533               assert(it2 != it->second.waiting_on.end());
9534               it->second.waiting_on.erase(it2);
9535               if (it->second.waiting_on.empty()) {
9536                 pg->repop_all_committed(it->second.repop.get());
9537                 pg->log_entry_update_waiting_on.erase(it);
9538               }
9539             }
9540             pg->unlock();
9541           }
9542         };
9543         t.register_on_commit(
9544           new OnComplete{this, rep_tid, get_osdmap()->get_epoch()});
9545       } else {
9546         if (on_complete) {
9547           struct OnComplete : public Context {
9548             PrimaryLogPGRef pg;
9549             std::function<void(void)> on_complete;
9550             epoch_t epoch;
9551             OnComplete(
9552               PrimaryLogPGRef pg,
9553               const std::function<void(void)> &on_complete,
9554               epoch_t epoch)
9555               : pg(pg),
9556                 on_complete(std::move(on_complete)),
9557                 epoch(epoch) {}
9558             void finish(int) override {
9559               pg->lock();
9560               if (!pg->pg_has_reset_since(epoch))
9561                 on_complete();
9562               pg->unlock();
9563             }
9564           };
9565           t.register_on_complete(
9566             new OnComplete{
9567               this, *on_complete, get_osdmap()->get_epoch()
9568                 });
9569         }
9570       }
9571       t.register_on_applied(
9572         new C_OSD_OnApplied{this, get_osdmap()->get_epoch(), info.last_update});
9573       int r = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
9574       assert(r == 0);
9575     });
9576 }
9577
9578 void PrimaryLogPG::cancel_log_updates()
9579 {
9580   // get rid of all the LogUpdateCtx so their references to repops are
9581   // dropped
9582   log_entry_update_waiting_on.clear();
9583 }
9584
9585 // -------------------------------------------------------
9586
9587 void PrimaryLogPG::get_watchers(list<obj_watch_item_t> &pg_watchers)
9588 {
9589   pair<hobject_t, ObjectContextRef> i;
9590   while (object_contexts.get_next(i.first, &i)) {
9591     ObjectContextRef obc(i.second);
9592     get_obc_watchers(obc, pg_watchers);
9593   }
9594 }
9595
9596 void PrimaryLogPG::get_obc_watchers(ObjectContextRef obc, list<obj_watch_item_t> &pg_watchers)
9597 {
9598   for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j =
9599          obc->watchers.begin();
9600         j != obc->watchers.end();
9601         ++j) {
9602     obj_watch_item_t owi;
9603
9604     owi.obj = obc->obs.oi.soid;
9605     owi.wi.addr = j->second->get_peer_addr();
9606     owi.wi.name = j->second->get_entity();
9607     owi.wi.cookie = j->second->get_cookie();
9608     owi.wi.timeout_seconds = j->second->get_timeout();
9609
9610     dout(30) << "watch: Found oid=" << owi.obj << " addr=" << owi.wi.addr
9611       << " name=" << owi.wi.name << " cookie=" << owi.wi.cookie << dendl;
9612
9613     pg_watchers.push_back(owi);
9614   }
9615 }
9616
9617 void PrimaryLogPG::check_blacklisted_watchers()
9618 {
9619   dout(20) << "PrimaryLogPG::check_blacklisted_watchers for pg " << get_pgid() << dendl;
9620   pair<hobject_t, ObjectContextRef> i;
9621   while (object_contexts.get_next(i.first, &i))
9622     check_blacklisted_obc_watchers(i.second);
9623 }
9624
9625 void PrimaryLogPG::check_blacklisted_obc_watchers(ObjectContextRef obc)
9626 {
9627   dout(20) << "PrimaryLogPG::check_blacklisted_obc_watchers for obc " << obc->obs.oi.soid << dendl;
9628   for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator k =
9629          obc->watchers.begin();
9630         k != obc->watchers.end();
9631         ) {
9632     //Advance iterator now so handle_watch_timeout() can erase element
9633     map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j = k++;
9634     dout(30) << "watch: Found " << j->second->get_entity() << " cookie " << j->second->get_cookie() << dendl;
9635     entity_addr_t ea = j->second->get_peer_addr();
9636     dout(30) << "watch: Check entity_addr_t " << ea << dendl;
9637     if (get_osdmap()->is_blacklisted(ea)) {
9638       dout(10) << "watch: Found blacklisted watcher for " << ea << dendl;
9639       assert(j->second->get_pg() == this);
9640       j->second->unregister_cb();
9641       handle_watch_timeout(j->second);
9642     }
9643   }
9644 }
9645
9646 void PrimaryLogPG::populate_obc_watchers(ObjectContextRef obc)
9647 {
9648   assert(is_active());
9649   assert((recovering.count(obc->obs.oi.soid) ||
9650           !is_missing_object(obc->obs.oi.soid)) ||
9651          (pg_log.get_log().objects.count(obc->obs.oi.soid) && // or this is a revert... see recover_primary()
9652           pg_log.get_log().objects.find(obc->obs.oi.soid)->second->op ==
9653             pg_log_entry_t::LOST_REVERT &&
9654           pg_log.get_log().objects.find(obc->obs.oi.soid)->second->reverting_to ==
9655             obc->obs.oi.version));
9656
9657   dout(10) << "populate_obc_watchers " << obc->obs.oi.soid << dendl;
9658   assert(obc->watchers.empty());
9659   // populate unconnected_watchers
9660   for (map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator p =
9661         obc->obs.oi.watchers.begin();
9662        p != obc->obs.oi.watchers.end();
9663        ++p) {
9664     utime_t expire = info.stats.last_became_active;
9665     expire += p->second.timeout_seconds;
9666     dout(10) << "  unconnected watcher " << p->first << " will expire " << expire << dendl;
9667     WatchRef watch(
9668       Watch::makeWatchRef(
9669         this, osd, obc, p->second.timeout_seconds, p->first.first,
9670         p->first.second, p->second.addr));
9671     watch->disconnect();
9672     obc->watchers.insert(
9673       make_pair(
9674         make_pair(p->first.first, p->first.second),
9675         watch));
9676   }
9677   // Look for watchers from blacklisted clients and drop
9678   check_blacklisted_obc_watchers(obc);
9679 }
9680
9681 void PrimaryLogPG::handle_watch_timeout(WatchRef watch)
9682 {
9683   ObjectContextRef obc = watch->get_obc(); // handle_watch_timeout owns this ref
9684   dout(10) << "handle_watch_timeout obc " << obc << dendl;
9685
9686   if (!is_active()) {
9687     dout(10) << "handle_watch_timeout not active, no-op" << dendl;
9688     return;
9689   }
9690   if (is_degraded_or_backfilling_object(obc->obs.oi.soid)) {
9691     callbacks_for_degraded_object[obc->obs.oi.soid].push_back(
9692       watch->get_delayed_cb()
9693       );
9694     dout(10) << "handle_watch_timeout waiting for degraded on obj "
9695              << obc->obs.oi.soid
9696              << dendl;
9697     return;
9698   }
9699
9700   if (scrubber.write_blocked_by_scrub(obc->obs.oi.soid)) {
9701     dout(10) << "handle_watch_timeout waiting for scrub on obj "
9702              << obc->obs.oi.soid
9703              << dendl;
9704     scrubber.add_callback(
9705       watch->get_delayed_cb() // This callback!
9706       );
9707     return;
9708   }
9709
9710   OpContextUPtr ctx = simple_opc_create(obc);
9711   ctx->at_version = get_next_version();
9712
9713   object_info_t& oi = ctx->new_obs.oi;
9714   oi.watchers.erase(make_pair(watch->get_cookie(),
9715                               watch->get_entity()));
9716
9717   list<watch_disconnect_t> watch_disconnects = {
9718     watch_disconnect_t(watch->get_cookie(), watch->get_entity(), true)
9719   };
9720   ctx->register_on_success(
9721     [this, obc, watch_disconnects]() {
9722       complete_disconnect_watches(obc, watch_disconnects);
9723     });
9724
9725
9726   PGTransaction *t = ctx->op_t.get();
9727   ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::MODIFY, obc->obs.oi.soid,
9728                                     ctx->at_version,
9729                                     oi.version,
9730                                     0,
9731                                     osd_reqid_t(), ctx->mtime, 0));
9732
9733   oi.prior_version = obc->obs.oi.version;
9734   oi.version = ctx->at_version;
9735   bufferlist bl;
9736   ::encode(oi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
9737   t->setattr(obc->obs.oi.soid, OI_ATTR, bl);
9738
9739   // apply new object state.
9740   ctx->obc->obs = ctx->new_obs;
9741
9742   // no ctx->delta_stats
9743   simple_opc_submit(std::move(ctx));
9744 }
9745
9746 ObjectContextRef PrimaryLogPG::create_object_context(const object_info_t& oi,
9747                                                      SnapSetContext *ssc)
9748 {
9749   ObjectContextRef obc(object_contexts.lookup_or_create(oi.soid));
9750   assert(obc->destructor_callback == NULL);
9751   obc->destructor_callback = new C_PG_ObjectContext(this, obc.get());
9752   obc->obs.oi = oi;
9753   obc->obs.exists = false;
9754   obc->ssc = ssc;
9755   if (ssc)
9756     register_snapset_context(ssc);
9757   dout(10) << "create_object_context " << (void*)obc.get() << " " << oi.soid << " " << dendl;
9758   if (is_active())
9759     populate_obc_watchers(obc);
9760   return obc;
9761 }
9762
9763 ObjectContextRef PrimaryLogPG::get_object_context(
9764   const hobject_t& soid,
9765   bool can_create,
9766   const map<string, bufferlist> *attrs)
9767 {
9768   assert(
9769     attrs || !pg_log.get_missing().is_missing(soid) ||
9770     // or this is a revert... see recover_primary()
9771     (pg_log.get_log().objects.count(soid) &&
9772       pg_log.get_log().objects.find(soid)->second->op ==
9773       pg_log_entry_t::LOST_REVERT));
9774   ObjectContextRef obc = object_contexts.lookup(soid);
9775   osd->logger->inc(l_osd_object_ctx_cache_total);
9776   if (obc) {
9777     osd->logger->inc(l_osd_object_ctx_cache_hit);
9778     dout(10) << __func__ << ": found obc in cache: " << obc
9779              << dendl;
9780   } else {
9781     dout(10) << __func__ << ": obc NOT found in cache: " << soid << dendl;
9782     // check disk
9783     bufferlist bv;
9784     if (attrs) {
9785       assert(attrs->count(OI_ATTR));
9786       bv = attrs->find(OI_ATTR)->second;
9787     } else {
9788       int r = pgbackend->objects_get_attr(soid, OI_ATTR, &bv);
9789       if (r < 0) {
9790         if (!can_create) {
9791           dout(10) << __func__ << ": no obc for soid "
9792                    << soid << " and !can_create"
9793                    << dendl;
9794           return ObjectContextRef();   // -ENOENT!
9795         }
9796
9797         dout(10) << __func__ << ": no obc for soid "
9798                  << soid << " but can_create"
9799                  << dendl;
9800         // new object.
9801         object_info_t oi(soid);
9802         SnapSetContext *ssc = get_snapset_context(
9803           soid, true, 0, false);
9804         assert(ssc);
9805         obc = create_object_context(oi, ssc);
9806         dout(10) << __func__ << ": " << obc << " " << soid
9807                  << " " << obc->rwstate
9808                  << " oi: " << obc->obs.oi
9809                  << " ssc: " << obc->ssc
9810                  << " snapset: " << obc->ssc->snapset << dendl;
9811         return obc;
9812       }
9813     }
9814
9815     object_info_t oi;
9816     try {
9817       bufferlist::iterator bliter = bv.begin();
9818       ::decode(oi, bliter);
9819     } catch (...) {
9820       dout(0) << __func__ << ": obc corrupt: " << soid << dendl;
9821       return ObjectContextRef();   // -ENOENT!
9822     }
9823
9824     assert(oi.soid.pool == (int64_t)info.pgid.pool());
9825
9826     obc = object_contexts.lookup_or_create(oi.soid);
9827     obc->destructor_callback = new C_PG_ObjectContext(this, obc.get());
9828     obc->obs.oi = oi;
9829     obc->obs.exists = true;
9830
9831     obc->ssc = get_snapset_context(
9832       soid, true,
9833       soid.has_snapset() ? attrs : 0);
9834
9835     if (is_active())
9836       populate_obc_watchers(obc);
9837
9838     if (pool.info.require_rollback()) {
9839       if (attrs) {
9840         obc->attr_cache = *attrs;
9841       } else {
9842         int r = pgbackend->objects_get_attrs(
9843           soid,
9844           &obc->attr_cache);
9845         assert(r == 0);
9846       }
9847     }
9848
9849     dout(10) << __func__ << ": creating obc from disk: " << obc
9850              << dendl;
9851   }
9852
9853   // XXX: Caller doesn't expect this
9854   if (obc->ssc == NULL) {
9855     derr << __func__ << ": obc->ssc not available, not returning context" << dendl;
9856     return ObjectContextRef();   // -ENOENT!
9857   }
9858
9859   dout(10) << __func__ << ": " << obc << " " << soid
9860            << " " << obc->rwstate
9861            << " oi: " << obc->obs.oi
9862            << " exists: " << (int)obc->obs.exists
9863            << " ssc: " << obc->ssc
9864            << " snapset: " << obc->ssc->snapset << dendl;
9865   return obc;
9866 }
9867
9868 void PrimaryLogPG::context_registry_on_change()
9869 {
9870   pair<hobject_t, ObjectContextRef> i;
9871   while (object_contexts.get_next(i.first, &i)) {
9872     ObjectContextRef obc(i.second);
9873     if (obc) {
9874       for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j =
9875              obc->watchers.begin();
9876            j != obc->watchers.end();
9877            obc->watchers.erase(j++)) {
9878         j->second->discard();
9879       }
9880     }
9881   }
9882 }
9883
9884
9885 /*
9886  * If we return an error, and set *pmissing, then promoting that
9887  * object may help.
9888  *
9889  * If we return -EAGAIN, we will always set *pmissing to the missing
9890  * object to wait for.
9891  *
9892  * If we return an error but do not set *pmissing, then we know the
9893  * object does not exist.
9894  */
9895 int PrimaryLogPG::find_object_context(const hobject_t& oid,
9896                                       ObjectContextRef *pobc,
9897                                       bool can_create,
9898                                       bool map_snapid_to_clone,
9899                                       hobject_t *pmissing)
9900 {
9901   FUNCTRACE();
9902   assert(oid.pool == static_cast<int64_t>(info.pgid.pool()));
9903   // want the head?
9904   if (oid.snap == CEPH_NOSNAP) {
9905     ObjectContextRef obc = get_object_context(oid, can_create);
9906     if (!obc) {
9907       if (pmissing)
9908         *pmissing = oid;
9909       return -ENOENT;
9910     }
9911     dout(10) << "find_object_context " << oid
9912        << " @" << oid.snap
9913        << " oi=" << obc->obs.oi
9914        << dendl;
9915     *pobc = obc;
9916
9917     return 0;
9918   }
9919
9920   hobject_t head = oid.get_head();
9921
9922   // want the snapdir?
9923   if (oid.snap == CEPH_SNAPDIR) {
9924     // return head or snapdir, whichever exists.
9925     ObjectContextRef headobc = get_object_context(head, can_create);
9926     ObjectContextRef obc = headobc;
9927     if (!obc || !obc->obs.exists)
9928       obc = get_object_context(oid, can_create);
9929     if (!obc || !obc->obs.exists) {
9930       // if we have neither, we would want to promote the head.
9931       if (pmissing)
9932         *pmissing = head;
9933       if (pobc)
9934         *pobc = headobc; // may be null
9935       return -ENOENT;
9936     }
9937     dout(10) << "find_object_context " << oid
9938              << " @" << oid.snap
9939              << " oi=" << obc->obs.oi
9940              << dendl;
9941     *pobc = obc;
9942
9943     // always populate ssc for SNAPDIR...
9944     if (!obc->ssc)
9945       obc->ssc = get_snapset_context(
9946         oid, true);
9947     return 0;
9948   }
9949
9950   // we want a snap
9951   if (!map_snapid_to_clone && pool.info.is_removed_snap(oid.snap)) {
9952     dout(10) << __func__ << " snap " << oid.snap << " is removed" << dendl;
9953     return -ENOENT;
9954   }
9955
9956   SnapSetContext *ssc = get_snapset_context(oid, can_create);
9957   if (!ssc || !(ssc->exists || can_create)) {
9958     dout(20) << __func__ << " " << oid << " no snapset" << dendl;
9959     if (pmissing)
9960       *pmissing = head;  // start by getting the head
9961     if (ssc)
9962       put_snapset_context(ssc);
9963     return -ENOENT;
9964   }
9965
9966   if (map_snapid_to_clone) {
9967     dout(10) << "find_object_context " << oid << " @" << oid.snap
9968              << " snapset " << ssc->snapset
9969              << " map_snapid_to_clone=true" << dendl;
9970     if (oid.snap > ssc->snapset.seq) {
9971       // already must be readable
9972       ObjectContextRef obc = get_object_context(head, false);
9973       dout(10) << "find_object_context " << oid << " @" << oid.snap
9974                << " snapset " << ssc->snapset
9975                << " maps to head" << dendl;
9976       *pobc = obc;
9977       put_snapset_context(ssc);
9978       return (obc && obc->obs.exists) ? 0 : -ENOENT;
9979     } else {
9980       vector<snapid_t>::const_iterator citer = std::find(
9981         ssc->snapset.clones.begin(),
9982         ssc->snapset.clones.end(),
9983         oid.snap);
9984       if (citer == ssc->snapset.clones.end()) {
9985         dout(10) << "find_object_context " << oid << " @" << oid.snap
9986                  << " snapset " << ssc->snapset
9987                  << " maps to nothing" << dendl;
9988         put_snapset_context(ssc);
9989         return -ENOENT;
9990       }
9991
9992       dout(10) << "find_object_context " << oid << " @" << oid.snap
9993                << " snapset " << ssc->snapset
9994                << " maps to " << oid << dendl;
9995
9996       if (pg_log.get_missing().is_missing(oid)) {
9997         dout(10) << "find_object_context " << oid << " @" << oid.snap
9998                  << " snapset " << ssc->snapset
9999                  << " " << oid << " is missing" << dendl;
10000         if (pmissing)
10001           *pmissing = oid;
10002         put_snapset_context(ssc);
10003         return -EAGAIN;
10004       }
10005
10006       ObjectContextRef obc = get_object_context(oid, false);
10007       if (!obc || !obc->obs.exists) {
10008         dout(10) << "find_object_context " << oid << " @" << oid.snap
10009                  << " snapset " << ssc->snapset
10010                  << " " << oid << " is not present" << dendl;
10011         if (pmissing)
10012           *pmissing = oid;
10013         put_snapset_context(ssc);
10014         return -ENOENT;
10015       }
10016       dout(10) << "find_object_context " << oid << " @" << oid.snap
10017                << " snapset " << ssc->snapset
10018                << " " << oid << " HIT" << dendl;
10019       *pobc = obc;
10020       put_snapset_context(ssc);
10021       return 0;
10022     }
10023     ceph_abort(); //unreachable
10024   }
10025
10026   dout(10) << "find_object_context " << oid << " @" << oid.snap
10027            << " snapset " << ssc->snapset << dendl;
10028
10029   // head?
10030   if (oid.snap > ssc->snapset.seq) {
10031     if (ssc->snapset.head_exists) {
10032       ObjectContextRef obc = get_object_context(head, false);
10033       dout(10) << "find_object_context  " << head
10034                << " want " << oid.snap << " > snapset seq " << ssc->snapset.seq
10035                << " -- HIT " << obc->obs
10036                << dendl;
10037       if (!obc->ssc)
10038         obc->ssc = ssc;
10039       else {
10040         assert(ssc == obc->ssc);
10041         put_snapset_context(ssc);
10042       }
10043       *pobc = obc;
10044       return 0;
10045     }
10046     dout(10) << "find_object_context  " << head
10047              << " want " << oid.snap << " > snapset seq " << ssc->snapset.seq
10048              << " but head dne -- DNE"
10049              << dendl;
10050     put_snapset_context(ssc);
10051     return -ENOENT;
10052   }
10053
10054   // which clone would it be?
10055   unsigned k = 0;
10056   while (k < ssc->snapset.clones.size() &&
10057          ssc->snapset.clones[k] < oid.snap)
10058     k++;
10059   if (k == ssc->snapset.clones.size()) {
10060     dout(10) << "find_object_context  no clones with last >= oid.snap "
10061              << oid.snap << " -- DNE" << dendl;
10062     put_snapset_context(ssc);
10063     return -ENOENT;
10064   }
10065   hobject_t soid(oid.oid, oid.get_key(), ssc->snapset.clones[k], oid.get_hash(),
10066                  info.pgid.pool(), oid.get_namespace());
10067
10068   if (pg_log.get_missing().is_missing(soid)) {
10069     dout(20) << "find_object_context  " << soid << " missing, try again later"
10070              << dendl;
10071     if (pmissing)
10072       *pmissing = soid;
10073     put_snapset_context(ssc);
10074     return -EAGAIN;
10075   }
10076
10077   ObjectContextRef obc = get_object_context(soid, false);
10078   if (!obc || !obc->obs.exists) {
10079     if (pmissing)
10080       *pmissing = soid;
10081     put_snapset_context(ssc);
10082     if (is_degraded_or_backfilling_object(soid)) {
10083       dout(20) << __func__ << " clone is degraded or backfilling " << soid << dendl;
10084       return -EAGAIN;
10085     } else {
10086       dout(20) << __func__ << " missing clone " << soid << dendl;
10087       return -ENOENT;
10088     }
10089   }
10090
10091   if (!obc->ssc) {
10092     obc->ssc = ssc;
10093   } else {
10094     assert(obc->ssc == ssc);
10095     put_snapset_context(ssc);
10096   }
10097   ssc = 0;
10098
10099   // clone
10100   dout(20) << "find_object_context  " << soid
10101            << " snapset " << obc->ssc->snapset
10102            << " legacy_snaps " << obc->obs.oi.legacy_snaps
10103            << dendl;
10104   snapid_t first, last;
10105   if (obc->ssc->snapset.is_legacy()) {
10106     first = obc->obs.oi.legacy_snaps.back();
10107     last = obc->obs.oi.legacy_snaps.front();
10108   } else {
10109     auto p = obc->ssc->snapset.clone_snaps.find(soid.snap);
10110     assert(p != obc->ssc->snapset.clone_snaps.end());
10111     first = p->second.back();
10112     last = p->second.front();
10113   }
10114   if (first <= oid.snap) {
10115     dout(20) << "find_object_context  " << soid << " [" << first << "," << last
10116              << "] contains " << oid.snap << " -- HIT " << obc->obs << dendl;
10117     *pobc = obc;
10118     return 0;
10119   } else {
10120     dout(20) << "find_object_context  " << soid << " [" << first << "," << last
10121              << "] does not contain " << oid.snap << " -- DNE" << dendl;
10122     return -ENOENT;
10123   }
10124 }
10125
10126 void PrimaryLogPG::object_context_destructor_callback(ObjectContext *obc)
10127 {
10128   if (obc->ssc)
10129     put_snapset_context(obc->ssc);
10130 }
10131
10132 void PrimaryLogPG::add_object_context_to_pg_stat(ObjectContextRef obc, pg_stat_t *pgstat)
10133 {
10134   object_info_t& oi = obc->obs.oi;
10135
10136   dout(10) << "add_object_context_to_pg_stat " << oi.soid << dendl;
10137   object_stat_sum_t stat;
10138
10139   stat.num_bytes += oi.size;
10140
10141   if (oi.soid.snap != CEPH_SNAPDIR)
10142     stat.num_objects++;
10143   if (oi.is_dirty())
10144     stat.num_objects_dirty++;
10145   if (oi.is_whiteout())
10146     stat.num_whiteouts++;
10147   if (oi.is_omap())
10148     stat.num_objects_omap++;
10149   if (oi.is_cache_pinned())
10150     stat.num_objects_pinned++;
10151
10152   if (oi.soid.snap && oi.soid.snap != CEPH_NOSNAP && oi.soid.snap != CEPH_SNAPDIR) {
10153     stat.num_object_clones++;
10154
10155     if (!obc->ssc)
10156       obc->ssc = get_snapset_context(oi.soid, false);
10157     assert(obc->ssc);
10158
10159     // subtract off clone overlap
10160     if (obc->ssc->snapset.clone_overlap.count(oi.soid.snap)) {
10161       interval_set<uint64_t>& o = obc->ssc->snapset.clone_overlap[oi.soid.snap];
10162       for (interval_set<uint64_t>::const_iterator r = o.begin();
10163            r != o.end();
10164            ++r) {
10165         stat.num_bytes -= r.get_len();
10166       }
10167     }
10168   }
10169
10170   // add it in
10171   pgstat->stats.sum.add(stat);
10172 }
10173
10174 void PrimaryLogPG::kick_object_context_blocked(ObjectContextRef obc)
10175 {
10176   const hobject_t& soid = obc->obs.oi.soid;
10177   if (obc->is_blocked()) {
10178     dout(10) << __func__ << " " << soid << " still blocked" << dendl;
10179     return;
10180   }
10181
10182   map<hobject_t, list<OpRequestRef>>::iterator p = waiting_for_blocked_object.find(soid);
10183   if (p != waiting_for_blocked_object.end()) {
10184     list<OpRequestRef>& ls = p->second;
10185     dout(10) << __func__ << " " << soid << " requeuing " << ls.size() << " requests" << dendl;
10186     requeue_ops(ls);
10187     waiting_for_blocked_object.erase(p);
10188   }
10189
10190   map<hobject_t, ObjectContextRef>::iterator i =
10191     objects_blocked_on_snap_promotion.find(obc->obs.oi.soid.get_head());
10192   if (i != objects_blocked_on_snap_promotion.end()) {
10193     assert(i->second == obc);
10194     objects_blocked_on_snap_promotion.erase(i);
10195   }
10196
10197   if (obc->requeue_scrub_on_unblock) {
10198     obc->requeue_scrub_on_unblock = false;
10199     requeue_scrub();
10200   }
10201 }
10202
10203 SnapSetContext *PrimaryLogPG::get_snapset_context(
10204   const hobject_t& oid,
10205   bool can_create,
10206   const map<string, bufferlist> *attrs,
10207   bool oid_existed)
10208 {
10209   Mutex::Locker l(snapset_contexts_lock);
10210   SnapSetContext *ssc;
10211   map<hobject_t, SnapSetContext*>::iterator p = snapset_contexts.find(
10212     oid.get_snapdir());
10213   if (p != snapset_contexts.end()) {
10214     if (can_create || p->second->exists) {
10215       ssc = p->second;
10216     } else {
10217       return NULL;
10218     }
10219   } else {
10220     bufferlist bv;
10221     if (!attrs) {
10222       int r = -ENOENT;
10223       if (!(oid.is_head() && !oid_existed))
10224         r = pgbackend->objects_get_attr(oid.get_head(), SS_ATTR, &bv);
10225       if (r < 0) {
10226         // try _snapset
10227         if (!(oid.is_snapdir() && !oid_existed))
10228           r = pgbackend->objects_get_attr(oid.get_snapdir(), SS_ATTR, &bv);
10229         if (r < 0 && !can_create)
10230           return NULL;
10231       }
10232     } else {
10233       assert(attrs->count(SS_ATTR));
10234       bv = attrs->find(SS_ATTR)->second;
10235     }
10236     ssc = new SnapSetContext(oid.get_snapdir());
10237     _register_snapset_context(ssc);
10238     if (bv.length()) {
10239       bufferlist::iterator bvp = bv.begin();
10240       try {
10241         ssc->snapset.decode(bvp);
10242       } catch (buffer::error& e) {
10243         dout(0) << __func__ << " Can't decode snapset: " << e << dendl;
10244         return NULL;
10245       }
10246       ssc->exists = true;
10247     } else {
10248       ssc->exists = false;
10249     }
10250   }
10251   assert(ssc);
10252   ssc->ref++;
10253   return ssc;
10254 }
10255
10256 void PrimaryLogPG::put_snapset_context(SnapSetContext *ssc)
10257 {
10258   Mutex::Locker l(snapset_contexts_lock);
10259   --ssc->ref;
10260   if (ssc->ref == 0) {
10261     if (ssc->registered)
10262       snapset_contexts.erase(ssc->oid);
10263     delete ssc;
10264   }
10265 }
10266
10267 /** pull - request object from a peer
10268  */
10269
10270 /*
10271  * Return values:
10272  *  NONE  - didn't pull anything
10273  *  YES   - pulled what the caller wanted
10274  *  OTHER - needed to pull something else first (_head or _snapdir)
10275  */
10276 enum { PULL_NONE, PULL_OTHER, PULL_YES };
10277
10278 int PrimaryLogPG::recover_missing(
10279   const hobject_t &soid, eversion_t v,
10280   int priority,
10281   PGBackend::RecoveryHandle *h)
10282 {
10283   if (missing_loc.is_unfound(soid)) {
10284     dout(7) << "pull " << soid
10285             << " v " << v
10286             << " but it is unfound" << dendl;
10287     return PULL_NONE;
10288   }
10289
10290   if (missing_loc.is_deleted(soid)) {
10291     start_recovery_op(soid);
10292     assert(!recovering.count(soid));
10293     recovering.insert(make_pair(soid, ObjectContextRef()));
10294     epoch_t cur_epoch = get_osdmap()->get_epoch();
10295     remove_missing_object(soid, v, new FunctionContext(
10296      [=](int) {
10297        lock();
10298        if (!pg_has_reset_since(cur_epoch)) {
10299          bool object_missing = false;
10300          for (const auto& shard : actingbackfill) {
10301            if (shard == pg_whoami)
10302              continue;
10303            if (peer_missing[shard].is_missing(soid)) {
10304              dout(20) << __func__ << ": soid " << soid << " needs to be deleted from replica " << shard << dendl;
10305              object_missing = true;
10306              break;
10307            }
10308          }
10309          if (!object_missing) {
10310            object_stat_sum_t stat_diff;
10311            stat_diff.num_objects_recovered = 1;
10312            on_global_recover(soid, stat_diff, true);
10313          } else {
10314            auto recovery_handle = pgbackend->open_recovery_op();
10315            pgbackend->recover_delete_object(soid, v, recovery_handle);
10316            pgbackend->run_recovery_op(recovery_handle, priority);
10317          }
10318        }
10319        unlock();
10320      }));
10321     return PULL_YES;
10322   }
10323
10324   // is this a snapped object?  if so, consult the snapset.. we may not need the entire object!
10325   ObjectContextRef obc;
10326   ObjectContextRef head_obc;
10327   if (soid.snap && soid.snap < CEPH_NOSNAP) {
10328     // do we have the head and/or snapdir?
10329     hobject_t head = soid.get_head();
10330     if (pg_log.get_missing().is_missing(head)) {
10331       if (recovering.count(head)) {
10332         dout(10) << " missing but already recovering head " << head << dendl;
10333         return PULL_NONE;
10334       } else {
10335         int r = recover_missing(
10336           head, pg_log.get_missing().get_items().find(head)->second.need, priority,
10337           h);
10338         if (r != PULL_NONE)
10339           return PULL_OTHER;
10340         return PULL_NONE;
10341       }
10342     }
10343     head = soid.get_snapdir();
10344     if (pg_log.get_missing().is_missing(head)) {
10345       if (recovering.count(head)) {
10346         dout(10) << " missing but already recovering snapdir " << head << dendl;
10347         return PULL_NONE;
10348       } else {
10349         int r = recover_missing(
10350           head, pg_log.get_missing().get_items().find(head)->second.need, priority,
10351           h);
10352         if (r != PULL_NONE)
10353           return PULL_OTHER;
10354         return PULL_NONE;
10355       }
10356     }
10357
10358     // we must have one or the other
10359     head_obc = get_object_context(
10360       soid.get_head(),
10361       false,
10362       0);
10363     if (!head_obc)
10364       head_obc = get_object_context(
10365         soid.get_snapdir(),
10366         false,
10367         0);
10368     assert(head_obc);
10369   }
10370   start_recovery_op(soid);
10371   assert(!recovering.count(soid));
10372   recovering.insert(make_pair(soid, obc));
10373   int r = pgbackend->recover_object(
10374     soid,
10375     v,
10376     head_obc,
10377     obc,
10378     h);
10379   // This is only a pull which shouldn't return an error
10380   assert(r >= 0);
10381   return PULL_YES;
10382 }
10383
10384 void PrimaryLogPG::send_remove_op(
10385   const hobject_t& oid, eversion_t v, pg_shard_t peer)
10386 {
10387   ceph_tid_t tid = osd->get_tid();
10388   osd_reqid_t rid(osd->get_cluster_msgr_name(), 0, tid);
10389
10390   dout(10) << "send_remove_op " << oid << " from osd." << peer
10391            << " tid " << tid << dendl;
10392
10393   MOSDSubOp *subop = new MOSDSubOp(
10394     rid, pg_whoami, spg_t(info.pgid.pgid, peer.shard),
10395     oid, CEPH_OSD_FLAG_ACK,
10396     get_osdmap()->get_epoch(), tid, v);
10397   subop->ops = vector<OSDOp>(1);
10398   subop->ops[0].op.op = CEPH_OSD_OP_DELETE;
10399
10400   osd->send_message_osd_cluster(peer.osd, subop, get_osdmap()->get_epoch());
10401 }
10402
10403 void PrimaryLogPG::remove_missing_object(const hobject_t &soid,
10404                                          eversion_t v, Context *on_complete)
10405 {
10406   dout(20) << __func__ << " " << soid << " " << v << dendl;
10407   assert(on_complete != nullptr);
10408   // delete locally
10409   ObjectStore::Transaction t;
10410   remove_snap_mapped_object(t, soid);
10411
10412   ObjectRecoveryInfo recovery_info;
10413   recovery_info.soid = soid;
10414   recovery_info.version = v;
10415
10416   epoch_t cur_epoch = get_osdmap()->get_epoch();
10417   t.register_on_complete(new FunctionContext(
10418      [=](int) {
10419        lock();
10420        if (!pg_has_reset_since(cur_epoch)) {
10421          ObjectStore::Transaction t2;
10422          on_local_recover(soid, recovery_info, ObjectContextRef(), true, &t2);
10423          t2.register_on_complete(on_complete);
10424          int r = osd->store->queue_transaction(osr.get(), std::move(t2), nullptr);
10425          assert(r == 0);
10426          unlock();
10427        } else {
10428          unlock();
10429          on_complete->complete(-EAGAIN);
10430        }
10431      }));
10432   int r = osd->store->queue_transaction(osr.get(), std::move(t), nullptr);
10433   assert(r == 0);
10434 }
10435
10436 void PrimaryLogPG::finish_degraded_object(const hobject_t& oid)
10437 {
10438   dout(10) << "finish_degraded_object " << oid << dendl;
10439   if (callbacks_for_degraded_object.count(oid)) {
10440     list<Context*> contexts;
10441     contexts.swap(callbacks_for_degraded_object[oid]);
10442     callbacks_for_degraded_object.erase(oid);
10443     for (list<Context*>::iterator i = contexts.begin();
10444          i != contexts.end();
10445          ++i) {
10446       (*i)->complete(0);
10447     }
10448   }
10449   map<hobject_t, snapid_t>::iterator i = objects_blocked_on_degraded_snap.find(
10450     oid.get_head());
10451   if (i != objects_blocked_on_degraded_snap.end() &&
10452       i->second == oid.snap)
10453     objects_blocked_on_degraded_snap.erase(i);
10454 }
10455
10456 void PrimaryLogPG::_committed_pushed_object(
10457   epoch_t epoch, eversion_t last_complete)
10458 {
10459   lock();
10460   if (!pg_has_reset_since(epoch)) {
10461     dout(10) << "_committed_pushed_object last_complete " << last_complete << " now ondisk" << dendl;
10462     last_complete_ondisk = last_complete;
10463
10464     if (last_complete_ondisk == info.last_update) {
10465       if (!is_primary()) {
10466         // Either we are a replica or backfill target.
10467         // we are fully up to date.  tell the primary!
10468         osd->send_message_osd_cluster(
10469           get_primary().osd,
10470           new MOSDPGTrim(
10471             get_osdmap()->get_epoch(),
10472             spg_t(info.pgid.pgid, get_primary().shard),
10473             last_complete_ondisk),
10474           get_osdmap()->get_epoch());
10475       } else {
10476         calc_min_last_complete_ondisk();
10477       }
10478     }
10479
10480   } else {
10481     dout(10) << "_committed_pushed_object pg has changed, not touching last_complete_ondisk" << dendl;
10482   }
10483
10484   unlock();
10485 }
10486
10487 void PrimaryLogPG::_applied_recovered_object(ObjectContextRef obc)
10488 {
10489   lock();
10490   dout(20) << __func__ << dendl;
10491   if (obc) {
10492     dout(20) << "obc = " << *obc << dendl;
10493   }
10494   assert(active_pushes >= 1);
10495   --active_pushes;
10496
10497   // requeue an active chunky scrub waiting on recovery ops
10498   if (!deleting && active_pushes == 0
10499       && scrubber.is_chunky_scrub_active()) {
10500     if (ops_blocked_by_scrub()) {
10501       requeue_scrub(true);
10502     } else {
10503       requeue_scrub(false);
10504     }
10505   }
10506   unlock();
10507 }
10508
10509 void PrimaryLogPG::_applied_recovered_object_replica()
10510 {
10511   lock();
10512   dout(20) << __func__ << dendl;
10513   assert(active_pushes >= 1);
10514   --active_pushes;
10515
10516   // requeue an active chunky scrub waiting on recovery ops
10517   if (!deleting && active_pushes == 0 &&
10518       scrubber.active_rep_scrub && static_cast<const MOSDRepScrub*>(
10519         scrubber.active_rep_scrub->get_req())->chunky) {
10520     osd->enqueue_back(
10521       info.pgid,
10522       PGQueueable(scrubber.active_rep_scrub, get_osdmap()->get_epoch()));
10523     scrubber.active_rep_scrub = OpRequestRef();
10524   }
10525   unlock();
10526 }
10527
10528 void PrimaryLogPG::recover_got(hobject_t oid, eversion_t v)
10529 {
10530   dout(10) << "got missing " << oid << " v " << v << dendl;
10531   pg_log.recover_got(oid, v, info);
10532   if (pg_log.get_log().complete_to != pg_log.get_log().log.end()) {
10533     dout(10) << "last_complete now " << info.last_complete
10534              << " log.complete_to " << pg_log.get_log().complete_to->version
10535              << dendl;
10536   } else {
10537     dout(10) << "last_complete now " << info.last_complete
10538              << " log.complete_to at end" << dendl;
10539     //below is not true in the repair case.
10540     //assert(missing.num_missing() == 0);  // otherwise, complete_to was wrong.
10541     assert(info.last_complete == info.last_update);
10542   }
10543 }
10544
10545 void PrimaryLogPG::primary_failed(const hobject_t &soid)
10546 {
10547   list<pg_shard_t> fl = { pg_whoami };
10548   failed_push(fl, soid);
10549 }
10550
10551 void PrimaryLogPG::failed_push(const list<pg_shard_t> &from, const hobject_t &soid)
10552 {
10553   dout(20) << __func__ << ": " << soid << dendl;
10554   assert(recovering.count(soid));
10555   auto obc = recovering[soid];
10556   if (obc) {
10557     list<OpRequestRef> blocked_ops;
10558     obc->drop_recovery_read(&blocked_ops);
10559     requeue_ops(blocked_ops);
10560   }
10561   recovering.erase(soid);
10562   for (auto&& i : from)
10563     missing_loc.remove_location(soid, i);
10564   dout(0) << __func__ << " " << soid << " from shard " << from
10565           << ", reps on " << missing_loc.get_locations(soid)
10566           << " unfound? " << missing_loc.is_unfound(soid) << dendl;
10567   finish_recovery_op(soid);  // close out this attempt,
10568 }
10569
10570 void PrimaryLogPG::sub_op_remove(OpRequestRef op)
10571 {
10572   const MOSDSubOp *m = static_cast<const MOSDSubOp*>(op->get_req());
10573   assert(m->get_type() == MSG_OSD_SUBOP);
10574   dout(7) << "sub_op_remove " << m->poid << dendl;
10575
10576   op->mark_started();
10577
10578   ObjectStore::Transaction t;
10579   remove_snap_mapped_object(t, m->poid);
10580   int r = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
10581   assert(r == 0);
10582 }
10583
10584 eversion_t PrimaryLogPG::pick_newest_available(const hobject_t& oid)
10585 {
10586   eversion_t v;
10587   pg_missing_item pmi;
10588   bool is_missing = pg_log.get_missing().is_missing(oid, &pmi);
10589   assert(is_missing);
10590   v = pmi.have;
10591   dout(10) << "pick_newest_available " << oid << " " << v << " on osd." << osd->whoami << " (local)" << dendl;
10592
10593   assert(!actingbackfill.empty());
10594   for (set<pg_shard_t>::iterator i = actingbackfill.begin();
10595        i != actingbackfill.end();
10596        ++i) {
10597     if (*i == get_primary()) continue;
10598     pg_shard_t peer = *i;
10599     if (!peer_missing[peer].is_missing(oid)) {
10600       continue;
10601     }
10602     eversion_t h = peer_missing[peer].get_items().at(oid).have;
10603     dout(10) << "pick_newest_available " << oid << " " << h << " on osd." << peer << dendl;
10604     if (h > v)
10605       v = h;
10606   }
10607
10608   dout(10) << "pick_newest_available " << oid << " " << v << " (newest)" << dendl;
10609   return v;
10610 }
10611
10612 void PrimaryLogPG::do_update_log_missing(OpRequestRef &op)
10613 {
10614   const MOSDPGUpdateLogMissing *m = static_cast<const MOSDPGUpdateLogMissing*>(
10615     op->get_req());
10616   assert(m->get_type() == MSG_OSD_PG_UPDATE_LOG_MISSING);
10617   ObjectStore::Transaction t;
10618   append_log_entries_update_missing(m->entries, t);
10619
10620   Context *complete = new FunctionContext(
10621     [=](int) {
10622       const MOSDPGUpdateLogMissing *msg = static_cast<const MOSDPGUpdateLogMissing*>(
10623         op->get_req());
10624       lock();
10625       if (!pg_has_reset_since(msg->get_epoch())) {
10626         MOSDPGUpdateLogMissingReply *reply =
10627           new MOSDPGUpdateLogMissingReply(
10628             spg_t(info.pgid.pgid, primary_shard().shard),
10629             pg_whoami.shard,
10630             msg->get_epoch(),
10631             msg->min_epoch,
10632             msg->get_tid());
10633         reply->set_priority(CEPH_MSG_PRIO_HIGH);
10634         msg->get_connection()->send_message(reply);
10635       }
10636       unlock();
10637     });
10638
10639   if (get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
10640     t.register_on_commit(complete);
10641   } else {
10642     /* Hack to work around the fact that ReplicatedBackend sends
10643      * ack+commit if commit happens first
10644      *
10645      * This behavior is no longer necessary, but we preserve it so old
10646      * primaries can keep their repops in order */
10647     if (pool.info.ec_pool()) {
10648       t.register_on_complete(complete);
10649     } else {
10650       t.register_on_commit(complete);
10651     }
10652   }
10653   t.register_on_applied(
10654     new C_OSD_OnApplied{this, get_osdmap()->get_epoch(), info.last_update});
10655   int tr = osd->store->queue_transaction(
10656     osr.get(),
10657     std::move(t),
10658     nullptr);
10659   assert(tr == 0);
10660 }
10661
10662 void PrimaryLogPG::do_update_log_missing_reply(OpRequestRef &op)
10663 {
10664   const MOSDPGUpdateLogMissingReply *m =
10665     static_cast<const MOSDPGUpdateLogMissingReply*>(
10666     op->get_req());
10667   dout(20) << __func__ << " got reply from "
10668            << m->get_from() << dendl;
10669
10670   auto it = log_entry_update_waiting_on.find(m->get_tid());
10671   if (it != log_entry_update_waiting_on.end()) {
10672     if (it->second.waiting_on.count(m->get_from())) {
10673       it->second.waiting_on.erase(m->get_from());
10674     } else {
10675       osd->clog->error()
10676         << info.pgid << " got reply "
10677         << *m << " from shard we are not waiting for "
10678         << m->get_from();
10679     }
10680
10681     if (it->second.waiting_on.empty()) {
10682       repop_all_committed(it->second.repop.get());
10683       log_entry_update_waiting_on.erase(it);
10684     }
10685   } else {
10686     osd->clog->error()
10687       << info.pgid << " got reply "
10688       << *m << " on unknown tid " << m->get_tid();
10689   }
10690 }
10691
10692 /* Mark all unfound objects as lost.
10693  */
10694 void PrimaryLogPG::mark_all_unfound_lost(
10695   int what,
10696   ConnectionRef con,
10697   ceph_tid_t tid)
10698 {
10699   dout(3) << __func__ << " " << pg_log_entry_t::get_op_name(what) << dendl;
10700   list<hobject_t> oids;
10701
10702   dout(30) << __func__ << ": log before:\n";
10703   pg_log.get_log().print(*_dout);
10704   *_dout << dendl;
10705
10706   mempool::osd_pglog::list<pg_log_entry_t> log_entries;
10707
10708   utime_t mtime = ceph_clock_now();
10709   map<hobject_t, pg_missing_item>::const_iterator m =
10710     missing_loc.get_needs_recovery().begin();
10711   map<hobject_t, pg_missing_item>::const_iterator mend =
10712     missing_loc.get_needs_recovery().end();
10713
10714   ObcLockManager manager;
10715   eversion_t v = get_next_version();
10716   v.epoch = get_osdmap()->get_epoch();
10717   uint64_t num_unfound = missing_loc.num_unfound();
10718   while (m != mend) {
10719     const hobject_t &oid(m->first);
10720     if (!missing_loc.is_unfound(oid)) {
10721       // We only care about unfound objects
10722       ++m;
10723       continue;
10724     }
10725
10726     ObjectContextRef obc;
10727     eversion_t prev;
10728
10729     switch (what) {
10730     case pg_log_entry_t::LOST_MARK:
10731       assert(0 == "actually, not implemented yet!");
10732       break;
10733
10734     case pg_log_entry_t::LOST_REVERT:
10735       prev = pick_newest_available(oid);
10736       if (prev > eversion_t()) {
10737         // log it
10738         pg_log_entry_t e(
10739           pg_log_entry_t::LOST_REVERT, oid, v,
10740           m->second.need, 0, osd_reqid_t(), mtime, 0);
10741         e.reverting_to = prev;
10742         e.mark_unrollbackable();
10743         log_entries.push_back(e);
10744         dout(10) << e << dendl;
10745
10746         // we are now missing the new version; recovery code will sort it out.
10747         ++v.version;
10748         ++m;
10749         break;
10750       }
10751
10752     case pg_log_entry_t::LOST_DELETE:
10753       {
10754         pg_log_entry_t e(pg_log_entry_t::LOST_DELETE, oid, v, m->second.need,
10755                          0, osd_reqid_t(), mtime, 0);
10756         if (get_osdmap()->require_osd_release >= CEPH_RELEASE_JEWEL) {
10757           if (pool.info.require_rollback()) {
10758             e.mod_desc.try_rmobject(v.version);
10759           } else {
10760             e.mark_unrollbackable();
10761           }
10762         } // otherwise, just do what we used to do
10763         dout(10) << e << dendl;
10764         log_entries.push_back(e);
10765         oids.push_back(oid);
10766
10767         ++v.version;
10768         ++m;
10769       }
10770       break;
10771
10772     default:
10773       ceph_abort();
10774     }
10775   }
10776
10777   info.stats.stats_invalid = true;
10778
10779   submit_log_entries(
10780     log_entries,
10781     std::move(manager),
10782     boost::optional<std::function<void(void)> >(
10783       [this, oids, con, num_unfound, tid]() {
10784         if (perform_deletes_during_peering()) {
10785           for (auto oid : oids) {
10786             // clear old locations - merge_new_log_entries will have
10787             // handled rebuilding missing_loc for each of these
10788             // objects if we have the RECOVERY_DELETES flag
10789             missing_loc.recovered(oid);
10790           }
10791         }
10792
10793         for (auto& p : waiting_for_unreadable_object) {
10794           release_backoffs(p.first);
10795         }
10796         requeue_object_waiters(waiting_for_unreadable_object);
10797         queue_recovery();
10798
10799         stringstream ss;
10800         ss << "pg has " << num_unfound
10801            << " objects unfound and apparently lost marking";
10802         string rs = ss.str();
10803         dout(0) << "do_command r=" << 0 << " " << rs << dendl;
10804         osd->clog->info() << rs;
10805         if (con) {
10806           MCommandReply *reply = new MCommandReply(0, rs);
10807           reply->set_tid(tid);
10808           con->send_message(reply);
10809         }
10810       }),
10811     OpRequestRef());
10812 }
10813
10814 void PrimaryLogPG::_split_into(pg_t child_pgid, PG *child, unsigned split_bits)
10815 {
10816   assert(repop_queue.empty());
10817 }
10818
10819 /*
10820  * pg status change notification
10821  */
10822
10823 void PrimaryLogPG::apply_and_flush_repops(bool requeue)
10824 {
10825   list<OpRequestRef> rq;
10826
10827   // apply all repops
10828   while (!repop_queue.empty()) {
10829     RepGather *repop = repop_queue.front();
10830     repop_queue.pop_front();
10831     dout(10) << " canceling repop tid " << repop->rep_tid << dendl;
10832     repop->rep_aborted = true;
10833     repop->on_applied.clear();
10834     repop->on_committed.clear();
10835     repop->on_success.clear();
10836
10837     if (requeue) {
10838       if (repop->op) {
10839         dout(10) << " requeuing " << *repop->op->get_req() << dendl;
10840         rq.push_back(repop->op);
10841         repop->op = OpRequestRef();
10842       }
10843
10844       // also requeue any dups, interleaved into position
10845       map<eversion_t, list<pair<OpRequestRef, version_t> > >::iterator p =
10846         waiting_for_ondisk.find(repop->v);
10847       if (p != waiting_for_ondisk.end()) {
10848         dout(10) << " also requeuing ondisk waiters " << p->second << dendl;
10849         for (list<pair<OpRequestRef, version_t> >::iterator i =
10850                p->second.begin();
10851              i != p->second.end();
10852              ++i) {
10853           rq.push_back(i->first);
10854         }
10855         waiting_for_ondisk.erase(p);
10856       }
10857     }
10858
10859     remove_repop(repop);
10860   }
10861
10862   assert(repop_queue.empty());
10863
10864   if (requeue) {
10865     requeue_ops(rq);
10866     if (!waiting_for_ondisk.empty()) {
10867       for (map<eversion_t, list<pair<OpRequestRef, version_t> > >::iterator i =
10868              waiting_for_ondisk.begin();
10869            i != waiting_for_ondisk.end();
10870            ++i) {
10871         for (list<pair<OpRequestRef, version_t> >::iterator j =
10872                i->second.begin();
10873              j != i->second.end();
10874              ++j) {
10875           derr << __func__ << ": op " << *(j->first->get_req()) << " waiting on "
10876                << i->first << dendl;
10877         }
10878       }
10879       assert(waiting_for_ondisk.empty());
10880     }
10881   }
10882
10883   waiting_for_ondisk.clear();
10884 }
10885
10886 void PrimaryLogPG::on_flushed()
10887 {
10888   assert(flushes_in_progress > 0);
10889   flushes_in_progress--;
10890   if (flushes_in_progress == 0) {
10891     requeue_ops(waiting_for_peered);
10892   }
10893   if (!is_peered() || !is_primary()) {
10894     pair<hobject_t, ObjectContextRef> i;
10895     while (object_contexts.get_next(i.first, &i)) {
10896       derr << "on_flushed: object " << i.first << " obc still alive" << dendl;
10897     }
10898     assert(object_contexts.empty());
10899   }
10900   pgbackend->on_flushed();
10901 }
10902
10903 void PrimaryLogPG::on_removal(ObjectStore::Transaction *t)
10904 {
10905   dout(10) << "on_removal" << dendl;
10906
10907   // adjust info to backfill
10908   info.set_last_backfill(hobject_t());
10909   pg_log.reset_backfill();
10910   dirty_info = true;
10911
10912
10913   // clear log
10914   PGLogEntryHandler rollbacker{this, t};
10915   pg_log.roll_forward(&rollbacker);
10916
10917   write_if_dirty(*t);
10918
10919   if (!deleting)
10920     on_shutdown();
10921 }
10922
10923 void PrimaryLogPG::clear_async_reads()
10924 {
10925   dout(10) << __func__ << dendl;
10926   for(auto& i : in_progress_async_reads) {
10927     dout(10) << "clear ctx: "
10928              << "OpRequestRef " << i.first
10929              << " OpContext " << i.second
10930              << dendl;
10931     close_op_ctx(i.second);
10932   }
10933 }
10934
10935 void PrimaryLogPG::on_shutdown()
10936 {
10937   dout(10) << "on_shutdown" << dendl;
10938
10939   // remove from queues
10940   osd->pg_stat_queue_dequeue(this);
10941   osd->peering_wq.dequeue(this);
10942
10943   // handles queue races
10944   deleting = true;
10945
10946   if (recovery_queued) {
10947     recovery_queued = false;
10948     osd->clear_queued_recovery(this);
10949   }
10950
10951   clear_scrub_reserved();
10952   scrub_clear_state();
10953
10954   unreg_next_scrub();
10955   cancel_copy_ops(false);
10956   cancel_flush_ops(false);
10957   cancel_proxy_ops(false);
10958   apply_and_flush_repops(false);
10959   cancel_log_updates();
10960   // we must remove PGRefs, so do this this prior to release_backoffs() callers
10961   clear_backoffs();
10962   // clean up snap trim references
10963   snap_trimmer_machine.process_event(Reset());
10964
10965   pgbackend->on_change();
10966
10967   context_registry_on_change();
10968   object_contexts.clear();
10969
10970   clear_async_reads();
10971
10972   osd->remote_reserver.cancel_reservation(info.pgid);
10973   osd->local_reserver.cancel_reservation(info.pgid);
10974
10975   clear_primary_state();
10976   cancel_recovery();
10977 }
10978
10979 void PrimaryLogPG::on_activate()
10980 {
10981   // all clean?
10982   if (needs_recovery()) {
10983     dout(10) << "activate not all replicas are up-to-date, queueing recovery" << dendl;
10984     queue_peering_event(
10985       CephPeeringEvtRef(
10986         std::make_shared<CephPeeringEvt>(
10987           get_osdmap()->get_epoch(),
10988           get_osdmap()->get_epoch(),
10989           DoRecovery())));
10990   } else if (needs_backfill()) {
10991     dout(10) << "activate queueing backfill" << dendl;
10992     queue_peering_event(
10993       CephPeeringEvtRef(
10994         std::make_shared<CephPeeringEvt>(
10995           get_osdmap()->get_epoch(),
10996           get_osdmap()->get_epoch(),
10997           RequestBackfill())));
10998   } else {
10999     dout(10) << "activate all replicas clean, no recovery" << dendl;
11000     eio_errors_to_process = false;
11001     queue_peering_event(
11002       CephPeeringEvtRef(
11003         std::make_shared<CephPeeringEvt>(
11004           get_osdmap()->get_epoch(),
11005           get_osdmap()->get_epoch(),
11006           AllReplicasRecovered())));
11007   }
11008
11009   publish_stats_to_osd();
11010
11011   if (!backfill_targets.empty()) {
11012     last_backfill_started = earliest_backfill();
11013     new_backfill = true;
11014     assert(!last_backfill_started.is_max());
11015     dout(5) << "on activate: bft=" << backfill_targets
11016            << " from " << last_backfill_started << dendl;
11017     for (set<pg_shard_t>::iterator i = backfill_targets.begin();
11018          i != backfill_targets.end();
11019          ++i) {
11020       dout(5) << "target shard " << *i
11021              << " from " << peer_info[*i].last_backfill
11022              << dendl;
11023     }
11024   }
11025
11026   hit_set_setup();
11027   agent_setup();
11028 }
11029
11030 void PrimaryLogPG::_on_new_interval()
11031 {
11032   dout(20) << __func__ << "checking missing set deletes flag. missing = " << pg_log.get_missing() << dendl;
11033   if (!pg_log.get_missing().may_include_deletes &&
11034       get_osdmap()->test_flag(CEPH_OSDMAP_RECOVERY_DELETES)) {
11035     pg_log.rebuild_missing_set_with_deletes(osd->store, coll, info);
11036   }
11037   assert(pg_log.get_missing().may_include_deletes == get_osdmap()->test_flag(CEPH_OSDMAP_RECOVERY_DELETES));
11038 }
11039
11040 void PrimaryLogPG::on_change(ObjectStore::Transaction *t)
11041 {
11042   dout(10) << "on_change" << dendl;
11043
11044   if (hit_set && hit_set->insert_count() == 0) {
11045     dout(20) << " discarding empty hit_set" << dendl;
11046     hit_set_clear();
11047   }
11048
11049   if (recovery_queued) {
11050     recovery_queued = false;
11051     osd->clear_queued_recovery(this);
11052   }
11053
11054   // requeue everything in the reverse order they should be
11055   // reexamined.
11056   requeue_ops(waiting_for_peered);
11057   requeue_ops(waiting_for_active);
11058
11059   clear_scrub_reserved();
11060
11061   cancel_copy_ops(is_primary());
11062   cancel_flush_ops(is_primary());
11063   cancel_proxy_ops(is_primary());
11064
11065   // requeue object waiters
11066   for (auto& p : waiting_for_unreadable_object) {
11067     release_backoffs(p.first);
11068   }
11069   if (is_primary()) {
11070     requeue_object_waiters(waiting_for_unreadable_object);
11071   } else {
11072     waiting_for_unreadable_object.clear();
11073   }
11074   for (map<hobject_t,list<OpRequestRef>>::iterator p = waiting_for_degraded_object.begin();
11075        p != waiting_for_degraded_object.end();
11076        waiting_for_degraded_object.erase(p++)) {
11077     release_backoffs(p->first);
11078     if (is_primary())
11079       requeue_ops(p->second);
11080     else
11081       p->second.clear();
11082     finish_degraded_object(p->first);
11083   }
11084
11085   // requeues waiting_for_scrub
11086   scrub_clear_state();
11087
11088   for (auto p = waiting_for_blocked_object.begin();
11089        p != waiting_for_blocked_object.end();
11090        waiting_for_blocked_object.erase(p++)) {
11091     if (is_primary())
11092       requeue_ops(p->second);
11093     else
11094       p->second.clear();
11095   }
11096   for (auto i = callbacks_for_degraded_object.begin();
11097        i != callbacks_for_degraded_object.end();
11098     ) {
11099     finish_degraded_object((i++)->first);
11100   }
11101   assert(callbacks_for_degraded_object.empty());
11102
11103   if (is_primary()) {
11104     requeue_ops(waiting_for_cache_not_full);
11105   } else {
11106     waiting_for_cache_not_full.clear();
11107   }
11108   objects_blocked_on_cache_full.clear();
11109
11110   for (list<pair<OpRequestRef, OpContext*> >::iterator i =
11111          in_progress_async_reads.begin();
11112        i != in_progress_async_reads.end();
11113        in_progress_async_reads.erase(i++)) {
11114     close_op_ctx(i->second);
11115     if (is_primary())
11116       requeue_op(i->first);
11117   }
11118
11119   // this will requeue ops we were working on but didn't finish, and
11120   // any dups
11121   apply_and_flush_repops(is_primary());
11122   cancel_log_updates();
11123
11124   // do this *after* apply_and_flush_repops so that we catch any newly
11125   // registered watches.
11126   context_registry_on_change();
11127
11128   pgbackend->on_change_cleanup(t);
11129   scrubber.cleanup_store(t);
11130   pgbackend->on_change();
11131
11132   // clear snap_trimmer state
11133   snap_trimmer_machine.process_event(Reset());
11134
11135   debug_op_order.clear();
11136   unstable_stats.clear();
11137
11138   // we don't want to cache object_contexts through the interval change
11139   // NOTE: we actually assert that all currently live references are dead
11140   // by the time the flush for the next interval completes.
11141   object_contexts.clear();
11142
11143   // should have been cleared above by finishing all of the degraded objects
11144   assert(objects_blocked_on_degraded_snap.empty());
11145 }
11146
11147 void PrimaryLogPG::on_role_change()
11148 {
11149   dout(10) << "on_role_change" << dendl;
11150   if (get_role() != 0 && hit_set) {
11151     dout(10) << " clearing hit set" << dendl;
11152     hit_set_clear();
11153   }
11154 }
11155
11156 void PrimaryLogPG::on_pool_change()
11157 {
11158   dout(10) << __func__ << dendl;
11159   // requeue cache full waiters just in case the cache_mode is
11160   // changing away from writeback mode.  note that if we are not
11161   // active the normal requeuing machinery is sufficient (and properly
11162   // ordered).
11163   if (is_active() &&
11164       pool.info.cache_mode != pg_pool_t::CACHEMODE_WRITEBACK &&
11165       !waiting_for_cache_not_full.empty()) {
11166     dout(10) << __func__ << " requeuing full waiters (not in writeback) "
11167              << dendl;
11168     requeue_ops(waiting_for_cache_not_full);
11169     objects_blocked_on_cache_full.clear();
11170   }
11171   hit_set_setup();
11172   agent_setup();
11173 }
11174
11175 // clear state.  called on recovery completion AND cancellation.
11176 void PrimaryLogPG::_clear_recovery_state()
11177 {
11178   missing_loc.clear();
11179 #ifdef DEBUG_RECOVERY_OIDS
11180   recovering_oids.clear();
11181 #endif
11182   last_backfill_started = hobject_t();
11183   set<hobject_t>::iterator i = backfills_in_flight.begin();
11184   while (i != backfills_in_flight.end()) {
11185     assert(recovering.count(*i));
11186     backfills_in_flight.erase(i++);
11187   }
11188
11189   list<OpRequestRef> blocked_ops;
11190   for (map<hobject_t, ObjectContextRef>::iterator i = recovering.begin();
11191        i != recovering.end();
11192        recovering.erase(i++)) {
11193     if (i->second) {
11194       i->second->drop_recovery_read(&blocked_ops);
11195       requeue_ops(blocked_ops);
11196     }
11197   }
11198   assert(backfills_in_flight.empty());
11199   pending_backfill_updates.clear();
11200   assert(recovering.empty());
11201   pgbackend->clear_recovery_state();
11202 }
11203
11204 void PrimaryLogPG::cancel_pull(const hobject_t &soid)
11205 {
11206   dout(20) << __func__ << ": " << soid << dendl;
11207   assert(recovering.count(soid));
11208   ObjectContextRef obc = recovering[soid];
11209   if (obc) {
11210     list<OpRequestRef> blocked_ops;
11211     obc->drop_recovery_read(&blocked_ops);
11212     requeue_ops(blocked_ops);
11213   }
11214   recovering.erase(soid);
11215   finish_recovery_op(soid);
11216   release_backoffs(soid);
11217   if (waiting_for_degraded_object.count(soid)) {
11218     dout(20) << " kicking degraded waiters on " << soid << dendl;
11219     requeue_ops(waiting_for_degraded_object[soid]);
11220     waiting_for_degraded_object.erase(soid);
11221   }
11222   if (waiting_for_unreadable_object.count(soid)) {
11223     dout(20) << " kicking unreadable waiters on " << soid << dendl;
11224     requeue_ops(waiting_for_unreadable_object[soid]);
11225     waiting_for_unreadable_object.erase(soid);
11226   }
11227   if (is_missing_object(soid))
11228     pg_log.set_last_requested(0); // get recover_primary to start over
11229   finish_degraded_object(soid);
11230 }
11231
11232 void PrimaryLogPG::check_recovery_sources(const OSDMapRef& osdmap)
11233 {
11234   /*
11235    * check that any peers we are planning to (or currently) pulling
11236    * objects from are dealt with.
11237    */
11238   missing_loc.check_recovery_sources(osdmap);
11239   pgbackend->check_recovery_sources(osdmap);
11240
11241   for (set<pg_shard_t>::iterator i = peer_log_requested.begin();
11242        i != peer_log_requested.end();
11243        ) {
11244     if (!osdmap->is_up(i->osd)) {
11245       dout(10) << "peer_log_requested removing " << *i << dendl;
11246       peer_log_requested.erase(i++);
11247     } else {
11248       ++i;
11249     }
11250   }
11251
11252   for (set<pg_shard_t>::iterator i = peer_missing_requested.begin();
11253        i != peer_missing_requested.end();
11254        ) {
11255     if (!osdmap->is_up(i->osd)) {
11256       dout(10) << "peer_missing_requested removing " << *i << dendl;
11257       peer_missing_requested.erase(i++);
11258     } else {
11259       ++i;
11260     }
11261   }
11262 }
11263
11264 void PG::MissingLoc::check_recovery_sources(const OSDMapRef& osdmap)
11265 {
11266   set<pg_shard_t> now_down;
11267   for (set<pg_shard_t>::iterator p = missing_loc_sources.begin();
11268        p != missing_loc_sources.end();
11269        ) {
11270     if (osdmap->is_up(p->osd)) {
11271       ++p;
11272       continue;
11273     }
11274     ldout(pg->cct, 10) << "check_recovery_sources source osd." << *p << " now down" << dendl;
11275     now_down.insert(*p);
11276     missing_loc_sources.erase(p++);
11277   }
11278
11279   if (now_down.empty()) {
11280     ldout(pg->cct, 10) << "check_recovery_sources no source osds (" << missing_loc_sources << ") went down" << dendl;
11281   } else {
11282     ldout(pg->cct, 10) << "check_recovery_sources sources osds " << now_down << " now down, remaining sources are "
11283                        << missing_loc_sources << dendl;
11284
11285     // filter missing_loc
11286     map<hobject_t, set<pg_shard_t>>::iterator p = missing_loc.begin();
11287     while (p != missing_loc.end()) {
11288       set<pg_shard_t>::iterator q = p->second.begin();
11289       while (q != p->second.end())
11290         if (now_down.count(*q)) {
11291           p->second.erase(q++);
11292         } else {
11293           ++q;
11294         }
11295       if (p->second.empty())
11296         missing_loc.erase(p++);
11297       else
11298         ++p;
11299     }
11300   }
11301 }
11302
11303
11304 bool PrimaryLogPG::start_recovery_ops(
11305   uint64_t max,
11306   ThreadPool::TPHandle &handle,
11307   uint64_t *ops_started)
11308 {
11309   uint64_t& started = *ops_started;
11310   started = 0;
11311   bool work_in_progress = false;
11312   assert(is_primary());
11313
11314   if (!state_test(PG_STATE_RECOVERING) &&
11315       !state_test(PG_STATE_BACKFILL)) {
11316     /* TODO: I think this case is broken and will make do_recovery()
11317      * unhappy since we're returning false */
11318     dout(10) << "recovery raced and were queued twice, ignoring!" << dendl;
11319     return false;
11320   }
11321
11322   const auto &missing = pg_log.get_missing();
11323
11324   unsigned int num_missing = missing.num_missing();
11325   uint64_t num_unfound = get_num_unfound();
11326
11327   if (num_missing == 0) {
11328     info.last_complete = info.last_update;
11329   }
11330
11331   if (num_missing == num_unfound) {
11332     // All of the missing objects we have are unfound.
11333     // Recover the replicas.
11334     started = recover_replicas(max, handle);
11335   }
11336   if (!started) {
11337     // We still have missing objects that we should grab from replicas.
11338     started += recover_primary(max, handle);
11339   }
11340   if (!started && num_unfound != get_num_unfound()) {
11341     // second chance to recovery replicas
11342     started = recover_replicas(max, handle);
11343   }
11344
11345   if (started)
11346     work_in_progress = true;
11347
11348   bool deferred_backfill = false;
11349   if (recovering.empty() &&
11350       state_test(PG_STATE_BACKFILL) &&
11351       !backfill_targets.empty() && started < max &&
11352       missing.num_missing() == 0 &&
11353       waiting_on_backfill.empty()) {
11354     if (get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL)) {
11355       dout(10) << "deferring backfill due to NOBACKFILL" << dendl;
11356       deferred_backfill = true;
11357     } else if (get_osdmap()->test_flag(CEPH_OSDMAP_NOREBALANCE) &&
11358                !is_degraded())  {
11359       dout(10) << "deferring backfill due to NOREBALANCE" << dendl;
11360       deferred_backfill = true;
11361     } else if (!backfill_reserved) {
11362       dout(10) << "deferring backfill due to !backfill_reserved" << dendl;
11363       if (!backfill_reserving) {
11364         dout(10) << "queueing RequestBackfill" << dendl;
11365         backfill_reserving = true;
11366         queue_peering_event(
11367           CephPeeringEvtRef(
11368             std::make_shared<CephPeeringEvt>(
11369               get_osdmap()->get_epoch(),
11370               get_osdmap()->get_epoch(),
11371               RequestBackfill())));
11372       }
11373       deferred_backfill = true;
11374     } else {
11375       started += recover_backfill(max - started, handle, &work_in_progress);
11376     }
11377   }
11378
11379   dout(10) << " started " << started << dendl;
11380   osd->logger->inc(l_osd_rop, started);
11381
11382   if (!recovering.empty() ||
11383       work_in_progress || recovery_ops_active > 0 || deferred_backfill)
11384     return work_in_progress;
11385
11386   assert(recovering.empty());
11387   assert(recovery_ops_active == 0);
11388
11389   dout(10) << __func__ << " needs_recovery: "
11390            << missing_loc.get_needs_recovery()
11391            << dendl;
11392   dout(10) << __func__ << " missing_loc: "
11393            << missing_loc.get_missing_locs()
11394            << dendl;
11395   int unfound = get_num_unfound();
11396   if (unfound) {
11397     dout(10) << " still have " << unfound << " unfound" << dendl;
11398     return work_in_progress;
11399   }
11400
11401   if (missing.num_missing() > 0) {
11402     // this shouldn't happen!
11403     osd->clog->error() << info.pgid << " Unexpected Error: recovery ending with "
11404                        << missing.num_missing() << ": " << missing.get_items();
11405     return work_in_progress;
11406   }
11407
11408   if (needs_recovery()) {
11409     // this shouldn't happen!
11410     // We already checked num_missing() so we must have missing replicas
11411     osd->clog->error() << info.pgid
11412                        << " Unexpected Error: recovery ending with missing replicas";
11413     return work_in_progress;
11414   }
11415
11416   if (state_test(PG_STATE_RECOVERING)) {
11417     state_clear(PG_STATE_RECOVERING);
11418     state_clear(PG_STATE_FORCED_RECOVERY);
11419     if (needs_backfill()) {
11420       dout(10) << "recovery done, queuing backfill" << dendl;
11421       queue_peering_event(
11422         CephPeeringEvtRef(
11423           std::make_shared<CephPeeringEvt>(
11424             get_osdmap()->get_epoch(),
11425             get_osdmap()->get_epoch(),
11426             RequestBackfill())));
11427     } else {
11428       dout(10) << "recovery done, no backfill" << dendl;
11429       eio_errors_to_process = false;
11430       state_clear(PG_STATE_FORCED_BACKFILL);
11431       queue_peering_event(
11432         CephPeeringEvtRef(
11433           std::make_shared<CephPeeringEvt>(
11434             get_osdmap()->get_epoch(),
11435             get_osdmap()->get_epoch(),
11436             AllReplicasRecovered())));
11437     }
11438   } else { // backfilling
11439     state_clear(PG_STATE_BACKFILL);
11440     state_clear(PG_STATE_FORCED_BACKFILL);
11441     state_clear(PG_STATE_FORCED_RECOVERY);
11442     dout(10) << "recovery done, backfill done" << dendl;
11443     eio_errors_to_process = false;
11444     queue_peering_event(
11445       CephPeeringEvtRef(
11446         std::make_shared<CephPeeringEvt>(
11447           get_osdmap()->get_epoch(),
11448           get_osdmap()->get_epoch(),
11449           Backfilled())));
11450   }
11451
11452   return false;
11453 }
11454
11455 /**
11456  * do one recovery op.
11457  * return true if done, false if nothing left to do.
11458  */
11459 uint64_t PrimaryLogPG::recover_primary(uint64_t max, ThreadPool::TPHandle &handle)
11460 {
11461   assert(is_primary());
11462
11463   const auto &missing = pg_log.get_missing();
11464
11465   dout(10) << "recover_primary recovering " << recovering.size()
11466            << " in pg" << dendl;
11467   dout(10) << "recover_primary " << missing << dendl;
11468   dout(25) << "recover_primary " << missing.get_items() << dendl;
11469
11470   // look at log!
11471   pg_log_entry_t *latest = 0;
11472   unsigned started = 0;
11473   int skipped = 0;
11474
11475   PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
11476   map<version_t, hobject_t>::const_iterator p =
11477     missing.get_rmissing().lower_bound(pg_log.get_log().last_requested);
11478   while (p != missing.get_rmissing().end()) {
11479     handle.reset_tp_timeout();
11480     hobject_t soid;
11481     version_t v = p->first;
11482
11483     if (pg_log.get_log().objects.count(p->second)) {
11484       latest = pg_log.get_log().objects.find(p->second)->second;
11485       assert(latest->is_update() || latest->is_delete());
11486       soid = latest->soid;
11487     } else {
11488       latest = 0;
11489       soid = p->second;
11490     }
11491     const pg_missing_item& item = missing.get_items().find(p->second)->second;
11492     ++p;
11493
11494     hobject_t head = soid.get_head();
11495
11496     eversion_t need = item.need;
11497
11498     dout(10) << "recover_primary "
11499              << soid << " " << item.need
11500              << (missing.is_missing(soid) ? " (missing)":"")
11501              << (missing.is_missing(head) ? " (missing head)":"")
11502              << (recovering.count(soid) ? " (recovering)":"")
11503              << (recovering.count(head) ? " (recovering head)":"")
11504              << dendl;
11505
11506     if (latest) {
11507       switch (latest->op) {
11508       case pg_log_entry_t::CLONE:
11509         /*
11510          * Handling for this special case removed for now, until we
11511          * can correctly construct an accurate SnapSet from the old
11512          * one.
11513          */
11514         break;
11515
11516       case pg_log_entry_t::LOST_REVERT:
11517         {
11518           if (item.have == latest->reverting_to) {
11519             ObjectContextRef obc = get_object_context(soid, true);
11520
11521             if (obc->obs.oi.version == latest->version) {
11522               // I'm already reverting
11523               dout(10) << " already reverting " << soid << dendl;
11524             } else {
11525               dout(10) << " reverting " << soid << " to " << latest->prior_version << dendl;
11526               obc->ondisk_write_lock();
11527               obc->obs.oi.version = latest->version;
11528
11529               ObjectStore::Transaction t;
11530               bufferlist b2;
11531               obc->obs.oi.encode(
11532                 b2,
11533                 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
11534               assert(!pool.info.require_rollback());
11535               t.setattr(coll, ghobject_t(soid), OI_ATTR, b2);
11536
11537               recover_got(soid, latest->version);
11538               missing_loc.add_location(soid, pg_whoami);
11539
11540               ++active_pushes;
11541
11542               osd->store->queue_transaction(osr.get(), std::move(t),
11543                                             new C_OSD_AppliedRecoveredObject(this, obc),
11544                                             new C_OSD_CommittedPushedObject(
11545                                               this,
11546                                               get_osdmap()->get_epoch(),
11547                                               info.last_complete),
11548                                             new C_OSD_OndiskWriteUnlock(obc));
11549               continue;
11550             }
11551           } else {
11552             /*
11553              * Pull the old version of the object.  Update missing_loc here to have the location
11554              * of the version we want.
11555              *
11556              * This doesn't use the usual missing_loc paths, but that's okay:
11557              *  - if we have it locally, we hit the case above, and go from there.
11558              *  - if we don't, we always pass through this case during recovery and set up the location
11559              *    properly.
11560              *  - this way we don't need to mangle the missing code to be general about needing an old
11561              *    version...
11562              */
11563             eversion_t alternate_need = latest->reverting_to;
11564             dout(10) << " need to pull prior_version " << alternate_need << " for revert " << item << dendl;
11565
11566             for (map<pg_shard_t, pg_missing_t>::iterator p = peer_missing.begin();
11567                  p != peer_missing.end();
11568                  ++p)
11569               if (p->second.is_missing(soid, need) &&
11570                   p->second.get_items().at(soid).have == alternate_need) {
11571                 missing_loc.add_location(soid, p->first);
11572               }
11573             dout(10) << " will pull " << alternate_need << " or " << need
11574                      << " from one of " << missing_loc.get_locations(soid)
11575                      << dendl;
11576           }
11577         }
11578         break;
11579       }
11580     }
11581
11582     if (!recovering.count(soid)) {
11583       if (recovering.count(head)) {
11584         ++skipped;
11585       } else {
11586         int r = recover_missing(
11587           soid, need, get_recovery_op_priority(), h);
11588         switch (r) {
11589         case PULL_YES:
11590           ++started;
11591           break;
11592         case PULL_OTHER:
11593           ++started;
11594         case PULL_NONE:
11595           ++skipped;
11596           break;
11597         default:
11598           ceph_abort();
11599         }
11600         if (started >= max)
11601           break;
11602       }
11603     }
11604
11605     // only advance last_requested if we haven't skipped anything
11606     if (!skipped)
11607       pg_log.set_last_requested(v);
11608   }
11609
11610   pgbackend->run_recovery_op(h, get_recovery_op_priority());
11611   return started;
11612 }
11613
11614 bool PrimaryLogPG::primary_error(
11615   const hobject_t& soid, eversion_t v)
11616 {
11617   pg_log.missing_add(soid, v, eversion_t());
11618   pg_log.set_last_requested(0);
11619   missing_loc.remove_location(soid, pg_whoami);
11620   bool uhoh = true;
11621   assert(!actingbackfill.empty());
11622   for (set<pg_shard_t>::iterator i = actingbackfill.begin();
11623        i != actingbackfill.end();
11624        ++i) {
11625     if (*i == get_primary()) continue;
11626     pg_shard_t peer = *i;
11627     if (!peer_missing[peer].is_missing(soid, v)) {
11628       missing_loc.add_location(soid, peer);
11629       dout(10) << info.pgid << " unexpectedly missing " << soid << " v" << v
11630                << ", there should be a copy on shard " << peer << dendl;
11631       uhoh = false;
11632     }
11633   }
11634   if (uhoh)
11635     osd->clog->error() << info.pgid << " missing primary copy of " << soid << ", unfound";
11636   else
11637     osd->clog->error() << info.pgid << " missing primary copy of " << soid
11638                          << ", will try copies on " << missing_loc.get_locations(soid);
11639   return uhoh;
11640 }
11641
11642 int PrimaryLogPG::prep_object_replica_deletes(
11643   const hobject_t& soid, eversion_t v,
11644   PGBackend::RecoveryHandle *h)
11645 {
11646   assert(is_primary());
11647   dout(10) << __func__ << ": on " << soid << dendl;
11648
11649   start_recovery_op(soid);
11650   assert(!recovering.count(soid));
11651   recovering.insert(make_pair(soid, ObjectContextRef()));
11652
11653   pgbackend->recover_delete_object(soid, v, h);
11654   return 1;
11655 }
11656
11657 int PrimaryLogPG::prep_object_replica_pushes(
11658   const hobject_t& soid, eversion_t v,
11659   PGBackend::RecoveryHandle *h)
11660 {
11661   assert(is_primary());
11662   dout(10) << __func__ << ": on " << soid << dendl;
11663
11664   // NOTE: we know we will get a valid oloc off of disk here.
11665   ObjectContextRef obc = get_object_context(soid, false);
11666   if (!obc) {
11667     primary_error(soid, v);
11668     return 0;
11669   }
11670
11671   if (!obc->get_recovery_read()) {
11672     dout(20) << "recovery delayed on " << soid
11673              << "; could not get rw_manager lock" << dendl;
11674     return 0;
11675   } else {
11676     dout(20) << "recovery got recovery read lock on " << soid
11677              << dendl;
11678   }
11679
11680   start_recovery_op(soid);
11681   assert(!recovering.count(soid));
11682   recovering.insert(make_pair(soid, obc));
11683
11684   /* We need this in case there is an in progress write on the object.  In fact,
11685    * the only possible write is an update to the xattr due to a lost_revert --
11686    * a client write would be blocked since the object is degraded.
11687    * In almost all cases, therefore, this lock should be uncontended.
11688    */
11689   obc->ondisk_read_lock();
11690   int r = pgbackend->recover_object(
11691     soid,
11692     v,
11693     ObjectContextRef(),
11694     obc, // has snapset context
11695     h);
11696   obc->ondisk_read_unlock();
11697   if (r < 0) {
11698     dout(0) << __func__ << " Error " << r << " on oid " << soid << dendl;
11699     primary_failed(soid);
11700     primary_error(soid, v);
11701     return 0;
11702   }
11703   return 1;
11704 }
11705
11706 uint64_t PrimaryLogPG::recover_replicas(uint64_t max, ThreadPool::TPHandle &handle)
11707 {
11708   dout(10) << __func__ << "(" << max << ")" << dendl;
11709   uint64_t started = 0;
11710
11711   PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
11712
11713   // this is FAR from an optimal recovery order.  pretty lame, really.
11714   assert(!actingbackfill.empty());
11715   for (set<pg_shard_t>::iterator i = actingbackfill.begin();
11716        i != actingbackfill.end();
11717        ++i) {
11718     if (*i == get_primary()) continue;
11719     pg_shard_t peer = *i;
11720     map<pg_shard_t, pg_missing_t>::const_iterator pm = peer_missing.find(peer);
11721     assert(pm != peer_missing.end());
11722     map<pg_shard_t, pg_info_t>::const_iterator pi = peer_info.find(peer);
11723     assert(pi != peer_info.end());
11724     size_t m_sz = pm->second.num_missing();
11725
11726     dout(10) << " peer osd." << peer << " missing " << m_sz << " objects." << dendl;
11727     dout(20) << " peer osd." << peer << " missing " << pm->second.get_items() << dendl;
11728
11729     // oldest first!
11730     const pg_missing_t &m(pm->second);
11731     for (map<version_t, hobject_t>::const_iterator p = m.get_rmissing().begin();
11732          p != m.get_rmissing().end() && started < max;
11733            ++p) {
11734       handle.reset_tp_timeout();
11735       const hobject_t soid(p->second);
11736
11737       if (missing_loc.is_unfound(soid)) {
11738         dout(10) << __func__ << ": " << soid << " still unfound" << dendl;
11739         continue;
11740       }
11741
11742       if (soid > pi->second.last_backfill) {
11743         if (!recovering.count(soid)) {
11744           derr << __func__ << ": object " << soid << " last_backfill " << pi->second.last_backfill << dendl;
11745           derr << __func__ << ": object added to missing set for backfill, but "
11746                << "is not in recovering, error!" << dendl;
11747           ceph_abort();
11748         }
11749         continue;
11750       }
11751
11752       if (recovering.count(soid)) {
11753         dout(10) << __func__ << ": already recovering " << soid << dendl;
11754         continue;
11755       }
11756
11757       if (missing_loc.is_deleted(soid)) {
11758         dout(10) << __func__ << ": " << soid << " is a delete, removing" << dendl;
11759         map<hobject_t,pg_missing_item>::const_iterator r = m.get_items().find(soid);
11760         started += prep_object_replica_deletes(soid, r->second.need, h);
11761         continue;
11762       }
11763
11764       if (soid.is_snap() && pg_log.get_missing().is_missing(soid.get_head())) {
11765         dout(10) << __func__ << ": " << soid.get_head()
11766                  << " still missing on primary" << dendl;
11767         continue;
11768       }
11769
11770       if (soid.is_snap() && pg_log.get_missing().is_missing(soid.get_snapdir())) {
11771         dout(10) << __func__ << ": " << soid.get_snapdir()
11772                  << " still missing on primary" << dendl;
11773         continue;
11774       }
11775
11776       if (pg_log.get_missing().is_missing(soid)) {
11777         dout(10) << __func__ << ": " << soid << " still missing on primary" << dendl;
11778         continue;
11779       }
11780
11781       dout(10) << __func__ << ": recover_object_replicas(" << soid << ")" << dendl;
11782       map<hobject_t,pg_missing_item>::const_iterator r = m.get_items().find(soid);
11783       started += prep_object_replica_pushes(soid, r->second.need,
11784                                             h);
11785     }
11786   }
11787
11788   pgbackend->run_recovery_op(h, get_recovery_op_priority());
11789   return started;
11790 }
11791
11792 hobject_t PrimaryLogPG::earliest_peer_backfill() const
11793 {
11794   hobject_t e = hobject_t::get_max();
11795   for (set<pg_shard_t>::const_iterator i = backfill_targets.begin();
11796        i != backfill_targets.end();
11797        ++i) {
11798     pg_shard_t peer = *i;
11799     map<pg_shard_t, BackfillInterval>::const_iterator iter =
11800       peer_backfill_info.find(peer);
11801     assert(iter != peer_backfill_info.end());
11802     if (iter->second.begin < e)
11803       e = iter->second.begin;
11804   }
11805   return e;
11806 }
11807
11808 bool PrimaryLogPG::all_peer_done() const
11809 {
11810   // Primary hasn't got any more objects
11811   assert(backfill_info.empty());
11812
11813   for (set<pg_shard_t>::const_iterator i = backfill_targets.begin();
11814        i != backfill_targets.end();
11815        ++i) {
11816     pg_shard_t bt = *i;
11817     map<pg_shard_t, BackfillInterval>::const_iterator piter =
11818       peer_backfill_info.find(bt);
11819     assert(piter != peer_backfill_info.end());
11820     const BackfillInterval& pbi = piter->second;
11821     // See if peer has more to process
11822     if (!pbi.extends_to_end() || !pbi.empty())
11823         return false;
11824   }
11825   return true;
11826 }
11827
11828 /**
11829  * recover_backfill
11830  *
11831  * Invariants:
11832  *
11833  * backfilled: fully pushed to replica or present in replica's missing set (both
11834  * our copy and theirs).
11835  *
11836  * All objects on a backfill_target in
11837  * [MIN,peer_backfill_info[backfill_target].begin) are valid; logically-removed
11838  * objects have been actually deleted and all logically-valid objects are replicated.
11839  * There may be PG objects in this interval yet to be backfilled.
11840  *
11841  * All objects in PG in [MIN,backfill_info.begin) have been backfilled to all
11842  * backfill_targets.  There may be objects on backfill_target(s) yet to be deleted.
11843  *
11844  * For a backfill target, all objects < MIN(peer_backfill_info[target].begin,
11845  *     backfill_info.begin) in PG are backfilled.  No deleted objects in this
11846  * interval remain on the backfill target.
11847  *
11848  * For a backfill target, all objects <= peer_info[target].last_backfill
11849  * have been backfilled to target
11850  *
11851  * There *MAY* be missing/outdated objects between last_backfill_started and
11852  * MIN(peer_backfill_info[*].begin, backfill_info.begin) in the event that client
11853  * io created objects since the last scan.  For this reason, we call
11854  * update_range() again before continuing backfill.
11855  */
11856 uint64_t PrimaryLogPG::recover_backfill(
11857   uint64_t max,
11858   ThreadPool::TPHandle &handle, bool *work_started)
11859 {
11860   dout(10) << "recover_backfill (" << max << ")"
11861            << " bft=" << backfill_targets
11862            << " last_backfill_started " << last_backfill_started
11863            << (new_backfill ? " new_backfill":"")
11864            << dendl;
11865   assert(!backfill_targets.empty());
11866
11867   // Initialize from prior backfill state
11868   if (new_backfill) {
11869     // on_activate() was called prior to getting here
11870     assert(last_backfill_started == earliest_backfill());
11871     new_backfill = false;
11872
11873     // initialize BackfillIntervals
11874     for (set<pg_shard_t>::iterator i = backfill_targets.begin();
11875          i != backfill_targets.end();
11876          ++i) {
11877       peer_backfill_info[*i].reset(peer_info[*i].last_backfill);
11878     }
11879     backfill_info.reset(last_backfill_started);
11880
11881     backfills_in_flight.clear();
11882     pending_backfill_updates.clear();
11883   }
11884
11885   for (set<pg_shard_t>::iterator i = backfill_targets.begin();
11886        i != backfill_targets.end();
11887        ++i) {
11888     dout(10) << "peer osd." << *i
11889            << " info " << peer_info[*i]
11890            << " interval " << peer_backfill_info[*i].begin
11891            << "-" << peer_backfill_info[*i].end
11892            << " " << peer_backfill_info[*i].objects.size() << " objects"
11893            << dendl;
11894   }
11895
11896   // update our local interval to cope with recent changes
11897   backfill_info.begin = last_backfill_started;
11898   update_range(&backfill_info, handle);
11899
11900   unsigned ops = 0;
11901   vector<boost::tuple<hobject_t, eversion_t, pg_shard_t> > to_remove;
11902   set<hobject_t> add_to_stat;
11903
11904   for (set<pg_shard_t>::iterator i = backfill_targets.begin();
11905        i != backfill_targets.end();
11906        ++i) {
11907     peer_backfill_info[*i].trim_to(
11908       std::max(peer_info[*i].last_backfill, last_backfill_started));
11909   }
11910   backfill_info.trim_to(last_backfill_started);
11911
11912   PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
11913   while (ops < max) {
11914     if (backfill_info.begin <= earliest_peer_backfill() &&
11915         !backfill_info.extends_to_end() && backfill_info.empty()) {
11916       hobject_t next = backfill_info.end;
11917       backfill_info.reset(next);
11918       backfill_info.end = hobject_t::get_max();
11919       update_range(&backfill_info, handle);
11920       backfill_info.trim();
11921     }
11922
11923     dout(20) << "   my backfill interval " << backfill_info << dendl;
11924
11925     bool sent_scan = false;
11926     for (set<pg_shard_t>::iterator i = backfill_targets.begin();
11927          i != backfill_targets.end();
11928          ++i) {
11929       pg_shard_t bt = *i;
11930       BackfillInterval& pbi = peer_backfill_info[bt];
11931
11932       dout(20) << " peer shard " << bt << " backfill " << pbi << dendl;
11933       if (pbi.begin <= backfill_info.begin &&
11934           !pbi.extends_to_end() && pbi.empty()) {
11935         dout(10) << " scanning peer osd." << bt << " from " << pbi.end << dendl;
11936         epoch_t e = get_osdmap()->get_epoch();
11937         MOSDPGScan *m = new MOSDPGScan(
11938           MOSDPGScan::OP_SCAN_GET_DIGEST, pg_whoami, e, last_peering_reset,
11939           spg_t(info.pgid.pgid, bt.shard),
11940           pbi.end, hobject_t());
11941         osd->send_message_osd_cluster(bt.osd, m, get_osdmap()->get_epoch());
11942         assert(waiting_on_backfill.find(bt) == waiting_on_backfill.end());
11943         waiting_on_backfill.insert(bt);
11944         sent_scan = true;
11945       }
11946     }
11947
11948     // Count simultaneous scans as a single op and let those complete
11949     if (sent_scan) {
11950       ops++;
11951       start_recovery_op(hobject_t::get_max()); // XXX: was pbi.end
11952       break;
11953     }
11954
11955     if (backfill_info.empty() && all_peer_done()) {
11956       dout(10) << " reached end for both local and all peers" << dendl;
11957       break;
11958     }
11959
11960     // Get object within set of peers to operate on and
11961     // the set of targets for which that object applies.
11962     hobject_t check = earliest_peer_backfill();
11963
11964     if (check < backfill_info.begin) {
11965
11966       set<pg_shard_t> check_targets;
11967       for (set<pg_shard_t>::iterator i = backfill_targets.begin();
11968            i != backfill_targets.end();
11969            ++i) {
11970         pg_shard_t bt = *i;
11971         BackfillInterval& pbi = peer_backfill_info[bt];
11972         if (pbi.begin == check)
11973           check_targets.insert(bt);
11974       }
11975       assert(!check_targets.empty());
11976
11977       dout(20) << " BACKFILL removing " << check
11978                << " from peers " << check_targets << dendl;
11979       for (set<pg_shard_t>::iterator i = check_targets.begin();
11980            i != check_targets.end();
11981            ++i) {
11982         pg_shard_t bt = *i;
11983         BackfillInterval& pbi = peer_backfill_info[bt];
11984         assert(pbi.begin == check);
11985
11986         to_remove.push_back(boost::make_tuple(check, pbi.objects.begin()->second, bt));
11987         pbi.pop_front();
11988       }
11989
11990       /* This requires a bit of explanation.  We compare head against
11991        * last_backfill to determine whether to send an operation
11992        * to the replica.  A single write operation can touch up to three
11993        * objects: head, the snapdir, and a new clone which sorts closer to
11994        * head than any existing clone.  If last_backfill points at a clone,
11995        * the transaction won't be sent and all 3 must lie on the right side
11996        * of the line (i.e., we'll backfill them later).  If last_backfill
11997        * points at snapdir, it sorts greater than head, so we send the
11998        * transaction which is correct because all three must lie to the left
11999        * of the line.
12000        *
12001        * If it points at head, we have a bit of an issue.  If head actually
12002        * exists, no problem, because any transaction which touches snapdir
12003        * must end up creating it (and deleting head), so sending the
12004        * operation won't pose a problem -- we'll end up having to scan it,
12005        * but it'll end up being the right version so we won't bother to
12006        * rebackfill it.  However, if head doesn't exist, any write on head
12007        * will remove snapdir.  For a replicated pool, this isn't a problem,
12008        * ENOENT on remove isn't an issue and it's in backfill future anyway.
12009        * It only poses a problem for EC pools, because we never just delete
12010        * an object, we rename it into a rollback object.  That operation
12011        * will end up crashing the osd with ENOENT.  Tolerating the failure
12012        * wouldn't work either, even if snapdir exists, we'd be creating a
12013        * rollback object past the last_backfill line which wouldn't get
12014        * cleaned up (no rollback objects past the last_backfill line is an
12015        * existing important invariant).  Thus, let's avoid the whole issue
12016        * by just not updating last_backfill_started here if head doesn't
12017        * exist and snapdir does.  We aren't using up a recovery count here,
12018        * so we're going to recover snapdir immediately anyway.  We'll only
12019        * fail "backward" if we fail to get the rw lock and that just means
12020        * we'll re-process this section of the hash space again.
12021        *
12022        * I'm choosing this hack here because the really "correct" answer is
12023        * going to be to unify snapdir and head into a single object (a
12024        * snapdir is really just a confusing way to talk about head existing
12025        * as a whiteout), but doing that is going to be a somewhat larger
12026        * undertaking.
12027        *
12028        * @see http://tracker.ceph.com/issues/17668
12029        */
12030       if (!(check.is_head() &&
12031             backfill_info.begin.is_snapdir() &&
12032             check == backfill_info.begin.get_head()))
12033         last_backfill_started = check;
12034
12035       // Don't increment ops here because deletions
12036       // are cheap and not replied to unlike real recovery_ops,
12037       // and we can't increment ops without requeueing ourself
12038       // for recovery.
12039     } else {
12040       eversion_t& obj_v = backfill_info.objects.begin()->second;
12041
12042       vector<pg_shard_t> need_ver_targs, missing_targs, keep_ver_targs, skip_targs;
12043       for (set<pg_shard_t>::iterator i = backfill_targets.begin();
12044            i != backfill_targets.end();
12045            ++i) {
12046         pg_shard_t bt = *i;
12047         BackfillInterval& pbi = peer_backfill_info[bt];
12048         // Find all check peers that have the wrong version
12049         if (check == backfill_info.begin && check == pbi.begin) {
12050           if (pbi.objects.begin()->second != obj_v) {
12051             need_ver_targs.push_back(bt);
12052           } else {
12053             keep_ver_targs.push_back(bt);
12054           }
12055         } else {
12056           pg_info_t& pinfo = peer_info[bt];
12057
12058           // Only include peers that we've caught up to their backfill line
12059           // otherwise, they only appear to be missing this object
12060           // because their pbi.begin > backfill_info.begin.
12061           if (backfill_info.begin > pinfo.last_backfill)
12062             missing_targs.push_back(bt);
12063           else
12064             skip_targs.push_back(bt);
12065         }
12066       }
12067
12068       if (!keep_ver_targs.empty()) {
12069         // These peers have version obj_v
12070         dout(20) << " BACKFILL keeping " << check
12071                  << " with ver " << obj_v
12072                  << " on peers " << keep_ver_targs << dendl;
12073         //assert(!waiting_for_degraded_object.count(check));
12074       }
12075       if (!need_ver_targs.empty() || !missing_targs.empty()) {
12076         ObjectContextRef obc = get_object_context(backfill_info.begin, false);
12077         assert(obc);
12078         if (obc->get_recovery_read()) {
12079           if (!need_ver_targs.empty()) {
12080             dout(20) << " BACKFILL replacing " << check
12081                    << " with ver " << obj_v
12082                    << " to peers " << need_ver_targs << dendl;
12083           }
12084           if (!missing_targs.empty()) {
12085             dout(20) << " BACKFILL pushing " << backfill_info.begin
12086                  << " with ver " << obj_v
12087                  << " to peers " << missing_targs << dendl;
12088           }
12089           vector<pg_shard_t> all_push = need_ver_targs;
12090           all_push.insert(all_push.end(), missing_targs.begin(), missing_targs.end());
12091
12092           handle.reset_tp_timeout();
12093           int r = prep_backfill_object_push(backfill_info.begin, obj_v, obc, all_push, h);
12094           if (r < 0) {
12095             *work_started = true;
12096             dout(0) << __func__ << " Error " << r << " trying to backfill " << backfill_info.begin << dendl;
12097             break;
12098           }
12099           ops++;
12100         } else {
12101           *work_started = true;
12102           dout(20) << "backfill blocking on " << backfill_info.begin
12103                    << "; could not get rw_manager lock" << dendl;
12104           break;
12105         }
12106       }
12107       dout(20) << "need_ver_targs=" << need_ver_targs
12108                << " keep_ver_targs=" << keep_ver_targs << dendl;
12109       dout(20) << "backfill_targets=" << backfill_targets
12110                << " missing_targs=" << missing_targs
12111                << " skip_targs=" << skip_targs << dendl;
12112
12113       last_backfill_started = backfill_info.begin;
12114       add_to_stat.insert(backfill_info.begin); // XXX: Only one for all pushes?
12115       backfill_info.pop_front();
12116       vector<pg_shard_t> check_targets = need_ver_targs;
12117       check_targets.insert(check_targets.end(), keep_ver_targs.begin(), keep_ver_targs.end());
12118       for (vector<pg_shard_t>::iterator i = check_targets.begin();
12119            i != check_targets.end();
12120            ++i) {
12121         pg_shard_t bt = *i;
12122         BackfillInterval& pbi = peer_backfill_info[bt];
12123         pbi.pop_front();
12124       }
12125     }
12126   }
12127
12128   hobject_t backfill_pos =
12129     std::min(backfill_info.begin, earliest_peer_backfill());
12130
12131   for (set<hobject_t>::iterator i = add_to_stat.begin();
12132        i != add_to_stat.end();
12133        ++i) {
12134     ObjectContextRef obc = get_object_context(*i, false);
12135     assert(obc);
12136     pg_stat_t stat;
12137     add_object_context_to_pg_stat(obc, &stat);
12138     pending_backfill_updates[*i] = stat;
12139   }
12140   if (HAVE_FEATURE(get_min_upacting_features(), SERVER_LUMINOUS)) {
12141     map<pg_shard_t,MOSDPGBackfillRemove*> reqs;
12142     for (unsigned i = 0; i < to_remove.size(); ++i) {
12143       handle.reset_tp_timeout();
12144       const hobject_t& oid = to_remove[i].get<0>();
12145       eversion_t v = to_remove[i].get<1>();
12146       pg_shard_t peer = to_remove[i].get<2>();
12147       MOSDPGBackfillRemove *m;
12148       auto it = reqs.find(peer);
12149       if (it != reqs.end()) {
12150         m = it->second;
12151       } else {
12152         m = reqs[peer] = new MOSDPGBackfillRemove(
12153           spg_t(info.pgid.pgid, peer.shard),
12154           get_osdmap()->get_epoch());
12155       }
12156       m->ls.push_back(make_pair(oid, v));
12157
12158       if (oid <= last_backfill_started)
12159         pending_backfill_updates[oid]; // add empty stat!
12160     }
12161     for (auto p : reqs) {
12162       osd->send_message_osd_cluster(p.first.osd, p.second,
12163                                     get_osdmap()->get_epoch());
12164     }
12165   } else {
12166     // for jewel targets
12167     for (unsigned i = 0; i < to_remove.size(); ++i) {
12168       handle.reset_tp_timeout();
12169
12170       // ordered before any subsequent updates
12171       send_remove_op(to_remove[i].get<0>(), to_remove[i].get<1>(),
12172                      to_remove[i].get<2>());
12173
12174       if (to_remove[i].get<0>() <= last_backfill_started)
12175         pending_backfill_updates[to_remove[i].get<0>()]; // add empty stat!
12176     }
12177   }
12178
12179   pgbackend->run_recovery_op(h, get_recovery_op_priority());
12180
12181   dout(5) << "backfill_pos is " << backfill_pos << dendl;
12182   for (set<hobject_t>::iterator i = backfills_in_flight.begin();
12183        i != backfills_in_flight.end();
12184        ++i) {
12185     dout(20) << *i << " is still in flight" << dendl;
12186   }
12187
12188   hobject_t next_backfill_to_complete = backfills_in_flight.empty() ?
12189     backfill_pos : *(backfills_in_flight.begin());
12190   hobject_t new_last_backfill = earliest_backfill();
12191   dout(10) << "starting new_last_backfill at " << new_last_backfill << dendl;
12192   for (map<hobject_t, pg_stat_t>::iterator i =
12193          pending_backfill_updates.begin();
12194        i != pending_backfill_updates.end() &&
12195          i->first < next_backfill_to_complete;
12196        pending_backfill_updates.erase(i++)) {
12197     dout(20) << " pending_backfill_update " << i->first << dendl;
12198     assert(i->first > new_last_backfill);
12199     for (set<pg_shard_t>::iterator j = backfill_targets.begin();
12200          j != backfill_targets.end();
12201          ++j) {
12202       pg_shard_t bt = *j;
12203       pg_info_t& pinfo = peer_info[bt];
12204       //Add stats to all peers that were missing object
12205       if (i->first > pinfo.last_backfill)
12206         pinfo.stats.add(i->second);
12207     }
12208     new_last_backfill = i->first;
12209   }
12210   dout(10) << "possible new_last_backfill at " << new_last_backfill << dendl;
12211
12212   assert(!pending_backfill_updates.empty() ||
12213          new_last_backfill == last_backfill_started);
12214   if (pending_backfill_updates.empty() &&
12215       backfill_pos.is_max()) {
12216     assert(backfills_in_flight.empty());
12217     new_last_backfill = backfill_pos;
12218     last_backfill_started = backfill_pos;
12219   }
12220   dout(10) << "final new_last_backfill at " << new_last_backfill << dendl;
12221
12222   // If new_last_backfill == MAX, then we will send OP_BACKFILL_FINISH to
12223   // all the backfill targets.  Otherwise, we will move last_backfill up on
12224   // those targets need it and send OP_BACKFILL_PROGRESS to them.
12225   for (set<pg_shard_t>::iterator i = backfill_targets.begin();
12226        i != backfill_targets.end();
12227        ++i) {
12228     pg_shard_t bt = *i;
12229     pg_info_t& pinfo = peer_info[bt];
12230
12231     if (new_last_backfill > pinfo.last_backfill) {
12232       pinfo.set_last_backfill(new_last_backfill);
12233       epoch_t e = get_osdmap()->get_epoch();
12234       MOSDPGBackfill *m = NULL;
12235       if (pinfo.last_backfill.is_max()) {
12236         m = new MOSDPGBackfill(
12237           MOSDPGBackfill::OP_BACKFILL_FINISH,
12238           e,
12239           last_peering_reset,
12240           spg_t(info.pgid.pgid, bt.shard));
12241         // Use default priority here, must match sub_op priority
12242         /* pinfo.stats might be wrong if we did log-based recovery on the
12243          * backfilled portion in addition to continuing backfill.
12244          */
12245         pinfo.stats = info.stats;
12246         start_recovery_op(hobject_t::get_max());
12247       } else {
12248         m = new MOSDPGBackfill(
12249           MOSDPGBackfill::OP_BACKFILL_PROGRESS,
12250           e,
12251           last_peering_reset,
12252           spg_t(info.pgid.pgid, bt.shard));
12253         // Use default priority here, must match sub_op priority
12254       }
12255       m->last_backfill = pinfo.last_backfill;
12256       m->stats = pinfo.stats;
12257       osd->send_message_osd_cluster(bt.osd, m, get_osdmap()->get_epoch());
12258       dout(10) << " peer " << bt
12259                << " num_objects now " << pinfo.stats.stats.sum.num_objects
12260                << " / " << info.stats.stats.sum.num_objects << dendl;
12261     }
12262   }
12263
12264   if (ops)
12265     *work_started = true;
12266   return ops;
12267 }
12268
12269 int PrimaryLogPG::prep_backfill_object_push(
12270   hobject_t oid, eversion_t v,
12271   ObjectContextRef obc,
12272   vector<pg_shard_t> peers,
12273   PGBackend::RecoveryHandle *h)
12274 {
12275   dout(10) << __func__ << " " << oid << " v " << v << " to peers " << peers << dendl;
12276   assert(!peers.empty());
12277
12278   backfills_in_flight.insert(oid);
12279   for (unsigned int i = 0 ; i < peers.size(); ++i) {
12280     map<pg_shard_t, pg_missing_t>::iterator bpm = peer_missing.find(peers[i]);
12281     assert(bpm != peer_missing.end());
12282     bpm->second.add(oid, eversion_t(), eversion_t(), false);
12283   }
12284
12285   assert(!recovering.count(oid));
12286
12287   start_recovery_op(oid);
12288   recovering.insert(make_pair(oid, obc));
12289
12290   // We need to take the read_lock here in order to flush in-progress writes
12291   obc->ondisk_read_lock();
12292   int r = pgbackend->recover_object(
12293     oid,
12294     v,
12295     ObjectContextRef(),
12296     obc,
12297     h);
12298   obc->ondisk_read_unlock();
12299   if (r < 0) {
12300     dout(0) << __func__ << " Error " << r << " on oid " << oid << dendl;
12301     primary_failed(oid);
12302     primary_error(oid, v);
12303     backfills_in_flight.erase(oid);
12304     missing_loc.add_missing(oid, v, eversion_t());
12305   }
12306   return r;
12307 }
12308
12309 void PrimaryLogPG::update_range(
12310   BackfillInterval *bi,
12311   ThreadPool::TPHandle &handle)
12312 {
12313   int local_min = cct->_conf->osd_backfill_scan_min;
12314   int local_max = cct->_conf->osd_backfill_scan_max;
12315
12316   if (bi->version < info.log_tail) {
12317     dout(10) << __func__<< ": bi is old, rescanning local backfill_info"
12318              << dendl;
12319     if (last_update_applied >= info.log_tail) {
12320       bi->version = last_update_applied;
12321     } else {
12322       osr->flush();
12323       bi->version = info.last_update;
12324     }
12325     scan_range(local_min, local_max, bi, handle);
12326   }
12327
12328   if (bi->version >= projected_last_update) {
12329     dout(10) << __func__<< ": bi is current " << dendl;
12330     assert(bi->version == projected_last_update);
12331   } else if (bi->version >= info.log_tail) {
12332     if (pg_log.get_log().empty() && projected_log.empty()) {
12333       /* Because we don't move log_tail on split, the log might be
12334        * empty even if log_tail != last_update.  However, the only
12335        * way to get here with an empty log is if log_tail is actually
12336        * eversion_t(), because otherwise the entry which changed
12337        * last_update since the last scan would have to be present.
12338        */
12339       assert(bi->version == eversion_t());
12340       return;
12341     }
12342
12343     dout(10) << __func__<< ": bi is old, (" << bi->version
12344              << ") can be updated with log to projected_last_update "
12345              << projected_last_update << dendl;
12346
12347     auto func = [&](const pg_log_entry_t &e) {
12348       dout(10) << __func__ << ": updating from version " << e.version
12349                << dendl;
12350       const hobject_t &soid = e.soid;
12351       if (soid >= bi->begin &&
12352           soid < bi->end) {
12353         if (e.is_update()) {
12354           dout(10) << __func__ << ": " << e.soid << " updated to version "
12355                    << e.version << dendl;
12356           bi->objects.erase(e.soid);
12357           bi->objects.insert(
12358             make_pair(
12359               e.soid,
12360               e.version));
12361         } else if (e.is_delete()) {
12362           dout(10) << __func__ << ": " << e.soid << " removed" << dendl;
12363           bi->objects.erase(e.soid);
12364         }
12365       }
12366     };
12367     dout(10) << "scanning pg log first" << dendl;
12368     pg_log.get_log().scan_log_after(bi->version, func);
12369     dout(10) << "scanning projected log" << dendl;
12370     projected_log.scan_log_after(bi->version, func);
12371     bi->version = projected_last_update;
12372   } else {
12373     assert(0 == "scan_range should have raised bi->version past log_tail");
12374   }
12375 }
12376
12377 void PrimaryLogPG::scan_range(
12378   int min, int max, BackfillInterval *bi,
12379   ThreadPool::TPHandle &handle)
12380 {
12381   assert(is_locked());
12382   dout(10) << "scan_range from " << bi->begin << dendl;
12383   bi->clear_objects();
12384
12385   vector<hobject_t> ls;
12386   ls.reserve(max);
12387   int r = pgbackend->objects_list_partial(bi->begin, min, max, &ls, &bi->end);
12388   assert(r >= 0);
12389   dout(10) << " got " << ls.size() << " items, next " << bi->end << dendl;
12390   dout(20) << ls << dendl;
12391
12392   for (vector<hobject_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
12393     handle.reset_tp_timeout();
12394     ObjectContextRef obc;
12395     if (is_primary())
12396       obc = object_contexts.lookup(*p);
12397     if (obc) {
12398       bi->objects[*p] = obc->obs.oi.version;
12399       dout(20) << "  " << *p << " " << obc->obs.oi.version << dendl;
12400     } else {
12401       bufferlist bl;
12402       int r = pgbackend->objects_get_attr(*p, OI_ATTR, &bl);
12403
12404       /* If the object does not exist here, it must have been removed
12405          * between the collection_list_partial and here.  This can happen
12406          * for the first item in the range, which is usually last_backfill.
12407          */
12408       if (r == -ENOENT)
12409         continue;
12410
12411       assert(r >= 0);
12412       object_info_t oi(bl);
12413       bi->objects[*p] = oi.version;
12414       dout(20) << "  " << *p << " " << oi.version << dendl;
12415     }
12416   }
12417 }
12418
12419
12420 /** check_local
12421  *
12422  * verifies that stray objects have been deleted
12423  */
12424 void PrimaryLogPG::check_local()
12425 {
12426   dout(10) << __func__ << dendl;
12427
12428   assert(info.last_update >= pg_log.get_tail());  // otherwise we need some help!
12429
12430   if (!cct->_conf->osd_debug_verify_stray_on_activate)
12431     return;
12432
12433   // just scan the log.
12434   set<hobject_t> did;
12435   for (list<pg_log_entry_t>::const_reverse_iterator p = pg_log.get_log().log.rbegin();
12436        p != pg_log.get_log().log.rend();
12437        ++p) {
12438     if (did.count(p->soid))
12439       continue;
12440     did.insert(p->soid);
12441
12442     if (p->is_delete() && !is_missing_object(p->soid)) {
12443       dout(10) << " checking " << p->soid
12444                << " at " << p->version << dendl;
12445       struct stat st;
12446       int r = osd->store->stat(
12447         ch,
12448         ghobject_t(p->soid, ghobject_t::NO_GEN, pg_whoami.shard),
12449         &st);
12450       if (r != -ENOENT) {
12451         derr << __func__ << " " << p->soid << " exists, but should have been "
12452              << "deleted" << dendl;
12453         assert(0 == "erroneously present object");
12454       }
12455     } else {
12456       // ignore old(+missing) objects
12457     }
12458   }
12459 }
12460
12461
12462
12463 // ===========================
12464 // hit sets
12465
12466 hobject_t PrimaryLogPG::get_hit_set_current_object(utime_t stamp)
12467 {
12468   ostringstream ss;
12469   ss << "hit_set_" << info.pgid.pgid << "_current_" << stamp;
12470   hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "",
12471                  info.pgid.ps(), info.pgid.pool(),
12472                  cct->_conf->osd_hit_set_namespace);
12473   dout(20) << __func__ << " " << hoid << dendl;
12474   return hoid;
12475 }
12476
12477 hobject_t PrimaryLogPG::get_hit_set_archive_object(utime_t start,
12478                                                    utime_t end,
12479                                                    bool using_gmt)
12480 {
12481   ostringstream ss;
12482   ss << "hit_set_" << info.pgid.pgid << "_archive_";
12483   if (using_gmt) {
12484     start.gmtime(ss) << "_";
12485     end.gmtime(ss);
12486   } else {
12487     start.localtime(ss) << "_";
12488     end.localtime(ss);
12489   }
12490   hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "",
12491                  info.pgid.ps(), info.pgid.pool(),
12492                  cct->_conf->osd_hit_set_namespace);
12493   dout(20) << __func__ << " " << hoid << dendl;
12494   return hoid;
12495 }
12496
12497 void PrimaryLogPG::hit_set_clear()
12498 {
12499   dout(20) << __func__ << dendl;
12500   hit_set.reset();
12501   hit_set_start_stamp = utime_t();
12502 }
12503
12504 void PrimaryLogPG::hit_set_setup()
12505 {
12506   if (!is_active() ||
12507       !is_primary()) {
12508     hit_set_clear();
12509     return;
12510   }
12511
12512   if (is_active() && is_primary() &&
12513       (!pool.info.hit_set_count ||
12514        !pool.info.hit_set_period ||
12515        pool.info.hit_set_params.get_type() == HitSet::TYPE_NONE)) {
12516     hit_set_clear();
12517
12518     // only primary is allowed to remove all the hit set objects
12519     hit_set_remove_all();
12520     return;
12521   }
12522
12523   // FIXME: discard any previous data for now
12524   hit_set_create();
12525
12526   // include any writes we know about from the pg log.  this doesn't
12527   // capture reads, but it is better than nothing!
12528   hit_set_apply_log();
12529 }
12530
12531 void PrimaryLogPG::hit_set_remove_all()
12532 {
12533   // If any archives are degraded we skip this
12534   for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin();
12535        p != info.hit_set.history.end();
12536        ++p) {
12537     hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
12538
12539     // Once we hit a degraded object just skip
12540     if (is_degraded_or_backfilling_object(aoid))
12541       return;
12542     if (scrubber.write_blocked_by_scrub(aoid))
12543       return;
12544   }
12545
12546   if (!info.hit_set.history.empty()) {
12547     list<pg_hit_set_info_t>::reverse_iterator p = info.hit_set.history.rbegin();
12548     assert(p != info.hit_set.history.rend());
12549     hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
12550     assert(!is_degraded_or_backfilling_object(oid));
12551     ObjectContextRef obc = get_object_context(oid, false);
12552     assert(obc);
12553
12554     OpContextUPtr ctx = simple_opc_create(obc);
12555     ctx->at_version = get_next_version();
12556     ctx->updated_hset_history = info.hit_set;
12557     utime_t now = ceph_clock_now();
12558     ctx->mtime = now;
12559     hit_set_trim(ctx, 0);
12560     simple_opc_submit(std::move(ctx));
12561   }
12562
12563   info.hit_set = pg_hit_set_history_t();
12564   if (agent_state) {
12565     agent_state->discard_hit_sets();
12566   }
12567 }
12568
12569 void PrimaryLogPG::hit_set_create()
12570 {
12571   utime_t now = ceph_clock_now();
12572   // make a copy of the params to modify
12573   HitSet::Params params(pool.info.hit_set_params);
12574
12575   dout(20) << __func__ << " " << params << dendl;
12576   if (pool.info.hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
12577     BloomHitSet::Params *p =
12578       static_cast<BloomHitSet::Params*>(params.impl.get());
12579
12580     // convert false positive rate so it holds up across the full period
12581     p->set_fpp(p->get_fpp() / pool.info.hit_set_count);
12582     if (p->get_fpp() <= 0.0)
12583       p->set_fpp(.01);  // fpp cannot be zero!
12584
12585     // if we don't have specified size, estimate target size based on the
12586     // previous bin!
12587     if (p->target_size == 0 && hit_set) {
12588       utime_t dur = now - hit_set_start_stamp;
12589       unsigned unique = hit_set->approx_unique_insert_count();
12590       dout(20) << __func__ << " previous set had approx " << unique
12591                << " unique items over " << dur << " seconds" << dendl;
12592       p->target_size = (double)unique * (double)pool.info.hit_set_period
12593                      / (double)dur;
12594     }
12595     if (p->target_size <
12596         static_cast<uint64_t>(cct->_conf->osd_hit_set_min_size))
12597       p->target_size = cct->_conf->osd_hit_set_min_size;
12598
12599     if (p->target_size
12600         > static_cast<uint64_t>(cct->_conf->osd_hit_set_max_size))
12601       p->target_size = cct->_conf->osd_hit_set_max_size;
12602
12603     p->seed = now.sec();
12604
12605     dout(10) << __func__ << " target_size " << p->target_size
12606              << " fpp " << p->get_fpp() << dendl;
12607   }
12608   hit_set.reset(new HitSet(params));
12609   hit_set_start_stamp = now;
12610 }
12611
12612 /**
12613  * apply log entries to set
12614  *
12615  * this would only happen after peering, to at least capture writes
12616  * during an interval that was potentially lost.
12617  */
12618 bool PrimaryLogPG::hit_set_apply_log()
12619 {
12620   if (!hit_set)
12621     return false;
12622
12623   eversion_t to = info.last_update;
12624   eversion_t from = info.hit_set.current_last_update;
12625   if (to <= from) {
12626     dout(20) << __func__ << " no update" << dendl;
12627     return false;
12628   }
12629
12630   dout(20) << __func__ << " " << to << " .. " << info.last_update << dendl;
12631   list<pg_log_entry_t>::const_reverse_iterator p = pg_log.get_log().log.rbegin();
12632   while (p != pg_log.get_log().log.rend() && p->version > to)
12633     ++p;
12634   while (p != pg_log.get_log().log.rend() && p->version > from) {
12635     hit_set->insert(p->soid);
12636     ++p;
12637   }
12638
12639   return true;
12640 }
12641
12642 void PrimaryLogPG::hit_set_persist()
12643 {
12644   dout(10) << __func__  << dendl;
12645   bufferlist bl;
12646   unsigned max = pool.info.hit_set_count;
12647
12648   utime_t now = ceph_clock_now();
12649   hobject_t oid;
12650
12651   // If any archives are degraded we skip this persist request
12652   // account for the additional entry being added below
12653   for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin();
12654        p != info.hit_set.history.end();
12655        ++p) {
12656     hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
12657
12658     // Once we hit a degraded object just skip further trim
12659     if (is_degraded_or_backfilling_object(aoid))
12660       return;
12661     if (scrubber.write_blocked_by_scrub(aoid))
12662       return;
12663   }
12664
12665   // If backfill is in progress and we could possibly overlap with the
12666   // hit_set_* objects, back off.  Since these all have
12667   // hobject_t::hash set to pgid.ps(), and those sort first, we can
12668   // look just at that.  This is necessary because our transactions
12669   // may include a modify of the new hit_set *and* a delete of the
12670   // old one, and this may span the backfill boundary.
12671   for (set<pg_shard_t>::iterator p = backfill_targets.begin();
12672        p != backfill_targets.end();
12673        ++p) {
12674     assert(peer_info.count(*p));
12675     const pg_info_t& pi = peer_info[*p];
12676     if (pi.last_backfill == hobject_t() ||
12677         pi.last_backfill.get_hash() == info.pgid.ps()) {
12678       dout(10) << __func__ << " backfill target osd." << *p
12679                << " last_backfill has not progressed past pgid ps"
12680                << dendl;
12681       return;
12682     }
12683   }
12684
12685
12686   pg_hit_set_info_t new_hset = pg_hit_set_info_t(pool.info.use_gmt_hitset);
12687   new_hset.begin = hit_set_start_stamp;
12688   new_hset.end = now;
12689   oid = get_hit_set_archive_object(
12690     new_hset.begin,
12691     new_hset.end,
12692     new_hset.using_gmt);
12693
12694   // If the current object is degraded we skip this persist request
12695   if (scrubber.write_blocked_by_scrub(oid))
12696     return;
12697
12698   hit_set->seal();
12699   ::encode(*hit_set, bl);
12700   dout(20) << __func__ << " archive " << oid << dendl;
12701
12702   if (agent_state) {
12703     agent_state->add_hit_set(new_hset.begin, hit_set);
12704     uint32_t size = agent_state->hit_set_map.size();
12705     if (size >= pool.info.hit_set_count) {
12706       size = pool.info.hit_set_count > 0 ? pool.info.hit_set_count - 1: 0;
12707     }
12708     hit_set_in_memory_trim(size);
12709   }
12710
12711   ObjectContextRef obc = get_object_context(oid, true);
12712   OpContextUPtr ctx = simple_opc_create(obc);
12713
12714   ctx->at_version = get_next_version();
12715   ctx->updated_hset_history = info.hit_set;
12716   pg_hit_set_history_t &updated_hit_set_hist = *(ctx->updated_hset_history);
12717
12718   updated_hit_set_hist.current_last_update = info.last_update;
12719   new_hset.version = ctx->at_version;
12720
12721   updated_hit_set_hist.history.push_back(new_hset);
12722   hit_set_create();
12723
12724   // fabricate an object_info_t and SnapSet
12725   obc->obs.oi.version = ctx->at_version;
12726   obc->obs.oi.mtime = now;
12727   obc->obs.oi.size = bl.length();
12728   obc->obs.exists = true;
12729   obc->obs.oi.set_data_digest(bl.crc32c(-1));
12730
12731   ctx->new_obs = obc->obs;
12732
12733   obc->ssc->snapset.head_exists = true;
12734   ctx->new_snapset = obc->ssc->snapset;
12735
12736   ctx->delta_stats.num_objects++;
12737   ctx->delta_stats.num_objects_hit_set_archive++;
12738   ctx->delta_stats.num_bytes += bl.length();
12739   ctx->delta_stats.num_bytes_hit_set_archive += bl.length();
12740
12741   bufferlist bss;
12742   ::encode(ctx->new_snapset, bss);
12743   bufferlist boi(sizeof(ctx->new_obs.oi));
12744   ::encode(ctx->new_obs.oi, boi,
12745            get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
12746
12747   ctx->op_t->create(oid);
12748   if (bl.length()) {
12749     ctx->op_t->write(oid, 0, bl.length(), bl, 0);
12750   }
12751   map <string, bufferlist> attrs;
12752   attrs[OI_ATTR].claim(boi);
12753   attrs[SS_ATTR].claim(bss);
12754   setattrs_maybe_cache(ctx->obc, ctx.get(), ctx->op_t.get(), attrs);
12755   ctx->log.push_back(
12756     pg_log_entry_t(
12757       pg_log_entry_t::MODIFY,
12758       oid,
12759       ctx->at_version,
12760       eversion_t(),
12761       0,
12762       osd_reqid_t(),
12763       ctx->mtime,
12764       0)
12765     );
12766
12767   hit_set_trim(ctx, max);
12768
12769   simple_opc_submit(std::move(ctx));
12770 }
12771
12772 void PrimaryLogPG::hit_set_trim(OpContextUPtr &ctx, unsigned max)
12773 {
12774   assert(ctx->updated_hset_history);
12775   pg_hit_set_history_t &updated_hit_set_hist =
12776     *(ctx->updated_hset_history);
12777   for (unsigned num = updated_hit_set_hist.history.size(); num > max; --num) {
12778     list<pg_hit_set_info_t>::iterator p = updated_hit_set_hist.history.begin();
12779     assert(p != updated_hit_set_hist.history.end());
12780     hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
12781
12782     assert(!is_degraded_or_backfilling_object(oid));
12783
12784     dout(20) << __func__ << " removing " << oid << dendl;
12785     ++ctx->at_version.version;
12786     ctx->log.push_back(
12787         pg_log_entry_t(pg_log_entry_t::DELETE,
12788                        oid,
12789                        ctx->at_version,
12790                        p->version,
12791                        0,
12792                        osd_reqid_t(),
12793                        ctx->mtime,
12794                        0));
12795
12796     ctx->op_t->remove(oid);
12797     updated_hit_set_hist.history.pop_front();
12798
12799     ObjectContextRef obc = get_object_context(oid, false);
12800     assert(obc);
12801     --ctx->delta_stats.num_objects;
12802     --ctx->delta_stats.num_objects_hit_set_archive;
12803     ctx->delta_stats.num_bytes -= obc->obs.oi.size;
12804     ctx->delta_stats.num_bytes_hit_set_archive -= obc->obs.oi.size;
12805   }
12806 }
12807
12808 void PrimaryLogPG::hit_set_in_memory_trim(uint32_t max_in_memory)
12809 {
12810   while (agent_state->hit_set_map.size() > max_in_memory) {
12811     agent_state->remove_oldest_hit_set();
12812   }
12813 }
12814
12815
12816 // =======================================
12817 // cache agent
12818
12819 void PrimaryLogPG::agent_setup()
12820 {
12821   assert(is_locked());
12822   if (!is_active() ||
12823       !is_primary() ||
12824       pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE ||
12825       pool.info.tier_of < 0 ||
12826       !get_osdmap()->have_pg_pool(pool.info.tier_of)) {
12827     agent_clear();
12828     return;
12829   }
12830   if (!agent_state) {
12831     agent_state.reset(new TierAgentState);
12832
12833     // choose random starting position
12834     agent_state->position = hobject_t();
12835     agent_state->position.pool = info.pgid.pool();
12836     agent_state->position.set_hash(pool.info.get_random_pg_position(
12837       info.pgid.pgid,
12838       rand()));
12839     agent_state->start = agent_state->position;
12840
12841     dout(10) << __func__ << " allocated new state, position "
12842              << agent_state->position << dendl;
12843   } else {
12844     dout(10) << __func__ << " keeping existing state" << dendl;
12845   }
12846
12847   if (info.stats.stats_invalid) {
12848     osd->clog->warn() << "pg " << info.pgid << " has invalid (post-split) stats; must scrub before tier agent can activate";
12849   }
12850
12851   agent_choose_mode();
12852 }
12853
12854 void PrimaryLogPG::agent_clear()
12855 {
12856   agent_stop();
12857   agent_state.reset(NULL);
12858 }
12859
12860 // Return false if no objects operated on since start of object hash space
12861 bool PrimaryLogPG::agent_work(int start_max, int agent_flush_quota)
12862 {
12863   lock();
12864   if (!agent_state) {
12865     dout(10) << __func__ << " no agent state, stopping" << dendl;
12866     unlock();
12867     return true;
12868   }
12869
12870   assert(!deleting);
12871
12872   if (agent_state->is_idle()) {
12873     dout(10) << __func__ << " idle, stopping" << dendl;
12874     unlock();
12875     return true;
12876   }
12877
12878   osd->logger->inc(l_osd_agent_wake);
12879
12880   dout(10) << __func__
12881            << " max " << start_max
12882            << ", flush " << agent_state->get_flush_mode_name()
12883            << ", evict " << agent_state->get_evict_mode_name()
12884            << ", pos " << agent_state->position
12885            << dendl;
12886   assert(is_primary());
12887   assert(is_active());
12888
12889   agent_load_hit_sets();
12890
12891   const pg_pool_t *base_pool = get_osdmap()->get_pg_pool(pool.info.tier_of);
12892   assert(base_pool);
12893
12894   int ls_min = 1;
12895   int ls_max = cct->_conf->osd_pool_default_cache_max_evict_check_size;
12896
12897   // list some objects.  this conveniently lists clones (oldest to
12898   // newest) before heads... the same order we want to flush in.
12899   //
12900   // NOTE: do not flush the Sequencer.  we will assume that the
12901   // listing we get back is imprecise.
12902   vector<hobject_t> ls;
12903   hobject_t next;
12904   int r = pgbackend->objects_list_partial(agent_state->position, ls_min, ls_max,
12905                                           &ls, &next);
12906   assert(r >= 0);
12907   dout(20) << __func__ << " got " << ls.size() << " objects" << dendl;
12908   int started = 0;
12909   for (vector<hobject_t>::iterator p = ls.begin();
12910        p != ls.end();
12911        ++p) {
12912     if (p->nspace == cct->_conf->osd_hit_set_namespace) {
12913       dout(20) << __func__ << " skip (hit set) " << *p << dendl;
12914       osd->logger->inc(l_osd_agent_skip);
12915       continue;
12916     }
12917     if (is_degraded_or_backfilling_object(*p)) {
12918       dout(20) << __func__ << " skip (degraded) " << *p << dendl;
12919       osd->logger->inc(l_osd_agent_skip);
12920       continue;
12921     }
12922     if (is_missing_object(p->get_head())) {
12923       dout(20) << __func__ << " skip (missing head) " << *p << dendl;
12924       osd->logger->inc(l_osd_agent_skip);
12925       continue;
12926     }
12927     ObjectContextRef obc = get_object_context(*p, false, NULL);
12928     if (!obc) {
12929       // we didn't flush; we may miss something here.
12930       dout(20) << __func__ << " skip (no obc) " << *p << dendl;
12931       osd->logger->inc(l_osd_agent_skip);
12932       continue;
12933     }
12934     if (!obc->obs.exists) {
12935       dout(20) << __func__ << " skip (dne) " << obc->obs.oi.soid << dendl;
12936       osd->logger->inc(l_osd_agent_skip);
12937       continue;
12938     }
12939     if (scrubber.write_blocked_by_scrub(obc->obs.oi.soid)) {
12940       dout(20) << __func__ << " skip (scrubbing) " << obc->obs.oi << dendl;
12941       osd->logger->inc(l_osd_agent_skip);
12942       continue;
12943     }
12944     if (obc->is_blocked()) {
12945       dout(20) << __func__ << " skip (blocked) " << obc->obs.oi << dendl;
12946       osd->logger->inc(l_osd_agent_skip);
12947       continue;
12948     }
12949     if (obc->is_request_pending()) {
12950       dout(20) << __func__ << " skip (request pending) " << obc->obs.oi << dendl;
12951       osd->logger->inc(l_osd_agent_skip);
12952       continue;
12953     }
12954
12955     // be careful flushing omap to an EC pool.
12956     if (!base_pool->supports_omap() &&
12957         obc->obs.oi.is_omap()) {
12958       dout(20) << __func__ << " skip (omap to EC) " << obc->obs.oi << dendl;
12959       osd->logger->inc(l_osd_agent_skip);
12960       continue;
12961     }
12962
12963     if (agent_state->evict_mode != TierAgentState::EVICT_MODE_IDLE &&
12964         agent_maybe_evict(obc, false))
12965       ++started;
12966     else if (agent_state->flush_mode != TierAgentState::FLUSH_MODE_IDLE &&
12967              agent_flush_quota > 0 && agent_maybe_flush(obc)) {
12968       ++started;
12969       --agent_flush_quota;
12970     }
12971     if (started >= start_max) {
12972       // If finishing early, set "next" to the next object
12973       if (++p != ls.end())
12974         next = *p;
12975       break;
12976     }
12977   }
12978
12979   if (++agent_state->hist_age > cct->_conf->osd_agent_hist_halflife) {
12980     dout(20) << __func__ << " resetting atime and temp histograms" << dendl;
12981     agent_state->hist_age = 0;
12982     agent_state->temp_hist.decay();
12983   }
12984
12985   // Total objects operated on so far
12986   int total_started = agent_state->started + started;
12987   bool need_delay = false;
12988
12989   dout(20) << __func__ << " start pos " << agent_state->position
12990     << " next start pos " << next
12991     << " started " << total_started << dendl;
12992
12993   // See if we've made a full pass over the object hash space
12994   // This might check at most ls_max objects a second time to notice that
12995   // we've checked every objects at least once.
12996   if (agent_state->position < agent_state->start &&
12997       next >= agent_state->start) {
12998     dout(20) << __func__ << " wrap around " << agent_state->start << dendl;
12999     if (total_started == 0)
13000       need_delay = true;
13001     else
13002       total_started = 0;
13003     agent_state->start = next;
13004   }
13005   agent_state->started = total_started;
13006
13007   // See if we are starting from beginning
13008   if (next.is_max())
13009     agent_state->position = hobject_t();
13010   else
13011     agent_state->position = next;
13012
13013   // Discard old in memory HitSets
13014   hit_set_in_memory_trim(pool.info.hit_set_count);
13015
13016   if (need_delay) {
13017     assert(agent_state->delaying == false);
13018     agent_delay();
13019     unlock();
13020     return false;
13021   }
13022   agent_choose_mode();
13023   unlock();
13024   return true;
13025 }
13026
13027 void PrimaryLogPG::agent_load_hit_sets()
13028 {
13029   if (agent_state->evict_mode == TierAgentState::EVICT_MODE_IDLE) {
13030     return;
13031   }
13032
13033   if (agent_state->hit_set_map.size() < info.hit_set.history.size()) {
13034     dout(10) << __func__ << dendl;
13035     for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin();
13036          p != info.hit_set.history.end(); ++p) {
13037       if (agent_state->hit_set_map.count(p->begin.sec()) == 0) {
13038         dout(10) << __func__ << " loading " << p->begin << "-"
13039                  << p->end << dendl;
13040         if (!pool.info.is_replicated()) {
13041           // FIXME: EC not supported here yet
13042           derr << __func__ << " on non-replicated pool" << dendl;
13043           break;
13044         }
13045
13046         hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
13047         if (is_unreadable_object(oid)) {
13048           dout(10) << __func__ << " unreadable " << oid << ", waiting" << dendl;
13049           break;
13050         }
13051
13052         ObjectContextRef obc = get_object_context(oid, false);
13053         if (!obc) {
13054           derr << __func__ << ": could not load hitset " << oid << dendl;
13055           break;
13056         }
13057
13058         bufferlist bl;
13059         {
13060           obc->ondisk_read_lock();
13061           int r = osd->store->read(ch, ghobject_t(oid), 0, 0, bl);
13062           assert(r >= 0);
13063           obc->ondisk_read_unlock();
13064         }
13065         HitSetRef hs(new HitSet);
13066         bufferlist::iterator pbl = bl.begin();
13067         ::decode(*hs, pbl);
13068         agent_state->add_hit_set(p->begin.sec(), hs);
13069       }
13070     }
13071   }
13072 }
13073
13074 bool PrimaryLogPG::agent_maybe_flush(ObjectContextRef& obc)
13075 {
13076   if (!obc->obs.oi.is_dirty()) {
13077     dout(20) << __func__ << " skip (clean) " << obc->obs.oi << dendl;
13078     osd->logger->inc(l_osd_agent_skip);
13079     return false;
13080   }
13081   if (obc->obs.oi.is_cache_pinned()) {
13082     dout(20) << __func__ << " skip (cache_pinned) " << obc->obs.oi << dendl;
13083     osd->logger->inc(l_osd_agent_skip);
13084     return false;
13085   }
13086
13087   utime_t now = ceph_clock_now();
13088   utime_t ob_local_mtime;
13089   if (obc->obs.oi.local_mtime != utime_t()) {
13090     ob_local_mtime = obc->obs.oi.local_mtime;
13091   } else {
13092     ob_local_mtime = obc->obs.oi.mtime;
13093   }
13094   bool evict_mode_full =
13095     (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL);
13096   if (!evict_mode_full &&
13097       obc->obs.oi.soid.snap == CEPH_NOSNAP &&  // snaps immutable; don't delay
13098       (ob_local_mtime + utime_t(pool.info.cache_min_flush_age, 0) > now)) {
13099     dout(20) << __func__ << " skip (too young) " << obc->obs.oi << dendl;
13100     osd->logger->inc(l_osd_agent_skip);
13101     return false;
13102   }
13103
13104   if (osd->agent_is_active_oid(obc->obs.oi.soid)) {
13105     dout(20) << __func__ << " skip (flushing) " << obc->obs.oi << dendl;
13106     osd->logger->inc(l_osd_agent_skip);
13107     return false;
13108   }
13109
13110   dout(10) << __func__ << " flushing " << obc->obs.oi << dendl;
13111
13112   // FIXME: flush anything dirty, regardless of what distribution of
13113   // ages we expect.
13114
13115   hobject_t oid = obc->obs.oi.soid;
13116   osd->agent_start_op(oid);
13117   // no need to capture a pg ref, can't outlive fop or ctx
13118   std::function<void()> on_flush = [this, oid]() {
13119     osd->agent_finish_op(oid);
13120   };
13121
13122   int result = start_flush(
13123     OpRequestRef(), obc, false, NULL,
13124     on_flush);
13125   if (result != -EINPROGRESS) {
13126     on_flush();
13127     dout(10) << __func__ << " start_flush() failed " << obc->obs.oi
13128       << " with " << result << dendl;
13129     osd->logger->inc(l_osd_agent_skip);
13130     return false;
13131   }
13132
13133   osd->logger->inc(l_osd_agent_flush);
13134   return true;
13135 }
13136
13137 bool PrimaryLogPG::agent_maybe_evict(ObjectContextRef& obc, bool after_flush)
13138 {
13139   const hobject_t& soid = obc->obs.oi.soid;
13140   if (!after_flush && obc->obs.oi.is_dirty()) {
13141     dout(20) << __func__ << " skip (dirty) " << obc->obs.oi << dendl;
13142     return false;
13143   }
13144   if (!obc->obs.oi.watchers.empty()) {
13145     dout(20) << __func__ << " skip (watchers) " << obc->obs.oi << dendl;
13146     return false;
13147   }
13148   if (obc->is_blocked()) {
13149     dout(20) << __func__ << " skip (blocked) " << obc->obs.oi << dendl;
13150     return false;
13151   }
13152   if (obc->obs.oi.is_cache_pinned()) {
13153     dout(20) << __func__ << " skip (cache_pinned) " << obc->obs.oi << dendl;
13154     return false;
13155   }
13156
13157   if (soid.snap == CEPH_NOSNAP) {
13158     int result = _verify_no_head_clones(soid, obc->ssc->snapset);
13159     if (result < 0) {
13160       dout(20) << __func__ << " skip (clones) " << obc->obs.oi << dendl;
13161       return false;
13162     }
13163   }
13164
13165   if (agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL) {
13166     // is this object old than cache_min_evict_age?
13167     utime_t now = ceph_clock_now();
13168     utime_t ob_local_mtime;
13169     if (obc->obs.oi.local_mtime != utime_t()) {
13170       ob_local_mtime = obc->obs.oi.local_mtime;
13171     } else {
13172       ob_local_mtime = obc->obs.oi.mtime;
13173     }
13174     if (ob_local_mtime + utime_t(pool.info.cache_min_evict_age, 0) > now) {
13175       dout(20) << __func__ << " skip (too young) " << obc->obs.oi << dendl;
13176       osd->logger->inc(l_osd_agent_skip);
13177       return false;
13178     }
13179     // is this object old and/or cold enough?
13180     int temp = 0;
13181     uint64_t temp_upper = 0, temp_lower = 0;
13182     if (hit_set)
13183       agent_estimate_temp(soid, &temp);
13184     agent_state->temp_hist.add(temp);
13185     agent_state->temp_hist.get_position_micro(temp, &temp_lower, &temp_upper);
13186
13187     dout(20) << __func__
13188              << " temp " << temp
13189              << " pos " << temp_lower << "-" << temp_upper
13190              << ", evict_effort " << agent_state->evict_effort
13191              << dendl;
13192     dout(30) << "agent_state:\n";
13193     Formatter *f = Formatter::create("");
13194     f->open_object_section("agent_state");
13195     agent_state->dump(f);
13196     f->close_section();
13197     f->flush(*_dout);
13198     delete f;
13199     *_dout << dendl;
13200
13201     if (1000000 - temp_upper >= agent_state->evict_effort)
13202       return false;
13203   }
13204
13205   dout(10) << __func__ << " evicting " << obc->obs.oi << dendl;
13206   OpContextUPtr ctx = simple_opc_create(obc);
13207
13208   if (!ctx->lock_manager.get_lock_type(
13209         ObjectContext::RWState::RWWRITE,
13210         obc->obs.oi.soid,
13211         obc,
13212         OpRequestRef())) {
13213     close_op_ctx(ctx.release());
13214     dout(20) << __func__ << " skip (cannot get lock) " << obc->obs.oi << dendl;
13215     return false;
13216   }
13217
13218   osd->agent_start_evict_op();
13219   ctx->register_on_finish(
13220     [this]() {
13221       osd->agent_finish_evict_op();
13222     });
13223
13224   ctx->at_version = get_next_version();
13225   assert(ctx->new_obs.exists);
13226   int r = _delete_oid(ctx.get(), true, false);
13227   if (obc->obs.oi.is_omap())
13228     ctx->delta_stats.num_objects_omap--;
13229   ctx->delta_stats.num_evict++;
13230   ctx->delta_stats.num_evict_kb += SHIFT_ROUND_UP(obc->obs.oi.size, 10);
13231   if (obc->obs.oi.is_dirty())
13232     --ctx->delta_stats.num_objects_dirty;
13233   assert(r == 0);
13234   finish_ctx(ctx.get(), pg_log_entry_t::DELETE, false);
13235   simple_opc_submit(std::move(ctx));
13236   osd->logger->inc(l_osd_tier_evict);
13237   osd->logger->inc(l_osd_agent_evict);
13238   return true;
13239 }
13240
13241 void PrimaryLogPG::agent_stop()
13242 {
13243   dout(20) << __func__ << dendl;
13244   if (agent_state && !agent_state->is_idle()) {
13245     agent_state->evict_mode = TierAgentState::EVICT_MODE_IDLE;
13246     agent_state->flush_mode = TierAgentState::FLUSH_MODE_IDLE;
13247     osd->agent_disable_pg(this, agent_state->evict_effort);
13248   }
13249 }
13250
13251 void PrimaryLogPG::agent_delay()
13252 {
13253   dout(20) << __func__ << dendl;
13254   if (agent_state && !agent_state->is_idle()) {
13255     assert(agent_state->delaying == false);
13256     agent_state->delaying = true;
13257     osd->agent_disable_pg(this, agent_state->evict_effort);
13258   }
13259 }
13260
13261 void PrimaryLogPG::agent_choose_mode_restart()
13262 {
13263   dout(20) << __func__ << dendl;
13264   lock();
13265   if (agent_state && agent_state->delaying) {
13266     agent_state->delaying = false;
13267     agent_choose_mode(true);
13268   }
13269   unlock();
13270 }
13271
13272 bool PrimaryLogPG::agent_choose_mode(bool restart, OpRequestRef op)
13273 {
13274   bool requeued = false;
13275   // Let delay play out
13276   if (agent_state->delaying) {
13277     dout(20) << __func__ << this << " delaying, ignored" << dendl;
13278     return requeued;
13279   }
13280
13281   TierAgentState::flush_mode_t flush_mode = TierAgentState::FLUSH_MODE_IDLE;
13282   TierAgentState::evict_mode_t evict_mode = TierAgentState::EVICT_MODE_IDLE;
13283   unsigned evict_effort = 0;
13284
13285   if (info.stats.stats_invalid) {
13286     // idle; stats can't be trusted until we scrub.
13287     dout(20) << __func__ << " stats invalid (post-split), idle" << dendl;
13288     goto skip_calc;
13289   }
13290
13291   {
13292   uint64_t divisor = pool.info.get_pg_num_divisor(info.pgid.pgid);
13293   assert(divisor > 0);
13294
13295   // adjust (effective) user objects down based on the number
13296   // of HitSet objects, which should not count toward our total since
13297   // they cannot be flushed.
13298   uint64_t unflushable = info.stats.stats.sum.num_objects_hit_set_archive;
13299
13300   // also exclude omap objects if ec backing pool
13301   const pg_pool_t *base_pool = get_osdmap()->get_pg_pool(pool.info.tier_of);
13302   assert(base_pool);
13303   if (!base_pool->supports_omap())
13304     unflushable += info.stats.stats.sum.num_objects_omap;
13305
13306   uint64_t num_user_objects = info.stats.stats.sum.num_objects;
13307   if (num_user_objects > unflushable)
13308     num_user_objects -= unflushable;
13309   else
13310     num_user_objects = 0;
13311
13312   uint64_t num_user_bytes = info.stats.stats.sum.num_bytes;
13313   uint64_t unflushable_bytes = info.stats.stats.sum.num_bytes_hit_set_archive;
13314   num_user_bytes -= unflushable_bytes;
13315   uint64_t num_overhead_bytes = osd->store->estimate_objects_overhead(num_user_objects);
13316   num_user_bytes += num_overhead_bytes;
13317
13318   // also reduce the num_dirty by num_objects_omap
13319   int64_t num_dirty = info.stats.stats.sum.num_objects_dirty;
13320   if (!base_pool->supports_omap()) {
13321     if (num_dirty > info.stats.stats.sum.num_objects_omap)
13322       num_dirty -= info.stats.stats.sum.num_objects_omap;
13323     else
13324       num_dirty = 0;
13325   }
13326
13327   dout(10) << __func__
13328            << " flush_mode: "
13329            << TierAgentState::get_flush_mode_name(agent_state->flush_mode)
13330            << " evict_mode: "
13331            << TierAgentState::get_evict_mode_name(agent_state->evict_mode)
13332            << " num_objects: " << info.stats.stats.sum.num_objects
13333            << " num_bytes: " << info.stats.stats.sum.num_bytes
13334            << " num_objects_dirty: " << info.stats.stats.sum.num_objects_dirty
13335            << " num_objects_omap: " << info.stats.stats.sum.num_objects_omap
13336            << " num_dirty: " << num_dirty
13337            << " num_user_objects: " << num_user_objects
13338            << " num_user_bytes: " << num_user_bytes
13339            << " num_overhead_bytes: " << num_overhead_bytes
13340            << " pool.info.target_max_bytes: " << pool.info.target_max_bytes
13341            << " pool.info.target_max_objects: " << pool.info.target_max_objects
13342            << dendl;
13343
13344   // get dirty, full ratios
13345   uint64_t dirty_micro = 0;
13346   uint64_t full_micro = 0;
13347   if (pool.info.target_max_bytes && num_user_objects > 0) {
13348     uint64_t avg_size = num_user_bytes / num_user_objects;
13349     dirty_micro =
13350       num_dirty * avg_size * 1000000 /
13351       MAX(pool.info.target_max_bytes / divisor, 1);
13352     full_micro =
13353       num_user_objects * avg_size * 1000000 /
13354       MAX(pool.info.target_max_bytes / divisor, 1);
13355   }
13356   if (pool.info.target_max_objects > 0) {
13357     uint64_t dirty_objects_micro =
13358       num_dirty * 1000000 /
13359       MAX(pool.info.target_max_objects / divisor, 1);
13360     if (dirty_objects_micro > dirty_micro)
13361       dirty_micro = dirty_objects_micro;
13362     uint64_t full_objects_micro =
13363       num_user_objects * 1000000 /
13364       MAX(pool.info.target_max_objects / divisor, 1);
13365     if (full_objects_micro > full_micro)
13366       full_micro = full_objects_micro;
13367   }
13368   dout(20) << __func__ << " dirty " << ((float)dirty_micro / 1000000.0)
13369            << " full " << ((float)full_micro / 1000000.0)
13370            << dendl;
13371
13372   // flush mode
13373   uint64_t flush_target = pool.info.cache_target_dirty_ratio_micro;
13374   uint64_t flush_high_target = pool.info.cache_target_dirty_high_ratio_micro;
13375   uint64_t flush_slop = (float)flush_target * cct->_conf->osd_agent_slop;
13376   if (restart || agent_state->flush_mode == TierAgentState::FLUSH_MODE_IDLE) {
13377     flush_target += flush_slop;
13378     flush_high_target += flush_slop;
13379   } else {
13380     flush_target -= MIN(flush_target, flush_slop);
13381     flush_high_target -= MIN(flush_high_target, flush_slop);
13382   }
13383
13384   if (dirty_micro > flush_high_target) {
13385     flush_mode = TierAgentState::FLUSH_MODE_HIGH;
13386   } else if (dirty_micro > flush_target) {
13387     flush_mode = TierAgentState::FLUSH_MODE_LOW;
13388   }
13389
13390   // evict mode
13391   uint64_t evict_target = pool.info.cache_target_full_ratio_micro;
13392   uint64_t evict_slop = (float)evict_target * cct->_conf->osd_agent_slop;
13393   if (restart || agent_state->evict_mode == TierAgentState::EVICT_MODE_IDLE)
13394     evict_target += evict_slop;
13395   else
13396     evict_target -= MIN(evict_target, evict_slop);
13397
13398   if (full_micro > 1000000) {
13399     // evict anything clean
13400     evict_mode = TierAgentState::EVICT_MODE_FULL;
13401     evict_effort = 1000000;
13402   } else if (full_micro > evict_target) {
13403     // set effort in [0..1] range based on where we are between
13404     evict_mode = TierAgentState::EVICT_MODE_SOME;
13405     uint64_t over = full_micro - evict_target;
13406     uint64_t span  = 1000000 - evict_target;
13407     evict_effort = MAX(over * 1000000 / span,
13408                        (unsigned)(1000000.0 * cct->_conf->osd_agent_min_evict_effort));
13409
13410     // quantize effort to avoid too much reordering in the agent_queue.
13411     uint64_t inc = cct->_conf->osd_agent_quantize_effort * 1000000;
13412     assert(inc > 0);
13413     uint64_t was = evict_effort;
13414     evict_effort -= evict_effort % inc;
13415     if (evict_effort < inc)
13416       evict_effort = inc;
13417     assert(evict_effort >= inc && evict_effort <= 1000000);
13418     dout(30) << __func__ << " evict_effort " << was << " quantized by " << inc << " to " << evict_effort << dendl;
13419   }
13420   }
13421
13422   skip_calc:
13423   bool old_idle = agent_state->is_idle();
13424   if (flush_mode != agent_state->flush_mode) {
13425     dout(5) << __func__ << " flush_mode "
13426             << TierAgentState::get_flush_mode_name(agent_state->flush_mode)
13427             << " -> "
13428             << TierAgentState::get_flush_mode_name(flush_mode)
13429             << dendl;
13430     if (flush_mode == TierAgentState::FLUSH_MODE_HIGH) {
13431       osd->agent_inc_high_count();
13432       info.stats.stats.sum.num_flush_mode_high = 1;
13433     } else if (flush_mode == TierAgentState::FLUSH_MODE_LOW) {
13434       info.stats.stats.sum.num_flush_mode_low = 1;
13435     }
13436     if (agent_state->flush_mode == TierAgentState::FLUSH_MODE_HIGH) {
13437       osd->agent_dec_high_count();
13438       info.stats.stats.sum.num_flush_mode_high = 0;
13439     } else if (agent_state->flush_mode == TierAgentState::FLUSH_MODE_LOW) {
13440       info.stats.stats.sum.num_flush_mode_low = 0;
13441     }
13442     agent_state->flush_mode = flush_mode;
13443   }
13444   if (evict_mode != agent_state->evict_mode) {
13445     dout(5) << __func__ << " evict_mode "
13446             << TierAgentState::get_evict_mode_name(agent_state->evict_mode)
13447             << " -> "
13448             << TierAgentState::get_evict_mode_name(evict_mode)
13449             << dendl;
13450     if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL &&
13451         is_active()) {
13452       if (op)
13453         requeue_op(op);
13454       requeue_ops(waiting_for_active);
13455       requeue_ops(waiting_for_scrub);
13456       requeue_ops(waiting_for_cache_not_full);
13457       objects_blocked_on_cache_full.clear();
13458       requeued = true;
13459     }
13460     if (evict_mode == TierAgentState::EVICT_MODE_SOME) {
13461       info.stats.stats.sum.num_evict_mode_some = 1;
13462     } else if (evict_mode == TierAgentState::EVICT_MODE_FULL) {
13463       info.stats.stats.sum.num_evict_mode_full = 1;
13464     }
13465     if (agent_state->evict_mode == TierAgentState::EVICT_MODE_SOME) {
13466       info.stats.stats.sum.num_evict_mode_some = 0;
13467     } else if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
13468       info.stats.stats.sum.num_evict_mode_full = 0;
13469     }
13470     agent_state->evict_mode = evict_mode;
13471   }
13472   uint64_t old_effort = agent_state->evict_effort;
13473   if (evict_effort != agent_state->evict_effort) {
13474     dout(5) << __func__ << " evict_effort "
13475             << ((float)agent_state->evict_effort / 1000000.0)
13476             << " -> "
13477             << ((float)evict_effort / 1000000.0)
13478             << dendl;
13479     agent_state->evict_effort = evict_effort;
13480   }
13481
13482   // NOTE: we are using evict_effort as a proxy for *all* agent effort
13483   // (including flush).  This is probably fine (they should be
13484   // correlated) but it is not precisely correct.
13485   if (agent_state->is_idle()) {
13486     if (!restart && !old_idle) {
13487       osd->agent_disable_pg(this, old_effort);
13488     }
13489   } else {
13490     if (restart || old_idle) {
13491       osd->agent_enable_pg(this, agent_state->evict_effort);
13492     } else if (old_effort != agent_state->evict_effort) {
13493       osd->agent_adjust_pg(this, old_effort, agent_state->evict_effort);
13494     }
13495   }
13496   return requeued;
13497 }
13498
13499 void PrimaryLogPG::agent_estimate_temp(const hobject_t& oid, int *temp)
13500 {
13501   assert(hit_set);
13502   assert(temp);
13503   *temp = 0;
13504   if (hit_set->contains(oid))
13505     *temp = 1000000;
13506   unsigned i = 0;
13507   int last_n = pool.info.hit_set_search_last_n;
13508   for (map<time_t,HitSetRef>::reverse_iterator p =
13509        agent_state->hit_set_map.rbegin(); last_n > 0 &&
13510        p != agent_state->hit_set_map.rend(); ++p, ++i) {
13511     if (p->second->contains(oid)) {
13512       *temp += pool.info.get_grade(i);
13513       --last_n;
13514     }
13515   }
13516 }
13517
13518 // Dup op detection
13519
13520 bool PrimaryLogPG::already_complete(eversion_t v)
13521 {
13522   dout(20) << __func__ << ": " << v << dendl;
13523   for (xlist<RepGather*>::iterator i = repop_queue.begin();
13524        !i.end();
13525        ++i) {
13526     dout(20) << __func__ << ": " << **i << dendl;
13527     // skip copy from temp object ops
13528     if ((*i)->v == eversion_t()) {
13529       dout(20) << __func__ << ": " << **i
13530                << " version is empty" << dendl;
13531       continue;
13532     }
13533     if ((*i)->v > v) {
13534       dout(20) << __func__ << ": " << **i
13535                << " (*i)->v past v" << dendl;
13536       break;
13537     }
13538     if (!(*i)->all_committed) {
13539       dout(20) << __func__ << ": " << **i
13540                << " not committed, returning false"
13541                << dendl;
13542       return false;
13543     }
13544   }
13545   dout(20) << __func__ << ": returning true" << dendl;
13546   return true;
13547 }
13548
13549 bool PrimaryLogPG::already_ack(eversion_t v)
13550 {
13551   dout(20) << __func__ << ": " << v << dendl;
13552   for (xlist<RepGather*>::iterator i = repop_queue.begin();
13553        !i.end();
13554        ++i) {
13555     // skip copy from temp object ops
13556     if ((*i)->v == eversion_t()) {
13557       dout(20) << __func__ << ": " << **i
13558                << " version is empty" << dendl;
13559       continue;
13560     }
13561     if ((*i)->v > v) {
13562       dout(20) << __func__ << ": " << **i
13563                << " (*i)->v past v" << dendl;
13564       break;
13565     }
13566     if (!(*i)->all_applied) {
13567       dout(20) << __func__ << ": " << **i
13568                << " not applied, returning false"
13569                << dendl;
13570       return false;
13571     }
13572   }
13573   dout(20) << __func__ << ": returning true" << dendl;
13574   return true;
13575 }
13576
13577
13578 // ==========================================================================================
13579 // SCRUB
13580
13581
13582 bool PrimaryLogPG::_range_available_for_scrub(
13583   const hobject_t &begin, const hobject_t &end)
13584 {
13585   pair<hobject_t, ObjectContextRef> next;
13586   next.second = object_contexts.lookup(begin);
13587   next.first = begin;
13588   bool more = true;
13589   while (more && next.first < end) {
13590     if (next.second && next.second->is_blocked()) {
13591       next.second->requeue_scrub_on_unblock = true;
13592       dout(10) << __func__ << ": scrub delayed, "
13593                << next.first << " is blocked"
13594                << dendl;
13595       return false;
13596     }
13597     more = object_contexts.get_next(next.first, &next);
13598   }
13599   return true;
13600 }
13601
13602 static bool doing_clones(const boost::optional<SnapSet> &snapset,
13603                          const vector<snapid_t>::reverse_iterator &curclone) {
13604     return snapset && curclone != snapset.get().clones.rend();
13605 }
13606
13607 void PrimaryLogPG::log_missing(unsigned missing,
13608                         const boost::optional<hobject_t> &head,
13609                         LogChannelRef clog,
13610                         const spg_t &pgid,
13611                         const char *func,
13612                         const char *mode,
13613                         bool allow_incomplete_clones)
13614 {
13615   assert(head);
13616   if (allow_incomplete_clones) {
13617     dout(20) << func << " " << mode << " " << pgid << " " << head.get()
13618                << " skipped " << missing << " clone(s) in cache tier" << dendl;
13619   } else {
13620     clog->info() << mode << " " << pgid << " " << head.get()
13621                        << " " << missing << " missing clone(s)";
13622   }
13623 }
13624
13625 unsigned PrimaryLogPG::process_clones_to(const boost::optional<hobject_t> &head,
13626   const boost::optional<SnapSet> &snapset,
13627   LogChannelRef clog,
13628   const spg_t &pgid,
13629   const char *mode,
13630   bool allow_incomplete_clones,
13631   boost::optional<snapid_t> target,
13632   vector<snapid_t>::reverse_iterator *curclone,
13633   inconsistent_snapset_wrapper &e)
13634 {
13635   assert(head);
13636   assert(snapset);
13637   unsigned missing = 0;
13638
13639   // NOTE: clones are in descending order, thus **curclone > target test here
13640   hobject_t next_clone(head.get());
13641   while(doing_clones(snapset, *curclone) && (!target || **curclone > *target)) {
13642     ++missing;
13643     // it is okay to be missing one or more clones in a cache tier.
13644     // skip higher-numbered clones in the list.
13645     if (!allow_incomplete_clones) {
13646       next_clone.snap = **curclone;
13647       clog->error() << mode << " " << pgid << " " << head.get()
13648                          << " expected clone " << next_clone << " " << missing
13649                          << " missing";
13650       ++scrubber.shallow_errors;
13651       e.set_clone_missing(next_clone.snap);
13652     }
13653     // Clones are descending
13654     ++(*curclone);
13655   }
13656   return missing;
13657 }
13658
13659 /*
13660  * Validate consistency of the object info and snap sets.
13661  *
13662  * We are sort of comparing 2 lists. The main loop is on objmap.objects. But
13663  * the comparison of the objects is against multiple snapset.clones. There are
13664  * multiple clone lists and in between lists we expect head or snapdir.
13665  *
13666  * Example
13667  *
13668  * objects              expected
13669  * =======              =======
13670  * obj1 snap 1          head/snapdir, unexpected obj1 snap 1
13671  * obj2 head            head/snapdir, head ok
13672  *              [SnapSet clones 6 4 2 1]
13673  * obj2 snap 7          obj2 snap 6, unexpected obj2 snap 7
13674  * obj2 snap 6          obj2 snap 6, match
13675  * obj2 snap 4          obj2 snap 4, match
13676  * obj3 head            obj2 snap 2 (expected), obj2 snap 1 (expected), head ok
13677  *              [Snapset clones 3 1]
13678  * obj3 snap 3          obj3 snap 3 match
13679  * obj3 snap 1          obj3 snap 1 match
13680  * obj4 snapdir         head/snapdir, snapdir ok
13681  *              [Snapset clones 4]
13682  * EOL                  obj4 snap 4, (expected)
13683  */
13684 void PrimaryLogPG::scrub_snapshot_metadata(
13685   ScrubMap &scrubmap,
13686   const map<hobject_t, pair<uint32_t, uint32_t>> &missing_digest)
13687 {
13688   dout(10) << __func__ << dendl;
13689
13690   coll_t c(info.pgid);
13691   bool repair = state_test(PG_STATE_REPAIR);
13692   bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
13693   const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
13694   boost::optional<snapid_t> all_clones;   // Unspecified snapid_t or boost::none
13695
13696   /// snapsets to repair
13697   map<hobject_t,SnapSet> snapset_to_repair;
13698
13699   // traverse in reverse order.
13700   boost::optional<hobject_t> head;
13701   boost::optional<SnapSet> snapset; // If initialized so will head (above)
13702   vector<snapid_t>::reverse_iterator curclone; // Defined only if snapset initialized
13703   unsigned missing = 0;
13704   inconsistent_snapset_wrapper soid_error, head_error;
13705
13706   bufferlist last_data;
13707
13708   for (map<hobject_t,ScrubMap::object>::reverse_iterator
13709        p = scrubmap.objects.rbegin(); p != scrubmap.objects.rend(); ++p) {
13710     const hobject_t& soid = p->first;
13711     soid_error = inconsistent_snapset_wrapper{soid};
13712     object_stat_sum_t stat;
13713     boost::optional<object_info_t> oi;
13714
13715     if (!soid.is_snapdir())
13716       stat.num_objects++;
13717
13718     if (soid.nspace == cct->_conf->osd_hit_set_namespace)
13719       stat.num_objects_hit_set_archive++;
13720
13721     if (soid.is_snap()) {
13722       // it's a clone
13723       stat.num_object_clones++;
13724     }
13725
13726     // basic checks.
13727     if (p->second.attrs.count(OI_ATTR) == 0) {
13728       oi = boost::none;
13729       osd->clog->error() << mode << " " << info.pgid << " " << soid
13730                         << " no '" << OI_ATTR << "' attr";
13731       ++scrubber.shallow_errors;
13732       soid_error.set_oi_attr_missing();
13733     } else {
13734       bufferlist bv;
13735       bv.push_back(p->second.attrs[OI_ATTR]);
13736       try {
13737         oi = object_info_t(); // Initialize optional<> before decode into it
13738         oi.get().decode(bv);
13739       } catch (buffer::error& e) {
13740         oi = boost::none;
13741         osd->clog->error() << mode << " " << info.pgid << " " << soid
13742                 << " can't decode '" << OI_ATTR << "' attr " << e.what();
13743         ++scrubber.shallow_errors;
13744         soid_error.set_oi_attr_corrupted();
13745         soid_error.set_oi_attr_missing(); // Not available too
13746       }
13747     }
13748
13749     if (oi) {
13750       if (pgbackend->be_get_ondisk_size(oi->size) != p->second.size) {
13751         osd->clog->error() << mode << " " << info.pgid << " " << soid
13752                            << " on disk size (" << p->second.size
13753                            << ") does not match object info size ("
13754                            << oi->size << ") adjusted for ondisk to ("
13755                            << pgbackend->be_get_ondisk_size(oi->size)
13756                            << ")";
13757         soid_error.set_size_mismatch();
13758         ++scrubber.shallow_errors;
13759       }
13760
13761       dout(20) << mode << "  " << soid << " " << oi.get() << dendl;
13762
13763       // A clone num_bytes will be added later when we have snapset
13764       if (!soid.is_snap()) {
13765         stat.num_bytes += oi->size;
13766       }
13767       if (soid.nspace == cct->_conf->osd_hit_set_namespace)
13768         stat.num_bytes_hit_set_archive += oi->size;
13769
13770       if (!soid.is_snapdir()) {
13771         if (oi->is_dirty())
13772           ++stat.num_objects_dirty;
13773         if (oi->is_whiteout())
13774           ++stat.num_whiteouts;
13775         if (oi->is_omap())
13776           ++stat.num_objects_omap;
13777         if (oi->is_cache_pinned())
13778           ++stat.num_objects_pinned;
13779       }
13780     } else {
13781       // pessimistic assumption that this object might contain a
13782       // legacy SnapSet
13783       stat.num_legacy_snapsets++;
13784     }
13785
13786     // Check for any problems while processing clones
13787     if (doing_clones(snapset, curclone)) {
13788       boost::optional<snapid_t> target;
13789       // Expecting an object with snap for current head
13790       if (soid.has_snapset() || soid.get_head() != head->get_head()) {
13791
13792         dout(10) << __func__ << " " << mode << " " << info.pgid << " new object "
13793                  << soid << " while processing " << head.get() << dendl;
13794
13795         target = all_clones;
13796       } else {
13797         assert(soid.is_snap());
13798         target = soid.snap;
13799       }
13800
13801       // Log any clones we were expecting to be there up to target
13802       // This will set missing, but will be a no-op if snap.soid == *curclone.
13803       missing += process_clones_to(head, snapset, osd->clog, info.pgid, mode,
13804                         pool.info.allow_incomplete_clones(), target, &curclone,
13805                         head_error);
13806     }
13807     bool expected;
13808     // Check doing_clones() again in case we ran process_clones_to()
13809     if (doing_clones(snapset, curclone)) {
13810       // A head/snapdir would have processed all clones above
13811       // or all greater than *curclone.
13812       assert(soid.is_snap() && *curclone <= soid.snap);
13813
13814       // After processing above clone snap should match the expected curclone
13815       expected = (*curclone == soid.snap);
13816     } else {
13817       // If we aren't doing clones any longer, then expecting head/snapdir
13818       expected = soid.has_snapset();
13819     }
13820     if (!expected) {
13821       // If we couldn't read the head's snapset, just ignore clones
13822       if (head && !snapset) {
13823         osd->clog->error() << mode << " " << info.pgid << " " << soid
13824                           << " clone ignored due to missing snapset";
13825       } else {
13826         osd->clog->error() << mode << " " << info.pgid << " " << soid
13827                            << " is an unexpected clone";
13828       }
13829       ++scrubber.shallow_errors;
13830       soid_error.set_headless();
13831       scrubber.store->add_snap_error(pool.id, soid_error);
13832       if (head && soid.get_head() == head->get_head())
13833         head_error.set_clone(soid.snap);
13834       continue;
13835     }
13836
13837     // new snapset?
13838     if (soid.has_snapset()) {
13839
13840       if (missing) {
13841         log_missing(missing, head, osd->clog, info.pgid, __func__, mode,
13842                     pool.info.allow_incomplete_clones());
13843       }
13844
13845       // Save previous head error information
13846       if (head && head_error.errors)
13847         scrubber.store->add_snap_error(pool.id, head_error);
13848       // Set this as a new head object
13849       head = soid;
13850       missing = 0;
13851       head_error = soid_error;
13852
13853       dout(20) << __func__ << " " << mode << " new head " << head << dendl;
13854
13855       if (p->second.attrs.count(SS_ATTR) == 0) {
13856         osd->clog->error() << mode << " " << info.pgid << " " << soid
13857                           << " no '" << SS_ATTR << "' attr";
13858         ++scrubber.shallow_errors;
13859         snapset = boost::none;
13860         head_error.set_ss_attr_missing();
13861       } else {
13862         bufferlist bl;
13863         bl.push_back(p->second.attrs[SS_ATTR]);
13864         bufferlist::iterator blp = bl.begin();
13865         try {
13866           snapset = SnapSet(); // Initialize optional<> before decoding into it
13867           ::decode(snapset.get(), blp);
13868         } catch (buffer::error& e) {
13869           snapset = boost::none;
13870           osd->clog->error() << mode << " " << info.pgid << " " << soid
13871                 << " can't decode '" << SS_ATTR << "' attr " << e.what();
13872           ++scrubber.shallow_errors;
13873           head_error.set_ss_attr_corrupted();
13874         }
13875       }
13876
13877       if (snapset) {
13878         // what will be next?
13879         curclone = snapset->clones.rbegin();
13880
13881         if (!snapset->clones.empty()) {
13882           dout(20) << "  snapset " << snapset.get() << dendl;
13883           if (snapset->seq == 0) {
13884             osd->clog->error() << mode << " " << info.pgid << " " << soid
13885                                << " snaps.seq not set";
13886             ++scrubber.shallow_errors;
13887             head_error.set_snapset_mismatch();
13888           }
13889         }
13890
13891         if (soid.is_head() && !snapset->head_exists) {
13892           osd->clog->error() << mode << " " << info.pgid << " " << soid
13893                           << " snapset.head_exists=false, but head exists";
13894           ++scrubber.shallow_errors;
13895           head_error.set_head_mismatch();
13896         }
13897         if (soid.is_snapdir() && snapset->head_exists) {
13898           osd->clog->error() << mode << " " << info.pgid << " " << soid
13899                           << " snapset.head_exists=true, but snapdir exists";
13900           ++scrubber.shallow_errors;
13901           head_error.set_head_mismatch();
13902         }
13903
13904         if (get_osdmap()->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
13905           if (soid.is_snapdir()) {
13906             dout(10) << " will move snapset to head from " << soid << dendl;
13907             snapset_to_repair[soid.get_head()] = *snapset;
13908           } else if (snapset->is_legacy()) {
13909             dout(10) << " will convert legacy snapset on " << soid << " " << *snapset
13910                      << dendl;
13911             snapset_to_repair[soid.get_head()] = *snapset;
13912           }
13913         } else {
13914           stat.num_legacy_snapsets++;
13915         }
13916       } else {
13917         // pessimistic assumption that this object might contain a
13918         // legacy SnapSet
13919         stat.num_legacy_snapsets++;
13920       }
13921     } else {
13922       assert(soid.is_snap());
13923       assert(head);
13924       assert(snapset);
13925       assert(soid.snap == *curclone);
13926
13927       dout(20) << __func__ << " " << mode << " matched clone " << soid << dendl;
13928
13929       if (snapset->clone_size.count(soid.snap) == 0) {
13930         osd->clog->error() << mode << " " << info.pgid << " " << soid
13931                            << " is missing in clone_size";
13932         ++scrubber.shallow_errors;
13933         soid_error.set_size_mismatch();
13934       } else {
13935         if (oi && oi->size != snapset->clone_size[soid.snap]) {
13936           osd->clog->error() << mode << " " << info.pgid << " " << soid
13937                              << " size " << oi->size << " != clone_size "
13938                              << snapset->clone_size[*curclone];
13939           ++scrubber.shallow_errors;
13940           soid_error.set_size_mismatch();
13941         }
13942
13943         if (snapset->clone_overlap.count(soid.snap) == 0) {
13944           osd->clog->error() << mode << " " << info.pgid << " " << soid
13945                              << " is missing in clone_overlap";
13946           ++scrubber.shallow_errors;
13947           soid_error.set_size_mismatch();
13948         } else {
13949           // This checking is based on get_clone_bytes().  The first 2 asserts
13950           // can't happen because we know we have a clone_size and
13951           // a clone_overlap.  Now we check that the interval_set won't
13952           // cause the last assert.
13953           uint64_t size = snapset->clone_size.find(soid.snap)->second;
13954           const interval_set<uint64_t> &overlap =
13955                 snapset->clone_overlap.find(soid.snap)->second;
13956           bool bad_interval_set = false;
13957           for (interval_set<uint64_t>::const_iterator i = overlap.begin();
13958                i != overlap.end(); ++i) {
13959             if (size < i.get_len()) {
13960               bad_interval_set = true;
13961               break;
13962             }
13963             size -= i.get_len();
13964           }
13965
13966           if (bad_interval_set) {
13967             osd->clog->error() << mode << " " << info.pgid << " " << soid
13968                                << " bad interval_set in clone_overlap";
13969             ++scrubber.shallow_errors;
13970             soid_error.set_size_mismatch();
13971           } else {
13972             stat.num_bytes += snapset->get_clone_bytes(soid.snap);
13973           }
13974         }
13975       }
13976
13977       // migrate legacy_snaps to snapset?
13978       auto p = snapset_to_repair.find(soid.get_head());
13979       if (p != snapset_to_repair.end()) {
13980         if (!oi || oi->legacy_snaps.empty()) {
13981           osd->clog->error() << mode << " " << info.pgid << " " << soid
13982                              << " has no oi or legacy_snaps; cannot convert "
13983                              << *snapset;
13984           ++scrubber.shallow_errors;
13985         } else {
13986           dout(20) << __func__ << "   copying legacy_snaps " << oi->legacy_snaps
13987                    << " to snapset " << p->second << dendl;
13988           p->second.clone_snaps[soid.snap] = oi->legacy_snaps;
13989         }
13990       }
13991
13992       // what's next?
13993       ++curclone;
13994       if (soid_error.errors)
13995         scrubber.store->add_snap_error(pool.id, soid_error);
13996     }
13997
13998     scrub_cstat.add(stat);
13999   }
14000
14001   if (doing_clones(snapset, curclone)) {
14002     dout(10) << __func__ << " " << mode << " " << info.pgid
14003              << " No more objects while processing " << head.get() << dendl;
14004
14005     missing += process_clones_to(head, snapset, osd->clog, info.pgid, mode,
14006                       pool.info.allow_incomplete_clones(), all_clones, &curclone,
14007                       head_error);
14008   }
14009   // There could be missing found by the test above or even
14010   // before dropping out of the loop for the last head.
14011   if (missing) {
14012     log_missing(missing, head, osd->clog, info.pgid, __func__,
14013                 mode, pool.info.allow_incomplete_clones());
14014   }
14015   if (head && head_error.errors)
14016     scrubber.store->add_snap_error(pool.id, head_error);
14017
14018   for (map<hobject_t,pair<uint32_t,uint32_t>>::const_iterator p =
14019          missing_digest.begin();
14020        p != missing_digest.end();
14021        ++p) {
14022     if (p->first.is_snapdir())
14023       continue;
14024     dout(10) << __func__ << " recording digests for " << p->first << dendl;
14025     ObjectContextRef obc = get_object_context(p->first, false);
14026     if (!obc) {
14027       osd->clog->error() << info.pgid << " " << mode
14028                          << " cannot get object context for object "
14029                          << p->first;
14030       continue;
14031     } else if (obc->obs.oi.soid != p->first) {
14032       osd->clog->error() << info.pgid << " " << mode
14033                          << " object " << p->first
14034                          << " has a valid oi attr with a mismatched name, "
14035                          << " obc->obs.oi.soid: " << obc->obs.oi.soid;
14036       continue;
14037     }
14038     OpContextUPtr ctx = simple_opc_create(obc);
14039     ctx->at_version = get_next_version();
14040     ctx->mtime = utime_t();      // do not update mtime
14041     ctx->new_obs.oi.set_data_digest(p->second.first);
14042     ctx->new_obs.oi.set_omap_digest(p->second.second);
14043     finish_ctx(ctx.get(), pg_log_entry_t::MODIFY);
14044
14045     ctx->register_on_success(
14046       [this]() {
14047         dout(20) << "updating scrub digest" << dendl;
14048         if (--scrubber.num_digest_updates_pending == 0) {
14049           requeue_scrub();
14050         }
14051       });
14052
14053     simple_opc_submit(std::move(ctx));
14054     ++scrubber.num_digest_updates_pending;
14055   }
14056   for (auto& p : snapset_to_repair) {
14057     // cache pools may not have the clones, which means we won't know
14058     // what snaps they have.  fake out the clone_snaps entries anyway (with
14059     // blank snap lists).
14060     p.second.head_exists = true;
14061     if (pool.info.allow_incomplete_clones()) {
14062       for (auto s : p.second.clones) {
14063         if (p.second.clone_snaps.count(s) == 0) {
14064           dout(10) << __func__ << " " << p.first << " faking clone_snaps for "
14065                    << s << dendl;
14066           p.second.clone_snaps[s];
14067         }
14068       }
14069     }
14070     if (p.second.clones.size() != p.second.clone_snaps.size() ||
14071         p.second.is_legacy()) {
14072       // this happens if we encounter other errors above, like a missing
14073       // or extra clone.
14074       dout(10) << __func__ << " not writing snapset to " << p.first
14075                << " snapset " << p.second << " clones " << p.second.clones
14076                << "; didn't convert fully" << dendl;
14077       scrub_cstat.sum.num_legacy_snapsets++;
14078       continue;
14079     }
14080     dout(10) << __func__ << " writing snapset to " << p.first
14081              << " " << p.second << dendl;
14082     ObjectContextRef obc = get_object_context(p.first, true);
14083     if (!obc) {
14084       osd->clog->error() << info.pgid << " " << mode
14085                          << " cannot get object context for object "
14086                          << p.first;
14087       continue;
14088     } else if (obc->obs.oi.soid != p.first) {
14089       osd->clog->error() << info.pgid << " " << mode
14090                          << " object " << p.first
14091                          << " has a valid oi attr with a mismatched name, "
14092                          << " obc->obs.oi.soid: " << obc->obs.oi.soid;
14093       continue;
14094     }
14095     ObjectContextRef snapset_obc;
14096     if (!obc->obs.exists) {
14097       snapset_obc = get_object_context(p.first.get_snapdir(), false);
14098       if (!snapset_obc) {
14099         osd->clog->error() << info.pgid << " " << mode
14100                            << " cannot get object context for "
14101                            << p.first.get_snapdir();
14102         continue;
14103       }
14104     }
14105     OpContextUPtr ctx = simple_opc_create(obc);
14106     PGTransaction *t = ctx->op_t.get();
14107     ctx->snapset_obc = snapset_obc;
14108     ctx->at_version = get_next_version();
14109     ctx->mtime = utime_t();      // do not update mtime
14110     ctx->new_snapset = p.second;
14111     if (!ctx->new_obs.exists) {
14112       dout(20) << __func__ << "   making " << p.first << " a whiteout" << dendl;
14113       ctx->new_obs.exists = true;
14114       ctx->new_snapset.head_exists = true;
14115       ctx->new_obs.oi.set_flag(object_info_t::FLAG_WHITEOUT);
14116       ++ctx->delta_stats.num_whiteouts;
14117       ++ctx->delta_stats.num_objects;
14118       t->create(p.first);
14119       if (p.first < scrubber.start) {
14120         dout(20) << __func__ << " kludging around update outside of scrub range"
14121                  << dendl;
14122       } else {
14123         scrub_cstat.add(ctx->delta_stats);
14124       }
14125     }
14126     dout(20) << __func__ << "   final snapset " << ctx->new_snapset << dendl;
14127     assert(!ctx->new_snapset.is_legacy());
14128     finish_ctx(ctx.get(), pg_log_entry_t::MODIFY);
14129     ctx->register_on_success(
14130       [this]() {
14131         dout(20) << "updating snapset" << dendl;
14132         if (--scrubber.num_digest_updates_pending == 0) {
14133           requeue_scrub();
14134         }
14135       });
14136
14137     simple_opc_submit(std::move(ctx));
14138     ++scrubber.num_digest_updates_pending;
14139   }
14140
14141   dout(10) << __func__ << " (" << mode << ") finish" << dendl;
14142 }
14143
14144 void PrimaryLogPG::_scrub_clear_state()
14145 {
14146   scrub_cstat = object_stat_collection_t();
14147 }
14148
14149 void PrimaryLogPG::_scrub_finish()
14150 {
14151   bool repair = state_test(PG_STATE_REPAIR);
14152   bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
14153   const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
14154
14155   if (info.stats.stats_invalid) {
14156     info.stats.stats = scrub_cstat;
14157     info.stats.stats_invalid = false;
14158
14159     if (agent_state)
14160       agent_choose_mode();
14161   }
14162
14163   dout(10) << mode << " got "
14164            << scrub_cstat.sum.num_objects << "/" << info.stats.stats.sum.num_objects << " objects, "
14165            << scrub_cstat.sum.num_object_clones << "/" << info.stats.stats.sum.num_object_clones << " clones, "
14166            << scrub_cstat.sum.num_objects_dirty << "/" << info.stats.stats.sum.num_objects_dirty << " dirty, "
14167            << scrub_cstat.sum.num_objects_omap << "/" << info.stats.stats.sum.num_objects_omap << " omap, "
14168            << scrub_cstat.sum.num_objects_pinned << "/" << info.stats.stats.sum.num_objects_pinned << " pinned, "
14169            << scrub_cstat.sum.num_objects_hit_set_archive << "/" << info.stats.stats.sum.num_objects_hit_set_archive << " hit_set_archive, "
14170            << scrub_cstat.sum.num_bytes << "/" << info.stats.stats.sum.num_bytes << " bytes, "
14171            << scrub_cstat.sum.num_bytes_hit_set_archive << "/" << info.stats.stats.sum.num_bytes_hit_set_archive << " hit_set_archive bytes."
14172            << dendl;
14173
14174   if (scrub_cstat.sum.num_objects != info.stats.stats.sum.num_objects ||
14175       scrub_cstat.sum.num_object_clones != info.stats.stats.sum.num_object_clones ||
14176       (scrub_cstat.sum.num_objects_dirty != info.stats.stats.sum.num_objects_dirty &&
14177        !info.stats.dirty_stats_invalid) ||
14178       (scrub_cstat.sum.num_objects_omap != info.stats.stats.sum.num_objects_omap &&
14179        !info.stats.omap_stats_invalid) ||
14180       (scrub_cstat.sum.num_objects_pinned != info.stats.stats.sum.num_objects_pinned &&
14181        !info.stats.pin_stats_invalid) ||
14182       (scrub_cstat.sum.num_objects_hit_set_archive != info.stats.stats.sum.num_objects_hit_set_archive &&
14183        !info.stats.hitset_stats_invalid) ||
14184       (scrub_cstat.sum.num_bytes_hit_set_archive != info.stats.stats.sum.num_bytes_hit_set_archive &&
14185        !info.stats.hitset_bytes_stats_invalid) ||
14186       scrub_cstat.sum.num_whiteouts != info.stats.stats.sum.num_whiteouts ||
14187       scrub_cstat.sum.num_bytes != info.stats.stats.sum.num_bytes) {
14188     osd->clog->error() << info.pgid << " " << mode
14189                       << " stat mismatch, got "
14190                       << scrub_cstat.sum.num_objects << "/" << info.stats.stats.sum.num_objects << " objects, "
14191                       << scrub_cstat.sum.num_object_clones << "/" << info.stats.stats.sum.num_object_clones << " clones, "
14192                       << scrub_cstat.sum.num_objects_dirty << "/" << info.stats.stats.sum.num_objects_dirty << " dirty, "
14193                       << scrub_cstat.sum.num_objects_omap << "/" << info.stats.stats.sum.num_objects_omap << " omap, "
14194                       << scrub_cstat.sum.num_objects_pinned << "/" << info.stats.stats.sum.num_objects_pinned << " pinned, "
14195                       << scrub_cstat.sum.num_objects_hit_set_archive << "/" << info.stats.stats.sum.num_objects_hit_set_archive << " hit_set_archive, "
14196                       << scrub_cstat.sum.num_whiteouts << "/" << info.stats.stats.sum.num_whiteouts << " whiteouts, "
14197                       << scrub_cstat.sum.num_bytes << "/" << info.stats.stats.sum.num_bytes << " bytes, "
14198                       << scrub_cstat.sum.num_bytes_hit_set_archive << "/" << info.stats.stats.sum.num_bytes_hit_set_archive << " hit_set_archive bytes.";
14199     ++scrubber.shallow_errors;
14200
14201     if (repair) {
14202       ++scrubber.fixed;
14203       info.stats.stats = scrub_cstat;
14204       info.stats.dirty_stats_invalid = false;
14205       info.stats.omap_stats_invalid = false;
14206       info.stats.hitset_stats_invalid = false;
14207       info.stats.hitset_bytes_stats_invalid = false;
14208       publish_stats_to_osd();
14209       share_pg_info();
14210     }
14211   } else if (scrub_cstat.sum.num_legacy_snapsets !=
14212              info.stats.stats.sum.num_legacy_snapsets) {
14213     osd->clog->info() << info.pgid << " " << mode << " updated num_legacy_snapsets"
14214                       << " from " << info.stats.stats.sum.num_legacy_snapsets
14215                       << " -> " << scrub_cstat.sum.num_legacy_snapsets << "\n";
14216     info.stats.stats.sum.num_legacy_snapsets = scrub_cstat.sum.num_legacy_snapsets;
14217     publish_stats_to_osd();
14218     share_pg_info();
14219   }
14220   // Clear object context cache to get repair information
14221   if (repair)
14222     object_contexts.clear();
14223 }
14224
14225 bool PrimaryLogPG::check_osdmap_full(const set<pg_shard_t> &missing_on)
14226 {
14227     return osd->check_osdmap_full(missing_on);
14228 }
14229
14230 int PrimaryLogPG::rep_repair_primary_object(const hobject_t& soid, OpRequestRef op)
14231 {
14232   // Only supports replicated pools
14233   assert(!pool.info.require_rollback());
14234   assert(is_primary());
14235
14236   dout(10) << __func__ << " " << soid
14237            << " peers osd.{" << actingbackfill << "}" << dendl;
14238
14239   if (!is_clean()) {
14240     block_for_clean(soid, op);
14241     return -EAGAIN;
14242   }
14243
14244   assert(!pg_log.get_missing().is_missing(soid));
14245   bufferlist bv;
14246   object_info_t oi;
14247   eversion_t v;
14248   int r = get_pgbackend()->objects_get_attr(soid, OI_ATTR, &bv);
14249   if (r < 0) {
14250     // Leave v and try to repair without a version, getting attr failed
14251     dout(0) << __func__ << ": Need version of replica, objects_get_attr failed: "
14252             << soid << " error=" << r << dendl;
14253   } else try {
14254     bufferlist::iterator bliter = bv.begin();
14255     ::decode(oi, bliter);
14256     v = oi.version;
14257   } catch (...) {
14258     // Leave v as default constructed. This will fail when sent to older OSDs, but
14259     // not much worse than failing here.
14260     dout(0) << __func__ << ": Need version of replica, bad object_info_t: " << soid << dendl;
14261   }
14262
14263   missing_loc.add_missing(soid, v, eversion_t());
14264   if (primary_error(soid, v)) {
14265     dout(0) << __func__ << " No other replicas available for " << soid << dendl;
14266     // XXX: If we knew that there is no down osd which could include this
14267     // object, it would be nice if we could return EIO here.
14268     // If a "never fail" flag was available, that could be used
14269     // for rbd to NOT return EIO until object marked lost.
14270
14271     // Drop through to save this op in case an osd comes up with the object.
14272   }
14273
14274   // Restart the op after object becomes readable again
14275   waiting_for_unreadable_object[soid].push_back(op);
14276   op->mark_delayed("waiting for missing object");
14277
14278   if (!eio_errors_to_process) {
14279     eio_errors_to_process = true;
14280     assert(is_clean());
14281     queue_peering_event(
14282         CephPeeringEvtRef(
14283           std::make_shared<CephPeeringEvt>(
14284           get_osdmap()->get_epoch(),
14285           get_osdmap()->get_epoch(),
14286           DoRecovery())));
14287   } else {
14288     // A prior error must have already cleared clean state and queued recovery
14289     // or a map change has triggered re-peering.
14290     // Not inlining the recovery by calling maybe_kick_recovery(soid);
14291     dout(5) << __func__<< ": Read error on " << soid << ", but already seen errors" << dendl;
14292   }
14293
14294   return -EAGAIN;
14295 }
14296
14297 /*---SnapTrimmer Logging---*/
14298 #undef dout_prefix
14299 #define dout_prefix *_dout << pg->gen_prefix()
14300
14301 void PrimaryLogPG::SnapTrimmer::log_enter(const char *state_name)
14302 {
14303   ldout(pg->cct, 20) << "enter " << state_name << dendl;
14304 }
14305
14306 void PrimaryLogPG::SnapTrimmer::log_exit(const char *state_name, utime_t enter_time)
14307 {
14308   ldout(pg->cct, 20) << "exit " << state_name << dendl;
14309 }
14310
14311 /*---SnapTrimmer states---*/
14312 #undef dout_prefix
14313 #define dout_prefix (*_dout << context< SnapTrimmer >().pg->gen_prefix() \
14314                      << "SnapTrimmer state<" << get_state_name() << ">: ")
14315
14316 /* NotTrimming */
14317 PrimaryLogPG::NotTrimming::NotTrimming(my_context ctx)
14318   : my_base(ctx),
14319     NamedState(context< SnapTrimmer >().pg, "NotTrimming")
14320 {
14321   context< SnapTrimmer >().log_enter(state_name);
14322 }
14323
14324 void PrimaryLogPG::NotTrimming::exit()
14325 {
14326   context< SnapTrimmer >().log_exit(state_name, enter_time);
14327 }
14328
14329 boost::statechart::result PrimaryLogPG::NotTrimming::react(const KickTrim&)
14330 {
14331   PrimaryLogPG *pg = context< SnapTrimmer >().pg;
14332   ldout(pg->cct, 10) << "NotTrimming react KickTrim" << dendl;
14333
14334   if (!(pg->is_primary() && pg->is_active())) {
14335     ldout(pg->cct, 10) << "NotTrimming not primary or active" << dendl;
14336     return discard_event();
14337   }
14338   if (!pg->is_clean() ||
14339       pg->snap_trimq.empty()) {
14340     ldout(pg->cct, 10) << "NotTrimming not clean or nothing to trim" << dendl;
14341     return discard_event();
14342   }
14343   if (pg->scrubber.active) {
14344     ldout(pg->cct, 10) << " scrubbing, will requeue snap_trimmer after" << dendl;
14345     return transit< WaitScrub >();
14346   } else {
14347     return transit< Trimming >();
14348   }
14349 }
14350
14351 boost::statechart::result PrimaryLogPG::WaitReservation::react(const SnapTrimReserved&)
14352 {
14353   PrimaryLogPG *pg = context< SnapTrimmer >().pg;
14354   ldout(pg->cct, 10) << "WaitReservation react SnapTrimReserved" << dendl;
14355
14356   pending = nullptr;
14357   if (!context< SnapTrimmer >().can_trim()) {
14358     post_event(KickTrim());
14359     return transit< NotTrimming >();
14360   }
14361
14362   context<Trimming>().snap_to_trim = pg->snap_trimq.range_start();
14363   ldout(pg->cct, 10) << "NotTrimming: trimming "
14364                      << pg->snap_trimq.range_start()
14365                      << dendl;
14366   return transit< AwaitAsyncWork >();
14367 }
14368
14369 /* AwaitAsyncWork */
14370 PrimaryLogPG::AwaitAsyncWork::AwaitAsyncWork(my_context ctx)
14371   : my_base(ctx),
14372     NamedState(context< SnapTrimmer >().pg, "Trimming/AwaitAsyncWork")
14373 {
14374   auto *pg = context< SnapTrimmer >().pg;
14375   context< SnapTrimmer >().log_enter(state_name);
14376   context< SnapTrimmer >().pg->osd->queue_for_snap_trim(pg);
14377   pg->state_set(PG_STATE_SNAPTRIM);
14378   pg->state_clear(PG_STATE_SNAPTRIM_ERROR);
14379   pg->publish_stats_to_osd();
14380 }
14381
14382 boost::statechart::result PrimaryLogPG::AwaitAsyncWork::react(const DoSnapWork&)
14383 {
14384   PrimaryLogPGRef pg = context< SnapTrimmer >().pg;
14385   snapid_t snap_to_trim = context<Trimming>().snap_to_trim;
14386   auto &in_flight = context<Trimming>().in_flight;
14387   assert(in_flight.empty());
14388
14389   assert(pg->is_primary() && pg->is_active());
14390   if (!context< SnapTrimmer >().can_trim()) {
14391     ldout(pg->cct, 10) << "something changed, reverting to NotTrimming" << dendl;
14392     post_event(KickTrim());
14393     return transit< NotTrimming >();
14394   }
14395
14396   ldout(pg->cct, 10) << "AwaitAsyncWork: trimming snap " << snap_to_trim << dendl;
14397
14398   vector<hobject_t> to_trim;
14399   unsigned max = pg->cct->_conf->osd_pg_max_concurrent_snap_trims;
14400   to_trim.reserve(max);
14401   int r = pg->snap_mapper.get_next_objects_to_trim(
14402     snap_to_trim,
14403     max,
14404     &to_trim);
14405   if (r != 0 && r != -ENOENT) {
14406     lderr(pg->cct) << "get_next_objects_to_trim returned "
14407                    << cpp_strerror(r) << dendl;
14408     assert(0 == "get_next_objects_to_trim returned an invalid code");
14409   } else if (r == -ENOENT) {
14410     // Done!
14411     ldout(pg->cct, 10) << "got ENOENT" << dendl;
14412
14413     ldout(pg->cct, 10) << "adding snap " << snap_to_trim
14414                        << " to purged_snaps"
14415                        << dendl;
14416     pg->info.purged_snaps.insert(snap_to_trim);
14417     pg->snap_trimq.erase(snap_to_trim);
14418     ldout(pg->cct, 10) << "purged_snaps now "
14419                        << pg->info.purged_snaps << ", snap_trimq now "
14420                        << pg->snap_trimq << dendl;
14421
14422     ObjectStore::Transaction t;
14423     pg->dirty_big_info = true;
14424     pg->write_if_dirty(t);
14425     int tr = pg->osd->store->queue_transaction(pg->osr.get(), std::move(t), NULL);
14426     assert(tr == 0);
14427
14428     pg->share_pg_info();
14429     post_event(KickTrim());
14430     return transit< NotTrimming >();
14431   }
14432   assert(!to_trim.empty());
14433
14434   for (auto &&object: to_trim) {
14435     // Get next
14436     ldout(pg->cct, 10) << "AwaitAsyncWork react trimming " << object << dendl;
14437     OpContextUPtr ctx;
14438     int error = pg->trim_object(in_flight.empty(), object, &ctx);
14439     if (error) {
14440       if (error == -ENOLCK) {
14441         ldout(pg->cct, 10) << "could not get write lock on obj "
14442                            << object << dendl;
14443       } else {
14444         pg->state_set(PG_STATE_SNAPTRIM_ERROR);
14445         ldout(pg->cct, 10) << "Snaptrim error=" << error << dendl;
14446       }
14447       if (!in_flight.empty()) {
14448         ldout(pg->cct, 10) << "letting the ones we already started finish" << dendl;
14449         return transit< WaitRepops >();
14450       }
14451       if (error == -ENOLCK) {
14452         ldout(pg->cct, 10) << "waiting for it to clear"
14453                            << dendl;
14454         return transit< WaitRWLock >();
14455       } else {
14456         return transit< NotTrimming >();
14457       }
14458     }
14459
14460     in_flight.insert(object);
14461     ctx->register_on_success(
14462       [pg, object, &in_flight]() {
14463         assert(in_flight.find(object) != in_flight.end());
14464         in_flight.erase(object);
14465         if (in_flight.empty()) {
14466           if (pg->state_test(PG_STATE_SNAPTRIM_ERROR)) {
14467             pg->snap_trimmer_machine.process_event(Reset());
14468           } else {
14469             pg->snap_trimmer_machine.process_event(RepopsComplete());
14470           }
14471         }
14472       });
14473
14474     pg->simple_opc_submit(std::move(ctx));
14475   }
14476
14477   return transit< WaitRepops >();
14478 }
14479
14480 void PrimaryLogPG::setattr_maybe_cache(
14481   ObjectContextRef obc,
14482   OpContext *op,
14483   PGTransaction *t,
14484   const string &key,
14485   bufferlist &val)
14486 {
14487   t->setattr(obc->obs.oi.soid, key, val);
14488 }
14489
14490 void PrimaryLogPG::setattrs_maybe_cache(
14491   ObjectContextRef obc,
14492   OpContext *op,
14493   PGTransaction *t,
14494   map<string, bufferlist> &attrs)
14495 {
14496   t->setattrs(obc->obs.oi.soid, attrs);
14497 }
14498
14499 void PrimaryLogPG::rmattr_maybe_cache(
14500   ObjectContextRef obc,
14501   OpContext *op,
14502   PGTransaction *t,
14503   const string &key)
14504 {
14505   t->rmattr(obc->obs.oi.soid, key);
14506 }
14507
14508 int PrimaryLogPG::getattr_maybe_cache(
14509   ObjectContextRef obc,
14510   const string &key,
14511   bufferlist *val)
14512 {
14513   if (pool.info.require_rollback()) {
14514     map<string, bufferlist>::iterator i = obc->attr_cache.find(key);
14515     if (i != obc->attr_cache.end()) {
14516       if (val)
14517         *val = i->second;
14518       return 0;
14519     } else {
14520       return -ENODATA;
14521     }
14522   }
14523   return pgbackend->objects_get_attr(obc->obs.oi.soid, key, val);
14524 }
14525
14526 int PrimaryLogPG::getattrs_maybe_cache(
14527   ObjectContextRef obc,
14528   map<string, bufferlist> *out,
14529   bool user_only)
14530 {
14531   int r = 0;
14532   if (pool.info.require_rollback()) {
14533     if (out)
14534       *out = obc->attr_cache;
14535   } else {
14536     r = pgbackend->objects_get_attrs(obc->obs.oi.soid, out);
14537   }
14538   if (out && user_only) {
14539     map<string, bufferlist> tmp;
14540     for (map<string, bufferlist>::iterator i = out->begin();
14541          i != out->end();
14542          ++i) {
14543       if (i->first.size() > 1 && i->first[0] == '_')
14544         tmp[i->first.substr(1, i->first.size())].claim(i->second);
14545     }
14546     tmp.swap(*out);
14547   }
14548   return r;
14549 }
14550
14551 bool PrimaryLogPG::check_failsafe_full(ostream &ss) {
14552     return osd->check_failsafe_full(ss);
14553 }
14554
14555 void intrusive_ptr_add_ref(PrimaryLogPG *pg) { pg->get("intptr"); }
14556 void intrusive_ptr_release(PrimaryLogPG *pg) { pg->put("intptr"); }
14557
14558 #ifdef PG_DEBUG_REFS
14559 uint64_t get_with_id(PrimaryLogPG *pg) { return pg->get_with_id(); }
14560 void put_with_id(PrimaryLogPG *pg, uint64_t id) { return pg->put_with_id(id); }
14561 #endif
14562
14563 void intrusive_ptr_add_ref(PrimaryLogPG::RepGather *repop) { repop->get(); }
14564 void intrusive_ptr_release(PrimaryLogPG::RepGather *repop) { repop->put(); }