ceph/src/osd/PrimaryLogPG.cc

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Ceph - scalable distributed file system
   5  *
   6  * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
   7  * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
   8  *
   9  * Author: Loic Dachary <loic@dachary.org>
  10  *
  11  * This is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public
  13  * License version 2.1, as published by the Free Software
  14  * Foundation.  See file COPYING.
  15  *
  16  */
  17
  18 #include "boost/tuple/tuple.hpp"
  19 #include "boost/intrusive_ptr.hpp"
  20 #include "PG.h"
  21 #include "PrimaryLogPG.h"
  22 #include "OSD.h"
  23 #include "OpRequest.h"
  24 #include "ScrubStore.h"
  25 #include "Session.h"
  26 #include "objclass/objclass.h"
  27
  28 #include "common/errno.h"
  29 #include "common/scrub_types.h"
  30 #include "common/perf_counters.h"
  31
  32 #include "messages/MOSDOp.h"
  33 #include "messages/MOSDBackoff.h"
  34 #include "messages/MOSDSubOp.h"
  35 #include "messages/MOSDSubOpReply.h"
  36 #include "messages/MOSDPGTrim.h"
  37 #include "messages/MOSDPGScan.h"
  38 #include "messages/MOSDRepScrub.h"
  39 #include "messages/MOSDPGBackfill.h"
  40 #include "messages/MOSDPGBackfillRemove.h"
  41 #include "messages/MOSDPGUpdateLogMissing.h"
  42 #include "messages/MOSDPGUpdateLogMissingReply.h"
  43 #include "messages/MCommandReply.h"
  44 #include "messages/MOSDScrubReserve.h"
  45 #include "mds/inode_backtrace.h" // Ugh
  46 #include "common/EventTrace.h"
  47
  48 #include "common/config.h"
  49 #include "include/compat.h"
  50 #include "mon/MonClient.h"
  51 #include "osdc/Objecter.h"
  52 #include "json_spirit/json_spirit_value.h"
  53 #include "json_spirit/json_spirit_reader.h"
  54 #include "include/assert.h"  // json_spirit clobbers it
  55 #include "include/rados/rados_types.hpp"
  56
  57 #ifdef WITH_LTTNG
  58 #include "tracing/osd.h"
  59 #else
  60 #define tracepoint(...)
  61 #endif
  62
  63 #define dout_context cct
  64 #define dout_subsys ceph_subsys_osd
  65 #define DOUT_PREFIX_ARGS this, osd->whoami, get_osdmap()
  66 #undef dout_prefix
  67 #define dout_prefix _prefix(_dout, this)
  68 template <typename T>
  69 static ostream& _prefix(std::ostream *_dout, T *pg) {
  70   return *_dout << pg->gen_prefix();
  71 }
  72
  73
  74 #include <sstream>
  75 #include <utility>
  76
  77 #include <errno.h>
  78
  79 MEMPOOL_DEFINE_OBJECT_FACTORY(PrimaryLogPG, replicatedpg, osd);
  80
  81 PGLSFilter::PGLSFilter() : cct(nullptr)
  82 {
  83 }
  84
  85 PGLSFilter::~PGLSFilter()
  86 {
  87 }
  88
  89 struct PrimaryLogPG::C_OSD_OnApplied : Context {
  90   PrimaryLogPGRef pg;
  91   epoch_t epoch;
  92   eversion_t v;
  93   C_OSD_OnApplied(
  94     PrimaryLogPGRef pg,
  95     epoch_t epoch,
  96     eversion_t v)
  97     : pg(pg), epoch(epoch), v(v) {}
  98   void finish(int) override {
  99     pg->lock();
 100     if (!pg->pg_has_reset_since(epoch))
 101       pg->op_applied(v);
 102     pg->unlock();
 103   }
 104 };
 105
 106 /**
 107  * The CopyCallback class defines an interface for completions to the
 108  * copy_start code. Users of the copy infrastructure must implement
 109  * one and give an instance of the class to start_copy.
 110  *
 111  * The implementer is responsible for making sure that the CopyCallback
 112  * can associate itself with the correct copy operation.
 113  */
 114 class PrimaryLogPG::CopyCallback : public GenContext<CopyCallbackResults> {
 115 protected:
 116   CopyCallback() {}
 117   /**
 118    * results.get<0>() is the return code: 0 for success; -ECANCELED if
 119    * the operation was cancelled by the local OSD; -errno for other issues.
 120    * results.get<1>() is a pointer to a CopyResults object, which you are
 121    * responsible for deleting.
 122    */
 123   void finish(CopyCallbackResults results_) override = 0;
 124
 125 public:
 126   /// Provide the final size of the copied object to the CopyCallback
 127   ~CopyCallback() override {}
 128 };
 129
 130 template <typename T>
 131 class PrimaryLogPG::BlessedGenContext : public GenContext<T> {
 132   PrimaryLogPGRef pg;
 133   unique_ptr<GenContext<T>> c;
 134   epoch_t e;
 135 public:
 136   BlessedGenContext(PrimaryLogPG *pg, GenContext<T> *c, epoch_t e)
 137     : pg(pg), c(c), e(e) {}
 138   void finish(T t) override {
 139     pg->lock();
 140     if (pg->pg_has_reset_since(e))
 141       c.reset();
 142     else
 143       c.release()->complete(t);
 144     pg->unlock();
 145   }
 146 };
 147
 148 GenContext<ThreadPool::TPHandle&> *PrimaryLogPG::bless_gencontext(
 149   GenContext<ThreadPool::TPHandle&> *c) {
 150   return new BlessedGenContext<ThreadPool::TPHandle&>(
 151     this, c, get_osdmap()->get_epoch());
 152 }
 153
 154 class PrimaryLogPG::BlessedContext : public Context {
 155   PrimaryLogPGRef pg;
 156   unique_ptr<Context> c;
 157   epoch_t e;
 158 public:
 159   BlessedContext(PrimaryLogPG *pg, Context *c, epoch_t e)
 160     : pg(pg), c(c), e(e) {}
 161   void finish(int r) override {
 162     pg->lock();
 163     if (pg->pg_has_reset_since(e))
 164       c.reset();
 165     else
 166       c.release()->complete(r);
 167     pg->unlock();
 168   }
 169 };
 170
 171
 172 Context *PrimaryLogPG::bless_context(Context *c) {
 173   return new BlessedContext(this, c, get_osdmap()->get_epoch());
 174 }
 175
 176 class PrimaryLogPG::C_PG_ObjectContext : public Context {
 177   PrimaryLogPGRef pg;
 178   ObjectContext *obc;
 179   public:
 180   C_PG_ObjectContext(PrimaryLogPG *p, ObjectContext *o) :
 181     pg(p), obc(o) {}
 182   void finish(int r) override {
 183     pg->object_context_destructor_callback(obc);
 184   }
 185 };
 186
 187 class PrimaryLogPG::C_OSD_OndiskWriteUnlock : public Context {
 188   ObjectContextRef obc, obc2, obc3;
 189   public:
 190   C_OSD_OndiskWriteUnlock(
 191     ObjectContextRef o,
 192     ObjectContextRef o2 = ObjectContextRef(),
 193     ObjectContextRef o3 = ObjectContextRef()) : obc(o), obc2(o2), obc3(o3) {}
 194   void finish(int r) override {
 195     obc->ondisk_write_unlock();
 196     if (obc2)
 197       obc2->ondisk_write_unlock();
 198     if (obc3)
 199       obc3->ondisk_write_unlock();
 200   }
 201 };
 202
 203 struct OnReadComplete : public Context {
 204   PrimaryLogPG *pg;
 205   PrimaryLogPG::OpContext *opcontext;
 206   OnReadComplete(
 207     PrimaryLogPG *pg,
 208     PrimaryLogPG::OpContext *ctx) : pg(pg), opcontext(ctx) {}
 209   void finish(int r) override {
 210     if (r < 0)
 211       opcontext->async_read_result = r;
 212     opcontext->finish_read(pg);
 213   }
 214   ~OnReadComplete() override {}
 215 };
 216
 217 class PrimaryLogPG::C_OSD_AppliedRecoveredObject : public Context {
 218   PrimaryLogPGRef pg;
 219   ObjectContextRef obc;
 220   public:
 221   C_OSD_AppliedRecoveredObject(PrimaryLogPG *p, ObjectContextRef o) :
 222     pg(p), obc(o) {}
 223   void finish(int r) override {
 224     pg->_applied_recovered_object(obc);
 225   }
 226 };
 227
 228 class PrimaryLogPG::C_OSD_CommittedPushedObject : public Context {
 229   PrimaryLogPGRef pg;
 230   epoch_t epoch;
 231   eversion_t last_complete;
 232   public:
 233   C_OSD_CommittedPushedObject(
 234     PrimaryLogPG *p, epoch_t epoch, eversion_t lc) :
 235     pg(p), epoch(epoch), last_complete(lc) {
 236   }
 237   void finish(int r) override {
 238     pg->_committed_pushed_object(epoch, last_complete);
 239   }
 240 };
 241
 242 class PrimaryLogPG::C_OSD_AppliedRecoveredObjectReplica : public Context {
 243   PrimaryLogPGRef pg;
 244   public:
 245   explicit C_OSD_AppliedRecoveredObjectReplica(PrimaryLogPG *p) :
 246     pg(p) {}
 247   void finish(int r) override {
 248     pg->_applied_recovered_object_replica();
 249   }
 250 };
 251
 252 // OpContext
 253 void PrimaryLogPG::OpContext::start_async_reads(PrimaryLogPG *pg)
 254 {
 255   inflightreads = 1;
 256   list<pair<boost::tuple<uint64_t, uint64_t, unsigned>,
 257             pair<bufferlist*, Context*> > > in;
 258   in.swap(pending_async_reads);
 259   pg->pgbackend->objects_read_async(
 260     obc->obs.oi.soid,
 261     in,
 262     new OnReadComplete(pg, this), pg->get_pool().fast_read);
 263 }
 264 void PrimaryLogPG::OpContext::finish_read(PrimaryLogPG *pg)
 265 {
 266   assert(inflightreads > 0);
 267   --inflightreads;
 268   if (async_reads_complete()) {
 269     assert(pg->in_progress_async_reads.size());
 270     assert(pg->in_progress_async_reads.front().second == this);
 271     pg->in_progress_async_reads.pop_front();
 272     pg->complete_read_ctx(async_read_result, this);
 273   }
 274 }
 275
 276 class CopyFromCallback: public PrimaryLogPG::CopyCallback {
 277 public:
 278   PrimaryLogPG::CopyResults *results;
 279   int retval;
 280   PrimaryLogPG::OpContext *ctx;
 281   explicit CopyFromCallback(PrimaryLogPG::OpContext *ctx_)
 282     : results(NULL),
 283       retval(0),
 284       ctx(ctx_) {}
 285   ~CopyFromCallback() override {}
 286
 287   void finish(PrimaryLogPG::CopyCallbackResults results_) override {
 288     results = results_.get<1>();
 289     int r = results_.get<0>();
 290     retval = r;
 291
 292     // for finish_copyfrom
 293     ctx->user_at_version = results->user_version;
 294
 295     if (r >= 0) {
 296       ctx->pg->execute_ctx(ctx);
 297     }
 298     ctx->copy_cb = NULL;
 299     if (r < 0) {
 300       if (r != -ECANCELED) { // on cancel just toss it out; client resends
 301         if (ctx->op)
 302           ctx->pg->osd->reply_op_error(ctx->op, r);
 303       } else if (results->should_requeue) {
 304         if (ctx->op)
 305           ctx->pg->requeue_op(ctx->op);
 306       }
 307       ctx->pg->close_op_ctx(ctx);
 308     }
 309   }
 310
 311   bool is_temp_obj_used() {
 312     return results->started_temp_obj;
 313   }
 314   uint64_t get_data_size() {
 315     return results->object_size;
 316   }
 317   int get_result() {
 318     return retval;
 319   }
 320 };
 321
 322 // ======================
 323 // PGBackend::Listener
 324
 325 void PrimaryLogPG::on_local_recover(
 326   const hobject_t &hoid,
 327   const ObjectRecoveryInfo &_recovery_info,
 328   ObjectContextRef obc,
 329   ObjectStore::Transaction *t
 330   )
 331 {
 332   dout(10) << __func__ << ": " << hoid << dendl;
 333
 334   ObjectRecoveryInfo recovery_info(_recovery_info);
 335   clear_object_snap_mapping(t, hoid);
 336   if (recovery_info.soid.is_snap()) {
 337     OSDriver::OSTransaction _t(osdriver.get_transaction(t));
 338     set<snapid_t> snaps;
 339     dout(20) << " snapset " << recovery_info.ss
 340              << " legacy_snaps " << recovery_info.oi.legacy_snaps << dendl;
 341     if (recovery_info.ss.is_legacy() ||
 342         recovery_info.ss.seq == 0 /* jewel osd doesn't populate this */) {
 343       assert(recovery_info.oi.legacy_snaps.size());
 344       snaps.insert(recovery_info.oi.legacy_snaps.begin(),
 345                    recovery_info.oi.legacy_snaps.end());
 346     } else {
 347       auto p = recovery_info.ss.clone_snaps.find(hoid.snap);
 348       assert(p != recovery_info.ss.clone_snaps.end());  // hmm, should we warn?
 349       snaps.insert(p->second.begin(), p->second.end());
 350     }
 351     dout(20) << " snaps " << snaps << dendl;
 352     snap_mapper.add_oid(
 353       recovery_info.soid,
 354       snaps,
 355       &_t);
 356   }
 357   if (pg_log.get_missing().is_missing(recovery_info.soid) &&
 358       pg_log.get_missing().get_items().find(recovery_info.soid)->second.need > recovery_info.version) {
 359     assert(is_primary());
 360     const pg_log_entry_t *latest = pg_log.get_log().objects.find(recovery_info.soid)->second;
 361     if (latest->op == pg_log_entry_t::LOST_REVERT &&
 362         latest->reverting_to == recovery_info.version) {
 363       dout(10) << " got old revert version " << recovery_info.version
 364                << " for " << *latest << dendl;
 365       recovery_info.version = latest->version;
 366       // update the attr to the revert event version
 367       recovery_info.oi.prior_version = recovery_info.oi.version;
 368       recovery_info.oi.version = latest->version;
 369       bufferlist bl;
 370       ::encode(recovery_info.oi, bl,
 371                get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
 372       assert(!pool.info.require_rollback());
 373       t->setattr(coll, ghobject_t(recovery_info.soid), OI_ATTR, bl);
 374       if (obc)
 375         obc->attr_cache[OI_ATTR] = bl;
 376     }
 377   }
 378
 379   // keep track of active pushes for scrub
 380   ++active_pushes;
 381
 382   if (recovery_info.version > pg_log.get_can_rollback_to()) {
 383     /* This can only happen during a repair, and even then, it would
 384      * be one heck of a race.  If we are repairing the object, the
 385      * write in question must be fully committed, so it's not valid
 386      * to roll it back anyway (and we'll be rolled forward shortly
 387      * anyway) */
 388     PGLogEntryHandler h{this, t};
 389     pg_log.roll_forward_to(recovery_info.version, &h);
 390   }
 391   recover_got(recovery_info.soid, recovery_info.version);
 392
 393   if (is_primary()) {
 394     assert(obc);
 395     obc->obs.exists = true;
 396     obc->ondisk_write_lock();
 397
 398     bool got = obc->get_recovery_read();
 399     assert(got);
 400
 401     assert(recovering.count(obc->obs.oi.soid));
 402     recovering[obc->obs.oi.soid] = obc;
 403     obc->obs.oi = recovery_info.oi;  // may have been updated above
 404
 405
 406     t->register_on_applied(new C_OSD_AppliedRecoveredObject(this, obc));
 407     t->register_on_applied_sync(new C_OSD_OndiskWriteUnlock(obc));
 408
 409     publish_stats_to_osd();
 410     assert(missing_loc.needs_recovery(hoid));
 411     missing_loc.add_location(hoid, pg_whoami);
 412     release_backoffs(hoid);
 413     if (!is_unreadable_object(hoid)) {
 414       auto unreadable_object_entry = waiting_for_unreadable_object.find(hoid);
 415       if (unreadable_object_entry != waiting_for_unreadable_object.end()) {
 416         dout(20) << " kicking unreadable waiters on " << hoid << dendl;
 417         requeue_ops(unreadable_object_entry->second);
 418         waiting_for_unreadable_object.erase(unreadable_object_entry);
 419       }
 420     }
 421     if (pg_log.get_missing().get_items().size() == 0) {
 422       requeue_ops(waiting_for_all_missing);
 423       waiting_for_all_missing.clear();
 424     }
 425   } else {
 426     t->register_on_applied(
 427       new C_OSD_AppliedRecoveredObjectReplica(this));
 428
 429   }
 430
 431   t->register_on_commit(
 432     new C_OSD_CommittedPushedObject(
 433       this,
 434       get_osdmap()->get_epoch(),
 435       info.last_complete));
 436
 437   // update pg
 438   dirty_info = true;
 439   write_if_dirty(*t);
 440 }
 441
 442 void PrimaryLogPG::on_global_recover(
 443   const hobject_t &soid,
 444   const object_stat_sum_t &stat_diff)
 445 {
 446   info.stats.stats.sum.add(stat_diff);
 447   missing_loc.recovered(soid);
 448   publish_stats_to_osd();
 449   dout(10) << "pushed " << soid << " to all replicas" << dendl;
 450   map<hobject_t, ObjectContextRef>::iterator i = recovering.find(soid);
 451   assert(i != recovering.end());
 452
 453   // recover missing won't have had an obc, but it gets filled in
 454   // during on_local_recover
 455   assert(i->second);
 456   list<OpRequestRef> requeue_list;
 457   i->second->drop_recovery_read(&requeue_list);
 458   requeue_ops(requeue_list);
 459
 460   backfills_in_flight.erase(soid);
 461
 462   recovering.erase(i);
 463   finish_recovery_op(soid);
 464   release_backoffs(soid);
 465   auto degraded_object_entry = waiting_for_degraded_object.find(soid);
 466   if (degraded_object_entry != waiting_for_degraded_object.end()) {
 467     dout(20) << " kicking degraded waiters on " << soid << dendl;
 468     requeue_ops(degraded_object_entry->second);
 469     waiting_for_degraded_object.erase(degraded_object_entry);
 470   }
 471   auto unreadable_object_entry = waiting_for_unreadable_object.find(soid);
 472   if (unreadable_object_entry != waiting_for_unreadable_object.end()) {
 473     dout(20) << " kicking unreadable waiters on " << soid << dendl;
 474     requeue_ops(unreadable_object_entry->second);
 475     waiting_for_unreadable_object.erase(unreadable_object_entry);
 476   }
 477   finish_degraded_object(soid);
 478 }
 479
 480 void PrimaryLogPG::on_peer_recover(
 481   pg_shard_t peer,
 482   const hobject_t &soid,
 483   const ObjectRecoveryInfo &recovery_info)
 484 {
 485   publish_stats_to_osd();
 486   // done!
 487   peer_missing[peer].got(soid, recovery_info.version);
 488 }
 489
 490 void PrimaryLogPG::begin_peer_recover(
 491   pg_shard_t peer,
 492   const hobject_t soid)
 493 {
 494   peer_missing[peer].revise_have(soid, eversion_t());
 495 }
 496
 497 void PrimaryLogPG::schedule_recovery_work(
 498   GenContext<ThreadPool::TPHandle&> *c)
 499 {
 500   osd->recovery_gen_wq.queue(c);
 501 }
 502
 503 void PrimaryLogPG::send_message_osd_cluster(
 504   int peer, Message *m, epoch_t from_epoch)
 505 {
 506   osd->send_message_osd_cluster(peer, m, from_epoch);
 507 }
 508
 509 void PrimaryLogPG::send_message_osd_cluster(
 510   Message *m, Connection *con)
 511 {
 512   osd->send_message_osd_cluster(m, con);
 513 }
 514
 515 void PrimaryLogPG::send_message_osd_cluster(
 516   Message *m, const ConnectionRef& con)
 517 {
 518   osd->send_message_osd_cluster(m, con);
 519 }
 520
 521 ConnectionRef PrimaryLogPG::get_con_osd_cluster(
 522   int peer, epoch_t from_epoch)
 523 {
 524   return osd->get_con_osd_cluster(peer, from_epoch);
 525 }
 526
 527 PerfCounters *PrimaryLogPG::get_logger()
 528 {
 529   return osd->logger;
 530 }
 531
 532
 533 // ====================
 534 // missing objects
 535
 536 bool PrimaryLogPG::is_missing_object(const hobject_t& soid) const
 537 {
 538   return pg_log.get_missing().get_items().count(soid);
 539 }
 540
 541 void PrimaryLogPG::maybe_kick_recovery(
 542   const hobject_t &soid)
 543 {
 544   eversion_t v;
 545   if (!missing_loc.needs_recovery(soid, &v))
 546     return;
 547
 548   map<hobject_t, ObjectContextRef>::const_iterator p = recovering.find(soid);
 549   if (p != recovering.end()) {
 550     dout(7) << "object " << soid << " v " << v << ", already recovering." << dendl;
 551   } else if (missing_loc.is_unfound(soid)) {
 552     dout(7) << "object " << soid << " v " << v << ", is unfound." << dendl;
 553   } else {
 554     dout(7) << "object " << soid << " v " << v << ", recovering." << dendl;
 555     PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
 556     if (is_missing_object(soid)) {
 557       recover_missing(soid, v, cct->_conf->osd_client_op_priority, h);
 558     } else {
 559       prep_object_replica_pushes(soid, v, h);
 560     }
 561     pgbackend->run_recovery_op(h, cct->_conf->osd_client_op_priority);
 562   }
 563 }
 564
 565 void PrimaryLogPG::wait_for_unreadable_object(
 566   const hobject_t& soid, OpRequestRef op)
 567 {
 568   assert(is_unreadable_object(soid));
 569   maybe_kick_recovery(soid);
 570   waiting_for_unreadable_object[soid].push_back(op);
 571   op->mark_delayed("waiting for missing object");
 572 }
 573
 574 void PrimaryLogPG::wait_for_all_missing(OpRequestRef op)
 575 {
 576   waiting_for_all_missing.push_back(op);
 577   op->mark_delayed("waiting for all missing");
 578 }
 579
 580 bool PrimaryLogPG::is_degraded_or_backfilling_object(const hobject_t& soid)
 581 {
 582   /* The conditions below may clear (on_local_recover, before we queue
 583    * the transaction) before we actually requeue the degraded waiters
 584    * in on_global_recover after the transaction completes.
 585    */
 586   if (waiting_for_degraded_object.count(soid))
 587     return true;
 588   if (pg_log.get_missing().get_items().count(soid))
 589     return true;
 590   assert(!actingbackfill.empty());
 591   for (set<pg_shard_t>::iterator i = actingbackfill.begin();
 592        i != actingbackfill.end();
 593        ++i) {
 594     if (*i == get_primary()) continue;
 595     pg_shard_t peer = *i;
 596     auto peer_missing_entry = peer_missing.find(peer);
 597     if (peer_missing_entry != peer_missing.end() &&
 598         peer_missing_entry->second.get_items().count(soid))
 599       return true;
 600
 601     // Object is degraded if after last_backfill AND
 602     // we are backfilling it
 603     if (is_backfill_targets(peer) &&
 604         peer_info[peer].last_backfill <= soid &&
 605         last_backfill_started >= soid &&
 606         backfills_in_flight.count(soid))
 607       return true;
 608   }
 609   return false;
 610 }
 611
 612 void PrimaryLogPG::wait_for_degraded_object(const hobject_t& soid, OpRequestRef op)
 613 {
 614   assert(is_degraded_or_backfilling_object(soid));
 615
 616   maybe_kick_recovery(soid);
 617   waiting_for_degraded_object[soid].push_back(op);
 618   op->mark_delayed("waiting for degraded object");
 619 }
 620
 621 void PrimaryLogPG::block_write_on_full_cache(
 622   const hobject_t& _oid, OpRequestRef op)
 623 {
 624   const hobject_t oid = _oid.get_head();
 625   dout(20) << __func__ << ": blocking object " << oid
 626            << " on full cache" << dendl;
 627   objects_blocked_on_cache_full.insert(oid);
 628   waiting_for_cache_not_full.push_back(op);
 629   op->mark_delayed("waiting for cache not full");
 630 }
 631
 632 void PrimaryLogPG::block_write_on_snap_rollback(
 633   const hobject_t& oid, ObjectContextRef obc, OpRequestRef op)
 634 {
 635   dout(20) << __func__ << ": blocking object " << oid.get_head()
 636            << " on snap promotion " << obc->obs.oi.soid << dendl;
 637   // otherwise, we'd have blocked in do_op
 638   assert(oid.is_head());
 639   assert(objects_blocked_on_snap_promotion.count(oid) == 0);
 640   objects_blocked_on_snap_promotion[oid] = obc;
 641   wait_for_blocked_object(obc->obs.oi.soid, op);
 642 }
 643
 644 void PrimaryLogPG::block_write_on_degraded_snap(
 645   const hobject_t& snap, OpRequestRef op)
 646 {
 647   dout(20) << __func__ << ": blocking object " << snap.get_head()
 648            << " on degraded snap " << snap << dendl;
 649   // otherwise, we'd have blocked in do_op
 650   assert(objects_blocked_on_degraded_snap.count(snap.get_head()) == 0);
 651   objects_blocked_on_degraded_snap[snap.get_head()] = snap.snap;
 652   wait_for_degraded_object(snap, op);
 653 }
 654
 655 bool PrimaryLogPG::maybe_await_blocked_snapset(
 656   const hobject_t &hoid,
 657   OpRequestRef op)
 658 {
 659   ObjectContextRef obc;
 660   obc = object_contexts.lookup(hoid.get_head());
 661   if (obc) {
 662     if (obc->is_blocked()) {
 663       wait_for_blocked_object(obc->obs.oi.soid, op);
 664       return true;
 665     } else {
 666       return false;
 667     }
 668   }
 669   obc = object_contexts.lookup(hoid.get_snapdir());
 670   if (obc) {
 671     if (obc->is_blocked()) {
 672       wait_for_blocked_object(obc->obs.oi.soid, op);
 673       return true;
 674     } else {
 675       return false;
 676     }
 677   }
 678   return false;
 679 }
 680
 681 void PrimaryLogPG::wait_for_blocked_object(const hobject_t& soid, OpRequestRef op)
 682 {
 683   dout(10) << __func__ << " " << soid << " " << op << dendl;
 684   waiting_for_blocked_object[soid].push_back(op);
 685   op->mark_delayed("waiting for blocked object");
 686 }
 687
 688 void PrimaryLogPG::maybe_force_recovery()
 689 {
 690   // no force if not in degraded/recovery/backfill stats
 691   if (!is_degraded() &&
 692       !state_test(PG_STATE_RECOVERING |
 693                   PG_STATE_RECOVERY_WAIT |
 694                   PG_STATE_BACKFILL |
 695                   PG_STATE_BACKFILL_WAIT |
 696                   PG_STATE_BACKFILL_TOOFULL))
 697     return;
 698
 699   if (pg_log.get_log().approx_size() <
 700       cct->_conf->osd_max_pg_log_entries *
 701         cct->_conf->osd_force_recovery_pg_log_entries_factor)
 702     return;
 703
 704   // find the oldest missing object
 705   version_t min_version = 0;
 706   hobject_t soid;
 707   if (!pg_log.get_missing().get_items().empty()) {
 708     min_version = pg_log.get_missing().get_rmissing().begin()->first;
 709     soid = pg_log.get_missing().get_rmissing().begin()->second;
 710   }
 711   assert(!actingbackfill.empty());
 712   for (set<pg_shard_t>::iterator it = actingbackfill.begin();
 713        it != actingbackfill.end();
 714        ++it) {
 715     if (*it == get_primary()) continue;
 716     pg_shard_t peer = *it;
 717     if (peer_missing.count(peer) &&
 718         !peer_missing[peer].get_items().empty() &&
 719         min_version > peer_missing[peer].get_rmissing().begin()->first) {
 720       min_version = peer_missing[peer].get_rmissing().begin()->first;
 721       soid = peer_missing[peer].get_rmissing().begin()->second;
 722     }
 723   }
 724
 725   // recover it
 726   if (soid != hobject_t())
 727     maybe_kick_recovery(soid);
 728 }
 729
 730 class PGLSPlainFilter : public PGLSFilter {
 731   string val;
 732 public:
 733   int init(bufferlist::iterator &params) override
 734   {
 735     try {
 736       ::decode(xattr, params);
 737       ::decode(val, params);
 738     } catch (buffer::error &e) {
 739       return -EINVAL;
 740     }
 741
 742     return 0;
 743   }
 744   ~PGLSPlainFilter() override {}
 745   bool filter(const hobject_t &obj, bufferlist& xattr_data,
 746                       bufferlist& outdata) override;
 747 };
 748
 749 class PGLSParentFilter : public PGLSFilter {
 750   inodeno_t parent_ino;
 751 public:
 752   CephContext* cct;
 753   PGLSParentFilter(CephContext* cct) : cct(cct) {
 754     xattr = "_parent";
 755   }
 756   int init(bufferlist::iterator &params) override
 757   {
 758     try {
 759       ::decode(parent_ino, params);
 760     } catch (buffer::error &e) {
 761       return -EINVAL;
 762     }
 763     generic_dout(0) << "parent_ino=" << parent_ino << dendl;
 764
 765     return 0;
 766   }
 767   ~PGLSParentFilter() override {}
 768   bool filter(const hobject_t &obj, bufferlist& xattr_data,
 769                       bufferlist& outdata) override;
 770 };
 771
 772 bool PGLSParentFilter::filter(const hobject_t &obj,
 773                               bufferlist& xattr_data, bufferlist& outdata)
 774 {
 775   bufferlist::iterator iter = xattr_data.begin();
 776   inode_backtrace_t bt;
 777
 778   generic_dout(0) << "PGLSParentFilter::filter" << dendl;
 779
 780   ::decode(bt, iter);
 781
 782   vector<inode_backpointer_t>::iterator vi;
 783   for (vi = bt.ancestors.begin(); vi != bt.ancestors.end(); ++vi) {
 784     generic_dout(0) << "vi->dirino=" << vi->dirino << " parent_ino=" << parent_ino << dendl;
 785     if (vi->dirino == parent_ino) {
 786       ::encode(*vi, outdata);
 787       return true;
 788     }
 789   }
 790
 791   return false;
 792 }
 793
 794 bool PGLSPlainFilter::filter(const hobject_t &obj,
 795                              bufferlist& xattr_data, bufferlist& outdata)
 796 {
 797   if (val.size() != xattr_data.length())
 798     return false;
 799
 800   if (memcmp(val.c_str(), xattr_data.c_str(), val.size()))
 801     return false;
 802
 803   return true;
 804 }
 805
 806 bool PrimaryLogPG::pgls_filter(PGLSFilter *filter, hobject_t& sobj, bufferlist& outdata)
 807 {
 808   bufferlist bl;
 809
 810   // If filter has expressed an interest in an xattr, load it.
 811   if (!filter->get_xattr().empty()) {
 812     int ret = pgbackend->objects_get_attr(
 813       sobj,
 814       filter->get_xattr(),
 815       &bl);
 816     dout(0) << "getattr (sobj=" << sobj << ", attr=" << filter->get_xattr() << ") returned " << ret << dendl;
 817     if (ret < 0) {
 818       if (ret != -ENODATA || filter->reject_empty_xattr()) {
 819         return false;
 820       }
 821     }
 822   }
 823
 824   return filter->filter(sobj, bl, outdata);
 825 }
 826
 827 int PrimaryLogPG::get_pgls_filter(bufferlist::iterator& iter, PGLSFilter **pfilter)
 828 {
 829   string type;
 830   PGLSFilter *filter;
 831
 832   try {
 833     ::decode(type, iter);
 834   }
 835   catch (buffer::error& e) {
 836     return -EINVAL;
 837   }
 838
 839   if (type.compare("parent") == 0) {
 840     filter = new PGLSParentFilter(cct);
 841   } else if (type.compare("plain") == 0) {
 842     filter = new PGLSPlainFilter();
 843   } else {
 844     std::size_t dot = type.find(".");
 845     if (dot == std::string::npos || dot == 0 || dot == type.size() - 1) {
 846       return -EINVAL;
 847     }
 848
 849     const std::string class_name = type.substr(0, dot);
 850     const std::string filter_name = type.substr(dot + 1);
 851     ClassHandler::ClassData *cls = NULL;
 852     int r = osd->class_handler->open_class(class_name, &cls);
 853     if (r != 0) {
 854       derr << "Error opening class '" << class_name << "': "
 855            << cpp_strerror(r) << dendl;
 856       if (r != -EPERM) // propogate permission error
 857         r = -EINVAL;
 858       return r;
 859     } else {
 860       assert(cls);
 861     }
 862
 863     ClassHandler::ClassFilter *class_filter = cls->get_filter(filter_name);
 864     if (class_filter == NULL) {
 865       derr << "Error finding filter '" << filter_name << "' in class "
 866            << class_name << dendl;
 867       return -EINVAL;
 868     }
 869     filter = class_filter->fn();
 870     if (!filter) {
 871       // Object classes are obliged to return us something, but let's
 872       // give an error rather than asserting out.
 873       derr << "Buggy class " << class_name << " failed to construct "
 874               "filter " << filter_name << dendl;
 875       return -EINVAL;
 876     }
 877   }
 878
 879   assert(filter);
 880   int r = filter->init(iter);
 881   if (r < 0) {
 882     derr << "Error initializing filter " << type << ": "
 883          << cpp_strerror(r) << dendl;
 884     delete filter;
 885     return -EINVAL;
 886   } else {
 887     // Successfully constructed and initialized, return it.
 888     *pfilter = filter;
 889     return 0;
 890   }
 891 }
 892
 893
 894 // ==========================================================
 895
 896 int PrimaryLogPG::do_command(
 897   cmdmap_t cmdmap,
 898   ostream& ss,
 899   bufferlist& idata,
 900   bufferlist& odata,
 901   ConnectionRef con,
 902   ceph_tid_t tid)
 903 {
 904   const pg_missing_t &missing = pg_log.get_missing();
 905   string prefix;
 906   string format;
 907
 908   cmd_getval(cct, cmdmap, "format", format);
 909   boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json"));
 910
 911   string command;
 912   cmd_getval(cct, cmdmap, "cmd", command);
 913   if (command == "query") {
 914     f->open_object_section("pg");
 915     f->dump_string("state", pg_state_string(get_state()));
 916     f->dump_stream("snap_trimq") << snap_trimq;
 917     f->dump_unsigned("epoch", get_osdmap()->get_epoch());
 918     f->open_array_section("up");
 919     for (vector<int>::iterator p = up.begin(); p != up.end(); ++p)
 920       f->dump_unsigned("osd", *p);
 921     f->close_section();
 922     f->open_array_section("acting");
 923     for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p)
 924       f->dump_unsigned("osd", *p);
 925     f->close_section();
 926     if (!backfill_targets.empty()) {
 927       f->open_array_section("backfill_targets");
 928       for (set<pg_shard_t>::iterator p = backfill_targets.begin();
 929            p != backfill_targets.end();
 930            ++p)
 931         f->dump_stream("shard") << *p;
 932       f->close_section();
 933     }
 934     if (!actingbackfill.empty()) {
 935       f->open_array_section("actingbackfill");
 936       for (set<pg_shard_t>::iterator p = actingbackfill.begin();
 937            p != actingbackfill.end();
 938            ++p)
 939         f->dump_stream("shard") << *p;
 940       f->close_section();
 941     }
 942     f->open_object_section("info");
 943     _update_calc_stats();
 944     info.dump(f.get());
 945     f->close_section();
 946
 947     f->open_array_section("peer_info");
 948     for (map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
 949          p != peer_info.end();
 950          ++p) {
 951       f->open_object_section("info");
 952       f->dump_stream("peer") << p->first;
 953       p->second.dump(f.get());
 954       f->close_section();
 955     }
 956     f->close_section();
 957
 958     f->open_array_section("recovery_state");
 959     handle_query_state(f.get());
 960     f->close_section();
 961
 962     f->open_object_section("agent_state");
 963     if (agent_state)
 964       agent_state->dump(f.get());
 965     f->close_section();
 966
 967     f->close_section();
 968     f->flush(odata);
 969     return 0;
 970   }
 971   else if (command == "mark_unfound_lost") {
 972     string mulcmd;
 973     cmd_getval(cct, cmdmap, "mulcmd", mulcmd);
 974     int mode = -1;
 975     if (mulcmd == "revert") {
 976       if (pool.info.ec_pool()) {
 977         ss << "mode must be 'delete' for ec pool";
 978         return -EINVAL;
 979       }
 980       mode = pg_log_entry_t::LOST_REVERT;
 981     } else if (mulcmd == "delete") {
 982       mode = pg_log_entry_t::LOST_DELETE;
 983     } else {
 984       ss << "mode must be 'revert' or 'delete'; mark not yet implemented";
 985       return -EINVAL;
 986     }
 987     assert(mode == pg_log_entry_t::LOST_REVERT ||
 988            mode == pg_log_entry_t::LOST_DELETE);
 989
 990     if (!is_primary()) {
 991       ss << "not primary";
 992       return -EROFS;
 993     }
 994
 995     uint64_t unfound = missing_loc.num_unfound();
 996     if (!unfound) {
 997       ss << "pg has no unfound objects";
 998       return 0;  // make command idempotent
 999     }
1000
1001     if (!all_unfound_are_queried_or_lost(get_osdmap())) {
1002       ss << "pg has " << unfound
1003          << " unfound objects but we haven't probed all sources, not marking lost";
1004       return -EINVAL;
1005     }
1006
1007     mark_all_unfound_lost(mode, con, tid);
1008     return -EAGAIN;
1009   }
1010   else if (command == "list_missing") {
1011     hobject_t offset;
1012     string offset_json;
1013     if (cmd_getval(cct, cmdmap, "offset", offset_json)) {
1014       json_spirit::Value v;
1015       try {
1016         if (!json_spirit::read(offset_json, v))
1017           throw std::runtime_error("bad json");
1018         offset.decode(v);
1019       } catch (std::runtime_error& e) {
1020         ss << "error parsing offset: " << e.what();
1021         return -EINVAL;
1022       }
1023     }
1024     f->open_object_section("missing");
1025     {
1026       f->open_object_section("offset");
1027       offset.dump(f.get());
1028       f->close_section();
1029     }
1030     f->dump_int("num_missing", missing.num_missing());
1031     f->dump_int("num_unfound", get_num_unfound());
1032     const map<hobject_t, pg_missing_item> &needs_recovery_map =
1033       missing_loc.get_needs_recovery();
1034     map<hobject_t, pg_missing_item>::const_iterator p =
1035       needs_recovery_map.upper_bound(offset);
1036     {
1037       f->open_array_section("objects");
1038       int32_t num = 0;
1039       for (; p != needs_recovery_map.end() && num < cct->_conf->osd_command_max_records; ++p) {
1040         if (missing_loc.is_unfound(p->first)) {
1041           f->open_object_section("object");
1042           {
1043             f->open_object_section("oid");
1044             p->first.dump(f.get());
1045             f->close_section();
1046           }
1047           p->second.dump(f.get()); // have, need keys
1048           {
1049             f->open_array_section("locations");
1050             for (set<pg_shard_t>::iterator r =
1051                 missing_loc.get_locations(p->first).begin();
1052                 r != missing_loc.get_locations(p->first).end();
1053                 ++r)
1054               f->dump_stream("shard") << *r;
1055             f->close_section();
1056           }
1057           f->close_section();
1058           num++;
1059         }
1060       }
1061       f->close_section();
1062     }
1063     f->dump_bool("more", p != needs_recovery_map.end());
1064     f->close_section();
1065     f->flush(odata);
1066     return 0;
1067   }
1068
1069   ss << "unknown pg command " << prefix;
1070   return -EINVAL;
1071 }
1072
1073 // ==========================================================
1074
1075 void PrimaryLogPG::do_pg_op(OpRequestRef op)
1076 {
1077   // NOTE: this is non-const because we modify the OSDOp.outdata in
1078   // place
1079   MOSDOp *m = static_cast<MOSDOp *>(op->get_nonconst_req());
1080   assert(m->get_type() == CEPH_MSG_OSD_OP);
1081   dout(10) << "do_pg_op " << *m << dendl;
1082
1083   op->mark_started();
1084
1085   int result = 0;
1086   string cname, mname;
1087   PGLSFilter *filter = NULL;
1088   bufferlist filter_out;
1089
1090   snapid_t snapid = m->get_snapid();
1091
1092   vector<OSDOp> ops = m->ops;
1093
1094   for (vector<OSDOp>::iterator p = ops.begin(); p != ops.end(); ++p) {
1095     OSDOp& osd_op = *p;
1096     bufferlist::iterator bp = p->indata.begin();
1097     switch (p->op.op) {
1098     case CEPH_OSD_OP_PGNLS_FILTER:
1099       try {
1100         ::decode(cname, bp);
1101         ::decode(mname, bp);
1102       }
1103       catch (const buffer::error& e) {
1104         dout(0) << "unable to decode PGLS_FILTER description in " << *m << dendl;
1105         result = -EINVAL;
1106         break;
1107       }
1108       if (filter) {
1109         delete filter;
1110         filter = NULL;
1111       }
1112       result = get_pgls_filter(bp, &filter);
1113       if (result < 0)
1114         break;
1115
1116       assert(filter);
1117
1118       // fall through
1119
1120     case CEPH_OSD_OP_PGNLS:
1121       if (snapid != CEPH_NOSNAP) {
1122         result = -EINVAL;
1123         break;
1124       }
1125       if (get_osdmap()->raw_pg_to_pg(m->get_pg()) != info.pgid.pgid) {
1126         dout(10) << " pgnls pg=" << m->get_pg()
1127                  << " " << get_osdmap()->raw_pg_to_pg(m->get_pg())
1128                  << " != " << info.pgid << dendl;
1129         result = 0; // hmm?
1130       } else {
1131         unsigned list_size = MIN(cct->_conf->osd_max_pgls, p->op.pgls.count);
1132
1133         dout(10) << " pgnls pg=" << m->get_pg() << " count " << list_size << dendl;
1134         // read into a buffer
1135         vector<hobject_t> sentries;
1136         pg_nls_response_t response;
1137         try {
1138           ::decode(response.handle, bp);
1139         }
1140         catch (const buffer::error& e) {
1141           dout(0) << "unable to decode PGNLS handle in " << *m << dendl;
1142           result = -EINVAL;
1143           break;
1144         }
1145
1146         hobject_t next;
1147         hobject_t lower_bound = response.handle;
1148         hobject_t pg_start = info.pgid.pgid.get_hobj_start();
1149         hobject_t pg_end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1150         dout(10) << " pgnls lower_bound " << lower_bound
1151                  << " pg_end " << pg_end << dendl;
1152         if (((!lower_bound.is_max() && lower_bound >= pg_end) ||
1153              (lower_bound != hobject_t() && lower_bound < pg_start))) {
1154           // this should only happen with a buggy client.
1155           dout(10) << "outside of PG bounds " << pg_start << " .. "
1156                    << pg_end << dendl;
1157           result = -EINVAL;
1158           break;
1159         }
1160
1161         hobject_t current = lower_bound;
1162         osr->flush();
1163         int r = pgbackend->objects_list_partial(
1164           current,
1165           list_size,
1166           list_size,
1167           &sentries,
1168           &next);
1169         if (r != 0) {
1170           result = -EINVAL;
1171           break;
1172         }
1173
1174         map<hobject_t, pg_missing_item>::const_iterator missing_iter =
1175           pg_log.get_missing().get_items().lower_bound(current);
1176         vector<hobject_t>::iterator ls_iter = sentries.begin();
1177         hobject_t _max = hobject_t::get_max();
1178         while (1) {
1179           const hobject_t &mcand =
1180             missing_iter == pg_log.get_missing().get_items().end() ?
1181             _max :
1182             missing_iter->first;
1183           const hobject_t &lcand =
1184             ls_iter == sentries.end() ?
1185             _max :
1186             *ls_iter;
1187
1188           hobject_t candidate;
1189           if (mcand == lcand) {
1190             candidate = mcand;
1191             if (!mcand.is_max()) {
1192               ++ls_iter;
1193               ++missing_iter;
1194             }
1195           } else if (mcand < lcand) {
1196             candidate = mcand;
1197             assert(!mcand.is_max());
1198             ++missing_iter;
1199           } else {
1200             candidate = lcand;
1201             assert(!lcand.is_max());
1202             ++ls_iter;
1203           }
1204
1205           dout(10) << " pgnls candidate 0x" << std::hex << candidate.get_hash()
1206             << " vs lower bound 0x" << lower_bound.get_hash() << dendl;
1207
1208           if (candidate >= next) {
1209             break;
1210           }
1211
1212           if (response.entries.size() == list_size) {
1213             next = candidate;
1214             break;
1215           }
1216
1217           // skip snapdir objects
1218           if (candidate.snap == CEPH_SNAPDIR)
1219             continue;
1220
1221           if (candidate.snap != CEPH_NOSNAP)
1222             continue;
1223
1224           // skip internal namespace
1225           if (candidate.get_namespace() == cct->_conf->osd_hit_set_namespace)
1226             continue;
1227
1228           // skip wrong namespace
1229           if (m->get_hobj().nspace != librados::all_nspaces &&
1230                candidate.get_namespace() != m->get_hobj().nspace)
1231             continue;
1232
1233           if (filter && !pgls_filter(filter, candidate, filter_out))
1234             continue;
1235
1236           dout(20) << "pgnls item 0x" << std::hex
1237             << candidate.get_hash()
1238             << ", rev 0x" << hobject_t::_reverse_bits(candidate.get_hash())
1239             << std::dec << " "
1240             << candidate.oid.name << dendl;
1241
1242           librados::ListObjectImpl item;
1243           item.nspace = candidate.get_namespace();
1244           item.oid = candidate.oid.name;
1245           item.locator = candidate.get_key();
1246           response.entries.push_back(item);
1247         }
1248
1249         if (next.is_max() &&
1250             missing_iter == pg_log.get_missing().get_items().end() &&
1251             ls_iter == sentries.end()) {
1252           result = 1;
1253
1254           // Set response.handle to the start of the next PG according
1255           // to the object sort order.
1256           response.handle = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1257         } else {
1258           response.handle = next;
1259         }
1260         dout(10) << "pgnls handle=" << response.handle << dendl;
1261         ::encode(response, osd_op.outdata);
1262         if (filter)
1263           ::encode(filter_out, osd_op.outdata);
1264         dout(10) << " pgnls result=" << result << " outdata.length()="
1265                  << osd_op.outdata.length() << dendl;
1266       }
1267       break;
1268
1269     case CEPH_OSD_OP_PGLS_FILTER:
1270       try {
1271         ::decode(cname, bp);
1272         ::decode(mname, bp);
1273       }
1274       catch (const buffer::error& e) {
1275         dout(0) << "unable to decode PGLS_FILTER description in " << *m << dendl;
1276         result = -EINVAL;
1277         break;
1278       }
1279       if (filter) {
1280         delete filter;
1281         filter = NULL;
1282       }
1283       result = get_pgls_filter(bp, &filter);
1284       if (result < 0)
1285         break;
1286
1287       assert(filter);
1288
1289       // fall through
1290
1291     case CEPH_OSD_OP_PGLS:
1292       if (snapid != CEPH_NOSNAP) {
1293         result = -EINVAL;
1294         break;
1295       }
1296       if (get_osdmap()->raw_pg_to_pg(m->get_pg()) != info.pgid.pgid) {
1297         dout(10) << " pgls pg=" << m->get_pg()
1298                  << " " << get_osdmap()->raw_pg_to_pg(m->get_pg())
1299                  << " != " << info.pgid << dendl;
1300         result = 0; // hmm?
1301       } else {
1302         unsigned list_size = MIN(cct->_conf->osd_max_pgls, p->op.pgls.count);
1303
1304         dout(10) << " pgls pg=" << m->get_pg() << " count " << list_size << dendl;
1305         // read into a buffer
1306         vector<hobject_t> sentries;
1307         pg_ls_response_t response;
1308         try {
1309           ::decode(response.handle, bp);
1310         }
1311         catch (const buffer::error& e) {
1312           dout(0) << "unable to decode PGLS handle in " << *m << dendl;
1313           result = -EINVAL;
1314           break;
1315         }
1316
1317         hobject_t next;
1318         hobject_t current = response.handle;
1319         osr->flush();
1320         int r = pgbackend->objects_list_partial(
1321           current,
1322           list_size,
1323           list_size,
1324           &sentries,
1325           &next);
1326         if (r != 0) {
1327           result = -EINVAL;
1328           break;
1329         }
1330
1331         assert(snapid == CEPH_NOSNAP || pg_log.get_missing().get_items().empty());
1332
1333         map<hobject_t, pg_missing_item>::const_iterator missing_iter =
1334           pg_log.get_missing().get_items().lower_bound(current);
1335         vector<hobject_t>::iterator ls_iter = sentries.begin();
1336         hobject_t _max = hobject_t::get_max();
1337         while (1) {
1338           const hobject_t &mcand =
1339             missing_iter == pg_log.get_missing().get_items().end() ?
1340             _max :
1341             missing_iter->first;
1342           const hobject_t &lcand =
1343             ls_iter == sentries.end() ?
1344             _max :
1345             *ls_iter;
1346
1347           hobject_t candidate;
1348           if (mcand == lcand) {
1349             candidate = mcand;
1350             if (!mcand.is_max()) {
1351               ++ls_iter;
1352               ++missing_iter;
1353             }
1354           } else if (mcand < lcand) {
1355             candidate = mcand;
1356             assert(!mcand.is_max());
1357             ++missing_iter;
1358           } else {
1359             candidate = lcand;
1360             assert(!lcand.is_max());
1361             ++ls_iter;
1362           }
1363
1364           if (candidate >= next) {
1365             break;
1366           }
1367
1368           if (response.entries.size() == list_size) {
1369             next = candidate;
1370             break;
1371           }
1372
1373           // skip snapdir objects
1374           if (candidate.snap == CEPH_SNAPDIR)
1375             continue;
1376
1377           if (candidate.snap != CEPH_NOSNAP)
1378             continue;
1379
1380           // skip wrong namespace
1381           if (candidate.get_namespace() != m->get_hobj().nspace)
1382             continue;
1383
1384           if (filter && !pgls_filter(filter, candidate, filter_out))
1385             continue;
1386
1387           response.entries.push_back(make_pair(candidate.oid,
1388                                                candidate.get_key()));
1389         }
1390         if (next.is_max() &&
1391             missing_iter == pg_log.get_missing().get_items().end() &&
1392             ls_iter == sentries.end()) {
1393           result = 1;
1394         }
1395         response.handle = next;
1396         ::encode(response, osd_op.outdata);
1397         if (filter)
1398           ::encode(filter_out, osd_op.outdata);
1399         dout(10) << " pgls result=" << result << " outdata.length()="
1400                  << osd_op.outdata.length() << dendl;
1401       }
1402       break;
1403
1404     case CEPH_OSD_OP_PG_HITSET_LS:
1405       {
1406         list< pair<utime_t,utime_t> > ls;
1407         for (list<pg_hit_set_info_t>::const_iterator p = info.hit_set.history.begin();
1408              p != info.hit_set.history.end();
1409              ++p)
1410           ls.push_back(make_pair(p->begin, p->end));
1411         if (hit_set)
1412           ls.push_back(make_pair(hit_set_start_stamp, utime_t()));
1413         ::encode(ls, osd_op.outdata);
1414       }
1415       break;
1416
1417     case CEPH_OSD_OP_PG_HITSET_GET:
1418       {
1419         utime_t stamp(osd_op.op.hit_set_get.stamp);
1420         if (hit_set_start_stamp && stamp >= hit_set_start_stamp) {
1421           // read the current in-memory HitSet, not the version we've
1422           // checkpointed.
1423           if (!hit_set) {
1424             result= -ENOENT;
1425             break;
1426           }
1427           ::encode(*hit_set, osd_op.outdata);
1428           result = osd_op.outdata.length();
1429         } else {
1430           // read an archived HitSet.
1431           hobject_t oid;
1432           for (list<pg_hit_set_info_t>::const_iterator p = info.hit_set.history.begin();
1433                p != info.hit_set.history.end();
1434                ++p) {
1435             if (stamp >= p->begin && stamp <= p->end) {
1436               oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
1437               break;
1438             }
1439           }
1440           if (oid == hobject_t()) {
1441             result = -ENOENT;
1442             break;
1443           }
1444           if (!pool.info.is_replicated()) {
1445             // FIXME: EC not supported yet
1446             result = -EOPNOTSUPP;
1447             break;
1448           }
1449           if (is_unreadable_object(oid)) {
1450             wait_for_unreadable_object(oid, op);
1451             delete filter;
1452             return;
1453           }
1454           result = osd->store->read(ch, ghobject_t(oid), 0, 0, osd_op.outdata);
1455         }
1456       }
1457       break;
1458
1459    case CEPH_OSD_OP_SCRUBLS:
1460       result = do_scrub_ls(m, &osd_op);
1461       break;
1462
1463     default:
1464       result = -EINVAL;
1465       break;
1466     }
1467
1468     if (result < 0)
1469       break;
1470   }
1471
1472   // reply
1473   MOSDOpReply *reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(),
1474                                        CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK,
1475                                        false);
1476   reply->claim_op_out_data(ops);
1477   reply->set_result(result);
1478   reply->set_reply_versions(info.last_update, info.last_user_version);
1479   osd->send_message_osd_client(reply, m->get_connection());
1480   delete filter;
1481 }
1482
1483 int PrimaryLogPG::do_scrub_ls(MOSDOp *m, OSDOp *osd_op)
1484 {
1485   if (m->get_pg() != info.pgid.pgid) {
1486     dout(10) << " scrubls pg=" << m->get_pg() << " != " << info.pgid << dendl;
1487     return -EINVAL; // hmm?
1488   }
1489   auto bp = osd_op->indata.begin();
1490   scrub_ls_arg_t arg;
1491   try {
1492     arg.decode(bp);
1493   } catch (buffer::error&) {
1494     dout(10) << " corrupted scrub_ls_arg_t" << dendl;
1495     return -EINVAL;
1496   }
1497   int r = 0;
1498   scrub_ls_result_t result = {.interval = info.history.same_interval_since};
1499   if (arg.interval != 0 && arg.interval != info.history.same_interval_since) {
1500     r = -EAGAIN;
1501   } else if (!scrubber.store) {
1502     r = -ENOENT;
1503   } else if (arg.get_snapsets) {
1504     result.vals = scrubber.store->get_snap_errors(osd->store,
1505                                                   get_pgid().pool(),
1506                                                   arg.start_after,
1507                                                   arg.max_return);
1508   } else {
1509     result.vals = scrubber.store->get_object_errors(osd->store,
1510                                                     get_pgid().pool(),
1511                                                     arg.start_after,
1512                                                     arg.max_return);
1513   }
1514   ::encode(result, osd_op->outdata);
1515   return r;
1516 }
1517
1518 void PrimaryLogPG::calc_trim_to()
1519 {
1520   size_t target = cct->_conf->osd_min_pg_log_entries;
1521   if (is_degraded() ||
1522       state_test(PG_STATE_RECOVERING |
1523                  PG_STATE_RECOVERY_WAIT |
1524                  PG_STATE_BACKFILL |
1525                  PG_STATE_BACKFILL_WAIT |
1526                  PG_STATE_BACKFILL_TOOFULL)) {
1527     target = cct->_conf->osd_max_pg_log_entries;
1528   }
1529
1530   eversion_t limit = MIN(
1531     min_last_complete_ondisk,
1532     pg_log.get_can_rollback_to());
1533   if (limit != eversion_t() &&
1534       limit != pg_trim_to &&
1535       pg_log.get_log().approx_size() > target) {
1536     size_t num_to_trim = pg_log.get_log().approx_size() - target;
1537     if (num_to_trim < cct->_conf->osd_pg_log_trim_min) {
1538       return;
1539     }
1540     list<pg_log_entry_t>::const_iterator it = pg_log.get_log().log.begin();
1541     eversion_t new_trim_to;
1542     for (size_t i = 0; i < num_to_trim; ++i) {
1543       new_trim_to = it->version;
1544       ++it;
1545       if (new_trim_to > limit) {
1546         new_trim_to = limit;
1547         dout(10) << "calc_trim_to trimming to min_last_complete_ondisk" << dendl;
1548         break;
1549       }
1550     }
1551     dout(10) << "calc_trim_to " << pg_trim_to << " -> " << new_trim_to << dendl;
1552     pg_trim_to = new_trim_to;
1553     assert(pg_trim_to <= pg_log.get_head());
1554     assert(pg_trim_to <= min_last_complete_ondisk);
1555   }
1556 }
1557
1558 PrimaryLogPG::PrimaryLogPG(OSDService *o, OSDMapRef curmap,
1559                            const PGPool &_pool, spg_t p) :
1560   PG(o, curmap, _pool, p),
1561   pgbackend(
1562     PGBackend::build_pg_backend(
1563       _pool.info, curmap, this, coll_t(p), ch, o->store, cct)),
1564   object_contexts(o->cct, o->cct->_conf->osd_pg_object_context_cache_count),
1565   snapset_contexts_lock("PrimaryLogPG::snapset_contexts_lock"),
1566   new_backfill(false),
1567   temp_seq(0),
1568   snap_trimmer_machine(this)
1569 {
1570   missing_loc.set_backend_predicates(
1571     pgbackend->get_is_readable_predicate(),
1572     pgbackend->get_is_recoverable_predicate());
1573   snap_trimmer_machine.initiate();
1574 }
1575
1576 void PrimaryLogPG::get_src_oloc(const object_t& oid, const object_locator_t& oloc, object_locator_t& src_oloc)
1577 {
1578   src_oloc = oloc;
1579   if (oloc.key.empty())
1580     src_oloc.key = oid.name;
1581 }
1582
1583 void PrimaryLogPG::handle_backoff(OpRequestRef& op)
1584 {
1585   const MOSDBackoff *m = static_cast<const MOSDBackoff*>(op->get_req());
1586   SessionRef session = static_cast<Session*>(m->get_connection()->get_priv());
1587   if (!session)
1588     return;  // drop it.
1589   session->put();  // get_priv takes a ref, and so does the SessionRef
1590   hobject_t begin = info.pgid.pgid.get_hobj_start();
1591   hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1592   if (begin < m->begin) {
1593     begin = m->begin;
1594   }
1595   if (end > m->end) {
1596     end = m->end;
1597   }
1598   dout(10) << __func__ << " backoff ack id " << m->id
1599            << " [" << begin << "," << end << ")" << dendl;
1600   session->ack_backoff(cct, m->pgid, m->id, begin, end);
1601 }
1602
1603 void PrimaryLogPG::do_request(
1604   OpRequestRef& op,
1605   ThreadPool::TPHandle &handle)
1606 {
1607   if (op->osd_trace) {
1608     op->pg_trace.init("pg op", &trace_endpoint, &op->osd_trace);
1609     op->pg_trace.event("do request");
1610   }
1611   // make sure we have a new enough map
1612   auto p = waiting_for_map.find(op->get_source());
1613   if (p != waiting_for_map.end()) {
1614     // preserve ordering
1615     dout(20) << __func__ << " waiting_for_map "
1616              << p->first << " not empty, queueing" << dendl;
1617     p->second.push_back(op);
1618     op->mark_delayed("waiting_for_map not empty");
1619     return;
1620   }
1621   if (!have_same_or_newer_map(op->min_epoch)) {
1622     dout(20) << __func__ << " min " << op->min_epoch
1623              << ", queue on waiting_for_map " << op->get_source() << dendl;
1624     waiting_for_map[op->get_source()].push_back(op);
1625     op->mark_delayed("op must wait for map");
1626     return;
1627   }
1628
1629   if (can_discard_request(op)) {
1630     return;
1631   }
1632
1633   // pg-wide backoffs
1634   const Message *m = op->get_req();
1635   if (m->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF)) {
1636     SessionRef session = static_cast<Session*>(m->get_connection()->get_priv());
1637     if (!session)
1638       return;  // drop it.
1639     session->put();  // get_priv takes a ref, and so does the SessionRef
1640
1641     if (op->get_req()->get_type() == CEPH_MSG_OSD_OP) {
1642       if (session->check_backoff(cct, info.pgid,
1643                                  info.pgid.pgid.get_hobj_start(), m)) {
1644         return;
1645       }
1646
1647       bool backoff =
1648         is_down() ||
1649         is_incomplete() ||
1650         (!is_active() && is_peered());
1651       if (g_conf->osd_backoff_on_peering && !backoff) {
1652         if (is_peering()) {
1653           backoff = true;
1654         }
1655       }
1656       if (backoff) {
1657         add_pg_backoff(session);
1658         return;
1659       }
1660     }
1661     // pg backoff acks at pg-level
1662     if (op->get_req()->get_type() == CEPH_MSG_OSD_BACKOFF) {
1663       const MOSDBackoff *ba = static_cast<const MOSDBackoff*>(m);
1664       if (ba->begin != ba->end) {
1665         handle_backoff(op);
1666         return;
1667       }
1668     }
1669   }
1670
1671   if (flushes_in_progress > 0) {
1672     dout(20) << flushes_in_progress
1673              << " flushes_in_progress pending "
1674              << "waiting for active on " << op << dendl;
1675     waiting_for_peered.push_back(op);
1676     op->mark_delayed("waiting for peered");
1677     return;
1678   }
1679
1680   if (!is_peered()) {
1681     // Delay unless PGBackend says it's ok
1682     if (pgbackend->can_handle_while_inactive(op)) {
1683       bool handled = pgbackend->handle_message(op);
1684       assert(handled);
1685       return;
1686     } else {
1687       waiting_for_peered.push_back(op);
1688       op->mark_delayed("waiting for peered");
1689       return;
1690     }
1691   }
1692
1693   assert(is_peered() && flushes_in_progress == 0);
1694   if (pgbackend->handle_message(op))
1695     return;
1696
1697   switch (op->get_req()->get_type()) {
1698   case CEPH_MSG_OSD_OP:
1699   case CEPH_MSG_OSD_BACKOFF:
1700     if (!is_active()) {
1701       dout(20) << " peered, not active, waiting for active on " << op << dendl;
1702       waiting_for_active.push_back(op);
1703       op->mark_delayed("waiting for active");
1704       return;
1705     }
1706     switch (op->get_req()->get_type()) {
1707     case CEPH_MSG_OSD_OP:
1708       // verify client features
1709       if ((pool.info.has_tiers() || pool.info.is_tier()) &&
1710           !op->has_feature(CEPH_FEATURE_OSD_CACHEPOOL)) {
1711         osd->reply_op_error(op, -EOPNOTSUPP);
1712         return;
1713       }
1714       do_op(op);
1715       break;
1716     case CEPH_MSG_OSD_BACKOFF:
1717       // object-level backoff acks handled in osdop context
1718       handle_backoff(op);
1719       break;
1720     }
1721     break;
1722
1723   case MSG_OSD_SUBOP:
1724     do_sub_op(op);
1725     break;
1726
1727   case MSG_OSD_SUBOPREPLY:
1728     do_sub_op_reply(op);
1729     break;
1730
1731   case MSG_OSD_PG_SCAN:
1732     do_scan(op, handle);
1733     break;
1734
1735   case MSG_OSD_PG_BACKFILL:
1736     do_backfill(op);
1737     break;
1738
1739   case MSG_OSD_PG_BACKFILL_REMOVE:
1740     do_backfill_remove(op);
1741     break;
1742
1743   case MSG_OSD_SCRUB_RESERVE:
1744     {
1745       const MOSDScrubReserve *m =
1746         static_cast<const MOSDScrubReserve*>(op->get_req());
1747       switch (m->type) {
1748       case MOSDScrubReserve::REQUEST:
1749         handle_scrub_reserve_request(op);
1750         break;
1751       case MOSDScrubReserve::GRANT:
1752         handle_scrub_reserve_grant(op, m->from);
1753         break;
1754       case MOSDScrubReserve::REJECT:
1755         handle_scrub_reserve_reject(op, m->from);
1756         break;
1757       case MOSDScrubReserve::RELEASE:
1758         handle_scrub_reserve_release(op);
1759         break;
1760       }
1761     }
1762     break;
1763
1764   case MSG_OSD_REP_SCRUB:
1765     replica_scrub(op, handle);
1766     break;
1767
1768   case MSG_OSD_REP_SCRUBMAP:
1769     do_replica_scrub_map(op);
1770     break;
1771
1772   case MSG_OSD_PG_UPDATE_LOG_MISSING:
1773     do_update_log_missing(op);
1774     break;
1775
1776   case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY:
1777     do_update_log_missing_reply(op);
1778     break;
1779
1780   default:
1781     assert(0 == "bad message type in do_request");
1782   }
1783 }
1784
1785 hobject_t PrimaryLogPG::earliest_backfill() const
1786 {
1787   hobject_t e = hobject_t::get_max();
1788   for (set<pg_shard_t>::iterator i = backfill_targets.begin();
1789        i != backfill_targets.end();
1790        ++i) {
1791     pg_shard_t bt = *i;
1792     map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(bt);
1793     assert(iter != peer_info.end());
1794     if (iter->second.last_backfill < e)
1795       e = iter->second.last_backfill;
1796   }
1797   return e;
1798 }
1799
1800 /** do_op - do an op
1801  * pg lock will be held (if multithreaded)
1802  * osd_lock NOT held.
1803  */
1804 void PrimaryLogPG::do_op(OpRequestRef& op)
1805 {
1806   FUNCTRACE();
1807   // NOTE: take a non-const pointer here; we must be careful not to
1808   // change anything that will break other reads on m (operator<<).
1809   MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
1810   assert(m->get_type() == CEPH_MSG_OSD_OP);
1811   if (m->finish_decode()) {
1812     op->reset_desc();   // for TrackedOp
1813     m->clear_payload();
1814   }
1815
1816   dout(20) << __func__ << ": op " << *m << dendl;
1817
1818   hobject_t head = m->get_hobj();
1819   head.snap = CEPH_NOSNAP;
1820
1821   if (!info.pgid.pgid.contains(
1822         info.pgid.pgid.get_split_bits(pool.info.get_pg_num()), head)) {
1823     derr << __func__ << " " << info.pgid.pgid << " does not contain "
1824          << head << " pg_num " << pool.info.get_pg_num() << " hash "
1825          << std::hex << head.get_hash() << std::dec << dendl;
1826     osd->clog->warn() << info.pgid.pgid << " does not contain " << head
1827                       << " op " << *m;
1828     assert(!cct->_conf->osd_debug_misdirected_ops);
1829     return;
1830   }
1831
1832   bool can_backoff =
1833     m->get_connection()->has_feature(CEPH_FEATURE_RADOS_BACKOFF);
1834   SessionRef session;
1835   if (can_backoff) {
1836     session = static_cast<Session*>(m->get_connection()->get_priv());
1837     if (!session.get()) {
1838       dout(10) << __func__ << " no session" << dendl;
1839       return;
1840     }
1841     session->put();  // get_priv() takes a ref, and so does the intrusive_ptr
1842
1843     if (session->check_backoff(cct, info.pgid, head, m)) {
1844       return;
1845     }
1846   }
1847
1848   if (m->has_flag(CEPH_OSD_FLAG_PARALLELEXEC)) {
1849     // not implemented.
1850     dout(20) << __func__ << ": PARALLELEXEC not implemented " << *m << dendl;
1851     osd->reply_op_error(op, -EINVAL);
1852     return;
1853   }
1854
1855   if (op->rmw_flags == 0) {
1856     int r = osd->osd->init_op_flags(op);
1857     if (r) {
1858       osd->reply_op_error(op, r);
1859       return;
1860     }
1861   }
1862
1863   if ((m->get_flags() & (CEPH_OSD_FLAG_BALANCE_READS |
1864                          CEPH_OSD_FLAG_LOCALIZE_READS)) &&
1865       op->may_read() &&
1866       !(op->may_write() || op->may_cache())) {
1867     // balanced reads; any replica will do
1868     if (!(is_primary() || is_replica())) {
1869       osd->handle_misdirected_op(this, op);
1870       return;
1871     }
1872   } else {
1873     // normal case; must be primary
1874     if (!is_primary()) {
1875       osd->handle_misdirected_op(this, op);
1876       return;
1877     }
1878   }
1879
1880   if (!op_has_sufficient_caps(op)) {
1881     osd->reply_op_error(op, -EPERM);
1882     return;
1883   }
1884
1885   if (op->includes_pg_op()) {
1886     return do_pg_op(op);
1887   }
1888
1889   // object name too long?
1890   if (m->get_oid().name.size() > cct->_conf->osd_max_object_name_len) {
1891     dout(4) << "do_op name is longer than "
1892             << cct->_conf->osd_max_object_name_len
1893             << " bytes" << dendl;
1894     osd->reply_op_error(op, -ENAMETOOLONG);
1895     return;
1896   }
1897   if (m->get_hobj().get_key().size() > cct->_conf->osd_max_object_name_len) {
1898     dout(4) << "do_op locator is longer than "
1899             << cct->_conf->osd_max_object_name_len
1900             << " bytes" << dendl;
1901     osd->reply_op_error(op, -ENAMETOOLONG);
1902     return;
1903   }
1904   if (m->get_hobj().nspace.size() > cct->_conf->osd_max_object_namespace_len) {
1905     dout(4) << "do_op namespace is longer than "
1906             << cct->_conf->osd_max_object_namespace_len
1907             << " bytes" << dendl;
1908     osd->reply_op_error(op, -ENAMETOOLONG);
1909     return;
1910   }
1911
1912   if (int r = osd->store->validate_hobject_key(head)) {
1913     dout(4) << "do_op object " << head << " invalid for backing store: "
1914             << r << dendl;
1915     osd->reply_op_error(op, r);
1916     return;
1917   }
1918
1919   // blacklisted?
1920   if (get_osdmap()->is_blacklisted(m->get_source_addr())) {
1921     dout(10) << "do_op " << m->get_source_addr() << " is blacklisted" << dendl;
1922     osd->reply_op_error(op, -EBLACKLISTED);
1923     return;
1924   }
1925
1926   // order this op as a write?
1927   bool write_ordered = op->rwordered();
1928
1929   // discard due to cluster full transition?  (we discard any op that
1930   // originates before the cluster or pool is marked full; the client
1931   // will resend after the full flag is removed or if they expect the
1932   // op to succeed despite being full).  The except is FULL_FORCE and
1933   // FULL_TRY ops, which there is no reason to discard because they
1934   // bypass all full checks anyway.  If this op isn't write or
1935   // read-ordered, we skip.
1936   // FIXME: we exclude mds writes for now.
1937   if (write_ordered && !(m->get_source().is_mds() ||
1938                          m->has_flag(CEPH_OSD_FLAG_FULL_TRY) ||
1939                          m->has_flag(CEPH_OSD_FLAG_FULL_FORCE)) &&
1940       info.history.last_epoch_marked_full > m->get_map_epoch()) {
1941     dout(10) << __func__ << " discarding op sent before full " << m << " "
1942              << *m << dendl;
1943     return;
1944   }
1945   // mds should have stopped writing before this point.
1946   // We can't allow OSD to become non-startable even if mds
1947   // could be writing as part of file removals.
1948   ostringstream ss;
1949   if (write_ordered && osd->check_failsafe_full(ss)) {
1950     dout(10) << __func__ << " fail-safe full check failed, dropping request"
1951              << ss.str()
1952              << dendl;
1953     return;
1954   }
1955   int64_t poolid = get_pgid().pool();
1956   if (op->may_write()) {
1957
1958     const pg_pool_t *pi = get_osdmap()->get_pg_pool(poolid);
1959     if (!pi) {
1960       return;
1961     }
1962
1963     // invalid?
1964     if (m->get_snapid() != CEPH_NOSNAP) {
1965       dout(20) << __func__ << ": write to clone not valid " << *m << dendl;
1966       osd->reply_op_error(op, -EINVAL);
1967       return;
1968     }
1969
1970     // too big?
1971     if (cct->_conf->osd_max_write_size &&
1972         m->get_data_len() > cct->_conf->osd_max_write_size << 20) {
1973       // journal can't hold commit!
1974       derr << "do_op msg data len " << m->get_data_len()
1975            << " > osd_max_write_size " << (cct->_conf->osd_max_write_size << 20)
1976            << " on " << *m << dendl;
1977       osd->reply_op_error(op, -OSD_WRITETOOBIG);
1978       return;
1979     }
1980   }
1981
1982   dout(10) << "do_op " << *m
1983            << (op->may_write() ? " may_write" : "")
1984            << (op->may_read() ? " may_read" : "")
1985            << (op->may_cache() ? " may_cache" : "")
1986            << " -> " << (write_ordered ? "write-ordered" : "read-ordered")
1987            << " flags " << ceph_osd_flag_string(m->get_flags())
1988            << dendl;
1989
1990   // missing object?
1991   if (is_unreadable_object(head)) {
1992     if (can_backoff &&
1993         (g_conf->osd_backoff_on_degraded ||
1994          (g_conf->osd_backoff_on_unfound && missing_loc.is_unfound(head)))) {
1995       add_backoff(session, head, head);
1996       maybe_kick_recovery(head);
1997     } else {
1998       wait_for_unreadable_object(head, op);
1999     }
2000     return;
2001   }
2002
2003   // degraded object?
2004   if (write_ordered && is_degraded_or_backfilling_object(head)) {
2005     if (can_backoff && g_conf->osd_backoff_on_degraded) {
2006       add_backoff(session, head, head);
2007     } else {
2008       wait_for_degraded_object(head, op);
2009     }
2010     return;
2011   }
2012
2013   if (write_ordered &&
2014       scrubber.write_blocked_by_scrub(head)) {
2015     dout(20) << __func__ << ": waiting for scrub" << dendl;
2016     waiting_for_scrub.push_back(op);
2017     op->mark_delayed("waiting for scrub");
2018     return;
2019   }
2020
2021   // blocked on snap?
2022   map<hobject_t, snapid_t>::iterator blocked_iter =
2023     objects_blocked_on_degraded_snap.find(head);
2024   if (write_ordered && blocked_iter != objects_blocked_on_degraded_snap.end()) {
2025     hobject_t to_wait_on(head);
2026     to_wait_on.snap = blocked_iter->second;
2027     wait_for_degraded_object(to_wait_on, op);
2028     return;
2029   }
2030   map<hobject_t, ObjectContextRef>::iterator blocked_snap_promote_iter =
2031     objects_blocked_on_snap_promotion.find(head);
2032   if (write_ordered &&
2033       blocked_snap_promote_iter != objects_blocked_on_snap_promotion.end()) {
2034     wait_for_blocked_object(
2035       blocked_snap_promote_iter->second->obs.oi.soid,
2036       op);
2037     return;
2038   }
2039   if (write_ordered && objects_blocked_on_cache_full.count(head)) {
2040     block_write_on_full_cache(head, op);
2041     return;
2042   }
2043
2044   // missing snapdir?
2045   hobject_t snapdir = head.get_snapdir();
2046
2047   if (is_unreadable_object(snapdir)) {
2048     wait_for_unreadable_object(snapdir, op);
2049     return;
2050   }
2051
2052   // degraded object?
2053   if (write_ordered && is_degraded_or_backfilling_object(snapdir)) {
2054     wait_for_degraded_object(snapdir, op);
2055     return;
2056   }
2057
2058   // dup/resent?
2059   if (op->may_write() || op->may_cache()) {
2060     // warning: we will get back *a* request for this reqid, but not
2061     // necessarily the most recent.  this happens with flush and
2062     // promote ops, but we can't possible have both in our log where
2063     // the original request is still not stable on disk, so for our
2064     // purposes here it doesn't matter which one we get.
2065     eversion_t version;
2066     version_t user_version;
2067     int return_code = 0;
2068     bool got = check_in_progress_op(
2069       m->get_reqid(), &version, &user_version, &return_code);
2070     if (got) {
2071       dout(3) << __func__ << " dup " << m->get_reqid()
2072               << " version " << version << dendl;
2073       if (already_complete(version)) {
2074         osd->reply_op_error(op, return_code, version, user_version);
2075       } else {
2076         dout(10) << " waiting for " << version << " to commit" << dendl;
2077         // always queue ondisk waiters, so that we can requeue if needed
2078         waiting_for_ondisk[version].push_back(make_pair(op, user_version));
2079         op->mark_delayed("waiting for ondisk");
2080       }
2081       return;
2082     }
2083   }
2084
2085   ObjectContextRef obc;
2086   bool can_create = op->may_write() || op->may_cache();
2087   hobject_t missing_oid;
2088   const hobject_t& oid = m->get_hobj();
2089
2090   // io blocked on obc?
2091   if (!m->has_flag(CEPH_OSD_FLAG_FLUSH) &&
2092       maybe_await_blocked_snapset(oid, op)) {
2093     return;
2094   }
2095
2096   int r = find_object_context(
2097     oid, &obc, can_create,
2098     m->has_flag(CEPH_OSD_FLAG_MAP_SNAP_CLONE),
2099     &missing_oid);
2100
2101   if (r == -EAGAIN) {
2102     // If we're not the primary of this OSD, we just return -EAGAIN. Otherwise,
2103     // we have to wait for the object.
2104     if (is_primary()) {
2105       // missing the specific snap we need; requeue and wait.
2106       assert(!op->may_write()); // only happens on a read/cache
2107       wait_for_unreadable_object(missing_oid, op);
2108       return;
2109     }
2110   } else if (r == 0) {
2111     if (is_unreadable_object(obc->obs.oi.soid)) {
2112       dout(10) << __func__ << ": clone " << obc->obs.oi.soid
2113                << " is unreadable, waiting" << dendl;
2114       wait_for_unreadable_object(obc->obs.oi.soid, op);
2115       return;
2116     }
2117
2118     // degraded object?  (the check above was for head; this could be a clone)
2119     if (write_ordered &&
2120         obc->obs.oi.soid.snap != CEPH_NOSNAP &&
2121         is_degraded_or_backfilling_object(obc->obs.oi.soid)) {
2122       dout(10) << __func__ << ": clone " << obc->obs.oi.soid
2123                << " is degraded, waiting" << dendl;
2124       wait_for_degraded_object(obc->obs.oi.soid, op);
2125       return;
2126     }
2127   }
2128
2129   bool in_hit_set = false;
2130   if (hit_set) {
2131     if (obc.get()) {
2132       if (obc->obs.oi.soid != hobject_t() && hit_set->contains(obc->obs.oi.soid))
2133         in_hit_set = true;
2134     } else {
2135       if (missing_oid != hobject_t() && hit_set->contains(missing_oid))
2136         in_hit_set = true;
2137     }
2138     if (!op->hitset_inserted) {
2139       hit_set->insert(oid);
2140       op->hitset_inserted = true;
2141       if (hit_set->is_full() ||
2142           hit_set_start_stamp + pool.info.hit_set_period <= m->get_recv_stamp()) {
2143         hit_set_persist();
2144       }
2145     }
2146   }
2147
2148   if (agent_state) {
2149     if (agent_choose_mode(false, op))
2150       return;
2151   }
2152
2153   if (obc.get() && obc->obs.exists && obc->obs.oi.has_manifest()) {
2154     if (maybe_handle_manifest(op,
2155                                write_ordered,
2156                                obc))
2157     return;
2158   }
2159
2160   if (maybe_handle_cache(op,
2161                          write_ordered,
2162                          obc,
2163                          r,
2164                          missing_oid,
2165                          false,
2166                          in_hit_set))
2167     return;
2168
2169   if (r && (r != -ENOENT || !obc)) {
2170     // copy the reqids for copy get on ENOENT
2171     if (r == -ENOENT &&
2172         (m->ops[0].op.op == CEPH_OSD_OP_COPY_GET)) {
2173       fill_in_copy_get_noent(op, oid, m->ops[0]);
2174       return;
2175     }
2176     dout(20) << __func__ << "find_object_context got error " << r << dendl;
2177     if (op->may_write() &&
2178         get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
2179       record_write_error(op, oid, nullptr, r);
2180     } else {
2181       osd->reply_op_error(op, r);
2182     }
2183     return;
2184   }
2185
2186   // make sure locator is consistent
2187   object_locator_t oloc(obc->obs.oi.soid);
2188   if (m->get_object_locator() != oloc) {
2189     dout(10) << " provided locator " << m->get_object_locator()
2190              << " != object's " << obc->obs.oi.soid << dendl;
2191     osd->clog->warn() << "bad locator " << m->get_object_locator()
2192                      << " on object " << oloc
2193                       << " op " << *m;
2194   }
2195
2196   // io blocked on obc?
2197   if (obc->is_blocked() &&
2198       !m->has_flag(CEPH_OSD_FLAG_FLUSH)) {
2199     wait_for_blocked_object(obc->obs.oi.soid, op);
2200     return;
2201   }
2202
2203   dout(25) << __func__ << " oi " << obc->obs.oi << dendl;
2204
2205   for (vector<OSDOp>::iterator p = m->ops.begin(); p != m->ops.end(); ++p) {
2206     OSDOp& osd_op = *p;
2207
2208     // make sure LIST_SNAPS is on CEPH_SNAPDIR and nothing else
2209     if (osd_op.op.op == CEPH_OSD_OP_LIST_SNAPS &&
2210         m->get_snapid() != CEPH_SNAPDIR) {
2211       dout(10) << "LIST_SNAPS with incorrect context" << dendl;
2212       osd->reply_op_error(op, -EINVAL);
2213       return;
2214     }
2215   }
2216
2217   OpContext *ctx = new OpContext(op, m->get_reqid(), m->ops, obc, this);
2218
2219   if (!obc->obs.exists)
2220     ctx->snapset_obc = get_object_context(obc->obs.oi.soid.get_snapdir(), false);
2221
2222   /* Due to obc caching, we might have a cached non-existent snapset_obc
2223    * for the snapdir.  If so, we can ignore it.  Subsequent parts of the
2224    * do_op pipeline make decisions based on whether snapset_obc is
2225    * populated.
2226    */
2227   if (ctx->snapset_obc && !ctx->snapset_obc->obs.exists)
2228     ctx->snapset_obc = ObjectContextRef();
2229
2230   if (m->has_flag(CEPH_OSD_FLAG_SKIPRWLOCKS)) {
2231     dout(20) << __func__ << ": skipping rw locks" << dendl;
2232   } else if (m->get_flags() & CEPH_OSD_FLAG_FLUSH) {
2233     dout(20) << __func__ << ": part of flush, will ignore write lock" << dendl;
2234
2235     // verify there is in fact a flush in progress
2236     // FIXME: we could make this a stronger test.
2237     map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(obc->obs.oi.soid);
2238     if (p == flush_ops.end()) {
2239       dout(10) << __func__ << " no flush in progress, aborting" << dendl;
2240       reply_ctx(ctx, -EINVAL);
2241       return;
2242     }
2243   } else if (!get_rw_locks(write_ordered, ctx)) {
2244     dout(20) << __func__ << " waiting for rw locks " << dendl;
2245     op->mark_delayed("waiting for rw locks");
2246     close_op_ctx(ctx);
2247     return;
2248   }
2249   dout(20) << __func__ << " obc " << *obc << dendl;
2250
2251   if (r) {
2252     dout(20) << __func__ << " returned an error: " << r << dendl;
2253     close_op_ctx(ctx);
2254     if (op->may_write() &&
2255         get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
2256       record_write_error(op, oid, nullptr, r);
2257     } else {
2258       osd->reply_op_error(op, r);
2259     }
2260     return;
2261   }
2262
2263   if (m->has_flag(CEPH_OSD_FLAG_IGNORE_CACHE)) {
2264     ctx->ignore_cache = true;
2265   }
2266
2267   if ((op->may_read()) && (obc->obs.oi.is_lost())) {
2268     // This object is lost. Reading from it returns an error.
2269     dout(20) << __func__ << ": object " << obc->obs.oi.soid
2270              << " is lost" << dendl;
2271     reply_ctx(ctx, -ENFILE);
2272     return;
2273   }
2274   if (!op->may_write() &&
2275       !op->may_cache() &&
2276       (!obc->obs.exists ||
2277        ((m->get_snapid() != CEPH_SNAPDIR) &&
2278         obc->obs.oi.is_whiteout()))) {
2279     // copy the reqids for copy get on ENOENT
2280     if (m->ops[0].op.op == CEPH_OSD_OP_COPY_GET) {
2281       fill_in_copy_get_noent(op, oid, m->ops[0]);
2282       close_op_ctx(ctx);
2283       return;
2284     }
2285     reply_ctx(ctx, -ENOENT);
2286     return;
2287   }
2288
2289   op->mark_started();
2290
2291   execute_ctx(ctx);
2292   utime_t prepare_latency = ceph_clock_now();
2293   prepare_latency -= op->get_dequeued_time();
2294   osd->logger->tinc(l_osd_op_prepare_lat, prepare_latency);
2295   if (op->may_read() && op->may_write()) {
2296     osd->logger->tinc(l_osd_op_rw_prepare_lat, prepare_latency);
2297   } else if (op->may_read()) {
2298     osd->logger->tinc(l_osd_op_r_prepare_lat, prepare_latency);
2299   } else if (op->may_write() || op->may_cache()) {
2300     osd->logger->tinc(l_osd_op_w_prepare_lat, prepare_latency);
2301   }
2302
2303   // force recovery of the oldest missing object if too many logs
2304   maybe_force_recovery();
2305 }
2306 PrimaryLogPG::cache_result_t PrimaryLogPG::maybe_handle_manifest_detail(
2307   OpRequestRef op,
2308   bool write_ordered,
2309   ObjectContextRef obc)
2310 {
2311   if (static_cast<const MOSDOp *>(op->get_req())->get_flags() &
2312       CEPH_OSD_FLAG_IGNORE_REDIRECT) {
2313     dout(20) << __func__ << ": ignoring redirect due to flag" << dendl;
2314     return cache_result_t::NOOP;
2315   }
2316
2317   if (obc)
2318     dout(10) << __func__ << " " << obc->obs.oi << " "
2319        << (obc->obs.exists ? "exists" : "DNE")
2320        << dendl;
2321
2322   // if it is write-ordered and blocked, stop now
2323   if (obc.get() && obc->is_blocked() && write_ordered) {
2324     // we're already doing something with this object
2325     dout(20) << __func__ << " blocked on " << obc->obs.oi.soid << dendl;
2326     return cache_result_t::NOOP;
2327   }
2328
2329   vector<OSDOp> ops = static_cast<const MOSDOp*>(op->get_req())->ops;
2330   for (vector<OSDOp>::iterator p = ops.begin(); p != ops.end(); ++p) {
2331     OSDOp& osd_op = *p;
2332     ceph_osd_op& op = osd_op.op;
2333     if (op.op == CEPH_OSD_OP_SET_REDIRECT) {
2334       return cache_result_t::NOOP;
2335     }
2336   }
2337
2338   switch (obc->obs.oi.manifest.type) {
2339   case object_manifest_t::TYPE_REDIRECT:
2340     if (op->may_write() || write_ordered) {
2341       do_proxy_write(op, obc->obs.oi.soid, obc);
2342     } else {
2343       do_proxy_read(op, obc);
2344     }
2345     return cache_result_t::HANDLED_PROXY;
2346   case object_manifest_t::TYPE_CHUNKED:
2347   default:
2348     assert(0 == "unrecognized manifest type");
2349   }
2350
2351   return cache_result_t::NOOP;
2352 }
2353
2354 void PrimaryLogPG::record_write_error(OpRequestRef op, const hobject_t &soid,
2355                                       MOSDOpReply *orig_reply, int r)
2356 {
2357   dout(20) << __func__ << " r=" << r << dendl;
2358   assert(op->may_write());
2359   const osd_reqid_t &reqid = static_cast<const MOSDOp*>(op->get_req())->get_reqid();
2360   ObjectContextRef obc;
2361   mempool::osd_pglog::list<pg_log_entry_t> entries;
2362   entries.push_back(pg_log_entry_t(pg_log_entry_t::ERROR, soid,
2363                                    get_next_version(), eversion_t(), 0,
2364                                    reqid, utime_t(), r));
2365
2366   struct OnComplete {
2367     PrimaryLogPG *pg;
2368     OpRequestRef op;
2369     boost::intrusive_ptr<MOSDOpReply> orig_reply;
2370     int r;
2371     OnComplete(
2372       PrimaryLogPG *pg,
2373       OpRequestRef op,
2374       MOSDOpReply *orig_reply,
2375       int r)
2376       : pg(pg), op(op),
2377         orig_reply(orig_reply, false /* take over ref */), r(r)
2378       {}
2379     void operator()() {
2380       ldpp_dout(pg, 20) << "finished " << __func__ << " r=" << r << dendl;
2381       const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
2382       int flags = m->get_flags() & (CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
2383       MOSDOpReply *reply = orig_reply.detach();
2384       if (reply == nullptr) {
2385         reply = new MOSDOpReply(m, r, pg->get_osdmap()->get_epoch(),
2386                                 flags, true);
2387       }
2388       ldpp_dout(pg, 10) << " sending commit on " << *m << " " << reply << dendl;
2389       pg->osd->send_message_osd_client(reply, m->get_connection());
2390     }
2391   };
2392
2393   ObcLockManager lock_manager;
2394   submit_log_entries(
2395     entries,
2396     std::move(lock_manager),
2397     boost::optional<std::function<void(void)> >(
2398       OnComplete(this, op, orig_reply, r)),
2399     op,
2400     r);
2401 }
2402
2403 PrimaryLogPG::cache_result_t PrimaryLogPG::maybe_handle_cache_detail(
2404   OpRequestRef op,
2405   bool write_ordered,
2406   ObjectContextRef obc,
2407   int r, hobject_t missing_oid,
2408   bool must_promote,
2409   bool in_hit_set,
2410   ObjectContextRef *promote_obc)
2411 {
2412   if (op &&
2413       op->get_req() &&
2414       op->get_req()->get_type() == CEPH_MSG_OSD_OP &&
2415       (static_cast<const MOSDOp *>(op->get_req())->get_flags() &
2416        CEPH_OSD_FLAG_IGNORE_CACHE)) {
2417     dout(20) << __func__ << ": ignoring cache due to flag" << dendl;
2418     return cache_result_t::NOOP;
2419   }
2420   // return quickly if caching is not enabled
2421   if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)
2422     return cache_result_t::NOOP;
2423
2424   must_promote = must_promote || op->need_promote();
2425
2426   if (obc)
2427     dout(25) << __func__ << " " << obc->obs.oi << " "
2428              << (obc->obs.exists ? "exists" : "DNE")
2429              << " missing_oid " << missing_oid
2430              << " must_promote " << (int)must_promote
2431              << " in_hit_set " << (int)in_hit_set
2432              << dendl;
2433   else
2434     dout(25) << __func__ << " (no obc)"
2435              << " missing_oid " << missing_oid
2436              << " must_promote " << (int)must_promote
2437              << " in_hit_set " << (int)in_hit_set
2438              << dendl;
2439
2440   // if it is write-ordered and blocked, stop now
2441   if (obc.get() && obc->is_blocked() && write_ordered) {
2442     // we're already doing something with this object
2443     dout(20) << __func__ << " blocked on " << obc->obs.oi.soid << dendl;
2444     return cache_result_t::NOOP;
2445   }
2446
2447   if (r == -ENOENT && missing_oid == hobject_t()) {
2448     // we know this object is logically absent (e.g., an undefined clone)
2449     return cache_result_t::NOOP;
2450   }
2451
2452   if (obc.get() && obc->obs.exists) {
2453     osd->logger->inc(l_osd_op_cache_hit);
2454     return cache_result_t::NOOP;
2455   }
2456
2457   if (missing_oid == hobject_t() && obc.get()) {
2458     missing_oid = obc->obs.oi.soid;
2459   }
2460
2461   const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
2462   const object_locator_t oloc = m->get_object_locator();
2463
2464   if (op->need_skip_handle_cache()) {
2465     return cache_result_t::NOOP;
2466   }
2467
2468   // older versions do not proxy the feature bits.
2469   bool can_proxy_write = get_osdmap()->get_up_osd_features() &
2470     CEPH_FEATURE_OSD_PROXY_WRITE_FEATURES;
2471   OpRequestRef promote_op;
2472
2473   switch (pool.info.cache_mode) {
2474   case pg_pool_t::CACHEMODE_WRITEBACK:
2475     if (agent_state &&
2476         agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2477       if (!op->may_write() && !op->may_cache() &&
2478           !write_ordered && !must_promote) {
2479         dout(20) << __func__ << " cache pool full, proxying read" << dendl;
2480         do_proxy_read(op);
2481         return cache_result_t::HANDLED_PROXY;
2482       }
2483       dout(20) << __func__ << " cache pool full, waiting" << dendl;
2484       block_write_on_full_cache(missing_oid, op);
2485       return cache_result_t::BLOCKED_FULL;
2486     }
2487
2488     if (must_promote || (!hit_set && !op->need_skip_promote())) {
2489       promote_object(obc, missing_oid, oloc, op, promote_obc);
2490       return cache_result_t::BLOCKED_PROMOTE;
2491     }
2492
2493     if (op->may_write() || op->may_cache()) {
2494       if (can_proxy_write) {
2495         do_proxy_write(op, missing_oid);
2496       } else {
2497         // promote if can't proxy the write
2498         promote_object(obc, missing_oid, oloc, op, promote_obc);
2499         return cache_result_t::BLOCKED_PROMOTE;
2500       }
2501
2502       // Promote too?
2503       if (!op->need_skip_promote() &&
2504           maybe_promote(obc, missing_oid, oloc, in_hit_set,
2505                       pool.info.min_write_recency_for_promote,
2506                       OpRequestRef(),
2507                       promote_obc)) {
2508         return cache_result_t::BLOCKED_PROMOTE;
2509       }
2510       return cache_result_t::HANDLED_PROXY;
2511     } else {
2512       do_proxy_read(op);
2513
2514       // Avoid duplicate promotion
2515       if (obc.get() && obc->is_blocked()) {
2516         if (promote_obc)
2517           *promote_obc = obc;
2518         return cache_result_t::BLOCKED_PROMOTE;
2519       }
2520
2521       // Promote too?
2522       if (!op->need_skip_promote()) {
2523         (void)maybe_promote(obc, missing_oid, oloc, in_hit_set,
2524                             pool.info.min_read_recency_for_promote,
2525                             promote_op, promote_obc);
2526       }
2527
2528       return cache_result_t::HANDLED_PROXY;
2529     }
2530     assert(0 == "unreachable");
2531     return cache_result_t::NOOP;
2532
2533   case pg_pool_t::CACHEMODE_FORWARD:
2534     // FIXME: this mode allows requests to be reordered.
2535     do_cache_redirect(op);
2536     return cache_result_t::HANDLED_REDIRECT;
2537
2538   case pg_pool_t::CACHEMODE_READONLY:
2539     // TODO: clean this case up
2540     if (!obc.get() && r == -ENOENT) {
2541       // we don't have the object and op's a read
2542       promote_object(obc, missing_oid, oloc, op, promote_obc);
2543       return cache_result_t::BLOCKED_PROMOTE;
2544     }
2545     if (!r) { // it must be a write
2546       do_cache_redirect(op);
2547       return cache_result_t::HANDLED_REDIRECT;
2548     }
2549     // crap, there was a failure of some kind
2550     return cache_result_t::NOOP;
2551
2552   case pg_pool_t::CACHEMODE_READFORWARD:
2553     // Do writeback to the cache tier for writes
2554     if (op->may_write() || write_ordered || must_promote) {
2555       if (agent_state &&
2556           agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2557         dout(20) << __func__ << " cache pool full, waiting" << dendl;
2558         block_write_on_full_cache(missing_oid, op);
2559         return cache_result_t::BLOCKED_FULL;
2560       }
2561       promote_object(obc, missing_oid, oloc, op, promote_obc);
2562       return cache_result_t::BLOCKED_PROMOTE;
2563     }
2564
2565     // If it is a read, we can read, we need to forward it
2566     do_cache_redirect(op);
2567     return cache_result_t::HANDLED_REDIRECT;
2568
2569   case pg_pool_t::CACHEMODE_PROXY:
2570     if (!must_promote) {
2571       if (op->may_write() || op->may_cache() || write_ordered) {
2572         if (can_proxy_write) {
2573           do_proxy_write(op, missing_oid);
2574           return cache_result_t::HANDLED_PROXY;
2575         }
2576       } else {
2577         do_proxy_read(op);
2578         return cache_result_t::HANDLED_PROXY;
2579       }
2580     }
2581     // ugh, we're forced to promote.
2582     if (agent_state &&
2583         agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2584       dout(20) << __func__ << " cache pool full, waiting" << dendl;
2585       block_write_on_full_cache(missing_oid, op);
2586       return cache_result_t::BLOCKED_FULL;
2587     }
2588     promote_object(obc, missing_oid, oloc, op, promote_obc);
2589     return cache_result_t::BLOCKED_PROMOTE;
2590
2591   case pg_pool_t::CACHEMODE_READPROXY:
2592     // Do writeback to the cache tier for writes
2593     if (op->may_write() || write_ordered || must_promote) {
2594       if (agent_state &&
2595           agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
2596         dout(20) << __func__ << " cache pool full, waiting" << dendl;
2597         block_write_on_full_cache(missing_oid, op);
2598         return cache_result_t::BLOCKED_FULL;
2599       }
2600       promote_object(obc, missing_oid, oloc, op, promote_obc);
2601       return cache_result_t::BLOCKED_PROMOTE;
2602     }
2603
2604     // If it is a read, we can read, we need to proxy it
2605     do_proxy_read(op);
2606     return cache_result_t::HANDLED_PROXY;
2607
2608   default:
2609     assert(0 == "unrecognized cache_mode");
2610   }
2611   return cache_result_t::NOOP;
2612 }
2613
2614 bool PrimaryLogPG::maybe_promote(ObjectContextRef obc,
2615                                  const hobject_t& missing_oid,
2616                                  const object_locator_t& oloc,
2617                                  bool in_hit_set,
2618                                  uint32_t recency,
2619                                  OpRequestRef promote_op,
2620                                  ObjectContextRef *promote_obc)
2621 {
2622   dout(20) << __func__ << " missing_oid " << missing_oid
2623            << "  in_hit_set " << in_hit_set << dendl;
2624
2625   switch (recency) {
2626   case 0:
2627     break;
2628   case 1:
2629     // Check if in the current hit set
2630     if (in_hit_set) {
2631       break;
2632     } else {
2633       // not promoting
2634       return false;
2635     }
2636     break;
2637   default:
2638     {
2639       unsigned count = (int)in_hit_set;
2640       if (count) {
2641         // Check if in other hit sets
2642         const hobject_t& oid = obc.get() ? obc->obs.oi.soid : missing_oid;
2643         for (map<time_t,HitSetRef>::reverse_iterator itor =
2644                agent_state->hit_set_map.rbegin();
2645              itor != agent_state->hit_set_map.rend();
2646              ++itor) {
2647           if (!itor->second->contains(oid)) {
2648             break;
2649           }
2650           ++count;
2651           if (count >= recency) {
2652             break;
2653           }
2654         }
2655       }
2656       if (count >= recency) {
2657         break;
2658       }
2659       return false;     // not promoting
2660     }
2661     break;
2662   }
2663
2664   if (osd->promote_throttle()) {
2665     dout(10) << __func__ << " promote throttled" << dendl;
2666     return false;
2667   }
2668   promote_object(obc, missing_oid, oloc, promote_op, promote_obc);
2669   return true;
2670 }
2671
2672 void PrimaryLogPG::do_cache_redirect(OpRequestRef op)
2673 {
2674   const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
2675   int flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
2676   MOSDOpReply *reply = new MOSDOpReply(m, -ENOENT,
2677                                        get_osdmap()->get_epoch(), flags, false);
2678   request_redirect_t redir(m->get_object_locator(), pool.info.tier_of);
2679   reply->set_redirect(redir);
2680   dout(10) << "sending redirect to pool " << pool.info.tier_of << " for op "
2681            << op << dendl;
2682   m->get_connection()->send_message(reply);
2683   return;
2684 }
2685
2686 struct C_ProxyRead : public Context {
2687   PrimaryLogPGRef pg;
2688   hobject_t oid;
2689   epoch_t last_peering_reset;
2690   ceph_tid_t tid;
2691   PrimaryLogPG::ProxyReadOpRef prdop;
2692   utime_t start;
2693   C_ProxyRead(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
2694              const PrimaryLogPG::ProxyReadOpRef& prd)
2695     : pg(p), oid(o), last_peering_reset(lpr),
2696       tid(0), prdop(prd), start(ceph_clock_now())
2697   {}
2698   void finish(int r) override {
2699     if (prdop->canceled)
2700       return;
2701     pg->lock();
2702     if (prdop->canceled) {
2703       pg->unlock();
2704       return;
2705     }
2706     if (last_peering_reset == pg->get_last_peering_reset()) {
2707       pg->finish_proxy_read(oid, tid, r);
2708       pg->osd->logger->tinc(l_osd_tier_r_lat, ceph_clock_now() - start);
2709     }
2710     pg->unlock();
2711   }
2712 };
2713
2714 void PrimaryLogPG::do_proxy_read(OpRequestRef op, ObjectContextRef obc)
2715 {
2716   // NOTE: non-const here because the ProxyReadOp needs mutable refs to
2717   // stash the result in the request's OSDOp vector
2718   MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
2719   object_locator_t oloc;
2720   hobject_t soid;
2721   /* extensible tier */
2722   if (obc && obc->obs.exists && obc->obs.oi.has_manifest()) {
2723     switch (obc->obs.oi.manifest.type) {
2724       case object_manifest_t::TYPE_REDIRECT:
2725           oloc = object_locator_t(obc->obs.oi.manifest.redirect_target);
2726           soid = obc->obs.oi.manifest.redirect_target;
2727           break;
2728       case object_manifest_t::TYPE_CHUNKED:
2729       default:
2730         assert(0 == "unrecognized manifest type");
2731     }
2732   } else {
2733   /* proxy */
2734     soid = m->get_hobj();
2735     oloc = object_locator_t(m->get_object_locator());
2736     oloc.pool = pool.info.tier_of;
2737   }
2738   unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY;
2739
2740   // pass through some original flags that make sense.
2741   //  - leave out redirection and balancing flags since we are
2742   //    already proxying through the primary
2743   //  - leave off read/write/exec flags that are derived from the op
2744   flags |= m->get_flags() & (CEPH_OSD_FLAG_RWORDERED |
2745                              CEPH_OSD_FLAG_ORDERSNAP |
2746                              CEPH_OSD_FLAG_ENFORCE_SNAPC |
2747                              CEPH_OSD_FLAG_MAP_SNAP_CLONE);
2748
2749   dout(10) << __func__ << " Start proxy read for " << *m << dendl;
2750
2751   ProxyReadOpRef prdop(std::make_shared<ProxyReadOp>(op, soid, m->ops));
2752
2753   ObjectOperation obj_op;
2754   obj_op.dup(prdop->ops);
2755
2756   if (pool.info.cache_mode == pg_pool_t::CACHEMODE_WRITEBACK &&
2757       (agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL)) {
2758     for (unsigned i = 0; i < obj_op.ops.size(); i++) {
2759       ceph_osd_op op = obj_op.ops[i].op;
2760       switch (op.op) {
2761         case CEPH_OSD_OP_READ:
2762         case CEPH_OSD_OP_SYNC_READ:
2763         case CEPH_OSD_OP_SPARSE_READ:
2764         case CEPH_OSD_OP_CHECKSUM:
2765           op.flags = (op.flags | CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL) &
2766                        ~(CEPH_OSD_OP_FLAG_FADVISE_DONTNEED | CEPH_OSD_OP_FLAG_FADVISE_NOCACHE);
2767       }
2768     }
2769   }
2770
2771   C_ProxyRead *fin = new C_ProxyRead(this, soid, get_last_peering_reset(),
2772                                      prdop);
2773   ceph_tid_t tid = osd->objecter->read(
2774     soid.oid, oloc, obj_op,
2775     m->get_snapid(), NULL,
2776     flags, new C_OnFinisher(fin, &osd->objecter_finisher),
2777     &prdop->user_version,
2778     &prdop->data_offset,
2779     m->get_features());
2780   fin->tid = tid;
2781   prdop->objecter_tid = tid;
2782   proxyread_ops[tid] = prdop;
2783   in_progress_proxy_ops[soid].push_back(op);
2784 }
2785
2786 void PrimaryLogPG::finish_proxy_read(hobject_t oid, ceph_tid_t tid, int r)
2787 {
2788   dout(10) << __func__ << " " << oid << " tid " << tid
2789            << " " << cpp_strerror(r) << dendl;
2790
2791   map<ceph_tid_t, ProxyReadOpRef>::iterator p = proxyread_ops.find(tid);
2792   if (p == proxyread_ops.end()) {
2793     dout(10) << __func__ << " no proxyread_op found" << dendl;
2794     return;
2795   }
2796   ProxyReadOpRef prdop = p->second;
2797   if (tid != prdop->objecter_tid) {
2798     dout(10) << __func__ << " tid " << tid << " != prdop " << prdop
2799              << " tid " << prdop->objecter_tid << dendl;
2800     return;
2801   }
2802   if (oid != prdop->soid) {
2803     dout(10) << __func__ << " oid " << oid << " != prdop " << prdop
2804              << " soid " << prdop->soid << dendl;
2805     return;
2806   }
2807   proxyread_ops.erase(tid);
2808
2809   map<hobject_t, list<OpRequestRef>>::iterator q = in_progress_proxy_ops.find(oid);
2810   if (q == in_progress_proxy_ops.end()) {
2811     dout(10) << __func__ << " no in_progress_proxy_ops found" << dendl;
2812     return;
2813   }
2814   assert(q->second.size());
2815   list<OpRequestRef>::iterator it = std::find(q->second.begin(),
2816                                               q->second.end(),
2817                                               prdop->op);
2818   assert(it != q->second.end());
2819   OpRequestRef op = *it;
2820   q->second.erase(it);
2821   if (q->second.size() == 0) {
2822     in_progress_proxy_ops.erase(oid);
2823   }
2824
2825   osd->logger->inc(l_osd_tier_proxy_read);
2826
2827   const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
2828   OpContext *ctx = new OpContext(op, m->get_reqid(), prdop->ops, this);
2829   ctx->reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0, false);
2830   ctx->user_at_version = prdop->user_version;
2831   ctx->data_off = prdop->data_offset;
2832   ctx->ignore_log_op_stats = true;
2833   complete_read_ctx(r, ctx);
2834 }
2835
2836 void PrimaryLogPG::kick_proxy_ops_blocked(hobject_t& soid)
2837 {
2838   map<hobject_t, list<OpRequestRef>>::iterator p = in_progress_proxy_ops.find(soid);
2839   if (p == in_progress_proxy_ops.end())
2840     return;
2841
2842   list<OpRequestRef>& ls = p->second;
2843   dout(10) << __func__ << " " << soid << " requeuing " << ls.size() << " requests" << dendl;
2844   requeue_ops(ls);
2845   in_progress_proxy_ops.erase(p);
2846 }
2847
2848 void PrimaryLogPG::cancel_proxy_read(ProxyReadOpRef prdop)
2849 {
2850   dout(10) << __func__ << " " << prdop->soid << dendl;
2851   prdop->canceled = true;
2852
2853   // cancel objecter op, if we can
2854   if (prdop->objecter_tid) {
2855     osd->objecter->op_cancel(prdop->objecter_tid, -ECANCELED);
2856     for (uint32_t i = 0; i < prdop->ops.size(); i++) {
2857       prdop->ops[i].outdata.clear();
2858     }
2859     proxyread_ops.erase(prdop->objecter_tid);
2860     prdop->objecter_tid = 0;
2861   }
2862 }
2863
2864 void PrimaryLogPG::cancel_proxy_ops(bool requeue)
2865 {
2866   dout(10) << __func__ << dendl;
2867
2868   // cancel proxy reads
2869   map<ceph_tid_t, ProxyReadOpRef>::iterator p = proxyread_ops.begin();
2870   while (p != proxyread_ops.end()) {
2871     cancel_proxy_read((p++)->second);
2872   }
2873
2874   // cancel proxy writes
2875   map<ceph_tid_t, ProxyWriteOpRef>::iterator q = proxywrite_ops.begin();
2876   while (q != proxywrite_ops.end()) {
2877     cancel_proxy_write((q++)->second);
2878   }
2879
2880   if (requeue) {
2881     map<hobject_t, list<OpRequestRef>>::iterator p =
2882       in_progress_proxy_ops.begin();
2883     while (p != in_progress_proxy_ops.end()) {
2884       list<OpRequestRef>& ls = p->second;
2885       dout(10) << __func__ << " " << p->first << " requeuing " << ls.size()
2886                << " requests" << dendl;
2887       requeue_ops(ls);
2888       in_progress_proxy_ops.erase(p++);
2889     }
2890   } else {
2891     in_progress_proxy_ops.clear();
2892   }
2893 }
2894
2895 struct C_ProxyWrite_Commit : public Context {
2896   PrimaryLogPGRef pg;
2897   hobject_t oid;
2898   epoch_t last_peering_reset;
2899   ceph_tid_t tid;
2900   PrimaryLogPG::ProxyWriteOpRef pwop;
2901   C_ProxyWrite_Commit(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
2902                       const PrimaryLogPG::ProxyWriteOpRef& pw)
2903     : pg(p), oid(o), last_peering_reset(lpr),
2904       tid(0), pwop(pw)
2905   {}
2906   void finish(int r) override {
2907     if (pwop->canceled)
2908       return;
2909     pg->lock();
2910     if (pwop->canceled) {
2911       pg->unlock();
2912       return;
2913     }
2914     if (last_peering_reset == pg->get_last_peering_reset()) {
2915       pg->finish_proxy_write(oid, tid, r);
2916     }
2917     pg->unlock();
2918   }
2919 };
2920
2921 void PrimaryLogPG::do_proxy_write(OpRequestRef op, const hobject_t& missing_oid, ObjectContextRef obc)
2922 {
2923   // NOTE: non-const because ProxyWriteOp takes a mutable ref
2924   MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
2925   object_locator_t oloc;
2926   SnapContext snapc(m->get_snap_seq(), m->get_snaps());
2927   hobject_t soid;
2928   /* extensible tier */
2929   if (obc && obc->obs.exists && obc->obs.oi.has_manifest()) {
2930     switch (obc->obs.oi.manifest.type) {
2931       case object_manifest_t::TYPE_REDIRECT:
2932           oloc = object_locator_t(obc->obs.oi.manifest.redirect_target);
2933           soid = obc->obs.oi.manifest.redirect_target;
2934           break;
2935       case object_manifest_t::TYPE_CHUNKED:
2936       default:
2937         assert(0 == "unrecognized manifest type");
2938     }
2939   } else {
2940   /* proxy */
2941     soid = m->get_hobj();
2942     oloc = object_locator_t(m->get_object_locator());
2943     oloc.pool = pool.info.tier_of;
2944   }
2945
2946   unsigned flags = CEPH_OSD_FLAG_IGNORE_CACHE | CEPH_OSD_FLAG_IGNORE_OVERLAY;
2947   if (!(op->may_write() || op->may_cache())) {
2948     flags |= CEPH_OSD_FLAG_RWORDERED;
2949   }
2950   dout(10) << __func__ << " Start proxy write for " << *m << dendl;
2951
2952   ProxyWriteOpRef pwop(std::make_shared<ProxyWriteOp>(op, soid, m->ops, m->get_reqid()));
2953   pwop->ctx = new OpContext(op, m->get_reqid(), pwop->ops, this);
2954   pwop->mtime = m->get_mtime();
2955
2956   ObjectOperation obj_op;
2957   obj_op.dup(pwop->ops);
2958
2959   C_ProxyWrite_Commit *fin = new C_ProxyWrite_Commit(
2960       this, soid, get_last_peering_reset(), pwop);
2961   ceph_tid_t tid = osd->objecter->mutate(
2962     soid.oid, oloc, obj_op, snapc,
2963     ceph::real_clock::from_ceph_timespec(pwop->mtime),
2964     flags, new C_OnFinisher(fin, &osd->objecter_finisher),
2965     &pwop->user_version, pwop->reqid);
2966   fin->tid = tid;
2967   pwop->objecter_tid = tid;
2968   proxywrite_ops[tid] = pwop;
2969   in_progress_proxy_ops[soid].push_back(op);
2970 }
2971
2972 void PrimaryLogPG::finish_proxy_write(hobject_t oid, ceph_tid_t tid, int r)
2973 {
2974   dout(10) << __func__ << " " << oid << " tid " << tid
2975            << " " << cpp_strerror(r) << dendl;
2976
2977   map<ceph_tid_t, ProxyWriteOpRef>::iterator p = proxywrite_ops.find(tid);
2978   if (p == proxywrite_ops.end()) {
2979     dout(10) << __func__ << " no proxywrite_op found" << dendl;
2980     return;
2981   }
2982   ProxyWriteOpRef pwop = p->second;
2983   assert(tid == pwop->objecter_tid);
2984   assert(oid == pwop->soid);
2985
2986   proxywrite_ops.erase(tid);
2987
2988   map<hobject_t, list<OpRequestRef> >::iterator q = in_progress_proxy_ops.find(oid);
2989   if (q == in_progress_proxy_ops.end()) {
2990     dout(10) << __func__ << " no in_progress_proxy_ops found" << dendl;
2991     delete pwop->ctx;
2992     pwop->ctx = NULL;
2993     return;
2994   }
2995   list<OpRequestRef>& in_progress_op = q->second;
2996   assert(in_progress_op.size());
2997   list<OpRequestRef>::iterator it = std::find(in_progress_op.begin(),
2998                                               in_progress_op.end(),
2999                                               pwop->op);
3000   assert(it != in_progress_op.end());
3001   in_progress_op.erase(it);
3002   if (in_progress_op.size() == 0) {
3003     in_progress_proxy_ops.erase(oid);
3004   }
3005
3006   osd->logger->inc(l_osd_tier_proxy_write);
3007
3008   const MOSDOp *m = static_cast<const MOSDOp*>(pwop->op->get_req());
3009   assert(m != NULL);
3010
3011   if (!pwop->sent_reply) {
3012     // send commit.
3013     MOSDOpReply *reply = pwop->ctx->reply;
3014     if (reply)
3015       pwop->ctx->reply = NULL;
3016     else {
3017       reply = new MOSDOpReply(m, r, get_osdmap()->get_epoch(), 0, true);
3018       reply->set_reply_versions(eversion_t(), pwop->user_version);
3019     }
3020     reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
3021     dout(10) << " sending commit on " << pwop << " " << reply << dendl;
3022     osd->send_message_osd_client(reply, m->get_connection());
3023     pwop->sent_reply = true;
3024     pwop->ctx->op->mark_commit_sent();
3025   }
3026
3027   delete pwop->ctx;
3028   pwop->ctx = NULL;
3029 }
3030
3031 void PrimaryLogPG::cancel_proxy_write(ProxyWriteOpRef pwop)
3032 {
3033   dout(10) << __func__ << " " << pwop->soid << dendl;
3034   pwop->canceled = true;
3035
3036   // cancel objecter op, if we can
3037   if (pwop->objecter_tid) {
3038     osd->objecter->op_cancel(pwop->objecter_tid, -ECANCELED);
3039     delete pwop->ctx;
3040     pwop->ctx = NULL;
3041     proxywrite_ops.erase(pwop->objecter_tid);
3042     pwop->objecter_tid = 0;
3043   }
3044 }
3045
3046 class PromoteCallback: public PrimaryLogPG::CopyCallback {
3047   ObjectContextRef obc;
3048   PrimaryLogPG *pg;
3049   utime_t start;
3050 public:
3051   PromoteCallback(ObjectContextRef obc_, PrimaryLogPG *pg_)
3052     : obc(obc_),
3053       pg(pg_),
3054       start(ceph_clock_now()) {}
3055
3056   void finish(PrimaryLogPG::CopyCallbackResults results) override {
3057     PrimaryLogPG::CopyResults *results_data = results.get<1>();
3058     int r = results.get<0>();
3059     pg->finish_promote(r, results_data, obc);
3060     pg->osd->logger->tinc(l_osd_tier_promote_lat, ceph_clock_now() - start);
3061   }
3062 };
3063
3064 void PrimaryLogPG::promote_object(ObjectContextRef obc,
3065                                   const hobject_t& missing_oid,
3066                                   const object_locator_t& oloc,
3067                                   OpRequestRef op,
3068                                   ObjectContextRef *promote_obc)
3069 {
3070   hobject_t hoid = obc ? obc->obs.oi.soid : missing_oid;
3071   assert(hoid != hobject_t());
3072   if (scrubber.write_blocked_by_scrub(hoid)) {
3073     dout(10) << __func__ << " " << hoid
3074              << " blocked by scrub" << dendl;
3075     if (op) {
3076       waiting_for_scrub.push_back(op);
3077       op->mark_delayed("waiting for scrub");
3078       dout(10) << __func__ << " " << hoid
3079                << " placing op in waiting_for_scrub" << dendl;
3080     } else {
3081       dout(10) << __func__ << " " << hoid
3082                << " no op, dropping on the floor" << dendl;
3083     }
3084     return;
3085   }
3086   if (!obc) { // we need to create an ObjectContext
3087     assert(missing_oid != hobject_t());
3088     obc = get_object_context(missing_oid, true);
3089   }
3090   if (promote_obc)
3091     *promote_obc = obc;
3092
3093   /*
3094    * Before promote complete, if there are  proxy-reads for the object,
3095    * for this case we don't use DONTNEED.
3096    */
3097   unsigned src_fadvise_flags = LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL;
3098   map<hobject_t, list<OpRequestRef>>::iterator q = in_progress_proxy_ops.find(obc->obs.oi.soid);
3099   if (q == in_progress_proxy_ops.end()) {
3100     src_fadvise_flags |= LIBRADOS_OP_FLAG_FADVISE_DONTNEED;
3101   }
3102
3103   PromoteCallback *cb = new PromoteCallback(obc, this);
3104   object_locator_t my_oloc = oloc;
3105   my_oloc.pool = pool.info.tier_of;
3106
3107   unsigned flags = CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
3108                    CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
3109                    CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE |
3110                    CEPH_OSD_COPY_FROM_FLAG_RWORDERED;
3111   start_copy(cb, obc, obc->obs.oi.soid, my_oloc, 0, flags,
3112              obc->obs.oi.soid.snap == CEPH_NOSNAP,
3113              src_fadvise_flags, 0);
3114
3115   assert(obc->is_blocked());
3116
3117   if (op)
3118     wait_for_blocked_object(obc->obs.oi.soid, op);
3119   info.stats.stats.sum.num_promote++;
3120 }
3121
3122 void PrimaryLogPG::execute_ctx(OpContext *ctx)
3123 {
3124   FUNCTRACE();
3125   dout(10) << __func__ << " " << ctx << dendl;
3126   ctx->reset_obs(ctx->obc);
3127   ctx->update_log_only = false; // reset in case finish_copyfrom() is re-running execute_ctx
3128   OpRequestRef op = ctx->op;
3129   const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
3130   ObjectContextRef obc = ctx->obc;
3131   const hobject_t& soid = obc->obs.oi.soid;
3132
3133   // this method must be idempotent since we may call it several times
3134   // before we finally apply the resulting transaction.
3135   ctx->op_t.reset(new PGTransaction);
3136
3137   if (op->may_write() || op->may_cache()) {
3138     // snap
3139     if (!(m->has_flag(CEPH_OSD_FLAG_ENFORCE_SNAPC)) &&
3140         pool.info.is_pool_snaps_mode()) {
3141       // use pool's snapc
3142       ctx->snapc = pool.snapc;
3143     } else {
3144       // client specified snapc
3145       ctx->snapc.seq = m->get_snap_seq();
3146       ctx->snapc.snaps = m->get_snaps();
3147       filter_snapc(ctx->snapc.snaps);
3148     }
3149     if ((m->has_flag(CEPH_OSD_FLAG_ORDERSNAP)) &&
3150         ctx->snapc.seq < obc->ssc->snapset.seq) {
3151       dout(10) << " ORDERSNAP flag set and snapc seq " << ctx->snapc.seq
3152                << " < snapset seq " << obc->ssc->snapset.seq
3153                << " on " << obc->obs.oi.soid << dendl;
3154       reply_ctx(ctx, -EOLDSNAPC);
3155       return;
3156     }
3157
3158     // version
3159     ctx->at_version = get_next_version();
3160     ctx->mtime = m->get_mtime();
3161
3162     dout(10) << __func__ << " " << soid << " " << ctx->ops
3163              << " ov " << obc->obs.oi.version << " av " << ctx->at_version
3164              << " snapc " << ctx->snapc
3165              << " snapset " << obc->ssc->snapset
3166              << dendl;
3167   } else {
3168     dout(10) << __func__ << " " << soid << " " << ctx->ops
3169              << " ov " << obc->obs.oi.version
3170              << dendl;
3171   }
3172
3173   if (!ctx->user_at_version)
3174     ctx->user_at_version = obc->obs.oi.user_version;
3175   dout(30) << __func__ << " user_at_version " << ctx->user_at_version << dendl;
3176
3177   if (op->may_read()) {
3178     dout(10) << " taking ondisk_read_lock" << dendl;
3179     obc->ondisk_read_lock();
3180   }
3181
3182   {
3183 #ifdef WITH_LTTNG
3184     osd_reqid_t reqid = ctx->op->get_reqid();
3185 #endif
3186     tracepoint(osd, prepare_tx_enter, reqid.name._type,
3187         reqid.name._num, reqid.tid, reqid.inc);
3188   }
3189
3190   int result = prepare_transaction(ctx);
3191
3192   {
3193 #ifdef WITH_LTTNG
3194     osd_reqid_t reqid = ctx->op->get_reqid();
3195 #endif
3196     tracepoint(osd, prepare_tx_exit, reqid.name._type,
3197         reqid.name._num, reqid.tid, reqid.inc);
3198   }
3199
3200   if (op->may_read()) {
3201     dout(10) << " dropping ondisk_read_lock" << dendl;
3202     obc->ondisk_read_unlock();
3203   }
3204
3205   if (result == -EINPROGRESS) {
3206     // come back later.
3207     return;
3208   }
3209
3210   if (result == -EAGAIN) {
3211     // clean up after the ctx
3212     close_op_ctx(ctx);
3213     return;
3214   }
3215
3216   bool successful_write = !ctx->op_t->empty() && op->may_write() && result >= 0;
3217   // prepare the reply
3218   ctx->reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0,
3219                                successful_write);
3220
3221   // Write operations aren't allowed to return a data payload because
3222   // we can't do so reliably. If the client has to resend the request
3223   // and it has already been applied, we will return 0 with no
3224   // payload.  Non-deterministic behavior is no good.  However, it is
3225   // possible to construct an operation that does a read, does a guard
3226   // check (e.g., CMPXATTR), and then a write.  Then we either succeed
3227   // with the write, or return a CMPXATTR and the read value.
3228   if (successful_write) {
3229     // write.  normalize the result code.
3230     dout(20) << " zeroing write result code " << result << dendl;
3231     result = 0;
3232   }
3233   ctx->reply->set_result(result);
3234
3235   // read or error?
3236   if ((ctx->op_t->empty() || result < 0) && !ctx->update_log_only) {
3237     // finish side-effects
3238     if (result >= 0)
3239       do_osd_op_effects(ctx, m->get_connection());
3240
3241     if (ctx->pending_async_reads.empty()) {
3242       complete_read_ctx(result, ctx);
3243     } else {
3244       in_progress_async_reads.push_back(make_pair(op, ctx));
3245       ctx->start_async_reads(this);
3246     }
3247
3248     return;
3249   }
3250
3251   ctx->reply->set_reply_versions(ctx->at_version, ctx->user_at_version);
3252
3253   assert(op->may_write() || op->may_cache());
3254
3255   // trim log?
3256   calc_trim_to();
3257
3258   // verify that we are doing this in order?
3259   if (cct->_conf->osd_debug_op_order && m->get_source().is_client() &&
3260       !pool.info.is_tier() && !pool.info.has_tiers()) {
3261     map<client_t,ceph_tid_t>& cm = debug_op_order[obc->obs.oi.soid];
3262     ceph_tid_t t = m->get_tid();
3263     client_t n = m->get_source().num();
3264     map<client_t,ceph_tid_t>::iterator p = cm.find(n);
3265     if (p == cm.end()) {
3266       dout(20) << " op order client." << n << " tid " << t << " (first)" << dendl;
3267       cm[n] = t;
3268     } else {
3269       dout(20) << " op order client." << n << " tid " << t << " last was " << p->second << dendl;
3270       if (p->second > t) {
3271         derr << "bad op order, already applied " << p->second << " > this " << t << dendl;
3272         assert(0 == "out of order op");
3273       }
3274       p->second = t;
3275     }
3276   }
3277
3278   if (ctx->update_log_only) {
3279     if (result >= 0)
3280       do_osd_op_effects(ctx, m->get_connection());
3281
3282     dout(20) << __func__ << " update_log_only -- result=" << result << dendl;
3283     // save just what we need from ctx
3284     MOSDOpReply *reply = ctx->reply;
3285     ctx->reply = nullptr;
3286     reply->claim_op_out_data(ctx->ops);
3287     reply->get_header().data_off = ctx->data_off;
3288     close_op_ctx(ctx);
3289
3290     if (result == -ENOENT) {
3291       reply->set_enoent_reply_versions(info.last_update,
3292                                        info.last_user_version);
3293     }
3294     reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
3295     // append to pg log for dup detection - don't save buffers for now
3296     record_write_error(op, soid, reply, result);
3297     return;
3298   }
3299
3300   // no need to capture PG ref, repop cancel will handle that
3301   // Can capture the ctx by pointer, it's owned by the repop
3302   ctx->register_on_commit(
3303     [m, ctx, this](){
3304       if (ctx->op)
3305         log_op_stats(
3306           ctx);
3307
3308       if (m && !ctx->sent_reply) {
3309         MOSDOpReply *reply = ctx->reply;
3310         if (reply)
3311           ctx->reply = nullptr;
3312         else {
3313           reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0, true);
3314           reply->set_reply_versions(ctx->at_version,
3315                                     ctx->user_at_version);
3316         }
3317         reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
3318         dout(10) << " sending reply on " << *m << " " << reply << dendl;
3319         osd->send_message_osd_client(reply, m->get_connection());
3320         ctx->sent_reply = true;
3321         ctx->op->mark_commit_sent();
3322       }
3323     });
3324   ctx->register_on_success(
3325     [ctx, this]() {
3326       do_osd_op_effects(
3327         ctx,
3328         ctx->op ? ctx->op->get_req()->get_connection() :
3329         ConnectionRef());
3330     });
3331   ctx->register_on_finish(
3332     [ctx, this]() {
3333       delete ctx;
3334     });
3335
3336   // issue replica writes
3337   ceph_tid_t rep_tid = osd->get_tid();
3338
3339   RepGather *repop = new_repop(ctx, obc, rep_tid);
3340
3341   issue_repop(repop, ctx);
3342   eval_repop(repop);
3343   repop->put();
3344 }
3345
3346 void PrimaryLogPG::reply_ctx(OpContext *ctx, int r)
3347 {
3348   if (ctx->op)
3349     osd->reply_op_error(ctx->op, r);
3350   close_op_ctx(ctx);
3351 }
3352
3353 void PrimaryLogPG::reply_ctx(OpContext *ctx, int r, eversion_t v, version_t uv)
3354 {
3355   if (ctx->op)
3356     osd->reply_op_error(ctx->op, r, v, uv);
3357   close_op_ctx(ctx);
3358 }
3359
3360 void PrimaryLogPG::log_op_stats(OpContext *ctx)
3361 {
3362   OpRequestRef op = ctx->op;
3363   const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
3364
3365   utime_t now = ceph_clock_now();
3366   utime_t latency = now;
3367   latency -= ctx->op->get_req()->get_recv_stamp();
3368   utime_t process_latency = now;
3369   process_latency -= ctx->op->get_dequeued_time();
3370
3371   uint64_t inb = ctx->bytes_written;
3372   uint64_t outb = ctx->bytes_read;
3373
3374   osd->logger->inc(l_osd_op);
3375
3376   osd->logger->inc(l_osd_op_outb, outb);
3377   osd->logger->inc(l_osd_op_inb, inb);
3378   osd->logger->tinc(l_osd_op_lat, latency);
3379   osd->logger->tinc(l_osd_op_process_lat, process_latency);
3380
3381   if (op->may_read() && op->may_write()) {
3382     osd->logger->inc(l_osd_op_rw);
3383     osd->logger->inc(l_osd_op_rw_inb, inb);
3384     osd->logger->inc(l_osd_op_rw_outb, outb);
3385     osd->logger->tinc(l_osd_op_rw_lat, latency);
3386     osd->logger->hinc(l_osd_op_rw_lat_inb_hist, latency.to_nsec(), inb);
3387     osd->logger->hinc(l_osd_op_rw_lat_outb_hist, latency.to_nsec(), outb);
3388     osd->logger->tinc(l_osd_op_rw_process_lat, process_latency);
3389   } else if (op->may_read()) {
3390     osd->logger->inc(l_osd_op_r);
3391     osd->logger->inc(l_osd_op_r_outb, outb);
3392     osd->logger->tinc(l_osd_op_r_lat, latency);
3393     osd->logger->hinc(l_osd_op_r_lat_outb_hist, latency.to_nsec(), outb);
3394     osd->logger->tinc(l_osd_op_r_process_lat, process_latency);
3395   } else if (op->may_write() || op->may_cache()) {
3396     osd->logger->inc(l_osd_op_w);
3397     osd->logger->inc(l_osd_op_w_inb, inb);
3398     osd->logger->tinc(l_osd_op_w_lat, latency);
3399     osd->logger->hinc(l_osd_op_w_lat_inb_hist, latency.to_nsec(), inb);
3400     osd->logger->tinc(l_osd_op_w_process_lat, process_latency);
3401   } else
3402     ceph_abort();
3403
3404   dout(15) << "log_op_stats " << *m
3405            << " inb " << inb
3406            << " outb " << outb
3407            << " lat " << latency << dendl;
3408 }
3409
3410 void PrimaryLogPG::do_sub_op(OpRequestRef op)
3411 {
3412   const MOSDSubOp *m = static_cast<const MOSDSubOp*>(op->get_req());
3413   assert(have_same_or_newer_map(m->map_epoch));
3414   assert(m->get_type() == MSG_OSD_SUBOP);
3415   dout(15) << "do_sub_op " << *op->get_req() << dendl;
3416
3417   if (!is_peered()) {
3418     waiting_for_peered.push_back(op);
3419     op->mark_delayed("waiting for active");
3420     return;
3421   }
3422
3423   const OSDOp *first = NULL;
3424   if (m->ops.size() >= 1) {
3425     first = &m->ops[0];
3426   }
3427
3428   if (first) {
3429     switch (first->op.op) {
3430     case CEPH_OSD_OP_DELETE:
3431       sub_op_remove(op);
3432       return;
3433     case CEPH_OSD_OP_SCRUB_RESERVE:
3434       handle_scrub_reserve_request(op);
3435       return;
3436     case CEPH_OSD_OP_SCRUB_UNRESERVE:
3437       handle_scrub_reserve_release(op);
3438       return;
3439     case CEPH_OSD_OP_SCRUB_MAP:
3440       sub_op_scrub_map(op);
3441       return;
3442     }
3443   }
3444 }
3445
3446 void PrimaryLogPG::do_sub_op_reply(OpRequestRef op)
3447 {
3448   const MOSDSubOpReply *r = static_cast<const MOSDSubOpReply *>(op->get_req());
3449   assert(r->get_type() == MSG_OSD_SUBOPREPLY);
3450   if (r->ops.size() >= 1) {
3451     const OSDOp& first = r->ops[0];
3452     switch (first.op.op) {
3453     case CEPH_OSD_OP_SCRUB_RESERVE:
3454       {
3455         pg_shard_t from = r->from;
3456         bufferlist::iterator p = const_cast<bufferlist&>(r->get_data()).begin();
3457         bool reserved;
3458         ::decode(reserved, p);
3459         if (reserved) {
3460           handle_scrub_reserve_grant(op, from);
3461         } else {
3462           handle_scrub_reserve_reject(op, from);
3463         }
3464       }
3465       return;
3466     }
3467   }
3468 }
3469
3470 void PrimaryLogPG::do_scan(
3471   OpRequestRef op,
3472   ThreadPool::TPHandle &handle)
3473 {
3474   const MOSDPGScan *m = static_cast<const MOSDPGScan*>(op->get_req());
3475   assert(m->get_type() == MSG_OSD_PG_SCAN);
3476   dout(10) << "do_scan " << *m << dendl;
3477
3478   op->mark_started();
3479
3480   switch (m->op) {
3481   case MOSDPGScan::OP_SCAN_GET_DIGEST:
3482     {
3483       ostringstream ss;
3484       if (osd->check_backfill_full(ss)) {
3485         dout(1) << __func__ << ": Canceling backfill, " << ss.str() << dendl;
3486         queue_peering_event(
3487           CephPeeringEvtRef(
3488             std::make_shared<CephPeeringEvt>(
3489               get_osdmap()->get_epoch(),
3490               get_osdmap()->get_epoch(),
3491               BackfillTooFull())));
3492         return;
3493       }
3494
3495       BackfillInterval bi;
3496       bi.begin = m->begin;
3497       // No need to flush, there won't be any in progress writes occuring
3498       // past m->begin
3499       scan_range(
3500         cct->_conf->osd_backfill_scan_min,
3501         cct->_conf->osd_backfill_scan_max,
3502         &bi,
3503         handle);
3504       MOSDPGScan *reply = new MOSDPGScan(
3505         MOSDPGScan::OP_SCAN_DIGEST,
3506         pg_whoami,
3507         get_osdmap()->get_epoch(), m->query_epoch,
3508         spg_t(info.pgid.pgid, get_primary().shard), bi.begin, bi.end);
3509       ::encode(bi.objects, reply->get_data());
3510       osd->send_message_osd_cluster(reply, m->get_connection());
3511     }
3512     break;
3513
3514   case MOSDPGScan::OP_SCAN_DIGEST:
3515     {
3516       pg_shard_t from = m->from;
3517
3518       // Check that from is in backfill_targets vector
3519       assert(is_backfill_targets(from));
3520
3521       BackfillInterval& bi = peer_backfill_info[from];
3522       bi.begin = m->begin;
3523       bi.end = m->end;
3524       bufferlist::iterator p = const_cast<bufferlist&>(m->get_data()).begin();
3525
3526       // take care to preserve ordering!
3527       bi.clear_objects();
3528       ::decode_noclear(bi.objects, p);
3529
3530       if (waiting_on_backfill.erase(from)) {
3531         if (waiting_on_backfill.empty()) {
3532           assert(peer_backfill_info.size() == backfill_targets.size());
3533           finish_recovery_op(hobject_t::get_max());
3534         }
3535       } else {
3536         // we canceled backfill for a while due to a too full, and this
3537         // is an extra response from a non-too-full peer
3538       }
3539     }
3540     break;
3541   }
3542 }
3543
3544 void PrimaryLogPG::do_backfill(OpRequestRef op)
3545 {
3546   const MOSDPGBackfill *m = static_cast<const MOSDPGBackfill*>(op->get_req());
3547   assert(m->get_type() == MSG_OSD_PG_BACKFILL);
3548   dout(10) << "do_backfill " << *m << dendl;
3549
3550   op->mark_started();
3551
3552   switch (m->op) {
3553   case MOSDPGBackfill::OP_BACKFILL_FINISH:
3554     {
3555       assert(cct->_conf->osd_kill_backfill_at != 1);
3556
3557       MOSDPGBackfill *reply = new MOSDPGBackfill(
3558         MOSDPGBackfill::OP_BACKFILL_FINISH_ACK,
3559         get_osdmap()->get_epoch(),
3560         m->query_epoch,
3561         spg_t(info.pgid.pgid, get_primary().shard));
3562       reply->set_priority(get_recovery_op_priority());
3563       osd->send_message_osd_cluster(reply, m->get_connection());
3564       queue_peering_event(
3565         CephPeeringEvtRef(
3566           std::make_shared<CephPeeringEvt>(
3567             get_osdmap()->get_epoch(),
3568             get_osdmap()->get_epoch(),
3569             RecoveryDone())));
3570     }
3571     // fall-thru
3572
3573   case MOSDPGBackfill::OP_BACKFILL_PROGRESS:
3574     {
3575       assert(cct->_conf->osd_kill_backfill_at != 2);
3576
3577       info.set_last_backfill(m->last_backfill);
3578       info.stats = m->stats;
3579
3580       ObjectStore::Transaction t;
3581       dirty_info = true;
3582       write_if_dirty(t);
3583       int tr = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
3584       assert(tr == 0);
3585     }
3586     break;
3587
3588   case MOSDPGBackfill::OP_BACKFILL_FINISH_ACK:
3589     {
3590       assert(is_primary());
3591       assert(cct->_conf->osd_kill_backfill_at != 3);
3592       finish_recovery_op(hobject_t::get_max());
3593     }
3594     break;
3595   }
3596 }
3597
3598 void PrimaryLogPG::do_backfill_remove(OpRequestRef op)
3599 {
3600   const MOSDPGBackfillRemove *m = static_cast<const MOSDPGBackfillRemove*>(
3601     op->get_req());
3602   assert(m->get_type() == MSG_OSD_PG_BACKFILL_REMOVE);
3603   dout(7) << __func__ << " " << m->ls << dendl;
3604
3605   op->mark_started();
3606
3607   ObjectStore::Transaction t;
3608   for (auto& p : m->ls) {
3609     remove_snap_mapped_object(t, p.first);
3610   }
3611   int r = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
3612   assert(r == 0);
3613 }
3614
3615 PrimaryLogPG::OpContextUPtr PrimaryLogPG::trim_object(
3616   bool first, const hobject_t &coid)
3617 {
3618   // load clone info
3619   bufferlist bl;
3620   ObjectContextRef obc = get_object_context(coid, false, NULL);
3621   if (!obc) {
3622     derr << __func__ << " could not find coid " << coid << dendl;
3623     ceph_abort();
3624   }
3625   assert(obc->ssc);
3626
3627   hobject_t snapoid(
3628     coid.oid, coid.get_key(),
3629     obc->ssc->snapset.head_exists ? CEPH_NOSNAP:CEPH_SNAPDIR, coid.get_hash(),
3630     info.pgid.pool(), coid.get_namespace());
3631   ObjectContextRef snapset_obc = get_object_context(snapoid, false);
3632   assert(snapset_obc);
3633
3634   SnapSet& snapset = obc->ssc->snapset;
3635
3636   bool legacy = snapset.is_legacy() ||
3637     get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
3638
3639   object_info_t &coi = obc->obs.oi;
3640   set<snapid_t> old_snaps;
3641   if (legacy) {
3642     old_snaps.insert(coi.legacy_snaps.begin(), coi.legacy_snaps.end());
3643   } else {
3644     auto p = snapset.clone_snaps.find(coid.snap);
3645     if (p == snapset.clone_snaps.end()) {
3646       osd->clog->error() << __func__ << " No clone_snaps in snapset " << snapset
3647                          << " for " << coid << "\n";
3648       return NULL;
3649     }
3650     old_snaps.insert(snapset.clone_snaps[coid.snap].begin(),
3651                      snapset.clone_snaps[coid.snap].end());
3652   }
3653   if (old_snaps.empty()) {
3654     osd->clog->error() << __func__ << " No object info snaps for " << coid;
3655     return NULL;
3656   }
3657
3658   dout(10) << coid << " old_snaps " << old_snaps
3659            << " old snapset " << snapset << dendl;
3660   if (snapset.seq == 0) {
3661     osd->clog->error() << __func__ << " No snapset.seq for " << coid;
3662     return NULL;
3663   }
3664
3665   set<snapid_t> new_snaps;
3666   for (set<snapid_t>::iterator i = old_snaps.begin();
3667        i != old_snaps.end();
3668        ++i) {
3669     if (!pool.info.is_removed_snap(*i))
3670       new_snaps.insert(*i);
3671   }
3672
3673   vector<snapid_t>::iterator p = snapset.clones.end();
3674
3675   if (new_snaps.empty()) {
3676     p = std::find(snapset.clones.begin(), snapset.clones.end(), coid.snap);
3677     if (p == snapset.clones.end()) {
3678       osd->clog->error() << __func__ << " Snap " << coid.snap << " not in clones";
3679       return NULL;
3680     }
3681   }
3682
3683   OpContextUPtr ctx = simple_opc_create(obc);
3684   ctx->snapset_obc = snapset_obc;
3685
3686   if (!ctx->lock_manager.get_snaptrimmer_write(
3687         coid,
3688         obc,
3689         first)) {
3690     close_op_ctx(ctx.release());
3691     dout(10) << __func__ << ": Unable to get a wlock on " << coid << dendl;
3692     return NULL;
3693   }
3694
3695   if (!ctx->lock_manager.get_snaptrimmer_write(
3696         snapoid,
3697         snapset_obc,
3698         first)) {
3699     close_op_ctx(ctx.release());
3700     dout(10) << __func__ << ": Unable to get a wlock on " << snapoid << dendl;
3701     return NULL;
3702   }
3703
3704   ctx->at_version = get_next_version();
3705
3706   PGTransaction *t = ctx->op_t.get();
3707
3708   if (new_snaps.empty()) {
3709     // remove clone
3710     dout(10) << coid << " snaps " << old_snaps << " -> "
3711              << new_snaps << " ... deleting" << dendl;
3712
3713     // ...from snapset
3714     assert(p != snapset.clones.end());
3715
3716     snapid_t last = coid.snap;
3717     ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(last);
3718
3719     if (p != snapset.clones.begin()) {
3720       // not the oldest... merge overlap into next older clone
3721       vector<snapid_t>::iterator n = p - 1;
3722       hobject_t prev_coid = coid;
3723       prev_coid.snap = *n;
3724       bool adjust_prev_bytes = is_present_clone(prev_coid);
3725
3726       if (adjust_prev_bytes)
3727         ctx->delta_stats.num_bytes -= snapset.get_clone_bytes(*n);
3728
3729       snapset.clone_overlap[*n].intersection_of(
3730         snapset.clone_overlap[*p]);
3731
3732       if (adjust_prev_bytes)
3733         ctx->delta_stats.num_bytes += snapset.get_clone_bytes(*n);
3734     }
3735     ctx->delta_stats.num_objects--;
3736     if (coi.is_dirty())
3737       ctx->delta_stats.num_objects_dirty--;
3738     if (coi.is_omap())
3739       ctx->delta_stats.num_objects_omap--;
3740     if (coi.is_whiteout()) {
3741       dout(20) << __func__ << " trimming whiteout on " << coid << dendl;
3742       ctx->delta_stats.num_whiteouts--;
3743     }
3744     ctx->delta_stats.num_object_clones--;
3745     if (coi.is_cache_pinned())
3746       ctx->delta_stats.num_objects_pinned--;
3747     obc->obs.exists = false;
3748
3749     snapset.clones.erase(p);
3750     snapset.clone_overlap.erase(last);
3751     snapset.clone_size.erase(last);
3752     snapset.clone_snaps.erase(last);
3753
3754     ctx->log.push_back(
3755       pg_log_entry_t(
3756         pg_log_entry_t::DELETE,
3757         coid,
3758         ctx->at_version,
3759         ctx->obs->oi.version,
3760         0,
3761         osd_reqid_t(),
3762         ctx->mtime,
3763         0)
3764       );
3765     t->remove(coid);
3766     t->update_snaps(
3767       coid,
3768       old_snaps,
3769       new_snaps);
3770
3771     coi = object_info_t(coid);
3772
3773     ctx->at_version.version++;
3774   } else {
3775     // save adjusted snaps for this object
3776     dout(10) << coid << " snaps " << old_snaps << " -> " << new_snaps << dendl;
3777     if (legacy) {
3778       coi.legacy_snaps = vector<snapid_t>(new_snaps.rbegin(), new_snaps.rend());
3779     } else {
3780       snapset.clone_snaps[coid.snap] = vector<snapid_t>(new_snaps.rbegin(),
3781                                                         new_snaps.rend());
3782       // we still do a 'modify' event on this object just to trigger a
3783       // snapmapper.update ... :(
3784     }
3785
3786     coi.prior_version = coi.version;
3787     coi.version = ctx->at_version;
3788     bl.clear();
3789     ::encode(coi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
3790     t->setattr(coid, OI_ATTR, bl);
3791
3792     ctx->log.push_back(
3793       pg_log_entry_t(
3794         pg_log_entry_t::MODIFY,
3795         coid,
3796         coi.version,
3797         coi.prior_version,
3798         0,
3799         osd_reqid_t(),
3800         ctx->mtime,
3801         0)
3802       );
3803     ctx->at_version.version++;
3804
3805     t->update_snaps(
3806       coid,
3807       old_snaps,
3808       new_snaps);
3809   }
3810
3811   // save head snapset
3812   dout(10) << coid << " new snapset " << snapset << " on "
3813            << snapset_obc->obs.oi << dendl;
3814   if (snapset.clones.empty() &&
3815       (!snapset.head_exists ||
3816        (snapset_obc->obs.oi.is_whiteout() &&
3817         !(snapset_obc->obs.oi.is_dirty() && pool.info.is_tier()) &&
3818         !snapset_obc->obs.oi.is_cache_pinned()))) {
3819     // NOTE: this arguably constitutes minor interference with the
3820     // tiering agent if this is a cache tier since a snap trim event
3821     // is effectively evicting a whiteout we might otherwise want to
3822     // keep around.
3823     dout(10) << coid << " removing " << snapoid << dendl;
3824     ctx->log.push_back(
3825       pg_log_entry_t(
3826         pg_log_entry_t::DELETE,
3827         snapoid,
3828         ctx->at_version,
3829         ctx->snapset_obc->obs.oi.version,
3830         0,
3831         osd_reqid_t(),
3832         ctx->mtime,
3833         0)
3834       );
3835     if (snapoid.is_head()) {
3836       derr << "removing snap head" << dendl;
3837       object_info_t& oi = ctx->snapset_obc->obs.oi;
3838       ctx->delta_stats.num_objects--;
3839       if (oi.is_dirty()) {
3840         ctx->delta_stats.num_objects_dirty--;
3841       }
3842       if (oi.is_omap())
3843         ctx->delta_stats.num_objects_omap--;
3844       if (oi.is_whiteout()) {
3845         dout(20) << __func__ << " trimming whiteout on " << oi.soid << dendl;
3846         ctx->delta_stats.num_whiteouts--;
3847       }
3848       if (oi.is_cache_pinned()) {
3849         ctx->delta_stats.num_objects_pinned--;
3850       }
3851     }
3852     ctx->snapset_obc->obs.exists = false;
3853     ctx->snapset_obc->obs.oi = object_info_t(snapoid);
3854     t->remove(snapoid);
3855   } else {
3856     dout(10) << coid << " filtering snapset on " << snapoid << dendl;
3857     snapset.filter(pool.info);
3858     dout(10) << coid << " writing updated snapset on " << snapoid
3859              << ", snapset is " << snapset << dendl;
3860     ctx->log.push_back(
3861       pg_log_entry_t(
3862         pg_log_entry_t::MODIFY,
3863         snapoid,
3864         ctx->at_version,
3865         ctx->snapset_obc->obs.oi.version,
3866         0,
3867         osd_reqid_t(),
3868         ctx->mtime,
3869         0)
3870       );
3871
3872     ctx->snapset_obc->obs.oi.prior_version =
3873       ctx->snapset_obc->obs.oi.version;
3874     ctx->snapset_obc->obs.oi.version = ctx->at_version;
3875
3876     map <string, bufferlist> attrs;
3877     bl.clear();
3878     ::encode(snapset, bl);
3879     attrs[SS_ATTR].claim(bl);
3880
3881     bl.clear();
3882     ::encode(ctx->snapset_obc->obs.oi, bl,
3883              get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
3884     attrs[OI_ATTR].claim(bl);
3885     t->setattrs(snapoid, attrs);
3886   }
3887
3888   return ctx;
3889 }
3890
3891 void PrimaryLogPG::kick_snap_trim()
3892 {
3893   assert(is_active());
3894   assert(is_primary());
3895   if (is_clean() && !snap_trimq.empty()) {
3896     dout(10) << __func__ << ": clean and snaps to trim, kicking" << dendl;
3897     snap_trimmer_machine.process_event(KickTrim());
3898   }
3899 }
3900
3901 void PrimaryLogPG::snap_trimmer_scrub_complete()
3902 {
3903   if (is_primary() && is_active() && is_clean()) {
3904     assert(!snap_trimq.empty());
3905     snap_trimmer_machine.process_event(ScrubComplete());
3906   }
3907 }
3908
3909 void PrimaryLogPG::snap_trimmer(epoch_t queued)
3910 {
3911   if (deleting || pg_has_reset_since(queued)) {
3912     return;
3913   }
3914
3915   assert(is_primary());
3916
3917   dout(10) << "snap_trimmer posting" << dendl;
3918   snap_trimmer_machine.process_event(DoSnapWork());
3919   dout(10) << "snap_trimmer complete" << dendl;
3920   return;
3921 }
3922
3923 int PrimaryLogPG::do_xattr_cmp_u64(int op, __u64 v1, bufferlist& xattr)
3924 {
3925   __u64 v2;
3926
3927   string v2s(xattr.c_str(), xattr.length());
3928   if (v2s.length())
3929     v2 = strtoull(v2s.c_str(), NULL, 10);
3930   else
3931     v2 = 0;
3932
3933   dout(20) << "do_xattr_cmp_u64 '" << v1 << "' vs '" << v2 << "' op " << op << dendl;
3934
3935   switch (op) {
3936   case CEPH_OSD_CMPXATTR_OP_EQ:
3937     return (v1 == v2);
3938   case CEPH_OSD_CMPXATTR_OP_NE:
3939     return (v1 != v2);
3940   case CEPH_OSD_CMPXATTR_OP_GT:
3941     return (v1 > v2);
3942   case CEPH_OSD_CMPXATTR_OP_GTE:
3943     return (v1 >= v2);
3944   case CEPH_OSD_CMPXATTR_OP_LT:
3945     return (v1 < v2);
3946   case CEPH_OSD_CMPXATTR_OP_LTE:
3947     return (v1 <= v2);
3948   default:
3949     return -EINVAL;
3950   }
3951 }
3952
3953 int PrimaryLogPG::do_xattr_cmp_str(int op, string& v1s, bufferlist& xattr)
3954 {
3955   string v2s(xattr.c_str(), xattr.length());
3956
3957   dout(20) << "do_xattr_cmp_str '" << v1s << "' vs '" << v2s << "' op " << op << dendl;
3958
3959   switch (op) {
3960   case CEPH_OSD_CMPXATTR_OP_EQ:
3961     return (v1s.compare(v2s) == 0);
3962   case CEPH_OSD_CMPXATTR_OP_NE:
3963     return (v1s.compare(v2s) != 0);
3964   case CEPH_OSD_CMPXATTR_OP_GT:
3965     return (v1s.compare(v2s) > 0);
3966   case CEPH_OSD_CMPXATTR_OP_GTE:
3967     return (v1s.compare(v2s) >= 0);
3968   case CEPH_OSD_CMPXATTR_OP_LT:
3969     return (v1s.compare(v2s) < 0);
3970   case CEPH_OSD_CMPXATTR_OP_LTE:
3971     return (v1s.compare(v2s) <= 0);
3972   default:
3973     return -EINVAL;
3974   }
3975 }
3976
3977 int PrimaryLogPG::do_extent_cmp(OpContext *ctx, OSDOp& osd_op)
3978 {
3979   ceph_osd_op& op = osd_op.op;
3980   vector<OSDOp> read_ops(1);
3981   OSDOp& read_op = read_ops[0];
3982   int result = 0;
3983
3984   read_op.op.op = CEPH_OSD_OP_SYNC_READ;
3985   read_op.op.extent.offset = op.extent.offset;
3986   read_op.op.extent.length = op.extent.length;
3987   read_op.op.extent.truncate_seq = op.extent.truncate_seq;
3988   read_op.op.extent.truncate_size = op.extent.truncate_size;
3989
3990   result = do_osd_ops(ctx, read_ops);
3991   if (result < 0) {
3992     derr << "do_extent_cmp do_osd_ops failed " << result << dendl;
3993     return result;
3994   }
3995
3996   if (read_op.outdata.length() != osd_op.indata.length())
3997     return -EINVAL;
3998
3999   for (uint64_t p = 0; p < osd_op.indata.length(); p++) {
4000     if (read_op.outdata[p] != osd_op.indata[p]) {
4001       return (-MAX_ERRNO - p);
4002     }
4003   }
4004
4005   return result;
4006 }
4007
4008 int PrimaryLogPG::do_writesame(OpContext *ctx, OSDOp& osd_op)
4009 {
4010   ceph_osd_op& op = osd_op.op;
4011   vector<OSDOp> write_ops(1);
4012   OSDOp& write_op = write_ops[0];
4013   uint64_t write_length = op.writesame.length;
4014   int result = 0;
4015
4016   if (!write_length)
4017     return 0;
4018
4019   if (!op.writesame.data_length || write_length % op.writesame.data_length)
4020     return -EINVAL;
4021
4022   if (op.writesame.data_length != osd_op.indata.length()) {
4023     derr << "invalid length ws data length " << op.writesame.data_length << " actual len " << osd_op.indata.length() << dendl;
4024     return -EINVAL;
4025   }
4026
4027   while (write_length) {
4028     write_op.indata.append(osd_op.indata);
4029     write_length -= op.writesame.data_length;
4030   }
4031
4032   write_op.op.op = CEPH_OSD_OP_WRITE;
4033   write_op.op.extent.offset = op.writesame.offset;
4034   write_op.op.extent.length = op.writesame.length;
4035   result = do_osd_ops(ctx, write_ops);
4036   if (result < 0)
4037     derr << "do_writesame do_osd_ops failed " << result << dendl;
4038
4039   return result;
4040 }
4041
4042 // ========================================================================
4043 // low level osd ops
4044
4045 int PrimaryLogPG::do_tmap2omap(OpContext *ctx, unsigned flags)
4046 {
4047   dout(20) << " convert tmap to omap for " << ctx->new_obs.oi.soid << dendl;
4048   bufferlist header, vals;
4049   int r = _get_tmap(ctx, &header, &vals);
4050   if (r < 0) {
4051     if (r == -ENODATA && (flags & CEPH_OSD_TMAP2OMAP_NULLOK))
4052       r = 0;
4053     return r;
4054   }
4055
4056   vector<OSDOp> ops(3);
4057
4058   ops[0].op.op = CEPH_OSD_OP_TRUNCATE;
4059   ops[0].op.extent.offset = 0;
4060   ops[0].op.extent.length = 0;
4061
4062   ops[1].op.op = CEPH_OSD_OP_OMAPSETHEADER;
4063   ops[1].indata.claim(header);
4064
4065   ops[2].op.op = CEPH_OSD_OP_OMAPSETVALS;
4066   ops[2].indata.claim(vals);
4067
4068   return do_osd_ops(ctx, ops);
4069 }
4070
4071 int PrimaryLogPG::do_tmapup_slow(OpContext *ctx, bufferlist::iterator& bp, OSDOp& osd_op,
4072                                     bufferlist& bl)
4073 {
4074   // decode
4075   bufferlist header;
4076   map<string, bufferlist> m;
4077   if (bl.length()) {
4078     bufferlist::iterator p = bl.begin();
4079     ::decode(header, p);
4080     ::decode(m, p);
4081     assert(p.end());
4082   }
4083
4084   // do the update(s)
4085   while (!bp.end()) {
4086     __u8 op;
4087     string key;
4088     ::decode(op, bp);
4089
4090     switch (op) {
4091     case CEPH_OSD_TMAP_SET: // insert key
4092       {
4093         ::decode(key, bp);
4094         bufferlist data;
4095         ::decode(data, bp);
4096         m[key] = data;
4097       }
4098       break;
4099     case CEPH_OSD_TMAP_RM: // remove key
4100       ::decode(key, bp);
4101       if (!m.count(key)) {
4102         return -ENOENT;
4103       }
4104       m.erase(key);
4105       break;
4106     case CEPH_OSD_TMAP_RMSLOPPY: // remove key
4107       ::decode(key, bp);
4108       m.erase(key);
4109       break;
4110     case CEPH_OSD_TMAP_HDR: // update header
4111       {
4112         ::decode(header, bp);
4113       }
4114       break;
4115     default:
4116       return -EINVAL;
4117     }
4118   }
4119
4120   // reencode
4121   bufferlist obl;
4122   ::encode(header, obl);
4123   ::encode(m, obl);
4124
4125   // write it out
4126   vector<OSDOp> nops(1);
4127   OSDOp& newop = nops[0];
4128   newop.op.op = CEPH_OSD_OP_WRITEFULL;
4129   newop.op.extent.offset = 0;
4130   newop.op.extent.length = obl.length();
4131   newop.indata = obl;
4132   do_osd_ops(ctx, nops);
4133   osd_op.outdata.claim(newop.outdata);
4134   return 0;
4135 }
4136
4137 int PrimaryLogPG::do_tmapup(OpContext *ctx, bufferlist::iterator& bp, OSDOp& osd_op)
4138 {
4139   bufferlist::iterator orig_bp = bp;
4140   int result = 0;
4141   if (bp.end()) {
4142     dout(10) << "tmapup is a no-op" << dendl;
4143   } else {
4144     // read the whole object
4145     vector<OSDOp> nops(1);
4146     OSDOp& newop = nops[0];
4147     newop.op.op = CEPH_OSD_OP_READ;
4148     newop.op.extent.offset = 0;
4149     newop.op.extent.length = 0;
4150     result = do_osd_ops(ctx, nops);
4151
4152     dout(10) << "tmapup read " << newop.outdata.length() << dendl;
4153
4154     dout(30) << " starting is \n";
4155     newop.outdata.hexdump(*_dout);
4156     *_dout << dendl;
4157
4158     bufferlist::iterator ip = newop.outdata.begin();
4159     bufferlist obl;
4160
4161     dout(30) << "the update command is: \n";
4162     osd_op.indata.hexdump(*_dout);
4163     *_dout << dendl;
4164
4165     // header
4166     bufferlist header;
4167     __u32 nkeys = 0;
4168     if (newop.outdata.length()) {
4169       ::decode(header, ip);
4170       ::decode(nkeys, ip);
4171     }
4172     dout(10) << "tmapup header " << header.length() << dendl;
4173
4174     if (!bp.end() && *bp == CEPH_OSD_TMAP_HDR) {
4175       ++bp;
4176       ::decode(header, bp);
4177       dout(10) << "tmapup new header " << header.length() << dendl;
4178     }
4179
4180     ::encode(header, obl);
4181
4182     dout(20) << "tmapup initial nkeys " << nkeys << dendl;
4183
4184     // update keys
4185     bufferlist newkeydata;
4186     string nextkey, last_in_key;
4187     bufferlist nextval;
4188     bool have_next = false;
4189     if (!ip.end()) {
4190       have_next = true;
4191       ::decode(nextkey, ip);
4192       ::decode(nextval, ip);
4193     }
4194     while (!bp.end() && !result) {
4195       __u8 op;
4196       string key;
4197       try {
4198         ::decode(op, bp);
4199         ::decode(key, bp);
4200       }
4201       catch (buffer::error& e) {
4202         return -EINVAL;
4203       }
4204       if (key < last_in_key) {
4205         dout(5) << "tmapup warning: key '" << key << "' < previous key '" << last_in_key
4206                 << "', falling back to an inefficient (unsorted) update" << dendl;
4207         bp = orig_bp;
4208         return do_tmapup_slow(ctx, bp, osd_op, newop.outdata);
4209       }
4210       last_in_key = key;
4211
4212       dout(10) << "tmapup op " << (int)op << " key " << key << dendl;
4213
4214       // skip existing intervening keys
4215       bool key_exists = false;
4216       while (have_next && !key_exists) {
4217         dout(20) << "  (have_next=" << have_next << " nextkey=" << nextkey << ")" << dendl;
4218         if (nextkey > key)
4219           break;
4220         if (nextkey < key) {
4221           // copy untouched.
4222           ::encode(nextkey, newkeydata);
4223           ::encode(nextval, newkeydata);
4224           dout(20) << "  keep " << nextkey << " " << nextval.length() << dendl;
4225         } else {
4226           // don't copy; discard old value.  and stop.
4227           dout(20) << "  drop " << nextkey << " " << nextval.length() << dendl;
4228           key_exists = true;
4229           nkeys--;
4230         }
4231         if (!ip.end()) {
4232           ::decode(nextkey, ip);
4233           ::decode(nextval, ip);
4234         } else {
4235           have_next = false;
4236         }
4237       }
4238
4239       if (op == CEPH_OSD_TMAP_SET) {
4240         bufferlist val;
4241         try {
4242           ::decode(val, bp);
4243         }
4244         catch (buffer::error& e) {
4245           return -EINVAL;
4246         }
4247         ::encode(key, newkeydata);
4248         ::encode(val, newkeydata);
4249         dout(20) << "   set " << key << " " << val.length() << dendl;
4250         nkeys++;
4251       } else if (op == CEPH_OSD_TMAP_CREATE) {
4252         if (key_exists) {
4253           return -EEXIST;
4254         }
4255         bufferlist val;
4256         try {
4257           ::decode(val, bp);
4258         }
4259         catch (buffer::error& e) {
4260           return -EINVAL;
4261         }
4262         ::encode(key, newkeydata);
4263         ::encode(val, newkeydata);
4264         dout(20) << "   create " << key << " " << val.length() << dendl;
4265         nkeys++;
4266       } else if (op == CEPH_OSD_TMAP_RM) {
4267         // do nothing.
4268         if (!key_exists) {
4269           return -ENOENT;
4270         }
4271       } else if (op == CEPH_OSD_TMAP_RMSLOPPY) {
4272         // do nothing
4273       } else {
4274         dout(10) << "  invalid tmap op " << (int)op << dendl;
4275         return -EINVAL;
4276       }
4277     }
4278
4279     // copy remaining
4280     if (have_next) {
4281       ::encode(nextkey, newkeydata);
4282       ::encode(nextval, newkeydata);
4283       dout(20) << "  keep " << nextkey << " " << nextval.length() << dendl;
4284     }
4285     if (!ip.end()) {
4286       bufferlist rest;
4287       rest.substr_of(newop.outdata, ip.get_off(), newop.outdata.length() - ip.get_off());
4288       dout(20) << "  keep trailing " << rest.length()
4289                << " at " << newkeydata.length() << dendl;
4290       newkeydata.claim_append(rest);
4291     }
4292
4293     // encode final key count + key data
4294     dout(20) << "tmapup final nkeys " << nkeys << dendl;
4295     ::encode(nkeys, obl);
4296     obl.claim_append(newkeydata);
4297
4298     if (0) {
4299       dout(30) << " final is \n";
4300       obl.hexdump(*_dout);
4301       *_dout << dendl;
4302
4303       // sanity check
4304       bufferlist::iterator tp = obl.begin();
4305       bufferlist h;
4306       ::decode(h, tp);
4307       map<string,bufferlist> d;
4308       ::decode(d, tp);
4309       assert(tp.end());
4310       dout(0) << " **** debug sanity check, looks ok ****" << dendl;
4311     }
4312
4313     // write it out
4314     if (!result) {
4315       dout(20) << "tmapput write " << obl.length() << dendl;
4316       newop.op.op = CEPH_OSD_OP_WRITEFULL;
4317       newop.op.extent.offset = 0;
4318       newop.op.extent.length = obl.length();
4319       newop.indata = obl;
4320       do_osd_ops(ctx, nops);
4321       osd_op.outdata.claim(newop.outdata);
4322     }
4323   }
4324   return result;
4325 }
4326
4327 static int check_offset_and_length(uint64_t offset, uint64_t length, uint64_t max)
4328 {
4329   if (offset >= max ||
4330       length > max ||
4331       offset + length > max)
4332     return -EFBIG;
4333
4334   return 0;
4335 }
4336
4337 struct FillInVerifyExtent : public Context {
4338   ceph_le64 *r;
4339   int32_t *rval;
4340   bufferlist *outdatap;
4341   boost::optional<uint32_t> maybe_crc;
4342   uint64_t size;
4343   OSDService *osd;
4344   hobject_t soid;
4345   __le32 flags;
4346   FillInVerifyExtent(ceph_le64 *r, int32_t *rv, bufferlist *blp,
4347                      boost::optional<uint32_t> mc, uint64_t size,
4348                      OSDService *osd, hobject_t soid, __le32 flags) :
4349     r(r), rval(rv), outdatap(blp), maybe_crc(mc),
4350     size(size), osd(osd), soid(soid), flags(flags) {}
4351   void finish(int len) override {
4352     *rval = len;
4353     *r = len;
4354     if (len < 0)
4355       return;
4356     // whole object?  can we verify the checksum?
4357     if (maybe_crc && *r == size) {
4358       uint32_t crc = outdatap->crc32c(-1);
4359       if (maybe_crc != crc) {
4360         osd->clog->error() << std::hex << " full-object read crc 0x" << crc
4361                            << " != expected 0x" << *maybe_crc
4362                            << std::dec << " on " << soid;
4363         if (!(flags & CEPH_OSD_OP_FLAG_FAILOK)) {
4364           *rval = -EIO;
4365           *r = 0;
4366         }
4367       }
4368     }
4369   }
4370 };
4371
4372 struct ToSparseReadResult : public Context {
4373   bufferlist& data_bl;
4374   uint64_t data_offset;
4375   ceph_le64& len;
4376   ToSparseReadResult(bufferlist& bl, uint64_t offset, ceph_le64& len):
4377     data_bl(bl), data_offset(offset),len(len) {}
4378   void finish(int r) override {
4379     if (r < 0) return;
4380     len = r;
4381     bufferlist outdata;
4382     map<uint64_t, uint64_t> extents = {{data_offset, r}};
4383     ::encode(extents, outdata);
4384     ::encode_destructively(data_bl, outdata);
4385     data_bl.swap(outdata);
4386   }
4387 };
4388
4389 template<typename V>
4390 static string list_keys(const map<string, V>& m) {
4391   string s;
4392   for (typename map<string, V>::const_iterator itr = m.begin(); itr != m.end(); ++itr) {
4393     if (!s.empty()) {
4394       s.push_back(',');
4395     }
4396     s.append(itr->first);
4397   }
4398   return s;
4399 }
4400
4401 template<typename T>
4402 static string list_entries(const T& m) {
4403   string s;
4404   for (typename T::const_iterator itr = m.begin(); itr != m.end(); ++itr) {
4405     if (!s.empty()) {
4406       s.push_back(',');
4407     }
4408     s.append(*itr);
4409   }
4410   return s;
4411 }
4412
4413 void PrimaryLogPG::maybe_create_new_object(
4414   OpContext *ctx,
4415   bool ignore_transaction)
4416 {
4417   ObjectState& obs = ctx->new_obs;
4418   if (!obs.exists) {
4419     ctx->delta_stats.num_objects++;
4420     obs.exists = true;
4421     assert(!obs.oi.is_whiteout());
4422     obs.oi.new_object();
4423     if (!ignore_transaction)
4424       ctx->op_t->create(obs.oi.soid);
4425   } else if (obs.oi.is_whiteout()) {
4426     dout(10) << __func__ << " clearing whiteout on " << obs.oi.soid << dendl;
4427     ctx->new_obs.oi.clear_flag(object_info_t::FLAG_WHITEOUT);
4428     --ctx->delta_stats.num_whiteouts;
4429   }
4430 }
4431
4432 struct C_ChecksumRead : public Context {
4433   PrimaryLogPG *primary_log_pg;
4434   OSDOp &osd_op;
4435   Checksummer::CSumType csum_type;
4436   bufferlist init_value_bl;
4437   ceph_le64 read_length;
4438   bufferlist read_bl;
4439   Context *fill_extent_ctx;
4440
4441   C_ChecksumRead(PrimaryLogPG *primary_log_pg, OSDOp &osd_op,
4442                  Checksummer::CSumType csum_type, bufferlist &&init_value_bl,
4443                  boost::optional<uint32_t> maybe_crc, uint64_t size,
4444                  OSDService *osd, hobject_t soid, __le32 flags)
4445     : primary_log_pg(primary_log_pg), osd_op(osd_op),
4446       csum_type(csum_type), init_value_bl(std::move(init_value_bl)),
4447       fill_extent_ctx(new FillInVerifyExtent(&read_length, &osd_op.rval,
4448                                              &read_bl, maybe_crc, size,
4449                                              osd, soid, flags)) {
4450   }
4451
4452   void finish(int r) override {
4453     fill_extent_ctx->complete(r);
4454
4455     if (osd_op.rval >= 0) {
4456       bufferlist::iterator init_value_bl_it = init_value_bl.begin();
4457       osd_op.rval = primary_log_pg->finish_checksum(osd_op, csum_type,
4458                                                     &init_value_bl_it,
4459                                                     read_bl);
4460     }
4461   }
4462 };
4463
4464 int PrimaryLogPG::do_checksum(OpContext *ctx, OSDOp& osd_op,
4465                               bufferlist::iterator *bl_it, bool *async_read)
4466 {
4467   dout(20) << __func__ << dendl;
4468
4469   auto& op = osd_op.op;
4470   if (op.checksum.chunk_size > 0) {
4471     if (op.checksum.length == 0) {
4472       dout(10) << __func__ << ": length required when chunk size provided"
4473                << dendl;
4474       return -EINVAL;
4475     }
4476     if (op.checksum.length % op.checksum.chunk_size != 0) {
4477       dout(10) << __func__ << ": length not aligned to chunk size" << dendl;
4478       return -EINVAL;
4479     }
4480   }
4481
4482   auto& oi = ctx->new_obs.oi;
4483   if (op.checksum.offset == 0 && op.checksum.length == 0) {
4484     // zeroed offset+length implies checksum whole object
4485     op.checksum.length = oi.size;
4486   } else if (op.checksum.offset + op.checksum.length > oi.size) {
4487     return -EOVERFLOW;
4488   }
4489
4490   Checksummer::CSumType csum_type;
4491   switch (op.checksum.type) {
4492   case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH32:
4493     csum_type = Checksummer::CSUM_XXHASH32;
4494     break;
4495   case CEPH_OSD_CHECKSUM_OP_TYPE_XXHASH64:
4496     csum_type = Checksummer::CSUM_XXHASH64;
4497     break;
4498   case CEPH_OSD_CHECKSUM_OP_TYPE_CRC32C:
4499     csum_type = Checksummer::CSUM_CRC32C;
4500     break;
4501   default:
4502     dout(10) << __func__ << ": unknown crc type ("
4503              << static_cast<uint32_t>(op.checksum.type) << ")" << dendl;
4504     return -EINVAL;
4505   }
4506
4507   size_t csum_init_value_size = Checksummer::get_csum_init_value_size(csum_type);
4508   if (bl_it->get_remaining() < csum_init_value_size) {
4509     dout(10) << __func__ << ": init value not provided" << dendl;
4510     return -EINVAL;
4511   }
4512
4513   bufferlist init_value_bl;
4514   init_value_bl.substr_of(bl_it->get_bl(), bl_it->get_off(),
4515                           csum_init_value_size);
4516   bl_it->advance(csum_init_value_size);
4517
4518   if (pool.info.require_rollback() && op.checksum.length > 0) {
4519     // If there is a data digest and it is possible we are reading
4520     // entire object, pass the digest.
4521     boost::optional<uint32_t> maybe_crc;
4522     if (oi.is_data_digest() && op.checksum.offset == 0 &&
4523         op.checksum.length >= oi.size) {
4524       maybe_crc = oi.data_digest;
4525     }
4526
4527     // async read
4528     auto& soid = oi.soid;
4529     auto checksum_ctx = new C_ChecksumRead(this, osd_op, csum_type,
4530                                            std::move(init_value_bl), maybe_crc,
4531                                            oi.size, osd, soid, op.flags);
4532     ctx->pending_async_reads.push_back({
4533       {op.checksum.offset, op.checksum.length, op.flags},
4534       {&checksum_ctx->read_bl, checksum_ctx}});
4535
4536     dout(10) << __func__ << ": async_read noted for " << soid << dendl;
4537     *async_read = true;
4538     return 0;
4539   }
4540
4541   // sync read
4542   *async_read = false;
4543   std::vector<OSDOp> read_ops(1);
4544   auto& read_op = read_ops[0];
4545   if (op.checksum.length > 0) {
4546     read_op.op.op = CEPH_OSD_OP_READ;
4547     read_op.op.flags = op.flags;
4548     read_op.op.extent.offset = op.checksum.offset;
4549     read_op.op.extent.length = op.checksum.length;
4550     read_op.op.extent.truncate_size = 0;
4551     read_op.op.extent.truncate_seq = 0;
4552
4553     int r = do_osd_ops(ctx, read_ops);
4554     if (r < 0) {
4555       derr << __func__ << ": do_osd_ops failed: " << cpp_strerror(r) << dendl;
4556       return r;
4557     }
4558   }
4559
4560   bufferlist::iterator init_value_bl_it = init_value_bl.begin();
4561   return finish_checksum(osd_op, csum_type, &init_value_bl_it,
4562                          read_op.outdata);
4563 }
4564
4565 int PrimaryLogPG::finish_checksum(OSDOp& osd_op,
4566                                   Checksummer::CSumType csum_type,
4567                                   bufferlist::iterator *init_value_bl_it,
4568                                   const bufferlist &read_bl) {
4569   dout(20) << __func__ << dendl;
4570
4571   auto& op = osd_op.op;
4572
4573   if (op.checksum.length > 0 && read_bl.length() != op.checksum.length) {
4574     derr << __func__ << ": bytes read " << read_bl.length() << " != "
4575          << op.checksum.length << dendl;
4576     return -EINVAL;
4577   }
4578
4579   size_t csum_chunk_size = (op.checksum.chunk_size != 0 ?
4580                               op.checksum.chunk_size : read_bl.length());
4581   uint32_t csum_count = (csum_chunk_size > 0 ?
4582                            read_bl.length() / csum_chunk_size : 0);
4583
4584   bufferlist csum;
4585   bufferptr csum_data;
4586   if (csum_count > 0) {
4587     size_t csum_value_size = Checksummer::get_csum_value_size(csum_type);
4588     csum_data = buffer::create(csum_value_size * csum_count);
4589     csum_data.zero();
4590     csum.append(csum_data);
4591
4592     switch (csum_type) {
4593     case Checksummer::CSUM_XXHASH32:
4594       {
4595         Checksummer::xxhash32::init_value_t init_value;
4596         ::decode(init_value, *init_value_bl_it);
4597         Checksummer::calculate<Checksummer::xxhash32>(
4598           init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
4599           &csum_data);
4600       }
4601       break;
4602     case Checksummer::CSUM_XXHASH64:
4603       {
4604         Checksummer::xxhash64::init_value_t init_value;
4605         ::decode(init_value, *init_value_bl_it);
4606         Checksummer::calculate<Checksummer::xxhash64>(
4607           init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
4608           &csum_data);
4609       }
4610       break;
4611     case Checksummer::CSUM_CRC32C:
4612       {
4613         Checksummer::crc32c::init_value_t init_value;
4614         ::decode(init_value, *init_value_bl_it);
4615         Checksummer::calculate<Checksummer::crc32c>(
4616           init_value, csum_chunk_size, 0, read_bl.length(), read_bl,
4617           &csum_data);
4618       }
4619       break;
4620     default:
4621       break;
4622     }
4623   }
4624
4625   ::encode(csum_count, osd_op.outdata);
4626   osd_op.outdata.claim_append(csum);
4627   return 0;
4628 }
4629
4630 int PrimaryLogPG::do_osd_ops(OpContext *ctx, vector<OSDOp>& ops)
4631 {
4632   int result = 0;
4633   SnapSetContext *ssc = ctx->obc->ssc;
4634   ObjectState& obs = ctx->new_obs;
4635   object_info_t& oi = obs.oi;
4636   const hobject_t& soid = oi.soid;
4637
4638   bool first_read = true;
4639
4640   PGTransaction* t = ctx->op_t.get();
4641
4642   dout(10) << "do_osd_op " << soid << " " << ops << dendl;
4643
4644   for (vector<OSDOp>::iterator p = ops.begin(); p != ops.end(); ++p, ctx->current_osd_subop_num++) {
4645     OSDOp& osd_op = *p;
4646     ceph_osd_op& op = osd_op.op;
4647
4648     // TODO: check endianness (__le32 vs uint32_t, etc.)
4649     // The fields in ceph_osd_op are little-endian (according to the definition in rados.h),
4650     // but the code in this function seems to treat them as native-endian.  What should the
4651     // tracepoints do?
4652     tracepoint(osd, do_osd_op_pre, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op), op.flags);
4653
4654     dout(10) << "do_osd_op  " << osd_op << dendl;
4655
4656     bufferlist::iterator bp = osd_op.indata.begin();
4657
4658     // user-visible modifcation?
4659     switch (op.op) {
4660       // non user-visible modifications
4661     case CEPH_OSD_OP_WATCH:
4662     case CEPH_OSD_OP_CACHE_EVICT:
4663     case CEPH_OSD_OP_CACHE_FLUSH:
4664     case CEPH_OSD_OP_CACHE_TRY_FLUSH:
4665     case CEPH_OSD_OP_UNDIRTY:
4666     case CEPH_OSD_OP_COPY_FROM:  // we handle user_version update explicitly
4667     case CEPH_OSD_OP_CACHE_PIN:
4668     case CEPH_OSD_OP_CACHE_UNPIN:
4669     case CEPH_OSD_OP_SET_REDIRECT:
4670       break;
4671     default:
4672       if (op.op & CEPH_OSD_OP_MODE_WR)
4673         ctx->user_modify = true;
4674     }
4675
4676     // munge -1 truncate to 0 truncate
4677     if (ceph_osd_op_uses_extent(op.op) &&
4678         op.extent.truncate_seq == 1 &&
4679         op.extent.truncate_size == (-1ULL)) {
4680       op.extent.truncate_size = 0;
4681       op.extent.truncate_seq = 0;
4682     }
4683
4684     // munge ZERO -> TRUNCATE?  (don't munge to DELETE or we risk hosing attributes)
4685     if (op.op == CEPH_OSD_OP_ZERO &&
4686         obs.exists &&
4687         op.extent.offset < cct->_conf->osd_max_object_size &&
4688         op.extent.length >= 1 &&
4689         op.extent.length <= cct->_conf->osd_max_object_size &&
4690         op.extent.offset + op.extent.length >= oi.size) {
4691       if (op.extent.offset >= oi.size) {
4692         // no-op
4693         goto fail;
4694       }
4695       dout(10) << " munging ZERO " << op.extent.offset << "~" << op.extent.length
4696                << " -> TRUNCATE " << op.extent.offset << " (old size is " << oi.size << ")" << dendl;
4697       op.op = CEPH_OSD_OP_TRUNCATE;
4698     }
4699
4700     switch (op.op) {
4701
4702       // --- READS ---
4703
4704     case CEPH_OSD_OP_CMPEXT:
4705       ++ctx->num_read;
4706       tracepoint(osd, do_osd_op_pre_extent_cmp, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
4707       result = do_extent_cmp(ctx, osd_op);
4708       break;
4709
4710     case CEPH_OSD_OP_SYNC_READ:
4711       if (pool.info.require_rollback()) {
4712         result = -EOPNOTSUPP;
4713         break;
4714       }
4715       // fall through
4716     case CEPH_OSD_OP_READ:
4717       ++ctx->num_read;
4718       {
4719         __u32 seq = oi.truncate_seq;
4720         uint64_t size = oi.size;
4721         tracepoint(osd, do_osd_op_pre_read, soid.oid.name.c_str(), soid.snap.val, size, seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
4722         bool trimmed_read = false;
4723         // are we beyond truncate_size?
4724         if ( (seq < op.extent.truncate_seq) &&
4725              (op.extent.offset + op.extent.length > op.extent.truncate_size) )
4726           size = op.extent.truncate_size;
4727
4728         if (op.extent.length == 0) //length is zero mean read the whole object
4729           op.extent.length = size;
4730
4731         if (op.extent.offset >= size) {
4732           op.extent.length = 0;
4733           trimmed_read = true;
4734         } else if (op.extent.offset + op.extent.length > size) {
4735           op.extent.length = size - op.extent.offset;
4736           trimmed_read = true;
4737         }
4738
4739         // read into a buffer
4740         bool async = false;
4741         if (trimmed_read && op.extent.length == 0) {
4742           // read size was trimmed to zero and it is expected to do nothing
4743           // a read operation of 0 bytes does *not* do nothing, this is why
4744           // the trimmed_read boolean is needed
4745         } else if (pool.info.require_rollback()) {
4746           async = true;
4747           boost::optional<uint32_t> maybe_crc;
4748           // If there is a data digest and it is possible we are reading
4749           // entire object, pass the digest.  FillInVerifyExtent will
4750           // will check the oi.size again.
4751           if (oi.is_data_digest() && op.extent.offset == 0 &&
4752               op.extent.length >= oi.size)
4753             maybe_crc = oi.data_digest;
4754           ctx->pending_async_reads.push_back(
4755             make_pair(
4756               boost::make_tuple(op.extent.offset, op.extent.length, op.flags),
4757               make_pair(&osd_op.outdata,
4758                         new FillInVerifyExtent(&op.extent.length, &osd_op.rval,
4759                                 &osd_op.outdata, maybe_crc, oi.size, osd,
4760                                 soid, op.flags))));
4761           dout(10) << " async_read noted for " << soid << dendl;
4762         } else {
4763           int r = pgbackend->objects_read_sync(
4764             soid, op.extent.offset, op.extent.length, op.flags, &osd_op.outdata);
4765           if (r >= 0)
4766             op.extent.length = r;
4767           else {
4768             result = r;
4769             op.extent.length = 0;
4770           }
4771           dout(10) << " read got " << r << " / " << op.extent.length
4772                    << " bytes from obj " << soid << dendl;
4773
4774           // whole object?  can we verify the checksum?
4775           if (op.extent.length == oi.size && oi.is_data_digest()) {
4776             uint32_t crc = osd_op.outdata.crc32c(-1);
4777             if (oi.data_digest != crc) {
4778               osd->clog->error() << info.pgid << std::hex
4779                                  << " full-object read crc 0x" << crc
4780                                  << " != expected 0x" << oi.data_digest
4781                                  << std::dec << " on " << soid;
4782               // FIXME fall back to replica or something?
4783               result = -EIO;
4784             }
4785           }
4786         }
4787         if (first_read) {
4788           first_read = false;
4789           ctx->data_off = op.extent.offset;
4790         }
4791         // XXX the op.extent.length is the requested length for async read
4792         // On error this length is changed to 0 after the error comes back.
4793         ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(op.extent.length, 10);
4794         ctx->delta_stats.num_rd++;
4795
4796         // Skip checking the result and just proceed to the next operation
4797         if (async)
4798           continue;
4799
4800       }
4801       break;
4802
4803     case CEPH_OSD_OP_CHECKSUM:
4804       ++ctx->num_read;
4805       {
4806         tracepoint(osd, do_osd_op_pre_checksum, soid.oid.name.c_str(),
4807                    soid.snap.val, oi.size, oi.truncate_seq, op.checksum.type,
4808                    op.checksum.offset, op.checksum.length,
4809                    op.checksum.chunk_size);
4810
4811         bool async_read;
4812         result = do_checksum(ctx, osd_op, &bp, &async_read);
4813         if (result == 0 && async_read) {
4814           continue;
4815         }
4816       }
4817       break;
4818
4819     /* map extents */
4820     case CEPH_OSD_OP_MAPEXT:
4821       tracepoint(osd, do_osd_op_pre_mapext, soid.oid.name.c_str(), soid.snap.val, op.extent.offset, op.extent.length);
4822       if (pool.info.require_rollback()) {
4823         result = -EOPNOTSUPP;
4824         break;
4825       }
4826       ++ctx->num_read;
4827       {
4828         // read into a buffer
4829         bufferlist bl;
4830         int r = osd->store->fiemap(ch, ghobject_t(soid, ghobject_t::NO_GEN,
4831                                                   info.pgid.shard),
4832                                    op.extent.offset, op.extent.length, bl);
4833         osd_op.outdata.claim(bl);
4834         if (r < 0)
4835           result = r;
4836         else
4837           ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(bl.length(), 10);
4838         ctx->delta_stats.num_rd++;
4839         dout(10) << " map_extents done on object " << soid << dendl;
4840       }
4841       break;
4842
4843     /* map extents */
4844     case CEPH_OSD_OP_SPARSE_READ:
4845       tracepoint(osd, do_osd_op_pre_sparse_read, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
4846       if (op.extent.truncate_seq) {
4847         dout(0) << "sparse_read does not support truncation sequence " << dendl;
4848         result = -EINVAL;
4849         break;
4850       }
4851       ++ctx->num_read;
4852       if (pool.info.ec_pool()) {
4853         // translate sparse read to a normal one if not supported
4854         uint64_t offset = op.extent.offset;
4855         uint64_t length = op.extent.length;
4856         if (offset > oi.size) {
4857           length = 0;
4858         } else if (offset + length > oi.size) {
4859           length = oi.size - offset;
4860         }
4861         if (length > 0) {
4862           ctx->pending_async_reads.push_back(
4863             make_pair(
4864               boost::make_tuple(offset, length, op.flags),
4865               make_pair(
4866                 &osd_op.outdata,
4867                 new ToSparseReadResult(
4868                   osd_op.outdata, offset,
4869                   op.extent.length /* updated by the callback */))));
4870           dout(10) << " async_read (was sparse_read) noted for " << soid << dendl;
4871         } else {
4872           dout(10) << " sparse read ended up empty for " << soid << dendl;
4873           map<uint64_t, uint64_t> extents;
4874           ::encode(extents, osd_op.outdata);
4875         }
4876       } else {
4877         // read into a buffer
4878         map<uint64_t, uint64_t> m;
4879         uint32_t total_read = 0;
4880         int r = osd->store->fiemap(ch, ghobject_t(soid, ghobject_t::NO_GEN,
4881                                                   info.pgid.shard),
4882                                    op.extent.offset, op.extent.length, m);
4883         if (r < 0)  {
4884           result = r;
4885           break;
4886         }
4887         map<uint64_t, uint64_t>::iterator miter;
4888         bufferlist data_bl;
4889         uint64_t last = op.extent.offset;
4890         for (miter = m.begin(); miter != m.end(); ++miter) {
4891           // verify hole?
4892           if (cct->_conf->osd_verify_sparse_read_holes &&
4893               last < miter->first) {
4894             bufferlist t;
4895             uint64_t len = miter->first - last;
4896             r = pgbackend->objects_read_sync(soid, last, len, op.flags, &t);
4897             if (r < 0) {
4898               osd->clog->error() << coll << " " << soid
4899                                  << " sparse-read failed to read: "
4900                                  << r;
4901             } else if (!t.is_zero()) {
4902               osd->clog->error() << coll << " " << soid << " sparse-read found data in hole "
4903                                  << last << "~" << len;
4904             }
4905           }
4906
4907           bufferlist tmpbl;
4908           r = pgbackend->objects_read_sync(soid, miter->first, miter->second, op.flags, &tmpbl);
4909           if (r < 0) {
4910             result = r;
4911             break;
4912           }
4913
4914           if (r < (int)miter->second) /* this is usually happen when we get extent that exceeds the actual file size */
4915             miter->second = r;
4916           total_read += r;
4917           dout(10) << "sparse-read " << miter->first << "@" << miter->second << dendl;
4918           data_bl.claim_append(tmpbl);
4919           last = miter->first + r;
4920         }
4921
4922         if (r < 0) {
4923           result = r;
4924           break;
4925         }
4926
4927         // verify trailing hole?
4928         if (cct->_conf->osd_verify_sparse_read_holes) {
4929           uint64_t end = MIN(op.extent.offset + op.extent.length, oi.size);
4930           if (last < end) {
4931             bufferlist t;
4932             uint64_t len = end - last;
4933             r = pgbackend->objects_read_sync(soid, last, len, op.flags, &t);
4934             if (r < 0) {
4935               osd->clog->error() << coll << " " << soid
4936                                  << " sparse-read failed to read: "
4937                                  << r;
4938             } else if (!t.is_zero()) {
4939               osd->clog->error() << coll << " " << soid << " sparse-read found data in hole "
4940                                  << last << "~" << len;
4941             }
4942           }
4943         }
4944
4945         // Why SPARSE_READ need checksum? In fact, librbd always use sparse-read.
4946         // Maybe at first, there is no much whole objects. With continued use, more and more whole object exist.
4947         // So from this point, for spare-read add checksum make sense.
4948         if (total_read == oi.size && oi.is_data_digest()) {
4949           uint32_t crc = data_bl.crc32c(-1);
4950           if (oi.data_digest != crc) {
4951             osd->clog->error() << info.pgid << std::hex
4952               << " full-object read crc 0x" << crc
4953               << " != expected 0x" << oi.data_digest
4954               << std::dec << " on " << soid;
4955             // FIXME fall back to replica or something?
4956             result = -EIO;
4957             break;
4958           }
4959         }
4960
4961         op.extent.length = total_read;
4962
4963         ::encode(m, osd_op.outdata); // re-encode since it might be modified
4964         ::encode_destructively(data_bl, osd_op.outdata);
4965
4966         dout(10) << " sparse_read got " << total_read << " bytes from object " << soid << dendl;
4967       }
4968       ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(op.extent.length, 10);
4969       ctx->delta_stats.num_rd++;
4970       break;
4971
4972     case CEPH_OSD_OP_CALL:
4973       {
4974         string cname, mname;
4975         bufferlist indata;
4976         try {
4977           bp.copy(op.cls.class_len, cname);
4978           bp.copy(op.cls.method_len, mname);
4979           bp.copy(op.cls.indata_len, indata);
4980         } catch (buffer::error& e) {
4981           dout(10) << "call unable to decode class + method + indata" << dendl;
4982           dout(30) << "in dump: ";
4983           osd_op.indata.hexdump(*_dout);
4984           *_dout << dendl;
4985           result = -EINVAL;
4986           tracepoint(osd, do_osd_op_pre_call, soid.oid.name.c_str(), soid.snap.val, "???", "???");
4987           break;
4988         }
4989         tracepoint(osd, do_osd_op_pre_call, soid.oid.name.c_str(), soid.snap.val, cname.c_str(), mname.c_str());
4990
4991         ClassHandler::ClassData *cls;
4992         result = osd->class_handler->open_class(cname, &cls);
4993         assert(result == 0);   // init_op_flags() already verified this works.
4994
4995         ClassHandler::ClassMethod *method = cls->get_method(mname.c_str());
4996         if (!method) {
4997           dout(10) << "call method " << cname << "." << mname << " does not exist" << dendl;
4998           result = -EOPNOTSUPP;
4999           break;
5000         }
5001
5002         int flags = method->get_flags();
5003         if (flags & CLS_METHOD_WR)
5004           ctx->user_modify = true;
5005
5006         bufferlist outdata;
5007         dout(10) << "call method " << cname << "." << mname << dendl;
5008         int prev_rd = ctx->num_read;
5009         int prev_wr = ctx->num_write;
5010         result = method->exec((cls_method_context_t)&ctx, indata, outdata);
5011
5012         if (ctx->num_read > prev_rd && !(flags & CLS_METHOD_RD)) {
5013           derr << "method " << cname << "." << mname << " tried to read object but is not marked RD" << dendl;
5014           result = -EIO;
5015           break;
5016         }
5017         if (ctx->num_write > prev_wr && !(flags & CLS_METHOD_WR)) {
5018           derr << "method " << cname << "." << mname << " tried to update object but is not marked WR" << dendl;
5019           result = -EIO;
5020           break;
5021         }
5022
5023         dout(10) << "method called response length=" << outdata.length() << dendl;
5024         op.extent.length = outdata.length();
5025         osd_op.outdata.claim_append(outdata);
5026         dout(30) << "out dump: ";
5027         osd_op.outdata.hexdump(*_dout);
5028         *_dout << dendl;
5029       }
5030       break;
5031
5032     case CEPH_OSD_OP_STAT:
5033       // note: stat does not require RD
5034       {
5035         tracepoint(osd, do_osd_op_pre_stat, soid.oid.name.c_str(), soid.snap.val);
5036
5037         if (obs.exists && !oi.is_whiteout()) {
5038           ::encode(oi.size, osd_op.outdata);
5039           ::encode(oi.mtime, osd_op.outdata);
5040           dout(10) << "stat oi has " << oi.size << " " << oi.mtime << dendl;
5041         } else {
5042           result = -ENOENT;
5043           dout(10) << "stat oi object does not exist" << dendl;
5044         }
5045
5046         ctx->delta_stats.num_rd++;
5047       }
5048       break;
5049
5050     case CEPH_OSD_OP_ISDIRTY:
5051       ++ctx->num_read;
5052       {
5053         tracepoint(osd, do_osd_op_pre_isdirty, soid.oid.name.c_str(), soid.snap.val);
5054         bool is_dirty = obs.oi.is_dirty();
5055         ::encode(is_dirty, osd_op.outdata);
5056         ctx->delta_stats.num_rd++;
5057         result = 0;
5058       }
5059       break;
5060
5061     case CEPH_OSD_OP_UNDIRTY:
5062       ++ctx->num_write;
5063       {
5064         tracepoint(osd, do_osd_op_pre_undirty, soid.oid.name.c_str(), soid.snap.val);
5065         if (oi.is_dirty()) {
5066           ctx->undirty = true;  // see make_writeable()
5067           ctx->modify = true;
5068           ctx->delta_stats.num_wr++;
5069         }
5070         result = 0;
5071       }
5072       break;
5073
5074     case CEPH_OSD_OP_CACHE_TRY_FLUSH:
5075       ++ctx->num_write;
5076       {
5077         tracepoint(osd, do_osd_op_pre_try_flush, soid.oid.name.c_str(), soid.snap.val);
5078         if (ctx->lock_type != ObjectContext::RWState::RWNONE) {
5079           dout(10) << "cache-try-flush without SKIPRWLOCKS flag set" << dendl;
5080           result = -EINVAL;
5081           break;
5082         }
5083         if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
5084           result = -EINVAL;
5085           break;
5086         }
5087         if (!obs.exists) {
5088           result = 0;
5089           break;
5090         }
5091         if (oi.is_cache_pinned()) {
5092           dout(10) << "cache-try-flush on a pinned object, consider unpin this object first" << dendl;
5093           result = -EPERM;
5094           break;
5095         }
5096         if (oi.is_dirty()) {
5097           result = start_flush(ctx->op, ctx->obc, false, NULL, boost::none);
5098           if (result == -EINPROGRESS)
5099             result = -EAGAIN;
5100         } else {
5101           result = 0;
5102         }
5103       }
5104       break;
5105
5106     case CEPH_OSD_OP_CACHE_FLUSH:
5107       ++ctx->num_write;
5108       {
5109         tracepoint(osd, do_osd_op_pre_cache_flush, soid.oid.name.c_str(), soid.snap.val);
5110         if (ctx->lock_type == ObjectContext::RWState::RWNONE) {
5111           dout(10) << "cache-flush with SKIPRWLOCKS flag set" << dendl;
5112           result = -EINVAL;
5113           break;
5114         }
5115         if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
5116           result = -EINVAL;
5117           break;
5118         }
5119         if (!obs.exists) {
5120           result = 0;
5121           break;
5122         }
5123         if (oi.is_cache_pinned()) {
5124           dout(10) << "cache-flush on a pinned object, consider unpin this object first" << dendl;
5125           result = -EPERM;
5126           break;
5127         }
5128         hobject_t missing;
5129         if (oi.is_dirty()) {
5130           result = start_flush(ctx->op, ctx->obc, true, &missing, boost::none);
5131           if (result == -EINPROGRESS)
5132             result = -EAGAIN;
5133         } else {
5134           result = 0;
5135         }
5136         // Check special return value which has set missing_return
5137         if (result == -ENOENT) {
5138           dout(10) << __func__ << " CEPH_OSD_OP_CACHE_FLUSH got ENOENT" << dendl;
5139           assert(!missing.is_min());
5140           wait_for_unreadable_object(missing, ctx->op);
5141           // Error code which is used elsewhere when wait_for_unreadable_object() is used
5142           result = -EAGAIN;
5143         }
5144       }
5145       break;
5146
5147     case CEPH_OSD_OP_CACHE_EVICT:
5148       ++ctx->num_write;
5149       {
5150         tracepoint(osd, do_osd_op_pre_cache_evict, soid.oid.name.c_str(), soid.snap.val);
5151         if (pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE) {
5152           result = -EINVAL;
5153           break;
5154         }
5155         if (!obs.exists) {
5156           result = 0;
5157           break;
5158         }
5159         if (oi.is_cache_pinned()) {
5160           dout(10) << "cache-evict on a pinned object, consider unpin this object first" << dendl;
5161           result = -EPERM;
5162           break;
5163         }
5164         if (oi.is_dirty()) {
5165           result = -EBUSY;
5166           break;
5167         }
5168         if (!oi.watchers.empty()) {
5169           result = -EBUSY;
5170           break;
5171         }
5172         if (soid.snap == CEPH_NOSNAP) {
5173           result = _verify_no_head_clones(soid, ssc->snapset);
5174           if (result < 0)
5175             break;
5176         }
5177         result = _delete_oid(ctx, true, false);
5178         if (result >= 0) {
5179           // mark that this is a cache eviction to avoid triggering normal
5180           // make_writeable() clone or snapdir object creation in finish_ctx()
5181           ctx->cache_evict = true;
5182         }
5183         osd->logger->inc(l_osd_tier_evict);
5184       }
5185       break;
5186
5187     case CEPH_OSD_OP_GETXATTR:
5188       ++ctx->num_read;
5189       {
5190         string aname;
5191         bp.copy(op.xattr.name_len, aname);
5192         tracepoint(osd, do_osd_op_pre_getxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
5193         string name = "_" + aname;
5194         int r = getattr_maybe_cache(
5195           ctx->obc,
5196           name,
5197           &(osd_op.outdata));
5198         if (r >= 0) {
5199           op.xattr.value_len = osd_op.outdata.length();
5200           result = 0;
5201           ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
5202         } else
5203           result = r;
5204
5205         ctx->delta_stats.num_rd++;
5206       }
5207       break;
5208
5209    case CEPH_OSD_OP_GETXATTRS:
5210       ++ctx->num_read;
5211       {
5212         tracepoint(osd, do_osd_op_pre_getxattrs, soid.oid.name.c_str(), soid.snap.val);
5213         map<string, bufferlist> out;
5214         result = getattrs_maybe_cache(
5215           ctx->obc,
5216           &out,
5217           true);
5218
5219         bufferlist bl;
5220         ::encode(out, bl);
5221         ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(bl.length(), 10);
5222         ctx->delta_stats.num_rd++;
5223         osd_op.outdata.claim_append(bl);
5224       }
5225       break;
5226
5227     case CEPH_OSD_OP_CMPXATTR:
5228       ++ctx->num_read;
5229       {
5230         string aname;
5231         bp.copy(op.xattr.name_len, aname);
5232         tracepoint(osd, do_osd_op_pre_cmpxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
5233         string name = "_" + aname;
5234         name[op.xattr.name_len + 1] = 0;
5235
5236         bufferlist xattr;
5237         result = getattr_maybe_cache(
5238           ctx->obc,
5239           name,
5240           &xattr);
5241         if (result < 0 && result != -EEXIST && result != -ENODATA)
5242           break;
5243
5244         ctx->delta_stats.num_rd++;
5245         ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(xattr.length(), 10);
5246
5247         switch (op.xattr.cmp_mode) {
5248         case CEPH_OSD_CMPXATTR_MODE_STRING:
5249           {
5250             string val;
5251             bp.copy(op.xattr.value_len, val);
5252             val[op.xattr.value_len] = 0;
5253             dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name << " val=" << val
5254                      << " op=" << (int)op.xattr.cmp_op << " mode=" << (int)op.xattr.cmp_mode << dendl;
5255             result = do_xattr_cmp_str(op.xattr.cmp_op, val, xattr);
5256           }
5257           break;
5258
5259         case CEPH_OSD_CMPXATTR_MODE_U64:
5260           {
5261             uint64_t u64val;
5262             try {
5263               ::decode(u64val, bp);
5264             }
5265             catch (buffer::error& e) {
5266               result = -EINVAL;
5267               goto fail;
5268             }
5269             dout(10) << "CEPH_OSD_OP_CMPXATTR name=" << name << " val=" << u64val
5270                      << " op=" << (int)op.xattr.cmp_op << " mode=" << (int)op.xattr.cmp_mode << dendl;
5271             result = do_xattr_cmp_u64(op.xattr.cmp_op, u64val, xattr);
5272           }
5273           break;
5274
5275         default:
5276           dout(10) << "bad cmp mode " << (int)op.xattr.cmp_mode << dendl;
5277           result = -EINVAL;
5278         }
5279
5280         if (!result) {
5281           dout(10) << "comparison returned false" << dendl;
5282           result = -ECANCELED;
5283           break;
5284         }
5285         if (result < 0) {
5286           dout(10) << "comparison returned " << result << " " << cpp_strerror(-result) << dendl;
5287           break;
5288         }
5289
5290         dout(10) << "comparison returned true" << dendl;
5291       }
5292       break;
5293
5294     case CEPH_OSD_OP_ASSERT_VER:
5295       ++ctx->num_read;
5296       {
5297         uint64_t ver = op.assert_ver.ver;
5298         tracepoint(osd, do_osd_op_pre_assert_ver, soid.oid.name.c_str(), soid.snap.val, ver);
5299         if (!ver)
5300           result = -EINVAL;
5301         else if (ver < oi.user_version)
5302           result = -ERANGE;
5303         else if (ver > oi.user_version)
5304           result = -EOVERFLOW;
5305       }
5306       break;
5307
5308     case CEPH_OSD_OP_LIST_WATCHERS:
5309       ++ctx->num_read;
5310       {
5311         tracepoint(osd, do_osd_op_pre_list_watchers, soid.oid.name.c_str(), soid.snap.val);
5312         obj_list_watch_response_t resp;
5313
5314         map<pair<uint64_t, entity_name_t>, watch_info_t>::const_iterator oi_iter;
5315         for (oi_iter = oi.watchers.begin(); oi_iter != oi.watchers.end();
5316                                        ++oi_iter) {
5317           dout(20) << "key cookie=" << oi_iter->first.first
5318                << " entity=" << oi_iter->first.second << " "
5319                << oi_iter->second << dendl;
5320           assert(oi_iter->first.first == oi_iter->second.cookie);
5321           assert(oi_iter->first.second.is_client());
5322
5323           watch_item_t wi(oi_iter->first.second, oi_iter->second.cookie,
5324                  oi_iter->second.timeout_seconds, oi_iter->second.addr);
5325           resp.entries.push_back(wi);
5326         }
5327
5328         resp.encode(osd_op.outdata, ctx->get_features());
5329         result = 0;
5330
5331         ctx->delta_stats.num_rd++;
5332         break;
5333       }
5334
5335     case CEPH_OSD_OP_LIST_SNAPS:
5336       ++ctx->num_read;
5337       {
5338         tracepoint(osd, do_osd_op_pre_list_snaps, soid.oid.name.c_str(), soid.snap.val);
5339         obj_list_snap_response_t resp;
5340
5341         if (!ssc) {
5342           ssc = ctx->obc->ssc = get_snapset_context(soid, false);
5343         }
5344         assert(ssc);
5345
5346         int clonecount = ssc->snapset.clones.size();
5347         if (ssc->snapset.head_exists)
5348           clonecount++;
5349         resp.clones.reserve(clonecount);
5350         for (auto clone_iter = ssc->snapset.clones.begin();
5351              clone_iter != ssc->snapset.clones.end(); ++clone_iter) {
5352           clone_info ci;
5353           ci.cloneid = *clone_iter;
5354
5355           hobject_t clone_oid = soid;
5356           clone_oid.snap = *clone_iter;
5357
5358           if (!ssc->snapset.is_legacy()) {
5359             auto p = ssc->snapset.clone_snaps.find(*clone_iter);
5360             if (p == ssc->snapset.clone_snaps.end()) {
5361               osd->clog->error() << "osd." << osd->whoami
5362                                  << ": inconsistent clone_snaps found for oid "
5363                                  << soid << " clone " << *clone_iter
5364                                  << " snapset " << ssc->snapset;
5365               result = -EINVAL;
5366               break;
5367             }
5368             for (auto q = p->second.rbegin(); q != p->second.rend(); ++q) {
5369               ci.snaps.push_back(*q);
5370             }
5371           } else {
5372             /* No need to take a lock here.  We are only inspecting state cached on
5373              * in the ObjectContext, so we aren't performing an actual read unless
5374              * the clone obc is not already loaded (in which case, it cannot have
5375              * an in progress write).  We also do not risk exposing uncommitted
5376              * state since we do have a read lock on the head object or snapdir,
5377              * which we would have to write lock in order to make user visible
5378              * modifications to the snapshot state (snap trim related mutations
5379              * are not user visible).
5380              */
5381             if (is_missing_object(clone_oid)) {
5382               dout(20) << "LIST_SNAPS " << clone_oid << " missing" << dendl;
5383               wait_for_unreadable_object(clone_oid, ctx->op);
5384               result = -EAGAIN;
5385               break;
5386             }
5387
5388             ObjectContextRef clone_obc = get_object_context(clone_oid, false);
5389             if (!clone_obc) {
5390               if (maybe_handle_cache(
5391                     ctx->op, true, clone_obc, -ENOENT, clone_oid, true)) {
5392                 // promoting the clone
5393                 result = -EAGAIN;
5394               } else {
5395                 osd->clog->error() << "osd." << osd->whoami
5396                                    << ": missing clone " << clone_oid
5397                                    << " for oid "
5398                                    << soid;
5399                 // should not happen
5400                 result = -ENOENT;
5401               }
5402               break;
5403             }
5404             for (vector<snapid_t>::reverse_iterator p =
5405                    clone_obc->obs.oi.legacy_snaps.rbegin();
5406                  p != clone_obc->obs.oi.legacy_snaps.rend();
5407                  ++p) {
5408               ci.snaps.push_back(*p);
5409             }
5410           }
5411
5412           dout(20) << " clone " << *clone_iter << " snaps " << ci.snaps << dendl;
5413
5414           map<snapid_t, interval_set<uint64_t> >::const_iterator coi;
5415           coi = ssc->snapset.clone_overlap.find(ci.cloneid);
5416           if (coi == ssc->snapset.clone_overlap.end()) {
5417             osd->clog->error() << "osd." << osd->whoami
5418                                << ": inconsistent clone_overlap found for oid "
5419                               << soid << " clone " << *clone_iter;
5420             result = -EINVAL;
5421             break;
5422           }
5423           const interval_set<uint64_t> &o = coi->second;
5424           ci.overlap.reserve(o.num_intervals());
5425           for (interval_set<uint64_t>::const_iterator r = o.begin();
5426                r != o.end(); ++r) {
5427             ci.overlap.push_back(pair<uint64_t,uint64_t>(r.get_start(),
5428                                                          r.get_len()));
5429           }
5430
5431           map<snapid_t, uint64_t>::const_iterator si;
5432           si = ssc->snapset.clone_size.find(ci.cloneid);
5433           if (si == ssc->snapset.clone_size.end()) {
5434             osd->clog->error() << "osd." << osd->whoami
5435                                << ": inconsistent clone_size found for oid "
5436                                << soid << " clone " << *clone_iter;
5437             result = -EINVAL;
5438             break;
5439           }
5440           ci.size = si->second;
5441
5442           resp.clones.push_back(ci);
5443         }
5444         if (result < 0) {
5445           break;
5446         }
5447         if (ssc->snapset.head_exists &&
5448             !ctx->obc->obs.oi.is_whiteout()) {
5449           assert(obs.exists);
5450           clone_info ci;
5451           ci.cloneid = CEPH_NOSNAP;
5452
5453           //Size for HEAD is oi.size
5454           ci.size = oi.size;
5455
5456           resp.clones.push_back(ci);
5457         }
5458         resp.seq = ssc->snapset.seq;
5459
5460         resp.encode(osd_op.outdata);
5461         result = 0;
5462
5463         ctx->delta_stats.num_rd++;
5464         break;
5465       }
5466
5467    case CEPH_OSD_OP_NOTIFY:
5468       ++ctx->num_read;
5469       {
5470         uint32_t timeout;
5471         bufferlist bl;
5472
5473         try {
5474           uint32_t ver; // obsolete
5475           ::decode(ver, bp);
5476           ::decode(timeout, bp);
5477           ::decode(bl, bp);
5478         } catch (const buffer::error &e) {
5479           timeout = 0;
5480         }
5481         tracepoint(osd, do_osd_op_pre_notify, soid.oid.name.c_str(), soid.snap.val, timeout);
5482         if (!timeout)
5483           timeout = cct->_conf->osd_default_notify_timeout;
5484
5485         notify_info_t n;
5486         n.timeout = timeout;
5487         n.notify_id = osd->get_next_id(get_osdmap()->get_epoch());
5488         n.cookie = op.watch.cookie;
5489         n.bl = bl;
5490         ctx->notifies.push_back(n);
5491
5492         // return our unique notify id to the client
5493         ::encode(n.notify_id, osd_op.outdata);
5494       }
5495       break;
5496
5497     case CEPH_OSD_OP_NOTIFY_ACK:
5498       ++ctx->num_read;
5499       {
5500         try {
5501           uint64_t notify_id = 0;
5502           uint64_t watch_cookie = 0;
5503           ::decode(notify_id, bp);
5504           ::decode(watch_cookie, bp);
5505           bufferlist reply_bl;
5506           if (!bp.end()) {
5507             ::decode(reply_bl, bp);
5508           }
5509           tracepoint(osd, do_osd_op_pre_notify_ack, soid.oid.name.c_str(), soid.snap.val, notify_id, watch_cookie, "Y");
5510           OpContext::NotifyAck ack(notify_id, watch_cookie, reply_bl);
5511           ctx->notify_acks.push_back(ack);
5512         } catch (const buffer::error &e) {
5513           tracepoint(osd, do_osd_op_pre_notify_ack, soid.oid.name.c_str(), soid.snap.val, op.watch.cookie, 0, "N");
5514           OpContext::NotifyAck ack(
5515             // op.watch.cookie is actually the notify_id for historical reasons
5516             op.watch.cookie
5517             );
5518           ctx->notify_acks.push_back(ack);
5519         }
5520       }
5521       break;
5522
5523     case CEPH_OSD_OP_SETALLOCHINT:
5524       ++ctx->num_write;
5525       {
5526         tracepoint(osd, do_osd_op_pre_setallochint, soid.oid.name.c_str(), soid.snap.val, op.alloc_hint.expected_object_size, op.alloc_hint.expected_write_size);
5527         maybe_create_new_object(ctx);
5528         oi.expected_object_size = op.alloc_hint.expected_object_size;
5529         oi.expected_write_size = op.alloc_hint.expected_write_size;
5530         oi.alloc_hint_flags = op.alloc_hint.flags;
5531         t->set_alloc_hint(soid, op.alloc_hint.expected_object_size,
5532                           op.alloc_hint.expected_write_size,
5533                           op.alloc_hint.flags);
5534         ctx->delta_stats.num_wr++;
5535         result = 0;
5536       }
5537       break;
5538
5539
5540       // --- WRITES ---
5541
5542       // -- object data --
5543
5544     case CEPH_OSD_OP_WRITE:
5545       ++ctx->num_write;
5546       { // write
5547         __u32 seq = oi.truncate_seq;
5548         tracepoint(osd, do_osd_op_pre_write, soid.oid.name.c_str(), soid.snap.val, oi.size, seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
5549         if (op.extent.length != osd_op.indata.length()) {
5550           result = -EINVAL;
5551           break;
5552         }
5553
5554         if (pool.info.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED))
5555           op.flags = op.flags | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
5556
5557         if (pool.info.requires_aligned_append() &&
5558             (op.extent.offset % pool.info.required_alignment() != 0)) {
5559           result = -EOPNOTSUPP;
5560           break;
5561         }
5562
5563         if (!obs.exists) {
5564           if (pool.info.requires_aligned_append() && op.extent.offset) {
5565             result = -EOPNOTSUPP;
5566             break;
5567           }
5568         } else if (op.extent.offset != oi.size &&
5569                    pool.info.requires_aligned_append()) {
5570           result = -EOPNOTSUPP;
5571           break;
5572         }
5573
5574         if (seq && (seq > op.extent.truncate_seq) &&
5575             (op.extent.offset + op.extent.length > oi.size)) {
5576           // old write, arrived after trimtrunc
5577           op.extent.length = (op.extent.offset > oi.size ? 0 : oi.size - op.extent.offset);
5578           dout(10) << " old truncate_seq " << op.extent.truncate_seq << " < current " << seq
5579                    << ", adjusting write length to " << op.extent.length << dendl;
5580           bufferlist t;
5581           t.substr_of(osd_op.indata, 0, op.extent.length);
5582           osd_op.indata.swap(t);
5583         }
5584         if (op.extent.truncate_seq > seq) {
5585           // write arrives before trimtrunc
5586           if (obs.exists && !oi.is_whiteout()) {
5587             dout(10) << " truncate_seq " << op.extent.truncate_seq << " > current " << seq
5588                      << ", truncating to " << op.extent.truncate_size << dendl;
5589             t->truncate(soid, op.extent.truncate_size);
5590             oi.truncate_seq = op.extent.truncate_seq;
5591             oi.truncate_size = op.extent.truncate_size;
5592             if (op.extent.truncate_size != oi.size) {
5593               ctx->delta_stats.num_bytes -= oi.size;
5594               ctx->delta_stats.num_bytes += op.extent.truncate_size;
5595               oi.size = op.extent.truncate_size;
5596             }
5597           } else {
5598             dout(10) << " truncate_seq " << op.extent.truncate_seq << " > current " << seq
5599                      << ", but object is new" << dendl;
5600             oi.truncate_seq = op.extent.truncate_seq;
5601             oi.truncate_size = op.extent.truncate_size;
5602           }
5603         }
5604         result = check_offset_and_length(op.extent.offset, op.extent.length, cct->_conf->osd_max_object_size);
5605         if (result < 0)
5606           break;
5607
5608         maybe_create_new_object(ctx);
5609
5610         if (op.extent.length == 0) {
5611           if (op.extent.offset > oi.size) {
5612             t->truncate(
5613               soid, op.extent.offset);
5614           } else {
5615             t->nop(soid);
5616           }
5617         } else {
5618           t->write(
5619             soid, op.extent.offset, op.extent.length, osd_op.indata, op.flags);
5620         }
5621
5622         if (op.extent.offset == 0 && op.extent.length >= oi.size)
5623           obs.oi.set_data_digest(osd_op.indata.crc32c(-1));
5624         else if (op.extent.offset == oi.size && obs.oi.is_data_digest())
5625           obs.oi.set_data_digest(osd_op.indata.crc32c(obs.oi.data_digest));
5626         else
5627           obs.oi.clear_data_digest();
5628         write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
5629                                     op.extent.offset, op.extent.length);
5630
5631       }
5632       break;
5633
5634     case CEPH_OSD_OP_WRITEFULL:
5635       ++ctx->num_write;
5636       { // write full object
5637         tracepoint(osd, do_osd_op_pre_writefull, soid.oid.name.c_str(), soid.snap.val, oi.size, 0, op.extent.length);
5638
5639         if (op.extent.length != osd_op.indata.length()) {
5640           result = -EINVAL;
5641           break;
5642         }
5643         result = check_offset_and_length(0, op.extent.length, cct->_conf->osd_max_object_size);
5644         if (result < 0)
5645           break;
5646
5647         if (pool.info.has_flag(pg_pool_t::FLAG_WRITE_FADVISE_DONTNEED))
5648           op.flags = op.flags | CEPH_OSD_OP_FLAG_FADVISE_DONTNEED;
5649
5650         maybe_create_new_object(ctx);
5651         if (pool.info.require_rollback()) {
5652           t->truncate(soid, 0);
5653         } else if (obs.exists && op.extent.length < oi.size) {
5654           t->truncate(soid, op.extent.length);
5655         }
5656         if (op.extent.length) {
5657           t->write(soid, 0, op.extent.length, osd_op.indata, op.flags);
5658         }
5659         obs.oi.set_data_digest(osd_op.indata.crc32c(-1));
5660
5661         write_update_size_and_usage(ctx->delta_stats, oi, ctx->modified_ranges,
5662             0, op.extent.length, true);
5663       }
5664       break;
5665
5666     case CEPH_OSD_OP_WRITESAME:
5667       ++ctx->num_write;
5668       tracepoint(osd, do_osd_op_pre_writesame, soid.oid.name.c_str(), soid.snap.val, oi.size, op.writesame.offset, op.writesame.length, op.writesame.data_length);
5669       result = do_writesame(ctx, osd_op);
5670       break;
5671
5672     case CEPH_OSD_OP_ROLLBACK :
5673       ++ctx->num_write;
5674       tracepoint(osd, do_osd_op_pre_rollback, soid.oid.name.c_str(), soid.snap.val);
5675       result = _rollback_to(ctx, op);
5676       break;
5677
5678     case CEPH_OSD_OP_ZERO:
5679       tracepoint(osd, do_osd_op_pre_zero, soid.oid.name.c_str(), soid.snap.val, op.extent.offset, op.extent.length);
5680       if (pool.info.requires_aligned_append()) {
5681         result = -EOPNOTSUPP;
5682         break;
5683       }
5684       ++ctx->num_write;
5685       { // zero
5686         result = check_offset_and_length(op.extent.offset, op.extent.length, cct->_conf->osd_max_object_size);
5687         if (result < 0)
5688           break;
5689         assert(op.extent.length);
5690         if (obs.exists && !oi.is_whiteout()) {
5691           t->zero(soid, op.extent.offset, op.extent.length);
5692           interval_set<uint64_t> ch;
5693           ch.insert(op.extent.offset, op.extent.length);
5694           ctx->modified_ranges.union_of(ch);
5695           ctx->delta_stats.num_wr++;
5696           oi.clear_data_digest();
5697         } else {
5698           // no-op
5699         }
5700       }
5701       break;
5702     case CEPH_OSD_OP_CREATE:
5703       ++ctx->num_write;
5704       {
5705         tracepoint(osd, do_osd_op_pre_create, soid.oid.name.c_str(), soid.snap.val);
5706         int flags = le32_to_cpu(op.flags);
5707         if (obs.exists && !oi.is_whiteout() &&
5708             (flags & CEPH_OSD_OP_FLAG_EXCL)) {
5709           result = -EEXIST; /* this is an exclusive create */
5710         } else {
5711           if (osd_op.indata.length()) {
5712             bufferlist::iterator p = osd_op.indata.begin();
5713             string category;
5714             try {
5715               ::decode(category, p);
5716             }
5717             catch (buffer::error& e) {
5718               result = -EINVAL;
5719               goto fail;
5720             }
5721             // category is no longer implemented.
5722           }
5723           if (result >= 0) {
5724             maybe_create_new_object(ctx);
5725             t->nop(soid);
5726           }
5727         }
5728       }
5729       break;
5730
5731     case CEPH_OSD_OP_TRIMTRUNC:
5732       op.extent.offset = op.extent.truncate_size;
5733       // falling through
5734
5735     case CEPH_OSD_OP_TRUNCATE:
5736       tracepoint(osd, do_osd_op_pre_truncate, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
5737       if (pool.info.requires_aligned_append()) {
5738         result = -EOPNOTSUPP;
5739         break;
5740       }
5741       ++ctx->num_write;
5742       {
5743         // truncate
5744         if (!obs.exists || oi.is_whiteout()) {
5745           dout(10) << " object dne, truncate is a no-op" << dendl;
5746           break;
5747         }
5748
5749         if (op.extent.offset > cct->_conf->osd_max_object_size) {
5750           result = -EFBIG;
5751           break;
5752         }
5753
5754         if (op.extent.truncate_seq) {
5755           assert(op.extent.offset == op.extent.truncate_size);
5756           if (op.extent.truncate_seq <= oi.truncate_seq) {
5757             dout(10) << " truncate seq " << op.extent.truncate_seq << " <= current " << oi.truncate_seq
5758                      << ", no-op" << dendl;
5759             break; // old
5760           }
5761           dout(10) << " truncate seq " << op.extent.truncate_seq << " > current " << oi.truncate_seq
5762                    << ", truncating" << dendl;
5763           oi.truncate_seq = op.extent.truncate_seq;
5764           oi.truncate_size = op.extent.truncate_size;
5765         }
5766
5767         maybe_create_new_object(ctx);
5768         t->truncate(soid, op.extent.offset);
5769         if (oi.size > op.extent.offset) {
5770           interval_set<uint64_t> trim;
5771           trim.insert(op.extent.offset, oi.size-op.extent.offset);
5772           ctx->modified_ranges.union_of(trim);
5773         }
5774         if (op.extent.offset != oi.size) {
5775           ctx->delta_stats.num_bytes -= oi.size;
5776           ctx->delta_stats.num_bytes += op.extent.offset;
5777           oi.size = op.extent.offset;
5778         }
5779         ctx->delta_stats.num_wr++;
5780         // do no set exists, or we will break above DELETE -> TRUNCATE munging.
5781
5782         oi.clear_data_digest();
5783       }
5784       break;
5785
5786     case CEPH_OSD_OP_DELETE:
5787       ++ctx->num_write;
5788       tracepoint(osd, do_osd_op_pre_delete, soid.oid.name.c_str(), soid.snap.val);
5789       {
5790         result = _delete_oid(ctx, false, ctx->ignore_cache);
5791       }
5792       break;
5793
5794     case CEPH_OSD_OP_WATCH:
5795       ++ctx->num_write;
5796       {
5797         tracepoint(osd, do_osd_op_pre_watch, soid.oid.name.c_str(), soid.snap.val,
5798                    op.watch.cookie, op.watch.op);
5799         if (!obs.exists) {
5800           result = -ENOENT;
5801           break;
5802         }
5803         uint64_t cookie = op.watch.cookie;
5804         entity_name_t entity = ctx->reqid.name;
5805         ObjectContextRef obc = ctx->obc;
5806
5807         dout(10) << "watch " << ceph_osd_watch_op_name(op.watch.op)
5808                  << ": ctx->obc=" << (void *)obc.get() << " cookie=" << cookie
5809                  << " oi.version=" << oi.version.version << " ctx->at_version=" << ctx->at_version << dendl;
5810         dout(10) << "watch: oi.user_version=" << oi.user_version<< dendl;
5811         dout(10) << "watch: peer_addr="
5812           << ctx->op->get_req()->get_connection()->get_peer_addr() << dendl;
5813
5814         uint32_t timeout = cct->_conf->osd_client_watch_timeout;
5815         if (op.watch.timeout != 0) {
5816           timeout = op.watch.timeout;
5817         }
5818
5819         watch_info_t w(cookie, timeout,
5820           ctx->op->get_req()->get_connection()->get_peer_addr());
5821         if (op.watch.op == CEPH_OSD_WATCH_OP_WATCH ||
5822             op.watch.op == CEPH_OSD_WATCH_OP_LEGACY_WATCH) {
5823           if (oi.watchers.count(make_pair(cookie, entity))) {
5824             dout(10) << " found existing watch " << w << " by " << entity << dendl;
5825           } else {
5826             dout(10) << " registered new watch " << w << " by " << entity << dendl;
5827             oi.watchers[make_pair(cookie, entity)] = w;
5828             t->nop(soid);  // make sure update the object_info on disk!
5829           }
5830           bool will_ping = (op.watch.op == CEPH_OSD_WATCH_OP_WATCH);
5831           ctx->watch_connects.push_back(make_pair(w, will_ping));
5832         } else if (op.watch.op == CEPH_OSD_WATCH_OP_RECONNECT) {
5833           if (!oi.watchers.count(make_pair(cookie, entity))) {
5834             result = -ENOTCONN;
5835             break;
5836           }
5837           dout(10) << " found existing watch " << w << " by " << entity << dendl;
5838           ctx->watch_connects.push_back(make_pair(w, true));
5839         } else if (op.watch.op == CEPH_OSD_WATCH_OP_PING) {
5840           /* Note: WATCH with PING doesn't cause may_write() to return true,
5841            * so if there is nothing else in the transaction, this is going
5842            * to run do_osd_op_effects, but not write out a log entry */
5843           if (!oi.watchers.count(make_pair(cookie, entity))) {
5844             result = -ENOTCONN;
5845             break;
5846           }
5847           map<pair<uint64_t,entity_name_t>,WatchRef>::iterator p =
5848             obc->watchers.find(make_pair(cookie, entity));
5849           if (p == obc->watchers.end() ||
5850               !p->second->is_connected()) {
5851             // client needs to reconnect
5852             result = -ETIMEDOUT;
5853             break;
5854           }
5855           dout(10) << " found existing watch " << w << " by " << entity << dendl;
5856           p->second->got_ping(ceph_clock_now());
5857           result = 0;
5858         } else if (op.watch.op == CEPH_OSD_WATCH_OP_UNWATCH) {
5859           map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator oi_iter =
5860             oi.watchers.find(make_pair(cookie, entity));
5861           if (oi_iter != oi.watchers.end()) {
5862             dout(10) << " removed watch " << oi_iter->second << " by "
5863                      << entity << dendl;
5864             oi.watchers.erase(oi_iter);
5865             t->nop(soid);  // update oi on disk
5866             ctx->watch_disconnects.push_back(
5867               watch_disconnect_t(cookie, entity, false));
5868           } else {
5869             dout(10) << " can't remove: no watch by " << entity << dendl;
5870           }
5871         }
5872       }
5873       break;
5874
5875     case CEPH_OSD_OP_CACHE_PIN:
5876       tracepoint(osd, do_osd_op_pre_cache_pin, soid.oid.name.c_str(), soid.snap.val);
5877       if ((!pool.info.is_tier() ||
5878           pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)) {
5879         result = -EINVAL;
5880         dout(10) << " pin object is only allowed on the cache tier " << dendl;
5881         break;
5882       }
5883       ++ctx->num_write;
5884       {
5885         if (!obs.exists || oi.is_whiteout()) {
5886           result = -ENOENT;
5887           break;
5888         }
5889
5890         if (!oi.is_cache_pinned()) {
5891           oi.set_flag(object_info_t::FLAG_CACHE_PIN);
5892           ctx->modify = true;
5893           ctx->delta_stats.num_objects_pinned++;
5894           ctx->delta_stats.num_wr++;
5895         }
5896         result = 0;
5897       }
5898       break;
5899
5900     case CEPH_OSD_OP_CACHE_UNPIN:
5901       tracepoint(osd, do_osd_op_pre_cache_unpin, soid.oid.name.c_str(), soid.snap.val);
5902       if ((!pool.info.is_tier() ||
5903           pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE)) {
5904         result = -EINVAL;
5905         dout(10) << " pin object is only allowed on the cache tier " << dendl;
5906         break;
5907       }
5908       ++ctx->num_write;
5909       {
5910         if (!obs.exists || oi.is_whiteout()) {
5911           result = -ENOENT;
5912           break;
5913         }
5914
5915         if (oi.is_cache_pinned()) {
5916           oi.clear_flag(object_info_t::FLAG_CACHE_PIN);
5917           ctx->modify = true;
5918           ctx->delta_stats.num_objects_pinned--;
5919           ctx->delta_stats.num_wr++;
5920         }
5921         result = 0;
5922       }
5923       break;
5924
5925     case CEPH_OSD_OP_SET_REDIRECT:
5926       ++ctx->num_write;
5927       {
5928         if (pool.info.is_tier()) {
5929           result = -EINVAL;
5930           break;
5931         }
5932         if (!obs.exists) {
5933           result = -ENOENT;
5934           break;
5935         }
5936         if (get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS) {
5937           result = -EOPNOTSUPP;
5938           break;
5939         }
5940
5941         object_t target_name;
5942         object_locator_t target_oloc;
5943         snapid_t target_snapid = (uint64_t)op.copy_from.snapid;
5944         version_t target_version = op.copy_from.src_version;
5945         try {
5946           ::decode(target_name, bp);
5947           ::decode(target_oloc, bp);
5948         }
5949         catch (buffer::error& e) {
5950           result = -EINVAL;
5951           goto fail;
5952         }
5953         pg_t raw_pg;
5954         get_osdmap()->object_locator_to_pg(target_name, target_oloc, raw_pg);
5955         hobject_t target(target_name, target_oloc.key, target_snapid,
5956                 raw_pg.ps(), raw_pg.pool(),
5957                 target_oloc.nspace);
5958         if (target == soid) {
5959           dout(20) << " set-redirect self is invalid" << dendl;
5960           result = -EINVAL;
5961           break;
5962         }
5963         oi.set_flag(object_info_t::FLAG_MANIFEST);
5964         oi.manifest.redirect_target = target;
5965         oi.manifest.type = object_manifest_t::TYPE_REDIRECT;
5966         t->truncate(soid, 0);
5967         if (oi.is_omap() && pool.info.supports_omap()) {
5968           t->omap_clear(soid);
5969           obs.oi.clear_omap_digest();
5970           obs.oi.clear_flag(object_info_t::FLAG_OMAP);
5971         }
5972         ctx->delta_stats.num_bytes -= oi.size;
5973         oi.size = 0;
5974         oi.new_object();
5975         oi.user_version = target_version;
5976         ctx->user_at_version = target_version;
5977         /* rm_attrs */
5978         map<string,bufferlist> rmattrs;
5979         result = getattrs_maybe_cache(ctx->obc,
5980                     &rmattrs,
5981                     true);
5982         if (result < 0) {
5983           return result;
5984         }
5985         map<string, bufferlist>::iterator iter;
5986         for (iter = rmattrs.begin(); iter != rmattrs.end(); ++iter) {
5987           const string& name = iter->first;
5988           t->rmattr(soid, name);
5989         }
5990         dout(10) << "set-redirect oid:" << oi.soid << " user_version: " << oi.user_version << dendl;
5991       }
5992
5993       break;
5994
5995       // -- object attrs --
5996
5997     case CEPH_OSD_OP_SETXATTR:
5998       ++ctx->num_write;
5999       {
6000         if (cct->_conf->osd_max_attr_size > 0 &&
6001             op.xattr.value_len > cct->_conf->osd_max_attr_size) {
6002           tracepoint(osd, do_osd_op_pre_setxattr, soid.oid.name.c_str(), soid.snap.val, "???");
6003           result = -EFBIG;
6004           break;
6005         }
6006         unsigned max_name_len = MIN(osd->store->get_max_attr_name_length(),
6007                                     cct->_conf->osd_max_attr_name_len);
6008         if (op.xattr.name_len > max_name_len) {
6009           result = -ENAMETOOLONG;
6010           break;
6011         }
6012         maybe_create_new_object(ctx);
6013         string aname;
6014         bp.copy(op.xattr.name_len, aname);
6015         tracepoint(osd, do_osd_op_pre_setxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
6016         string name = "_" + aname;
6017         bufferlist bl;
6018         bp.copy(op.xattr.value_len, bl);
6019         t->setattr(soid, name, bl);
6020         ctx->delta_stats.num_wr++;
6021       }
6022       break;
6023
6024     case CEPH_OSD_OP_RMXATTR:
6025       ++ctx->num_write;
6026       {
6027         string aname;
6028         bp.copy(op.xattr.name_len, aname);
6029         tracepoint(osd, do_osd_op_pre_rmxattr, soid.oid.name.c_str(), soid.snap.val, aname.c_str());
6030         if (!obs.exists || oi.is_whiteout()) {
6031           result = -ENOENT;
6032           break;
6033         }
6034         string name = "_" + aname;
6035         t->rmattr(soid, name);
6036         ctx->delta_stats.num_wr++;
6037       }
6038       break;
6039
6040
6041       // -- fancy writers --
6042     case CEPH_OSD_OP_APPEND:
6043       {
6044         tracepoint(osd, do_osd_op_pre_append, soid.oid.name.c_str(), soid.snap.val, oi.size, oi.truncate_seq, op.extent.offset, op.extent.length, op.extent.truncate_size, op.extent.truncate_seq);
6045         // just do it inline; this works because we are happy to execute
6046         // fancy op on replicas as well.
6047         vector<OSDOp> nops(1);
6048         OSDOp& newop = nops[0];
6049         newop.op.op = CEPH_OSD_OP_WRITE;
6050         newop.op.extent.offset = oi.size;
6051         newop.op.extent.length = op.extent.length;
6052         newop.op.extent.truncate_seq = oi.truncate_seq;
6053         newop.indata = osd_op.indata;
6054         result = do_osd_ops(ctx, nops);
6055         osd_op.outdata.claim(newop.outdata);
6056       }
6057       break;
6058
6059     case CEPH_OSD_OP_STARTSYNC:
6060       tracepoint(osd, do_osd_op_pre_startsync, soid.oid.name.c_str(), soid.snap.val);
6061       t->nop(soid);
6062       break;
6063
6064
6065       // -- trivial map --
6066     case CEPH_OSD_OP_TMAPGET:
6067       tracepoint(osd, do_osd_op_pre_tmapget, soid.oid.name.c_str(), soid.snap.val);
6068       if (pool.info.require_rollback()) {
6069         result = -EOPNOTSUPP;
6070         break;
6071       }
6072       {
6073         vector<OSDOp> nops(1);
6074         OSDOp& newop = nops[0];
6075         newop.op.op = CEPH_OSD_OP_SYNC_READ;
6076         newop.op.extent.offset = 0;
6077         newop.op.extent.length = 0;
6078         do_osd_ops(ctx, nops);
6079         osd_op.outdata.claim(newop.outdata);
6080       }
6081       break;
6082
6083     case CEPH_OSD_OP_TMAPPUT:
6084       tracepoint(osd, do_osd_op_pre_tmapput, soid.oid.name.c_str(), soid.snap.val);
6085       if (pool.info.require_rollback()) {
6086         result = -EOPNOTSUPP;
6087         break;
6088       }
6089       {
6090         //_dout_lock.Lock();
6091         //osd_op.data.hexdump(*_dout);
6092         //_dout_lock.Unlock();
6093
6094         // verify sort order
6095         bool unsorted = false;
6096         if (true) {
6097           bufferlist header;
6098           ::decode(header, bp);
6099           uint32_t n;
6100           ::decode(n, bp);
6101           string last_key;
6102           while (n--) {
6103             string key;
6104             ::decode(key, bp);
6105             dout(10) << "tmapput key " << key << dendl;
6106             bufferlist val;
6107             ::decode(val, bp);
6108             if (key < last_key) {
6109               dout(10) << "TMAPPUT is unordered; resorting" << dendl;
6110               unsorted = true;
6111               break;
6112             }
6113             last_key = key;
6114           }
6115         }
6116
6117         // write it
6118         vector<OSDOp> nops(1);
6119         OSDOp& newop = nops[0];
6120         newop.op.op = CEPH_OSD_OP_WRITEFULL;
6121         newop.op.extent.offset = 0;
6122         newop.op.extent.length = osd_op.indata.length();
6123         newop.indata = osd_op.indata;
6124
6125         if (unsorted) {
6126           bp = osd_op.indata.begin();
6127           bufferlist header;
6128           map<string, bufferlist> m;
6129           ::decode(header, bp);
6130           ::decode(m, bp);
6131           assert(bp.end());
6132           bufferlist newbl;
6133           ::encode(header, newbl);
6134           ::encode(m, newbl);
6135           newop.indata = newbl;
6136         }
6137         result = do_osd_ops(ctx, nops);
6138         assert(result == 0);
6139       }
6140       break;
6141
6142     case CEPH_OSD_OP_TMAPUP:
6143       tracepoint(osd, do_osd_op_pre_tmapup, soid.oid.name.c_str(), soid.snap.val);
6144       if (pool.info.require_rollback()) {
6145         result = -EOPNOTSUPP;
6146         break;
6147       }
6148       ++ctx->num_write;
6149       result = do_tmapup(ctx, bp, osd_op);
6150       break;
6151
6152     case CEPH_OSD_OP_TMAP2OMAP:
6153       ++ctx->num_write;
6154       tracepoint(osd, do_osd_op_pre_tmap2omap, soid.oid.name.c_str(), soid.snap.val);
6155       result = do_tmap2omap(ctx, op.tmap2omap.flags);
6156       break;
6157
6158       // OMAP Read ops
6159     case CEPH_OSD_OP_OMAPGETKEYS:
6160       ++ctx->num_read;
6161       {
6162         string start_after;
6163         uint64_t max_return;
6164         try {
6165           ::decode(start_after, bp);
6166           ::decode(max_return, bp);
6167         }
6168         catch (buffer::error& e) {
6169           result = -EINVAL;
6170           tracepoint(osd, do_osd_op_pre_omapgetkeys, soid.oid.name.c_str(), soid.snap.val, "???", 0);
6171           goto fail;
6172         }
6173         if (max_return > cct->_conf->osd_max_omap_entries_per_request) {
6174           max_return = cct->_conf->osd_max_omap_entries_per_request;
6175         }
6176         tracepoint(osd, do_osd_op_pre_omapgetkeys, soid.oid.name.c_str(), soid.snap.val, start_after.c_str(), max_return);
6177
6178         bufferlist bl;
6179         uint32_t num = 0;
6180         bool truncated = false;
6181         if (oi.is_omap()) {
6182           ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(
6183             coll, ghobject_t(soid)
6184             );
6185           assert(iter);
6186           iter->upper_bound(start_after);
6187           for (num = 0; iter->valid(); ++num, iter->next(false)) {
6188             if (num >= max_return ||
6189                 bl.length() >= cct->_conf->osd_max_omap_bytes_per_request) {
6190               truncated = true;
6191               break;
6192             }
6193             ::encode(iter->key(), bl);
6194           }
6195         } // else return empty out_set
6196         ::encode(num, osd_op.outdata);
6197         osd_op.outdata.claim_append(bl);
6198         ::encode(truncated, osd_op.outdata);
6199         ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
6200         ctx->delta_stats.num_rd++;
6201       }
6202       break;
6203
6204     case CEPH_OSD_OP_OMAPGETVALS:
6205       ++ctx->num_read;
6206       {
6207         string start_after;
6208         uint64_t max_return;
6209         string filter_prefix;
6210         try {
6211           ::decode(start_after, bp);
6212           ::decode(max_return, bp);
6213           ::decode(filter_prefix, bp);
6214         }
6215         catch (buffer::error& e) {
6216           result = -EINVAL;
6217           tracepoint(osd, do_osd_op_pre_omapgetvals, soid.oid.name.c_str(), soid.snap.val, "???", 0, "???");
6218           goto fail;
6219         }
6220         if (max_return > cct->_conf->osd_max_omap_entries_per_request) {
6221           max_return = cct->_conf->osd_max_omap_entries_per_request;
6222         }
6223         tracepoint(osd, do_osd_op_pre_omapgetvals, soid.oid.name.c_str(), soid.snap.val, start_after.c_str(), max_return, filter_prefix.c_str());
6224
6225         uint32_t num = 0;
6226         bool truncated = false;
6227         bufferlist bl;
6228         if (oi.is_omap()) {
6229           ObjectMap::ObjectMapIterator iter = osd->store->get_omap_iterator(
6230             coll, ghobject_t(soid)
6231             );
6232           if (!iter) {
6233             result = -ENOENT;
6234             goto fail;
6235           }
6236           iter->upper_bound(start_after);
6237           if (filter_prefix > start_after) iter->lower_bound(filter_prefix);
6238           for (num = 0;
6239                iter->valid() &&
6240                  iter->key().substr(0, filter_prefix.size()) == filter_prefix;
6241                ++num, iter->next(false)) {
6242             dout(20) << "Found key " << iter->key() << dendl;
6243             if (num >= max_return ||
6244                 bl.length() >= cct->_conf->osd_max_omap_bytes_per_request) {
6245               truncated = true;
6246               break;
6247             }
6248             ::encode(iter->key(), bl);
6249             ::encode(iter->value(), bl);
6250           }
6251         } // else return empty out_set
6252         ::encode(num, osd_op.outdata);
6253         osd_op.outdata.claim_append(bl);
6254         ::encode(truncated, osd_op.outdata);
6255         ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
6256         ctx->delta_stats.num_rd++;
6257       }
6258       break;
6259
6260     case CEPH_OSD_OP_OMAPGETHEADER:
6261       tracepoint(osd, do_osd_op_pre_omapgetheader, soid.oid.name.c_str(), soid.snap.val);
6262       if (!oi.is_omap()) {
6263         // return empty header
6264         break;
6265       }
6266       ++ctx->num_read;
6267       {
6268         osd->store->omap_get_header(ch, ghobject_t(soid), &osd_op.outdata);
6269         ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
6270         ctx->delta_stats.num_rd++;
6271       }
6272       break;
6273
6274     case CEPH_OSD_OP_OMAPGETVALSBYKEYS:
6275       ++ctx->num_read;
6276       {
6277         set<string> keys_to_get;
6278         try {
6279           ::decode(keys_to_get, bp);
6280         }
6281         catch (buffer::error& e) {
6282           result = -EINVAL;
6283           tracepoint(osd, do_osd_op_pre_omapgetvalsbykeys, soid.oid.name.c_str(), soid.snap.val, "???");
6284           goto fail;
6285         }
6286         tracepoint(osd, do_osd_op_pre_omapgetvalsbykeys, soid.oid.name.c_str(), soid.snap.val, list_entries(keys_to_get).c_str());
6287         map<string, bufferlist> out;
6288         if (oi.is_omap()) {
6289           osd->store->omap_get_values(ch, ghobject_t(soid), keys_to_get, &out);
6290         } // else return empty omap entries
6291         ::encode(out, osd_op.outdata);
6292         ctx->delta_stats.num_rd_kb += SHIFT_ROUND_UP(osd_op.outdata.length(), 10);
6293         ctx->delta_stats.num_rd++;
6294       }
6295       break;
6296
6297     case CEPH_OSD_OP_OMAP_CMP:
6298       ++ctx->num_read;
6299       {
6300         if (!obs.exists || oi.is_whiteout()) {
6301           result = -ENOENT;
6302           tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, "???");
6303           break;
6304         }
6305         map<string, pair<bufferlist, int> > assertions;
6306         try {
6307           ::decode(assertions, bp);
6308         }
6309         catch (buffer::error& e) {
6310           result = -EINVAL;
6311           tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, "???");
6312           goto fail;
6313         }
6314         tracepoint(osd, do_osd_op_pre_omap_cmp, soid.oid.name.c_str(), soid.snap.val, list_keys(assertions).c_str());
6315
6316         map<string, bufferlist> out;
6317
6318         if (oi.is_omap()) {
6319           set<string> to_get;
6320           for (map<string, pair<bufferlist, int> >::iterator i = assertions.begin();
6321                i != assertions.end();
6322                ++i)
6323             to_get.insert(i->first);
6324           int r = osd->store->omap_get_values(ch, ghobject_t(soid),
6325                                               to_get, &out);
6326           if (r < 0) {
6327             result = r;
6328             break;
6329           }
6330         } // else leave out empty
6331
6332         //Should set num_rd_kb based on encode length of map
6333         ctx->delta_stats.num_rd++;
6334
6335         int r = 0;
6336         bufferlist empty;
6337         for (map<string, pair<bufferlist, int> >::iterator i = assertions.begin();
6338              i != assertions.end();
6339              ++i) {
6340           auto out_entry = out.find(i->first);
6341           bufferlist &bl = (out_entry != out.end()) ?
6342             out_entry->second : empty;
6343           switch (i->second.second) {
6344           case CEPH_OSD_CMPXATTR_OP_EQ:
6345             if (!(bl == i->second.first)) {
6346               r = -ECANCELED;
6347             }
6348             break;
6349           case CEPH_OSD_CMPXATTR_OP_LT:
6350             if (!(bl < i->second.first)) {
6351               r = -ECANCELED;
6352             }
6353             break;
6354           case CEPH_OSD_CMPXATTR_OP_GT:
6355             if (!(bl > i->second.first)) {
6356               r = -ECANCELED;
6357             }
6358             break;
6359           default:
6360             r = -EINVAL;
6361             break;
6362           }
6363           if (r < 0)
6364             break;
6365         }
6366         if (r < 0) {
6367           result = r;
6368         }
6369       }
6370       break;
6371
6372       // OMAP Write ops
6373     case CEPH_OSD_OP_OMAPSETVALS:
6374       if (!pool.info.supports_omap()) {
6375         result = -EOPNOTSUPP;
6376         tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
6377         break;
6378       }
6379       ++ctx->num_write;
6380       {
6381         maybe_create_new_object(ctx);
6382         bufferlist to_set_bl;
6383         try {
6384           decode_str_str_map_to_bl(bp, &to_set_bl);
6385         }
6386         catch (buffer::error& e) {
6387           result = -EINVAL;
6388           tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
6389           goto fail;
6390         }
6391         tracepoint(osd, do_osd_op_pre_omapsetvals, soid.oid.name.c_str(), soid.snap.val);
6392         if (cct->_conf->subsys.should_gather(dout_subsys, 20)) {
6393           dout(20) << "setting vals: " << dendl;
6394           map<string,bufferlist> to_set;
6395           bufferlist::iterator pt = to_set_bl.begin();
6396           ::decode(to_set, pt);
6397           for (map<string, bufferlist>::iterator i = to_set.begin();
6398                i != to_set.end();
6399                ++i) {
6400             dout(20) << "\t" << i->first << dendl;
6401           }
6402         }
6403         t->omap_setkeys(soid, to_set_bl);
6404         ctx->delta_stats.num_wr++;
6405       }
6406       obs.oi.set_flag(object_info_t::FLAG_OMAP);
6407       obs.oi.clear_omap_digest();
6408       break;
6409
6410     case CEPH_OSD_OP_OMAPSETHEADER:
6411       tracepoint(osd, do_osd_op_pre_omapsetheader, soid.oid.name.c_str(), soid.snap.val);
6412       if (!pool.info.supports_omap()) {
6413         result = -EOPNOTSUPP;
6414         break;
6415       }
6416       ++ctx->num_write;
6417       {
6418         maybe_create_new_object(ctx);
6419         t->omap_setheader(soid, osd_op.indata);
6420         ctx->delta_stats.num_wr++;
6421       }
6422       obs.oi.set_flag(object_info_t::FLAG_OMAP);
6423       obs.oi.clear_omap_digest();
6424       break;
6425
6426     case CEPH_OSD_OP_OMAPCLEAR:
6427       tracepoint(osd, do_osd_op_pre_omapclear, soid.oid.name.c_str(), soid.snap.val);
6428       if (!pool.info.supports_omap()) {
6429         result = -EOPNOTSUPP;
6430         break;
6431       }
6432       ++ctx->num_write;
6433       {
6434         if (!obs.exists || oi.is_whiteout()) {
6435           result = -ENOENT;
6436           break;
6437         }
6438         if (oi.is_omap()) {
6439           t->omap_clear(soid);
6440           ctx->delta_stats.num_wr++;
6441           obs.oi.clear_omap_digest();
6442           obs.oi.clear_flag(object_info_t::FLAG_OMAP);
6443         }
6444       }
6445       break;
6446
6447     case CEPH_OSD_OP_OMAPRMKEYS:
6448       if (!pool.info.supports_omap()) {
6449         result = -EOPNOTSUPP;
6450         tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
6451         break;
6452       }
6453       ++ctx->num_write;
6454       {
6455         if (!obs.exists || oi.is_whiteout()) {
6456           result = -ENOENT;
6457           tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
6458           break;
6459         }
6460         bufferlist to_rm_bl;
6461         try {
6462           decode_str_set_to_bl(bp, &to_rm_bl);
6463         }
6464         catch (buffer::error& e) {
6465           result = -EINVAL;
6466           tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
6467           goto fail;
6468         }
6469         tracepoint(osd, do_osd_op_pre_omaprmkeys, soid.oid.name.c_str(), soid.snap.val);
6470         t->omap_rmkeys(soid, to_rm_bl);
6471         ctx->delta_stats.num_wr++;
6472       }
6473       obs.oi.clear_omap_digest();
6474       break;
6475
6476     case CEPH_OSD_OP_COPY_GET:
6477       ++ctx->num_read;
6478       tracepoint(osd, do_osd_op_pre_copy_get, soid.oid.name.c_str(), soid.snap.val);
6479       result = fill_in_copy_get(ctx, bp, osd_op, ctx->obc);
6480       break;
6481
6482     case CEPH_OSD_OP_COPY_FROM:
6483       ++ctx->num_write;
6484       {
6485         object_t src_name;
6486         object_locator_t src_oloc;
6487         snapid_t src_snapid = (uint64_t)op.copy_from.snapid;
6488         version_t src_version = op.copy_from.src_version;
6489         try {
6490           ::decode(src_name, bp);
6491           ::decode(src_oloc, bp);
6492         }
6493         catch (buffer::error& e) {
6494           result = -EINVAL;
6495           tracepoint(osd,
6496                      do_osd_op_pre_copy_from,
6497                      soid.oid.name.c_str(),
6498                      soid.snap.val,
6499                      "???",
6500                      0,
6501                      "???",
6502                      "???",
6503                      0,
6504                      src_snapid,
6505                      src_version);
6506           goto fail;
6507         }
6508         tracepoint(osd,
6509                    do_osd_op_pre_copy_from,
6510                    soid.oid.name.c_str(),
6511                    soid.snap.val,
6512                    src_name.name.c_str(),
6513                    src_oloc.pool,
6514                    src_oloc.key.c_str(),
6515                    src_oloc.nspace.c_str(),
6516                    src_oloc.hash,
6517                    src_snapid,
6518                    src_version);
6519         if (!ctx->copy_cb) {
6520           // start
6521           pg_t raw_pg;
6522           get_osdmap()->object_locator_to_pg(src_name, src_oloc, raw_pg);
6523           hobject_t src(src_name, src_oloc.key, src_snapid,
6524                         raw_pg.ps(), raw_pg.pool(),
6525                         src_oloc.nspace);
6526           if (src == soid) {
6527             dout(20) << " copy from self is invalid" << dendl;
6528             result = -EINVAL;
6529             break;
6530           }
6531           CopyFromCallback *cb = new CopyFromCallback(ctx);
6532           ctx->copy_cb = cb;
6533           start_copy(cb, ctx->obc, src, src_oloc, src_version,
6534                      op.copy_from.flags,
6535                      false,
6536                      op.copy_from.src_fadvise_flags,
6537                      op.flags);
6538           result = -EINPROGRESS;
6539         } else {
6540           // finish
6541           assert(ctx->copy_cb->get_result() >= 0);
6542           finish_copyfrom(ctx);
6543           result = 0;
6544         }
6545       }
6546       break;
6547
6548     default:
6549       tracepoint(osd, do_osd_op_pre_unknown, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op));
6550       dout(1) << "unrecognized osd op " << op.op
6551               << " " << ceph_osd_op_name(op.op)
6552               << dendl;
6553       result = -EOPNOTSUPP;
6554     }
6555
6556   fail:
6557     osd_op.rval = result;
6558     tracepoint(osd, do_osd_op_post, soid.oid.name.c_str(), soid.snap.val, op.op, ceph_osd_op_name(op.op), op.flags, result);
6559     if (result < 0 && (op.flags & CEPH_OSD_OP_FLAG_FAILOK))
6560       result = 0;
6561
6562     if (result < 0)
6563       break;
6564   }
6565   return result;
6566 }
6567
6568 int PrimaryLogPG::_get_tmap(OpContext *ctx, bufferlist *header, bufferlist *vals)
6569 {
6570   if (ctx->new_obs.oi.size == 0) {
6571     dout(20) << "unable to get tmap for zero sized " << ctx->new_obs.oi.soid << dendl;
6572     return -ENODATA;
6573   }
6574   vector<OSDOp> nops(1);
6575   OSDOp &newop = nops[0];
6576   newop.op.op = CEPH_OSD_OP_TMAPGET;
6577   do_osd_ops(ctx, nops);
6578   try {
6579     bufferlist::iterator i = newop.outdata.begin();
6580     ::decode(*header, i);
6581     (*vals).substr_of(newop.outdata, i.get_off(), i.get_remaining());
6582   } catch (...) {
6583     dout(20) << "unsuccessful at decoding tmap for " << ctx->new_obs.oi.soid
6584              << dendl;
6585     return -EINVAL;
6586   }
6587   dout(20) << "successful at decoding tmap for " << ctx->new_obs.oi.soid
6588            << dendl;
6589   return 0;
6590 }
6591
6592 int PrimaryLogPG::_verify_no_head_clones(const hobject_t& soid,
6593                                         const SnapSet& ss)
6594 {
6595   // verify that all clones have been evicted
6596   dout(20) << __func__ << " verifying clones are absent "
6597            << ss << dendl;
6598   for (vector<snapid_t>::const_iterator p = ss.clones.begin();
6599        p != ss.clones.end();
6600        ++p) {
6601     hobject_t clone_oid = soid;
6602     clone_oid.snap = *p;
6603     if (is_missing_object(clone_oid))
6604       return -EBUSY;
6605     ObjectContextRef clone_obc = get_object_context(clone_oid, false);
6606     if (clone_obc && clone_obc->obs.exists) {
6607       dout(10) << __func__ << " cannot evict head before clone "
6608                << clone_oid << dendl;
6609       return -EBUSY;
6610     }
6611     if (copy_ops.count(clone_oid)) {
6612       dout(10) << __func__ << " cannot evict head, pending promote on clone "
6613                << clone_oid << dendl;
6614       return -EBUSY;
6615     }
6616   }
6617   return 0;
6618 }
6619
6620 inline int PrimaryLogPG::_delete_oid(
6621   OpContext *ctx,
6622   bool no_whiteout,     // no whiteouts, no matter what.
6623   bool try_no_whiteout) // try not to whiteout
6624 {
6625   SnapSet& snapset = ctx->new_snapset;
6626   ObjectState& obs = ctx->new_obs;
6627   object_info_t& oi = obs.oi;
6628   const hobject_t& soid = oi.soid;
6629   PGTransaction* t = ctx->op_t.get();
6630
6631   // cache: cache: set whiteout on delete?
6632   bool whiteout = false;
6633   if (pool.info.cache_mode != pg_pool_t::CACHEMODE_NONE
6634       && !no_whiteout
6635       && !try_no_whiteout) {
6636     whiteout = true;
6637   }
6638   bool legacy;
6639   if (get_osdmap()->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
6640     legacy = false;
6641     // in luminous or later, we can't delete the head if there are
6642     // clones. we trust the caller passing no_whiteout has already
6643     // verified they don't exist.
6644     if (!snapset.clones.empty() ||
6645         (!ctx->snapc.snaps.empty() && ctx->snapc.snaps[0] > snapset.seq)) {
6646       if (no_whiteout) {
6647         dout(20) << __func__ << " has or will have clones but no_whiteout=1"
6648                  << dendl;
6649       } else {
6650         dout(20) << __func__ << " has or will have clones; will whiteout"
6651                  << dendl;
6652         whiteout = true;
6653       }
6654     }
6655   } else {
6656     legacy = false;
6657   }
6658   dout(20) << __func__ << " " << soid << " whiteout=" << (int)whiteout
6659            << " no_whiteout=" << (int)no_whiteout
6660            << " try_no_whiteout=" << (int)try_no_whiteout
6661            << dendl;
6662   if (!obs.exists || (obs.oi.is_whiteout() && whiteout))
6663     return -ENOENT;
6664
6665   t->remove(soid);
6666
6667   if (oi.size > 0) {
6668     interval_set<uint64_t> ch;
6669     ch.insert(0, oi.size);
6670     ctx->modified_ranges.union_of(ch);
6671   }
6672
6673   ctx->delta_stats.num_wr++;
6674   if (soid.is_snap()) {
6675     assert(ctx->obc->ssc->snapset.clone_overlap.count(soid.snap));
6676     ctx->delta_stats.num_bytes -= ctx->obc->ssc->snapset.get_clone_bytes(soid.snap);
6677   } else {
6678     ctx->delta_stats.num_bytes -= oi.size;
6679   }
6680   oi.size = 0;
6681   oi.new_object();
6682
6683   // disconnect all watchers
6684   for (map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator p =
6685          oi.watchers.begin();
6686        p != oi.watchers.end();
6687        ++p) {
6688     dout(20) << __func__ << " will disconnect watcher " << p->first << dendl;
6689     ctx->watch_disconnects.push_back(
6690       watch_disconnect_t(p->first.first, p->first.second, true));
6691   }
6692   oi.watchers.clear();
6693
6694   if (whiteout) {
6695     dout(20) << __func__ << " setting whiteout on " << soid << dendl;
6696     oi.set_flag(object_info_t::FLAG_WHITEOUT);
6697     ctx->delta_stats.num_whiteouts++;
6698     t->create(soid);
6699     osd->logger->inc(l_osd_tier_whiteout);
6700     return 0;
6701   }
6702
6703   // delete the head
6704   ctx->delta_stats.num_objects--;
6705   if (soid.is_snap())
6706     ctx->delta_stats.num_object_clones--;
6707   if (oi.is_whiteout()) {
6708     dout(20) << __func__ << " deleting whiteout on " << soid << dendl;
6709     ctx->delta_stats.num_whiteouts--;
6710     oi.clear_flag(object_info_t::FLAG_WHITEOUT);
6711   }
6712   if (oi.is_cache_pinned()) {
6713     ctx->delta_stats.num_objects_pinned--;
6714   }
6715   if ((legacy || snapset.is_legacy()) && soid.is_head()) {
6716     snapset.head_exists = false;
6717   }
6718   obs.exists = false;
6719   return 0;
6720 }
6721
6722 int PrimaryLogPG::_rollback_to(OpContext *ctx, ceph_osd_op& op)
6723 {
6724   SnapSet& snapset = ctx->new_snapset;
6725   ObjectState& obs = ctx->new_obs;
6726   object_info_t& oi = obs.oi;
6727   const hobject_t& soid = oi.soid;
6728   PGTransaction* t = ctx->op_t.get();
6729   snapid_t snapid = (uint64_t)op.snap.snapid;
6730   hobject_t missing_oid;
6731
6732   dout(10) << "_rollback_to " << soid << " snapid " << snapid << dendl;
6733
6734   ObjectContextRef rollback_to;
6735   int ret = find_object_context(
6736     hobject_t(soid.oid, soid.get_key(), snapid, soid.get_hash(), info.pgid.pool(),
6737               soid.get_namespace()),
6738     &rollback_to, false, false, &missing_oid);
6739   if (ret == -EAGAIN) {
6740     /* clone must be missing */
6741     assert(is_missing_object(missing_oid));
6742     dout(20) << "_rollback_to attempted to roll back to a missing object "
6743              << missing_oid << " (requested snapid: ) " << snapid << dendl;
6744     block_write_on_degraded_snap(missing_oid, ctx->op);
6745     return ret;
6746   }
6747   {
6748     ObjectContextRef promote_obc;
6749     cache_result_t tier_mode_result;
6750     if (obs.exists && obs.oi.has_manifest()) {
6751       tier_mode_result =
6752         maybe_handle_manifest_detail(
6753           ctx->op,
6754           true,
6755           rollback_to);
6756     } else {
6757       tier_mode_result =
6758         maybe_handle_cache_detail(
6759           ctx->op,
6760           true,
6761           rollback_to,
6762           ret,
6763           missing_oid,
6764           true,
6765           false,
6766           &promote_obc);
6767     }
6768     switch (tier_mode_result) {
6769     case cache_result_t::NOOP:
6770       break;
6771     case cache_result_t::BLOCKED_PROMOTE:
6772       assert(promote_obc);
6773       block_write_on_snap_rollback(soid, promote_obc, ctx->op);
6774       return -EAGAIN;
6775     case cache_result_t::BLOCKED_FULL:
6776       block_write_on_full_cache(soid, ctx->op);
6777       return -EAGAIN;
6778     default:
6779       assert(0 == "must promote was set, other values are not valid");
6780       return -EAGAIN;
6781     }
6782   }
6783
6784   if (ret == -ENOENT || (rollback_to && rollback_to->obs.oi.is_whiteout())) {
6785     // there's no snapshot here, or there's no object.
6786     // if there's no snapshot, we delete the object; otherwise, do nothing.
6787     dout(20) << "_rollback_to deleting head on " << soid.oid
6788              << " because got ENOENT|whiteout on find_object_context" << dendl;
6789     if (ctx->obc->obs.oi.watchers.size()) {
6790       // Cannot delete an object with watchers
6791       ret = -EBUSY;
6792     } else {
6793       _delete_oid(ctx, false, false);
6794       ret = 0;
6795     }
6796   } else if (ret) {
6797     // ummm....huh? It *can't* return anything else at time of writing.
6798     assert(0 == "unexpected error code in _rollback_to");
6799   } else { //we got our context, let's use it to do the rollback!
6800     hobject_t& rollback_to_sobject = rollback_to->obs.oi.soid;
6801     if (is_degraded_or_backfilling_object(rollback_to_sobject)) {
6802       dout(20) << "_rollback_to attempted to roll back to a degraded object "
6803                << rollback_to_sobject << " (requested snapid: ) " << snapid << dendl;
6804       block_write_on_degraded_snap(rollback_to_sobject, ctx->op);
6805       ret = -EAGAIN;
6806     } else if (rollback_to->obs.oi.soid.snap == CEPH_NOSNAP) {
6807       // rolling back to the head; we just need to clone it.
6808       ctx->modify = true;
6809     } else {
6810       /* 1) Delete current head
6811        * 2) Clone correct snapshot into head
6812        * 3) Calculate clone_overlaps by following overlaps
6813        *    forward from rollback snapshot */
6814       dout(10) << "_rollback_to deleting " << soid.oid
6815                << " and rolling back to old snap" << dendl;
6816
6817       if (obs.exists) {
6818         t->remove(soid);
6819       }
6820       t->clone(soid, rollback_to_sobject);
6821       snapset.head_exists = true;
6822       t->add_obc(rollback_to);
6823
6824       map<snapid_t, interval_set<uint64_t> >::iterator iter =
6825         snapset.clone_overlap.lower_bound(snapid);
6826       interval_set<uint64_t> overlaps = iter->second;
6827       assert(iter != snapset.clone_overlap.end());
6828       for ( ;
6829             iter != snapset.clone_overlap.end();
6830             ++iter)
6831         overlaps.intersection_of(iter->second);
6832
6833       if (obs.oi.size > 0) {
6834         interval_set<uint64_t> modified;
6835         modified.insert(0, obs.oi.size);
6836         overlaps.intersection_of(modified);
6837         modified.subtract(overlaps);
6838         ctx->modified_ranges.union_of(modified);
6839       }
6840
6841       // Adjust the cached objectcontext
6842       maybe_create_new_object(ctx, true);
6843       ctx->delta_stats.num_bytes -= obs.oi.size;
6844       ctx->delta_stats.num_bytes += rollback_to->obs.oi.size;
6845       obs.oi.size = rollback_to->obs.oi.size;
6846       if (rollback_to->obs.oi.is_data_digest())
6847         obs.oi.set_data_digest(rollback_to->obs.oi.data_digest);
6848       else
6849         obs.oi.clear_data_digest();
6850       if (rollback_to->obs.oi.is_omap_digest())
6851         obs.oi.set_omap_digest(rollback_to->obs.oi.omap_digest);
6852       else
6853         obs.oi.clear_omap_digest();
6854
6855       if (rollback_to->obs.oi.is_omap()) {
6856         dout(10) << __func__ << " setting omap flag on " << obs.oi.soid << dendl;
6857         obs.oi.set_flag(object_info_t::FLAG_OMAP);
6858       } else {
6859         dout(10) << __func__ << " clearing omap flag on " << obs.oi.soid << dendl;
6860         obs.oi.clear_flag(object_info_t::FLAG_OMAP);
6861       }
6862
6863       snapset.head_exists = true;
6864     }
6865   }
6866   return ret;
6867 }
6868
6869 void PrimaryLogPG::_make_clone(
6870   OpContext *ctx,
6871   PGTransaction* t,
6872   ObjectContextRef obc,
6873   const hobject_t& head, const hobject_t& coid,
6874   object_info_t *poi)
6875 {
6876   bufferlist bv;
6877   ::encode(*poi, bv, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
6878
6879   t->clone(coid, head);
6880   setattr_maybe_cache(obc, ctx, t, OI_ATTR, bv);
6881   rmattr_maybe_cache(obc, ctx, t, SS_ATTR);
6882 }
6883
6884 void PrimaryLogPG::make_writeable(OpContext *ctx)
6885 {
6886   const hobject_t& soid = ctx->obs->oi.soid;
6887   SnapContext& snapc = ctx->snapc;
6888
6889   // clone?
6890   assert(soid.snap == CEPH_NOSNAP);
6891   dout(20) << "make_writeable " << soid << " snapset=" << ctx->new_snapset
6892            << "  snapc=" << snapc << dendl;
6893
6894   bool was_dirty = ctx->obc->obs.oi.is_dirty();
6895   if (ctx->new_obs.exists) {
6896     // we will mark the object dirty
6897     if (ctx->undirty && was_dirty) {
6898       dout(20) << " clearing DIRTY flag" << dendl;
6899       assert(ctx->new_obs.oi.is_dirty());
6900       ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
6901       --ctx->delta_stats.num_objects_dirty;
6902       osd->logger->inc(l_osd_tier_clean);
6903     } else if (!was_dirty && !ctx->undirty) {
6904       dout(20) << " setting DIRTY flag" << dendl;
6905       ctx->new_obs.oi.set_flag(object_info_t::FLAG_DIRTY);
6906       ++ctx->delta_stats.num_objects_dirty;
6907       osd->logger->inc(l_osd_tier_dirty);
6908     }
6909   } else {
6910     if (was_dirty) {
6911       dout(20) << " deletion, decrementing num_dirty and clearing flag" << dendl;
6912       ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
6913       --ctx->delta_stats.num_objects_dirty;
6914     }
6915   }
6916
6917   if ((ctx->new_obs.exists &&
6918        ctx->new_obs.oi.is_omap()) &&
6919       (!ctx->obc->obs.exists ||
6920        !ctx->obc->obs.oi.is_omap())) {
6921     ++ctx->delta_stats.num_objects_omap;
6922   }
6923   if ((!ctx->new_obs.exists ||
6924        !ctx->new_obs.oi.is_omap()) &&
6925       (ctx->obc->obs.exists &&
6926        ctx->obc->obs.oi.is_omap())) {
6927     --ctx->delta_stats.num_objects_omap;
6928   }
6929
6930   // use newer snapc?
6931   if (ctx->new_snapset.seq > snapc.seq) {
6932     snapc.seq = ctx->new_snapset.seq;
6933     snapc.snaps = ctx->new_snapset.snaps;
6934     filter_snapc(snapc.snaps);
6935     dout(10) << " using newer snapc " << snapc << dendl;
6936   }
6937
6938   if ((ctx->obs->exists && !ctx->obs->oi.is_whiteout()) && // head exist(ed)
6939       snapc.snaps.size() &&                 // there are snaps
6940       !ctx->cache_evict &&
6941       snapc.snaps[0] > ctx->new_snapset.seq) {  // existing object is old
6942     // clone
6943     hobject_t coid = soid;
6944     coid.snap = snapc.seq;
6945
6946     unsigned l;
6947     for (l=1; l<snapc.snaps.size() && snapc.snaps[l] > ctx->new_snapset.seq; l++) ;
6948
6949     vector<snapid_t> snaps(l);
6950     for (unsigned i=0; i<l; i++)
6951       snaps[i] = snapc.snaps[i];
6952
6953     // prepare clone
6954     object_info_t static_snap_oi(coid);
6955     object_info_t *snap_oi;
6956     if (is_primary()) {
6957       ctx->clone_obc = object_contexts.lookup_or_create(static_snap_oi.soid);
6958       ctx->clone_obc->destructor_callback = new C_PG_ObjectContext(this, ctx->clone_obc.get());
6959       ctx->clone_obc->obs.oi = static_snap_oi;
6960       ctx->clone_obc->obs.exists = true;
6961       ctx->clone_obc->ssc = ctx->obc->ssc;
6962       ctx->clone_obc->ssc->ref++;
6963       if (pool.info.require_rollback())
6964         ctx->clone_obc->attr_cache = ctx->obc->attr_cache;
6965       snap_oi = &ctx->clone_obc->obs.oi;
6966       bool got = ctx->lock_manager.get_write_greedy(
6967         coid,
6968         ctx->clone_obc,
6969         ctx->op);
6970       assert(got);
6971       dout(20) << " got greedy write on clone_obc " << *ctx->clone_obc << dendl;
6972     } else {
6973       snap_oi = &static_snap_oi;
6974     }
6975     snap_oi->version = ctx->at_version;
6976     snap_oi->prior_version = ctx->obs->oi.version;
6977     snap_oi->copy_user_bits(ctx->obs->oi);
6978
6979     bool legacy = ctx->new_snapset.is_legacy() ||
6980       get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
6981     if (legacy) {
6982       snap_oi->legacy_snaps = snaps;
6983     }
6984
6985     _make_clone(ctx, ctx->op_t.get(), ctx->clone_obc, soid, coid, snap_oi);
6986
6987     ctx->delta_stats.num_objects++;
6988     if (snap_oi->is_dirty()) {
6989       ctx->delta_stats.num_objects_dirty++;
6990       osd->logger->inc(l_osd_tier_dirty);
6991     }
6992     if (snap_oi->is_omap())
6993       ctx->delta_stats.num_objects_omap++;
6994     if (snap_oi->is_cache_pinned())
6995       ctx->delta_stats.num_objects_pinned++;
6996     ctx->delta_stats.num_object_clones++;
6997     ctx->new_snapset.clones.push_back(coid.snap);
6998     ctx->new_snapset.clone_size[coid.snap] = ctx->obs->oi.size;
6999     if (!legacy) {
7000       ctx->new_snapset.clone_snaps[coid.snap] = snaps;
7001     }
7002
7003     // clone_overlap should contain an entry for each clone
7004     // (an empty interval_set if there is no overlap)
7005     ctx->new_snapset.clone_overlap[coid.snap];
7006     if (ctx->obs->oi.size)
7007       ctx->new_snapset.clone_overlap[coid.snap].insert(0, ctx->obs->oi.size);
7008
7009     // log clone
7010     dout(10) << " cloning v " << ctx->obs->oi.version
7011              << " to " << coid << " v " << ctx->at_version
7012              << " snaps=" << snaps
7013              << " snapset=" << ctx->new_snapset << dendl;
7014     ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::CLONE, coid, ctx->at_version,
7015                                       ctx->obs->oi.version,
7016                                       ctx->obs->oi.user_version,
7017                                       osd_reqid_t(), ctx->new_obs.oi.mtime, 0));
7018     ::encode(snaps, ctx->log.back().snaps);
7019
7020     ctx->at_version.version++;
7021   }
7022
7023   // update most recent clone_overlap and usage stats
7024   if (ctx->new_snapset.clones.size() > 0) {
7025     /* we need to check whether the most recent clone exists, if it's been evicted,
7026      * it's not included in the stats */
7027     hobject_t last_clone_oid = soid;
7028     last_clone_oid.snap = ctx->new_snapset.clone_overlap.rbegin()->first;
7029     if (is_present_clone(last_clone_oid)) {
7030       interval_set<uint64_t> &newest_overlap = ctx->new_snapset.clone_overlap.rbegin()->second;
7031       ctx->modified_ranges.intersection_of(newest_overlap);
7032       // modified_ranges is still in use by the clone
7033       add_interval_usage(ctx->modified_ranges, ctx->delta_stats);
7034       newest_overlap.subtract(ctx->modified_ranges);
7035     }
7036   }
7037
7038   // update snapset with latest snap context
7039   ctx->new_snapset.seq = snapc.seq;
7040   ctx->new_snapset.snaps = snapc.snaps;
7041   if (get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS) {
7042     // pessimistic assumption that this is a net-new legacy SnapSet
7043     ctx->delta_stats.num_legacy_snapsets++;
7044     ctx->new_snapset.head_exists = ctx->new_obs.exists;
7045   } else if (ctx->new_snapset.is_legacy()) {
7046     ctx->new_snapset.head_exists = ctx->new_obs.exists;
7047   }
7048   dout(20) << "make_writeable " << soid
7049            << " done, snapset=" << ctx->new_snapset << dendl;
7050 }
7051
7052
7053 void PrimaryLogPG::write_update_size_and_usage(object_stat_sum_t& delta_stats, object_info_t& oi,
7054                                                interval_set<uint64_t>& modified, uint64_t offset,
7055                                                uint64_t length, bool write_full)
7056 {
7057   interval_set<uint64_t> ch;
7058   if (write_full) {
7059     if (oi.size)
7060       ch.insert(0, oi.size);
7061   } else if (length)
7062     ch.insert(offset, length);
7063   modified.union_of(ch);
7064   if (write_full || offset + length > oi.size) {
7065     uint64_t new_size = offset + length;
7066     delta_stats.num_bytes -= oi.size;
7067     delta_stats.num_bytes += new_size;
7068     oi.size = new_size;
7069   }
7070   delta_stats.num_wr++;
7071   delta_stats.num_wr_kb += SHIFT_ROUND_UP(length, 10);
7072 }
7073
7074 void PrimaryLogPG::add_interval_usage(interval_set<uint64_t>& s, object_stat_sum_t& delta_stats)
7075 {
7076   for (interval_set<uint64_t>::const_iterator p = s.begin(); p != s.end(); ++p) {
7077     delta_stats.num_bytes += p.get_len();
7078   }
7079 }
7080
7081 void PrimaryLogPG::complete_disconnect_watches(
7082   ObjectContextRef obc,
7083   const list<watch_disconnect_t> &to_disconnect)
7084 {
7085   for (list<watch_disconnect_t>::const_iterator i =
7086          to_disconnect.begin();
7087        i != to_disconnect.end();
7088        ++i) {
7089     pair<uint64_t, entity_name_t> watcher(i->cookie, i->name);
7090     auto watchers_entry = obc->watchers.find(watcher);
7091     if (watchers_entry != obc->watchers.end()) {
7092       WatchRef watch = watchers_entry->second;
7093       dout(10) << "do_osd_op_effects disconnect watcher " << watcher << dendl;
7094       obc->watchers.erase(watcher);
7095       watch->remove(i->send_disconnect);
7096     } else {
7097       dout(10) << "do_osd_op_effects disconnect failed to find watcher "
7098                << watcher << dendl;
7099     }
7100   }
7101 }
7102
7103 void PrimaryLogPG::do_osd_op_effects(OpContext *ctx, const ConnectionRef& conn)
7104 {
7105   entity_name_t entity = ctx->reqid.name;
7106   dout(15) << "do_osd_op_effects " << entity << " con " << conn.get() << dendl;
7107
7108   // disconnects first
7109   complete_disconnect_watches(ctx->obc, ctx->watch_disconnects);
7110
7111   assert(conn);
7112
7113   boost::intrusive_ptr<Session> session((Session *)conn->get_priv());
7114   if (!session.get())
7115     return;
7116   session->put();  // get_priv() takes a ref, and so does the intrusive_ptr
7117
7118   for (list<pair<watch_info_t,bool> >::iterator i = ctx->watch_connects.begin();
7119        i != ctx->watch_connects.end();
7120        ++i) {
7121     pair<uint64_t, entity_name_t> watcher(i->first.cookie, entity);
7122     dout(15) << "do_osd_op_effects applying watch connect on session "
7123              << session.get() << " watcher " << watcher << dendl;
7124     WatchRef watch;
7125     if (ctx->obc->watchers.count(watcher)) {
7126       dout(15) << "do_osd_op_effects found existing watch watcher " << watcher
7127                << dendl;
7128       watch = ctx->obc->watchers[watcher];
7129     } else {
7130       dout(15) << "do_osd_op_effects new watcher " << watcher
7131                << dendl;
7132       watch = Watch::makeWatchRef(
7133         this, osd, ctx->obc, i->first.timeout_seconds,
7134         i->first.cookie, entity, conn->get_peer_addr());
7135       ctx->obc->watchers.insert(
7136         make_pair(
7137           watcher,
7138           watch));
7139     }
7140     watch->connect(conn, i->second);
7141   }
7142
7143   for (list<notify_info_t>::iterator p = ctx->notifies.begin();
7144        p != ctx->notifies.end();
7145        ++p) {
7146     dout(10) << "do_osd_op_effects, notify " << *p << dendl;
7147     ConnectionRef conn(ctx->op->get_req()->get_connection());
7148     NotifyRef notif(
7149       Notify::makeNotifyRef(
7150         conn,
7151         ctx->reqid.name.num(),
7152         p->bl,
7153         p->timeout,
7154         p->cookie,
7155         p->notify_id,
7156         ctx->obc->obs.oi.user_version,
7157         osd));
7158     for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator i =
7159            ctx->obc->watchers.begin();
7160          i != ctx->obc->watchers.end();
7161          ++i) {
7162       dout(10) << "starting notify on watch " << i->first << dendl;
7163       i->second->start_notify(notif);
7164     }
7165     notif->init();
7166   }
7167
7168   for (list<OpContext::NotifyAck>::iterator p = ctx->notify_acks.begin();
7169        p != ctx->notify_acks.end();
7170        ++p) {
7171     if (p->watch_cookie)
7172       dout(10) << "notify_ack " << make_pair(p->watch_cookie.get(), p->notify_id) << dendl;
7173     else
7174       dout(10) << "notify_ack " << make_pair("NULL", p->notify_id) << dendl;
7175     for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator i =
7176            ctx->obc->watchers.begin();
7177          i != ctx->obc->watchers.end();
7178          ++i) {
7179       if (i->first.second != entity) continue;
7180       if (p->watch_cookie &&
7181           p->watch_cookie.get() != i->first.first) continue;
7182       dout(10) << "acking notify on watch " << i->first << dendl;
7183       i->second->notify_ack(p->notify_id, p->reply_bl);
7184     }
7185   }
7186 }
7187
7188 hobject_t PrimaryLogPG::generate_temp_object(const hobject_t& target)
7189 {
7190   ostringstream ss;
7191   ss << "temp_" << info.pgid << "_" << get_role()
7192      << "_" << osd->monc->get_global_id() << "_" << (++temp_seq);
7193   hobject_t hoid = target.make_temp_hobject(ss.str());
7194   dout(20) << __func__ << " " << hoid << dendl;
7195   return hoid;
7196 }
7197
7198 hobject_t PrimaryLogPG::get_temp_recovery_object(
7199   const hobject_t& target,
7200   eversion_t version)
7201 {
7202   ostringstream ss;
7203   ss << "temp_recovering_" << info.pgid  // (note this includes the shardid)
7204      << "_" << version
7205      << "_" << info.history.same_interval_since
7206      << "_" << target.snap;
7207   // pgid + version + interval + snapid is unique, and short
7208   hobject_t hoid = target.make_temp_hobject(ss.str());
7209   dout(20) << __func__ << " " << hoid << dendl;
7210   return hoid;
7211 }
7212
7213 int PrimaryLogPG::prepare_transaction(OpContext *ctx)
7214 {
7215   assert(!ctx->ops.empty());
7216
7217   const hobject_t& soid = ctx->obs->oi.soid;
7218
7219   // valid snap context?
7220   if (!ctx->snapc.is_valid()) {
7221     dout(10) << " invalid snapc " << ctx->snapc << dendl;
7222     return -EINVAL;
7223   }
7224
7225   // prepare the actual mutation
7226   int result = do_osd_ops(ctx, ctx->ops);
7227   if (result < 0) {
7228     if (ctx->op->may_write() &&
7229         get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
7230       // need to save the error code in the pg log, to detect dup ops,
7231       // but do nothing else
7232       ctx->update_log_only = true;
7233     }
7234     return result;
7235   }
7236
7237   // read-op?  write-op noop? done?
7238   if (ctx->op_t->empty() && !ctx->modify) {
7239     unstable_stats.add(ctx->delta_stats);
7240     if (ctx->op->may_write() &&
7241         get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
7242       ctx->update_log_only = true;
7243     }
7244     return result;
7245   }
7246
7247   // check for full
7248   if ((ctx->delta_stats.num_bytes > 0 ||
7249        ctx->delta_stats.num_objects > 0) &&  // FIXME: keys?
7250       (pool.info.has_flag(pg_pool_t::FLAG_FULL) ||
7251        get_osdmap()->test_flag(CEPH_OSDMAP_FULL))) {
7252     const MOSDOp *m = static_cast<const MOSDOp*>(ctx->op->get_req());
7253     if (ctx->reqid.name.is_mds() ||   // FIXME: ignore MDS for now
7254         m->has_flag(CEPH_OSD_FLAG_FULL_FORCE)) {
7255       dout(20) << __func__ << " full, but proceeding due to FULL_FORCE or MDS"
7256                << dendl;
7257     } else if (m->has_flag(CEPH_OSD_FLAG_FULL_TRY)) {
7258       // they tried, they failed.
7259       dout(20) << __func__ << " full, replying to FULL_TRY op" << dendl;
7260       return pool.info.has_flag(pg_pool_t::FLAG_FULL) ? -EDQUOT : -ENOSPC;
7261     } else {
7262       // drop request
7263       dout(20) << __func__ << " full, dropping request (bad client)" << dendl;
7264       return -EAGAIN;
7265     }
7266   }
7267
7268   // clone, if necessary
7269   if (soid.snap == CEPH_NOSNAP)
7270     make_writeable(ctx);
7271
7272   finish_ctx(ctx,
7273              ctx->new_obs.exists ? pg_log_entry_t::MODIFY :
7274              pg_log_entry_t::DELETE);
7275
7276   return result;
7277 }
7278
7279 void PrimaryLogPG::finish_ctx(OpContext *ctx, int log_op_type, bool maintain_ssc)
7280 {
7281   const hobject_t& soid = ctx->obs->oi.soid;
7282   dout(20) << __func__ << " " << soid << " " << ctx
7283            << " op " << pg_log_entry_t::get_op_name(log_op_type)
7284            << dendl;
7285   utime_t now = ceph_clock_now();
7286
7287   // snapset
7288   bufferlist bss;
7289
7290   if (soid.snap == CEPH_NOSNAP && maintain_ssc) {
7291     ::encode(ctx->new_snapset, bss);
7292     assert(ctx->new_obs.exists == ctx->new_snapset.head_exists ||
7293            !ctx->new_snapset.is_legacy());
7294
7295     if (ctx->new_obs.exists) {
7296       if (!ctx->obs->exists) {
7297         if (ctx->snapset_obc && ctx->snapset_obc->obs.exists) {
7298           hobject_t snapoid = soid.get_snapdir();
7299           dout(10) << " removing unneeded snapdir " << snapoid << dendl;
7300           ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::DELETE, snapoid,
7301               ctx->at_version,
7302               ctx->snapset_obc->obs.oi.version,
7303               0, osd_reqid_t(), ctx->mtime, 0));
7304           ctx->op_t->remove(snapoid);
7305
7306           ctx->at_version.version++;
7307
7308           ctx->snapset_obc->obs.exists = false;
7309         }
7310       }
7311     } else if (!ctx->new_snapset.clones.empty() &&
7312                !ctx->cache_evict &&
7313                !ctx->new_snapset.head_exists &&
7314                (!ctx->snapset_obc || !ctx->snapset_obc->obs.exists)) {
7315       // save snapset on _snap
7316       hobject_t snapoid(soid.oid, soid.get_key(), CEPH_SNAPDIR, soid.get_hash(),
7317                         info.pgid.pool(), soid.get_namespace());
7318       dout(10) << " final snapset " << ctx->new_snapset
7319                << " in " << snapoid << dendl;
7320       assert(get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS);
7321       ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::MODIFY, snapoid,
7322                                         ctx->at_version,
7323                                         eversion_t(),
7324                                         0, osd_reqid_t(), ctx->mtime, 0));
7325
7326       if (!ctx->snapset_obc)
7327         ctx->snapset_obc = get_object_context(snapoid, true);
7328       bool got = false;
7329       if (ctx->lock_type == ObjectContext::RWState::RWWRITE) {
7330         got = ctx->lock_manager.get_write_greedy(
7331           snapoid,
7332           ctx->snapset_obc,
7333           ctx->op);
7334       } else {
7335         assert(ctx->lock_type == ObjectContext::RWState::RWEXCL);
7336         got = ctx->lock_manager.get_lock_type(
7337           ObjectContext::RWState::RWEXCL,
7338           snapoid,
7339           ctx->snapset_obc,
7340           ctx->op);
7341       }
7342       assert(got);
7343       dout(20) << " got greedy write on snapset_obc " << *ctx->snapset_obc << dendl;
7344       ctx->snapset_obc->obs.exists = true;
7345       ctx->snapset_obc->obs.oi.version = ctx->at_version;
7346       ctx->snapset_obc->obs.oi.last_reqid = ctx->reqid;
7347       ctx->snapset_obc->obs.oi.mtime = ctx->mtime;
7348       ctx->snapset_obc->obs.oi.local_mtime = now;
7349
7350       map<string, bufferlist> attrs;
7351       bufferlist bv(sizeof(ctx->new_obs.oi));
7352       ::encode(ctx->snapset_obc->obs.oi, bv,
7353                get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
7354       ctx->op_t->create(snapoid);
7355       attrs[OI_ATTR].claim(bv);
7356       attrs[SS_ATTR].claim(bss);
7357       setattrs_maybe_cache(ctx->snapset_obc, ctx, ctx->op_t.get(), attrs);
7358       ctx->at_version.version++;
7359     }
7360   }
7361
7362   // finish and log the op.
7363   if (ctx->user_modify) {
7364     // update the user_version for any modify ops, except for the watch op
7365     ctx->user_at_version = MAX(info.last_user_version, ctx->new_obs.oi.user_version) + 1;
7366     /* In order for new clients and old clients to interoperate properly
7367      * when exchanging versions, we need to lower bound the user_version
7368      * (which our new clients pay proper attention to)
7369      * by the at_version (which is all the old clients can ever see). */
7370     if (ctx->at_version.version > ctx->user_at_version)
7371       ctx->user_at_version = ctx->at_version.version;
7372     ctx->new_obs.oi.user_version = ctx->user_at_version;
7373   }
7374   ctx->bytes_written = ctx->op_t->get_bytes_written();
7375
7376   if (ctx->new_obs.exists) {
7377     // on the head object
7378     ctx->new_obs.oi.version = ctx->at_version;
7379     ctx->new_obs.oi.prior_version = ctx->obs->oi.version;
7380     ctx->new_obs.oi.last_reqid = ctx->reqid;
7381     if (ctx->mtime != utime_t()) {
7382       ctx->new_obs.oi.mtime = ctx->mtime;
7383       dout(10) << " set mtime to " << ctx->new_obs.oi.mtime << dendl;
7384       ctx->new_obs.oi.local_mtime = now;
7385     } else {
7386       dout(10) << " mtime unchanged at " << ctx->new_obs.oi.mtime << dendl;
7387     }
7388
7389     map <string, bufferlist> attrs;
7390     bufferlist bv(sizeof(ctx->new_obs.oi));
7391     ::encode(ctx->new_obs.oi, bv,
7392              get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
7393     attrs[OI_ATTR].claim(bv);
7394
7395     if (soid.snap == CEPH_NOSNAP) {
7396       dout(10) << " final snapset " << ctx->new_snapset
7397                << " in " << soid << dendl;
7398       attrs[SS_ATTR].claim(bss);
7399     } else {
7400       dout(10) << " no snapset (this is a clone)" << dendl;
7401     }
7402     ctx->op_t->setattrs(soid, attrs);
7403   } else {
7404     ctx->new_obs.oi = object_info_t(ctx->obc->obs.oi.soid);
7405   }
7406
7407   bool legacy_snapset = ctx->new_snapset.is_legacy() ||
7408     get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
7409
7410   // append to log
7411   ctx->log.push_back(pg_log_entry_t(log_op_type, soid, ctx->at_version,
7412                                     ctx->obs->oi.version,
7413                                     ctx->user_at_version, ctx->reqid,
7414                                     ctx->mtime, 0));
7415   if (soid.snap < CEPH_NOSNAP) {
7416     switch (log_op_type) {
7417     case pg_log_entry_t::MODIFY:
7418     case pg_log_entry_t::PROMOTE:
7419     case pg_log_entry_t::CLEAN:
7420       if (legacy_snapset) {
7421         dout(20) << __func__ << " encoding legacy_snaps "
7422                  << ctx->new_obs.oi.legacy_snaps
7423                  << dendl;
7424         ::encode(ctx->new_obs.oi.legacy_snaps, ctx->log.back().snaps);
7425       } else {
7426         dout(20) << __func__ << " encoding snaps from " << ctx->new_snapset
7427                  << dendl;
7428         ::encode(ctx->new_snapset.clone_snaps[soid.snap], ctx->log.back().snaps);
7429       }
7430       break;
7431     default:
7432       break;
7433     }
7434   }
7435
7436   if (!ctx->extra_reqids.empty()) {
7437     dout(20) << __func__ << "  extra_reqids " << ctx->extra_reqids << dendl;
7438     ctx->log.back().extra_reqids.swap(ctx->extra_reqids);
7439   }
7440
7441   // apply new object state.
7442   ctx->obc->obs = ctx->new_obs;
7443
7444   if (soid.is_head() && !ctx->obc->obs.exists &&
7445       (!maintain_ssc || ctx->cache_evict)) {
7446     ctx->obc->ssc->exists = false;
7447     ctx->obc->ssc->snapset = SnapSet();
7448   } else {
7449     ctx->obc->ssc->exists = true;
7450     ctx->obc->ssc->snapset = ctx->new_snapset;
7451   }
7452 }
7453
7454 void PrimaryLogPG::apply_stats(
7455   const hobject_t &soid,
7456   const object_stat_sum_t &delta_stats) {
7457
7458   info.stats.stats.add(delta_stats);
7459
7460   for (set<pg_shard_t>::iterator i = backfill_targets.begin();
7461        i != backfill_targets.end();
7462        ++i) {
7463     pg_shard_t bt = *i;
7464     pg_info_t& pinfo = peer_info[bt];
7465     if (soid <= pinfo.last_backfill)
7466       pinfo.stats.stats.add(delta_stats);
7467     else if (soid <= last_backfill_started)
7468       pending_backfill_updates[soid].stats.add(delta_stats);
7469   }
7470
7471   if (is_primary() && scrubber.active) {
7472     if (soid < scrubber.start) {
7473       dout(20) << __func__ << " " << soid << " < [" << scrubber.start
7474                << "," << scrubber.end << ")" << dendl;
7475       scrub_cstat.add(delta_stats);
7476     } else {
7477       dout(20) << __func__ << " " << soid << " >= [" << scrubber.start
7478                << "," << scrubber.end << ")" << dendl;
7479     }
7480   }
7481 }
7482
7483 void PrimaryLogPG::complete_read_ctx(int result, OpContext *ctx)
7484 {
7485   const MOSDOp *m = static_cast<const MOSDOp*>(ctx->op->get_req());
7486   assert(ctx->async_reads_complete());
7487
7488   for (vector<OSDOp>::iterator p = ctx->ops.begin();
7489     p != ctx->ops.end() && result >= 0; ++p) {
7490     if (p->rval < 0 && !(p->op.flags & CEPH_OSD_OP_FLAG_FAILOK)) {
7491       result = p->rval;
7492       break;
7493     }
7494     ctx->bytes_read += p->outdata.length();
7495   }
7496   ctx->reply->claim_op_out_data(ctx->ops);
7497   ctx->reply->get_header().data_off = ctx->data_off;
7498
7499   MOSDOpReply *reply = ctx->reply;
7500   ctx->reply = nullptr;
7501
7502   if (result >= 0) {
7503     if (!ctx->ignore_log_op_stats) {
7504       log_op_stats(ctx);
7505       publish_stats_to_osd();
7506     }
7507
7508     // on read, return the current object version
7509     if (ctx->obs) {
7510       reply->set_reply_versions(eversion_t(), ctx->obs->oi.user_version);
7511     } else {
7512       reply->set_reply_versions(eversion_t(), ctx->user_at_version);
7513     }
7514   } else if (result == -ENOENT) {
7515     // on ENOENT, set a floor for what the next user version will be.
7516     reply->set_enoent_reply_versions(info.last_update, info.last_user_version);
7517   }
7518
7519   reply->set_result(result);
7520   reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
7521   osd->send_message_osd_client(reply, m->get_connection());
7522   close_op_ctx(ctx);
7523 }
7524
7525 // ========================================================================
7526 // copyfrom
7527
7528 struct C_Copyfrom : public Context {
7529   PrimaryLogPGRef pg;
7530   hobject_t oid;
7531   epoch_t last_peering_reset;
7532   ceph_tid_t tid;
7533   PrimaryLogPG::CopyOpRef cop;
7534   C_Copyfrom(PrimaryLogPG *p, hobject_t o, epoch_t lpr,
7535              const PrimaryLogPG::CopyOpRef& c)
7536     : pg(p), oid(o), last_peering_reset(lpr),
7537       tid(0), cop(c)
7538   {}
7539   void finish(int r) override {
7540     if (r == -ECANCELED)
7541       return;
7542     pg->lock();
7543     if (last_peering_reset == pg->get_last_peering_reset()) {
7544       pg->process_copy_chunk(oid, tid, r);
7545     }
7546     pg->unlock();
7547   }
7548 };
7549
7550 struct C_CopyFrom_AsyncReadCb : public Context {
7551   OSDOp *osd_op;
7552   object_copy_data_t reply_obj;
7553   uint64_t features;
7554   size_t len;
7555   C_CopyFrom_AsyncReadCb(OSDOp *osd_op, uint64_t features) :
7556     osd_op(osd_op), features(features), len(0) {}
7557   void finish(int r) override {
7558     assert(len > 0);
7559     assert(len <= reply_obj.data.length());
7560     bufferlist bl;
7561     bl.substr_of(reply_obj.data, 0, len);
7562     reply_obj.data.swap(bl);
7563     ::encode(reply_obj, osd_op->outdata, features);
7564   }
7565 };
7566
7567 int PrimaryLogPG::fill_in_copy_get(
7568   OpContext *ctx,
7569   bufferlist::iterator& bp,
7570   OSDOp& osd_op,
7571   ObjectContextRef &obc)
7572 {
7573   object_info_t& oi = obc->obs.oi;
7574   hobject_t& soid = oi.soid;
7575   int result = 0;
7576   object_copy_cursor_t cursor;
7577   uint64_t out_max;
7578   try {
7579     ::decode(cursor, bp);
7580     ::decode(out_max, bp);
7581   }
7582   catch (buffer::error& e) {
7583     result = -EINVAL;
7584     return result;
7585   }
7586
7587   const MOSDOp *op = reinterpret_cast<const MOSDOp*>(ctx->op->get_req());
7588   uint64_t features = op->get_features();
7589
7590   bool async_read_started = false;
7591   object_copy_data_t _reply_obj;
7592   C_CopyFrom_AsyncReadCb *cb = NULL;
7593   if (pool.info.require_rollback()) {
7594     cb = new C_CopyFrom_AsyncReadCb(&osd_op, features);
7595   }
7596   object_copy_data_t &reply_obj = cb ? cb->reply_obj : _reply_obj;
7597   // size, mtime
7598   reply_obj.size = oi.size;
7599   reply_obj.mtime = oi.mtime;
7600   assert(obc->ssc);
7601   if (soid.snap < CEPH_NOSNAP) {
7602     if (obc->ssc->snapset.is_legacy()) {
7603       reply_obj.snaps = oi.legacy_snaps;
7604     } else {
7605       auto p = obc->ssc->snapset.clone_snaps.find(soid.snap);
7606       assert(p != obc->ssc->snapset.clone_snaps.end()); // warn?
7607       reply_obj.snaps = p->second;
7608     }
7609   } else {
7610     reply_obj.snap_seq = obc->ssc->snapset.seq;
7611   }
7612   if (oi.is_data_digest()) {
7613     reply_obj.flags |= object_copy_data_t::FLAG_DATA_DIGEST;
7614     reply_obj.data_digest = oi.data_digest;
7615   }
7616   if (oi.is_omap_digest()) {
7617     reply_obj.flags |= object_copy_data_t::FLAG_OMAP_DIGEST;
7618     reply_obj.omap_digest = oi.omap_digest;
7619   }
7620   reply_obj.truncate_seq = oi.truncate_seq;
7621   reply_obj.truncate_size = oi.truncate_size;
7622
7623   // attrs
7624   map<string,bufferlist>& out_attrs = reply_obj.attrs;
7625   if (!cursor.attr_complete) {
7626     result = getattrs_maybe_cache(
7627       ctx->obc,
7628       &out_attrs,
7629       true);
7630     if (result < 0) {
7631       if (cb) {
7632         delete cb;
7633       }
7634       return result;
7635     }
7636     cursor.attr_complete = true;
7637     dout(20) << " got attrs" << dendl;
7638   }
7639
7640   int64_t left = out_max - osd_op.outdata.length();
7641
7642   // data
7643   bufferlist& bl = reply_obj.data;
7644   if (left > 0 && !cursor.data_complete) {
7645     if (cursor.data_offset < oi.size) {
7646       uint64_t max_read = MIN(oi.size - cursor.data_offset, (uint64_t)left);
7647       if (cb) {
7648         async_read_started = true;
7649         ctx->pending_async_reads.push_back(
7650           make_pair(
7651             boost::make_tuple(cursor.data_offset, max_read, osd_op.op.flags),
7652             make_pair(&bl, cb)));
7653         result = max_read;
7654         cb->len = result;
7655       } else {
7656         result = pgbackend->objects_read_sync(
7657           oi.soid, cursor.data_offset, left, osd_op.op.flags, &bl);
7658         if (result < 0)
7659           return result;
7660       }
7661       assert(result <= left);
7662       left -= result;
7663       cursor.data_offset += result;
7664     }
7665     if (cursor.data_offset == oi.size) {
7666       cursor.data_complete = true;
7667       dout(20) << " got data" << dendl;
7668     }
7669     assert(cursor.data_offset <= oi.size);
7670   }
7671
7672   // omap
7673   uint32_t omap_keys = 0;
7674   if (!pool.info.supports_omap() || !oi.is_omap()) {
7675     cursor.omap_complete = true;
7676   } else {
7677     if (left > 0 && !cursor.omap_complete) {
7678       assert(cursor.data_complete);
7679       if (cursor.omap_offset.empty()) {
7680         osd->store->omap_get_header(ch, ghobject_t(oi.soid),
7681                                     &reply_obj.omap_header);
7682       }
7683       bufferlist omap_data;
7684       ObjectMap::ObjectMapIterator iter =
7685         osd->store->get_omap_iterator(coll, ghobject_t(oi.soid));
7686       assert(iter);
7687       iter->upper_bound(cursor.omap_offset);
7688       for (; iter->valid(); iter->next(false)) {
7689         ++omap_keys;
7690         ::encode(iter->key(), omap_data);
7691         ::encode(iter->value(), omap_data);
7692         left -= iter->key().length() + 4 + iter->value().length() + 4;
7693         if (left <= 0)
7694           break;
7695       }
7696       if (omap_keys) {
7697         ::encode(omap_keys, reply_obj.omap_data);
7698         reply_obj.omap_data.claim_append(omap_data);
7699       }
7700       if (iter->valid()) {
7701         cursor.omap_offset = iter->key();
7702       } else {
7703         cursor.omap_complete = true;
7704         dout(20) << " got omap" << dendl;
7705       }
7706     }
7707   }
7708
7709   if (cursor.is_complete()) {
7710     // include reqids only in the final step.  this is a bit fragile
7711     // but it works...
7712     pg_log.get_log().get_object_reqids(ctx->obc->obs.oi.soid, 10, &reply_obj.reqids);
7713     dout(20) << " got reqids" << dendl;
7714   }
7715
7716   dout(20) << " cursor.is_complete=" << cursor.is_complete()
7717            << " " << out_attrs.size() << " attrs"
7718            << " " << bl.length() << " bytes"
7719            << " " << reply_obj.omap_header.length() << " omap header bytes"
7720            << " " << reply_obj.omap_data.length() << " omap data bytes in "
7721            << omap_keys << " keys"
7722            << " " << reply_obj.reqids.size() << " reqids"
7723            << dendl;
7724   reply_obj.cursor = cursor;
7725   if (!async_read_started) {
7726     ::encode(reply_obj, osd_op.outdata, features);
7727   }
7728   if (cb && !async_read_started) {
7729     delete cb;
7730   }
7731   result = 0;
7732   return result;
7733 }
7734
7735 void PrimaryLogPG::fill_in_copy_get_noent(OpRequestRef& op, hobject_t oid,
7736                                           OSDOp& osd_op)
7737 {
7738   // NOTE: we take non-const ref here for claim_op_out_data below; we must
7739   // be careful not to modify anything else that will upset a racing
7740   // operator<<
7741   MOSDOp *m = static_cast<MOSDOp*>(op->get_nonconst_req());
7742   uint64_t features = m->get_features();
7743   object_copy_data_t reply_obj;
7744
7745   pg_log.get_log().get_object_reqids(oid, 10, &reply_obj.reqids);
7746   dout(20) << __func__ << " got reqids " << reply_obj.reqids << dendl;
7747   ::encode(reply_obj, osd_op.outdata, features);
7748   osd_op.rval = -ENOENT;
7749   MOSDOpReply *reply = new MOSDOpReply(m, 0, get_osdmap()->get_epoch(), 0, false);
7750   reply->claim_op_out_data(m->ops);
7751   reply->set_result(-ENOENT);
7752   reply->add_flags(CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK);
7753   osd->send_message_osd_client(reply, m->get_connection());
7754 }
7755
7756 void PrimaryLogPG::start_copy(CopyCallback *cb, ObjectContextRef obc,
7757                               hobject_t src, object_locator_t oloc,
7758                               version_t version, unsigned flags,
7759                               bool mirror_snapset,
7760                               unsigned src_obj_fadvise_flags,
7761                               unsigned dest_obj_fadvise_flags)
7762 {
7763   const hobject_t& dest = obc->obs.oi.soid;
7764   dout(10) << __func__ << " " << dest
7765            << " from " << src << " " << oloc << " v" << version
7766            << " flags " << flags
7767            << (mirror_snapset ? " mirror_snapset" : "")
7768            << dendl;
7769
7770   assert(!mirror_snapset || (src.snap == CEPH_NOSNAP ||
7771                              src.snap == CEPH_SNAPDIR));
7772
7773   // cancel a previous in-progress copy?
7774   if (copy_ops.count(dest)) {
7775     // FIXME: if the src etc match, we could avoid restarting from the
7776     // beginning.
7777     CopyOpRef cop = copy_ops[dest];
7778     cancel_copy(cop, false);
7779   }
7780
7781   CopyOpRef cop(std::make_shared<CopyOp>(cb, obc, src, oloc, version, flags,
7782                            mirror_snapset, src_obj_fadvise_flags,
7783                            dest_obj_fadvise_flags));
7784   copy_ops[dest] = cop;
7785   obc->start_block();
7786
7787   _copy_some(obc, cop);
7788 }
7789
7790 void PrimaryLogPG::_copy_some(ObjectContextRef obc, CopyOpRef cop)
7791 {
7792   dout(10) << __func__ << " " << obc << " " << cop << dendl;
7793
7794   unsigned flags = 0;
7795   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_FLUSH)
7796     flags |= CEPH_OSD_FLAG_FLUSH;
7797   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE)
7798     flags |= CEPH_OSD_FLAG_IGNORE_CACHE;
7799   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY)
7800     flags |= CEPH_OSD_FLAG_IGNORE_OVERLAY;
7801   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE)
7802     flags |= CEPH_OSD_FLAG_MAP_SNAP_CLONE;
7803   if (cop->flags & CEPH_OSD_COPY_FROM_FLAG_RWORDERED)
7804     flags |= CEPH_OSD_FLAG_RWORDERED;
7805
7806   C_GatherBuilder gather(cct);
7807
7808   if (cop->cursor.is_initial() && cop->mirror_snapset) {
7809     // list snaps too.
7810     assert(cop->src.snap == CEPH_NOSNAP);
7811     ObjectOperation op;
7812     op.list_snaps(&cop->results.snapset, NULL);
7813     ceph_tid_t tid = osd->objecter->read(cop->src.oid, cop->oloc, op,
7814                                     CEPH_SNAPDIR, NULL,
7815                                     flags, gather.new_sub(), NULL);
7816     cop->objecter_tid2 = tid;
7817   }
7818
7819   ObjectOperation op;
7820   if (cop->results.user_version) {
7821     op.assert_version(cop->results.user_version);
7822   } else {
7823     // we should learn the version after the first chunk, if we didn't know
7824     // it already!
7825     assert(cop->cursor.is_initial());
7826   }
7827   op.copy_get(&cop->cursor, get_copy_chunk_size(),
7828               &cop->results.object_size, &cop->results.mtime,
7829               &cop->attrs, &cop->data, &cop->omap_header, &cop->omap_data,
7830               &cop->results.snaps, &cop->results.snap_seq,
7831               &cop->results.flags,
7832               &cop->results.source_data_digest,
7833               &cop->results.source_omap_digest,
7834               &cop->results.reqids,
7835               &cop->results.truncate_seq,
7836               &cop->results.truncate_size,
7837               &cop->rval);
7838   op.set_last_op_flags(cop->src_obj_fadvise_flags);
7839
7840   C_Copyfrom *fin = new C_Copyfrom(this, obc->obs.oi.soid,
7841                                    get_last_peering_reset(), cop);
7842   gather.set_finisher(new C_OnFinisher(fin,
7843                                        &osd->objecter_finisher));
7844
7845   ceph_tid_t tid = osd->objecter->read(cop->src.oid, cop->oloc, op,
7846                                   cop->src.snap, NULL,
7847                                   flags,
7848                                   gather.new_sub(),
7849                                   // discover the object version if we don't know it yet
7850                                   cop->results.user_version ? NULL : &cop->results.user_version);
7851   fin->tid = tid;
7852   cop->objecter_tid = tid;
7853   gather.activate();
7854 }
7855
7856 void PrimaryLogPG::process_copy_chunk(hobject_t oid, ceph_tid_t tid, int r)
7857 {
7858   dout(10) << __func__ << " " << oid << " tid " << tid
7859            << " " << cpp_strerror(r) << dendl;
7860   map<hobject_t,CopyOpRef>::iterator p = copy_ops.find(oid);
7861   if (p == copy_ops.end()) {
7862     dout(10) << __func__ << " no copy_op found" << dendl;
7863     return;
7864   }
7865   CopyOpRef cop = p->second;
7866   if (tid != cop->objecter_tid) {
7867     dout(10) << __func__ << " tid " << tid << " != cop " << cop
7868              << " tid " << cop->objecter_tid << dendl;
7869     return;
7870   }
7871
7872   if (cop->omap_data.length() || cop->omap_header.length())
7873     cop->results.has_omap = true;
7874
7875   if (r >= 0 && !pool.info.supports_omap() &&
7876       (cop->omap_data.length() || cop->omap_header.length())) {
7877     r = -EOPNOTSUPP;
7878   }
7879   cop->objecter_tid = 0;
7880   cop->objecter_tid2 = 0;  // assume this ordered before us (if it happened)
7881   ObjectContextRef& cobc = cop->obc;
7882
7883   if (r < 0)
7884     goto out;
7885
7886   assert(cop->rval >= 0);
7887
7888   if (oid.snap < CEPH_NOSNAP && !cop->results.snaps.empty()) {
7889     // verify snap hasn't been deleted
7890     vector<snapid_t>::iterator p = cop->results.snaps.begin();
7891     while (p != cop->results.snaps.end()) {
7892       if (pool.info.is_removed_snap(*p)) {
7893         dout(10) << __func__ << " clone snap " << *p << " has been deleted"
7894                  << dendl;
7895         for (vector<snapid_t>::iterator q = p + 1;
7896              q != cop->results.snaps.end();
7897              ++q)
7898           *(q - 1) = *q;
7899         cop->results.snaps.resize(cop->results.snaps.size() - 1);
7900       } else {
7901         ++p;
7902       }
7903     }
7904     if (cop->results.snaps.empty()) {
7905       dout(10) << __func__ << " no more snaps for " << oid << dendl;
7906       r = -ENOENT;
7907       goto out;
7908     }
7909   }
7910
7911   assert(cop->rval >= 0);
7912
7913   if (!cop->temp_cursor.data_complete) {
7914     cop->results.data_digest = cop->data.crc32c(cop->results.data_digest);
7915   }
7916   if (pool.info.supports_omap() && !cop->temp_cursor.omap_complete) {
7917     if (cop->omap_header.length()) {
7918       cop->results.omap_digest =
7919         cop->omap_header.crc32c(cop->results.omap_digest);
7920     }
7921     if (cop->omap_data.length()) {
7922       bufferlist keys;
7923       keys.substr_of(cop->omap_data, 4, cop->omap_data.length() - 4);
7924       cop->results.omap_digest = keys.crc32c(cop->results.omap_digest);
7925     }
7926   }
7927
7928   if (!cop->temp_cursor.attr_complete) {
7929     for (map<string,bufferlist>::iterator p = cop->attrs.begin();
7930          p != cop->attrs.end();
7931          ++p) {
7932       cop->results.attrs[string("_") + p->first] = p->second;
7933     }
7934     cop->attrs.clear();
7935   }
7936
7937   if (!cop->cursor.is_complete()) {
7938     // write out what we have so far
7939     if (cop->temp_cursor.is_initial()) {
7940       assert(!cop->results.started_temp_obj);
7941       cop->results.started_temp_obj = true;
7942       cop->results.temp_oid = generate_temp_object(oid);
7943       dout(20) << __func__ << " using temp " << cop->results.temp_oid << dendl;
7944     }
7945     ObjectContextRef tempobc = get_object_context(cop->results.temp_oid, true);
7946     OpContextUPtr ctx = simple_opc_create(tempobc);
7947     if (cop->temp_cursor.is_initial()) {
7948       ctx->new_temp_oid = cop->results.temp_oid;
7949     }
7950     _write_copy_chunk(cop, ctx->op_t.get());
7951     simple_opc_submit(std::move(ctx));
7952     dout(10) << __func__ << " fetching more" << dendl;
7953     _copy_some(cobc, cop);
7954     return;
7955   }
7956
7957   // verify digests?
7958   if (cop->results.is_data_digest() || cop->results.is_omap_digest()) {
7959     dout(20) << __func__ << std::hex
7960       << " got digest: rx data 0x" << cop->results.data_digest
7961       << " omap 0x" << cop->results.omap_digest
7962       << ", source: data 0x" << cop->results.source_data_digest
7963       << " omap 0x" <<  cop->results.source_omap_digest
7964       << std::dec
7965       << " flags " << cop->results.flags
7966       << dendl;
7967   }
7968   if (cop->results.is_data_digest() &&
7969       cop->results.data_digest != cop->results.source_data_digest) {
7970     derr << __func__ << std::hex << " data digest 0x" << cop->results.data_digest
7971          << " != source 0x" << cop->results.source_data_digest << std::dec
7972          << dendl;
7973     osd->clog->error() << info.pgid << " copy from " << cop->src
7974                        << " to " << cop->obc->obs.oi.soid << std::hex
7975                        << " data digest 0x" << cop->results.data_digest
7976                        << " != source 0x" << cop->results.source_data_digest
7977                        << std::dec;
7978     r = -EIO;
7979     goto out;
7980   }
7981   if (cop->results.is_omap_digest() &&
7982       cop->results.omap_digest != cop->results.source_omap_digest) {
7983     derr << __func__ << std::hex
7984          << " omap digest 0x" << cop->results.omap_digest
7985          << " != source 0x" << cop->results.source_omap_digest
7986          << std::dec << dendl;
7987     osd->clog->error() << info.pgid << " copy from " << cop->src
7988                        << " to " << cop->obc->obs.oi.soid << std::hex
7989                        << " omap digest 0x" << cop->results.omap_digest
7990                        << " != source 0x" << cop->results.source_omap_digest
7991                        << std::dec;
7992     r = -EIO;
7993     goto out;
7994   }
7995   if (cct->_conf->osd_debug_inject_copyfrom_error) {
7996     derr << __func__ << " injecting copyfrom failure" << dendl;
7997     r = -EIO;
7998     goto out;
7999   }
8000
8001   cop->results.fill_in_final_tx = std::function<void(PGTransaction*)>(
8002     [this, &cop /* avoid ref cycle */](PGTransaction *t) {
8003       ObjectState& obs = cop->obc->obs;
8004       if (cop->temp_cursor.is_initial()) {
8005         dout(20) << "fill_in_final_tx: writing "
8006                  << "directly to final object" << dendl;
8007         // write directly to final object
8008         cop->results.temp_oid = obs.oi.soid;
8009         _write_copy_chunk(cop, t);
8010       } else {
8011         // finish writing to temp object, then move into place
8012         dout(20) << "fill_in_final_tx: writing to temp object" << dendl;
8013         _write_copy_chunk(cop, t);
8014         t->rename(obs.oi.soid, cop->results.temp_oid);
8015       }
8016       t->setattrs(obs.oi.soid, cop->results.attrs);
8017     });
8018
8019   dout(20) << __func__ << " success; committing" << dendl;
8020
8021  out:
8022   dout(20) << __func__ << " complete r = " << cpp_strerror(r) << dendl;
8023   CopyCallbackResults results(r, &cop->results);
8024   cop->cb->complete(results);
8025
8026   copy_ops.erase(cobc->obs.oi.soid);
8027   cobc->stop_block();
8028
8029   if (r < 0 && cop->results.started_temp_obj) {
8030     dout(10) << __func__ << " deleting partial temp object "
8031              << cop->results.temp_oid << dendl;
8032     ObjectContextRef tempobc = get_object_context(cop->results.temp_oid, true);
8033     OpContextUPtr ctx = simple_opc_create(tempobc);
8034     ctx->op_t->remove(cop->results.temp_oid);
8035     ctx->discard_temp_oid = cop->results.temp_oid;
8036     simple_opc_submit(std::move(ctx));
8037   }
8038
8039   // cancel and requeue proxy ops on this object
8040   if (!r) {
8041     for (map<ceph_tid_t, ProxyReadOpRef>::iterator it = proxyread_ops.begin();
8042         it != proxyread_ops.end();) {
8043       if (it->second->soid == cobc->obs.oi.soid) {
8044         cancel_proxy_read((it++)->second);
8045       } else {
8046         ++it;
8047       }
8048     }
8049     for (map<ceph_tid_t, ProxyWriteOpRef>::iterator it = proxywrite_ops.begin();
8050          it != proxywrite_ops.end();) {
8051       if (it->second->soid == cobc->obs.oi.soid) {
8052         cancel_proxy_write((it++)->second);
8053       } else {
8054         ++it;
8055       }
8056     }
8057     kick_proxy_ops_blocked(cobc->obs.oi.soid);
8058   }
8059
8060   kick_object_context_blocked(cobc);
8061 }
8062
8063 void PrimaryLogPG::_write_copy_chunk(CopyOpRef cop, PGTransaction *t)
8064 {
8065   dout(20) << __func__ << " " << cop
8066            << " " << cop->attrs.size() << " attrs"
8067            << " " << cop->data.length() << " bytes"
8068            << " " << cop->omap_header.length() << " omap header bytes"
8069            << " " << cop->omap_data.length() << " omap data bytes"
8070            << dendl;
8071   if (!cop->temp_cursor.attr_complete) {
8072     t->create(cop->results.temp_oid);
8073   }
8074   if (!cop->temp_cursor.data_complete) {
8075     assert(cop->data.length() + cop->temp_cursor.data_offset ==
8076            cop->cursor.data_offset);
8077     if (pool.info.requires_aligned_append() &&
8078         !cop->cursor.data_complete) {
8079       /**
8080        * Trim off the unaligned bit at the end, we'll adjust cursor.data_offset
8081        * to pick it up on the next pass.
8082        */
8083       assert(cop->temp_cursor.data_offset %
8084              pool.info.required_alignment() == 0);
8085       if (cop->data.length() % pool.info.required_alignment() != 0) {
8086         uint64_t to_trim =
8087           cop->data.length() % pool.info.required_alignment();
8088         bufferlist bl;
8089         bl.substr_of(cop->data, 0, cop->data.length() - to_trim);
8090         cop->data.swap(bl);
8091         cop->cursor.data_offset -= to_trim;
8092         assert(cop->data.length() + cop->temp_cursor.data_offset ==
8093                cop->cursor.data_offset);
8094       }
8095     }
8096     if (cop->data.length()) {
8097       t->write(
8098         cop->results.temp_oid,
8099         cop->temp_cursor.data_offset,
8100         cop->data.length(),
8101         cop->data,
8102         cop->dest_obj_fadvise_flags);
8103     }
8104     cop->data.clear();
8105   }
8106   if (pool.info.supports_omap()) {
8107     if (!cop->temp_cursor.omap_complete) {
8108       if (cop->omap_header.length()) {
8109         t->omap_setheader(
8110           cop->results.temp_oid,
8111           cop->omap_header);
8112         cop->omap_header.clear();
8113       }
8114       if (cop->omap_data.length()) {
8115         map<string,bufferlist> omap;
8116         bufferlist::iterator p = cop->omap_data.begin();
8117         ::decode(omap, p);
8118         t->omap_setkeys(cop->results.temp_oid, omap);
8119         cop->omap_data.clear();
8120       }
8121     }
8122   } else {
8123     assert(cop->omap_header.length() == 0);
8124     assert(cop->omap_data.length() == 0);
8125   }
8126   cop->temp_cursor = cop->cursor;
8127 }
8128
8129 void PrimaryLogPG::finish_copyfrom(OpContext *ctx)
8130 {
8131   dout(20) << "finish_copyfrom on " << ctx->obs->oi.soid << dendl;
8132   ObjectState& obs = ctx->new_obs;
8133   CopyFromCallback *cb = static_cast<CopyFromCallback*>(ctx->copy_cb);
8134
8135   if (obs.exists) {
8136     dout(20) << __func__ << ": exists, removing" << dendl;
8137     ctx->op_t->remove(obs.oi.soid);
8138   } else {
8139     ctx->delta_stats.num_objects++;
8140     obs.exists = true;
8141   }
8142   if (cb->is_temp_obj_used()) {
8143     ctx->discard_temp_oid = cb->results->temp_oid;
8144   }
8145   cb->results->fill_in_final_tx(ctx->op_t.get());
8146
8147   // CopyFromCallback fills this in for us
8148   obs.oi.user_version = ctx->user_at_version;
8149
8150   obs.oi.set_data_digest(cb->results->data_digest);
8151   obs.oi.set_omap_digest(cb->results->omap_digest);
8152
8153   obs.oi.truncate_seq = cb->results->truncate_seq;
8154   obs.oi.truncate_size = cb->results->truncate_size;
8155
8156   ctx->extra_reqids = cb->results->reqids;
8157
8158   // cache: clear whiteout?
8159   if (obs.oi.is_whiteout()) {
8160     dout(10) << __func__ << " clearing whiteout on " << obs.oi.soid << dendl;
8161     obs.oi.clear_flag(object_info_t::FLAG_WHITEOUT);
8162     --ctx->delta_stats.num_whiteouts;
8163   }
8164
8165   if (cb->results->has_omap) {
8166     dout(10) << __func__ << " setting omap flag on " << obs.oi.soid << dendl;
8167     obs.oi.set_flag(object_info_t::FLAG_OMAP);
8168   } else {
8169     dout(10) << __func__ << " clearing omap flag on " << obs.oi.soid << dendl;
8170     obs.oi.clear_flag(object_info_t::FLAG_OMAP);
8171   }
8172
8173   interval_set<uint64_t> ch;
8174   if (obs.oi.size > 0)
8175     ch.insert(0, obs.oi.size);
8176   ctx->modified_ranges.union_of(ch);
8177
8178   if (cb->get_data_size() != obs.oi.size) {
8179     ctx->delta_stats.num_bytes -= obs.oi.size;
8180     obs.oi.size = cb->get_data_size();
8181     ctx->delta_stats.num_bytes += obs.oi.size;
8182   }
8183   ctx->delta_stats.num_wr++;
8184   ctx->delta_stats.num_wr_kb += SHIFT_ROUND_UP(obs.oi.size, 10);
8185
8186   osd->logger->inc(l_osd_copyfrom);
8187 }
8188
8189 void PrimaryLogPG::finish_promote(int r, CopyResults *results,
8190                                   ObjectContextRef obc)
8191 {
8192   const hobject_t& soid = obc->obs.oi.soid;
8193   dout(10) << __func__ << " " << soid << " r=" << r
8194            << " uv" << results->user_version << dendl;
8195
8196   if (r == -ECANCELED) {
8197     return;
8198   }
8199
8200   if (r != -ENOENT && soid.is_snap()) {
8201     if (results->snaps.empty()) {
8202       // we must have read "snap" content from the head object in
8203       // the base pool.  use snap_seq to construct what snaps should
8204       // be for this clone (what is was before we evicted the clean
8205       // clone from this pool, and what it will be when we flush and
8206       // the clone eventually happens in the base pool).
8207       SnapSet& snapset = obc->ssc->snapset;
8208       vector<snapid_t>::iterator p = snapset.snaps.begin();
8209       while (p != snapset.snaps.end() && *p > soid.snap)
8210         ++p;
8211       while (p != snapset.snaps.end() && *p > results->snap_seq) {
8212         results->snaps.push_back(*p);
8213         ++p;
8214       }
8215     }
8216
8217     dout(20) << __func__ << " snaps " << results->snaps << dendl;
8218     filter_snapc(results->snaps);
8219
8220     dout(20) << __func__ << " filtered snaps " << results->snaps << dendl;
8221     if (results->snaps.empty()) {
8222       dout(20) << __func__
8223                << " snaps are empty, clone is invalid,"
8224                << " setting r to ENOENT" << dendl;
8225       r = -ENOENT;
8226     }
8227   }
8228
8229   if (r < 0 && results->started_temp_obj) {
8230     dout(10) << __func__ << " abort; will clean up partial work" << dendl;
8231     ObjectContextRef tempobc = get_object_context(results->temp_oid, false);
8232     assert(tempobc);
8233     OpContextUPtr ctx = simple_opc_create(tempobc);
8234     ctx->op_t->remove(results->temp_oid);
8235     simple_opc_submit(std::move(ctx));
8236     results->started_temp_obj = false;
8237   }
8238
8239   if (r == -ENOENT && soid.is_snap()) {
8240     dout(10) << __func__
8241              << ": enoent while trying to promote clone, " << soid
8242              << " must have been trimmed, removing from snapset"
8243              << dendl;
8244     hobject_t head(soid.get_head());
8245     ObjectContextRef obc = get_object_context(head, false);
8246     assert(obc);
8247
8248     OpContextUPtr tctx = simple_opc_create(obc);
8249     tctx->at_version = get_next_version();
8250     filter_snapc(tctx->new_snapset.snaps);
8251     vector<snapid_t> new_clones;
8252     map<snapid_t, vector<snapid_t>> new_clone_snaps;
8253     for (vector<snapid_t>::iterator i = tctx->new_snapset.clones.begin();
8254          i != tctx->new_snapset.clones.end();
8255          ++i) {
8256       if (*i != soid.snap) {
8257         new_clones.push_back(*i);
8258         auto p = tctx->new_snapset.clone_snaps.find(*i);
8259         if (p != tctx->new_snapset.clone_snaps.end()) {
8260           new_clone_snaps[*i] = p->second;
8261         }
8262       }
8263     }
8264     tctx->new_snapset.clones.swap(new_clones);
8265     tctx->new_snapset.clone_overlap.erase(soid.snap);
8266     tctx->new_snapset.clone_size.erase(soid.snap);
8267     tctx->new_snapset.clone_snaps.swap(new_clone_snaps);
8268
8269     // take RWWRITE lock for duration of our local write.  ignore starvation.
8270     if (!tctx->lock_manager.take_write_lock(
8271           head,
8272           obc)) {
8273       assert(0 == "problem!");
8274     }
8275     dout(20) << __func__ << " took lock on obc, " << obc->rwstate << dendl;
8276
8277     finish_ctx(tctx.get(), pg_log_entry_t::PROMOTE);
8278
8279     simple_opc_submit(std::move(tctx));
8280     return;
8281   }
8282
8283   bool whiteout = false;
8284   if (r == -ENOENT) {
8285     assert(soid.snap == CEPH_NOSNAP); // snap case is above
8286     dout(10) << __func__ << " whiteout " << soid << dendl;
8287     whiteout = true;
8288   }
8289
8290   if (r < 0 && !whiteout) {
8291     derr << __func__ << " unexpected promote error " << cpp_strerror(r) << dendl;
8292     // pass error to everyone blocked on this object
8293     // FIXME: this is pretty sloppy, but at this point we got
8294     // something unexpected and don't have many other options.
8295     map<hobject_t,list<OpRequestRef>>::iterator blocked_iter =
8296       waiting_for_blocked_object.find(soid);
8297     if (blocked_iter != waiting_for_blocked_object.end()) {
8298       while (!blocked_iter->second.empty()) {
8299         osd->reply_op_error(blocked_iter->second.front(), r);
8300         blocked_iter->second.pop_front();
8301       }
8302       waiting_for_blocked_object.erase(blocked_iter);
8303     }
8304     return;
8305   }
8306
8307   osd->promote_finish(results->object_size);
8308
8309   OpContextUPtr tctx =  simple_opc_create(obc);
8310   tctx->at_version = get_next_version();
8311
8312   ++tctx->delta_stats.num_objects;
8313   if (soid.snap < CEPH_NOSNAP)
8314     ++tctx->delta_stats.num_object_clones;
8315   tctx->new_obs.exists = true;
8316
8317   tctx->extra_reqids = results->reqids;
8318
8319   bool legacy_snapset = tctx->new_snapset.is_legacy() ||
8320     get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
8321
8322   if (whiteout) {
8323     // create a whiteout
8324     tctx->op_t->create(soid);
8325     tctx->new_obs.oi.set_flag(object_info_t::FLAG_WHITEOUT);
8326     ++tctx->delta_stats.num_whiteouts;
8327     dout(20) << __func__ << " creating whiteout on " << soid << dendl;
8328     osd->logger->inc(l_osd_tier_whiteout);
8329   } else {
8330     if (results->has_omap) {
8331       dout(10) << __func__ << " setting omap flag on " << soid << dendl;
8332       tctx->new_obs.oi.set_flag(object_info_t::FLAG_OMAP);
8333       ++tctx->delta_stats.num_objects_omap;
8334     }
8335
8336     results->fill_in_final_tx(tctx->op_t.get());
8337     if (results->started_temp_obj) {
8338       tctx->discard_temp_oid = results->temp_oid;
8339     }
8340     tctx->new_obs.oi.size = results->object_size;
8341     tctx->new_obs.oi.user_version = results->user_version;
8342     // Don't care src object whether have data or omap digest
8343     if (results->object_size)
8344       tctx->new_obs.oi.set_data_digest(results->data_digest);
8345     if (results->has_omap)
8346       tctx->new_obs.oi.set_omap_digest(results->omap_digest);
8347     tctx->new_obs.oi.truncate_seq = results->truncate_seq;
8348     tctx->new_obs.oi.truncate_size = results->truncate_size;
8349
8350     if (soid.snap != CEPH_NOSNAP) {
8351       if (legacy_snapset) {
8352         tctx->new_obs.oi.legacy_snaps = results->snaps;
8353         assert(!tctx->new_obs.oi.legacy_snaps.empty());
8354       } else {
8355         // it's already in the snapset
8356         assert(obc->ssc->snapset.clone_snaps.count(soid.snap));
8357       }
8358       assert(obc->ssc->snapset.clone_size.count(soid.snap));
8359       assert(obc->ssc->snapset.clone_size[soid.snap] ==
8360              results->object_size);
8361       assert(obc->ssc->snapset.clone_overlap.count(soid.snap));
8362
8363       tctx->delta_stats.num_bytes += obc->ssc->snapset.get_clone_bytes(soid.snap);
8364     } else {
8365       tctx->delta_stats.num_bytes += results->object_size;
8366     }
8367   }
8368
8369   if (results->mirror_snapset) {
8370     assert(tctx->new_obs.oi.soid.snap == CEPH_NOSNAP);
8371     tctx->new_snapset.from_snap_set(
8372       results->snapset,
8373       get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS);
8374   }
8375   tctx->new_snapset.head_exists = true;
8376   dout(20) << __func__ << " new_snapset " << tctx->new_snapset << dendl;
8377
8378   // take RWWRITE lock for duration of our local write.  ignore starvation.
8379   if (!tctx->lock_manager.take_write_lock(
8380         obc->obs.oi.soid,
8381         obc)) {
8382     assert(0 == "problem!");
8383   }
8384   dout(20) << __func__ << " took lock on obc, " << obc->rwstate << dendl;
8385
8386   finish_ctx(tctx.get(), pg_log_entry_t::PROMOTE);
8387
8388   simple_opc_submit(std::move(tctx));
8389
8390   osd->logger->inc(l_osd_tier_promote);
8391
8392   if (agent_state &&
8393       agent_state->is_idle())
8394     agent_choose_mode();
8395 }
8396
8397 void PrimaryLogPG::cancel_copy(CopyOpRef cop, bool requeue)
8398 {
8399   dout(10) << __func__ << " " << cop->obc->obs.oi.soid
8400            << " from " << cop->src << " " << cop->oloc
8401            << " v" << cop->results.user_version << dendl;
8402
8403   // cancel objecter op, if we can
8404   if (cop->objecter_tid) {
8405     osd->objecter->op_cancel(cop->objecter_tid, -ECANCELED);
8406     cop->objecter_tid = 0;
8407     if (cop->objecter_tid2) {
8408       osd->objecter->op_cancel(cop->objecter_tid2, -ECANCELED);
8409       cop->objecter_tid2 = 0;
8410     }
8411   }
8412
8413   copy_ops.erase(cop->obc->obs.oi.soid);
8414   cop->obc->stop_block();
8415
8416   kick_object_context_blocked(cop->obc);
8417   cop->results.should_requeue = requeue;
8418   CopyCallbackResults result(-ECANCELED, &cop->results);
8419   cop->cb->complete(result);
8420
8421   // There may still be an objecter callback referencing this copy op.
8422   // That callback will not need the obc since it's been canceled, and
8423   // we need the obc reference to go away prior to flush.
8424   cop->obc = ObjectContextRef();
8425 }
8426
8427 void PrimaryLogPG::cancel_copy_ops(bool requeue)
8428 {
8429   dout(10) << __func__ << dendl;
8430   map<hobject_t,CopyOpRef>::iterator p = copy_ops.begin();
8431   while (p != copy_ops.end()) {
8432     // requeue this op? can I queue up all of them?
8433     cancel_copy((p++)->second, requeue);
8434   }
8435 }
8436
8437
8438 // ========================================================================
8439 // flush
8440 //
8441 // Flush a dirty object in the cache tier by writing it back to the
8442 // base tier.  The sequence looks like:
8443 //
8444 //  * send a copy-from operation to the base tier to copy the current
8445 //    version of the object
8446 //  * base tier will pull the object via (perhaps multiple) copy-get(s)
8447 //  * on completion, we check if the object has been modified.  if so,
8448 //    just reply with -EAGAIN.
8449 //  * try to take a write lock so we can clear the dirty flag.  if this
8450 //    fails, wait and retry
8451 //  * start a repop that clears the bit.
8452 //
8453 // If we have to wait, we will retry by coming back through the
8454 // start_flush method.  We check if a flush is already in progress
8455 // and, if so, try to finish it by rechecking the version and trying
8456 // to clear the dirty bit.
8457 //
8458 // In order for the cache-flush (a write op) to not block the copy-get
8459 // from reading the object, the client *must* set the SKIPRWLOCKS
8460 // flag.
8461 //
8462 // NOTE: normally writes are strictly ordered for the client, but
8463 // flushes are special in that they can be reordered with respect to
8464 // other writes.  In particular, we can't have a flush request block
8465 // an update to the cache pool object!
8466
8467 struct C_Flush : public Context {
8468   PrimaryLogPGRef pg;
8469   hobject_t oid;
8470   epoch_t last_peering_reset;
8471   ceph_tid_t tid;
8472   utime_t start;
8473   C_Flush(PrimaryLogPG *p, hobject_t o, epoch_t lpr)
8474     : pg(p), oid(o), last_peering_reset(lpr),
8475       tid(0), start(ceph_clock_now())
8476   {}
8477   void finish(int r) override {
8478     if (r == -ECANCELED)
8479       return;
8480     pg->lock();
8481     if (last_peering_reset == pg->get_last_peering_reset()) {
8482       pg->finish_flush(oid, tid, r);
8483       pg->osd->logger->tinc(l_osd_tier_flush_lat, ceph_clock_now() - start);
8484     }
8485     pg->unlock();
8486   }
8487 };
8488
8489 int PrimaryLogPG::start_flush(
8490   OpRequestRef op, ObjectContextRef obc,
8491   bool blocking, hobject_t *pmissing,
8492   boost::optional<std::function<void()>> &&on_flush)
8493 {
8494   const object_info_t& oi = obc->obs.oi;
8495   const hobject_t& soid = oi.soid;
8496   dout(10) << __func__ << " " << soid
8497            << " v" << oi.version
8498            << " uv" << oi.user_version
8499            << " " << (blocking ? "blocking" : "non-blocking/best-effort")
8500            << dendl;
8501
8502   // get a filtered snapset, need to remove removed snaps
8503   SnapSet snapset = obc->ssc->snapset.get_filtered(pool.info);
8504
8505   // verify there are no (older) check for dirty clones
8506   {
8507     dout(20) << " snapset " << snapset << dendl;
8508     vector<snapid_t>::reverse_iterator p = snapset.clones.rbegin();
8509     while (p != snapset.clones.rend() && *p >= soid.snap)
8510       ++p;
8511     if (p != snapset.clones.rend()) {
8512       hobject_t next = soid;
8513       next.snap = *p;
8514       assert(next.snap < soid.snap);
8515       if (pg_log.get_missing().is_missing(next)) {
8516         dout(10) << __func__ << " missing clone is " << next << dendl;
8517         if (pmissing)
8518           *pmissing = next;
8519         return -ENOENT;
8520       }
8521       ObjectContextRef older_obc = get_object_context(next, false);
8522       if (older_obc) {
8523         dout(20) << __func__ << " next oldest clone is " << older_obc->obs.oi
8524                  << dendl;
8525         if (older_obc->obs.oi.is_dirty()) {
8526           dout(10) << __func__ << " next oldest clone is dirty: "
8527                    << older_obc->obs.oi << dendl;
8528           return -EBUSY;
8529         }
8530       } else {
8531         dout(20) << __func__ << " next oldest clone " << next
8532                  << " is not present; implicitly clean" << dendl;
8533       }
8534     } else {
8535       dout(20) << __func__ << " no older clones" << dendl;
8536     }
8537   }
8538
8539   if (blocking)
8540     obc->start_block();
8541
8542   map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(soid);
8543   if (p != flush_ops.end()) {
8544     FlushOpRef fop = p->second;
8545     if (fop->op == op) {
8546       // we couldn't take the write lock on a cache-try-flush before;
8547       // now we are trying again for the lock.
8548       return try_flush_mark_clean(fop);
8549     }
8550     if (fop->flushed_version == obc->obs.oi.user_version &&
8551         (fop->blocking || !blocking)) {
8552       // nonblocking can join anything
8553       // blocking can only join a blocking flush
8554       dout(20) << __func__ << " piggybacking on existing flush " << dendl;
8555       if (op)
8556         fop->dup_ops.push_back(op);
8557       return -EAGAIN;   // clean up this ctx; op will retry later
8558     }
8559
8560     // cancel current flush since it will fail anyway, or because we
8561     // are blocking and the existing flush is nonblocking.
8562     dout(20) << __func__ << " canceling previous flush; it will fail" << dendl;
8563     if (fop->op)
8564       osd->reply_op_error(fop->op, -EBUSY);
8565     while (!fop->dup_ops.empty()) {
8566       osd->reply_op_error(fop->dup_ops.front(), -EBUSY);
8567       fop->dup_ops.pop_front();
8568     }
8569     cancel_flush(fop, false);
8570   }
8571
8572   /**
8573    * In general, we need to send a delete and a copyfrom.
8574    * Consider snapc 10:[10, 9, 8, 4, 3, 2]:[10(10, 9), 4(4,3,2)]
8575    * where 4 is marked as clean.  To flush 10, we have to:
8576    * 1) delete 4:[4,3,2] -- Logically, the object does not exist after 4
8577    * 2) copyfrom 8:[8,4,3,2] -- flush object after snap 8
8578    *
8579    * There is a complicating case.  Supposed there had been a clone 7
8580    * for snaps [7, 6] which has been trimmed since they no longer exist.
8581    * In the base pool, we'd have 5:[4,3,2]:[4(4,3,2)]+head.  When we submit
8582    * the delete, the snap will be promoted to 5, and the head will become
8583    * a snapdir.  When the copy-from goes through, we'll end up with
8584    * 8:[8,4,3,2]:[4(4,3,2)]+head.
8585    *
8586    * Another complication is the case where there is an interval change
8587    * after doing the delete and the flush but before marking the object
8588    * clean.  We'll happily delete head and then recreate it at the same
8589    * sequence number, which works out ok.
8590    */
8591
8592   SnapContext snapc, dsnapc;
8593   if (snapset.seq != 0) {
8594     if (soid.snap == CEPH_NOSNAP) {
8595       snapc.seq = snapset.seq;
8596       snapc.snaps = snapset.snaps;
8597     } else {
8598       snapid_t min_included_snap;
8599       if (snapset.is_legacy()) {
8600         min_included_snap = oi.legacy_snaps.back();
8601       } else {
8602         auto p = snapset.clone_snaps.find(soid.snap);
8603         assert(p != snapset.clone_snaps.end());
8604         min_included_snap = p->second.back();
8605       }
8606       snapc = snapset.get_ssc_as_of(min_included_snap - 1);
8607     }
8608
8609     snapid_t prev_snapc = 0;
8610     for (vector<snapid_t>::reverse_iterator citer = snapset.clones.rbegin();
8611          citer != snapset.clones.rend();
8612          ++citer) {
8613       if (*citer < soid.snap) {
8614         prev_snapc = *citer;
8615         break;
8616       }
8617     }
8618
8619     dsnapc = snapset.get_ssc_as_of(prev_snapc);
8620   }
8621
8622   object_locator_t base_oloc(soid);
8623   base_oloc.pool = pool.info.tier_of;
8624
8625   if (dsnapc.seq < snapc.seq) {
8626     ObjectOperation o;
8627     o.remove();
8628     osd->objecter->mutate(
8629       soid.oid,
8630       base_oloc,
8631       o,
8632       dsnapc,
8633       ceph::real_clock::from_ceph_timespec(oi.mtime),
8634       (CEPH_OSD_FLAG_IGNORE_OVERLAY |
8635        CEPH_OSD_FLAG_ENFORCE_SNAPC),
8636       NULL /* no callback, we'll rely on the ordering w.r.t the next op */);
8637   }
8638
8639   FlushOpRef fop(std::make_shared<FlushOp>());
8640   fop->obc = obc;
8641   fop->flushed_version = oi.user_version;
8642   fop->blocking = blocking;
8643   fop->on_flush = std::move(on_flush);
8644   fop->op = op;
8645
8646   ObjectOperation o;
8647   if (oi.is_whiteout()) {
8648     fop->removal = true;
8649     o.remove();
8650   } else {
8651     object_locator_t oloc(soid);
8652     o.copy_from(soid.oid.name, soid.snap, oloc, oi.user_version,
8653                 CEPH_OSD_COPY_FROM_FLAG_FLUSH |
8654                 CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY |
8655                 CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE |
8656                 CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE,
8657                 LIBRADOS_OP_FLAG_FADVISE_SEQUENTIAL|LIBRADOS_OP_FLAG_FADVISE_NOCACHE);
8658
8659     //mean the base tier don't cache data after this
8660     if (agent_state && agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL)
8661       o.set_last_op_flags(LIBRADOS_OP_FLAG_FADVISE_DONTNEED);
8662   }
8663   C_Flush *fin = new C_Flush(this, soid, get_last_peering_reset());
8664
8665   ceph_tid_t tid = osd->objecter->mutate(
8666     soid.oid, base_oloc, o, snapc,
8667     ceph::real_clock::from_ceph_timespec(oi.mtime),
8668     CEPH_OSD_FLAG_IGNORE_OVERLAY | CEPH_OSD_FLAG_ENFORCE_SNAPC,
8669     new C_OnFinisher(fin,
8670                      &osd->objecter_finisher));
8671   /* we're under the pg lock and fin->finish() is grabbing that */
8672   fin->tid = tid;
8673   fop->objecter_tid = tid;
8674
8675   flush_ops[soid] = fop;
8676   info.stats.stats.sum.num_flush++;
8677   info.stats.stats.sum.num_flush_kb += SHIFT_ROUND_UP(oi.size, 10);
8678   return -EINPROGRESS;
8679 }
8680
8681 void PrimaryLogPG::finish_flush(hobject_t oid, ceph_tid_t tid, int r)
8682 {
8683   dout(10) << __func__ << " " << oid << " tid " << tid
8684            << " " << cpp_strerror(r) << dendl;
8685   map<hobject_t,FlushOpRef>::iterator p = flush_ops.find(oid);
8686   if (p == flush_ops.end()) {
8687     dout(10) << __func__ << " no flush_op found" << dendl;
8688     return;
8689   }
8690   FlushOpRef fop = p->second;
8691   if (tid != fop->objecter_tid) {
8692     dout(10) << __func__ << " tid " << tid << " != fop " << fop
8693              << " tid " << fop->objecter_tid << dendl;
8694     return;
8695   }
8696   ObjectContextRef obc = fop->obc;
8697   fop->objecter_tid = 0;
8698
8699   if (r < 0 && !(r == -ENOENT && fop->removal)) {
8700     if (fop->op)
8701       osd->reply_op_error(fop->op, -EBUSY);
8702     if (fop->blocking) {
8703       obc->stop_block();
8704       kick_object_context_blocked(obc);
8705     }
8706
8707     if (!fop->dup_ops.empty()) {
8708       dout(20) << __func__ << " requeueing dups" << dendl;
8709       requeue_ops(fop->dup_ops);
8710     }
8711     if (fop->on_flush) {
8712       (*(fop->on_flush))();
8713       fop->on_flush = boost::none;
8714     }
8715     flush_ops.erase(oid);
8716     return;
8717   }
8718
8719   r = try_flush_mark_clean(fop);
8720   if (r == -EBUSY && fop->op) {
8721     osd->reply_op_error(fop->op, r);
8722   }
8723 }
8724
8725 int PrimaryLogPG::try_flush_mark_clean(FlushOpRef fop)
8726 {
8727   ObjectContextRef obc = fop->obc;
8728   const hobject_t& oid = obc->obs.oi.soid;
8729
8730   if (fop->blocking) {
8731     obc->stop_block();
8732     kick_object_context_blocked(obc);
8733   }
8734
8735   if (fop->flushed_version != obc->obs.oi.user_version ||
8736       !obc->obs.exists) {
8737     if (obc->obs.exists)
8738       dout(10) << __func__ << " flushed_version " << fop->flushed_version
8739                << " != current " << obc->obs.oi.user_version
8740                << dendl;
8741     else
8742       dout(10) << __func__ << " object no longer exists" << dendl;
8743
8744     if (!fop->dup_ops.empty()) {
8745       dout(20) << __func__ << " requeueing dups" << dendl;
8746       requeue_ops(fop->dup_ops);
8747     }
8748     if (fop->on_flush) {
8749       (*(fop->on_flush))();
8750       fop->on_flush = boost::none;
8751     }
8752     flush_ops.erase(oid);
8753     if (fop->blocking)
8754       osd->logger->inc(l_osd_tier_flush_fail);
8755     else
8756       osd->logger->inc(l_osd_tier_try_flush_fail);
8757     return -EBUSY;
8758   }
8759
8760   if (!fop->blocking &&
8761       scrubber.write_blocked_by_scrub(oid)) {
8762     if (fop->op) {
8763       dout(10) << __func__ << " blocked by scrub" << dendl;
8764       requeue_op(fop->op);
8765       requeue_ops(fop->dup_ops);
8766       return -EAGAIN;    // will retry
8767     } else {
8768       osd->logger->inc(l_osd_tier_try_flush_fail);
8769       cancel_flush(fop, false);
8770       return -ECANCELED;
8771     }
8772   }
8773
8774   // successfully flushed, can we evict this object?
8775   if (!fop->op && agent_state->evict_mode != TierAgentState::EVICT_MODE_IDLE &&
8776       agent_maybe_evict(obc, true)) {
8777     osd->logger->inc(l_osd_tier_clean);
8778     if (fop->on_flush) {
8779       (*(fop->on_flush))();
8780       fop->on_flush = boost::none;
8781     }
8782     flush_ops.erase(oid);
8783     return 0;
8784   }
8785
8786   dout(10) << __func__ << " clearing DIRTY flag for " << oid << dendl;
8787   OpContextUPtr ctx = simple_opc_create(fop->obc);
8788
8789   // successfully flushed; can we clear the dirty bit?
8790   // try to take the lock manually, since we don't
8791   // have a ctx yet.
8792   if (ctx->lock_manager.get_lock_type(
8793         ObjectContext::RWState::RWWRITE,
8794         oid,
8795         obc,
8796         fop->op)) {
8797     dout(20) << __func__ << " took write lock" << dendl;
8798   } else if (fop->op) {
8799     dout(10) << __func__ << " waiting on write lock" << dendl;
8800     close_op_ctx(ctx.release());
8801     requeue_op(fop->op);
8802     requeue_ops(fop->dup_ops);
8803     return -EAGAIN;    // will retry
8804   } else {
8805     dout(10) << __func__ << " failed write lock, no op; failing" << dendl;
8806     close_op_ctx(ctx.release());
8807     osd->logger->inc(l_osd_tier_try_flush_fail);
8808     cancel_flush(fop, false);
8809     return -ECANCELED;
8810   }
8811
8812   if (fop->on_flush) {
8813     ctx->register_on_finish(*(fop->on_flush));
8814     fop->on_flush = boost::none;
8815   }
8816
8817   ctx->at_version = get_next_version();
8818
8819   ctx->new_obs = obc->obs;
8820   ctx->new_obs.oi.clear_flag(object_info_t::FLAG_DIRTY);
8821   --ctx->delta_stats.num_objects_dirty;
8822
8823   finish_ctx(ctx.get(), pg_log_entry_t::CLEAN);
8824
8825   osd->logger->inc(l_osd_tier_clean);
8826
8827   if (!fop->dup_ops.empty() || fop->op) {
8828     dout(20) << __func__ << " requeueing for " << ctx->at_version << dendl;
8829     list<OpRequestRef> ls;
8830     if (fop->op)
8831       ls.push_back(fop->op);
8832     ls.splice(ls.end(), fop->dup_ops);
8833     requeue_ops(ls);
8834   }
8835
8836   simple_opc_submit(std::move(ctx));
8837
8838   flush_ops.erase(oid);
8839
8840   if (fop->blocking)
8841     osd->logger->inc(l_osd_tier_flush);
8842   else
8843     osd->logger->inc(l_osd_tier_try_flush);
8844
8845   return -EINPROGRESS;
8846 }
8847
8848 void PrimaryLogPG::cancel_flush(FlushOpRef fop, bool requeue)
8849 {
8850   dout(10) << __func__ << " " << fop->obc->obs.oi.soid << " tid "
8851            << fop->objecter_tid << dendl;
8852   if (fop->objecter_tid) {
8853     osd->objecter->op_cancel(fop->objecter_tid, -ECANCELED);
8854     fop->objecter_tid = 0;
8855   }
8856   if (fop->blocking) {
8857     fop->obc->stop_block();
8858     kick_object_context_blocked(fop->obc);
8859   }
8860   if (requeue) {
8861     if (fop->op)
8862       requeue_op(fop->op);
8863     requeue_ops(fop->dup_ops);
8864   }
8865   if (fop->on_flush) {
8866     (*(fop->on_flush))();
8867     fop->on_flush = boost::none;
8868   }
8869   flush_ops.erase(fop->obc->obs.oi.soid);
8870 }
8871
8872 void PrimaryLogPG::cancel_flush_ops(bool requeue)
8873 {
8874   dout(10) << __func__ << dendl;
8875   map<hobject_t,FlushOpRef>::iterator p = flush_ops.begin();
8876   while (p != flush_ops.end()) {
8877     cancel_flush((p++)->second, requeue);
8878   }
8879 }
8880
8881 bool PrimaryLogPG::is_present_clone(hobject_t coid)
8882 {
8883   if (!pool.info.allow_incomplete_clones())
8884     return true;
8885   if (is_missing_object(coid))
8886     return true;
8887   ObjectContextRef obc = get_object_context(coid, false);
8888   return obc && obc->obs.exists;
8889 }
8890
8891 // ========================================================================
8892 // rep op gather
8893
8894 class C_OSD_RepopApplied : public Context {
8895   PrimaryLogPGRef pg;
8896   boost::intrusive_ptr<PrimaryLogPG::RepGather> repop;
8897 public:
8898   C_OSD_RepopApplied(PrimaryLogPG *pg, PrimaryLogPG::RepGather *repop)
8899   : pg(pg), repop(repop) {}
8900   void finish(int) override {
8901     pg->repop_all_applied(repop.get());
8902   }
8903 };
8904
8905
8906 void PrimaryLogPG::repop_all_applied(RepGather *repop)
8907 {
8908   dout(10) << __func__ << ": repop tid " << repop->rep_tid << " all applied "
8909            << dendl;
8910   assert(!repop->applies_with_commit);
8911   repop->all_applied = true;
8912   if (!repop->rep_aborted) {
8913     eval_repop(repop);
8914   }
8915 }
8916
8917 class C_OSD_RepopCommit : public Context {
8918   PrimaryLogPGRef pg;
8919   boost::intrusive_ptr<PrimaryLogPG::RepGather> repop;
8920 public:
8921   C_OSD_RepopCommit(PrimaryLogPG *pg, PrimaryLogPG::RepGather *repop)
8922     : pg(pg), repop(repop) {}
8923   void finish(int) override {
8924     pg->repop_all_committed(repop.get());
8925   }
8926 };
8927
8928 void PrimaryLogPG::repop_all_committed(RepGather *repop)
8929 {
8930   dout(10) << __func__ << ": repop tid " << repop->rep_tid << " all committed "
8931            << dendl;
8932   repop->all_committed = true;
8933   if (repop->applies_with_commit) {
8934     assert(!repop->all_applied);
8935     repop->all_applied = true;
8936   }
8937
8938   if (!repop->rep_aborted) {
8939     if (repop->v != eversion_t()) {
8940       last_update_ondisk = repop->v;
8941       last_complete_ondisk = repop->pg_local_last_complete;
8942     }
8943     eval_repop(repop);
8944   }
8945 }
8946
8947 void PrimaryLogPG::op_applied(const eversion_t &applied_version)
8948 {
8949   dout(10) << "op_applied version " << applied_version << dendl;
8950   if (applied_version == eversion_t())
8951     return;
8952   assert(applied_version > last_update_applied);
8953   assert(applied_version <= info.last_update);
8954   last_update_applied = applied_version;
8955   if (is_primary()) {
8956     if (scrubber.active) {
8957       if (last_update_applied == scrubber.subset_last_update) {
8958         if (ops_blocked_by_scrub()) {
8959           requeue_scrub(true);
8960         } else {
8961           requeue_scrub(false);
8962         }
8963
8964       }
8965     } else {
8966       assert(scrubber.start == scrubber.end);
8967     }
8968   } else {
8969     if (scrubber.active_rep_scrub) {
8970       if (last_update_applied == static_cast<const MOSDRepScrub*>(
8971             scrubber.active_rep_scrub->get_req())->scrub_to) {
8972         osd->enqueue_back(
8973           info.pgid,
8974           PGQueueable(scrubber.active_rep_scrub, get_osdmap()->get_epoch()));
8975         scrubber.active_rep_scrub = OpRequestRef();
8976       }
8977     }
8978   }
8979 }
8980
8981 void PrimaryLogPG::eval_repop(RepGather *repop)
8982 {
8983   const MOSDOp *m = NULL;
8984   if (repop->op)
8985     m = static_cast<const MOSDOp *>(repop->op->get_req());
8986
8987   if (m)
8988     dout(10) << "eval_repop " << *repop
8989              << (repop->rep_done ? " DONE" : "")
8990              << dendl;
8991   else
8992     dout(10) << "eval_repop " << *repop << " (no op)"
8993              << (repop->rep_done ? " DONE" : "")
8994              << dendl;
8995
8996   if (repop->rep_done)
8997     return;
8998
8999   // ondisk?
9000   if (repop->all_committed) {
9001     dout(10) << " commit: " << *repop << dendl;
9002     for (auto p = repop->on_committed.begin();
9003          p != repop->on_committed.end();
9004          repop->on_committed.erase(p++)) {
9005       (*p)();
9006     }
9007     // send dup commits, in order
9008     if (waiting_for_ondisk.count(repop->v)) {
9009       assert(waiting_for_ondisk.begin()->first == repop->v);
9010       for (list<pair<OpRequestRef, version_t> >::iterator i =
9011              waiting_for_ondisk[repop->v].begin();
9012            i != waiting_for_ondisk[repop->v].end();
9013            ++i) {
9014         osd->reply_op_error(i->first, repop->r, repop->v,
9015                             i->second);
9016       }
9017       waiting_for_ondisk.erase(repop->v);
9018     }
9019   }
9020
9021   // applied?
9022   if (repop->all_applied) {
9023     if (repop->applies_with_commit) {
9024       assert(repop->on_applied.empty());
9025     }
9026     dout(10) << " applied: " << *repop << " " << dendl;
9027     for (auto p = repop->on_applied.begin();
9028          p != repop->on_applied.end();
9029          repop->on_applied.erase(p++)) {
9030       (*p)();
9031     }
9032   }
9033
9034   // done.
9035   if (repop->all_applied && repop->all_committed) {
9036     repop->rep_done = true;
9037
9038     publish_stats_to_osd();
9039     calc_min_last_complete_ondisk();
9040
9041     dout(10) << " removing " << *repop << dendl;
9042     assert(!repop_queue.empty());
9043     dout(20) << "   q front is " << *repop_queue.front() << dendl;
9044     if (repop_queue.front() != repop) {
9045       if (!repop->applies_with_commit) {
9046         dout(0) << " removing " << *repop << dendl;
9047         dout(0) << "   q front is " << *repop_queue.front() << dendl;
9048         assert(repop_queue.front() == repop);
9049       }
9050     } else {
9051       RepGather *to_remove = nullptr;
9052       while (!repop_queue.empty() &&
9053              (to_remove = repop_queue.front())->rep_done) {
9054         repop_queue.pop_front();
9055         for (auto p = to_remove->on_success.begin();
9056              p != to_remove->on_success.end();
9057              to_remove->on_success.erase(p++)) {
9058           (*p)();
9059         }
9060         remove_repop(to_remove);
9061       }
9062     }
9063   }
9064 }
9065
9066 void PrimaryLogPG::issue_repop(RepGather *repop, OpContext *ctx)
9067 {
9068   FUNCTRACE();
9069   const hobject_t& soid = ctx->obs->oi.soid;
9070   dout(7) << "issue_repop rep_tid " << repop->rep_tid
9071           << " o " << soid
9072           << dendl;
9073
9074   repop->v = ctx->at_version;
9075   if (ctx->at_version > eversion_t()) {
9076     for (set<pg_shard_t>::iterator i = actingbackfill.begin();
9077          i != actingbackfill.end();
9078          ++i) {
9079       if (*i == get_primary()) continue;
9080       pg_info_t &pinfo = peer_info[*i];
9081       // keep peer_info up to date
9082       if (pinfo.last_complete == pinfo.last_update)
9083         pinfo.last_complete = ctx->at_version;
9084       pinfo.last_update = ctx->at_version;
9085     }
9086   }
9087
9088   ctx->obc->ondisk_write_lock();
9089
9090   bool unlock_snapset_obc = false;
9091   ctx->op_t->add_obc(ctx->obc);
9092   if (ctx->clone_obc) {
9093     ctx->clone_obc->ondisk_write_lock();
9094     ctx->op_t->add_obc(ctx->clone_obc);
9095   }
9096   if (ctx->snapset_obc && ctx->snapset_obc->obs.oi.soid !=
9097       ctx->obc->obs.oi.soid) {
9098     ctx->snapset_obc->ondisk_write_lock();
9099     unlock_snapset_obc = true;
9100     ctx->op_t->add_obc(ctx->snapset_obc);
9101   }
9102
9103   Context *on_all_commit = new C_OSD_RepopCommit(this, repop);
9104   Context *on_all_applied = new C_OSD_RepopApplied(this, repop);
9105   Context *onapplied_sync = new C_OSD_OndiskWriteUnlock(
9106     ctx->obc,
9107     ctx->clone_obc,
9108     unlock_snapset_obc ? ctx->snapset_obc : ObjectContextRef());
9109   if (!(ctx->log.empty())) {
9110     assert(ctx->at_version >= projected_last_update);
9111     projected_last_update = ctx->at_version;
9112   }
9113   for (auto &&entry: ctx->log) {
9114     projected_log.add(entry);
9115   }
9116   pgbackend->submit_transaction(
9117     soid,
9118     ctx->delta_stats,
9119     ctx->at_version,
9120     std::move(ctx->op_t),
9121     pg_trim_to,
9122     min_last_complete_ondisk,
9123     ctx->log,
9124     ctx->updated_hset_history,
9125     onapplied_sync,
9126     on_all_applied,
9127     on_all_commit,
9128     repop->rep_tid,
9129     ctx->reqid,
9130     ctx->op);
9131 }
9132
9133 PrimaryLogPG::RepGather *PrimaryLogPG::new_repop(
9134   OpContext *ctx, ObjectContextRef obc,
9135   ceph_tid_t rep_tid)
9136 {
9137   if (ctx->op)
9138     dout(10) << "new_repop rep_tid " << rep_tid << " on " << *ctx->op->get_req() << dendl;
9139   else
9140     dout(10) << "new_repop rep_tid " << rep_tid << " (no op)" << dendl;
9141
9142   RepGather *repop = new RepGather(
9143     ctx, rep_tid, info.last_complete, false);
9144
9145   repop->start = ceph_clock_now();
9146
9147   repop_queue.push_back(&repop->queue_item);
9148   repop->get();
9149
9150   osd->logger->inc(l_osd_op_wip);
9151
9152   dout(10) << __func__ << ": " << *repop << dendl;
9153   return repop;
9154 }
9155
9156 boost::intrusive_ptr<PrimaryLogPG::RepGather> PrimaryLogPG::new_repop(
9157   eversion_t version,
9158   int r,
9159   ObcLockManager &&manager,
9160   OpRequestRef &&op,
9161   boost::optional<std::function<void(void)> > &&on_complete)
9162 {
9163   RepGather *repop = new RepGather(
9164     std::move(manager),
9165     std::move(op),
9166     std::move(on_complete),
9167     osd->get_tid(),
9168     info.last_complete,
9169     true,
9170     r);
9171   repop->v = version;
9172
9173   repop->start = ceph_clock_now();
9174
9175   repop_queue.push_back(&repop->queue_item);
9176
9177   osd->logger->inc(l_osd_op_wip);
9178
9179   dout(10) << __func__ << ": " << *repop << dendl;
9180   return boost::intrusive_ptr<RepGather>(repop);
9181 }
9182
9183 void PrimaryLogPG::remove_repop(RepGather *repop)
9184 {
9185   dout(20) << __func__ << " " << *repop << dendl;
9186
9187   for (auto p = repop->on_finish.begin();
9188        p != repop->on_finish.end();
9189        repop->on_finish.erase(p++)) {
9190     (*p)();
9191   }
9192
9193   release_object_locks(
9194     repop->lock_manager);
9195   repop->put();
9196
9197   osd->logger->dec(l_osd_op_wip);
9198 }
9199
9200 PrimaryLogPG::OpContextUPtr PrimaryLogPG::simple_opc_create(ObjectContextRef obc)
9201 {
9202   dout(20) << __func__ << " " << obc->obs.oi.soid << dendl;
9203   vector<OSDOp> ops;
9204   ceph_tid_t rep_tid = osd->get_tid();
9205   osd_reqid_t reqid(osd->get_cluster_msgr_name(), 0, rep_tid);
9206   OpContextUPtr ctx(new OpContext(OpRequestRef(), reqid, ops, obc, this));
9207   ctx->op_t.reset(new PGTransaction());
9208   ctx->mtime = ceph_clock_now();
9209   return ctx;
9210 }
9211
9212 void PrimaryLogPG::simple_opc_submit(OpContextUPtr ctx)
9213 {
9214   RepGather *repop = new_repop(ctx.get(), ctx->obc, ctx->reqid.tid);
9215   dout(20) << __func__ << " " << repop << dendl;
9216   issue_repop(repop, ctx.get());
9217   eval_repop(repop);
9218   repop->put();
9219 }
9220
9221
9222 void PrimaryLogPG::submit_log_entries(
9223   const mempool::osd_pglog::list<pg_log_entry_t> &entries,
9224   ObcLockManager &&manager,
9225   boost::optional<std::function<void(void)> > &&_on_complete,
9226   OpRequestRef op,
9227   int r)
9228 {
9229   dout(10) << __func__ << " " << entries << dendl;
9230   assert(is_primary());
9231
9232   eversion_t version;
9233   if (!entries.empty()) {
9234     assert(entries.rbegin()->version >= projected_last_update);
9235     version = projected_last_update = entries.rbegin()->version;
9236   }
9237
9238   boost::intrusive_ptr<RepGather> repop;
9239   boost::optional<std::function<void(void)> > on_complete;
9240   if (get_osdmap()->require_osd_release >= CEPH_RELEASE_JEWEL) {
9241     repop = new_repop(
9242       version,
9243       r,
9244       std::move(manager),
9245       std::move(op),
9246       std::move(_on_complete));
9247   } else {
9248     on_complete = std::move(_on_complete);
9249   }
9250
9251   pgbackend->call_write_ordered(
9252     [this, entries, repop, on_complete]() {
9253       ObjectStore::Transaction t;
9254       eversion_t old_last_update = info.last_update;
9255       merge_new_log_entries(entries, t);
9256
9257
9258       set<pg_shard_t> waiting_on;
9259       for (set<pg_shard_t>::const_iterator i = actingbackfill.begin();
9260            i != actingbackfill.end();
9261            ++i) {
9262         pg_shard_t peer(*i);
9263         if (peer == pg_whoami) continue;
9264         assert(peer_missing.count(peer));
9265         assert(peer_info.count(peer));
9266         if (get_osdmap()->require_osd_release >= CEPH_RELEASE_JEWEL) {
9267           assert(repop);
9268           MOSDPGUpdateLogMissing *m = new MOSDPGUpdateLogMissing(
9269             entries,
9270             spg_t(info.pgid.pgid, i->shard),
9271             pg_whoami.shard,
9272             get_osdmap()->get_epoch(),
9273             last_peering_reset,
9274             repop->rep_tid);
9275           osd->send_message_osd_cluster(
9276             peer.osd, m, get_osdmap()->get_epoch());
9277           waiting_on.insert(peer);
9278         } else {
9279           MOSDPGLog *m = new MOSDPGLog(
9280             peer.shard, pg_whoami.shard,
9281             info.last_update.epoch,
9282             info);
9283           m->log.log = entries;
9284           m->log.tail = old_last_update;
9285           m->log.head = info.last_update;
9286           osd->send_message_osd_cluster(
9287             peer.osd, m, get_osdmap()->get_epoch());
9288         }
9289       }
9290       if (get_osdmap()->require_osd_release >= CEPH_RELEASE_JEWEL) {
9291         ceph_tid_t rep_tid = repop->rep_tid;
9292         waiting_on.insert(pg_whoami);
9293         log_entry_update_waiting_on.insert(
9294           make_pair(
9295             rep_tid,
9296             LogUpdateCtx{std::move(repop), std::move(waiting_on)}
9297             ));
9298         struct OnComplete : public Context {
9299           PrimaryLogPGRef pg;
9300           ceph_tid_t rep_tid;
9301           epoch_t epoch;
9302           OnComplete(
9303             PrimaryLogPGRef pg,
9304             ceph_tid_t rep_tid,
9305             epoch_t epoch)
9306             : pg(pg), rep_tid(rep_tid), epoch(epoch) {}
9307           void finish(int) override {
9308             pg->lock();
9309             if (!pg->pg_has_reset_since(epoch)) {
9310               auto it = pg->log_entry_update_waiting_on.find(rep_tid);
9311               assert(it != pg->log_entry_update_waiting_on.end());
9312               auto it2 = it->second.waiting_on.find(pg->pg_whoami);
9313               assert(it2 != it->second.waiting_on.end());
9314               it->second.waiting_on.erase(it2);
9315               if (it->second.waiting_on.empty()) {
9316                 pg->repop_all_committed(it->second.repop.get());
9317                 pg->log_entry_update_waiting_on.erase(it);
9318               }
9319             }
9320             pg->unlock();
9321           }
9322         };
9323         t.register_on_commit(
9324           new OnComplete{this, rep_tid, get_osdmap()->get_epoch()});
9325       } else {
9326         if (on_complete) {
9327           struct OnComplete : public Context {
9328             PrimaryLogPGRef pg;
9329             std::function<void(void)> on_complete;
9330             epoch_t epoch;
9331             OnComplete(
9332               PrimaryLogPGRef pg,
9333               const std::function<void(void)> &on_complete,
9334               epoch_t epoch)
9335               : pg(pg),
9336                 on_complete(std::move(on_complete)),
9337                 epoch(epoch) {}
9338             void finish(int) override {
9339               pg->lock();
9340               if (!pg->pg_has_reset_since(epoch))
9341                 on_complete();
9342               pg->unlock();
9343             }
9344           };
9345           t.register_on_complete(
9346             new OnComplete{
9347               this, *on_complete, get_osdmap()->get_epoch()
9348                 });
9349         }
9350       }
9351       t.register_on_applied(
9352         new C_OSD_OnApplied{this, get_osdmap()->get_epoch(), info.last_update});
9353       int r = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
9354       assert(r == 0);
9355     });
9356 }
9357
9358 void PrimaryLogPG::cancel_log_updates()
9359 {
9360   // get rid of all the LogUpdateCtx so their references to repops are
9361   // dropped
9362   log_entry_update_waiting_on.clear();
9363 }
9364
9365 // -------------------------------------------------------
9366
9367 void PrimaryLogPG::get_watchers(list<obj_watch_item_t> &pg_watchers)
9368 {
9369   pair<hobject_t, ObjectContextRef> i;
9370   while (object_contexts.get_next(i.first, &i)) {
9371     ObjectContextRef obc(i.second);
9372     get_obc_watchers(obc, pg_watchers);
9373   }
9374 }
9375
9376 void PrimaryLogPG::get_obc_watchers(ObjectContextRef obc, list<obj_watch_item_t> &pg_watchers)
9377 {
9378   for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j =
9379          obc->watchers.begin();
9380         j != obc->watchers.end();
9381         ++j) {
9382     obj_watch_item_t owi;
9383
9384     owi.obj = obc->obs.oi.soid;
9385     owi.wi.addr = j->second->get_peer_addr();
9386     owi.wi.name = j->second->get_entity();
9387     owi.wi.cookie = j->second->get_cookie();
9388     owi.wi.timeout_seconds = j->second->get_timeout();
9389
9390     dout(30) << "watch: Found oid=" << owi.obj << " addr=" << owi.wi.addr
9391       << " name=" << owi.wi.name << " cookie=" << owi.wi.cookie << dendl;
9392
9393     pg_watchers.push_back(owi);
9394   }
9395 }
9396
9397 void PrimaryLogPG::check_blacklisted_watchers()
9398 {
9399   dout(20) << "PrimaryLogPG::check_blacklisted_watchers for pg " << get_pgid() << dendl;
9400   pair<hobject_t, ObjectContextRef> i;
9401   while (object_contexts.get_next(i.first, &i))
9402     check_blacklisted_obc_watchers(i.second);
9403 }
9404
9405 void PrimaryLogPG::check_blacklisted_obc_watchers(ObjectContextRef obc)
9406 {
9407   dout(20) << "PrimaryLogPG::check_blacklisted_obc_watchers for obc " << obc->obs.oi.soid << dendl;
9408   for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator k =
9409          obc->watchers.begin();
9410         k != obc->watchers.end();
9411         ) {
9412     //Advance iterator now so handle_watch_timeout() can erase element
9413     map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j = k++;
9414     dout(30) << "watch: Found " << j->second->get_entity() << " cookie " << j->second->get_cookie() << dendl;
9415     entity_addr_t ea = j->second->get_peer_addr();
9416     dout(30) << "watch: Check entity_addr_t " << ea << dendl;
9417     if (get_osdmap()->is_blacklisted(ea)) {
9418       dout(10) << "watch: Found blacklisted watcher for " << ea << dendl;
9419       assert(j->second->get_pg() == this);
9420       j->second->unregister_cb();
9421       handle_watch_timeout(j->second);
9422     }
9423   }
9424 }
9425
9426 void PrimaryLogPG::populate_obc_watchers(ObjectContextRef obc)
9427 {
9428   assert(is_active());
9429   assert((recovering.count(obc->obs.oi.soid) ||
9430           !is_missing_object(obc->obs.oi.soid)) ||
9431          (pg_log.get_log().objects.count(obc->obs.oi.soid) && // or this is a revert... see recover_primary()
9432           pg_log.get_log().objects.find(obc->obs.oi.soid)->second->op ==
9433             pg_log_entry_t::LOST_REVERT &&
9434           pg_log.get_log().objects.find(obc->obs.oi.soid)->second->reverting_to ==
9435             obc->obs.oi.version));
9436
9437   dout(10) << "populate_obc_watchers " << obc->obs.oi.soid << dendl;
9438   assert(obc->watchers.empty());
9439   // populate unconnected_watchers
9440   for (map<pair<uint64_t, entity_name_t>, watch_info_t>::iterator p =
9441         obc->obs.oi.watchers.begin();
9442        p != obc->obs.oi.watchers.end();
9443        ++p) {
9444     utime_t expire = info.stats.last_became_active;
9445     expire += p->second.timeout_seconds;
9446     dout(10) << "  unconnected watcher " << p->first << " will expire " << expire << dendl;
9447     WatchRef watch(
9448       Watch::makeWatchRef(
9449         this, osd, obc, p->second.timeout_seconds, p->first.first,
9450         p->first.second, p->second.addr));
9451     watch->disconnect();
9452     obc->watchers.insert(
9453       make_pair(
9454         make_pair(p->first.first, p->first.second),
9455         watch));
9456   }
9457   // Look for watchers from blacklisted clients and drop
9458   check_blacklisted_obc_watchers(obc);
9459 }
9460
9461 void PrimaryLogPG::handle_watch_timeout(WatchRef watch)
9462 {
9463   ObjectContextRef obc = watch->get_obc(); // handle_watch_timeout owns this ref
9464   dout(10) << "handle_watch_timeout obc " << obc << dendl;
9465
9466   if (!is_active()) {
9467     dout(10) << "handle_watch_timeout not active, no-op" << dendl;
9468     return;
9469   }
9470   if (is_degraded_or_backfilling_object(obc->obs.oi.soid)) {
9471     callbacks_for_degraded_object[obc->obs.oi.soid].push_back(
9472       watch->get_delayed_cb()
9473       );
9474     dout(10) << "handle_watch_timeout waiting for degraded on obj "
9475              << obc->obs.oi.soid
9476              << dendl;
9477     return;
9478   }
9479
9480   if (scrubber.write_blocked_by_scrub(obc->obs.oi.soid)) {
9481     dout(10) << "handle_watch_timeout waiting for scrub on obj "
9482              << obc->obs.oi.soid
9483              << dendl;
9484     scrubber.add_callback(
9485       watch->get_delayed_cb() // This callback!
9486       );
9487     return;
9488   }
9489
9490   OpContextUPtr ctx = simple_opc_create(obc);
9491   ctx->at_version = get_next_version();
9492
9493   object_info_t& oi = ctx->new_obs.oi;
9494   oi.watchers.erase(make_pair(watch->get_cookie(),
9495                               watch->get_entity()));
9496
9497   list<watch_disconnect_t> watch_disconnects = {
9498     watch_disconnect_t(watch->get_cookie(), watch->get_entity(), true)
9499   };
9500   ctx->register_on_success(
9501     [this, obc, watch_disconnects]() {
9502       complete_disconnect_watches(obc, watch_disconnects);
9503     });
9504
9505
9506   PGTransaction *t = ctx->op_t.get();
9507   ctx->log.push_back(pg_log_entry_t(pg_log_entry_t::MODIFY, obc->obs.oi.soid,
9508                                     ctx->at_version,
9509                                     oi.version,
9510                                     0,
9511                                     osd_reqid_t(), ctx->mtime, 0));
9512
9513   oi.prior_version = obc->obs.oi.version;
9514   oi.version = ctx->at_version;
9515   bufferlist bl;
9516   ::encode(oi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
9517   t->setattr(obc->obs.oi.soid, OI_ATTR, bl);
9518
9519   // apply new object state.
9520   ctx->obc->obs = ctx->new_obs;
9521
9522   // no ctx->delta_stats
9523   simple_opc_submit(std::move(ctx));
9524 }
9525
9526 ObjectContextRef PrimaryLogPG::create_object_context(const object_info_t& oi,
9527                                                      SnapSetContext *ssc)
9528 {
9529   ObjectContextRef obc(object_contexts.lookup_or_create(oi.soid));
9530   assert(obc->destructor_callback == NULL);
9531   obc->destructor_callback = new C_PG_ObjectContext(this, obc.get());
9532   obc->obs.oi = oi;
9533   obc->obs.exists = false;
9534   obc->ssc = ssc;
9535   if (ssc)
9536     register_snapset_context(ssc);
9537   dout(10) << "create_object_context " << (void*)obc.get() << " " << oi.soid << " " << dendl;
9538   if (is_active())
9539     populate_obc_watchers(obc);
9540   return obc;
9541 }
9542
9543 ObjectContextRef PrimaryLogPG::get_object_context(
9544   const hobject_t& soid,
9545   bool can_create,
9546   const map<string, bufferlist> *attrs)
9547 {
9548   assert(
9549     attrs || !pg_log.get_missing().is_missing(soid) ||
9550     // or this is a revert... see recover_primary()
9551     (pg_log.get_log().objects.count(soid) &&
9552       pg_log.get_log().objects.find(soid)->second->op ==
9553       pg_log_entry_t::LOST_REVERT));
9554   ObjectContextRef obc = object_contexts.lookup(soid);
9555   osd->logger->inc(l_osd_object_ctx_cache_total);
9556   if (obc) {
9557     osd->logger->inc(l_osd_object_ctx_cache_hit);
9558     dout(10) << __func__ << ": found obc in cache: " << obc
9559              << dendl;
9560   } else {
9561     dout(10) << __func__ << ": obc NOT found in cache: " << soid << dendl;
9562     // check disk
9563     bufferlist bv;
9564     if (attrs) {
9565       assert(attrs->count(OI_ATTR));
9566       bv = attrs->find(OI_ATTR)->second;
9567     } else {
9568       int r = pgbackend->objects_get_attr(soid, OI_ATTR, &bv);
9569       if (r < 0) {
9570         if (!can_create) {
9571           dout(10) << __func__ << ": no obc for soid "
9572                    << soid << " and !can_create"
9573                    << dendl;
9574           return ObjectContextRef();   // -ENOENT!
9575         }
9576
9577         dout(10) << __func__ << ": no obc for soid "
9578                  << soid << " but can_create"
9579                  << dendl;
9580         // new object.
9581         object_info_t oi(soid);
9582         SnapSetContext *ssc = get_snapset_context(
9583           soid, true, 0, false);
9584         obc = create_object_context(oi, ssc);
9585         dout(10) << __func__ << ": " << obc << " " << soid
9586                  << " " << obc->rwstate
9587                  << " oi: " << obc->obs.oi
9588                  << " ssc: " << obc->ssc
9589                  << " snapset: " << obc->ssc->snapset << dendl;
9590         return obc;
9591       }
9592     }
9593
9594     object_info_t oi;
9595     try {
9596       bufferlist::iterator bliter = bv.begin();
9597       ::decode(oi, bliter);
9598     } catch (...) {
9599       dout(0) << __func__ << ": obc corrupt: " << soid << dendl;
9600       return ObjectContextRef();   // -ENOENT!
9601     }
9602
9603     assert(oi.soid.pool == (int64_t)info.pgid.pool());
9604
9605     obc = object_contexts.lookup_or_create(oi.soid);
9606     obc->destructor_callback = new C_PG_ObjectContext(this, obc.get());
9607     obc->obs.oi = oi;
9608     obc->obs.exists = true;
9609
9610     obc->ssc = get_snapset_context(
9611       soid, true,
9612       soid.has_snapset() ? attrs : 0);
9613
9614     if (is_active())
9615       populate_obc_watchers(obc);
9616
9617     if (pool.info.require_rollback()) {
9618       if (attrs) {
9619         obc->attr_cache = *attrs;
9620       } else {
9621         int r = pgbackend->objects_get_attrs(
9622           soid,
9623           &obc->attr_cache);
9624         assert(r == 0);
9625       }
9626     }
9627
9628     dout(10) << __func__ << ": creating obc from disk: " << obc
9629              << dendl;
9630   }
9631   assert(obc->ssc);
9632   dout(10) << __func__ << ": " << obc << " " << soid
9633            << " " << obc->rwstate
9634            << " oi: " << obc->obs.oi
9635            << " exists: " << (int)obc->obs.exists
9636            << " ssc: " << obc->ssc
9637            << " snapset: " << obc->ssc->snapset << dendl;
9638   return obc;
9639 }
9640
9641 void PrimaryLogPG::context_registry_on_change()
9642 {
9643   pair<hobject_t, ObjectContextRef> i;
9644   while (object_contexts.get_next(i.first, &i)) {
9645     ObjectContextRef obc(i.second);
9646     if (obc) {
9647       for (map<pair<uint64_t, entity_name_t>, WatchRef>::iterator j =
9648              obc->watchers.begin();
9649            j != obc->watchers.end();
9650            obc->watchers.erase(j++)) {
9651         j->second->discard();
9652       }
9653     }
9654   }
9655 }
9656
9657
9658 /*
9659  * If we return an error, and set *pmissing, then promoting that
9660  * object may help.
9661  *
9662  * If we return -EAGAIN, we will always set *pmissing to the missing
9663  * object to wait for.
9664  *
9665  * If we return an error but do not set *pmissing, then we know the
9666  * object does not exist.
9667  */
9668 int PrimaryLogPG::find_object_context(const hobject_t& oid,
9669                                       ObjectContextRef *pobc,
9670                                       bool can_create,
9671                                       bool map_snapid_to_clone,
9672                                       hobject_t *pmissing)
9673 {
9674   FUNCTRACE();
9675   assert(oid.pool == static_cast<int64_t>(info.pgid.pool()));
9676   // want the head?
9677   if (oid.snap == CEPH_NOSNAP) {
9678     ObjectContextRef obc = get_object_context(oid, can_create);
9679     if (!obc) {
9680       if (pmissing)
9681         *pmissing = oid;
9682       return -ENOENT;
9683     }
9684     dout(10) << "find_object_context " << oid
9685        << " @" << oid.snap
9686        << " oi=" << obc->obs.oi
9687        << dendl;
9688     *pobc = obc;
9689
9690     return 0;
9691   }
9692
9693   hobject_t head = oid.get_head();
9694
9695   // want the snapdir?
9696   if (oid.snap == CEPH_SNAPDIR) {
9697     // return head or snapdir, whichever exists.
9698     ObjectContextRef headobc = get_object_context(head, can_create);
9699     ObjectContextRef obc = headobc;
9700     if (!obc || !obc->obs.exists)
9701       obc = get_object_context(oid, can_create);
9702     if (!obc || !obc->obs.exists) {
9703       // if we have neither, we would want to promote the head.
9704       if (pmissing)
9705         *pmissing = head;
9706       if (pobc)
9707         *pobc = headobc; // may be null
9708       return -ENOENT;
9709     }
9710     dout(10) << "find_object_context " << oid
9711              << " @" << oid.snap
9712              << " oi=" << obc->obs.oi
9713              << dendl;
9714     *pobc = obc;
9715
9716     // always populate ssc for SNAPDIR...
9717     if (!obc->ssc)
9718       obc->ssc = get_snapset_context(
9719         oid, true);
9720     return 0;
9721   }
9722
9723   // we want a snap
9724   if (!map_snapid_to_clone && pool.info.is_removed_snap(oid.snap)) {
9725     dout(10) << __func__ << " snap " << oid.snap << " is removed" << dendl;
9726     return -ENOENT;
9727   }
9728
9729   SnapSetContext *ssc = get_snapset_context(oid, can_create);
9730   if (!ssc || !(ssc->exists || can_create)) {
9731     dout(20) << __func__ << " " << oid << " no snapset" << dendl;
9732     if (pmissing)
9733       *pmissing = head;  // start by getting the head
9734     if (ssc)
9735       put_snapset_context(ssc);
9736     return -ENOENT;
9737   }
9738
9739   if (map_snapid_to_clone) {
9740     dout(10) << "find_object_context " << oid << " @" << oid.snap
9741              << " snapset " << ssc->snapset
9742              << " map_snapid_to_clone=true" << dendl;
9743     if (oid.snap > ssc->snapset.seq) {
9744       // already must be readable
9745       ObjectContextRef obc = get_object_context(head, false);
9746       dout(10) << "find_object_context " << oid << " @" << oid.snap
9747                << " snapset " << ssc->snapset
9748                << " maps to head" << dendl;
9749       *pobc = obc;
9750       put_snapset_context(ssc);
9751       return (obc && obc->obs.exists) ? 0 : -ENOENT;
9752     } else {
9753       vector<snapid_t>::const_iterator citer = std::find(
9754         ssc->snapset.clones.begin(),
9755         ssc->snapset.clones.end(),
9756         oid.snap);
9757       if (citer == ssc->snapset.clones.end()) {
9758         dout(10) << "find_object_context " << oid << " @" << oid.snap
9759                  << " snapset " << ssc->snapset
9760                  << " maps to nothing" << dendl;
9761         put_snapset_context(ssc);
9762         return -ENOENT;
9763       }
9764
9765       dout(10) << "find_object_context " << oid << " @" << oid.snap
9766                << " snapset " << ssc->snapset
9767                << " maps to " << oid << dendl;
9768
9769       if (pg_log.get_missing().is_missing(oid)) {
9770         dout(10) << "find_object_context " << oid << " @" << oid.snap
9771                  << " snapset " << ssc->snapset
9772                  << " " << oid << " is missing" << dendl;
9773         if (pmissing)
9774           *pmissing = oid;
9775         put_snapset_context(ssc);
9776         return -EAGAIN;
9777       }
9778
9779       ObjectContextRef obc = get_object_context(oid, false);
9780       if (!obc || !obc->obs.exists) {
9781         dout(10) << "find_object_context " << oid << " @" << oid.snap
9782                  << " snapset " << ssc->snapset
9783                  << " " << oid << " is not present" << dendl;
9784         if (pmissing)
9785           *pmissing = oid;
9786         put_snapset_context(ssc);
9787         return -ENOENT;
9788       }
9789       dout(10) << "find_object_context " << oid << " @" << oid.snap
9790                << " snapset " << ssc->snapset
9791                << " " << oid << " HIT" << dendl;
9792       *pobc = obc;
9793       put_snapset_context(ssc);
9794       return 0;
9795     }
9796     ceph_abort(); //unreachable
9797   }
9798
9799   dout(10) << "find_object_context " << oid << " @" << oid.snap
9800            << " snapset " << ssc->snapset << dendl;
9801
9802   // head?
9803   if (oid.snap > ssc->snapset.seq) {
9804     if (ssc->snapset.head_exists) {
9805       ObjectContextRef obc = get_object_context(head, false);
9806       dout(10) << "find_object_context  " << head
9807                << " want " << oid.snap << " > snapset seq " << ssc->snapset.seq
9808                << " -- HIT " << obc->obs
9809                << dendl;
9810       if (!obc->ssc)
9811         obc->ssc = ssc;
9812       else {
9813         assert(ssc == obc->ssc);
9814         put_snapset_context(ssc);
9815       }
9816       *pobc = obc;
9817       return 0;
9818     }
9819     dout(10) << "find_object_context  " << head
9820              << " want " << oid.snap << " > snapset seq " << ssc->snapset.seq
9821              << " but head dne -- DNE"
9822              << dendl;
9823     put_snapset_context(ssc);
9824     return -ENOENT;
9825   }
9826
9827   // which clone would it be?
9828   unsigned k = 0;
9829   while (k < ssc->snapset.clones.size() &&
9830          ssc->snapset.clones[k] < oid.snap)
9831     k++;
9832   if (k == ssc->snapset.clones.size()) {
9833     dout(10) << "find_object_context  no clones with last >= oid.snap "
9834              << oid.snap << " -- DNE" << dendl;
9835     put_snapset_context(ssc);
9836     return -ENOENT;
9837   }
9838   hobject_t soid(oid.oid, oid.get_key(), ssc->snapset.clones[k], oid.get_hash(),
9839                  info.pgid.pool(), oid.get_namespace());
9840
9841   if (pg_log.get_missing().is_missing(soid)) {
9842     dout(20) << "find_object_context  " << soid << " missing, try again later"
9843              << dendl;
9844     if (pmissing)
9845       *pmissing = soid;
9846     put_snapset_context(ssc);
9847     return -EAGAIN;
9848   }
9849
9850   ObjectContextRef obc = get_object_context(soid, false);
9851   if (!obc || !obc->obs.exists) {
9852     dout(20) << __func__ << " missing clone " << soid << dendl;
9853     if (pmissing)
9854       *pmissing = soid;
9855     put_snapset_context(ssc);
9856     return -ENOENT;
9857   }
9858
9859   if (!obc->ssc) {
9860     obc->ssc = ssc;
9861   } else {
9862     assert(obc->ssc == ssc);
9863     put_snapset_context(ssc);
9864   }
9865   ssc = 0;
9866
9867   // clone
9868   dout(20) << "find_object_context  " << soid
9869            << " snapset " << obc->ssc->snapset
9870            << " legacy_snaps " << obc->obs.oi.legacy_snaps
9871            << dendl;
9872   snapid_t first, last;
9873   if (obc->ssc->snapset.is_legacy()) {
9874     first = obc->obs.oi.legacy_snaps.back();
9875     last = obc->obs.oi.legacy_snaps.front();
9876   } else {
9877     auto p = obc->ssc->snapset.clone_snaps.find(soid.snap);
9878     assert(p != obc->ssc->snapset.clone_snaps.end());
9879     first = p->second.back();
9880     last = p->second.front();
9881   }
9882   if (first <= oid.snap) {
9883     dout(20) << "find_object_context  " << soid << " [" << first << "," << last
9884              << "] contains " << oid.snap << " -- HIT " << obc->obs << dendl;
9885     *pobc = obc;
9886     return 0;
9887   } else {
9888     dout(20) << "find_object_context  " << soid << " [" << first << "," << last
9889              << "] does not contain " << oid.snap << " -- DNE" << dendl;
9890     return -ENOENT;
9891   }
9892 }
9893
9894 void PrimaryLogPG::object_context_destructor_callback(ObjectContext *obc)
9895 {
9896   if (obc->ssc)
9897     put_snapset_context(obc->ssc);
9898 }
9899
9900 void PrimaryLogPG::add_object_context_to_pg_stat(ObjectContextRef obc, pg_stat_t *pgstat)
9901 {
9902   object_info_t& oi = obc->obs.oi;
9903
9904   dout(10) << "add_object_context_to_pg_stat " << oi.soid << dendl;
9905   object_stat_sum_t stat;
9906
9907   stat.num_bytes += oi.size;
9908
9909   if (oi.soid.snap != CEPH_SNAPDIR)
9910     stat.num_objects++;
9911   if (oi.is_dirty())
9912     stat.num_objects_dirty++;
9913   if (oi.is_whiteout())
9914     stat.num_whiteouts++;
9915   if (oi.is_omap())
9916     stat.num_objects_omap++;
9917   if (oi.is_cache_pinned())
9918     stat.num_objects_pinned++;
9919
9920   if (oi.soid.snap && oi.soid.snap != CEPH_NOSNAP && oi.soid.snap != CEPH_SNAPDIR) {
9921     stat.num_object_clones++;
9922
9923     if (!obc->ssc)
9924       obc->ssc = get_snapset_context(oi.soid, false);
9925     assert(obc->ssc);
9926
9927     // subtract off clone overlap
9928     if (obc->ssc->snapset.clone_overlap.count(oi.soid.snap)) {
9929       interval_set<uint64_t>& o = obc->ssc->snapset.clone_overlap[oi.soid.snap];
9930       for (interval_set<uint64_t>::const_iterator r = o.begin();
9931            r != o.end();
9932            ++r) {
9933         stat.num_bytes -= r.get_len();
9934       }
9935     }
9936   }
9937
9938   // add it in
9939   pgstat->stats.sum.add(stat);
9940 }
9941
9942 void PrimaryLogPG::kick_object_context_blocked(ObjectContextRef obc)
9943 {
9944   const hobject_t& soid = obc->obs.oi.soid;
9945   if (obc->is_blocked()) {
9946     dout(10) << __func__ << " " << soid << " still blocked" << dendl;
9947     return;
9948   }
9949
9950   map<hobject_t, list<OpRequestRef>>::iterator p = waiting_for_blocked_object.find(soid);
9951   if (p != waiting_for_blocked_object.end()) {
9952     list<OpRequestRef>& ls = p->second;
9953     dout(10) << __func__ << " " << soid << " requeuing " << ls.size() << " requests" << dendl;
9954     requeue_ops(ls);
9955     waiting_for_blocked_object.erase(p);
9956   }
9957
9958   map<hobject_t, ObjectContextRef>::iterator i =
9959     objects_blocked_on_snap_promotion.find(obc->obs.oi.soid.get_head());
9960   if (i != objects_blocked_on_snap_promotion.end()) {
9961     assert(i->second == obc);
9962     objects_blocked_on_snap_promotion.erase(i);
9963   }
9964
9965   if (obc->requeue_scrub_on_unblock) {
9966     obc->requeue_scrub_on_unblock = false;
9967     requeue_scrub();
9968   }
9969 }
9970
9971 SnapSetContext *PrimaryLogPG::get_snapset_context(
9972   const hobject_t& oid,
9973   bool can_create,
9974   const map<string, bufferlist> *attrs,
9975   bool oid_existed)
9976 {
9977   Mutex::Locker l(snapset_contexts_lock);
9978   SnapSetContext *ssc;
9979   map<hobject_t, SnapSetContext*>::iterator p = snapset_contexts.find(
9980     oid.get_snapdir());
9981   if (p != snapset_contexts.end()) {
9982     if (can_create || p->second->exists) {
9983       ssc = p->second;
9984     } else {
9985       return NULL;
9986     }
9987   } else {
9988     bufferlist bv;
9989     if (!attrs) {
9990       int r = -ENOENT;
9991       if (!(oid.is_head() && !oid_existed))
9992         r = pgbackend->objects_get_attr(oid.get_head(), SS_ATTR, &bv);
9993       if (r < 0) {
9994         // try _snapset
9995         if (!(oid.is_snapdir() && !oid_existed))
9996           r = pgbackend->objects_get_attr(oid.get_snapdir(), SS_ATTR, &bv);
9997         if (r < 0 && !can_create)
9998           return NULL;
9999       }
10000     } else {
10001       assert(attrs->count(SS_ATTR));
10002       bv = attrs->find(SS_ATTR)->second;
10003     }
10004     ssc = new SnapSetContext(oid.get_snapdir());
10005     _register_snapset_context(ssc);
10006     if (bv.length()) {
10007       bufferlist::iterator bvp = bv.begin();
10008       ssc->snapset.decode(bvp);
10009       ssc->exists = true;
10010     } else {
10011       ssc->exists = false;
10012     }
10013   }
10014   assert(ssc);
10015   ssc->ref++;
10016   return ssc;
10017 }
10018
10019 void PrimaryLogPG::put_snapset_context(SnapSetContext *ssc)
10020 {
10021   Mutex::Locker l(snapset_contexts_lock);
10022   --ssc->ref;
10023   if (ssc->ref == 0) {
10024     if (ssc->registered)
10025       snapset_contexts.erase(ssc->oid);
10026     delete ssc;
10027   }
10028 }
10029
10030 /** pull - request object from a peer
10031  */
10032
10033 /*
10034  * Return values:
10035  *  NONE  - didn't pull anything
10036  *  YES   - pulled what the caller wanted
10037  *  OTHER - needed to pull something else first (_head or _snapdir)
10038  */
10039 enum { PULL_NONE, PULL_OTHER, PULL_YES };
10040
10041 int PrimaryLogPG::recover_missing(
10042   const hobject_t &soid, eversion_t v,
10043   int priority,
10044   PGBackend::RecoveryHandle *h)
10045 {
10046   if (missing_loc.is_unfound(soid)) {
10047     dout(7) << "pull " << soid
10048             << " v " << v
10049             << " but it is unfound" << dendl;
10050     return PULL_NONE;
10051   }
10052
10053   // is this a snapped object?  if so, consult the snapset.. we may not need the entire object!
10054   ObjectContextRef obc;
10055   ObjectContextRef head_obc;
10056   if (soid.snap && soid.snap < CEPH_NOSNAP) {
10057     // do we have the head and/or snapdir?
10058     hobject_t head = soid.get_head();
10059     if (pg_log.get_missing().is_missing(head)) {
10060       if (recovering.count(head)) {
10061         dout(10) << " missing but already recovering head " << head << dendl;
10062         return PULL_NONE;
10063       } else {
10064         int r = recover_missing(
10065           head, pg_log.get_missing().get_items().find(head)->second.need, priority,
10066           h);
10067         if (r != PULL_NONE)
10068           return PULL_OTHER;
10069         return PULL_NONE;
10070       }
10071     }
10072     head = soid.get_snapdir();
10073     if (pg_log.get_missing().is_missing(head)) {
10074       if (recovering.count(head)) {
10075         dout(10) << " missing but already recovering snapdir " << head << dendl;
10076         return PULL_NONE;
10077       } else {
10078         int r = recover_missing(
10079           head, pg_log.get_missing().get_items().find(head)->second.need, priority,
10080           h);
10081         if (r != PULL_NONE)
10082           return PULL_OTHER;
10083         return PULL_NONE;
10084       }
10085     }
10086
10087     // we must have one or the other
10088     head_obc = get_object_context(
10089       soid.get_head(),
10090       false,
10091       0);
10092     if (!head_obc)
10093       head_obc = get_object_context(
10094         soid.get_snapdir(),
10095         false,
10096         0);
10097     assert(head_obc);
10098   }
10099   start_recovery_op(soid);
10100   assert(!recovering.count(soid));
10101   recovering.insert(make_pair(soid, obc));
10102   pgbackend->recover_object(
10103     soid,
10104     v,
10105     head_obc,
10106     obc,
10107     h);
10108   return PULL_YES;
10109 }
10110
10111 void PrimaryLogPG::send_remove_op(
10112   const hobject_t& oid, eversion_t v, pg_shard_t peer)
10113 {
10114   ceph_tid_t tid = osd->get_tid();
10115   osd_reqid_t rid(osd->get_cluster_msgr_name(), 0, tid);
10116
10117   dout(10) << "send_remove_op " << oid << " from osd." << peer
10118            << " tid " << tid << dendl;
10119
10120   MOSDSubOp *subop = new MOSDSubOp(
10121     rid, pg_whoami, spg_t(info.pgid.pgid, peer.shard),
10122     oid, CEPH_OSD_FLAG_ACK,
10123     get_osdmap()->get_epoch(), tid, v);
10124   subop->ops = vector<OSDOp>(1);
10125   subop->ops[0].op.op = CEPH_OSD_OP_DELETE;
10126
10127   osd->send_message_osd_cluster(peer.osd, subop, get_osdmap()->get_epoch());
10128 }
10129
10130
10131 void PrimaryLogPG::finish_degraded_object(const hobject_t& oid)
10132 {
10133   dout(10) << "finish_degraded_object " << oid << dendl;
10134   ObjectContextRef obc(object_contexts.lookup(oid));
10135   if (callbacks_for_degraded_object.count(oid)) {
10136     list<Context*> contexts;
10137     contexts.swap(callbacks_for_degraded_object[oid]);
10138     callbacks_for_degraded_object.erase(oid);
10139     for (list<Context*>::iterator i = contexts.begin();
10140          i != contexts.end();
10141          ++i) {
10142       (*i)->complete(0);
10143     }
10144   }
10145   map<hobject_t, snapid_t>::iterator i = objects_blocked_on_degraded_snap.find(
10146     oid.get_head());
10147   if (i != objects_blocked_on_degraded_snap.end() &&
10148       i->second == oid.snap)
10149     objects_blocked_on_degraded_snap.erase(i);
10150 }
10151
10152 void PrimaryLogPG::_committed_pushed_object(
10153   epoch_t epoch, eversion_t last_complete)
10154 {
10155   lock();
10156   if (!pg_has_reset_since(epoch)) {
10157     dout(10) << "_committed_pushed_object last_complete " << last_complete << " now ondisk" << dendl;
10158     last_complete_ondisk = last_complete;
10159
10160     if (last_complete_ondisk == info.last_update) {
10161       if (!is_primary()) {
10162         // Either we are a replica or backfill target.
10163         // we are fully up to date.  tell the primary!
10164         osd->send_message_osd_cluster(
10165           get_primary().osd,
10166           new MOSDPGTrim(
10167             get_osdmap()->get_epoch(),
10168             spg_t(info.pgid.pgid, get_primary().shard),
10169             last_complete_ondisk),
10170           get_osdmap()->get_epoch());
10171       } else {
10172         calc_min_last_complete_ondisk();
10173       }
10174     }
10175
10176   } else {
10177     dout(10) << "_committed_pushed_object pg has changed, not touching last_complete_ondisk" << dendl;
10178   }
10179
10180   unlock();
10181 }
10182
10183 void PrimaryLogPG::_applied_recovered_object(ObjectContextRef obc)
10184 {
10185   lock();
10186   dout(10) << "_applied_recovered_object " << *obc << dendl;
10187
10188   assert(active_pushes >= 1);
10189   --active_pushes;
10190
10191   // requeue an active chunky scrub waiting on recovery ops
10192   if (!deleting && active_pushes == 0
10193       && scrubber.is_chunky_scrub_active()) {
10194     if (ops_blocked_by_scrub()) {
10195       requeue_scrub(true);
10196     } else {
10197       requeue_scrub(false);
10198     }
10199   }
10200
10201   unlock();
10202 }
10203
10204 void PrimaryLogPG::_applied_recovered_object_replica()
10205 {
10206   lock();
10207   dout(10) << "_applied_recovered_object_replica" << dendl;
10208
10209   assert(active_pushes >= 1);
10210   --active_pushes;
10211
10212   // requeue an active chunky scrub waiting on recovery ops
10213   if (!deleting && active_pushes == 0 &&
10214       scrubber.active_rep_scrub && static_cast<const MOSDRepScrub*>(
10215         scrubber.active_rep_scrub->get_req())->chunky) {
10216     osd->enqueue_back(
10217       info.pgid,
10218       PGQueueable(scrubber.active_rep_scrub, get_osdmap()->get_epoch()));
10219     scrubber.active_rep_scrub = OpRequestRef();
10220   }
10221
10222   unlock();
10223 }
10224
10225 void PrimaryLogPG::recover_got(hobject_t oid, eversion_t v)
10226 {
10227   dout(10) << "got missing " << oid << " v " << v << dendl;
10228   pg_log.recover_got(oid, v, info);
10229   if (pg_log.get_log().complete_to != pg_log.get_log().log.end()) {
10230     dout(10) << "last_complete now " << info.last_complete
10231              << " log.complete_to " << pg_log.get_log().complete_to->version
10232              << dendl;
10233   } else {
10234     dout(10) << "last_complete now " << info.last_complete
10235              << " log.complete_to at end" << dendl;
10236     //below is not true in the repair case.
10237     //assert(missing.num_missing() == 0);  // otherwise, complete_to was wrong.
10238     assert(info.last_complete == info.last_update);
10239   }
10240 }
10241
10242 void PrimaryLogPG::failed_push(const list<pg_shard_t> &from, const hobject_t &soid)
10243 {
10244   dout(20) << __func__ << ": " << soid << dendl;
10245   assert(recovering.count(soid));
10246   auto obc = recovering[soid];
10247   if (obc) {
10248     list<OpRequestRef> blocked_ops;
10249     obc->drop_recovery_read(&blocked_ops);
10250     requeue_ops(blocked_ops);
10251   }
10252   recovering.erase(soid);
10253   for (auto&& i : from)
10254     missing_loc.remove_location(soid, i);
10255   dout(0) << __func__ << " " << soid << " from shard " << from
10256           << ", reps on " << missing_loc.get_locations(soid)
10257           << " unfound? " << missing_loc.is_unfound(soid) << dendl;
10258   finish_recovery_op(soid);  // close out this attempt,
10259 }
10260
10261 void PrimaryLogPG::sub_op_remove(OpRequestRef op)
10262 {
10263   const MOSDSubOp *m = static_cast<const MOSDSubOp*>(op->get_req());
10264   assert(m->get_type() == MSG_OSD_SUBOP);
10265   dout(7) << "sub_op_remove " << m->poid << dendl;
10266
10267   op->mark_started();
10268
10269   ObjectStore::Transaction t;
10270   remove_snap_mapped_object(t, m->poid);
10271   int r = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
10272   assert(r == 0);
10273 }
10274
10275 eversion_t PrimaryLogPG::pick_newest_available(const hobject_t& oid)
10276 {
10277   eversion_t v;
10278   pg_missing_item pmi;
10279   bool is_missing = pg_log.get_missing().is_missing(oid, &pmi);
10280   assert(is_missing);
10281   v = pmi.have;
10282   dout(10) << "pick_newest_available " << oid << " " << v << " on osd." << osd->whoami << " (local)" << dendl;
10283
10284   assert(!actingbackfill.empty());
10285   for (set<pg_shard_t>::iterator i = actingbackfill.begin();
10286        i != actingbackfill.end();
10287        ++i) {
10288     if (*i == get_primary()) continue;
10289     pg_shard_t peer = *i;
10290     if (!peer_missing[peer].is_missing(oid)) {
10291       assert(is_backfill_targets(peer));
10292       continue;
10293     }
10294     eversion_t h = peer_missing[peer].get_items().at(oid).have;
10295     dout(10) << "pick_newest_available " << oid << " " << h << " on osd." << peer << dendl;
10296     if (h > v)
10297       v = h;
10298   }
10299
10300   dout(10) << "pick_newest_available " << oid << " " << v << " (newest)" << dendl;
10301   return v;
10302 }
10303
10304 void PrimaryLogPG::do_update_log_missing(OpRequestRef &op)
10305 {
10306   const MOSDPGUpdateLogMissing *m = static_cast<const MOSDPGUpdateLogMissing*>(
10307     op->get_req());
10308   assert(m->get_type() == MSG_OSD_PG_UPDATE_LOG_MISSING);
10309   ObjectStore::Transaction t;
10310   append_log_entries_update_missing(m->entries, t);
10311
10312   Context *complete = new FunctionContext(
10313     [=](int) {
10314       const MOSDPGUpdateLogMissing *msg = static_cast<const MOSDPGUpdateLogMissing*>(
10315         op->get_req());
10316       lock();
10317       if (!pg_has_reset_since(msg->get_epoch())) {
10318         MOSDPGUpdateLogMissingReply *reply =
10319           new MOSDPGUpdateLogMissingReply(
10320             spg_t(info.pgid.pgid, primary_shard().shard),
10321             pg_whoami.shard,
10322             msg->get_epoch(),
10323             msg->min_epoch,
10324             msg->get_tid());
10325         reply->set_priority(CEPH_MSG_PRIO_HIGH);
10326         msg->get_connection()->send_message(reply);
10327       }
10328       unlock();
10329     });
10330
10331   if (get_osdmap()->require_osd_release >= CEPH_RELEASE_KRAKEN) {
10332     t.register_on_commit(complete);
10333   } else {
10334     /* Hack to work around the fact that ReplicatedBackend sends
10335      * ack+commit if commit happens first
10336      *
10337      * This behavior is no longer necessary, but we preserve it so old
10338      * primaries can keep their repops in order */
10339     if (pool.info.ec_pool()) {
10340       t.register_on_complete(complete);
10341     } else {
10342       t.register_on_commit(complete);
10343     }
10344   }
10345   t.register_on_applied(
10346     new C_OSD_OnApplied{this, get_osdmap()->get_epoch(), info.last_update});
10347   int tr = osd->store->queue_transaction(
10348     osr.get(),
10349     std::move(t),
10350     nullptr);
10351   assert(tr == 0);
10352 }
10353
10354 void PrimaryLogPG::do_update_log_missing_reply(OpRequestRef &op)
10355 {
10356   const MOSDPGUpdateLogMissingReply *m =
10357     static_cast<const MOSDPGUpdateLogMissingReply*>(
10358     op->get_req());
10359   dout(20) << __func__ << " got reply from "
10360            << m->get_from() << dendl;
10361
10362   auto it = log_entry_update_waiting_on.find(m->get_tid());
10363   if (it != log_entry_update_waiting_on.end()) {
10364     if (it->second.waiting_on.count(m->get_from())) {
10365       it->second.waiting_on.erase(m->get_from());
10366     } else {
10367       osd->clog->error()
10368         << info.pgid << " got reply "
10369         << *m << " from shard we are not waiting for "
10370         << m->get_from();
10371     }
10372
10373     if (it->second.waiting_on.empty()) {
10374       repop_all_committed(it->second.repop.get());
10375       log_entry_update_waiting_on.erase(it);
10376     }
10377   } else {
10378     osd->clog->error()
10379       << info.pgid << " got reply "
10380       << *m << " on unknown tid " << m->get_tid();
10381   }
10382 }
10383
10384 /* Mark all unfound objects as lost.
10385  */
10386 void PrimaryLogPG::mark_all_unfound_lost(
10387   int what,
10388   ConnectionRef con,
10389   ceph_tid_t tid)
10390 {
10391   dout(3) << __func__ << " " << pg_log_entry_t::get_op_name(what) << dendl;
10392
10393   dout(30) << __func__ << ": log before:\n";
10394   pg_log.get_log().print(*_dout);
10395   *_dout << dendl;
10396
10397   mempool::osd_pglog::list<pg_log_entry_t> log_entries;
10398
10399   utime_t mtime = ceph_clock_now();
10400   map<hobject_t, pg_missing_item>::const_iterator m =
10401     missing_loc.get_needs_recovery().begin();
10402   map<hobject_t, pg_missing_item>::const_iterator mend =
10403     missing_loc.get_needs_recovery().end();
10404
10405   ObcLockManager manager;
10406   eversion_t v = get_next_version();
10407   v.epoch = get_osdmap()->get_epoch();
10408   uint64_t num_unfound = missing_loc.num_unfound();
10409   while (m != mend) {
10410     const hobject_t &oid(m->first);
10411     if (!missing_loc.is_unfound(oid)) {
10412       // We only care about unfound objects
10413       ++m;
10414       continue;
10415     }
10416
10417     ObjectContextRef obc;
10418     eversion_t prev;
10419
10420     switch (what) {
10421     case pg_log_entry_t::LOST_MARK:
10422       assert(0 == "actually, not implemented yet!");
10423       break;
10424
10425     case pg_log_entry_t::LOST_REVERT:
10426       prev = pick_newest_available(oid);
10427       if (prev > eversion_t()) {
10428         // log it
10429         pg_log_entry_t e(
10430           pg_log_entry_t::LOST_REVERT, oid, v,
10431           m->second.need, 0, osd_reqid_t(), mtime, 0);
10432         e.reverting_to = prev;
10433         e.mark_unrollbackable();
10434         log_entries.push_back(e);
10435         dout(10) << e << dendl;
10436
10437         // we are now missing the new version; recovery code will sort it out.
10438         ++v.version;
10439         ++m;
10440         break;
10441       }
10442
10443     case pg_log_entry_t::LOST_DELETE:
10444       {
10445         pg_log_entry_t e(pg_log_entry_t::LOST_DELETE, oid, v, m->second.need,
10446                          0, osd_reqid_t(), mtime, 0);
10447         if (get_osdmap()->require_osd_release >= CEPH_RELEASE_JEWEL) {
10448           if (pool.info.require_rollback()) {
10449             e.mod_desc.try_rmobject(v.version);
10450           } else {
10451             e.mark_unrollbackable();
10452           }
10453         } // otherwise, just do what we used to do
10454         dout(10) << e << dendl;
10455         log_entries.push_back(e);
10456
10457         ++v.version;
10458         ++m;
10459       }
10460       break;
10461
10462     default:
10463       ceph_abort();
10464     }
10465   }
10466
10467   info.stats.stats_invalid = true;
10468
10469   submit_log_entries(
10470     log_entries,
10471     std::move(manager),
10472     boost::optional<std::function<void(void)> >(
10473       [=]() {
10474         requeue_ops(waiting_for_all_missing);
10475         waiting_for_all_missing.clear();
10476         for (auto& p : waiting_for_unreadable_object) {
10477           release_backoffs(p.first);
10478         }
10479         requeue_object_waiters(waiting_for_unreadable_object);
10480         queue_recovery();
10481
10482         stringstream ss;
10483         ss << "pg has " << num_unfound
10484            << " objects unfound and apparently lost marking";
10485         string rs = ss.str();
10486         dout(0) << "do_command r=" << 0 << " " << rs << dendl;
10487         osd->clog->info() << rs;
10488         if (con) {
10489           MCommandReply *reply = new MCommandReply(0, rs);
10490           reply->set_tid(tid);
10491           con->send_message(reply);
10492         }
10493       }),
10494     OpRequestRef());
10495 }
10496
10497 void PrimaryLogPG::_split_into(pg_t child_pgid, PG *child, unsigned split_bits)
10498 {
10499   assert(repop_queue.empty());
10500 }
10501
10502 /*
10503  * pg status change notification
10504  */
10505
10506 void PrimaryLogPG::apply_and_flush_repops(bool requeue)
10507 {
10508   list<OpRequestRef> rq;
10509
10510   // apply all repops
10511   while (!repop_queue.empty()) {
10512     RepGather *repop = repop_queue.front();
10513     repop_queue.pop_front();
10514     dout(10) << " canceling repop tid " << repop->rep_tid << dendl;
10515     repop->rep_aborted = true;
10516     repop->on_applied.clear();
10517     repop->on_committed.clear();
10518     repop->on_success.clear();
10519
10520     if (requeue) {
10521       if (repop->op) {
10522         dout(10) << " requeuing " << *repop->op->get_req() << dendl;
10523         rq.push_back(repop->op);
10524         repop->op = OpRequestRef();
10525       }
10526
10527       // also requeue any dups, interleaved into position
10528       map<eversion_t, list<pair<OpRequestRef, version_t> > >::iterator p =
10529         waiting_for_ondisk.find(repop->v);
10530       if (p != waiting_for_ondisk.end()) {
10531         dout(10) << " also requeuing ondisk waiters " << p->second << dendl;
10532         for (list<pair<OpRequestRef, version_t> >::iterator i =
10533                p->second.begin();
10534              i != p->second.end();
10535              ++i) {
10536           rq.push_back(i->first);
10537         }
10538         waiting_for_ondisk.erase(p);
10539       }
10540     }
10541
10542     remove_repop(repop);
10543   }
10544
10545   assert(repop_queue.empty());
10546
10547   if (requeue) {
10548     requeue_ops(rq);
10549     if (!waiting_for_ondisk.empty()) {
10550       for (map<eversion_t, list<pair<OpRequestRef, version_t> > >::iterator i =
10551              waiting_for_ondisk.begin();
10552            i != waiting_for_ondisk.end();
10553            ++i) {
10554         for (list<pair<OpRequestRef, version_t> >::iterator j =
10555                i->second.begin();
10556              j != i->second.end();
10557              ++j) {
10558           derr << __func__ << ": op " << *(j->first->get_req()) << " waiting on "
10559                << i->first << dendl;
10560         }
10561       }
10562       assert(waiting_for_ondisk.empty());
10563     }
10564   }
10565
10566   waiting_for_ondisk.clear();
10567 }
10568
10569 void PrimaryLogPG::on_flushed()
10570 {
10571   assert(flushes_in_progress > 0);
10572   flushes_in_progress--;
10573   if (flushes_in_progress == 0) {
10574     requeue_ops(waiting_for_peered);
10575   }
10576   if (!is_peered() || !is_primary()) {
10577     pair<hobject_t, ObjectContextRef> i;
10578     while (object_contexts.get_next(i.first, &i)) {
10579       derr << "on_flushed: object " << i.first << " obc still alive" << dendl;
10580     }
10581     assert(object_contexts.empty());
10582   }
10583   pgbackend->on_flushed();
10584 }
10585
10586 void PrimaryLogPG::on_removal(ObjectStore::Transaction *t)
10587 {
10588   dout(10) << "on_removal" << dendl;
10589
10590   // adjust info to backfill
10591   info.set_last_backfill(hobject_t());
10592   pg_log.reset_backfill();
10593   dirty_info = true;
10594
10595
10596   // clear log
10597   PGLogEntryHandler rollbacker{this, t};
10598   pg_log.roll_forward(&rollbacker);
10599
10600   write_if_dirty(*t);
10601
10602   if (!deleting)
10603     on_shutdown();
10604 }
10605
10606 void PrimaryLogPG::on_shutdown()
10607 {
10608   dout(10) << "on_shutdown" << dendl;
10609
10610   // remove from queues
10611   osd->pg_stat_queue_dequeue(this);
10612   osd->peering_wq.dequeue(this);
10613
10614   // handles queue races
10615   deleting = true;
10616
10617   clear_scrub_reserved();
10618   scrub_clear_state();
10619
10620   unreg_next_scrub();
10621   cancel_copy_ops(false);
10622   cancel_flush_ops(false);
10623   cancel_proxy_ops(false);
10624   apply_and_flush_repops(false);
10625   cancel_log_updates();
10626   // we must remove PGRefs, so do this this prior to release_backoffs() callers
10627   clear_backoffs();
10628   // clean up snap trim references
10629   snap_trimmer_machine.process_event(Reset());
10630
10631   pgbackend->on_change();
10632
10633   context_registry_on_change();
10634   object_contexts.clear();
10635
10636   osd->remote_reserver.cancel_reservation(info.pgid);
10637   osd->local_reserver.cancel_reservation(info.pgid);
10638
10639   clear_primary_state();
10640   cancel_recovery();
10641 }
10642
10643 void PrimaryLogPG::on_activate()
10644 {
10645   // all clean?
10646   if (needs_recovery()) {
10647     dout(10) << "activate not all replicas are up-to-date, queueing recovery" << dendl;
10648     queue_peering_event(
10649       CephPeeringEvtRef(
10650         std::make_shared<CephPeeringEvt>(
10651           get_osdmap()->get_epoch(),
10652           get_osdmap()->get_epoch(),
10653           DoRecovery())));
10654   } else if (needs_backfill()) {
10655     dout(10) << "activate queueing backfill" << dendl;
10656     queue_peering_event(
10657       CephPeeringEvtRef(
10658         std::make_shared<CephPeeringEvt>(
10659           get_osdmap()->get_epoch(),
10660           get_osdmap()->get_epoch(),
10661           RequestBackfill())));
10662   } else {
10663     dout(10) << "activate all replicas clean, no recovery" << dendl;
10664     queue_peering_event(
10665       CephPeeringEvtRef(
10666         std::make_shared<CephPeeringEvt>(
10667           get_osdmap()->get_epoch(),
10668           get_osdmap()->get_epoch(),
10669           AllReplicasRecovered())));
10670   }
10671
10672   publish_stats_to_osd();
10673
10674   if (!backfill_targets.empty()) {
10675     last_backfill_started = earliest_backfill();
10676     new_backfill = true;
10677     assert(!last_backfill_started.is_max());
10678     dout(5) << "on activate: bft=" << backfill_targets
10679            << " from " << last_backfill_started << dendl;
10680     for (set<pg_shard_t>::iterator i = backfill_targets.begin();
10681          i != backfill_targets.end();
10682          ++i) {
10683       dout(5) << "target shard " << *i
10684              << " from " << peer_info[*i].last_backfill
10685              << dendl;
10686     }
10687   }
10688
10689   hit_set_setup();
10690   agent_setup();
10691 }
10692
10693 void PrimaryLogPG::_on_new_interval()
10694 {
10695 }
10696
10697 void PrimaryLogPG::on_change(ObjectStore::Transaction *t)
10698 {
10699   dout(10) << "on_change" << dendl;
10700
10701   if (hit_set && hit_set->insert_count() == 0) {
10702     dout(20) << " discarding empty hit_set" << dendl;
10703     hit_set_clear();
10704   }
10705
10706   if (recovery_queued) {
10707     recovery_queued = false;
10708     osd->clear_queued_recovery(this);
10709   }
10710
10711   // requeue everything in the reverse order they should be
10712   // reexamined.
10713   requeue_ops(waiting_for_peered);
10714   requeue_ops(waiting_for_active);
10715
10716   clear_scrub_reserved();
10717
10718   cancel_copy_ops(is_primary());
10719   cancel_flush_ops(is_primary());
10720   cancel_proxy_ops(is_primary());
10721
10722   // requeue object waiters
10723   for (auto& p : waiting_for_unreadable_object) {
10724     release_backoffs(p.first);
10725   }
10726   if (is_primary()) {
10727     requeue_object_waiters(waiting_for_unreadable_object);
10728   } else {
10729     waiting_for_unreadable_object.clear();
10730   }
10731   for (map<hobject_t,list<OpRequestRef>>::iterator p = waiting_for_degraded_object.begin();
10732        p != waiting_for_degraded_object.end();
10733        waiting_for_degraded_object.erase(p++)) {
10734     release_backoffs(p->first);
10735     if (is_primary())
10736       requeue_ops(p->second);
10737     else
10738       p->second.clear();
10739     finish_degraded_object(p->first);
10740   }
10741
10742   // requeues waiting_for_scrub
10743   scrub_clear_state();
10744
10745   for (auto p = waiting_for_blocked_object.begin();
10746        p != waiting_for_blocked_object.end();
10747        waiting_for_blocked_object.erase(p++)) {
10748     if (is_primary())
10749       requeue_ops(p->second);
10750     else
10751       p->second.clear();
10752   }
10753   for (auto i = callbacks_for_degraded_object.begin();
10754        i != callbacks_for_degraded_object.end();
10755     ) {
10756     finish_degraded_object((i++)->first);
10757   }
10758   assert(callbacks_for_degraded_object.empty());
10759
10760   if (is_primary()) {
10761     requeue_ops(waiting_for_cache_not_full);
10762     requeue_ops(waiting_for_all_missing);
10763   } else {
10764     waiting_for_cache_not_full.clear();
10765     waiting_for_all_missing.clear();
10766   }
10767   objects_blocked_on_cache_full.clear();
10768
10769   for (list<pair<OpRequestRef, OpContext*> >::iterator i =
10770          in_progress_async_reads.begin();
10771        i != in_progress_async_reads.end();
10772        in_progress_async_reads.erase(i++)) {
10773     close_op_ctx(i->second);
10774     if (is_primary())
10775       requeue_op(i->first);
10776   }
10777
10778   // this will requeue ops we were working on but didn't finish, and
10779   // any dups
10780   apply_and_flush_repops(is_primary());
10781   cancel_log_updates();
10782
10783   // do this *after* apply_and_flush_repops so that we catch any newly
10784   // registered watches.
10785   context_registry_on_change();
10786
10787   pgbackend->on_change_cleanup(t);
10788   scrubber.cleanup_store(t);
10789   pgbackend->on_change();
10790
10791   // clear snap_trimmer state
10792   snap_trimmer_machine.process_event(Reset());
10793
10794   debug_op_order.clear();
10795   unstable_stats.clear();
10796
10797   // we don't want to cache object_contexts through the interval change
10798   // NOTE: we actually assert that all currently live references are dead
10799   // by the time the flush for the next interval completes.
10800   object_contexts.clear();
10801
10802   // should have been cleared above by finishing all of the degraded objects
10803   assert(objects_blocked_on_degraded_snap.empty());
10804 }
10805
10806 void PrimaryLogPG::on_role_change()
10807 {
10808   dout(10) << "on_role_change" << dendl;
10809   if (get_role() != 0 && hit_set) {
10810     dout(10) << " clearing hit set" << dendl;
10811     hit_set_clear();
10812   }
10813 }
10814
10815 void PrimaryLogPG::on_pool_change()
10816 {
10817   dout(10) << __func__ << dendl;
10818   // requeue cache full waiters just in case the cache_mode is
10819   // changing away from writeback mode.  note that if we are not
10820   // active the normal requeuing machinery is sufficient (and properly
10821   // ordered).
10822   if (is_active() &&
10823       pool.info.cache_mode != pg_pool_t::CACHEMODE_WRITEBACK &&
10824       !waiting_for_cache_not_full.empty()) {
10825     dout(10) << __func__ << " requeuing full waiters (not in writeback) "
10826              << dendl;
10827     requeue_ops(waiting_for_cache_not_full);
10828     objects_blocked_on_cache_full.clear();
10829   }
10830   hit_set_setup();
10831   agent_setup();
10832 }
10833
10834 // clear state.  called on recovery completion AND cancellation.
10835 void PrimaryLogPG::_clear_recovery_state()
10836 {
10837   missing_loc.clear();
10838 #ifdef DEBUG_RECOVERY_OIDS
10839   recovering_oids.clear();
10840 #endif
10841   last_backfill_started = hobject_t();
10842   set<hobject_t>::iterator i = backfills_in_flight.begin();
10843   while (i != backfills_in_flight.end()) {
10844     assert(recovering.count(*i));
10845     backfills_in_flight.erase(i++);
10846   }
10847
10848   list<OpRequestRef> blocked_ops;
10849   for (map<hobject_t, ObjectContextRef>::iterator i = recovering.begin();
10850        i != recovering.end();
10851        recovering.erase(i++)) {
10852     if (i->second) {
10853       i->second->drop_recovery_read(&blocked_ops);
10854       requeue_ops(blocked_ops);
10855     }
10856   }
10857   assert(backfills_in_flight.empty());
10858   pending_backfill_updates.clear();
10859   assert(recovering.empty());
10860   pgbackend->clear_recovery_state();
10861 }
10862
10863 void PrimaryLogPG::cancel_pull(const hobject_t &soid)
10864 {
10865   dout(20) << __func__ << ": " << soid << dendl;
10866   assert(recovering.count(soid));
10867   ObjectContextRef obc = recovering[soid];
10868   if (obc) {
10869     list<OpRequestRef> blocked_ops;
10870     obc->drop_recovery_read(&blocked_ops);
10871     requeue_ops(blocked_ops);
10872   }
10873   recovering.erase(soid);
10874   finish_recovery_op(soid);
10875   release_backoffs(soid);
10876   if (waiting_for_degraded_object.count(soid)) {
10877     dout(20) << " kicking degraded waiters on " << soid << dendl;
10878     requeue_ops(waiting_for_degraded_object[soid]);
10879     waiting_for_degraded_object.erase(soid);
10880   }
10881   if (waiting_for_unreadable_object.count(soid)) {
10882     dout(20) << " kicking unreadable waiters on " << soid << dendl;
10883     requeue_ops(waiting_for_unreadable_object[soid]);
10884     waiting_for_unreadable_object.erase(soid);
10885   }
10886   if (is_missing_object(soid))
10887     pg_log.set_last_requested(0); // get recover_primary to start over
10888   finish_degraded_object(soid);
10889 }
10890
10891 void PrimaryLogPG::check_recovery_sources(const OSDMapRef& osdmap)
10892 {
10893   /*
10894    * check that any peers we are planning to (or currently) pulling
10895    * objects from are dealt with.
10896    */
10897   missing_loc.check_recovery_sources(osdmap);
10898   pgbackend->check_recovery_sources(osdmap);
10899
10900   for (set<pg_shard_t>::iterator i = peer_log_requested.begin();
10901        i != peer_log_requested.end();
10902        ) {
10903     if (!osdmap->is_up(i->osd)) {
10904       dout(10) << "peer_log_requested removing " << *i << dendl;
10905       peer_log_requested.erase(i++);
10906     } else {
10907       ++i;
10908     }
10909   }
10910
10911   for (set<pg_shard_t>::iterator i = peer_missing_requested.begin();
10912        i != peer_missing_requested.end();
10913        ) {
10914     if (!osdmap->is_up(i->osd)) {
10915       dout(10) << "peer_missing_requested removing " << *i << dendl;
10916       peer_missing_requested.erase(i++);
10917     } else {
10918       ++i;
10919     }
10920   }
10921 }
10922
10923 void PG::MissingLoc::check_recovery_sources(const OSDMapRef& osdmap)
10924 {
10925   set<pg_shard_t> now_down;
10926   for (set<pg_shard_t>::iterator p = missing_loc_sources.begin();
10927        p != missing_loc_sources.end();
10928        ) {
10929     if (osdmap->is_up(p->osd)) {
10930       ++p;
10931       continue;
10932     }
10933     ldout(pg->cct, 10) << "check_recovery_sources source osd." << *p << " now down" << dendl;
10934     now_down.insert(*p);
10935     missing_loc_sources.erase(p++);
10936   }
10937
10938   if (now_down.empty()) {
10939     ldout(pg->cct, 10) << "check_recovery_sources no source osds (" << missing_loc_sources << ") went down" << dendl;
10940   } else {
10941     ldout(pg->cct, 10) << "check_recovery_sources sources osds " << now_down << " now down, remaining sources are "
10942                        << missing_loc_sources << dendl;
10943
10944     // filter missing_loc
10945     map<hobject_t, set<pg_shard_t>>::iterator p = missing_loc.begin();
10946     while (p != missing_loc.end()) {
10947       set<pg_shard_t>::iterator q = p->second.begin();
10948       while (q != p->second.end())
10949         if (now_down.count(*q)) {
10950           p->second.erase(q++);
10951         } else {
10952           ++q;
10953         }
10954       if (p->second.empty())
10955         missing_loc.erase(p++);
10956       else
10957         ++p;
10958     }
10959   }
10960 }
10961
10962
10963 bool PrimaryLogPG::start_recovery_ops(
10964   uint64_t max,
10965   ThreadPool::TPHandle &handle,
10966   uint64_t *ops_started)
10967 {
10968   uint64_t& started = *ops_started;
10969   started = 0;
10970   bool work_in_progress = false;
10971   assert(is_primary());
10972
10973   if (!state_test(PG_STATE_RECOVERING) &&
10974       !state_test(PG_STATE_BACKFILL)) {
10975     /* TODO: I think this case is broken and will make do_recovery()
10976      * unhappy since we're returning false */
10977     dout(10) << "recovery raced and were queued twice, ignoring!" << dendl;
10978     return false;
10979   }
10980
10981   const pg_missing_t &missing = pg_log.get_missing();
10982
10983   unsigned int num_missing = missing.num_missing();
10984   uint64_t num_unfound = get_num_unfound();
10985
10986   if (num_missing == 0) {
10987     info.last_complete = info.last_update;
10988   }
10989
10990   if (num_missing == num_unfound) {
10991     // All of the missing objects we have are unfound.
10992     // Recover the replicas.
10993     started = recover_replicas(max, handle);
10994   }
10995   if (!started) {
10996     // We still have missing objects that we should grab from replicas.
10997     started += recover_primary(max, handle);
10998   }
10999   if (!started && num_unfound != get_num_unfound()) {
11000     // second chance to recovery replicas
11001     started = recover_replicas(max, handle);
11002   }
11003
11004   if (started)
11005     work_in_progress = true;
11006
11007   bool deferred_backfill = false;
11008   if (recovering.empty() &&
11009       state_test(PG_STATE_BACKFILL) &&
11010       !backfill_targets.empty() && started < max &&
11011       missing.num_missing() == 0 &&
11012       waiting_on_backfill.empty()) {
11013     if (get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL)) {
11014       dout(10) << "deferring backfill due to NOBACKFILL" << dendl;
11015       deferred_backfill = true;
11016     } else if (get_osdmap()->test_flag(CEPH_OSDMAP_NOREBALANCE) &&
11017                !is_degraded())  {
11018       dout(10) << "deferring backfill due to NOREBALANCE" << dendl;
11019       deferred_backfill = true;
11020     } else if (!backfill_reserved) {
11021       dout(10) << "deferring backfill due to !backfill_reserved" << dendl;
11022       if (!backfill_reserving) {
11023         dout(10) << "queueing RequestBackfill" << dendl;
11024         backfill_reserving = true;
11025         queue_peering_event(
11026           CephPeeringEvtRef(
11027             std::make_shared<CephPeeringEvt>(
11028               get_osdmap()->get_epoch(),
11029               get_osdmap()->get_epoch(),
11030               RequestBackfill())));
11031       }
11032       deferred_backfill = true;
11033     } else {
11034       started += recover_backfill(max - started, handle, &work_in_progress);
11035     }
11036   }
11037
11038   dout(10) << " started " << started << dendl;
11039   osd->logger->inc(l_osd_rop, started);
11040
11041   if (!recovering.empty() ||
11042       work_in_progress || recovery_ops_active > 0 || deferred_backfill)
11043     return work_in_progress;
11044
11045   assert(recovering.empty());
11046   assert(recovery_ops_active == 0);
11047
11048   dout(10) << __func__ << " needs_recovery: "
11049            << missing_loc.get_needs_recovery()
11050            << dendl;
11051   dout(10) << __func__ << " missing_loc: "
11052            << missing_loc.get_missing_locs()
11053            << dendl;
11054   int unfound = get_num_unfound();
11055   if (unfound) {
11056     dout(10) << " still have " << unfound << " unfound" << dendl;
11057     return work_in_progress;
11058   }
11059
11060   if (missing.num_missing() > 0) {
11061     // this shouldn't happen!
11062     osd->clog->error() << info.pgid << " recovery ending with " << missing.num_missing()
11063                        << ": " << missing.get_items();
11064     return work_in_progress;
11065   }
11066
11067   if (needs_recovery()) {
11068     // this shouldn't happen!
11069     // We already checked num_missing() so we must have missing replicas
11070     osd->clog->error() << info.pgid << " recovery ending with missing replicas";
11071     return work_in_progress;
11072   }
11073
11074   if (state_test(PG_STATE_RECOVERING)) {
11075     state_clear(PG_STATE_RECOVERING);
11076     if (needs_backfill()) {
11077       dout(10) << "recovery done, queuing backfill" << dendl;
11078       queue_peering_event(
11079         CephPeeringEvtRef(
11080           std::make_shared<CephPeeringEvt>(
11081             get_osdmap()->get_epoch(),
11082             get_osdmap()->get_epoch(),
11083             RequestBackfill())));
11084     } else {
11085       dout(10) << "recovery done, no backfill" << dendl;
11086       queue_peering_event(
11087         CephPeeringEvtRef(
11088           std::make_shared<CephPeeringEvt>(
11089             get_osdmap()->get_epoch(),
11090             get_osdmap()->get_epoch(),
11091             AllReplicasRecovered())));
11092     }
11093   } else { // backfilling
11094     state_clear(PG_STATE_BACKFILL);
11095     dout(10) << "recovery done, backfill done" << dendl;
11096     queue_peering_event(
11097       CephPeeringEvtRef(
11098         std::make_shared<CephPeeringEvt>(
11099           get_osdmap()->get_epoch(),
11100           get_osdmap()->get_epoch(),
11101           Backfilled())));
11102   }
11103
11104   return false;
11105 }
11106
11107 /**
11108  * do one recovery op.
11109  * return true if done, false if nothing left to do.
11110  */
11111 uint64_t PrimaryLogPG::recover_primary(uint64_t max, ThreadPool::TPHandle &handle)
11112 {
11113   assert(is_primary());
11114
11115   const pg_missing_t &missing = pg_log.get_missing();
11116
11117   dout(10) << "recover_primary recovering " << recovering.size()
11118            << " in pg" << dendl;
11119   dout(10) << "recover_primary " << missing << dendl;
11120   dout(25) << "recover_primary " << missing.get_items() << dendl;
11121
11122   // look at log!
11123   pg_log_entry_t *latest = 0;
11124   unsigned started = 0;
11125   int skipped = 0;
11126
11127   PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
11128   map<version_t, hobject_t>::const_iterator p =
11129     missing.get_rmissing().lower_bound(pg_log.get_log().last_requested);
11130   while (p != missing.get_rmissing().end()) {
11131     handle.reset_tp_timeout();
11132     hobject_t soid;
11133     version_t v = p->first;
11134
11135     if (pg_log.get_log().objects.count(p->second)) {
11136       latest = pg_log.get_log().objects.find(p->second)->second;
11137       assert(latest->is_update());
11138       soid = latest->soid;
11139     } else {
11140       latest = 0;
11141       soid = p->second;
11142     }
11143     const pg_missing_item& item = missing.get_items().find(p->second)->second;
11144     ++p;
11145
11146     hobject_t head = soid;
11147     head.snap = CEPH_NOSNAP;
11148
11149     eversion_t need = item.need;
11150
11151     dout(10) << "recover_primary "
11152              << soid << " " << item.need
11153              << (missing.is_missing(soid) ? " (missing)":"")
11154              << (missing.is_missing(head) ? " (missing head)":"")
11155              << (recovering.count(soid) ? " (recovering)":"")
11156              << (recovering.count(head) ? " (recovering head)":"")
11157              << dendl;
11158
11159     if (latest) {
11160       switch (latest->op) {
11161       case pg_log_entry_t::CLONE:
11162         /*
11163          * Handling for this special case removed for now, until we
11164          * can correctly construct an accurate SnapSet from the old
11165          * one.
11166          */
11167         break;
11168
11169       case pg_log_entry_t::LOST_REVERT:
11170         {
11171           if (item.have == latest->reverting_to) {
11172             ObjectContextRef obc = get_object_context(soid, true);
11173
11174             if (obc->obs.oi.version == latest->version) {
11175               // I'm already reverting
11176               dout(10) << " already reverting " << soid << dendl;
11177             } else {
11178               dout(10) << " reverting " << soid << " to " << latest->prior_version << dendl;
11179               obc->ondisk_write_lock();
11180               obc->obs.oi.version = latest->version;
11181
11182               ObjectStore::Transaction t;
11183               bufferlist b2;
11184               obc->obs.oi.encode(
11185                 b2,
11186                 get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
11187               assert(!pool.info.require_rollback());
11188               t.setattr(coll, ghobject_t(soid), OI_ATTR, b2);
11189
11190               recover_got(soid, latest->version);
11191               missing_loc.add_location(soid, pg_whoami);
11192
11193               ++active_pushes;
11194
11195               osd->store->queue_transaction(osr.get(), std::move(t),
11196                                             new C_OSD_AppliedRecoveredObject(this, obc),
11197                                             new C_OSD_CommittedPushedObject(
11198                                               this,
11199                                               get_osdmap()->get_epoch(),
11200                                               info.last_complete),
11201                                             new C_OSD_OndiskWriteUnlock(obc));
11202               continue;
11203             }
11204           } else {
11205             /*
11206              * Pull the old version of the object.  Update missing_loc here to have the location
11207              * of the version we want.
11208              *
11209              * This doesn't use the usual missing_loc paths, but that's okay:
11210              *  - if we have it locally, we hit the case above, and go from there.
11211              *  - if we don't, we always pass through this case during recovery and set up the location
11212              *    properly.
11213              *  - this way we don't need to mangle the missing code to be general about needing an old
11214              *    version...
11215              */
11216             eversion_t alternate_need = latest->reverting_to;
11217             dout(10) << " need to pull prior_version " << alternate_need << " for revert " << item << dendl;
11218
11219             for (map<pg_shard_t, pg_missing_t>::iterator p = peer_missing.begin();
11220                  p != peer_missing.end();
11221                  ++p)
11222               if (p->second.is_missing(soid, need) &&
11223                   p->second.get_items().at(soid).have == alternate_need) {
11224                 missing_loc.add_location(soid, p->first);
11225               }
11226             dout(10) << " will pull " << alternate_need << " or " << need
11227                      << " from one of " << missing_loc.get_locations(soid)
11228                      << dendl;
11229           }
11230         }
11231         break;
11232       }
11233     }
11234
11235     if (!recovering.count(soid)) {
11236       if (recovering.count(head)) {
11237         ++skipped;
11238       } else {
11239         int r = recover_missing(
11240           soid, need, get_recovery_op_priority(), h);
11241         switch (r) {
11242         case PULL_YES:
11243           ++started;
11244           break;
11245         case PULL_OTHER:
11246           ++started;
11247         case PULL_NONE:
11248           ++skipped;
11249           break;
11250         default:
11251           ceph_abort();
11252         }
11253         if (started >= max)
11254           break;
11255       }
11256     }
11257
11258     // only advance last_requested if we haven't skipped anything
11259     if (!skipped)
11260       pg_log.set_last_requested(v);
11261   }
11262
11263   pgbackend->run_recovery_op(h, get_recovery_op_priority());
11264   return started;
11265 }
11266
11267 int PrimaryLogPG::prep_object_replica_pushes(
11268   const hobject_t& soid, eversion_t v,
11269   PGBackend::RecoveryHandle *h)
11270 {
11271   assert(is_primary());
11272   dout(10) << __func__ << ": on " << soid << dendl;
11273
11274   // NOTE: we know we will get a valid oloc off of disk here.
11275   ObjectContextRef obc = get_object_context(soid, false);
11276   if (!obc) {
11277     pg_log.missing_add(soid, v, eversion_t());
11278     missing_loc.remove_location(soid, pg_whoami);
11279     bool uhoh = true;
11280     assert(!actingbackfill.empty());
11281     for (set<pg_shard_t>::iterator i = actingbackfill.begin();
11282          i != actingbackfill.end();
11283          ++i) {
11284       if (*i == get_primary()) continue;
11285       pg_shard_t peer = *i;
11286       if (!peer_missing[peer].is_missing(soid, v)) {
11287         missing_loc.add_location(soid, peer);
11288         dout(10) << info.pgid << " unexpectedly missing " << soid << " v" << v
11289                  << ", there should be a copy on shard " << peer << dendl;
11290         uhoh = false;
11291       }
11292     }
11293     if (uhoh)
11294       osd->clog->error() << info.pgid << " missing primary copy of " << soid << ", unfound";
11295     else
11296       osd->clog->error() << info.pgid << " missing primary copy of " << soid
11297                          << ", will try copies on " << missing_loc.get_locations(soid);
11298     return 0;
11299   }
11300
11301   if (!obc->get_recovery_read()) {
11302     dout(20) << "recovery delayed on " << soid
11303              << "; could not get rw_manager lock" << dendl;
11304     return 0;
11305   } else {
11306     dout(20) << "recovery got recovery read lock on " << soid
11307              << dendl;
11308   }
11309
11310   start_recovery_op(soid);
11311   assert(!recovering.count(soid));
11312   recovering.insert(make_pair(soid, obc));
11313
11314   /* We need this in case there is an in progress write on the object.  In fact,
11315    * the only possible write is an update to the xattr due to a lost_revert --
11316    * a client write would be blocked since the object is degraded.
11317    * In almost all cases, therefore, this lock should be uncontended.
11318    */
11319   obc->ondisk_read_lock();
11320   pgbackend->recover_object(
11321     soid,
11322     v,
11323     ObjectContextRef(),
11324     obc, // has snapset context
11325     h);
11326   obc->ondisk_read_unlock();
11327   return 1;
11328 }
11329
11330 uint64_t PrimaryLogPG::recover_replicas(uint64_t max, ThreadPool::TPHandle &handle)
11331 {
11332   dout(10) << __func__ << "(" << max << ")" << dendl;
11333   uint64_t started = 0;
11334
11335   PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
11336
11337   // this is FAR from an optimal recovery order.  pretty lame, really.
11338   assert(!actingbackfill.empty());
11339   for (set<pg_shard_t>::iterator i = actingbackfill.begin();
11340        i != actingbackfill.end();
11341        ++i) {
11342     if (*i == get_primary()) continue;
11343     pg_shard_t peer = *i;
11344     map<pg_shard_t, pg_missing_t>::const_iterator pm = peer_missing.find(peer);
11345     assert(pm != peer_missing.end());
11346     map<pg_shard_t, pg_info_t>::const_iterator pi = peer_info.find(peer);
11347     assert(pi != peer_info.end());
11348     size_t m_sz = pm->second.num_missing();
11349
11350     dout(10) << " peer osd." << peer << " missing " << m_sz << " objects." << dendl;
11351     dout(20) << " peer osd." << peer << " missing " << pm->second.get_items() << dendl;
11352
11353     // oldest first!
11354     const pg_missing_t &m(pm->second);
11355     for (map<version_t, hobject_t>::const_iterator p = m.get_rmissing().begin();
11356          p != m.get_rmissing().end() && started < max;
11357            ++p) {
11358       handle.reset_tp_timeout();
11359       const hobject_t soid(p->second);
11360
11361       if (soid > pi->second.last_backfill) {
11362         if (!recovering.count(soid)) {
11363           derr << __func__ << ": object added to missing set for backfill, but "
11364                << "is not in recovering, error!" << dendl;
11365           ceph_abort();
11366         }
11367         continue;
11368       }
11369
11370       if (recovering.count(soid)) {
11371         dout(10) << __func__ << ": already recovering " << soid << dendl;
11372         continue;
11373       }
11374
11375       if (missing_loc.is_unfound(soid)) {
11376         dout(10) << __func__ << ": " << soid << " still unfound" << dendl;
11377         continue;
11378       }
11379
11380       if (soid.is_snap() && pg_log.get_missing().is_missing(soid.get_head())) {
11381         dout(10) << __func__ << ": " << soid.get_head()
11382                  << " still missing on primary" << dendl;
11383         continue;
11384       }
11385
11386       if (soid.is_snap() && pg_log.get_missing().is_missing(soid.get_snapdir())) {
11387         dout(10) << __func__ << ": " << soid.get_snapdir()
11388                  << " still missing on primary" << dendl;
11389         continue;
11390       }
11391
11392       if (pg_log.get_missing().is_missing(soid)) {
11393         dout(10) << __func__ << ": " << soid << " still missing on primary" << dendl;
11394         continue;
11395       }
11396
11397       dout(10) << __func__ << ": recover_object_replicas(" << soid << ")" << dendl;
11398       map<hobject_t,pg_missing_item>::const_iterator r = m.get_items().find(soid);
11399       started += prep_object_replica_pushes(soid, r->second.need,
11400                                             h);
11401     }
11402   }
11403
11404   pgbackend->run_recovery_op(h, get_recovery_op_priority());
11405   return started;
11406 }
11407
11408 hobject_t PrimaryLogPG::earliest_peer_backfill() const
11409 {
11410   hobject_t e = hobject_t::get_max();
11411   for (set<pg_shard_t>::const_iterator i = backfill_targets.begin();
11412        i != backfill_targets.end();
11413        ++i) {
11414     pg_shard_t peer = *i;
11415     map<pg_shard_t, BackfillInterval>::const_iterator iter =
11416       peer_backfill_info.find(peer);
11417     assert(iter != peer_backfill_info.end());
11418     if (iter->second.begin < e)
11419       e = iter->second.begin;
11420   }
11421   return e;
11422 }
11423
11424 bool PrimaryLogPG::all_peer_done() const
11425 {
11426   // Primary hasn't got any more objects
11427   assert(backfill_info.empty());
11428
11429   for (set<pg_shard_t>::const_iterator i = backfill_targets.begin();
11430        i != backfill_targets.end();
11431        ++i) {
11432     pg_shard_t bt = *i;
11433     map<pg_shard_t, BackfillInterval>::const_iterator piter =
11434       peer_backfill_info.find(bt);
11435     assert(piter != peer_backfill_info.end());
11436     const BackfillInterval& pbi = piter->second;
11437     // See if peer has more to process
11438     if (!pbi.extends_to_end() || !pbi.empty())
11439         return false;
11440   }
11441   return true;
11442 }
11443
11444 /**
11445  * recover_backfill
11446  *
11447  * Invariants:
11448  *
11449  * backfilled: fully pushed to replica or present in replica's missing set (both
11450  * our copy and theirs).
11451  *
11452  * All objects on a backfill_target in
11453  * [MIN,peer_backfill_info[backfill_target].begin) are valid; logically-removed
11454  * objects have been actually deleted and all logically-valid objects are replicated.
11455  * There may be PG objects in this interval yet to be backfilled.
11456  *
11457  * All objects in PG in [MIN,backfill_info.begin) have been backfilled to all
11458  * backfill_targets.  There may be objects on backfill_target(s) yet to be deleted.
11459  *
11460  * For a backfill target, all objects < MIN(peer_backfill_info[target].begin,
11461  *     backfill_info.begin) in PG are backfilled.  No deleted objects in this
11462  * interval remain on the backfill target.
11463  *
11464  * For a backfill target, all objects <= peer_info[target].last_backfill
11465  * have been backfilled to target
11466  *
11467  * There *MAY* be missing/outdated objects between last_backfill_started and
11468  * MIN(peer_backfill_info[*].begin, backfill_info.begin) in the event that client
11469  * io created objects since the last scan.  For this reason, we call
11470  * update_range() again before continuing backfill.
11471  */
11472 uint64_t PrimaryLogPG::recover_backfill(
11473   uint64_t max,
11474   ThreadPool::TPHandle &handle, bool *work_started)
11475 {
11476   dout(10) << "recover_backfill (" << max << ")"
11477            << " bft=" << backfill_targets
11478            << " last_backfill_started " << last_backfill_started
11479            << (new_backfill ? " new_backfill":"")
11480            << dendl;
11481   assert(!backfill_targets.empty());
11482
11483   // Initialize from prior backfill state
11484   if (new_backfill) {
11485     // on_activate() was called prior to getting here
11486     assert(last_backfill_started == earliest_backfill());
11487     new_backfill = false;
11488
11489     // initialize BackfillIntervals
11490     for (set<pg_shard_t>::iterator i = backfill_targets.begin();
11491          i != backfill_targets.end();
11492          ++i) {
11493       peer_backfill_info[*i].reset(peer_info[*i].last_backfill);
11494     }
11495     backfill_info.reset(last_backfill_started);
11496
11497     backfills_in_flight.clear();
11498     pending_backfill_updates.clear();
11499   }
11500
11501   for (set<pg_shard_t>::iterator i = backfill_targets.begin();
11502        i != backfill_targets.end();
11503        ++i) {
11504     dout(10) << "peer osd." << *i
11505            << " info " << peer_info[*i]
11506            << " interval " << peer_backfill_info[*i].begin
11507            << "-" << peer_backfill_info[*i].end
11508            << " " << peer_backfill_info[*i].objects.size() << " objects"
11509            << dendl;
11510   }
11511
11512   // update our local interval to cope with recent changes
11513   backfill_info.begin = last_backfill_started;
11514   update_range(&backfill_info, handle);
11515
11516   unsigned ops = 0;
11517   vector<boost::tuple<hobject_t, eversion_t,
11518                       ObjectContextRef, vector<pg_shard_t> > > to_push;
11519   vector<boost::tuple<hobject_t, eversion_t, pg_shard_t> > to_remove;
11520   set<hobject_t> add_to_stat;
11521
11522   for (set<pg_shard_t>::iterator i = backfill_targets.begin();
11523        i != backfill_targets.end();
11524        ++i) {
11525     peer_backfill_info[*i].trim_to(
11526       std::max(peer_info[*i].last_backfill, last_backfill_started));
11527   }
11528   backfill_info.trim_to(last_backfill_started);
11529
11530   while (ops < max) {
11531     if (backfill_info.begin <= earliest_peer_backfill() &&
11532         !backfill_info.extends_to_end() && backfill_info.empty()) {
11533       hobject_t next = backfill_info.end;
11534       backfill_info.reset(next);
11535       backfill_info.end = hobject_t::get_max();
11536       update_range(&backfill_info, handle);
11537       backfill_info.trim();
11538     }
11539
11540     dout(20) << "   my backfill interval " << backfill_info << dendl;
11541
11542     bool sent_scan = false;
11543     for (set<pg_shard_t>::iterator i = backfill_targets.begin();
11544          i != backfill_targets.end();
11545          ++i) {
11546       pg_shard_t bt = *i;
11547       BackfillInterval& pbi = peer_backfill_info[bt];
11548
11549       dout(20) << " peer shard " << bt << " backfill " << pbi << dendl;
11550       if (pbi.begin <= backfill_info.begin &&
11551           !pbi.extends_to_end() && pbi.empty()) {
11552         dout(10) << " scanning peer osd." << bt << " from " << pbi.end << dendl;
11553         epoch_t e = get_osdmap()->get_epoch();
11554         MOSDPGScan *m = new MOSDPGScan(
11555           MOSDPGScan::OP_SCAN_GET_DIGEST, pg_whoami, e, last_peering_reset,
11556           spg_t(info.pgid.pgid, bt.shard),
11557           pbi.end, hobject_t());
11558         osd->send_message_osd_cluster(bt.osd, m, get_osdmap()->get_epoch());
11559         assert(waiting_on_backfill.find(bt) == waiting_on_backfill.end());
11560         waiting_on_backfill.insert(bt);
11561         sent_scan = true;
11562       }
11563     }
11564
11565     // Count simultaneous scans as a single op and let those complete
11566     if (sent_scan) {
11567       ops++;
11568       start_recovery_op(hobject_t::get_max()); // XXX: was pbi.end
11569       break;
11570     }
11571
11572     if (backfill_info.empty() && all_peer_done()) {
11573       dout(10) << " reached end for both local and all peers" << dendl;
11574       break;
11575     }
11576
11577     // Get object within set of peers to operate on and
11578     // the set of targets for which that object applies.
11579     hobject_t check = earliest_peer_backfill();
11580
11581     if (check < backfill_info.begin) {
11582
11583       set<pg_shard_t> check_targets;
11584       for (set<pg_shard_t>::iterator i = backfill_targets.begin();
11585            i != backfill_targets.end();
11586            ++i) {
11587         pg_shard_t bt = *i;
11588         BackfillInterval& pbi = peer_backfill_info[bt];
11589         if (pbi.begin == check)
11590           check_targets.insert(bt);
11591       }
11592       assert(!check_targets.empty());
11593
11594       dout(20) << " BACKFILL removing " << check
11595                << " from peers " << check_targets << dendl;
11596       for (set<pg_shard_t>::iterator i = check_targets.begin();
11597            i != check_targets.end();
11598            ++i) {
11599         pg_shard_t bt = *i;
11600         BackfillInterval& pbi = peer_backfill_info[bt];
11601         assert(pbi.begin == check);
11602
11603         to_remove.push_back(boost::make_tuple(check, pbi.objects.begin()->second, bt));
11604         pbi.pop_front();
11605       }
11606
11607       /* This requires a bit of explanation.  We compare head against
11608        * last_backfill to determine whether to send an operation
11609        * to the replica.  A single write operation can touch up to three
11610        * objects: head, the snapdir, and a new clone which sorts closer to
11611        * head than any existing clone.  If last_backfill points at a clone,
11612        * the transaction won't be sent and all 3 must lie on the right side
11613        * of the line (i.e., we'll backfill them later).  If last_backfill
11614        * points at snapdir, it sorts greater than head, so we send the
11615        * transaction which is correct because all three must lie to the left
11616        * of the line.
11617        *
11618        * If it points at head, we have a bit of an issue.  If head actually
11619        * exists, no problem, because any transaction which touches snapdir
11620        * must end up creating it (and deleting head), so sending the
11621        * operation won't pose a problem -- we'll end up having to scan it,
11622        * but it'll end up being the right version so we won't bother to
11623        * rebackfill it.  However, if head doesn't exist, any write on head
11624        * will remove snapdir.  For a replicated pool, this isn't a problem,
11625        * ENOENT on remove isn't an issue and it's in backfill future anyway.
11626        * It only poses a problem for EC pools, because we never just delete
11627        * an object, we rename it into a rollback object.  That operation
11628        * will end up crashing the osd with ENOENT.  Tolerating the failure
11629        * wouldn't work either, even if snapdir exists, we'd be creating a
11630        * rollback object past the last_backfill line which wouldn't get
11631        * cleaned up (no rollback objects past the last_backfill line is an
11632        * existing important invariant).  Thus, let's avoid the whole issue
11633        * by just not updating last_backfill_started here if head doesn't
11634        * exist and snapdir does.  We aren't using up a recovery count here,
11635        * so we're going to recover snapdir immediately anyway.  We'll only
11636        * fail "backward" if we fail to get the rw lock and that just means
11637        * we'll re-process this section of the hash space again.
11638        *
11639        * I'm choosing this hack here because the really "correct" answer is
11640        * going to be to unify snapdir and head into a single object (a
11641        * snapdir is really just a confusing way to talk about head existing
11642        * as a whiteout), but doing that is going to be a somewhat larger
11643        * undertaking.
11644        *
11645        * @see http://tracker.ceph.com/issues/17668
11646        */
11647       if (!(check.is_head() &&
11648             backfill_info.begin.is_snapdir() &&
11649             check == backfill_info.begin.get_head()))
11650         last_backfill_started = check;
11651
11652       // Don't increment ops here because deletions
11653       // are cheap and not replied to unlike real recovery_ops,
11654       // and we can't increment ops without requeueing ourself
11655       // for recovery.
11656     } else {
11657       eversion_t& obj_v = backfill_info.objects.begin()->second;
11658
11659       vector<pg_shard_t> need_ver_targs, missing_targs, keep_ver_targs, skip_targs;
11660       for (set<pg_shard_t>::iterator i = backfill_targets.begin();
11661            i != backfill_targets.end();
11662            ++i) {
11663         pg_shard_t bt = *i;
11664         BackfillInterval& pbi = peer_backfill_info[bt];
11665         // Find all check peers that have the wrong version
11666         if (check == backfill_info.begin && check == pbi.begin) {
11667           if (pbi.objects.begin()->second != obj_v) {
11668             need_ver_targs.push_back(bt);
11669           } else {
11670             keep_ver_targs.push_back(bt);
11671           }
11672         } else {
11673           pg_info_t& pinfo = peer_info[bt];
11674
11675           // Only include peers that we've caught up to their backfill line
11676           // otherwise, they only appear to be missing this object
11677           // because their pbi.begin > backfill_info.begin.
11678           if (backfill_info.begin > pinfo.last_backfill)
11679             missing_targs.push_back(bt);
11680           else
11681             skip_targs.push_back(bt);
11682         }
11683       }
11684
11685       if (!keep_ver_targs.empty()) {
11686         // These peers have version obj_v
11687         dout(20) << " BACKFILL keeping " << check
11688                  << " with ver " << obj_v
11689                  << " on peers " << keep_ver_targs << dendl;
11690         //assert(!waiting_for_degraded_object.count(check));
11691       }
11692       if (!need_ver_targs.empty() || !missing_targs.empty()) {
11693         ObjectContextRef obc = get_object_context(backfill_info.begin, false);
11694         assert(obc);
11695         if (obc->get_recovery_read()) {
11696           if (!need_ver_targs.empty()) {
11697             dout(20) << " BACKFILL replacing " << check
11698                    << " with ver " << obj_v
11699                    << " to peers " << need_ver_targs << dendl;
11700           }
11701           if (!missing_targs.empty()) {
11702             dout(20) << " BACKFILL pushing " << backfill_info.begin
11703                  << " with ver " << obj_v
11704                  << " to peers " << missing_targs << dendl;
11705           }
11706           vector<pg_shard_t> all_push = need_ver_targs;
11707           all_push.insert(all_push.end(), missing_targs.begin(), missing_targs.end());
11708
11709           to_push.push_back(
11710             boost::tuple<hobject_t, eversion_t, ObjectContextRef, vector<pg_shard_t> >
11711             (backfill_info.begin, obj_v, obc, all_push));
11712           // Count all simultaneous pushes of the same object as a single op
11713           ops++;
11714         } else {
11715           *work_started = true;
11716           dout(20) << "backfill blocking on " << backfill_info.begin
11717                    << "; could not get rw_manager lock" << dendl;
11718           break;
11719         }
11720       }
11721       dout(20) << "need_ver_targs=" << need_ver_targs
11722                << " keep_ver_targs=" << keep_ver_targs << dendl;
11723       dout(20) << "backfill_targets=" << backfill_targets
11724                << " missing_targs=" << missing_targs
11725                << " skip_targs=" << skip_targs << dendl;
11726
11727       last_backfill_started = backfill_info.begin;
11728       add_to_stat.insert(backfill_info.begin); // XXX: Only one for all pushes?
11729       backfill_info.pop_front();
11730       vector<pg_shard_t> check_targets = need_ver_targs;
11731       check_targets.insert(check_targets.end(), keep_ver_targs.begin(), keep_ver_targs.end());
11732       for (vector<pg_shard_t>::iterator i = check_targets.begin();
11733            i != check_targets.end();
11734            ++i) {
11735         pg_shard_t bt = *i;
11736         BackfillInterval& pbi = peer_backfill_info[bt];
11737         pbi.pop_front();
11738       }
11739     }
11740   }
11741
11742   hobject_t backfill_pos =
11743     std::min(backfill_info.begin, earliest_peer_backfill());
11744
11745   for (set<hobject_t>::iterator i = add_to_stat.begin();
11746        i != add_to_stat.end();
11747        ++i) {
11748     ObjectContextRef obc = get_object_context(*i, false);
11749     assert(obc);
11750     pg_stat_t stat;
11751     add_object_context_to_pg_stat(obc, &stat);
11752     pending_backfill_updates[*i] = stat;
11753   }
11754   if (HAVE_FEATURE(get_min_upacting_features(), SERVER_LUMINOUS)) {
11755     map<pg_shard_t,MOSDPGBackfillRemove*> reqs;
11756     for (unsigned i = 0; i < to_remove.size(); ++i) {
11757       handle.reset_tp_timeout();
11758       const hobject_t& oid = to_remove[i].get<0>();
11759       eversion_t v = to_remove[i].get<1>();
11760       pg_shard_t peer = to_remove[i].get<2>();
11761       MOSDPGBackfillRemove *m;
11762       auto it = reqs.find(peer);
11763       if (it != reqs.end()) {
11764         m = it->second;
11765       } else {
11766         m = reqs[peer] = new MOSDPGBackfillRemove(
11767           spg_t(info.pgid.pgid, peer.shard),
11768           get_osdmap()->get_epoch());
11769       }
11770       m->ls.push_back(make_pair(oid, v));
11771
11772       if (oid <= last_backfill_started)
11773         pending_backfill_updates[oid]; // add empty stat!
11774     }
11775     for (auto p : reqs) {
11776       osd->send_message_osd_cluster(p.first.osd, p.second,
11777                                     get_osdmap()->get_epoch());
11778     }
11779   } else {
11780     // for jewel targets
11781     for (unsigned i = 0; i < to_remove.size(); ++i) {
11782       handle.reset_tp_timeout();
11783
11784       // ordered before any subsequent updates
11785       send_remove_op(to_remove[i].get<0>(), to_remove[i].get<1>(),
11786                      to_remove[i].get<2>());
11787
11788       if (to_remove[i].get<0>() <= last_backfill_started)
11789         pending_backfill_updates[to_remove[i].get<0>()]; // add empty stat!
11790     }
11791   }
11792
11793   PGBackend::RecoveryHandle *h = pgbackend->open_recovery_op();
11794   for (unsigned i = 0; i < to_push.size(); ++i) {
11795     handle.reset_tp_timeout();
11796     prep_backfill_object_push(to_push[i].get<0>(), to_push[i].get<1>(),
11797             to_push[i].get<2>(), to_push[i].get<3>(), h);
11798   }
11799   pgbackend->run_recovery_op(h, get_recovery_op_priority());
11800
11801   dout(5) << "backfill_pos is " << backfill_pos << dendl;
11802   for (set<hobject_t>::iterator i = backfills_in_flight.begin();
11803        i != backfills_in_flight.end();
11804        ++i) {
11805     dout(20) << *i << " is still in flight" << dendl;
11806   }
11807
11808   hobject_t next_backfill_to_complete = backfills_in_flight.empty() ?
11809     backfill_pos : *(backfills_in_flight.begin());
11810   hobject_t new_last_backfill = earliest_backfill();
11811   dout(10) << "starting new_last_backfill at " << new_last_backfill << dendl;
11812   for (map<hobject_t, pg_stat_t>::iterator i =
11813          pending_backfill_updates.begin();
11814        i != pending_backfill_updates.end() &&
11815          i->first < next_backfill_to_complete;
11816        pending_backfill_updates.erase(i++)) {
11817     dout(20) << " pending_backfill_update " << i->first << dendl;
11818     assert(i->first > new_last_backfill);
11819     for (set<pg_shard_t>::iterator j = backfill_targets.begin();
11820          j != backfill_targets.end();
11821          ++j) {
11822       pg_shard_t bt = *j;
11823       pg_info_t& pinfo = peer_info[bt];
11824       //Add stats to all peers that were missing object
11825       if (i->first > pinfo.last_backfill)
11826         pinfo.stats.add(i->second);
11827     }
11828     new_last_backfill = i->first;
11829   }
11830   dout(10) << "possible new_last_backfill at " << new_last_backfill << dendl;
11831
11832   assert(!pending_backfill_updates.empty() ||
11833          new_last_backfill == last_backfill_started);
11834   if (pending_backfill_updates.empty() &&
11835       backfill_pos.is_max()) {
11836     assert(backfills_in_flight.empty());
11837     new_last_backfill = backfill_pos;
11838     last_backfill_started = backfill_pos;
11839   }
11840   dout(10) << "final new_last_backfill at " << new_last_backfill << dendl;
11841
11842   // If new_last_backfill == MAX, then we will send OP_BACKFILL_FINISH to
11843   // all the backfill targets.  Otherwise, we will move last_backfill up on
11844   // those targets need it and send OP_BACKFILL_PROGRESS to them.
11845   for (set<pg_shard_t>::iterator i = backfill_targets.begin();
11846        i != backfill_targets.end();
11847        ++i) {
11848     pg_shard_t bt = *i;
11849     pg_info_t& pinfo = peer_info[bt];
11850
11851     if (new_last_backfill > pinfo.last_backfill) {
11852       pinfo.set_last_backfill(new_last_backfill);
11853       epoch_t e = get_osdmap()->get_epoch();
11854       MOSDPGBackfill *m = NULL;
11855       if (pinfo.last_backfill.is_max()) {
11856         m = new MOSDPGBackfill(
11857           MOSDPGBackfill::OP_BACKFILL_FINISH,
11858           e,
11859           last_peering_reset,
11860           spg_t(info.pgid.pgid, bt.shard));
11861         // Use default priority here, must match sub_op priority
11862         /* pinfo.stats might be wrong if we did log-based recovery on the
11863          * backfilled portion in addition to continuing backfill.
11864          */
11865         pinfo.stats = info.stats;
11866         start_recovery_op(hobject_t::get_max());
11867       } else {
11868         m = new MOSDPGBackfill(
11869           MOSDPGBackfill::OP_BACKFILL_PROGRESS,
11870           e,
11871           last_peering_reset,
11872           spg_t(info.pgid.pgid, bt.shard));
11873         // Use default priority here, must match sub_op priority
11874       }
11875       m->last_backfill = pinfo.last_backfill;
11876       m->stats = pinfo.stats;
11877       osd->send_message_osd_cluster(bt.osd, m, get_osdmap()->get_epoch());
11878       dout(10) << " peer " << bt
11879                << " num_objects now " << pinfo.stats.stats.sum.num_objects
11880                << " / " << info.stats.stats.sum.num_objects << dendl;
11881     }
11882   }
11883
11884   if (ops)
11885     *work_started = true;
11886   return ops;
11887 }
11888
11889 void PrimaryLogPG::prep_backfill_object_push(
11890   hobject_t oid, eversion_t v,
11891   ObjectContextRef obc,
11892   vector<pg_shard_t> peers,
11893   PGBackend::RecoveryHandle *h)
11894 {
11895   dout(10) << "push_backfill_object " << oid << " v " << v << " to peers " << peers << dendl;
11896   assert(!peers.empty());
11897
11898   backfills_in_flight.insert(oid);
11899   for (unsigned int i = 0 ; i < peers.size(); ++i) {
11900     map<pg_shard_t, pg_missing_t>::iterator bpm = peer_missing.find(peers[i]);
11901     assert(bpm != peer_missing.end());
11902     bpm->second.add(oid, eversion_t(), eversion_t());
11903   }
11904
11905   assert(!recovering.count(oid));
11906
11907   start_recovery_op(oid);
11908   recovering.insert(make_pair(oid, obc));
11909
11910   // We need to take the read_lock here in order to flush in-progress writes
11911   obc->ondisk_read_lock();
11912   pgbackend->recover_object(
11913     oid,
11914     v,
11915     ObjectContextRef(),
11916     obc,
11917     h);
11918   obc->ondisk_read_unlock();
11919 }
11920
11921 void PrimaryLogPG::update_range(
11922   BackfillInterval *bi,
11923   ThreadPool::TPHandle &handle)
11924 {
11925   int local_min = cct->_conf->osd_backfill_scan_min;
11926   int local_max = cct->_conf->osd_backfill_scan_max;
11927
11928   if (bi->version < info.log_tail) {
11929     dout(10) << __func__<< ": bi is old, rescanning local backfill_info"
11930              << dendl;
11931     if (last_update_applied >= info.log_tail) {
11932       bi->version = last_update_applied;
11933     } else {
11934       osr->flush();
11935       bi->version = info.last_update;
11936     }
11937     scan_range(local_min, local_max, bi, handle);
11938   }
11939
11940   if (bi->version >= projected_last_update) {
11941     dout(10) << __func__<< ": bi is current " << dendl;
11942     assert(bi->version == projected_last_update);
11943   } else if (bi->version >= info.log_tail) {
11944     if (pg_log.get_log().empty() && projected_log.empty()) {
11945       /* Because we don't move log_tail on split, the log might be
11946        * empty even if log_tail != last_update.  However, the only
11947        * way to get here with an empty log is if log_tail is actually
11948        * eversion_t(), because otherwise the entry which changed
11949        * last_update since the last scan would have to be present.
11950        */
11951       assert(bi->version == eversion_t());
11952       return;
11953     }
11954
11955     dout(10) << __func__<< ": bi is old, (" << bi->version
11956              << ") can be updated with log to projected_last_update "
11957              << projected_last_update << dendl;
11958
11959     auto func = [&](const pg_log_entry_t &e) {
11960       dout(10) << __func__ << ": updating from version " << e.version
11961                << dendl;
11962       const hobject_t &soid = e.soid;
11963       if (soid >= bi->begin &&
11964           soid < bi->end) {
11965         if (e.is_update()) {
11966           dout(10) << __func__ << ": " << e.soid << " updated to version "
11967                    << e.version << dendl;
11968           bi->objects.erase(e.soid);
11969           bi->objects.insert(
11970             make_pair(
11971               e.soid,
11972               e.version));
11973         } else if (e.is_delete()) {
11974           dout(10) << __func__ << ": " << e.soid << " removed" << dendl;
11975           bi->objects.erase(e.soid);
11976         }
11977       }
11978     };
11979     dout(10) << "scanning pg log first" << dendl;
11980     pg_log.get_log().scan_log_after(bi->version, func);
11981     dout(10) << "scanning projected log" << dendl;
11982     projected_log.scan_log_after(bi->version, func);
11983     bi->version = projected_last_update;
11984   } else {
11985     assert(0 == "scan_range should have raised bi->version past log_tail");
11986   }
11987 }
11988
11989 void PrimaryLogPG::scan_range(
11990   int min, int max, BackfillInterval *bi,
11991   ThreadPool::TPHandle &handle)
11992 {
11993   assert(is_locked());
11994   dout(10) << "scan_range from " << bi->begin << dendl;
11995   bi->clear_objects();
11996
11997   vector<hobject_t> ls;
11998   ls.reserve(max);
11999   int r = pgbackend->objects_list_partial(bi->begin, min, max, &ls, &bi->end);
12000   assert(r >= 0);
12001   dout(10) << " got " << ls.size() << " items, next " << bi->end << dendl;
12002   dout(20) << ls << dendl;
12003
12004   for (vector<hobject_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
12005     handle.reset_tp_timeout();
12006     ObjectContextRef obc;
12007     if (is_primary())
12008       obc = object_contexts.lookup(*p);
12009     if (obc) {
12010       bi->objects[*p] = obc->obs.oi.version;
12011       dout(20) << "  " << *p << " " << obc->obs.oi.version << dendl;
12012     } else {
12013       bufferlist bl;
12014       int r = pgbackend->objects_get_attr(*p, OI_ATTR, &bl);
12015
12016       /* If the object does not exist here, it must have been removed
12017          * between the collection_list_partial and here.  This can happen
12018          * for the first item in the range, which is usually last_backfill.
12019          */
12020       if (r == -ENOENT)
12021         continue;
12022
12023       assert(r >= 0);
12024       object_info_t oi(bl);
12025       bi->objects[*p] = oi.version;
12026       dout(20) << "  " << *p << " " << oi.version << dendl;
12027     }
12028   }
12029 }
12030
12031
12032 /** check_local
12033  *
12034  * verifies that stray objects have been deleted
12035  */
12036 void PrimaryLogPG::check_local()
12037 {
12038   dout(10) << __func__ << dendl;
12039
12040   assert(info.last_update >= pg_log.get_tail());  // otherwise we need some help!
12041
12042   if (!cct->_conf->osd_debug_verify_stray_on_activate)
12043     return;
12044
12045   // just scan the log.
12046   set<hobject_t> did;
12047   for (list<pg_log_entry_t>::const_reverse_iterator p = pg_log.get_log().log.rbegin();
12048        p != pg_log.get_log().log.rend();
12049        ++p) {
12050     if (did.count(p->soid))
12051       continue;
12052     did.insert(p->soid);
12053
12054     if (p->is_delete()) {
12055       dout(10) << " checking " << p->soid
12056                << " at " << p->version << dendl;
12057       struct stat st;
12058       int r = osd->store->stat(
12059         ch,
12060         ghobject_t(p->soid, ghobject_t::NO_GEN, pg_whoami.shard),
12061         &st);
12062       if (r != -ENOENT) {
12063         derr << __func__ << " " << p->soid << " exists, but should have been "
12064              << "deleted" << dendl;
12065         assert(0 == "erroneously present object");
12066       }
12067     } else {
12068       // ignore old(+missing) objects
12069     }
12070   }
12071 }
12072
12073
12074
12075 // ===========================
12076 // hit sets
12077
12078 hobject_t PrimaryLogPG::get_hit_set_current_object(utime_t stamp)
12079 {
12080   ostringstream ss;
12081   ss << "hit_set_" << info.pgid.pgid << "_current_" << stamp;
12082   hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "",
12083                  info.pgid.ps(), info.pgid.pool(),
12084                  cct->_conf->osd_hit_set_namespace);
12085   dout(20) << __func__ << " " << hoid << dendl;
12086   return hoid;
12087 }
12088
12089 hobject_t PrimaryLogPG::get_hit_set_archive_object(utime_t start,
12090                                                    utime_t end,
12091                                                    bool using_gmt)
12092 {
12093   ostringstream ss;
12094   ss << "hit_set_" << info.pgid.pgid << "_archive_";
12095   if (using_gmt) {
12096     start.gmtime(ss) << "_";
12097     end.gmtime(ss);
12098   } else {
12099     start.localtime(ss) << "_";
12100     end.localtime(ss);
12101   }
12102   hobject_t hoid(sobject_t(ss.str(), CEPH_NOSNAP), "",
12103                  info.pgid.ps(), info.pgid.pool(),
12104                  cct->_conf->osd_hit_set_namespace);
12105   dout(20) << __func__ << " " << hoid << dendl;
12106   return hoid;
12107 }
12108
12109 void PrimaryLogPG::hit_set_clear()
12110 {
12111   dout(20) << __func__ << dendl;
12112   hit_set.reset();
12113   hit_set_start_stamp = utime_t();
12114 }
12115
12116 void PrimaryLogPG::hit_set_setup()
12117 {
12118   if (!is_active() ||
12119       !is_primary()) {
12120     hit_set_clear();
12121     return;
12122   }
12123
12124   if (is_active() && is_primary() &&
12125       (!pool.info.hit_set_count ||
12126        !pool.info.hit_set_period ||
12127        pool.info.hit_set_params.get_type() == HitSet::TYPE_NONE)) {
12128     hit_set_clear();
12129
12130     // only primary is allowed to remove all the hit set objects
12131     hit_set_remove_all();
12132     return;
12133   }
12134
12135   // FIXME: discard any previous data for now
12136   hit_set_create();
12137
12138   // include any writes we know about from the pg log.  this doesn't
12139   // capture reads, but it is better than nothing!
12140   hit_set_apply_log();
12141 }
12142
12143 void PrimaryLogPG::hit_set_remove_all()
12144 {
12145   // If any archives are degraded we skip this
12146   for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin();
12147        p != info.hit_set.history.end();
12148        ++p) {
12149     hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
12150
12151     // Once we hit a degraded object just skip
12152     if (is_degraded_or_backfilling_object(aoid))
12153       return;
12154     if (scrubber.write_blocked_by_scrub(aoid))
12155       return;
12156   }
12157
12158   if (!info.hit_set.history.empty()) {
12159     list<pg_hit_set_info_t>::reverse_iterator p = info.hit_set.history.rbegin();
12160     assert(p != info.hit_set.history.rend());
12161     hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
12162     assert(!is_degraded_or_backfilling_object(oid));
12163     ObjectContextRef obc = get_object_context(oid, false);
12164     assert(obc);
12165
12166     OpContextUPtr ctx = simple_opc_create(obc);
12167     ctx->at_version = get_next_version();
12168     ctx->updated_hset_history = info.hit_set;
12169     utime_t now = ceph_clock_now();
12170     ctx->mtime = now;
12171     hit_set_trim(ctx, 0);
12172     simple_opc_submit(std::move(ctx));
12173   }
12174
12175   info.hit_set = pg_hit_set_history_t();
12176   if (agent_state) {
12177     agent_state->discard_hit_sets();
12178   }
12179 }
12180
12181 void PrimaryLogPG::hit_set_create()
12182 {
12183   utime_t now = ceph_clock_now();
12184   // make a copy of the params to modify
12185   HitSet::Params params(pool.info.hit_set_params);
12186
12187   dout(20) << __func__ << " " << params << dendl;
12188   if (pool.info.hit_set_params.get_type() == HitSet::TYPE_BLOOM) {
12189     BloomHitSet::Params *p =
12190       static_cast<BloomHitSet::Params*>(params.impl.get());
12191
12192     // convert false positive rate so it holds up across the full period
12193     p->set_fpp(p->get_fpp() / pool.info.hit_set_count);
12194     if (p->get_fpp() <= 0.0)
12195       p->set_fpp(.01);  // fpp cannot be zero!
12196
12197     // if we don't have specified size, estimate target size based on the
12198     // previous bin!
12199     if (p->target_size == 0 && hit_set) {
12200       utime_t dur = now - hit_set_start_stamp;
12201       unsigned unique = hit_set->approx_unique_insert_count();
12202       dout(20) << __func__ << " previous set had approx " << unique
12203                << " unique items over " << dur << " seconds" << dendl;
12204       p->target_size = (double)unique * (double)pool.info.hit_set_period
12205                      / (double)dur;
12206     }
12207     if (p->target_size <
12208         static_cast<uint64_t>(cct->_conf->osd_hit_set_min_size))
12209       p->target_size = cct->_conf->osd_hit_set_min_size;
12210
12211     if (p->target_size
12212         > static_cast<uint64_t>(cct->_conf->osd_hit_set_max_size))
12213       p->target_size = cct->_conf->osd_hit_set_max_size;
12214
12215     p->seed = now.sec();
12216
12217     dout(10) << __func__ << " target_size " << p->target_size
12218              << " fpp " << p->get_fpp() << dendl;
12219   }
12220   hit_set.reset(new HitSet(params));
12221   hit_set_start_stamp = now;
12222 }
12223
12224 /**
12225  * apply log entries to set
12226  *
12227  * this would only happen after peering, to at least capture writes
12228  * during an interval that was potentially lost.
12229  */
12230 bool PrimaryLogPG::hit_set_apply_log()
12231 {
12232   if (!hit_set)
12233     return false;
12234
12235   eversion_t to = info.last_update;
12236   eversion_t from = info.hit_set.current_last_update;
12237   if (to <= from) {
12238     dout(20) << __func__ << " no update" << dendl;
12239     return false;
12240   }
12241
12242   dout(20) << __func__ << " " << to << " .. " << info.last_update << dendl;
12243   list<pg_log_entry_t>::const_reverse_iterator p = pg_log.get_log().log.rbegin();
12244   while (p != pg_log.get_log().log.rend() && p->version > to)
12245     ++p;
12246   while (p != pg_log.get_log().log.rend() && p->version > from) {
12247     hit_set->insert(p->soid);
12248     ++p;
12249   }
12250
12251   return true;
12252 }
12253
12254 void PrimaryLogPG::hit_set_persist()
12255 {
12256   dout(10) << __func__  << dendl;
12257   bufferlist bl;
12258   unsigned max = pool.info.hit_set_count;
12259
12260   utime_t now = ceph_clock_now();
12261   hobject_t oid;
12262
12263   // If any archives are degraded we skip this persist request
12264   // account for the additional entry being added below
12265   for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin();
12266        p != info.hit_set.history.end();
12267        ++p) {
12268     hobject_t aoid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
12269
12270     // Once we hit a degraded object just skip further trim
12271     if (is_degraded_or_backfilling_object(aoid))
12272       return;
12273     if (scrubber.write_blocked_by_scrub(aoid))
12274       return;
12275   }
12276
12277   // If backfill is in progress and we could possibly overlap with the
12278   // hit_set_* objects, back off.  Since these all have
12279   // hobject_t::hash set to pgid.ps(), and those sort first, we can
12280   // look just at that.  This is necessary because our transactions
12281   // may include a modify of the new hit_set *and* a delete of the
12282   // old one, and this may span the backfill boundary.
12283   for (set<pg_shard_t>::iterator p = backfill_targets.begin();
12284        p != backfill_targets.end();
12285        ++p) {
12286     assert(peer_info.count(*p));
12287     const pg_info_t& pi = peer_info[*p];
12288     if (pi.last_backfill == hobject_t() ||
12289         pi.last_backfill.get_hash() == info.pgid.ps()) {
12290       dout(10) << __func__ << " backfill target osd." << *p
12291                << " last_backfill has not progressed past pgid ps"
12292                << dendl;
12293       return;
12294     }
12295   }
12296
12297
12298   pg_hit_set_info_t new_hset = pg_hit_set_info_t(pool.info.use_gmt_hitset);
12299   new_hset.begin = hit_set_start_stamp;
12300   new_hset.end = now;
12301   oid = get_hit_set_archive_object(
12302     new_hset.begin,
12303     new_hset.end,
12304     new_hset.using_gmt);
12305
12306   // If the current object is degraded we skip this persist request
12307   if (scrubber.write_blocked_by_scrub(oid))
12308     return;
12309
12310   hit_set->seal();
12311   ::encode(*hit_set, bl);
12312   dout(20) << __func__ << " archive " << oid << dendl;
12313
12314   if (agent_state) {
12315     agent_state->add_hit_set(new_hset.begin, hit_set);
12316     uint32_t size = agent_state->hit_set_map.size();
12317     if (size >= pool.info.hit_set_count) {
12318       size = pool.info.hit_set_count > 0 ? pool.info.hit_set_count - 1: 0;
12319     }
12320     hit_set_in_memory_trim(size);
12321   }
12322
12323   ObjectContextRef obc = get_object_context(oid, true);
12324   OpContextUPtr ctx = simple_opc_create(obc);
12325
12326   ctx->at_version = get_next_version();
12327   ctx->updated_hset_history = info.hit_set;
12328   pg_hit_set_history_t &updated_hit_set_hist = *(ctx->updated_hset_history);
12329
12330   updated_hit_set_hist.current_last_update = info.last_update;
12331   new_hset.version = ctx->at_version;
12332
12333   updated_hit_set_hist.history.push_back(new_hset);
12334   hit_set_create();
12335
12336   // fabricate an object_info_t and SnapSet
12337   obc->obs.oi.version = ctx->at_version;
12338   obc->obs.oi.mtime = now;
12339   obc->obs.oi.size = bl.length();
12340   obc->obs.exists = true;
12341   obc->obs.oi.set_data_digest(bl.crc32c(-1));
12342
12343   ctx->new_obs = obc->obs;
12344
12345   obc->ssc->snapset.head_exists = true;
12346   ctx->new_snapset = obc->ssc->snapset;
12347
12348   ctx->delta_stats.num_objects++;
12349   ctx->delta_stats.num_objects_hit_set_archive++;
12350   ctx->delta_stats.num_bytes += bl.length();
12351   ctx->delta_stats.num_bytes_hit_set_archive += bl.length();
12352
12353   bufferlist bss;
12354   ::encode(ctx->new_snapset, bss);
12355   bufferlist boi(sizeof(ctx->new_obs.oi));
12356   ::encode(ctx->new_obs.oi, boi,
12357            get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
12358
12359   ctx->op_t->create(oid);
12360   if (bl.length()) {
12361     ctx->op_t->write(oid, 0, bl.length(), bl, 0);
12362   }
12363   map <string, bufferlist> attrs;
12364   attrs[OI_ATTR].claim(boi);
12365   attrs[SS_ATTR].claim(bss);
12366   setattrs_maybe_cache(ctx->obc, ctx.get(), ctx->op_t.get(), attrs);
12367   ctx->log.push_back(
12368     pg_log_entry_t(
12369       pg_log_entry_t::MODIFY,
12370       oid,
12371       ctx->at_version,
12372       eversion_t(),
12373       0,
12374       osd_reqid_t(),
12375       ctx->mtime,
12376       0)
12377     );
12378
12379   hit_set_trim(ctx, max);
12380
12381   simple_opc_submit(std::move(ctx));
12382 }
12383
12384 void PrimaryLogPG::hit_set_trim(OpContextUPtr &ctx, unsigned max)
12385 {
12386   assert(ctx->updated_hset_history);
12387   pg_hit_set_history_t &updated_hit_set_hist =
12388     *(ctx->updated_hset_history);
12389   for (unsigned num = updated_hit_set_hist.history.size(); num > max; --num) {
12390     list<pg_hit_set_info_t>::iterator p = updated_hit_set_hist.history.begin();
12391     assert(p != updated_hit_set_hist.history.end());
12392     hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
12393
12394     assert(!is_degraded_or_backfilling_object(oid));
12395
12396     dout(20) << __func__ << " removing " << oid << dendl;
12397     ++ctx->at_version.version;
12398     ctx->log.push_back(
12399         pg_log_entry_t(pg_log_entry_t::DELETE,
12400                        oid,
12401                        ctx->at_version,
12402                        p->version,
12403                        0,
12404                        osd_reqid_t(),
12405                        ctx->mtime,
12406                        0));
12407
12408     ctx->op_t->remove(oid);
12409     updated_hit_set_hist.history.pop_front();
12410
12411     ObjectContextRef obc = get_object_context(oid, false);
12412     assert(obc);
12413     --ctx->delta_stats.num_objects;
12414     --ctx->delta_stats.num_objects_hit_set_archive;
12415     ctx->delta_stats.num_bytes -= obc->obs.oi.size;
12416     ctx->delta_stats.num_bytes_hit_set_archive -= obc->obs.oi.size;
12417   }
12418 }
12419
12420 void PrimaryLogPG::hit_set_in_memory_trim(uint32_t max_in_memory)
12421 {
12422   while (agent_state->hit_set_map.size() > max_in_memory) {
12423     agent_state->remove_oldest_hit_set();
12424   }
12425 }
12426
12427
12428 // =======================================
12429 // cache agent
12430
12431 void PrimaryLogPG::agent_setup()
12432 {
12433   assert(is_locked());
12434   if (!is_active() ||
12435       !is_primary() ||
12436       pool.info.cache_mode == pg_pool_t::CACHEMODE_NONE ||
12437       pool.info.tier_of < 0 ||
12438       !get_osdmap()->have_pg_pool(pool.info.tier_of)) {
12439     agent_clear();
12440     return;
12441   }
12442   if (!agent_state) {
12443     agent_state.reset(new TierAgentState);
12444
12445     // choose random starting position
12446     agent_state->position = hobject_t();
12447     agent_state->position.pool = info.pgid.pool();
12448     agent_state->position.set_hash(pool.info.get_random_pg_position(
12449       info.pgid.pgid,
12450       rand()));
12451     agent_state->start = agent_state->position;
12452
12453     dout(10) << __func__ << " allocated new state, position "
12454              << agent_state->position << dendl;
12455   } else {
12456     dout(10) << __func__ << " keeping existing state" << dendl;
12457   }
12458
12459   if (info.stats.stats_invalid) {
12460     osd->clog->warn() << "pg " << info.pgid << " has invalid (post-split) stats; must scrub before tier agent can activate";
12461   }
12462
12463   agent_choose_mode();
12464 }
12465
12466 void PrimaryLogPG::agent_clear()
12467 {
12468   agent_stop();
12469   agent_state.reset(NULL);
12470 }
12471
12472 // Return false if no objects operated on since start of object hash space
12473 bool PrimaryLogPG::agent_work(int start_max, int agent_flush_quota)
12474 {
12475   lock();
12476   if (!agent_state) {
12477     dout(10) << __func__ << " no agent state, stopping" << dendl;
12478     unlock();
12479     return true;
12480   }
12481
12482   assert(!deleting);
12483
12484   if (agent_state->is_idle()) {
12485     dout(10) << __func__ << " idle, stopping" << dendl;
12486     unlock();
12487     return true;
12488   }
12489
12490   osd->logger->inc(l_osd_agent_wake);
12491
12492   dout(10) << __func__
12493            << " max " << start_max
12494            << ", flush " << agent_state->get_flush_mode_name()
12495            << ", evict " << agent_state->get_evict_mode_name()
12496            << ", pos " << agent_state->position
12497            << dendl;
12498   assert(is_primary());
12499   assert(is_active());
12500
12501   agent_load_hit_sets();
12502
12503   const pg_pool_t *base_pool = get_osdmap()->get_pg_pool(pool.info.tier_of);
12504   assert(base_pool);
12505
12506   int ls_min = 1;
12507   int ls_max = cct->_conf->osd_pool_default_cache_max_evict_check_size;
12508
12509   // list some objects.  this conveniently lists clones (oldest to
12510   // newest) before heads... the same order we want to flush in.
12511   //
12512   // NOTE: do not flush the Sequencer.  we will assume that the
12513   // listing we get back is imprecise.
12514   vector<hobject_t> ls;
12515   hobject_t next;
12516   int r = pgbackend->objects_list_partial(agent_state->position, ls_min, ls_max,
12517                                           &ls, &next);
12518   assert(r >= 0);
12519   dout(20) << __func__ << " got " << ls.size() << " objects" << dendl;
12520   int started = 0;
12521   for (vector<hobject_t>::iterator p = ls.begin();
12522        p != ls.end();
12523        ++p) {
12524     if (p->nspace == cct->_conf->osd_hit_set_namespace) {
12525       dout(20) << __func__ << " skip (hit set) " << *p << dendl;
12526       osd->logger->inc(l_osd_agent_skip);
12527       continue;
12528     }
12529     if (is_degraded_or_backfilling_object(*p)) {
12530       dout(20) << __func__ << " skip (degraded) " << *p << dendl;
12531       osd->logger->inc(l_osd_agent_skip);
12532       continue;
12533     }
12534     if (is_missing_object(p->get_head())) {
12535       dout(20) << __func__ << " skip (missing head) " << *p << dendl;
12536       osd->logger->inc(l_osd_agent_skip);
12537       continue;
12538     }
12539     ObjectContextRef obc = get_object_context(*p, false, NULL);
12540     if (!obc) {
12541       // we didn't flush; we may miss something here.
12542       dout(20) << __func__ << " skip (no obc) " << *p << dendl;
12543       osd->logger->inc(l_osd_agent_skip);
12544       continue;
12545     }
12546     if (!obc->obs.exists) {
12547       dout(20) << __func__ << " skip (dne) " << obc->obs.oi.soid << dendl;
12548       osd->logger->inc(l_osd_agent_skip);
12549       continue;
12550     }
12551     if (scrubber.write_blocked_by_scrub(obc->obs.oi.soid)) {
12552       dout(20) << __func__ << " skip (scrubbing) " << obc->obs.oi << dendl;
12553       osd->logger->inc(l_osd_agent_skip);
12554       continue;
12555     }
12556     if (obc->is_blocked()) {
12557       dout(20) << __func__ << " skip (blocked) " << obc->obs.oi << dendl;
12558       osd->logger->inc(l_osd_agent_skip);
12559       continue;
12560     }
12561     if (obc->is_request_pending()) {
12562       dout(20) << __func__ << " skip (request pending) " << obc->obs.oi << dendl;
12563       osd->logger->inc(l_osd_agent_skip);
12564       continue;
12565     }
12566
12567     // be careful flushing omap to an EC pool.
12568     if (!base_pool->supports_omap() &&
12569         obc->obs.oi.is_omap()) {
12570       dout(20) << __func__ << " skip (omap to EC) " << obc->obs.oi << dendl;
12571       osd->logger->inc(l_osd_agent_skip);
12572       continue;
12573     }
12574
12575     if (agent_state->evict_mode != TierAgentState::EVICT_MODE_IDLE &&
12576         agent_maybe_evict(obc, false))
12577       ++started;
12578     else if (agent_state->flush_mode != TierAgentState::FLUSH_MODE_IDLE &&
12579              agent_flush_quota > 0 && agent_maybe_flush(obc)) {
12580       ++started;
12581       --agent_flush_quota;
12582     }
12583     if (started >= start_max) {
12584       // If finishing early, set "next" to the next object
12585       if (++p != ls.end())
12586         next = *p;
12587       break;
12588     }
12589   }
12590
12591   if (++agent_state->hist_age > cct->_conf->osd_agent_hist_halflife) {
12592     dout(20) << __func__ << " resetting atime and temp histograms" << dendl;
12593     agent_state->hist_age = 0;
12594     agent_state->temp_hist.decay();
12595   }
12596
12597   // Total objects operated on so far
12598   int total_started = agent_state->started + started;
12599   bool need_delay = false;
12600
12601   dout(20) << __func__ << " start pos " << agent_state->position
12602     << " next start pos " << next
12603     << " started " << total_started << dendl;
12604
12605   // See if we've made a full pass over the object hash space
12606   // This might check at most ls_max objects a second time to notice that
12607   // we've checked every objects at least once.
12608   if (agent_state->position < agent_state->start &&
12609       next >= agent_state->start) {
12610     dout(20) << __func__ << " wrap around " << agent_state->start << dendl;
12611     if (total_started == 0)
12612       need_delay = true;
12613     else
12614       total_started = 0;
12615     agent_state->start = next;
12616   }
12617   agent_state->started = total_started;
12618
12619   // See if we are starting from beginning
12620   if (next.is_max())
12621     agent_state->position = hobject_t();
12622   else
12623     agent_state->position = next;
12624
12625   // Discard old in memory HitSets
12626   hit_set_in_memory_trim(pool.info.hit_set_count);
12627
12628   if (need_delay) {
12629     assert(agent_state->delaying == false);
12630     agent_delay();
12631     unlock();
12632     return false;
12633   }
12634   agent_choose_mode();
12635   unlock();
12636   return true;
12637 }
12638
12639 void PrimaryLogPG::agent_load_hit_sets()
12640 {
12641   if (agent_state->evict_mode == TierAgentState::EVICT_MODE_IDLE) {
12642     return;
12643   }
12644
12645   if (agent_state->hit_set_map.size() < info.hit_set.history.size()) {
12646     dout(10) << __func__ << dendl;
12647     for (list<pg_hit_set_info_t>::iterator p = info.hit_set.history.begin();
12648          p != info.hit_set.history.end(); ++p) {
12649       if (agent_state->hit_set_map.count(p->begin.sec()) == 0) {
12650         dout(10) << __func__ << " loading " << p->begin << "-"
12651                  << p->end << dendl;
12652         if (!pool.info.is_replicated()) {
12653           // FIXME: EC not supported here yet
12654           derr << __func__ << " on non-replicated pool" << dendl;
12655           break;
12656         }
12657
12658         hobject_t oid = get_hit_set_archive_object(p->begin, p->end, p->using_gmt);
12659         if (is_unreadable_object(oid)) {
12660           dout(10) << __func__ << " unreadable " << oid << ", waiting" << dendl;
12661           break;
12662         }
12663
12664         ObjectContextRef obc = get_object_context(oid, false);
12665         if (!obc) {
12666           derr << __func__ << ": could not load hitset " << oid << dendl;
12667           break;
12668         }
12669
12670         bufferlist bl;
12671         {
12672           obc->ondisk_read_lock();
12673           int r = osd->store->read(ch, ghobject_t(oid), 0, 0, bl);
12674           assert(r >= 0);
12675           obc->ondisk_read_unlock();
12676         }
12677         HitSetRef hs(new HitSet);
12678         bufferlist::iterator pbl = bl.begin();
12679         ::decode(*hs, pbl);
12680         agent_state->add_hit_set(p->begin.sec(), hs);
12681       }
12682     }
12683   }
12684 }
12685
12686 bool PrimaryLogPG::agent_maybe_flush(ObjectContextRef& obc)
12687 {
12688   if (!obc->obs.oi.is_dirty()) {
12689     dout(20) << __func__ << " skip (clean) " << obc->obs.oi << dendl;
12690     osd->logger->inc(l_osd_agent_skip);
12691     return false;
12692   }
12693   if (obc->obs.oi.is_cache_pinned()) {
12694     dout(20) << __func__ << " skip (cache_pinned) " << obc->obs.oi << dendl;
12695     osd->logger->inc(l_osd_agent_skip);
12696     return false;
12697   }
12698
12699   utime_t now = ceph_clock_now();
12700   utime_t ob_local_mtime;
12701   if (obc->obs.oi.local_mtime != utime_t()) {
12702     ob_local_mtime = obc->obs.oi.local_mtime;
12703   } else {
12704     ob_local_mtime = obc->obs.oi.mtime;
12705   }
12706   bool evict_mode_full =
12707     (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL);
12708   if (!evict_mode_full &&
12709       obc->obs.oi.soid.snap == CEPH_NOSNAP &&  // snaps immutable; don't delay
12710       (ob_local_mtime + utime_t(pool.info.cache_min_flush_age, 0) > now)) {
12711     dout(20) << __func__ << " skip (too young) " << obc->obs.oi << dendl;
12712     osd->logger->inc(l_osd_agent_skip);
12713     return false;
12714   }
12715
12716   if (osd->agent_is_active_oid(obc->obs.oi.soid)) {
12717     dout(20) << __func__ << " skip (flushing) " << obc->obs.oi << dendl;
12718     osd->logger->inc(l_osd_agent_skip);
12719     return false;
12720   }
12721
12722   dout(10) << __func__ << " flushing " << obc->obs.oi << dendl;
12723
12724   // FIXME: flush anything dirty, regardless of what distribution of
12725   // ages we expect.
12726
12727   hobject_t oid = obc->obs.oi.soid;
12728   osd->agent_start_op(oid);
12729   // no need to capture a pg ref, can't outlive fop or ctx
12730   std::function<void()> on_flush = [this, oid]() {
12731     osd->agent_finish_op(oid);
12732   };
12733
12734   int result = start_flush(
12735     OpRequestRef(), obc, false, NULL,
12736     on_flush);
12737   if (result != -EINPROGRESS) {
12738     on_flush();
12739     dout(10) << __func__ << " start_flush() failed " << obc->obs.oi
12740       << " with " << result << dendl;
12741     osd->logger->inc(l_osd_agent_skip);
12742     return false;
12743   }
12744
12745   osd->logger->inc(l_osd_agent_flush);
12746   return true;
12747 }
12748
12749 bool PrimaryLogPG::agent_maybe_evict(ObjectContextRef& obc, bool after_flush)
12750 {
12751   const hobject_t& soid = obc->obs.oi.soid;
12752   if (!after_flush && obc->obs.oi.is_dirty()) {
12753     dout(20) << __func__ << " skip (dirty) " << obc->obs.oi << dendl;
12754     return false;
12755   }
12756   if (!obc->obs.oi.watchers.empty()) {
12757     dout(20) << __func__ << " skip (watchers) " << obc->obs.oi << dendl;
12758     return false;
12759   }
12760   if (obc->is_blocked()) {
12761     dout(20) << __func__ << " skip (blocked) " << obc->obs.oi << dendl;
12762     return false;
12763   }
12764   if (obc->obs.oi.is_cache_pinned()) {
12765     dout(20) << __func__ << " skip (cache_pinned) " << obc->obs.oi << dendl;
12766     return false;
12767   }
12768
12769   if (soid.snap == CEPH_NOSNAP) {
12770     int result = _verify_no_head_clones(soid, obc->ssc->snapset);
12771     if (result < 0) {
12772       dout(20) << __func__ << " skip (clones) " << obc->obs.oi << dendl;
12773       return false;
12774     }
12775   }
12776
12777   if (agent_state->evict_mode != TierAgentState::EVICT_MODE_FULL) {
12778     // is this object old than cache_min_evict_age?
12779     utime_t now = ceph_clock_now();
12780     utime_t ob_local_mtime;
12781     if (obc->obs.oi.local_mtime != utime_t()) {
12782       ob_local_mtime = obc->obs.oi.local_mtime;
12783     } else {
12784       ob_local_mtime = obc->obs.oi.mtime;
12785     }
12786     if (ob_local_mtime + utime_t(pool.info.cache_min_evict_age, 0) > now) {
12787       dout(20) << __func__ << " skip (too young) " << obc->obs.oi << dendl;
12788       osd->logger->inc(l_osd_agent_skip);
12789       return false;
12790     }
12791     // is this object old and/or cold enough?
12792     int temp = 0;
12793     uint64_t temp_upper = 0, temp_lower = 0;
12794     if (hit_set)
12795       agent_estimate_temp(soid, &temp);
12796     agent_state->temp_hist.add(temp);
12797     agent_state->temp_hist.get_position_micro(temp, &temp_lower, &temp_upper);
12798
12799     dout(20) << __func__
12800              << " temp " << temp
12801              << " pos " << temp_lower << "-" << temp_upper
12802              << ", evict_effort " << agent_state->evict_effort
12803              << dendl;
12804     dout(30) << "agent_state:\n";
12805     Formatter *f = Formatter::create("");
12806     f->open_object_section("agent_state");
12807     agent_state->dump(f);
12808     f->close_section();
12809     f->flush(*_dout);
12810     delete f;
12811     *_dout << dendl;
12812
12813     if (1000000 - temp_upper >= agent_state->evict_effort)
12814       return false;
12815   }
12816
12817   dout(10) << __func__ << " evicting " << obc->obs.oi << dendl;
12818   OpContextUPtr ctx = simple_opc_create(obc);
12819
12820   if (!ctx->lock_manager.get_lock_type(
12821         ObjectContext::RWState::RWWRITE,
12822         obc->obs.oi.soid,
12823         obc,
12824         OpRequestRef())) {
12825     close_op_ctx(ctx.release());
12826     dout(20) << __func__ << " skip (cannot get lock) " << obc->obs.oi << dendl;
12827     return false;
12828   }
12829
12830   osd->agent_start_evict_op();
12831   ctx->register_on_finish(
12832     [this]() {
12833       osd->agent_finish_evict_op();
12834     });
12835
12836   ctx->at_version = get_next_version();
12837   assert(ctx->new_obs.exists);
12838   int r = _delete_oid(ctx.get(), true, false);
12839   if (obc->obs.oi.is_omap())
12840     ctx->delta_stats.num_objects_omap--;
12841   ctx->delta_stats.num_evict++;
12842   ctx->delta_stats.num_evict_kb += SHIFT_ROUND_UP(obc->obs.oi.size, 10);
12843   if (obc->obs.oi.is_dirty())
12844     --ctx->delta_stats.num_objects_dirty;
12845   assert(r == 0);
12846   finish_ctx(ctx.get(), pg_log_entry_t::DELETE, false);
12847   simple_opc_submit(std::move(ctx));
12848   osd->logger->inc(l_osd_tier_evict);
12849   osd->logger->inc(l_osd_agent_evict);
12850   return true;
12851 }
12852
12853 void PrimaryLogPG::agent_stop()
12854 {
12855   dout(20) << __func__ << dendl;
12856   if (agent_state && !agent_state->is_idle()) {
12857     agent_state->evict_mode = TierAgentState::EVICT_MODE_IDLE;
12858     agent_state->flush_mode = TierAgentState::FLUSH_MODE_IDLE;
12859     osd->agent_disable_pg(this, agent_state->evict_effort);
12860   }
12861 }
12862
12863 void PrimaryLogPG::agent_delay()
12864 {
12865   dout(20) << __func__ << dendl;
12866   if (agent_state && !agent_state->is_idle()) {
12867     assert(agent_state->delaying == false);
12868     agent_state->delaying = true;
12869     osd->agent_disable_pg(this, agent_state->evict_effort);
12870   }
12871 }
12872
12873 void PrimaryLogPG::agent_choose_mode_restart()
12874 {
12875   dout(20) << __func__ << dendl;
12876   lock();
12877   if (agent_state && agent_state->delaying) {
12878     agent_state->delaying = false;
12879     agent_choose_mode(true);
12880   }
12881   unlock();
12882 }
12883
12884 bool PrimaryLogPG::agent_choose_mode(bool restart, OpRequestRef op)
12885 {
12886   bool requeued = false;
12887   // Let delay play out
12888   if (agent_state->delaying) {
12889     dout(20) << __func__ << this << " delaying, ignored" << dendl;
12890     return requeued;
12891   }
12892
12893   TierAgentState::flush_mode_t flush_mode = TierAgentState::FLUSH_MODE_IDLE;
12894   TierAgentState::evict_mode_t evict_mode = TierAgentState::EVICT_MODE_IDLE;
12895   unsigned evict_effort = 0;
12896
12897   if (info.stats.stats_invalid) {
12898     // idle; stats can't be trusted until we scrub.
12899     dout(20) << __func__ << " stats invalid (post-split), idle" << dendl;
12900     goto skip_calc;
12901   }
12902
12903   {
12904   uint64_t divisor = pool.info.get_pg_num_divisor(info.pgid.pgid);
12905   assert(divisor > 0);
12906
12907   // adjust (effective) user objects down based on the number
12908   // of HitSet objects, which should not count toward our total since
12909   // they cannot be flushed.
12910   uint64_t unflushable = info.stats.stats.sum.num_objects_hit_set_archive;
12911
12912   // also exclude omap objects if ec backing pool
12913   const pg_pool_t *base_pool = get_osdmap()->get_pg_pool(pool.info.tier_of);
12914   assert(base_pool);
12915   if (!base_pool->supports_omap())
12916     unflushable += info.stats.stats.sum.num_objects_omap;
12917
12918   uint64_t num_user_objects = info.stats.stats.sum.num_objects;
12919   if (num_user_objects > unflushable)
12920     num_user_objects -= unflushable;
12921   else
12922     num_user_objects = 0;
12923
12924   uint64_t num_user_bytes = info.stats.stats.sum.num_bytes;
12925   uint64_t unflushable_bytes = info.stats.stats.sum.num_bytes_hit_set_archive;
12926   num_user_bytes -= unflushable_bytes;
12927   uint64_t num_overhead_bytes = osd->store->estimate_objects_overhead(num_user_objects);
12928   num_user_bytes += num_overhead_bytes;
12929
12930   // also reduce the num_dirty by num_objects_omap
12931   int64_t num_dirty = info.stats.stats.sum.num_objects_dirty;
12932   if (!base_pool->supports_omap()) {
12933     if (num_dirty > info.stats.stats.sum.num_objects_omap)
12934       num_dirty -= info.stats.stats.sum.num_objects_omap;
12935     else
12936       num_dirty = 0;
12937   }
12938
12939   dout(10) << __func__
12940            << " flush_mode: "
12941            << TierAgentState::get_flush_mode_name(agent_state->flush_mode)
12942            << " evict_mode: "
12943            << TierAgentState::get_evict_mode_name(agent_state->evict_mode)
12944            << " num_objects: " << info.stats.stats.sum.num_objects
12945            << " num_bytes: " << info.stats.stats.sum.num_bytes
12946            << " num_objects_dirty: " << info.stats.stats.sum.num_objects_dirty
12947            << " num_objects_omap: " << info.stats.stats.sum.num_objects_omap
12948            << " num_dirty: " << num_dirty
12949            << " num_user_objects: " << num_user_objects
12950            << " num_user_bytes: " << num_user_bytes
12951            << " num_overhead_bytes: " << num_overhead_bytes
12952            << " pool.info.target_max_bytes: " << pool.info.target_max_bytes
12953            << " pool.info.target_max_objects: " << pool.info.target_max_objects
12954            << dendl;
12955
12956   // get dirty, full ratios
12957   uint64_t dirty_micro = 0;
12958   uint64_t full_micro = 0;
12959   if (pool.info.target_max_bytes && num_user_objects > 0) {
12960     uint64_t avg_size = num_user_bytes / num_user_objects;
12961     dirty_micro =
12962       num_dirty * avg_size * 1000000 /
12963       MAX(pool.info.target_max_bytes / divisor, 1);
12964     full_micro =
12965       num_user_objects * avg_size * 1000000 /
12966       MAX(pool.info.target_max_bytes / divisor, 1);
12967   }
12968   if (pool.info.target_max_objects > 0) {
12969     uint64_t dirty_objects_micro =
12970       num_dirty * 1000000 /
12971       MAX(pool.info.target_max_objects / divisor, 1);
12972     if (dirty_objects_micro > dirty_micro)
12973       dirty_micro = dirty_objects_micro;
12974     uint64_t full_objects_micro =
12975       num_user_objects * 1000000 /
12976       MAX(pool.info.target_max_objects / divisor, 1);
12977     if (full_objects_micro > full_micro)
12978       full_micro = full_objects_micro;
12979   }
12980   dout(20) << __func__ << " dirty " << ((float)dirty_micro / 1000000.0)
12981            << " full " << ((float)full_micro / 1000000.0)
12982            << dendl;
12983
12984   // flush mode
12985   uint64_t flush_target = pool.info.cache_target_dirty_ratio_micro;
12986   uint64_t flush_high_target = pool.info.cache_target_dirty_high_ratio_micro;
12987   uint64_t flush_slop = (float)flush_target * cct->_conf->osd_agent_slop;
12988   if (restart || agent_state->flush_mode == TierAgentState::FLUSH_MODE_IDLE) {
12989     flush_target += flush_slop;
12990     flush_high_target += flush_slop;
12991   } else {
12992     flush_target -= MIN(flush_target, flush_slop);
12993     flush_high_target -= MIN(flush_high_target, flush_slop);
12994   }
12995
12996   if (dirty_micro > flush_high_target) {
12997     flush_mode = TierAgentState::FLUSH_MODE_HIGH;
12998   } else if (dirty_micro > flush_target) {
12999     flush_mode = TierAgentState::FLUSH_MODE_LOW;
13000   }
13001
13002   // evict mode
13003   uint64_t evict_target = pool.info.cache_target_full_ratio_micro;
13004   uint64_t evict_slop = (float)evict_target * cct->_conf->osd_agent_slop;
13005   if (restart || agent_state->evict_mode == TierAgentState::EVICT_MODE_IDLE)
13006     evict_target += evict_slop;
13007   else
13008     evict_target -= MIN(evict_target, evict_slop);
13009
13010   if (full_micro > 1000000) {
13011     // evict anything clean
13012     evict_mode = TierAgentState::EVICT_MODE_FULL;
13013     evict_effort = 1000000;
13014   } else if (full_micro > evict_target) {
13015     // set effort in [0..1] range based on where we are between
13016     evict_mode = TierAgentState::EVICT_MODE_SOME;
13017     uint64_t over = full_micro - evict_target;
13018     uint64_t span  = 1000000 - evict_target;
13019     evict_effort = MAX(over * 1000000 / span,
13020                        (unsigned)(1000000.0 * cct->_conf->osd_agent_min_evict_effort));
13021
13022     // quantize effort to avoid too much reordering in the agent_queue.
13023     uint64_t inc = cct->_conf->osd_agent_quantize_effort * 1000000;
13024     assert(inc > 0);
13025     uint64_t was = evict_effort;
13026     evict_effort -= evict_effort % inc;
13027     if (evict_effort < inc)
13028       evict_effort = inc;
13029     assert(evict_effort >= inc && evict_effort <= 1000000);
13030     dout(30) << __func__ << " evict_effort " << was << " quantized by " << inc << " to " << evict_effort << dendl;
13031   }
13032   }
13033
13034   skip_calc:
13035   bool old_idle = agent_state->is_idle();
13036   if (flush_mode != agent_state->flush_mode) {
13037     dout(5) << __func__ << " flush_mode "
13038             << TierAgentState::get_flush_mode_name(agent_state->flush_mode)
13039             << " -> "
13040             << TierAgentState::get_flush_mode_name(flush_mode)
13041             << dendl;
13042     if (flush_mode == TierAgentState::FLUSH_MODE_HIGH) {
13043       osd->agent_inc_high_count();
13044       info.stats.stats.sum.num_flush_mode_high = 1;
13045     } else if (flush_mode == TierAgentState::FLUSH_MODE_LOW) {
13046       info.stats.stats.sum.num_flush_mode_low = 1;
13047     }
13048     if (agent_state->flush_mode == TierAgentState::FLUSH_MODE_HIGH) {
13049       osd->agent_dec_high_count();
13050       info.stats.stats.sum.num_flush_mode_high = 0;
13051     } else if (agent_state->flush_mode == TierAgentState::FLUSH_MODE_LOW) {
13052       info.stats.stats.sum.num_flush_mode_low = 0;
13053     }
13054     agent_state->flush_mode = flush_mode;
13055   }
13056   if (evict_mode != agent_state->evict_mode) {
13057     dout(5) << __func__ << " evict_mode "
13058             << TierAgentState::get_evict_mode_name(agent_state->evict_mode)
13059             << " -> "
13060             << TierAgentState::get_evict_mode_name(evict_mode)
13061             << dendl;
13062     if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL &&
13063         is_active()) {
13064       if (op)
13065         requeue_op(op);
13066       requeue_ops(waiting_for_active);
13067       requeue_ops(waiting_for_scrub);
13068       requeue_ops(waiting_for_cache_not_full);
13069       objects_blocked_on_cache_full.clear();
13070       requeued = true;
13071     }
13072     if (evict_mode == TierAgentState::EVICT_MODE_SOME) {
13073       info.stats.stats.sum.num_evict_mode_some = 1;
13074     } else if (evict_mode == TierAgentState::EVICT_MODE_FULL) {
13075       info.stats.stats.sum.num_evict_mode_full = 1;
13076     }
13077     if (agent_state->evict_mode == TierAgentState::EVICT_MODE_SOME) {
13078       info.stats.stats.sum.num_evict_mode_some = 0;
13079     } else if (agent_state->evict_mode == TierAgentState::EVICT_MODE_FULL) {
13080       info.stats.stats.sum.num_evict_mode_full = 0;
13081     }
13082     agent_state->evict_mode = evict_mode;
13083   }
13084   uint64_t old_effort = agent_state->evict_effort;
13085   if (evict_effort != agent_state->evict_effort) {
13086     dout(5) << __func__ << " evict_effort "
13087             << ((float)agent_state->evict_effort / 1000000.0)
13088             << " -> "
13089             << ((float)evict_effort / 1000000.0)
13090             << dendl;
13091     agent_state->evict_effort = evict_effort;
13092   }
13093
13094   // NOTE: we are using evict_effort as a proxy for *all* agent effort
13095   // (including flush).  This is probably fine (they should be
13096   // correlated) but it is not precisely correct.
13097   if (agent_state->is_idle()) {
13098     if (!restart && !old_idle) {
13099       osd->agent_disable_pg(this, old_effort);
13100     }
13101   } else {
13102     if (restart || old_idle) {
13103       osd->agent_enable_pg(this, agent_state->evict_effort);
13104     } else if (old_effort != agent_state->evict_effort) {
13105       osd->agent_adjust_pg(this, old_effort, agent_state->evict_effort);
13106     }
13107   }
13108   return requeued;
13109 }
13110
13111 void PrimaryLogPG::agent_estimate_temp(const hobject_t& oid, int *temp)
13112 {
13113   assert(hit_set);
13114   assert(temp);
13115   *temp = 0;
13116   if (hit_set->contains(oid))
13117     *temp = 1000000;
13118   unsigned i = 0;
13119   int last_n = pool.info.hit_set_search_last_n;
13120   for (map<time_t,HitSetRef>::reverse_iterator p =
13121        agent_state->hit_set_map.rbegin(); last_n > 0 &&
13122        p != agent_state->hit_set_map.rend(); ++p, ++i) {
13123     if (p->second->contains(oid)) {
13124       *temp += pool.info.get_grade(i);
13125       --last_n;
13126     }
13127   }
13128 }
13129
13130 // Dup op detection
13131
13132 bool PrimaryLogPG::already_complete(eversion_t v)
13133 {
13134   dout(20) << __func__ << ": " << v << dendl;
13135   for (xlist<RepGather*>::iterator i = repop_queue.begin();
13136        !i.end();
13137        ++i) {
13138     dout(20) << __func__ << ": " << **i << dendl;
13139     // skip copy from temp object ops
13140     if ((*i)->v == eversion_t()) {
13141       dout(20) << __func__ << ": " << **i
13142                << " version is empty" << dendl;
13143       continue;
13144     }
13145     if ((*i)->v > v) {
13146       dout(20) << __func__ << ": " << **i
13147                << " (*i)->v past v" << dendl;
13148       break;
13149     }
13150     if (!(*i)->all_committed) {
13151       dout(20) << __func__ << ": " << **i
13152                << " not committed, returning false"
13153                << dendl;
13154       return false;
13155     }
13156   }
13157   dout(20) << __func__ << ": returning true" << dendl;
13158   return true;
13159 }
13160
13161 bool PrimaryLogPG::already_ack(eversion_t v)
13162 {
13163   dout(20) << __func__ << ": " << v << dendl;
13164   for (xlist<RepGather*>::iterator i = repop_queue.begin();
13165        !i.end();
13166        ++i) {
13167     // skip copy from temp object ops
13168     if ((*i)->v == eversion_t()) {
13169       dout(20) << __func__ << ": " << **i
13170                << " version is empty" << dendl;
13171       continue;
13172     }
13173     if ((*i)->v > v) {
13174       dout(20) << __func__ << ": " << **i
13175                << " (*i)->v past v" << dendl;
13176       break;
13177     }
13178     if (!(*i)->all_applied) {
13179       dout(20) << __func__ << ": " << **i
13180                << " not applied, returning false"
13181                << dendl;
13182       return false;
13183     }
13184   }
13185   dout(20) << __func__ << ": returning true" << dendl;
13186   return true;
13187 }
13188
13189
13190 // ==========================================================================================
13191 // SCRUB
13192
13193
13194 bool PrimaryLogPG::_range_available_for_scrub(
13195   const hobject_t &begin, const hobject_t &end)
13196 {
13197   pair<hobject_t, ObjectContextRef> next;
13198   next.second = object_contexts.lookup(begin);
13199   next.first = begin;
13200   bool more = true;
13201   while (more && next.first < end) {
13202     if (next.second && next.second->is_blocked()) {
13203       next.second->requeue_scrub_on_unblock = true;
13204       dout(10) << __func__ << ": scrub delayed, "
13205                << next.first << " is blocked"
13206                << dendl;
13207       return false;
13208     }
13209     more = object_contexts.get_next(next.first, &next);
13210   }
13211   return true;
13212 }
13213
13214 static bool doing_clones(const boost::optional<SnapSet> &snapset,
13215                          const vector<snapid_t>::reverse_iterator &curclone) {
13216     return snapset && curclone != snapset.get().clones.rend();
13217 }
13218
13219 void PrimaryLogPG::log_missing(unsigned missing,
13220                         const boost::optional<hobject_t> &head,
13221                         LogChannelRef clog,
13222                         const spg_t &pgid,
13223                         const char *func,
13224                         const char *mode,
13225                         bool allow_incomplete_clones)
13226 {
13227   assert(head);
13228   if (allow_incomplete_clones) {
13229     dout(20) << func << " " << mode << " " << pgid << " " << head.get()
13230                << " skipped " << missing << " clone(s) in cache tier" << dendl;
13231   } else {
13232     clog->info() << mode << " " << pgid << " " << head.get()
13233                        << " " << missing << " missing clone(s)";
13234   }
13235 }
13236
13237 unsigned PrimaryLogPG::process_clones_to(const boost::optional<hobject_t> &head,
13238   const boost::optional<SnapSet> &snapset,
13239   LogChannelRef clog,
13240   const spg_t &pgid,
13241   const char *mode,
13242   bool allow_incomplete_clones,
13243   boost::optional<snapid_t> target,
13244   vector<snapid_t>::reverse_iterator *curclone,
13245   inconsistent_snapset_wrapper &e)
13246 {
13247   assert(head);
13248   assert(snapset);
13249   unsigned missing = 0;
13250
13251   // NOTE: clones are in descending order, thus **curclone > target test here
13252   hobject_t next_clone(head.get());
13253   while(doing_clones(snapset, *curclone) && (!target || **curclone > *target)) {
13254     ++missing;
13255     // it is okay to be missing one or more clones in a cache tier.
13256     // skip higher-numbered clones in the list.
13257     if (!allow_incomplete_clones) {
13258       next_clone.snap = **curclone;
13259       clog->error() << mode << " " << pgid << " " << head.get()
13260                          << " expected clone " << next_clone;
13261       ++scrubber.shallow_errors;
13262       e.set_clone_missing(next_clone.snap);
13263     }
13264     // Clones are descending
13265     ++(*curclone);
13266   }
13267   return missing;
13268 }
13269
13270 /*
13271  * Validate consistency of the object info and snap sets.
13272  *
13273  * We are sort of comparing 2 lists. The main loop is on objmap.objects. But
13274  * the comparison of the objects is against multiple snapset.clones. There are
13275  * multiple clone lists and in between lists we expect head or snapdir.
13276  *
13277  * Example
13278  *
13279  * objects              expected
13280  * =======              =======
13281  * obj1 snap 1          head/snapdir, unexpected obj1 snap 1
13282  * obj2 head            head/snapdir, head ok
13283  *              [SnapSet clones 6 4 2 1]
13284  * obj2 snap 7          obj2 snap 6, unexpected obj2 snap 7
13285  * obj2 snap 6          obj2 snap 6, match
13286  * obj2 snap 4          obj2 snap 4, match
13287  * obj3 head            obj2 snap 2 (expected), obj2 snap 1 (expected), head ok
13288  *              [Snapset clones 3 1]
13289  * obj3 snap 3          obj3 snap 3 match
13290  * obj3 snap 1          obj3 snap 1 match
13291  * obj4 snapdir         head/snapdir, snapdir ok
13292  *              [Snapset clones 4]
13293  * EOL                  obj4 snap 4, (expected)
13294  */
13295 void PrimaryLogPG::scrub_snapshot_metadata(
13296   ScrubMap &scrubmap,
13297   const map<hobject_t, pair<uint32_t, uint32_t>> &missing_digest)
13298 {
13299   dout(10) << __func__ << dendl;
13300
13301   coll_t c(info.pgid);
13302   bool repair = state_test(PG_STATE_REPAIR);
13303   bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
13304   const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
13305   boost::optional<snapid_t> all_clones;   // Unspecified snapid_t or boost::none
13306
13307   /// snapsets to repair
13308   map<hobject_t,SnapSet> snapset_to_repair;
13309
13310   // traverse in reverse order.
13311   boost::optional<hobject_t> head;
13312   boost::optional<SnapSet> snapset; // If initialized so will head (above)
13313   vector<snapid_t>::reverse_iterator curclone; // Defined only if snapset initialized
13314   unsigned missing = 0;
13315   inconsistent_snapset_wrapper soid_error, head_error;
13316
13317   bufferlist last_data;
13318
13319   for (map<hobject_t,ScrubMap::object>::reverse_iterator
13320        p = scrubmap.objects.rbegin(); p != scrubmap.objects.rend(); ++p) {
13321     const hobject_t& soid = p->first;
13322     soid_error = inconsistent_snapset_wrapper{soid};
13323     object_stat_sum_t stat;
13324     boost::optional<object_info_t> oi;
13325
13326     if (!soid.is_snapdir())
13327       stat.num_objects++;
13328
13329     if (soid.nspace == cct->_conf->osd_hit_set_namespace)
13330       stat.num_objects_hit_set_archive++;
13331
13332     if (soid.is_snap()) {
13333       // it's a clone
13334       stat.num_object_clones++;
13335     }
13336
13337     // basic checks.
13338     if (p->second.attrs.count(OI_ATTR) == 0) {
13339       oi = boost::none;
13340       osd->clog->error() << mode << " " << info.pgid << " " << soid
13341                         << " no '" << OI_ATTR << "' attr";
13342       ++scrubber.shallow_errors;
13343       soid_error.set_oi_attr_missing();
13344     } else {
13345       bufferlist bv;
13346       bv.push_back(p->second.attrs[OI_ATTR]);
13347       try {
13348         oi = object_info_t(); // Initialize optional<> before decode into it
13349         oi.get().decode(bv);
13350       } catch (buffer::error& e) {
13351         oi = boost::none;
13352         osd->clog->error() << mode << " " << info.pgid << " " << soid
13353                 << " can't decode '" << OI_ATTR << "' attr " << e.what();
13354         ++scrubber.shallow_errors;
13355         soid_error.set_oi_attr_corrupted();
13356         soid_error.set_oi_attr_missing(); // Not available too
13357       }
13358     }
13359
13360     if (oi) {
13361       if (pgbackend->be_get_ondisk_size(oi->size) != p->second.size) {
13362         osd->clog->error() << mode << " " << info.pgid << " " << soid
13363                            << " on disk size (" << p->second.size
13364                            << ") does not match object info size ("
13365                            << oi->size << ") adjusted for ondisk to ("
13366                            << pgbackend->be_get_ondisk_size(oi->size)
13367                            << ")";
13368         soid_error.set_size_mismatch();
13369         ++scrubber.shallow_errors;
13370       }
13371
13372       dout(20) << mode << "  " << soid << " " << oi.get() << dendl;
13373
13374       // A clone num_bytes will be added later when we have snapset
13375       if (!soid.is_snap()) {
13376         stat.num_bytes += oi->size;
13377       }
13378       if (soid.nspace == cct->_conf->osd_hit_set_namespace)
13379         stat.num_bytes_hit_set_archive += oi->size;
13380
13381       if (!soid.is_snapdir()) {
13382         if (oi->is_dirty())
13383           ++stat.num_objects_dirty;
13384         if (oi->is_whiteout())
13385           ++stat.num_whiteouts;
13386         if (oi->is_omap())
13387           ++stat.num_objects_omap;
13388         if (oi->is_cache_pinned())
13389           ++stat.num_objects_pinned;
13390       }
13391     } else {
13392       // pessimistic assumption that this object might contain a
13393       // legacy SnapSet
13394       stat.num_legacy_snapsets++;
13395     }
13396
13397     // Check for any problems while processing clones
13398     if (doing_clones(snapset, curclone)) {
13399       boost::optional<snapid_t> target;
13400       // Expecting an object with snap for current head
13401       if (soid.has_snapset() || soid.get_head() != head->get_head()) {
13402
13403         dout(10) << __func__ << " " << mode << " " << info.pgid << " new object "
13404                  << soid << " while processing " << head.get() << dendl;
13405
13406         target = all_clones;
13407       } else {
13408         assert(soid.is_snap());
13409         target = soid.snap;
13410       }
13411
13412       // Log any clones we were expecting to be there up to target
13413       // This will set missing, but will be a no-op if snap.soid == *curclone.
13414       missing += process_clones_to(head, snapset, osd->clog, info.pgid, mode,
13415                         pool.info.allow_incomplete_clones(), target, &curclone,
13416                         head_error);
13417     }
13418     bool expected;
13419     // Check doing_clones() again in case we ran process_clones_to()
13420     if (doing_clones(snapset, curclone)) {
13421       // A head/snapdir would have processed all clones above
13422       // or all greater than *curclone.
13423       assert(soid.is_snap() && *curclone <= soid.snap);
13424
13425       // After processing above clone snap should match the expected curclone
13426       expected = (*curclone == soid.snap);
13427     } else {
13428       // If we aren't doing clones any longer, then expecting head/snapdir
13429       expected = soid.has_snapset();
13430     }
13431     if (!expected) {
13432       // If we couldn't read the head's snapset, just ignore clones
13433       if (head && !snapset) {
13434         osd->clog->error() << mode << " " << info.pgid << " " << soid
13435                           << " clone ignored due to missing snapset";
13436       } else {
13437         osd->clog->error() << mode << " " << info.pgid << " " << soid
13438                            << " is an unexpected clone";
13439       }
13440       ++scrubber.shallow_errors;
13441       soid_error.set_headless();
13442       scrubber.store->add_snap_error(pool.id, soid_error);
13443       if (head && soid.get_head() == head->get_head())
13444         head_error.set_clone(soid.snap);
13445       continue;
13446     }
13447
13448     // new snapset?
13449     if (soid.has_snapset()) {
13450
13451       if (missing) {
13452         log_missing(missing, head, osd->clog, info.pgid, __func__, mode,
13453                     pool.info.allow_incomplete_clones());
13454       }
13455
13456       // Save previous head error information
13457       if (head && head_error.errors)
13458         scrubber.store->add_snap_error(pool.id, head_error);
13459       // Set this as a new head object
13460       head = soid;
13461       missing = 0;
13462       head_error = soid_error;
13463
13464       dout(20) << __func__ << " " << mode << " new head " << head << dendl;
13465
13466       if (p->second.attrs.count(SS_ATTR) == 0) {
13467         osd->clog->error() << mode << " " << info.pgid << " " << soid
13468                           << " no '" << SS_ATTR << "' attr";
13469         ++scrubber.shallow_errors;
13470         snapset = boost::none;
13471         head_error.set_ss_attr_missing();
13472       } else {
13473         bufferlist bl;
13474         bl.push_back(p->second.attrs[SS_ATTR]);
13475         bufferlist::iterator blp = bl.begin();
13476         try {
13477           snapset = SnapSet(); // Initialize optional<> before decoding into it
13478           ::decode(snapset.get(), blp);
13479         } catch (buffer::error& e) {
13480           snapset = boost::none;
13481           osd->clog->error() << mode << " " << info.pgid << " " << soid
13482                 << " can't decode '" << SS_ATTR << "' attr " << e.what();
13483           ++scrubber.shallow_errors;
13484           head_error.set_ss_attr_corrupted();
13485         }
13486       }
13487
13488       if (snapset) {
13489         // what will be next?
13490         curclone = snapset->clones.rbegin();
13491
13492         if (!snapset->clones.empty()) {
13493           dout(20) << "  snapset " << snapset.get() << dendl;
13494           if (snapset->seq == 0) {
13495             osd->clog->error() << mode << " " << info.pgid << " " << soid
13496                                << " snaps.seq not set";
13497             ++scrubber.shallow_errors;
13498             head_error.set_snapset_mismatch();
13499           }
13500         }
13501
13502         if (soid.is_head() && !snapset->head_exists) {
13503           osd->clog->error() << mode << " " << info.pgid << " " << soid
13504                           << " snapset.head_exists=false, but head exists";
13505           ++scrubber.shallow_errors;
13506           head_error.set_head_mismatch();
13507         }
13508         if (soid.is_snapdir() && snapset->head_exists) {
13509           osd->clog->error() << mode << " " << info.pgid << " " << soid
13510                           << " snapset.head_exists=true, but snapdir exists";
13511           ++scrubber.shallow_errors;
13512           head_error.set_head_mismatch();
13513         }
13514
13515         if (get_osdmap()->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
13516           if (soid.is_snapdir()) {
13517             dout(10) << " will move snapset to head from " << soid << dendl;
13518             snapset_to_repair[soid.get_head()] = *snapset;
13519           } else if (snapset->is_legacy()) {
13520             dout(10) << " will convert legacy snapset on " << soid << " " << *snapset
13521                      << dendl;
13522             snapset_to_repair[soid.get_head()] = *snapset;
13523           }
13524         } else {
13525           stat.num_legacy_snapsets++;
13526         }
13527       } else {
13528         // pessimistic assumption that this object might contain a
13529         // legacy SnapSet
13530         stat.num_legacy_snapsets++;
13531       }
13532     } else {
13533       assert(soid.is_snap());
13534       assert(head);
13535       assert(snapset);
13536       assert(soid.snap == *curclone);
13537
13538       dout(20) << __func__ << " " << mode << " matched clone " << soid << dendl;
13539
13540       if (snapset->clone_size.count(soid.snap) == 0) {
13541         osd->clog->error() << mode << " " << info.pgid << " " << soid
13542                            << " is missing in clone_size";
13543         ++scrubber.shallow_errors;
13544         soid_error.set_size_mismatch();
13545       } else {
13546         if (oi && oi->size != snapset->clone_size[soid.snap]) {
13547           osd->clog->error() << mode << " " << info.pgid << " " << soid
13548                              << " size " << oi->size << " != clone_size "
13549                              << snapset->clone_size[*curclone];
13550           ++scrubber.shallow_errors;
13551           soid_error.set_size_mismatch();
13552         }
13553
13554         if (snapset->clone_overlap.count(soid.snap) == 0) {
13555           osd->clog->error() << mode << " " << info.pgid << " " << soid
13556                              << " is missing in clone_overlap";
13557           ++scrubber.shallow_errors;
13558           soid_error.set_size_mismatch();
13559         } else {
13560           // This checking is based on get_clone_bytes().  The first 2 asserts
13561           // can't happen because we know we have a clone_size and
13562           // a clone_overlap.  Now we check that the interval_set won't
13563           // cause the last assert.
13564           uint64_t size = snapset->clone_size.find(soid.snap)->second;
13565           const interval_set<uint64_t> &overlap =
13566                 snapset->clone_overlap.find(soid.snap)->second;
13567           bool bad_interval_set = false;
13568           for (interval_set<uint64_t>::const_iterator i = overlap.begin();
13569                i != overlap.end(); ++i) {
13570             if (size < i.get_len()) {
13571               bad_interval_set = true;
13572               break;
13573             }
13574             size -= i.get_len();
13575           }
13576
13577           if (bad_interval_set) {
13578             osd->clog->error() << mode << " " << info.pgid << " " << soid
13579                                << " bad interval_set in clone_overlap";
13580             ++scrubber.shallow_errors;
13581             soid_error.set_size_mismatch();
13582           } else {
13583             stat.num_bytes += snapset->get_clone_bytes(soid.snap);
13584           }
13585         }
13586       }
13587
13588       // migrate legacy_snaps to snapset?
13589       auto p = snapset_to_repair.find(soid.get_head());
13590       if (p != snapset_to_repair.end()) {
13591         if (!oi || oi->legacy_snaps.empty()) {
13592           osd->clog->error() << mode << " " << info.pgid << " " << soid
13593                              << " has no oi or legacy_snaps; cannot convert "
13594                              << *snapset;
13595           ++scrubber.shallow_errors;
13596         } else {
13597           dout(20) << __func__ << "   copying legacy_snaps " << oi->legacy_snaps
13598                    << " to snapset " << p->second << dendl;
13599           p->second.clone_snaps[soid.snap] = oi->legacy_snaps;
13600         }
13601       }
13602
13603       // what's next?
13604       ++curclone;
13605       if (soid_error.errors)
13606         scrubber.store->add_snap_error(pool.id, soid_error);
13607     }
13608
13609     scrub_cstat.add(stat);
13610   }
13611
13612   if (doing_clones(snapset, curclone)) {
13613     dout(10) << __func__ << " " << mode << " " << info.pgid
13614              << " No more objects while processing " << head.get() << dendl;
13615
13616     missing += process_clones_to(head, snapset, osd->clog, info.pgid, mode,
13617                       pool.info.allow_incomplete_clones(), all_clones, &curclone,
13618                       head_error);
13619   }
13620   // There could be missing found by the test above or even
13621   // before dropping out of the loop for the last head.
13622   if (missing) {
13623     log_missing(missing, head, osd->clog, info.pgid, __func__,
13624                 mode, pool.info.allow_incomplete_clones());
13625   }
13626   if (head && head_error.errors)
13627     scrubber.store->add_snap_error(pool.id, head_error);
13628
13629   for (map<hobject_t,pair<uint32_t,uint32_t>>::const_iterator p =
13630          missing_digest.begin();
13631        p != missing_digest.end();
13632        ++p) {
13633     if (p->first.is_snapdir())
13634       continue;
13635     dout(10) << __func__ << " recording digests for " << p->first << dendl;
13636     ObjectContextRef obc = get_object_context(p->first, false);
13637     if (!obc) {
13638       osd->clog->error() << info.pgid << " " << mode
13639                          << " cannot get object context for "
13640                          << p->first;
13641       continue;
13642     } else if (obc->obs.oi.soid != p->first) {
13643       osd->clog->error() << info.pgid << " " << mode
13644                          << " object " << p->first
13645                          << " has a valid oi attr with a mismatched name, "
13646                          << " obc->obs.oi.soid: " << obc->obs.oi.soid;
13647       continue;
13648     }
13649     OpContextUPtr ctx = simple_opc_create(obc);
13650     ctx->at_version = get_next_version();
13651     ctx->mtime = utime_t();      // do not update mtime
13652     ctx->new_obs.oi.set_data_digest(p->second.first);
13653     ctx->new_obs.oi.set_omap_digest(p->second.second);
13654     finish_ctx(ctx.get(), pg_log_entry_t::MODIFY);
13655
13656     ctx->register_on_success(
13657       [this]() {
13658         dout(20) << "updating scrub digest" << dendl;
13659         if (--scrubber.num_digest_updates_pending == 0) {
13660           requeue_scrub();
13661         }
13662       });
13663
13664     simple_opc_submit(std::move(ctx));
13665     ++scrubber.num_digest_updates_pending;
13666   }
13667   for (auto& p : snapset_to_repair) {
13668     // cache pools may not have the clones, which means we won't know
13669     // what snaps they have.  fake out the clone_snaps entries anyway (with
13670     // blank snap lists).
13671     p.second.head_exists = true;
13672     if (pool.info.allow_incomplete_clones()) {
13673       for (auto s : p.second.clones) {
13674         if (p.second.clone_snaps.count(s) == 0) {
13675           dout(10) << __func__ << " " << p.first << " faking clone_snaps for "
13676                    << s << dendl;
13677           p.second.clone_snaps[s];
13678         }
13679       }
13680     }
13681     if (p.second.clones.size() != p.second.clone_snaps.size() ||
13682         p.second.is_legacy()) {
13683       // this happens if we encounter other errors above, like a missing
13684       // or extra clone.
13685       dout(10) << __func__ << " not writing snapset to " << p.first
13686                << " snapset " << p.second << " clones " << p.second.clones
13687                << "; didn't convert fully" << dendl;
13688       scrub_cstat.sum.num_legacy_snapsets++;
13689       continue;
13690     }
13691     dout(10) << __func__ << " writing snapset to " << p.first
13692              << " " << p.second << dendl;
13693     ObjectContextRef obc = get_object_context(p.first, true);
13694     if (!obc) {
13695       osd->clog->error() << info.pgid << " " << mode
13696                          << " cannot get object context for "
13697                          << p.first;
13698       continue;
13699     } else if (obc->obs.oi.soid != p.first) {
13700       osd->clog->error() << info.pgid << " " << mode
13701                          << " object " << p.first
13702                          << " has a valid oi attr with a mismatched name, "
13703                          << " obc->obs.oi.soid: " << obc->obs.oi.soid;
13704       continue;
13705     }
13706     ObjectContextRef snapset_obc;
13707     if (!obc->obs.exists) {
13708       snapset_obc = get_object_context(p.first.get_snapdir(), false);
13709       if (!snapset_obc) {
13710         osd->clog->error() << info.pgid << " " << mode
13711                            << " cannot get object context for "
13712                            << p.first.get_snapdir();
13713         continue;
13714       }
13715     }
13716     OpContextUPtr ctx = simple_opc_create(obc);
13717     PGTransaction *t = ctx->op_t.get();
13718     ctx->snapset_obc = snapset_obc;
13719     ctx->at_version = get_next_version();
13720     ctx->mtime = utime_t();      // do not update mtime
13721     ctx->new_snapset = p.second;
13722     if (!ctx->new_obs.exists) {
13723       dout(20) << __func__ << "   making " << p.first << " a whiteout" << dendl;
13724       ctx->new_obs.exists = true;
13725       ctx->new_snapset.head_exists = true;
13726       ctx->new_obs.oi.set_flag(object_info_t::FLAG_WHITEOUT);
13727       ++ctx->delta_stats.num_whiteouts;
13728       ++ctx->delta_stats.num_objects;
13729       t->create(p.first);
13730       if (p.first < scrubber.start) {
13731         dout(20) << __func__ << " kludging around update outside of scrub range"
13732                  << dendl;
13733       } else {
13734         scrub_cstat.add(ctx->delta_stats);
13735       }
13736     }
13737     dout(20) << __func__ << "   final snapset " << ctx->new_snapset << dendl;
13738     assert(!ctx->new_snapset.is_legacy());
13739     finish_ctx(ctx.get(), pg_log_entry_t::MODIFY);
13740     ctx->register_on_success(
13741       [this]() {
13742         dout(20) << "updating snapset" << dendl;
13743         if (--scrubber.num_digest_updates_pending == 0) {
13744           requeue_scrub();
13745         }
13746       });
13747
13748     simple_opc_submit(std::move(ctx));
13749     ++scrubber.num_digest_updates_pending;
13750   }
13751
13752   dout(10) << __func__ << " (" << mode << ") finish" << dendl;
13753 }
13754
13755 void PrimaryLogPG::_scrub_clear_state()
13756 {
13757   scrub_cstat = object_stat_collection_t();
13758 }
13759
13760 void PrimaryLogPG::_scrub_finish()
13761 {
13762   bool repair = state_test(PG_STATE_REPAIR);
13763   bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
13764   const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
13765
13766   if (info.stats.stats_invalid) {
13767     info.stats.stats = scrub_cstat;
13768     info.stats.stats_invalid = false;
13769
13770     if (agent_state)
13771       agent_choose_mode();
13772   }
13773
13774   dout(10) << mode << " got "
13775            << scrub_cstat.sum.num_objects << "/" << info.stats.stats.sum.num_objects << " objects, "
13776            << scrub_cstat.sum.num_object_clones << "/" << info.stats.stats.sum.num_object_clones << " clones, "
13777            << scrub_cstat.sum.num_objects_dirty << "/" << info.stats.stats.sum.num_objects_dirty << " dirty, "
13778            << scrub_cstat.sum.num_objects_omap << "/" << info.stats.stats.sum.num_objects_omap << " omap, "
13779            << scrub_cstat.sum.num_objects_pinned << "/" << info.stats.stats.sum.num_objects_pinned << " pinned, "
13780            << scrub_cstat.sum.num_objects_hit_set_archive << "/" << info.stats.stats.sum.num_objects_hit_set_archive << " hit_set_archive, "
13781            << scrub_cstat.sum.num_bytes << "/" << info.stats.stats.sum.num_bytes << " bytes, "
13782            << scrub_cstat.sum.num_bytes_hit_set_archive << "/" << info.stats.stats.sum.num_bytes_hit_set_archive << " hit_set_archive bytes."
13783            << dendl;
13784
13785   if (scrub_cstat.sum.num_objects != info.stats.stats.sum.num_objects ||
13786       scrub_cstat.sum.num_object_clones != info.stats.stats.sum.num_object_clones ||
13787       (scrub_cstat.sum.num_objects_dirty != info.stats.stats.sum.num_objects_dirty &&
13788        !info.stats.dirty_stats_invalid) ||
13789       (scrub_cstat.sum.num_objects_omap != info.stats.stats.sum.num_objects_omap &&
13790        !info.stats.omap_stats_invalid) ||
13791       (scrub_cstat.sum.num_objects_pinned != info.stats.stats.sum.num_objects_pinned &&
13792        !info.stats.pin_stats_invalid) ||
13793       (scrub_cstat.sum.num_objects_hit_set_archive != info.stats.stats.sum.num_objects_hit_set_archive &&
13794        !info.stats.hitset_stats_invalid) ||
13795       (scrub_cstat.sum.num_bytes_hit_set_archive != info.stats.stats.sum.num_bytes_hit_set_archive &&
13796        !info.stats.hitset_bytes_stats_invalid) ||
13797       scrub_cstat.sum.num_whiteouts != info.stats.stats.sum.num_whiteouts ||
13798       scrub_cstat.sum.num_bytes != info.stats.stats.sum.num_bytes) {
13799     osd->clog->error() << info.pgid << " " << mode
13800                       << " stat mismatch, got "
13801                       << scrub_cstat.sum.num_objects << "/" << info.stats.stats.sum.num_objects << " objects, "
13802                       << scrub_cstat.sum.num_object_clones << "/" << info.stats.stats.sum.num_object_clones << " clones, "
13803                       << scrub_cstat.sum.num_objects_dirty << "/" << info.stats.stats.sum.num_objects_dirty << " dirty, "
13804                       << scrub_cstat.sum.num_objects_omap << "/" << info.stats.stats.sum.num_objects_omap << " omap, "
13805                       << scrub_cstat.sum.num_objects_pinned << "/" << info.stats.stats.sum.num_objects_pinned << " pinned, "
13806                       << scrub_cstat.sum.num_objects_hit_set_archive << "/" << info.stats.stats.sum.num_objects_hit_set_archive << " hit_set_archive, "
13807                       << scrub_cstat.sum.num_whiteouts << "/" << info.stats.stats.sum.num_whiteouts << " whiteouts, "
13808                       << scrub_cstat.sum.num_bytes << "/" << info.stats.stats.sum.num_bytes << " bytes, "
13809                       << scrub_cstat.sum.num_bytes_hit_set_archive << "/" << info.stats.stats.sum.num_bytes_hit_set_archive << " hit_set_archive bytes.";
13810     ++scrubber.shallow_errors;
13811
13812     if (repair) {
13813       ++scrubber.fixed;
13814       info.stats.stats = scrub_cstat;
13815       info.stats.dirty_stats_invalid = false;
13816       info.stats.omap_stats_invalid = false;
13817       info.stats.hitset_stats_invalid = false;
13818       info.stats.hitset_bytes_stats_invalid = false;
13819       publish_stats_to_osd();
13820       share_pg_info();
13821     }
13822   } else if (scrub_cstat.sum.num_legacy_snapsets !=
13823              info.stats.stats.sum.num_legacy_snapsets) {
13824     osd->clog->info() << info.pgid << " " << mode << " updated num_legacy_snapsets"
13825                       << " from " << info.stats.stats.sum.num_legacy_snapsets
13826                       << " -> " << scrub_cstat.sum.num_legacy_snapsets << "\n";
13827     info.stats.stats.sum.num_legacy_snapsets = scrub_cstat.sum.num_legacy_snapsets;
13828     publish_stats_to_osd();
13829     share_pg_info();
13830   }
13831 }
13832
13833 bool PrimaryLogPG::check_osdmap_full(const set<pg_shard_t> &missing_on)
13834 {
13835     return osd->check_osdmap_full(missing_on);
13836 }
13837
13838 /*---SnapTrimmer Logging---*/
13839 #undef dout_prefix
13840 #define dout_prefix *_dout << pg->gen_prefix()
13841
13842 void PrimaryLogPG::SnapTrimmer::log_enter(const char *state_name)
13843 {
13844   ldout(pg->cct, 20) << "enter " << state_name << dendl;
13845 }
13846
13847 void PrimaryLogPG::SnapTrimmer::log_exit(const char *state_name, utime_t enter_time)
13848 {
13849   ldout(pg->cct, 20) << "exit " << state_name << dendl;
13850 }
13851
13852 /*---SnapTrimmer states---*/
13853 #undef dout_prefix
13854 #define dout_prefix (*_dout << context< SnapTrimmer >().pg->gen_prefix() \
13855                      << "SnapTrimmer state<" << get_state_name() << ">: ")
13856
13857 /* NotTrimming */
13858 PrimaryLogPG::NotTrimming::NotTrimming(my_context ctx)
13859   : my_base(ctx),
13860     NamedState(context< SnapTrimmer >().pg, "NotTrimming")
13861 {
13862   context< SnapTrimmer >().log_enter(state_name);
13863 }
13864
13865 void PrimaryLogPG::NotTrimming::exit()
13866 {
13867   context< SnapTrimmer >().log_exit(state_name, enter_time);
13868 }
13869
13870 boost::statechart::result PrimaryLogPG::NotTrimming::react(const KickTrim&)
13871 {
13872   PrimaryLogPG *pg = context< SnapTrimmer >().pg;
13873   ldout(pg->cct, 10) << "NotTrimming react KickTrim" << dendl;
13874
13875   if (!(pg->is_primary() && pg->is_active())) {
13876     ldout(pg->cct, 10) << "NotTrimming not primary or active" << dendl;
13877     return discard_event();
13878   }
13879   if (!pg->is_clean() ||
13880       pg->snap_trimq.empty()) {
13881     ldout(pg->cct, 10) << "NotTrimming not clean or nothing to trim" << dendl;
13882     return discard_event();
13883   }
13884   if (pg->scrubber.active) {
13885     ldout(pg->cct, 10) << " scrubbing, will requeue snap_trimmer after" << dendl;
13886     pg->scrubber.queue_snap_trim = true;
13887     return transit< WaitScrub >();
13888   } else {
13889     return transit< Trimming >();
13890   }
13891 }
13892
13893 boost::statechart::result PrimaryLogPG::WaitReservation::react(const SnapTrimReserved&)
13894 {
13895   PrimaryLogPG *pg = context< SnapTrimmer >().pg;
13896   ldout(pg->cct, 10) << "WaitReservation react SnapTrimReserved" << dendl;
13897
13898   pending = nullptr;
13899   if (!context< SnapTrimmer >().can_trim()) {
13900     post_event(KickTrim());
13901     return transit< NotTrimming >();
13902   }
13903
13904   context<Trimming>().snap_to_trim = pg->snap_trimq.range_start();
13905   ldout(pg->cct, 10) << "NotTrimming: trimming "
13906                      << pg->snap_trimq.range_start()
13907                      << dendl;
13908   return transit< AwaitAsyncWork >();
13909 }
13910
13911 /* AwaitAsyncWork */
13912 PrimaryLogPG::AwaitAsyncWork::AwaitAsyncWork(my_context ctx)
13913   : my_base(ctx),
13914     NamedState(context< SnapTrimmer >().pg, "Trimming/AwaitAsyncWork")
13915 {
13916   auto *pg = context< SnapTrimmer >().pg;
13917   context< SnapTrimmer >().log_enter(state_name);
13918   context< SnapTrimmer >().pg->osd->queue_for_snap_trim(pg);
13919   pg->state_set(PG_STATE_SNAPTRIM);
13920   pg->publish_stats_to_osd();
13921 }
13922
13923 boost::statechart::result PrimaryLogPG::AwaitAsyncWork::react(const DoSnapWork&)
13924 {
13925   PrimaryLogPGRef pg = context< SnapTrimmer >().pg;
13926   snapid_t snap_to_trim = context<Trimming>().snap_to_trim;
13927   auto &in_flight = context<Trimming>().in_flight;
13928   assert(in_flight.empty());
13929
13930   assert(pg->is_primary() && pg->is_active());
13931   if (!context< SnapTrimmer >().can_trim()) {
13932     ldout(pg->cct, 10) << "something changed, reverting to NotTrimming" << dendl;
13933     post_event(KickTrim());
13934     return transit< NotTrimming >();
13935   }
13936
13937   ldout(pg->cct, 10) << "AwaitAsyncWork: trimming snap " << snap_to_trim << dendl;
13938
13939   vector<hobject_t> to_trim;
13940   unsigned max = pg->cct->_conf->osd_pg_max_concurrent_snap_trims;
13941   to_trim.reserve(max);
13942   int r = pg->snap_mapper.get_next_objects_to_trim(
13943     snap_to_trim,
13944     max,
13945     &to_trim);
13946   if (r != 0 && r != -ENOENT) {
13947     lderr(pg->cct) << "get_next_objects_to_trim returned "
13948                    << cpp_strerror(r) << dendl;
13949     assert(0 == "get_next_objects_to_trim returned an invalid code");
13950   } else if (r == -ENOENT) {
13951     // Done!
13952     ldout(pg->cct, 10) << "got ENOENT" << dendl;
13953
13954     ldout(pg->cct, 10) << "adding snap " << snap_to_trim
13955                        << " to purged_snaps"
13956                        << dendl;
13957     pg->info.purged_snaps.insert(snap_to_trim);
13958     pg->snap_trimq.erase(snap_to_trim);
13959     ldout(pg->cct, 10) << "purged_snaps now "
13960                        << pg->info.purged_snaps << ", snap_trimq now "
13961                        << pg->snap_trimq << dendl;
13962
13963     ObjectStore::Transaction t;
13964     pg->dirty_big_info = true;
13965     pg->write_if_dirty(t);
13966     int tr = pg->osd->store->queue_transaction(pg->osr.get(), std::move(t), NULL);
13967     assert(tr == 0);
13968
13969     pg->share_pg_info();
13970     post_event(KickTrim());
13971     return transit< NotTrimming >();
13972   }
13973   assert(!to_trim.empty());
13974
13975   for (auto &&object: to_trim) {
13976     // Get next
13977     ldout(pg->cct, 10) << "AwaitAsyncWork react trimming " << object << dendl;
13978     OpContextUPtr ctx = pg->trim_object(in_flight.empty(), object);
13979     if (!ctx) {
13980       ldout(pg->cct, 10) << "could not get write lock on obj "
13981                          << object << dendl;
13982       if (in_flight.empty()) {
13983         ldout(pg->cct, 10) << "waiting for it to clear"
13984                            << dendl;
13985         return transit< WaitRWLock >();
13986
13987       } else {
13988         ldout(pg->cct, 10) << "letting the ones we already started finish" << dendl;
13989         return transit< WaitRepops >();
13990       }
13991     }
13992
13993     in_flight.insert(object);
13994     ctx->register_on_success(
13995       [pg, object, &in_flight]() {
13996         assert(in_flight.find(object) != in_flight.end());
13997         in_flight.erase(object);
13998         if (in_flight.empty())
13999           pg->snap_trimmer_machine.process_event(RepopsComplete());
14000       });
14001
14002     pg->simple_opc_submit(std::move(ctx));
14003   }
14004
14005   return transit< WaitRepops >();
14006 }
14007
14008 void PrimaryLogPG::setattr_maybe_cache(
14009   ObjectContextRef obc,
14010   OpContext *op,
14011   PGTransaction *t,
14012   const string &key,
14013   bufferlist &val)
14014 {
14015   t->setattr(obc->obs.oi.soid, key, val);
14016 }
14017
14018 void PrimaryLogPG::setattrs_maybe_cache(
14019   ObjectContextRef obc,
14020   OpContext *op,
14021   PGTransaction *t,
14022   map<string, bufferlist> &attrs)
14023 {
14024   t->setattrs(obc->obs.oi.soid, attrs);
14025 }
14026
14027 void PrimaryLogPG::rmattr_maybe_cache(
14028   ObjectContextRef obc,
14029   OpContext *op,
14030   PGTransaction *t,
14031   const string &key)
14032 {
14033   t->rmattr(obc->obs.oi.soid, key);
14034 }
14035
14036 int PrimaryLogPG::getattr_maybe_cache(
14037   ObjectContextRef obc,
14038   const string &key,
14039   bufferlist *val)
14040 {
14041   if (pool.info.require_rollback()) {
14042     map<string, bufferlist>::iterator i = obc->attr_cache.find(key);
14043     if (i != obc->attr_cache.end()) {
14044       if (val)
14045         *val = i->second;
14046       return 0;
14047     } else {
14048       return -ENODATA;
14049     }
14050   }
14051   return pgbackend->objects_get_attr(obc->obs.oi.soid, key, val);
14052 }
14053
14054 int PrimaryLogPG::getattrs_maybe_cache(
14055   ObjectContextRef obc,
14056   map<string, bufferlist> *out,
14057   bool user_only)
14058 {
14059   int r = 0;
14060   if (pool.info.require_rollback()) {
14061     if (out)
14062       *out = obc->attr_cache;
14063   } else {
14064     r = pgbackend->objects_get_attrs(obc->obs.oi.soid, out);
14065   }
14066   if (out && user_only) {
14067     map<string, bufferlist> tmp;
14068     for (map<string, bufferlist>::iterator i = out->begin();
14069          i != out->end();
14070          ++i) {
14071       if (i->first.size() > 1 && i->first[0] == '_')
14072         tmp[i->first.substr(1, i->first.size())].claim(i->second);
14073     }
14074     tmp.swap(*out);
14075   }
14076   return r;
14077 }
14078
14079 bool PrimaryLogPG::check_failsafe_full(ostream &ss) {
14080     return osd->check_failsafe_full(ss);
14081 }
14082
14083 void intrusive_ptr_add_ref(PrimaryLogPG *pg) { pg->get("intptr"); }
14084 void intrusive_ptr_release(PrimaryLogPG *pg) { pg->put("intptr"); }
14085
14086 #ifdef PG_DEBUG_REFS
14087 uint64_t get_with_id(PrimaryLogPG *pg) { return pg->get_with_id(); }
14088 void put_with_id(PrimaryLogPG *pg, uint64_t id) { return pg->put_with_id(id); }
14089 #endif
14090
14091 void intrusive_ptr_add_ref(PrimaryLogPG::RepGather *repop) { repop->get(); }
14092 void intrusive_ptr_release(PrimaryLogPG::RepGather *repop) { repop->put(); }