ceph/src/osd/PG.cc

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Ceph - scalable distributed file system
   5  *
   6  * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
   7  *
   8  * This is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License version 2.1, as published by the Free Software
  11  * Foundation.  See file COPYING.
  12  *
  13  */
  14
  15 #include "PG.h"
  16 // #include "msg/Messenger.h"
  17 #include "messages/MOSDRepScrub.h"
  18 // #include "common/cmdparse.h"
  19 // #include "common/ceph_context.h"
  20
  21 #include "common/errno.h"
  22 #include "common/config.h"
  23 #include "OSD.h"
  24 #include "OpRequest.h"
  25 #include "ScrubStore.h"
  26 #include "Session.h"
  27
  28 #include "common/Timer.h"
  29 #include "common/perf_counters.h"
  30
  31 #include "messages/MOSDOp.h"
  32 #include "messages/MOSDPGNotify.h"
  33 // #include "messages/MOSDPGLog.h"
  34 #include "messages/MOSDPGRemove.h"
  35 #include "messages/MOSDPGInfo.h"
  36 #include "messages/MOSDPGTrim.h"
  37 #include "messages/MOSDPGScan.h"
  38 #include "messages/MOSDPGBackfill.h"
  39 #include "messages/MOSDPGBackfillRemove.h"
  40 #include "messages/MBackfillReserve.h"
  41 #include "messages/MRecoveryReserve.h"
  42 #include "messages/MOSDPGPush.h"
  43 #include "messages/MOSDPGPushReply.h"
  44 #include "messages/MOSDPGPull.h"
  45 #include "messages/MOSDECSubOpWrite.h"
  46 #include "messages/MOSDECSubOpWriteReply.h"
  47 #include "messages/MOSDECSubOpRead.h"
  48 #include "messages/MOSDECSubOpReadReply.h"
  49 #include "messages/MOSDPGUpdateLogMissing.h"
  50 #include "messages/MOSDPGUpdateLogMissingReply.h"
  51 #include "messages/MOSDBackoff.h"
  52 #include "messages/MOSDScrubReserve.h"
  53 #include "messages/MOSDSubOp.h"
  54 #include "messages/MOSDRepOp.h"
  55 #include "messages/MOSDSubOpReply.h"
  56 #include "messages/MOSDRepOpReply.h"
  57 #include "messages/MOSDRepScrubMap.h"
  58 #include "messages/MOSDPGRecoveryDelete.h"
  59 #include "messages/MOSDPGRecoveryDeleteReply.h"
  60
  61 #include "common/BackTrace.h"
  62 #include "common/EventTrace.h"
  63
  64 #ifdef WITH_LTTNG
  65 #define TRACEPOINT_DEFINE
  66 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
  67 #include "tracing/pg.h"
  68 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
  69 #undef TRACEPOINT_DEFINE
  70 #else
  71 #define tracepoint(...)
  72 #endif
  73
  74 #include <sstream>
  75
  76 #define dout_context cct
  77 #define dout_subsys ceph_subsys_osd
  78 #undef dout_prefix
  79 #define dout_prefix _prefix(_dout, this)
  80
  81 // prefix pgmeta_oid keys with _ so that PGLog::read_log_and_missing() can
  82 // easily skip them
  83 const string infover_key("_infover");
  84 const string info_key("_info");
  85 const string biginfo_key("_biginfo");
  86 const string epoch_key("_epoch");
  87 const string fastinfo_key("_fastinfo");
  88
  89 template <class T>
  90 static ostream& _prefix(std::ostream *_dout, T *t)
  91 {
  92   return *_dout << t->gen_prefix();
  93 }
  94
  95 MEMPOOL_DEFINE_OBJECT_FACTORY(PG::CephPeeringEvt, pg_peering_evt, osd);
  96
  97 void PGStateHistory::enter(PG* pg, const utime_t entime, const char* state)
  98 {
  99   // Ignore trimming state machine for now
 100   if (::strstr(state, "Trimming") != NULL) {
 101     return;
 102   } else if (pi != nullptr) {
 103     pi->enter_state(entime, state);
 104   } else {
 105     // Store current state since we can't reliably take the PG lock here
 106     if ( tmppi == nullptr) {
 107       tmppi = std::unique_ptr<PGStateInstance>(new PGStateInstance);
 108     }
 109
 110     thispg = pg;
 111     tmppi->enter_state(entime, state);
 112   }
 113 }
 114
 115 void PGStateHistory::exit(const char* state) {
 116   // Ignore trimming state machine for now
 117   // Do nothing if PG is being destroyed!
 118   if (::strstr(state, "Trimming") != NULL || pg_in_destructor) {
 119     return;
 120   } else {
 121     bool ilocked = false;
 122     if(!thispg->is_locked()) {
 123       thispg->lock();
 124       ilocked = true;
 125     }
 126     if (pi == nullptr) {
 127       buffer.push_back(std::unique_ptr<PGStateInstance>(tmppi.release()));
 128       pi = buffer.back().get();
 129       pi->setepoch(thispg->get_osdmap()->get_epoch());
 130     }
 131
 132     pi->exit_state(ceph_clock_now());
 133     if (::strcmp(state, "Reset") == 0) {
 134       this->reset();
 135     }
 136     if(ilocked) {
 137       thispg->unlock();
 138     }
 139   }
 140 }
 141
 142 void PGStateHistory::dump(Formatter* f) const {
 143   f->open_array_section("history");
 144   for (auto pi = buffer.begin(); pi != buffer.end(); ++pi) {
 145     f->open_object_section("states");
 146     f->dump_stream("epoch") << (*pi)->this_epoch;
 147     for (auto she : (*pi)->state_history) {
 148       f->dump_string("state", std::get<2>(she));
 149       f->dump_stream("enter") << std::get<0>(she);
 150       f->dump_stream("exit") << std::get<1>(she);
 151     }
 152     f->close_section();
 153   }
 154   f->close_section();
 155 }
 156
 157 void PG::get(const char* tag)
 158 {
 159   ref++;
 160 #ifdef PG_DEBUG_REFS
 161   Mutex::Locker l(_ref_id_lock);
 162   _tag_counts[tag]++;
 163 #endif
 164 }
 165
 166 void PG::put(const char* tag)
 167 {
 168 #ifdef PG_DEBUG_REFS
 169   {
 170     Mutex::Locker l(_ref_id_lock);
 171     auto tag_counts_entry = _tag_counts.find(tag);
 172     assert(tag_counts_entry != _tag_counts.end());
 173     --tag_counts_entry->second;
 174     if (tag_counts_entry->second == 0) {
 175       _tag_counts.erase(tag_counts_entry);
 176     }
 177   }
 178 #endif
 179   if (--ref== 0)
 180     delete this;
 181 }
 182
 183 #ifdef PG_DEBUG_REFS
 184 uint64_t PG::get_with_id()
 185 {
 186   ref++;
 187   Mutex::Locker l(_ref_id_lock);
 188   uint64_t id = ++_ref_id;
 189   BackTrace bt(0);
 190   stringstream ss;
 191   bt.print(ss);
 192   dout(20) << __func__ << ": " << info.pgid << " got id " << id << " (new) ref==" << ref << dendl;
 193   assert(!_live_ids.count(id));
 194   _live_ids.insert(make_pair(id, ss.str()));
 195   return id;
 196 }
 197
 198 void PG::put_with_id(uint64_t id)
 199 {
 200   dout(20) << __func__ << ": " << info.pgid << " put id " << id << " (current) ref==" << ref << dendl;
 201   {
 202     Mutex::Locker l(_ref_id_lock);
 203     assert(_live_ids.count(id));
 204     _live_ids.erase(id);
 205   }
 206   if (--ref == 0)
 207     delete this;
 208 }
 209
 210 void PG::dump_live_ids()
 211 {
 212   Mutex::Locker l(_ref_id_lock);
 213   dout(0) << "\t" << __func__ << ": " << info.pgid << " live ids:" << dendl;
 214   for (map<uint64_t, string>::iterator i = _live_ids.begin();
 215        i != _live_ids.end();
 216        ++i) {
 217     dout(0) << "\t\tid: " << *i << dendl;
 218   }
 219   dout(0) << "\t" << __func__ << ": " << info.pgid << " live tags:" << dendl;
 220   for (map<string, uint64_t>::iterator i = _tag_counts.begin();
 221        i != _tag_counts.end();
 222        ++i) {
 223     dout(0) << "\t\tid: " << *i << dendl;
 224   }
 225 }
 226 #endif
 227
 228
 229 void PGPool::update(OSDMapRef map)
 230 {
 231   const pg_pool_t *pi = map->get_pg_pool(id);
 232   assert(pi);
 233   info = *pi;
 234   auid = pi->auid;
 235   name = map->get_pool_name(id);
 236   bool updated = false;
 237   if ((map->get_epoch() != cached_epoch + 1) ||
 238       (pi->get_snap_epoch() == map->get_epoch())) {
 239     updated = true;
 240     pi->build_removed_snaps(newly_removed_snaps);
 241     interval_set<snapid_t> intersection;
 242     intersection.intersection_of(newly_removed_snaps, cached_removed_snaps);
 243     if (intersection == cached_removed_snaps) {
 244         newly_removed_snaps.subtract(cached_removed_snaps);
 245         cached_removed_snaps.union_of(newly_removed_snaps);
 246     } else {
 247         lgeneric_subdout(cct, osd, 0) << __func__
 248           << " cached_removed_snaps shrank from " << cached_removed_snaps
 249           << " to " << newly_removed_snaps << dendl;
 250         cached_removed_snaps = newly_removed_snaps;
 251         newly_removed_snaps.clear();
 252     }
 253     snapc = pi->get_snap_context();
 254   } else {
 255     /* 1) map->get_epoch() == cached_epoch + 1 &&
 256      * 2) pi->get_snap_epoch() != map->get_epoch()
 257      *
 258      * From the if branch, 1 && 2 must be true.  From 2, we know that
 259      * this map didn't change the set of removed snaps.  From 1, we
 260      * know that our cached_removed_snaps matches the previous map.
 261      * Thus, from 1 && 2, cached_removed snaps matches the current
 262      * set of removed snaps and all we have to do is clear
 263      * newly_removed_snaps.
 264      */
 265     newly_removed_snaps.clear();
 266   }
 267   cached_epoch = map->get_epoch();
 268   lgeneric_subdout(cct, osd, 20)
 269     << "PGPool::update cached_removed_snaps "
 270     << cached_removed_snaps
 271     << " newly_removed_snaps "
 272     << newly_removed_snaps
 273     << " snapc " << snapc
 274     << (updated ? " (updated)":" (no change)")
 275     << dendl;
 276 }
 277
 278 PG::PG(OSDService *o, OSDMapRef curmap,
 279        const PGPool &_pool, spg_t p) :
 280   osd(o),
 281   cct(o->cct),
 282   osdriver(osd->store, coll_t(), OSD::make_snapmapper_oid()),
 283   snap_mapper(
 284     cct,
 285     &osdriver,
 286     p.ps(),
 287     p.get_split_bits(curmap->get_pg_num(_pool.id)),
 288     _pool.id,
 289     p.shard),
 290   osdmap_ref(curmap), last_persisted_osdmap_ref(curmap), pool(_pool),
 291   _lock("PG::_lock"),
 292   #ifdef PG_DEBUG_REFS
 293   _ref_id_lock("PG::_ref_id_lock"), _ref_id(0),
 294   #endif
 295   deleting(false),
 296   trace_endpoint("0.0.0.0", 0, "PG"),
 297   dirty_info(false), dirty_big_info(false),
 298   info(p),
 299   info_struct_v(0),
 300   coll(p),
 301   pg_log(cct),
 302   pgmeta_oid(p.make_pgmeta_oid()),
 303   missing_loc(this),
 304   past_intervals(
 305     curmap->get_pools().at(p.pgid.pool()).ec_pool(),
 306     *curmap),
 307   stat_queue_item(this),
 308   scrub_queued(false),
 309   recovery_queued(false),
 310   recovery_ops_active(0),
 311   role(-1),
 312   state(0),
 313   send_notify(false),
 314   pg_whoami(osd->whoami, p.shard),
 315   need_up_thru(false),
 316   last_peering_reset(0),
 317   heartbeat_peer_lock("PG::heartbeat_peer_lock"),
 318   backfill_reserved(false),
 319   backfill_reserving(false),
 320   flushes_in_progress(0),
 321   pg_stats_publish_lock("PG::pg_stats_publish_lock"),
 322   pg_stats_publish_valid(false),
 323   osr(osd->osr_registry.lookup_or_create(p, (stringify(p)))),
 324   finish_sync_event(NULL),
 325   backoff_lock("PG::backoff_lock"),
 326   scrub_after_recovery(false),
 327   active_pushes(0),
 328   recovery_state(this),
 329   pg_id(p),
 330   peer_features(CEPH_FEATURES_SUPPORTED_DEFAULT),
 331   acting_features(CEPH_FEATURES_SUPPORTED_DEFAULT),
 332   upacting_features(CEPH_FEATURES_SUPPORTED_DEFAULT),
 333   last_epoch(0)
 334 {
 335 #ifdef PG_DEBUG_REFS
 336   osd->add_pgid(p, this);
 337 #endif
 338 #ifdef WITH_BLKIN
 339   std::stringstream ss;
 340   ss << "PG " << info.pgid;
 341   trace_endpoint.copy_name(ss.str());
 342 #endif
 343   osr->shard_hint = p;
 344 }
 345
 346 PG::~PG()
 347 {
 348   pgstate_history.set_pg_in_destructor();
 349 #ifdef PG_DEBUG_REFS
 350   osd->remove_pgid(info.pgid, this);
 351 #endif
 352 }
 353
 354 void PG::lock_suspend_timeout(ThreadPool::TPHandle &handle)
 355 {
 356   handle.suspend_tp_timeout();
 357   lock();
 358   handle.reset_tp_timeout();
 359 }
 360
 361 void PG::lock(bool no_lockdep) const
 362 {
 363   _lock.Lock(no_lockdep);
 364   // if we have unrecorded dirty state with the lock dropped, there is a bug
 365   assert(!dirty_info);
 366   assert(!dirty_big_info);
 367
 368   dout(30) << "lock" << dendl;
 369 }
 370
 371 std::string PG::gen_prefix() const
 372 {
 373   stringstream out;
 374   OSDMapRef mapref = osdmap_ref;
 375   if (_lock.is_locked_by_me()) {
 376     out << "osd." << osd->whoami
 377         << " pg_epoch: " << (mapref ? mapref->get_epoch():0)
 378         << " " << *this << " ";
 379   } else {
 380     out << "osd." << osd->whoami
 381         << " pg_epoch: " << (mapref ? mapref->get_epoch():0)
 382         << " pg[" << info.pgid << "(unlocked)] ";
 383   }
 384   return out.str();
 385 }
 386
 387 /********* PG **********/
 388
 389 void PG::proc_master_log(
 390   ObjectStore::Transaction& t, pg_info_t &oinfo,
 391   pg_log_t &olog, pg_missing_t& omissing, pg_shard_t from)
 392 {
 393   dout(10) << "proc_master_log for osd." << from << ": "
 394            << olog << " " << omissing << dendl;
 395   assert(!is_peered() && is_primary());
 396
 397   // merge log into our own log to build master log.  no need to
 398   // make any adjustments to their missing map; we are taking their
 399   // log to be authoritative (i.e., their entries are by definitely
 400   // non-divergent).
 401   merge_log(t, oinfo, olog, from);
 402   peer_info[from] = oinfo;
 403   dout(10) << " peer osd." << from << " now " << oinfo << " " << omissing << dendl;
 404   might_have_unfound.insert(from);
 405
 406   // See doc/dev/osd_internals/last_epoch_started
 407   if (oinfo.last_epoch_started > info.last_epoch_started) {
 408     info.last_epoch_started = oinfo.last_epoch_started;
 409     dirty_info = true;
 410   }
 411   if (oinfo.last_interval_started > info.last_interval_started) {
 412     info.last_interval_started = oinfo.last_interval_started;
 413     dirty_info = true;
 414   }
 415   update_history(oinfo.history);
 416   assert(cct->_conf->osd_find_best_info_ignore_history_les ||
 417          info.last_epoch_started >= info.history.last_epoch_started);
 418
 419   peer_missing[from].claim(omissing);
 420 }
 421
 422 void PG::proc_replica_log(
 423   pg_info_t &oinfo,
 424   const pg_log_t &olog,
 425   pg_missing_t& omissing,
 426   pg_shard_t from)
 427 {
 428   dout(10) << "proc_replica_log for osd." << from << ": "
 429            << oinfo << " " << olog << " " << omissing << dendl;
 430
 431   pg_log.proc_replica_log(oinfo, olog, omissing, from);
 432
 433   peer_info[from] = oinfo;
 434   dout(10) << " peer osd." << from << " now " << oinfo << " " << omissing << dendl;
 435   might_have_unfound.insert(from);
 436
 437   for (map<hobject_t, pg_missing_item>::const_iterator i =
 438          omissing.get_items().begin();
 439        i != omissing.get_items().end();
 440        ++i) {
 441     dout(20) << " after missing " << i->first << " need " << i->second.need
 442              << " have " << i->second.have << dendl;
 443   }
 444   peer_missing[from].claim(omissing);
 445 }
 446
 447 bool PG::proc_replica_info(
 448   pg_shard_t from, const pg_info_t &oinfo, epoch_t send_epoch)
 449 {
 450   map<pg_shard_t, pg_info_t>::iterator p = peer_info.find(from);
 451   if (p != peer_info.end() && p->second.last_update == oinfo.last_update) {
 452     dout(10) << " got dup osd." << from << " info " << oinfo << ", identical to ours" << dendl;
 453     return false;
 454   }
 455
 456   if (!get_osdmap()->has_been_up_since(from.osd, send_epoch)) {
 457     dout(10) << " got info " << oinfo << " from down osd." << from
 458              << " discarding" << dendl;
 459     return false;
 460   }
 461
 462   dout(10) << " got osd." << from << " " << oinfo << dendl;
 463   assert(is_primary());
 464   peer_info[from] = oinfo;
 465   might_have_unfound.insert(from);
 466
 467   update_history(oinfo.history);
 468
 469   // stray?
 470   if (!is_up(from) && !is_acting(from)) {
 471     dout(10) << " osd." << from << " has stray content: " << oinfo << dendl;
 472     stray_set.insert(from);
 473     if (is_clean()) {
 474       purge_strays();
 475     }
 476   }
 477
 478   // was this a new info?  if so, update peers!
 479   if (p == peer_info.end())
 480     update_heartbeat_peers();
 481
 482   return true;
 483 }
 484
 485 void PG::remove_snap_mapped_object(
 486   ObjectStore::Transaction &t, const hobject_t &soid)
 487 {
 488   t.remove(
 489     coll,
 490     ghobject_t(soid, ghobject_t::NO_GEN, pg_whoami.shard));
 491   clear_object_snap_mapping(&t, soid);
 492 }
 493
 494 void PG::clear_object_snap_mapping(
 495   ObjectStore::Transaction *t, const hobject_t &soid)
 496 {
 497   OSDriver::OSTransaction _t(osdriver.get_transaction(t));
 498   if (soid.snap < CEPH_MAXSNAP) {
 499     int r = snap_mapper.remove_oid(
 500       soid,
 501       &_t);
 502     if (!(r == 0 || r == -ENOENT)) {
 503       derr << __func__ << ": remove_oid returned " << cpp_strerror(r) << dendl;
 504       ceph_abort();
 505     }
 506   }
 507 }
 508
 509 void PG::update_object_snap_mapping(
 510   ObjectStore::Transaction *t, const hobject_t &soid, const set<snapid_t> &snaps)
 511 {
 512   OSDriver::OSTransaction _t(osdriver.get_transaction(t));
 513   assert(soid.snap < CEPH_MAXSNAP);
 514   int r = snap_mapper.remove_oid(
 515     soid,
 516     &_t);
 517   if (!(r == 0 || r == -ENOENT)) {
 518     derr << __func__ << ": remove_oid returned " << cpp_strerror(r) << dendl;
 519     ceph_abort();
 520   }
 521   snap_mapper.add_oid(
 522     soid,
 523     snaps,
 524     &_t);
 525 }
 526
 527 void PG::merge_log(
 528   ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog, pg_shard_t from)
 529 {
 530   PGLogEntryHandler rollbacker{this, &t};
 531   pg_log.merge_log(
 532     oinfo, olog, from, info, &rollbacker, dirty_info, dirty_big_info);
 533 }
 534
 535 void PG::rewind_divergent_log(ObjectStore::Transaction& t, eversion_t newhead)
 536 {
 537   PGLogEntryHandler rollbacker{this, &t};
 538   pg_log.rewind_divergent_log(
 539     newhead, info, &rollbacker, dirty_info, dirty_big_info);
 540 }
 541
 542 /*
 543  * Process information from a replica to determine if it could have any
 544  * objects that i need.
 545  *
 546  * TODO: if the missing set becomes very large, this could get expensive.
 547  * Instead, we probably want to just iterate over our unfound set.
 548  */
 549 bool PG::search_for_missing(
 550   const pg_info_t &oinfo, const pg_missing_t &omissing,
 551   pg_shard_t from,
 552   RecoveryCtx *ctx)
 553 {
 554   uint64_t num_unfound_before = missing_loc.num_unfound();
 555   bool found_missing = missing_loc.add_source_info(
 556     from, oinfo, omissing, ctx->handle);
 557   if (found_missing && num_unfound_before != missing_loc.num_unfound())
 558     publish_stats_to_osd();
 559   if (found_missing &&
 560       (get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, NULL) &
 561        CEPH_FEATURE_OSD_ERASURE_CODES)) {
 562     pg_info_t tinfo(oinfo);
 563     tinfo.pgid.shard = pg_whoami.shard;
 564     (*(ctx->info_map))[from.osd].push_back(
 565       make_pair(
 566         pg_notify_t(
 567           from.shard, pg_whoami.shard,
 568           get_osdmap()->get_epoch(),
 569           get_osdmap()->get_epoch(),
 570           tinfo),
 571         past_intervals));
 572   }
 573   return found_missing;
 574 }
 575
 576 bool PG::MissingLoc::readable_with_acting(
 577   const hobject_t &hoid,
 578   const set<pg_shard_t> &acting) const {
 579   if (!needs_recovery(hoid))
 580     return true;
 581   if (is_deleted(hoid))
 582     return false;
 583   auto missing_loc_entry = missing_loc.find(hoid);
 584   if (missing_loc_entry == missing_loc.end())
 585     return false;
 586   const set<pg_shard_t> &locs = missing_loc_entry->second;
 587   ldout(pg->cct, 10) << __func__ << ": locs:" << locs << dendl;
 588   set<pg_shard_t> have_acting;
 589   for (set<pg_shard_t>::const_iterator i = locs.begin();
 590        i != locs.end();
 591        ++i) {
 592     if (acting.count(*i))
 593       have_acting.insert(*i);
 594   }
 595   return (*is_readable)(have_acting);
 596 }
 597
 598 void PG::MissingLoc::add_batch_sources_info(
 599   const set<pg_shard_t> &sources, ThreadPool::TPHandle* handle)
 600 {
 601   ldout(pg->cct, 10) << __func__ << ": adding sources in batch "
 602                      << sources.size() << dendl;
 603   unsigned loop = 0;
 604   for (map<hobject_t, pg_missing_item>::const_iterator i = needs_recovery_map.begin();
 605       i != needs_recovery_map.end();
 606       ++i) {
 607     if (handle && ++loop >= pg->cct->_conf->osd_loop_before_reset_tphandle) {
 608       handle->reset_tp_timeout();
 609       loop = 0;
 610     }
 611     if (i->second.is_delete())
 612       continue;
 613     missing_loc[i->first].insert(sources.begin(), sources.end());
 614     missing_loc_sources.insert(sources.begin(), sources.end());
 615   }
 616 }
 617
 618 bool PG::MissingLoc::add_source_info(
 619   pg_shard_t fromosd,
 620   const pg_info_t &oinfo,
 621   const pg_missing_t &omissing,
 622   ThreadPool::TPHandle* handle)
 623 {
 624   bool found_missing = false;
 625   unsigned loop = 0;
 626   // found items?
 627   for (map<hobject_t,pg_missing_item>::const_iterator p = needs_recovery_map.begin();
 628        p != needs_recovery_map.end();
 629        ++p) {
 630     const hobject_t &soid(p->first);
 631     eversion_t need = p->second.need;
 632     if (handle && ++loop >= pg->cct->_conf->osd_loop_before_reset_tphandle) {
 633       handle->reset_tp_timeout();
 634       loop = 0;
 635     }
 636     if (p->second.is_delete()) {
 637       ldout(pg->cct, 10) << __func__ << " " << soid
 638                          << " delete, ignoring source" << dendl;
 639       found_missing = true;
 640       continue;
 641     }
 642     if (oinfo.last_update < need) {
 643       ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
 644                          << " also missing on osd." << fromosd
 645                          << " (last_update " << oinfo.last_update
 646                          << " < needed " << need << ")" << dendl;
 647       continue;
 648     }
 649     if (!oinfo.last_backfill.is_max() &&
 650         !oinfo.last_backfill_bitwise) {
 651       ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
 652                          << " also missing on osd." << fromosd
 653                          << " (last_backfill " << oinfo.last_backfill
 654                          << " but with wrong sort order)"
 655                          << dendl;
 656       continue;
 657     }
 658     if (p->first >= oinfo.last_backfill) {
 659       // FIXME: this is _probably_ true, although it could conceivably
 660       // be in the undefined region!  Hmm!
 661       ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
 662                          << " also missing on osd." << fromosd
 663                          << " (past last_backfill " << oinfo.last_backfill
 664                          << ")" << dendl;
 665       continue;
 666     }
 667     if (oinfo.last_complete < need) {
 668       if (omissing.is_missing(soid)) {
 669         ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
 670                            << " also missing on osd." << fromosd << dendl;
 671         continue;
 672       }
 673     }
 674
 675     ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
 676                        << " is on osd." << fromosd << dendl;
 677
 678     missing_loc[soid].insert(fromosd);
 679     missing_loc_sources.insert(fromosd);
 680     found_missing = true;
 681   }
 682
 683   ldout(pg->cct, 20) << "needs_recovery_map missing " << needs_recovery_map
 684                      << dendl;
 685   return found_missing;
 686 }
 687
 688 void PG::discover_all_missing(map<int, map<spg_t,pg_query_t> > &query_map)
 689 {
 690   auto &missing = pg_log.get_missing();
 691   uint64_t unfound = get_num_unfound();
 692   assert(unfound > 0);
 693
 694   dout(10) << __func__ << " "
 695            << missing.num_missing() << " missing, "
 696            << unfound << " unfound"
 697            << dendl;
 698
 699   std::set<pg_shard_t>::const_iterator m = might_have_unfound.begin();
 700   std::set<pg_shard_t>::const_iterator mend = might_have_unfound.end();
 701   for (; m != mend; ++m) {
 702     pg_shard_t peer(*m);
 703
 704     if (!get_osdmap()->is_up(peer.osd)) {
 705       dout(20) << __func__ << " skipping down osd." << peer << dendl;
 706       continue;
 707     }
 708
 709     map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(peer);
 710     if (iter != peer_info.end() &&
 711         (iter->second.is_empty() || iter->second.dne())) {
 712       // ignore empty peers
 713       continue;
 714     }
 715
 716     // If we've requested any of this stuff, the pg_missing_t information
 717     // should be on its way.
 718     // TODO: coalsce requested_* into a single data structure
 719     if (peer_missing.find(peer) != peer_missing.end()) {
 720       dout(20) << __func__ << ": osd." << peer
 721                << ": we already have pg_missing_t" << dendl;
 722       continue;
 723     }
 724     if (peer_log_requested.find(peer) != peer_log_requested.end()) {
 725       dout(20) << __func__ << ": osd." << peer
 726                << ": in peer_log_requested" << dendl;
 727       continue;
 728     }
 729     if (peer_missing_requested.find(peer) != peer_missing_requested.end()) {
 730       dout(20) << __func__ << ": osd." << peer
 731                << ": in peer_missing_requested" << dendl;
 732       continue;
 733     }
 734
 735     // Request missing
 736     dout(10) << __func__ << ": osd." << peer << ": requesting pg_missing_t"
 737              << dendl;
 738     peer_missing_requested.insert(peer);
 739     query_map[peer.osd][spg_t(info.pgid.pgid, peer.shard)] =
 740       pg_query_t(
 741         pg_query_t::FULLLOG,
 742         peer.shard, pg_whoami.shard,
 743         info.history, get_osdmap()->get_epoch());
 744   }
 745 }
 746
 747 /******* PG ***********/
 748 bool PG::needs_recovery() const
 749 {
 750   assert(is_primary());
 751
 752   auto &missing = pg_log.get_missing();
 753
 754   if (missing.num_missing()) {
 755     dout(10) << __func__ << " primary has " << missing.num_missing()
 756       << " missing" << dendl;
 757     return true;
 758   }
 759
 760   assert(!actingbackfill.empty());
 761   set<pg_shard_t>::const_iterator end = actingbackfill.end();
 762   set<pg_shard_t>::const_iterator a = actingbackfill.begin();
 763   for (; a != end; ++a) {
 764     if (*a == get_primary()) continue;
 765     pg_shard_t peer = *a;
 766     map<pg_shard_t, pg_missing_t>::const_iterator pm = peer_missing.find(peer);
 767     if (pm == peer_missing.end()) {
 768       dout(10) << __func__ << " osd." << peer << " doesn't have missing set"
 769         << dendl;
 770       continue;
 771     }
 772     if (pm->second.num_missing()) {
 773       dout(10) << __func__ << " osd." << peer << " has "
 774         << pm->second.num_missing() << " missing" << dendl;
 775       return true;
 776     }
 777   }
 778
 779   dout(10) << __func__ << " is recovered" << dendl;
 780   return false;
 781 }
 782
 783 bool PG::needs_backfill() const
 784 {
 785   assert(is_primary());
 786
 787   // We can assume that only possible osds that need backfill
 788   // are on the backfill_targets vector nodes.
 789   set<pg_shard_t>::const_iterator end = backfill_targets.end();
 790   set<pg_shard_t>::const_iterator a = backfill_targets.begin();
 791   for (; a != end; ++a) {
 792     pg_shard_t peer = *a;
 793     map<pg_shard_t, pg_info_t>::const_iterator pi = peer_info.find(peer);
 794     if (!pi->second.last_backfill.is_max()) {
 795       dout(10) << __func__ << " osd." << peer << " has last_backfill " << pi->second.last_backfill << dendl;
 796       return true;
 797     }
 798   }
 799
 800   dout(10) << __func__ << " does not need backfill" << dendl;
 801   return false;
 802 }
 803
 804
 805 void PG::check_past_interval_bounds() const
 806 {
 807   auto rpib = get_required_past_interval_bounds(
 808     info,
 809     osd->get_superblock().oldest_map);
 810   if (rpib.first >= rpib.second) {
 811     if (!past_intervals.empty()) {
 812       osd->clog->error() << info.pgid << " required past_interval bounds are"
 813                          << " empty [" << rpib << ") but past_intervals is not: "
 814                          << past_intervals;
 815       derr << info.pgid << " required past_interval bounds are"
 816            << " empty [" << rpib << ") but past_intervals is not: "
 817            << past_intervals << dendl;
 818     }
 819   } else {
 820     if (past_intervals.empty()) {
 821       osd->clog->error() << info.pgid << " required past_interval bounds are"
 822                          << " not empty [" << rpib << ") but past_intervals "
 823                          << past_intervals << " is empty";
 824       derr << info.pgid << " required past_interval bounds are"
 825            << " not empty [" << rpib << ") but past_intervals "
 826            << past_intervals << " is empty" << dendl;
 827       assert(!past_intervals.empty());
 828     }
 829
 830     auto apib = past_intervals.get_bounds();
 831     if (apib.first > rpib.first) {
 832       osd->clog->error() << info.pgid << " past_intervals [" << apib
 833                          << ") start interval does not contain the required"
 834                          << " bound [" << rpib << ") start";
 835       derr << info.pgid << " past_intervals [" << apib
 836            << ") start interval does not contain the required"
 837            << " bound [" << rpib << ") start" << dendl;
 838       assert(0 == "past_interval start interval mismatch");
 839     }
 840     if (apib.second != rpib.second) {
 841       osd->clog->error() << info.pgid << " past_interal bound [" << apib
 842                          << ") end does not match required [" << rpib
 843                          << ") end";
 844       derr << info.pgid << " past_interal bound [" << apib
 845            << ") end does not match required [" << rpib
 846            << ") end" << dendl;
 847       assert(0 == "past_interval end mismatch");
 848     }
 849   }
 850 }
 851
 852 bool PG::adjust_need_up_thru(const OSDMapRef osdmap)
 853 {
 854   epoch_t up_thru = osdmap->get_up_thru(osd->whoami);
 855   if (need_up_thru &&
 856       up_thru >= info.history.same_interval_since) {
 857     dout(10) << "adjust_need_up_thru now " << up_thru << ", need_up_thru now false" << dendl;
 858     need_up_thru = false;
 859     return true;
 860   }
 861   return false;
 862 }
 863
 864 void PG::remove_down_peer_info(const OSDMapRef osdmap)
 865 {
 866   // Remove any downed osds from peer_info
 867   bool removed = false;
 868   map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
 869   while (p != peer_info.end()) {
 870     if (!osdmap->is_up(p->first.osd)) {
 871       dout(10) << " dropping down osd." << p->first << " info " << p->second << dendl;
 872       peer_missing.erase(p->first);
 873       peer_log_requested.erase(p->first);
 874       peer_missing_requested.erase(p->first);
 875       peer_info.erase(p++);
 876       removed = true;
 877     } else
 878       ++p;
 879   }
 880
 881   // if we removed anyone, update peers (which include peer_info)
 882   if (removed)
 883     update_heartbeat_peers();
 884   check_recovery_sources(osdmap);
 885 }
 886
 887 /*
 888  * Returns true unless there is a non-lost OSD in might_have_unfound.
 889  */
 890 bool PG::all_unfound_are_queried_or_lost(const OSDMapRef osdmap) const
 891 {
 892   assert(is_primary());
 893
 894   set<pg_shard_t>::const_iterator peer = might_have_unfound.begin();
 895   set<pg_shard_t>::const_iterator mend = might_have_unfound.end();
 896   for (; peer != mend; ++peer) {
 897     if (peer_missing.count(*peer))
 898       continue;
 899     map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(*peer);
 900     if (iter != peer_info.end() &&
 901         (iter->second.is_empty() || iter->second.dne()))
 902       continue;
 903     if (!osdmap->exists(peer->osd))
 904       continue;
 905     const osd_info_t &osd_info(osdmap->get_info(peer->osd));
 906     if (osd_info.lost_at <= osd_info.up_from) {
 907       // If there is even one OSD in might_have_unfound that isn't lost, we
 908       // still might retrieve our unfound.
 909       return false;
 910     }
 911   }
 912   dout(10) << "all_unfound_are_queried_or_lost all of might_have_unfound " << might_have_unfound
 913            << " have been queried or are marked lost" << dendl;
 914   return true;
 915 }
 916
 917 PastIntervals::PriorSet PG::build_prior()
 918 {
 919   if (1) {
 920     // sanity check
 921     for (map<pg_shard_t,pg_info_t>::iterator it = peer_info.begin();
 922          it != peer_info.end();
 923          ++it) {
 924       assert(info.history.last_epoch_started >= it->second.history.last_epoch_started);
 925     }
 926   }
 927
 928   const OSDMap &osdmap = *get_osdmap();
 929   PastIntervals::PriorSet prior = past_intervals.get_prior_set(
 930     pool.info.ec_pool(),
 931     info.history.last_epoch_started,
 932     get_pgbackend()->get_is_recoverable_predicate(),
 933     [&](epoch_t start, int osd, epoch_t *lost_at) {
 934       const osd_info_t *pinfo = 0;
 935       if (osdmap.exists(osd)) {
 936         pinfo = &osdmap.get_info(osd);
 937         if (lost_at)
 938           *lost_at = pinfo->lost_at;
 939       }
 940
 941       if (osdmap.is_up(osd)) {
 942         return PastIntervals::UP;
 943       } else if (!pinfo) {
 944         return PastIntervals::DNE;
 945       } else if (pinfo->lost_at > start) {
 946         return PastIntervals::LOST;
 947       } else {
 948         return PastIntervals::DOWN;
 949       }
 950     },
 951     up,
 952     acting,
 953     this);
 954
 955   if (prior.pg_down) {
 956     state_set(PG_STATE_DOWN);
 957   }
 958
 959   if (get_osdmap()->get_up_thru(osd->whoami) < info.history.same_interval_since) {
 960     dout(10) << "up_thru " << get_osdmap()->get_up_thru(osd->whoami)
 961              << " < same_since " << info.history.same_interval_since
 962              << ", must notify monitor" << dendl;
 963     need_up_thru = true;
 964   } else {
 965     dout(10) << "up_thru " << get_osdmap()->get_up_thru(osd->whoami)
 966              << " >= same_since " << info.history.same_interval_since
 967              << ", all is well" << dendl;
 968     need_up_thru = false;
 969   }
 970   set_probe_targets(prior.probe);
 971   return prior;
 972 }
 973
 974 void PG::clear_primary_state()
 975 {
 976   dout(10) << "clear_primary_state" << dendl;
 977
 978   // clear peering state
 979   stray_set.clear();
 980   peer_log_requested.clear();
 981   peer_missing_requested.clear();
 982   peer_info.clear();
 983   peer_missing.clear();
 984   need_up_thru = false;
 985   peer_last_complete_ondisk.clear();
 986   peer_activated.clear();
 987   min_last_complete_ondisk = eversion_t();
 988   pg_trim_to = eversion_t();
 989   might_have_unfound.clear();
 990   projected_log = PGLog::IndexedLog();
 991
 992   last_update_ondisk = eversion_t();
 993
 994   snap_trimq.clear();
 995
 996   finish_sync_event = 0;  // so that _finish_recovery doesn't go off in another thread
 997
 998   missing_loc.clear();
 999
1000   release_pg_backoffs();
1001
1002   pg_log.reset_recovery_pointers();
1003
1004   scrubber.reserved_peers.clear();
1005   scrub_after_recovery = false;
1006
1007   agent_clear();
1008 }
1009
1010 PG::Scrubber::Scrubber()
1011  : reserved(false), reserve_failed(false),
1012    epoch_start(0),
1013    active(false),
1014    waiting_on(0), shallow_errors(0), deep_errors(0), fixed(0),
1015    must_scrub(false), must_deep_scrub(false), must_repair(false),
1016    auto_repair(false),
1017    num_digest_updates_pending(0),
1018    state(INACTIVE),
1019    deep(false),
1020    seed(0)
1021 {}
1022
1023 PG::Scrubber::~Scrubber() {}
1024
1025 /**
1026  * find_best_info
1027  *
1028  * Returns an iterator to the best info in infos sorted by:
1029  *  1) Prefer newer last_update
1030  *  2) Prefer longer tail if it brings another info into contiguity
1031  *  3) Prefer current primary
1032  */
1033 map<pg_shard_t, pg_info_t>::const_iterator PG::find_best_info(
1034   const map<pg_shard_t, pg_info_t> &infos,
1035   bool restrict_to_up_acting,
1036   bool *history_les_bound) const
1037 {
1038   assert(history_les_bound);
1039   /* See doc/dev/osd_internals/last_epoch_started.rst before attempting
1040    * to make changes to this process.  Also, make sure to update it
1041    * when you find bugs! */
1042   eversion_t min_last_update_acceptable = eversion_t::max();
1043   epoch_t max_last_epoch_started_found = 0;
1044   for (map<pg_shard_t, pg_info_t>::const_iterator i = infos.begin();
1045        i != infos.end();
1046        ++i) {
1047     if (!cct->_conf->osd_find_best_info_ignore_history_les &&
1048         max_last_epoch_started_found < i->second.history.last_epoch_started) {
1049       *history_les_bound = true;
1050       max_last_epoch_started_found = i->second.history.last_epoch_started;
1051     }
1052     if (!i->second.is_incomplete() &&
1053         max_last_epoch_started_found < i->second.last_epoch_started) {
1054       max_last_epoch_started_found = i->second.last_epoch_started;
1055     }
1056   }
1057   for (map<pg_shard_t, pg_info_t>::const_iterator i = infos.begin();
1058        i != infos.end();
1059        ++i) {
1060     if (max_last_epoch_started_found <= i->second.last_epoch_started) {
1061       if (min_last_update_acceptable > i->second.last_update)
1062         min_last_update_acceptable = i->second.last_update;
1063     }
1064   }
1065   if (min_last_update_acceptable == eversion_t::max())
1066     return infos.end();
1067
1068   map<pg_shard_t, pg_info_t>::const_iterator best = infos.end();
1069   // find osd with newest last_update (oldest for ec_pool).
1070   // if there are multiples, prefer
1071   //  - a longer tail, if it brings another peer into log contiguity
1072   //  - the current primary
1073   for (map<pg_shard_t, pg_info_t>::const_iterator p = infos.begin();
1074        p != infos.end();
1075        ++p) {
1076     if (restrict_to_up_acting && !is_up(p->first) &&
1077         !is_acting(p->first))
1078       continue;
1079     // Only consider peers with last_update >= min_last_update_acceptable
1080     if (p->second.last_update < min_last_update_acceptable)
1081       continue;
1082     // Disqualify anyone with a too old last_epoch_started
1083     if (p->second.last_epoch_started < max_last_epoch_started_found)
1084       continue;
1085     // Disqualify anyone who is incomplete (not fully backfilled)
1086     if (p->second.is_incomplete())
1087       continue;
1088     if (best == infos.end()) {
1089       best = p;
1090       continue;
1091     }
1092     // Prefer newer last_update
1093     if (pool.info.require_rollback()) {
1094       if (p->second.last_update > best->second.last_update)
1095         continue;
1096       if (p->second.last_update < best->second.last_update) {
1097         best = p;
1098         continue;
1099       }
1100     } else {
1101       if (p->second.last_update < best->second.last_update)
1102         continue;
1103       if (p->second.last_update > best->second.last_update) {
1104         best = p;
1105         continue;
1106       }
1107     }
1108
1109     // Prefer longer tail
1110     if (p->second.log_tail > best->second.log_tail) {
1111       continue;
1112     } else if (p->second.log_tail < best->second.log_tail) {
1113       best = p;
1114       continue;
1115     }
1116
1117     // prefer current primary (usually the caller), all things being equal
1118     if (p->first == pg_whoami) {
1119       dout(10) << "calc_acting prefer osd." << p->first
1120                << " because it is current primary" << dendl;
1121       best = p;
1122       continue;
1123     }
1124   }
1125   return best;
1126 }
1127
1128 void PG::calc_ec_acting(
1129   map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1130   unsigned size,
1131   const vector<int> &acting,
1132   pg_shard_t acting_primary,
1133   const vector<int> &up,
1134   pg_shard_t up_primary,
1135   const map<pg_shard_t, pg_info_t> &all_info,
1136   bool restrict_to_up_acting,
1137   vector<int> *_want,
1138   set<pg_shard_t> *backfill,
1139   set<pg_shard_t> *acting_backfill,
1140   pg_shard_t *want_primary,
1141   ostream &ss)
1142 {
1143   vector<int> want(size, CRUSH_ITEM_NONE);
1144   map<shard_id_t, set<pg_shard_t> > all_info_by_shard;
1145   unsigned usable = 0;
1146   for (map<pg_shard_t, pg_info_t>::const_iterator i = all_info.begin();
1147        i != all_info.end();
1148        ++i) {
1149     all_info_by_shard[i->first.shard].insert(i->first);
1150   }
1151   for (uint8_t i = 0; i < want.size(); ++i) {
1152     ss << "For position " << (unsigned)i << ": ";
1153     if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE &&
1154         !all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.is_incomplete() &&
1155         all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.last_update >=
1156         auth_log_shard->second.log_tail) {
1157       ss << " selecting up[i]: " << pg_shard_t(up[i], shard_id_t(i)) << std::endl;
1158       want[i] = up[i];
1159       ++usable;
1160       continue;
1161     }
1162     if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE) {
1163       ss << " backfilling up[i]: " << pg_shard_t(up[i], shard_id_t(i))
1164          << " and ";
1165       backfill->insert(pg_shard_t(up[i], shard_id_t(i)));
1166     }
1167
1168     if (acting.size() > (unsigned)i && acting[i] != CRUSH_ITEM_NONE &&
1169         !all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.is_incomplete() &&
1170         all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.last_update >=
1171         auth_log_shard->second.log_tail) {
1172       ss << " selecting acting[i]: " << pg_shard_t(acting[i], shard_id_t(i)) << std::endl;
1173       want[i] = acting[i];
1174       ++usable;
1175     } else if (!restrict_to_up_acting) {
1176       for (set<pg_shard_t>::iterator j = all_info_by_shard[shard_id_t(i)].begin();
1177            j != all_info_by_shard[shard_id_t(i)].end();
1178            ++j) {
1179         assert(j->shard == i);
1180         if (!all_info.find(*j)->second.is_incomplete() &&
1181             all_info.find(*j)->second.last_update >=
1182             auth_log_shard->second.log_tail) {
1183           ss << " selecting stray: " << *j << std::endl;
1184           want[i] = j->osd;
1185           ++usable;
1186           break;
1187         }
1188       }
1189       if (want[i] == CRUSH_ITEM_NONE)
1190         ss << " failed to fill position " << (int)i << std::endl;
1191     }
1192   }
1193
1194   bool found_primary = false;
1195   for (uint8_t i = 0; i < want.size(); ++i) {
1196     if (want[i] != CRUSH_ITEM_NONE) {
1197       acting_backfill->insert(pg_shard_t(want[i], shard_id_t(i)));
1198       if (!found_primary) {
1199         *want_primary = pg_shard_t(want[i], shard_id_t(i));
1200         found_primary = true;
1201       }
1202     }
1203   }
1204   acting_backfill->insert(backfill->begin(), backfill->end());
1205   _want->swap(want);
1206 }
1207
1208 /**
1209  * calculate the desired acting set.
1210  *
1211  * Choose an appropriate acting set.  Prefer up[0], unless it is
1212  * incomplete, or another osd has a longer tail that allows us to
1213  * bring other up nodes up to date.
1214  */
1215 void PG::calc_replicated_acting(
1216   map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1217   unsigned size,
1218   const vector<int> &acting,
1219   pg_shard_t acting_primary,
1220   const vector<int> &up,
1221   pg_shard_t up_primary,
1222   const map<pg_shard_t, pg_info_t> &all_info,
1223   bool restrict_to_up_acting,
1224   vector<int> *want,
1225   set<pg_shard_t> *backfill,
1226   set<pg_shard_t> *acting_backfill,
1227   pg_shard_t *want_primary,
1228   ostream &ss)
1229 {
1230   ss << "calc_acting newest update on osd." << auth_log_shard->first
1231      << " with " << auth_log_shard->second
1232      << (restrict_to_up_acting ? " restrict_to_up_acting" : "") << std::endl;
1233   pg_shard_t auth_log_shard_id = auth_log_shard->first;
1234
1235   // select primary
1236   map<pg_shard_t,pg_info_t>::const_iterator primary;
1237   if (up.size() &&
1238       !all_info.find(up_primary)->second.is_incomplete() &&
1239       all_info.find(up_primary)->second.last_update >=
1240         auth_log_shard->second.log_tail) {
1241     ss << "up_primary: " << up_primary << ") selected as primary" << std::endl;
1242     primary = all_info.find(up_primary); // prefer up[0], all thing being equal
1243   } else {
1244     assert(!auth_log_shard->second.is_incomplete());
1245     ss << "up[0] needs backfill, osd." << auth_log_shard_id
1246        << " selected as primary instead" << std::endl;
1247     primary = auth_log_shard;
1248   }
1249
1250   ss << "calc_acting primary is osd." << primary->first
1251      << " with " << primary->second << std::endl;
1252   *want_primary = primary->first;
1253   want->push_back(primary->first.osd);
1254   acting_backfill->insert(primary->first);
1255   unsigned usable = 1;
1256
1257   // select replicas that have log contiguity with primary.
1258   // prefer up, then acting, then any peer_info osds
1259   for (vector<int>::const_iterator i = up.begin();
1260        i != up.end();
1261        ++i) {
1262     pg_shard_t up_cand = pg_shard_t(*i, shard_id_t::NO_SHARD);
1263     if (up_cand == primary->first)
1264       continue;
1265     const pg_info_t &cur_info = all_info.find(up_cand)->second;
1266     if (cur_info.is_incomplete() ||
1267       cur_info.last_update < MIN(
1268         primary->second.log_tail,
1269         auth_log_shard->second.log_tail)) {
1270       /* We include auth_log_shard->second.log_tail because in GetLog,
1271        * we will request logs back to the min last_update over our
1272        * acting_backfill set, which will result in our log being extended
1273        * as far backwards as necessary to pick up any peers which can
1274        * be log recovered by auth_log_shard's log */
1275       ss << " shard " << up_cand << " (up) backfill " << cur_info << std::endl;
1276       backfill->insert(up_cand);
1277       acting_backfill->insert(up_cand);
1278     } else {
1279       want->push_back(*i);
1280       acting_backfill->insert(up_cand);
1281       usable++;
1282       ss << " osd." << *i << " (up) accepted " << cur_info << std::endl;
1283     }
1284   }
1285
1286   // This no longer has backfill OSDs, but they are covered above.
1287   for (vector<int>::const_iterator i = acting.begin();
1288        i != acting.end();
1289        ++i) {
1290     pg_shard_t acting_cand(*i, shard_id_t::NO_SHARD);
1291     if (usable >= size)
1292       break;
1293
1294     // skip up osds we already considered above
1295     if (acting_cand == primary->first)
1296       continue;
1297     vector<int>::const_iterator up_it = find(up.begin(), up.end(), acting_cand.osd);
1298     if (up_it != up.end())
1299       continue;
1300
1301     const pg_info_t &cur_info = all_info.find(acting_cand)->second;
1302     if (cur_info.is_incomplete() ||
1303         cur_info.last_update < primary->second.log_tail) {
1304       ss << " shard " << acting_cand << " (stray) REJECTED "
1305                << cur_info << std::endl;
1306     } else {
1307       want->push_back(*i);
1308       acting_backfill->insert(acting_cand);
1309       ss << " shard " << acting_cand << " (stray) accepted "
1310          << cur_info << std::endl;
1311       usable++;
1312     }
1313   }
1314
1315   if (restrict_to_up_acting) {
1316     return;
1317   }
1318   for (map<pg_shard_t,pg_info_t>::const_iterator i = all_info.begin();
1319        i != all_info.end();
1320        ++i) {
1321     if (usable >= size)
1322       break;
1323
1324     // skip up osds we already considered above
1325     if (i->first == primary->first)
1326       continue;
1327     vector<int>::const_iterator up_it = find(up.begin(), up.end(), i->first.osd);
1328     if (up_it != up.end())
1329       continue;
1330     vector<int>::const_iterator acting_it = find(
1331       acting.begin(), acting.end(), i->first.osd);
1332     if (acting_it != acting.end())
1333       continue;
1334
1335     if (i->second.is_incomplete() ||
1336         i->second.last_update < primary->second.log_tail) {
1337       ss << " shard " << i->first << " (stray) REJECTED "
1338          << i->second << std::endl;
1339     } else {
1340       want->push_back(i->first.osd);
1341       acting_backfill->insert(i->first);
1342       ss << " shard " << i->first << " (stray) accepted "
1343          << i->second << std::endl;
1344       usable++;
1345     }
1346   }
1347 }
1348
1349 /**
1350  * choose acting
1351  *
1352  * calculate the desired acting, and request a change with the monitor
1353  * if it differs from the current acting.
1354  *
1355  * if restrict_to_up_acting=true, we filter out anything that's not in
1356  * up/acting.  in order to lift this restriction, we need to
1357  *  1) check whether it's worth switching the acting set any time we get
1358  *     a new pg info (not just here, when recovery finishes)
1359  *  2) check whether anything in want_acting went down on each new map
1360  *     (and, if so, calculate a new want_acting)
1361  *  3) remove the assertion in PG::RecoveryState::Active::react(const AdvMap)
1362  * TODO!
1363  */
1364 bool PG::choose_acting(pg_shard_t &auth_log_shard_id,
1365                        bool restrict_to_up_acting,
1366                        bool *history_les_bound)
1367 {
1368   map<pg_shard_t, pg_info_t> all_info(peer_info.begin(), peer_info.end());
1369   all_info[pg_whoami] = info;
1370
1371   for (map<pg_shard_t, pg_info_t>::iterator p = all_info.begin();
1372        p != all_info.end();
1373        ++p) {
1374     dout(10) << "calc_acting osd." << p->first << " " << p->second << dendl;
1375   }
1376
1377   map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard =
1378     find_best_info(all_info, restrict_to_up_acting, history_les_bound);
1379
1380   if (auth_log_shard == all_info.end()) {
1381     if (up != acting) {
1382       dout(10) << "choose_acting no suitable info found (incomplete backfills?),"
1383                << " reverting to up" << dendl;
1384       want_acting = up;
1385       vector<int> empty;
1386       osd->queue_want_pg_temp(info.pgid.pgid, empty);
1387     } else {
1388       dout(10) << "choose_acting failed" << dendl;
1389       assert(want_acting.empty());
1390     }
1391     return false;
1392   }
1393
1394   assert(!auth_log_shard->second.is_incomplete());
1395   auth_log_shard_id = auth_log_shard->first;
1396
1397   set<pg_shard_t> want_backfill, want_acting_backfill;
1398   vector<int> want;
1399   pg_shard_t want_primary;
1400   stringstream ss;
1401   if (!pool.info.ec_pool())
1402     calc_replicated_acting(
1403       auth_log_shard,
1404       get_osdmap()->get_pg_size(info.pgid.pgid),
1405       acting,
1406       primary,
1407       up,
1408       up_primary,
1409       all_info,
1410       restrict_to_up_acting,
1411       &want,
1412       &want_backfill,
1413       &want_acting_backfill,
1414       &want_primary,
1415       ss);
1416   else
1417     calc_ec_acting(
1418       auth_log_shard,
1419       get_osdmap()->get_pg_size(info.pgid.pgid),
1420       acting,
1421       primary,
1422       up,
1423       up_primary,
1424       all_info,
1425       restrict_to_up_acting,
1426       &want,
1427       &want_backfill,
1428       &want_acting_backfill,
1429       &want_primary,
1430       ss);
1431   dout(10) << ss.str() << dendl;
1432
1433   unsigned num_want_acting = 0;
1434   set<pg_shard_t> have;
1435   for (int i = 0; i < (int)want.size(); ++i) {
1436     if (want[i] != CRUSH_ITEM_NONE) {
1437       ++num_want_acting;
1438       have.insert(
1439         pg_shard_t(
1440           want[i],
1441           pool.info.ec_pool() ? shard_id_t(i) : shard_id_t::NO_SHARD));
1442     }
1443   }
1444
1445   // We go incomplete if below min_size for ec_pools since backfill
1446   // does not currently maintain rollbackability
1447   // Otherwise, we will go "peered", but not "active"
1448   if (num_want_acting < pool.info.min_size &&
1449       (pool.info.ec_pool() ||
1450        !cct->_conf->osd_allow_recovery_below_min_size)) {
1451     want_acting.clear();
1452     dout(10) << "choose_acting failed, below min size" << dendl;
1453     return false;
1454   }
1455
1456   /* Check whether we have enough acting shards to later perform recovery */
1457   boost::scoped_ptr<IsPGRecoverablePredicate> recoverable_predicate(
1458     get_pgbackend()->get_is_recoverable_predicate());
1459   if (!(*recoverable_predicate)(have)) {
1460     want_acting.clear();
1461     dout(10) << "choose_acting failed, not recoverable" << dendl;
1462     return false;
1463   }
1464
1465   if (want != acting) {
1466     dout(10) << "choose_acting want " << want << " != acting " << acting
1467              << ", requesting pg_temp change" << dendl;
1468     want_acting = want;
1469
1470     if (want_acting == up) {
1471       // There can't be any pending backfill if
1472       // want is the same as crush map up OSDs.
1473       assert(want_backfill.empty());
1474       vector<int> empty;
1475       osd->queue_want_pg_temp(info.pgid.pgid, empty);
1476     } else
1477       osd->queue_want_pg_temp(info.pgid.pgid, want);
1478     return false;
1479   }
1480   want_acting.clear();
1481   actingbackfill = want_acting_backfill;
1482   dout(10) << "actingbackfill is " << actingbackfill << dendl;
1483   assert(backfill_targets.empty() || backfill_targets == want_backfill);
1484   if (backfill_targets.empty()) {
1485     // Caller is GetInfo
1486     backfill_targets = want_backfill;
1487   }
1488   // Will not change if already set because up would have had to change
1489   // Verify that nothing in backfill is in stray_set
1490   for (set<pg_shard_t>::iterator i = want_backfill.begin();
1491       i != want_backfill.end();
1492       ++i) {
1493     assert(stray_set.find(*i) == stray_set.end());
1494   }
1495   dout(10) << "choose_acting want " << want << " (== acting) backfill_targets "
1496            << want_backfill << dendl;
1497   return true;
1498 }
1499
1500 /* Build the might_have_unfound set.
1501  *
1502  * This is used by the primary OSD during recovery.
1503  *
1504  * This set tracks the OSDs which might have unfound objects that the primary
1505  * OSD needs. As we receive pg_missing_t from each OSD in might_have_unfound, we
1506  * will remove the OSD from the set.
1507  */
1508 void PG::build_might_have_unfound()
1509 {
1510   assert(might_have_unfound.empty());
1511   assert(is_primary());
1512
1513   dout(10) << __func__ << dendl;
1514
1515   check_past_interval_bounds();
1516
1517   might_have_unfound = past_intervals.get_might_have_unfound(
1518     pg_whoami,
1519     pool.info.ec_pool());
1520
1521   // include any (stray) peers
1522   for (map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
1523        p != peer_info.end();
1524        ++p)
1525     might_have_unfound.insert(p->first);
1526
1527   dout(15) << __func__ << ": built " << might_have_unfound << dendl;
1528 }
1529
1530 struct C_PG_ActivateCommitted : public Context {
1531   PGRef pg;
1532   epoch_t epoch;
1533   epoch_t activation_epoch;
1534   C_PG_ActivateCommitted(PG *p, epoch_t e, epoch_t ae)
1535     : pg(p), epoch(e), activation_epoch(ae) {}
1536   void finish(int r) override {
1537     pg->_activate_committed(epoch, activation_epoch);
1538   }
1539 };
1540
1541 void PG::activate(ObjectStore::Transaction& t,
1542                   epoch_t activation_epoch,
1543                   list<Context*>& tfin,
1544                   map<int, map<spg_t,pg_query_t> >& query_map,
1545                   map<int,
1546                       vector<
1547                         pair<pg_notify_t,
1548                              PastIntervals> > > *activator_map,
1549                   RecoveryCtx *ctx)
1550 {
1551   assert(!is_peered());
1552   assert(scrubber.callbacks.empty());
1553   assert(callbacks_for_degraded_object.empty());
1554
1555   // twiddle pg state
1556   state_clear(PG_STATE_DOWN);
1557
1558   send_notify = false;
1559
1560   if (is_primary()) {
1561     // only update primary last_epoch_started if we will go active
1562     if (acting.size() >= pool.info.min_size) {
1563       assert(cct->_conf->osd_find_best_info_ignore_history_les ||
1564              info.last_epoch_started <= activation_epoch);
1565       info.last_epoch_started = activation_epoch;
1566       info.last_interval_started = info.history.same_interval_since;
1567     }
1568   } else if (is_acting(pg_whoami)) {
1569     /* update last_epoch_started on acting replica to whatever the primary sent
1570      * unless it's smaller (could happen if we are going peered rather than
1571      * active, see doc/dev/osd_internals/last_epoch_started.rst) */
1572     if (info.last_epoch_started < activation_epoch) {
1573       info.last_epoch_started = activation_epoch;
1574       info.last_interval_started = info.history.same_interval_since;
1575     }
1576   }
1577
1578   auto &missing = pg_log.get_missing();
1579
1580   if (is_primary()) {
1581     last_update_ondisk = info.last_update;
1582     min_last_complete_ondisk = eversion_t(0,0);  // we don't know (yet)!
1583   }
1584   last_update_applied = info.last_update;
1585   last_rollback_info_trimmed_to_applied = pg_log.get_can_rollback_to();
1586
1587   need_up_thru = false;
1588
1589   // write pg info, log
1590   dirty_info = true;
1591   dirty_big_info = true; // maybe
1592
1593   // find out when we commit
1594   t.register_on_complete(
1595     new C_PG_ActivateCommitted(
1596       this,
1597       get_osdmap()->get_epoch(),
1598       activation_epoch));
1599
1600   // initialize snap_trimq
1601   if (is_primary()) {
1602     dout(20) << "activate - purged_snaps " << info.purged_snaps
1603              << " cached_removed_snaps " << pool.cached_removed_snaps << dendl;
1604     snap_trimq = pool.cached_removed_snaps;
1605     interval_set<snapid_t> intersection;
1606     intersection.intersection_of(snap_trimq, info.purged_snaps);
1607     if (intersection == info.purged_snaps) {
1608       snap_trimq.subtract(info.purged_snaps);
1609     } else {
1610         dout(0) << "warning: info.purged_snaps (" << info.purged_snaps
1611                 << ") is not a subset of pool.cached_removed_snaps ("
1612                 << pool.cached_removed_snaps << ")" << dendl;
1613         snap_trimq.subtract(intersection);
1614     }
1615   }
1616
1617   // init complete pointer
1618   if (missing.num_missing() == 0) {
1619     dout(10) << "activate - no missing, moving last_complete " << info.last_complete
1620              << " -> " << info.last_update << dendl;
1621     info.last_complete = info.last_update;
1622     pg_log.reset_recovery_pointers();
1623   } else {
1624     dout(10) << "activate - not complete, " << missing << dendl;
1625     pg_log.activate_not_complete(info);
1626   }
1627
1628   log_weirdness();
1629
1630   // if primary..
1631   if (is_primary()) {
1632     assert(ctx);
1633     // start up replicas
1634
1635     assert(!actingbackfill.empty());
1636     for (set<pg_shard_t>::iterator i = actingbackfill.begin();
1637          i != actingbackfill.end();
1638          ++i) {
1639       if (*i == pg_whoami) continue;
1640       pg_shard_t peer = *i;
1641       assert(peer_info.count(peer));
1642       pg_info_t& pi = peer_info[peer];
1643
1644       dout(10) << "activate peer osd." << peer << " " << pi << dendl;
1645
1646       MOSDPGLog *m = 0;
1647       pg_missing_t& pm = peer_missing[peer];
1648
1649       bool needs_past_intervals = pi.dne();
1650
1651       /*
1652        * cover case where peer sort order was different and
1653        * last_backfill cannot be interpreted
1654        */
1655       bool force_restart_backfill =
1656         !pi.last_backfill.is_max() &&
1657         !pi.last_backfill_bitwise;
1658
1659       if (pi.last_update == info.last_update && !force_restart_backfill) {
1660         // empty log
1661         if (!pi.last_backfill.is_max())
1662           osd->clog->info() << info.pgid << " continuing backfill to osd."
1663                             << peer
1664                             << " from (" << pi.log_tail << "," << pi.last_update
1665                             << "] " << pi.last_backfill
1666                             << " to " << info.last_update;
1667         if (!pi.is_empty() && activator_map) {
1668           dout(10) << "activate peer osd." << peer << " is up to date, queueing in pending_activators" << dendl;
1669           (*activator_map)[peer.osd].push_back(
1670             make_pair(
1671               pg_notify_t(
1672                 peer.shard, pg_whoami.shard,
1673                 get_osdmap()->get_epoch(),
1674                 get_osdmap()->get_epoch(),
1675                 info),
1676               past_intervals));
1677         } else {
1678           dout(10) << "activate peer osd." << peer << " is up to date, but sending pg_log anyway" << dendl;
1679           m = new MOSDPGLog(
1680             i->shard, pg_whoami.shard,
1681             get_osdmap()->get_epoch(), info);
1682         }
1683       } else if (
1684         pg_log.get_tail() > pi.last_update ||
1685         pi.last_backfill == hobject_t() ||
1686         force_restart_backfill ||
1687         (backfill_targets.count(*i) && pi.last_backfill.is_max())) {
1688         /* ^ This last case covers a situation where a replica is not contiguous
1689          * with the auth_log, but is contiguous with this replica.  Reshuffling
1690          * the active set to handle this would be tricky, so instead we just go
1691          * ahead and backfill it anyway.  This is probably preferrable in any
1692          * case since the replica in question would have to be significantly
1693          * behind.
1694          */
1695         // backfill
1696         osd->clog->debug() << info.pgid << " starting backfill to osd." << peer
1697                          << " from (" << pi.log_tail << "," << pi.last_update
1698                           << "] " << pi.last_backfill
1699                          << " to " << info.last_update;
1700
1701         pi.last_update = info.last_update;
1702         pi.last_complete = info.last_update;
1703         pi.set_last_backfill(hobject_t());
1704         pi.last_epoch_started = info.last_epoch_started;
1705         pi.last_interval_started = info.last_interval_started;
1706         pi.history = info.history;
1707         pi.hit_set = info.hit_set;
1708         pi.stats.stats.clear();
1709
1710         // initialize peer with our purged_snaps.
1711         pi.purged_snaps = info.purged_snaps;
1712
1713         m = new MOSDPGLog(
1714           i->shard, pg_whoami.shard,
1715           get_osdmap()->get_epoch(), pi);
1716
1717         // send some recent log, so that op dup detection works well.
1718         m->log.copy_up_to(pg_log.get_log(), cct->_conf->osd_min_pg_log_entries);
1719         m->info.log_tail = m->log.tail;
1720         pi.log_tail = m->log.tail;  // sigh...
1721
1722         pm.clear();
1723       } else {
1724         // catch up
1725         assert(pg_log.get_tail() <= pi.last_update);
1726         m = new MOSDPGLog(
1727           i->shard, pg_whoami.shard,
1728           get_osdmap()->get_epoch(), info);
1729         // send new stuff to append to replicas log
1730         m->log.copy_after(pg_log.get_log(), pi.last_update);
1731       }
1732
1733       // share past_intervals if we are creating the pg on the replica
1734       // based on whether our info for that peer was dne() *before*
1735       // updating pi.history in the backfill block above.
1736       if (m && needs_past_intervals)
1737         m->past_intervals = past_intervals;
1738
1739       // update local version of peer's missing list!
1740       if (m && pi.last_backfill != hobject_t()) {
1741         for (list<pg_log_entry_t>::iterator p = m->log.log.begin();
1742              p != m->log.log.end();
1743              ++p) {
1744           if (p->soid <= pi.last_backfill &&
1745               !p->is_error()) {
1746             if (perform_deletes_during_peering() && p->is_delete()) {
1747               pm.rm(p->soid, p->version);
1748             } else {
1749               pm.add_next_event(*p);
1750             }
1751           }
1752         }
1753       }
1754
1755       if (m) {
1756         dout(10) << "activate peer osd." << peer << " sending " << m->log << dendl;
1757         //m->log.print(cout);
1758         osd->send_message_osd_cluster(peer.osd, m, get_osdmap()->get_epoch());
1759       }
1760
1761       // peer now has
1762       pi.last_update = info.last_update;
1763
1764       // update our missing
1765       if (pm.num_missing() == 0) {
1766         pi.last_complete = pi.last_update;
1767         dout(10) << "activate peer osd." << peer << " " << pi << " uptodate" << dendl;
1768       } else {
1769         dout(10) << "activate peer osd." << peer << " " << pi << " missing " << pm << dendl;
1770       }
1771     }
1772
1773     // Set up missing_loc
1774     set<pg_shard_t> complete_shards;
1775     for (set<pg_shard_t>::iterator i = actingbackfill.begin();
1776          i != actingbackfill.end();
1777          ++i) {
1778       dout(20) << __func__ << " setting up missing_loc from shard " << *i << " " << dendl;
1779       if (*i == get_primary()) {
1780         missing_loc.add_active_missing(missing);
1781         if (!missing.have_missing())
1782           complete_shards.insert(*i);
1783       } else {
1784         auto peer_missing_entry = peer_missing.find(*i);
1785         assert(peer_missing_entry != peer_missing.end());
1786         missing_loc.add_active_missing(peer_missing_entry->second);
1787         if (!peer_missing_entry->second.have_missing() &&
1788             peer_info[*i].last_backfill.is_max())
1789           complete_shards.insert(*i);
1790       }
1791     }
1792     // If necessary, create might_have_unfound to help us find our unfound objects.
1793     // NOTE: It's important that we build might_have_unfound before trimming the
1794     // past intervals.
1795     might_have_unfound.clear();
1796     if (needs_recovery()) {
1797       // If only one shard has missing, we do a trick to add all others as recovery
1798       // source, this is considered safe since the PGLogs have been merged locally,
1799       // and covers vast majority of the use cases, like one OSD/host is down for
1800       // a while for hardware repairing
1801       if (complete_shards.size() + 1 == actingbackfill.size()) {
1802         missing_loc.add_batch_sources_info(complete_shards, ctx->handle);
1803       } else {
1804         missing_loc.add_source_info(pg_whoami, info, pg_log.get_missing(),
1805                                     ctx->handle);
1806         for (set<pg_shard_t>::iterator i = actingbackfill.begin();
1807              i != actingbackfill.end();
1808              ++i) {
1809           if (*i == pg_whoami) continue;
1810           dout(10) << __func__ << ": adding " << *i << " as a source" << dendl;
1811           assert(peer_missing.count(*i));
1812           assert(peer_info.count(*i));
1813           missing_loc.add_source_info(
1814             *i,
1815             peer_info[*i],
1816             peer_missing[*i],
1817             ctx->handle);
1818         }
1819       }
1820       for (map<pg_shard_t, pg_missing_t>::iterator i = peer_missing.begin();
1821            i != peer_missing.end();
1822            ++i) {
1823         if (is_actingbackfill(i->first))
1824           continue;
1825         assert(peer_info.count(i->first));
1826         search_for_missing(
1827           peer_info[i->first],
1828           i->second,
1829           i->first,
1830           ctx);
1831       }
1832
1833       build_might_have_unfound();
1834
1835       state_set(PG_STATE_DEGRADED);
1836       if (have_unfound())
1837         discover_all_missing(query_map);
1838     }
1839
1840     // degraded?
1841     if (get_osdmap()->get_pg_size(info.pgid.pgid) > actingset.size()) {
1842       state_set(PG_STATE_DEGRADED);
1843       state_set(PG_STATE_UNDERSIZED);
1844     }
1845
1846     state_set(PG_STATE_ACTIVATING);
1847     release_pg_backoffs();
1848     projected_last_update = info.last_update;
1849   }
1850   if (acting.size() >= pool.info.min_size) {
1851     PGLogEntryHandler handler{this, &t};
1852     pg_log.roll_forward(&handler);
1853   }
1854 }
1855
1856 bool PG::op_has_sufficient_caps(OpRequestRef& op)
1857 {
1858   // only check MOSDOp
1859   if (op->get_req()->get_type() != CEPH_MSG_OSD_OP)
1860     return true;
1861
1862   const MOSDOp *req = static_cast<const MOSDOp*>(op->get_req());
1863
1864   Session *session = static_cast<Session*>(req->get_connection()->get_priv());
1865   if (!session) {
1866     dout(0) << "op_has_sufficient_caps: no session for op " << *req << dendl;
1867     return false;
1868   }
1869   OSDCap& caps = session->caps;
1870   session->put();
1871
1872   const string &key = req->get_hobj().get_key().empty() ?
1873     req->get_oid().name :
1874     req->get_hobj().get_key();
1875
1876   bool cap = caps.is_capable(pool.name, req->get_hobj().nspace,
1877                              pool.auid, key,
1878                              op->need_read_cap(),
1879                              op->need_write_cap(),
1880                              op->classes());
1881
1882   dout(20) << "op_has_sufficient_caps "
1883            << "session=" << session
1884            << " pool=" << pool.id << " (" << pool.name
1885            << " " << req->get_hobj().nspace
1886            << ") owner=" << pool.auid
1887            << " need_read_cap=" << op->need_read_cap()
1888            << " need_write_cap=" << op->need_write_cap()
1889            << " classes=" << op->classes()
1890            << " -> " << (cap ? "yes" : "NO")
1891            << dendl;
1892   return cap;
1893 }
1894
1895 void PG::_activate_committed(epoch_t epoch, epoch_t activation_epoch)
1896 {
1897   lock();
1898   if (pg_has_reset_since(epoch)) {
1899     dout(10) << "_activate_committed " << epoch
1900              << ", that was an old interval" << dendl;
1901   } else if (is_primary()) {
1902     peer_activated.insert(pg_whoami);
1903     dout(10) << "_activate_committed " << epoch
1904              << " peer_activated now " << peer_activated
1905              << " last_interval_started " << info.history.last_interval_started
1906              << " last_epoch_started " << info.history.last_epoch_started
1907              << " same_interval_since " << info.history.same_interval_since << dendl;
1908     assert(!actingbackfill.empty());
1909     if (peer_activated.size() == actingbackfill.size())
1910       all_activated_and_committed();
1911   } else {
1912     dout(10) << "_activate_committed " << epoch << " telling primary" << dendl;
1913     MOSDPGInfo *m = new MOSDPGInfo(epoch);
1914     pg_notify_t i = pg_notify_t(
1915       get_primary().shard, pg_whoami.shard,
1916       get_osdmap()->get_epoch(),
1917       get_osdmap()->get_epoch(),
1918       info);
1919
1920     i.info.history.last_epoch_started = activation_epoch;
1921     i.info.history.last_interval_started = i.info.history.same_interval_since;
1922     if (acting.size() >= pool.info.min_size) {
1923       state_set(PG_STATE_ACTIVE);
1924     } else {
1925       state_set(PG_STATE_PEERED);
1926     }
1927
1928     m->pg_list.push_back(make_pair(i, PastIntervals()));
1929     osd->send_message_osd_cluster(get_primary().osd, m, get_osdmap()->get_epoch());
1930
1931     // waiters
1932     if (flushes_in_progress == 0) {
1933       requeue_ops(waiting_for_peered);
1934     }
1935   }
1936
1937   assert(!dirty_info);
1938
1939   unlock();
1940 }
1941
1942 /*
1943  * update info.history.last_epoch_started ONLY after we and all
1944  * replicas have activated AND committed the activate transaction
1945  * (i.e. the peering results are stable on disk).
1946  */
1947 void PG::all_activated_and_committed()
1948 {
1949   dout(10) << "all_activated_and_committed" << dendl;
1950   assert(is_primary());
1951   assert(peer_activated.size() == actingbackfill.size());
1952   assert(!actingbackfill.empty());
1953   assert(blocked_by.empty());
1954
1955   queue_peering_event(
1956     CephPeeringEvtRef(
1957       std::make_shared<CephPeeringEvt>(
1958         get_osdmap()->get_epoch(),
1959         get_osdmap()->get_epoch(),
1960         AllReplicasActivated())));
1961 }
1962
1963 bool PG::requeue_scrub(bool high_priority)
1964 {
1965   assert(is_locked());
1966   if (scrub_queued) {
1967     dout(10) << __func__ << ": already queued" << dendl;
1968     return false;
1969   } else {
1970     dout(10) << __func__ << ": queueing" << dendl;
1971     scrub_queued = true;
1972     osd->queue_for_scrub(this, high_priority);
1973     return true;
1974   }
1975 }
1976
1977 void PG::queue_recovery()
1978 {
1979   if (!is_primary() || !is_peered()) {
1980     dout(10) << "queue_recovery -- not primary or not peered " << dendl;
1981     assert(!recovery_queued);
1982   } else if (recovery_queued) {
1983     dout(10) << "queue_recovery -- already queued" << dendl;
1984   } else {
1985     dout(10) << "queue_recovery -- queuing" << dendl;
1986     recovery_queued = true;
1987     osd->queue_for_recovery(this);
1988   }
1989 }
1990
1991 bool PG::queue_scrub()
1992 {
1993   assert(is_locked());
1994   if (is_scrubbing()) {
1995     return false;
1996   }
1997   scrubber.priority = scrubber.must_scrub ?
1998          cct->_conf->osd_requested_scrub_priority : get_scrub_priority();
1999   scrubber.must_scrub = false;
2000   state_set(PG_STATE_SCRUBBING);
2001   if (scrubber.must_deep_scrub) {
2002     state_set(PG_STATE_DEEP_SCRUB);
2003     scrubber.must_deep_scrub = false;
2004   }
2005   if (scrubber.must_repair || scrubber.auto_repair) {
2006     state_set(PG_STATE_REPAIR);
2007     scrubber.must_repair = false;
2008   }
2009   requeue_scrub();
2010   return true;
2011 }
2012
2013 unsigned PG::get_scrub_priority()
2014 {
2015   // a higher value -> a higher priority
2016   int pool_scrub_priority = 0;
2017   pool.info.opts.get(pool_opts_t::SCRUB_PRIORITY, &pool_scrub_priority);
2018   return pool_scrub_priority > 0 ? pool_scrub_priority : cct->_conf->osd_scrub_priority;
2019 }
2020
2021 struct C_PG_FinishRecovery : public Context {
2022   PGRef pg;
2023   explicit C_PG_FinishRecovery(PG *p) : pg(p) {}
2024   void finish(int r) override {
2025     pg->_finish_recovery(this);
2026   }
2027 };
2028
2029 void PG::mark_clean()
2030 {
2031   if (actingset.size() == get_osdmap()->get_pg_size(info.pgid.pgid)) {
2032     state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
2033     state_set(PG_STATE_CLEAN);
2034     info.history.last_epoch_clean = get_osdmap()->get_epoch();
2035     info.history.last_interval_clean = info.history.same_interval_since;
2036     past_intervals.clear();
2037     dirty_big_info = true;
2038     dirty_info = true;
2039   }
2040
2041   kick_snap_trim();
2042 }
2043
2044 void PG::change_recovery_force_mode(int new_mode, bool clear)
2045 {
2046   lock(true);
2047   if (clear) {
2048     state_clear(new_mode);
2049   } else {
2050     state_set(new_mode);
2051   }
2052   publish_stats_to_osd();
2053
2054   unlock();
2055 }
2056
2057 inline int PG::clamp_recovery_priority(int priority)
2058 {
2059   static_assert(OSD_RECOVERY_PRIORITY_MIN < OSD_RECOVERY_PRIORITY_MAX, "Invalid priority range");
2060   static_assert(OSD_RECOVERY_PRIORITY_MIN >= 0, "Priority range must match unsigned type");
2061
2062   // Clamp to valid range
2063   if (priority > OSD_RECOVERY_PRIORITY_MAX) {
2064     return OSD_RECOVERY_PRIORITY_MAX;
2065   } else if (priority < OSD_RECOVERY_PRIORITY_MIN) {
2066     return OSD_RECOVERY_PRIORITY_MIN;
2067   } else {
2068     return priority;
2069   }
2070 }
2071
2072 unsigned PG::get_recovery_priority()
2073 {
2074   // a higher value -> a higher priority
2075   int ret = 0;
2076
2077   if (state & PG_STATE_FORCED_RECOVERY) {
2078     ret = OSD_RECOVERY_PRIORITY_FORCED;
2079   } else {
2080     pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &ret);
2081     ret = clamp_recovery_priority(OSD_RECOVERY_PRIORITY_BASE + ret);
2082   }
2083   dout(20) << __func__ << " recovery priority for " << *this << " is " << ret << ", state is " << state << dendl;
2084   return static_cast<unsigned>(ret);
2085 }
2086
2087 unsigned PG::get_backfill_priority()
2088 {
2089   // a higher value -> a higher priority
2090   int ret = OSD_BACKFILL_PRIORITY_BASE;
2091   if (state & PG_STATE_FORCED_BACKFILL) {
2092     ret = OSD_RECOVERY_PRIORITY_FORCED;
2093   } else {
2094     if (acting.size() < pool.info.min_size) {
2095       // inactive: no. of replicas < min_size, highest priority since it blocks IO
2096       ret = OSD_BACKFILL_INACTIVE_PRIORITY_BASE + (pool.info.min_size - acting.size());
2097
2098     } else if (is_undersized()) {
2099       // undersized: OSD_BACKFILL_DEGRADED_PRIORITY_BASE + num missing replicas
2100       assert(pool.info.size > actingset.size());
2101       ret = OSD_BACKFILL_DEGRADED_PRIORITY_BASE + (pool.info.size - actingset.size());
2102
2103     } else if (is_degraded()) {
2104       // degraded: baseline degraded
2105       ret = OSD_BACKFILL_DEGRADED_PRIORITY_BASE;
2106     }
2107
2108     // Adjust with pool's recovery priority
2109     int pool_recovery_priority = 0;
2110     pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority);
2111
2112     ret = clamp_recovery_priority(pool_recovery_priority + ret);
2113   }
2114
2115   return static_cast<unsigned>(ret);
2116 }
2117
2118 void PG::finish_recovery(list<Context*>& tfin)
2119 {
2120   dout(10) << "finish_recovery" << dendl;
2121   assert(info.last_complete == info.last_update);
2122
2123   clear_recovery_state();
2124
2125   /*
2126    * sync all this before purging strays.  but don't block!
2127    */
2128   finish_sync_event = new C_PG_FinishRecovery(this);
2129   tfin.push_back(finish_sync_event);
2130 }
2131
2132 void PG::_finish_recovery(Context *c)
2133 {
2134   lock();
2135   if (deleting) {
2136     unlock();
2137     return;
2138   }
2139   if (c == finish_sync_event) {
2140     dout(10) << "_finish_recovery" << dendl;
2141     finish_sync_event = 0;
2142     purge_strays();
2143
2144     publish_stats_to_osd();
2145
2146     if (scrub_after_recovery) {
2147       dout(10) << "_finish_recovery requeueing for scrub" << dendl;
2148       scrub_after_recovery = false;
2149       scrubber.must_deep_scrub = true;
2150       queue_scrub();
2151     }
2152   } else {
2153     dout(10) << "_finish_recovery -- stale" << dendl;
2154   }
2155   unlock();
2156 }
2157
2158 void PG::start_recovery_op(const hobject_t& soid)
2159 {
2160   dout(10) << "start_recovery_op " << soid
2161 #ifdef DEBUG_RECOVERY_OIDS
2162            << " (" << recovering_oids << ")"
2163 #endif
2164            << dendl;
2165   assert(recovery_ops_active >= 0);
2166   recovery_ops_active++;
2167 #ifdef DEBUG_RECOVERY_OIDS
2168   assert(recovering_oids.count(soid) == 0);
2169   recovering_oids.insert(soid);
2170 #endif
2171   osd->start_recovery_op(this, soid);
2172 }
2173
2174 void PG::finish_recovery_op(const hobject_t& soid, bool dequeue)
2175 {
2176   dout(10) << "finish_recovery_op " << soid
2177 #ifdef DEBUG_RECOVERY_OIDS
2178            << " (" << recovering_oids << ")"
2179 #endif
2180            << dendl;
2181   assert(recovery_ops_active > 0);
2182   recovery_ops_active--;
2183 #ifdef DEBUG_RECOVERY_OIDS
2184   assert(recovering_oids.count(soid));
2185   recovering_oids.erase(soid);
2186 #endif
2187   osd->finish_recovery_op(this, soid, dequeue);
2188
2189   if (!dequeue) {
2190     queue_recovery();
2191   }
2192 }
2193
2194 void PG::split_into(pg_t child_pgid, PG *child, unsigned split_bits)
2195 {
2196   child->update_snap_mapper_bits(split_bits);
2197   child->update_osdmap_ref(get_osdmap());
2198
2199   child->pool = pool;
2200
2201   // Log
2202   pg_log.split_into(child_pgid, split_bits, &(child->pg_log));
2203   child->info.last_complete = info.last_complete;
2204
2205   info.last_update = pg_log.get_head();
2206   child->info.last_update = child->pg_log.get_head();
2207
2208   child->info.last_user_version = info.last_user_version;
2209
2210   info.log_tail = pg_log.get_tail();
2211   child->info.log_tail = child->pg_log.get_tail();
2212
2213   if (info.last_complete < pg_log.get_tail())
2214     info.last_complete = pg_log.get_tail();
2215   if (child->info.last_complete < child->pg_log.get_tail())
2216     child->info.last_complete = child->pg_log.get_tail();
2217
2218   // Info
2219   child->info.history = info.history;
2220   child->info.history.epoch_created = get_osdmap()->get_epoch();
2221   child->info.purged_snaps = info.purged_snaps;
2222
2223   if (info.last_backfill.is_max()) {
2224     child->info.set_last_backfill(hobject_t::get_max());
2225   } else {
2226     // restart backfill on parent and child to be safe.  we could
2227     // probably do better in the bitwise sort case, but it's more
2228     // fragile (there may be special work to do on backfill completion
2229     // in the future).
2230     info.set_last_backfill(hobject_t());
2231     child->info.set_last_backfill(hobject_t());
2232   }
2233
2234   child->info.stats = info.stats;
2235   child->info.stats.parent_split_bits = split_bits;
2236   info.stats.stats_invalid = true;
2237   child->info.stats.stats_invalid = true;
2238   child->info.last_epoch_started = info.last_epoch_started;
2239   child->info.last_interval_started = info.last_interval_started;
2240
2241   child->snap_trimq = snap_trimq;
2242
2243   // There can't be recovery/backfill going on now
2244   int primary, up_primary;
2245   vector<int> newup, newacting;
2246   get_osdmap()->pg_to_up_acting_osds(
2247     child->info.pgid.pgid, &newup, &up_primary, &newacting, &primary);
2248   child->init_primary_up_acting(
2249     newup,
2250     newacting,
2251     up_primary,
2252     primary);
2253   child->role = OSDMap::calc_pg_role(osd->whoami, child->acting);
2254
2255   // this comparison includes primary rank via pg_shard_t
2256   if (get_primary() != child->get_primary())
2257     child->info.history.same_primary_since = get_osdmap()->get_epoch();
2258
2259   child->info.stats.up = up;
2260   child->info.stats.up_primary = up_primary;
2261   child->info.stats.acting = acting;
2262   child->info.stats.acting_primary = primary;
2263   child->info.stats.mapping_epoch = get_osdmap()->get_epoch();
2264
2265   // History
2266   child->past_intervals = past_intervals;
2267
2268   _split_into(child_pgid, child, split_bits);
2269
2270   // release all backoffs for simplicity
2271   release_backoffs(hobject_t(), hobject_t::get_max());
2272
2273   child->on_new_interval();
2274
2275   child->dirty_info = true;
2276   child->dirty_big_info = true;
2277   dirty_info = true;
2278   dirty_big_info = true;
2279 }
2280
2281 void PG::add_backoff(SessionRef s, const hobject_t& begin, const hobject_t& end)
2282 {
2283   ConnectionRef con = s->con;
2284   if (!con)   // OSD::ms_handle_reset clears s->con without a lock
2285     return;
2286   BackoffRef b(s->have_backoff(info.pgid, begin));
2287   if (b) {
2288     derr << __func__ << " already have backoff for " << s << " begin " << begin
2289          << " " << *b << dendl;
2290     ceph_abort();
2291   }
2292   Mutex::Locker l(backoff_lock);
2293   {
2294     b = new Backoff(info.pgid, this, s, ++s->backoff_seq, begin, end);
2295     backoffs[begin].insert(b);
2296     s->add_backoff(b);
2297     dout(10) << __func__ << " session " << s << " added " << *b << dendl;
2298   }
2299   con->send_message(
2300     new MOSDBackoff(
2301       info.pgid,
2302       get_osdmap()->get_epoch(),
2303       CEPH_OSD_BACKOFF_OP_BLOCK,
2304       b->id,
2305       begin,
2306       end));
2307 }
2308
2309 void PG::release_backoffs(const hobject_t& begin, const hobject_t& end)
2310 {
2311   dout(10) << __func__ << " [" << begin << "," << end << ")" << dendl;
2312   vector<BackoffRef> bv;
2313   {
2314     Mutex::Locker l(backoff_lock);
2315     auto p = backoffs.lower_bound(begin);
2316     while (p != backoffs.end()) {
2317       int r = cmp(p->first, end);
2318       dout(20) << __func__ << " ? " << r << " " << p->first
2319                << " " << p->second << dendl;
2320       // note: must still examine begin=end=p->first case
2321       if (r > 0 || (r == 0 && begin < end)) {
2322         break;
2323       }
2324       dout(20) << __func__ << " checking " << p->first
2325                << " " << p->second << dendl;
2326       auto q = p->second.begin();
2327       while (q != p->second.end()) {
2328         dout(20) << __func__ << " checking  " << *q << dendl;
2329         int r = cmp((*q)->begin, begin);
2330         if (r == 0 || (r > 0 && (*q)->end < end)) {
2331           bv.push_back(*q);
2332           q = p->second.erase(q);
2333         } else {
2334           ++q;
2335         }
2336       }
2337       if (p->second.empty()) {
2338         p = backoffs.erase(p);
2339       } else {
2340         ++p;
2341       }
2342     }
2343   }
2344   for (auto b : bv) {
2345     Mutex::Locker l(b->lock);
2346     dout(10) << __func__ << " " << *b << dendl;
2347     if (b->session) {
2348       assert(b->pg == this);
2349       ConnectionRef con = b->session->con;
2350       if (con) {   // OSD::ms_handle_reset clears s->con without a lock
2351         con->send_message(
2352           new MOSDBackoff(
2353             info.pgid,
2354             get_osdmap()->get_epoch(),
2355             CEPH_OSD_BACKOFF_OP_UNBLOCK,
2356             b->id,
2357             b->begin,
2358             b->end));
2359       }
2360       if (b->is_new()) {
2361         b->state = Backoff::STATE_DELETING;
2362       } else {
2363         b->session->rm_backoff(b);
2364         b->session.reset();
2365       }
2366       b->pg.reset();
2367     }
2368   }
2369 }
2370
2371 void PG::clear_backoffs()
2372 {
2373   dout(10) << __func__ << " " << dendl;
2374   map<hobject_t,set<BackoffRef>> ls;
2375   {
2376     Mutex::Locker l(backoff_lock);
2377     ls.swap(backoffs);
2378   }
2379   for (auto& p : ls) {
2380     for (auto& b : p.second) {
2381       Mutex::Locker l(b->lock);
2382       dout(10) << __func__ << " " << *b << dendl;
2383       if (b->session) {
2384         assert(b->pg == this);
2385         if (b->is_new()) {
2386           b->state = Backoff::STATE_DELETING;
2387         } else {
2388           b->session->rm_backoff(b);
2389           b->session.reset();
2390         }
2391         b->pg.reset();
2392       }
2393     }
2394   }
2395 }
2396
2397 // called by Session::clear_backoffs()
2398 void PG::rm_backoff(BackoffRef b)
2399 {
2400   dout(10) << __func__ << " " << *b << dendl;
2401   Mutex::Locker l(backoff_lock);
2402   assert(b->lock.is_locked_by_me());
2403   assert(b->pg == this);
2404   auto p = backoffs.find(b->begin);
2405   // may race with release_backoffs()
2406   if (p != backoffs.end()) {
2407     auto q = p->second.find(b);
2408     if (q != p->second.end()) {
2409       p->second.erase(q);
2410       if (p->second.empty()) {
2411         backoffs.erase(p);
2412       }
2413     }
2414   }
2415 }
2416
2417 void PG::clear_recovery_state()
2418 {
2419   dout(10) << "clear_recovery_state" << dendl;
2420
2421   pg_log.reset_recovery_pointers();
2422   finish_sync_event = 0;
2423
2424   hobject_t soid;
2425   while (recovery_ops_active > 0) {
2426 #ifdef DEBUG_RECOVERY_OIDS
2427     soid = *recovering_oids.begin();
2428 #endif
2429     finish_recovery_op(soid, true);
2430   }
2431
2432   backfill_targets.clear();
2433   backfill_info.clear();
2434   peer_backfill_info.clear();
2435   waiting_on_backfill.clear();
2436   _clear_recovery_state();  // pg impl specific hook
2437 }
2438
2439 void PG::cancel_recovery()
2440 {
2441   dout(10) << "cancel_recovery" << dendl;
2442   clear_recovery_state();
2443 }
2444
2445
2446 void PG::purge_strays()
2447 {
2448   dout(10) << "purge_strays " << stray_set << dendl;
2449
2450   bool removed = false;
2451   for (set<pg_shard_t>::iterator p = stray_set.begin();
2452        p != stray_set.end();
2453        ++p) {
2454     assert(!is_actingbackfill(*p));
2455     if (get_osdmap()->is_up(p->osd)) {
2456       dout(10) << "sending PGRemove to osd." << *p << dendl;
2457       vector<spg_t> to_remove;
2458       to_remove.push_back(spg_t(info.pgid.pgid, p->shard));
2459       MOSDPGRemove *m = new MOSDPGRemove(
2460         get_osdmap()->get_epoch(),
2461         to_remove);
2462       osd->send_message_osd_cluster(p->osd, m, get_osdmap()->get_epoch());
2463     } else {
2464       dout(10) << "not sending PGRemove to down osd." << *p << dendl;
2465     }
2466     peer_missing.erase(*p);
2467     peer_info.erase(*p);
2468     peer_purged.insert(*p);
2469     removed = true;
2470   }
2471
2472   // if we removed anyone, update peers (which include peer_info)
2473   if (removed)
2474     update_heartbeat_peers();
2475
2476   stray_set.clear();
2477
2478   // clear _requested maps; we may have to peer() again if we discover
2479   // (more) stray content
2480   peer_log_requested.clear();
2481   peer_missing_requested.clear();
2482 }
2483
2484 void PG::set_probe_targets(const set<pg_shard_t> &probe_set)
2485 {
2486   Mutex::Locker l(heartbeat_peer_lock);
2487   probe_targets.clear();
2488   for (set<pg_shard_t>::iterator i = probe_set.begin();
2489        i != probe_set.end();
2490        ++i) {
2491     probe_targets.insert(i->osd);
2492   }
2493 }
2494
2495 void PG::clear_probe_targets()
2496 {
2497   Mutex::Locker l(heartbeat_peer_lock);
2498   probe_targets.clear();
2499 }
2500
2501 void PG::update_heartbeat_peers()
2502 {
2503   assert(is_locked());
2504
2505   if (!is_primary())
2506     return;
2507
2508   set<int> new_peers;
2509   for (unsigned i=0; i<acting.size(); i++) {
2510     if (acting[i] != CRUSH_ITEM_NONE)
2511       new_peers.insert(acting[i]);
2512   }
2513   for (unsigned i=0; i<up.size(); i++) {
2514     if (up[i] != CRUSH_ITEM_NONE)
2515       new_peers.insert(up[i]);
2516   }
2517   for (map<pg_shard_t,pg_info_t>::iterator p = peer_info.begin();
2518     p != peer_info.end();
2519     ++p)
2520     new_peers.insert(p->first.osd);
2521
2522   bool need_update = false;
2523   heartbeat_peer_lock.Lock();
2524   if (new_peers == heartbeat_peers) {
2525     dout(10) << "update_heartbeat_peers " << heartbeat_peers << " unchanged" << dendl;
2526   } else {
2527     dout(10) << "update_heartbeat_peers " << heartbeat_peers << " -> " << new_peers << dendl;
2528     heartbeat_peers.swap(new_peers);
2529     need_update = true;
2530   }
2531   heartbeat_peer_lock.Unlock();
2532
2533   if (need_update)
2534     osd->need_heartbeat_peer_update();
2535 }
2536
2537
2538 bool PG::check_in_progress_op(
2539   const osd_reqid_t &r,
2540   eversion_t *version,
2541   version_t *user_version,
2542   int *return_code) const
2543 {
2544   return (
2545     projected_log.get_request(r, version, user_version, return_code) ||
2546     pg_log.get_log().get_request(r, version, user_version, return_code));
2547 }
2548
2549 void PG::_update_calc_stats()
2550 {
2551   info.stats.version = info.last_update;
2552   info.stats.created = info.history.epoch_created;
2553   info.stats.last_scrub = info.history.last_scrub;
2554   info.stats.last_scrub_stamp = info.history.last_scrub_stamp;
2555   info.stats.last_deep_scrub = info.history.last_deep_scrub;
2556   info.stats.last_deep_scrub_stamp = info.history.last_deep_scrub_stamp;
2557   info.stats.last_clean_scrub_stamp = info.history.last_clean_scrub_stamp;
2558   info.stats.last_epoch_clean = info.history.last_epoch_clean;
2559
2560   info.stats.log_size = pg_log.get_head().version - pg_log.get_tail().version;
2561   info.stats.ondisk_log_size = info.stats.log_size;
2562   info.stats.log_start = pg_log.get_tail();
2563   info.stats.ondisk_log_start = pg_log.get_tail();
2564
2565   // If actingset is larger then upset we will have misplaced,
2566   // so we will report based on actingset size.
2567
2568   // If upset is larger then we will have degraded,
2569   // so we will report based on upset size.
2570
2571   // If target is the largest of them all, it will contribute to
2572   // the degraded count because num_object_copies is
2573   // computed using target and eventual used to get degraded total.
2574
2575   unsigned target = get_osdmap()->get_pg_size(info.pgid.pgid);
2576   unsigned nrep = MAX(actingset.size(), upset.size());
2577   // calc num_object_copies
2578   info.stats.stats.calc_copies(MAX(target, nrep));
2579   info.stats.stats.sum.num_objects_degraded = 0;
2580   info.stats.stats.sum.num_objects_unfound = 0;
2581   info.stats.stats.sum.num_objects_misplaced = 0;
2582   if ((is_degraded() || is_undersized() || !is_clean()) && is_peered()) {
2583     // NOTE: we only generate copies, degraded, misplaced and unfound
2584     // values for the summation, not individual stat categories.
2585     int64_t num_objects = info.stats.stats.sum.num_objects;
2586
2587     // Total sum of all missing
2588     int64_t missing = 0;
2589     // Objects that have arrived backfilled to up OSDs (not in acting)
2590     int64_t backfilled = 0;
2591     // A misplaced object is not stored on the correct OSD
2592     int64_t misplaced = 0;
2593     // Total of object copies/shards found
2594     int64_t object_copies = 0;
2595
2596     // num_objects_missing on each peer
2597     for (map<pg_shard_t, pg_info_t>::iterator pi =
2598         peer_info.begin();
2599         pi != peer_info.end();
2600         ++pi) {
2601       map<pg_shard_t, pg_missing_t>::const_iterator pm =
2602         peer_missing.find(pi->first);
2603       if (pm != peer_missing.end()) {
2604         pi->second.stats.stats.sum.num_objects_missing =
2605           pm->second.num_missing();
2606       }
2607     }
2608
2609     assert(!actingbackfill.empty());
2610     for (set<pg_shard_t>::iterator i = actingbackfill.begin();
2611          i != actingbackfill.end();
2612          ++i) {
2613       const pg_shard_t &p = *i;
2614
2615       bool in_up = (upset.find(p) != upset.end());
2616       bool in_acting = (actingset.find(p) != actingset.end());
2617       assert(in_up || in_acting);
2618
2619       // in acting                  Compute total objects excluding num_missing
2620       // in acting and not in up    Compute misplaced objects excluding num_missing
2621       // in up and not in acting    Compute total objects already backfilled
2622       if (in_acting) {
2623         unsigned osd_missing;
2624         // primary handling
2625         if (p == pg_whoami) {
2626           osd_missing = pg_log.get_missing().num_missing();
2627           info.stats.stats.sum.num_objects_missing_on_primary =
2628               osd_missing;
2629           object_copies += num_objects; // My local (primary) count
2630         } else {
2631           assert(peer_missing.count(p));
2632           osd_missing = peer_missing[p].num_missing();
2633           object_copies += peer_info[p].stats.stats.sum.num_objects;
2634         }
2635         missing += osd_missing;
2636         // Count non-missing objects not in up as misplaced
2637         if (!in_up && num_objects > osd_missing)
2638           misplaced += num_objects - osd_missing;
2639       } else {
2640         assert(in_up && !in_acting);
2641
2642         // If this peer has more objects then it should, ignore them
2643         backfilled += MIN(num_objects, peer_info[p].stats.stats.sum.num_objects);
2644       }
2645     }
2646
2647     // Any objects that have been backfilled to up OSDs can deducted from misplaced
2648     misplaced = MAX(0, misplaced - backfilled);
2649
2650     // Deduct computed total missing on acting nodes
2651     object_copies -= missing;
2652     // Include computed backfilled objects on up nodes
2653     object_copies += backfilled;
2654     // a degraded objects has fewer replicas or EC shards than the
2655     // pool specifies.  num_object_copies will never be smaller than target * num_copies.
2656     int64_t degraded = MAX(0, info.stats.stats.sum.num_object_copies - object_copies);
2657
2658     info.stats.stats.sum.num_objects_degraded = degraded;
2659     info.stats.stats.sum.num_objects_unfound = get_num_unfound();
2660     info.stats.stats.sum.num_objects_misplaced = misplaced;
2661   }
2662 }
2663
2664 void PG::_update_blocked_by()
2665 {
2666   // set a max on the number of blocking peers we report. if we go
2667   // over, report a random subset.  keep the result sorted.
2668   unsigned keep = MIN(blocked_by.size(), cct->_conf->osd_max_pg_blocked_by);
2669   unsigned skip = blocked_by.size() - keep;
2670   info.stats.blocked_by.clear();
2671   info.stats.blocked_by.resize(keep);
2672   unsigned pos = 0;
2673   for (set<int>::iterator p = blocked_by.begin();
2674        p != blocked_by.end() && keep > 0;
2675        ++p) {
2676     if (skip > 0 && (rand() % (skip + keep) < skip)) {
2677       --skip;
2678     } else {
2679       info.stats.blocked_by[pos++] = *p;
2680       --keep;
2681     }
2682   }
2683 }
2684
2685 void PG::publish_stats_to_osd()
2686 {
2687   if (!is_primary())
2688     return;
2689
2690   pg_stats_publish_lock.Lock();
2691
2692   if (info.stats.stats.sum.num_scrub_errors)
2693     state_set(PG_STATE_INCONSISTENT);
2694   else
2695     state_clear(PG_STATE_INCONSISTENT);
2696
2697   utime_t now = ceph_clock_now();
2698   if (info.stats.state != state) {
2699     info.stats.last_change = now;
2700     // Optimistic estimation, if we just find out an inactive PG,
2701     // assumt it is active till now.
2702     if (!(state & PG_STATE_ACTIVE) &&
2703         (info.stats.state & PG_STATE_ACTIVE))
2704       info.stats.last_active = now;
2705
2706     if ((state & PG_STATE_ACTIVE) &&
2707         !(info.stats.state & PG_STATE_ACTIVE))
2708       info.stats.last_became_active = now;
2709     if ((state & (PG_STATE_ACTIVE|PG_STATE_PEERED)) &&
2710         !(info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED)))
2711       info.stats.last_became_peered = now;
2712     if (!(state & PG_STATE_CREATING) &&
2713         (info.stats.state & PG_STATE_CREATING)) {
2714       osd->send_pg_created(get_pgid().pgid);
2715     }
2716     info.stats.state = state;
2717   }
2718
2719   _update_calc_stats();
2720   _update_blocked_by();
2721
2722   bool publish = false;
2723   pg_stat_t pre_publish = info.stats;
2724   pre_publish.stats.add(unstable_stats);
2725   utime_t cutoff = now;
2726   cutoff -= cct->_conf->osd_pg_stat_report_interval_max;
2727   if (pg_stats_publish_valid && pre_publish == pg_stats_publish &&
2728       info.stats.last_fresh > cutoff) {
2729     dout(15) << "publish_stats_to_osd " << pg_stats_publish.reported_epoch
2730              << ": no change since " << info.stats.last_fresh << dendl;
2731   } else {
2732     // update our stat summary and timestamps
2733     info.stats.reported_epoch = get_osdmap()->get_epoch();
2734     ++info.stats.reported_seq;
2735
2736     info.stats.last_fresh = now;
2737
2738     if (info.stats.state & PG_STATE_CLEAN)
2739       info.stats.last_clean = now;
2740     if (info.stats.state & PG_STATE_ACTIVE)
2741       info.stats.last_active = now;
2742     if (info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED))
2743       info.stats.last_peered = now;
2744     info.stats.last_unstale = now;
2745     if ((info.stats.state & PG_STATE_DEGRADED) == 0)
2746       info.stats.last_undegraded = now;
2747     if ((info.stats.state & PG_STATE_UNDERSIZED) == 0)
2748       info.stats.last_fullsized = now;
2749
2750     // do not send pgstat to mon anymore once we are luminous, since mgr takes
2751     // care of this by sending MMonMgrReport to mon.
2752     publish =
2753       osd->osd->get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
2754     pg_stats_publish_valid = true;
2755     pg_stats_publish = pre_publish;
2756
2757     dout(15) << "publish_stats_to_osd " << pg_stats_publish.reported_epoch
2758              << ":" << pg_stats_publish.reported_seq << dendl;
2759   }
2760   pg_stats_publish_lock.Unlock();
2761
2762   if (publish)
2763     osd->pg_stat_queue_enqueue(this);
2764 }
2765
2766 void PG::clear_publish_stats()
2767 {
2768   dout(15) << "clear_stats" << dendl;
2769   pg_stats_publish_lock.Lock();
2770   pg_stats_publish_valid = false;
2771   pg_stats_publish_lock.Unlock();
2772
2773   osd->pg_stat_queue_dequeue(this);
2774 }
2775
2776 /**
2777  * initialize a newly instantiated pg
2778  *
2779  * Initialize PG state, as when a PG is initially created, or when it
2780  * is first instantiated on the current node.
2781  *
2782  * @param role our role/rank
2783  * @param newup up set
2784  * @param newacting acting set
2785  * @param history pg history
2786  * @param pi past_intervals
2787  * @param backfill true if info should be marked as backfill
2788  * @param t transaction to write out our new state in
2789  */
2790 void PG::init(
2791   int role,
2792   const vector<int>& newup, int new_up_primary,
2793   const vector<int>& newacting, int new_acting_primary,
2794   const pg_history_t& history,
2795   const PastIntervals& pi,
2796   bool backfill,
2797   ObjectStore::Transaction *t)
2798 {
2799   dout(10) << "init role " << role << " up " << newup << " acting " << newacting
2800            << " history " << history
2801            << " past_intervals " << pi
2802            << dendl;
2803
2804   set_role(role);
2805   acting = newacting;
2806   up = newup;
2807   init_primary_up_acting(
2808     newup,
2809     newacting,
2810     new_up_primary,
2811     new_acting_primary);
2812
2813   info.history = history;
2814   past_intervals = pi;
2815
2816   info.stats.up = up;
2817   info.stats.up_primary = new_up_primary;
2818   info.stats.acting = acting;
2819   info.stats.acting_primary = new_acting_primary;
2820   info.stats.mapping_epoch = info.history.same_interval_since;
2821
2822   if (backfill) {
2823     dout(10) << __func__ << ": Setting backfill" << dendl;
2824     info.set_last_backfill(hobject_t());
2825     info.last_complete = info.last_update;
2826     pg_log.mark_log_for_rewrite();
2827   }
2828
2829   on_new_interval();
2830
2831   dirty_info = true;
2832   dirty_big_info = true;
2833   write_if_dirty(*t);
2834 }
2835
2836 #pragma GCC diagnostic ignored "-Wpragmas"
2837 #pragma GCC diagnostic push
2838 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
2839
2840 void PG::upgrade(ObjectStore *store)
2841 {
2842   assert(info_struct_v <= 10);
2843   ObjectStore::Transaction t;
2844
2845   assert(info_struct_v >= 7);
2846
2847   // 7 -> 8
2848   if (info_struct_v <= 7) {
2849     pg_log.mark_log_for_rewrite();
2850     ghobject_t log_oid(OSD::make_pg_log_oid(pg_id));
2851     ghobject_t biginfo_oid(OSD::make_pg_biginfo_oid(pg_id));
2852     t.remove(coll_t::meta(), log_oid);
2853     t.remove(coll_t::meta(), biginfo_oid);
2854     t.touch(coll, pgmeta_oid);
2855   }
2856
2857   // 8 -> 9
2858   if (info_struct_v <= 8) {
2859     // no special action needed.
2860   }
2861
2862   // 9 -> 10
2863   if (info_struct_v <= 9) {
2864     // previous versions weren't (as) aggressively clearing past_intervals
2865     if (info.history.last_epoch_clean >= info.history.same_interval_since) {
2866       dout(20) << __func__ << " clearing past_intervals" << dendl;
2867       past_intervals.clear();
2868     }
2869   }
2870
2871   // update infover_key
2872   if (info_struct_v < cur_struct_v) {
2873     map<string,bufferlist> v;
2874     __u8 ver = cur_struct_v;
2875     ::encode(ver, v[infover_key]);
2876     t.omap_setkeys(coll, pgmeta_oid, v);
2877   }
2878
2879   dirty_info = true;
2880   dirty_big_info = true;
2881   write_if_dirty(t);
2882
2883   ceph::shared_ptr<ObjectStore::Sequencer> osr (std::make_shared<
2884                                       ObjectStore::Sequencer>("upgrade"));
2885   int r = store->apply_transaction(osr.get(), std::move(t));
2886   if (r != 0) {
2887     derr << __func__ << ": apply_transaction returned "
2888          << cpp_strerror(r) << dendl;
2889     ceph_abort();
2890   }
2891   assert(r == 0);
2892
2893   C_SaferCond waiter;
2894   if (!osr->flush_commit(&waiter)) {
2895     waiter.wait();
2896   }
2897 }
2898
2899 #pragma GCC diagnostic pop
2900 #pragma GCC diagnostic warning "-Wpragmas"
2901
2902 int PG::_prepare_write_info(CephContext* cct,
2903                             map<string,bufferlist> *km,
2904                             epoch_t epoch,
2905                             pg_info_t &info, pg_info_t &last_written_info,
2906                             PastIntervals &past_intervals,
2907                             bool dirty_big_info,
2908                             bool dirty_epoch,
2909                             bool try_fast_info,
2910                             PerfCounters *logger)
2911 {
2912   if (dirty_epoch) {
2913     ::encode(epoch, (*km)[epoch_key]);
2914   }
2915
2916   if (logger)
2917     logger->inc(l_osd_pg_info);
2918
2919   // try to do info efficiently?
2920   if (!dirty_big_info && try_fast_info &&
2921       info.last_update > last_written_info.last_update) {
2922     pg_fast_info_t fast;
2923     fast.populate_from(info);
2924     bool did = fast.try_apply_to(&last_written_info);
2925     assert(did);  // we verified last_update increased above
2926     if (info == last_written_info) {
2927       ::encode(fast, (*km)[fastinfo_key]);
2928       if (logger)
2929         logger->inc(l_osd_pg_fastinfo);
2930       return 0;
2931     }
2932     generic_dout(30) << __func__ << " fastinfo failed, info:\n";
2933     {
2934       JSONFormatter jf(true);
2935       jf.dump_object("info", info);
2936       jf.flush(*_dout);
2937     }
2938     {
2939       *_dout << "\nlast_written_info:\n";
2940       JSONFormatter jf(true);
2941       jf.dump_object("last_written_info", last_written_info);
2942       jf.flush(*_dout);
2943     }
2944     *_dout << dendl;
2945   }
2946   last_written_info = info;
2947
2948   // info.  store purged_snaps separately.
2949   interval_set<snapid_t> purged_snaps;
2950   purged_snaps.swap(info.purged_snaps);
2951   ::encode(info, (*km)[info_key]);
2952   purged_snaps.swap(info.purged_snaps);
2953
2954   if (dirty_big_info) {
2955     // potentially big stuff
2956     bufferlist& bigbl = (*km)[biginfo_key];
2957     ::encode(past_intervals, bigbl);
2958     ::encode(info.purged_snaps, bigbl);
2959     //dout(20) << "write_info bigbl " << bigbl.length() << dendl;
2960     if (logger)
2961       logger->inc(l_osd_pg_biginfo);
2962   }
2963
2964   return 0;
2965 }
2966
2967 void PG::_create(ObjectStore::Transaction& t, spg_t pgid, int bits)
2968 {
2969   coll_t coll(pgid);
2970   t.create_collection(coll, bits);
2971 }
2972
2973 void PG::_init(ObjectStore::Transaction& t, spg_t pgid, const pg_pool_t *pool)
2974 {
2975   coll_t coll(pgid);
2976
2977   if (pool) {
2978     // Give a hint to the PG collection
2979     bufferlist hint;
2980     uint32_t pg_num = pool->get_pg_num();
2981     uint64_t expected_num_objects_pg = pool->expected_num_objects / pg_num;
2982     ::encode(pg_num, hint);
2983     ::encode(expected_num_objects_pg, hint);
2984     uint32_t hint_type = ObjectStore::Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS;
2985     t.collection_hint(coll, hint_type, hint);
2986   }
2987
2988   ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
2989   t.touch(coll, pgmeta_oid);
2990   map<string,bufferlist> values;
2991   __u8 struct_v = cur_struct_v;
2992   ::encode(struct_v, values[infover_key]);
2993   t.omap_setkeys(coll, pgmeta_oid, values);
2994 }
2995
2996 void PG::prepare_write_info(map<string,bufferlist> *km)
2997 {
2998   info.stats.stats.add(unstable_stats);
2999   unstable_stats.clear();
3000
3001   bool need_update_epoch = last_epoch < get_osdmap()->get_epoch();
3002   int ret = _prepare_write_info(cct, km, get_osdmap()->get_epoch(),
3003                                 info,
3004                                 last_written_info,
3005                                 past_intervals,
3006                                 dirty_big_info, need_update_epoch,
3007                                 cct->_conf->osd_fast_info,
3008                                 osd->logger);
3009   assert(ret == 0);
3010   if (need_update_epoch)
3011     last_epoch = get_osdmap()->get_epoch();
3012   last_persisted_osdmap_ref = osdmap_ref;
3013
3014   dirty_info = false;
3015   dirty_big_info = false;
3016 }
3017
3018 #pragma GCC diagnostic ignored "-Wpragmas"
3019 #pragma GCC diagnostic push
3020 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
3021
3022 bool PG::_has_removal_flag(ObjectStore *store,
3023                            spg_t pgid)
3024 {
3025   coll_t coll(pgid);
3026   ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
3027
3028   // first try new way
3029   set<string> keys;
3030   keys.insert("_remove");
3031   map<string,bufferlist> values;
3032   if (store->omap_get_values(coll, pgmeta_oid, keys, &values) == 0 &&
3033       values.size() == 1)
3034     return true;
3035
3036   return false;
3037 }
3038
3039 int PG::peek_map_epoch(ObjectStore *store,
3040                        spg_t pgid,
3041                        epoch_t *pepoch,
3042                        bufferlist *bl)
3043 {
3044   coll_t coll(pgid);
3045   ghobject_t legacy_infos_oid(OSD::make_infos_oid());
3046   ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
3047   epoch_t cur_epoch = 0;
3048
3049   assert(bl);
3050   {
3051     // validate collection name
3052     assert(coll.is_pg());
3053   }
3054
3055   // try for v8
3056   set<string> keys;
3057   keys.insert(infover_key);
3058   keys.insert(epoch_key);
3059   map<string,bufferlist> values;
3060   int r = store->omap_get_values(coll, pgmeta_oid, keys, &values);
3061   if (r == 0) {
3062     assert(values.size() == 2);
3063
3064     // sanity check version
3065     bufferlist::iterator bp = values[infover_key].begin();
3066     __u8 struct_v = 0;
3067     ::decode(struct_v, bp);
3068     assert(struct_v >= 8);
3069
3070     // get epoch
3071     bp = values[epoch_key].begin();
3072     ::decode(cur_epoch, bp);
3073   } else {
3074     // probably bug 10617; see OSD::load_pgs()
3075     return -1;
3076   }
3077
3078   *pepoch = cur_epoch;
3079   return 0;
3080 }
3081
3082 #pragma GCC diagnostic pop
3083 #pragma GCC diagnostic warning "-Wpragmas"
3084
3085 void PG::write_if_dirty(ObjectStore::Transaction& t)
3086 {
3087   map<string,bufferlist> km;
3088   if (dirty_big_info || dirty_info)
3089     prepare_write_info(&km);
3090   pg_log.write_log_and_missing(t, &km, coll, pgmeta_oid, pool.info.require_rollback());
3091   if (!km.empty())
3092     t.omap_setkeys(coll, pgmeta_oid, km);
3093 }
3094
3095 void PG::trim_log()
3096 {
3097   assert(is_primary());
3098   calc_trim_to();
3099   dout(10) << __func__ << " to " << pg_trim_to << dendl;
3100   if (pg_trim_to != eversion_t()) {
3101     // inform peers to trim log
3102     assert(!actingbackfill.empty());
3103     for (set<pg_shard_t>::iterator i = actingbackfill.begin();
3104          i != actingbackfill.end();
3105          ++i) {
3106       if (*i == pg_whoami) continue;
3107       osd->send_message_osd_cluster(
3108         i->osd,
3109         new MOSDPGTrim(
3110           get_osdmap()->get_epoch(),
3111           spg_t(info.pgid.pgid, i->shard),
3112           pg_trim_to),
3113         get_osdmap()->get_epoch());
3114     }
3115
3116     // trim primary as well
3117     pg_log.trim(pg_trim_to, info);
3118     dirty_info = true;
3119   }
3120 }
3121
3122 void PG::add_log_entry(const pg_log_entry_t& e, bool applied)
3123 {
3124   // raise last_complete only if we were previously up to date
3125   if (info.last_complete == info.last_update)
3126     info.last_complete = e.version;
3127
3128   // raise last_update.
3129   assert(e.version > info.last_update);
3130   info.last_update = e.version;
3131
3132   // raise user_version, if it increased (it may have not get bumped
3133   // by all logged updates)
3134   if (e.user_version > info.last_user_version)
3135     info.last_user_version = e.user_version;
3136
3137   // log mutation
3138   pg_log.add(e, applied);
3139   dout(10) << "add_log_entry " << e << dendl;
3140 }
3141
3142
3143 void PG::append_log(
3144   const vector<pg_log_entry_t>& logv,
3145   eversion_t trim_to,
3146   eversion_t roll_forward_to,
3147   ObjectStore::Transaction &t,
3148   bool transaction_applied)
3149 {
3150   if (transaction_applied)
3151     update_snap_map(logv, t);
3152
3153   /* The primary has sent an info updating the history, but it may not
3154    * have arrived yet.  We want to make sure that we cannot remember this
3155    * write without remembering that it happened in an interval which went
3156    * active in epoch history.last_epoch_started.
3157    */
3158   if (info.last_epoch_started != info.history.last_epoch_started) {
3159     info.history.last_epoch_started = info.last_epoch_started;
3160   }
3161   if (info.last_interval_started != info.history.last_interval_started) {
3162     info.history.last_interval_started = info.last_interval_started;
3163   }
3164   dout(10) << "append_log " << pg_log.get_log() << " " << logv << dendl;
3165
3166   PGLogEntryHandler handler{this, &t};
3167   if (!transaction_applied) {
3168      /* We must be a backfill peer, so it's ok if we apply
3169       * out-of-turn since we won't be considered when
3170       * determining a min possible last_update.
3171       */
3172     pg_log.roll_forward(&handler);
3173   }
3174
3175   for (vector<pg_log_entry_t>::const_iterator p = logv.begin();
3176        p != logv.end();
3177        ++p) {
3178     add_log_entry(*p, transaction_applied);
3179
3180     /* We don't want to leave the rollforward artifacts around
3181      * here past last_backfill.  It's ok for the same reason as
3182      * above */
3183     if (transaction_applied &&
3184         p->soid > info.last_backfill) {
3185       pg_log.roll_forward(&handler);
3186     }
3187   }
3188   auto last = logv.rbegin();
3189   if (is_primary() && last != logv.rend()) {
3190     projected_log.skip_can_rollback_to_to_head();
3191     projected_log.trim(cct, last->version, nullptr, nullptr, nullptr);
3192   }
3193
3194   if (transaction_applied && roll_forward_to > pg_log.get_can_rollback_to()) {
3195     pg_log.roll_forward_to(
3196       roll_forward_to,
3197       &handler);
3198     t.register_on_applied(
3199       new C_UpdateLastRollbackInfoTrimmedToApplied(
3200         this,
3201         get_osdmap()->get_epoch(),
3202         roll_forward_to));
3203   }
3204
3205   pg_log.trim(trim_to, info);
3206
3207   // update the local pg, pg log
3208   dirty_info = true;
3209   write_if_dirty(t);
3210 }
3211
3212 bool PG::check_log_for_corruption(ObjectStore *store)
3213 {
3214   /// TODO: this method needs to work with the omap log
3215   return true;
3216 }
3217
3218 //! Get the name we're going to save our corrupt page log as
3219 std::string PG::get_corrupt_pg_log_name() const
3220 {
3221   const int MAX_BUF = 512;
3222   char buf[MAX_BUF];
3223   struct tm tm_buf;
3224   time_t my_time(time(NULL));
3225   const struct tm *t = localtime_r(&my_time, &tm_buf);
3226   int ret = strftime(buf, sizeof(buf), "corrupt_log_%Y-%m-%d_%k:%M_", t);
3227   if (ret == 0) {
3228     dout(0) << "strftime failed" << dendl;
3229     return "corrupt_log_unknown_time";
3230   }
3231   string out(buf);
3232   out += stringify(info.pgid);
3233   return out;
3234 }
3235
3236 int PG::read_info(
3237   ObjectStore *store, spg_t pgid, const coll_t &coll, bufferlist &bl,
3238   pg_info_t &info, PastIntervals &past_intervals,
3239   __u8 &struct_v)
3240 {
3241   // try for v8 or later
3242   set<string> keys;
3243   keys.insert(infover_key);
3244   keys.insert(info_key);
3245   keys.insert(biginfo_key);
3246   keys.insert(fastinfo_key);
3247   ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
3248   map<string,bufferlist> values;
3249   int r = store->omap_get_values(coll, pgmeta_oid, keys, &values);
3250   if (r == 0) {
3251     assert(values.size() == 3 ||
3252            values.size() == 4);
3253
3254     bufferlist::iterator p = values[infover_key].begin();
3255     ::decode(struct_v, p);
3256     assert(struct_v >= 8);
3257
3258     p = values[info_key].begin();
3259     ::decode(info, p);
3260
3261     p = values[biginfo_key].begin();
3262     if (struct_v >= 10) {
3263       ::decode(past_intervals, p);
3264     } else {
3265       past_intervals.decode_classic(p);
3266     }
3267     ::decode(info.purged_snaps, p);
3268
3269     p = values[fastinfo_key].begin();
3270     if (!p.end()) {
3271       pg_fast_info_t fast;
3272       ::decode(fast, p);
3273       fast.try_apply_to(&info);
3274     }
3275     return 0;
3276   }
3277
3278   // legacy (ver < 8)
3279   ghobject_t infos_oid(OSD::make_infos_oid());
3280   bufferlist::iterator p = bl.begin();
3281   ::decode(struct_v, p);
3282   assert(struct_v == 7);
3283
3284   // get info out of leveldb
3285   string k = get_info_key(info.pgid);
3286   string bk = get_biginfo_key(info.pgid);
3287   keys.clear();
3288   keys.insert(k);
3289   keys.insert(bk);
3290   values.clear();
3291   store->omap_get_values(coll_t::meta(), ghobject_t(infos_oid), keys, &values);
3292   assert(values.size() == 2);
3293
3294   p = values[k].begin();
3295   ::decode(info, p);
3296
3297   p = values[bk].begin();
3298   ::decode(past_intervals, p);
3299   interval_set<snapid_t> snap_collections;  // obsolete
3300   ::decode(snap_collections, p);
3301   ::decode(info.purged_snaps, p);
3302   return 0;
3303 }
3304
3305 void PG::read_state(ObjectStore *store, bufferlist &bl)
3306 {
3307   int r = read_info(store, pg_id, coll, bl, info, past_intervals,
3308                     info_struct_v);
3309   assert(r >= 0);
3310
3311   last_written_info = info;
3312
3313   ostringstream oss;
3314   pg_log.read_log_and_missing(
3315     store,
3316     coll,
3317     info_struct_v < 8 ? coll_t::meta() : coll,
3318     ghobject_t(info_struct_v < 8 ? OSD::make_pg_log_oid(pg_id) : pgmeta_oid),
3319     info,
3320     oss,
3321     cct->_conf->osd_ignore_stale_divergent_priors,
3322     cct->_conf->osd_debug_verify_missing_on_start);
3323   if (oss.tellp())
3324     osd->clog->error() << oss.rdbuf();
3325
3326   // log any weirdness
3327   log_weirdness();
3328 }
3329
3330 void PG::log_weirdness()
3331 {
3332   if (pg_log.get_tail() != info.log_tail)
3333     osd->clog->error() << info.pgid
3334                        << " info mismatch, log.tail " << pg_log.get_tail()
3335                        << " != info.log_tail " << info.log_tail;
3336   if (pg_log.get_head() != info.last_update)
3337     osd->clog->error() << info.pgid
3338                        << " info mismatch, log.head " << pg_log.get_head()
3339                        << " != info.last_update " << info.last_update;
3340
3341   if (!pg_log.get_log().empty()) {
3342     // sloppy check
3343     if ((pg_log.get_log().log.begin()->version <= pg_log.get_tail()))
3344       osd->clog->error() << info.pgid
3345                         << " log bound mismatch, info (tail,head] ("
3346                         << pg_log.get_tail() << "," << pg_log.get_head() << "]"
3347                         << " actual ["
3348                         << pg_log.get_log().log.begin()->version << ","
3349                          << pg_log.get_log().log.rbegin()->version << "]";
3350   }
3351
3352   if (pg_log.get_log().caller_ops.size() > pg_log.get_log().log.size()) {
3353     osd->clog->error() << info.pgid
3354                       << " caller_ops.size " << pg_log.get_log().caller_ops.size()
3355                        << " > log size " << pg_log.get_log().log.size();
3356   }
3357 }
3358
3359 void PG::update_snap_map(
3360   const vector<pg_log_entry_t> &log_entries,
3361   ObjectStore::Transaction &t)
3362 {
3363   for (vector<pg_log_entry_t>::const_iterator i = log_entries.begin();
3364        i != log_entries.end();
3365        ++i) {
3366     OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
3367     if (i->soid.snap < CEPH_MAXSNAP) {
3368       if (i->is_delete()) {
3369         int r = snap_mapper.remove_oid(
3370           i->soid,
3371           &_t);
3372         assert(r == 0);
3373       } else if (i->is_update()) {
3374         assert(i->snaps.length() > 0);
3375         vector<snapid_t> snaps;
3376         bufferlist snapbl = i->snaps;
3377         bufferlist::iterator p = snapbl.begin();
3378         try {
3379           ::decode(snaps, p);
3380         } catch (...) {
3381           snaps.clear();
3382         }
3383         set<snapid_t> _snaps(snaps.begin(), snaps.end());
3384
3385         if (i->is_clone() || i->is_promote()) {
3386           snap_mapper.add_oid(
3387             i->soid,
3388             _snaps,
3389             &_t);
3390         } else if (i->is_modify()) {
3391           assert(i->is_modify());
3392           int r = snap_mapper.update_snaps(
3393             i->soid,
3394             _snaps,
3395             0,
3396             &_t);
3397           assert(r == 0);
3398         } else {
3399           assert(i->is_clean());
3400         }
3401       }
3402     }
3403   }
3404 }
3405
3406 /**
3407  * filter trimming|trimmed snaps out of snapcontext
3408  */
3409 void PG::filter_snapc(vector<snapid_t> &snaps)
3410 {
3411   //nothing needs to trim, we can return immediately
3412   if(snap_trimq.empty() && info.purged_snaps.empty())
3413     return;
3414
3415   bool filtering = false;
3416   vector<snapid_t> newsnaps;
3417   for (vector<snapid_t>::iterator p = snaps.begin();
3418        p != snaps.end();
3419        ++p) {
3420     if (snap_trimq.contains(*p) || info.purged_snaps.contains(*p)) {
3421       if (!filtering) {
3422         // start building a new vector with what we've seen so far
3423         dout(10) << "filter_snapc filtering " << snaps << dendl;
3424         newsnaps.insert(newsnaps.begin(), snaps.begin(), p);
3425         filtering = true;
3426       }
3427       dout(20) << "filter_snapc  removing trimq|purged snap " << *p << dendl;
3428     } else {
3429       if (filtering)
3430         newsnaps.push_back(*p);  // continue building new vector
3431     }
3432   }
3433   if (filtering) {
3434     snaps.swap(newsnaps);
3435     dout(10) << "filter_snapc  result " << snaps << dendl;
3436   }
3437 }
3438
3439 void PG::requeue_object_waiters(map<hobject_t, list<OpRequestRef>>& m)
3440 {
3441   for (map<hobject_t, list<OpRequestRef>>::iterator it = m.begin();
3442        it != m.end();
3443        ++it)
3444     requeue_ops(it->second);
3445   m.clear();
3446 }
3447
3448 void PG::requeue_op(OpRequestRef op)
3449 {
3450   auto p = waiting_for_map.find(op->get_source());
3451   if (p != waiting_for_map.end()) {
3452     dout(20) << __func__ << " " << op << " (waiting_for_map " << p->first << ")"
3453              << dendl;
3454     p->second.push_front(op);
3455   } else {
3456     dout(20) << __func__ << " " << op << dendl;
3457     osd->enqueue_front(info.pgid, PGQueueable(op, get_osdmap()->get_epoch()));
3458   }
3459 }
3460
3461 void PG::requeue_ops(list<OpRequestRef> &ls)
3462 {
3463   for (list<OpRequestRef>::reverse_iterator i = ls.rbegin();
3464        i != ls.rend();
3465        ++i) {
3466     auto p = waiting_for_map.find((*i)->get_source());
3467     if (p != waiting_for_map.end()) {
3468       dout(20) << __func__ << " " << *i << " (waiting_for_map " << p->first
3469                << ")" << dendl;
3470       p->second.push_front(*i);
3471     } else {
3472       dout(20) << __func__ << " " << *i << dendl;
3473       osd->enqueue_front(info.pgid, PGQueueable(*i, get_osdmap()->get_epoch()));
3474     }
3475   }
3476   ls.clear();
3477 }
3478
3479 void PG::requeue_map_waiters()
3480 {
3481   epoch_t epoch = get_osdmap()->get_epoch();
3482   auto p = waiting_for_map.begin();
3483   while (p != waiting_for_map.end()) {
3484     if (epoch < p->second.front()->min_epoch) {
3485       dout(20) << __func__ << " " << p->first << " front op "
3486                << p->second.front() << " must still wait, doing nothing"
3487                << dendl;
3488       ++p;
3489     } else {
3490       dout(20) << __func__ << " " << p->first << " " << p->second << dendl;
3491       for (auto q = p->second.rbegin(); q != p->second.rend(); ++q) {
3492         osd->enqueue_front(info.pgid, PGQueueable(*q, epoch));
3493       }
3494       p = waiting_for_map.erase(p);
3495     }
3496   }
3497 }
3498
3499
3500 // ==========================================================================================
3501 // SCRUB
3502
3503 /*
3504  * when holding pg and sched_scrub_lock, then the states are:
3505  *   scheduling:
3506  *     scrubber.reserved = true
3507  *     scrub_rserved_peers includes whoami
3508  *     osd->scrub_pending++
3509  *   scheduling, replica declined:
3510  *     scrubber.reserved = true
3511  *     scrubber.reserved_peers includes -1
3512  *     osd->scrub_pending++
3513  *   pending:
3514  *     scrubber.reserved = true
3515  *     scrubber.reserved_peers.size() == acting.size();
3516  *     pg on scrub_wq
3517  *     osd->scrub_pending++
3518  *   scrubbing:
3519  *     scrubber.reserved = false;
3520  *     scrubber.reserved_peers empty
3521  *     osd->scrubber.active++
3522  */
3523
3524 // returns true if a scrub has been newly kicked off
3525 bool PG::sched_scrub()
3526 {
3527   bool nodeep_scrub = false;
3528   assert(is_locked());
3529   if (!(is_primary() && is_active() && is_clean() && !is_scrubbing())) {
3530     return false;
3531   }
3532
3533   double deep_scrub_interval = 0;
3534   pool.info.opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &deep_scrub_interval);
3535   if (deep_scrub_interval <= 0) {
3536     deep_scrub_interval = cct->_conf->osd_deep_scrub_interval;
3537   }
3538   bool time_for_deep = ceph_clock_now() >=
3539     info.history.last_deep_scrub_stamp + deep_scrub_interval;
3540
3541   bool deep_coin_flip = false;
3542   // Only add random deep scrubs when NOT user initiated scrub
3543   if (!scrubber.must_scrub)
3544       deep_coin_flip = (rand() % 100) < cct->_conf->osd_deep_scrub_randomize_ratio * 100;
3545   dout(20) << __func__ << ": time_for_deep=" << time_for_deep << " deep_coin_flip=" << deep_coin_flip << dendl;
3546
3547   time_for_deep = (time_for_deep || deep_coin_flip);
3548
3549   //NODEEP_SCRUB so ignore time initiated deep-scrub
3550   if (osd->osd->get_osdmap()->test_flag(CEPH_OSDMAP_NODEEP_SCRUB) ||
3551       pool.info.has_flag(pg_pool_t::FLAG_NODEEP_SCRUB)) {
3552     time_for_deep = false;
3553     nodeep_scrub = true;
3554   }
3555
3556   if (!scrubber.must_scrub) {
3557     assert(!scrubber.must_deep_scrub);
3558
3559     //NOSCRUB so skip regular scrubs
3560     if ((osd->osd->get_osdmap()->test_flag(CEPH_OSDMAP_NOSCRUB) ||
3561          pool.info.has_flag(pg_pool_t::FLAG_NOSCRUB)) && !time_for_deep) {
3562       if (scrubber.reserved) {
3563         // cancel scrub if it is still in scheduling,
3564         // so pgs from other pools where scrub are still legal
3565         // have a chance to go ahead with scrubbing.
3566         clear_scrub_reserved();
3567         scrub_unreserve_replicas();
3568       }
3569       return false;
3570     }
3571   }
3572
3573   if (cct->_conf->osd_scrub_auto_repair
3574       && get_pgbackend()->auto_repair_supported()
3575       && time_for_deep
3576       // respect the command from user, and not do auto-repair
3577       && !scrubber.must_repair
3578       && !scrubber.must_scrub
3579       && !scrubber.must_deep_scrub) {
3580     dout(20) << __func__ << ": auto repair with deep scrubbing" << dendl;
3581     scrubber.auto_repair = true;
3582   } else {
3583     // this happens when user issue the scrub/repair command during
3584     // the scheduling of the scrub/repair (e.g. request reservation)
3585     scrubber.auto_repair = false;
3586   }
3587
3588   bool ret = true;
3589   if (!scrubber.reserved) {
3590     assert(scrubber.reserved_peers.empty());
3591     if (osd->inc_scrubs_pending()) {
3592       dout(20) << "sched_scrub: reserved locally, reserving replicas" << dendl;
3593       scrubber.reserved = true;
3594       scrubber.reserved_peers.insert(pg_whoami);
3595       scrub_reserve_replicas();
3596     } else {
3597       dout(20) << "sched_scrub: failed to reserve locally" << dendl;
3598       ret = false;
3599     }
3600   }
3601   if (scrubber.reserved) {
3602     if (scrubber.reserve_failed) {
3603       dout(20) << "sched_scrub: failed, a peer declined" << dendl;
3604       clear_scrub_reserved();
3605       scrub_unreserve_replicas();
3606       ret = false;
3607     } else if (scrubber.reserved_peers.size() == acting.size()) {
3608       dout(20) << "sched_scrub: success, reserved self and replicas" << dendl;
3609       if (time_for_deep) {
3610         dout(10) << "sched_scrub: scrub will be deep" << dendl;
3611         state_set(PG_STATE_DEEP_SCRUB);
3612       } else if (!scrubber.must_deep_scrub && info.stats.stats.sum.num_deep_scrub_errors) {
3613         if (!nodeep_scrub) {
3614           osd->clog->info() << "osd." << osd->whoami
3615                             << " pg " << info.pgid
3616                             << " Deep scrub errors, upgrading scrub to deep-scrub";
3617           state_set(PG_STATE_DEEP_SCRUB);
3618         } else if (!scrubber.must_scrub) {
3619           osd->clog->error() << "osd." << osd->whoami
3620                              << " pg " << info.pgid
3621                              << " Regular scrub skipped due to deep-scrub errors and nodeep-scrub set";
3622           clear_scrub_reserved();
3623           scrub_unreserve_replicas();
3624           return false;
3625         } else {
3626           osd->clog->error() << "osd." << osd->whoami
3627                              << " pg " << info.pgid
3628                              << " Regular scrub request, deep-scrub details will be lost";
3629         }
3630       }
3631       queue_scrub();
3632     } else {
3633       // none declined, since scrubber.reserved is set
3634       dout(20) << "sched_scrub: reserved " << scrubber.reserved_peers << ", waiting for replicas" << dendl;
3635     }
3636   }
3637
3638   return ret;
3639 }
3640
3641 void PG::reg_next_scrub()
3642 {
3643   if (!is_primary())
3644     return;
3645
3646   utime_t reg_stamp;
3647   if (scrubber.must_scrub ||
3648       (info.stats.stats_invalid && cct->_conf->osd_scrub_invalid_stats)) {
3649     reg_stamp = ceph_clock_now();
3650   } else {
3651     reg_stamp = info.history.last_scrub_stamp;
3652   }
3653   // note down the sched_time, so we can locate this scrub, and remove it
3654   // later on.
3655   double scrub_min_interval = 0, scrub_max_interval = 0;
3656   pool.info.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &scrub_min_interval);
3657   pool.info.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &scrub_max_interval);
3658   assert(scrubber.scrub_reg_stamp == utime_t());
3659   scrubber.scrub_reg_stamp = osd->reg_pg_scrub(info.pgid,
3660                                                reg_stamp,
3661                                                scrub_min_interval,
3662                                                scrub_max_interval,
3663                                                scrubber.must_scrub);
3664 }
3665
3666 void PG::unreg_next_scrub()
3667 {
3668   if (is_primary()) {
3669     osd->unreg_pg_scrub(info.pgid, scrubber.scrub_reg_stamp);
3670     scrubber.scrub_reg_stamp = utime_t();
3671   }
3672 }
3673
3674 void PG::do_replica_scrub_map(OpRequestRef op)
3675 {
3676   const MOSDRepScrubMap *m = static_cast<const MOSDRepScrubMap*>(op->get_req());
3677   dout(7) << __func__ << " " << *m << dendl;
3678   if (m->map_epoch < info.history.same_interval_since) {
3679     dout(10) << __func__ << " discarding old from "
3680              << m->map_epoch << " < " << info.history.same_interval_since
3681              << dendl;
3682     return;
3683   }
3684   if (!scrubber.is_chunky_scrub_active()) {
3685     dout(10) << __func__ << " scrub isn't active" << dendl;
3686     return;
3687   }
3688
3689   op->mark_started();
3690
3691   bufferlist::iterator p = const_cast<bufferlist&>(m->get_data()).begin();
3692   scrubber.received_maps[m->from].decode(p, info.pgid.pool());
3693   dout(10) << "map version is "
3694            << scrubber.received_maps[m->from].valid_through
3695            << dendl;
3696
3697   --scrubber.waiting_on;
3698   scrubber.waiting_on_whom.erase(m->from);
3699   if (scrubber.waiting_on == 0) {
3700     if (ops_blocked_by_scrub()) {
3701       requeue_scrub(true);
3702     } else {
3703       requeue_scrub(false);
3704     }
3705   }
3706 }
3707
3708 void PG::sub_op_scrub_map(OpRequestRef op)
3709 {
3710   // for legacy jewel compatibility only
3711   const MOSDSubOp *m = static_cast<const MOSDSubOp *>(op->get_req());
3712   assert(m->get_type() == MSG_OSD_SUBOP);
3713   dout(7) << "sub_op_scrub_map" << dendl;
3714
3715   if (m->map_epoch < info.history.same_interval_since) {
3716     dout(10) << "sub_op_scrub discarding old sub_op from "
3717              << m->map_epoch << " < " << info.history.same_interval_since << dendl;
3718     return;
3719   }
3720
3721   if (!scrubber.is_chunky_scrub_active()) {
3722     dout(10) << "sub_op_scrub_map scrub isn't active" << dendl;
3723     return;
3724   }
3725
3726   op->mark_started();
3727
3728   dout(10) << " got " << m->from << " scrub map" << dendl;
3729   bufferlist::iterator p = const_cast<bufferlist&>(m->get_data()).begin();
3730
3731   scrubber.received_maps[m->from].decode(p, info.pgid.pool());
3732   dout(10) << "map version is "
3733              << scrubber.received_maps[m->from].valid_through
3734              << dendl;
3735
3736   --scrubber.waiting_on;
3737   scrubber.waiting_on_whom.erase(m->from);
3738
3739   if (scrubber.waiting_on == 0) {
3740     if (ops_blocked_by_scrub()) {
3741       requeue_scrub(true);
3742     } else {
3743       requeue_scrub(false);
3744     }
3745   }
3746 }
3747
3748 // send scrub v3 messages (chunky scrub)
3749 void PG::_request_scrub_map(
3750   pg_shard_t replica, eversion_t version,
3751   hobject_t start, hobject_t end,
3752   bool deep, uint32_t seed)
3753 {
3754   assert(replica != pg_whoami);
3755   dout(10) << "scrub  requesting scrubmap from osd." << replica
3756            << " deep " << (int)deep << " seed " << seed << dendl;
3757   MOSDRepScrub *repscrubop = new MOSDRepScrub(
3758     spg_t(info.pgid.pgid, replica.shard), version,
3759     get_osdmap()->get_epoch(),
3760     get_last_peering_reset(),
3761     start, end, deep, seed);
3762   // default priority, we want the rep scrub processed prior to any recovery
3763   // or client io messages (we are holding a lock!)
3764   osd->send_message_osd_cluster(
3765     replica.osd, repscrubop, get_osdmap()->get_epoch());
3766 }
3767
3768 void PG::handle_scrub_reserve_request(OpRequestRef op)
3769 {
3770   dout(7) << __func__ << " " << *op->get_req() << dendl;
3771   op->mark_started();
3772   if (scrubber.reserved) {
3773     dout(10) << __func__ << " ignoring reserve request: Already reserved"
3774              << dendl;
3775     return;
3776   }
3777   scrubber.reserved = osd->inc_scrubs_pending();
3778   if (op->get_req()->get_type() == MSG_OSD_SCRUB_RESERVE) {
3779     const MOSDScrubReserve *m =
3780       static_cast<const MOSDScrubReserve*>(op->get_req());
3781     Message *reply = new MOSDScrubReserve(
3782       spg_t(info.pgid.pgid, primary.shard),
3783       m->map_epoch,
3784       scrubber.reserved ? MOSDScrubReserve::GRANT : MOSDScrubReserve::REJECT,
3785       pg_whoami);
3786     osd->send_message_osd_cluster(reply, op->get_req()->get_connection());
3787   } else {
3788     // for jewel compat only
3789     const MOSDSubOp *req = static_cast<const MOSDSubOp*>(op->get_req());
3790     assert(req->get_type() == MSG_OSD_SUBOP);
3791     MOSDSubOpReply *reply = new MOSDSubOpReply(
3792       req, pg_whoami, 0, get_osdmap()->get_epoch(), CEPH_OSD_FLAG_ACK);
3793     ::encode(scrubber.reserved, reply->get_data());
3794     osd->send_message_osd_cluster(reply, op->get_req()->get_connection());
3795   }
3796 }
3797
3798 void PG::handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from)
3799 {
3800   dout(7) << __func__ << " " << *op->get_req() << dendl;
3801   op->mark_started();
3802   if (!scrubber.reserved) {
3803     dout(10) << "ignoring obsolete scrub reserve reply" << dendl;
3804     return;
3805   }
3806   if (scrubber.reserved_peers.find(from) != scrubber.reserved_peers.end()) {
3807     dout(10) << " already had osd." << from << " reserved" << dendl;
3808   } else {
3809     dout(10) << " osd." << from << " scrub reserve = success" << dendl;
3810     scrubber.reserved_peers.insert(from);
3811     sched_scrub();
3812   }
3813 }
3814
3815 void PG::handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from)
3816 {
3817   dout(7) << __func__ << " " << *op->get_req() << dendl;
3818   op->mark_started();
3819   if (!scrubber.reserved) {
3820     dout(10) << "ignoring obsolete scrub reserve reply" << dendl;
3821     return;
3822   }
3823   if (scrubber.reserved_peers.find(from) != scrubber.reserved_peers.end()) {
3824     dout(10) << " already had osd." << from << " reserved" << dendl;
3825   } else {
3826     /* One decline stops this pg from being scheduled for scrubbing. */
3827     dout(10) << " osd." << from << " scrub reserve = fail" << dendl;
3828     scrubber.reserve_failed = true;
3829     sched_scrub();
3830   }
3831 }
3832
3833 void PG::handle_scrub_reserve_release(OpRequestRef op)
3834 {
3835   dout(7) << __func__ << " " << *op->get_req() << dendl;
3836   op->mark_started();
3837   clear_scrub_reserved();
3838 }
3839
3840 void PG::reject_reservation()
3841 {
3842   osd->send_message_osd_cluster(
3843     primary.osd,
3844     new MBackfillReserve(
3845       MBackfillReserve::REJECT,
3846       spg_t(info.pgid.pgid, primary.shard),
3847       get_osdmap()->get_epoch()),
3848     get_osdmap()->get_epoch());
3849 }
3850
3851 void PG::schedule_backfill_full_retry()
3852 {
3853   Mutex::Locker lock(osd->recovery_request_lock);
3854   osd->recovery_request_timer.add_event_after(
3855     cct->_conf->osd_backfill_retry_interval,
3856     new QueuePeeringEvt<RequestBackfill>(
3857       this, get_osdmap()->get_epoch(),
3858       RequestBackfill()));
3859 }
3860
3861 void PG::schedule_recovery_full_retry()
3862 {
3863   Mutex::Locker lock(osd->recovery_request_lock);
3864   osd->recovery_request_timer.add_event_after(
3865     cct->_conf->osd_recovery_retry_interval,
3866     new QueuePeeringEvt<DoRecovery>(
3867       this, get_osdmap()->get_epoch(),
3868       DoRecovery()));
3869 }
3870
3871 void PG::clear_scrub_reserved()
3872 {
3873   scrubber.reserved_peers.clear();
3874   scrubber.reserve_failed = false;
3875
3876   if (scrubber.reserved) {
3877     scrubber.reserved = false;
3878     osd->dec_scrubs_pending();
3879   }
3880 }
3881
3882 void PG::scrub_reserve_replicas()
3883 {
3884   assert(backfill_targets.empty());
3885   for (set<pg_shard_t>::iterator i = actingbackfill.begin();
3886        i != actingbackfill.end();
3887        ++i) {
3888     if (*i == pg_whoami) continue;
3889     dout(10) << "scrub requesting reserve from osd." << *i << dendl;
3890     if (HAVE_FEATURE(get_min_acting_features(), SERVER_LUMINOUS)) {
3891       osd->send_message_osd_cluster(
3892         i->osd,
3893         new MOSDScrubReserve(spg_t(info.pgid.pgid, i->shard),
3894                              get_osdmap()->get_epoch(),
3895                              MOSDScrubReserve::REQUEST, pg_whoami),
3896         get_osdmap()->get_epoch());
3897     } else {
3898       // for jewel compat only
3899       vector<OSDOp> scrub(1);
3900       scrub[0].op.op = CEPH_OSD_OP_SCRUB_RESERVE;
3901       hobject_t poid;
3902       eversion_t v;
3903       osd_reqid_t reqid;
3904       MOSDSubOp *subop = new MOSDSubOp(
3905         reqid, pg_whoami, spg_t(info.pgid.pgid, i->shard), poid, 0,
3906         get_osdmap()->get_epoch(), osd->get_tid(), v);
3907       subop->ops = scrub;
3908       osd->send_message_osd_cluster(
3909         i->osd, subop, get_osdmap()->get_epoch());
3910     }
3911   }
3912 }
3913
3914 void PG::scrub_unreserve_replicas()
3915 {
3916   assert(backfill_targets.empty());
3917   for (set<pg_shard_t>::iterator i = actingbackfill.begin();
3918        i != actingbackfill.end();
3919        ++i) {
3920     if (*i == pg_whoami) continue;
3921     dout(10) << "scrub requesting unreserve from osd." << *i << dendl;
3922     if (HAVE_FEATURE(get_min_acting_features(), SERVER_LUMINOUS)) {
3923       osd->send_message_osd_cluster(
3924         i->osd,
3925         new MOSDScrubReserve(spg_t(info.pgid.pgid, i->shard),
3926                              get_osdmap()->get_epoch(),
3927                              MOSDScrubReserve::RELEASE, pg_whoami),
3928         get_osdmap()->get_epoch());
3929     } else {
3930       // for jewel compat only
3931       vector<OSDOp> scrub(1);
3932       scrub[0].op.op = CEPH_OSD_OP_SCRUB_UNRESERVE;
3933       hobject_t poid;
3934       eversion_t v;
3935       osd_reqid_t reqid;
3936       MOSDSubOp *subop = new MOSDSubOp(
3937         reqid, pg_whoami, spg_t(info.pgid.pgid, i->shard), poid, 0,
3938         get_osdmap()->get_epoch(), osd->get_tid(), v);
3939       subop->ops = scrub;
3940       osd->send_message_osd_cluster(i->osd, subop, get_osdmap()->get_epoch());
3941     }
3942   }
3943 }
3944
3945 void PG::_scan_rollback_obs(
3946   const vector<ghobject_t> &rollback_obs,
3947   ThreadPool::TPHandle &handle)
3948 {
3949   ObjectStore::Transaction t;
3950   eversion_t trimmed_to = last_rollback_info_trimmed_to_applied;
3951   for (vector<ghobject_t>::const_iterator i = rollback_obs.begin();
3952        i != rollback_obs.end();
3953        ++i) {
3954     if (i->generation < trimmed_to.version) {
3955       osd->clog->error() << "osd." << osd->whoami
3956                         << " pg " << info.pgid
3957                         << " found obsolete rollback obj "
3958                         << *i << " generation < trimmed_to "
3959                         << trimmed_to
3960                         << "...repaired";
3961       t.remove(coll, *i);
3962     }
3963   }
3964   if (!t.empty()) {
3965     derr << __func__ << ": queueing trans to clean up obsolete rollback objs"
3966          << dendl;
3967     osd->store->queue_transaction(osr.get(), std::move(t), NULL);
3968   }
3969 }
3970
3971 void PG::_scan_snaps(ScrubMap &smap)
3972 {
3973   hobject_t head;
3974   SnapSet snapset;
3975   for (map<hobject_t, ScrubMap::object>::reverse_iterator i = smap.objects.rbegin();
3976        i != smap.objects.rend();
3977        ++i) {
3978     const hobject_t &hoid = i->first;
3979     ScrubMap::object &o = i->second;
3980
3981     if (hoid.is_head() || hoid.is_snapdir()) {
3982       // parse the SnapSet
3983       bufferlist bl;
3984       if (o.attrs.find(SS_ATTR) == o.attrs.end()) {
3985         continue;
3986       }
3987       bl.push_back(o.attrs[SS_ATTR]);
3988       auto p = bl.begin();
3989       try {
3990         ::decode(snapset, p);
3991       } catch(...) {
3992         continue;
3993       }
3994       head = hoid.get_head();
3995       continue;
3996     }
3997     if (hoid.snap < CEPH_MAXSNAP) {
3998       // check and if necessary fix snap_mapper
3999       if (hoid.get_head() != head) {
4000         derr << __func__ << " no head for " << hoid << " (have " << head << ")"
4001              << dendl;
4002         continue;
4003       }
4004       set<snapid_t> obj_snaps;
4005       if (!snapset.is_legacy()) {
4006         auto p = snapset.clone_snaps.find(hoid.snap);
4007         if (p == snapset.clone_snaps.end()) {
4008           derr << __func__ << " no clone_snaps for " << hoid << " in " << snapset
4009                << dendl;
4010           continue;
4011         }
4012         obj_snaps.insert(p->second.begin(), p->second.end());
4013       } else {
4014         bufferlist bl;
4015         if (o.attrs.find(OI_ATTR) == o.attrs.end()) {
4016           continue;
4017         }
4018         bl.push_back(o.attrs[OI_ATTR]);
4019         object_info_t oi;
4020         try {
4021           oi.decode(bl);
4022         } catch(...) {
4023           continue;
4024         }
4025         obj_snaps.insert(oi.legacy_snaps.begin(), oi.legacy_snaps.end());
4026       }
4027       set<snapid_t> cur_snaps;
4028       int r = snap_mapper.get_snaps(hoid, &cur_snaps);
4029       if (r != 0 && r != -ENOENT) {
4030         derr << __func__ << ": get_snaps returned " << cpp_strerror(r) << dendl;
4031         ceph_abort();
4032       }
4033       if (r == -ENOENT || cur_snaps != obj_snaps) {
4034         ObjectStore::Transaction t;
4035         OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
4036         if (r == 0) {
4037           r = snap_mapper.remove_oid(hoid, &_t);
4038           if (r != 0) {
4039             derr << __func__ << ": remove_oid returned " << cpp_strerror(r)
4040                  << dendl;
4041             ceph_abort();
4042           }
4043           osd->clog->error() << "osd." << osd->whoami
4044                             << " found snap mapper error on pg "
4045                             << info.pgid
4046                             << " oid " << hoid << " snaps in mapper: "
4047                             << cur_snaps << ", oi: "
4048                             << obj_snaps
4049                             << "...repaired";
4050         } else {
4051           osd->clog->error() << "osd." << osd->whoami
4052                             << " found snap mapper error on pg "
4053                             << info.pgid
4054                             << " oid " << hoid << " snaps missing in mapper"
4055                             << ", should be: "
4056                             << obj_snaps
4057                             << "...repaired";
4058         }
4059         snap_mapper.add_oid(hoid, obj_snaps, &_t);
4060         r = osd->store->apply_transaction(osr.get(), std::move(t));
4061         if (r != 0) {
4062           derr << __func__ << ": apply_transaction got " << cpp_strerror(r)
4063                << dendl;
4064         }
4065       }
4066     }
4067   }
4068 }
4069
4070 void PG::_repair_oinfo_oid(ScrubMap &smap)
4071 {
4072   for (map<hobject_t, ScrubMap::object>::reverse_iterator i = smap.objects.rbegin();
4073        i != smap.objects.rend();
4074        ++i) {
4075     const hobject_t &hoid = i->first;
4076     ScrubMap::object &o = i->second;
4077
4078     bufferlist bl;
4079     if (o.attrs.find(OI_ATTR) == o.attrs.end()) {
4080       continue;
4081     }
4082     bl.push_back(o.attrs[OI_ATTR]);
4083     object_info_t oi;
4084     try {
4085       oi.decode(bl);
4086     } catch(...) {
4087       continue;
4088     }
4089     if (oi.soid != hoid) {
4090       ObjectStore::Transaction t;
4091       OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
4092       osd->clog->error() << "osd." << osd->whoami
4093                             << " found object info error on pg "
4094                             << info.pgid
4095                             << " oid " << hoid << " oid in object info: "
4096                             << oi.soid
4097                             << "...repaired";
4098       // Fix object info
4099       oi.soid = hoid;
4100       bl.clear();
4101       ::encode(oi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
4102
4103       bufferptr bp(bl.c_str(), bl.length());
4104       o.attrs[OI_ATTR] = bp;
4105
4106       t.setattr(coll, ghobject_t(hoid), OI_ATTR, bl);
4107       int r = osd->store->apply_transaction(osr.get(), std::move(t));
4108       if (r != 0) {
4109         derr << __func__ << ": apply_transaction got " << cpp_strerror(r)
4110              << dendl;
4111       }
4112     }
4113   }
4114 }
4115
4116 /*
4117  * build a scrub map over a chunk without releasing the lock
4118  * only used by chunky scrub
4119  */
4120 int PG::build_scrub_map_chunk(
4121   ScrubMap &map,
4122   hobject_t start, hobject_t end, bool deep, uint32_t seed,
4123   ThreadPool::TPHandle &handle)
4124 {
4125   dout(10) << __func__ << " [" << start << "," << end << ") "
4126            << " seed " << seed << dendl;
4127
4128   map.valid_through = info.last_update;
4129
4130   // objects
4131   vector<hobject_t> ls;
4132   vector<ghobject_t> rollback_obs;
4133   int ret = get_pgbackend()->objects_list_range(
4134     start,
4135     end,
4136     0,
4137     &ls,
4138     &rollback_obs);
4139   if (ret < 0) {
4140     dout(5) << "objects_list_range error: " << ret << dendl;
4141     return ret;
4142   }
4143
4144
4145   get_pgbackend()->be_scan_list(map, ls, deep, seed, handle);
4146   _scan_rollback_obs(rollback_obs, handle);
4147   _scan_snaps(map);
4148   _repair_oinfo_oid(map);
4149
4150   dout(20) << __func__ << " done" << dendl;
4151   return 0;
4152 }
4153
4154 void PG::Scrubber::cleanup_store(ObjectStore::Transaction *t) {
4155   if (!store)
4156     return;
4157   struct OnComplete : Context {
4158     std::unique_ptr<Scrub::Store> store;
4159     OnComplete(
4160       std::unique_ptr<Scrub::Store> &&store)
4161       : store(std::move(store)) {}
4162     void finish(int) override {}
4163   };
4164   store->cleanup(t);
4165   t->register_on_complete(new OnComplete(std::move(store)));
4166   assert(!store);
4167 }
4168
4169 void PG::repair_object(
4170   const hobject_t& soid, list<pair<ScrubMap::object, pg_shard_t> > *ok_peers,
4171   pg_shard_t bad_peer)
4172 {
4173   list<pg_shard_t> op_shards;
4174   for (auto i : *ok_peers) {
4175     op_shards.push_back(i.second);
4176   }
4177   dout(10) << "repair_object " << soid << " bad_peer osd."
4178            << bad_peer << " ok_peers osd.{" << op_shards << "}" << dendl;
4179   ScrubMap::object &po = ok_peers->back().first;
4180   eversion_t v;
4181   bufferlist bv;
4182   bv.push_back(po.attrs[OI_ATTR]);
4183   object_info_t oi;
4184   try {
4185     bufferlist::iterator bliter = bv.begin();
4186     ::decode(oi, bliter);
4187   } catch (...) {
4188     dout(0) << __func__ << ": Need version of replica, bad object_info_t: " << soid << dendl;
4189     assert(0);
4190   }
4191   if (bad_peer != primary) {
4192     peer_missing[bad_peer].add(soid, oi.version, eversion_t(), false);
4193   } else {
4194     // We should only be scrubbing if the PG is clean.
4195     assert(waiting_for_unreadable_object.empty());
4196
4197     pg_log.missing_add(soid, oi.version, eversion_t());
4198
4199     pg_log.set_last_requested(0);
4200     dout(10) << __func__ << ": primary = " << primary << dendl;
4201   }
4202
4203   if (is_ec_pg() || bad_peer == primary) {
4204     // we'd better collect all shard for EC pg, and prepare good peers as the
4205     // source of pull in the case of replicated pg.
4206     missing_loc.add_missing(soid, oi.version, eversion_t());
4207     list<pair<ScrubMap::object, pg_shard_t> >::iterator i;
4208     for (i = ok_peers->begin();
4209         i != ok_peers->end();
4210         ++i)
4211       missing_loc.add_location(soid, i->second);
4212   }
4213 }
4214
4215 /* replica_scrub
4216  *
4217  * Wait for last_update_applied to match msg->scrub_to as above. Wait
4218  * for pushes to complete in case of recent recovery. Build a single
4219  * scrubmap of objects that are in the range [msg->start, msg->end).
4220  */
4221 void PG::replica_scrub(
4222   OpRequestRef op,
4223   ThreadPool::TPHandle &handle)
4224 {
4225   const MOSDRepScrub *msg = static_cast<const MOSDRepScrub *>(op->get_req());
4226   assert(!scrubber.active_rep_scrub);
4227   dout(7) << "replica_scrub" << dendl;
4228
4229   if (msg->map_epoch < info.history.same_interval_since) {
4230     dout(10) << "replica_scrub discarding old replica_scrub from "
4231              << msg->map_epoch << " < " << info.history.same_interval_since
4232              << dendl;
4233     return;
4234   }
4235
4236   ScrubMap map;
4237
4238   assert(msg->chunky);
4239   if (last_update_applied < msg->scrub_to) {
4240     dout(10) << "waiting for last_update_applied to catch up" << dendl;
4241     scrubber.active_rep_scrub = op;
4242     return;
4243   }
4244
4245   if (active_pushes > 0) {
4246     dout(10) << "waiting for active pushes to finish" << dendl;
4247     scrubber.active_rep_scrub = op;
4248     return;
4249   }
4250
4251   // compensate for hobject_t's with wrong pool from sloppy hammer OSDs
4252   hobject_t start = msg->start;
4253   hobject_t end = msg->end;
4254   if (!start.is_max())
4255     start.pool = info.pgid.pool();
4256   if (!end.is_max())
4257     end.pool = info.pgid.pool();
4258
4259   build_scrub_map_chunk(
4260     map, start, end, msg->deep, msg->seed,
4261     handle);
4262
4263   if (HAVE_FEATURE(acting_features, SERVER_LUMINOUS)) {
4264     MOSDRepScrubMap *reply = new MOSDRepScrubMap(
4265       spg_t(info.pgid.pgid, get_primary().shard),
4266       msg->map_epoch,
4267       pg_whoami);
4268     ::encode(map, reply->get_data());
4269     osd->send_message_osd_cluster(reply, msg->get_connection());
4270   } else {
4271     // for jewel compatibility
4272     vector<OSDOp> scrub(1);
4273     scrub[0].op.op = CEPH_OSD_OP_SCRUB_MAP;
4274     hobject_t poid;
4275     eversion_t v;
4276     osd_reqid_t reqid;
4277     MOSDSubOp *subop = new MOSDSubOp(
4278       reqid,
4279       pg_whoami,
4280       spg_t(info.pgid.pgid, get_primary().shard),
4281       poid,
4282       0,
4283       msg->map_epoch,
4284       osd->get_tid(),
4285       v);
4286     ::encode(map, subop->get_data());
4287     subop->ops = scrub;
4288     osd->send_message_osd_cluster(subop, msg->get_connection());
4289   }
4290 }
4291
4292 /* Scrub:
4293  * PG_STATE_SCRUBBING is set when the scrub is queued
4294  *
4295  * scrub will be chunky if all OSDs in PG support chunky scrub
4296  * scrub will fail if OSDs are too old.
4297  */
4298 void PG::scrub(epoch_t queued, ThreadPool::TPHandle &handle)
4299 {
4300   if (cct->_conf->osd_scrub_sleep > 0 &&
4301       (scrubber.state == PG::Scrubber::NEW_CHUNK ||
4302        scrubber.state == PG::Scrubber::INACTIVE) &&
4303        scrubber.needs_sleep) {
4304     ceph_assert(!scrubber.sleeping);
4305     dout(20) << __func__ << " state is INACTIVE|NEW_CHUNK, sleeping" << dendl;
4306
4307     // Do an async sleep so we don't block the op queue
4308     OSDService *osds = osd;
4309     spg_t pgid = get_pgid();
4310     int state = scrubber.state;
4311     auto scrub_requeue_callback =
4312         new FunctionContext([osds, pgid, state](int r) {
4313           PG *pg = osds->osd->lookup_lock_pg(pgid);
4314           if (pg == nullptr) {
4315             lgeneric_dout(osds->osd->cct, 20)
4316                 << "scrub_requeue_callback: Could not find "
4317                 << "PG " << pgid << " can't complete scrub requeue after sleep"
4318                 << dendl;
4319             return;
4320           }
4321           pg->scrubber.sleeping = false;
4322           pg->scrubber.needs_sleep = false;
4323           lgeneric_dout(pg->cct, 20)
4324               << "scrub_requeue_callback: slept for "
4325               << ceph_clock_now() - pg->scrubber.sleep_start
4326               << ", re-queuing scrub with state " << state << dendl;
4327           pg->scrub_queued = false;
4328           pg->requeue_scrub();
4329           pg->scrubber.sleep_start = utime_t();
4330           pg->unlock();
4331         });
4332     Mutex::Locker l(osd->scrub_sleep_lock);
4333     osd->scrub_sleep_timer.add_event_after(cct->_conf->osd_scrub_sleep,
4334                                            scrub_requeue_callback);
4335     scrubber.sleeping = true;
4336     scrubber.sleep_start = ceph_clock_now();
4337     return;
4338   }
4339   if (pg_has_reset_since(queued)) {
4340     return;
4341   }
4342   assert(scrub_queued);
4343   scrub_queued = false;
4344   scrubber.needs_sleep = true;
4345
4346   if (!is_primary() || !is_active() || !is_clean() || !is_scrubbing()) {
4347     dout(10) << "scrub -- not primary or active or not clean" << dendl;
4348     state_clear(PG_STATE_SCRUBBING);
4349     state_clear(PG_STATE_REPAIR);
4350     state_clear(PG_STATE_DEEP_SCRUB);
4351     publish_stats_to_osd();
4352     return;
4353   }
4354
4355   if (!scrubber.active) {
4356     assert(backfill_targets.empty());
4357
4358     scrubber.deep = state_test(PG_STATE_DEEP_SCRUB);
4359
4360     dout(10) << "starting a new chunky scrub" << dendl;
4361   }
4362
4363   chunky_scrub(handle);
4364 }
4365
4366 /*
4367  * Chunky scrub scrubs objects one chunk at a time with writes blocked for that
4368  * chunk.
4369  *
4370  * The object store is partitioned into chunks which end on hash boundaries. For
4371  * each chunk, the following logic is performed:
4372  *
4373  *  (1) Block writes on the chunk
4374  *  (2) Request maps from replicas
4375  *  (3) Wait for pushes to be applied (after recovery)
4376  *  (4) Wait for writes to flush on the chunk
4377  *  (5) Wait for maps from replicas
4378  *  (6) Compare / repair all scrub maps
4379  *  (7) Wait for digest updates to apply
4380  *
4381  * This logic is encoded in the mostly linear state machine:
4382  *
4383  *           +------------------+
4384  *  _________v__________        |
4385  * |                    |       |
4386  * |      INACTIVE      |       |
4387  * |____________________|       |
4388  *           |                  |
4389  *           |   +----------+   |
4390  *  _________v___v______    |   |
4391  * |                    |   |   |
4392  * |      NEW_CHUNK     |   |   |
4393  * |____________________|   |   |
4394  *           |              |   |
4395  *  _________v__________    |   |
4396  * |                    |   |   |
4397  * |     WAIT_PUSHES    |   |   |
4398  * |____________________|   |   |
4399  *           |              |   |
4400  *  _________v__________    |   |
4401  * |                    |   |   |
4402  * |  WAIT_LAST_UPDATE  |   |   |
4403  * |____________________|   |   |
4404  *           |              |   |
4405  *  _________v__________    |   |
4406  * |                    |   |   |
4407  * |      BUILD_MAP     |   |   |
4408  * |____________________|   |   |
4409  *           |              |   |
4410  *  _________v__________    |   |
4411  * |                    |   |   |
4412  * |    WAIT_REPLICAS   |   |   |
4413  * |____________________|   |   |
4414  *           |              |   |
4415  *  _________v__________    |   |
4416  * |                    |   |   |
4417  * |    COMPARE_MAPS    |   |   |
4418  * |____________________|   |   |
4419  *           |              |   |
4420  *           |              |   |
4421  *  _________v__________    |   |
4422  * |                    |   |   |
4423  * |WAIT_DIGEST_UPDATES |   |   |
4424  * |____________________|   |   |
4425  *           |   |          |   |
4426  *           |   +----------+   |
4427  *  _________v__________        |
4428  * |                    |       |
4429  * |       FINISH       |       |
4430  * |____________________|       |
4431  *           |                  |
4432  *           +------------------+
4433  *
4434  * The primary determines the last update from the subset by walking the log. If
4435  * it sees a log entry pertaining to a file in the chunk, it tells the replicas
4436  * to wait until that update is applied before building a scrub map. Both the
4437  * primary and replicas will wait for any active pushes to be applied.
4438  *
4439  * In contrast to classic_scrub, chunky_scrub is entirely handled by scrub_wq.
4440  *
4441  * scrubber.state encodes the current state of the scrub (refer to state diagram
4442  * for details).
4443  */
4444 void PG::chunky_scrub(ThreadPool::TPHandle &handle)
4445 {
4446   // check for map changes
4447   if (scrubber.is_chunky_scrub_active()) {
4448     if (scrubber.epoch_start != info.history.same_interval_since) {
4449       dout(10) << "scrub  pg changed, aborting" << dendl;
4450       scrub_clear_state();
4451       scrub_unreserve_replicas();
4452       return;
4453     }
4454   }
4455
4456   bool done = false;
4457   int ret;
4458
4459   while (!done) {
4460     dout(20) << "scrub state " << Scrubber::state_string(scrubber.state)
4461              << " [" << scrubber.start << "," << scrubber.end << ")" << dendl;
4462
4463     switch (scrubber.state) {
4464       case PG::Scrubber::INACTIVE:
4465         dout(10) << "scrub start" << dendl;
4466
4467         publish_stats_to_osd();
4468         scrubber.epoch_start = info.history.same_interval_since;
4469         scrubber.active = true;
4470
4471         osd->inc_scrubs_active(scrubber.reserved);
4472         if (scrubber.reserved) {
4473           scrubber.reserved = false;
4474           scrubber.reserved_peers.clear();
4475         }
4476
4477         {
4478           ObjectStore::Transaction t;
4479           scrubber.cleanup_store(&t);
4480           scrubber.store.reset(Scrub::Store::create(osd->store, &t,
4481                                                     info.pgid, coll));
4482           osd->store->queue_transaction(osr.get(), std::move(t), nullptr);
4483         }
4484
4485         // Don't include temporary objects when scrubbing
4486         scrubber.start = info.pgid.pgid.get_hobj_start();
4487         scrubber.state = PG::Scrubber::NEW_CHUNK;
4488
4489         {
4490           bool repair = state_test(PG_STATE_REPAIR);
4491           bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
4492           const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
4493           stringstream oss;
4494           oss << info.pgid.pgid << " " << mode << " starts" << std::endl;
4495           osd->clog->info(oss);
4496         }
4497
4498         scrubber.seed = -1;
4499
4500         break;
4501
4502       case PG::Scrubber::NEW_CHUNK:
4503         scrubber.primary_scrubmap = ScrubMap();
4504         scrubber.received_maps.clear();
4505
4506         {
4507           /* get the start and end of our scrub chunk
4508            *
4509            * Our scrub chunk has an important restriction we're going to need to
4510            * respect. We can't let head or snapdir be start or end.
4511            * Using a half-open interval means that if end == head|snapdir,
4512            * we'd scrub/lock head and the clone right next to head in different
4513            * chunks which would allow us to miss clones created between
4514            * scrubbing that chunk and scrubbing the chunk including head.
4515            * This isn't true for any of the other clones since clones can
4516            * only be created "just to the left of" head.  There is one exception
4517            * to this: promotion of clones which always happens to the left of the
4518            * left-most clone, but promote_object checks the scrubber in that
4519            * case, so it should be ok.  Also, it's ok to "miss" clones at the
4520            * left end of the range if we are a tier because they may legitimately
4521            * not exist (see _scrub).
4522            */
4523           int min = MAX(3, cct->_conf->osd_scrub_chunk_min);
4524           hobject_t start = scrubber.start;
4525           hobject_t candidate_end;
4526           vector<hobject_t> objects;
4527           ret = get_pgbackend()->objects_list_partial(
4528             start,
4529             min,
4530             MAX(min, cct->_conf->osd_scrub_chunk_max),
4531             &objects,
4532             &candidate_end);
4533           assert(ret >= 0);
4534
4535           if (!objects.empty()) {
4536             hobject_t back = objects.back();
4537             while (candidate_end.has_snapset() &&
4538                       candidate_end.get_head() == back.get_head()) {
4539               candidate_end = back;
4540               objects.pop_back();
4541               if (objects.empty()) {
4542                 assert(0 ==
4543                        "Somehow we got more than 2 objects which"
4544                        "have the same head but are not clones");
4545               }
4546               back = objects.back();
4547             }
4548             if (candidate_end.has_snapset()) {
4549               assert(candidate_end.get_head() != back.get_head());
4550               candidate_end = candidate_end.get_object_boundary();
4551             }
4552           } else {
4553             assert(candidate_end.is_max());
4554           }
4555
4556           if (!_range_available_for_scrub(scrubber.start, candidate_end)) {
4557             // we'll be requeued by whatever made us unavailable for scrub
4558             dout(10) << __func__ << ": scrub blocked somewhere in range "
4559                      << "[" << scrubber.start << ", " << candidate_end << ")"
4560                      << dendl;
4561             done = true;
4562             break;
4563           }
4564           scrubber.end = candidate_end;
4565         }
4566
4567         // walk the log to find the latest update that affects our chunk
4568         scrubber.subset_last_update = eversion_t();
4569         for (auto p = projected_log.log.rbegin();
4570              p != projected_log.log.rend();
4571              ++p) {
4572           if (p->soid >= scrubber.start &&
4573               p->soid < scrubber.end) {
4574             scrubber.subset_last_update = p->version;
4575             break;
4576           }
4577         }
4578         if (scrubber.subset_last_update == eversion_t()) {
4579           for (list<pg_log_entry_t>::const_reverse_iterator p =
4580                  pg_log.get_log().log.rbegin();
4581                p != pg_log.get_log().log.rend();
4582                ++p) {
4583             if (p->soid >= scrubber.start &&
4584                 p->soid < scrubber.end) {
4585               scrubber.subset_last_update = p->version;
4586               break;
4587             }
4588           }
4589         }
4590
4591         // ask replicas to wait until
4592         // last_update_applied >= scrubber.subset_last_update and then scan
4593         scrubber.waiting_on_whom.insert(pg_whoami);
4594         ++scrubber.waiting_on;
4595
4596         // request maps from replicas
4597         for (set<pg_shard_t>::iterator i = actingbackfill.begin();
4598              i != actingbackfill.end();
4599              ++i) {
4600           if (*i == pg_whoami) continue;
4601           _request_scrub_map(*i, scrubber.subset_last_update,
4602                              scrubber.start, scrubber.end, scrubber.deep,
4603                              scrubber.seed);
4604           scrubber.waiting_on_whom.insert(*i);
4605           ++scrubber.waiting_on;
4606         }
4607
4608         scrubber.state = PG::Scrubber::WAIT_PUSHES;
4609
4610         break;
4611
4612       case PG::Scrubber::WAIT_PUSHES:
4613         if (active_pushes == 0) {
4614           scrubber.state = PG::Scrubber::WAIT_LAST_UPDATE;
4615         } else {
4616           dout(15) << "wait for pushes to apply" << dendl;
4617           done = true;
4618         }
4619         break;
4620
4621       case PG::Scrubber::WAIT_LAST_UPDATE:
4622         if (last_update_applied >= scrubber.subset_last_update) {
4623           scrubber.state = PG::Scrubber::BUILD_MAP;
4624         } else {
4625           // will be requeued by op_applied
4626           dout(15) << "wait for writes to flush" << dendl;
4627           done = true;
4628         }
4629         break;
4630
4631       case PG::Scrubber::BUILD_MAP:
4632         assert(last_update_applied >= scrubber.subset_last_update);
4633
4634         // build my own scrub map
4635         ret = build_scrub_map_chunk(scrubber.primary_scrubmap,
4636                                     scrubber.start, scrubber.end,
4637                                     scrubber.deep, scrubber.seed,
4638                                     handle);
4639         if (ret < 0) {
4640           dout(5) << "error building scrub map: " << ret << ", aborting" << dendl;
4641           scrub_clear_state();
4642           scrub_unreserve_replicas();
4643           return;
4644         }
4645
4646         --scrubber.waiting_on;
4647         scrubber.waiting_on_whom.erase(pg_whoami);
4648
4649         scrubber.state = PG::Scrubber::WAIT_REPLICAS;
4650         break;
4651
4652       case PG::Scrubber::WAIT_REPLICAS:
4653         if (scrubber.waiting_on > 0) {
4654           // will be requeued by sub_op_scrub_map
4655           dout(10) << "wait for replicas to build scrub map" << dendl;
4656           done = true;
4657         } else {
4658           scrubber.state = PG::Scrubber::COMPARE_MAPS;
4659         }
4660         break;
4661
4662       case PG::Scrubber::COMPARE_MAPS:
4663         assert(last_update_applied >= scrubber.subset_last_update);
4664         assert(scrubber.waiting_on == 0);
4665
4666         scrub_compare_maps();
4667         scrubber.start = scrubber.end;
4668         scrubber.run_callbacks();
4669
4670         // requeue the writes from the chunk that just finished
4671         requeue_ops(waiting_for_scrub);
4672
4673         scrubber.state = PG::Scrubber::WAIT_DIGEST_UPDATES;
4674
4675         // fall-thru
4676
4677       case PG::Scrubber::WAIT_DIGEST_UPDATES:
4678         if (scrubber.num_digest_updates_pending) {
4679           dout(10) << __func__ << " waiting on "
4680                    << scrubber.num_digest_updates_pending
4681                    << " digest updates" << dendl;
4682           done = true;
4683           break;
4684         }
4685
4686         if (!(scrubber.end.is_max())) {
4687           scrubber.state = PG::Scrubber::NEW_CHUNK;
4688           requeue_scrub();
4689           done = true;
4690         } else {
4691           scrubber.state = PG::Scrubber::FINISH;
4692         }
4693
4694         break;
4695
4696       case PG::Scrubber::FINISH:
4697         scrub_finish();
4698         scrubber.state = PG::Scrubber::INACTIVE;
4699         done = true;
4700
4701         if (!snap_trimq.empty()) {
4702           dout(10) << "scrub finished, requeuing snap_trimmer" << dendl;
4703           snap_trimmer_scrub_complete();
4704         }
4705
4706         break;
4707
4708       default:
4709         ceph_abort();
4710     }
4711   }
4712   dout(20) << "scrub final state " << Scrubber::state_string(scrubber.state)
4713            << " [" << scrubber.start << "," << scrubber.end << ")" << dendl;
4714 }
4715
4716 void PG::scrub_clear_state()
4717 {
4718   assert(is_locked());
4719   state_clear(PG_STATE_SCRUBBING);
4720   state_clear(PG_STATE_REPAIR);
4721   state_clear(PG_STATE_DEEP_SCRUB);
4722   publish_stats_to_osd();
4723
4724   // active -> nothing.
4725   if (scrubber.active)
4726     osd->dec_scrubs_active();
4727
4728   requeue_ops(waiting_for_scrub);
4729
4730   scrubber.reset();
4731
4732   // type-specific state clear
4733   _scrub_clear_state();
4734 }
4735
4736 void PG::scrub_compare_maps()
4737 {
4738   dout(10) << __func__ << " has maps, analyzing" << dendl;
4739
4740   // construct authoritative scrub map for type specific scrubbing
4741   scrubber.cleaned_meta_map.insert(scrubber.primary_scrubmap);
4742   map<hobject_t, pair<uint32_t, uint32_t>> missing_digest;
4743
4744   if (acting.size() > 1) {
4745     dout(10) << __func__ << "  comparing replica scrub maps" << dendl;
4746
4747     stringstream ss;
4748
4749     // Map from object with errors to good peer
4750     map<hobject_t, list<pg_shard_t>> authoritative;
4751     map<pg_shard_t, ScrubMap *> maps;
4752
4753     dout(2) << __func__ << "   osd." << acting[0] << " has "
4754             << scrubber.primary_scrubmap.objects.size() << " items" << dendl;
4755     maps[pg_whoami] = &scrubber.primary_scrubmap;
4756
4757     for (set<pg_shard_t>::iterator i = actingbackfill.begin();
4758          i != actingbackfill.end();
4759          ++i) {
4760       if (*i == pg_whoami) continue;
4761       dout(2) << __func__ << " replica " << *i << " has "
4762               << scrubber.received_maps[*i].objects.size()
4763               << " items" << dendl;
4764       maps[*i] = &scrubber.received_maps[*i];
4765     }
4766
4767     get_pgbackend()->be_compare_scrubmaps(
4768       maps,
4769       state_test(PG_STATE_REPAIR),
4770       scrubber.missing,
4771       scrubber.inconsistent,
4772       authoritative,
4773       missing_digest,
4774       scrubber.shallow_errors,
4775       scrubber.deep_errors,
4776       scrubber.store.get(),
4777       info.pgid, acting,
4778       ss);
4779     dout(2) << ss.str() << dendl;
4780
4781     if (!ss.str().empty()) {
4782       osd->clog->error(ss);
4783     }
4784
4785     for (map<hobject_t, list<pg_shard_t>>::iterator i = authoritative.begin();
4786          i != authoritative.end();
4787          ++i) {
4788       list<pair<ScrubMap::object, pg_shard_t> > good_peers;
4789       for (list<pg_shard_t>::const_iterator j = i->second.begin();
4790            j != i->second.end();
4791            ++j) {
4792         good_peers.push_back(make_pair(maps[*j]->objects[i->first], *j));
4793       }
4794       scrubber.authoritative.insert(
4795         make_pair(
4796           i->first,
4797           good_peers));
4798     }
4799
4800     for (map<hobject_t, list<pg_shard_t>>::iterator i = authoritative.begin();
4801          i != authoritative.end();
4802          ++i) {
4803       scrubber.cleaned_meta_map.objects.erase(i->first);
4804       scrubber.cleaned_meta_map.objects.insert(
4805         *(maps[i->second.back()]->objects.find(i->first))
4806         );
4807     }
4808   }
4809
4810   ScrubMap for_meta_scrub;
4811   if (scrubber.end.is_max() ||
4812       scrubber.cleaned_meta_map.objects.empty()) {
4813     scrubber.cleaned_meta_map.swap(for_meta_scrub);
4814   } else {
4815     auto iter = scrubber.cleaned_meta_map.objects.end();
4816     --iter; // not empty, see if clause
4817     auto begin = scrubber.cleaned_meta_map.objects.begin();
4818     while (iter != begin) {
4819       auto next = iter--;
4820       if (next->first.get_head() != iter->first.get_head()) {
4821         ++iter;
4822         break;
4823       }
4824     }
4825     for_meta_scrub.objects.insert(begin, iter);
4826     scrubber.cleaned_meta_map.objects.erase(begin, iter);
4827   }
4828
4829   // ok, do the pg-type specific scrubbing
4830   scrub_snapshot_metadata(for_meta_scrub, missing_digest);
4831   if (!scrubber.store->empty()) {
4832     if (state_test(PG_STATE_REPAIR)) {
4833       dout(10) << __func__ << ": discarding scrub results" << dendl;
4834       scrubber.store->flush(nullptr);
4835     } else {
4836       dout(10) << __func__ << ": updating scrub object" << dendl;
4837       ObjectStore::Transaction t;
4838       scrubber.store->flush(&t);
4839       osd->store->queue_transaction(osr.get(), std::move(t), nullptr);
4840     }
4841   }
4842 }
4843
4844 bool PG::scrub_process_inconsistent()
4845 {
4846   dout(10) << __func__ << ": checking authoritative" << dendl;
4847   bool repair = state_test(PG_STATE_REPAIR);
4848   bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
4849   const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
4850
4851   // authoriative only store objects which missing or inconsistent.
4852   if (!scrubber.authoritative.empty()) {
4853     stringstream ss;
4854     ss << info.pgid << " " << mode << " "
4855        << scrubber.missing.size() << " missing, "
4856        << scrubber.inconsistent.size() << " inconsistent objects";
4857     dout(2) << ss.str() << dendl;
4858     osd->clog->error(ss);
4859     if (repair) {
4860       state_clear(PG_STATE_CLEAN);
4861       for (map<hobject_t, list<pair<ScrubMap::object, pg_shard_t> >>::iterator i =
4862              scrubber.authoritative.begin();
4863            i != scrubber.authoritative.end();
4864            ++i) {
4865         set<pg_shard_t>::iterator j;
4866
4867         auto missing_entry = scrubber.missing.find(i->first);
4868         if (missing_entry != scrubber.missing.end()) {
4869           for (j = missing_entry->second.begin();
4870                j != missing_entry->second.end();
4871                ++j) {
4872             repair_object(
4873               i->first,
4874               &(i->second),
4875               *j);
4876             ++scrubber.fixed;
4877           }
4878         }
4879         if (scrubber.inconsistent.count(i->first)) {
4880           for (j = scrubber.inconsistent[i->first].begin();
4881                j != scrubber.inconsistent[i->first].end();
4882                ++j) {
4883             repair_object(i->first,
4884               &(i->second),
4885               *j);
4886             ++scrubber.fixed;
4887           }
4888         }
4889       }
4890     }
4891   }
4892   return (!scrubber.authoritative.empty() && repair);
4893 }
4894
4895 bool PG::ops_blocked_by_scrub() const {
4896   return (waiting_for_scrub.size() != 0);
4897 }
4898
4899 // the part that actually finalizes a scrub
4900 void PG::scrub_finish()
4901 {
4902   bool repair = state_test(PG_STATE_REPAIR);
4903   // if the repair request comes from auto-repair and large number of errors,
4904   // we would like to cancel auto-repair
4905   if (repair && scrubber.auto_repair
4906       && scrubber.authoritative.size() > cct->_conf->osd_scrub_auto_repair_num_errors) {
4907     state_clear(PG_STATE_REPAIR);
4908     repair = false;
4909   }
4910   bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
4911   const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
4912
4913   // type-specific finish (can tally more errors)
4914   _scrub_finish();
4915
4916   bool has_error = scrub_process_inconsistent();
4917
4918   {
4919     stringstream oss;
4920     oss << info.pgid.pgid << " " << mode << " ";
4921     int total_errors = scrubber.shallow_errors + scrubber.deep_errors;
4922     if (total_errors)
4923       oss << total_errors << " errors";
4924     else
4925       oss << "ok";
4926     if (!deep_scrub && info.stats.stats.sum.num_deep_scrub_errors)
4927       oss << " ( " << info.stats.stats.sum.num_deep_scrub_errors
4928           << " remaining deep scrub error details lost)";
4929     if (repair)
4930       oss << ", " << scrubber.fixed << " fixed";
4931     if (total_errors)
4932       osd->clog->error(oss);
4933     else
4934       osd->clog->info(oss);
4935   }
4936
4937   // finish up
4938   unreg_next_scrub();
4939   utime_t now = ceph_clock_now();
4940   info.history.last_scrub = info.last_update;
4941   info.history.last_scrub_stamp = now;
4942   if (scrubber.deep) {
4943     info.history.last_deep_scrub = info.last_update;
4944     info.history.last_deep_scrub_stamp = now;
4945   }
4946   // Since we don't know which errors were fixed, we can only clear them
4947   // when every one has been fixed.
4948   if (repair) {
4949     if (scrubber.fixed == scrubber.shallow_errors + scrubber.deep_errors) {
4950       assert(deep_scrub);
4951       scrubber.shallow_errors = scrubber.deep_errors = 0;
4952     } else {
4953       // Deep scrub in order to get corrected error counts
4954       scrub_after_recovery = true;
4955     }
4956   }
4957   if (deep_scrub) {
4958     if ((scrubber.shallow_errors == 0) && (scrubber.deep_errors == 0))
4959       info.history.last_clean_scrub_stamp = now;
4960     info.stats.stats.sum.num_shallow_scrub_errors = scrubber.shallow_errors;
4961     info.stats.stats.sum.num_deep_scrub_errors = scrubber.deep_errors;
4962   } else {
4963     info.stats.stats.sum.num_shallow_scrub_errors = scrubber.shallow_errors;
4964     // XXX: last_clean_scrub_stamp doesn't mean the pg is not inconsistent
4965     // because of deep-scrub errors
4966     if (scrubber.shallow_errors == 0)
4967       info.history.last_clean_scrub_stamp = now;
4968   }
4969   info.stats.stats.sum.num_scrub_errors =
4970     info.stats.stats.sum.num_shallow_scrub_errors +
4971     info.stats.stats.sum.num_deep_scrub_errors;
4972   reg_next_scrub();
4973
4974   {
4975     ObjectStore::Transaction t;
4976     dirty_info = true;
4977     write_if_dirty(t);
4978     int tr = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
4979     assert(tr == 0);
4980   }
4981
4982
4983   if (has_error) {
4984     queue_peering_event(
4985       CephPeeringEvtRef(
4986         std::make_shared<CephPeeringEvt>(
4987           get_osdmap()->get_epoch(),
4988           get_osdmap()->get_epoch(),
4989           DoRecovery())));
4990   }
4991
4992   scrub_clear_state();
4993   scrub_unreserve_replicas();
4994
4995   if (is_active() && is_primary()) {
4996     share_pg_info();
4997   }
4998 }
4999
5000 void PG::share_pg_info()
5001 {
5002   dout(10) << "share_pg_info" << dendl;
5003
5004   // share new pg_info_t with replicas
5005   assert(!actingbackfill.empty());
5006   for (set<pg_shard_t>::iterator i = actingbackfill.begin();
5007        i != actingbackfill.end();
5008        ++i) {
5009     if (*i == pg_whoami) continue;
5010     pg_shard_t peer = *i;
5011     if (peer_info.count(peer)) {
5012       peer_info[peer].last_epoch_started = info.last_epoch_started;
5013       peer_info[peer].last_interval_started = info.last_interval_started;
5014       peer_info[peer].history.merge(info.history);
5015     }
5016     MOSDPGInfo *m = new MOSDPGInfo(get_osdmap()->get_epoch());
5017     m->pg_list.push_back(
5018       make_pair(
5019         pg_notify_t(
5020           peer.shard, pg_whoami.shard,
5021           get_osdmap()->get_epoch(),
5022           get_osdmap()->get_epoch(),
5023           info),
5024         PastIntervals()));
5025     osd->send_message_osd_cluster(peer.osd, m, get_osdmap()->get_epoch());
5026   }
5027 }
5028
5029 bool PG::append_log_entries_update_missing(
5030   const mempool::osd_pglog::list<pg_log_entry_t> &entries,
5031   ObjectStore::Transaction &t)
5032 {
5033   assert(!entries.empty());
5034   assert(entries.begin()->version > info.last_update);
5035
5036   PGLogEntryHandler rollbacker{this, &t};
5037   bool invalidate_stats =
5038     pg_log.append_new_log_entries(info.last_backfill,
5039                                   info.last_backfill_bitwise,
5040                                   entries,
5041                                   &rollbacker);
5042   info.last_update = pg_log.get_head();
5043
5044   if (pg_log.get_missing().num_missing() == 0) {
5045     // advance last_complete since nothing else is missing!
5046     info.last_complete = info.last_update;
5047   }
5048
5049   info.stats.stats_invalid = info.stats.stats_invalid || invalidate_stats;
5050   dirty_info = true;
5051   write_if_dirty(t);
5052   return invalidate_stats;
5053 }
5054
5055
5056 void PG::merge_new_log_entries(
5057   const mempool::osd_pglog::list<pg_log_entry_t> &entries,
5058   ObjectStore::Transaction &t)
5059 {
5060   dout(10) << __func__ << " " << entries << dendl;
5061   assert(is_primary());
5062
5063   bool rebuild_missing = append_log_entries_update_missing(entries, t);
5064   for (set<pg_shard_t>::const_iterator i = actingbackfill.begin();
5065        i != actingbackfill.end();
5066        ++i) {
5067     pg_shard_t peer(*i);
5068     if (peer == pg_whoami) continue;
5069     assert(peer_missing.count(peer));
5070     assert(peer_info.count(peer));
5071     pg_missing_t& pmissing(peer_missing[peer]);
5072     pg_info_t& pinfo(peer_info[peer]);
5073     bool invalidate_stats = PGLog::append_log_entries_update_missing(
5074       pinfo.last_backfill,
5075       info.last_backfill_bitwise,
5076       entries,
5077       true,
5078       NULL,
5079       pmissing,
5080       NULL,
5081       this);
5082     pinfo.last_update = info.last_update;
5083     pinfo.stats.stats_invalid = pinfo.stats.stats_invalid || invalidate_stats;
5084     rebuild_missing = rebuild_missing || invalidate_stats;
5085   }
5086
5087   if (!rebuild_missing) {
5088     return;
5089   }
5090
5091   for (auto &&i: entries) {
5092     missing_loc.rebuild(
5093       i.soid,
5094       pg_whoami,
5095       actingbackfill,
5096       info,
5097       pg_log.get_missing(),
5098       peer_missing,
5099       peer_info);
5100   }
5101 }
5102
5103 void PG::update_history(const pg_history_t& new_history)
5104 {
5105   unreg_next_scrub();
5106   if (info.history.merge(new_history)) {
5107     dout(20) << __func__ << " advanced history from " << new_history << dendl;
5108     dirty_info = true;
5109     if (info.history.last_epoch_clean >= info.history.same_interval_since) {
5110       dout(20) << __func__ << " clearing past_intervals" << dendl;
5111       past_intervals.clear();
5112       dirty_big_info = true;
5113     }
5114   }
5115   reg_next_scrub();
5116 }
5117
5118 void PG::fulfill_info(
5119   pg_shard_t from, const pg_query_t &query,
5120   pair<pg_shard_t, pg_info_t> &notify_info)
5121 {
5122   assert(from == primary);
5123   assert(query.type == pg_query_t::INFO);
5124
5125   // info
5126   dout(10) << "sending info" << dendl;
5127   notify_info = make_pair(from, info);
5128 }
5129
5130 void PG::fulfill_log(
5131   pg_shard_t from, const pg_query_t &query, epoch_t query_epoch)
5132 {
5133   dout(10) << "log request from " << from << dendl;
5134   assert(from == primary);
5135   assert(query.type != pg_query_t::INFO);
5136   ConnectionRef con = osd->get_con_osd_cluster(
5137     from.osd, get_osdmap()->get_epoch());
5138   if (!con) return;
5139
5140   MOSDPGLog *mlog = new MOSDPGLog(
5141     from.shard, pg_whoami.shard,
5142     get_osdmap()->get_epoch(),
5143     info, query_epoch);
5144   mlog->missing = pg_log.get_missing();
5145
5146   // primary -> other, when building master log
5147   if (query.type == pg_query_t::LOG) {
5148     dout(10) << " sending info+missing+log since " << query.since
5149              << dendl;
5150     if (query.since != eversion_t() && query.since < pg_log.get_tail()) {
5151       osd->clog->error() << info.pgid << " got broken pg_query_t::LOG since " << query.since
5152                         << " when my log.tail is " << pg_log.get_tail()
5153                         << ", sending full log instead";
5154       mlog->log = pg_log.get_log();           // primary should not have requested this!!
5155     } else
5156       mlog->log.copy_after(pg_log.get_log(), query.since);
5157   }
5158   else if (query.type == pg_query_t::FULLLOG) {
5159     dout(10) << " sending info+missing+full log" << dendl;
5160     mlog->log = pg_log.get_log();
5161   }
5162
5163   dout(10) << " sending " << mlog->log << " " << mlog->missing << dendl;
5164
5165   osd->share_map_peer(from.osd, con.get(), get_osdmap());
5166   osd->send_message_osd_cluster(mlog, con.get());
5167 }
5168
5169 void PG::check_full_transition(OSDMapRef lastmap, OSDMapRef osdmap)
5170 {
5171   bool changed = false;
5172   if (osdmap->test_flag(CEPH_OSDMAP_FULL) &&
5173       !lastmap->test_flag(CEPH_OSDMAP_FULL)) {
5174     dout(10) << " cluster was marked full in " << osdmap->get_epoch() << dendl;
5175     changed = true;
5176   }
5177   const pg_pool_t *pi = osdmap->get_pg_pool(info.pgid.pool());
5178   assert(pi);
5179   if (pi->has_flag(pg_pool_t::FLAG_FULL)) {
5180     const pg_pool_t *opi = lastmap->get_pg_pool(info.pgid.pool());
5181     if (!opi || !opi->has_flag(pg_pool_t::FLAG_FULL)) {
5182       dout(10) << " pool was marked full in " << osdmap->get_epoch() << dendl;
5183       changed = true;
5184     }
5185   }
5186   if (changed) {
5187     info.history.last_epoch_marked_full = osdmap->get_epoch();
5188     dirty_info = true;
5189   }
5190 }
5191
5192 bool PG::should_restart_peering(
5193   int newupprimary,
5194   int newactingprimary,
5195   const vector<int>& newup,
5196   const vector<int>& newacting,
5197   OSDMapRef lastmap,
5198   OSDMapRef osdmap)
5199 {
5200   if (PastIntervals::is_new_interval(
5201         primary.osd,
5202         newactingprimary,
5203         acting,
5204         newacting,
5205         up_primary.osd,
5206         newupprimary,
5207         up,
5208         newup,
5209         osdmap,
5210         lastmap,
5211         info.pgid.pgid)) {
5212     dout(20) << "new interval newup " << newup
5213              << " newacting " << newacting << dendl;
5214     return true;
5215   } else {
5216     return false;
5217   }
5218 }
5219
5220 bool PG::old_peering_msg(epoch_t reply_epoch, epoch_t query_epoch)
5221 {
5222   if (last_peering_reset > reply_epoch ||
5223       last_peering_reset > query_epoch) {
5224     dout(10) << "old_peering_msg reply_epoch " << reply_epoch << " query_epoch " << query_epoch
5225              << " last_peering_reset " << last_peering_reset
5226              << dendl;
5227     return true;
5228   }
5229   return false;
5230 }
5231
5232 void PG::set_last_peering_reset()
5233 {
5234   dout(20) << "set_last_peering_reset " << get_osdmap()->get_epoch() << dendl;
5235   if (last_peering_reset != get_osdmap()->get_epoch()) {
5236     last_peering_reset = get_osdmap()->get_epoch();
5237     reset_interval_flush();
5238   }
5239 }
5240
5241 struct FlushState {
5242   PGRef pg;
5243   epoch_t epoch;
5244   FlushState(PG *pg, epoch_t epoch) : pg(pg), epoch(epoch) {}
5245   ~FlushState() {
5246     pg->lock();
5247     if (!pg->pg_has_reset_since(epoch))
5248       pg->queue_flushed(epoch);
5249     pg->unlock();
5250   }
5251 };
5252 typedef ceph::shared_ptr<FlushState> FlushStateRef;
5253
5254 void PG::start_flush(ObjectStore::Transaction *t,
5255                      list<Context *> *on_applied,
5256                      list<Context *> *on_safe)
5257 {
5258   // flush in progress ops
5259   FlushStateRef flush_trigger (std::make_shared<FlushState>(
5260                                this, get_osdmap()->get_epoch()));
5261   t->nop();
5262   flushes_in_progress++;
5263   on_applied->push_back(new ContainerContext<FlushStateRef>(flush_trigger));
5264   on_safe->push_back(new ContainerContext<FlushStateRef>(flush_trigger));
5265 }
5266
5267 void PG::reset_interval_flush()
5268 {
5269   dout(10) << "Clearing blocked outgoing recovery messages" << dendl;
5270   recovery_state.clear_blocked_outgoing();
5271
5272   Context *c = new QueuePeeringEvt<IntervalFlush>(
5273     this, get_osdmap()->get_epoch(), IntervalFlush());
5274   if (!osr->flush_commit(c)) {
5275     dout(10) << "Beginning to block outgoing recovery messages" << dendl;
5276     recovery_state.begin_block_outgoing();
5277   } else {
5278     dout(10) << "Not blocking outgoing recovery messages" << dendl;
5279     delete c;
5280   }
5281 }
5282
5283 /* Called before initializing peering during advance_map */
5284 void PG::start_peering_interval(
5285   const OSDMapRef lastmap,
5286   const vector<int>& newup, int new_up_primary,
5287   const vector<int>& newacting, int new_acting_primary,
5288   ObjectStore::Transaction *t)
5289 {
5290   const OSDMapRef osdmap = get_osdmap();
5291
5292   set_last_peering_reset();
5293
5294   vector<int> oldacting, oldup;
5295   int oldrole = get_role();
5296
5297   unreg_next_scrub();
5298
5299   pg_shard_t old_acting_primary = get_primary();
5300   pg_shard_t old_up_primary = up_primary;
5301   bool was_old_primary = is_primary();
5302
5303   acting.swap(oldacting);
5304   up.swap(oldup);
5305   init_primary_up_acting(
5306     newup,
5307     newacting,
5308     new_up_primary,
5309     new_acting_primary);
5310
5311   if (info.stats.up != up ||
5312       info.stats.acting != acting ||
5313       info.stats.up_primary != new_up_primary ||
5314       info.stats.acting_primary != new_acting_primary) {
5315     info.stats.up = up;
5316     info.stats.up_primary = new_up_primary;
5317     info.stats.acting = acting;
5318     info.stats.acting_primary = new_acting_primary;
5319     info.stats.mapping_epoch = osdmap->get_epoch();
5320   }
5321
5322   pg_stats_publish_lock.Lock();
5323   pg_stats_publish_valid = false;
5324   pg_stats_publish_lock.Unlock();
5325
5326   // This will now be remapped during a backfill in cases
5327   // that it would not have been before.
5328   if (up != acting)
5329     state_set(PG_STATE_REMAPPED);
5330   else
5331     state_clear(PG_STATE_REMAPPED);
5332
5333   int role = osdmap->calc_pg_role(osd->whoami, acting, acting.size());
5334   if (pool.info.is_replicated() || role == pg_whoami.shard)
5335     set_role(role);
5336   else
5337     set_role(-1);
5338
5339   // did acting, up, primary|acker change?
5340   if (!lastmap) {
5341     dout(10) << " no lastmap" << dendl;
5342     dirty_info = true;
5343     dirty_big_info = true;
5344     info.history.same_interval_since = osdmap->get_epoch();
5345   } else {
5346     std::stringstream debug;
5347     assert(info.history.same_interval_since != 0);
5348     boost::scoped_ptr<IsPGRecoverablePredicate> recoverable(
5349       get_is_recoverable_predicate());
5350     bool new_interval = PastIntervals::check_new_interval(
5351       old_acting_primary.osd,
5352       new_acting_primary,
5353       oldacting, newacting,
5354       old_up_primary.osd,
5355       new_up_primary,
5356       oldup, newup,
5357       info.history.same_interval_since,
5358       info.history.last_epoch_clean,
5359       osdmap,
5360       lastmap,
5361       info.pgid.pgid,
5362       recoverable.get(),
5363       &past_intervals,
5364       &debug);
5365     dout(10) << __func__ << ": check_new_interval output: "
5366              << debug.str() << dendl;
5367     if (new_interval) {
5368       if (osdmap->get_epoch() == osd->get_superblock().oldest_map &&
5369           info.history.last_epoch_clean < osdmap->get_epoch()) {
5370         dout(10) << " map gap, clearing past_intervals and faking" << dendl;
5371         // our information is incomplete and useless; someone else was clean
5372         // after everything we know if osdmaps were trimmed.
5373         past_intervals.clear();
5374       } else {
5375         dout(10) << " noting past " << past_intervals << dendl;
5376       }
5377       dirty_info = true;
5378       dirty_big_info = true;
5379       info.history.same_interval_since = osdmap->get_epoch();
5380       if (info.pgid.pgid.is_split(lastmap->get_pg_num(info.pgid.pgid.pool()),
5381                                   osdmap->get_pg_num(info.pgid.pgid.pool()),
5382                                   nullptr)) {
5383         info.history.last_epoch_split = osdmap->get_epoch();
5384       }
5385     }
5386   }
5387
5388   if (old_up_primary != up_primary ||
5389       oldup != up) {
5390     info.history.same_up_since = osdmap->get_epoch();
5391   }
5392   // this comparison includes primary rank via pg_shard_t
5393   if (old_acting_primary != get_primary()) {
5394     info.history.same_primary_since = osdmap->get_epoch();
5395   }
5396
5397   on_new_interval();
5398
5399   dout(1) << __func__ << " up " << oldup << " -> " << up
5400            << ", acting " << oldacting << " -> " << acting
5401            << ", acting_primary " << old_acting_primary << " -> " << new_acting_primary
5402            << ", up_primary " << old_up_primary << " -> " << new_up_primary
5403            << ", role " << oldrole << " -> " << role
5404            << ", features acting " << acting_features
5405            << " upacting " << upacting_features
5406            << dendl;
5407
5408   // deactivate.
5409   state_clear(PG_STATE_ACTIVE);
5410   state_clear(PG_STATE_PEERED);
5411   state_clear(PG_STATE_DOWN);
5412   state_clear(PG_STATE_RECOVERY_WAIT);
5413   state_clear(PG_STATE_RECOVERY_TOOFULL);
5414   state_clear(PG_STATE_RECOVERING);
5415
5416   peer_purged.clear();
5417   actingbackfill.clear();
5418   scrub_queued = false;
5419
5420   // reset primary state?
5421   if (was_old_primary || is_primary()) {
5422     osd->remove_want_pg_temp(info.pgid.pgid);
5423   }
5424   clear_primary_state();
5425
5426
5427   // pg->on_*
5428   on_change(t);
5429
5430   projected_last_update = eversion_t();
5431
5432   assert(!deleting);
5433
5434   // should we tell the primary we are here?
5435   send_notify = !is_primary();
5436
5437   if (role != oldrole ||
5438       was_old_primary != is_primary()) {
5439     // did primary change?
5440     if (was_old_primary != is_primary()) {
5441       state_clear(PG_STATE_CLEAN);
5442       clear_publish_stats();
5443     }
5444
5445     on_role_change();
5446
5447     // take active waiters
5448     requeue_ops(waiting_for_peered);
5449
5450   } else {
5451     // no role change.
5452     // did primary change?
5453     if (get_primary() != old_acting_primary) {
5454       dout(10) << *this << " " << oldacting << " -> " << acting
5455                << ", acting primary "
5456                << old_acting_primary << " -> " << get_primary()
5457                << dendl;
5458     } else {
5459       // primary is the same.
5460       if (is_primary()) {
5461         // i am (still) primary. but my replica set changed.
5462         state_clear(PG_STATE_CLEAN);
5463
5464         dout(10) << oldacting << " -> " << acting
5465                  << ", replicas changed" << dendl;
5466       }
5467     }
5468   }
5469   cancel_recovery();
5470
5471   if (acting.empty() && !up.empty() && up_primary == pg_whoami) {
5472     dout(10) << " acting empty, but i am up[0], clearing pg_temp" << dendl;
5473     osd->queue_want_pg_temp(info.pgid.pgid, acting);
5474   }
5475 }
5476
5477 void PG::on_new_interval()
5478 {
5479   const OSDMapRef osdmap = get_osdmap();
5480
5481   reg_next_scrub();
5482
5483   // initialize features
5484   acting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
5485   upacting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
5486   for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p) {
5487     if (*p == CRUSH_ITEM_NONE)
5488       continue;
5489     uint64_t f = osdmap->get_xinfo(*p).features;
5490     acting_features &= f;
5491     upacting_features &= f;
5492   }
5493   for (vector<int>::iterator p = up.begin(); p != up.end(); ++p) {
5494     if (*p == CRUSH_ITEM_NONE)
5495       continue;
5496     upacting_features &= osdmap->get_xinfo(*p).features;
5497   }
5498
5499   assert(osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE));
5500
5501   _on_new_interval();
5502 }
5503
5504 void PG::proc_primary_info(ObjectStore::Transaction &t, const pg_info_t &oinfo)
5505 {
5506   assert(!is_primary());
5507
5508   update_history(oinfo.history);
5509
5510   if (last_complete_ondisk.epoch >= info.history.last_epoch_started) {
5511     // DEBUG: verify that the snaps are empty in snap_mapper
5512     if (cct->_conf->osd_debug_verify_snaps_on_info) {
5513       interval_set<snapid_t> p;
5514       p.union_of(oinfo.purged_snaps, info.purged_snaps);
5515       p.subtract(info.purged_snaps);
5516       if (!p.empty()) {
5517         for (interval_set<snapid_t>::iterator i = p.begin();
5518              i != p.end();
5519              ++i) {
5520           for (snapid_t snap = i.get_start();
5521                snap != i.get_len() + i.get_start();
5522                ++snap) {
5523             vector<hobject_t> hoids;
5524             int r = snap_mapper.get_next_objects_to_trim(snap, 1, &hoids);
5525             if (r != 0 && r != -ENOENT) {
5526               derr << __func__ << ": snap_mapper get_next_object_to_trim returned "
5527                    << cpp_strerror(r) << dendl;
5528               ceph_abort();
5529             } else if (r != -ENOENT) {
5530               assert(!hoids.empty());
5531               derr << __func__ << ": snap_mapper get_next_object_to_trim returned "
5532                    << cpp_strerror(r) << " for object "
5533                    << hoids[0] << " on snap " << snap
5534                    << " which should have been fully trimmed " << dendl;
5535               ceph_abort();
5536             }
5537           }
5538         }
5539       }
5540     }
5541     info.purged_snaps = oinfo.purged_snaps;
5542     dirty_info = true;
5543     dirty_big_info = true;
5544   }
5545 }
5546
5547 ostream& operator<<(ostream& out, const PG& pg)
5548 {
5549   out << "pg[" << pg.info
5550       << " " << pg.up;
5551   if (pg.acting != pg.up)
5552     out << "/" << pg.acting;
5553   out << " r=" << pg.get_role();
5554   out << " lpr=" << pg.get_last_peering_reset();
5555
5556   if (!pg.past_intervals.empty()) {
5557     out << " pi=[" << pg.past_intervals.get_bounds()
5558         << ")/" << pg.past_intervals.size();
5559   }
5560
5561   if (pg.is_peered()) {
5562     if (pg.last_update_ondisk != pg.info.last_update)
5563       out << " luod=" << pg.last_update_ondisk;
5564     if (pg.last_update_applied != pg.info.last_update)
5565       out << " lua=" << pg.last_update_applied;
5566   }
5567
5568   if (pg.recovery_ops_active)
5569     out << " rops=" << pg.recovery_ops_active;
5570
5571   if (pg.pg_log.get_tail() != pg.info.log_tail ||
5572       pg.pg_log.get_head() != pg.info.last_update)
5573     out << " (info mismatch, " << pg.pg_log.get_log() << ")";
5574
5575   if (!pg.pg_log.get_log().empty()) {
5576     if ((pg.pg_log.get_log().log.begin()->version <= pg.pg_log.get_tail())) {
5577       out << " (log bound mismatch, actual=["
5578           << pg.pg_log.get_log().log.begin()->version << ","
5579           << pg.pg_log.get_log().log.rbegin()->version << "]";
5580       out << ")";
5581     }
5582   }
5583
5584   if (!pg.backfill_targets.empty())
5585     out << " bft=" << pg.backfill_targets;
5586   out << " crt=" << pg.pg_log.get_can_rollback_to();
5587
5588   if (pg.last_complete_ondisk != pg.info.last_complete)
5589     out << " lcod " << pg.last_complete_ondisk;
5590
5591   if (pg.is_primary()) {
5592     out << " mlcod " << pg.min_last_complete_ondisk;
5593   }
5594
5595   out << " " << pg_state_string(pg.get_state());
5596   if (pg.should_send_notify())
5597     out << " NOTIFY";
5598
5599   if (pg.scrubber.must_repair)
5600     out << " MUST_REPAIR";
5601   if (pg.scrubber.auto_repair)
5602     out << " AUTO_REPAIR";
5603   if (pg.scrubber.must_deep_scrub)
5604     out << " MUST_DEEP_SCRUB";
5605   if (pg.scrubber.must_scrub)
5606     out << " MUST_SCRUB";
5607
5608   //out << " (" << pg.pg_log.get_tail() << "," << pg.pg_log.get_head() << "]";
5609   if (pg.pg_log.get_missing().num_missing()) {
5610     out << " m=" << pg.pg_log.get_missing().num_missing();
5611     if (pg.is_primary()) {
5612       uint64_t unfound = pg.get_num_unfound();
5613       if (unfound)
5614         out << " u=" << unfound;
5615     }
5616   }
5617   if (pg.snap_trimq.size())
5618     out << " snaptrimq=" << pg.snap_trimq;
5619
5620   out << "]";
5621
5622
5623   return out;
5624 }
5625
5626 bool PG::can_discard_op(OpRequestRef& op)
5627 {
5628   const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
5629   if (cct->_conf->osd_discard_disconnected_ops && OSD::op_is_discardable(m)) {
5630     dout(20) << " discard " << *m << dendl;
5631     return true;
5632   }
5633
5634   if (m->get_map_epoch() < info.history.same_primary_since) {
5635     dout(7) << " changed after " << m->get_map_epoch()
5636             << ", dropping " << *m << dendl;
5637     return true;
5638   }
5639
5640   if (m->get_connection()->has_feature(CEPH_FEATURE_RESEND_ON_SPLIT)) {
5641     if (m->get_map_epoch() < pool.info.get_last_force_op_resend()) {
5642       dout(7) << __func__ << " sent before last_force_op_resend "
5643               << pool.info.last_force_op_resend << ", dropping" << *m << dendl;
5644       return true;
5645     }
5646     if (m->get_map_epoch() < info.history.last_epoch_split) {
5647       dout(7) << __func__ << " pg split in "
5648               << info.history.last_epoch_split << ", dropping" << dendl;
5649       return true;
5650     }
5651   } else if (m->get_connection()->has_feature(CEPH_FEATURE_OSD_POOLRESEND)) {
5652     if (m->get_map_epoch() < pool.info.get_last_force_op_resend_preluminous()) {
5653       dout(7) << __func__ << " sent before last_force_op_resend_preluminous "
5654               << pool.info.last_force_op_resend_preluminous
5655               << ", dropping" << *m << dendl;
5656       return true;
5657     }
5658   }
5659
5660   return false;
5661 }
5662
5663 template<typename T, int MSGTYPE>
5664 bool PG::can_discard_replica_op(OpRequestRef& op)
5665 {
5666   const T *m = static_cast<const T *>(op->get_req());
5667   assert(m->get_type() == MSGTYPE);
5668
5669   /* Mostly, this overlaps with the old_peering_msg
5670    * condition.  An important exception is pushes
5671    * sent by replicas not in the acting set, since
5672    * if such a replica goes down it does not cause
5673    * a new interval. */
5674   int from = m->get_source().num();
5675   if (get_osdmap()->get_down_at(from) >= m->map_epoch)
5676     return true;
5677
5678   // same pg?
5679   //  if pg changes _at all_, we reset and repeer!
5680   if (old_peering_msg(m->map_epoch, m->map_epoch)) {
5681     dout(10) << "can_discard_replica_op pg changed " << info.history
5682              << " after " << m->map_epoch
5683              << ", dropping" << dendl;
5684     return true;
5685   }
5686   return false;
5687 }
5688
5689 bool PG::can_discard_scan(OpRequestRef op)
5690 {
5691   const MOSDPGScan *m = static_cast<const MOSDPGScan *>(op->get_req());
5692   assert(m->get_type() == MSG_OSD_PG_SCAN);
5693
5694   if (old_peering_msg(m->map_epoch, m->query_epoch)) {
5695     dout(10) << " got old scan, ignoring" << dendl;
5696     return true;
5697   }
5698   return false;
5699 }
5700
5701 bool PG::can_discard_backfill(OpRequestRef op)
5702 {
5703   const MOSDPGBackfill *m = static_cast<const MOSDPGBackfill *>(op->get_req());
5704   assert(m->get_type() == MSG_OSD_PG_BACKFILL);
5705
5706   if (old_peering_msg(m->map_epoch, m->query_epoch)) {
5707     dout(10) << " got old backfill, ignoring" << dendl;
5708     return true;
5709   }
5710
5711   return false;
5712
5713 }
5714
5715 bool PG::can_discard_request(OpRequestRef& op)
5716 {
5717   switch (op->get_req()->get_type()) {
5718   case CEPH_MSG_OSD_OP:
5719     return can_discard_op(op);
5720   case CEPH_MSG_OSD_BACKOFF:
5721     return false; // never discard
5722   case MSG_OSD_SUBOP:
5723     return can_discard_replica_op<MOSDSubOp, MSG_OSD_SUBOP>(op);
5724   case MSG_OSD_REPOP:
5725     return can_discard_replica_op<MOSDRepOp, MSG_OSD_REPOP>(op);
5726   case MSG_OSD_PG_PUSH:
5727     return can_discard_replica_op<MOSDPGPush, MSG_OSD_PG_PUSH>(op);
5728   case MSG_OSD_PG_PULL:
5729     return can_discard_replica_op<MOSDPGPull, MSG_OSD_PG_PULL>(op);
5730   case MSG_OSD_PG_PUSH_REPLY:
5731     return can_discard_replica_op<MOSDPGPushReply, MSG_OSD_PG_PUSH_REPLY>(op);
5732   case MSG_OSD_SUBOPREPLY:
5733     return can_discard_replica_op<MOSDSubOpReply, MSG_OSD_SUBOPREPLY>(op);
5734   case MSG_OSD_REPOPREPLY:
5735     return can_discard_replica_op<MOSDRepOpReply, MSG_OSD_REPOPREPLY>(op);
5736   case MSG_OSD_PG_RECOVERY_DELETE:
5737     return can_discard_replica_op<MOSDPGRecoveryDelete, MSG_OSD_PG_RECOVERY_DELETE>(op);
5738
5739   case MSG_OSD_PG_RECOVERY_DELETE_REPLY:
5740     return can_discard_replica_op<MOSDPGRecoveryDeleteReply, MSG_OSD_PG_RECOVERY_DELETE_REPLY>(op);
5741
5742   case MSG_OSD_EC_WRITE:
5743     return can_discard_replica_op<MOSDECSubOpWrite, MSG_OSD_EC_WRITE>(op);
5744   case MSG_OSD_EC_WRITE_REPLY:
5745     return can_discard_replica_op<MOSDECSubOpWriteReply, MSG_OSD_EC_WRITE_REPLY>(op);
5746   case MSG_OSD_EC_READ:
5747     return can_discard_replica_op<MOSDECSubOpRead, MSG_OSD_EC_READ>(op);
5748   case MSG_OSD_EC_READ_REPLY:
5749     return can_discard_replica_op<MOSDECSubOpReadReply, MSG_OSD_EC_READ_REPLY>(op);
5750   case MSG_OSD_REP_SCRUB:
5751     return can_discard_replica_op<MOSDRepScrub, MSG_OSD_REP_SCRUB>(op);
5752   case MSG_OSD_SCRUB_RESERVE:
5753     return can_discard_replica_op<MOSDScrubReserve, MSG_OSD_SCRUB_RESERVE>(op);
5754   case MSG_OSD_REP_SCRUBMAP:
5755     return can_discard_replica_op<MOSDRepScrubMap, MSG_OSD_REP_SCRUBMAP>(op);
5756   case MSG_OSD_PG_UPDATE_LOG_MISSING:
5757     return can_discard_replica_op<
5758       MOSDPGUpdateLogMissing, MSG_OSD_PG_UPDATE_LOG_MISSING>(op);
5759   case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY:
5760     return can_discard_replica_op<
5761       MOSDPGUpdateLogMissingReply, MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY>(op);
5762
5763   case MSG_OSD_PG_SCAN:
5764     return can_discard_scan(op);
5765   case MSG_OSD_PG_BACKFILL:
5766     return can_discard_backfill(op);
5767   case MSG_OSD_PG_BACKFILL_REMOVE:
5768     return can_discard_replica_op<MOSDPGBackfillRemove,
5769                                   MSG_OSD_PG_BACKFILL_REMOVE>(op);
5770   }
5771   return true;
5772 }
5773
5774 void PG::take_waiters()
5775 {
5776   dout(10) << "take_waiters" << dendl;
5777   requeue_map_waiters();
5778   for (list<CephPeeringEvtRef>::iterator i = peering_waiters.begin();
5779        i != peering_waiters.end();
5780        ++i) osd->queue_for_peering(this);
5781   peering_queue.splice(peering_queue.begin(), peering_waiters,
5782                        peering_waiters.begin(), peering_waiters.end());
5783 }
5784
5785 void PG::handle_peering_event(CephPeeringEvtRef evt, RecoveryCtx *rctx)
5786 {
5787   dout(10) << "handle_peering_event: " << evt->get_desc() << dendl;
5788   if (!have_same_or_newer_map(evt->get_epoch_sent())) {
5789     dout(10) << "deferring event " << evt->get_desc() << dendl;
5790     peering_waiters.push_back(evt);
5791     return;
5792   }
5793   if (old_peering_evt(evt))
5794     return;
5795   recovery_state.handle_event(evt, rctx);
5796 }
5797
5798 void PG::queue_peering_event(CephPeeringEvtRef evt)
5799 {
5800   if (old_peering_evt(evt))
5801     return;
5802   peering_queue.push_back(evt);
5803   osd->queue_for_peering(this);
5804 }
5805
5806 void PG::queue_null(epoch_t msg_epoch,
5807                     epoch_t query_epoch)
5808 {
5809   dout(10) << "null" << dendl;
5810   queue_peering_event(
5811     CephPeeringEvtRef(std::make_shared<CephPeeringEvt>(msg_epoch, query_epoch,
5812                                          NullEvt())));
5813 }
5814
5815 void PG::queue_flushed(epoch_t e)
5816 {
5817   dout(10) << "flushed" << dendl;
5818   queue_peering_event(
5819     CephPeeringEvtRef(std::make_shared<CephPeeringEvt>(e, e,
5820                                          FlushedEvt())));
5821 }
5822
5823 void PG::queue_query(epoch_t msg_epoch,
5824                      epoch_t query_epoch,
5825                      pg_shard_t from, const pg_query_t& q)
5826 {
5827   dout(10) << "handle_query " << q << " from replica " << from << dendl;
5828   queue_peering_event(
5829     CephPeeringEvtRef(std::make_shared<CephPeeringEvt>(msg_epoch, query_epoch,
5830                                          MQuery(from, q, query_epoch))));
5831 }
5832
5833 void PG::handle_advance_map(
5834   OSDMapRef osdmap, OSDMapRef lastmap,
5835   vector<int>& newup, int up_primary,
5836   vector<int>& newacting, int acting_primary,
5837   RecoveryCtx *rctx)
5838 {
5839   assert(lastmap->get_epoch() == osdmap_ref->get_epoch());
5840   assert(lastmap == osdmap_ref);
5841   dout(10) << "handle_advance_map "
5842            << newup << "/" << newacting
5843            << " -- " << up_primary << "/" << acting_primary
5844            << dendl;
5845   update_osdmap_ref(osdmap);
5846   pool.update(osdmap);
5847   past_intervals.update_type_from_map(pool.info.ec_pool(), *osdmap);
5848   if (cct->_conf->osd_debug_verify_cached_snaps) {
5849     interval_set<snapid_t> actual_removed_snaps;
5850     const pg_pool_t *pi = osdmap->get_pg_pool(info.pgid.pool());
5851     assert(pi);
5852     pi->build_removed_snaps(actual_removed_snaps);
5853     if (!(actual_removed_snaps == pool.cached_removed_snaps)) {
5854       derr << __func__ << ": mismatch between the actual removed snaps "
5855            << actual_removed_snaps << " and pool.cached_removed_snaps "
5856            << " pool.cached_removed_snaps " << pool.cached_removed_snaps
5857            << dendl;
5858     }
5859     assert(actual_removed_snaps == pool.cached_removed_snaps);
5860   }
5861   AdvMap evt(
5862     osdmap, lastmap, newup, up_primary,
5863     newacting, acting_primary);
5864   recovery_state.handle_event(evt, rctx);
5865   if (pool.info.last_change == osdmap_ref->get_epoch()) {
5866     on_pool_change();
5867     update_store_with_options();
5868   }
5869 }
5870
5871 void PG::handle_activate_map(RecoveryCtx *rctx)
5872 {
5873   dout(10) << "handle_activate_map " << dendl;
5874   ActMap evt;
5875   recovery_state.handle_event(evt, rctx);
5876   if (osdmap_ref->get_epoch() - last_persisted_osdmap_ref->get_epoch() >
5877     cct->_conf->osd_pg_epoch_persisted_max_stale) {
5878     dout(20) << __func__ << ": Dirtying info: last_persisted is "
5879              << last_persisted_osdmap_ref->get_epoch()
5880              << " while current is " << osdmap_ref->get_epoch() << dendl;
5881     dirty_info = true;
5882   } else {
5883     dout(20) << __func__ << ": Not dirtying info: last_persisted is "
5884              << last_persisted_osdmap_ref->get_epoch()
5885              << " while current is " << osdmap_ref->get_epoch() << dendl;
5886   }
5887   if (osdmap_ref->check_new_blacklist_entries()) check_blacklisted_watchers();
5888 }
5889
5890 void PG::handle_loaded(RecoveryCtx *rctx)
5891 {
5892   dout(10) << "handle_loaded" << dendl;
5893   Load evt;
5894   recovery_state.handle_event(evt, rctx);
5895 }
5896
5897 void PG::handle_create(RecoveryCtx *rctx)
5898 {
5899   dout(10) << "handle_create" << dendl;
5900   rctx->created_pgs.insert(this);
5901   Initialize evt;
5902   recovery_state.handle_event(evt, rctx);
5903   ActMap evt2;
5904   recovery_state.handle_event(evt2, rctx);
5905 }
5906
5907 void PG::handle_query_state(Formatter *f)
5908 {
5909   dout(10) << "handle_query_state" << dendl;
5910   QueryState q(f);
5911   recovery_state.handle_event(q, 0);
5912 }
5913
5914 void PG::update_store_with_options()
5915 {
5916   auto r = osd->store->set_collection_opts(coll, pool.info.opts);
5917   if(r < 0 && r != -EOPNOTSUPP) {
5918     derr << __func__ << "set_collection_opts returns error:" << r << dendl;
5919   }
5920 }
5921
5922 void PG::update_store_on_load()
5923 {
5924   if (osd->store->get_type() == "filestore") {
5925     // legacy filestore didn't store collection bit width; fix.
5926     int bits = osd->store->collection_bits(coll);
5927     if (bits < 0) {
5928       if (coll.is_meta())
5929         bits = 0;
5930       else
5931         bits = info.pgid.get_split_bits(pool.info.get_pg_num());
5932       lderr(cct) << __func__ << " setting bit width to " << bits << dendl;
5933       ObjectStore::Transaction t;
5934       t.collection_set_bits(coll, bits);
5935       osd->store->apply_transaction(osr.get(), std::move(t));
5936     }
5937   }
5938 }
5939
5940 /*------------ Recovery State Machine----------------*/
5941 #undef dout_prefix
5942 #define dout_prefix (*_dout << context< RecoveryMachine >().pg->gen_prefix() \
5943                      << "state<" << get_state_name() << ">: ")
5944
5945 /*------Crashed-------*/
5946 PG::RecoveryState::Crashed::Crashed(my_context ctx)
5947   : my_base(ctx),
5948     NamedState(context< RecoveryMachine >().pg, "Crashed")
5949 {
5950   context< RecoveryMachine >().log_enter(state_name);
5951   assert(0 == "we got a bad state machine event");
5952 }
5953
5954
5955 /*------Initial-------*/
5956 PG::RecoveryState::Initial::Initial(my_context ctx)
5957   : my_base(ctx),
5958     NamedState(context< RecoveryMachine >().pg, "Initial")
5959 {
5960   context< RecoveryMachine >().log_enter(state_name);
5961 }
5962
5963 boost::statechart::result PG::RecoveryState::Initial::react(const Load& l)
5964 {
5965   PG *pg = context< RecoveryMachine >().pg;
5966
5967   // do we tell someone we're here?
5968   pg->send_notify = (!pg->is_primary());
5969   pg->update_store_with_options();
5970
5971   pg->update_store_on_load();
5972
5973   return transit< Reset >();
5974 }
5975
5976 boost::statechart::result PG::RecoveryState::Initial::react(const MNotifyRec& notify)
5977 {
5978   PG *pg = context< RecoveryMachine >().pg;
5979   pg->proc_replica_info(
5980     notify.from, notify.notify.info, notify.notify.epoch_sent);
5981   pg->set_last_peering_reset();
5982   return transit< Primary >();
5983 }
5984
5985 boost::statechart::result PG::RecoveryState::Initial::react(const MInfoRec& i)
5986 {
5987   PG *pg = context< RecoveryMachine >().pg;
5988   assert(!pg->is_primary());
5989   post_event(i);
5990   return transit< Stray >();
5991 }
5992
5993 boost::statechart::result PG::RecoveryState::Initial::react(const MLogRec& i)
5994 {
5995   PG *pg = context< RecoveryMachine >().pg;
5996   assert(!pg->is_primary());
5997   post_event(i);
5998   return transit< Stray >();
5999 }
6000
6001 void PG::RecoveryState::Initial::exit()
6002 {
6003   context< RecoveryMachine >().log_exit(state_name, enter_time);
6004   PG *pg = context< RecoveryMachine >().pg;
6005   utime_t dur = ceph_clock_now() - enter_time;
6006   pg->osd->recoverystate_perf->tinc(rs_initial_latency, dur);
6007 }
6008
6009 /*------Started-------*/
6010 PG::RecoveryState::Started::Started(my_context ctx)
6011   : my_base(ctx),
6012     NamedState(context< RecoveryMachine >().pg, "Started")
6013 {
6014   context< RecoveryMachine >().log_enter(state_name);
6015 }
6016
6017 boost::statechart::result
6018 PG::RecoveryState::Started::react(const IntervalFlush&)
6019 {
6020   PG *pg = context< RecoveryMachine >().pg;
6021   ldout(pg->cct, 10) << "Ending blocked outgoing recovery messages" << dendl;
6022   context< RecoveryMachine >().pg->recovery_state.end_block_outgoing();
6023   return discard_event();
6024 }
6025
6026
6027 boost::statechart::result
6028 PG::RecoveryState::Started::react(const FlushedEvt&)
6029 {
6030   PG *pg = context< RecoveryMachine >().pg;
6031   pg->on_flushed();
6032   return discard_event();
6033 }
6034
6035
6036 boost::statechart::result PG::RecoveryState::Started::react(const AdvMap& advmap)
6037 {
6038   PG *pg = context< RecoveryMachine >().pg;
6039   ldout(pg->cct, 10) << "Started advmap" << dendl;
6040   pg->check_full_transition(advmap.lastmap, advmap.osdmap);
6041   if (pg->should_restart_peering(
6042         advmap.up_primary,
6043         advmap.acting_primary,
6044         advmap.newup,
6045         advmap.newacting,
6046         advmap.lastmap,
6047         advmap.osdmap)) {
6048     ldout(pg->cct, 10) << "should_restart_peering, transitioning to Reset"
6049                        << dendl;
6050     post_event(advmap);
6051     return transit< Reset >();
6052   }
6053   pg->remove_down_peer_info(advmap.osdmap);
6054   return discard_event();
6055 }
6056
6057 boost::statechart::result PG::RecoveryState::Started::react(const QueryState& q)
6058 {
6059   q.f->open_object_section("state");
6060   q.f->dump_string("name", state_name);
6061   q.f->dump_stream("enter_time") << enter_time;
6062   q.f->close_section();
6063   return discard_event();
6064 }
6065
6066 void PG::RecoveryState::Started::exit()
6067 {
6068   context< RecoveryMachine >().log_exit(state_name, enter_time);
6069   PG *pg = context< RecoveryMachine >().pg;
6070   utime_t dur = ceph_clock_now() - enter_time;
6071   pg->osd->recoverystate_perf->tinc(rs_started_latency, dur);
6072 }
6073
6074 /*--------Reset---------*/
6075 PG::RecoveryState::Reset::Reset(my_context ctx)
6076   : my_base(ctx),
6077     NamedState(context< RecoveryMachine >().pg, "Reset")
6078 {
6079   context< RecoveryMachine >().log_enter(state_name);
6080   PG *pg = context< RecoveryMachine >().pg;
6081
6082   pg->flushes_in_progress = 0;
6083   pg->set_last_peering_reset();
6084 }
6085
6086 boost::statechart::result
6087 PG::RecoveryState::Reset::react(const FlushedEvt&)
6088 {
6089   PG *pg = context< RecoveryMachine >().pg;
6090   pg->on_flushed();
6091   return discard_event();
6092 }
6093
6094 boost::statechart::result
6095 PG::RecoveryState::Reset::react(const IntervalFlush&)
6096 {
6097   PG *pg = context< RecoveryMachine >().pg;
6098   ldout(pg->cct, 10) << "Ending blocked outgoing recovery messages" << dendl;
6099   context< RecoveryMachine >().pg->recovery_state.end_block_outgoing();
6100   return discard_event();
6101 }
6102
6103 boost::statechart::result PG::RecoveryState::Reset::react(const AdvMap& advmap)
6104 {
6105   PG *pg = context< RecoveryMachine >().pg;
6106   ldout(pg->cct, 10) << "Reset advmap" << dendl;
6107
6108   pg->check_full_transition(advmap.lastmap, advmap.osdmap);
6109
6110   if (pg->should_restart_peering(
6111         advmap.up_primary,
6112         advmap.acting_primary,
6113         advmap.newup,
6114         advmap.newacting,
6115         advmap.lastmap,
6116         advmap.osdmap)) {
6117     ldout(pg->cct, 10) << "should restart peering, calling start_peering_interval again"
6118                        << dendl;
6119     pg->start_peering_interval(
6120       advmap.lastmap,
6121       advmap.newup, advmap.up_primary,
6122       advmap.newacting, advmap.acting_primary,
6123       context< RecoveryMachine >().get_cur_transaction());
6124   }
6125   pg->remove_down_peer_info(advmap.osdmap);
6126   pg->check_past_interval_bounds();
6127   return discard_event();
6128 }
6129
6130 boost::statechart::result PG::RecoveryState::Reset::react(const ActMap&)
6131 {
6132   PG *pg = context< RecoveryMachine >().pg;
6133   if (pg->should_send_notify() && pg->get_primary().osd >= 0) {
6134     context< RecoveryMachine >().send_notify(
6135       pg->get_primary(),
6136       pg_notify_t(
6137         pg->get_primary().shard, pg->pg_whoami.shard,
6138         pg->get_osdmap()->get_epoch(),
6139         pg->get_osdmap()->get_epoch(),
6140         pg->info),
6141       pg->past_intervals);
6142   }
6143
6144   pg->update_heartbeat_peers();
6145   pg->take_waiters();
6146
6147   return transit< Started >();
6148 }
6149
6150 boost::statechart::result PG::RecoveryState::Reset::react(const QueryState& q)
6151 {
6152   q.f->open_object_section("state");
6153   q.f->dump_string("name", state_name);
6154   q.f->dump_stream("enter_time") << enter_time;
6155   q.f->close_section();
6156   return discard_event();
6157 }
6158
6159 void PG::RecoveryState::Reset::exit()
6160 {
6161   context< RecoveryMachine >().log_exit(state_name, enter_time);
6162   PG *pg = context< RecoveryMachine >().pg;
6163   utime_t dur = ceph_clock_now() - enter_time;
6164   pg->osd->recoverystate_perf->tinc(rs_reset_latency, dur);
6165 }
6166
6167 /*-------Start---------*/
6168 PG::RecoveryState::Start::Start(my_context ctx)
6169   : my_base(ctx),
6170     NamedState(context< RecoveryMachine >().pg, "Start")
6171 {
6172   context< RecoveryMachine >().log_enter(state_name);
6173
6174   PG *pg = context< RecoveryMachine >().pg;
6175   if (pg->is_primary()) {
6176     ldout(pg->cct, 1) << "transitioning to Primary" << dendl;
6177     post_event(MakePrimary());
6178   } else { //is_stray
6179     ldout(pg->cct, 1) << "transitioning to Stray" << dendl;
6180     post_event(MakeStray());
6181   }
6182 }
6183
6184 void PG::RecoveryState::Start::exit()
6185 {
6186   context< RecoveryMachine >().log_exit(state_name, enter_time);
6187   PG *pg = context< RecoveryMachine >().pg;
6188   utime_t dur = ceph_clock_now() - enter_time;
6189   pg->osd->recoverystate_perf->tinc(rs_start_latency, dur);
6190 }
6191
6192 /*---------Primary--------*/
6193 PG::RecoveryState::Primary::Primary(my_context ctx)
6194   : my_base(ctx),
6195     NamedState(context< RecoveryMachine >().pg, "Started/Primary")
6196 {
6197   context< RecoveryMachine >().log_enter(state_name);
6198   PG *pg = context< RecoveryMachine >().pg;
6199   assert(pg->want_acting.empty());
6200
6201   // set CREATING bit until we have peered for the first time.
6202   if (pg->info.history.last_epoch_started == 0) {
6203     pg->state_set(PG_STATE_CREATING);
6204     // use the history timestamp, which ultimately comes from the
6205     // monitor in the create case.
6206     utime_t t = pg->info.history.last_scrub_stamp;
6207     pg->info.stats.last_fresh = t;
6208     pg->info.stats.last_active = t;
6209     pg->info.stats.last_change = t;
6210     pg->info.stats.last_peered = t;
6211     pg->info.stats.last_clean = t;
6212     pg->info.stats.last_unstale = t;
6213     pg->info.stats.last_undegraded = t;
6214     pg->info.stats.last_fullsized = t;
6215     pg->info.stats.last_scrub_stamp = t;
6216     pg->info.stats.last_deep_scrub_stamp = t;
6217     pg->info.stats.last_clean_scrub_stamp = t;
6218   }
6219 }
6220
6221 boost::statechart::result PG::RecoveryState::Primary::react(const MNotifyRec& notevt)
6222 {
6223   PG *pg = context< RecoveryMachine >().pg;
6224   ldout(pg->cct, 7) << "handle_pg_notify from osd." << notevt.from << dendl;
6225   pg->proc_replica_info(
6226     notevt.from, notevt.notify.info, notevt.notify.epoch_sent);
6227   return discard_event();
6228 }
6229
6230 boost::statechart::result PG::RecoveryState::Primary::react(const ActMap&)
6231 {
6232   PG *pg = context< RecoveryMachine >().pg;
6233   ldout(pg->cct, 7) << "handle ActMap primary" << dendl;
6234   pg->publish_stats_to_osd();
6235   pg->take_waiters();
6236   return discard_event();
6237 }
6238
6239 void PG::RecoveryState::Primary::exit()
6240 {
6241   context< RecoveryMachine >().log_exit(state_name, enter_time);
6242   PG *pg = context< RecoveryMachine >().pg;
6243   pg->want_acting.clear();
6244   utime_t dur = ceph_clock_now() - enter_time;
6245   pg->osd->recoverystate_perf->tinc(rs_primary_latency, dur);
6246   pg->clear_primary_state();
6247   pg->state_clear(PG_STATE_CREATING);
6248 }
6249
6250 /*---------Peering--------*/
6251 PG::RecoveryState::Peering::Peering(my_context ctx)
6252   : my_base(ctx),
6253     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering"),
6254     history_les_bound(false)
6255 {
6256   context< RecoveryMachine >().log_enter(state_name);
6257
6258   PG *pg = context< RecoveryMachine >().pg;
6259   assert(!pg->is_peered());
6260   assert(!pg->is_peering());
6261   assert(pg->is_primary());
6262   pg->state_set(PG_STATE_PEERING);
6263 }
6264
6265 boost::statechart::result PG::RecoveryState::Peering::react(const AdvMap& advmap)
6266 {
6267   PG *pg = context< RecoveryMachine >().pg;
6268   ldout(pg->cct, 10) << "Peering advmap" << dendl;
6269   if (prior_set.affected_by_map(*(advmap.osdmap), pg)) {
6270     ldout(pg->cct, 1) << "Peering, affected_by_map, going to Reset" << dendl;
6271     post_event(advmap);
6272     return transit< Reset >();
6273   }
6274
6275   pg->adjust_need_up_thru(advmap.osdmap);
6276
6277   return forward_event();
6278 }
6279
6280 boost::statechart::result PG::RecoveryState::Peering::react(const QueryState& q)
6281 {
6282   PG *pg = context< RecoveryMachine >().pg;
6283
6284   q.f->open_object_section("state");
6285   q.f->dump_string("name", state_name);
6286   q.f->dump_stream("enter_time") << enter_time;
6287
6288   q.f->open_array_section("past_intervals");
6289   pg->past_intervals.dump(q.f);
6290   q.f->close_section();
6291
6292   q.f->open_array_section("probing_osds");
6293   for (set<pg_shard_t>::iterator p = prior_set.probe.begin();
6294        p != prior_set.probe.end();
6295        ++p)
6296     q.f->dump_stream("osd") << *p;
6297   q.f->close_section();
6298
6299   if (prior_set.pg_down)
6300     q.f->dump_string("blocked", "peering is blocked due to down osds");
6301
6302   q.f->open_array_section("down_osds_we_would_probe");
6303   for (set<int>::iterator p = prior_set.down.begin();
6304        p != prior_set.down.end();
6305        ++p)
6306     q.f->dump_int("osd", *p);
6307   q.f->close_section();
6308
6309   q.f->open_array_section("peering_blocked_by");
6310   for (map<int,epoch_t>::iterator p = prior_set.blocked_by.begin();
6311        p != prior_set.blocked_by.end();
6312        ++p) {
6313     q.f->open_object_section("osd");
6314     q.f->dump_int("osd", p->first);
6315     q.f->dump_int("current_lost_at", p->second);
6316     q.f->dump_string("comment", "starting or marking this osd lost may let us proceed");
6317     q.f->close_section();
6318   }
6319   q.f->close_section();
6320
6321   if (history_les_bound) {
6322     q.f->open_array_section("peering_blocked_by_detail");
6323     q.f->open_object_section("item");
6324     q.f->dump_string("detail","peering_blocked_by_history_les_bound");
6325     q.f->close_section();
6326     q.f->close_section();
6327   }
6328
6329   q.f->close_section();
6330   return forward_event();
6331 }
6332
6333 void PG::RecoveryState::Peering::exit()
6334 {
6335   PG *pg = context< RecoveryMachine >().pg;
6336   ldout(pg->cct, 10) << "Leaving Peering" << dendl;
6337   context< RecoveryMachine >().log_exit(state_name, enter_time);
6338   pg->state_clear(PG_STATE_PEERING);
6339   pg->clear_probe_targets();
6340
6341   utime_t dur = ceph_clock_now() - enter_time;
6342   pg->osd->recoverystate_perf->tinc(rs_peering_latency, dur);
6343 }
6344
6345
6346 /*------Backfilling-------*/
6347 PG::RecoveryState::Backfilling::Backfilling(my_context ctx)
6348   : my_base(ctx),
6349     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Backfilling")
6350 {
6351   context< RecoveryMachine >().log_enter(state_name);
6352   PG *pg = context< RecoveryMachine >().pg;
6353   pg->backfill_reserved = true;
6354   pg->queue_recovery();
6355   pg->state_clear(PG_STATE_BACKFILL_TOOFULL);
6356   pg->state_clear(PG_STATE_BACKFILL_WAIT);
6357   pg->state_set(PG_STATE_BACKFILL);
6358   pg->publish_stats_to_osd();
6359 }
6360
6361 boost::statechart::result
6362 PG::RecoveryState::Backfilling::react(const CancelBackfill &)
6363 {
6364   PG *pg = context< RecoveryMachine >().pg;
6365   pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
6366   // XXX: Add a new pg state so user can see why backfill isn't proceeding
6367   // Can't use PG_STATE_BACKFILL_WAIT since it means waiting for reservations
6368   //pg->state_set(PG_STATE_BACKFILL_STALLED????);
6369
6370   for (set<pg_shard_t>::iterator it = pg->backfill_targets.begin();
6371        it != pg->backfill_targets.end();
6372        ++it) {
6373     assert(*it != pg->pg_whoami);
6374     ConnectionRef con = pg->osd->get_con_osd_cluster(
6375       it->osd, pg->get_osdmap()->get_epoch());
6376     if (con) {
6377       pg->osd->send_message_osd_cluster(
6378         new MBackfillReserve(
6379           MBackfillReserve::REJECT,
6380           spg_t(pg->info.pgid.pgid, it->shard),
6381           pg->get_osdmap()->get_epoch()),
6382         con.get());
6383     }
6384   }
6385
6386   pg->waiting_on_backfill.clear();
6387
6388   pg->schedule_backfill_full_retry();
6389   return transit<NotBackfilling>();
6390 }
6391
6392 boost::statechart::result
6393 PG::RecoveryState::Backfilling::react(const RemoteReservationRejected &)
6394 {
6395   PG *pg = context< RecoveryMachine >().pg;
6396   pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
6397   pg->state_set(PG_STATE_BACKFILL_TOOFULL);
6398
6399   for (set<pg_shard_t>::iterator it = pg->backfill_targets.begin();
6400        it != pg->backfill_targets.end();
6401        ++it) {
6402     assert(*it != pg->pg_whoami);
6403     ConnectionRef con = pg->osd->get_con_osd_cluster(
6404       it->osd, pg->get_osdmap()->get_epoch());
6405     if (con) {
6406       pg->osd->send_message_osd_cluster(
6407         new MBackfillReserve(
6408           MBackfillReserve::REJECT,
6409           spg_t(pg->info.pgid.pgid, it->shard),
6410           pg->get_osdmap()->get_epoch()),
6411         con.get());
6412     }
6413   }
6414
6415   pg->waiting_on_backfill.clear();
6416   pg->finish_recovery_op(hobject_t::get_max());
6417
6418   pg->schedule_backfill_full_retry();
6419   return transit<NotBackfilling>();
6420 }
6421
6422 void PG::RecoveryState::Backfilling::exit()
6423 {
6424   context< RecoveryMachine >().log_exit(state_name, enter_time);
6425   PG *pg = context< RecoveryMachine >().pg;
6426   pg->backfill_reserved = false;
6427   pg->backfill_reserving = false;
6428   pg->state_clear(PG_STATE_BACKFILL);
6429   pg->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
6430   utime_t dur = ceph_clock_now() - enter_time;
6431   pg->osd->recoverystate_perf->tinc(rs_backfilling_latency, dur);
6432 }
6433
6434 /*--WaitRemoteBackfillReserved--*/
6435
6436 PG::RecoveryState::WaitRemoteBackfillReserved::WaitRemoteBackfillReserved(my_context ctx)
6437   : my_base(ctx),
6438     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitRemoteBackfillReserved"),
6439     backfill_osd_it(context< Active >().remote_shards_to_reserve_backfill.begin())
6440 {
6441   context< RecoveryMachine >().log_enter(state_name);
6442   PG *pg = context< RecoveryMachine >().pg;
6443   pg->state_set(PG_STATE_BACKFILL_WAIT);
6444   pg->publish_stats_to_osd();
6445   post_event(RemoteBackfillReserved());
6446 }
6447
6448 boost::statechart::result
6449 PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteBackfillReserved &evt)
6450 {
6451   PG *pg = context< RecoveryMachine >().pg;
6452
6453   if (backfill_osd_it != context< Active >().remote_shards_to_reserve_backfill.end()) {
6454     //The primary never backfills itself
6455     assert(*backfill_osd_it != pg->pg_whoami);
6456     ConnectionRef con = pg->osd->get_con_osd_cluster(
6457       backfill_osd_it->osd, pg->get_osdmap()->get_epoch());
6458     if (con) {
6459       pg->osd->send_message_osd_cluster(
6460         new MBackfillReserve(
6461         MBackfillReserve::REQUEST,
6462         spg_t(pg->info.pgid.pgid, backfill_osd_it->shard),
6463         pg->get_osdmap()->get_epoch(),
6464         pg->get_backfill_priority()),
6465       con.get());
6466     }
6467     ++backfill_osd_it;
6468   } else {
6469     post_event(AllBackfillsReserved());
6470   }
6471   return discard_event();
6472 }
6473
6474 void PG::RecoveryState::WaitRemoteBackfillReserved::exit()
6475 {
6476   context< RecoveryMachine >().log_exit(state_name, enter_time);
6477   PG *pg = context< RecoveryMachine >().pg;
6478   utime_t dur = ceph_clock_now() - enter_time;
6479   pg->osd->recoverystate_perf->tinc(rs_waitremotebackfillreserved_latency, dur);
6480 }
6481
6482 boost::statechart::result
6483 PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteReservationRejected &evt)
6484 {
6485   PG *pg = context< RecoveryMachine >().pg;
6486   pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
6487
6488   // Send REJECT to all previously acquired reservations
6489   set<pg_shard_t>::const_iterator it, begin, end, next;
6490   begin = context< Active >().remote_shards_to_reserve_backfill.begin();
6491   end = context< Active >().remote_shards_to_reserve_backfill.end();
6492   assert(begin != end);
6493   for (next = it = begin, ++next ; next != backfill_osd_it; ++it, ++next) {
6494     //The primary never backfills itself
6495     assert(*it != pg->pg_whoami);
6496     ConnectionRef con = pg->osd->get_con_osd_cluster(
6497       it->osd, pg->get_osdmap()->get_epoch());
6498     if (con) {
6499       pg->osd->send_message_osd_cluster(
6500         new MBackfillReserve(
6501         MBackfillReserve::REJECT,
6502         spg_t(pg->info.pgid.pgid, it->shard),
6503         pg->get_osdmap()->get_epoch()),
6504       con.get());
6505     }
6506   }
6507
6508   pg->state_clear(PG_STATE_BACKFILL_WAIT);
6509   pg->state_set(PG_STATE_BACKFILL_TOOFULL);
6510   pg->publish_stats_to_osd();
6511
6512   pg->schedule_backfill_full_retry();
6513
6514   return transit<NotBackfilling>();
6515 }
6516
6517 /*--WaitLocalBackfillReserved--*/
6518 PG::RecoveryState::WaitLocalBackfillReserved::WaitLocalBackfillReserved(my_context ctx)
6519   : my_base(ctx),
6520     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitLocalBackfillReserved")
6521 {
6522   context< RecoveryMachine >().log_enter(state_name);
6523   PG *pg = context< RecoveryMachine >().pg;
6524   pg->state_set(PG_STATE_BACKFILL_WAIT);
6525   pg->osd->local_reserver.request_reservation(
6526     pg->info.pgid,
6527     new QueuePeeringEvt<LocalBackfillReserved>(
6528       pg, pg->get_osdmap()->get_epoch(),
6529       LocalBackfillReserved()),
6530     pg->get_backfill_priority());
6531   pg->publish_stats_to_osd();
6532 }
6533
6534 void PG::RecoveryState::WaitLocalBackfillReserved::exit()
6535 {
6536   context< RecoveryMachine >().log_exit(state_name, enter_time);
6537   PG *pg = context< RecoveryMachine >().pg;
6538   utime_t dur = ceph_clock_now() - enter_time;
6539   pg->osd->recoverystate_perf->tinc(rs_waitlocalbackfillreserved_latency, dur);
6540 }
6541
6542 /*----NotBackfilling------*/
6543 PG::RecoveryState::NotBackfilling::NotBackfilling(my_context ctx)
6544   : my_base(ctx),
6545     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/NotBackfilling")
6546 {
6547   context< RecoveryMachine >().log_enter(state_name);
6548   PG *pg = context< RecoveryMachine >().pg;
6549   pg->publish_stats_to_osd();
6550 }
6551
6552 boost::statechart::result
6553 PG::RecoveryState::NotBackfilling::react(const RemoteBackfillReserved &evt)
6554 {
6555   return discard_event();
6556 }
6557
6558 boost::statechart::result
6559 PG::RecoveryState::NotBackfilling::react(const RemoteReservationRejected &evt)
6560 {
6561   return discard_event();
6562 }
6563
6564 void PG::RecoveryState::NotBackfilling::exit()
6565 {
6566   context< RecoveryMachine >().log_exit(state_name, enter_time);
6567   PG *pg = context< RecoveryMachine >().pg;
6568   utime_t dur = ceph_clock_now() - enter_time;
6569   pg->osd->recoverystate_perf->tinc(rs_notbackfilling_latency, dur);
6570 }
6571
6572 /*----NotRecovering------*/
6573 PG::RecoveryState::NotRecovering::NotRecovering(my_context ctx)
6574   : my_base(ctx),
6575     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/NotRecovering")
6576 {
6577   context< RecoveryMachine >().log_enter(state_name);
6578   PG *pg = context< RecoveryMachine >().pg;
6579   pg->publish_stats_to_osd();
6580 }
6581
6582 void PG::RecoveryState::NotRecovering::exit()
6583 {
6584   context< RecoveryMachine >().log_exit(state_name, enter_time);
6585   PG *pg = context< RecoveryMachine >().pg;
6586   utime_t dur = ceph_clock_now() - enter_time;
6587   pg->osd->recoverystate_perf->tinc(rs_notrecovering_latency, dur);
6588 }
6589
6590 /*---RepNotRecovering----*/
6591 PG::RecoveryState::RepNotRecovering::RepNotRecovering(my_context ctx)
6592   : my_base(ctx),
6593     NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepNotRecovering")
6594 {
6595   context< RecoveryMachine >().log_enter(state_name);
6596 }
6597
6598 void PG::RecoveryState::RepNotRecovering::exit()
6599 {
6600   context< RecoveryMachine >().log_exit(state_name, enter_time);
6601   PG *pg = context< RecoveryMachine >().pg;
6602   utime_t dur = ceph_clock_now() - enter_time;
6603   pg->osd->recoverystate_perf->tinc(rs_repnotrecovering_latency, dur);
6604 }
6605
6606 /*---RepWaitRecoveryReserved--*/
6607 PG::RecoveryState::RepWaitRecoveryReserved::RepWaitRecoveryReserved(my_context ctx)
6608   : my_base(ctx),
6609     NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepWaitRecoveryReserved")
6610 {
6611   context< RecoveryMachine >().log_enter(state_name);
6612   PG *pg = context< RecoveryMachine >().pg;
6613
6614   pg->osd->remote_reserver.request_reservation(
6615     pg->info.pgid,
6616     new QueuePeeringEvt<RemoteRecoveryReserved>(
6617       pg, pg->get_osdmap()->get_epoch(),
6618       RemoteRecoveryReserved()),
6619     pg->get_recovery_priority());
6620 }
6621
6622 boost::statechart::result
6623 PG::RecoveryState::RepWaitRecoveryReserved::react(const RemoteRecoveryReserved &evt)
6624 {
6625   PG *pg = context< RecoveryMachine >().pg;
6626   pg->osd->send_message_osd_cluster(
6627     pg->primary.osd,
6628     new MRecoveryReserve(
6629       MRecoveryReserve::GRANT,
6630       spg_t(pg->info.pgid.pgid, pg->primary.shard),
6631       pg->get_osdmap()->get_epoch()),
6632     pg->get_osdmap()->get_epoch());
6633   return transit<RepRecovering>();
6634 }
6635
6636 void PG::RecoveryState::RepWaitRecoveryReserved::exit()
6637 {
6638   context< RecoveryMachine >().log_exit(state_name, enter_time);
6639   PG *pg = context< RecoveryMachine >().pg;
6640   utime_t dur = ceph_clock_now() - enter_time;
6641   pg->osd->recoverystate_perf->tinc(rs_repwaitrecoveryreserved_latency, dur);
6642 }
6643
6644 /*-RepWaitBackfillReserved*/
6645 PG::RecoveryState::RepWaitBackfillReserved::RepWaitBackfillReserved(my_context ctx)
6646   : my_base(ctx),
6647     NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepWaitBackfillReserved")
6648 {
6649   context< RecoveryMachine >().log_enter(state_name);
6650 }
6651
6652 boost::statechart::result
6653 PG::RecoveryState::RepNotRecovering::react(const RequestBackfillPrio &evt)
6654 {
6655   PG *pg = context< RecoveryMachine >().pg;
6656   ostringstream ss;
6657
6658   if (pg->cct->_conf->osd_debug_reject_backfill_probability > 0 &&
6659       (rand()%1000 < (pg->cct->_conf->osd_debug_reject_backfill_probability*1000.0))) {
6660     ldout(pg->cct, 10) << "backfill reservation rejected: failure injection"
6661                        << dendl;
6662     post_event(RemoteReservationRejected());
6663   } else if (!pg->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation &&
6664       pg->osd->check_backfill_full(ss)) {
6665     ldout(pg->cct, 10) << "backfill reservation rejected: "
6666                        << ss.str() << dendl;
6667     post_event(RemoteReservationRejected());
6668   } else {
6669     pg->osd->remote_reserver.request_reservation(
6670       pg->info.pgid,
6671       new QueuePeeringEvt<RemoteBackfillReserved>(
6672         pg, pg->get_osdmap()->get_epoch(),
6673         RemoteBackfillReserved()), evt.priority);
6674   }
6675   return transit<RepWaitBackfillReserved>();
6676 }
6677
6678 void PG::RecoveryState::RepWaitBackfillReserved::exit()
6679 {
6680   context< RecoveryMachine >().log_exit(state_name, enter_time);
6681   PG *pg = context< RecoveryMachine >().pg;
6682   utime_t dur = ceph_clock_now() - enter_time;
6683   pg->osd->recoverystate_perf->tinc(rs_repwaitbackfillreserved_latency, dur);
6684 }
6685
6686 boost::statechart::result
6687 PG::RecoveryState::RepWaitBackfillReserved::react(const RemoteBackfillReserved &evt)
6688 {
6689   PG *pg = context< RecoveryMachine >().pg;
6690
6691   ostringstream ss;
6692   if (pg->cct->_conf->osd_debug_reject_backfill_probability > 0 &&
6693       (rand()%1000 < (pg->cct->_conf->osd_debug_reject_backfill_probability*1000.0))) {
6694     ldout(pg->cct, 10) << "backfill reservation rejected after reservation: "
6695                        << "failure injection" << dendl;
6696     pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
6697     post_event(RemoteReservationRejected());
6698     return discard_event();
6699   } else if (!pg->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation &&
6700              pg->osd->check_backfill_full(ss)) {
6701     ldout(pg->cct, 10) << "backfill reservation rejected after reservation: "
6702                        << ss.str() << dendl;
6703     pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
6704     post_event(RemoteReservationRejected());
6705     return discard_event();
6706   } else {
6707     pg->osd->send_message_osd_cluster(
6708       pg->primary.osd,
6709       new MBackfillReserve(
6710         MBackfillReserve::GRANT,
6711         spg_t(pg->info.pgid.pgid, pg->primary.shard),
6712         pg->get_osdmap()->get_epoch()),
6713       pg->get_osdmap()->get_epoch());
6714     return transit<RepRecovering>();
6715   }
6716 }
6717
6718 boost::statechart::result
6719 PG::RecoveryState::RepWaitBackfillReserved::react(const RemoteReservationRejected &evt)
6720 {
6721   PG *pg = context< RecoveryMachine >().pg;
6722   pg->reject_reservation();
6723   return transit<RepNotRecovering>();
6724 }
6725
6726 /*---RepRecovering-------*/
6727 PG::RecoveryState::RepRecovering::RepRecovering(my_context ctx)
6728   : my_base(ctx),
6729     NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepRecovering")
6730 {
6731   context< RecoveryMachine >().log_enter(state_name);
6732 }
6733
6734 boost::statechart::result
6735 PG::RecoveryState::RepRecovering::react(const BackfillTooFull &)
6736 {
6737   PG *pg = context< RecoveryMachine >().pg;
6738   pg->reject_reservation();
6739   return discard_event();
6740 }
6741
6742 void PG::RecoveryState::RepRecovering::exit()
6743 {
6744   context< RecoveryMachine >().log_exit(state_name, enter_time);
6745   PG *pg = context< RecoveryMachine >().pg;
6746   pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
6747   utime_t dur = ceph_clock_now() - enter_time;
6748   pg->osd->recoverystate_perf->tinc(rs_reprecovering_latency, dur);
6749 }
6750
6751 /*------Activating--------*/
6752 PG::RecoveryState::Activating::Activating(my_context ctx)
6753   : my_base(ctx),
6754     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Activating")
6755 {
6756   context< RecoveryMachine >().log_enter(state_name);
6757 }
6758
6759 void PG::RecoveryState::Activating::exit()
6760 {
6761   context< RecoveryMachine >().log_exit(state_name, enter_time);
6762   PG *pg = context< RecoveryMachine >().pg;
6763   utime_t dur = ceph_clock_now() - enter_time;
6764   pg->osd->recoverystate_perf->tinc(rs_activating_latency, dur);
6765 }
6766
6767 PG::RecoveryState::WaitLocalRecoveryReserved::WaitLocalRecoveryReserved(my_context ctx)
6768   : my_base(ctx),
6769     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitLocalRecoveryReserved")
6770 {
6771   context< RecoveryMachine >().log_enter(state_name);
6772   PG *pg = context< RecoveryMachine >().pg;
6773
6774   // Make sure all nodes that part of the recovery aren't full
6775   if (!pg->cct->_conf->osd_debug_skip_full_check_in_recovery &&
6776       pg->osd->check_osdmap_full(pg->actingbackfill)) {
6777     post_event(RecoveryTooFull());
6778     return;
6779   }
6780
6781   pg->state_clear(PG_STATE_RECOVERY_TOOFULL);
6782   pg->state_set(PG_STATE_RECOVERY_WAIT);
6783   pg->osd->local_reserver.request_reservation(
6784     pg->info.pgid,
6785     new QueuePeeringEvt<LocalRecoveryReserved>(
6786       pg, pg->get_osdmap()->get_epoch(),
6787       LocalRecoveryReserved()),
6788     pg->get_recovery_priority());
6789   pg->publish_stats_to_osd();
6790 }
6791
6792 boost::statechart::result
6793 PG::RecoveryState::WaitLocalRecoveryReserved::react(const RecoveryTooFull &evt)
6794 {
6795   PG *pg = context< RecoveryMachine >().pg;
6796   pg->state_set(PG_STATE_RECOVERY_TOOFULL);
6797   pg->schedule_recovery_full_retry();
6798   return transit<NotRecovering>();
6799 }
6800
6801 void PG::RecoveryState::WaitLocalRecoveryReserved::exit()
6802 {
6803   context< RecoveryMachine >().log_exit(state_name, enter_time);
6804   PG *pg = context< RecoveryMachine >().pg;
6805   utime_t dur = ceph_clock_now() - enter_time;
6806   pg->osd->recoverystate_perf->tinc(rs_waitlocalrecoveryreserved_latency, dur);
6807 }
6808
6809 PG::RecoveryState::WaitRemoteRecoveryReserved::WaitRemoteRecoveryReserved(my_context ctx)
6810   : my_base(ctx),
6811     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitRemoteRecoveryReserved"),
6812     remote_recovery_reservation_it(context< Active >().remote_shards_to_reserve_recovery.begin())
6813 {
6814   context< RecoveryMachine >().log_enter(state_name);
6815   post_event(RemoteRecoveryReserved());
6816 }
6817
6818 boost::statechart::result
6819 PG::RecoveryState::WaitRemoteRecoveryReserved::react(const RemoteRecoveryReserved &evt) {
6820   PG *pg = context< RecoveryMachine >().pg;
6821
6822   if (remote_recovery_reservation_it != context< Active >().remote_shards_to_reserve_recovery.end()) {
6823     assert(*remote_recovery_reservation_it != pg->pg_whoami);
6824     ConnectionRef con = pg->osd->get_con_osd_cluster(
6825       remote_recovery_reservation_it->osd, pg->get_osdmap()->get_epoch());
6826     if (con) {
6827       pg->osd->send_message_osd_cluster(
6828         new MRecoveryReserve(
6829           MRecoveryReserve::REQUEST,
6830           spg_t(pg->info.pgid.pgid, remote_recovery_reservation_it->shard),
6831           pg->get_osdmap()->get_epoch()),
6832         con.get());
6833     }
6834     ++remote_recovery_reservation_it;
6835   } else {
6836     post_event(AllRemotesReserved());
6837   }
6838   return discard_event();
6839 }
6840
6841 void PG::RecoveryState::WaitRemoteRecoveryReserved::exit()
6842 {
6843   context< RecoveryMachine >().log_exit(state_name, enter_time);
6844   PG *pg = context< RecoveryMachine >().pg;
6845   utime_t dur = ceph_clock_now() - enter_time;
6846   pg->osd->recoverystate_perf->tinc(rs_waitremoterecoveryreserved_latency, dur);
6847 }
6848
6849 PG::RecoveryState::Recovering::Recovering(my_context ctx)
6850   : my_base(ctx),
6851     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Recovering")
6852 {
6853   context< RecoveryMachine >().log_enter(state_name);
6854
6855   PG *pg = context< RecoveryMachine >().pg;
6856   pg->state_clear(PG_STATE_RECOVERY_WAIT);
6857   pg->state_clear(PG_STATE_RECOVERY_TOOFULL);
6858   pg->state_set(PG_STATE_RECOVERING);
6859   pg->publish_stats_to_osd();
6860   pg->queue_recovery();
6861 }
6862
6863 void PG::RecoveryState::Recovering::release_reservations(bool cancel)
6864 {
6865   PG *pg = context< RecoveryMachine >().pg;
6866   assert(cancel || !pg->pg_log.get_missing().have_missing());
6867
6868   // release remote reservations
6869   for (set<pg_shard_t>::const_iterator i =
6870          context< Active >().remote_shards_to_reserve_recovery.begin();
6871         i != context< Active >().remote_shards_to_reserve_recovery.end();
6872         ++i) {
6873     if (*i == pg->pg_whoami) // skip myself
6874       continue;
6875     ConnectionRef con = pg->osd->get_con_osd_cluster(
6876       i->osd, pg->get_osdmap()->get_epoch());
6877     if (con) {
6878       pg->osd->send_message_osd_cluster(
6879         new MRecoveryReserve(
6880           MRecoveryReserve::RELEASE,
6881           spg_t(pg->info.pgid.pgid, i->shard),
6882           pg->get_osdmap()->get_epoch()),
6883         con.get());
6884     }
6885   }
6886 }
6887
6888 boost::statechart::result
6889 PG::RecoveryState::Recovering::react(const AllReplicasRecovered &evt)
6890 {
6891   PG *pg = context< RecoveryMachine >().pg;
6892   pg->state_clear(PG_STATE_RECOVERING);
6893   pg->state_clear(PG_STATE_FORCED_RECOVERY);
6894   release_reservations();
6895   return transit<Recovered>();
6896 }
6897
6898 boost::statechart::result
6899 PG::RecoveryState::Recovering::react(const RequestBackfill &evt)
6900 {
6901   PG *pg = context< RecoveryMachine >().pg;
6902   pg->state_clear(PG_STATE_RECOVERING);
6903   pg->state_clear(PG_STATE_FORCED_RECOVERY);
6904   release_reservations();
6905   return transit<WaitRemoteBackfillReserved>();
6906 }
6907
6908 boost::statechart::result
6909 PG::RecoveryState::Recovering::react(const CancelRecovery &evt)
6910 {
6911   PG *pg = context< RecoveryMachine >().pg;
6912   pg->state_clear(PG_STATE_RECOVERING);
6913   pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
6914   release_reservations(true);
6915   pg->schedule_recovery_full_retry();
6916   return transit<NotRecovering>();
6917 }
6918
6919 void PG::RecoveryState::Recovering::exit()
6920 {
6921   context< RecoveryMachine >().log_exit(state_name, enter_time);
6922   PG *pg = context< RecoveryMachine >().pg;
6923   utime_t dur = ceph_clock_now() - enter_time;
6924   pg->osd->recoverystate_perf->tinc(rs_recovering_latency, dur);
6925 }
6926
6927 PG::RecoveryState::Recovered::Recovered(my_context ctx)
6928   : my_base(ctx),
6929     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Recovered")
6930 {
6931   pg_shard_t auth_log_shard;
6932
6933   context< RecoveryMachine >().log_enter(state_name);
6934
6935   PG *pg = context< RecoveryMachine >().pg;
6936   pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
6937
6938   assert(!pg->needs_recovery());
6939
6940   // if we finished backfill, all acting are active; recheck if
6941   // DEGRADED | UNDERSIZED is appropriate.
6942   assert(!pg->actingbackfill.empty());
6943   if (pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid) <=
6944       pg->actingbackfill.size()) {
6945     pg->state_clear(PG_STATE_DEGRADED);
6946     pg->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
6947     pg->publish_stats_to_osd();
6948   }
6949
6950   // trim pglog on recovered
6951   pg->trim_log();
6952
6953   // adjust acting set?  (e.g. because backfill completed...)
6954   bool history_les_bound = false;
6955   if (pg->acting != pg->up && !pg->choose_acting(auth_log_shard,
6956                                                  true, &history_les_bound))
6957     assert(pg->want_acting.size());
6958
6959   if (context< Active >().all_replicas_activated)
6960     post_event(GoClean());
6961 }
6962
6963 void PG::RecoveryState::Recovered::exit()
6964 {
6965   context< RecoveryMachine >().log_exit(state_name, enter_time);
6966   PG *pg = context< RecoveryMachine >().pg;
6967   utime_t dur = ceph_clock_now() - enter_time;
6968   pg->osd->recoverystate_perf->tinc(rs_recovered_latency, dur);
6969 }
6970
6971 PG::RecoveryState::Clean::Clean(my_context ctx)
6972   : my_base(ctx),
6973     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Clean")
6974 {
6975   context< RecoveryMachine >().log_enter(state_name);
6976
6977   PG *pg = context< RecoveryMachine >().pg;
6978
6979   if (pg->info.last_complete != pg->info.last_update) {
6980     ceph_abort();
6981   }
6982   pg->finish_recovery(*context< RecoveryMachine >().get_on_safe_context_list());
6983
6984   if (pg->is_active()) {
6985     pg->mark_clean();
6986   }
6987
6988   pg->share_pg_info();
6989   pg->publish_stats_to_osd();
6990   pg->requeue_ops(pg->waiting_for_clean_to_primary_repair);
6991 }
6992
6993 void PG::RecoveryState::Clean::exit()
6994 {
6995   context< RecoveryMachine >().log_exit(state_name, enter_time);
6996   PG *pg = context< RecoveryMachine >().pg;
6997   pg->state_clear(PG_STATE_CLEAN);
6998   utime_t dur = ceph_clock_now() - enter_time;
6999   pg->osd->recoverystate_perf->tinc(rs_clean_latency, dur);
7000 }
7001
7002 template <typename T>
7003 set<pg_shard_t> unique_osd_shard_set(const pg_shard_t & skip, const T &in)
7004 {
7005   set<int> osds_found;
7006   set<pg_shard_t> out;
7007   for (typename T::const_iterator i = in.begin();
7008        i != in.end();
7009        ++i) {
7010     if (*i != skip && !osds_found.count(i->osd)) {
7011       osds_found.insert(i->osd);
7012       out.insert(*i);
7013     }
7014   }
7015   return out;
7016 }
7017
7018 /*---------Active---------*/
7019 PG::RecoveryState::Active::Active(my_context ctx)
7020   : my_base(ctx),
7021     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active"),
7022     remote_shards_to_reserve_recovery(
7023       unique_osd_shard_set(
7024         context< RecoveryMachine >().pg->pg_whoami,
7025         context< RecoveryMachine >().pg->actingbackfill)),
7026     remote_shards_to_reserve_backfill(
7027       unique_osd_shard_set(
7028         context< RecoveryMachine >().pg->pg_whoami,
7029         context< RecoveryMachine >().pg->backfill_targets)),
7030     all_replicas_activated(false)
7031 {
7032   context< RecoveryMachine >().log_enter(state_name);
7033
7034   PG *pg = context< RecoveryMachine >().pg;
7035
7036   assert(!pg->backfill_reserving);
7037   assert(!pg->backfill_reserved);
7038   assert(pg->is_primary());
7039   ldout(pg->cct, 10) << "In Active, about to call activate" << dendl;
7040   pg->start_flush(
7041     context< RecoveryMachine >().get_cur_transaction(),
7042     context< RecoveryMachine >().get_on_applied_context_list(),
7043     context< RecoveryMachine >().get_on_safe_context_list());
7044   pg->activate(*context< RecoveryMachine >().get_cur_transaction(),
7045                pg->get_osdmap()->get_epoch(),
7046                *context< RecoveryMachine >().get_on_safe_context_list(),
7047                *context< RecoveryMachine >().get_query_map(),
7048                context< RecoveryMachine >().get_info_map(),
7049                context< RecoveryMachine >().get_recovery_ctx());
7050
7051   // everyone has to commit/ack before we are truly active
7052   pg->blocked_by.clear();
7053   for (set<pg_shard_t>::iterator p = pg->actingbackfill.begin();
7054        p != pg->actingbackfill.end();
7055        ++p) {
7056     if (p->shard != pg->pg_whoami.shard) {
7057       pg->blocked_by.insert(p->shard);
7058     }
7059   }
7060   pg->publish_stats_to_osd();
7061   ldout(pg->cct, 10) << "Activate Finished" << dendl;
7062 }
7063
7064 boost::statechart::result PG::RecoveryState::Active::react(const AdvMap& advmap)
7065 {
7066   PG *pg = context< RecoveryMachine >().pg;
7067   ldout(pg->cct, 10) << "Active advmap" << dendl;
7068   if (!pg->pool.newly_removed_snaps.empty()) {
7069     pg->snap_trimq.union_of(pg->pool.newly_removed_snaps);
7070     ldout(pg->cct, 10) << *pg << " snap_trimq now " << pg->snap_trimq << dendl;
7071     pg->dirty_info = true;
7072     pg->dirty_big_info = true;
7073   }
7074
7075   for (size_t i = 0; i < pg->want_acting.size(); i++) {
7076     int osd = pg->want_acting[i];
7077     if (!advmap.osdmap->is_up(osd)) {
7078       pg_shard_t osd_with_shard(osd, shard_id_t(i));
7079       assert(pg->is_acting(osd_with_shard) || pg->is_up(osd_with_shard));
7080     }
7081   }
7082
7083   bool need_publish = false;
7084   /* Check for changes in pool size (if the acting set changed as a result,
7085    * this does not matter) */
7086   if (advmap.lastmap->get_pg_size(pg->info.pgid.pgid) !=
7087       pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid)) {
7088     if (pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid) <= pg->actingset.size()) {
7089       pg->state_clear(PG_STATE_UNDERSIZED);
7090       if (pg->needs_recovery()) {
7091         pg->state_set(PG_STATE_DEGRADED);
7092       } else {
7093         pg->state_clear(PG_STATE_DEGRADED);
7094       }
7095     } else {
7096       pg->state_set(PG_STATE_UNDERSIZED);
7097       pg->state_set(PG_STATE_DEGRADED);
7098     }
7099     need_publish = true; // degraded may have changed
7100   }
7101
7102   // if we haven't reported our PG stats in a long time, do so now.
7103   if (pg->info.stats.reported_epoch + pg->cct->_conf->osd_pg_stat_report_interval_max < advmap.osdmap->get_epoch()) {
7104     ldout(pg->cct, 20) << "reporting stats to osd after " << (advmap.osdmap->get_epoch() - pg->info.stats.reported_epoch)
7105                        << " epochs" << dendl;
7106     need_publish = true;
7107   }
7108
7109   if (need_publish)
7110     pg->publish_stats_to_osd();
7111
7112   return forward_event();
7113 }
7114
7115 boost::statechart::result PG::RecoveryState::Active::react(const ActMap&)
7116 {
7117   PG *pg = context< RecoveryMachine >().pg;
7118   ldout(pg->cct, 10) << "Active: handling ActMap" << dendl;
7119   assert(pg->is_primary());
7120
7121   if (pg->have_unfound()) {
7122     // object may have become unfound
7123     pg->discover_all_missing(*context< RecoveryMachine >().get_query_map());
7124   }
7125
7126   if (pg->cct->_conf->osd_check_for_log_corruption)
7127     pg->check_log_for_corruption(pg->osd->store);
7128
7129   uint64_t unfound = pg->missing_loc.num_unfound();
7130   if (unfound > 0 &&
7131       pg->all_unfound_are_queried_or_lost(pg->get_osdmap())) {
7132     if (pg->cct->_conf->osd_auto_mark_unfound_lost) {
7133       pg->osd->clog->error() << pg->info.pgid.pgid << " has " << unfound
7134                             << " objects unfound and apparently lost, would automatically "
7135                             << "mark these objects lost but this feature is not yet implemented "
7136                             << "(osd_auto_mark_unfound_lost)";
7137     } else
7138       pg->osd->clog->error() << pg->info.pgid.pgid << " has "
7139                              << unfound << " objects unfound and apparently lost";
7140   }
7141
7142   if (pg->is_active()) {
7143     ldout(pg->cct, 10) << "Active: kicking snap trim" << dendl;
7144     pg->kick_snap_trim();
7145   }
7146
7147   if (pg->is_peered() &&
7148       !pg->is_clean() &&
7149       !pg->get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL) &&
7150       (!pg->get_osdmap()->test_flag(CEPH_OSDMAP_NOREBALANCE) || pg->is_degraded())) {
7151     pg->queue_recovery();
7152   }
7153   return forward_event();
7154 }
7155
7156 boost::statechart::result PG::RecoveryState::Active::react(const MNotifyRec& notevt)
7157 {
7158   PG *pg = context< RecoveryMachine >().pg;
7159   assert(pg->is_primary());
7160   if (pg->peer_info.count(notevt.from)) {
7161     ldout(pg->cct, 10) << "Active: got notify from " << notevt.from
7162                        << ", already have info from that osd, ignoring"
7163                        << dendl;
7164   } else if (pg->peer_purged.count(notevt.from)) {
7165     ldout(pg->cct, 10) << "Active: got notify from " << notevt.from
7166                        << ", already purged that peer, ignoring"
7167                        << dendl;
7168   } else {
7169     ldout(pg->cct, 10) << "Active: got notify from " << notevt.from
7170                        << ", calling proc_replica_info and discover_all_missing"
7171                        << dendl;
7172     pg->proc_replica_info(
7173       notevt.from, notevt.notify.info, notevt.notify.epoch_sent);
7174     if (pg->have_unfound()) {
7175       pg->discover_all_missing(*context< RecoveryMachine >().get_query_map());
7176     }
7177   }
7178   return discard_event();
7179 }
7180
7181 boost::statechart::result PG::RecoveryState::Active::react(const MInfoRec& infoevt)
7182 {
7183   PG *pg = context< RecoveryMachine >().pg;
7184   assert(pg->is_primary());
7185
7186   assert(!pg->actingbackfill.empty());
7187   // don't update history (yet) if we are active and primary; the replica
7188   // may be telling us they have activated (and committed) but we can't
7189   // share that until _everyone_ does the same.
7190   if (pg->is_actingbackfill(infoevt.from)) {
7191     ldout(pg->cct, 10) << " peer osd." << infoevt.from
7192                        << " activated and committed" << dendl;
7193     pg->peer_activated.insert(infoevt.from);
7194     pg->blocked_by.erase(infoevt.from.shard);
7195     pg->publish_stats_to_osd();
7196     if (pg->peer_activated.size() == pg->actingbackfill.size()) {
7197       pg->all_activated_and_committed();
7198     }
7199   }
7200   return discard_event();
7201 }
7202
7203 boost::statechart::result PG::RecoveryState::Active::react(const MLogRec& logevt)
7204 {
7205   PG *pg = context< RecoveryMachine >().pg;
7206   ldout(pg->cct, 10) << "searching osd." << logevt.from
7207                      << " log for unfound items" << dendl;
7208   pg->proc_replica_log(
7209     logevt.msg->info, logevt.msg->log, logevt.msg->missing, logevt.from);
7210   bool got_missing = pg->search_for_missing(
7211     pg->peer_info[logevt.from],
7212     pg->peer_missing[logevt.from],
7213     logevt.from,
7214     context< RecoveryMachine >().get_recovery_ctx());
7215   if (pg->is_peered() &&
7216       got_missing)
7217     pg->queue_recovery();
7218   return discard_event();
7219 }
7220
7221 boost::statechart::result PG::RecoveryState::Active::react(const QueryState& q)
7222 {
7223   PG *pg = context< RecoveryMachine >().pg;
7224
7225   q.f->open_object_section("state");
7226   q.f->dump_string("name", state_name);
7227   q.f->dump_stream("enter_time") << enter_time;
7228
7229   {
7230     q.f->open_array_section("might_have_unfound");
7231     for (set<pg_shard_t>::iterator p = pg->might_have_unfound.begin();
7232          p != pg->might_have_unfound.end();
7233          ++p) {
7234       q.f->open_object_section("osd");
7235       q.f->dump_stream("osd") << *p;
7236       if (pg->peer_missing.count(*p)) {
7237         q.f->dump_string("status", "already probed");
7238       } else if (pg->peer_missing_requested.count(*p)) {
7239         q.f->dump_string("status", "querying");
7240       } else if (!pg->get_osdmap()->is_up(p->osd)) {
7241         q.f->dump_string("status", "osd is down");
7242       } else {
7243         q.f->dump_string("status", "not queried");
7244       }
7245       q.f->close_section();
7246     }
7247     q.f->close_section();
7248   }
7249   {
7250     q.f->open_object_section("recovery_progress");
7251     pg->dump_recovery_info(q.f);
7252     q.f->close_section();
7253   }
7254
7255   {
7256     q.f->open_object_section("scrub");
7257     q.f->dump_stream("scrubber.epoch_start") << pg->scrubber.epoch_start;
7258     q.f->dump_bool("scrubber.active", pg->scrubber.active);
7259     q.f->dump_string("scrubber.state", Scrubber::state_string(pg->scrubber.state));
7260     q.f->dump_stream("scrubber.start") << pg->scrubber.start;
7261     q.f->dump_stream("scrubber.end") << pg->scrubber.end;
7262     q.f->dump_stream("scrubber.subset_last_update") << pg->scrubber.subset_last_update;
7263     q.f->dump_bool("scrubber.deep", pg->scrubber.deep);
7264     q.f->dump_unsigned("scrubber.seed", pg->scrubber.seed);
7265     q.f->dump_int("scrubber.waiting_on", pg->scrubber.waiting_on);
7266     {
7267       q.f->open_array_section("scrubber.waiting_on_whom");
7268       for (set<pg_shard_t>::iterator p = pg->scrubber.waiting_on_whom.begin();
7269            p != pg->scrubber.waiting_on_whom.end();
7270            ++p) {
7271         q.f->dump_stream("shard") << *p;
7272       }
7273       q.f->close_section();
7274     }
7275     q.f->close_section();
7276   }
7277
7278   q.f->close_section();
7279   return forward_event();
7280 }
7281
7282 boost::statechart::result PG::RecoveryState::Active::react(const AllReplicasActivated &evt)
7283 {
7284   PG *pg = context< RecoveryMachine >().pg;
7285   all_replicas_activated = true;
7286
7287   pg->state_clear(PG_STATE_ACTIVATING);
7288   pg->state_clear(PG_STATE_CREATING);
7289   if (pg->acting.size() >= pg->pool.info.min_size) {
7290     pg->state_set(PG_STATE_ACTIVE);
7291   } else {
7292     pg->state_set(PG_STATE_PEERED);
7293   }
7294
7295   // info.last_epoch_started is set during activate()
7296   pg->info.history.last_epoch_started = pg->info.last_epoch_started;
7297   pg->info.history.last_interval_started = pg->info.last_interval_started;
7298   pg->dirty_info = true;
7299
7300   pg->share_pg_info();
7301   pg->publish_stats_to_osd();
7302
7303   pg->check_local();
7304
7305   // waiters
7306   if (pg->flushes_in_progress == 0) {
7307     pg->requeue_ops(pg->waiting_for_peered);
7308   }
7309
7310   pg->on_activate();
7311
7312   return discard_event();
7313 }
7314
7315 void PG::RecoveryState::Active::exit()
7316 {
7317   context< RecoveryMachine >().log_exit(state_name, enter_time);
7318   PG *pg = context< RecoveryMachine >().pg;
7319   pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
7320
7321   pg->blocked_by.clear();
7322   pg->backfill_reserved = false;
7323   pg->backfill_reserving = false;
7324   pg->state_clear(PG_STATE_ACTIVATING);
7325   pg->state_clear(PG_STATE_DEGRADED);
7326   pg->state_clear(PG_STATE_UNDERSIZED);
7327   pg->state_clear(PG_STATE_BACKFILL_TOOFULL);
7328   pg->state_clear(PG_STATE_BACKFILL_WAIT);
7329   pg->state_clear(PG_STATE_RECOVERY_WAIT);
7330   pg->state_clear(PG_STATE_RECOVERY_TOOFULL);
7331   utime_t dur = ceph_clock_now() - enter_time;
7332   pg->osd->recoverystate_perf->tinc(rs_active_latency, dur);
7333   pg->agent_stop();
7334 }
7335
7336 /*------ReplicaActive-----*/
7337 PG::RecoveryState::ReplicaActive::ReplicaActive(my_context ctx)
7338   : my_base(ctx),
7339     NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive")
7340 {
7341   context< RecoveryMachine >().log_enter(state_name);
7342
7343   PG *pg = context< RecoveryMachine >().pg;
7344   pg->start_flush(
7345     context< RecoveryMachine >().get_cur_transaction(),
7346     context< RecoveryMachine >().get_on_applied_context_list(),
7347     context< RecoveryMachine >().get_on_safe_context_list());
7348 }
7349
7350
7351 boost::statechart::result PG::RecoveryState::ReplicaActive::react(
7352   const Activate& actevt) {
7353   PG *pg = context< RecoveryMachine >().pg;
7354   ldout(pg->cct, 10) << "In ReplicaActive, about to call activate" << dendl;
7355   map<int, map<spg_t, pg_query_t> > query_map;
7356   pg->activate(*context< RecoveryMachine >().get_cur_transaction(),
7357                actevt.activation_epoch,
7358                *context< RecoveryMachine >().get_on_safe_context_list(),
7359                query_map, NULL, NULL);
7360   ldout(pg->cct, 10) << "Activate Finished" << dendl;
7361   return discard_event();
7362 }
7363
7364 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const MInfoRec& infoevt)
7365 {
7366   PG *pg = context< RecoveryMachine >().pg;
7367   pg->proc_primary_info(*context<RecoveryMachine>().get_cur_transaction(),
7368                         infoevt.info);
7369   return discard_event();
7370 }
7371
7372 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const MLogRec& logevt)
7373 {
7374   PG *pg = context< RecoveryMachine >().pg;
7375   ldout(pg->cct, 10) << "received log from " << logevt.from << dendl;
7376   ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
7377   pg->merge_log(*t, logevt.msg->info, logevt.msg->log, logevt.from);
7378   assert(pg->pg_log.get_head() == pg->info.last_update);
7379
7380   return discard_event();
7381 }
7382
7383 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const ActMap&)
7384 {
7385   PG *pg = context< RecoveryMachine >().pg;
7386   if (pg->should_send_notify() && pg->get_primary().osd >= 0) {
7387     context< RecoveryMachine >().send_notify(
7388       pg->get_primary(),
7389       pg_notify_t(
7390         pg->get_primary().shard, pg->pg_whoami.shard,
7391         pg->get_osdmap()->get_epoch(),
7392         pg->get_osdmap()->get_epoch(),
7393         pg->info),
7394       pg->past_intervals);
7395   }
7396   pg->take_waiters();
7397   return discard_event();
7398 }
7399
7400 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const MQuery& query)
7401 {
7402   PG *pg = context< RecoveryMachine >().pg;
7403   if (query.query.type == pg_query_t::MISSING) {
7404     pg->update_history(query.query.history);
7405     pg->fulfill_log(query.from, query.query, query.query_epoch);
7406   } // else: from prior to activation, safe to ignore
7407   return discard_event();
7408 }
7409
7410 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const QueryState& q)
7411 {
7412   q.f->open_object_section("state");
7413   q.f->dump_string("name", state_name);
7414   q.f->dump_stream("enter_time") << enter_time;
7415   q.f->close_section();
7416   return forward_event();
7417 }
7418
7419 void PG::RecoveryState::ReplicaActive::exit()
7420 {
7421   context< RecoveryMachine >().log_exit(state_name, enter_time);
7422   PG *pg = context< RecoveryMachine >().pg;
7423   pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
7424   utime_t dur = ceph_clock_now() - enter_time;
7425   pg->osd->recoverystate_perf->tinc(rs_replicaactive_latency, dur);
7426 }
7427
7428 /*-------Stray---*/
7429 PG::RecoveryState::Stray::Stray(my_context ctx)
7430   : my_base(ctx),
7431     NamedState(context< RecoveryMachine >().pg, "Started/Stray")
7432 {
7433   context< RecoveryMachine >().log_enter(state_name);
7434
7435   PG *pg = context< RecoveryMachine >().pg;
7436   assert(!pg->is_peered());
7437   assert(!pg->is_peering());
7438   assert(!pg->is_primary());
7439   pg->start_flush(
7440     context< RecoveryMachine >().get_cur_transaction(),
7441     context< RecoveryMachine >().get_on_applied_context_list(),
7442     context< RecoveryMachine >().get_on_safe_context_list());
7443 }
7444
7445 boost::statechart::result PG::RecoveryState::Stray::react(const MLogRec& logevt)
7446 {
7447   PG *pg = context< RecoveryMachine >().pg;
7448   MOSDPGLog *msg = logevt.msg.get();
7449   ldout(pg->cct, 10) << "got info+log from osd." << logevt.from << " " << msg->info << " " << msg->log << dendl;
7450
7451   ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
7452   if (msg->info.last_backfill == hobject_t()) {
7453     // restart backfill
7454     pg->unreg_next_scrub();
7455     pg->info = msg->info;
7456     pg->reg_next_scrub();
7457     pg->dirty_info = true;
7458     pg->dirty_big_info = true;  // maybe.
7459
7460     PGLogEntryHandler rollbacker{pg, t};
7461     pg->pg_log.reset_backfill_claim_log(msg->log, &rollbacker);
7462
7463     pg->pg_log.reset_backfill();
7464   } else {
7465     pg->merge_log(*t, msg->info, msg->log, logevt.from);
7466   }
7467
7468   assert(pg->pg_log.get_head() == pg->info.last_update);
7469
7470   post_event(Activate(logevt.msg->info.last_epoch_started));
7471   return transit<ReplicaActive>();
7472 }
7473
7474 boost::statechart::result PG::RecoveryState::Stray::react(const MInfoRec& infoevt)
7475 {
7476   PG *pg = context< RecoveryMachine >().pg;
7477   ldout(pg->cct, 10) << "got info from osd." << infoevt.from << " " << infoevt.info << dendl;
7478
7479   if (pg->info.last_update > infoevt.info.last_update) {
7480     // rewind divergent log entries
7481     ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
7482     pg->rewind_divergent_log(*t, infoevt.info.last_update);
7483     pg->info.stats = infoevt.info.stats;
7484     pg->info.hit_set = infoevt.info.hit_set;
7485   }
7486
7487   assert(infoevt.info.last_update == pg->info.last_update);
7488   assert(pg->pg_log.get_head() == pg->info.last_update);
7489
7490   post_event(Activate(infoevt.info.last_epoch_started));
7491   return transit<ReplicaActive>();
7492 }
7493
7494 boost::statechart::result PG::RecoveryState::Stray::react(const MQuery& query)
7495 {
7496   PG *pg = context< RecoveryMachine >().pg;
7497   if (query.query.type == pg_query_t::INFO) {
7498     pair<pg_shard_t, pg_info_t> notify_info;
7499     pg->update_history(query.query.history);
7500     pg->fulfill_info(query.from, query.query, notify_info);
7501     context< RecoveryMachine >().send_notify(
7502       notify_info.first,
7503       pg_notify_t(
7504         notify_info.first.shard, pg->pg_whoami.shard,
7505         query.query_epoch,
7506         pg->get_osdmap()->get_epoch(),
7507         notify_info.second),
7508       pg->past_intervals);
7509   } else {
7510     pg->fulfill_log(query.from, query.query, query.query_epoch);
7511   }
7512   return discard_event();
7513 }
7514
7515 boost::statechart::result PG::RecoveryState::Stray::react(const ActMap&)
7516 {
7517   PG *pg = context< RecoveryMachine >().pg;
7518   if (pg->should_send_notify() && pg->get_primary().osd >= 0) {
7519     context< RecoveryMachine >().send_notify(
7520       pg->get_primary(),
7521       pg_notify_t(
7522         pg->get_primary().shard, pg->pg_whoami.shard,
7523         pg->get_osdmap()->get_epoch(),
7524         pg->get_osdmap()->get_epoch(),
7525         pg->info),
7526       pg->past_intervals);
7527   }
7528   pg->take_waiters();
7529   return discard_event();
7530 }
7531
7532 void PG::RecoveryState::Stray::exit()
7533 {
7534   context< RecoveryMachine >().log_exit(state_name, enter_time);
7535   PG *pg = context< RecoveryMachine >().pg;
7536   utime_t dur = ceph_clock_now() - enter_time;
7537   pg->osd->recoverystate_perf->tinc(rs_stray_latency, dur);
7538 }
7539
7540 /*--------GetInfo---------*/
7541 PG::RecoveryState::GetInfo::GetInfo(my_context ctx)
7542   : my_base(ctx),
7543     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/GetInfo")
7544 {
7545   context< RecoveryMachine >().log_enter(state_name);
7546
7547   PG *pg = context< RecoveryMachine >().pg;
7548   pg->check_past_interval_bounds();
7549   PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
7550
7551   assert(pg->blocked_by.empty());
7552
7553   prior_set = pg->build_prior();
7554
7555   pg->reset_min_peer_features();
7556   get_infos();
7557   if (prior_set.pg_down) {
7558     post_event(IsDown());
7559   } else if (peer_info_requested.empty()) {
7560     post_event(GotInfo());
7561   }
7562 }
7563
7564 void PG::RecoveryState::GetInfo::get_infos()
7565 {
7566   PG *pg = context< RecoveryMachine >().pg;
7567   PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
7568
7569   pg->blocked_by.clear();
7570   for (set<pg_shard_t>::const_iterator it = prior_set.probe.begin();
7571        it != prior_set.probe.end();
7572        ++it) {
7573     pg_shard_t peer = *it;
7574     if (peer == pg->pg_whoami) {
7575       continue;
7576     }
7577     if (pg->peer_info.count(peer)) {
7578       ldout(pg->cct, 10) << " have osd." << peer << " info " << pg->peer_info[peer] << dendl;
7579       continue;
7580     }
7581     if (peer_info_requested.count(peer)) {
7582       ldout(pg->cct, 10) << " already requested info from osd." << peer << dendl;
7583       pg->blocked_by.insert(peer.osd);
7584     } else if (!pg->get_osdmap()->is_up(peer.osd)) {
7585       ldout(pg->cct, 10) << " not querying info from down osd." << peer << dendl;
7586     } else {
7587       ldout(pg->cct, 10) << " querying info from osd." << peer << dendl;
7588       context< RecoveryMachine >().send_query(
7589         peer, pg_query_t(pg_query_t::INFO,
7590                          it->shard, pg->pg_whoami.shard,
7591                          pg->info.history,
7592                          pg->get_osdmap()->get_epoch()));
7593       peer_info_requested.insert(peer);
7594       pg->blocked_by.insert(peer.osd);
7595     }
7596   }
7597
7598   pg->publish_stats_to_osd();
7599 }
7600
7601 boost::statechart::result PG::RecoveryState::GetInfo::react(const MNotifyRec& infoevt)
7602 {
7603   PG *pg = context< RecoveryMachine >().pg;
7604
7605   set<pg_shard_t>::iterator p = peer_info_requested.find(infoevt.from);
7606   if (p != peer_info_requested.end()) {
7607     peer_info_requested.erase(p);
7608     pg->blocked_by.erase(infoevt.from.osd);
7609   }
7610
7611   epoch_t old_start = pg->info.history.last_epoch_started;
7612   if (pg->proc_replica_info(
7613         infoevt.from, infoevt.notify.info, infoevt.notify.epoch_sent)) {
7614     // we got something new ...
7615     PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
7616     if (old_start < pg->info.history.last_epoch_started) {
7617       ldout(pg->cct, 10) << " last_epoch_started moved forward, rebuilding prior" << dendl;
7618       prior_set = pg->build_prior();
7619
7620       // filter out any osds that got dropped from the probe set from
7621       // peer_info_requested.  this is less expensive than restarting
7622       // peering (which would re-probe everyone).
7623       set<pg_shard_t>::iterator p = peer_info_requested.begin();
7624       while (p != peer_info_requested.end()) {
7625         if (prior_set.probe.count(*p) == 0) {
7626           ldout(pg->cct, 20) << " dropping osd." << *p << " from info_requested, no longer in probe set" << dendl;
7627           peer_info_requested.erase(p++);
7628         } else {
7629           ++p;
7630         }
7631       }
7632       get_infos();
7633     }
7634     ldout(pg->cct, 20) << "Adding osd: " << infoevt.from.osd << " peer features: "
7635                        << hex << infoevt.features << dec << dendl;
7636     pg->apply_peer_features(infoevt.features);
7637
7638     // are we done getting everything?
7639     if (peer_info_requested.empty() && !prior_set.pg_down) {
7640       ldout(pg->cct, 20) << "Common peer features: " << hex << pg->get_min_peer_features() << dec << dendl;
7641       ldout(pg->cct, 20) << "Common acting features: " << hex << pg->get_min_acting_features() << dec << dendl;
7642       ldout(pg->cct, 20) << "Common upacting features: " << hex << pg->get_min_upacting_features() << dec << dendl;
7643       post_event(GotInfo());
7644     }
7645   }
7646   return discard_event();
7647 }
7648
7649 boost::statechart::result PG::RecoveryState::GetInfo::react(const QueryState& q)
7650 {
7651   PG *pg = context< RecoveryMachine >().pg;
7652   q.f->open_object_section("state");
7653   q.f->dump_string("name", state_name);
7654   q.f->dump_stream("enter_time") << enter_time;
7655
7656   q.f->open_array_section("requested_info_from");
7657   for (set<pg_shard_t>::iterator p = peer_info_requested.begin();
7658        p != peer_info_requested.end();
7659        ++p) {
7660     q.f->open_object_section("osd");
7661     q.f->dump_stream("osd") << *p;
7662     if (pg->peer_info.count(*p)) {
7663       q.f->open_object_section("got_info");
7664       pg->peer_info[*p].dump(q.f);
7665       q.f->close_section();
7666     }
7667     q.f->close_section();
7668   }
7669   q.f->close_section();
7670
7671   q.f->close_section();
7672   return forward_event();
7673 }
7674
7675 void PG::RecoveryState::GetInfo::exit()
7676 {
7677   context< RecoveryMachine >().log_exit(state_name, enter_time);
7678   PG *pg = context< RecoveryMachine >().pg;
7679   utime_t dur = ceph_clock_now() - enter_time;
7680   pg->osd->recoverystate_perf->tinc(rs_getinfo_latency, dur);
7681   pg->blocked_by.clear();
7682   pg->publish_stats_to_osd();
7683 }
7684
7685 /*------GetLog------------*/
7686 PG::RecoveryState::GetLog::GetLog(my_context ctx)
7687   : my_base(ctx),
7688     NamedState(
7689       context< RecoveryMachine >().pg, "Started/Primary/Peering/GetLog"),
7690     msg(0)
7691 {
7692   context< RecoveryMachine >().log_enter(state_name);
7693
7694   PG *pg = context< RecoveryMachine >().pg;
7695
7696   // adjust acting?
7697   if (!pg->choose_acting(auth_log_shard, false,
7698                          &context< Peering >().history_les_bound)) {
7699     if (!pg->want_acting.empty()) {
7700       post_event(NeedActingChange());
7701     } else {
7702       post_event(IsIncomplete());
7703     }
7704     return;
7705   }
7706
7707   // am i the best?
7708   if (auth_log_shard == pg->pg_whoami) {
7709     post_event(GotLog());
7710     return;
7711   }
7712
7713   const pg_info_t& best = pg->peer_info[auth_log_shard];
7714
7715   // am i broken?
7716   if (pg->info.last_update < best.log_tail) {
7717     ldout(pg->cct, 10) << " not contiguous with osd." << auth_log_shard << ", down" << dendl;
7718     post_event(IsIncomplete());
7719     return;
7720   }
7721
7722   // how much log to request?
7723   eversion_t request_log_from = pg->info.last_update;
7724   assert(!pg->actingbackfill.empty());
7725   for (set<pg_shard_t>::iterator p = pg->actingbackfill.begin();
7726        p != pg->actingbackfill.end();
7727        ++p) {
7728     if (*p == pg->pg_whoami) continue;
7729     pg_info_t& ri = pg->peer_info[*p];
7730     if (ri.last_update < pg->info.log_tail && ri.last_update >= best.log_tail &&
7731         ri.last_update < request_log_from)
7732       request_log_from = ri.last_update;
7733   }
7734
7735   // how much?
7736   ldout(pg->cct, 10) << " requesting log from osd." << auth_log_shard << dendl;
7737   context<RecoveryMachine>().send_query(
7738     auth_log_shard,
7739     pg_query_t(
7740       pg_query_t::LOG,
7741       auth_log_shard.shard, pg->pg_whoami.shard,
7742       request_log_from, pg->info.history,
7743       pg->get_osdmap()->get_epoch()));
7744
7745   assert(pg->blocked_by.empty());
7746   pg->blocked_by.insert(auth_log_shard.osd);
7747   pg->publish_stats_to_osd();
7748 }
7749
7750 boost::statechart::result PG::RecoveryState::GetLog::react(const AdvMap& advmap)
7751 {
7752   PG *pg = context< RecoveryMachine >().pg;
7753   // make sure our log source didn't go down.  we need to check
7754   // explicitly because it may not be part of the prior set, which
7755   // means the Peering state check won't catch it going down.
7756   if (!advmap.osdmap->is_up(auth_log_shard.osd)) {
7757     ldout(pg->cct, 10) << "GetLog: auth_log_shard osd."
7758                        << auth_log_shard.osd << " went down" << dendl;
7759     post_event(advmap);
7760     return transit< Reset >();
7761   }
7762
7763   // let the Peering state do its checks.
7764   return forward_event();
7765 }
7766
7767 boost::statechart::result PG::RecoveryState::GetLog::react(const MLogRec& logevt)
7768 {
7769   PG *pg = context< RecoveryMachine >().pg;
7770   assert(!msg);
7771   if (logevt.from != auth_log_shard) {
7772     ldout(pg->cct, 10) << "GetLog: discarding log from "
7773                        << "non-auth_log_shard osd." << logevt.from << dendl;
7774     return discard_event();
7775   }
7776   ldout(pg->cct, 10) << "GetLog: received master log from osd"
7777                      << logevt.from << dendl;
7778   msg = logevt.msg;
7779   post_event(GotLog());
7780   return discard_event();
7781 }
7782
7783 boost::statechart::result PG::RecoveryState::GetLog::react(const GotLog&)
7784 {
7785   PG *pg = context< RecoveryMachine >().pg;
7786   ldout(pg->cct, 10) << "leaving GetLog" << dendl;
7787   if (msg) {
7788     ldout(pg->cct, 10) << "processing master log" << dendl;
7789     pg->proc_master_log(*context<RecoveryMachine>().get_cur_transaction(),
7790                         msg->info, msg->log, msg->missing,
7791                         auth_log_shard);
7792   }
7793   pg->start_flush(
7794     context< RecoveryMachine >().get_cur_transaction(),
7795     context< RecoveryMachine >().get_on_applied_context_list(),
7796     context< RecoveryMachine >().get_on_safe_context_list());
7797   return transit< GetMissing >();
7798 }
7799
7800 boost::statechart::result PG::RecoveryState::GetLog::react(const QueryState& q)
7801 {
7802   q.f->open_object_section("state");
7803   q.f->dump_string("name", state_name);
7804   q.f->dump_stream("enter_time") << enter_time;
7805   q.f->dump_stream("auth_log_shard") << auth_log_shard;
7806   q.f->close_section();
7807   return forward_event();
7808 }
7809
7810 void PG::RecoveryState::GetLog::exit()
7811 {
7812   context< RecoveryMachine >().log_exit(state_name, enter_time);
7813   PG *pg = context< RecoveryMachine >().pg;
7814   utime_t dur = ceph_clock_now() - enter_time;
7815   pg->osd->recoverystate_perf->tinc(rs_getlog_latency, dur);
7816   pg->blocked_by.clear();
7817   pg->publish_stats_to_osd();
7818 }
7819
7820 /*------WaitActingChange--------*/
7821 PG::RecoveryState::WaitActingChange::WaitActingChange(my_context ctx)
7822   : my_base(ctx),
7823     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/WaitActingChange")
7824 {
7825   context< RecoveryMachine >().log_enter(state_name);
7826 }
7827
7828 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const AdvMap& advmap)
7829 {
7830   PG *pg = context< RecoveryMachine >().pg;
7831   OSDMapRef osdmap = advmap.osdmap;
7832
7833   ldout(pg->cct, 10) << "verifying no want_acting " << pg->want_acting << " targets didn't go down" << dendl;
7834   for (vector<int>::iterator p = pg->want_acting.begin(); p != pg->want_acting.end(); ++p) {
7835     if (!osdmap->is_up(*p)) {
7836       ldout(pg->cct, 10) << " want_acting target osd." << *p << " went down, resetting" << dendl;
7837       post_event(advmap);
7838       return transit< Reset >();
7839     }
7840   }
7841   return forward_event();
7842 }
7843
7844 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const MLogRec& logevt)
7845 {
7846   PG *pg = context< RecoveryMachine >().pg;
7847   ldout(pg->cct, 10) << "In WaitActingChange, ignoring MLocRec" << dendl;
7848   return discard_event();
7849 }
7850
7851 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const MInfoRec& evt)
7852 {
7853   PG *pg = context< RecoveryMachine >().pg;
7854   ldout(pg->cct, 10) << "In WaitActingChange, ignoring MInfoRec" << dendl;
7855   return discard_event();
7856 }
7857
7858 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const MNotifyRec& evt)
7859 {
7860   PG *pg = context< RecoveryMachine >().pg;
7861   ldout(pg->cct, 10) << "In WaitActingChange, ignoring MNotifyRec" << dendl;
7862   return discard_event();
7863 }
7864
7865 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const QueryState& q)
7866 {
7867   q.f->open_object_section("state");
7868   q.f->dump_string("name", state_name);
7869   q.f->dump_stream("enter_time") << enter_time;
7870   q.f->dump_string("comment", "waiting for pg acting set to change");
7871   q.f->close_section();
7872   return forward_event();
7873 }
7874
7875 void PG::RecoveryState::WaitActingChange::exit()
7876 {
7877   context< RecoveryMachine >().log_exit(state_name, enter_time);
7878   PG *pg = context< RecoveryMachine >().pg;
7879   utime_t dur = ceph_clock_now() - enter_time;
7880   pg->osd->recoverystate_perf->tinc(rs_waitactingchange_latency, dur);
7881 }
7882
7883 /*------Down--------*/
7884 PG::RecoveryState::Down::Down(my_context ctx)
7885   : my_base(ctx),
7886     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/Down")
7887 {
7888   context< RecoveryMachine >().log_enter(state_name);
7889   PG *pg = context< RecoveryMachine >().pg;
7890
7891   pg->state_clear(PG_STATE_PEERING);
7892   pg->state_set(PG_STATE_DOWN);
7893
7894   auto &prior_set = context< Peering >().prior_set;
7895   assert(pg->blocked_by.empty());
7896   pg->blocked_by.insert(prior_set.down.begin(), prior_set.down.end());
7897   pg->publish_stats_to_osd();
7898 }
7899
7900 void PG::RecoveryState::Down::exit()
7901 {
7902   context< RecoveryMachine >().log_exit(state_name, enter_time);
7903   PG *pg = context< RecoveryMachine >().pg;
7904
7905   pg->state_clear(PG_STATE_DOWN);
7906   utime_t dur = ceph_clock_now() - enter_time;
7907   pg->osd->recoverystate_perf->tinc(rs_down_latency, dur);
7908
7909   pg->blocked_by.clear();
7910   pg->publish_stats_to_osd();
7911 }
7912
7913 boost::statechart::result PG::RecoveryState::Down::react(const QueryState& q)
7914 {
7915   q.f->open_object_section("state");
7916   q.f->dump_string("name", state_name);
7917   q.f->dump_stream("enter_time") << enter_time;
7918   q.f->dump_string("comment",
7919                    "not enough up instances of this PG to go active");
7920   q.f->close_section();
7921   return forward_event();
7922 }
7923
7924 /*------Incomplete--------*/
7925 PG::RecoveryState::Incomplete::Incomplete(my_context ctx)
7926   : my_base(ctx),
7927     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/Incomplete")
7928 {
7929   context< RecoveryMachine >().log_enter(state_name);
7930   PG *pg = context< RecoveryMachine >().pg;
7931
7932   pg->state_clear(PG_STATE_PEERING);
7933   pg->state_set(PG_STATE_INCOMPLETE);
7934
7935   PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
7936   assert(pg->blocked_by.empty());
7937   pg->blocked_by.insert(prior_set.down.begin(), prior_set.down.end());
7938   pg->publish_stats_to_osd();
7939 }
7940
7941 boost::statechart::result PG::RecoveryState::Incomplete::react(const AdvMap &advmap) {
7942   PG *pg = context< RecoveryMachine >().pg;
7943   int64_t poolnum = pg->info.pgid.pool();
7944
7945   // Reset if min_size turn smaller than previous value, pg might now be able to go active
7946   if (advmap.lastmap->get_pools().find(poolnum)->second.min_size >
7947       advmap.osdmap->get_pools().find(poolnum)->second.min_size) {
7948     post_event(advmap);
7949     return transit< Reset >();
7950   }
7951
7952   return forward_event();
7953 }
7954
7955 boost::statechart::result PG::RecoveryState::Incomplete::react(const MNotifyRec& notevt) {
7956   PG *pg = context< RecoveryMachine >().pg;
7957   ldout(pg->cct, 7) << "handle_pg_notify from osd." << notevt.from << dendl;
7958   if (pg->proc_replica_info(
7959     notevt.from, notevt.notify.info, notevt.notify.epoch_sent)) {
7960     // We got something new, try again!
7961     return transit< GetLog >();
7962   } else {
7963     return discard_event();
7964   }
7965 }
7966
7967 boost::statechart::result PG::RecoveryState::Incomplete::react(
7968   const QueryState& q)
7969 {
7970   q.f->open_object_section("state");
7971   q.f->dump_string("name", state_name);
7972   q.f->dump_stream("enter_time") << enter_time;
7973   q.f->dump_string("comment", "not enough complete instances of this PG");
7974   q.f->close_section();
7975   return forward_event();
7976 }
7977
7978 void PG::RecoveryState::Incomplete::exit()
7979 {
7980   context< RecoveryMachine >().log_exit(state_name, enter_time);
7981   PG *pg = context< RecoveryMachine >().pg;
7982
7983   pg->state_clear(PG_STATE_INCOMPLETE);
7984   utime_t dur = ceph_clock_now() - enter_time;
7985   pg->osd->recoverystate_perf->tinc(rs_incomplete_latency, dur);
7986
7987   pg->blocked_by.clear();
7988   pg->publish_stats_to_osd();
7989 }
7990
7991 /*------GetMissing--------*/
7992 PG::RecoveryState::GetMissing::GetMissing(my_context ctx)
7993   : my_base(ctx),
7994     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/GetMissing")
7995 {
7996   context< RecoveryMachine >().log_enter(state_name);
7997
7998   PG *pg = context< RecoveryMachine >().pg;
7999   assert(!pg->actingbackfill.empty());
8000   eversion_t since;
8001   for (set<pg_shard_t>::iterator i = pg->actingbackfill.begin();
8002        i != pg->actingbackfill.end();
8003        ++i) {
8004     if (*i == pg->get_primary()) continue;
8005     const pg_info_t& pi = pg->peer_info[*i];
8006
8007     if (pi.is_empty())
8008       continue;                                // no pg data, nothing divergent
8009
8010     if (pi.last_update < pg->pg_log.get_tail()) {
8011       ldout(pg->cct, 10) << " osd." << *i << " is not contiguous, will restart backfill" << dendl;
8012       pg->peer_missing[*i];
8013       continue;
8014     }
8015     if (pi.last_backfill == hobject_t()) {
8016       ldout(pg->cct, 10) << " osd." << *i << " will fully backfill; can infer empty missing set" << dendl;
8017       pg->peer_missing[*i];
8018       continue;
8019     }
8020
8021     if (pi.last_update == pi.last_complete &&  // peer has no missing
8022         pi.last_update == pg->info.last_update) {  // peer is up to date
8023       // replica has no missing and identical log as us.  no need to
8024       // pull anything.
8025       // FIXME: we can do better here.  if last_update==last_complete we
8026       //        can infer the rest!
8027       ldout(pg->cct, 10) << " osd." << *i << " has no missing, identical log" << dendl;
8028       pg->peer_missing[*i];
8029       continue;
8030     }
8031
8032     // We pull the log from the peer's last_epoch_started to ensure we
8033     // get enough log to detect divergent updates.
8034     since.epoch = pi.last_epoch_started;
8035     assert(pi.last_update >= pg->info.log_tail);  // or else choose_acting() did a bad thing
8036     if (pi.log_tail <= since) {
8037       ldout(pg->cct, 10) << " requesting log+missing since " << since << " from osd." << *i << dendl;
8038       context< RecoveryMachine >().send_query(
8039         *i,
8040         pg_query_t(
8041           pg_query_t::LOG,
8042           i->shard, pg->pg_whoami.shard,
8043           since, pg->info.history,
8044           pg->get_osdmap()->get_epoch()));
8045     } else {
8046       ldout(pg->cct, 10) << " requesting fulllog+missing from osd." << *i
8047                          << " (want since " << since << " < log.tail "
8048                          << pi.log_tail << ")" << dendl;
8049       context< RecoveryMachine >().send_query(
8050         *i, pg_query_t(
8051           pg_query_t::FULLLOG,
8052           i->shard, pg->pg_whoami.shard,
8053           pg->info.history, pg->get_osdmap()->get_epoch()));
8054     }
8055     peer_missing_requested.insert(*i);
8056     pg->blocked_by.insert(i->osd);
8057   }
8058
8059   if (peer_missing_requested.empty()) {
8060     if (pg->need_up_thru) {
8061       ldout(pg->cct, 10) << " still need up_thru update before going active"
8062                          << dendl;
8063       post_event(NeedUpThru());
8064       return;
8065     }
8066
8067     // all good!
8068     post_event(Activate(pg->get_osdmap()->get_epoch()));
8069   } else {
8070     pg->publish_stats_to_osd();
8071   }
8072 }
8073
8074 boost::statechart::result PG::RecoveryState::GetMissing::react(const MLogRec& logevt)
8075 {
8076   PG *pg = context< RecoveryMachine >().pg;
8077
8078   peer_missing_requested.erase(logevt.from);
8079   pg->proc_replica_log(logevt.msg->info, logevt.msg->log, logevt.msg->missing, logevt.from);
8080
8081   if (peer_missing_requested.empty()) {
8082     if (pg->need_up_thru) {
8083       ldout(pg->cct, 10) << " still need up_thru update before going active"
8084                          << dendl;
8085       post_event(NeedUpThru());
8086     } else {
8087       ldout(pg->cct, 10) << "Got last missing, don't need missing "
8088                          << "posting Activate" << dendl;
8089       post_event(Activate(pg->get_osdmap()->get_epoch()));
8090     }
8091   }
8092   return discard_event();
8093 }
8094
8095 boost::statechart::result PG::RecoveryState::GetMissing::react(const QueryState& q)
8096 {
8097   PG *pg = context< RecoveryMachine >().pg;
8098   q.f->open_object_section("state");
8099   q.f->dump_string("name", state_name);
8100   q.f->dump_stream("enter_time") << enter_time;
8101
8102   q.f->open_array_section("peer_missing_requested");
8103   for (set<pg_shard_t>::iterator p = peer_missing_requested.begin();
8104        p != peer_missing_requested.end();
8105        ++p) {
8106     q.f->open_object_section("osd");
8107     q.f->dump_stream("osd") << *p;
8108     if (pg->peer_missing.count(*p)) {
8109       q.f->open_object_section("got_missing");
8110       pg->peer_missing[*p].dump(q.f);
8111       q.f->close_section();
8112     }
8113     q.f->close_section();
8114   }
8115   q.f->close_section();
8116
8117   q.f->close_section();
8118   return forward_event();
8119 }
8120
8121 void PG::RecoveryState::GetMissing::exit()
8122 {
8123   context< RecoveryMachine >().log_exit(state_name, enter_time);
8124   PG *pg = context< RecoveryMachine >().pg;
8125   utime_t dur = ceph_clock_now() - enter_time;
8126   pg->osd->recoverystate_perf->tinc(rs_getmissing_latency, dur);
8127   pg->blocked_by.clear();
8128   pg->publish_stats_to_osd();
8129 }
8130
8131 /*------WaitUpThru--------*/
8132 PG::RecoveryState::WaitUpThru::WaitUpThru(my_context ctx)
8133   : my_base(ctx),
8134     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/WaitUpThru")
8135 {
8136   context< RecoveryMachine >().log_enter(state_name);
8137 }
8138
8139 boost::statechart::result PG::RecoveryState::WaitUpThru::react(const ActMap& am)
8140 {
8141   PG *pg = context< RecoveryMachine >().pg;
8142   if (!pg->need_up_thru) {
8143     post_event(Activate(pg->get_osdmap()->get_epoch()));
8144   }
8145   return forward_event();
8146 }
8147
8148 boost::statechart::result PG::RecoveryState::WaitUpThru::react(const MLogRec& logevt)
8149 {
8150   PG *pg = context< RecoveryMachine >().pg;
8151   ldout(pg->cct, 10) << "Noting missing from osd." << logevt.from << dendl;
8152   pg->peer_missing[logevt.from].claim(logevt.msg->missing);
8153   pg->peer_info[logevt.from] = logevt.msg->info;
8154   return discard_event();
8155 }
8156
8157 boost::statechart::result PG::RecoveryState::WaitUpThru::react(const QueryState& q)
8158 {
8159   q.f->open_object_section("state");
8160   q.f->dump_string("name", state_name);
8161   q.f->dump_stream("enter_time") << enter_time;
8162   q.f->dump_string("comment", "waiting for osdmap to reflect a new up_thru for this osd");
8163   q.f->close_section();
8164   return forward_event();
8165 }
8166
8167 void PG::RecoveryState::WaitUpThru::exit()
8168 {
8169   context< RecoveryMachine >().log_exit(state_name, enter_time);
8170   PG *pg = context< RecoveryMachine >().pg;
8171   utime_t dur = ceph_clock_now() - enter_time;
8172   pg->osd->recoverystate_perf->tinc(rs_waitupthru_latency, dur);
8173 }
8174
8175 /*----RecoveryState::RecoveryMachine Methods-----*/
8176 #undef dout_prefix
8177 #define dout_prefix *_dout << pg->gen_prefix()
8178
8179 void PG::RecoveryState::RecoveryMachine::log_enter(const char *state_name)
8180 {
8181   PG *pg = context< RecoveryMachine >().pg;
8182   ldout(pg->cct, 5) << "enter " << state_name << dendl;
8183   pg->osd->pg_recovery_stats.log_enter(state_name);
8184 }
8185
8186 void PG::RecoveryState::RecoveryMachine::log_exit(const char *state_name, utime_t enter_time)
8187 {
8188   utime_t dur = ceph_clock_now() - enter_time;
8189   PG *pg = context< RecoveryMachine >().pg;
8190   ldout(pg->cct, 5) << "exit " << state_name << " " << dur << " " << event_count << " " << event_time << dendl;
8191   pg->osd->pg_recovery_stats.log_exit(state_name, ceph_clock_now() - enter_time,
8192                                       event_count, event_time);
8193   event_count = 0;
8194   event_time = utime_t();
8195 }
8196
8197
8198 /*---------------------------------------------------*/
8199 #undef dout_prefix
8200 #define dout_prefix (*_dout << (debug_pg ? debug_pg->gen_prefix() : string()) << " PriorSet: ")
8201
8202 void PG::RecoveryState::start_handle(RecoveryCtx *new_ctx) {
8203   assert(!rctx);
8204   assert(!orig_ctx);
8205   orig_ctx = new_ctx;
8206   if (new_ctx) {
8207     if (messages_pending_flush) {
8208       rctx = RecoveryCtx(*messages_pending_flush, *new_ctx);
8209     } else {
8210       rctx = *new_ctx;
8211     }
8212     rctx->start_time = ceph_clock_now();
8213   }
8214 }
8215
8216 void PG::RecoveryState::begin_block_outgoing() {
8217   assert(!messages_pending_flush);
8218   assert(orig_ctx);
8219   assert(rctx);
8220   messages_pending_flush = BufferedRecoveryMessages();
8221   rctx = RecoveryCtx(*messages_pending_flush, *orig_ctx);
8222 }
8223
8224 void PG::RecoveryState::clear_blocked_outgoing() {
8225   assert(orig_ctx);
8226   assert(rctx);
8227   messages_pending_flush = boost::optional<BufferedRecoveryMessages>();
8228 }
8229
8230 void PG::RecoveryState::end_block_outgoing() {
8231   assert(messages_pending_flush);
8232   assert(orig_ctx);
8233   assert(rctx);
8234
8235   rctx = RecoveryCtx(*orig_ctx);
8236   rctx->accept_buffered_messages(*messages_pending_flush);
8237   messages_pending_flush = boost::optional<BufferedRecoveryMessages>();
8238 }
8239
8240 void PG::RecoveryState::end_handle() {
8241   if (rctx) {
8242     utime_t dur = ceph_clock_now() - rctx->start_time;
8243     machine.event_time += dur;
8244   }
8245
8246   machine.event_count++;
8247   rctx = boost::optional<RecoveryCtx>();
8248   orig_ctx = NULL;
8249 }
8250
8251 ostream& operator<<(ostream& out, const PG::BackfillInterval& bi)
8252 {
8253   out << "BackfillInfo(" << bi.begin << "-" << bi.end
8254       << " " << bi.objects.size() << " objects";
8255   if (!bi.objects.empty())
8256     out << " " << bi.objects;
8257   out << ")";
8258   return out;
8259 }
8260
8261 void intrusive_ptr_add_ref(PG *pg) { pg->get("intptr"); }
8262 void intrusive_ptr_release(PG *pg) { pg->put("intptr"); }
8263
8264 #ifdef PG_DEBUG_REFS
8265   uint64_t get_with_id(PG *pg) { return pg->get_with_id(); }
8266   void put_with_id(PG *pg, uint64_t id) { return pg->put_with_id(id); }
8267 #endif