ceph/src/osd/PG.cc

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Ceph - scalable distributed file system
   5  *
   6  * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
   7  *
   8  * This is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License version 2.1, as published by the Free Software
  11  * Foundation.  See file COPYING.
  12  *
  13  */
  14
  15 #include "PG.h"
  16 // #include "msg/Messenger.h"
  17 #include "messages/MOSDRepScrub.h"
  18 // #include "common/cmdparse.h"
  19 // #include "common/ceph_context.h"
  20
  21 #include "common/errno.h"
  22 #include "common/config.h"
  23 #include "OSD.h"
  24 #include "OpRequest.h"
  25 #include "ScrubStore.h"
  26 #include "Session.h"
  27
  28 #include "common/Timer.h"
  29 #include "common/perf_counters.h"
  30
  31 #include "messages/MOSDOp.h"
  32 #include "messages/MOSDPGNotify.h"
  33 // #include "messages/MOSDPGLog.h"
  34 #include "messages/MOSDPGRemove.h"
  35 #include "messages/MOSDPGInfo.h"
  36 #include "messages/MOSDPGTrim.h"
  37 #include "messages/MOSDPGScan.h"
  38 #include "messages/MOSDPGBackfill.h"
  39 #include "messages/MOSDPGBackfillRemove.h"
  40 #include "messages/MBackfillReserve.h"
  41 #include "messages/MRecoveryReserve.h"
  42 #include "messages/MOSDPGPush.h"
  43 #include "messages/MOSDPGPushReply.h"
  44 #include "messages/MOSDPGPull.h"
  45 #include "messages/MOSDECSubOpWrite.h"
  46 #include "messages/MOSDECSubOpWriteReply.h"
  47 #include "messages/MOSDECSubOpRead.h"
  48 #include "messages/MOSDECSubOpReadReply.h"
  49 #include "messages/MOSDPGUpdateLogMissing.h"
  50 #include "messages/MOSDPGUpdateLogMissingReply.h"
  51 #include "messages/MOSDBackoff.h"
  52 #include "messages/MOSDScrubReserve.h"
  53 #include "messages/MOSDSubOp.h"
  54 #include "messages/MOSDRepOp.h"
  55 #include "messages/MOSDSubOpReply.h"
  56 #include "messages/MOSDRepOpReply.h"
  57 #include "messages/MOSDRepScrubMap.h"
  58
  59 #include "common/BackTrace.h"
  60 #include "common/EventTrace.h"
  61
  62 #ifdef WITH_LTTNG
  63 #define TRACEPOINT_DEFINE
  64 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
  65 #include "tracing/pg.h"
  66 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
  67 #undef TRACEPOINT_DEFINE
  68 #else
  69 #define tracepoint(...)
  70 #endif
  71
  72 #include <sstream>
  73
  74 #define dout_context cct
  75 #define dout_subsys ceph_subsys_osd
  76 #undef dout_prefix
  77 #define dout_prefix _prefix(_dout, this)
  78
  79 // prefix pgmeta_oid keys with _ so that PGLog::read_log_and_missing() can
  80 // easily skip them
  81 const string infover_key("_infover");
  82 const string info_key("_info");
  83 const string biginfo_key("_biginfo");
  84 const string epoch_key("_epoch");
  85 const string fastinfo_key("_fastinfo");
  86
  87 template <class T>
  88 static ostream& _prefix(std::ostream *_dout, T *t)
  89 {
  90   return *_dout << t->gen_prefix();
  91 }
  92
  93 MEMPOOL_DEFINE_OBJECT_FACTORY(PG::CephPeeringEvt, pg_peering_evt, osd);
  94
  95 void PGStateHistory::enter(PG* pg, const utime_t entime, const char* state)
  96 {
  97   // Ignore trimming state machine for now
  98   if (::strstr(state, "Trimming") != NULL) {
  99     return;
 100   } else if (pi != nullptr) {
 101     pi->enter_state(entime, state);
 102   } else {
 103     // Store current state since we can't reliably take the PG lock here
 104     if ( tmppi == nullptr) {
 105       tmppi = std::unique_ptr<PGStateInstance>(new PGStateInstance);
 106     }
 107
 108     thispg = pg;
 109     tmppi->enter_state(entime, state);
 110   }
 111 }
 112
 113 void PGStateHistory::exit(const char* state) {
 114   // Ignore trimming state machine for now
 115   // Do nothing if PG is being destroyed!
 116   if (::strstr(state, "Trimming") != NULL || pg_in_destructor) {
 117     return;
 118   } else {
 119     bool ilocked = false;
 120     if(!thispg->is_locked()) {
 121       thispg->lock();
 122       ilocked = true;
 123     }
 124     if (pi == nullptr) {
 125       buffer.push_back(std::unique_ptr<PGStateInstance>(tmppi.release()));
 126       pi = buffer.back().get();
 127       pi->setepoch(thispg->get_osdmap()->get_epoch());
 128     }
 129
 130     pi->exit_state(ceph_clock_now());
 131     if (::strcmp(state, "Reset") == 0) {
 132       this->reset();
 133     }
 134     if(ilocked) {
 135       thispg->unlock();
 136     }
 137   }
 138 }
 139
 140 void PGStateHistory::dump(Formatter* f) const {
 141   f->open_array_section("history");
 142   for (auto pi = buffer.begin(); pi != buffer.end(); ++pi) {
 143     f->open_object_section("states");
 144     f->dump_stream("epoch") << (*pi)->this_epoch;
 145     for (auto she : (*pi)->state_history) {
 146       f->dump_string("state", std::get<2>(she));
 147       f->dump_stream("enter") << std::get<0>(she);
 148       f->dump_stream("exit") << std::get<1>(she);
 149     }
 150     f->close_section();
 151   }
 152   f->close_section();
 153 }
 154
 155 void PG::get(const char* tag)
 156 {
 157   ref++;
 158 #ifdef PG_DEBUG_REFS
 159   Mutex::Locker l(_ref_id_lock);
 160   _tag_counts[tag]++;
 161 #endif
 162 }
 163
 164 void PG::put(const char* tag)
 165 {
 166 #ifdef PG_DEBUG_REFS
 167   {
 168     Mutex::Locker l(_ref_id_lock);
 169     auto tag_counts_entry = _tag_counts.find(tag);
 170     assert(tag_counts_entry != _tag_counts.end());
 171     --tag_counts_entry->second;
 172     if (tag_counts_entry->second == 0) {
 173       _tag_counts.erase(tag_counts_entry);
 174     }
 175   }
 176 #endif
 177   if (--ref== 0)
 178     delete this;
 179 }
 180
 181 #ifdef PG_DEBUG_REFS
 182 uint64_t PG::get_with_id()
 183 {
 184   ref++;
 185   Mutex::Locker l(_ref_id_lock);
 186   uint64_t id = ++_ref_id;
 187   BackTrace bt(0);
 188   stringstream ss;
 189   bt.print(ss);
 190   dout(20) << __func__ << ": " << info.pgid << " got id " << id << " (new) ref==" << ref << dendl;
 191   assert(!_live_ids.count(id));
 192   _live_ids.insert(make_pair(id, ss.str()));
 193   return id;
 194 }
 195
 196 void PG::put_with_id(uint64_t id)
 197 {
 198   dout(20) << __func__ << ": " << info.pgid << " put id " << id << " (current) ref==" << ref << dendl;
 199   {
 200     Mutex::Locker l(_ref_id_lock);
 201     assert(_live_ids.count(id));
 202     _live_ids.erase(id);
 203   }
 204   if (--ref == 0)
 205     delete this;
 206 }
 207
 208 void PG::dump_live_ids()
 209 {
 210   Mutex::Locker l(_ref_id_lock);
 211   dout(0) << "\t" << __func__ << ": " << info.pgid << " live ids:" << dendl;
 212   for (map<uint64_t, string>::iterator i = _live_ids.begin();
 213        i != _live_ids.end();
 214        ++i) {
 215     dout(0) << "\t\tid: " << *i << dendl;
 216   }
 217   dout(0) << "\t" << __func__ << ": " << info.pgid << " live tags:" << dendl;
 218   for (map<string, uint64_t>::iterator i = _tag_counts.begin();
 219        i != _tag_counts.end();
 220        ++i) {
 221     dout(0) << "\t\tid: " << *i << dendl;
 222   }
 223 }
 224 #endif
 225
 226 void PGPool::update(OSDMapRef map)
 227 {
 228   const pg_pool_t *pi = map->get_pg_pool(id);
 229   assert(pi);
 230   info = *pi;
 231   auid = pi->auid;
 232   name = map->get_pool_name(id);
 233   bool updated = false;
 234   if ((map->get_epoch() != cached_epoch + 1) ||
 235       (pi->get_snap_epoch() == map->get_epoch())) {
 236     updated = true;
 237     pi->build_removed_snaps(newly_removed_snaps);
 238     interval_set<snapid_t> intersection;
 239     intersection.intersection_of(newly_removed_snaps, cached_removed_snaps);
 240     if (intersection == cached_removed_snaps) {
 241         newly_removed_snaps.subtract(cached_removed_snaps);
 242         cached_removed_snaps.union_of(newly_removed_snaps);
 243     } else {
 244         lgeneric_subdout(cct, osd, 0) << __func__
 245           << " cached_removed_snaps shrank from " << cached_removed_snaps
 246           << " to " << newly_removed_snaps << dendl;
 247         cached_removed_snaps = newly_removed_snaps;
 248         newly_removed_snaps.clear();
 249     }
 250     snapc = pi->get_snap_context();
 251   } else {
 252     /* 1) map->get_epoch() == cached_epoch + 1 &&
 253      * 2) pi->get_snap_epoch() != map->get_epoch()
 254      *
 255      * From the if branch, 1 && 2 must be true.  From 2, we know that
 256      * this map didn't change the set of removed snaps.  From 1, we
 257      * know that our cached_removed_snaps matches the previous map.
 258      * Thus, from 1 && 2, cached_removed snaps matches the current
 259      * set of removed snaps and all we have to do is clear
 260      * newly_removed_snaps.
 261      */
 262     newly_removed_snaps.clear();
 263   }
 264   cached_epoch = map->get_epoch();
 265   lgeneric_subdout(cct, osd, 20)
 266     << "PGPool::update cached_removed_snaps "
 267     << cached_removed_snaps
 268     << " newly_removed_snaps "
 269     << newly_removed_snaps
 270     << " snapc " << snapc
 271     << (updated ? " (updated)":" (no change)")
 272     << dendl;
 273 }
 274
 275 PG::PG(OSDService *o, OSDMapRef curmap,
 276        const PGPool &_pool, spg_t p) :
 277   osd(o),
 278   cct(o->cct),
 279   osdriver(osd->store, coll_t(), OSD::make_snapmapper_oid()),
 280   snap_mapper(
 281     cct,
 282     &osdriver,
 283     p.ps(),
 284     p.get_split_bits(curmap->get_pg_num(_pool.id)),
 285     _pool.id,
 286     p.shard),
 287   osdmap_ref(curmap), last_persisted_osdmap_ref(curmap), pool(_pool),
 288   _lock("PG::_lock"),
 289   #ifdef PG_DEBUG_REFS
 290   _ref_id_lock("PG::_ref_id_lock"), _ref_id(0),
 291   #endif
 292   deleting(false),
 293   trace_endpoint("0.0.0.0", 0, "PG"),
 294   dirty_info(false), dirty_big_info(false),
 295   info(p),
 296   info_struct_v(0),
 297   coll(p), pg_log(cct),
 298   pgmeta_oid(p.make_pgmeta_oid()),
 299   missing_loc(this),
 300   past_intervals(
 301     curmap->get_pools().at(p.pgid.pool()).ec_pool(),
 302     *curmap),
 303   stat_queue_item(this),
 304   scrub_queued(false),
 305   recovery_queued(false),
 306   recovery_ops_active(0),
 307   role(-1),
 308   state(0),
 309   send_notify(false),
 310   pg_whoami(osd->whoami, p.shard),
 311   need_up_thru(false),
 312   last_peering_reset(0),
 313   heartbeat_peer_lock("PG::heartbeat_peer_lock"),
 314   backfill_reserved(false),
 315   backfill_reserving(false),
 316   flushes_in_progress(0),
 317   pg_stats_publish_lock("PG::pg_stats_publish_lock"),
 318   pg_stats_publish_valid(false),
 319   osr(osd->osr_registry.lookup_or_create(p, (stringify(p)))),
 320   finish_sync_event(NULL),
 321   backoff_lock("PG::backoff_lock"),
 322   scrub_after_recovery(false),
 323   active_pushes(0),
 324   recovery_state(this),
 325   pg_id(p),
 326   peer_features(CEPH_FEATURES_SUPPORTED_DEFAULT),
 327   acting_features(CEPH_FEATURES_SUPPORTED_DEFAULT),
 328   upacting_features(CEPH_FEATURES_SUPPORTED_DEFAULT),
 329   last_epoch(0)
 330 {
 331 #ifdef PG_DEBUG_REFS
 332   osd->add_pgid(p, this);
 333 #endif
 334 #ifdef WITH_BLKIN
 335   std::stringstream ss;
 336   ss << "PG " << info.pgid;
 337   trace_endpoint.copy_name(ss.str());
 338 #endif
 339   osr->shard_hint = p;
 340 }
 341
 342 PG::~PG()
 343 {
 344   pgstate_history.set_pg_in_destructor();
 345 #ifdef PG_DEBUG_REFS
 346   osd->remove_pgid(info.pgid, this);
 347 #endif
 348 }
 349
 350 void PG::lock_suspend_timeout(ThreadPool::TPHandle &handle)
 351 {
 352   handle.suspend_tp_timeout();
 353   lock();
 354   handle.reset_tp_timeout();
 355 }
 356
 357 void PG::lock(bool no_lockdep) const
 358 {
 359   _lock.Lock(no_lockdep);
 360   // if we have unrecorded dirty state with the lock dropped, there is a bug
 361   assert(!dirty_info);
 362   assert(!dirty_big_info);
 363
 364   dout(30) << "lock" << dendl;
 365 }
 366
 367 std::string PG::gen_prefix() const
 368 {
 369   stringstream out;
 370   OSDMapRef mapref = osdmap_ref;
 371   if (_lock.is_locked_by_me()) {
 372     out << "osd." << osd->whoami
 373         << " pg_epoch: " << (mapref ? mapref->get_epoch():0)
 374         << " " << *this << " ";
 375   } else {
 376     out << "osd." << osd->whoami
 377         << " pg_epoch: " << (mapref ? mapref->get_epoch():0)
 378         << " pg[" << info.pgid << "(unlocked)] ";
 379   }
 380   return out.str();
 381 }
 382
 383 /********* PG **********/
 384
 385 void PG::proc_master_log(
 386   ObjectStore::Transaction& t, pg_info_t &oinfo,
 387   pg_log_t &olog, pg_missing_t& omissing, pg_shard_t from)
 388 {
 389   dout(10) << "proc_master_log for osd." << from << ": "
 390            << olog << " " << omissing << dendl;
 391   assert(!is_peered() && is_primary());
 392
 393   // merge log into our own log to build master log.  no need to
 394   // make any adjustments to their missing map; we are taking their
 395   // log to be authoritative (i.e., their entries are by definitely
 396   // non-divergent).
 397   merge_log(t, oinfo, olog, from);
 398   peer_info[from] = oinfo;
 399   dout(10) << " peer osd." << from << " now " << oinfo << " " << omissing << dendl;
 400   might_have_unfound.insert(from);
 401
 402   // See doc/dev/osd_internals/last_epoch_started
 403   if (oinfo.last_epoch_started > info.last_epoch_started) {
 404     info.last_epoch_started = oinfo.last_epoch_started;
 405     dirty_info = true;
 406   }
 407   if (oinfo.last_interval_started > info.last_interval_started) {
 408     info.last_interval_started = oinfo.last_interval_started;
 409     dirty_info = true;
 410   }
 411   update_history(oinfo.history);
 412   assert(cct->_conf->osd_find_best_info_ignore_history_les ||
 413          info.last_epoch_started >= info.history.last_epoch_started);
 414
 415   peer_missing[from].claim(omissing);
 416 }
 417
 418 void PG::proc_replica_log(
 419   pg_info_t &oinfo,
 420   const pg_log_t &olog,
 421   pg_missing_t& omissing,
 422   pg_shard_t from)
 423 {
 424   dout(10) << "proc_replica_log for osd." << from << ": "
 425            << oinfo << " " << olog << " " << omissing << dendl;
 426
 427   pg_log.proc_replica_log(oinfo, olog, omissing, from);
 428
 429   peer_info[from] = oinfo;
 430   dout(10) << " peer osd." << from << " now " << oinfo << " " << omissing << dendl;
 431   might_have_unfound.insert(from);
 432
 433   for (map<hobject_t, pg_missing_item>::const_iterator i =
 434          omissing.get_items().begin();
 435        i != omissing.get_items().end();
 436        ++i) {
 437     dout(20) << " after missing " << i->first << " need " << i->second.need
 438              << " have " << i->second.have << dendl;
 439   }
 440   peer_missing[from].claim(omissing);
 441 }
 442
 443 bool PG::proc_replica_info(
 444   pg_shard_t from, const pg_info_t &oinfo, epoch_t send_epoch)
 445 {
 446   map<pg_shard_t, pg_info_t>::iterator p = peer_info.find(from);
 447   if (p != peer_info.end() && p->second.last_update == oinfo.last_update) {
 448     dout(10) << " got dup osd." << from << " info " << oinfo << ", identical to ours" << dendl;
 449     return false;
 450   }
 451
 452   if (!get_osdmap()->has_been_up_since(from.osd, send_epoch)) {
 453     dout(10) << " got info " << oinfo << " from down osd." << from
 454              << " discarding" << dendl;
 455     return false;
 456   }
 457
 458   dout(10) << " got osd." << from << " " << oinfo << dendl;
 459   assert(is_primary());
 460   peer_info[from] = oinfo;
 461   might_have_unfound.insert(from);
 462
 463   update_history(oinfo.history);
 464
 465   // stray?
 466   if (!is_up(from) && !is_acting(from)) {
 467     dout(10) << " osd." << from << " has stray content: " << oinfo << dendl;
 468     stray_set.insert(from);
 469     if (is_clean()) {
 470       purge_strays();
 471     }
 472   }
 473
 474   // was this a new info?  if so, update peers!
 475   if (p == peer_info.end())
 476     update_heartbeat_peers();
 477
 478   return true;
 479 }
 480
 481 void PG::remove_snap_mapped_object(
 482   ObjectStore::Transaction &t, const hobject_t &soid)
 483 {
 484   t.remove(
 485     coll,
 486     ghobject_t(soid, ghobject_t::NO_GEN, pg_whoami.shard));
 487   clear_object_snap_mapping(&t, soid);
 488 }
 489
 490 void PG::clear_object_snap_mapping(
 491   ObjectStore::Transaction *t, const hobject_t &soid)
 492 {
 493   OSDriver::OSTransaction _t(osdriver.get_transaction(t));
 494   if (soid.snap < CEPH_MAXSNAP) {
 495     int r = snap_mapper.remove_oid(
 496       soid,
 497       &_t);
 498     if (!(r == 0 || r == -ENOENT)) {
 499       derr << __func__ << ": remove_oid returned " << cpp_strerror(r) << dendl;
 500       ceph_abort();
 501     }
 502   }
 503 }
 504
 505 void PG::update_object_snap_mapping(
 506   ObjectStore::Transaction *t, const hobject_t &soid, const set<snapid_t> &snaps)
 507 {
 508   OSDriver::OSTransaction _t(osdriver.get_transaction(t));
 509   assert(soid.snap < CEPH_MAXSNAP);
 510   int r = snap_mapper.remove_oid(
 511     soid,
 512     &_t);
 513   if (!(r == 0 || r == -ENOENT)) {
 514     derr << __func__ << ": remove_oid returned " << cpp_strerror(r) << dendl;
 515     ceph_abort();
 516   }
 517   snap_mapper.add_oid(
 518     soid,
 519     snaps,
 520     &_t);
 521 }
 522
 523 void PG::merge_log(
 524   ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog, pg_shard_t from)
 525 {
 526   PGLogEntryHandler rollbacker{this, &t};
 527   pg_log.merge_log(
 528     oinfo, olog, from, info, &rollbacker, dirty_info, dirty_big_info);
 529 }
 530
 531 void PG::rewind_divergent_log(ObjectStore::Transaction& t, eversion_t newhead)
 532 {
 533   PGLogEntryHandler rollbacker{this, &t};
 534   pg_log.rewind_divergent_log(
 535     newhead, info, &rollbacker, dirty_info, dirty_big_info);
 536 }
 537
 538 /*
 539  * Process information from a replica to determine if it could have any
 540  * objects that i need.
 541  *
 542  * TODO: if the missing set becomes very large, this could get expensive.
 543  * Instead, we probably want to just iterate over our unfound set.
 544  */
 545 bool PG::search_for_missing(
 546   const pg_info_t &oinfo, const pg_missing_t &omissing,
 547   pg_shard_t from,
 548   RecoveryCtx *ctx)
 549 {
 550   uint64_t num_unfound_before = missing_loc.num_unfound();
 551   bool found_missing = missing_loc.add_source_info(
 552     from, oinfo, omissing, ctx->handle);
 553   if (found_missing && num_unfound_before != missing_loc.num_unfound())
 554     publish_stats_to_osd();
 555   if (found_missing &&
 556       (get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, NULL) &
 557        CEPH_FEATURE_OSD_ERASURE_CODES)) {
 558     pg_info_t tinfo(oinfo);
 559     tinfo.pgid.shard = pg_whoami.shard;
 560     (*(ctx->info_map))[from.osd].push_back(
 561       make_pair(
 562         pg_notify_t(
 563           from.shard, pg_whoami.shard,
 564           get_osdmap()->get_epoch(),
 565           get_osdmap()->get_epoch(),
 566           tinfo),
 567         past_intervals));
 568   }
 569   return found_missing;
 570 }
 571
 572 bool PG::MissingLoc::readable_with_acting(
 573   const hobject_t &hoid,
 574   const set<pg_shard_t> &acting) const {
 575   if (!needs_recovery(hoid)) return true;
 576   auto missing_loc_entry = missing_loc.find(hoid);
 577   if (missing_loc_entry == missing_loc.end()) return false;
 578   const set<pg_shard_t> &locs = missing_loc_entry->second;
 579   ldout(pg->cct, 10) << __func__ << ": locs:" << locs << dendl;
 580   set<pg_shard_t> have_acting;
 581   for (set<pg_shard_t>::const_iterator i = locs.begin();
 582        i != locs.end();
 583        ++i) {
 584     if (acting.count(*i))
 585       have_acting.insert(*i);
 586   }
 587   return (*is_readable)(have_acting);
 588 }
 589
 590 void PG::MissingLoc::add_batch_sources_info(
 591   const set<pg_shard_t> &sources, ThreadPool::TPHandle* handle)
 592 {
 593   ldout(pg->cct, 10) << __func__ << ": adding sources in batch "
 594                      << sources.size() << dendl;
 595   unsigned loop = 0;
 596   for (map<hobject_t, pg_missing_item>::const_iterator i = needs_recovery_map.begin();
 597       i != needs_recovery_map.end();
 598       ++i) {
 599     if (handle && ++loop >= pg->cct->_conf->osd_loop_before_reset_tphandle) {
 600       handle->reset_tp_timeout();
 601       loop = 0;
 602     }
 603     missing_loc[i->first].insert(sources.begin(), sources.end());
 604     missing_loc_sources.insert(sources.begin(), sources.end());
 605   }
 606 }
 607
 608 bool PG::MissingLoc::add_source_info(
 609   pg_shard_t fromosd,
 610   const pg_info_t &oinfo,
 611   const pg_missing_t &omissing,
 612   ThreadPool::TPHandle* handle)
 613 {
 614   bool found_missing = false;
 615   unsigned loop = 0;
 616   // found items?
 617   for (map<hobject_t,pg_missing_item>::const_iterator p = needs_recovery_map.begin();
 618        p != needs_recovery_map.end();
 619        ++p) {
 620     const hobject_t &soid(p->first);
 621     eversion_t need = p->second.need;
 622     if (handle && ++loop >= pg->cct->_conf->osd_loop_before_reset_tphandle) {
 623       handle->reset_tp_timeout();
 624       loop = 0;
 625     }
 626     if (oinfo.last_update < need) {
 627       ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
 628                          << " also missing on osd." << fromosd
 629                          << " (last_update " << oinfo.last_update
 630                          << " < needed " << need << ")" << dendl;
 631       continue;
 632     }
 633     if (!oinfo.last_backfill.is_max() &&
 634         !oinfo.last_backfill_bitwise) {
 635       ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
 636                          << " also missing on osd." << fromosd
 637                          << " (last_backfill " << oinfo.last_backfill
 638                          << " but with wrong sort order)"
 639                          << dendl;
 640       continue;
 641     }
 642     if (p->first >= oinfo.last_backfill) {
 643       // FIXME: this is _probably_ true, although it could conceivably
 644       // be in the undefined region!  Hmm!
 645       ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
 646                          << " also missing on osd." << fromosd
 647                          << " (past last_backfill " << oinfo.last_backfill
 648                          << ")" << dendl;
 649       continue;
 650     }
 651     if (oinfo.last_complete < need) {
 652       if (omissing.is_missing(soid)) {
 653         ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
 654                            << " also missing on osd." << fromosd << dendl;
 655         continue;
 656       }
 657     }
 658
 659     ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
 660                        << " is on osd." << fromosd << dendl;
 661
 662     missing_loc[soid].insert(fromosd);
 663     missing_loc_sources.insert(fromosd);
 664     found_missing = true;
 665   }
 666
 667   ldout(pg->cct, 20) << "needs_recovery_map missing " << needs_recovery_map
 668                      << dendl;
 669   return found_missing;
 670 }
 671
 672 void PG::discover_all_missing(map<int, map<spg_t,pg_query_t> > &query_map)
 673 {
 674   auto &missing = pg_log.get_missing();
 675   uint64_t unfound = get_num_unfound();
 676   assert(unfound > 0);
 677
 678   dout(10) << __func__ << " "
 679            << missing.num_missing() << " missing, "
 680            << unfound << " unfound"
 681            << dendl;
 682
 683   std::set<pg_shard_t>::const_iterator m = might_have_unfound.begin();
 684   std::set<pg_shard_t>::const_iterator mend = might_have_unfound.end();
 685   for (; m != mend; ++m) {
 686     pg_shard_t peer(*m);
 687
 688     if (!get_osdmap()->is_up(peer.osd)) {
 689       dout(20) << __func__ << " skipping down osd." << peer << dendl;
 690       continue;
 691     }
 692
 693     map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(peer);
 694     if (iter != peer_info.end() &&
 695         (iter->second.is_empty() || iter->second.dne())) {
 696       // ignore empty peers
 697       continue;
 698     }
 699
 700     // If we've requested any of this stuff, the pg_missing_t information
 701     // should be on its way.
 702     // TODO: coalsce requested_* into a single data structure
 703     if (peer_missing.find(peer) != peer_missing.end()) {
 704       dout(20) << __func__ << ": osd." << peer
 705                << ": we already have pg_missing_t" << dendl;
 706       continue;
 707     }
 708     if (peer_log_requested.find(peer) != peer_log_requested.end()) {
 709       dout(20) << __func__ << ": osd." << peer
 710                << ": in peer_log_requested" << dendl;
 711       continue;
 712     }
 713     if (peer_missing_requested.find(peer) != peer_missing_requested.end()) {
 714       dout(20) << __func__ << ": osd." << peer
 715                << ": in peer_missing_requested" << dendl;
 716       continue;
 717     }
 718
 719     // Request missing
 720     dout(10) << __func__ << ": osd." << peer << ": requesting pg_missing_t"
 721              << dendl;
 722     peer_missing_requested.insert(peer);
 723     query_map[peer.osd][spg_t(info.pgid.pgid, peer.shard)] =
 724       pg_query_t(
 725         pg_query_t::FULLLOG,
 726         peer.shard, pg_whoami.shard,
 727         info.history, get_osdmap()->get_epoch());
 728   }
 729 }
 730
 731 /******* PG ***********/
 732 bool PG::needs_recovery() const
 733 {
 734   assert(is_primary());
 735
 736   auto &missing = pg_log.get_missing();
 737
 738   if (missing.num_missing()) {
 739     dout(10) << __func__ << " primary has " << missing.num_missing()
 740       << " missing" << dendl;
 741     return true;
 742   }
 743
 744   assert(!actingbackfill.empty());
 745   set<pg_shard_t>::const_iterator end = actingbackfill.end();
 746   set<pg_shard_t>::const_iterator a = actingbackfill.begin();
 747   for (; a != end; ++a) {
 748     if (*a == get_primary()) continue;
 749     pg_shard_t peer = *a;
 750     map<pg_shard_t, pg_missing_t>::const_iterator pm = peer_missing.find(peer);
 751     if (pm == peer_missing.end()) {
 752       dout(10) << __func__ << " osd." << peer << " doesn't have missing set"
 753         << dendl;
 754       continue;
 755     }
 756     if (pm->second.num_missing()) {
 757       dout(10) << __func__ << " osd." << peer << " has "
 758         << pm->second.num_missing() << " missing" << dendl;
 759       return true;
 760     }
 761   }
 762
 763   dout(10) << __func__ << " is recovered" << dendl;
 764   return false;
 765 }
 766
 767 bool PG::needs_backfill() const
 768 {
 769   assert(is_primary());
 770
 771   // We can assume that only possible osds that need backfill
 772   // are on the backfill_targets vector nodes.
 773   set<pg_shard_t>::const_iterator end = backfill_targets.end();
 774   set<pg_shard_t>::const_iterator a = backfill_targets.begin();
 775   for (; a != end; ++a) {
 776     pg_shard_t peer = *a;
 777     map<pg_shard_t, pg_info_t>::const_iterator pi = peer_info.find(peer);
 778     if (!pi->second.last_backfill.is_max()) {
 779       dout(10) << __func__ << " osd." << peer << " has last_backfill " << pi->second.last_backfill << dendl;
 780       return true;
 781     }
 782   }
 783
 784   dout(10) << __func__ << " does not need backfill" << dendl;
 785   return false;
 786 }
 787
 788
 789 void PG::check_past_interval_bounds() const
 790 {
 791   auto rpib = get_required_past_interval_bounds(
 792     info,
 793     osd->get_superblock().oldest_map);
 794   if (rpib.first >= rpib.second) {
 795     if (!past_intervals.empty()) {
 796       osd->clog->error() << info.pgid << " required past_interval bounds are"
 797                          << " empty [" << rpib << ") but past_intervals is not: "
 798                          << past_intervals;
 799       derr << info.pgid << " required past_interval bounds are"
 800            << " empty [" << rpib << ") but past_intervals is not: "
 801            << past_intervals << dendl;
 802     }
 803   } else {
 804     if (past_intervals.empty()) {
 805       osd->clog->error() << info.pgid << " required past_interval bounds are"
 806                          << " not empty [" << rpib << ") but past_intervals "
 807                          << past_intervals << " is empty";
 808       derr << info.pgid << " required past_interval bounds are"
 809            << " not empty [" << rpib << ") but past_intervals "
 810            << past_intervals << " is empty" << dendl;
 811       assert(!past_intervals.empty());
 812     }
 813
 814     auto apib = past_intervals.get_bounds();
 815     if (apib.first > rpib.first) {
 816       osd->clog->error() << info.pgid << " past_intervals [" << apib
 817                          << ") start interval does not contain the required"
 818                          << " bound [" << rpib << ") start";
 819       derr << info.pgid << " past_intervals [" << apib
 820            << ") start interval does not contain the required"
 821            << " bound [" << rpib << ") start" << dendl;
 822       assert(0 == "past_interval start interval mismatch");
 823     }
 824     if (apib.second != rpib.second) {
 825       osd->clog->error() << info.pgid << " past_interal bound [" << apib
 826                          << ") end does not match required [" << rpib
 827                          << ") end";
 828       derr << info.pgid << " past_interal bound [" << apib
 829            << ") end does not match required [" << rpib
 830            << ") end" << dendl;
 831       assert(0 == "past_interval end mismatch");
 832     }
 833   }
 834 }
 835
 836 bool PG::adjust_need_up_thru(const OSDMapRef osdmap)
 837 {
 838   epoch_t up_thru = osdmap->get_up_thru(osd->whoami);
 839   if (need_up_thru &&
 840       up_thru >= info.history.same_interval_since) {
 841     dout(10) << "adjust_need_up_thru now " << up_thru << ", need_up_thru now false" << dendl;
 842     need_up_thru = false;
 843     return true;
 844   }
 845   return false;
 846 }
 847
 848 void PG::remove_down_peer_info(const OSDMapRef osdmap)
 849 {
 850   // Remove any downed osds from peer_info
 851   bool removed = false;
 852   map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
 853   while (p != peer_info.end()) {
 854     if (!osdmap->is_up(p->first.osd)) {
 855       dout(10) << " dropping down osd." << p->first << " info " << p->second << dendl;
 856       peer_missing.erase(p->first);
 857       peer_log_requested.erase(p->first);
 858       peer_missing_requested.erase(p->first);
 859       peer_info.erase(p++);
 860       removed = true;
 861     } else
 862       ++p;
 863   }
 864
 865   // if we removed anyone, update peers (which include peer_info)
 866   if (removed)
 867     update_heartbeat_peers();
 868   check_recovery_sources(osdmap);
 869 }
 870
 871 /*
 872  * Returns true unless there is a non-lost OSD in might_have_unfound.
 873  */
 874 bool PG::all_unfound_are_queried_or_lost(const OSDMapRef osdmap) const
 875 {
 876   assert(is_primary());
 877
 878   set<pg_shard_t>::const_iterator peer = might_have_unfound.begin();
 879   set<pg_shard_t>::const_iterator mend = might_have_unfound.end();
 880   for (; peer != mend; ++peer) {
 881     if (peer_missing.count(*peer))
 882       continue;
 883     map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(*peer);
 884     if (iter != peer_info.end() &&
 885         (iter->second.is_empty() || iter->second.dne()))
 886       continue;
 887     if (!osdmap->exists(peer->osd))
 888       continue;
 889     const osd_info_t &osd_info(osdmap->get_info(peer->osd));
 890     if (osd_info.lost_at <= osd_info.up_from) {
 891       // If there is even one OSD in might_have_unfound that isn't lost, we
 892       // still might retrieve our unfound.
 893       return false;
 894     }
 895   }
 896   dout(10) << "all_unfound_are_queried_or_lost all of might_have_unfound " << might_have_unfound
 897            << " have been queried or are marked lost" << dendl;
 898   return true;
 899 }
 900
 901 PastIntervals::PriorSet PG::build_prior()
 902 {
 903   if (1) {
 904     // sanity check
 905     for (map<pg_shard_t,pg_info_t>::iterator it = peer_info.begin();
 906          it != peer_info.end();
 907          ++it) {
 908       assert(info.history.last_epoch_started >= it->second.history.last_epoch_started);
 909     }
 910   }
 911
 912   const OSDMap &osdmap = *get_osdmap();
 913   PastIntervals::PriorSet prior = past_intervals.get_prior_set(
 914     pool.info.ec_pool(),
 915     info.history.last_epoch_started,
 916     get_pgbackend()->get_is_recoverable_predicate(),
 917     [&](epoch_t start, int osd, epoch_t *lost_at) {
 918       const osd_info_t *pinfo = 0;
 919       if (osdmap.exists(osd)) {
 920         pinfo = &osdmap.get_info(osd);
 921         if (lost_at)
 922           *lost_at = pinfo->lost_at;
 923       }
 924
 925       if (osdmap.is_up(osd)) {
 926         return PastIntervals::UP;
 927       } else if (!pinfo) {
 928         return PastIntervals::DNE;
 929       } else if (pinfo->lost_at > start) {
 930         return PastIntervals::LOST;
 931       } else {
 932         return PastIntervals::DOWN;
 933       }
 934     },
 935     up,
 936     acting,
 937     this);
 938
 939   if (prior.pg_down) {
 940     state_set(PG_STATE_DOWN);
 941   }
 942
 943   if (get_osdmap()->get_up_thru(osd->whoami) < info.history.same_interval_since) {
 944     dout(10) << "up_thru " << get_osdmap()->get_up_thru(osd->whoami)
 945              << " < same_since " << info.history.same_interval_since
 946              << ", must notify monitor" << dendl;
 947     need_up_thru = true;
 948   } else {
 949     dout(10) << "up_thru " << get_osdmap()->get_up_thru(osd->whoami)
 950              << " >= same_since " << info.history.same_interval_since
 951              << ", all is well" << dendl;
 952     need_up_thru = false;
 953   }
 954   set_probe_targets(prior.probe);
 955   return prior;
 956 }
 957
 958 void PG::clear_primary_state()
 959 {
 960   dout(10) << "clear_primary_state" << dendl;
 961
 962   // clear peering state
 963   stray_set.clear();
 964   peer_log_requested.clear();
 965   peer_missing_requested.clear();
 966   peer_info.clear();
 967   peer_missing.clear();
 968   need_up_thru = false;
 969   peer_last_complete_ondisk.clear();
 970   peer_activated.clear();
 971   min_last_complete_ondisk = eversion_t();
 972   pg_trim_to = eversion_t();
 973   might_have_unfound.clear();
 974   projected_log = PGLog::IndexedLog();
 975
 976   last_update_ondisk = eversion_t();
 977
 978   snap_trimq.clear();
 979
 980   finish_sync_event = 0;  // so that _finish_recovery doesn't go off in another thread
 981
 982   missing_loc.clear();
 983
 984   release_pg_backoffs();
 985
 986   pg_log.reset_recovery_pointers();
 987
 988   scrubber.reserved_peers.clear();
 989   scrub_after_recovery = false;
 990
 991   agent_clear();
 992 }
 993
 994 PG::Scrubber::Scrubber()
 995  : reserved(false), reserve_failed(false),
 996    epoch_start(0),
 997    active(false),
 998    waiting_on(0), shallow_errors(0), deep_errors(0), fixed(0),
 999    must_scrub(false), must_deep_scrub(false), must_repair(false),
1000    auto_repair(false),
1001    num_digest_updates_pending(0),
1002    state(INACTIVE),
1003    deep(false),
1004    seed(0)
1005 {}
1006
1007 PG::Scrubber::~Scrubber() {}
1008
1009 /**
1010  * find_best_info
1011  *
1012  * Returns an iterator to the best info in infos sorted by:
1013  *  1) Prefer newer last_update
1014  *  2) Prefer longer tail if it brings another info into contiguity
1015  *  3) Prefer current primary
1016  */
1017 map<pg_shard_t, pg_info_t>::const_iterator PG::find_best_info(
1018   const map<pg_shard_t, pg_info_t> &infos,
1019   bool restrict_to_up_acting,
1020   bool *history_les_bound) const
1021 {
1022   assert(history_les_bound);
1023   /* See doc/dev/osd_internals/last_epoch_started.rst before attempting
1024    * to make changes to this process.  Also, make sure to update it
1025    * when you find bugs! */
1026   eversion_t min_last_update_acceptable = eversion_t::max();
1027   epoch_t max_last_epoch_started_found = 0;
1028   for (map<pg_shard_t, pg_info_t>::const_iterator i = infos.begin();
1029        i != infos.end();
1030        ++i) {
1031     if (!cct->_conf->osd_find_best_info_ignore_history_les &&
1032         max_last_epoch_started_found < i->second.history.last_epoch_started) {
1033       *history_les_bound = true;
1034       max_last_epoch_started_found = i->second.history.last_epoch_started;
1035     }
1036     if (!i->second.is_incomplete() &&
1037         max_last_epoch_started_found < i->second.last_epoch_started) {
1038       max_last_epoch_started_found = i->second.last_epoch_started;
1039     }
1040   }
1041   for (map<pg_shard_t, pg_info_t>::const_iterator i = infos.begin();
1042        i != infos.end();
1043        ++i) {
1044     if (max_last_epoch_started_found <= i->second.last_epoch_started) {
1045       if (min_last_update_acceptable > i->second.last_update)
1046         min_last_update_acceptable = i->second.last_update;
1047     }
1048   }
1049   if (min_last_update_acceptable == eversion_t::max())
1050     return infos.end();
1051
1052   map<pg_shard_t, pg_info_t>::const_iterator best = infos.end();
1053   // find osd with newest last_update (oldest for ec_pool).
1054   // if there are multiples, prefer
1055   //  - a longer tail, if it brings another peer into log contiguity
1056   //  - the current primary
1057   for (map<pg_shard_t, pg_info_t>::const_iterator p = infos.begin();
1058        p != infos.end();
1059        ++p) {
1060     if (restrict_to_up_acting && !is_up(p->first) &&
1061         !is_acting(p->first))
1062       continue;
1063     // Only consider peers with last_update >= min_last_update_acceptable
1064     if (p->second.last_update < min_last_update_acceptable)
1065       continue;
1066     // Disqualify anyone with a too old last_epoch_started
1067     if (p->second.last_epoch_started < max_last_epoch_started_found)
1068       continue;
1069     // Disqualify anyone who is incomplete (not fully backfilled)
1070     if (p->second.is_incomplete())
1071       continue;
1072     if (best == infos.end()) {
1073       best = p;
1074       continue;
1075     }
1076     // Prefer newer last_update
1077     if (pool.info.require_rollback()) {
1078       if (p->second.last_update > best->second.last_update)
1079         continue;
1080       if (p->second.last_update < best->second.last_update) {
1081         best = p;
1082         continue;
1083       }
1084     } else {
1085       if (p->second.last_update < best->second.last_update)
1086         continue;
1087       if (p->second.last_update > best->second.last_update) {
1088         best = p;
1089         continue;
1090       }
1091     }
1092
1093     // Prefer longer tail
1094     if (p->second.log_tail > best->second.log_tail) {
1095       continue;
1096     } else if (p->second.log_tail < best->second.log_tail) {
1097       best = p;
1098       continue;
1099     }
1100
1101     // prefer current primary (usually the caller), all things being equal
1102     if (p->first == pg_whoami) {
1103       dout(10) << "calc_acting prefer osd." << p->first
1104                << " because it is current primary" << dendl;
1105       best = p;
1106       continue;
1107     }
1108   }
1109   return best;
1110 }
1111
1112 void PG::calc_ec_acting(
1113   map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1114   unsigned size,
1115   const vector<int> &acting,
1116   pg_shard_t acting_primary,
1117   const vector<int> &up,
1118   pg_shard_t up_primary,
1119   const map<pg_shard_t, pg_info_t> &all_info,
1120   bool restrict_to_up_acting,
1121   vector<int> *_want,
1122   set<pg_shard_t> *backfill,
1123   set<pg_shard_t> *acting_backfill,
1124   pg_shard_t *want_primary,
1125   ostream &ss)
1126 {
1127   vector<int> want(size, CRUSH_ITEM_NONE);
1128   map<shard_id_t, set<pg_shard_t> > all_info_by_shard;
1129   unsigned usable = 0;
1130   for (map<pg_shard_t, pg_info_t>::const_iterator i = all_info.begin();
1131        i != all_info.end();
1132        ++i) {
1133     all_info_by_shard[i->first.shard].insert(i->first);
1134   }
1135   for (uint8_t i = 0; i < want.size(); ++i) {
1136     ss << "For position " << (unsigned)i << ": ";
1137     if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE &&
1138         !all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.is_incomplete() &&
1139         all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.last_update >=
1140         auth_log_shard->second.log_tail) {
1141       ss << " selecting up[i]: " << pg_shard_t(up[i], shard_id_t(i)) << std::endl;
1142       want[i] = up[i];
1143       ++usable;
1144       continue;
1145     }
1146     if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE) {
1147       ss << " backfilling up[i]: " << pg_shard_t(up[i], shard_id_t(i))
1148          << " and ";
1149       backfill->insert(pg_shard_t(up[i], shard_id_t(i)));
1150     }
1151
1152     if (acting.size() > (unsigned)i && acting[i] != CRUSH_ITEM_NONE &&
1153         !all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.is_incomplete() &&
1154         all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.last_update >=
1155         auth_log_shard->second.log_tail) {
1156       ss << " selecting acting[i]: " << pg_shard_t(acting[i], shard_id_t(i)) << std::endl;
1157       want[i] = acting[i];
1158       ++usable;
1159     } else if (!restrict_to_up_acting) {
1160       for (set<pg_shard_t>::iterator j = all_info_by_shard[shard_id_t(i)].begin();
1161            j != all_info_by_shard[shard_id_t(i)].end();
1162            ++j) {
1163         assert(j->shard == i);
1164         if (!all_info.find(*j)->second.is_incomplete() &&
1165             all_info.find(*j)->second.last_update >=
1166             auth_log_shard->second.log_tail) {
1167           ss << " selecting stray: " << *j << std::endl;
1168           want[i] = j->osd;
1169           ++usable;
1170           break;
1171         }
1172       }
1173       if (want[i] == CRUSH_ITEM_NONE)
1174         ss << " failed to fill position " << (int)i << std::endl;
1175     }
1176   }
1177
1178   bool found_primary = false;
1179   for (uint8_t i = 0; i < want.size(); ++i) {
1180     if (want[i] != CRUSH_ITEM_NONE) {
1181       acting_backfill->insert(pg_shard_t(want[i], shard_id_t(i)));
1182       if (!found_primary) {
1183         *want_primary = pg_shard_t(want[i], shard_id_t(i));
1184         found_primary = true;
1185       }
1186     }
1187   }
1188   acting_backfill->insert(backfill->begin(), backfill->end());
1189   _want->swap(want);
1190 }
1191
1192 /**
1193  * calculate the desired acting set.
1194  *
1195  * Choose an appropriate acting set.  Prefer up[0], unless it is
1196  * incomplete, or another osd has a longer tail that allows us to
1197  * bring other up nodes up to date.
1198  */
1199 void PG::calc_replicated_acting(
1200   map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1201   unsigned size,
1202   const vector<int> &acting,
1203   pg_shard_t acting_primary,
1204   const vector<int> &up,
1205   pg_shard_t up_primary,
1206   const map<pg_shard_t, pg_info_t> &all_info,
1207   bool restrict_to_up_acting,
1208   vector<int> *want,
1209   set<pg_shard_t> *backfill,
1210   set<pg_shard_t> *acting_backfill,
1211   pg_shard_t *want_primary,
1212   ostream &ss)
1213 {
1214   ss << "calc_acting newest update on osd." << auth_log_shard->first
1215      << " with " << auth_log_shard->second
1216      << (restrict_to_up_acting ? " restrict_to_up_acting" : "") << std::endl;
1217   pg_shard_t auth_log_shard_id = auth_log_shard->first;
1218
1219   // select primary
1220   map<pg_shard_t,pg_info_t>::const_iterator primary;
1221   if (up.size() &&
1222       !all_info.find(up_primary)->second.is_incomplete() &&
1223       all_info.find(up_primary)->second.last_update >=
1224         auth_log_shard->second.log_tail) {
1225     ss << "up_primary: " << up_primary << ") selected as primary" << std::endl;
1226     primary = all_info.find(up_primary); // prefer up[0], all thing being equal
1227   } else {
1228     assert(!auth_log_shard->second.is_incomplete());
1229     ss << "up[0] needs backfill, osd." << auth_log_shard_id
1230        << " selected as primary instead" << std::endl;
1231     primary = auth_log_shard;
1232   }
1233
1234   ss << "calc_acting primary is osd." << primary->first
1235      << " with " << primary->second << std::endl;
1236   *want_primary = primary->first;
1237   want->push_back(primary->first.osd);
1238   acting_backfill->insert(primary->first);
1239   unsigned usable = 1;
1240
1241   // select replicas that have log contiguity with primary.
1242   // prefer up, then acting, then any peer_info osds
1243   for (vector<int>::const_iterator i = up.begin();
1244        i != up.end();
1245        ++i) {
1246     pg_shard_t up_cand = pg_shard_t(*i, shard_id_t::NO_SHARD);
1247     if (up_cand == primary->first)
1248       continue;
1249     const pg_info_t &cur_info = all_info.find(up_cand)->second;
1250     if (cur_info.is_incomplete() ||
1251       cur_info.last_update < MIN(
1252         primary->second.log_tail,
1253         auth_log_shard->second.log_tail)) {
1254       /* We include auth_log_shard->second.log_tail because in GetLog,
1255        * we will request logs back to the min last_update over our
1256        * acting_backfill set, which will result in our log being extended
1257        * as far backwards as necessary to pick up any peers which can
1258        * be log recovered by auth_log_shard's log */
1259       ss << " shard " << up_cand << " (up) backfill " << cur_info << std::endl;
1260       backfill->insert(up_cand);
1261       acting_backfill->insert(up_cand);
1262     } else {
1263       want->push_back(*i);
1264       acting_backfill->insert(up_cand);
1265       usable++;
1266       ss << " osd." << *i << " (up) accepted " << cur_info << std::endl;
1267     }
1268   }
1269
1270   // This no longer has backfill OSDs, but they are covered above.
1271   for (vector<int>::const_iterator i = acting.begin();
1272        i != acting.end();
1273        ++i) {
1274     pg_shard_t acting_cand(*i, shard_id_t::NO_SHARD);
1275     if (usable >= size)
1276       break;
1277
1278     // skip up osds we already considered above
1279     if (acting_cand == primary->first)
1280       continue;
1281     vector<int>::const_iterator up_it = find(up.begin(), up.end(), acting_cand.osd);
1282     if (up_it != up.end())
1283       continue;
1284
1285     const pg_info_t &cur_info = all_info.find(acting_cand)->second;
1286     if (cur_info.is_incomplete() ||
1287         cur_info.last_update < primary->second.log_tail) {
1288       ss << " shard " << acting_cand << " (stray) REJECTED "
1289                << cur_info << std::endl;
1290     } else {
1291       want->push_back(*i);
1292       acting_backfill->insert(acting_cand);
1293       ss << " shard " << acting_cand << " (stray) accepted "
1294          << cur_info << std::endl;
1295       usable++;
1296     }
1297   }
1298
1299   if (restrict_to_up_acting) {
1300     return;
1301   }
1302   for (map<pg_shard_t,pg_info_t>::const_iterator i = all_info.begin();
1303        i != all_info.end();
1304        ++i) {
1305     if (usable >= size)
1306       break;
1307
1308     // skip up osds we already considered above
1309     if (i->first == primary->first)
1310       continue;
1311     vector<int>::const_iterator up_it = find(up.begin(), up.end(), i->first.osd);
1312     if (up_it != up.end())
1313       continue;
1314     vector<int>::const_iterator acting_it = find(
1315       acting.begin(), acting.end(), i->first.osd);
1316     if (acting_it != acting.end())
1317       continue;
1318
1319     if (i->second.is_incomplete() ||
1320         i->second.last_update < primary->second.log_tail) {
1321       ss << " shard " << i->first << " (stray) REJECTED "
1322          << i->second << std::endl;
1323     } else {
1324       want->push_back(i->first.osd);
1325       acting_backfill->insert(i->first);
1326       ss << " shard " << i->first << " (stray) accepted "
1327          << i->second << std::endl;
1328       usable++;
1329     }
1330   }
1331 }
1332
1333 /**
1334  * choose acting
1335  *
1336  * calculate the desired acting, and request a change with the monitor
1337  * if it differs from the current acting.
1338  *
1339  * if restrict_to_up_acting=true, we filter out anything that's not in
1340  * up/acting.  in order to lift this restriction, we need to
1341  *  1) check whether it's worth switching the acting set any time we get
1342  *     a new pg info (not just here, when recovery finishes)
1343  *  2) check whether anything in want_acting went down on each new map
1344  *     (and, if so, calculate a new want_acting)
1345  *  3) remove the assertion in PG::RecoveryState::Active::react(const AdvMap)
1346  * TODO!
1347  */
1348 bool PG::choose_acting(pg_shard_t &auth_log_shard_id,
1349                        bool restrict_to_up_acting,
1350                        bool *history_les_bound)
1351 {
1352   map<pg_shard_t, pg_info_t> all_info(peer_info.begin(), peer_info.end());
1353   all_info[pg_whoami] = info;
1354
1355   for (map<pg_shard_t, pg_info_t>::iterator p = all_info.begin();
1356        p != all_info.end();
1357        ++p) {
1358     dout(10) << "calc_acting osd." << p->first << " " << p->second << dendl;
1359   }
1360
1361   map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard =
1362     find_best_info(all_info, restrict_to_up_acting, history_les_bound);
1363
1364   if (auth_log_shard == all_info.end()) {
1365     if (up != acting) {
1366       dout(10) << "choose_acting no suitable info found (incomplete backfills?),"
1367                << " reverting to up" << dendl;
1368       want_acting = up;
1369       vector<int> empty;
1370       osd->queue_want_pg_temp(info.pgid.pgid, empty);
1371     } else {
1372       dout(10) << "choose_acting failed" << dendl;
1373       assert(want_acting.empty());
1374     }
1375     return false;
1376   }
1377
1378   assert(!auth_log_shard->second.is_incomplete());
1379   auth_log_shard_id = auth_log_shard->first;
1380
1381   set<pg_shard_t> want_backfill, want_acting_backfill;
1382   vector<int> want;
1383   pg_shard_t want_primary;
1384   stringstream ss;
1385   if (!pool.info.ec_pool())
1386     calc_replicated_acting(
1387       auth_log_shard,
1388       get_osdmap()->get_pg_size(info.pgid.pgid),
1389       acting,
1390       primary,
1391       up,
1392       up_primary,
1393       all_info,
1394       restrict_to_up_acting,
1395       &want,
1396       &want_backfill,
1397       &want_acting_backfill,
1398       &want_primary,
1399       ss);
1400   else
1401     calc_ec_acting(
1402       auth_log_shard,
1403       get_osdmap()->get_pg_size(info.pgid.pgid),
1404       acting,
1405       primary,
1406       up,
1407       up_primary,
1408       all_info,
1409       restrict_to_up_acting,
1410       &want,
1411       &want_backfill,
1412       &want_acting_backfill,
1413       &want_primary,
1414       ss);
1415   dout(10) << ss.str() << dendl;
1416
1417   unsigned num_want_acting = 0;
1418   set<pg_shard_t> have;
1419   for (int i = 0; i < (int)want.size(); ++i) {
1420     if (want[i] != CRUSH_ITEM_NONE) {
1421       ++num_want_acting;
1422       have.insert(
1423         pg_shard_t(
1424           want[i],
1425           pool.info.ec_pool() ? shard_id_t(i) : shard_id_t::NO_SHARD));
1426     }
1427   }
1428
1429   // We go incomplete if below min_size for ec_pools since backfill
1430   // does not currently maintain rollbackability
1431   // Otherwise, we will go "peered", but not "active"
1432   if (num_want_acting < pool.info.min_size &&
1433       (pool.info.ec_pool() ||
1434        !cct->_conf->osd_allow_recovery_below_min_size)) {
1435     want_acting.clear();
1436     dout(10) << "choose_acting failed, below min size" << dendl;
1437     return false;
1438   }
1439
1440   /* Check whether we have enough acting shards to later perform recovery */
1441   boost::scoped_ptr<IsPGRecoverablePredicate> recoverable_predicate(
1442     get_pgbackend()->get_is_recoverable_predicate());
1443   if (!(*recoverable_predicate)(have)) {
1444     want_acting.clear();
1445     dout(10) << "choose_acting failed, not recoverable" << dendl;
1446     return false;
1447   }
1448
1449   if (want != acting) {
1450     dout(10) << "choose_acting want " << want << " != acting " << acting
1451              << ", requesting pg_temp change" << dendl;
1452     want_acting = want;
1453
1454     if (want_acting == up) {
1455       // There can't be any pending backfill if
1456       // want is the same as crush map up OSDs.
1457       assert(want_backfill.empty());
1458       vector<int> empty;
1459       osd->queue_want_pg_temp(info.pgid.pgid, empty);
1460     } else
1461       osd->queue_want_pg_temp(info.pgid.pgid, want);
1462     return false;
1463   }
1464   want_acting.clear();
1465   actingbackfill = want_acting_backfill;
1466   dout(10) << "actingbackfill is " << actingbackfill << dendl;
1467   assert(backfill_targets.empty() || backfill_targets == want_backfill);
1468   if (backfill_targets.empty()) {
1469     // Caller is GetInfo
1470     backfill_targets = want_backfill;
1471   }
1472   // Will not change if already set because up would have had to change
1473   // Verify that nothing in backfill is in stray_set
1474   for (set<pg_shard_t>::iterator i = want_backfill.begin();
1475       i != want_backfill.end();
1476       ++i) {
1477     assert(stray_set.find(*i) == stray_set.end());
1478   }
1479   dout(10) << "choose_acting want " << want << " (== acting) backfill_targets "
1480            << want_backfill << dendl;
1481   return true;
1482 }
1483
1484 /* Build the might_have_unfound set.
1485  *
1486  * This is used by the primary OSD during recovery.
1487  *
1488  * This set tracks the OSDs which might have unfound objects that the primary
1489  * OSD needs. As we receive pg_missing_t from each OSD in might_have_unfound, we
1490  * will remove the OSD from the set.
1491  */
1492 void PG::build_might_have_unfound()
1493 {
1494   assert(might_have_unfound.empty());
1495   assert(is_primary());
1496
1497   dout(10) << __func__ << dendl;
1498
1499   check_past_interval_bounds();
1500
1501   might_have_unfound = past_intervals.get_might_have_unfound(
1502     pg_whoami,
1503     pool.info.ec_pool());
1504
1505   // include any (stray) peers
1506   for (map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
1507        p != peer_info.end();
1508        ++p)
1509     might_have_unfound.insert(p->first);
1510
1511   dout(15) << __func__ << ": built " << might_have_unfound << dendl;
1512 }
1513
1514 struct C_PG_ActivateCommitted : public Context {
1515   PGRef pg;
1516   epoch_t epoch;
1517   epoch_t activation_epoch;
1518   C_PG_ActivateCommitted(PG *p, epoch_t e, epoch_t ae)
1519     : pg(p), epoch(e), activation_epoch(ae) {}
1520   void finish(int r) override {
1521     pg->_activate_committed(epoch, activation_epoch);
1522   }
1523 };
1524
1525 void PG::activate(ObjectStore::Transaction& t,
1526                   epoch_t activation_epoch,
1527                   list<Context*>& tfin,
1528                   map<int, map<spg_t,pg_query_t> >& query_map,
1529                   map<int,
1530                       vector<
1531                         pair<pg_notify_t,
1532                              PastIntervals> > > *activator_map,
1533                   RecoveryCtx *ctx)
1534 {
1535   assert(!is_peered());
1536   assert(scrubber.callbacks.empty());
1537   assert(callbacks_for_degraded_object.empty());
1538
1539   // twiddle pg state
1540   state_clear(PG_STATE_DOWN);
1541
1542   send_notify = false;
1543
1544   if (is_primary()) {
1545     // only update primary last_epoch_started if we will go active
1546     if (acting.size() >= pool.info.min_size) {
1547       assert(cct->_conf->osd_find_best_info_ignore_history_les ||
1548              info.last_epoch_started <= activation_epoch);
1549       info.last_epoch_started = activation_epoch;
1550       info.last_interval_started = info.history.same_interval_since;
1551     }
1552   } else if (is_acting(pg_whoami)) {
1553     /* update last_epoch_started on acting replica to whatever the primary sent
1554      * unless it's smaller (could happen if we are going peered rather than
1555      * active, see doc/dev/osd_internals/last_epoch_started.rst) */
1556     if (info.last_epoch_started < activation_epoch) {
1557       info.last_epoch_started = activation_epoch;
1558       info.last_interval_started = info.history.same_interval_since;
1559     }
1560   }
1561
1562   auto &missing = pg_log.get_missing();
1563
1564   if (is_primary()) {
1565     last_update_ondisk = info.last_update;
1566     min_last_complete_ondisk = eversion_t(0,0);  // we don't know (yet)!
1567   }
1568   last_update_applied = info.last_update;
1569   last_rollback_info_trimmed_to_applied = pg_log.get_can_rollback_to();
1570
1571   need_up_thru = false;
1572
1573   // write pg info, log
1574   dirty_info = true;
1575   dirty_big_info = true; // maybe
1576
1577   // find out when we commit
1578   t.register_on_complete(
1579     new C_PG_ActivateCommitted(
1580       this,
1581       get_osdmap()->get_epoch(),
1582       activation_epoch));
1583
1584   // initialize snap_trimq
1585   if (is_primary()) {
1586     dout(20) << "activate - purged_snaps " << info.purged_snaps
1587              << " cached_removed_snaps " << pool.cached_removed_snaps << dendl;
1588     snap_trimq = pool.cached_removed_snaps;
1589     interval_set<snapid_t> intersection;
1590     intersection.intersection_of(snap_trimq, info.purged_snaps);
1591     if (intersection == info.purged_snaps) {
1592       snap_trimq.subtract(info.purged_snaps);
1593     } else {
1594         dout(0) << "warning: info.purged_snaps (" << info.purged_snaps
1595                 << ") is not a subset of pool.cached_removed_snaps ("
1596                 << pool.cached_removed_snaps << ")" << dendl;
1597         snap_trimq.subtract(intersection);
1598     }
1599   }
1600
1601   // init complete pointer
1602   if (missing.num_missing() == 0) {
1603     dout(10) << "activate - no missing, moving last_complete " << info.last_complete
1604              << " -> " << info.last_update << dendl;
1605     info.last_complete = info.last_update;
1606     pg_log.reset_recovery_pointers();
1607   } else {
1608     dout(10) << "activate - not complete, " << missing << dendl;
1609     pg_log.activate_not_complete(info);
1610   }
1611
1612   log_weirdness();
1613
1614   // if primary..
1615   if (is_primary()) {
1616     assert(ctx);
1617     // start up replicas
1618
1619     assert(!actingbackfill.empty());
1620     for (set<pg_shard_t>::iterator i = actingbackfill.begin();
1621          i != actingbackfill.end();
1622          ++i) {
1623       if (*i == pg_whoami) continue;
1624       pg_shard_t peer = *i;
1625       assert(peer_info.count(peer));
1626       pg_info_t& pi = peer_info[peer];
1627
1628       dout(10) << "activate peer osd." << peer << " " << pi << dendl;
1629
1630       MOSDPGLog *m = 0;
1631       pg_missing_t& pm = peer_missing[peer];
1632
1633       bool needs_past_intervals = pi.dne();
1634
1635       /*
1636        * cover case where peer sort order was different and
1637        * last_backfill cannot be interpreted
1638        */
1639       bool force_restart_backfill =
1640         !pi.last_backfill.is_max() &&
1641         !pi.last_backfill_bitwise;
1642
1643       if (pi.last_update == info.last_update && !force_restart_backfill) {
1644         // empty log
1645         if (!pi.last_backfill.is_max())
1646           osd->clog->info() << info.pgid << " continuing backfill to osd."
1647                             << peer
1648                             << " from (" << pi.log_tail << "," << pi.last_update
1649                             << "] " << pi.last_backfill
1650                             << " to " << info.last_update;
1651         if (!pi.is_empty() && activator_map) {
1652           dout(10) << "activate peer osd." << peer << " is up to date, queueing in pending_activators" << dendl;
1653           (*activator_map)[peer.osd].push_back(
1654             make_pair(
1655               pg_notify_t(
1656                 peer.shard, pg_whoami.shard,
1657                 get_osdmap()->get_epoch(),
1658                 get_osdmap()->get_epoch(),
1659                 info),
1660               past_intervals));
1661         } else {
1662           dout(10) << "activate peer osd." << peer << " is up to date, but sending pg_log anyway" << dendl;
1663           m = new MOSDPGLog(
1664             i->shard, pg_whoami.shard,
1665             get_osdmap()->get_epoch(), info);
1666         }
1667       } else if (
1668         pg_log.get_tail() > pi.last_update ||
1669         pi.last_backfill == hobject_t() ||
1670         force_restart_backfill ||
1671         (backfill_targets.count(*i) && pi.last_backfill.is_max())) {
1672         /* ^ This last case covers a situation where a replica is not contiguous
1673          * with the auth_log, but is contiguous with this replica.  Reshuffling
1674          * the active set to handle this would be tricky, so instead we just go
1675          * ahead and backfill it anyway.  This is probably preferrable in any
1676          * case since the replica in question would have to be significantly
1677          * behind.
1678          */
1679         // backfill
1680         osd->clog->debug() << info.pgid << " starting backfill to osd." << peer
1681                          << " from (" << pi.log_tail << "," << pi.last_update
1682                           << "] " << pi.last_backfill
1683                          << " to " << info.last_update;
1684
1685         pi.last_update = info.last_update;
1686         pi.last_complete = info.last_update;
1687         pi.set_last_backfill(hobject_t());
1688         pi.last_epoch_started = info.last_epoch_started;
1689         pi.last_interval_started = info.last_interval_started;
1690         pi.history = info.history;
1691         pi.hit_set = info.hit_set;
1692         pi.stats.stats.clear();
1693
1694         // initialize peer with our purged_snaps.
1695         pi.purged_snaps = info.purged_snaps;
1696
1697         m = new MOSDPGLog(
1698           i->shard, pg_whoami.shard,
1699           get_osdmap()->get_epoch(), pi);
1700
1701         // send some recent log, so that op dup detection works well.
1702         m->log.copy_up_to(pg_log.get_log(), cct->_conf->osd_min_pg_log_entries);
1703         m->info.log_tail = m->log.tail;
1704         pi.log_tail = m->log.tail;  // sigh...
1705
1706         pm.clear();
1707       } else {
1708         // catch up
1709         assert(pg_log.get_tail() <= pi.last_update);
1710         m = new MOSDPGLog(
1711           i->shard, pg_whoami.shard,
1712           get_osdmap()->get_epoch(), info);
1713         // send new stuff to append to replicas log
1714         m->log.copy_after(pg_log.get_log(), pi.last_update);
1715       }
1716
1717       // share past_intervals if we are creating the pg on the replica
1718       // based on whether our info for that peer was dne() *before*
1719       // updating pi.history in the backfill block above.
1720       if (m && needs_past_intervals)
1721         m->past_intervals = past_intervals;
1722
1723       // update local version of peer's missing list!
1724       if (m && pi.last_backfill != hobject_t()) {
1725         for (list<pg_log_entry_t>::iterator p = m->log.log.begin();
1726              p != m->log.log.end();
1727              ++p)
1728           if (p->soid <= pi.last_backfill &&
1729               !p->is_error())
1730             pm.add_next_event(*p);
1731       }
1732
1733       if (m) {
1734         dout(10) << "activate peer osd." << peer << " sending " << m->log << dendl;
1735         //m->log.print(cout);
1736         osd->send_message_osd_cluster(peer.osd, m, get_osdmap()->get_epoch());
1737       }
1738
1739       // peer now has
1740       pi.last_update = info.last_update;
1741
1742       // update our missing
1743       if (pm.num_missing() == 0) {
1744         pi.last_complete = pi.last_update;
1745         dout(10) << "activate peer osd." << peer << " " << pi << " uptodate" << dendl;
1746       } else {
1747         dout(10) << "activate peer osd." << peer << " " << pi << " missing " << pm << dendl;
1748       }
1749     }
1750
1751     // Set up missing_loc
1752     set<pg_shard_t> complete_shards;
1753     for (set<pg_shard_t>::iterator i = actingbackfill.begin();
1754          i != actingbackfill.end();
1755          ++i) {
1756       if (*i == get_primary()) {
1757         missing_loc.add_active_missing(missing);
1758         if (!missing.have_missing())
1759           complete_shards.insert(*i);
1760       } else {
1761         auto peer_missing_entry = peer_missing.find(*i);
1762         assert(peer_missing_entry != peer_missing.end());
1763         missing_loc.add_active_missing(peer_missing_entry->second);
1764         if (!peer_missing_entry->second.have_missing() &&
1765             peer_info[*i].last_backfill.is_max())
1766           complete_shards.insert(*i);
1767       }
1768     }
1769     // If necessary, create might_have_unfound to help us find our unfound objects.
1770     // NOTE: It's important that we build might_have_unfound before trimming the
1771     // past intervals.
1772     might_have_unfound.clear();
1773     if (needs_recovery()) {
1774       // If only one shard has missing, we do a trick to add all others as recovery
1775       // source, this is considered safe since the PGLogs have been merged locally,
1776       // and covers vast majority of the use cases, like one OSD/host is down for
1777       // a while for hardware repairing
1778       if (complete_shards.size() + 1 == actingbackfill.size()) {
1779         missing_loc.add_batch_sources_info(complete_shards, ctx->handle);
1780       } else {
1781         missing_loc.add_source_info(pg_whoami, info, pg_log.get_missing(),
1782                                     ctx->handle);
1783         for (set<pg_shard_t>::iterator i = actingbackfill.begin();
1784              i != actingbackfill.end();
1785              ++i) {
1786           if (*i == pg_whoami) continue;
1787           dout(10) << __func__ << ": adding " << *i << " as a source" << dendl;
1788           assert(peer_missing.count(*i));
1789           assert(peer_info.count(*i));
1790           missing_loc.add_source_info(
1791             *i,
1792             peer_info[*i],
1793             peer_missing[*i],
1794             ctx->handle);
1795         }
1796       }
1797       for (map<pg_shard_t, pg_missing_t>::iterator i = peer_missing.begin();
1798            i != peer_missing.end();
1799            ++i) {
1800         if (is_actingbackfill(i->first))
1801           continue;
1802         assert(peer_info.count(i->first));
1803         search_for_missing(
1804           peer_info[i->first],
1805           i->second,
1806           i->first,
1807           ctx);
1808       }
1809
1810       build_might_have_unfound();
1811
1812       state_set(PG_STATE_DEGRADED);
1813       if (have_unfound())
1814         discover_all_missing(query_map);
1815     }
1816
1817     // degraded?
1818     if (get_osdmap()->get_pg_size(info.pgid.pgid) > actingset.size()) {
1819       state_set(PG_STATE_DEGRADED);
1820       state_set(PG_STATE_UNDERSIZED);
1821     }
1822
1823     state_set(PG_STATE_ACTIVATING);
1824     release_pg_backoffs();
1825     projected_last_update = info.last_update;
1826   }
1827   if (acting.size() >= pool.info.min_size) {
1828     PGLogEntryHandler handler{this, &t};
1829     pg_log.roll_forward(&handler);
1830   }
1831 }
1832
1833 bool PG::op_has_sufficient_caps(OpRequestRef& op)
1834 {
1835   // only check MOSDOp
1836   if (op->get_req()->get_type() != CEPH_MSG_OSD_OP)
1837     return true;
1838
1839   const MOSDOp *req = static_cast<const MOSDOp*>(op->get_req());
1840
1841   Session *session = static_cast<Session*>(req->get_connection()->get_priv());
1842   if (!session) {
1843     dout(0) << "op_has_sufficient_caps: no session for op " << *req << dendl;
1844     return false;
1845   }
1846   OSDCap& caps = session->caps;
1847   session->put();
1848
1849   const string &key = req->get_hobj().get_key().empty() ?
1850     req->get_oid().name :
1851     req->get_hobj().get_key();
1852
1853   bool cap = caps.is_capable(pool.name, req->get_hobj().nspace,
1854                              pool.auid, key,
1855                              op->need_read_cap(),
1856                              op->need_write_cap(),
1857                              op->classes());
1858
1859   dout(20) << "op_has_sufficient_caps pool=" << pool.id << " (" << pool.name
1860                    << " " << req->get_hobj().nspace
1861            << ") owner=" << pool.auid
1862            << " need_read_cap=" << op->need_read_cap()
1863            << " need_write_cap=" << op->need_write_cap()
1864            << " classes=" << op->classes()
1865            << " -> " << (cap ? "yes" : "NO")
1866            << dendl;
1867   return cap;
1868 }
1869
1870 void PG::_activate_committed(epoch_t epoch, epoch_t activation_epoch)
1871 {
1872   lock();
1873   if (pg_has_reset_since(epoch)) {
1874     dout(10) << "_activate_committed " << epoch
1875              << ", that was an old interval" << dendl;
1876   } else if (is_primary()) {
1877     peer_activated.insert(pg_whoami);
1878     dout(10) << "_activate_committed " << epoch
1879              << " peer_activated now " << peer_activated
1880              << " last_interval_started " << info.history.last_interval_started
1881              << " last_epoch_started " << info.history.last_epoch_started
1882              << " same_interval_since " << info.history.same_interval_since << dendl;
1883     assert(!actingbackfill.empty());
1884     if (peer_activated.size() == actingbackfill.size())
1885       all_activated_and_committed();
1886   } else {
1887     dout(10) << "_activate_committed " << epoch << " telling primary" << dendl;
1888     MOSDPGInfo *m = new MOSDPGInfo(epoch);
1889     pg_notify_t i = pg_notify_t(
1890       get_primary().shard, pg_whoami.shard,
1891       get_osdmap()->get_epoch(),
1892       get_osdmap()->get_epoch(),
1893       info);
1894
1895     i.info.history.last_epoch_started = activation_epoch;
1896     i.info.history.last_interval_started = i.info.history.same_interval_since;
1897     if (acting.size() >= pool.info.min_size) {
1898       state_set(PG_STATE_ACTIVE);
1899     } else {
1900       state_set(PG_STATE_PEERED);
1901     }
1902
1903     m->pg_list.push_back(make_pair(i, PastIntervals()));
1904     osd->send_message_osd_cluster(get_primary().osd, m, get_osdmap()->get_epoch());
1905
1906     // waiters
1907     if (flushes_in_progress == 0) {
1908       requeue_ops(waiting_for_peered);
1909     }
1910   }
1911
1912   assert(!dirty_info);
1913
1914   unlock();
1915 }
1916
1917 /*
1918  * update info.history.last_epoch_started ONLY after we and all
1919  * replicas have activated AND committed the activate transaction
1920  * (i.e. the peering results are stable on disk).
1921  */
1922 void PG::all_activated_and_committed()
1923 {
1924   dout(10) << "all_activated_and_committed" << dendl;
1925   assert(is_primary());
1926   assert(peer_activated.size() == actingbackfill.size());
1927   assert(!actingbackfill.empty());
1928   assert(blocked_by.empty());
1929
1930   queue_peering_event(
1931     CephPeeringEvtRef(
1932       std::make_shared<CephPeeringEvt>(
1933         get_osdmap()->get_epoch(),
1934         get_osdmap()->get_epoch(),
1935         AllReplicasActivated())));
1936 }
1937
1938 bool PG::requeue_scrub(bool high_priority)
1939 {
1940   assert(is_locked());
1941   if (scrub_queued) {
1942     dout(10) << __func__ << ": already queued" << dendl;
1943     return false;
1944   } else {
1945     dout(10) << __func__ << ": queueing" << dendl;
1946     scrub_queued = true;
1947     osd->queue_for_scrub(this, high_priority);
1948     return true;
1949   }
1950 }
1951
1952 void PG::queue_recovery(bool front)
1953 {
1954   if (!is_primary() || !is_peered()) {
1955     dout(10) << "queue_recovery -- not primary or not peered " << dendl;
1956     assert(!recovery_queued);
1957   } else if (recovery_queued) {
1958     dout(10) << "queue_recovery -- already queued" << dendl;
1959   } else {
1960     dout(10) << "queue_recovery -- queuing" << dendl;
1961     recovery_queued = true;
1962     osd->queue_for_recovery(this, front);
1963   }
1964 }
1965
1966 bool PG::queue_scrub()
1967 {
1968   assert(is_locked());
1969   if (is_scrubbing()) {
1970     return false;
1971   }
1972   scrubber.priority = scrubber.must_scrub ?
1973          cct->_conf->osd_requested_scrub_priority : get_scrub_priority();
1974   scrubber.must_scrub = false;
1975   state_set(PG_STATE_SCRUBBING);
1976   if (scrubber.must_deep_scrub) {
1977     state_set(PG_STATE_DEEP_SCRUB);
1978     scrubber.must_deep_scrub = false;
1979   }
1980   if (scrubber.must_repair || scrubber.auto_repair) {
1981     state_set(PG_STATE_REPAIR);
1982     scrubber.must_repair = false;
1983   }
1984   requeue_scrub();
1985   return true;
1986 }
1987
1988 unsigned PG::get_scrub_priority()
1989 {
1990   // a higher value -> a higher priority
1991   int pool_scrub_priority = 0;
1992   pool.info.opts.get(pool_opts_t::SCRUB_PRIORITY, &pool_scrub_priority);
1993   return pool_scrub_priority > 0 ? pool_scrub_priority : cct->_conf->osd_scrub_priority;
1994 }
1995
1996 struct C_PG_FinishRecovery : public Context {
1997   PGRef pg;
1998   explicit C_PG_FinishRecovery(PG *p) : pg(p) {}
1999   void finish(int r) override {
2000     pg->_finish_recovery(this);
2001   }
2002 };
2003
2004 void PG::mark_clean()
2005 {
2006   if (actingset.size() == get_osdmap()->get_pg_size(info.pgid.pgid)) {
2007     state_set(PG_STATE_CLEAN);
2008     info.history.last_epoch_clean = get_osdmap()->get_epoch();
2009     info.history.last_interval_clean = info.history.same_interval_since;
2010     past_intervals.clear();
2011     dirty_big_info = true;
2012     dirty_info = true;
2013   }
2014
2015   kick_snap_trim();
2016 }
2017
2018 unsigned PG::get_recovery_priority()
2019 {
2020   // a higher value -> a higher priority
2021
2022   int pool_recovery_priority = 0;
2023   pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority);
2024
2025   int ret = OSD_RECOVERY_PRIORITY_BASE + pool_recovery_priority;
2026
2027   // Clamp to valid range
2028   if (ret > OSD_RECOVERY_PRIORITY_MAX) {
2029     ret = OSD_RECOVERY_PRIORITY_MAX;
2030   } else if (ret < OSD_RECOVERY_PRIORITY_MIN) {
2031     ret = OSD_RECOVERY_PRIORITY_MIN;
2032   }
2033
2034   static_assert(OSD_RECOVERY_PRIORITY_MIN < OSD_RECOVERY_PRIORITY_MAX, "Invalid priority range");
2035   static_assert(OSD_RECOVERY_PRIORITY_MIN >= 0, "Priority range must match unsigned type");
2036
2037   return static_cast<unsigned>(ret);
2038 }
2039
2040 unsigned PG::get_backfill_priority()
2041 {
2042   // a higher value -> a higher priority
2043
2044   int ret = OSD_BACKFILL_PRIORITY_BASE;
2045   if (acting.size() < pool.info.min_size) {
2046     // inactive: no. of replicas < min_size, highest priority since it blocks IO
2047     ret = OSD_BACKFILL_INACTIVE_PRIORITY_BASE + (pool.info.min_size - acting.size());
2048
2049   } else if (is_undersized()) {
2050     // undersized: OSD_BACKFILL_DEGRADED_PRIORITY_BASE + num missing replicas
2051     assert(pool.info.size > actingset.size());
2052     ret = OSD_BACKFILL_DEGRADED_PRIORITY_BASE + (pool.info.size - actingset.size());
2053
2054   } else if (is_degraded()) {
2055     // degraded: baseline degraded
2056     ret = OSD_BACKFILL_DEGRADED_PRIORITY_BASE;
2057   }
2058
2059   // Adjust with pool's recovery priority
2060   int pool_recovery_priority = 0;
2061   pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority);
2062   ret += pool_recovery_priority;
2063
2064   // Clamp to valid range
2065   if (ret > OSD_RECOVERY_PRIORITY_MAX) {
2066     ret = OSD_RECOVERY_PRIORITY_MAX;
2067   } else if (ret < OSD_RECOVERY_PRIORITY_MIN) {
2068     ret = OSD_RECOVERY_PRIORITY_MIN;
2069   }
2070
2071   return static_cast<unsigned>(ret);
2072 }
2073
2074 void PG::finish_recovery(list<Context*>& tfin)
2075 {
2076   dout(10) << "finish_recovery" << dendl;
2077   assert(info.last_complete == info.last_update);
2078
2079   clear_recovery_state();
2080
2081   /*
2082    * sync all this before purging strays.  but don't block!
2083    */
2084   finish_sync_event = new C_PG_FinishRecovery(this);
2085   tfin.push_back(finish_sync_event);
2086 }
2087
2088 void PG::_finish_recovery(Context *c)
2089 {
2090   lock();
2091   if (deleting) {
2092     unlock();
2093     return;
2094   }
2095   if (c == finish_sync_event) {
2096     dout(10) << "_finish_recovery" << dendl;
2097     finish_sync_event = 0;
2098     purge_strays();
2099
2100     publish_stats_to_osd();
2101
2102     if (scrub_after_recovery) {
2103       dout(10) << "_finish_recovery requeueing for scrub" << dendl;
2104       scrub_after_recovery = false;
2105       scrubber.must_deep_scrub = true;
2106       queue_scrub();
2107     }
2108   } else {
2109     dout(10) << "_finish_recovery -- stale" << dendl;
2110   }
2111   unlock();
2112 }
2113
2114 void PG::start_recovery_op(const hobject_t& soid)
2115 {
2116   dout(10) << "start_recovery_op " << soid
2117 #ifdef DEBUG_RECOVERY_OIDS
2118            << " (" << recovering_oids << ")"
2119 #endif
2120            << dendl;
2121   assert(recovery_ops_active >= 0);
2122   recovery_ops_active++;
2123 #ifdef DEBUG_RECOVERY_OIDS
2124   assert(recovering_oids.count(soid) == 0);
2125   recovering_oids.insert(soid);
2126 #endif
2127   osd->start_recovery_op(this, soid);
2128 }
2129
2130 void PG::finish_recovery_op(const hobject_t& soid, bool dequeue)
2131 {
2132   dout(10) << "finish_recovery_op " << soid
2133 #ifdef DEBUG_RECOVERY_OIDS
2134            << " (" << recovering_oids << ")"
2135 #endif
2136            << dendl;
2137   assert(recovery_ops_active > 0);
2138   recovery_ops_active--;
2139 #ifdef DEBUG_RECOVERY_OIDS
2140   assert(recovering_oids.count(soid));
2141   recovering_oids.erase(soid);
2142 #endif
2143   osd->finish_recovery_op(this, soid, dequeue);
2144
2145   if (!dequeue) {
2146     queue_recovery();
2147   }
2148 }
2149
2150 void PG::split_into(pg_t child_pgid, PG *child, unsigned split_bits)
2151 {
2152   child->update_snap_mapper_bits(split_bits);
2153   child->update_osdmap_ref(get_osdmap());
2154
2155   child->pool = pool;
2156
2157   // Log
2158   pg_log.split_into(child_pgid, split_bits, &(child->pg_log));
2159   child->info.last_complete = info.last_complete;
2160
2161   info.last_update = pg_log.get_head();
2162   child->info.last_update = child->pg_log.get_head();
2163
2164   child->info.last_user_version = info.last_user_version;
2165
2166   info.log_tail = pg_log.get_tail();
2167   child->info.log_tail = child->pg_log.get_tail();
2168
2169   if (info.last_complete < pg_log.get_tail())
2170     info.last_complete = pg_log.get_tail();
2171   if (child->info.last_complete < child->pg_log.get_tail())
2172     child->info.last_complete = child->pg_log.get_tail();
2173
2174   // Info
2175   child->info.history = info.history;
2176   child->info.history.epoch_created = get_osdmap()->get_epoch();
2177   child->info.purged_snaps = info.purged_snaps;
2178
2179   if (info.last_backfill.is_max()) {
2180     child->info.set_last_backfill(hobject_t::get_max());
2181   } else {
2182     // restart backfill on parent and child to be safe.  we could
2183     // probably do better in the bitwise sort case, but it's more
2184     // fragile (there may be special work to do on backfill completion
2185     // in the future).
2186     info.set_last_backfill(hobject_t());
2187     child->info.set_last_backfill(hobject_t());
2188   }
2189
2190   child->info.stats = info.stats;
2191   child->info.stats.parent_split_bits = split_bits;
2192   info.stats.stats_invalid = true;
2193   child->info.stats.stats_invalid = true;
2194   child->info.last_epoch_started = info.last_epoch_started;
2195   child->info.last_interval_started = info.last_interval_started;
2196
2197   child->snap_trimq = snap_trimq;
2198
2199   // There can't be recovery/backfill going on now
2200   int primary, up_primary;
2201   vector<int> newup, newacting;
2202   get_osdmap()->pg_to_up_acting_osds(
2203     child->info.pgid.pgid, &newup, &up_primary, &newacting, &primary);
2204   child->init_primary_up_acting(
2205     newup,
2206     newacting,
2207     up_primary,
2208     primary);
2209   child->role = OSDMap::calc_pg_role(osd->whoami, child->acting);
2210
2211   // this comparison includes primary rank via pg_shard_t
2212   if (get_primary() != child->get_primary())
2213     child->info.history.same_primary_since = get_osdmap()->get_epoch();
2214
2215   child->info.stats.up = up;
2216   child->info.stats.up_primary = up_primary;
2217   child->info.stats.acting = acting;
2218   child->info.stats.acting_primary = primary;
2219   child->info.stats.mapping_epoch = get_osdmap()->get_epoch();
2220
2221   // History
2222   child->past_intervals = past_intervals;
2223
2224   _split_into(child_pgid, child, split_bits);
2225
2226   // release all backoffs for simplicity
2227   release_backoffs(hobject_t(), hobject_t::get_max());
2228
2229   child->on_new_interval();
2230
2231   child->dirty_info = true;
2232   child->dirty_big_info = true;
2233   dirty_info = true;
2234   dirty_big_info = true;
2235 }
2236
2237 void PG::add_backoff(SessionRef s, const hobject_t& begin, const hobject_t& end)
2238 {
2239   ConnectionRef con = s->con;
2240   if (!con)   // OSD::ms_handle_reset clears s->con without a lock
2241     return;
2242   BackoffRef b(s->have_backoff(info.pgid, begin));
2243   if (b) {
2244     derr << __func__ << " already have backoff for " << s << " begin " << begin
2245          << " " << *b << dendl;
2246     ceph_abort();
2247   }
2248   Mutex::Locker l(backoff_lock);
2249   {
2250     b = new Backoff(info.pgid, this, s, ++s->backoff_seq, begin, end);
2251     backoffs[begin].insert(b);
2252     s->add_backoff(b);
2253     dout(10) << __func__ << " session " << s << " added " << *b << dendl;
2254   }
2255   con->send_message(
2256     new MOSDBackoff(
2257       info.pgid,
2258       get_osdmap()->get_epoch(),
2259       CEPH_OSD_BACKOFF_OP_BLOCK,
2260       b->id,
2261       begin,
2262       end));
2263 }
2264
2265 void PG::release_backoffs(const hobject_t& begin, const hobject_t& end)
2266 {
2267   dout(10) << __func__ << " [" << begin << "," << end << ")" << dendl;
2268   vector<BackoffRef> bv;
2269   {
2270     Mutex::Locker l(backoff_lock);
2271     auto p = backoffs.lower_bound(begin);
2272     while (p != backoffs.end()) {
2273       int r = cmp(p->first, end);
2274       dout(20) << __func__ << " ? " << r << " " << p->first
2275                << " " << p->second << dendl;
2276       // note: must still examine begin=end=p->first case
2277       if (r > 0 || (r == 0 && begin < end)) {
2278         break;
2279       }
2280       dout(20) << __func__ << " checking " << p->first
2281                << " " << p->second << dendl;
2282       auto q = p->second.begin();
2283       while (q != p->second.end()) {
2284         dout(20) << __func__ << " checking  " << *q << dendl;
2285         int r = cmp((*q)->begin, begin);
2286         if (r == 0 || (r > 0 && (*q)->end < end)) {
2287           bv.push_back(*q);
2288           q = p->second.erase(q);
2289         } else {
2290           ++q;
2291         }
2292       }
2293       if (p->second.empty()) {
2294         p = backoffs.erase(p);
2295       } else {
2296         ++p;
2297       }
2298     }
2299   }
2300   for (auto b : bv) {
2301     Mutex::Locker l(b->lock);
2302     dout(10) << __func__ << " " << *b << dendl;
2303     if (b->session) {
2304       assert(b->pg == this);
2305       ConnectionRef con = b->session->con;
2306       if (con) {   // OSD::ms_handle_reset clears s->con without a lock
2307         con->send_message(
2308           new MOSDBackoff(
2309             info.pgid,
2310             get_osdmap()->get_epoch(),
2311             CEPH_OSD_BACKOFF_OP_UNBLOCK,
2312             b->id,
2313             b->begin,
2314             b->end));
2315       }
2316       if (b->is_new()) {
2317         b->state = Backoff::STATE_DELETING;
2318       } else {
2319         b->session->rm_backoff(b);
2320         b->session.reset();
2321       }
2322       b->pg.reset();
2323     }
2324   }
2325 }
2326
2327 void PG::clear_backoffs()
2328 {
2329   dout(10) << __func__ << " " << dendl;
2330   map<hobject_t,set<BackoffRef>> ls;
2331   {
2332     Mutex::Locker l(backoff_lock);
2333     ls.swap(backoffs);
2334   }
2335   for (auto& p : ls) {
2336     for (auto& b : p.second) {
2337       Mutex::Locker l(b->lock);
2338       dout(10) << __func__ << " " << *b << dendl;
2339       if (b->session) {
2340         assert(b->pg == this);
2341         if (b->is_new()) {
2342           b->state = Backoff::STATE_DELETING;
2343         } else {
2344           b->session->rm_backoff(b);
2345           b->session.reset();
2346         }
2347         b->pg.reset();
2348       }
2349     }
2350   }
2351 }
2352
2353 // called by Session::clear_backoffs()
2354 void PG::rm_backoff(BackoffRef b)
2355 {
2356   dout(10) << __func__ << " " << *b << dendl;
2357   Mutex::Locker l(backoff_lock);
2358   assert(b->lock.is_locked_by_me());
2359   assert(b->pg == this);
2360   auto p = backoffs.find(b->begin);
2361   // may race with release_backoffs()
2362   if (p != backoffs.end()) {
2363     auto q = p->second.find(b);
2364     if (q != p->second.end()) {
2365       p->second.erase(q);
2366       if (p->second.empty()) {
2367         backoffs.erase(p);
2368       }
2369     }
2370   }
2371 }
2372
2373 void PG::clear_recovery_state()
2374 {
2375   dout(10) << "clear_recovery_state" << dendl;
2376
2377   pg_log.reset_recovery_pointers();
2378   finish_sync_event = 0;
2379
2380   hobject_t soid;
2381   while (recovery_ops_active > 0) {
2382 #ifdef DEBUG_RECOVERY_OIDS
2383     soid = *recovering_oids.begin();
2384 #endif
2385     finish_recovery_op(soid, true);
2386   }
2387
2388   backfill_targets.clear();
2389   backfill_info.clear();
2390   peer_backfill_info.clear();
2391   waiting_on_backfill.clear();
2392   _clear_recovery_state();  // pg impl specific hook
2393 }
2394
2395 void PG::cancel_recovery()
2396 {
2397   dout(10) << "cancel_recovery" << dendl;
2398   clear_recovery_state();
2399 }
2400
2401
2402 void PG::purge_strays()
2403 {
2404   dout(10) << "purge_strays " << stray_set << dendl;
2405
2406   bool removed = false;
2407   for (set<pg_shard_t>::iterator p = stray_set.begin();
2408        p != stray_set.end();
2409        ++p) {
2410     assert(!is_actingbackfill(*p));
2411     if (get_osdmap()->is_up(p->osd)) {
2412       dout(10) << "sending PGRemove to osd." << *p << dendl;
2413       vector<spg_t> to_remove;
2414       to_remove.push_back(spg_t(info.pgid.pgid, p->shard));
2415       MOSDPGRemove *m = new MOSDPGRemove(
2416         get_osdmap()->get_epoch(),
2417         to_remove);
2418       osd->send_message_osd_cluster(p->osd, m, get_osdmap()->get_epoch());
2419     } else {
2420       dout(10) << "not sending PGRemove to down osd." << *p << dendl;
2421     }
2422     peer_missing.erase(*p);
2423     peer_info.erase(*p);
2424     peer_purged.insert(*p);
2425     removed = true;
2426   }
2427
2428   // if we removed anyone, update peers (which include peer_info)
2429   if (removed)
2430     update_heartbeat_peers();
2431
2432   stray_set.clear();
2433
2434   // clear _requested maps; we may have to peer() again if we discover
2435   // (more) stray content
2436   peer_log_requested.clear();
2437   peer_missing_requested.clear();
2438 }
2439
2440 void PG::set_probe_targets(const set<pg_shard_t> &probe_set)
2441 {
2442   Mutex::Locker l(heartbeat_peer_lock);
2443   probe_targets.clear();
2444   for (set<pg_shard_t>::iterator i = probe_set.begin();
2445        i != probe_set.end();
2446        ++i) {
2447     probe_targets.insert(i->osd);
2448   }
2449 }
2450
2451 void PG::clear_probe_targets()
2452 {
2453   Mutex::Locker l(heartbeat_peer_lock);
2454   probe_targets.clear();
2455 }
2456
2457 void PG::update_heartbeat_peers()
2458 {
2459   assert(is_locked());
2460
2461   if (!is_primary())
2462     return;
2463
2464   set<int> new_peers;
2465   for (unsigned i=0; i<acting.size(); i++) {
2466     if (acting[i] != CRUSH_ITEM_NONE)
2467       new_peers.insert(acting[i]);
2468   }
2469   for (unsigned i=0; i<up.size(); i++) {
2470     if (up[i] != CRUSH_ITEM_NONE)
2471       new_peers.insert(up[i]);
2472   }
2473   for (map<pg_shard_t,pg_info_t>::iterator p = peer_info.begin();
2474     p != peer_info.end();
2475     ++p)
2476     new_peers.insert(p->first.osd);
2477
2478   bool need_update = false;
2479   heartbeat_peer_lock.Lock();
2480   if (new_peers == heartbeat_peers) {
2481     dout(10) << "update_heartbeat_peers " << heartbeat_peers << " unchanged" << dendl;
2482   } else {
2483     dout(10) << "update_heartbeat_peers " << heartbeat_peers << " -> " << new_peers << dendl;
2484     heartbeat_peers.swap(new_peers);
2485     need_update = true;
2486   }
2487   heartbeat_peer_lock.Unlock();
2488
2489   if (need_update)
2490     osd->need_heartbeat_peer_update();
2491 }
2492
2493
2494 bool PG::check_in_progress_op(
2495   const osd_reqid_t &r,
2496   eversion_t *version,
2497   version_t *user_version,
2498   int *return_code) const
2499 {
2500   return (
2501     projected_log.get_request(r, version, user_version, return_code) ||
2502     pg_log.get_log().get_request(r, version, user_version, return_code));
2503 }
2504
2505 void PG::_update_calc_stats()
2506 {
2507   info.stats.version = info.last_update;
2508   info.stats.created = info.history.epoch_created;
2509   info.stats.last_scrub = info.history.last_scrub;
2510   info.stats.last_scrub_stamp = info.history.last_scrub_stamp;
2511   info.stats.last_deep_scrub = info.history.last_deep_scrub;
2512   info.stats.last_deep_scrub_stamp = info.history.last_deep_scrub_stamp;
2513   info.stats.last_clean_scrub_stamp = info.history.last_clean_scrub_stamp;
2514   info.stats.last_epoch_clean = info.history.last_epoch_clean;
2515
2516   info.stats.log_size = pg_log.get_head().version - pg_log.get_tail().version;
2517   info.stats.ondisk_log_size = info.stats.log_size;
2518   info.stats.log_start = pg_log.get_tail();
2519   info.stats.ondisk_log_start = pg_log.get_tail();
2520
2521   // If actingset is larger then upset we will have misplaced,
2522   // so we will report based on actingset size.
2523
2524   // If upset is larger then we will have degraded,
2525   // so we will report based on upset size.
2526
2527   // If target is the largest of them all, it will contribute to
2528   // the degraded count because num_object_copies is
2529   // computed using target and eventual used to get degraded total.
2530
2531   unsigned target = get_osdmap()->get_pg_size(info.pgid.pgid);
2532   unsigned nrep = MAX(actingset.size(), upset.size());
2533   // calc num_object_copies
2534   info.stats.stats.calc_copies(MAX(target, nrep));
2535   info.stats.stats.sum.num_objects_degraded = 0;
2536   info.stats.stats.sum.num_objects_unfound = 0;
2537   info.stats.stats.sum.num_objects_misplaced = 0;
2538   if ((is_degraded() || is_undersized() || !is_clean()) && is_peered()) {
2539     // NOTE: we only generate copies, degraded, misplaced and unfound
2540     // values for the summation, not individual stat categories.
2541     int64_t num_objects = info.stats.stats.sum.num_objects;
2542
2543     // Total sum of all missing
2544     int64_t missing = 0;
2545     // Objects that have arrived backfilled to up OSDs (not in acting)
2546     int64_t backfilled = 0;
2547     // A misplaced object is not stored on the correct OSD
2548     int64_t misplaced = 0;
2549     // Total of object copies/shards found
2550     int64_t object_copies = 0;
2551
2552     // num_objects_missing on each peer
2553     for (map<pg_shard_t, pg_info_t>::iterator pi =
2554         peer_info.begin();
2555         pi != peer_info.end();
2556         ++pi) {
2557       map<pg_shard_t, pg_missing_t>::const_iterator pm =
2558         peer_missing.find(pi->first);
2559       if (pm != peer_missing.end()) {
2560         pi->second.stats.stats.sum.num_objects_missing =
2561           pm->second.num_missing();
2562       }
2563     }
2564
2565     assert(!actingbackfill.empty());
2566     for (set<pg_shard_t>::iterator i = actingbackfill.begin();
2567          i != actingbackfill.end();
2568          ++i) {
2569       const pg_shard_t &p = *i;
2570
2571       bool in_up = (upset.find(p) != upset.end());
2572       bool in_acting = (actingset.find(p) != actingset.end());
2573       assert(in_up || in_acting);
2574
2575       // in acting                  Compute total objects excluding num_missing
2576       // in acting and not in up    Compute misplaced objects excluding num_missing
2577       // in up and not in acting    Compute total objects already backfilled
2578       if (in_acting) {
2579         unsigned osd_missing;
2580         // primary handling
2581         if (p == pg_whoami) {
2582           osd_missing = pg_log.get_missing().num_missing();
2583           info.stats.stats.sum.num_objects_missing_on_primary =
2584               osd_missing;
2585           object_copies += num_objects; // My local (primary) count
2586         } else {
2587           assert(peer_missing.count(p));
2588           osd_missing = peer_missing[p].num_missing();
2589           object_copies += peer_info[p].stats.stats.sum.num_objects;
2590         }
2591         missing += osd_missing;
2592         // Count non-missing objects not in up as misplaced
2593         if (!in_up && num_objects > osd_missing)
2594           misplaced += num_objects - osd_missing;
2595       } else {
2596         assert(in_up && !in_acting);
2597
2598         // If this peer has more objects then it should, ignore them
2599         backfilled += MIN(num_objects, peer_info[p].stats.stats.sum.num_objects);
2600       }
2601     }
2602
2603     // Any objects that have been backfilled to up OSDs can deducted from misplaced
2604     misplaced = MAX(0, misplaced - backfilled);
2605
2606     // Deduct computed total missing on acting nodes
2607     object_copies -= missing;
2608     // Include computed backfilled objects on up nodes
2609     object_copies += backfilled;
2610     // a degraded objects has fewer replicas or EC shards than the
2611     // pool specifies.  num_object_copies will never be smaller than target * num_copies.
2612     int64_t degraded = MAX(0, info.stats.stats.sum.num_object_copies - object_copies);
2613
2614     info.stats.stats.sum.num_objects_degraded = degraded;
2615     info.stats.stats.sum.num_objects_unfound = get_num_unfound();
2616     info.stats.stats.sum.num_objects_misplaced = misplaced;
2617   }
2618 }
2619
2620 void PG::_update_blocked_by()
2621 {
2622   // set a max on the number of blocking peers we report. if we go
2623   // over, report a random subset.  keep the result sorted.
2624   unsigned keep = MIN(blocked_by.size(), cct->_conf->osd_max_pg_blocked_by);
2625   unsigned skip = blocked_by.size() - keep;
2626   info.stats.blocked_by.clear();
2627   info.stats.blocked_by.resize(keep);
2628   unsigned pos = 0;
2629   for (set<int>::iterator p = blocked_by.begin();
2630        p != blocked_by.end() && keep > 0;
2631        ++p) {
2632     if (skip > 0 && (rand() % (skip + keep) < skip)) {
2633       --skip;
2634     } else {
2635       info.stats.blocked_by[pos++] = *p;
2636       --keep;
2637     }
2638   }
2639 }
2640
2641 void PG::publish_stats_to_osd()
2642 {
2643   if (!is_primary())
2644     return;
2645
2646   pg_stats_publish_lock.Lock();
2647
2648   if (info.stats.stats.sum.num_scrub_errors)
2649     state_set(PG_STATE_INCONSISTENT);
2650   else
2651     state_clear(PG_STATE_INCONSISTENT);
2652
2653   utime_t now = ceph_clock_now();
2654   if (info.stats.state != state) {
2655     info.stats.last_change = now;
2656     // Optimistic estimation, if we just find out an inactive PG,
2657     // assumt it is active till now.
2658     if (!(state & PG_STATE_ACTIVE) &&
2659         (info.stats.state & PG_STATE_ACTIVE))
2660       info.stats.last_active = now;
2661
2662     if ((state & PG_STATE_ACTIVE) &&
2663         !(info.stats.state & PG_STATE_ACTIVE))
2664       info.stats.last_became_active = now;
2665     if ((state & (PG_STATE_ACTIVE|PG_STATE_PEERED)) &&
2666         !(info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED)))
2667       info.stats.last_became_peered = now;
2668     if (!(state & PG_STATE_CREATING) &&
2669         (info.stats.state & PG_STATE_CREATING)) {
2670       osd->send_pg_created(get_pgid().pgid);
2671     }
2672     info.stats.state = state;
2673   }
2674
2675   _update_calc_stats();
2676   _update_blocked_by();
2677
2678   bool publish = false;
2679   pg_stat_t pre_publish = info.stats;
2680   pre_publish.stats.add(unstable_stats);
2681   utime_t cutoff = now;
2682   cutoff -= cct->_conf->osd_pg_stat_report_interval_max;
2683   if (pg_stats_publish_valid && pre_publish == pg_stats_publish &&
2684       info.stats.last_fresh > cutoff) {
2685     dout(15) << "publish_stats_to_osd " << pg_stats_publish.reported_epoch
2686              << ": no change since " << info.stats.last_fresh << dendl;
2687   } else {
2688     // update our stat summary and timestamps
2689     info.stats.reported_epoch = get_osdmap()->get_epoch();
2690     ++info.stats.reported_seq;
2691
2692     info.stats.last_fresh = now;
2693
2694     if (info.stats.state & PG_STATE_CLEAN)
2695       info.stats.last_clean = now;
2696     if (info.stats.state & PG_STATE_ACTIVE)
2697       info.stats.last_active = now;
2698     if (info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED))
2699       info.stats.last_peered = now;
2700     info.stats.last_unstale = now;
2701     if ((info.stats.state & PG_STATE_DEGRADED) == 0)
2702       info.stats.last_undegraded = now;
2703     if ((info.stats.state & PG_STATE_UNDERSIZED) == 0)
2704       info.stats.last_fullsized = now;
2705
2706     // do not send pgstat to mon anymore once we are luminous, since mgr takes
2707     // care of this by sending MMonMgrReport to mon.
2708     publish =
2709       osd->osd->get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
2710     pg_stats_publish_valid = true;
2711     pg_stats_publish = pre_publish;
2712
2713     dout(15) << "publish_stats_to_osd " << pg_stats_publish.reported_epoch
2714              << ":" << pg_stats_publish.reported_seq << dendl;
2715   }
2716   pg_stats_publish_lock.Unlock();
2717
2718   if (publish)
2719     osd->pg_stat_queue_enqueue(this);
2720 }
2721
2722 void PG::clear_publish_stats()
2723 {
2724   dout(15) << "clear_stats" << dendl;
2725   pg_stats_publish_lock.Lock();
2726   pg_stats_publish_valid = false;
2727   pg_stats_publish_lock.Unlock();
2728
2729   osd->pg_stat_queue_dequeue(this);
2730 }
2731
2732 /**
2733  * initialize a newly instantiated pg
2734  *
2735  * Initialize PG state, as when a PG is initially created, or when it
2736  * is first instantiated on the current node.
2737  *
2738  * @param role our role/rank
2739  * @param newup up set
2740  * @param newacting acting set
2741  * @param history pg history
2742  * @param pi past_intervals
2743  * @param backfill true if info should be marked as backfill
2744  * @param t transaction to write out our new state in
2745  */
2746 void PG::init(
2747   int role,
2748   const vector<int>& newup, int new_up_primary,
2749   const vector<int>& newacting, int new_acting_primary,
2750   const pg_history_t& history,
2751   const PastIntervals& pi,
2752   bool backfill,
2753   ObjectStore::Transaction *t)
2754 {
2755   dout(10) << "init role " << role << " up " << newup << " acting " << newacting
2756            << " history " << history
2757            << " past_intervals " << pi
2758            << dendl;
2759
2760   set_role(role);
2761   acting = newacting;
2762   up = newup;
2763   init_primary_up_acting(
2764     newup,
2765     newacting,
2766     new_up_primary,
2767     new_acting_primary);
2768
2769   info.history = history;
2770   past_intervals = pi;
2771
2772   info.stats.up = up;
2773   info.stats.up_primary = new_up_primary;
2774   info.stats.acting = acting;
2775   info.stats.acting_primary = new_acting_primary;
2776   info.stats.mapping_epoch = info.history.same_interval_since;
2777
2778   if (backfill) {
2779     dout(10) << __func__ << ": Setting backfill" << dendl;
2780     info.set_last_backfill(hobject_t());
2781     info.last_complete = info.last_update;
2782     pg_log.mark_log_for_rewrite();
2783   }
2784
2785   on_new_interval();
2786
2787   dirty_info = true;
2788   dirty_big_info = true;
2789   write_if_dirty(*t);
2790 }
2791
2792 #pragma GCC diagnostic ignored "-Wpragmas"
2793 #pragma GCC diagnostic push
2794 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
2795
2796 void PG::upgrade(ObjectStore *store)
2797 {
2798   assert(info_struct_v <= 10);
2799   ObjectStore::Transaction t;
2800
2801   assert(info_struct_v >= 7);
2802
2803   // 7 -> 8
2804   if (info_struct_v <= 7) {
2805     pg_log.mark_log_for_rewrite();
2806     ghobject_t log_oid(OSD::make_pg_log_oid(pg_id));
2807     ghobject_t biginfo_oid(OSD::make_pg_biginfo_oid(pg_id));
2808     t.remove(coll_t::meta(), log_oid);
2809     t.remove(coll_t::meta(), biginfo_oid);
2810     t.touch(coll, pgmeta_oid);
2811   }
2812
2813   // 8 -> 9
2814   if (info_struct_v <= 8) {
2815     // no special action needed.
2816   }
2817
2818   // 9 -> 10
2819   if (info_struct_v <= 9) {
2820     // previous versions weren't (as) aggressively clearing past_intervals
2821     if (info.history.last_epoch_clean >= info.history.same_interval_since) {
2822       dout(20) << __func__ << " clearing past_intervals" << dendl;
2823       past_intervals.clear();
2824     }
2825   }
2826
2827   // update infover_key
2828   if (info_struct_v < cur_struct_v) {
2829     map<string,bufferlist> v;
2830     __u8 ver = cur_struct_v;
2831     ::encode(ver, v[infover_key]);
2832     t.omap_setkeys(coll, pgmeta_oid, v);
2833   }
2834
2835   dirty_info = true;
2836   dirty_big_info = true;
2837   write_if_dirty(t);
2838
2839   ceph::shared_ptr<ObjectStore::Sequencer> osr (std::make_shared<
2840                                       ObjectStore::Sequencer>("upgrade"));
2841   int r = store->apply_transaction(osr.get(), std::move(t));
2842   if (r != 0) {
2843     derr << __func__ << ": apply_transaction returned "
2844          << cpp_strerror(r) << dendl;
2845     ceph_abort();
2846   }
2847   assert(r == 0);
2848
2849   C_SaferCond waiter;
2850   if (!osr->flush_commit(&waiter)) {
2851     waiter.wait();
2852   }
2853 }
2854
2855 #pragma GCC diagnostic pop
2856 #pragma GCC diagnostic warning "-Wpragmas"
2857
2858 int PG::_prepare_write_info(CephContext* cct,
2859                             map<string,bufferlist> *km,
2860                             epoch_t epoch,
2861                             pg_info_t &info, pg_info_t &last_written_info,
2862                             PastIntervals &past_intervals,
2863                             bool dirty_big_info,
2864                             bool dirty_epoch,
2865                             bool try_fast_info,
2866                             PerfCounters *logger)
2867 {
2868   if (dirty_epoch) {
2869     ::encode(epoch, (*km)[epoch_key]);
2870   }
2871
2872   if (logger)
2873     logger->inc(l_osd_pg_info);
2874
2875   // try to do info efficiently?
2876   if (!dirty_big_info && try_fast_info &&
2877       info.last_update > last_written_info.last_update) {
2878     pg_fast_info_t fast;
2879     fast.populate_from(info);
2880     bool did = fast.try_apply_to(&last_written_info);
2881     assert(did);  // we verified last_update increased above
2882     if (info == last_written_info) {
2883       ::encode(fast, (*km)[fastinfo_key]);
2884       if (logger)
2885         logger->inc(l_osd_pg_fastinfo);
2886       return 0;
2887     }
2888     generic_dout(30) << __func__ << " fastinfo failed, info:\n";
2889     {
2890       JSONFormatter jf(true);
2891       jf.dump_object("info", info);
2892       jf.flush(*_dout);
2893     }
2894     {
2895       *_dout << "\nlast_written_info:\n";
2896       JSONFormatter jf(true);
2897       jf.dump_object("last_written_info", last_written_info);
2898       jf.flush(*_dout);
2899     }
2900     *_dout << dendl;
2901   }
2902   last_written_info = info;
2903
2904   // info.  store purged_snaps separately.
2905   interval_set<snapid_t> purged_snaps;
2906   purged_snaps.swap(info.purged_snaps);
2907   ::encode(info, (*km)[info_key]);
2908   purged_snaps.swap(info.purged_snaps);
2909
2910   if (dirty_big_info) {
2911     // potentially big stuff
2912     bufferlist& bigbl = (*km)[biginfo_key];
2913     ::encode(past_intervals, bigbl);
2914     ::encode(info.purged_snaps, bigbl);
2915     //dout(20) << "write_info bigbl " << bigbl.length() << dendl;
2916     if (logger)
2917       logger->inc(l_osd_pg_biginfo);
2918   }
2919
2920   return 0;
2921 }
2922
2923 void PG::_create(ObjectStore::Transaction& t, spg_t pgid, int bits)
2924 {
2925   coll_t coll(pgid);
2926   t.create_collection(coll, bits);
2927 }
2928
2929 void PG::_init(ObjectStore::Transaction& t, spg_t pgid, const pg_pool_t *pool)
2930 {
2931   coll_t coll(pgid);
2932
2933   if (pool) {
2934     // Give a hint to the PG collection
2935     bufferlist hint;
2936     uint32_t pg_num = pool->get_pg_num();
2937     uint64_t expected_num_objects_pg = pool->expected_num_objects / pg_num;
2938     ::encode(pg_num, hint);
2939     ::encode(expected_num_objects_pg, hint);
2940     uint32_t hint_type = ObjectStore::Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS;
2941     t.collection_hint(coll, hint_type, hint);
2942   }
2943
2944   ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
2945   t.touch(coll, pgmeta_oid);
2946   map<string,bufferlist> values;
2947   __u8 struct_v = cur_struct_v;
2948   ::encode(struct_v, values[infover_key]);
2949   t.omap_setkeys(coll, pgmeta_oid, values);
2950 }
2951
2952 void PG::prepare_write_info(map<string,bufferlist> *km)
2953 {
2954   info.stats.stats.add(unstable_stats);
2955   unstable_stats.clear();
2956
2957   bool need_update_epoch = last_epoch < get_osdmap()->get_epoch();
2958   int ret = _prepare_write_info(cct, km, get_osdmap()->get_epoch(),
2959                                 info,
2960                                 last_written_info,
2961                                 past_intervals,
2962                                 dirty_big_info, need_update_epoch,
2963                                 cct->_conf->osd_fast_info,
2964                                 osd->logger);
2965   assert(ret == 0);
2966   if (need_update_epoch)
2967     last_epoch = get_osdmap()->get_epoch();
2968   last_persisted_osdmap_ref = osdmap_ref;
2969
2970   dirty_info = false;
2971   dirty_big_info = false;
2972 }
2973
2974 #pragma GCC diagnostic ignored "-Wpragmas"
2975 #pragma GCC diagnostic push
2976 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
2977
2978 bool PG::_has_removal_flag(ObjectStore *store,
2979                            spg_t pgid)
2980 {
2981   coll_t coll(pgid);
2982   ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
2983
2984   // first try new way
2985   set<string> keys;
2986   keys.insert("_remove");
2987   map<string,bufferlist> values;
2988   if (store->omap_get_values(coll, pgmeta_oid, keys, &values) == 0 &&
2989       values.size() == 1)
2990     return true;
2991
2992   return false;
2993 }
2994
2995 int PG::peek_map_epoch(ObjectStore *store,
2996                        spg_t pgid,
2997                        epoch_t *pepoch,
2998                        bufferlist *bl)
2999 {
3000   coll_t coll(pgid);
3001   ghobject_t legacy_infos_oid(OSD::make_infos_oid());
3002   ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
3003   epoch_t cur_epoch = 0;
3004
3005   assert(bl);
3006   {
3007     // validate collection name
3008     assert(coll.is_pg());
3009   }
3010
3011   // try for v8
3012   set<string> keys;
3013   keys.insert(infover_key);
3014   keys.insert(epoch_key);
3015   map<string,bufferlist> values;
3016   int r = store->omap_get_values(coll, pgmeta_oid, keys, &values);
3017   if (r == 0) {
3018     assert(values.size() == 2);
3019
3020     // sanity check version
3021     bufferlist::iterator bp = values[infover_key].begin();
3022     __u8 struct_v = 0;
3023     ::decode(struct_v, bp);
3024     assert(struct_v >= 8);
3025
3026     // get epoch
3027     bp = values[epoch_key].begin();
3028     ::decode(cur_epoch, bp);
3029   } else {
3030     // probably bug 10617; see OSD::load_pgs()
3031     return -1;
3032   }
3033
3034   *pepoch = cur_epoch;
3035   return 0;
3036 }
3037
3038 #pragma GCC diagnostic pop
3039 #pragma GCC diagnostic warning "-Wpragmas"
3040
3041 void PG::write_if_dirty(ObjectStore::Transaction& t)
3042 {
3043   map<string,bufferlist> km;
3044   if (dirty_big_info || dirty_info)
3045     prepare_write_info(&km);
3046   pg_log.write_log_and_missing(t, &km, coll, pgmeta_oid, pool.info.require_rollback());
3047   if (!km.empty())
3048     t.omap_setkeys(coll, pgmeta_oid, km);
3049 }
3050
3051 void PG::trim_log()
3052 {
3053   assert(is_primary());
3054   calc_trim_to();
3055   dout(10) << __func__ << " to " << pg_trim_to << dendl;
3056   if (pg_trim_to != eversion_t()) {
3057     // inform peers to trim log
3058     assert(!actingbackfill.empty());
3059     for (set<pg_shard_t>::iterator i = actingbackfill.begin();
3060          i != actingbackfill.end();
3061          ++i) {
3062       if (*i == pg_whoami) continue;
3063       osd->send_message_osd_cluster(
3064         i->osd,
3065         new MOSDPGTrim(
3066           get_osdmap()->get_epoch(),
3067           spg_t(info.pgid.pgid, i->shard),
3068           pg_trim_to),
3069         get_osdmap()->get_epoch());
3070     }
3071
3072     // trim primary as well
3073     pg_log.trim(pg_trim_to, info);
3074     dirty_info = true;
3075   }
3076 }
3077
3078 void PG::add_log_entry(const pg_log_entry_t& e, bool applied)
3079 {
3080   // raise last_complete only if we were previously up to date
3081   if (info.last_complete == info.last_update)
3082     info.last_complete = e.version;
3083
3084   // raise last_update.
3085   assert(e.version > info.last_update);
3086   info.last_update = e.version;
3087
3088   // raise user_version, if it increased (it may have not get bumped
3089   // by all logged updates)
3090   if (e.user_version > info.last_user_version)
3091     info.last_user_version = e.user_version;
3092
3093   // log mutation
3094   pg_log.add(e, applied);
3095   dout(10) << "add_log_entry " << e << dendl;
3096 }
3097
3098
3099 void PG::append_log(
3100   const vector<pg_log_entry_t>& logv,
3101   eversion_t trim_to,
3102   eversion_t roll_forward_to,
3103   ObjectStore::Transaction &t,
3104   bool transaction_applied)
3105 {
3106   if (transaction_applied)
3107     update_snap_map(logv, t);
3108
3109   /* The primary has sent an info updating the history, but it may not
3110    * have arrived yet.  We want to make sure that we cannot remember this
3111    * write without remembering that it happened in an interval which went
3112    * active in epoch history.last_epoch_started.
3113    */
3114   if (info.last_epoch_started != info.history.last_epoch_started) {
3115     info.history.last_epoch_started = info.last_epoch_started;
3116   }
3117   if (info.last_interval_started != info.history.last_interval_started) {
3118     info.history.last_interval_started = info.last_interval_started;
3119   }
3120   dout(10) << "append_log " << pg_log.get_log() << " " << logv << dendl;
3121
3122   PGLogEntryHandler handler{this, &t};
3123   if (!transaction_applied) {
3124      /* We must be a backfill peer, so it's ok if we apply
3125       * out-of-turn since we won't be considered when
3126       * determining a min possible last_update.
3127       */
3128     pg_log.roll_forward(&handler);
3129   }
3130
3131   for (vector<pg_log_entry_t>::const_iterator p = logv.begin();
3132        p != logv.end();
3133        ++p) {
3134     add_log_entry(*p, transaction_applied);
3135
3136     /* We don't want to leave the rollforward artifacts around
3137      * here past last_backfill.  It's ok for the same reason as
3138      * above */
3139     if (transaction_applied &&
3140         p->soid > info.last_backfill) {
3141       pg_log.roll_forward(&handler);
3142     }
3143   }
3144   auto last = logv.rbegin();
3145   if (is_primary() && last != logv.rend()) {
3146     projected_log.skip_can_rollback_to_to_head();
3147     projected_log.trim(cct, last->version, nullptr);
3148   }
3149
3150   if (transaction_applied && roll_forward_to > pg_log.get_can_rollback_to()) {
3151     pg_log.roll_forward_to(
3152       roll_forward_to,
3153       &handler);
3154     t.register_on_applied(
3155       new C_UpdateLastRollbackInfoTrimmedToApplied(
3156         this,
3157         get_osdmap()->get_epoch(),
3158         roll_forward_to));
3159   }
3160
3161   pg_log.trim(trim_to, info);
3162
3163   // update the local pg, pg log
3164   dirty_info = true;
3165   write_if_dirty(t);
3166 }
3167
3168 bool PG::check_log_for_corruption(ObjectStore *store)
3169 {
3170   /// TODO: this method needs to work with the omap log
3171   return true;
3172 }
3173
3174 //! Get the name we're going to save our corrupt page log as
3175 std::string PG::get_corrupt_pg_log_name() const
3176 {
3177   const int MAX_BUF = 512;
3178   char buf[MAX_BUF];
3179   struct tm tm_buf;
3180   time_t my_time(time(NULL));
3181   const struct tm *t = localtime_r(&my_time, &tm_buf);
3182   int ret = strftime(buf, sizeof(buf), "corrupt_log_%Y-%m-%d_%k:%M_", t);
3183   if (ret == 0) {
3184     dout(0) << "strftime failed" << dendl;
3185     return "corrupt_log_unknown_time";
3186   }
3187   string out(buf);
3188   out += stringify(info.pgid);
3189   return out;
3190 }
3191
3192 int PG::read_info(
3193   ObjectStore *store, spg_t pgid, const coll_t &coll, bufferlist &bl,
3194   pg_info_t &info, PastIntervals &past_intervals,
3195   __u8 &struct_v)
3196 {
3197   // try for v8 or later
3198   set<string> keys;
3199   keys.insert(infover_key);
3200   keys.insert(info_key);
3201   keys.insert(biginfo_key);
3202   keys.insert(fastinfo_key);
3203   ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
3204   map<string,bufferlist> values;
3205   int r = store->omap_get_values(coll, pgmeta_oid, keys, &values);
3206   if (r == 0) {
3207     assert(values.size() == 3 ||
3208            values.size() == 4);
3209
3210     bufferlist::iterator p = values[infover_key].begin();
3211     ::decode(struct_v, p);
3212     assert(struct_v >= 8);
3213
3214     p = values[info_key].begin();
3215     ::decode(info, p);
3216
3217     p = values[biginfo_key].begin();
3218     if (struct_v >= 10) {
3219       ::decode(past_intervals, p);
3220     } else {
3221       past_intervals.decode_classic(p);
3222     }
3223     ::decode(info.purged_snaps, p);
3224
3225     p = values[fastinfo_key].begin();
3226     if (!p.end()) {
3227       pg_fast_info_t fast;
3228       ::decode(fast, p);
3229       fast.try_apply_to(&info);
3230     }
3231     return 0;
3232   }
3233
3234   // legacy (ver < 8)
3235   ghobject_t infos_oid(OSD::make_infos_oid());
3236   bufferlist::iterator p = bl.begin();
3237   ::decode(struct_v, p);
3238   assert(struct_v == 7);
3239
3240   // get info out of leveldb
3241   string k = get_info_key(info.pgid);
3242   string bk = get_biginfo_key(info.pgid);
3243   keys.clear();
3244   keys.insert(k);
3245   keys.insert(bk);
3246   values.clear();
3247   store->omap_get_values(coll_t::meta(), ghobject_t(infos_oid), keys, &values);
3248   assert(values.size() == 2);
3249
3250   p = values[k].begin();
3251   ::decode(info, p);
3252
3253   p = values[bk].begin();
3254   ::decode(past_intervals, p);
3255   interval_set<snapid_t> snap_collections;  // obsolete
3256   ::decode(snap_collections, p);
3257   ::decode(info.purged_snaps, p);
3258   return 0;
3259 }
3260
3261 void PG::read_state(ObjectStore *store, bufferlist &bl)
3262 {
3263   int r = read_info(store, pg_id, coll, bl, info, past_intervals,
3264                     info_struct_v);
3265   assert(r >= 0);
3266
3267   last_written_info = info;
3268
3269   ostringstream oss;
3270   pg_log.read_log_and_missing(
3271     store,
3272     coll,
3273     info_struct_v < 8 ? coll_t::meta() : coll,
3274     ghobject_t(info_struct_v < 8 ? OSD::make_pg_log_oid(pg_id) : pgmeta_oid),
3275     info,
3276     oss,
3277     cct->_conf->osd_ignore_stale_divergent_priors,
3278     cct->_conf->osd_debug_verify_missing_on_start);
3279   if (oss.tellp())
3280     osd->clog->error() << oss.rdbuf();
3281
3282   // log any weirdness
3283   log_weirdness();
3284 }
3285
3286 void PG::log_weirdness()
3287 {
3288   if (pg_log.get_tail() != info.log_tail)
3289     osd->clog->error() << info.pgid
3290                        << " info mismatch, log.tail " << pg_log.get_tail()
3291                        << " != info.log_tail " << info.log_tail;
3292   if (pg_log.get_head() != info.last_update)
3293     osd->clog->error() << info.pgid
3294                        << " info mismatch, log.head " << pg_log.get_head()
3295                        << " != info.last_update " << info.last_update;
3296
3297   if (!pg_log.get_log().empty()) {
3298     // sloppy check
3299     if ((pg_log.get_log().log.begin()->version <= pg_log.get_tail()))
3300       osd->clog->error() << info.pgid
3301                         << " log bound mismatch, info (" << pg_log.get_tail() << ","
3302                         << pg_log.get_head() << "]"
3303                         << " actual ["
3304                         << pg_log.get_log().log.begin()->version << ","
3305                          << pg_log.get_log().log.rbegin()->version << "]";
3306   }
3307
3308   if (pg_log.get_log().caller_ops.size() > pg_log.get_log().log.size()) {
3309     osd->clog->error() << info.pgid
3310                       << " caller_ops.size " << pg_log.get_log().caller_ops.size()
3311                        << " > log size " << pg_log.get_log().log.size();
3312   }
3313 }
3314
3315 void PG::update_snap_map(
3316   const vector<pg_log_entry_t> &log_entries,
3317   ObjectStore::Transaction &t)
3318 {
3319   for (vector<pg_log_entry_t>::const_iterator i = log_entries.begin();
3320        i != log_entries.end();
3321        ++i) {
3322     OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
3323     if (i->soid.snap < CEPH_MAXSNAP) {
3324       if (i->is_delete()) {
3325         int r = snap_mapper.remove_oid(
3326           i->soid,
3327           &_t);
3328         assert(r == 0);
3329       } else if (i->is_update()) {
3330         assert(i->snaps.length() > 0);
3331         vector<snapid_t> snaps;
3332         bufferlist snapbl = i->snaps;
3333         bufferlist::iterator p = snapbl.begin();
3334         try {
3335           ::decode(snaps, p);
3336         } catch (...) {
3337           snaps.clear();
3338         }
3339         set<snapid_t> _snaps(snaps.begin(), snaps.end());
3340
3341         if (i->is_clone() || i->is_promote()) {
3342           snap_mapper.add_oid(
3343             i->soid,
3344             _snaps,
3345             &_t);
3346         } else if (i->is_modify()) {
3347           assert(i->is_modify());
3348           int r = snap_mapper.update_snaps(
3349             i->soid,
3350             _snaps,
3351             0,
3352             &_t);
3353           assert(r == 0);
3354         } else {
3355           assert(i->is_clean());
3356         }
3357       }
3358     }
3359   }
3360 }
3361
3362 /**
3363  * filter trimming|trimmed snaps out of snapcontext
3364  */
3365 void PG::filter_snapc(vector<snapid_t> &snaps)
3366 {
3367   //nothing needs to trim, we can return immediately
3368   if(snap_trimq.empty() && info.purged_snaps.empty())
3369     return;
3370
3371   bool filtering = false;
3372   vector<snapid_t> newsnaps;
3373   for (vector<snapid_t>::iterator p = snaps.begin();
3374        p != snaps.end();
3375        ++p) {
3376     if (snap_trimq.contains(*p) || info.purged_snaps.contains(*p)) {
3377       if (!filtering) {
3378         // start building a new vector with what we've seen so far
3379         dout(10) << "filter_snapc filtering " << snaps << dendl;
3380         newsnaps.insert(newsnaps.begin(), snaps.begin(), p);
3381         filtering = true;
3382       }
3383       dout(20) << "filter_snapc  removing trimq|purged snap " << *p << dendl;
3384     } else {
3385       if (filtering)
3386         newsnaps.push_back(*p);  // continue building new vector
3387     }
3388   }
3389   if (filtering) {
3390     snaps.swap(newsnaps);
3391     dout(10) << "filter_snapc  result " << snaps << dendl;
3392   }
3393 }
3394
3395 void PG::requeue_object_waiters(map<hobject_t, list<OpRequestRef>>& m)
3396 {
3397   for (map<hobject_t, list<OpRequestRef>>::iterator it = m.begin();
3398        it != m.end();
3399        ++it)
3400     requeue_ops(it->second);
3401   m.clear();
3402 }
3403
3404 void PG::requeue_op(OpRequestRef op)
3405 {
3406   auto p = waiting_for_map.find(op->get_source());
3407   if (p != waiting_for_map.end()) {
3408     dout(20) << __func__ << " " << op << " (waiting_for_map " << p->first << ")"
3409              << dendl;
3410     p->second.push_front(op);
3411   } else {
3412     dout(20) << __func__ << " " << op << dendl;
3413     osd->enqueue_front(info.pgid, PGQueueable(op, get_osdmap()->get_epoch()));
3414   }
3415 }
3416
3417 void PG::requeue_ops(list<OpRequestRef> &ls)
3418 {
3419   for (list<OpRequestRef>::reverse_iterator i = ls.rbegin();
3420        i != ls.rend();
3421        ++i) {
3422     auto p = waiting_for_map.find((*i)->get_source());
3423     if (p != waiting_for_map.end()) {
3424       dout(20) << __func__ << " " << *i << " (waiting_for_map " << p->first
3425                << ")" << dendl;
3426       p->second.push_front(*i);
3427     } else {
3428       dout(20) << __func__ << " " << *i << dendl;
3429       osd->enqueue_front(info.pgid, PGQueueable(*i, get_osdmap()->get_epoch()));
3430     }
3431   }
3432   ls.clear();
3433 }
3434
3435 void PG::requeue_map_waiters()
3436 {
3437   epoch_t epoch = get_osdmap()->get_epoch();
3438   auto p = waiting_for_map.begin();
3439   while (p != waiting_for_map.end()) {
3440     if (epoch < p->second.front()->min_epoch) {
3441       dout(20) << __func__ << " " << p->first << " front op "
3442                << p->second.front() << " must still wait, doing nothing"
3443                << dendl;
3444       ++p;
3445     } else {
3446       dout(20) << __func__ << " " << p->first << " " << p->second << dendl;
3447       for (auto q = p->second.rbegin(); q != p->second.rend(); ++q) {
3448         osd->enqueue_front(info.pgid, PGQueueable(*q, epoch));
3449       }
3450       p = waiting_for_map.erase(p);
3451     }
3452   }
3453 }
3454
3455
3456 // ==========================================================================================
3457 // SCRUB
3458
3459 /*
3460  * when holding pg and sched_scrub_lock, then the states are:
3461  *   scheduling:
3462  *     scrubber.reserved = true
3463  *     scrub_rserved_peers includes whoami
3464  *     osd->scrub_pending++
3465  *   scheduling, replica declined:
3466  *     scrubber.reserved = true
3467  *     scrubber.reserved_peers includes -1
3468  *     osd->scrub_pending++
3469  *   pending:
3470  *     scrubber.reserved = true
3471  *     scrubber.reserved_peers.size() == acting.size();
3472  *     pg on scrub_wq
3473  *     osd->scrub_pending++
3474  *   scrubbing:
3475  *     scrubber.reserved = false;
3476  *     scrubber.reserved_peers empty
3477  *     osd->scrubber.active++
3478  */
3479
3480 // returns true if a scrub has been newly kicked off
3481 bool PG::sched_scrub()
3482 {
3483   bool nodeep_scrub = false;
3484   assert(is_locked());
3485   if (!(is_primary() && is_active() && is_clean() && !is_scrubbing())) {
3486     return false;
3487   }
3488
3489   double deep_scrub_interval = 0;
3490   pool.info.opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &deep_scrub_interval);
3491   if (deep_scrub_interval <= 0) {
3492     deep_scrub_interval = cct->_conf->osd_deep_scrub_interval;
3493   }
3494   bool time_for_deep = ceph_clock_now() >=
3495     info.history.last_deep_scrub_stamp + deep_scrub_interval;
3496
3497   bool deep_coin_flip = false;
3498   // Only add random deep scrubs when NOT user initiated scrub
3499   if (!scrubber.must_scrub)
3500       deep_coin_flip = (rand() % 100) < cct->_conf->osd_deep_scrub_randomize_ratio * 100;
3501   dout(20) << __func__ << ": time_for_deep=" << time_for_deep << " deep_coin_flip=" << deep_coin_flip << dendl;
3502
3503   time_for_deep = (time_for_deep || deep_coin_flip);
3504
3505   //NODEEP_SCRUB so ignore time initiated deep-scrub
3506   if (osd->osd->get_osdmap()->test_flag(CEPH_OSDMAP_NODEEP_SCRUB) ||
3507       pool.info.has_flag(pg_pool_t::FLAG_NODEEP_SCRUB)) {
3508     time_for_deep = false;
3509     nodeep_scrub = true;
3510   }
3511
3512   if (!scrubber.must_scrub) {
3513     assert(!scrubber.must_deep_scrub);
3514
3515     //NOSCRUB so skip regular scrubs
3516     if ((osd->osd->get_osdmap()->test_flag(CEPH_OSDMAP_NOSCRUB) ||
3517          pool.info.has_flag(pg_pool_t::FLAG_NOSCRUB)) && !time_for_deep) {
3518       if (scrubber.reserved) {
3519         // cancel scrub if it is still in scheduling,
3520         // so pgs from other pools where scrub are still legal
3521         // have a chance to go ahead with scrubbing.
3522         clear_scrub_reserved();
3523         scrub_unreserve_replicas();
3524       }
3525       return false;
3526     }
3527   }
3528
3529   if (cct->_conf->osd_scrub_auto_repair
3530       && get_pgbackend()->auto_repair_supported()
3531       && time_for_deep
3532       // respect the command from user, and not do auto-repair
3533       && !scrubber.must_repair
3534       && !scrubber.must_scrub
3535       && !scrubber.must_deep_scrub) {
3536     dout(20) << __func__ << ": auto repair with deep scrubbing" << dendl;
3537     scrubber.auto_repair = true;
3538   } else {
3539     // this happens when user issue the scrub/repair command during
3540     // the scheduling of the scrub/repair (e.g. request reservation)
3541     scrubber.auto_repair = false;
3542   }
3543
3544   bool ret = true;
3545   if (!scrubber.reserved) {
3546     assert(scrubber.reserved_peers.empty());
3547     if (osd->inc_scrubs_pending()) {
3548       dout(20) << "sched_scrub: reserved locally, reserving replicas" << dendl;
3549       scrubber.reserved = true;
3550       scrubber.reserved_peers.insert(pg_whoami);
3551       scrub_reserve_replicas();
3552     } else {
3553       dout(20) << "sched_scrub: failed to reserve locally" << dendl;
3554       ret = false;
3555     }
3556   }
3557   if (scrubber.reserved) {
3558     if (scrubber.reserve_failed) {
3559       dout(20) << "sched_scrub: failed, a peer declined" << dendl;
3560       clear_scrub_reserved();
3561       scrub_unreserve_replicas();
3562       ret = false;
3563     } else if (scrubber.reserved_peers.size() == acting.size()) {
3564       dout(20) << "sched_scrub: success, reserved self and replicas" << dendl;
3565       if (time_for_deep) {
3566         dout(10) << "sched_scrub: scrub will be deep" << dendl;
3567         state_set(PG_STATE_DEEP_SCRUB);
3568       } else if (!scrubber.must_deep_scrub && info.stats.stats.sum.num_deep_scrub_errors) {
3569         if (!nodeep_scrub) {
3570           osd->clog->info() << "osd." << osd->whoami
3571                             << " pg " << info.pgid
3572                             << " Deep scrub errors, upgrading scrub to deep-scrub";
3573           state_set(PG_STATE_DEEP_SCRUB);
3574         } else if (!scrubber.must_scrub) {
3575           osd->clog->error() << "osd." << osd->whoami
3576                              << " pg " << info.pgid
3577                              << " Regular scrub skipped due to deep-scrub errors and nodeep-scrub set";
3578           clear_scrub_reserved();
3579           scrub_unreserve_replicas();
3580           return false;
3581         } else {
3582           osd->clog->error() << "osd." << osd->whoami
3583                              << " pg " << info.pgid
3584                              << " Regular scrub request, losing deep-scrub details";
3585         }
3586       }
3587       queue_scrub();
3588     } else {
3589       // none declined, since scrubber.reserved is set
3590       dout(20) << "sched_scrub: reserved " << scrubber.reserved_peers << ", waiting for replicas" << dendl;
3591     }
3592   }
3593
3594   return ret;
3595 }
3596
3597 void PG::reg_next_scrub()
3598 {
3599   if (!is_primary())
3600     return;
3601
3602   utime_t reg_stamp;
3603   if (scrubber.must_scrub ||
3604       (info.stats.stats_invalid && cct->_conf->osd_scrub_invalid_stats)) {
3605     reg_stamp = ceph_clock_now();
3606   } else {
3607     reg_stamp = info.history.last_scrub_stamp;
3608   }
3609   // note down the sched_time, so we can locate this scrub, and remove it
3610   // later on.
3611   double scrub_min_interval = 0, scrub_max_interval = 0;
3612   pool.info.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &scrub_min_interval);
3613   pool.info.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &scrub_max_interval);
3614   assert(scrubber.scrub_reg_stamp == utime_t());
3615   scrubber.scrub_reg_stamp = osd->reg_pg_scrub(info.pgid,
3616                                                reg_stamp,
3617                                                scrub_min_interval,
3618                                                scrub_max_interval,
3619                                                scrubber.must_scrub);
3620 }
3621
3622 void PG::unreg_next_scrub()
3623 {
3624   if (is_primary()) {
3625     osd->unreg_pg_scrub(info.pgid, scrubber.scrub_reg_stamp);
3626     scrubber.scrub_reg_stamp = utime_t();
3627   }
3628 }
3629
3630 void PG::do_replica_scrub_map(OpRequestRef op)
3631 {
3632   const MOSDRepScrubMap *m = static_cast<const MOSDRepScrubMap*>(op->get_req());
3633   dout(7) << __func__ << " " << *m << dendl;
3634   if (m->map_epoch < info.history.same_interval_since) {
3635     dout(10) << __func__ << " discarding old from "
3636              << m->map_epoch << " < " << info.history.same_interval_since
3637              << dendl;
3638     return;
3639   }
3640   if (!scrubber.is_chunky_scrub_active()) {
3641     dout(10) << __func__ << " scrub isn't active" << dendl;
3642     return;
3643   }
3644
3645   op->mark_started();
3646
3647   bufferlist::iterator p = const_cast<bufferlist&>(m->get_data()).begin();
3648   scrubber.received_maps[m->from].decode(p, info.pgid.pool());
3649   dout(10) << "map version is "
3650            << scrubber.received_maps[m->from].valid_through
3651            << dendl;
3652
3653   --scrubber.waiting_on;
3654   scrubber.waiting_on_whom.erase(m->from);
3655   if (scrubber.waiting_on == 0) {
3656     if (ops_blocked_by_scrub()) {
3657       requeue_scrub(true);
3658     } else {
3659       requeue_scrub(false);
3660     }
3661   }
3662 }
3663
3664 void PG::sub_op_scrub_map(OpRequestRef op)
3665 {
3666   // for legacy jewel compatibility only
3667   const MOSDSubOp *m = static_cast<const MOSDSubOp *>(op->get_req());
3668   assert(m->get_type() == MSG_OSD_SUBOP);
3669   dout(7) << "sub_op_scrub_map" << dendl;
3670
3671   if (m->map_epoch < info.history.same_interval_since) {
3672     dout(10) << "sub_op_scrub discarding old sub_op from "
3673              << m->map_epoch << " < " << info.history.same_interval_since << dendl;
3674     return;
3675   }
3676
3677   if (!scrubber.is_chunky_scrub_active()) {
3678     dout(10) << "sub_op_scrub_map scrub isn't active" << dendl;
3679     return;
3680   }
3681
3682   op->mark_started();
3683
3684   dout(10) << " got " << m->from << " scrub map" << dendl;
3685   bufferlist::iterator p = const_cast<bufferlist&>(m->get_data()).begin();
3686
3687   scrubber.received_maps[m->from].decode(p, info.pgid.pool());
3688   dout(10) << "map version is "
3689              << scrubber.received_maps[m->from].valid_through
3690              << dendl;
3691
3692   --scrubber.waiting_on;
3693   scrubber.waiting_on_whom.erase(m->from);
3694
3695   if (scrubber.waiting_on == 0) {
3696     if (ops_blocked_by_scrub()) {
3697       requeue_scrub(true);
3698     } else {
3699       requeue_scrub(false);
3700     }
3701   }
3702 }
3703
3704 // send scrub v3 messages (chunky scrub)
3705 void PG::_request_scrub_map(
3706   pg_shard_t replica, eversion_t version,
3707   hobject_t start, hobject_t end,
3708   bool deep, uint32_t seed)
3709 {
3710   assert(replica != pg_whoami);
3711   dout(10) << "scrub  requesting scrubmap from osd." << replica
3712            << " deep " << (int)deep << " seed " << seed << dendl;
3713   MOSDRepScrub *repscrubop = new MOSDRepScrub(
3714     spg_t(info.pgid.pgid, replica.shard), version,
3715     get_osdmap()->get_epoch(),
3716     get_last_peering_reset(),
3717     start, end, deep, seed);
3718   // default priority, we want the rep scrub processed prior to any recovery
3719   // or client io messages (we are holding a lock!)
3720   osd->send_message_osd_cluster(
3721     replica.osd, repscrubop, get_osdmap()->get_epoch());
3722 }
3723
3724 void PG::handle_scrub_reserve_request(OpRequestRef op)
3725 {
3726   dout(7) << __func__ << " " << *op->get_req() << dendl;
3727   op->mark_started();
3728   if (scrubber.reserved) {
3729     dout(10) << __func__ << " ignoring reserve request: Already reserved"
3730              << dendl;
3731     return;
3732   }
3733   scrubber.reserved = osd->inc_scrubs_pending();
3734   if (op->get_req()->get_type() == MSG_OSD_SCRUB_RESERVE) {
3735     const MOSDScrubReserve *m =
3736       static_cast<const MOSDScrubReserve*>(op->get_req());
3737     Message *reply = new MOSDScrubReserve(
3738       spg_t(info.pgid.pgid, primary.shard),
3739       m->map_epoch,
3740       scrubber.reserved ? MOSDScrubReserve::GRANT : MOSDScrubReserve::REJECT,
3741       pg_whoami);
3742     osd->send_message_osd_cluster(reply, op->get_req()->get_connection());
3743   } else {
3744     // for jewel compat only
3745     const MOSDSubOp *req = static_cast<const MOSDSubOp*>(op->get_req());
3746     assert(req->get_type() == MSG_OSD_SUBOP);
3747     MOSDSubOpReply *reply = new MOSDSubOpReply(
3748       req, pg_whoami, 0, get_osdmap()->get_epoch(), CEPH_OSD_FLAG_ACK);
3749     ::encode(scrubber.reserved, reply->get_data());
3750     osd->send_message_osd_cluster(reply, op->get_req()->get_connection());
3751   }
3752 }
3753
3754 void PG::handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from)
3755 {
3756   dout(7) << __func__ << " " << *op->get_req() << dendl;
3757   op->mark_started();
3758   if (!scrubber.reserved) {
3759     dout(10) << "ignoring obsolete scrub reserve reply" << dendl;
3760     return;
3761   }
3762   if (scrubber.reserved_peers.find(from) != scrubber.reserved_peers.end()) {
3763     dout(10) << " already had osd." << from << " reserved" << dendl;
3764   } else {
3765     dout(10) << " osd." << from << " scrub reserve = success" << dendl;
3766     scrubber.reserved_peers.insert(from);
3767     sched_scrub();
3768   }
3769 }
3770
3771 void PG::handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from)
3772 {
3773   dout(7) << __func__ << " " << *op->get_req() << dendl;
3774   op->mark_started();
3775   if (!scrubber.reserved) {
3776     dout(10) << "ignoring obsolete scrub reserve reply" << dendl;
3777     return;
3778   }
3779   if (scrubber.reserved_peers.find(from) != scrubber.reserved_peers.end()) {
3780     dout(10) << " already had osd." << from << " reserved" << dendl;
3781   } else {
3782     /* One decline stops this pg from being scheduled for scrubbing. */
3783     dout(10) << " osd." << from << " scrub reserve = fail" << dendl;
3784     scrubber.reserve_failed = true;
3785     sched_scrub();
3786   }
3787 }
3788
3789 void PG::handle_scrub_reserve_release(OpRequestRef op)
3790 {
3791   dout(7) << __func__ << " " << *op->get_req() << dendl;
3792   op->mark_started();
3793   clear_scrub_reserved();
3794 }
3795
3796 void PG::reject_reservation()
3797 {
3798   osd->send_message_osd_cluster(
3799     primary.osd,
3800     new MBackfillReserve(
3801       MBackfillReserve::REJECT,
3802       spg_t(info.pgid.pgid, primary.shard),
3803       get_osdmap()->get_epoch()),
3804     get_osdmap()->get_epoch());
3805 }
3806
3807 void PG::schedule_backfill_full_retry()
3808 {
3809   Mutex::Locker lock(osd->recovery_request_lock);
3810   osd->recovery_request_timer.add_event_after(
3811     cct->_conf->osd_backfill_retry_interval,
3812     new QueuePeeringEvt<RequestBackfill>(
3813       this, get_osdmap()->get_epoch(),
3814       RequestBackfill()));
3815 }
3816
3817 void PG::schedule_recovery_full_retry()
3818 {
3819   Mutex::Locker lock(osd->recovery_request_lock);
3820   osd->recovery_request_timer.add_event_after(
3821     cct->_conf->osd_recovery_retry_interval,
3822     new QueuePeeringEvt<DoRecovery>(
3823       this, get_osdmap()->get_epoch(),
3824       DoRecovery()));
3825 }
3826
3827 void PG::clear_scrub_reserved()
3828 {
3829   scrubber.reserved_peers.clear();
3830   scrubber.reserve_failed = false;
3831
3832   if (scrubber.reserved) {
3833     scrubber.reserved = false;
3834     osd->dec_scrubs_pending();
3835   }
3836 }
3837
3838 void PG::scrub_reserve_replicas()
3839 {
3840   assert(backfill_targets.empty());
3841   for (set<pg_shard_t>::iterator i = actingbackfill.begin();
3842        i != actingbackfill.end();
3843        ++i) {
3844     if (*i == pg_whoami) continue;
3845     dout(10) << "scrub requesting reserve from osd." << *i << dendl;
3846     if (HAVE_FEATURE(get_min_acting_features(), SERVER_LUMINOUS)) {
3847       osd->send_message_osd_cluster(
3848         i->osd,
3849         new MOSDScrubReserve(spg_t(info.pgid.pgid, i->shard),
3850                              get_osdmap()->get_epoch(),
3851                              MOSDScrubReserve::REQUEST, pg_whoami),
3852         get_osdmap()->get_epoch());
3853     } else {
3854       // for jewel compat only
3855       vector<OSDOp> scrub(1);
3856       scrub[0].op.op = CEPH_OSD_OP_SCRUB_RESERVE;
3857       hobject_t poid;
3858       eversion_t v;
3859       osd_reqid_t reqid;
3860       MOSDSubOp *subop = new MOSDSubOp(
3861         reqid, pg_whoami, spg_t(info.pgid.pgid, i->shard), poid, 0,
3862         get_osdmap()->get_epoch(), osd->get_tid(), v);
3863       subop->ops = scrub;
3864       osd->send_message_osd_cluster(
3865         i->osd, subop, get_osdmap()->get_epoch());
3866     }
3867   }
3868 }
3869
3870 void PG::scrub_unreserve_replicas()
3871 {
3872   assert(backfill_targets.empty());
3873   for (set<pg_shard_t>::iterator i = actingbackfill.begin();
3874        i != actingbackfill.end();
3875        ++i) {
3876     if (*i == pg_whoami) continue;
3877     dout(10) << "scrub requesting unreserve from osd." << *i << dendl;
3878     if (HAVE_FEATURE(get_min_acting_features(), SERVER_LUMINOUS)) {
3879       osd->send_message_osd_cluster(
3880         i->osd,
3881         new MOSDScrubReserve(spg_t(info.pgid.pgid, i->shard),
3882                              get_osdmap()->get_epoch(),
3883                              MOSDScrubReserve::RELEASE, pg_whoami),
3884         get_osdmap()->get_epoch());
3885     } else {
3886       // for jewel compat only
3887       vector<OSDOp> scrub(1);
3888       scrub[0].op.op = CEPH_OSD_OP_SCRUB_UNRESERVE;
3889       hobject_t poid;
3890       eversion_t v;
3891       osd_reqid_t reqid;
3892       MOSDSubOp *subop = new MOSDSubOp(
3893         reqid, pg_whoami, spg_t(info.pgid.pgid, i->shard), poid, 0,
3894         get_osdmap()->get_epoch(), osd->get_tid(), v);
3895       subop->ops = scrub;
3896       osd->send_message_osd_cluster(i->osd, subop, get_osdmap()->get_epoch());
3897     }
3898   }
3899 }
3900
3901 void PG::_scan_rollback_obs(
3902   const vector<ghobject_t> &rollback_obs,
3903   ThreadPool::TPHandle &handle)
3904 {
3905   ObjectStore::Transaction t;
3906   eversion_t trimmed_to = last_rollback_info_trimmed_to_applied;
3907   for (vector<ghobject_t>::const_iterator i = rollback_obs.begin();
3908        i != rollback_obs.end();
3909        ++i) {
3910     if (i->generation < trimmed_to.version) {
3911       osd->clog->error() << "osd." << osd->whoami
3912                         << " pg " << info.pgid
3913                         << " found obsolete rollback obj "
3914                         << *i << " generation < trimmed_to "
3915                         << trimmed_to
3916                         << "...repaired";
3917       t.remove(coll, *i);
3918     }
3919   }
3920   if (!t.empty()) {
3921     derr << __func__ << ": queueing trans to clean up obsolete rollback objs"
3922          << dendl;
3923     osd->store->queue_transaction(osr.get(), std::move(t), NULL);
3924   }
3925 }
3926
3927 void PG::_scan_snaps(ScrubMap &smap)
3928 {
3929   hobject_t head;
3930   SnapSet snapset;
3931   for (map<hobject_t, ScrubMap::object>::reverse_iterator i = smap.objects.rbegin();
3932        i != smap.objects.rend();
3933        ++i) {
3934     const hobject_t &hoid = i->first;
3935     ScrubMap::object &o = i->second;
3936
3937     if (hoid.is_head() || hoid.is_snapdir()) {
3938       // parse the SnapSet
3939       bufferlist bl;
3940       if (o.attrs.find(SS_ATTR) == o.attrs.end()) {
3941         continue;
3942       }
3943       bl.push_back(o.attrs[SS_ATTR]);
3944       auto p = bl.begin();
3945       try {
3946         ::decode(snapset, p);
3947       } catch(...) {
3948         continue;
3949       }
3950       head = hoid.get_head();
3951       continue;
3952     }
3953     if (hoid.snap < CEPH_MAXSNAP) {
3954       // check and if necessary fix snap_mapper
3955       if (hoid.get_head() != head) {
3956         derr << __func__ << " no head for " << hoid << " (have " << head << ")"
3957              << dendl;
3958         continue;
3959       }
3960       set<snapid_t> obj_snaps;
3961       if (!snapset.is_legacy()) {
3962         auto p = snapset.clone_snaps.find(hoid.snap);
3963         if (p == snapset.clone_snaps.end()) {
3964           derr << __func__ << " no clone_snaps for " << hoid << " in " << snapset
3965                << dendl;
3966           continue;
3967         }
3968         obj_snaps.insert(p->second.begin(), p->second.end());
3969       } else {
3970         bufferlist bl;
3971         if (o.attrs.find(OI_ATTR) == o.attrs.end()) {
3972           continue;
3973         }
3974         bl.push_back(o.attrs[OI_ATTR]);
3975         object_info_t oi;
3976         try {
3977           oi.decode(bl);
3978         } catch(...) {
3979           continue;
3980         }
3981         obj_snaps.insert(oi.legacy_snaps.begin(), oi.legacy_snaps.end());
3982       }
3983       set<snapid_t> cur_snaps;
3984       int r = snap_mapper.get_snaps(hoid, &cur_snaps);
3985       if (r != 0 && r != -ENOENT) {
3986         derr << __func__ << ": get_snaps returned " << cpp_strerror(r) << dendl;
3987         ceph_abort();
3988       }
3989       if (r == -ENOENT || cur_snaps != obj_snaps) {
3990         ObjectStore::Transaction t;
3991         OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
3992         if (r == 0) {
3993           r = snap_mapper.remove_oid(hoid, &_t);
3994           if (r != 0) {
3995             derr << __func__ << ": remove_oid returned " << cpp_strerror(r)
3996                  << dendl;
3997             ceph_abort();
3998           }
3999           osd->clog->error() << "osd." << osd->whoami
4000                             << " found snap mapper error on pg "
4001                             << info.pgid
4002                             << " oid " << hoid << " snaps in mapper: "
4003                             << cur_snaps << ", oi: "
4004                             << obj_snaps
4005                             << "...repaired";
4006         } else {
4007           osd->clog->error() << "osd." << osd->whoami
4008                             << " found snap mapper error on pg "
4009                             << info.pgid
4010                             << " oid " << hoid << " snaps missing in mapper"
4011                             << ", should be: "
4012                             << obj_snaps
4013                             << "...repaired";
4014         }
4015         snap_mapper.add_oid(hoid, obj_snaps, &_t);
4016         r = osd->store->apply_transaction(osr.get(), std::move(t));
4017         if (r != 0) {
4018           derr << __func__ << ": apply_transaction got " << cpp_strerror(r)
4019                << dendl;
4020         }
4021       }
4022     }
4023   }
4024 }
4025
4026 void PG::_repair_oinfo_oid(ScrubMap &smap)
4027 {
4028   for (map<hobject_t, ScrubMap::object>::reverse_iterator i = smap.objects.rbegin();
4029        i != smap.objects.rend();
4030        ++i) {
4031     const hobject_t &hoid = i->first;
4032     ScrubMap::object &o = i->second;
4033
4034     bufferlist bl;
4035     if (o.attrs.find(OI_ATTR) == o.attrs.end()) {
4036       continue;
4037     }
4038     bl.push_back(o.attrs[OI_ATTR]);
4039     object_info_t oi;
4040     try {
4041       oi.decode(bl);
4042     } catch(...) {
4043       continue;
4044     }
4045     if (oi.soid != hoid) {
4046       ObjectStore::Transaction t;
4047       OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
4048       osd->clog->error() << "osd." << osd->whoami
4049                             << " found object info error on pg "
4050                             << info.pgid
4051                             << " oid " << hoid << " oid in object info: "
4052                             << oi.soid
4053                             << "...repaired";
4054       // Fix object info
4055       oi.soid = hoid;
4056       bl.clear();
4057       ::encode(oi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
4058
4059       bufferptr bp(bl.c_str(), bl.length());
4060       o.attrs[OI_ATTR] = bp;
4061
4062       t.setattr(coll, ghobject_t(hoid), OI_ATTR, bl);
4063       int r = osd->store->apply_transaction(osr.get(), std::move(t));
4064       if (r != 0) {
4065         derr << __func__ << ": apply_transaction got " << cpp_strerror(r)
4066              << dendl;
4067       }
4068     }
4069   }
4070 }
4071
4072 /*
4073  * build a scrub map over a chunk without releasing the lock
4074  * only used by chunky scrub
4075  */
4076 int PG::build_scrub_map_chunk(
4077   ScrubMap &map,
4078   hobject_t start, hobject_t end, bool deep, uint32_t seed,
4079   ThreadPool::TPHandle &handle)
4080 {
4081   dout(10) << __func__ << " [" << start << "," << end << ") "
4082            << " seed " << seed << dendl;
4083
4084   map.valid_through = info.last_update;
4085
4086   // objects
4087   vector<hobject_t> ls;
4088   vector<ghobject_t> rollback_obs;
4089   int ret = get_pgbackend()->objects_list_range(
4090     start,
4091     end,
4092     0,
4093     &ls,
4094     &rollback_obs);
4095   if (ret < 0) {
4096     dout(5) << "objects_list_range error: " << ret << dendl;
4097     return ret;
4098   }
4099
4100
4101   get_pgbackend()->be_scan_list(map, ls, deep, seed, handle);
4102   _scan_rollback_obs(rollback_obs, handle);
4103   _scan_snaps(map);
4104   _repair_oinfo_oid(map);
4105
4106   dout(20) << __func__ << " done" << dendl;
4107   return 0;
4108 }
4109
4110 void PG::Scrubber::cleanup_store(ObjectStore::Transaction *t) {
4111   if (!store)
4112     return;
4113   struct OnComplete : Context {
4114     std::unique_ptr<Scrub::Store> store;
4115     OnComplete(
4116       std::unique_ptr<Scrub::Store> &&store)
4117       : store(std::move(store)) {}
4118     void finish(int) override {}
4119   };
4120   store->cleanup(t);
4121   t->register_on_complete(new OnComplete(std::move(store)));
4122   assert(!store);
4123 }
4124
4125 void PG::repair_object(
4126   const hobject_t& soid, list<pair<ScrubMap::object, pg_shard_t> > *ok_peers,
4127   pg_shard_t bad_peer)
4128 {
4129   list<pg_shard_t> op_shards;
4130   for (auto i : *ok_peers) {
4131     op_shards.push_back(i.second);
4132   }
4133   dout(10) << "repair_object " << soid << " bad_peer osd."
4134            << bad_peer << " ok_peers osd.{" << op_shards << "}" << dendl;
4135   ScrubMap::object &po = ok_peers->back().first;
4136   eversion_t v;
4137   bufferlist bv;
4138   bv.push_back(po.attrs[OI_ATTR]);
4139   object_info_t oi;
4140   try {
4141     bufferlist::iterator bliter = bv.begin();
4142     ::decode(oi, bliter);
4143   } catch (...) {
4144     dout(0) << __func__ << ": Need version of replica, bad object_info_t: " << soid << dendl;
4145     assert(0);
4146   }
4147   if (bad_peer != primary) {
4148     peer_missing[bad_peer].add(soid, oi.version, eversion_t());
4149   } else {
4150     // We should only be scrubbing if the PG is clean.
4151     assert(waiting_for_unreadable_object.empty());
4152
4153     pg_log.missing_add(soid, oi.version, eversion_t());
4154
4155     pg_log.set_last_requested(0);
4156     dout(10) << __func__ << ": primary = " << primary << dendl;
4157   }
4158
4159   if (is_ec_pg() || bad_peer == primary) {
4160     // we'd better collect all shard for EC pg, and prepare good peers as the
4161     // source of pull in the case of replicated pg.
4162     missing_loc.add_missing(soid, oi.version, eversion_t());
4163     list<pair<ScrubMap::object, pg_shard_t> >::iterator i;
4164     for (i = ok_peers->begin();
4165         i != ok_peers->end();
4166         ++i)
4167       missing_loc.add_location(soid, i->second);
4168   }
4169 }
4170
4171 /* replica_scrub
4172  *
4173  * Wait for last_update_applied to match msg->scrub_to as above. Wait
4174  * for pushes to complete in case of recent recovery. Build a single
4175  * scrubmap of objects that are in the range [msg->start, msg->end).
4176  */
4177 void PG::replica_scrub(
4178   OpRequestRef op,
4179   ThreadPool::TPHandle &handle)
4180 {
4181   const MOSDRepScrub *msg = static_cast<const MOSDRepScrub *>(op->get_req());
4182   assert(!scrubber.active_rep_scrub);
4183   dout(7) << "replica_scrub" << dendl;
4184
4185   if (msg->map_epoch < info.history.same_interval_since) {
4186     dout(10) << "replica_scrub discarding old replica_scrub from "
4187              << msg->map_epoch << " < " << info.history.same_interval_since
4188              << dendl;
4189     return;
4190   }
4191
4192   ScrubMap map;
4193
4194   assert(msg->chunky);
4195   if (last_update_applied < msg->scrub_to) {
4196     dout(10) << "waiting for last_update_applied to catch up" << dendl;
4197     scrubber.active_rep_scrub = op;
4198     return;
4199   }
4200
4201   if (active_pushes > 0) {
4202     dout(10) << "waiting for active pushes to finish" << dendl;
4203     scrubber.active_rep_scrub = op;
4204     return;
4205   }
4206
4207   // compensate for hobject_t's with wrong pool from sloppy hammer OSDs
4208   hobject_t start = msg->start;
4209   hobject_t end = msg->end;
4210   if (!start.is_max())
4211     start.pool = info.pgid.pool();
4212   if (!end.is_max())
4213     end.pool = info.pgid.pool();
4214
4215   build_scrub_map_chunk(
4216     map, start, end, msg->deep, msg->seed,
4217     handle);
4218
4219   if (HAVE_FEATURE(acting_features, SERVER_LUMINOUS)) {
4220     MOSDRepScrubMap *reply = new MOSDRepScrubMap(
4221       spg_t(info.pgid.pgid, get_primary().shard),
4222       msg->map_epoch,
4223       pg_whoami);
4224     ::encode(map, reply->get_data());
4225     osd->send_message_osd_cluster(reply, msg->get_connection());
4226   } else {
4227     // for jewel compatibility
4228     vector<OSDOp> scrub(1);
4229     scrub[0].op.op = CEPH_OSD_OP_SCRUB_MAP;
4230     hobject_t poid;
4231     eversion_t v;
4232     osd_reqid_t reqid;
4233     MOSDSubOp *subop = new MOSDSubOp(
4234       reqid,
4235       pg_whoami,
4236       spg_t(info.pgid.pgid, get_primary().shard),
4237       poid,
4238       0,
4239       msg->map_epoch,
4240       osd->get_tid(),
4241       v);
4242     ::encode(map, subop->get_data());
4243     subop->ops = scrub;
4244     osd->send_message_osd_cluster(subop, msg->get_connection());
4245   }
4246 }
4247
4248 /* Scrub:
4249  * PG_STATE_SCRUBBING is set when the scrub is queued
4250  *
4251  * scrub will be chunky if all OSDs in PG support chunky scrub
4252  * scrub will fail if OSDs are too old.
4253  */
4254 void PG::scrub(epoch_t queued, ThreadPool::TPHandle &handle)
4255 {
4256   if (cct->_conf->osd_scrub_sleep > 0 &&
4257       (scrubber.state == PG::Scrubber::NEW_CHUNK ||
4258        scrubber.state == PG::Scrubber::INACTIVE) &&
4259        scrubber.needs_sleep) {
4260     ceph_assert(!scrubber.sleeping);
4261     dout(20) << __func__ << " state is INACTIVE|NEW_CHUNK, sleeping" << dendl;
4262
4263     // Do an async sleep so we don't block the op queue
4264     OSDService *osds = osd;
4265     spg_t pgid = get_pgid();
4266     int state = scrubber.state;
4267     auto scrub_requeue_callback =
4268         new FunctionContext([osds, pgid, state](int r) {
4269           PG *pg = osds->osd->lookup_lock_pg(pgid);
4270           if (pg == nullptr) {
4271             lgeneric_dout(osds->osd->cct, 20)
4272                 << "scrub_requeue_callback: Could not find "
4273                 << "PG " << pgid << " can't complete scrub requeue after sleep"
4274                 << dendl;
4275             return;
4276           }
4277           pg->scrubber.sleeping = false;
4278           pg->scrubber.needs_sleep = false;
4279           lgeneric_dout(pg->cct, 20)
4280               << "scrub_requeue_callback: slept for "
4281               << ceph_clock_now() - pg->scrubber.sleep_start
4282               << ", re-queuing scrub with state " << state << dendl;
4283           pg->scrub_queued = false;
4284           pg->requeue_scrub();
4285           pg->scrubber.sleep_start = utime_t();
4286           pg->unlock();
4287         });
4288     Mutex::Locker l(osd->scrub_sleep_lock);
4289     osd->scrub_sleep_timer.add_event_after(cct->_conf->osd_scrub_sleep,
4290                                            scrub_requeue_callback);
4291     scrubber.sleeping = true;
4292     scrubber.sleep_start = ceph_clock_now();
4293     return;
4294   }
4295   if (pg_has_reset_since(queued)) {
4296     return;
4297   }
4298   assert(scrub_queued);
4299   scrub_queued = false;
4300   scrubber.needs_sleep = true;
4301
4302   if (!is_primary() || !is_active() || !is_clean() || !is_scrubbing()) {
4303     dout(10) << "scrub -- not primary or active or not clean" << dendl;
4304     state_clear(PG_STATE_SCRUBBING);
4305     state_clear(PG_STATE_REPAIR);
4306     state_clear(PG_STATE_DEEP_SCRUB);
4307     publish_stats_to_osd();
4308     return;
4309   }
4310
4311   if (!scrubber.active) {
4312     assert(backfill_targets.empty());
4313
4314     scrubber.deep = state_test(PG_STATE_DEEP_SCRUB);
4315
4316     dout(10) << "starting a new chunky scrub" << dendl;
4317   }
4318
4319   chunky_scrub(handle);
4320 }
4321
4322 /*
4323  * Chunky scrub scrubs objects one chunk at a time with writes blocked for that
4324  * chunk.
4325  *
4326  * The object store is partitioned into chunks which end on hash boundaries. For
4327  * each chunk, the following logic is performed:
4328  *
4329  *  (1) Block writes on the chunk
4330  *  (2) Request maps from replicas
4331  *  (3) Wait for pushes to be applied (after recovery)
4332  *  (4) Wait for writes to flush on the chunk
4333  *  (5) Wait for maps from replicas
4334  *  (6) Compare / repair all scrub maps
4335  *  (7) Wait for digest updates to apply
4336  *
4337  * This logic is encoded in the mostly linear state machine:
4338  *
4339  *           +------------------+
4340  *  _________v__________        |
4341  * |                    |       |
4342  * |      INACTIVE      |       |
4343  * |____________________|       |
4344  *           |                  |
4345  *           |   +----------+   |
4346  *  _________v___v______    |   |
4347  * |                    |   |   |
4348  * |      NEW_CHUNK     |   |   |
4349  * |____________________|   |   |
4350  *           |              |   |
4351  *  _________v__________    |   |
4352  * |                    |   |   |
4353  * |     WAIT_PUSHES    |   |   |
4354  * |____________________|   |   |
4355  *           |              |   |
4356  *  _________v__________    |   |
4357  * |                    |   |   |
4358  * |  WAIT_LAST_UPDATE  |   |   |
4359  * |____________________|   |   |
4360  *           |              |   |
4361  *  _________v__________    |   |
4362  * |                    |   |   |
4363  * |      BUILD_MAP     |   |   |
4364  * |____________________|   |   |
4365  *           |              |   |
4366  *  _________v__________    |   |
4367  * |                    |   |   |
4368  * |    WAIT_REPLICAS   |   |   |
4369  * |____________________|   |   |
4370  *           |              |   |
4371  *  _________v__________    |   |
4372  * |                    |   |   |
4373  * |    COMPARE_MAPS    |   |   |
4374  * |____________________|   |   |
4375  *           |              |   |
4376  *           |              |   |
4377  *  _________v__________    |   |
4378  * |                    |   |   |
4379  * |WAIT_DIGEST_UPDATES |   |   |
4380  * |____________________|   |   |
4381  *           |   |          |   |
4382  *           |   +----------+   |
4383  *  _________v__________        |
4384  * |                    |       |
4385  * |       FINISH       |       |
4386  * |____________________|       |
4387  *           |                  |
4388  *           +------------------+
4389  *
4390  * The primary determines the last update from the subset by walking the log. If
4391  * it sees a log entry pertaining to a file in the chunk, it tells the replicas
4392  * to wait until that update is applied before building a scrub map. Both the
4393  * primary and replicas will wait for any active pushes to be applied.
4394  *
4395  * In contrast to classic_scrub, chunky_scrub is entirely handled by scrub_wq.
4396  *
4397  * scrubber.state encodes the current state of the scrub (refer to state diagram
4398  * for details).
4399  */
4400 void PG::chunky_scrub(ThreadPool::TPHandle &handle)
4401 {
4402   // check for map changes
4403   if (scrubber.is_chunky_scrub_active()) {
4404     if (scrubber.epoch_start != info.history.same_interval_since) {
4405       dout(10) << "scrub  pg changed, aborting" << dendl;
4406       scrub_clear_state();
4407       scrub_unreserve_replicas();
4408       return;
4409     }
4410   }
4411
4412   bool done = false;
4413   int ret;
4414
4415   while (!done) {
4416     dout(20) << "scrub state " << Scrubber::state_string(scrubber.state)
4417              << " [" << scrubber.start << "," << scrubber.end << ")" << dendl;
4418
4419     switch (scrubber.state) {
4420       case PG::Scrubber::INACTIVE:
4421         dout(10) << "scrub start" << dendl;
4422
4423         publish_stats_to_osd();
4424         scrubber.epoch_start = info.history.same_interval_since;
4425         scrubber.active = true;
4426
4427         osd->inc_scrubs_active(scrubber.reserved);
4428         if (scrubber.reserved) {
4429           scrubber.reserved = false;
4430           scrubber.reserved_peers.clear();
4431         }
4432
4433         {
4434           ObjectStore::Transaction t;
4435           scrubber.cleanup_store(&t);
4436           scrubber.store.reset(Scrub::Store::create(osd->store, &t,
4437                                                     info.pgid, coll));
4438           osd->store->queue_transaction(osr.get(), std::move(t), nullptr);
4439         }
4440
4441         // Don't include temporary objects when scrubbing
4442         scrubber.start = info.pgid.pgid.get_hobj_start();
4443         scrubber.state = PG::Scrubber::NEW_CHUNK;
4444
4445         {
4446           bool repair = state_test(PG_STATE_REPAIR);
4447           bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
4448           const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
4449           stringstream oss;
4450           oss << info.pgid.pgid << " " << mode << " starts" << std::endl;
4451           osd->clog->info(oss);
4452         }
4453
4454         scrubber.seed = -1;
4455
4456         break;
4457
4458       case PG::Scrubber::NEW_CHUNK:
4459         scrubber.primary_scrubmap = ScrubMap();
4460         scrubber.received_maps.clear();
4461
4462         {
4463           /* get the start and end of our scrub chunk
4464            *
4465            * Our scrub chunk has an important restriction we're going to need to
4466            * respect. We can't let head or snapdir be start or end.
4467            * Using a half-open interval means that if end == head|snapdir,
4468            * we'd scrub/lock head and the clone right next to head in different
4469            * chunks which would allow us to miss clones created between
4470            * scrubbing that chunk and scrubbing the chunk including head.
4471            * This isn't true for any of the other clones since clones can
4472            * only be created "just to the left of" head.  There is one exception
4473            * to this: promotion of clones which always happens to the left of the
4474            * left-most clone, but promote_object checks the scrubber in that
4475            * case, so it should be ok.  Also, it's ok to "miss" clones at the
4476            * left end of the range if we are a tier because they may legitimately
4477            * not exist (see _scrub).
4478            */
4479           int min = MAX(3, cct->_conf->osd_scrub_chunk_min);
4480           hobject_t start = scrubber.start;
4481           hobject_t candidate_end;
4482           vector<hobject_t> objects;
4483           ret = get_pgbackend()->objects_list_partial(
4484             start,
4485             min,
4486             MAX(min, cct->_conf->osd_scrub_chunk_max),
4487             &objects,
4488             &candidate_end);
4489           assert(ret >= 0);
4490
4491           if (!objects.empty()) {
4492             hobject_t back = objects.back();
4493             while (candidate_end.has_snapset() &&
4494                       candidate_end.get_head() == back.get_head()) {
4495               candidate_end = back;
4496               objects.pop_back();
4497               if (objects.empty()) {
4498                 assert(0 ==
4499                        "Somehow we got more than 2 objects which"
4500                        "have the same head but are not clones");
4501               }
4502               back = objects.back();
4503             }
4504             if (candidate_end.has_snapset()) {
4505               assert(candidate_end.get_head() != back.get_head());
4506               candidate_end = candidate_end.get_object_boundary();
4507             }
4508           } else {
4509             assert(candidate_end.is_max());
4510           }
4511
4512           if (!_range_available_for_scrub(scrubber.start, candidate_end)) {
4513             // we'll be requeued by whatever made us unavailable for scrub
4514             dout(10) << __func__ << ": scrub blocked somewhere in range "
4515                      << "[" << scrubber.start << ", " << candidate_end << ")"
4516                      << dendl;
4517             done = true;
4518             break;
4519           }
4520           scrubber.end = candidate_end;
4521         }
4522
4523         // walk the log to find the latest update that affects our chunk
4524         scrubber.subset_last_update = eversion_t();
4525         for (auto p = projected_log.log.rbegin();
4526              p != projected_log.log.rend();
4527              ++p) {
4528           if (p->soid >= scrubber.start &&
4529               p->soid < scrubber.end) {
4530             scrubber.subset_last_update = p->version;
4531             break;
4532           }
4533         }
4534         if (scrubber.subset_last_update == eversion_t()) {
4535           for (list<pg_log_entry_t>::const_reverse_iterator p =
4536                  pg_log.get_log().log.rbegin();
4537                p != pg_log.get_log().log.rend();
4538                ++p) {
4539             if (p->soid >= scrubber.start &&
4540                 p->soid < scrubber.end) {
4541               scrubber.subset_last_update = p->version;
4542               break;
4543             }
4544           }
4545         }
4546
4547         // ask replicas to wait until
4548         // last_update_applied >= scrubber.subset_last_update and then scan
4549         scrubber.waiting_on_whom.insert(pg_whoami);
4550         ++scrubber.waiting_on;
4551
4552         // request maps from replicas
4553         for (set<pg_shard_t>::iterator i = actingbackfill.begin();
4554              i != actingbackfill.end();
4555              ++i) {
4556           if (*i == pg_whoami) continue;
4557           _request_scrub_map(*i, scrubber.subset_last_update,
4558                              scrubber.start, scrubber.end, scrubber.deep,
4559                              scrubber.seed);
4560           scrubber.waiting_on_whom.insert(*i);
4561           ++scrubber.waiting_on;
4562         }
4563
4564         scrubber.state = PG::Scrubber::WAIT_PUSHES;
4565
4566         break;
4567
4568       case PG::Scrubber::WAIT_PUSHES:
4569         if (active_pushes == 0) {
4570           scrubber.state = PG::Scrubber::WAIT_LAST_UPDATE;
4571         } else {
4572           dout(15) << "wait for pushes to apply" << dendl;
4573           done = true;
4574         }
4575         break;
4576
4577       case PG::Scrubber::WAIT_LAST_UPDATE:
4578         if (last_update_applied >= scrubber.subset_last_update) {
4579           scrubber.state = PG::Scrubber::BUILD_MAP;
4580         } else {
4581           // will be requeued by op_applied
4582           dout(15) << "wait for writes to flush" << dendl;
4583           done = true;
4584         }
4585         break;
4586
4587       case PG::Scrubber::BUILD_MAP:
4588         assert(last_update_applied >= scrubber.subset_last_update);
4589
4590         // build my own scrub map
4591         ret = build_scrub_map_chunk(scrubber.primary_scrubmap,
4592                                     scrubber.start, scrubber.end,
4593                                     scrubber.deep, scrubber.seed,
4594                                     handle);
4595         if (ret < 0) {
4596           dout(5) << "error building scrub map: " << ret << ", aborting" << dendl;
4597           scrub_clear_state();
4598           scrub_unreserve_replicas();
4599           return;
4600         }
4601
4602         --scrubber.waiting_on;
4603         scrubber.waiting_on_whom.erase(pg_whoami);
4604
4605         scrubber.state = PG::Scrubber::WAIT_REPLICAS;
4606         break;
4607
4608       case PG::Scrubber::WAIT_REPLICAS:
4609         if (scrubber.waiting_on > 0) {
4610           // will be requeued by sub_op_scrub_map
4611           dout(10) << "wait for replicas to build scrub map" << dendl;
4612           done = true;
4613         } else {
4614           scrubber.state = PG::Scrubber::COMPARE_MAPS;
4615         }
4616         break;
4617
4618       case PG::Scrubber::COMPARE_MAPS:
4619         assert(last_update_applied >= scrubber.subset_last_update);
4620         assert(scrubber.waiting_on == 0);
4621
4622         scrub_compare_maps();
4623         scrubber.start = scrubber.end;
4624         scrubber.run_callbacks();
4625
4626         // requeue the writes from the chunk that just finished
4627         requeue_ops(waiting_for_scrub);
4628
4629         scrubber.state = PG::Scrubber::WAIT_DIGEST_UPDATES;
4630
4631         // fall-thru
4632
4633       case PG::Scrubber::WAIT_DIGEST_UPDATES:
4634         if (scrubber.num_digest_updates_pending) {
4635           dout(10) << __func__ << " waiting on "
4636                    << scrubber.num_digest_updates_pending
4637                    << " digest updates" << dendl;
4638           done = true;
4639           break;
4640         }
4641
4642         if (!(scrubber.end.is_max())) {
4643           scrubber.state = PG::Scrubber::NEW_CHUNK;
4644           requeue_scrub();
4645           done = true;
4646         } else {
4647           scrubber.state = PG::Scrubber::FINISH;
4648         }
4649
4650         break;
4651
4652       case PG::Scrubber::FINISH:
4653         scrub_finish();
4654         scrubber.state = PG::Scrubber::INACTIVE;
4655         done = true;
4656
4657         if (!snap_trimq.empty()) {
4658           dout(10) << "scrub finished, requeuing snap_trimmer" << dendl;
4659           snap_trimmer_scrub_complete();
4660         }
4661
4662         break;
4663
4664       default:
4665         ceph_abort();
4666     }
4667   }
4668   dout(20) << "scrub final state " << Scrubber::state_string(scrubber.state)
4669            << " [" << scrubber.start << "," << scrubber.end << ")" << dendl;
4670 }
4671
4672 void PG::scrub_clear_state()
4673 {
4674   assert(is_locked());
4675   state_clear(PG_STATE_SCRUBBING);
4676   state_clear(PG_STATE_REPAIR);
4677   state_clear(PG_STATE_DEEP_SCRUB);
4678   publish_stats_to_osd();
4679
4680   // active -> nothing.
4681   if (scrubber.active)
4682     osd->dec_scrubs_active();
4683
4684   requeue_ops(waiting_for_scrub);
4685
4686   scrubber.reset();
4687
4688   // type-specific state clear
4689   _scrub_clear_state();
4690 }
4691
4692 void PG::scrub_compare_maps()
4693 {
4694   dout(10) << __func__ << " has maps, analyzing" << dendl;
4695
4696   // construct authoritative scrub map for type specific scrubbing
4697   scrubber.cleaned_meta_map.insert(scrubber.primary_scrubmap);
4698   map<hobject_t, pair<uint32_t, uint32_t>> missing_digest;
4699
4700   if (acting.size() > 1) {
4701     dout(10) << __func__ << "  comparing replica scrub maps" << dendl;
4702
4703     stringstream ss;
4704
4705     // Map from object with errors to good peer
4706     map<hobject_t, list<pg_shard_t>> authoritative;
4707     map<pg_shard_t, ScrubMap *> maps;
4708
4709     dout(2) << __func__ << "   osd." << acting[0] << " has "
4710             << scrubber.primary_scrubmap.objects.size() << " items" << dendl;
4711     maps[pg_whoami] = &scrubber.primary_scrubmap;
4712
4713     for (set<pg_shard_t>::iterator i = actingbackfill.begin();
4714          i != actingbackfill.end();
4715          ++i) {
4716       if (*i == pg_whoami) continue;
4717       dout(2) << __func__ << " replica " << *i << " has "
4718               << scrubber.received_maps[*i].objects.size()
4719               << " items" << dendl;
4720       maps[*i] = &scrubber.received_maps[*i];
4721     }
4722
4723     get_pgbackend()->be_compare_scrubmaps(
4724       maps,
4725       state_test(PG_STATE_REPAIR),
4726       scrubber.missing,
4727       scrubber.inconsistent,
4728       authoritative,
4729       missing_digest,
4730       scrubber.shallow_errors,
4731       scrubber.deep_errors,
4732       scrubber.store.get(),
4733       info.pgid, acting,
4734       ss);
4735     dout(2) << ss.str() << dendl;
4736
4737     if (!ss.str().empty()) {
4738       osd->clog->error(ss);
4739     }
4740
4741     for (map<hobject_t, list<pg_shard_t>>::iterator i = authoritative.begin();
4742          i != authoritative.end();
4743          ++i) {
4744       list<pair<ScrubMap::object, pg_shard_t> > good_peers;
4745       for (list<pg_shard_t>::const_iterator j = i->second.begin();
4746            j != i->second.end();
4747            ++j) {
4748         good_peers.push_back(make_pair(maps[*j]->objects[i->first], *j));
4749       }
4750       scrubber.authoritative.insert(
4751         make_pair(
4752           i->first,
4753           good_peers));
4754     }
4755
4756     for (map<hobject_t, list<pg_shard_t>>::iterator i = authoritative.begin();
4757          i != authoritative.end();
4758          ++i) {
4759       scrubber.cleaned_meta_map.objects.erase(i->first);
4760       scrubber.cleaned_meta_map.objects.insert(
4761         *(maps[i->second.back()]->objects.find(i->first))
4762         );
4763     }
4764   }
4765
4766   ScrubMap for_meta_scrub;
4767   if (scrubber.end.is_max() ||
4768       scrubber.cleaned_meta_map.objects.empty()) {
4769     scrubber.cleaned_meta_map.swap(for_meta_scrub);
4770   } else {
4771     auto iter = scrubber.cleaned_meta_map.objects.end();
4772     --iter; // not empty, see if clause
4773     auto begin = scrubber.cleaned_meta_map.objects.begin();
4774     while (iter != begin) {
4775       auto next = iter--;
4776       if (next->first.get_head() != iter->first.get_head()) {
4777         ++iter;
4778         break;
4779       }
4780     }
4781     for_meta_scrub.objects.insert(begin, iter);
4782     scrubber.cleaned_meta_map.objects.erase(begin, iter);
4783   }
4784
4785   // ok, do the pg-type specific scrubbing
4786   scrub_snapshot_metadata(for_meta_scrub, missing_digest);
4787   if (!scrubber.store->empty()) {
4788     if (state_test(PG_STATE_REPAIR)) {
4789       dout(10) << __func__ << ": discarding scrub results" << dendl;
4790       scrubber.store->flush(nullptr);
4791     } else {
4792       dout(10) << __func__ << ": updating scrub object" << dendl;
4793       ObjectStore::Transaction t;
4794       scrubber.store->flush(&t);
4795       osd->store->queue_transaction(osr.get(), std::move(t), nullptr);
4796     }
4797   }
4798 }
4799
4800 bool PG::scrub_process_inconsistent()
4801 {
4802   dout(10) << __func__ << ": checking authoritative" << dendl;
4803   bool repair = state_test(PG_STATE_REPAIR);
4804   bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
4805   const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
4806
4807   // authoriative only store objects which missing or inconsistent.
4808   if (!scrubber.authoritative.empty()) {
4809     stringstream ss;
4810     ss << info.pgid << " " << mode << " "
4811        << scrubber.missing.size() << " missing, "
4812        << scrubber.inconsistent.size() << " inconsistent objects";
4813     dout(2) << ss.str() << dendl;
4814     osd->clog->error(ss);
4815     if (repair) {
4816       state_clear(PG_STATE_CLEAN);
4817       for (map<hobject_t, list<pair<ScrubMap::object, pg_shard_t> >>::iterator i =
4818              scrubber.authoritative.begin();
4819            i != scrubber.authoritative.end();
4820            ++i) {
4821         set<pg_shard_t>::iterator j;
4822
4823         auto missing_entry = scrubber.missing.find(i->first);
4824         if (missing_entry != scrubber.missing.end()) {
4825           for (j = missing_entry->second.begin();
4826                j != missing_entry->second.end();
4827                ++j) {
4828             repair_object(
4829               i->first,
4830               &(i->second),
4831               *j);
4832             ++scrubber.fixed;
4833           }
4834         }
4835         if (scrubber.inconsistent.count(i->first)) {
4836           for (j = scrubber.inconsistent[i->first].begin();
4837                j != scrubber.inconsistent[i->first].end();
4838                ++j) {
4839             repair_object(i->first,
4840               &(i->second),
4841               *j);
4842             ++scrubber.fixed;
4843           }
4844         }
4845       }
4846     }
4847   }
4848   return (!scrubber.authoritative.empty() && repair);
4849 }
4850
4851 bool PG::ops_blocked_by_scrub() const {
4852   return (waiting_for_scrub.size() != 0);
4853 }
4854
4855 // the part that actually finalizes a scrub
4856 void PG::scrub_finish()
4857 {
4858   bool repair = state_test(PG_STATE_REPAIR);
4859   // if the repair request comes from auto-repair and large number of errors,
4860   // we would like to cancel auto-repair
4861   if (repair && scrubber.auto_repair
4862       && scrubber.authoritative.size() > cct->_conf->osd_scrub_auto_repair_num_errors) {
4863     state_clear(PG_STATE_REPAIR);
4864     repair = false;
4865   }
4866   bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
4867   const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
4868
4869   // type-specific finish (can tally more errors)
4870   _scrub_finish();
4871
4872   bool has_error = scrub_process_inconsistent();
4873
4874   {
4875     stringstream oss;
4876     oss << info.pgid.pgid << " " << mode << " ";
4877     int total_errors = scrubber.shallow_errors + scrubber.deep_errors;
4878     if (total_errors)
4879       oss << total_errors << " errors";
4880     else
4881       oss << "ok";
4882     if (!deep_scrub && info.stats.stats.sum.num_deep_scrub_errors)
4883       oss << " ( " << info.stats.stats.sum.num_deep_scrub_errors
4884           << " remaining deep scrub error details lost)";
4885     if (repair)
4886       oss << ", " << scrubber.fixed << " fixed";
4887     if (total_errors)
4888       osd->clog->error(oss);
4889     else
4890       osd->clog->info(oss);
4891   }
4892
4893   // finish up
4894   unreg_next_scrub();
4895   utime_t now = ceph_clock_now();
4896   info.history.last_scrub = info.last_update;
4897   info.history.last_scrub_stamp = now;
4898   if (scrubber.deep) {
4899     info.history.last_deep_scrub = info.last_update;
4900     info.history.last_deep_scrub_stamp = now;
4901   }
4902   // Since we don't know which errors were fixed, we can only clear them
4903   // when every one has been fixed.
4904   if (repair) {
4905     if (scrubber.fixed == scrubber.shallow_errors + scrubber.deep_errors) {
4906       assert(deep_scrub);
4907       scrubber.shallow_errors = scrubber.deep_errors = 0;
4908     } else {
4909       // Deep scrub in order to get corrected error counts
4910       scrub_after_recovery = true;
4911     }
4912   }
4913   if (deep_scrub) {
4914     if ((scrubber.shallow_errors == 0) && (scrubber.deep_errors == 0))
4915       info.history.last_clean_scrub_stamp = now;
4916     info.stats.stats.sum.num_shallow_scrub_errors = scrubber.shallow_errors;
4917     info.stats.stats.sum.num_deep_scrub_errors = scrubber.deep_errors;
4918   } else {
4919     info.stats.stats.sum.num_shallow_scrub_errors = scrubber.shallow_errors;
4920     // XXX: last_clean_scrub_stamp doesn't mean the pg is not inconsistent
4921     // because of deep-scrub errors
4922     if (scrubber.shallow_errors == 0)
4923       info.history.last_clean_scrub_stamp = now;
4924   }
4925   info.stats.stats.sum.num_scrub_errors =
4926     info.stats.stats.sum.num_shallow_scrub_errors +
4927     info.stats.stats.sum.num_deep_scrub_errors;
4928   reg_next_scrub();
4929
4930   {
4931     ObjectStore::Transaction t;
4932     dirty_info = true;
4933     write_if_dirty(t);
4934     int tr = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
4935     assert(tr == 0);
4936   }
4937
4938
4939   if (has_error) {
4940     queue_peering_event(
4941       CephPeeringEvtRef(
4942         std::make_shared<CephPeeringEvt>(
4943           get_osdmap()->get_epoch(),
4944           get_osdmap()->get_epoch(),
4945           DoRecovery())));
4946   }
4947
4948   scrub_clear_state();
4949   scrub_unreserve_replicas();
4950
4951   if (is_active() && is_primary()) {
4952     share_pg_info();
4953   }
4954 }
4955
4956 void PG::share_pg_info()
4957 {
4958   dout(10) << "share_pg_info" << dendl;
4959
4960   // share new pg_info_t with replicas
4961   assert(!actingbackfill.empty());
4962   for (set<pg_shard_t>::iterator i = actingbackfill.begin();
4963        i != actingbackfill.end();
4964        ++i) {
4965     if (*i == pg_whoami) continue;
4966     pg_shard_t peer = *i;
4967     if (peer_info.count(peer)) {
4968       peer_info[peer].last_epoch_started = info.last_epoch_started;
4969       peer_info[peer].last_interval_started = info.last_interval_started;
4970       peer_info[peer].history.merge(info.history);
4971     }
4972     MOSDPGInfo *m = new MOSDPGInfo(get_osdmap()->get_epoch());
4973     m->pg_list.push_back(
4974       make_pair(
4975         pg_notify_t(
4976           peer.shard, pg_whoami.shard,
4977           get_osdmap()->get_epoch(),
4978           get_osdmap()->get_epoch(),
4979           info),
4980         PastIntervals()));
4981     osd->send_message_osd_cluster(peer.osd, m, get_osdmap()->get_epoch());
4982   }
4983 }
4984
4985 bool PG::append_log_entries_update_missing(
4986   const mempool::osd_pglog::list<pg_log_entry_t> &entries,
4987   ObjectStore::Transaction &t)
4988 {
4989   assert(!entries.empty());
4990   assert(entries.begin()->version > info.last_update);
4991
4992   PGLogEntryHandler rollbacker{this, &t};
4993   bool invalidate_stats =
4994     pg_log.append_new_log_entries(info.last_backfill,
4995                                   info.last_backfill_bitwise,
4996                                   entries,
4997                                   &rollbacker);
4998   info.last_update = pg_log.get_head();
4999
5000   if (pg_log.get_missing().num_missing() == 0) {
5001     // advance last_complete since nothing else is missing!
5002     info.last_complete = info.last_update;
5003   }
5004
5005   info.stats.stats_invalid = info.stats.stats_invalid || invalidate_stats;
5006   dirty_info = true;
5007   write_if_dirty(t);
5008   return invalidate_stats;
5009 }
5010
5011
5012 void PG::merge_new_log_entries(
5013   const mempool::osd_pglog::list<pg_log_entry_t> &entries,
5014   ObjectStore::Transaction &t)
5015 {
5016   dout(10) << __func__ << " " << entries << dendl;
5017   assert(is_primary());
5018
5019   bool rebuild_missing = append_log_entries_update_missing(entries, t);
5020   for (set<pg_shard_t>::const_iterator i = actingbackfill.begin();
5021        i != actingbackfill.end();
5022        ++i) {
5023     pg_shard_t peer(*i);
5024     if (peer == pg_whoami) continue;
5025     assert(peer_missing.count(peer));
5026     assert(peer_info.count(peer));
5027     pg_missing_t& pmissing(peer_missing[peer]);
5028     pg_info_t& pinfo(peer_info[peer]);
5029     bool invalidate_stats = PGLog::append_log_entries_update_missing(
5030       pinfo.last_backfill,
5031       info.last_backfill_bitwise,
5032       entries,
5033       true,
5034       NULL,
5035       pmissing,
5036       NULL,
5037       this);
5038     pinfo.last_update = info.last_update;
5039     pinfo.stats.stats_invalid = pinfo.stats.stats_invalid || invalidate_stats;
5040     rebuild_missing = rebuild_missing || invalidate_stats;
5041   }
5042
5043   if (!rebuild_missing) {
5044     return;
5045   }
5046
5047   for (auto &&i: entries) {
5048     missing_loc.rebuild(
5049       i.soid,
5050       pg_whoami,
5051       actingbackfill,
5052       info,
5053       pg_log.get_missing(),
5054       peer_missing,
5055       peer_info);
5056   }
5057 }
5058
5059 void PG::update_history(const pg_history_t& new_history)
5060 {
5061   unreg_next_scrub();
5062   if (info.history.merge(new_history)) {
5063     dout(20) << __func__ << " advanced history from " << new_history << dendl;
5064     dirty_info = true;
5065     if (info.history.last_epoch_clean >= info.history.same_interval_since) {
5066       dout(20) << __func__ << " clearing past_intervals" << dendl;
5067       past_intervals.clear();
5068       dirty_big_info = true;
5069     }
5070   }
5071   reg_next_scrub();
5072 }
5073
5074 void PG::fulfill_info(
5075   pg_shard_t from, const pg_query_t &query,
5076   pair<pg_shard_t, pg_info_t> &notify_info)
5077 {
5078   assert(from == primary);
5079   assert(query.type == pg_query_t::INFO);
5080
5081   // info
5082   dout(10) << "sending info" << dendl;
5083   notify_info = make_pair(from, info);
5084 }
5085
5086 void PG::fulfill_log(
5087   pg_shard_t from, const pg_query_t &query, epoch_t query_epoch)
5088 {
5089   dout(10) << "log request from " << from << dendl;
5090   assert(from == primary);
5091   assert(query.type != pg_query_t::INFO);
5092   ConnectionRef con = osd->get_con_osd_cluster(
5093     from.osd, get_osdmap()->get_epoch());
5094   if (!con) return;
5095
5096   MOSDPGLog *mlog = new MOSDPGLog(
5097     from.shard, pg_whoami.shard,
5098     get_osdmap()->get_epoch(),
5099     info, query_epoch);
5100   mlog->missing = pg_log.get_missing();
5101
5102   // primary -> other, when building master log
5103   if (query.type == pg_query_t::LOG) {
5104     dout(10) << " sending info+missing+log since " << query.since
5105              << dendl;
5106     if (query.since != eversion_t() && query.since < pg_log.get_tail()) {
5107       osd->clog->error() << info.pgid << " got broken pg_query_t::LOG since " << query.since
5108                         << " when my log.tail is " << pg_log.get_tail()
5109                         << ", sending full log instead";
5110       mlog->log = pg_log.get_log();           // primary should not have requested this!!
5111     } else
5112       mlog->log.copy_after(pg_log.get_log(), query.since);
5113   }
5114   else if (query.type == pg_query_t::FULLLOG) {
5115     dout(10) << " sending info+missing+full log" << dendl;
5116     mlog->log = pg_log.get_log();
5117   }
5118
5119   dout(10) << " sending " << mlog->log << " " << mlog->missing << dendl;
5120
5121   osd->share_map_peer(from.osd, con.get(), get_osdmap());
5122   osd->send_message_osd_cluster(mlog, con.get());
5123 }
5124
5125 void PG::check_full_transition(OSDMapRef lastmap, OSDMapRef osdmap)
5126 {
5127   bool changed = false;
5128   if (osdmap->test_flag(CEPH_OSDMAP_FULL) &&
5129       !lastmap->test_flag(CEPH_OSDMAP_FULL)) {
5130     dout(10) << " cluster was marked full in " << osdmap->get_epoch() << dendl;
5131     changed = true;
5132   }
5133   const pg_pool_t *pi = osdmap->get_pg_pool(info.pgid.pool());
5134   assert(pi);
5135   if (pi->has_flag(pg_pool_t::FLAG_FULL)) {
5136     const pg_pool_t *opi = lastmap->get_pg_pool(info.pgid.pool());
5137     if (!opi || !opi->has_flag(pg_pool_t::FLAG_FULL)) {
5138       dout(10) << " pool was marked full in " << osdmap->get_epoch() << dendl;
5139       changed = true;
5140     }
5141   }
5142   if (changed) {
5143     info.history.last_epoch_marked_full = osdmap->get_epoch();
5144     dirty_info = true;
5145   }
5146 }
5147
5148 bool PG::should_restart_peering(
5149   int newupprimary,
5150   int newactingprimary,
5151   const vector<int>& newup,
5152   const vector<int>& newacting,
5153   OSDMapRef lastmap,
5154   OSDMapRef osdmap)
5155 {
5156   if (PastIntervals::is_new_interval(
5157         primary.osd,
5158         newactingprimary,
5159         acting,
5160         newacting,
5161         up_primary.osd,
5162         newupprimary,
5163         up,
5164         newup,
5165         osdmap,
5166         lastmap,
5167         info.pgid.pgid)) {
5168     dout(20) << "new interval newup " << newup
5169              << " newacting " << newacting << dendl;
5170     return true;
5171   } else {
5172     return false;
5173   }
5174 }
5175
5176 bool PG::old_peering_msg(epoch_t reply_epoch, epoch_t query_epoch)
5177 {
5178   if (last_peering_reset > reply_epoch ||
5179       last_peering_reset > query_epoch) {
5180     dout(10) << "old_peering_msg reply_epoch " << reply_epoch << " query_epoch " << query_epoch
5181              << " last_peering_reset " << last_peering_reset
5182              << dendl;
5183     return true;
5184   }
5185   return false;
5186 }
5187
5188 void PG::set_last_peering_reset()
5189 {
5190   dout(20) << "set_last_peering_reset " << get_osdmap()->get_epoch() << dendl;
5191   if (last_peering_reset != get_osdmap()->get_epoch()) {
5192     last_peering_reset = get_osdmap()->get_epoch();
5193     reset_interval_flush();
5194   }
5195 }
5196
5197 struct FlushState {
5198   PGRef pg;
5199   epoch_t epoch;
5200   FlushState(PG *pg, epoch_t epoch) : pg(pg), epoch(epoch) {}
5201   ~FlushState() {
5202     pg->lock();
5203     if (!pg->pg_has_reset_since(epoch))
5204       pg->queue_flushed(epoch);
5205     pg->unlock();
5206   }
5207 };
5208 typedef ceph::shared_ptr<FlushState> FlushStateRef;
5209
5210 void PG::start_flush(ObjectStore::Transaction *t,
5211                      list<Context *> *on_applied,
5212                      list<Context *> *on_safe)
5213 {
5214   // flush in progress ops
5215   FlushStateRef flush_trigger (std::make_shared<FlushState>(
5216                                this, get_osdmap()->get_epoch()));
5217   t->nop();
5218   flushes_in_progress++;
5219   on_applied->push_back(new ContainerContext<FlushStateRef>(flush_trigger));
5220   on_safe->push_back(new ContainerContext<FlushStateRef>(flush_trigger));
5221 }
5222
5223 void PG::reset_interval_flush()
5224 {
5225   dout(10) << "Clearing blocked outgoing recovery messages" << dendl;
5226   recovery_state.clear_blocked_outgoing();
5227
5228   Context *c = new QueuePeeringEvt<IntervalFlush>(
5229     this, get_osdmap()->get_epoch(), IntervalFlush());
5230   if (!osr->flush_commit(c)) {
5231     dout(10) << "Beginning to block outgoing recovery messages" << dendl;
5232     recovery_state.begin_block_outgoing();
5233   } else {
5234     dout(10) << "Not blocking outgoing recovery messages" << dendl;
5235     delete c;
5236   }
5237 }
5238
5239 /* Called before initializing peering during advance_map */
5240 void PG::start_peering_interval(
5241   const OSDMapRef lastmap,
5242   const vector<int>& newup, int new_up_primary,
5243   const vector<int>& newacting, int new_acting_primary,
5244   ObjectStore::Transaction *t)
5245 {
5246   const OSDMapRef osdmap = get_osdmap();
5247
5248   set_last_peering_reset();
5249
5250   vector<int> oldacting, oldup;
5251   int oldrole = get_role();
5252
5253   unreg_next_scrub();
5254
5255   pg_shard_t old_acting_primary = get_primary();
5256   pg_shard_t old_up_primary = up_primary;
5257   bool was_old_primary = is_primary();
5258
5259   acting.swap(oldacting);
5260   up.swap(oldup);
5261   init_primary_up_acting(
5262     newup,
5263     newacting,
5264     new_up_primary,
5265     new_acting_primary);
5266
5267   if (info.stats.up != up ||
5268       info.stats.acting != acting ||
5269       info.stats.up_primary != new_up_primary ||
5270       info.stats.acting_primary != new_acting_primary) {
5271     info.stats.up = up;
5272     info.stats.up_primary = new_up_primary;
5273     info.stats.acting = acting;
5274     info.stats.acting_primary = new_acting_primary;
5275     info.stats.mapping_epoch = osdmap->get_epoch();
5276   }
5277
5278   pg_stats_publish_lock.Lock();
5279   pg_stats_publish_valid = false;
5280   pg_stats_publish_lock.Unlock();
5281
5282   // This will now be remapped during a backfill in cases
5283   // that it would not have been before.
5284   if (up != acting)
5285     state_set(PG_STATE_REMAPPED);
5286   else
5287     state_clear(PG_STATE_REMAPPED);
5288
5289   int role = osdmap->calc_pg_role(osd->whoami, acting, acting.size());
5290   if (pool.info.is_replicated() || role == pg_whoami.shard)
5291     set_role(role);
5292   else
5293     set_role(-1);
5294
5295   // did acting, up, primary|acker change?
5296   if (!lastmap) {
5297     dout(10) << " no lastmap" << dendl;
5298     dirty_info = true;
5299     dirty_big_info = true;
5300     info.history.same_interval_since = osdmap->get_epoch();
5301   } else {
5302     std::stringstream debug;
5303     assert(info.history.same_interval_since != 0);
5304     boost::scoped_ptr<IsPGRecoverablePredicate> recoverable(
5305       get_is_recoverable_predicate());
5306     bool new_interval = PastIntervals::check_new_interval(
5307       old_acting_primary.osd,
5308       new_acting_primary,
5309       oldacting, newacting,
5310       old_up_primary.osd,
5311       new_up_primary,
5312       oldup, newup,
5313       info.history.same_interval_since,
5314       info.history.last_epoch_clean,
5315       osdmap,
5316       lastmap,
5317       info.pgid.pgid,
5318       recoverable.get(),
5319       &past_intervals,
5320       &debug);
5321     dout(10) << __func__ << ": check_new_interval output: "
5322              << debug.str() << dendl;
5323     if (new_interval) {
5324       if (osdmap->get_epoch() == osd->get_superblock().oldest_map &&
5325           info.history.last_epoch_clean < osdmap->get_epoch()) {
5326         dout(10) << " map gap, clearing past_intervals and faking" << dendl;
5327         // our information is incomplete and useless; someone else was clean
5328         // after everything we know if osdmaps were trimmed.
5329         past_intervals.clear();
5330       } else {
5331         dout(10) << " noting past " << past_intervals << dendl;
5332       }
5333       dirty_info = true;
5334       dirty_big_info = true;
5335       info.history.same_interval_since = osdmap->get_epoch();
5336       if (info.pgid.pgid.is_split(lastmap->get_pg_num(info.pgid.pgid.pool()),
5337                                   osdmap->get_pg_num(info.pgid.pgid.pool()),
5338                                   nullptr)) {
5339         info.history.last_epoch_split = osdmap->get_epoch();
5340       }
5341     }
5342   }
5343
5344   if (old_up_primary != up_primary ||
5345       oldup != up) {
5346     info.history.same_up_since = osdmap->get_epoch();
5347   }
5348   // this comparison includes primary rank via pg_shard_t
5349   if (old_acting_primary != get_primary()) {
5350     info.history.same_primary_since = osdmap->get_epoch();
5351   }
5352
5353   on_new_interval();
5354
5355   dout(1) << __func__ << " up " << oldup << " -> " << up
5356            << ", acting " << oldacting << " -> " << acting
5357            << ", acting_primary " << old_acting_primary << " -> " << new_acting_primary
5358            << ", up_primary " << old_up_primary << " -> " << new_up_primary
5359            << ", role " << oldrole << " -> " << role
5360            << ", features acting " << acting_features
5361            << " upacting " << upacting_features
5362            << dendl;
5363
5364   // deactivate.
5365   state_clear(PG_STATE_ACTIVE);
5366   state_clear(PG_STATE_PEERED);
5367   state_clear(PG_STATE_DOWN);
5368   state_clear(PG_STATE_RECOVERY_WAIT);
5369   state_clear(PG_STATE_RECOVERY_TOOFULL);
5370   state_clear(PG_STATE_RECOVERING);
5371
5372   peer_purged.clear();
5373   actingbackfill.clear();
5374   scrub_queued = false;
5375
5376   // reset primary state?
5377   if (was_old_primary || is_primary()) {
5378     osd->remove_want_pg_temp(info.pgid.pgid);
5379   }
5380   clear_primary_state();
5381
5382
5383   // pg->on_*
5384   on_change(t);
5385
5386   projected_last_update = eversion_t();
5387
5388   assert(!deleting);
5389
5390   // should we tell the primary we are here?
5391   send_notify = !is_primary();
5392
5393   if (role != oldrole ||
5394       was_old_primary != is_primary()) {
5395     // did primary change?
5396     if (was_old_primary != is_primary()) {
5397       state_clear(PG_STATE_CLEAN);
5398       clear_publish_stats();
5399     }
5400
5401     on_role_change();
5402
5403     // take active waiters
5404     requeue_ops(waiting_for_peered);
5405
5406   } else {
5407     // no role change.
5408     // did primary change?
5409     if (get_primary() != old_acting_primary) {
5410       dout(10) << *this << " " << oldacting << " -> " << acting
5411                << ", acting primary "
5412                << old_acting_primary << " -> " << get_primary()
5413                << dendl;
5414     } else {
5415       // primary is the same.
5416       if (is_primary()) {
5417         // i am (still) primary. but my replica set changed.
5418         state_clear(PG_STATE_CLEAN);
5419
5420         dout(10) << oldacting << " -> " << acting
5421                  << ", replicas changed" << dendl;
5422       }
5423     }
5424   }
5425   cancel_recovery();
5426
5427   if (acting.empty() && !up.empty() && up_primary == pg_whoami) {
5428     dout(10) << " acting empty, but i am up[0], clearing pg_temp" << dendl;
5429     osd->queue_want_pg_temp(info.pgid.pgid, acting);
5430   }
5431 }
5432
5433 void PG::on_new_interval()
5434 {
5435   const OSDMapRef osdmap = get_osdmap();
5436
5437   reg_next_scrub();
5438
5439   // initialize features
5440   acting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
5441   upacting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
5442   for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p) {
5443     if (*p == CRUSH_ITEM_NONE)
5444       continue;
5445     uint64_t f = osdmap->get_xinfo(*p).features;
5446     acting_features &= f;
5447     upacting_features &= f;
5448   }
5449   for (vector<int>::iterator p = up.begin(); p != up.end(); ++p) {
5450     if (*p == CRUSH_ITEM_NONE)
5451       continue;
5452     upacting_features &= osdmap->get_xinfo(*p).features;
5453   }
5454
5455   assert(osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE));
5456
5457   _on_new_interval();
5458 }
5459
5460 void PG::proc_primary_info(ObjectStore::Transaction &t, const pg_info_t &oinfo)
5461 {
5462   assert(!is_primary());
5463
5464   update_history(oinfo.history);
5465
5466   if (last_complete_ondisk.epoch >= info.history.last_epoch_started) {
5467     // DEBUG: verify that the snaps are empty in snap_mapper
5468     if (cct->_conf->osd_debug_verify_snaps_on_info) {
5469       interval_set<snapid_t> p;
5470       p.union_of(oinfo.purged_snaps, info.purged_snaps);
5471       p.subtract(info.purged_snaps);
5472       if (!p.empty()) {
5473         for (interval_set<snapid_t>::iterator i = p.begin();
5474              i != p.end();
5475              ++i) {
5476           for (snapid_t snap = i.get_start();
5477                snap != i.get_len() + i.get_start();
5478                ++snap) {
5479             vector<hobject_t> hoids;
5480             int r = snap_mapper.get_next_objects_to_trim(snap, 1, &hoids);
5481             if (r != 0 && r != -ENOENT) {
5482               derr << __func__ << ": snap_mapper get_next_object_to_trim returned "
5483                    << cpp_strerror(r) << dendl;
5484               ceph_abort();
5485             } else if (r != -ENOENT) {
5486               assert(!hoids.empty());
5487               derr << __func__ << ": snap_mapper get_next_object_to_trim returned "
5488                    << cpp_strerror(r) << " for object "
5489                    << hoids[0] << " on snap " << snap
5490                    << " which should have been fully trimmed " << dendl;
5491               ceph_abort();
5492             }
5493           }
5494         }
5495       }
5496     }
5497     info.purged_snaps = oinfo.purged_snaps;
5498     dirty_info = true;
5499     dirty_big_info = true;
5500   }
5501 }
5502
5503 ostream& operator<<(ostream& out, const PG& pg)
5504 {
5505   out << "pg[" << pg.info
5506       << " " << pg.up;
5507   if (pg.acting != pg.up)
5508     out << "/" << pg.acting;
5509   out << " r=" << pg.get_role();
5510   out << " lpr=" << pg.get_last_peering_reset();
5511
5512   if (!pg.past_intervals.empty()) {
5513     out << " pi=[" << pg.past_intervals.get_bounds()
5514         << ")/" << pg.past_intervals.size();
5515   }
5516
5517   if (pg.is_peered()) {
5518     if (pg.last_update_ondisk != pg.info.last_update)
5519       out << " luod=" << pg.last_update_ondisk;
5520     if (pg.last_update_applied != pg.info.last_update)
5521       out << " lua=" << pg.last_update_applied;
5522   }
5523
5524   if (pg.recovery_ops_active)
5525     out << " rops=" << pg.recovery_ops_active;
5526
5527   if (pg.pg_log.get_tail() != pg.info.log_tail ||
5528       pg.pg_log.get_head() != pg.info.last_update)
5529     out << " (info mismatch, " << pg.pg_log.get_log() << ")";
5530
5531   if (!pg.pg_log.get_log().empty()) {
5532     if ((pg.pg_log.get_log().log.begin()->version <= pg.pg_log.get_tail())) {
5533       out << " (log bound mismatch, actual=["
5534           << pg.pg_log.get_log().log.begin()->version << ","
5535           << pg.pg_log.get_log().log.rbegin()->version << "]";
5536       out << ")";
5537     }
5538   }
5539
5540   if (!pg.backfill_targets.empty())
5541     out << " bft=" << pg.backfill_targets;
5542   out << " crt=" << pg.pg_log.get_can_rollback_to();
5543
5544   if (pg.last_complete_ondisk != pg.info.last_complete)
5545     out << " lcod " << pg.last_complete_ondisk;
5546
5547   if (pg.is_primary()) {
5548     out << " mlcod " << pg.min_last_complete_ondisk;
5549   }
5550
5551   out << " " << pg_state_string(pg.get_state());
5552   if (pg.should_send_notify())
5553     out << " NOTIFY";
5554
5555   if (pg.scrubber.must_repair)
5556     out << " MUST_REPAIR";
5557   if (pg.scrubber.auto_repair)
5558     out << " AUTO_REPAIR";
5559   if (pg.scrubber.must_deep_scrub)
5560     out << " MUST_DEEP_SCRUB";
5561   if (pg.scrubber.must_scrub)
5562     out << " MUST_SCRUB";
5563
5564   //out << " (" << pg.pg_log.get_tail() << "," << pg.pg_log.get_head() << "]";
5565   if (pg.pg_log.get_missing().num_missing()) {
5566     out << " m=" << pg.pg_log.get_missing().num_missing();
5567     if (pg.is_primary()) {
5568       uint64_t unfound = pg.get_num_unfound();
5569       if (unfound)
5570         out << " u=" << unfound;
5571     }
5572   }
5573   if (pg.snap_trimq.size())
5574     out << " snaptrimq=" << pg.snap_trimq;
5575
5576   out << "]";
5577
5578
5579   return out;
5580 }
5581
5582 bool PG::can_discard_op(OpRequestRef& op)
5583 {
5584   const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
5585   if (cct->_conf->osd_discard_disconnected_ops && OSD::op_is_discardable(m)) {
5586     dout(20) << " discard " << *m << dendl;
5587     return true;
5588   }
5589
5590   if (m->get_map_epoch() < info.history.same_primary_since) {
5591     dout(7) << " changed after " << m->get_map_epoch()
5592             << ", dropping " << *m << dendl;
5593     return true;
5594   }
5595
5596   if (m->get_connection()->has_feature(CEPH_FEATURE_RESEND_ON_SPLIT)) {
5597     if (m->get_map_epoch() < pool.info.get_last_force_op_resend()) {
5598       dout(7) << __func__ << " sent before last_force_op_resend "
5599               << pool.info.last_force_op_resend << ", dropping" << *m << dendl;
5600       return true;
5601     }
5602     if (m->get_map_epoch() < info.history.last_epoch_split) {
5603       dout(7) << __func__ << " pg split in "
5604               << info.history.last_epoch_split << ", dropping" << dendl;
5605       return true;
5606     }
5607   } else if (m->get_connection()->has_feature(CEPH_FEATURE_OSD_POOLRESEND)) {
5608     if (m->get_map_epoch() < pool.info.get_last_force_op_resend_preluminous()) {
5609       dout(7) << __func__ << " sent before last_force_op_resend_preluminous "
5610               << pool.info.last_force_op_resend_preluminous
5611               << ", dropping" << *m << dendl;
5612       return true;
5613     }
5614   }
5615
5616   return false;
5617 }
5618
5619 template<typename T, int MSGTYPE>
5620 bool PG::can_discard_replica_op(OpRequestRef& op)
5621 {
5622   const T *m = static_cast<const T *>(op->get_req());
5623   assert(m->get_type() == MSGTYPE);
5624
5625   /* Mostly, this overlaps with the old_peering_msg
5626    * condition.  An important exception is pushes
5627    * sent by replicas not in the acting set, since
5628    * if such a replica goes down it does not cause
5629    * a new interval. */
5630   int from = m->get_source().num();
5631   if (get_osdmap()->get_down_at(from) >= m->map_epoch)
5632     return true;
5633
5634   // same pg?
5635   //  if pg changes _at all_, we reset and repeer!
5636   if (old_peering_msg(m->map_epoch, m->map_epoch)) {
5637     dout(10) << "can_discard_replica_op pg changed " << info.history
5638              << " after " << m->map_epoch
5639              << ", dropping" << dendl;
5640     return true;
5641   }
5642   return false;
5643 }
5644
5645 bool PG::can_discard_scan(OpRequestRef op)
5646 {
5647   const MOSDPGScan *m = static_cast<const MOSDPGScan *>(op->get_req());
5648   assert(m->get_type() == MSG_OSD_PG_SCAN);
5649
5650   if (old_peering_msg(m->map_epoch, m->query_epoch)) {
5651     dout(10) << " got old scan, ignoring" << dendl;
5652     return true;
5653   }
5654   return false;
5655 }
5656
5657 bool PG::can_discard_backfill(OpRequestRef op)
5658 {
5659   const MOSDPGBackfill *m = static_cast<const MOSDPGBackfill *>(op->get_req());
5660   assert(m->get_type() == MSG_OSD_PG_BACKFILL);
5661
5662   if (old_peering_msg(m->map_epoch, m->query_epoch)) {
5663     dout(10) << " got old backfill, ignoring" << dendl;
5664     return true;
5665   }
5666
5667   return false;
5668
5669 }
5670
5671 bool PG::can_discard_request(OpRequestRef& op)
5672 {
5673   switch (op->get_req()->get_type()) {
5674   case CEPH_MSG_OSD_OP:
5675     return can_discard_op(op);
5676   case CEPH_MSG_OSD_BACKOFF:
5677     return false; // never discard
5678   case MSG_OSD_SUBOP:
5679     return can_discard_replica_op<MOSDSubOp, MSG_OSD_SUBOP>(op);
5680   case MSG_OSD_REPOP:
5681     return can_discard_replica_op<MOSDRepOp, MSG_OSD_REPOP>(op);
5682   case MSG_OSD_PG_PUSH:
5683     return can_discard_replica_op<MOSDPGPush, MSG_OSD_PG_PUSH>(op);
5684   case MSG_OSD_PG_PULL:
5685     return can_discard_replica_op<MOSDPGPull, MSG_OSD_PG_PULL>(op);
5686   case MSG_OSD_PG_PUSH_REPLY:
5687     return can_discard_replica_op<MOSDPGPushReply, MSG_OSD_PG_PUSH_REPLY>(op);
5688   case MSG_OSD_SUBOPREPLY:
5689     return can_discard_replica_op<MOSDSubOpReply, MSG_OSD_SUBOPREPLY>(op);
5690   case MSG_OSD_REPOPREPLY:
5691     return can_discard_replica_op<MOSDRepOpReply, MSG_OSD_REPOPREPLY>(op);
5692
5693   case MSG_OSD_EC_WRITE:
5694     return can_discard_replica_op<MOSDECSubOpWrite, MSG_OSD_EC_WRITE>(op);
5695   case MSG_OSD_EC_WRITE_REPLY:
5696     return can_discard_replica_op<MOSDECSubOpWriteReply, MSG_OSD_EC_WRITE_REPLY>(op);
5697   case MSG_OSD_EC_READ:
5698     return can_discard_replica_op<MOSDECSubOpRead, MSG_OSD_EC_READ>(op);
5699   case MSG_OSD_EC_READ_REPLY:
5700     return can_discard_replica_op<MOSDECSubOpReadReply, MSG_OSD_EC_READ_REPLY>(op);
5701   case MSG_OSD_REP_SCRUB:
5702     return can_discard_replica_op<MOSDRepScrub, MSG_OSD_REP_SCRUB>(op);
5703   case MSG_OSD_SCRUB_RESERVE:
5704     return can_discard_replica_op<MOSDScrubReserve, MSG_OSD_SCRUB_RESERVE>(op);
5705   case MSG_OSD_REP_SCRUBMAP:
5706     return can_discard_replica_op<MOSDRepScrubMap, MSG_OSD_REP_SCRUBMAP>(op);
5707   case MSG_OSD_PG_UPDATE_LOG_MISSING:
5708     return can_discard_replica_op<
5709       MOSDPGUpdateLogMissing, MSG_OSD_PG_UPDATE_LOG_MISSING>(op);
5710   case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY:
5711     return can_discard_replica_op<
5712       MOSDPGUpdateLogMissingReply, MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY>(op);
5713
5714   case MSG_OSD_PG_SCAN:
5715     return can_discard_scan(op);
5716   case MSG_OSD_PG_BACKFILL:
5717     return can_discard_backfill(op);
5718   case MSG_OSD_PG_BACKFILL_REMOVE:
5719     return can_discard_replica_op<MOSDPGBackfillRemove,
5720                                   MSG_OSD_PG_BACKFILL_REMOVE>(op);
5721   }
5722   return true;
5723 }
5724
5725 void PG::take_waiters()
5726 {
5727   dout(10) << "take_waiters" << dendl;
5728   requeue_map_waiters();
5729   for (list<CephPeeringEvtRef>::iterator i = peering_waiters.begin();
5730        i != peering_waiters.end();
5731        ++i) osd->queue_for_peering(this);
5732   peering_queue.splice(peering_queue.begin(), peering_waiters,
5733                        peering_waiters.begin(), peering_waiters.end());
5734 }
5735
5736 void PG::handle_peering_event(CephPeeringEvtRef evt, RecoveryCtx *rctx)
5737 {
5738   dout(10) << "handle_peering_event: " << evt->get_desc() << dendl;
5739   if (!have_same_or_newer_map(evt->get_epoch_sent())) {
5740     dout(10) << "deferring event " << evt->get_desc() << dendl;
5741     peering_waiters.push_back(evt);
5742     return;
5743   }
5744   if (old_peering_evt(evt))
5745     return;
5746   recovery_state.handle_event(evt, rctx);
5747 }
5748
5749 void PG::queue_peering_event(CephPeeringEvtRef evt)
5750 {
5751   if (old_peering_evt(evt))
5752     return;
5753   peering_queue.push_back(evt);
5754   osd->queue_for_peering(this);
5755 }
5756
5757 void PG::queue_null(epoch_t msg_epoch,
5758                     epoch_t query_epoch)
5759 {
5760   dout(10) << "null" << dendl;
5761   queue_peering_event(
5762     CephPeeringEvtRef(std::make_shared<CephPeeringEvt>(msg_epoch, query_epoch,
5763                                          NullEvt())));
5764 }
5765
5766 void PG::queue_flushed(epoch_t e)
5767 {
5768   dout(10) << "flushed" << dendl;
5769   queue_peering_event(
5770     CephPeeringEvtRef(std::make_shared<CephPeeringEvt>(e, e,
5771                                          FlushedEvt())));
5772 }
5773
5774 void PG::queue_query(epoch_t msg_epoch,
5775                      epoch_t query_epoch,
5776                      pg_shard_t from, const pg_query_t& q)
5777 {
5778   dout(10) << "handle_query " << q << " from replica " << from << dendl;
5779   queue_peering_event(
5780     CephPeeringEvtRef(std::make_shared<CephPeeringEvt>(msg_epoch, query_epoch,
5781                                          MQuery(from, q, query_epoch))));
5782 }
5783
5784 void PG::handle_advance_map(
5785   OSDMapRef osdmap, OSDMapRef lastmap,
5786   vector<int>& newup, int up_primary,
5787   vector<int>& newacting, int acting_primary,
5788   RecoveryCtx *rctx)
5789 {
5790   assert(lastmap->get_epoch() == osdmap_ref->get_epoch());
5791   assert(lastmap == osdmap_ref);
5792   dout(10) << "handle_advance_map "
5793            << newup << "/" << newacting
5794            << " -- " << up_primary << "/" << acting_primary
5795            << dendl;
5796   update_osdmap_ref(osdmap);
5797   pool.update(osdmap);
5798   past_intervals.update_type_from_map(pool.info.ec_pool(), *osdmap);
5799   if (cct->_conf->osd_debug_verify_cached_snaps) {
5800     interval_set<snapid_t> actual_removed_snaps;
5801     const pg_pool_t *pi = osdmap->get_pg_pool(info.pgid.pool());
5802     assert(pi);
5803     pi->build_removed_snaps(actual_removed_snaps);
5804     if (!(actual_removed_snaps == pool.cached_removed_snaps)) {
5805       derr << __func__ << ": mismatch between the actual removed snaps "
5806            << actual_removed_snaps << " and pool.cached_removed_snaps "
5807            << " pool.cached_removed_snaps " << pool.cached_removed_snaps
5808            << dendl;
5809     }
5810     assert(actual_removed_snaps == pool.cached_removed_snaps);
5811   }
5812   AdvMap evt(
5813     osdmap, lastmap, newup, up_primary,
5814     newacting, acting_primary);
5815   recovery_state.handle_event(evt, rctx);
5816   if (pool.info.last_change == osdmap_ref->get_epoch()) {
5817     on_pool_change();
5818     update_store_with_options();
5819   }
5820 }
5821
5822 void PG::handle_activate_map(RecoveryCtx *rctx)
5823 {
5824   dout(10) << "handle_activate_map " << dendl;
5825   ActMap evt;
5826   recovery_state.handle_event(evt, rctx);
5827   if (osdmap_ref->get_epoch() - last_persisted_osdmap_ref->get_epoch() >
5828     cct->_conf->osd_pg_epoch_persisted_max_stale) {
5829     dout(20) << __func__ << ": Dirtying info: last_persisted is "
5830              << last_persisted_osdmap_ref->get_epoch()
5831              << " while current is " << osdmap_ref->get_epoch() << dendl;
5832     dirty_info = true;
5833   } else {
5834     dout(20) << __func__ << ": Not dirtying info: last_persisted is "
5835              << last_persisted_osdmap_ref->get_epoch()
5836              << " while current is " << osdmap_ref->get_epoch() << dendl;
5837   }
5838   if (osdmap_ref->check_new_blacklist_entries()) check_blacklisted_watchers();
5839 }
5840
5841 void PG::handle_loaded(RecoveryCtx *rctx)
5842 {
5843   dout(10) << "handle_loaded" << dendl;
5844   Load evt;
5845   recovery_state.handle_event(evt, rctx);
5846 }
5847
5848 void PG::handle_create(RecoveryCtx *rctx)
5849 {
5850   dout(10) << "handle_create" << dendl;
5851   rctx->created_pgs.insert(this);
5852   Initialize evt;
5853   recovery_state.handle_event(evt, rctx);
5854   ActMap evt2;
5855   recovery_state.handle_event(evt2, rctx);
5856 }
5857
5858 void PG::handle_query_state(Formatter *f)
5859 {
5860   dout(10) << "handle_query_state" << dendl;
5861   QueryState q(f);
5862   recovery_state.handle_event(q, 0);
5863 }
5864
5865 void PG::update_store_with_options()
5866 {
5867   auto r = osd->store->set_collection_opts(coll, pool.info.opts);
5868   if(r < 0 && r != -EOPNOTSUPP) {
5869     derr << __func__ << "set_collection_opts returns error:" << r << dendl;
5870   }
5871 }
5872
5873 void PG::update_store_on_load()
5874 {
5875   if (osd->store->get_type() == "filestore") {
5876     // legacy filestore didn't store collection bit width; fix.
5877     int bits = osd->store->collection_bits(coll);
5878     if (bits < 0) {
5879       if (coll.is_meta())
5880         bits = 0;
5881       else
5882         bits = info.pgid.get_split_bits(pool.info.get_pg_num());
5883       lderr(cct) << __func__ << " setting bit width to " << bits << dendl;
5884       ObjectStore::Transaction t;
5885       t.collection_set_bits(coll, bits);
5886       osd->store->apply_transaction(osr.get(), std::move(t));
5887     }
5888   }
5889 }
5890
5891 /*------------ Recovery State Machine----------------*/
5892 #undef dout_prefix
5893 #define dout_prefix (*_dout << context< RecoveryMachine >().pg->gen_prefix() \
5894                      << "state<" << get_state_name() << ">: ")
5895
5896 /*------Crashed-------*/
5897 PG::RecoveryState::Crashed::Crashed(my_context ctx)
5898   : my_base(ctx),
5899     NamedState(context< RecoveryMachine >().pg, "Crashed")
5900 {
5901   context< RecoveryMachine >().log_enter(state_name);
5902   assert(0 == "we got a bad state machine event");
5903 }
5904
5905
5906 /*------Initial-------*/
5907 PG::RecoveryState::Initial::Initial(my_context ctx)
5908   : my_base(ctx),
5909     NamedState(context< RecoveryMachine >().pg, "Initial")
5910 {
5911   context< RecoveryMachine >().log_enter(state_name);
5912 }
5913
5914 boost::statechart::result PG::RecoveryState::Initial::react(const Load& l)
5915 {
5916   PG *pg = context< RecoveryMachine >().pg;
5917
5918   // do we tell someone we're here?
5919   pg->send_notify = (!pg->is_primary());
5920   pg->update_store_with_options();
5921
5922   pg->update_store_on_load();
5923
5924   return transit< Reset >();
5925 }
5926
5927 boost::statechart::result PG::RecoveryState::Initial::react(const MNotifyRec& notify)
5928 {
5929   PG *pg = context< RecoveryMachine >().pg;
5930   pg->proc_replica_info(
5931     notify.from, notify.notify.info, notify.notify.epoch_sent);
5932   pg->set_last_peering_reset();
5933   return transit< Primary >();
5934 }
5935
5936 boost::statechart::result PG::RecoveryState::Initial::react(const MInfoRec& i)
5937 {
5938   PG *pg = context< RecoveryMachine >().pg;
5939   assert(!pg->is_primary());
5940   post_event(i);
5941   return transit< Stray >();
5942 }
5943
5944 boost::statechart::result PG::RecoveryState::Initial::react(const MLogRec& i)
5945 {
5946   PG *pg = context< RecoveryMachine >().pg;
5947   assert(!pg->is_primary());
5948   post_event(i);
5949   return transit< Stray >();
5950 }
5951
5952 void PG::RecoveryState::Initial::exit()
5953 {
5954   context< RecoveryMachine >().log_exit(state_name, enter_time);
5955   PG *pg = context< RecoveryMachine >().pg;
5956   utime_t dur = ceph_clock_now() - enter_time;
5957   pg->osd->recoverystate_perf->tinc(rs_initial_latency, dur);
5958 }
5959
5960 /*------Started-------*/
5961 PG::RecoveryState::Started::Started(my_context ctx)
5962   : my_base(ctx),
5963     NamedState(context< RecoveryMachine >().pg, "Started")
5964 {
5965   context< RecoveryMachine >().log_enter(state_name);
5966 }
5967
5968 boost::statechart::result
5969 PG::RecoveryState::Started::react(const IntervalFlush&)
5970 {
5971   PG *pg = context< RecoveryMachine >().pg;
5972   ldout(pg->cct, 10) << "Ending blocked outgoing recovery messages" << dendl;
5973   context< RecoveryMachine >().pg->recovery_state.end_block_outgoing();
5974   return discard_event();
5975 }
5976
5977
5978 boost::statechart::result
5979 PG::RecoveryState::Started::react(const FlushedEvt&)
5980 {
5981   PG *pg = context< RecoveryMachine >().pg;
5982   pg->on_flushed();
5983   return discard_event();
5984 }
5985
5986
5987 boost::statechart::result PG::RecoveryState::Started::react(const AdvMap& advmap)
5988 {
5989   PG *pg = context< RecoveryMachine >().pg;
5990   ldout(pg->cct, 10) << "Started advmap" << dendl;
5991   pg->check_full_transition(advmap.lastmap, advmap.osdmap);
5992   if (pg->should_restart_peering(
5993         advmap.up_primary,
5994         advmap.acting_primary,
5995         advmap.newup,
5996         advmap.newacting,
5997         advmap.lastmap,
5998         advmap.osdmap)) {
5999     ldout(pg->cct, 10) << "should_restart_peering, transitioning to Reset"
6000                        << dendl;
6001     post_event(advmap);
6002     return transit< Reset >();
6003   }
6004   pg->remove_down_peer_info(advmap.osdmap);
6005   return discard_event();
6006 }
6007
6008 boost::statechart::result PG::RecoveryState::Started::react(const QueryState& q)
6009 {
6010   q.f->open_object_section("state");
6011   q.f->dump_string("name", state_name);
6012   q.f->dump_stream("enter_time") << enter_time;
6013   q.f->close_section();
6014   return discard_event();
6015 }
6016
6017 void PG::RecoveryState::Started::exit()
6018 {
6019   context< RecoveryMachine >().log_exit(state_name, enter_time);
6020   PG *pg = context< RecoveryMachine >().pg;
6021   utime_t dur = ceph_clock_now() - enter_time;
6022   pg->osd->recoverystate_perf->tinc(rs_started_latency, dur);
6023 }
6024
6025 /*--------Reset---------*/
6026 PG::RecoveryState::Reset::Reset(my_context ctx)
6027   : my_base(ctx),
6028     NamedState(context< RecoveryMachine >().pg, "Reset")
6029 {
6030   context< RecoveryMachine >().log_enter(state_name);
6031   PG *pg = context< RecoveryMachine >().pg;
6032
6033   pg->flushes_in_progress = 0;
6034   pg->set_last_peering_reset();
6035 }
6036
6037 boost::statechart::result
6038 PG::RecoveryState::Reset::react(const FlushedEvt&)
6039 {
6040   PG *pg = context< RecoveryMachine >().pg;
6041   pg->on_flushed();
6042   return discard_event();
6043 }
6044
6045 boost::statechart::result
6046 PG::RecoveryState::Reset::react(const IntervalFlush&)
6047 {
6048   PG *pg = context< RecoveryMachine >().pg;
6049   ldout(pg->cct, 10) << "Ending blocked outgoing recovery messages" << dendl;
6050   context< RecoveryMachine >().pg->recovery_state.end_block_outgoing();
6051   return discard_event();
6052 }
6053
6054 boost::statechart::result PG::RecoveryState::Reset::react(const AdvMap& advmap)
6055 {
6056   PG *pg = context< RecoveryMachine >().pg;
6057   ldout(pg->cct, 10) << "Reset advmap" << dendl;
6058
6059   pg->check_full_transition(advmap.lastmap, advmap.osdmap);
6060
6061   if (pg->should_restart_peering(
6062         advmap.up_primary,
6063         advmap.acting_primary,
6064         advmap.newup,
6065         advmap.newacting,
6066         advmap.lastmap,
6067         advmap.osdmap)) {
6068     ldout(pg->cct, 10) << "should restart peering, calling start_peering_interval again"
6069                        << dendl;
6070     pg->start_peering_interval(
6071       advmap.lastmap,
6072       advmap.newup, advmap.up_primary,
6073       advmap.newacting, advmap.acting_primary,
6074       context< RecoveryMachine >().get_cur_transaction());
6075   }
6076   pg->remove_down_peer_info(advmap.osdmap);
6077   pg->check_past_interval_bounds();
6078   return discard_event();
6079 }
6080
6081 boost::statechart::result PG::RecoveryState::Reset::react(const ActMap&)
6082 {
6083   PG *pg = context< RecoveryMachine >().pg;
6084   if (pg->should_send_notify() && pg->get_primary().osd >= 0) {
6085     context< RecoveryMachine >().send_notify(
6086       pg->get_primary(),
6087       pg_notify_t(
6088         pg->get_primary().shard, pg->pg_whoami.shard,
6089         pg->get_osdmap()->get_epoch(),
6090         pg->get_osdmap()->get_epoch(),
6091         pg->info),
6092       pg->past_intervals);
6093   }
6094
6095   pg->update_heartbeat_peers();
6096   pg->take_waiters();
6097
6098   return transit< Started >();
6099 }
6100
6101 boost::statechart::result PG::RecoveryState::Reset::react(const QueryState& q)
6102 {
6103   q.f->open_object_section("state");
6104   q.f->dump_string("name", state_name);
6105   q.f->dump_stream("enter_time") << enter_time;
6106   q.f->close_section();
6107   return discard_event();
6108 }
6109
6110 void PG::RecoveryState::Reset::exit()
6111 {
6112   context< RecoveryMachine >().log_exit(state_name, enter_time);
6113   PG *pg = context< RecoveryMachine >().pg;
6114   utime_t dur = ceph_clock_now() - enter_time;
6115   pg->osd->recoverystate_perf->tinc(rs_reset_latency, dur);
6116 }
6117
6118 /*-------Start---------*/
6119 PG::RecoveryState::Start::Start(my_context ctx)
6120   : my_base(ctx),
6121     NamedState(context< RecoveryMachine >().pg, "Start")
6122 {
6123   context< RecoveryMachine >().log_enter(state_name);
6124
6125   PG *pg = context< RecoveryMachine >().pg;
6126   if (pg->is_primary()) {
6127     ldout(pg->cct, 1) << "transitioning to Primary" << dendl;
6128     post_event(MakePrimary());
6129   } else { //is_stray
6130     ldout(pg->cct, 1) << "transitioning to Stray" << dendl;
6131     post_event(MakeStray());
6132   }
6133 }
6134
6135 void PG::RecoveryState::Start::exit()
6136 {
6137   context< RecoveryMachine >().log_exit(state_name, enter_time);
6138   PG *pg = context< RecoveryMachine >().pg;
6139   utime_t dur = ceph_clock_now() - enter_time;
6140   pg->osd->recoverystate_perf->tinc(rs_start_latency, dur);
6141 }
6142
6143 /*---------Primary--------*/
6144 PG::RecoveryState::Primary::Primary(my_context ctx)
6145   : my_base(ctx),
6146     NamedState(context< RecoveryMachine >().pg, "Started/Primary")
6147 {
6148   context< RecoveryMachine >().log_enter(state_name);
6149   PG *pg = context< RecoveryMachine >().pg;
6150   assert(pg->want_acting.empty());
6151
6152   // set CREATING bit until we have peered for the first time.
6153   if (pg->info.history.last_epoch_started == 0) {
6154     pg->state_set(PG_STATE_CREATING);
6155     // use the history timestamp, which ultimately comes from the
6156     // monitor in the create case.
6157     utime_t t = pg->info.history.last_scrub_stamp;
6158     pg->info.stats.last_fresh = t;
6159     pg->info.stats.last_active = t;
6160     pg->info.stats.last_change = t;
6161     pg->info.stats.last_peered = t;
6162     pg->info.stats.last_clean = t;
6163     pg->info.stats.last_unstale = t;
6164     pg->info.stats.last_undegraded = t;
6165     pg->info.stats.last_fullsized = t;
6166     pg->info.stats.last_scrub_stamp = t;
6167     pg->info.stats.last_deep_scrub_stamp = t;
6168     pg->info.stats.last_clean_scrub_stamp = t;
6169   }
6170 }
6171
6172 boost::statechart::result PG::RecoveryState::Primary::react(const MNotifyRec& notevt)
6173 {
6174   PG *pg = context< RecoveryMachine >().pg;
6175   ldout(pg->cct, 7) << "handle_pg_notify from osd." << notevt.from << dendl;
6176   pg->proc_replica_info(
6177     notevt.from, notevt.notify.info, notevt.notify.epoch_sent);
6178   return discard_event();
6179 }
6180
6181 boost::statechart::result PG::RecoveryState::Primary::react(const ActMap&)
6182 {
6183   PG *pg = context< RecoveryMachine >().pg;
6184   ldout(pg->cct, 7) << "handle ActMap primary" << dendl;
6185   pg->publish_stats_to_osd();
6186   pg->take_waiters();
6187   return discard_event();
6188 }
6189
6190 void PG::RecoveryState::Primary::exit()
6191 {
6192   context< RecoveryMachine >().log_exit(state_name, enter_time);
6193   PG *pg = context< RecoveryMachine >().pg;
6194   pg->want_acting.clear();
6195   utime_t dur = ceph_clock_now() - enter_time;
6196   pg->osd->recoverystate_perf->tinc(rs_primary_latency, dur);
6197   pg->clear_primary_state();
6198   pg->state_clear(PG_STATE_CREATING);
6199 }
6200
6201 /*---------Peering--------*/
6202 PG::RecoveryState::Peering::Peering(my_context ctx)
6203   : my_base(ctx),
6204     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering"),
6205     history_les_bound(false)
6206 {
6207   context< RecoveryMachine >().log_enter(state_name);
6208
6209   PG *pg = context< RecoveryMachine >().pg;
6210   assert(!pg->is_peered());
6211   assert(!pg->is_peering());
6212   assert(pg->is_primary());
6213   pg->state_set(PG_STATE_PEERING);
6214 }
6215
6216 boost::statechart::result PG::RecoveryState::Peering::react(const AdvMap& advmap)
6217 {
6218   PG *pg = context< RecoveryMachine >().pg;
6219   ldout(pg->cct, 10) << "Peering advmap" << dendl;
6220   if (prior_set.affected_by_map(*(advmap.osdmap), pg)) {
6221     ldout(pg->cct, 1) << "Peering, affected_by_map, going to Reset" << dendl;
6222     post_event(advmap);
6223     return transit< Reset >();
6224   }
6225
6226   pg->adjust_need_up_thru(advmap.osdmap);
6227
6228   return forward_event();
6229 }
6230
6231 boost::statechart::result PG::RecoveryState::Peering::react(const QueryState& q)
6232 {
6233   PG *pg = context< RecoveryMachine >().pg;
6234
6235   q.f->open_object_section("state");
6236   q.f->dump_string("name", state_name);
6237   q.f->dump_stream("enter_time") << enter_time;
6238
6239   q.f->open_array_section("past_intervals");
6240   pg->past_intervals.dump(q.f);
6241   q.f->close_section();
6242
6243   q.f->open_array_section("probing_osds");
6244   for (set<pg_shard_t>::iterator p = prior_set.probe.begin();
6245        p != prior_set.probe.end();
6246        ++p)
6247     q.f->dump_stream("osd") << *p;
6248   q.f->close_section();
6249
6250   if (prior_set.pg_down)
6251     q.f->dump_string("blocked", "peering is blocked due to down osds");
6252
6253   q.f->open_array_section("down_osds_we_would_probe");
6254   for (set<int>::iterator p = prior_set.down.begin();
6255        p != prior_set.down.end();
6256        ++p)
6257     q.f->dump_int("osd", *p);
6258   q.f->close_section();
6259
6260   q.f->open_array_section("peering_blocked_by");
6261   for (map<int,epoch_t>::iterator p = prior_set.blocked_by.begin();
6262        p != prior_set.blocked_by.end();
6263        ++p) {
6264     q.f->open_object_section("osd");
6265     q.f->dump_int("osd", p->first);
6266     q.f->dump_int("current_lost_at", p->second);
6267     q.f->dump_string("comment", "starting or marking this osd lost may let us proceed");
6268     q.f->close_section();
6269   }
6270   q.f->close_section();
6271
6272   if (history_les_bound) {
6273     q.f->open_array_section("peering_blocked_by_detail");
6274     q.f->open_object_section("item");
6275     q.f->dump_string("detail","peering_blocked_by_history_les_bound");
6276     q.f->close_section();
6277     q.f->close_section();
6278   }
6279
6280   q.f->close_section();
6281   return forward_event();
6282 }
6283
6284 void PG::RecoveryState::Peering::exit()
6285 {
6286   PG *pg = context< RecoveryMachine >().pg;
6287   ldout(pg->cct, 10) << "Leaving Peering" << dendl;
6288   context< RecoveryMachine >().log_exit(state_name, enter_time);
6289   pg->state_clear(PG_STATE_PEERING);
6290   pg->clear_probe_targets();
6291
6292   utime_t dur = ceph_clock_now() - enter_time;
6293   pg->osd->recoverystate_perf->tinc(rs_peering_latency, dur);
6294 }
6295
6296
6297 /*------Backfilling-------*/
6298 PG::RecoveryState::Backfilling::Backfilling(my_context ctx)
6299   : my_base(ctx),
6300     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Backfilling")
6301 {
6302   context< RecoveryMachine >().log_enter(state_name);
6303   PG *pg = context< RecoveryMachine >().pg;
6304   pg->backfill_reserved = true;
6305   pg->queue_recovery();
6306   pg->state_clear(PG_STATE_BACKFILL_TOOFULL);
6307   pg->state_clear(PG_STATE_BACKFILL_WAIT);
6308   pg->state_set(PG_STATE_BACKFILL);
6309   pg->publish_stats_to_osd();
6310 }
6311
6312 boost::statechart::result
6313 PG::RecoveryState::Backfilling::react(const CancelBackfill &)
6314 {
6315   PG *pg = context< RecoveryMachine >().pg;
6316   pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
6317   // XXX: Add a new pg state so user can see why backfill isn't proceeding
6318   // Can't use PG_STATE_BACKFILL_WAIT since it means waiting for reservations
6319   //pg->state_set(PG_STATE_BACKFILL_STALLED????);
6320
6321   for (set<pg_shard_t>::iterator it = pg->backfill_targets.begin();
6322        it != pg->backfill_targets.end();
6323        ++it) {
6324     assert(*it != pg->pg_whoami);
6325     ConnectionRef con = pg->osd->get_con_osd_cluster(
6326       it->osd, pg->get_osdmap()->get_epoch());
6327     if (con) {
6328       pg->osd->send_message_osd_cluster(
6329         new MBackfillReserve(
6330           MBackfillReserve::REJECT,
6331           spg_t(pg->info.pgid.pgid, it->shard),
6332           pg->get_osdmap()->get_epoch()),
6333         con.get());
6334     }
6335   }
6336
6337   pg->waiting_on_backfill.clear();
6338
6339   pg->schedule_backfill_full_retry();
6340   return transit<NotBackfilling>();
6341 }
6342
6343 boost::statechart::result
6344 PG::RecoveryState::Backfilling::react(const RemoteReservationRejected &)
6345 {
6346   PG *pg = context< RecoveryMachine >().pg;
6347   pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
6348   pg->state_set(PG_STATE_BACKFILL_TOOFULL);
6349
6350   for (set<pg_shard_t>::iterator it = pg->backfill_targets.begin();
6351        it != pg->backfill_targets.end();
6352        ++it) {
6353     assert(*it != pg->pg_whoami);
6354     ConnectionRef con = pg->osd->get_con_osd_cluster(
6355       it->osd, pg->get_osdmap()->get_epoch());
6356     if (con) {
6357       pg->osd->send_message_osd_cluster(
6358         new MBackfillReserve(
6359           MBackfillReserve::REJECT,
6360           spg_t(pg->info.pgid.pgid, it->shard),
6361           pg->get_osdmap()->get_epoch()),
6362         con.get());
6363     }
6364   }
6365
6366   pg->waiting_on_backfill.clear();
6367   pg->finish_recovery_op(hobject_t::get_max());
6368
6369   pg->schedule_backfill_full_retry();
6370   return transit<NotBackfilling>();
6371 }
6372
6373 void PG::RecoveryState::Backfilling::exit()
6374 {
6375   context< RecoveryMachine >().log_exit(state_name, enter_time);
6376   PG *pg = context< RecoveryMachine >().pg;
6377   pg->backfill_reserved = false;
6378   pg->backfill_reserving = false;
6379   pg->state_clear(PG_STATE_BACKFILL);
6380   utime_t dur = ceph_clock_now() - enter_time;
6381   pg->osd->recoverystate_perf->tinc(rs_backfilling_latency, dur);
6382 }
6383
6384 /*--WaitRemoteBackfillReserved--*/
6385
6386 PG::RecoveryState::WaitRemoteBackfillReserved::WaitRemoteBackfillReserved(my_context ctx)
6387   : my_base(ctx),
6388     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitRemoteBackfillReserved"),
6389     backfill_osd_it(context< Active >().remote_shards_to_reserve_backfill.begin())
6390 {
6391   context< RecoveryMachine >().log_enter(state_name);
6392   PG *pg = context< RecoveryMachine >().pg;
6393   pg->state_set(PG_STATE_BACKFILL_WAIT);
6394   pg->publish_stats_to_osd();
6395   post_event(RemoteBackfillReserved());
6396 }
6397
6398 boost::statechart::result
6399 PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteBackfillReserved &evt)
6400 {
6401   PG *pg = context< RecoveryMachine >().pg;
6402
6403   if (backfill_osd_it != context< Active >().remote_shards_to_reserve_backfill.end()) {
6404     //The primary never backfills itself
6405     assert(*backfill_osd_it != pg->pg_whoami);
6406     ConnectionRef con = pg->osd->get_con_osd_cluster(
6407       backfill_osd_it->osd, pg->get_osdmap()->get_epoch());
6408     if (con) {
6409       pg->osd->send_message_osd_cluster(
6410         new MBackfillReserve(
6411         MBackfillReserve::REQUEST,
6412         spg_t(pg->info.pgid.pgid, backfill_osd_it->shard),
6413         pg->get_osdmap()->get_epoch(),
6414         pg->get_backfill_priority()),
6415       con.get());
6416     }
6417     ++backfill_osd_it;
6418   } else {
6419     post_event(AllBackfillsReserved());
6420   }
6421   return discard_event();
6422 }
6423
6424 void PG::RecoveryState::WaitRemoteBackfillReserved::exit()
6425 {
6426   context< RecoveryMachine >().log_exit(state_name, enter_time);
6427   PG *pg = context< RecoveryMachine >().pg;
6428   utime_t dur = ceph_clock_now() - enter_time;
6429   pg->osd->recoverystate_perf->tinc(rs_waitremotebackfillreserved_latency, dur);
6430 }
6431
6432 boost::statechart::result
6433 PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteReservationRejected &evt)
6434 {
6435   PG *pg = context< RecoveryMachine >().pg;
6436   pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
6437
6438   // Send REJECT to all previously acquired reservations
6439   set<pg_shard_t>::const_iterator it, begin, end, next;
6440   begin = context< Active >().remote_shards_to_reserve_backfill.begin();
6441   end = context< Active >().remote_shards_to_reserve_backfill.end();
6442   assert(begin != end);
6443   for (next = it = begin, ++next ; next != backfill_osd_it; ++it, ++next) {
6444     //The primary never backfills itself
6445     assert(*it != pg->pg_whoami);
6446     ConnectionRef con = pg->osd->get_con_osd_cluster(
6447       it->osd, pg->get_osdmap()->get_epoch());
6448     if (con) {
6449       pg->osd->send_message_osd_cluster(
6450         new MBackfillReserve(
6451         MBackfillReserve::REJECT,
6452         spg_t(pg->info.pgid.pgid, it->shard),
6453         pg->get_osdmap()->get_epoch()),
6454       con.get());
6455     }
6456   }
6457
6458   pg->state_clear(PG_STATE_BACKFILL_WAIT);
6459   pg->state_set(PG_STATE_BACKFILL_TOOFULL);
6460   pg->publish_stats_to_osd();
6461
6462   pg->schedule_backfill_full_retry();
6463
6464   return transit<NotBackfilling>();
6465 }
6466
6467 /*--WaitLocalBackfillReserved--*/
6468 PG::RecoveryState::WaitLocalBackfillReserved::WaitLocalBackfillReserved(my_context ctx)
6469   : my_base(ctx),
6470     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitLocalBackfillReserved")
6471 {
6472   context< RecoveryMachine >().log_enter(state_name);
6473   PG *pg = context< RecoveryMachine >().pg;
6474   pg->state_set(PG_STATE_BACKFILL_WAIT);
6475   pg->osd->local_reserver.request_reservation(
6476     pg->info.pgid,
6477     new QueuePeeringEvt<LocalBackfillReserved>(
6478       pg, pg->get_osdmap()->get_epoch(),
6479       LocalBackfillReserved()),
6480     pg->get_backfill_priority());
6481   pg->publish_stats_to_osd();
6482 }
6483
6484 void PG::RecoveryState::WaitLocalBackfillReserved::exit()
6485 {
6486   context< RecoveryMachine >().log_exit(state_name, enter_time);
6487   PG *pg = context< RecoveryMachine >().pg;
6488   utime_t dur = ceph_clock_now() - enter_time;
6489   pg->osd->recoverystate_perf->tinc(rs_waitlocalbackfillreserved_latency, dur);
6490 }
6491
6492 /*----NotBackfilling------*/
6493 PG::RecoveryState::NotBackfilling::NotBackfilling(my_context ctx)
6494   : my_base(ctx),
6495     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/NotBackfilling")
6496 {
6497   context< RecoveryMachine >().log_enter(state_name);
6498   PG *pg = context< RecoveryMachine >().pg;
6499   pg->publish_stats_to_osd();
6500 }
6501
6502 boost::statechart::result
6503 PG::RecoveryState::NotBackfilling::react(const RemoteBackfillReserved &evt)
6504 {
6505   return discard_event();
6506 }
6507
6508 boost::statechart::result
6509 PG::RecoveryState::NotBackfilling::react(const RemoteReservationRejected &evt)
6510 {
6511   return discard_event();
6512 }
6513
6514 void PG::RecoveryState::NotBackfilling::exit()
6515 {
6516   context< RecoveryMachine >().log_exit(state_name, enter_time);
6517   PG *pg = context< RecoveryMachine >().pg;
6518   utime_t dur = ceph_clock_now() - enter_time;
6519   pg->osd->recoverystate_perf->tinc(rs_notbackfilling_latency, dur);
6520 }
6521
6522 /*----NotRecovering------*/
6523 PG::RecoveryState::NotRecovering::NotRecovering(my_context ctx)
6524   : my_base(ctx),
6525     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/NotRecovering")
6526 {
6527   context< RecoveryMachine >().log_enter(state_name);
6528   PG *pg = context< RecoveryMachine >().pg;
6529   pg->publish_stats_to_osd();
6530 }
6531
6532 void PG::RecoveryState::NotRecovering::exit()
6533 {
6534   context< RecoveryMachine >().log_exit(state_name, enter_time);
6535   PG *pg = context< RecoveryMachine >().pg;
6536   utime_t dur = ceph_clock_now() - enter_time;
6537   pg->osd->recoverystate_perf->tinc(rs_notrecovering_latency, dur);
6538 }
6539
6540 /*---RepNotRecovering----*/
6541 PG::RecoveryState::RepNotRecovering::RepNotRecovering(my_context ctx)
6542   : my_base(ctx),
6543     NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepNotRecovering")
6544 {
6545   context< RecoveryMachine >().log_enter(state_name);
6546 }
6547
6548 void PG::RecoveryState::RepNotRecovering::exit()
6549 {
6550   context< RecoveryMachine >().log_exit(state_name, enter_time);
6551   PG *pg = context< RecoveryMachine >().pg;
6552   utime_t dur = ceph_clock_now() - enter_time;
6553   pg->osd->recoverystate_perf->tinc(rs_repnotrecovering_latency, dur);
6554 }
6555
6556 /*---RepWaitRecoveryReserved--*/
6557 PG::RecoveryState::RepWaitRecoveryReserved::RepWaitRecoveryReserved(my_context ctx)
6558   : my_base(ctx),
6559     NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepWaitRecoveryReserved")
6560 {
6561   context< RecoveryMachine >().log_enter(state_name);
6562   PG *pg = context< RecoveryMachine >().pg;
6563
6564   pg->osd->remote_reserver.request_reservation(
6565     pg->info.pgid,
6566     new QueuePeeringEvt<RemoteRecoveryReserved>(
6567       pg, pg->get_osdmap()->get_epoch(),
6568       RemoteRecoveryReserved()),
6569     pg->get_recovery_priority());
6570 }
6571
6572 boost::statechart::result
6573 PG::RecoveryState::RepWaitRecoveryReserved::react(const RemoteRecoveryReserved &evt)
6574 {
6575   PG *pg = context< RecoveryMachine >().pg;
6576   pg->osd->send_message_osd_cluster(
6577     pg->primary.osd,
6578     new MRecoveryReserve(
6579       MRecoveryReserve::GRANT,
6580       spg_t(pg->info.pgid.pgid, pg->primary.shard),
6581       pg->get_osdmap()->get_epoch()),
6582     pg->get_osdmap()->get_epoch());
6583   return transit<RepRecovering>();
6584 }
6585
6586 void PG::RecoveryState::RepWaitRecoveryReserved::exit()
6587 {
6588   context< RecoveryMachine >().log_exit(state_name, enter_time);
6589   PG *pg = context< RecoveryMachine >().pg;
6590   utime_t dur = ceph_clock_now() - enter_time;
6591   pg->osd->recoverystate_perf->tinc(rs_repwaitrecoveryreserved_latency, dur);
6592 }
6593
6594 /*-RepWaitBackfillReserved*/
6595 PG::RecoveryState::RepWaitBackfillReserved::RepWaitBackfillReserved(my_context ctx)
6596   : my_base(ctx),
6597     NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepWaitBackfillReserved")
6598 {
6599   context< RecoveryMachine >().log_enter(state_name);
6600 }
6601
6602 boost::statechart::result
6603 PG::RecoveryState::RepNotRecovering::react(const RequestBackfillPrio &evt)
6604 {
6605   PG *pg = context< RecoveryMachine >().pg;
6606   ostringstream ss;
6607
6608   if (pg->cct->_conf->osd_debug_reject_backfill_probability > 0 &&
6609       (rand()%1000 < (pg->cct->_conf->osd_debug_reject_backfill_probability*1000.0))) {
6610     ldout(pg->cct, 10) << "backfill reservation rejected: failure injection"
6611                        << dendl;
6612     post_event(RemoteReservationRejected());
6613   } else if (!pg->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation &&
6614       pg->osd->check_backfill_full(ss)) {
6615     ldout(pg->cct, 10) << "backfill reservation rejected: "
6616                        << ss.str() << dendl;
6617     post_event(RemoteReservationRejected());
6618   } else {
6619     pg->osd->remote_reserver.request_reservation(
6620       pg->info.pgid,
6621       new QueuePeeringEvt<RemoteBackfillReserved>(
6622         pg, pg->get_osdmap()->get_epoch(),
6623         RemoteBackfillReserved()), evt.priority);
6624   }
6625   return transit<RepWaitBackfillReserved>();
6626 }
6627
6628 void PG::RecoveryState::RepWaitBackfillReserved::exit()
6629 {
6630   context< RecoveryMachine >().log_exit(state_name, enter_time);
6631   PG *pg = context< RecoveryMachine >().pg;
6632   utime_t dur = ceph_clock_now() - enter_time;
6633   pg->osd->recoverystate_perf->tinc(rs_repwaitbackfillreserved_latency, dur);
6634 }
6635
6636 boost::statechart::result
6637 PG::RecoveryState::RepWaitBackfillReserved::react(const RemoteBackfillReserved &evt)
6638 {
6639   PG *pg = context< RecoveryMachine >().pg;
6640
6641   ostringstream ss;
6642   if (pg->cct->_conf->osd_debug_reject_backfill_probability > 0 &&
6643       (rand()%1000 < (pg->cct->_conf->osd_debug_reject_backfill_probability*1000.0))) {
6644     ldout(pg->cct, 10) << "backfill reservation rejected after reservation: "
6645                        << "failure injection" << dendl;
6646     pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
6647     post_event(RemoteReservationRejected());
6648     return discard_event();
6649   } else if (!pg->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation &&
6650              pg->osd->check_backfill_full(ss)) {
6651     ldout(pg->cct, 10) << "backfill reservation rejected after reservation: "
6652                        << ss.str() << dendl;
6653     pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
6654     post_event(RemoteReservationRejected());
6655     return discard_event();
6656   } else {
6657     pg->osd->send_message_osd_cluster(
6658       pg->primary.osd,
6659       new MBackfillReserve(
6660         MBackfillReserve::GRANT,
6661         spg_t(pg->info.pgid.pgid, pg->primary.shard),
6662         pg->get_osdmap()->get_epoch()),
6663       pg->get_osdmap()->get_epoch());
6664     return transit<RepRecovering>();
6665   }
6666 }
6667
6668 boost::statechart::result
6669 PG::RecoveryState::RepWaitBackfillReserved::react(const RemoteReservationRejected &evt)
6670 {
6671   PG *pg = context< RecoveryMachine >().pg;
6672   pg->reject_reservation();
6673   return transit<RepNotRecovering>();
6674 }
6675
6676 /*---RepRecovering-------*/
6677 PG::RecoveryState::RepRecovering::RepRecovering(my_context ctx)
6678   : my_base(ctx),
6679     NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepRecovering")
6680 {
6681   context< RecoveryMachine >().log_enter(state_name);
6682 }
6683
6684 boost::statechart::result
6685 PG::RecoveryState::RepRecovering::react(const BackfillTooFull &)
6686 {
6687   PG *pg = context< RecoveryMachine >().pg;
6688   pg->reject_reservation();
6689   return discard_event();
6690 }
6691
6692 void PG::RecoveryState::RepRecovering::exit()
6693 {
6694   context< RecoveryMachine >().log_exit(state_name, enter_time);
6695   PG *pg = context< RecoveryMachine >().pg;
6696   pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
6697   utime_t dur = ceph_clock_now() - enter_time;
6698   pg->osd->recoverystate_perf->tinc(rs_reprecovering_latency, dur);
6699 }
6700
6701 /*------Activating--------*/
6702 PG::RecoveryState::Activating::Activating(my_context ctx)
6703   : my_base(ctx),
6704     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Activating")
6705 {
6706   context< RecoveryMachine >().log_enter(state_name);
6707 }
6708
6709 void PG::RecoveryState::Activating::exit()
6710 {
6711   context< RecoveryMachine >().log_exit(state_name, enter_time);
6712   PG *pg = context< RecoveryMachine >().pg;
6713   utime_t dur = ceph_clock_now() - enter_time;
6714   pg->osd->recoverystate_perf->tinc(rs_activating_latency, dur);
6715 }
6716
6717 PG::RecoveryState::WaitLocalRecoveryReserved::WaitLocalRecoveryReserved(my_context ctx)
6718   : my_base(ctx),
6719     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitLocalRecoveryReserved")
6720 {
6721   context< RecoveryMachine >().log_enter(state_name);
6722   PG *pg = context< RecoveryMachine >().pg;
6723
6724   // Make sure all nodes that part of the recovery aren't full
6725   if (!pg->cct->_conf->osd_debug_skip_full_check_in_recovery &&
6726       pg->osd->check_osdmap_full(pg->actingbackfill)) {
6727     post_event(RecoveryTooFull());
6728     return;
6729   }
6730
6731   pg->state_clear(PG_STATE_RECOVERY_TOOFULL);
6732   pg->state_set(PG_STATE_RECOVERY_WAIT);
6733   pg->osd->local_reserver.request_reservation(
6734     pg->info.pgid,
6735     new QueuePeeringEvt<LocalRecoveryReserved>(
6736       pg, pg->get_osdmap()->get_epoch(),
6737       LocalRecoveryReserved()),
6738     pg->get_recovery_priority());
6739   pg->publish_stats_to_osd();
6740 }
6741
6742 boost::statechart::result
6743 PG::RecoveryState::WaitLocalRecoveryReserved::react(const RecoveryTooFull &evt)
6744 {
6745   PG *pg = context< RecoveryMachine >().pg;
6746   pg->state_set(PG_STATE_RECOVERY_TOOFULL);
6747   pg->schedule_recovery_full_retry();
6748   return transit<NotRecovering>();
6749 }
6750
6751 void PG::RecoveryState::WaitLocalRecoveryReserved::exit()
6752 {
6753   context< RecoveryMachine >().log_exit(state_name, enter_time);
6754   PG *pg = context< RecoveryMachine >().pg;
6755   utime_t dur = ceph_clock_now() - enter_time;
6756   pg->osd->recoverystate_perf->tinc(rs_waitlocalrecoveryreserved_latency, dur);
6757 }
6758
6759 PG::RecoveryState::WaitRemoteRecoveryReserved::WaitRemoteRecoveryReserved(my_context ctx)
6760   : my_base(ctx),
6761     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitRemoteRecoveryReserved"),
6762     remote_recovery_reservation_it(context< Active >().remote_shards_to_reserve_recovery.begin())
6763 {
6764   context< RecoveryMachine >().log_enter(state_name);
6765   post_event(RemoteRecoveryReserved());
6766 }
6767
6768 boost::statechart::result
6769 PG::RecoveryState::WaitRemoteRecoveryReserved::react(const RemoteRecoveryReserved &evt) {
6770   PG *pg = context< RecoveryMachine >().pg;
6771
6772   if (remote_recovery_reservation_it != context< Active >().remote_shards_to_reserve_recovery.end()) {
6773     assert(*remote_recovery_reservation_it != pg->pg_whoami);
6774     ConnectionRef con = pg->osd->get_con_osd_cluster(
6775       remote_recovery_reservation_it->osd, pg->get_osdmap()->get_epoch());
6776     if (con) {
6777       pg->osd->send_message_osd_cluster(
6778         new MRecoveryReserve(
6779           MRecoveryReserve::REQUEST,
6780           spg_t(pg->info.pgid.pgid, remote_recovery_reservation_it->shard),
6781           pg->get_osdmap()->get_epoch()),
6782         con.get());
6783     }
6784     ++remote_recovery_reservation_it;
6785   } else {
6786     post_event(AllRemotesReserved());
6787   }
6788   return discard_event();
6789 }
6790
6791 void PG::RecoveryState::WaitRemoteRecoveryReserved::exit()
6792 {
6793   context< RecoveryMachine >().log_exit(state_name, enter_time);
6794   PG *pg = context< RecoveryMachine >().pg;
6795   utime_t dur = ceph_clock_now() - enter_time;
6796   pg->osd->recoverystate_perf->tinc(rs_waitremoterecoveryreserved_latency, dur);
6797 }
6798
6799 PG::RecoveryState::Recovering::Recovering(my_context ctx)
6800   : my_base(ctx),
6801     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Recovering")
6802 {
6803   context< RecoveryMachine >().log_enter(state_name);
6804
6805   PG *pg = context< RecoveryMachine >().pg;
6806   pg->state_clear(PG_STATE_RECOVERY_WAIT);
6807   pg->state_clear(PG_STATE_RECOVERY_TOOFULL);
6808   pg->state_set(PG_STATE_RECOVERING);
6809   pg->publish_stats_to_osd();
6810   pg->queue_recovery();
6811 }
6812
6813 void PG::RecoveryState::Recovering::release_reservations(bool cancel)
6814 {
6815   PG *pg = context< RecoveryMachine >().pg;
6816   assert(cancel || !pg->pg_log.get_missing().have_missing());
6817
6818   // release remote reservations
6819   for (set<pg_shard_t>::const_iterator i =
6820          context< Active >().remote_shards_to_reserve_recovery.begin();
6821         i != context< Active >().remote_shards_to_reserve_recovery.end();
6822         ++i) {
6823     if (*i == pg->pg_whoami) // skip myself
6824       continue;
6825     ConnectionRef con = pg->osd->get_con_osd_cluster(
6826       i->osd, pg->get_osdmap()->get_epoch());
6827     if (con) {
6828       pg->osd->send_message_osd_cluster(
6829         new MRecoveryReserve(
6830           MRecoveryReserve::RELEASE,
6831           spg_t(pg->info.pgid.pgid, i->shard),
6832           pg->get_osdmap()->get_epoch()),
6833         con.get());
6834     }
6835   }
6836 }
6837
6838 boost::statechart::result
6839 PG::RecoveryState::Recovering::react(const AllReplicasRecovered &evt)
6840 {
6841   PG *pg = context< RecoveryMachine >().pg;
6842   pg->state_clear(PG_STATE_RECOVERING);
6843   release_reservations();
6844   return transit<Recovered>();
6845 }
6846
6847 boost::statechart::result
6848 PG::RecoveryState::Recovering::react(const RequestBackfill &evt)
6849 {
6850   PG *pg = context< RecoveryMachine >().pg;
6851   pg->state_clear(PG_STATE_RECOVERING);
6852   release_reservations();
6853   return transit<WaitRemoteBackfillReserved>();
6854 }
6855
6856 boost::statechart::result
6857 PG::RecoveryState::Recovering::react(const CancelRecovery &evt)
6858 {
6859   PG *pg = context< RecoveryMachine >().pg;
6860   pg->state_clear(PG_STATE_RECOVERING);
6861   pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
6862   release_reservations(true);
6863   pg->schedule_recovery_full_retry();
6864   return transit<NotRecovering>();
6865 }
6866
6867 void PG::RecoveryState::Recovering::exit()
6868 {
6869   context< RecoveryMachine >().log_exit(state_name, enter_time);
6870   PG *pg = context< RecoveryMachine >().pg;
6871   utime_t dur = ceph_clock_now() - enter_time;
6872   pg->osd->recoverystate_perf->tinc(rs_recovering_latency, dur);
6873 }
6874
6875 PG::RecoveryState::Recovered::Recovered(my_context ctx)
6876   : my_base(ctx),
6877     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Recovered")
6878 {
6879   pg_shard_t auth_log_shard;
6880
6881   context< RecoveryMachine >().log_enter(state_name);
6882
6883   PG *pg = context< RecoveryMachine >().pg;
6884   pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
6885
6886   assert(!pg->needs_recovery());
6887
6888   // if we finished backfill, all acting are active; recheck if
6889   // DEGRADED | UNDERSIZED is appropriate.
6890   assert(!pg->actingbackfill.empty());
6891   if (pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid) <=
6892       pg->actingbackfill.size()) {
6893     pg->state_clear(PG_STATE_DEGRADED);
6894     pg->publish_stats_to_osd();
6895   }
6896
6897   // trim pglog on recovered
6898   pg->trim_log();
6899
6900   // adjust acting set?  (e.g. because backfill completed...)
6901   bool history_les_bound = false;
6902   if (pg->acting != pg->up && !pg->choose_acting(auth_log_shard,
6903                                                  true, &history_les_bound))
6904     assert(pg->want_acting.size());
6905
6906   if (context< Active >().all_replicas_activated)
6907     post_event(GoClean());
6908 }
6909
6910 void PG::RecoveryState::Recovered::exit()
6911 {
6912   context< RecoveryMachine >().log_exit(state_name, enter_time);
6913   PG *pg = context< RecoveryMachine >().pg;
6914   utime_t dur = ceph_clock_now() - enter_time;
6915   pg->osd->recoverystate_perf->tinc(rs_recovered_latency, dur);
6916 }
6917
6918 PG::RecoveryState::Clean::Clean(my_context ctx)
6919   : my_base(ctx),
6920     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Clean")
6921 {
6922   context< RecoveryMachine >().log_enter(state_name);
6923
6924   PG *pg = context< RecoveryMachine >().pg;
6925
6926   if (pg->info.last_complete != pg->info.last_update) {
6927     ceph_abort();
6928   }
6929   pg->finish_recovery(*context< RecoveryMachine >().get_on_safe_context_list());
6930
6931   if (pg->is_active()) {
6932     pg->mark_clean();
6933   }
6934
6935   pg->share_pg_info();
6936   pg->publish_stats_to_osd();
6937   pg->requeue_ops(pg->waiting_for_clean_to_primary_repair);
6938 }
6939
6940 void PG::RecoveryState::Clean::exit()
6941 {
6942   context< RecoveryMachine >().log_exit(state_name, enter_time);
6943   PG *pg = context< RecoveryMachine >().pg;
6944   pg->state_clear(PG_STATE_CLEAN);
6945   utime_t dur = ceph_clock_now() - enter_time;
6946   pg->osd->recoverystate_perf->tinc(rs_clean_latency, dur);
6947 }
6948
6949 template <typename T>
6950 set<pg_shard_t> unique_osd_shard_set(const pg_shard_t & skip, const T &in)
6951 {
6952   set<int> osds_found;
6953   set<pg_shard_t> out;
6954   for (typename T::const_iterator i = in.begin();
6955        i != in.end();
6956        ++i) {
6957     if (*i != skip && !osds_found.count(i->osd)) {
6958       osds_found.insert(i->osd);
6959       out.insert(*i);
6960     }
6961   }
6962   return out;
6963 }
6964
6965 /*---------Active---------*/
6966 PG::RecoveryState::Active::Active(my_context ctx)
6967   : my_base(ctx),
6968     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active"),
6969     remote_shards_to_reserve_recovery(
6970       unique_osd_shard_set(
6971         context< RecoveryMachine >().pg->pg_whoami,
6972         context< RecoveryMachine >().pg->actingbackfill)),
6973     remote_shards_to_reserve_backfill(
6974       unique_osd_shard_set(
6975         context< RecoveryMachine >().pg->pg_whoami,
6976         context< RecoveryMachine >().pg->backfill_targets)),
6977     all_replicas_activated(false)
6978 {
6979   context< RecoveryMachine >().log_enter(state_name);
6980
6981   PG *pg = context< RecoveryMachine >().pg;
6982
6983   assert(!pg->backfill_reserving);
6984   assert(!pg->backfill_reserved);
6985   assert(pg->is_primary());
6986   ldout(pg->cct, 10) << "In Active, about to call activate" << dendl;
6987   pg->start_flush(
6988     context< RecoveryMachine >().get_cur_transaction(),
6989     context< RecoveryMachine >().get_on_applied_context_list(),
6990     context< RecoveryMachine >().get_on_safe_context_list());
6991   pg->activate(*context< RecoveryMachine >().get_cur_transaction(),
6992                pg->get_osdmap()->get_epoch(),
6993                *context< RecoveryMachine >().get_on_safe_context_list(),
6994                *context< RecoveryMachine >().get_query_map(),
6995                context< RecoveryMachine >().get_info_map(),
6996                context< RecoveryMachine >().get_recovery_ctx());
6997
6998   // everyone has to commit/ack before we are truly active
6999   pg->blocked_by.clear();
7000   for (set<pg_shard_t>::iterator p = pg->actingbackfill.begin();
7001        p != pg->actingbackfill.end();
7002        ++p) {
7003     if (p->shard != pg->pg_whoami.shard) {
7004       pg->blocked_by.insert(p->shard);
7005     }
7006   }
7007   pg->publish_stats_to_osd();
7008   ldout(pg->cct, 10) << "Activate Finished" << dendl;
7009 }
7010
7011 boost::statechart::result PG::RecoveryState::Active::react(const AdvMap& advmap)
7012 {
7013   PG *pg = context< RecoveryMachine >().pg;
7014   ldout(pg->cct, 10) << "Active advmap" << dendl;
7015   if (!pg->pool.newly_removed_snaps.empty()) {
7016     pg->snap_trimq.union_of(pg->pool.newly_removed_snaps);
7017     ldout(pg->cct, 10) << *pg << " snap_trimq now " << pg->snap_trimq << dendl;
7018     pg->dirty_info = true;
7019     pg->dirty_big_info = true;
7020   }
7021
7022   for (size_t i = 0; i < pg->want_acting.size(); i++) {
7023     int osd = pg->want_acting[i];
7024     if (!advmap.osdmap->is_up(osd)) {
7025       pg_shard_t osd_with_shard(osd, shard_id_t(i));
7026       assert(pg->is_acting(osd_with_shard) || pg->is_up(osd_with_shard));
7027     }
7028   }
7029
7030   bool need_publish = false;
7031   /* Check for changes in pool size (if the acting set changed as a result,
7032    * this does not matter) */
7033   if (advmap.lastmap->get_pg_size(pg->info.pgid.pgid) !=
7034       pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid)) {
7035     if (pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid) <= pg->actingset.size()) {
7036       pg->state_clear(PG_STATE_UNDERSIZED);
7037       if (pg->needs_recovery()) {
7038         pg->state_set(PG_STATE_DEGRADED);
7039       } else {
7040         pg->state_clear(PG_STATE_DEGRADED);
7041       }
7042     } else {
7043       pg->state_set(PG_STATE_UNDERSIZED);
7044       pg->state_set(PG_STATE_DEGRADED);
7045     }
7046     need_publish = true; // degraded may have changed
7047   }
7048
7049   // if we haven't reported our PG stats in a long time, do so now.
7050   if (pg->info.stats.reported_epoch + pg->cct->_conf->osd_pg_stat_report_interval_max < advmap.osdmap->get_epoch()) {
7051     ldout(pg->cct, 20) << "reporting stats to osd after " << (advmap.osdmap->get_epoch() - pg->info.stats.reported_epoch)
7052                        << " epochs" << dendl;
7053     need_publish = true;
7054   }
7055
7056   if (need_publish)
7057     pg->publish_stats_to_osd();
7058
7059   return forward_event();
7060 }
7061
7062 boost::statechart::result PG::RecoveryState::Active::react(const ActMap&)
7063 {
7064   PG *pg = context< RecoveryMachine >().pg;
7065   ldout(pg->cct, 10) << "Active: handling ActMap" << dendl;
7066   assert(pg->is_primary());
7067
7068   if (pg->have_unfound()) {
7069     // object may have become unfound
7070     pg->discover_all_missing(*context< RecoveryMachine >().get_query_map());
7071   }
7072
7073   if (pg->cct->_conf->osd_check_for_log_corruption)
7074     pg->check_log_for_corruption(pg->osd->store);
7075
7076   uint64_t unfound = pg->missing_loc.num_unfound();
7077   if (unfound > 0 &&
7078       pg->all_unfound_are_queried_or_lost(pg->get_osdmap())) {
7079     if (pg->cct->_conf->osd_auto_mark_unfound_lost) {
7080       pg->osd->clog->error() << pg->info.pgid.pgid << " has " << unfound
7081                             << " objects unfound and apparently lost, would automatically marking lost but NOT IMPLEMENTED";
7082     } else
7083       pg->osd->clog->error() << pg->info.pgid.pgid << " has " << unfound << " objects unfound and apparently lost";
7084   }
7085
7086   if (pg->is_active()) {
7087     ldout(pg->cct, 10) << "Active: kicking snap trim" << dendl;
7088     pg->kick_snap_trim();
7089   }
7090
7091   if (pg->is_peered() &&
7092       !pg->is_clean() &&
7093       !pg->get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL) &&
7094       (!pg->get_osdmap()->test_flag(CEPH_OSDMAP_NOREBALANCE) || pg->is_degraded())) {
7095     pg->queue_recovery();
7096   }
7097   return forward_event();
7098 }
7099
7100 boost::statechart::result PG::RecoveryState::Active::react(const MNotifyRec& notevt)
7101 {
7102   PG *pg = context< RecoveryMachine >().pg;
7103   assert(pg->is_primary());
7104   if (pg->peer_info.count(notevt.from)) {
7105     ldout(pg->cct, 10) << "Active: got notify from " << notevt.from
7106                        << ", already have info from that osd, ignoring"
7107                        << dendl;
7108   } else if (pg->peer_purged.count(notevt.from)) {
7109     ldout(pg->cct, 10) << "Active: got notify from " << notevt.from
7110                        << ", already purged that peer, ignoring"
7111                        << dendl;
7112   } else {
7113     ldout(pg->cct, 10) << "Active: got notify from " << notevt.from
7114                        << ", calling proc_replica_info and discover_all_missing"
7115                        << dendl;
7116     pg->proc_replica_info(
7117       notevt.from, notevt.notify.info, notevt.notify.epoch_sent);
7118     if (pg->have_unfound()) {
7119       pg->discover_all_missing(*context< RecoveryMachine >().get_query_map());
7120     }
7121   }
7122   return discard_event();
7123 }
7124
7125 boost::statechart::result PG::RecoveryState::Active::react(const MInfoRec& infoevt)
7126 {
7127   PG *pg = context< RecoveryMachine >().pg;
7128   assert(pg->is_primary());
7129
7130   assert(!pg->actingbackfill.empty());
7131   // don't update history (yet) if we are active and primary; the replica
7132   // may be telling us they have activated (and committed) but we can't
7133   // share that until _everyone_ does the same.
7134   if (pg->is_actingbackfill(infoevt.from)) {
7135     ldout(pg->cct, 10) << " peer osd." << infoevt.from
7136                        << " activated and committed" << dendl;
7137     pg->peer_activated.insert(infoevt.from);
7138     pg->blocked_by.erase(infoevt.from.shard);
7139     pg->publish_stats_to_osd();
7140     if (pg->peer_activated.size() == pg->actingbackfill.size()) {
7141       pg->all_activated_and_committed();
7142     }
7143   }
7144   return discard_event();
7145 }
7146
7147 boost::statechart::result PG::RecoveryState::Active::react(const MLogRec& logevt)
7148 {
7149   PG *pg = context< RecoveryMachine >().pg;
7150   ldout(pg->cct, 10) << "searching osd." << logevt.from
7151                      << " log for unfound items" << dendl;
7152   pg->proc_replica_log(
7153     logevt.msg->info, logevt.msg->log, logevt.msg->missing, logevt.from);
7154   bool got_missing = pg->search_for_missing(
7155     pg->peer_info[logevt.from],
7156     pg->peer_missing[logevt.from],
7157     logevt.from,
7158     context< RecoveryMachine >().get_recovery_ctx());
7159   if (pg->is_peered() &&
7160       got_missing)
7161     pg->queue_recovery();
7162   return discard_event();
7163 }
7164
7165 boost::statechart::result PG::RecoveryState::Active::react(const QueryState& q)
7166 {
7167   PG *pg = context< RecoveryMachine >().pg;
7168
7169   q.f->open_object_section("state");
7170   q.f->dump_string("name", state_name);
7171   q.f->dump_stream("enter_time") << enter_time;
7172
7173   {
7174     q.f->open_array_section("might_have_unfound");
7175     for (set<pg_shard_t>::iterator p = pg->might_have_unfound.begin();
7176          p != pg->might_have_unfound.end();
7177          ++p) {
7178       q.f->open_object_section("osd");
7179       q.f->dump_stream("osd") << *p;
7180       if (pg->peer_missing.count(*p)) {
7181         q.f->dump_string("status", "already probed");
7182       } else if (pg->peer_missing_requested.count(*p)) {
7183         q.f->dump_string("status", "querying");
7184       } else if (!pg->get_osdmap()->is_up(p->osd)) {
7185         q.f->dump_string("status", "osd is down");
7186       } else {
7187         q.f->dump_string("status", "not queried");
7188       }
7189       q.f->close_section();
7190     }
7191     q.f->close_section();
7192   }
7193   {
7194     q.f->open_object_section("recovery_progress");
7195     pg->dump_recovery_info(q.f);
7196     q.f->close_section();
7197   }
7198
7199   {
7200     q.f->open_object_section("scrub");
7201     q.f->dump_stream("scrubber.epoch_start") << pg->scrubber.epoch_start;
7202     q.f->dump_bool("scrubber.active", pg->scrubber.active);
7203     q.f->dump_string("scrubber.state", Scrubber::state_string(pg->scrubber.state));
7204     q.f->dump_stream("scrubber.start") << pg->scrubber.start;
7205     q.f->dump_stream("scrubber.end") << pg->scrubber.end;
7206     q.f->dump_stream("scrubber.subset_last_update") << pg->scrubber.subset_last_update;
7207     q.f->dump_bool("scrubber.deep", pg->scrubber.deep);
7208     q.f->dump_unsigned("scrubber.seed", pg->scrubber.seed);
7209     q.f->dump_int("scrubber.waiting_on", pg->scrubber.waiting_on);
7210     {
7211       q.f->open_array_section("scrubber.waiting_on_whom");
7212       for (set<pg_shard_t>::iterator p = pg->scrubber.waiting_on_whom.begin();
7213            p != pg->scrubber.waiting_on_whom.end();
7214            ++p) {
7215         q.f->dump_stream("shard") << *p;
7216       }
7217       q.f->close_section();
7218     }
7219     q.f->close_section();
7220   }
7221
7222   q.f->close_section();
7223   return forward_event();
7224 }
7225
7226 boost::statechart::result PG::RecoveryState::Active::react(const AllReplicasActivated &evt)
7227 {
7228   PG *pg = context< RecoveryMachine >().pg;
7229   all_replicas_activated = true;
7230
7231   pg->state_clear(PG_STATE_ACTIVATING);
7232   pg->state_clear(PG_STATE_CREATING);
7233   if (pg->acting.size() >= pg->pool.info.min_size) {
7234     pg->state_set(PG_STATE_ACTIVE);
7235   } else {
7236     pg->state_set(PG_STATE_PEERED);
7237   }
7238
7239   // info.last_epoch_started is set during activate()
7240   pg->info.history.last_epoch_started = pg->info.last_epoch_started;
7241   pg->info.history.last_interval_started = pg->info.last_interval_started;
7242   pg->dirty_info = true;
7243
7244   pg->share_pg_info();
7245   pg->publish_stats_to_osd();
7246
7247   pg->check_local();
7248
7249   // waiters
7250   if (pg->flushes_in_progress == 0) {
7251     pg->requeue_ops(pg->waiting_for_peered);
7252   }
7253
7254   pg->on_activate();
7255
7256   return discard_event();
7257 }
7258
7259 void PG::RecoveryState::Active::exit()
7260 {
7261   context< RecoveryMachine >().log_exit(state_name, enter_time);
7262   PG *pg = context< RecoveryMachine >().pg;
7263   pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
7264
7265   pg->blocked_by.clear();
7266   pg->backfill_reserved = false;
7267   pg->backfill_reserving = false;
7268   pg->state_clear(PG_STATE_ACTIVATING);
7269   pg->state_clear(PG_STATE_DEGRADED);
7270   pg->state_clear(PG_STATE_UNDERSIZED);
7271   pg->state_clear(PG_STATE_BACKFILL_TOOFULL);
7272   pg->state_clear(PG_STATE_BACKFILL_WAIT);
7273   pg->state_clear(PG_STATE_RECOVERY_WAIT);
7274   pg->state_clear(PG_STATE_RECOVERY_TOOFULL);
7275   utime_t dur = ceph_clock_now() - enter_time;
7276   pg->osd->recoverystate_perf->tinc(rs_active_latency, dur);
7277   pg->agent_stop();
7278 }
7279
7280 /*------ReplicaActive-----*/
7281 PG::RecoveryState::ReplicaActive::ReplicaActive(my_context ctx)
7282   : my_base(ctx),
7283     NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive")
7284 {
7285   context< RecoveryMachine >().log_enter(state_name);
7286
7287   PG *pg = context< RecoveryMachine >().pg;
7288   pg->start_flush(
7289     context< RecoveryMachine >().get_cur_transaction(),
7290     context< RecoveryMachine >().get_on_applied_context_list(),
7291     context< RecoveryMachine >().get_on_safe_context_list());
7292 }
7293
7294
7295 boost::statechart::result PG::RecoveryState::ReplicaActive::react(
7296   const Activate& actevt) {
7297   PG *pg = context< RecoveryMachine >().pg;
7298   ldout(pg->cct, 10) << "In ReplicaActive, about to call activate" << dendl;
7299   map<int, map<spg_t, pg_query_t> > query_map;
7300   pg->activate(*context< RecoveryMachine >().get_cur_transaction(),
7301                actevt.activation_epoch,
7302                *context< RecoveryMachine >().get_on_safe_context_list(),
7303                query_map, NULL, NULL);
7304   ldout(pg->cct, 10) << "Activate Finished" << dendl;
7305   return discard_event();
7306 }
7307
7308 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const MInfoRec& infoevt)
7309 {
7310   PG *pg = context< RecoveryMachine >().pg;
7311   pg->proc_primary_info(*context<RecoveryMachine>().get_cur_transaction(),
7312                         infoevt.info);
7313   return discard_event();
7314 }
7315
7316 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const MLogRec& logevt)
7317 {
7318   PG *pg = context< RecoveryMachine >().pg;
7319   ldout(pg->cct, 10) << "received log from " << logevt.from << dendl;
7320   ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
7321   pg->merge_log(*t, logevt.msg->info, logevt.msg->log, logevt.from);
7322   assert(pg->pg_log.get_head() == pg->info.last_update);
7323
7324   return discard_event();
7325 }
7326
7327 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const ActMap&)
7328 {
7329   PG *pg = context< RecoveryMachine >().pg;
7330   if (pg->should_send_notify() && pg->get_primary().osd >= 0) {
7331     context< RecoveryMachine >().send_notify(
7332       pg->get_primary(),
7333       pg_notify_t(
7334         pg->get_primary().shard, pg->pg_whoami.shard,
7335         pg->get_osdmap()->get_epoch(),
7336         pg->get_osdmap()->get_epoch(),
7337         pg->info),
7338       pg->past_intervals);
7339   }
7340   pg->take_waiters();
7341   return discard_event();
7342 }
7343
7344 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const MQuery& query)
7345 {
7346   PG *pg = context< RecoveryMachine >().pg;
7347   if (query.query.type == pg_query_t::MISSING) {
7348     pg->update_history(query.query.history);
7349     pg->fulfill_log(query.from, query.query, query.query_epoch);
7350   } // else: from prior to activation, safe to ignore
7351   return discard_event();
7352 }
7353
7354 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const QueryState& q)
7355 {
7356   q.f->open_object_section("state");
7357   q.f->dump_string("name", state_name);
7358   q.f->dump_stream("enter_time") << enter_time;
7359   q.f->close_section();
7360   return forward_event();
7361 }
7362
7363 void PG::RecoveryState::ReplicaActive::exit()
7364 {
7365   context< RecoveryMachine >().log_exit(state_name, enter_time);
7366   PG *pg = context< RecoveryMachine >().pg;
7367   pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
7368   utime_t dur = ceph_clock_now() - enter_time;
7369   pg->osd->recoverystate_perf->tinc(rs_replicaactive_latency, dur);
7370 }
7371
7372 /*-------Stray---*/
7373 PG::RecoveryState::Stray::Stray(my_context ctx)
7374   : my_base(ctx),
7375     NamedState(context< RecoveryMachine >().pg, "Started/Stray")
7376 {
7377   context< RecoveryMachine >().log_enter(state_name);
7378
7379   PG *pg = context< RecoveryMachine >().pg;
7380   assert(!pg->is_peered());
7381   assert(!pg->is_peering());
7382   assert(!pg->is_primary());
7383   pg->start_flush(
7384     context< RecoveryMachine >().get_cur_transaction(),
7385     context< RecoveryMachine >().get_on_applied_context_list(),
7386     context< RecoveryMachine >().get_on_safe_context_list());
7387 }
7388
7389 boost::statechart::result PG::RecoveryState::Stray::react(const MLogRec& logevt)
7390 {
7391   PG *pg = context< RecoveryMachine >().pg;
7392   MOSDPGLog *msg = logevt.msg.get();
7393   ldout(pg->cct, 10) << "got info+log from osd." << logevt.from << " " << msg->info << " " << msg->log << dendl;
7394
7395   ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
7396   if (msg->info.last_backfill == hobject_t()) {
7397     // restart backfill
7398     pg->unreg_next_scrub();
7399     pg->info = msg->info;
7400     pg->reg_next_scrub();
7401     pg->dirty_info = true;
7402     pg->dirty_big_info = true;  // maybe.
7403
7404     PGLogEntryHandler rollbacker{pg, t};
7405     pg->pg_log.reset_backfill_claim_log(msg->log, &rollbacker);
7406
7407     pg->pg_log.reset_backfill();
7408   } else {
7409     pg->merge_log(*t, msg->info, msg->log, logevt.from);
7410   }
7411
7412   assert(pg->pg_log.get_head() == pg->info.last_update);
7413
7414   post_event(Activate(logevt.msg->info.last_epoch_started));
7415   return transit<ReplicaActive>();
7416 }
7417
7418 boost::statechart::result PG::RecoveryState::Stray::react(const MInfoRec& infoevt)
7419 {
7420   PG *pg = context< RecoveryMachine >().pg;
7421   ldout(pg->cct, 10) << "got info from osd." << infoevt.from << " " << infoevt.info << dendl;
7422
7423   if (pg->info.last_update > infoevt.info.last_update) {
7424     // rewind divergent log entries
7425     ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
7426     pg->rewind_divergent_log(*t, infoevt.info.last_update);
7427     pg->info.stats = infoevt.info.stats;
7428     pg->info.hit_set = infoevt.info.hit_set;
7429   }
7430
7431   assert(infoevt.info.last_update == pg->info.last_update);
7432   assert(pg->pg_log.get_head() == pg->info.last_update);
7433
7434   post_event(Activate(infoevt.info.last_epoch_started));
7435   return transit<ReplicaActive>();
7436 }
7437
7438 boost::statechart::result PG::RecoveryState::Stray::react(const MQuery& query)
7439 {
7440   PG *pg = context< RecoveryMachine >().pg;
7441   if (query.query.type == pg_query_t::INFO) {
7442     pair<pg_shard_t, pg_info_t> notify_info;
7443     pg->update_history(query.query.history);
7444     pg->fulfill_info(query.from, query.query, notify_info);
7445     context< RecoveryMachine >().send_notify(
7446       notify_info.first,
7447       pg_notify_t(
7448         notify_info.first.shard, pg->pg_whoami.shard,
7449         query.query_epoch,
7450         pg->get_osdmap()->get_epoch(),
7451         notify_info.second),
7452       pg->past_intervals);
7453   } else {
7454     pg->fulfill_log(query.from, query.query, query.query_epoch);
7455   }
7456   return discard_event();
7457 }
7458
7459 boost::statechart::result PG::RecoveryState::Stray::react(const ActMap&)
7460 {
7461   PG *pg = context< RecoveryMachine >().pg;
7462   if (pg->should_send_notify() && pg->get_primary().osd >= 0) {
7463     context< RecoveryMachine >().send_notify(
7464       pg->get_primary(),
7465       pg_notify_t(
7466         pg->get_primary().shard, pg->pg_whoami.shard,
7467         pg->get_osdmap()->get_epoch(),
7468         pg->get_osdmap()->get_epoch(),
7469         pg->info),
7470       pg->past_intervals);
7471   }
7472   pg->take_waiters();
7473   return discard_event();
7474 }
7475
7476 void PG::RecoveryState::Stray::exit()
7477 {
7478   context< RecoveryMachine >().log_exit(state_name, enter_time);
7479   PG *pg = context< RecoveryMachine >().pg;
7480   utime_t dur = ceph_clock_now() - enter_time;
7481   pg->osd->recoverystate_perf->tinc(rs_stray_latency, dur);
7482 }
7483
7484 /*--------GetInfo---------*/
7485 PG::RecoveryState::GetInfo::GetInfo(my_context ctx)
7486   : my_base(ctx),
7487     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/GetInfo")
7488 {
7489   context< RecoveryMachine >().log_enter(state_name);
7490
7491   PG *pg = context< RecoveryMachine >().pg;
7492   pg->check_past_interval_bounds();
7493   PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
7494
7495   assert(pg->blocked_by.empty());
7496
7497   prior_set = pg->build_prior();
7498
7499   pg->reset_min_peer_features();
7500   get_infos();
7501   if (prior_set.pg_down) {
7502     post_event(IsDown());
7503   } else if (peer_info_requested.empty()) {
7504     post_event(GotInfo());
7505   }
7506 }
7507
7508 void PG::RecoveryState::GetInfo::get_infos()
7509 {
7510   PG *pg = context< RecoveryMachine >().pg;
7511   PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
7512
7513   pg->blocked_by.clear();
7514   for (set<pg_shard_t>::const_iterator it = prior_set.probe.begin();
7515        it != prior_set.probe.end();
7516        ++it) {
7517     pg_shard_t peer = *it;
7518     if (peer == pg->pg_whoami) {
7519       continue;
7520     }
7521     if (pg->peer_info.count(peer)) {
7522       ldout(pg->cct, 10) << " have osd." << peer << " info " << pg->peer_info[peer] << dendl;
7523       continue;
7524     }
7525     if (peer_info_requested.count(peer)) {
7526       ldout(pg->cct, 10) << " already requested info from osd." << peer << dendl;
7527       pg->blocked_by.insert(peer.osd);
7528     } else if (!pg->get_osdmap()->is_up(peer.osd)) {
7529       ldout(pg->cct, 10) << " not querying info from down osd." << peer << dendl;
7530     } else {
7531       ldout(pg->cct, 10) << " querying info from osd." << peer << dendl;
7532       context< RecoveryMachine >().send_query(
7533         peer, pg_query_t(pg_query_t::INFO,
7534                          it->shard, pg->pg_whoami.shard,
7535                          pg->info.history,
7536                          pg->get_osdmap()->get_epoch()));
7537       peer_info_requested.insert(peer);
7538       pg->blocked_by.insert(peer.osd);
7539     }
7540   }
7541
7542   pg->publish_stats_to_osd();
7543 }
7544
7545 boost::statechart::result PG::RecoveryState::GetInfo::react(const MNotifyRec& infoevt)
7546 {
7547   PG *pg = context< RecoveryMachine >().pg;
7548
7549   set<pg_shard_t>::iterator p = peer_info_requested.find(infoevt.from);
7550   if (p != peer_info_requested.end()) {
7551     peer_info_requested.erase(p);
7552     pg->blocked_by.erase(infoevt.from.osd);
7553   }
7554
7555   epoch_t old_start = pg->info.history.last_epoch_started;
7556   if (pg->proc_replica_info(
7557         infoevt.from, infoevt.notify.info, infoevt.notify.epoch_sent)) {
7558     // we got something new ...
7559     PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
7560     if (old_start < pg->info.history.last_epoch_started) {
7561       ldout(pg->cct, 10) << " last_epoch_started moved forward, rebuilding prior" << dendl;
7562       prior_set = pg->build_prior();
7563
7564       // filter out any osds that got dropped from the probe set from
7565       // peer_info_requested.  this is less expensive than restarting
7566       // peering (which would re-probe everyone).
7567       set<pg_shard_t>::iterator p = peer_info_requested.begin();
7568       while (p != peer_info_requested.end()) {
7569         if (prior_set.probe.count(*p) == 0) {
7570           ldout(pg->cct, 20) << " dropping osd." << *p << " from info_requested, no longer in probe set" << dendl;
7571           peer_info_requested.erase(p++);
7572         } else {
7573           ++p;
7574         }
7575       }
7576       get_infos();
7577     }
7578     ldout(pg->cct, 20) << "Adding osd: " << infoevt.from.osd << " peer features: "
7579                        << hex << infoevt.features << dec << dendl;
7580     pg->apply_peer_features(infoevt.features);
7581
7582     // are we done getting everything?
7583     if (peer_info_requested.empty() && !prior_set.pg_down) {
7584       ldout(pg->cct, 20) << "Common peer features: " << hex << pg->get_min_peer_features() << dec << dendl;
7585       ldout(pg->cct, 20) << "Common acting features: " << hex << pg->get_min_acting_features() << dec << dendl;
7586       ldout(pg->cct, 20) << "Common upacting features: " << hex << pg->get_min_upacting_features() << dec << dendl;
7587       post_event(GotInfo());
7588     }
7589   }
7590   return discard_event();
7591 }
7592
7593 boost::statechart::result PG::RecoveryState::GetInfo::react(const QueryState& q)
7594 {
7595   PG *pg = context< RecoveryMachine >().pg;
7596   q.f->open_object_section("state");
7597   q.f->dump_string("name", state_name);
7598   q.f->dump_stream("enter_time") << enter_time;
7599
7600   q.f->open_array_section("requested_info_from");
7601   for (set<pg_shard_t>::iterator p = peer_info_requested.begin();
7602        p != peer_info_requested.end();
7603        ++p) {
7604     q.f->open_object_section("osd");
7605     q.f->dump_stream("osd") << *p;
7606     if (pg->peer_info.count(*p)) {
7607       q.f->open_object_section("got_info");
7608       pg->peer_info[*p].dump(q.f);
7609       q.f->close_section();
7610     }
7611     q.f->close_section();
7612   }
7613   q.f->close_section();
7614
7615   q.f->close_section();
7616   return forward_event();
7617 }
7618
7619 void PG::RecoveryState::GetInfo::exit()
7620 {
7621   context< RecoveryMachine >().log_exit(state_name, enter_time);
7622   PG *pg = context< RecoveryMachine >().pg;
7623   utime_t dur = ceph_clock_now() - enter_time;
7624   pg->osd->recoverystate_perf->tinc(rs_getinfo_latency, dur);
7625   pg->blocked_by.clear();
7626   pg->publish_stats_to_osd();
7627 }
7628
7629 /*------GetLog------------*/
7630 PG::RecoveryState::GetLog::GetLog(my_context ctx)
7631   : my_base(ctx),
7632     NamedState(
7633       context< RecoveryMachine >().pg, "Started/Primary/Peering/GetLog"),
7634     msg(0)
7635 {
7636   context< RecoveryMachine >().log_enter(state_name);
7637
7638   PG *pg = context< RecoveryMachine >().pg;
7639
7640   // adjust acting?
7641   if (!pg->choose_acting(auth_log_shard, false,
7642                          &context< Peering >().history_les_bound)) {
7643     if (!pg->want_acting.empty()) {
7644       post_event(NeedActingChange());
7645     } else {
7646       post_event(IsIncomplete());
7647     }
7648     return;
7649   }
7650
7651   // am i the best?
7652   if (auth_log_shard == pg->pg_whoami) {
7653     post_event(GotLog());
7654     return;
7655   }
7656
7657   const pg_info_t& best = pg->peer_info[auth_log_shard];
7658
7659   // am i broken?
7660   if (pg->info.last_update < best.log_tail) {
7661     ldout(pg->cct, 10) << " not contiguous with osd." << auth_log_shard << ", down" << dendl;
7662     post_event(IsIncomplete());
7663     return;
7664   }
7665
7666   // how much log to request?
7667   eversion_t request_log_from = pg->info.last_update;
7668   assert(!pg->actingbackfill.empty());
7669   for (set<pg_shard_t>::iterator p = pg->actingbackfill.begin();
7670        p != pg->actingbackfill.end();
7671        ++p) {
7672     if (*p == pg->pg_whoami) continue;
7673     pg_info_t& ri = pg->peer_info[*p];
7674     if (ri.last_update < pg->info.log_tail && ri.last_update >= best.log_tail &&
7675         ri.last_update < request_log_from)
7676       request_log_from = ri.last_update;
7677   }
7678
7679   // how much?
7680   ldout(pg->cct, 10) << " requesting log from osd." << auth_log_shard << dendl;
7681   context<RecoveryMachine>().send_query(
7682     auth_log_shard,
7683     pg_query_t(
7684       pg_query_t::LOG,
7685       auth_log_shard.shard, pg->pg_whoami.shard,
7686       request_log_from, pg->info.history,
7687       pg->get_osdmap()->get_epoch()));
7688
7689   assert(pg->blocked_by.empty());
7690   pg->blocked_by.insert(auth_log_shard.osd);
7691   pg->publish_stats_to_osd();
7692 }
7693
7694 boost::statechart::result PG::RecoveryState::GetLog::react(const AdvMap& advmap)
7695 {
7696   PG *pg = context< RecoveryMachine >().pg;
7697   // make sure our log source didn't go down.  we need to check
7698   // explicitly because it may not be part of the prior set, which
7699   // means the Peering state check won't catch it going down.
7700   if (!advmap.osdmap->is_up(auth_log_shard.osd)) {
7701     ldout(pg->cct, 10) << "GetLog: auth_log_shard osd."
7702                        << auth_log_shard.osd << " went down" << dendl;
7703     post_event(advmap);
7704     return transit< Reset >();
7705   }
7706
7707   // let the Peering state do its checks.
7708   return forward_event();
7709 }
7710
7711 boost::statechart::result PG::RecoveryState::GetLog::react(const MLogRec& logevt)
7712 {
7713   PG *pg = context< RecoveryMachine >().pg;
7714   assert(!msg);
7715   if (logevt.from != auth_log_shard) {
7716     ldout(pg->cct, 10) << "GetLog: discarding log from "
7717                        << "non-auth_log_shard osd." << logevt.from << dendl;
7718     return discard_event();
7719   }
7720   ldout(pg->cct, 10) << "GetLog: received master log from osd"
7721                      << logevt.from << dendl;
7722   msg = logevt.msg;
7723   post_event(GotLog());
7724   return discard_event();
7725 }
7726
7727 boost::statechart::result PG::RecoveryState::GetLog::react(const GotLog&)
7728 {
7729   PG *pg = context< RecoveryMachine >().pg;
7730   ldout(pg->cct, 10) << "leaving GetLog" << dendl;
7731   if (msg) {
7732     ldout(pg->cct, 10) << "processing master log" << dendl;
7733     pg->proc_master_log(*context<RecoveryMachine>().get_cur_transaction(),
7734                         msg->info, msg->log, msg->missing,
7735                         auth_log_shard);
7736   }
7737   pg->start_flush(
7738     context< RecoveryMachine >().get_cur_transaction(),
7739     context< RecoveryMachine >().get_on_applied_context_list(),
7740     context< RecoveryMachine >().get_on_safe_context_list());
7741   return transit< GetMissing >();
7742 }
7743
7744 boost::statechart::result PG::RecoveryState::GetLog::react(const QueryState& q)
7745 {
7746   q.f->open_object_section("state");
7747   q.f->dump_string("name", state_name);
7748   q.f->dump_stream("enter_time") << enter_time;
7749   q.f->dump_stream("auth_log_shard") << auth_log_shard;
7750   q.f->close_section();
7751   return forward_event();
7752 }
7753
7754 void PG::RecoveryState::GetLog::exit()
7755 {
7756   context< RecoveryMachine >().log_exit(state_name, enter_time);
7757   PG *pg = context< RecoveryMachine >().pg;
7758   utime_t dur = ceph_clock_now() - enter_time;
7759   pg->osd->recoverystate_perf->tinc(rs_getlog_latency, dur);
7760   pg->blocked_by.clear();
7761   pg->publish_stats_to_osd();
7762 }
7763
7764 /*------WaitActingChange--------*/
7765 PG::RecoveryState::WaitActingChange::WaitActingChange(my_context ctx)
7766   : my_base(ctx),
7767     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/WaitActingChange")
7768 {
7769   context< RecoveryMachine >().log_enter(state_name);
7770 }
7771
7772 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const AdvMap& advmap)
7773 {
7774   PG *pg = context< RecoveryMachine >().pg;
7775   OSDMapRef osdmap = advmap.osdmap;
7776
7777   ldout(pg->cct, 10) << "verifying no want_acting " << pg->want_acting << " targets didn't go down" << dendl;
7778   for (vector<int>::iterator p = pg->want_acting.begin(); p != pg->want_acting.end(); ++p) {
7779     if (!osdmap->is_up(*p)) {
7780       ldout(pg->cct, 10) << " want_acting target osd." << *p << " went down, resetting" << dendl;
7781       post_event(advmap);
7782       return transit< Reset >();
7783     }
7784   }
7785   return forward_event();
7786 }
7787
7788 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const MLogRec& logevt)
7789 {
7790   PG *pg = context< RecoveryMachine >().pg;
7791   ldout(pg->cct, 10) << "In WaitActingChange, ignoring MLocRec" << dendl;
7792   return discard_event();
7793 }
7794
7795 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const MInfoRec& evt)
7796 {
7797   PG *pg = context< RecoveryMachine >().pg;
7798   ldout(pg->cct, 10) << "In WaitActingChange, ignoring MInfoRec" << dendl;
7799   return discard_event();
7800 }
7801
7802 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const MNotifyRec& evt)
7803 {
7804   PG *pg = context< RecoveryMachine >().pg;
7805   ldout(pg->cct, 10) << "In WaitActingChange, ignoring MNotifyRec" << dendl;
7806   return discard_event();
7807 }
7808
7809 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const QueryState& q)
7810 {
7811   q.f->open_object_section("state");
7812   q.f->dump_string("name", state_name);
7813   q.f->dump_stream("enter_time") << enter_time;
7814   q.f->dump_string("comment", "waiting for pg acting set to change");
7815   q.f->close_section();
7816   return forward_event();
7817 }
7818
7819 void PG::RecoveryState::WaitActingChange::exit()
7820 {
7821   context< RecoveryMachine >().log_exit(state_name, enter_time);
7822   PG *pg = context< RecoveryMachine >().pg;
7823   utime_t dur = ceph_clock_now() - enter_time;
7824   pg->osd->recoverystate_perf->tinc(rs_waitactingchange_latency, dur);
7825 }
7826
7827 /*------Down--------*/
7828 PG::RecoveryState::Down::Down(my_context ctx)
7829   : my_base(ctx),
7830     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/Down")
7831 {
7832   context< RecoveryMachine >().log_enter(state_name);
7833   PG *pg = context< RecoveryMachine >().pg;
7834
7835   pg->state_clear(PG_STATE_PEERING);
7836   pg->state_set(PG_STATE_DOWN);
7837
7838   auto &prior_set = context< Peering >().prior_set;
7839   assert(pg->blocked_by.empty());
7840   pg->blocked_by.insert(prior_set.down.begin(), prior_set.down.end());
7841   pg->publish_stats_to_osd();
7842 }
7843
7844 void PG::RecoveryState::Down::exit()
7845 {
7846   context< RecoveryMachine >().log_exit(state_name, enter_time);
7847   PG *pg = context< RecoveryMachine >().pg;
7848
7849   pg->state_clear(PG_STATE_DOWN);
7850   utime_t dur = ceph_clock_now() - enter_time;
7851   pg->osd->recoverystate_perf->tinc(rs_down_latency, dur);
7852
7853   pg->blocked_by.clear();
7854   pg->publish_stats_to_osd();
7855 }
7856
7857 boost::statechart::result PG::RecoveryState::Down::react(const QueryState& q)
7858 {
7859   q.f->open_object_section("state");
7860   q.f->dump_string("name", state_name);
7861   q.f->dump_stream("enter_time") << enter_time;
7862   q.f->dump_string("comment",
7863                    "not enough up instances of this PG to go active");
7864   q.f->close_section();
7865   return forward_event();
7866 }
7867
7868 /*------Incomplete--------*/
7869 PG::RecoveryState::Incomplete::Incomplete(my_context ctx)
7870   : my_base(ctx),
7871     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/Incomplete")
7872 {
7873   context< RecoveryMachine >().log_enter(state_name);
7874   PG *pg = context< RecoveryMachine >().pg;
7875
7876   pg->state_clear(PG_STATE_PEERING);
7877   pg->state_set(PG_STATE_INCOMPLETE);
7878
7879   PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
7880   assert(pg->blocked_by.empty());
7881   pg->blocked_by.insert(prior_set.down.begin(), prior_set.down.end());
7882   pg->publish_stats_to_osd();
7883 }
7884
7885 boost::statechart::result PG::RecoveryState::Incomplete::react(const AdvMap &advmap) {
7886   PG *pg = context< RecoveryMachine >().pg;
7887   int64_t poolnum = pg->info.pgid.pool();
7888
7889   // Reset if min_size turn smaller than previous value, pg might now be able to go active
7890   if (advmap.lastmap->get_pools().find(poolnum)->second.min_size >
7891       advmap.osdmap->get_pools().find(poolnum)->second.min_size) {
7892     post_event(advmap);
7893     return transit< Reset >();
7894   }
7895
7896   return forward_event();
7897 }
7898
7899 boost::statechart::result PG::RecoveryState::Incomplete::react(const MNotifyRec& notevt) {
7900   PG *pg = context< RecoveryMachine >().pg;
7901   ldout(pg->cct, 7) << "handle_pg_notify from osd." << notevt.from << dendl;
7902   if (pg->proc_replica_info(
7903     notevt.from, notevt.notify.info, notevt.notify.epoch_sent)) {
7904     // We got something new, try again!
7905     return transit< GetLog >();
7906   } else {
7907     return discard_event();
7908   }
7909 }
7910
7911 boost::statechart::result PG::RecoveryState::Incomplete::react(
7912   const QueryState& q)
7913 {
7914   q.f->open_object_section("state");
7915   q.f->dump_string("name", state_name);
7916   q.f->dump_stream("enter_time") << enter_time;
7917   q.f->dump_string("comment", "not enough complete instances of this PG");
7918   q.f->close_section();
7919   return forward_event();
7920 }
7921
7922 void PG::RecoveryState::Incomplete::exit()
7923 {
7924   context< RecoveryMachine >().log_exit(state_name, enter_time);
7925   PG *pg = context< RecoveryMachine >().pg;
7926
7927   pg->state_clear(PG_STATE_INCOMPLETE);
7928   utime_t dur = ceph_clock_now() - enter_time;
7929   pg->osd->recoverystate_perf->tinc(rs_incomplete_latency, dur);
7930
7931   pg->blocked_by.clear();
7932   pg->publish_stats_to_osd();
7933 }
7934
7935 /*------GetMissing--------*/
7936 PG::RecoveryState::GetMissing::GetMissing(my_context ctx)
7937   : my_base(ctx),
7938     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/GetMissing")
7939 {
7940   context< RecoveryMachine >().log_enter(state_name);
7941
7942   PG *pg = context< RecoveryMachine >().pg;
7943   assert(!pg->actingbackfill.empty());
7944   eversion_t since;
7945   for (set<pg_shard_t>::iterator i = pg->actingbackfill.begin();
7946        i != pg->actingbackfill.end();
7947        ++i) {
7948     if (*i == pg->get_primary()) continue;
7949     const pg_info_t& pi = pg->peer_info[*i];
7950
7951     if (pi.is_empty())
7952       continue;                                // no pg data, nothing divergent
7953
7954     if (pi.last_update < pg->pg_log.get_tail()) {
7955       ldout(pg->cct, 10) << " osd." << *i << " is not contiguous, will restart backfill" << dendl;
7956       pg->peer_missing[*i];
7957       continue;
7958     }
7959     if (pi.last_backfill == hobject_t()) {
7960       ldout(pg->cct, 10) << " osd." << *i << " will fully backfill; can infer empty missing set" << dendl;
7961       pg->peer_missing[*i];
7962       continue;
7963     }
7964
7965     if (pi.last_update == pi.last_complete &&  // peer has no missing
7966         pi.last_update == pg->info.last_update) {  // peer is up to date
7967       // replica has no missing and identical log as us.  no need to
7968       // pull anything.
7969       // FIXME: we can do better here.  if last_update==last_complete we
7970       //        can infer the rest!
7971       ldout(pg->cct, 10) << " osd." << *i << " has no missing, identical log" << dendl;
7972       pg->peer_missing[*i];
7973       continue;
7974     }
7975
7976     // We pull the log from the peer's last_epoch_started to ensure we
7977     // get enough log to detect divergent updates.
7978     since.epoch = pi.last_epoch_started;
7979     assert(pi.last_update >= pg->info.log_tail);  // or else choose_acting() did a bad thing
7980     if (pi.log_tail <= since) {
7981       ldout(pg->cct, 10) << " requesting log+missing since " << since << " from osd." << *i << dendl;
7982       context< RecoveryMachine >().send_query(
7983         *i,
7984         pg_query_t(
7985           pg_query_t::LOG,
7986           i->shard, pg->pg_whoami.shard,
7987           since, pg->info.history,
7988           pg->get_osdmap()->get_epoch()));
7989     } else {
7990       ldout(pg->cct, 10) << " requesting fulllog+missing from osd." << *i
7991                          << " (want since " << since << " < log.tail "
7992                          << pi.log_tail << ")" << dendl;
7993       context< RecoveryMachine >().send_query(
7994         *i, pg_query_t(
7995           pg_query_t::FULLLOG,
7996           i->shard, pg->pg_whoami.shard,
7997           pg->info.history, pg->get_osdmap()->get_epoch()));
7998     }
7999     peer_missing_requested.insert(*i);
8000     pg->blocked_by.insert(i->osd);
8001   }
8002
8003   if (peer_missing_requested.empty()) {
8004     if (pg->need_up_thru) {
8005       ldout(pg->cct, 10) << " still need up_thru update before going active"
8006                          << dendl;
8007       post_event(NeedUpThru());
8008       return;
8009     }
8010
8011     // all good!
8012     post_event(Activate(pg->get_osdmap()->get_epoch()));
8013   } else {
8014     pg->publish_stats_to_osd();
8015   }
8016 }
8017
8018 boost::statechart::result PG::RecoveryState::GetMissing::react(const MLogRec& logevt)
8019 {
8020   PG *pg = context< RecoveryMachine >().pg;
8021
8022   peer_missing_requested.erase(logevt.from);
8023   pg->proc_replica_log(logevt.msg->info, logevt.msg->log, logevt.msg->missing, logevt.from);
8024
8025   if (peer_missing_requested.empty()) {
8026     if (pg->need_up_thru) {
8027       ldout(pg->cct, 10) << " still need up_thru update before going active"
8028                          << dendl;
8029       post_event(NeedUpThru());
8030     } else {
8031       ldout(pg->cct, 10) << "Got last missing, don't need missing "
8032                          << "posting Activate" << dendl;
8033       post_event(Activate(pg->get_osdmap()->get_epoch()));
8034     }
8035   }
8036   return discard_event();
8037 }
8038
8039 boost::statechart::result PG::RecoveryState::GetMissing::react(const QueryState& q)
8040 {
8041   PG *pg = context< RecoveryMachine >().pg;
8042   q.f->open_object_section("state");
8043   q.f->dump_string("name", state_name);
8044   q.f->dump_stream("enter_time") << enter_time;
8045
8046   q.f->open_array_section("peer_missing_requested");
8047   for (set<pg_shard_t>::iterator p = peer_missing_requested.begin();
8048        p != peer_missing_requested.end();
8049        ++p) {
8050     q.f->open_object_section("osd");
8051     q.f->dump_stream("osd") << *p;
8052     if (pg->peer_missing.count(*p)) {
8053       q.f->open_object_section("got_missing");
8054       pg->peer_missing[*p].dump(q.f);
8055       q.f->close_section();
8056     }
8057     q.f->close_section();
8058   }
8059   q.f->close_section();
8060
8061   q.f->close_section();
8062   return forward_event();
8063 }
8064
8065 void PG::RecoveryState::GetMissing::exit()
8066 {
8067   context< RecoveryMachine >().log_exit(state_name, enter_time);
8068   PG *pg = context< RecoveryMachine >().pg;
8069   utime_t dur = ceph_clock_now() - enter_time;
8070   pg->osd->recoverystate_perf->tinc(rs_getmissing_latency, dur);
8071   pg->blocked_by.clear();
8072   pg->publish_stats_to_osd();
8073 }
8074
8075 /*------WaitUpThru--------*/
8076 PG::RecoveryState::WaitUpThru::WaitUpThru(my_context ctx)
8077   : my_base(ctx),
8078     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/WaitUpThru")
8079 {
8080   context< RecoveryMachine >().log_enter(state_name);
8081 }
8082
8083 boost::statechart::result PG::RecoveryState::WaitUpThru::react(const ActMap& am)
8084 {
8085   PG *pg = context< RecoveryMachine >().pg;
8086   if (!pg->need_up_thru) {
8087     post_event(Activate(pg->get_osdmap()->get_epoch()));
8088   }
8089   return forward_event();
8090 }
8091
8092 boost::statechart::result PG::RecoveryState::WaitUpThru::react(const MLogRec& logevt)
8093 {
8094   PG *pg = context< RecoveryMachine >().pg;
8095   ldout(pg->cct, 10) << "Noting missing from osd." << logevt.from << dendl;
8096   pg->peer_missing[logevt.from].claim(logevt.msg->missing);
8097   pg->peer_info[logevt.from] = logevt.msg->info;
8098   return discard_event();
8099 }
8100
8101 boost::statechart::result PG::RecoveryState::WaitUpThru::react(const QueryState& q)
8102 {
8103   q.f->open_object_section("state");
8104   q.f->dump_string("name", state_name);
8105   q.f->dump_stream("enter_time") << enter_time;
8106   q.f->dump_string("comment", "waiting for osdmap to reflect a new up_thru for this osd");
8107   q.f->close_section();
8108   return forward_event();
8109 }
8110
8111 void PG::RecoveryState::WaitUpThru::exit()
8112 {
8113   context< RecoveryMachine >().log_exit(state_name, enter_time);
8114   PG *pg = context< RecoveryMachine >().pg;
8115   utime_t dur = ceph_clock_now() - enter_time;
8116   pg->osd->recoverystate_perf->tinc(rs_waitupthru_latency, dur);
8117 }
8118
8119 /*----RecoveryState::RecoveryMachine Methods-----*/
8120 #undef dout_prefix
8121 #define dout_prefix *_dout << pg->gen_prefix()
8122
8123 void PG::RecoveryState::RecoveryMachine::log_enter(const char *state_name)
8124 {
8125   PG *pg = context< RecoveryMachine >().pg;
8126   ldout(pg->cct, 5) << "enter " << state_name << dendl;
8127   pg->osd->pg_recovery_stats.log_enter(state_name);
8128 }
8129
8130 void PG::RecoveryState::RecoveryMachine::log_exit(const char *state_name, utime_t enter_time)
8131 {
8132   utime_t dur = ceph_clock_now() - enter_time;
8133   PG *pg = context< RecoveryMachine >().pg;
8134   ldout(pg->cct, 5) << "exit " << state_name << " " << dur << " " << event_count << " " << event_time << dendl;
8135   pg->osd->pg_recovery_stats.log_exit(state_name, ceph_clock_now() - enter_time,
8136                                       event_count, event_time);
8137   event_count = 0;
8138   event_time = utime_t();
8139 }
8140
8141
8142 /*---------------------------------------------------*/
8143 #undef dout_prefix
8144 #define dout_prefix (*_dout << (debug_pg ? debug_pg->gen_prefix() : string()) << " PriorSet: ")
8145
8146 void PG::RecoveryState::start_handle(RecoveryCtx *new_ctx) {
8147   assert(!rctx);
8148   assert(!orig_ctx);
8149   orig_ctx = new_ctx;
8150   if (new_ctx) {
8151     if (messages_pending_flush) {
8152       rctx = RecoveryCtx(*messages_pending_flush, *new_ctx);
8153     } else {
8154       rctx = *new_ctx;
8155     }
8156     rctx->start_time = ceph_clock_now();
8157   }
8158 }
8159
8160 void PG::RecoveryState::begin_block_outgoing() {
8161   assert(!messages_pending_flush);
8162   assert(orig_ctx);
8163   assert(rctx);
8164   messages_pending_flush = BufferedRecoveryMessages();
8165   rctx = RecoveryCtx(*messages_pending_flush, *orig_ctx);
8166 }
8167
8168 void PG::RecoveryState::clear_blocked_outgoing() {
8169   assert(orig_ctx);
8170   assert(rctx);
8171   messages_pending_flush = boost::optional<BufferedRecoveryMessages>();
8172 }
8173
8174 void PG::RecoveryState::end_block_outgoing() {
8175   assert(messages_pending_flush);
8176   assert(orig_ctx);
8177   assert(rctx);
8178
8179   rctx = RecoveryCtx(*orig_ctx);
8180   rctx->accept_buffered_messages(*messages_pending_flush);
8181   messages_pending_flush = boost::optional<BufferedRecoveryMessages>();
8182 }
8183
8184 void PG::RecoveryState::end_handle() {
8185   if (rctx) {
8186     utime_t dur = ceph_clock_now() - rctx->start_time;
8187     machine.event_time += dur;
8188   }
8189
8190   machine.event_count++;
8191   rctx = boost::optional<RecoveryCtx>();
8192   orig_ctx = NULL;
8193 }
8194
8195 ostream& operator<<(ostream& out, const PG::BackfillInterval& bi)
8196 {
8197   out << "BackfillInfo(" << bi.begin << "-" << bi.end
8198       << " " << bi.objects.size() << " objects";
8199   if (!bi.objects.empty())
8200     out << " " << bi.objects;
8201   out << ")";
8202   return out;
8203 }
8204
8205 void intrusive_ptr_add_ref(PG *pg) { pg->get("intptr"); }
8206 void intrusive_ptr_release(PG *pg) { pg->put("intptr"); }
8207
8208 #ifdef PG_DEBUG_REFS
8209   uint64_t get_with_id(PG *pg) { return pg->get_with_id(); }
8210   void put_with_id(PG *pg, uint64_t id) { return pg->put_with_id(id); }
8211 #endif