ceph/src/osd/PG.cc

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Ceph - scalable distributed file system
   5  *
   6  * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
   7  *
   8  * This is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License version 2.1, as published by the Free Software
  11  * Foundation.  See file COPYING.
  12  *
  13  */
  14
  15 #include "PG.h"
  16 // #include "msg/Messenger.h"
  17 #include "messages/MOSDRepScrub.h"
  18 // #include "common/cmdparse.h"
  19 // #include "common/ceph_context.h"
  20
  21 #include "common/errno.h"
  22 #include "common/config.h"
  23 #include "OSD.h"
  24 #include "OpRequest.h"
  25 #include "ScrubStore.h"
  26 #include "Session.h"
  27
  28 #include "common/Timer.h"
  29 #include "common/perf_counters.h"
  30
  31 #include "messages/MOSDOp.h"
  32 #include "messages/MOSDPGNotify.h"
  33 // #include "messages/MOSDPGLog.h"
  34 #include "messages/MOSDPGRemove.h"
  35 #include "messages/MOSDPGInfo.h"
  36 #include "messages/MOSDPGTrim.h"
  37 #include "messages/MOSDPGScan.h"
  38 #include "messages/MOSDPGBackfill.h"
  39 #include "messages/MOSDPGBackfillRemove.h"
  40 #include "messages/MBackfillReserve.h"
  41 #include "messages/MRecoveryReserve.h"
  42 #include "messages/MOSDPGPush.h"
  43 #include "messages/MOSDPGPushReply.h"
  44 #include "messages/MOSDPGPull.h"
  45 #include "messages/MOSDECSubOpWrite.h"
  46 #include "messages/MOSDECSubOpWriteReply.h"
  47 #include "messages/MOSDECSubOpRead.h"
  48 #include "messages/MOSDECSubOpReadReply.h"
  49 #include "messages/MOSDPGUpdateLogMissing.h"
  50 #include "messages/MOSDPGUpdateLogMissingReply.h"
  51 #include "messages/MOSDBackoff.h"
  52 #include "messages/MOSDScrubReserve.h"
  53 #include "messages/MOSDSubOp.h"
  54 #include "messages/MOSDRepOp.h"
  55 #include "messages/MOSDSubOpReply.h"
  56 #include "messages/MOSDRepOpReply.h"
  57 #include "messages/MOSDRepScrubMap.h"
  58 #include "messages/MOSDPGRecoveryDelete.h"
  59 #include "messages/MOSDPGRecoveryDeleteReply.h"
  60
  61 #include "common/BackTrace.h"
  62 #include "common/EventTrace.h"
  63
  64 #ifdef WITH_LTTNG
  65 #define TRACEPOINT_DEFINE
  66 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
  67 #include "tracing/pg.h"
  68 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
  69 #undef TRACEPOINT_DEFINE
  70 #else
  71 #define tracepoint(...)
  72 #endif
  73
  74 #include <sstream>
  75
  76 #define dout_context cct
  77 #define dout_subsys ceph_subsys_osd
  78 #undef dout_prefix
  79 #define dout_prefix _prefix(_dout, this)
  80
  81 // prefix pgmeta_oid keys with _ so that PGLog::read_log_and_missing() can
  82 // easily skip them
  83 const string infover_key("_infover");
  84 const string info_key("_info");
  85 const string biginfo_key("_biginfo");
  86 const string epoch_key("_epoch");
  87 const string fastinfo_key("_fastinfo");
  88
  89 template <class T>
  90 static ostream& _prefix(std::ostream *_dout, T *t)
  91 {
  92   return *_dout << t->gen_prefix();
  93 }
  94
  95 MEMPOOL_DEFINE_OBJECT_FACTORY(PG::CephPeeringEvt, pg_peering_evt, osd);
  96
  97 void PGStateHistory::enter(PG* pg, const utime_t entime, const char* state)
  98 {
  99   // Ignore trimming state machine for now
 100   if (::strstr(state, "Trimming") != NULL) {
 101     return;
 102   } else if (pi != nullptr) {
 103     pi->enter_state(entime, state);
 104   } else {
 105     // Store current state since we can't reliably take the PG lock here
 106     if ( tmppi == nullptr) {
 107       tmppi = std::unique_ptr<PGStateInstance>(new PGStateInstance);
 108     }
 109
 110     thispg = pg;
 111     tmppi->enter_state(entime, state);
 112   }
 113 }
 114
 115 void PGStateHistory::exit(const char* state) {
 116   // Ignore trimming state machine for now
 117   // Do nothing if PG is being destroyed!
 118   if (::strstr(state, "Trimming") != NULL || pg_in_destructor) {
 119     return;
 120   } else {
 121     bool ilocked = false;
 122     if(!thispg->is_locked()) {
 123       thispg->lock();
 124       ilocked = true;
 125     }
 126     if (pi == nullptr) {
 127       buffer.push_back(std::unique_ptr<PGStateInstance>(tmppi.release()));
 128       pi = buffer.back().get();
 129       pi->setepoch(thispg->get_osdmap()->get_epoch());
 130     }
 131
 132     pi->exit_state(ceph_clock_now());
 133     if (::strcmp(state, "Reset") == 0) {
 134       this->reset();
 135     }
 136     if(ilocked) {
 137       thispg->unlock();
 138     }
 139   }
 140 }
 141
 142 void PGStateHistory::dump(Formatter* f) const {
 143   f->open_array_section("history");
 144   for (auto pi = buffer.begin(); pi != buffer.end(); ++pi) {
 145     f->open_object_section("states");
 146     f->dump_stream("epoch") << (*pi)->this_epoch;
 147     for (auto she : (*pi)->state_history) {
 148       f->dump_string("state", std::get<2>(she));
 149       f->dump_stream("enter") << std::get<0>(she);
 150       f->dump_stream("exit") << std::get<1>(she);
 151     }
 152     f->close_section();
 153   }
 154   f->close_section();
 155 }
 156
 157 void PG::get(const char* tag)
 158 {
 159   ref++;
 160 #ifdef PG_DEBUG_REFS
 161   Mutex::Locker l(_ref_id_lock);
 162   _tag_counts[tag]++;
 163 #endif
 164 }
 165
 166 void PG::put(const char* tag)
 167 {
 168 #ifdef PG_DEBUG_REFS
 169   {
 170     Mutex::Locker l(_ref_id_lock);
 171     auto tag_counts_entry = _tag_counts.find(tag);
 172     assert(tag_counts_entry != _tag_counts.end());
 173     --tag_counts_entry->second;
 174     if (tag_counts_entry->second == 0) {
 175       _tag_counts.erase(tag_counts_entry);
 176     }
 177   }
 178 #endif
 179   if (--ref== 0)
 180     delete this;
 181 }
 182
 183 #ifdef PG_DEBUG_REFS
 184 uint64_t PG::get_with_id()
 185 {
 186   ref++;
 187   Mutex::Locker l(_ref_id_lock);
 188   uint64_t id = ++_ref_id;
 189   BackTrace bt(0);
 190   stringstream ss;
 191   bt.print(ss);
 192   dout(20) << __func__ << ": " << info.pgid << " got id " << id << " (new) ref==" << ref << dendl;
 193   assert(!_live_ids.count(id));
 194   _live_ids.insert(make_pair(id, ss.str()));
 195   return id;
 196 }
 197
 198 void PG::put_with_id(uint64_t id)
 199 {
 200   dout(20) << __func__ << ": " << info.pgid << " put id " << id << " (current) ref==" << ref << dendl;
 201   {
 202     Mutex::Locker l(_ref_id_lock);
 203     assert(_live_ids.count(id));
 204     _live_ids.erase(id);
 205   }
 206   if (--ref == 0)
 207     delete this;
 208 }
 209
 210 void PG::dump_live_ids()
 211 {
 212   Mutex::Locker l(_ref_id_lock);
 213   dout(0) << "\t" << __func__ << ": " << info.pgid << " live ids:" << dendl;
 214   for (map<uint64_t, string>::iterator i = _live_ids.begin();
 215        i != _live_ids.end();
 216        ++i) {
 217     dout(0) << "\t\tid: " << *i << dendl;
 218   }
 219   dout(0) << "\t" << __func__ << ": " << info.pgid << " live tags:" << dendl;
 220   for (map<string, uint64_t>::iterator i = _tag_counts.begin();
 221        i != _tag_counts.end();
 222        ++i) {
 223     dout(0) << "\t\tid: " << *i << dendl;
 224   }
 225 }
 226 #endif
 227
 228
 229 void PGPool::update(OSDMapRef map)
 230 {
 231   const pg_pool_t *pi = map->get_pg_pool(id);
 232   assert(pi);
 233   info = *pi;
 234   auid = pi->auid;
 235   name = map->get_pool_name(id);
 236   bool updated = false;
 237   if ((map->get_epoch() != cached_epoch + 1) ||
 238       (pi->get_snap_epoch() == map->get_epoch())) {
 239     updated = true;
 240     pi->build_removed_snaps(newly_removed_snaps);
 241     interval_set<snapid_t> intersection;
 242     intersection.intersection_of(newly_removed_snaps, cached_removed_snaps);
 243     if (intersection == cached_removed_snaps) {
 244         newly_removed_snaps.subtract(cached_removed_snaps);
 245         cached_removed_snaps.union_of(newly_removed_snaps);
 246     } else {
 247         lgeneric_subdout(cct, osd, 0) << __func__
 248           << " cached_removed_snaps shrank from " << cached_removed_snaps
 249           << " to " << newly_removed_snaps << dendl;
 250         cached_removed_snaps = newly_removed_snaps;
 251         newly_removed_snaps.clear();
 252     }
 253     snapc = pi->get_snap_context();
 254   } else {
 255     /* 1) map->get_epoch() == cached_epoch + 1 &&
 256      * 2) pi->get_snap_epoch() != map->get_epoch()
 257      *
 258      * From the if branch, 1 && 2 must be true.  From 2, we know that
 259      * this map didn't change the set of removed snaps.  From 1, we
 260      * know that our cached_removed_snaps matches the previous map.
 261      * Thus, from 1 && 2, cached_removed snaps matches the current
 262      * set of removed snaps and all we have to do is clear
 263      * newly_removed_snaps.
 264      */
 265     newly_removed_snaps.clear();
 266   }
 267   cached_epoch = map->get_epoch();
 268   lgeneric_subdout(cct, osd, 20)
 269     << "PGPool::update cached_removed_snaps "
 270     << cached_removed_snaps
 271     << " newly_removed_snaps "
 272     << newly_removed_snaps
 273     << " snapc " << snapc
 274     << (updated ? " (updated)":" (no change)")
 275     << dendl;
 276 }
 277
 278 PG::PG(OSDService *o, OSDMapRef curmap,
 279        const PGPool &_pool, spg_t p) :
 280   osd(o),
 281   cct(o->cct),
 282   osdriver(osd->store, coll_t(), OSD::make_snapmapper_oid()),
 283   snap_mapper(
 284     cct,
 285     &osdriver,
 286     p.ps(),
 287     p.get_split_bits(curmap->get_pg_num(_pool.id)),
 288     _pool.id,
 289     p.shard),
 290   osdmap_ref(curmap), last_persisted_osdmap_ref(curmap), pool(_pool),
 291   _lock("PG::_lock"),
 292   #ifdef PG_DEBUG_REFS
 293   _ref_id_lock("PG::_ref_id_lock"), _ref_id(0),
 294   #endif
 295   deleting(false),
 296   trace_endpoint("0.0.0.0", 0, "PG"),
 297   dirty_info(false), dirty_big_info(false),
 298   info(p),
 299   info_struct_v(0),
 300   coll(p),
 301   pg_log(cct),
 302   pgmeta_oid(p.make_pgmeta_oid()),
 303   missing_loc(this),
 304   past_intervals(
 305     curmap->get_pools().at(p.pgid.pool()).ec_pool(),
 306     *curmap),
 307   stat_queue_item(this),
 308   scrub_queued(false),
 309   recovery_queued(false),
 310   recovery_ops_active(0),
 311   role(-1),
 312   state(0),
 313   send_notify(false),
 314   pg_whoami(osd->whoami, p.shard),
 315   need_up_thru(false),
 316   last_peering_reset(0),
 317   heartbeat_peer_lock("PG::heartbeat_peer_lock"),
 318   backfill_reserved(false),
 319   backfill_reserving(false),
 320   flushes_in_progress(0),
 321   pg_stats_publish_lock("PG::pg_stats_publish_lock"),
 322   pg_stats_publish_valid(false),
 323   osr(osd->osr_registry.lookup_or_create(p, (stringify(p)))),
 324   finish_sync_event(NULL),
 325   backoff_lock("PG::backoff_lock"),
 326   scrub_after_recovery(false),
 327   active_pushes(0),
 328   recovery_state(this),
 329   pg_id(p),
 330   peer_features(CEPH_FEATURES_SUPPORTED_DEFAULT),
 331   acting_features(CEPH_FEATURES_SUPPORTED_DEFAULT),
 332   upacting_features(CEPH_FEATURES_SUPPORTED_DEFAULT),
 333   last_epoch(0)
 334 {
 335 #ifdef PG_DEBUG_REFS
 336   osd->add_pgid(p, this);
 337 #endif
 338 #ifdef WITH_BLKIN
 339   std::stringstream ss;
 340   ss << "PG " << info.pgid;
 341   trace_endpoint.copy_name(ss.str());
 342 #endif
 343   osr->shard_hint = p;
 344 }
 345
 346 PG::~PG()
 347 {
 348   pgstate_history.set_pg_in_destructor();
 349 #ifdef PG_DEBUG_REFS
 350   osd->remove_pgid(info.pgid, this);
 351 #endif
 352 }
 353
 354 void PG::lock_suspend_timeout(ThreadPool::TPHandle &handle)
 355 {
 356   handle.suspend_tp_timeout();
 357   lock();
 358   handle.reset_tp_timeout();
 359 }
 360
 361 void PG::lock(bool no_lockdep) const
 362 {
 363   _lock.Lock(no_lockdep);
 364   // if we have unrecorded dirty state with the lock dropped, there is a bug
 365   assert(!dirty_info);
 366   assert(!dirty_big_info);
 367
 368   dout(30) << "lock" << dendl;
 369 }
 370
 371 std::string PG::gen_prefix() const
 372 {
 373   stringstream out;
 374   OSDMapRef mapref = osdmap_ref;
 375   if (_lock.is_locked_by_me()) {
 376     out << "osd." << osd->whoami
 377         << " pg_epoch: " << (mapref ? mapref->get_epoch():0)
 378         << " " << *this << " ";
 379   } else {
 380     out << "osd." << osd->whoami
 381         << " pg_epoch: " << (mapref ? mapref->get_epoch():0)
 382         << " pg[" << info.pgid << "(unlocked)] ";
 383   }
 384   return out.str();
 385 }
 386
 387 /********* PG **********/
 388
 389 void PG::proc_master_log(
 390   ObjectStore::Transaction& t, pg_info_t &oinfo,
 391   pg_log_t &olog, pg_missing_t& omissing, pg_shard_t from)
 392 {
 393   dout(10) << "proc_master_log for osd." << from << ": "
 394            << olog << " " << omissing << dendl;
 395   assert(!is_peered() && is_primary());
 396
 397   // merge log into our own log to build master log.  no need to
 398   // make any adjustments to their missing map; we are taking their
 399   // log to be authoritative (i.e., their entries are by definitely
 400   // non-divergent).
 401   merge_log(t, oinfo, olog, from);
 402   peer_info[from] = oinfo;
 403   dout(10) << " peer osd." << from << " now " << oinfo << " " << omissing << dendl;
 404   might_have_unfound.insert(from);
 405
 406   // See doc/dev/osd_internals/last_epoch_started
 407   if (oinfo.last_epoch_started > info.last_epoch_started) {
 408     info.last_epoch_started = oinfo.last_epoch_started;
 409     dirty_info = true;
 410   }
 411   if (oinfo.last_interval_started > info.last_interval_started) {
 412     info.last_interval_started = oinfo.last_interval_started;
 413     dirty_info = true;
 414   }
 415   update_history(oinfo.history);
 416   assert(cct->_conf->osd_find_best_info_ignore_history_les ||
 417          info.last_epoch_started >= info.history.last_epoch_started);
 418
 419   peer_missing[from].claim(omissing);
 420 }
 421
 422 void PG::proc_replica_log(
 423   pg_info_t &oinfo,
 424   const pg_log_t &olog,
 425   pg_missing_t& omissing,
 426   pg_shard_t from)
 427 {
 428   dout(10) << "proc_replica_log for osd." << from << ": "
 429            << oinfo << " " << olog << " " << omissing << dendl;
 430
 431   pg_log.proc_replica_log(oinfo, olog, omissing, from);
 432
 433   peer_info[from] = oinfo;
 434   dout(10) << " peer osd." << from << " now " << oinfo << " " << omissing << dendl;
 435   might_have_unfound.insert(from);
 436
 437   for (map<hobject_t, pg_missing_item>::const_iterator i =
 438          omissing.get_items().begin();
 439        i != omissing.get_items().end();
 440        ++i) {
 441     dout(20) << " after missing " << i->first << " need " << i->second.need
 442              << " have " << i->second.have << dendl;
 443   }
 444   peer_missing[from].claim(omissing);
 445 }
 446
 447 bool PG::proc_replica_info(
 448   pg_shard_t from, const pg_info_t &oinfo, epoch_t send_epoch)
 449 {
 450   map<pg_shard_t, pg_info_t>::iterator p = peer_info.find(from);
 451   if (p != peer_info.end() && p->second.last_update == oinfo.last_update) {
 452     dout(10) << " got dup osd." << from << " info " << oinfo << ", identical to ours" << dendl;
 453     return false;
 454   }
 455
 456   if (!get_osdmap()->has_been_up_since(from.osd, send_epoch)) {
 457     dout(10) << " got info " << oinfo << " from down osd." << from
 458              << " discarding" << dendl;
 459     return false;
 460   }
 461
 462   dout(10) << " got osd." << from << " " << oinfo << dendl;
 463   assert(is_primary());
 464   peer_info[from] = oinfo;
 465   might_have_unfound.insert(from);
 466
 467   update_history(oinfo.history);
 468
 469   // stray?
 470   if (!is_up(from) && !is_acting(from)) {
 471     dout(10) << " osd." << from << " has stray content: " << oinfo << dendl;
 472     stray_set.insert(from);
 473     if (is_clean()) {
 474       purge_strays();
 475     }
 476   }
 477
 478   // was this a new info?  if so, update peers!
 479   if (p == peer_info.end())
 480     update_heartbeat_peers();
 481
 482   return true;
 483 }
 484
 485 void PG::remove_snap_mapped_object(
 486   ObjectStore::Transaction &t, const hobject_t &soid)
 487 {
 488   t.remove(
 489     coll,
 490     ghobject_t(soid, ghobject_t::NO_GEN, pg_whoami.shard));
 491   clear_object_snap_mapping(&t, soid);
 492 }
 493
 494 void PG::clear_object_snap_mapping(
 495   ObjectStore::Transaction *t, const hobject_t &soid)
 496 {
 497   OSDriver::OSTransaction _t(osdriver.get_transaction(t));
 498   if (soid.snap < CEPH_MAXSNAP) {
 499     int r = snap_mapper.remove_oid(
 500       soid,
 501       &_t);
 502     if (!(r == 0 || r == -ENOENT)) {
 503       derr << __func__ << ": remove_oid returned " << cpp_strerror(r) << dendl;
 504       ceph_abort();
 505     }
 506   }
 507 }
 508
 509 void PG::update_object_snap_mapping(
 510   ObjectStore::Transaction *t, const hobject_t &soid, const set<snapid_t> &snaps)
 511 {
 512   OSDriver::OSTransaction _t(osdriver.get_transaction(t));
 513   assert(soid.snap < CEPH_MAXSNAP);
 514   int r = snap_mapper.remove_oid(
 515     soid,
 516     &_t);
 517   if (!(r == 0 || r == -ENOENT)) {
 518     derr << __func__ << ": remove_oid returned " << cpp_strerror(r) << dendl;
 519     ceph_abort();
 520   }
 521   snap_mapper.add_oid(
 522     soid,
 523     snaps,
 524     &_t);
 525 }
 526
 527 void PG::merge_log(
 528   ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog, pg_shard_t from)
 529 {
 530   PGLogEntryHandler rollbacker{this, &t};
 531   pg_log.merge_log(
 532     oinfo, olog, from, info, &rollbacker, dirty_info, dirty_big_info);
 533 }
 534
 535 void PG::rewind_divergent_log(ObjectStore::Transaction& t, eversion_t newhead)
 536 {
 537   PGLogEntryHandler rollbacker{this, &t};
 538   pg_log.rewind_divergent_log(
 539     newhead, info, &rollbacker, dirty_info, dirty_big_info);
 540 }
 541
 542 /*
 543  * Process information from a replica to determine if it could have any
 544  * objects that i need.
 545  *
 546  * TODO: if the missing set becomes very large, this could get expensive.
 547  * Instead, we probably want to just iterate over our unfound set.
 548  */
 549 bool PG::search_for_missing(
 550   const pg_info_t &oinfo, const pg_missing_t &omissing,
 551   pg_shard_t from,
 552   RecoveryCtx *ctx)
 553 {
 554   uint64_t num_unfound_before = missing_loc.num_unfound();
 555   bool found_missing = missing_loc.add_source_info(
 556     from, oinfo, omissing, ctx->handle);
 557   if (found_missing && num_unfound_before != missing_loc.num_unfound())
 558     publish_stats_to_osd();
 559   if (found_missing &&
 560       (get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, NULL) &
 561        CEPH_FEATURE_OSD_ERASURE_CODES)) {
 562     pg_info_t tinfo(oinfo);
 563     tinfo.pgid.shard = pg_whoami.shard;
 564     (*(ctx->info_map))[from.osd].push_back(
 565       make_pair(
 566         pg_notify_t(
 567           from.shard, pg_whoami.shard,
 568           get_osdmap()->get_epoch(),
 569           get_osdmap()->get_epoch(),
 570           tinfo),
 571         past_intervals));
 572   }
 573   return found_missing;
 574 }
 575
 576 bool PG::MissingLoc::readable_with_acting(
 577   const hobject_t &hoid,
 578   const set<pg_shard_t> &acting) const {
 579   if (!needs_recovery(hoid))
 580     return true;
 581   if (is_deleted(hoid))
 582     return false;
 583   auto missing_loc_entry = missing_loc.find(hoid);
 584   if (missing_loc_entry == missing_loc.end())
 585     return false;
 586   const set<pg_shard_t> &locs = missing_loc_entry->second;
 587   ldout(pg->cct, 10) << __func__ << ": locs:" << locs << dendl;
 588   set<pg_shard_t> have_acting;
 589   for (set<pg_shard_t>::const_iterator i = locs.begin();
 590        i != locs.end();
 591        ++i) {
 592     if (acting.count(*i))
 593       have_acting.insert(*i);
 594   }
 595   return (*is_readable)(have_acting);
 596 }
 597
 598 void PG::MissingLoc::add_batch_sources_info(
 599   const set<pg_shard_t> &sources, ThreadPool::TPHandle* handle)
 600 {
 601   ldout(pg->cct, 10) << __func__ << ": adding sources in batch "
 602                      << sources.size() << dendl;
 603   unsigned loop = 0;
 604   for (map<hobject_t, pg_missing_item>::const_iterator i = needs_recovery_map.begin();
 605       i != needs_recovery_map.end();
 606       ++i) {
 607     if (handle && ++loop >= pg->cct->_conf->osd_loop_before_reset_tphandle) {
 608       handle->reset_tp_timeout();
 609       loop = 0;
 610     }
 611     if (i->second.is_delete())
 612       continue;
 613     missing_loc[i->first].insert(sources.begin(), sources.end());
 614     missing_loc_sources.insert(sources.begin(), sources.end());
 615   }
 616 }
 617
 618 bool PG::MissingLoc::add_source_info(
 619   pg_shard_t fromosd,
 620   const pg_info_t &oinfo,
 621   const pg_missing_t &omissing,
 622   ThreadPool::TPHandle* handle)
 623 {
 624   bool found_missing = false;
 625   unsigned loop = 0;
 626   // found items?
 627   for (map<hobject_t,pg_missing_item>::const_iterator p = needs_recovery_map.begin();
 628        p != needs_recovery_map.end();
 629        ++p) {
 630     const hobject_t &soid(p->first);
 631     eversion_t need = p->second.need;
 632     if (handle && ++loop >= pg->cct->_conf->osd_loop_before_reset_tphandle) {
 633       handle->reset_tp_timeout();
 634       loop = 0;
 635     }
 636     if (p->second.is_delete()) {
 637       ldout(pg->cct, 10) << __func__ << " " << soid
 638                          << " delete, ignoring source" << dendl;
 639       found_missing = true;
 640       continue;
 641     }
 642     if (oinfo.last_update < need) {
 643       ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
 644                          << " also missing on osd." << fromosd
 645                          << " (last_update " << oinfo.last_update
 646                          << " < needed " << need << ")" << dendl;
 647       continue;
 648     }
 649     if (!oinfo.last_backfill.is_max() &&
 650         !oinfo.last_backfill_bitwise) {
 651       ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
 652                          << " also missing on osd." << fromosd
 653                          << " (last_backfill " << oinfo.last_backfill
 654                          << " but with wrong sort order)"
 655                          << dendl;
 656       continue;
 657     }
 658     if (p->first >= oinfo.last_backfill) {
 659       // FIXME: this is _probably_ true, although it could conceivably
 660       // be in the undefined region!  Hmm!
 661       ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
 662                          << " also missing on osd." << fromosd
 663                          << " (past last_backfill " << oinfo.last_backfill
 664                          << ")" << dendl;
 665       continue;
 666     }
 667     if (oinfo.last_complete < need) {
 668       if (omissing.is_missing(soid)) {
 669         ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
 670                            << " also missing on osd." << fromosd << dendl;
 671         continue;
 672       }
 673     }
 674
 675     ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
 676                        << " is on osd." << fromosd << dendl;
 677
 678     missing_loc[soid].insert(fromosd);
 679     missing_loc_sources.insert(fromosd);
 680     found_missing = true;
 681   }
 682
 683   ldout(pg->cct, 20) << "needs_recovery_map missing " << needs_recovery_map
 684                      << dendl;
 685   return found_missing;
 686 }
 687
 688 void PG::discover_all_missing(map<int, map<spg_t,pg_query_t> > &query_map)
 689 {
 690   auto &missing = pg_log.get_missing();
 691   uint64_t unfound = get_num_unfound();
 692   assert(unfound > 0);
 693
 694   dout(10) << __func__ << " "
 695            << missing.num_missing() << " missing, "
 696            << unfound << " unfound"
 697            << dendl;
 698
 699   std::set<pg_shard_t>::const_iterator m = might_have_unfound.begin();
 700   std::set<pg_shard_t>::const_iterator mend = might_have_unfound.end();
 701   for (; m != mend; ++m) {
 702     pg_shard_t peer(*m);
 703
 704     if (!get_osdmap()->is_up(peer.osd)) {
 705       dout(20) << __func__ << " skipping down osd." << peer << dendl;
 706       continue;
 707     }
 708
 709     map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(peer);
 710     if (iter != peer_info.end() &&
 711         (iter->second.is_empty() || iter->second.dne())) {
 712       // ignore empty peers
 713       continue;
 714     }
 715
 716     // If we've requested any of this stuff, the pg_missing_t information
 717     // should be on its way.
 718     // TODO: coalsce requested_* into a single data structure
 719     if (peer_missing.find(peer) != peer_missing.end()) {
 720       dout(20) << __func__ << ": osd." << peer
 721                << ": we already have pg_missing_t" << dendl;
 722       continue;
 723     }
 724     if (peer_log_requested.find(peer) != peer_log_requested.end()) {
 725       dout(20) << __func__ << ": osd." << peer
 726                << ": in peer_log_requested" << dendl;
 727       continue;
 728     }
 729     if (peer_missing_requested.find(peer) != peer_missing_requested.end()) {
 730       dout(20) << __func__ << ": osd." << peer
 731                << ": in peer_missing_requested" << dendl;
 732       continue;
 733     }
 734
 735     // Request missing
 736     dout(10) << __func__ << ": osd." << peer << ": requesting pg_missing_t"
 737              << dendl;
 738     peer_missing_requested.insert(peer);
 739     query_map[peer.osd][spg_t(info.pgid.pgid, peer.shard)] =
 740       pg_query_t(
 741         pg_query_t::FULLLOG,
 742         peer.shard, pg_whoami.shard,
 743         info.history, get_osdmap()->get_epoch());
 744   }
 745 }
 746
 747 /******* PG ***********/
 748 bool PG::needs_recovery() const
 749 {
 750   assert(is_primary());
 751
 752   auto &missing = pg_log.get_missing();
 753
 754   if (missing.num_missing()) {
 755     dout(10) << __func__ << " primary has " << missing.num_missing()
 756       << " missing" << dendl;
 757     return true;
 758   }
 759
 760   assert(!actingbackfill.empty());
 761   set<pg_shard_t>::const_iterator end = actingbackfill.end();
 762   set<pg_shard_t>::const_iterator a = actingbackfill.begin();
 763   for (; a != end; ++a) {
 764     if (*a == get_primary()) continue;
 765     pg_shard_t peer = *a;
 766     map<pg_shard_t, pg_missing_t>::const_iterator pm = peer_missing.find(peer);
 767     if (pm == peer_missing.end()) {
 768       dout(10) << __func__ << " osd." << peer << " doesn't have missing set"
 769         << dendl;
 770       continue;
 771     }
 772     if (pm->second.num_missing()) {
 773       dout(10) << __func__ << " osd." << peer << " has "
 774         << pm->second.num_missing() << " missing" << dendl;
 775       return true;
 776     }
 777   }
 778
 779   dout(10) << __func__ << " is recovered" << dendl;
 780   return false;
 781 }
 782
 783 bool PG::needs_backfill() const
 784 {
 785   assert(is_primary());
 786
 787   // We can assume that only possible osds that need backfill
 788   // are on the backfill_targets vector nodes.
 789   set<pg_shard_t>::const_iterator end = backfill_targets.end();
 790   set<pg_shard_t>::const_iterator a = backfill_targets.begin();
 791   for (; a != end; ++a) {
 792     pg_shard_t peer = *a;
 793     map<pg_shard_t, pg_info_t>::const_iterator pi = peer_info.find(peer);
 794     if (!pi->second.last_backfill.is_max()) {
 795       dout(10) << __func__ << " osd." << peer << " has last_backfill " << pi->second.last_backfill << dendl;
 796       return true;
 797     }
 798   }
 799
 800   dout(10) << __func__ << " does not need backfill" << dendl;
 801   return false;
 802 }
 803
 804
 805 void PG::check_past_interval_bounds() const
 806 {
 807   auto rpib = get_required_past_interval_bounds(
 808     info,
 809     osd->get_superblock().oldest_map);
 810   if (rpib.first >= rpib.second) {
 811     if (!past_intervals.empty()) {
 812       osd->clog->error() << info.pgid << " required past_interval bounds are"
 813                          << " empty [" << rpib << ") but past_intervals is not: "
 814                          << past_intervals;
 815       derr << info.pgid << " required past_interval bounds are"
 816            << " empty [" << rpib << ") but past_intervals is not: "
 817            << past_intervals << dendl;
 818     }
 819   } else {
 820     if (past_intervals.empty()) {
 821       osd->clog->error() << info.pgid << " required past_interval bounds are"
 822                          << " not empty [" << rpib << ") but past_intervals "
 823                          << past_intervals << " is empty";
 824       derr << info.pgid << " required past_interval bounds are"
 825            << " not empty [" << rpib << ") but past_intervals "
 826            << past_intervals << " is empty" << dendl;
 827       assert(!past_intervals.empty());
 828     }
 829
 830     auto apib = past_intervals.get_bounds();
 831     if (apib.first > rpib.first) {
 832       osd->clog->error() << info.pgid << " past_intervals [" << apib
 833                          << ") start interval does not contain the required"
 834                          << " bound [" << rpib << ") start";
 835       derr << info.pgid << " past_intervals [" << apib
 836            << ") start interval does not contain the required"
 837            << " bound [" << rpib << ") start" << dendl;
 838       assert(0 == "past_interval start interval mismatch");
 839     }
 840     if (apib.second != rpib.second) {
 841       osd->clog->error() << info.pgid << " past_interal bound [" << apib
 842                          << ") end does not match required [" << rpib
 843                          << ") end";
 844       derr << info.pgid << " past_interal bound [" << apib
 845            << ") end does not match required [" << rpib
 846            << ") end" << dendl;
 847       assert(0 == "past_interval end mismatch");
 848     }
 849   }
 850 }
 851
 852 bool PG::adjust_need_up_thru(const OSDMapRef osdmap)
 853 {
 854   epoch_t up_thru = osdmap->get_up_thru(osd->whoami);
 855   if (need_up_thru &&
 856       up_thru >= info.history.same_interval_since) {
 857     dout(10) << "adjust_need_up_thru now " << up_thru << ", need_up_thru now false" << dendl;
 858     need_up_thru = false;
 859     return true;
 860   }
 861   return false;
 862 }
 863
 864 void PG::remove_down_peer_info(const OSDMapRef osdmap)
 865 {
 866   // Remove any downed osds from peer_info
 867   bool removed = false;
 868   map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
 869   while (p != peer_info.end()) {
 870     if (!osdmap->is_up(p->first.osd)) {
 871       dout(10) << " dropping down osd." << p->first << " info " << p->second << dendl;
 872       peer_missing.erase(p->first);
 873       peer_log_requested.erase(p->first);
 874       peer_missing_requested.erase(p->first);
 875       peer_info.erase(p++);
 876       removed = true;
 877     } else
 878       ++p;
 879   }
 880
 881   // if we removed anyone, update peers (which include peer_info)
 882   if (removed)
 883     update_heartbeat_peers();
 884   check_recovery_sources(osdmap);
 885 }
 886
 887 /*
 888  * Returns true unless there is a non-lost OSD in might_have_unfound.
 889  */
 890 bool PG::all_unfound_are_queried_or_lost(const OSDMapRef osdmap) const
 891 {
 892   assert(is_primary());
 893
 894   set<pg_shard_t>::const_iterator peer = might_have_unfound.begin();
 895   set<pg_shard_t>::const_iterator mend = might_have_unfound.end();
 896   for (; peer != mend; ++peer) {
 897     if (peer_missing.count(*peer))
 898       continue;
 899     map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(*peer);
 900     if (iter != peer_info.end() &&
 901         (iter->second.is_empty() || iter->second.dne()))
 902       continue;
 903     if (!osdmap->exists(peer->osd))
 904       continue;
 905     const osd_info_t &osd_info(osdmap->get_info(peer->osd));
 906     if (osd_info.lost_at <= osd_info.up_from) {
 907       // If there is even one OSD in might_have_unfound that isn't lost, we
 908       // still might retrieve our unfound.
 909       return false;
 910     }
 911   }
 912   dout(10) << "all_unfound_are_queried_or_lost all of might_have_unfound " << might_have_unfound
 913            << " have been queried or are marked lost" << dendl;
 914   return true;
 915 }
 916
 917 PastIntervals::PriorSet PG::build_prior()
 918 {
 919   if (1) {
 920     // sanity check
 921     for (map<pg_shard_t,pg_info_t>::iterator it = peer_info.begin();
 922          it != peer_info.end();
 923          ++it) {
 924       assert(info.history.last_epoch_started >= it->second.history.last_epoch_started);
 925     }
 926   }
 927
 928   const OSDMap &osdmap = *get_osdmap();
 929   PastIntervals::PriorSet prior = past_intervals.get_prior_set(
 930     pool.info.ec_pool(),
 931     info.history.last_epoch_started,
 932     get_pgbackend()->get_is_recoverable_predicate(),
 933     [&](epoch_t start, int osd, epoch_t *lost_at) {
 934       const osd_info_t *pinfo = 0;
 935       if (osdmap.exists(osd)) {
 936         pinfo = &osdmap.get_info(osd);
 937         if (lost_at)
 938           *lost_at = pinfo->lost_at;
 939       }
 940
 941       if (osdmap.is_up(osd)) {
 942         return PastIntervals::UP;
 943       } else if (!pinfo) {
 944         return PastIntervals::DNE;
 945       } else if (pinfo->lost_at > start) {
 946         return PastIntervals::LOST;
 947       } else {
 948         return PastIntervals::DOWN;
 949       }
 950     },
 951     up,
 952     acting,
 953     this);
 954
 955   if (prior.pg_down) {
 956     state_set(PG_STATE_DOWN);
 957   }
 958
 959   if (get_osdmap()->get_up_thru(osd->whoami) < info.history.same_interval_since) {
 960     dout(10) << "up_thru " << get_osdmap()->get_up_thru(osd->whoami)
 961              << " < same_since " << info.history.same_interval_since
 962              << ", must notify monitor" << dendl;
 963     need_up_thru = true;
 964   } else {
 965     dout(10) << "up_thru " << get_osdmap()->get_up_thru(osd->whoami)
 966              << " >= same_since " << info.history.same_interval_since
 967              << ", all is well" << dendl;
 968     need_up_thru = false;
 969   }
 970   set_probe_targets(prior.probe);
 971   return prior;
 972 }
 973
 974 void PG::clear_primary_state()
 975 {
 976   dout(10) << "clear_primary_state" << dendl;
 977
 978   // clear peering state
 979   stray_set.clear();
 980   peer_log_requested.clear();
 981   peer_missing_requested.clear();
 982   peer_info.clear();
 983   peer_missing.clear();
 984   need_up_thru = false;
 985   peer_last_complete_ondisk.clear();
 986   peer_activated.clear();
 987   min_last_complete_ondisk = eversion_t();
 988   pg_trim_to = eversion_t();
 989   might_have_unfound.clear();
 990   projected_log = PGLog::IndexedLog();
 991
 992   last_update_ondisk = eversion_t();
 993
 994   snap_trimq.clear();
 995
 996   finish_sync_event = 0;  // so that _finish_recovery doesn't go off in another thread
 997
 998   missing_loc.clear();
 999
1000   release_pg_backoffs();
1001
1002   pg_log.reset_recovery_pointers();
1003
1004   scrubber.reserved_peers.clear();
1005   scrub_after_recovery = false;
1006
1007   agent_clear();
1008 }
1009
1010 PG::Scrubber::Scrubber()
1011  : reserved(false), reserve_failed(false),
1012    epoch_start(0),
1013    active(false),
1014    waiting_on(0), shallow_errors(0), deep_errors(0), fixed(0),
1015    must_scrub(false), must_deep_scrub(false), must_repair(false),
1016    auto_repair(false),
1017    num_digest_updates_pending(0),
1018    state(INACTIVE),
1019    deep(false),
1020    seed(0)
1021 {}
1022
1023 PG::Scrubber::~Scrubber() {}
1024
1025 /**
1026  * find_best_info
1027  *
1028  * Returns an iterator to the best info in infos sorted by:
1029  *  1) Prefer newer last_update
1030  *  2) Prefer longer tail if it brings another info into contiguity
1031  *  3) Prefer current primary
1032  */
1033 map<pg_shard_t, pg_info_t>::const_iterator PG::find_best_info(
1034   const map<pg_shard_t, pg_info_t> &infos,
1035   bool restrict_to_up_acting,
1036   bool *history_les_bound) const
1037 {
1038   assert(history_les_bound);
1039   /* See doc/dev/osd_internals/last_epoch_started.rst before attempting
1040    * to make changes to this process.  Also, make sure to update it
1041    * when you find bugs! */
1042   eversion_t min_last_update_acceptable = eversion_t::max();
1043   epoch_t max_last_epoch_started_found = 0;
1044   for (map<pg_shard_t, pg_info_t>::const_iterator i = infos.begin();
1045        i != infos.end();
1046        ++i) {
1047     if (!cct->_conf->osd_find_best_info_ignore_history_les &&
1048         max_last_epoch_started_found < i->second.history.last_epoch_started) {
1049       *history_les_bound = true;
1050       max_last_epoch_started_found = i->second.history.last_epoch_started;
1051     }
1052     if (!i->second.is_incomplete() &&
1053         max_last_epoch_started_found < i->second.last_epoch_started) {
1054       max_last_epoch_started_found = i->second.last_epoch_started;
1055     }
1056   }
1057   for (map<pg_shard_t, pg_info_t>::const_iterator i = infos.begin();
1058        i != infos.end();
1059        ++i) {
1060     if (max_last_epoch_started_found <= i->second.last_epoch_started) {
1061       if (min_last_update_acceptable > i->second.last_update)
1062         min_last_update_acceptable = i->second.last_update;
1063     }
1064   }
1065   if (min_last_update_acceptable == eversion_t::max())
1066     return infos.end();
1067
1068   map<pg_shard_t, pg_info_t>::const_iterator best = infos.end();
1069   // find osd with newest last_update (oldest for ec_pool).
1070   // if there are multiples, prefer
1071   //  - a longer tail, if it brings another peer into log contiguity
1072   //  - the current primary
1073   for (map<pg_shard_t, pg_info_t>::const_iterator p = infos.begin();
1074        p != infos.end();
1075        ++p) {
1076     if (restrict_to_up_acting && !is_up(p->first) &&
1077         !is_acting(p->first))
1078       continue;
1079     // Only consider peers with last_update >= min_last_update_acceptable
1080     if (p->second.last_update < min_last_update_acceptable)
1081       continue;
1082     // Disqualify anyone with a too old last_epoch_started
1083     if (p->second.last_epoch_started < max_last_epoch_started_found)
1084       continue;
1085     // Disqualify anyone who is incomplete (not fully backfilled)
1086     if (p->second.is_incomplete())
1087       continue;
1088     if (best == infos.end()) {
1089       best = p;
1090       continue;
1091     }
1092     // Prefer newer last_update
1093     if (pool.info.require_rollback()) {
1094       if (p->second.last_update > best->second.last_update)
1095         continue;
1096       if (p->second.last_update < best->second.last_update) {
1097         best = p;
1098         continue;
1099       }
1100     } else {
1101       if (p->second.last_update < best->second.last_update)
1102         continue;
1103       if (p->second.last_update > best->second.last_update) {
1104         best = p;
1105         continue;
1106       }
1107     }
1108
1109     // Prefer longer tail
1110     if (p->second.log_tail > best->second.log_tail) {
1111       continue;
1112     } else if (p->second.log_tail < best->second.log_tail) {
1113       best = p;
1114       continue;
1115     }
1116
1117     // prefer current primary (usually the caller), all things being equal
1118     if (p->first == pg_whoami) {
1119       dout(10) << "calc_acting prefer osd." << p->first
1120                << " because it is current primary" << dendl;
1121       best = p;
1122       continue;
1123     }
1124   }
1125   return best;
1126 }
1127
1128 void PG::calc_ec_acting(
1129   map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1130   unsigned size,
1131   const vector<int> &acting,
1132   pg_shard_t acting_primary,
1133   const vector<int> &up,
1134   pg_shard_t up_primary,
1135   const map<pg_shard_t, pg_info_t> &all_info,
1136   bool restrict_to_up_acting,
1137   vector<int> *_want,
1138   set<pg_shard_t> *backfill,
1139   set<pg_shard_t> *acting_backfill,
1140   pg_shard_t *want_primary,
1141   ostream &ss)
1142 {
1143   vector<int> want(size, CRUSH_ITEM_NONE);
1144   map<shard_id_t, set<pg_shard_t> > all_info_by_shard;
1145   unsigned usable = 0;
1146   for (map<pg_shard_t, pg_info_t>::const_iterator i = all_info.begin();
1147        i != all_info.end();
1148        ++i) {
1149     all_info_by_shard[i->first.shard].insert(i->first);
1150   }
1151   for (uint8_t i = 0; i < want.size(); ++i) {
1152     ss << "For position " << (unsigned)i << ": ";
1153     if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE &&
1154         !all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.is_incomplete() &&
1155         all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.last_update >=
1156         auth_log_shard->second.log_tail) {
1157       ss << " selecting up[i]: " << pg_shard_t(up[i], shard_id_t(i)) << std::endl;
1158       want[i] = up[i];
1159       ++usable;
1160       continue;
1161     }
1162     if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE) {
1163       ss << " backfilling up[i]: " << pg_shard_t(up[i], shard_id_t(i))
1164          << " and ";
1165       backfill->insert(pg_shard_t(up[i], shard_id_t(i)));
1166     }
1167
1168     if (acting.size() > (unsigned)i && acting[i] != CRUSH_ITEM_NONE &&
1169         !all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.is_incomplete() &&
1170         all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.last_update >=
1171         auth_log_shard->second.log_tail) {
1172       ss << " selecting acting[i]: " << pg_shard_t(acting[i], shard_id_t(i)) << std::endl;
1173       want[i] = acting[i];
1174       ++usable;
1175     } else if (!restrict_to_up_acting) {
1176       for (set<pg_shard_t>::iterator j = all_info_by_shard[shard_id_t(i)].begin();
1177            j != all_info_by_shard[shard_id_t(i)].end();
1178            ++j) {
1179         assert(j->shard == i);
1180         if (!all_info.find(*j)->second.is_incomplete() &&
1181             all_info.find(*j)->second.last_update >=
1182             auth_log_shard->second.log_tail) {
1183           ss << " selecting stray: " << *j << std::endl;
1184           want[i] = j->osd;
1185           ++usable;
1186           break;
1187         }
1188       }
1189       if (want[i] == CRUSH_ITEM_NONE)
1190         ss << " failed to fill position " << (int)i << std::endl;
1191     }
1192   }
1193
1194   bool found_primary = false;
1195   for (uint8_t i = 0; i < want.size(); ++i) {
1196     if (want[i] != CRUSH_ITEM_NONE) {
1197       acting_backfill->insert(pg_shard_t(want[i], shard_id_t(i)));
1198       if (!found_primary) {
1199         *want_primary = pg_shard_t(want[i], shard_id_t(i));
1200         found_primary = true;
1201       }
1202     }
1203   }
1204   acting_backfill->insert(backfill->begin(), backfill->end());
1205   _want->swap(want);
1206 }
1207
1208 /**
1209  * calculate the desired acting set.
1210  *
1211  * Choose an appropriate acting set.  Prefer up[0], unless it is
1212  * incomplete, or another osd has a longer tail that allows us to
1213  * bring other up nodes up to date.
1214  */
1215 void PG::calc_replicated_acting(
1216   map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1217   unsigned size,
1218   const vector<int> &acting,
1219   pg_shard_t acting_primary,
1220   const vector<int> &up,
1221   pg_shard_t up_primary,
1222   const map<pg_shard_t, pg_info_t> &all_info,
1223   bool restrict_to_up_acting,
1224   vector<int> *want,
1225   set<pg_shard_t> *backfill,
1226   set<pg_shard_t> *acting_backfill,
1227   pg_shard_t *want_primary,
1228   ostream &ss)
1229 {
1230   ss << "calc_acting newest update on osd." << auth_log_shard->first
1231      << " with " << auth_log_shard->second
1232      << (restrict_to_up_acting ? " restrict_to_up_acting" : "") << std::endl;
1233   pg_shard_t auth_log_shard_id = auth_log_shard->first;
1234
1235   // select primary
1236   map<pg_shard_t,pg_info_t>::const_iterator primary;
1237   if (up.size() &&
1238       !all_info.find(up_primary)->second.is_incomplete() &&
1239       all_info.find(up_primary)->second.last_update >=
1240         auth_log_shard->second.log_tail) {
1241     ss << "up_primary: " << up_primary << ") selected as primary" << std::endl;
1242     primary = all_info.find(up_primary); // prefer up[0], all thing being equal
1243   } else {
1244     assert(!auth_log_shard->second.is_incomplete());
1245     ss << "up[0] needs backfill, osd." << auth_log_shard_id
1246        << " selected as primary instead" << std::endl;
1247     primary = auth_log_shard;
1248   }
1249
1250   ss << "calc_acting primary is osd." << primary->first
1251      << " with " << primary->second << std::endl;
1252   *want_primary = primary->first;
1253   want->push_back(primary->first.osd);
1254   acting_backfill->insert(primary->first);
1255   unsigned usable = 1;
1256
1257   // select replicas that have log contiguity with primary.
1258   // prefer up, then acting, then any peer_info osds
1259   for (vector<int>::const_iterator i = up.begin();
1260        i != up.end();
1261        ++i) {
1262     pg_shard_t up_cand = pg_shard_t(*i, shard_id_t::NO_SHARD);
1263     if (up_cand == primary->first)
1264       continue;
1265     const pg_info_t &cur_info = all_info.find(up_cand)->second;
1266     if (cur_info.is_incomplete() ||
1267       cur_info.last_update < MIN(
1268         primary->second.log_tail,
1269         auth_log_shard->second.log_tail)) {
1270       /* We include auth_log_shard->second.log_tail because in GetLog,
1271        * we will request logs back to the min last_update over our
1272        * acting_backfill set, which will result in our log being extended
1273        * as far backwards as necessary to pick up any peers which can
1274        * be log recovered by auth_log_shard's log */
1275       ss << " shard " << up_cand << " (up) backfill " << cur_info << std::endl;
1276       backfill->insert(up_cand);
1277       acting_backfill->insert(up_cand);
1278     } else {
1279       want->push_back(*i);
1280       acting_backfill->insert(up_cand);
1281       usable++;
1282       ss << " osd." << *i << " (up) accepted " << cur_info << std::endl;
1283     }
1284   }
1285
1286   // This no longer has backfill OSDs, but they are covered above.
1287   for (vector<int>::const_iterator i = acting.begin();
1288        i != acting.end();
1289        ++i) {
1290     pg_shard_t acting_cand(*i, shard_id_t::NO_SHARD);
1291     if (usable >= size)
1292       break;
1293
1294     // skip up osds we already considered above
1295     if (acting_cand == primary->first)
1296       continue;
1297     vector<int>::const_iterator up_it = find(up.begin(), up.end(), acting_cand.osd);
1298     if (up_it != up.end())
1299       continue;
1300
1301     const pg_info_t &cur_info = all_info.find(acting_cand)->second;
1302     if (cur_info.is_incomplete() ||
1303         cur_info.last_update < primary->second.log_tail) {
1304       ss << " shard " << acting_cand << " (stray) REJECTED "
1305                << cur_info << std::endl;
1306     } else {
1307       want->push_back(*i);
1308       acting_backfill->insert(acting_cand);
1309       ss << " shard " << acting_cand << " (stray) accepted "
1310          << cur_info << std::endl;
1311       usable++;
1312     }
1313   }
1314
1315   if (restrict_to_up_acting) {
1316     return;
1317   }
1318   for (map<pg_shard_t,pg_info_t>::const_iterator i = all_info.begin();
1319        i != all_info.end();
1320        ++i) {
1321     if (usable >= size)
1322       break;
1323
1324     // skip up osds we already considered above
1325     if (i->first == primary->first)
1326       continue;
1327     vector<int>::const_iterator up_it = find(up.begin(), up.end(), i->first.osd);
1328     if (up_it != up.end())
1329       continue;
1330     vector<int>::const_iterator acting_it = find(
1331       acting.begin(), acting.end(), i->first.osd);
1332     if (acting_it != acting.end())
1333       continue;
1334
1335     if (i->second.is_incomplete() ||
1336         i->second.last_update < primary->second.log_tail) {
1337       ss << " shard " << i->first << " (stray) REJECTED "
1338          << i->second << std::endl;
1339     } else {
1340       want->push_back(i->first.osd);
1341       acting_backfill->insert(i->first);
1342       ss << " shard " << i->first << " (stray) accepted "
1343          << i->second << std::endl;
1344       usable++;
1345     }
1346   }
1347 }
1348
1349 /**
1350  * choose acting
1351  *
1352  * calculate the desired acting, and request a change with the monitor
1353  * if it differs from the current acting.
1354  *
1355  * if restrict_to_up_acting=true, we filter out anything that's not in
1356  * up/acting.  in order to lift this restriction, we need to
1357  *  1) check whether it's worth switching the acting set any time we get
1358  *     a new pg info (not just here, when recovery finishes)
1359  *  2) check whether anything in want_acting went down on each new map
1360  *     (and, if so, calculate a new want_acting)
1361  *  3) remove the assertion in PG::RecoveryState::Active::react(const AdvMap)
1362  * TODO!
1363  */
1364 bool PG::choose_acting(pg_shard_t &auth_log_shard_id,
1365                        bool restrict_to_up_acting,
1366                        bool *history_les_bound)
1367 {
1368   map<pg_shard_t, pg_info_t> all_info(peer_info.begin(), peer_info.end());
1369   all_info[pg_whoami] = info;
1370
1371   for (map<pg_shard_t, pg_info_t>::iterator p = all_info.begin();
1372        p != all_info.end();
1373        ++p) {
1374     dout(10) << "calc_acting osd." << p->first << " " << p->second << dendl;
1375   }
1376
1377   map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard =
1378     find_best_info(all_info, restrict_to_up_acting, history_les_bound);
1379
1380   if (auth_log_shard == all_info.end()) {
1381     if (up != acting) {
1382       dout(10) << "choose_acting no suitable info found (incomplete backfills?),"
1383                << " reverting to up" << dendl;
1384       want_acting = up;
1385       vector<int> empty;
1386       osd->queue_want_pg_temp(info.pgid.pgid, empty);
1387     } else {
1388       dout(10) << "choose_acting failed" << dendl;
1389       assert(want_acting.empty());
1390     }
1391     return false;
1392   }
1393
1394   assert(!auth_log_shard->second.is_incomplete());
1395   auth_log_shard_id = auth_log_shard->first;
1396
1397   set<pg_shard_t> want_backfill, want_acting_backfill;
1398   vector<int> want;
1399   pg_shard_t want_primary;
1400   stringstream ss;
1401   if (!pool.info.ec_pool())
1402     calc_replicated_acting(
1403       auth_log_shard,
1404       get_osdmap()->get_pg_size(info.pgid.pgid),
1405       acting,
1406       primary,
1407       up,
1408       up_primary,
1409       all_info,
1410       restrict_to_up_acting,
1411       &want,
1412       &want_backfill,
1413       &want_acting_backfill,
1414       &want_primary,
1415       ss);
1416   else
1417     calc_ec_acting(
1418       auth_log_shard,
1419       get_osdmap()->get_pg_size(info.pgid.pgid),
1420       acting,
1421       primary,
1422       up,
1423       up_primary,
1424       all_info,
1425       restrict_to_up_acting,
1426       &want,
1427       &want_backfill,
1428       &want_acting_backfill,
1429       &want_primary,
1430       ss);
1431   dout(10) << ss.str() << dendl;
1432
1433   unsigned num_want_acting = 0;
1434   set<pg_shard_t> have;
1435   for (int i = 0; i < (int)want.size(); ++i) {
1436     if (want[i] != CRUSH_ITEM_NONE) {
1437       ++num_want_acting;
1438       have.insert(
1439         pg_shard_t(
1440           want[i],
1441           pool.info.ec_pool() ? shard_id_t(i) : shard_id_t::NO_SHARD));
1442     }
1443   }
1444
1445   // We go incomplete if below min_size for ec_pools since backfill
1446   // does not currently maintain rollbackability
1447   // Otherwise, we will go "peered", but not "active"
1448   if (num_want_acting < pool.info.min_size &&
1449       (pool.info.ec_pool() ||
1450        !cct->_conf->osd_allow_recovery_below_min_size)) {
1451     want_acting.clear();
1452     dout(10) << "choose_acting failed, below min size" << dendl;
1453     return false;
1454   }
1455
1456   /* Check whether we have enough acting shards to later perform recovery */
1457   boost::scoped_ptr<IsPGRecoverablePredicate> recoverable_predicate(
1458     get_pgbackend()->get_is_recoverable_predicate());
1459   if (!(*recoverable_predicate)(have)) {
1460     want_acting.clear();
1461     dout(10) << "choose_acting failed, not recoverable" << dendl;
1462     return false;
1463   }
1464
1465   if (want != acting) {
1466     dout(10) << "choose_acting want " << want << " != acting " << acting
1467              << ", requesting pg_temp change" << dendl;
1468     want_acting = want;
1469
1470     if (want_acting == up) {
1471       // There can't be any pending backfill if
1472       // want is the same as crush map up OSDs.
1473       assert(want_backfill.empty());
1474       vector<int> empty;
1475       osd->queue_want_pg_temp(info.pgid.pgid, empty);
1476     } else
1477       osd->queue_want_pg_temp(info.pgid.pgid, want);
1478     return false;
1479   }
1480   want_acting.clear();
1481   actingbackfill = want_acting_backfill;
1482   dout(10) << "actingbackfill is " << actingbackfill << dendl;
1483   assert(backfill_targets.empty() || backfill_targets == want_backfill);
1484   if (backfill_targets.empty()) {
1485     // Caller is GetInfo
1486     backfill_targets = want_backfill;
1487   }
1488   // Will not change if already set because up would have had to change
1489   // Verify that nothing in backfill is in stray_set
1490   for (set<pg_shard_t>::iterator i = want_backfill.begin();
1491       i != want_backfill.end();
1492       ++i) {
1493     assert(stray_set.find(*i) == stray_set.end());
1494   }
1495   dout(10) << "choose_acting want " << want << " (== acting) backfill_targets "
1496            << want_backfill << dendl;
1497   return true;
1498 }
1499
1500 /* Build the might_have_unfound set.
1501  *
1502  * This is used by the primary OSD during recovery.
1503  *
1504  * This set tracks the OSDs which might have unfound objects that the primary
1505  * OSD needs. As we receive pg_missing_t from each OSD in might_have_unfound, we
1506  * will remove the OSD from the set.
1507  */
1508 void PG::build_might_have_unfound()
1509 {
1510   assert(might_have_unfound.empty());
1511   assert(is_primary());
1512
1513   dout(10) << __func__ << dendl;
1514
1515   check_past_interval_bounds();
1516
1517   might_have_unfound = past_intervals.get_might_have_unfound(
1518     pg_whoami,
1519     pool.info.ec_pool());
1520
1521   // include any (stray) peers
1522   for (map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
1523        p != peer_info.end();
1524        ++p)
1525     might_have_unfound.insert(p->first);
1526
1527   dout(15) << __func__ << ": built " << might_have_unfound << dendl;
1528 }
1529
1530 struct C_PG_ActivateCommitted : public Context {
1531   PGRef pg;
1532   epoch_t epoch;
1533   epoch_t activation_epoch;
1534   C_PG_ActivateCommitted(PG *p, epoch_t e, epoch_t ae)
1535     : pg(p), epoch(e), activation_epoch(ae) {}
1536   void finish(int r) override {
1537     pg->_activate_committed(epoch, activation_epoch);
1538   }
1539 };
1540
1541 void PG::activate(ObjectStore::Transaction& t,
1542                   epoch_t activation_epoch,
1543                   list<Context*>& tfin,
1544                   map<int, map<spg_t,pg_query_t> >& query_map,
1545                   map<int,
1546                       vector<
1547                         pair<pg_notify_t,
1548                              PastIntervals> > > *activator_map,
1549                   RecoveryCtx *ctx)
1550 {
1551   assert(!is_peered());
1552   assert(scrubber.callbacks.empty());
1553   assert(callbacks_for_degraded_object.empty());
1554
1555   // twiddle pg state
1556   state_clear(PG_STATE_DOWN);
1557
1558   send_notify = false;
1559
1560   if (is_primary()) {
1561     // only update primary last_epoch_started if we will go active
1562     if (acting.size() >= pool.info.min_size) {
1563       assert(cct->_conf->osd_find_best_info_ignore_history_les ||
1564              info.last_epoch_started <= activation_epoch);
1565       info.last_epoch_started = activation_epoch;
1566       info.last_interval_started = info.history.same_interval_since;
1567     }
1568   } else if (is_acting(pg_whoami)) {
1569     /* update last_epoch_started on acting replica to whatever the primary sent
1570      * unless it's smaller (could happen if we are going peered rather than
1571      * active, see doc/dev/osd_internals/last_epoch_started.rst) */
1572     if (info.last_epoch_started < activation_epoch) {
1573       info.last_epoch_started = activation_epoch;
1574       info.last_interval_started = info.history.same_interval_since;
1575     }
1576   }
1577
1578   auto &missing = pg_log.get_missing();
1579
1580   if (is_primary()) {
1581     last_update_ondisk = info.last_update;
1582     min_last_complete_ondisk = eversion_t(0,0);  // we don't know (yet)!
1583   }
1584   last_update_applied = info.last_update;
1585   last_rollback_info_trimmed_to_applied = pg_log.get_can_rollback_to();
1586
1587   need_up_thru = false;
1588
1589   // write pg info, log
1590   dirty_info = true;
1591   dirty_big_info = true; // maybe
1592
1593   // find out when we commit
1594   t.register_on_complete(
1595     new C_PG_ActivateCommitted(
1596       this,
1597       get_osdmap()->get_epoch(),
1598       activation_epoch));
1599
1600   // initialize snap_trimq
1601   if (is_primary()) {
1602     dout(20) << "activate - purged_snaps " << info.purged_snaps
1603              << " cached_removed_snaps " << pool.cached_removed_snaps << dendl;
1604     snap_trimq = pool.cached_removed_snaps;
1605     interval_set<snapid_t> intersection;
1606     intersection.intersection_of(snap_trimq, info.purged_snaps);
1607     if (intersection == info.purged_snaps) {
1608       snap_trimq.subtract(info.purged_snaps);
1609     } else {
1610         dout(0) << "warning: info.purged_snaps (" << info.purged_snaps
1611                 << ") is not a subset of pool.cached_removed_snaps ("
1612                 << pool.cached_removed_snaps << ")" << dendl;
1613         snap_trimq.subtract(intersection);
1614     }
1615   }
1616
1617   // init complete pointer
1618   if (missing.num_missing() == 0) {
1619     dout(10) << "activate - no missing, moving last_complete " << info.last_complete
1620              << " -> " << info.last_update << dendl;
1621     info.last_complete = info.last_update;
1622     pg_log.reset_recovery_pointers();
1623   } else {
1624     dout(10) << "activate - not complete, " << missing << dendl;
1625     pg_log.activate_not_complete(info);
1626   }
1627
1628   log_weirdness();
1629
1630   // if primary..
1631   if (is_primary()) {
1632     assert(ctx);
1633     // start up replicas
1634
1635     assert(!actingbackfill.empty());
1636     for (set<pg_shard_t>::iterator i = actingbackfill.begin();
1637          i != actingbackfill.end();
1638          ++i) {
1639       if (*i == pg_whoami) continue;
1640       pg_shard_t peer = *i;
1641       assert(peer_info.count(peer));
1642       pg_info_t& pi = peer_info[peer];
1643
1644       dout(10) << "activate peer osd." << peer << " " << pi << dendl;
1645
1646       MOSDPGLog *m = 0;
1647       assert(peer_missing.count(peer));
1648       pg_missing_t& pm = peer_missing[peer];
1649
1650       bool needs_past_intervals = pi.dne();
1651
1652       /*
1653        * cover case where peer sort order was different and
1654        * last_backfill cannot be interpreted
1655        */
1656       bool force_restart_backfill =
1657         !pi.last_backfill.is_max() &&
1658         !pi.last_backfill_bitwise;
1659
1660       if (pi.last_update == info.last_update && !force_restart_backfill) {
1661         // empty log
1662         if (!pi.last_backfill.is_max())
1663           osd->clog->info() << info.pgid << " continuing backfill to osd."
1664                             << peer
1665                             << " from (" << pi.log_tail << "," << pi.last_update
1666                             << "] " << pi.last_backfill
1667                             << " to " << info.last_update;
1668         if (!pi.is_empty() && activator_map) {
1669           dout(10) << "activate peer osd." << peer << " is up to date, queueing in pending_activators" << dendl;
1670           (*activator_map)[peer.osd].push_back(
1671             make_pair(
1672               pg_notify_t(
1673                 peer.shard, pg_whoami.shard,
1674                 get_osdmap()->get_epoch(),
1675                 get_osdmap()->get_epoch(),
1676                 info),
1677               past_intervals));
1678         } else {
1679           dout(10) << "activate peer osd." << peer << " is up to date, but sending pg_log anyway" << dendl;
1680           m = new MOSDPGLog(
1681             i->shard, pg_whoami.shard,
1682             get_osdmap()->get_epoch(), info);
1683         }
1684       } else if (
1685         pg_log.get_tail() > pi.last_update ||
1686         pi.last_backfill == hobject_t() ||
1687         force_restart_backfill ||
1688         (backfill_targets.count(*i) && pi.last_backfill.is_max())) {
1689         /* ^ This last case covers a situation where a replica is not contiguous
1690          * with the auth_log, but is contiguous with this replica.  Reshuffling
1691          * the active set to handle this would be tricky, so instead we just go
1692          * ahead and backfill it anyway.  This is probably preferrable in any
1693          * case since the replica in question would have to be significantly
1694          * behind.
1695          */
1696         // backfill
1697         osd->clog->debug() << info.pgid << " starting backfill to osd." << peer
1698                          << " from (" << pi.log_tail << "," << pi.last_update
1699                           << "] " << pi.last_backfill
1700                          << " to " << info.last_update;
1701
1702         pi.last_update = info.last_update;
1703         pi.last_complete = info.last_update;
1704         pi.set_last_backfill(hobject_t());
1705         pi.last_epoch_started = info.last_epoch_started;
1706         pi.last_interval_started = info.last_interval_started;
1707         pi.history = info.history;
1708         pi.hit_set = info.hit_set;
1709         pi.stats.stats.clear();
1710
1711         // initialize peer with our purged_snaps.
1712         pi.purged_snaps = info.purged_snaps;
1713
1714         m = new MOSDPGLog(
1715           i->shard, pg_whoami.shard,
1716           get_osdmap()->get_epoch(), pi);
1717
1718         // send some recent log, so that op dup detection works well.
1719         m->log.copy_up_to(pg_log.get_log(), cct->_conf->osd_min_pg_log_entries);
1720         m->info.log_tail = m->log.tail;
1721         pi.log_tail = m->log.tail;  // sigh...
1722
1723         pm.clear();
1724       } else {
1725         // catch up
1726         assert(pg_log.get_tail() <= pi.last_update);
1727         m = new MOSDPGLog(
1728           i->shard, pg_whoami.shard,
1729           get_osdmap()->get_epoch(), info);
1730         // send new stuff to append to replicas log
1731         m->log.copy_after(pg_log.get_log(), pi.last_update);
1732       }
1733
1734       // share past_intervals if we are creating the pg on the replica
1735       // based on whether our info for that peer was dne() *before*
1736       // updating pi.history in the backfill block above.
1737       if (m && needs_past_intervals)
1738         m->past_intervals = past_intervals;
1739
1740       // update local version of peer's missing list!
1741       if (m && pi.last_backfill != hobject_t()) {
1742         for (list<pg_log_entry_t>::iterator p = m->log.log.begin();
1743              p != m->log.log.end();
1744              ++p) {
1745           if (p->soid <= pi.last_backfill &&
1746               !p->is_error()) {
1747             if (perform_deletes_during_peering() && p->is_delete()) {
1748               pm.rm(p->soid, p->version);
1749             } else {
1750               pm.add_next_event(*p);
1751             }
1752           }
1753         }
1754       }
1755
1756       if (m) {
1757         dout(10) << "activate peer osd." << peer << " sending " << m->log << dendl;
1758         //m->log.print(cout);
1759         osd->send_message_osd_cluster(peer.osd, m, get_osdmap()->get_epoch());
1760       }
1761
1762       // peer now has
1763       pi.last_update = info.last_update;
1764
1765       // update our missing
1766       if (pm.num_missing() == 0) {
1767         pi.last_complete = pi.last_update;
1768         dout(10) << "activate peer osd." << peer << " " << pi << " uptodate" << dendl;
1769       } else {
1770         dout(10) << "activate peer osd." << peer << " " << pi << " missing " << pm << dendl;
1771       }
1772     }
1773
1774     // Set up missing_loc
1775     set<pg_shard_t> complete_shards;
1776     for (set<pg_shard_t>::iterator i = actingbackfill.begin();
1777          i != actingbackfill.end();
1778          ++i) {
1779       dout(20) << __func__ << " setting up missing_loc from shard " << *i << " " << dendl;
1780       if (*i == get_primary()) {
1781         missing_loc.add_active_missing(missing);
1782         if (!missing.have_missing())
1783           complete_shards.insert(*i);
1784       } else {
1785         auto peer_missing_entry = peer_missing.find(*i);
1786         assert(peer_missing_entry != peer_missing.end());
1787         missing_loc.add_active_missing(peer_missing_entry->second);
1788         if (!peer_missing_entry->second.have_missing() &&
1789             peer_info[*i].last_backfill.is_max())
1790           complete_shards.insert(*i);
1791       }
1792     }
1793     // If necessary, create might_have_unfound to help us find our unfound objects.
1794     // NOTE: It's important that we build might_have_unfound before trimming the
1795     // past intervals.
1796     might_have_unfound.clear();
1797     if (needs_recovery()) {
1798       // If only one shard has missing, we do a trick to add all others as recovery
1799       // source, this is considered safe since the PGLogs have been merged locally,
1800       // and covers vast majority of the use cases, like one OSD/host is down for
1801       // a while for hardware repairing
1802       if (complete_shards.size() + 1 == actingbackfill.size()) {
1803         missing_loc.add_batch_sources_info(complete_shards, ctx->handle);
1804       } else {
1805         missing_loc.add_source_info(pg_whoami, info, pg_log.get_missing(),
1806                                     ctx->handle);
1807         for (set<pg_shard_t>::iterator i = actingbackfill.begin();
1808              i != actingbackfill.end();
1809              ++i) {
1810           if (*i == pg_whoami) continue;
1811           dout(10) << __func__ << ": adding " << *i << " as a source" << dendl;
1812           assert(peer_missing.count(*i));
1813           assert(peer_info.count(*i));
1814           missing_loc.add_source_info(
1815             *i,
1816             peer_info[*i],
1817             peer_missing[*i],
1818             ctx->handle);
1819         }
1820       }
1821       for (map<pg_shard_t, pg_missing_t>::iterator i = peer_missing.begin();
1822            i != peer_missing.end();
1823            ++i) {
1824         if (is_actingbackfill(i->first))
1825           continue;
1826         assert(peer_info.count(i->first));
1827         search_for_missing(
1828           peer_info[i->first],
1829           i->second,
1830           i->first,
1831           ctx);
1832       }
1833
1834       build_might_have_unfound();
1835
1836       if (have_unfound())
1837         discover_all_missing(query_map);
1838     }
1839
1840     // num_objects_degraded if calculated should reflect this too, unless no
1841     // missing and we are about to go clean.
1842     if (get_osdmap()->get_pg_size(info.pgid.pgid) > actingset.size()) {
1843       state_set(PG_STATE_UNDERSIZED);
1844     }
1845
1846     state_set(PG_STATE_ACTIVATING);
1847     release_pg_backoffs();
1848     projected_last_update = info.last_update;
1849   }
1850   if (acting.size() >= pool.info.min_size) {
1851     PGLogEntryHandler handler{this, &t};
1852     pg_log.roll_forward(&handler);
1853   }
1854 }
1855
1856 bool PG::op_has_sufficient_caps(OpRequestRef& op)
1857 {
1858   // only check MOSDOp
1859   if (op->get_req()->get_type() != CEPH_MSG_OSD_OP)
1860     return true;
1861
1862   const MOSDOp *req = static_cast<const MOSDOp*>(op->get_req());
1863
1864   Session *session = static_cast<Session*>(req->get_connection()->get_priv());
1865   if (!session) {
1866     dout(0) << "op_has_sufficient_caps: no session for op " << *req << dendl;
1867     return false;
1868   }
1869   OSDCap& caps = session->caps;
1870   session->put();
1871
1872   const string &key = req->get_hobj().get_key().empty() ?
1873     req->get_oid().name :
1874     req->get_hobj().get_key();
1875
1876   bool cap = caps.is_capable(pool.name, req->get_hobj().nspace,
1877                              pool.auid, key,
1878                              op->need_read_cap(),
1879                              op->need_write_cap(),
1880                              op->classes());
1881
1882   dout(20) << "op_has_sufficient_caps "
1883            << "session=" << session
1884            << " pool=" << pool.id << " (" << pool.name
1885            << " " << req->get_hobj().nspace
1886            << ") owner=" << pool.auid
1887            << " need_read_cap=" << op->need_read_cap()
1888            << " need_write_cap=" << op->need_write_cap()
1889            << " classes=" << op->classes()
1890            << " -> " << (cap ? "yes" : "NO")
1891            << dendl;
1892   return cap;
1893 }
1894
1895 void PG::_activate_committed(epoch_t epoch, epoch_t activation_epoch)
1896 {
1897   lock();
1898   if (pg_has_reset_since(epoch)) {
1899     dout(10) << "_activate_committed " << epoch
1900              << ", that was an old interval" << dendl;
1901   } else if (is_primary()) {
1902     peer_activated.insert(pg_whoami);
1903     dout(10) << "_activate_committed " << epoch
1904              << " peer_activated now " << peer_activated
1905              << " last_interval_started " << info.history.last_interval_started
1906              << " last_epoch_started " << info.history.last_epoch_started
1907              << " same_interval_since " << info.history.same_interval_since << dendl;
1908     assert(!actingbackfill.empty());
1909     if (peer_activated.size() == actingbackfill.size())
1910       all_activated_and_committed();
1911   } else {
1912     dout(10) << "_activate_committed " << epoch << " telling primary" << dendl;
1913     MOSDPGInfo *m = new MOSDPGInfo(epoch);
1914     pg_notify_t i = pg_notify_t(
1915       get_primary().shard, pg_whoami.shard,
1916       get_osdmap()->get_epoch(),
1917       get_osdmap()->get_epoch(),
1918       info);
1919
1920     i.info.history.last_epoch_started = activation_epoch;
1921     i.info.history.last_interval_started = i.info.history.same_interval_since;
1922     if (acting.size() >= pool.info.min_size) {
1923       state_set(PG_STATE_ACTIVE);
1924     } else {
1925       state_set(PG_STATE_PEERED);
1926     }
1927
1928     m->pg_list.push_back(make_pair(i, PastIntervals()));
1929     osd->send_message_osd_cluster(get_primary().osd, m, get_osdmap()->get_epoch());
1930
1931     // waiters
1932     if (flushes_in_progress == 0) {
1933       requeue_ops(waiting_for_peered);
1934     } else if (!waiting_for_peered.empty()) {
1935       dout(10) << __func__ << " flushes in progress, moving "
1936                << waiting_for_peered.size() << " items to waiting_for_flush"
1937                << dendl;
1938       assert(waiting_for_flush.empty());
1939       waiting_for_flush.swap(waiting_for_peered);
1940     }
1941   }
1942
1943   assert(!dirty_info);
1944
1945   unlock();
1946 }
1947
1948 /*
1949  * update info.history.last_epoch_started ONLY after we and all
1950  * replicas have activated AND committed the activate transaction
1951  * (i.e. the peering results are stable on disk).
1952  */
1953 void PG::all_activated_and_committed()
1954 {
1955   dout(10) << "all_activated_and_committed" << dendl;
1956   assert(is_primary());
1957   assert(peer_activated.size() == actingbackfill.size());
1958   assert(!actingbackfill.empty());
1959   assert(blocked_by.empty());
1960
1961   // Degraded?
1962   _update_calc_stats();
1963   if (info.stats.stats.sum.num_objects_degraded) {
1964     state_set(PG_STATE_DEGRADED);
1965   } else {
1966     state_clear(PG_STATE_DEGRADED);
1967   }
1968
1969   queue_peering_event(
1970     CephPeeringEvtRef(
1971       std::make_shared<CephPeeringEvt>(
1972         get_osdmap()->get_epoch(),
1973         get_osdmap()->get_epoch(),
1974         AllReplicasActivated())));
1975 }
1976
1977 bool PG::requeue_scrub(bool high_priority)
1978 {
1979   assert(is_locked());
1980   if (scrub_queued) {
1981     dout(10) << __func__ << ": already queued" << dendl;
1982     return false;
1983   } else {
1984     dout(10) << __func__ << ": queueing" << dendl;
1985     scrub_queued = true;
1986     osd->queue_for_scrub(this, high_priority);
1987     return true;
1988   }
1989 }
1990
1991 void PG::queue_recovery()
1992 {
1993   if (!is_primary() || !is_peered()) {
1994     dout(10) << "queue_recovery -- not primary or not peered " << dendl;
1995     assert(!recovery_queued);
1996   } else if (recovery_queued) {
1997     dout(10) << "queue_recovery -- already queued" << dendl;
1998   } else {
1999     dout(10) << "queue_recovery -- queuing" << dendl;
2000     recovery_queued = true;
2001     osd->queue_for_recovery(this);
2002   }
2003 }
2004
2005 bool PG::queue_scrub()
2006 {
2007   assert(is_locked());
2008   if (is_scrubbing()) {
2009     return false;
2010   }
2011   scrubber.priority = scrubber.must_scrub ?
2012          cct->_conf->osd_requested_scrub_priority : get_scrub_priority();
2013   scrubber.must_scrub = false;
2014   state_set(PG_STATE_SCRUBBING);
2015   if (scrubber.must_deep_scrub) {
2016     state_set(PG_STATE_DEEP_SCRUB);
2017     scrubber.must_deep_scrub = false;
2018   }
2019   if (scrubber.must_repair || scrubber.auto_repair) {
2020     state_set(PG_STATE_REPAIR);
2021     scrubber.must_repair = false;
2022   }
2023   requeue_scrub();
2024   return true;
2025 }
2026
2027 unsigned PG::get_scrub_priority()
2028 {
2029   // a higher value -> a higher priority
2030   int pool_scrub_priority = 0;
2031   pool.info.opts.get(pool_opts_t::SCRUB_PRIORITY, &pool_scrub_priority);
2032   return pool_scrub_priority > 0 ? pool_scrub_priority : cct->_conf->osd_scrub_priority;
2033 }
2034
2035 struct C_PG_FinishRecovery : public Context {
2036   PGRef pg;
2037   explicit C_PG_FinishRecovery(PG *p) : pg(p) {}
2038   void finish(int r) override {
2039     pg->_finish_recovery(this);
2040   }
2041 };
2042
2043 void PG::mark_clean()
2044 {
2045   if (actingset.size() == get_osdmap()->get_pg_size(info.pgid.pgid)) {
2046     state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
2047     state_set(PG_STATE_CLEAN);
2048     info.history.last_epoch_clean = get_osdmap()->get_epoch();
2049     info.history.last_interval_clean = info.history.same_interval_since;
2050     past_intervals.clear();
2051     dirty_big_info = true;
2052     dirty_info = true;
2053   }
2054
2055   kick_snap_trim();
2056 }
2057
2058 void PG::_change_recovery_force_mode(int new_mode, bool clear)
2059 {
2060   if (!deleting) {
2061     // we can't and shouldn't do anything if the PG is being deleted locally
2062     if (clear) {
2063       state_clear(new_mode);
2064     } else {
2065       state_set(new_mode);
2066     }
2067     publish_stats_to_osd();
2068   }
2069 }
2070
2071 inline int PG::clamp_recovery_priority(int priority)
2072 {
2073   static_assert(OSD_RECOVERY_PRIORITY_MIN < OSD_RECOVERY_PRIORITY_MAX, "Invalid priority range");
2074   static_assert(OSD_RECOVERY_PRIORITY_MIN >= 0, "Priority range must match unsigned type");
2075
2076   // Clamp to valid range
2077   if (priority > OSD_RECOVERY_PRIORITY_MAX) {
2078     return OSD_RECOVERY_PRIORITY_MAX;
2079   } else if (priority < OSD_RECOVERY_PRIORITY_MIN) {
2080     return OSD_RECOVERY_PRIORITY_MIN;
2081   } else {
2082     return priority;
2083   }
2084 }
2085
2086 unsigned PG::get_recovery_priority()
2087 {
2088   // a higher value -> a higher priority
2089   int ret = 0;
2090
2091   if (state & PG_STATE_FORCED_RECOVERY) {
2092     ret = OSD_RECOVERY_PRIORITY_FORCED;
2093   } else {
2094     pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &ret);
2095     ret = clamp_recovery_priority(OSD_RECOVERY_PRIORITY_BASE + ret);
2096   }
2097   dout(20) << __func__ << " recovery priority for " << *this << " is " << ret << ", state is " << state << dendl;
2098   return static_cast<unsigned>(ret);
2099 }
2100
2101 unsigned PG::get_backfill_priority()
2102 {
2103   // a higher value -> a higher priority
2104   int ret = OSD_BACKFILL_PRIORITY_BASE;
2105   if (state & PG_STATE_FORCED_BACKFILL) {
2106     ret = OSD_RECOVERY_PRIORITY_FORCED;
2107   } else {
2108     if (acting.size() < pool.info.min_size) {
2109       // inactive: no. of replicas < min_size, highest priority since it blocks IO
2110       ret = OSD_BACKFILL_INACTIVE_PRIORITY_BASE + (pool.info.min_size - acting.size());
2111
2112     } else if (is_undersized()) {
2113       // undersized: OSD_BACKFILL_DEGRADED_PRIORITY_BASE + num missing replicas
2114       assert(pool.info.size > actingset.size());
2115       ret = OSD_BACKFILL_DEGRADED_PRIORITY_BASE + (pool.info.size - actingset.size());
2116
2117     } else if (is_degraded()) {
2118       // degraded: baseline degraded
2119       ret = OSD_BACKFILL_DEGRADED_PRIORITY_BASE;
2120     }
2121
2122     // Adjust with pool's recovery priority
2123     int pool_recovery_priority = 0;
2124     pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority);
2125
2126     ret = clamp_recovery_priority(pool_recovery_priority + ret);
2127   }
2128
2129   return static_cast<unsigned>(ret);
2130 }
2131
2132 void PG::finish_recovery(list<Context*>& tfin)
2133 {
2134   dout(10) << "finish_recovery" << dendl;
2135   assert(info.last_complete == info.last_update);
2136
2137   clear_recovery_state();
2138
2139   /*
2140    * sync all this before purging strays.  but don't block!
2141    */
2142   finish_sync_event = new C_PG_FinishRecovery(this);
2143   tfin.push_back(finish_sync_event);
2144 }
2145
2146 void PG::_finish_recovery(Context *c)
2147 {
2148   lock();
2149   if (deleting) {
2150     unlock();
2151     return;
2152   }
2153   if (c == finish_sync_event) {
2154     dout(10) << "_finish_recovery" << dendl;
2155     finish_sync_event = 0;
2156     purge_strays();
2157
2158     publish_stats_to_osd();
2159
2160     if (scrub_after_recovery) {
2161       dout(10) << "_finish_recovery requeueing for scrub" << dendl;
2162       scrub_after_recovery = false;
2163       scrubber.must_deep_scrub = true;
2164       queue_scrub();
2165     }
2166   } else {
2167     dout(10) << "_finish_recovery -- stale" << dendl;
2168   }
2169   unlock();
2170 }
2171
2172 void PG::start_recovery_op(const hobject_t& soid)
2173 {
2174   dout(10) << "start_recovery_op " << soid
2175 #ifdef DEBUG_RECOVERY_OIDS
2176            << " (" << recovering_oids << ")"
2177 #endif
2178            << dendl;
2179   assert(recovery_ops_active >= 0);
2180   recovery_ops_active++;
2181 #ifdef DEBUG_RECOVERY_OIDS
2182   assert(recovering_oids.count(soid) == 0);
2183   recovering_oids.insert(soid);
2184 #endif
2185   osd->start_recovery_op(this, soid);
2186 }
2187
2188 void PG::finish_recovery_op(const hobject_t& soid, bool dequeue)
2189 {
2190   dout(10) << "finish_recovery_op " << soid
2191 #ifdef DEBUG_RECOVERY_OIDS
2192            << " (" << recovering_oids << ")"
2193 #endif
2194            << dendl;
2195   assert(recovery_ops_active > 0);
2196   recovery_ops_active--;
2197 #ifdef DEBUG_RECOVERY_OIDS
2198   assert(recovering_oids.count(soid));
2199   recovering_oids.erase(soid);
2200 #endif
2201   osd->finish_recovery_op(this, soid, dequeue);
2202
2203   if (!dequeue) {
2204     queue_recovery();
2205   }
2206 }
2207
2208 void PG::split_into(pg_t child_pgid, PG *child, unsigned split_bits)
2209 {
2210   child->update_snap_mapper_bits(split_bits);
2211   child->update_osdmap_ref(get_osdmap());
2212
2213   child->pool = pool;
2214
2215   // Log
2216   pg_log.split_into(child_pgid, split_bits, &(child->pg_log));
2217   child->info.last_complete = info.last_complete;
2218
2219   info.last_update = pg_log.get_head();
2220   child->info.last_update = child->pg_log.get_head();
2221
2222   child->info.last_user_version = info.last_user_version;
2223
2224   info.log_tail = pg_log.get_tail();
2225   child->info.log_tail = child->pg_log.get_tail();
2226
2227   if (info.last_complete < pg_log.get_tail())
2228     info.last_complete = pg_log.get_tail();
2229   if (child->info.last_complete < child->pg_log.get_tail())
2230     child->info.last_complete = child->pg_log.get_tail();
2231
2232   // Info
2233   child->info.history = info.history;
2234   child->info.history.epoch_created = get_osdmap()->get_epoch();
2235   child->info.purged_snaps = info.purged_snaps;
2236
2237   if (info.last_backfill.is_max()) {
2238     child->info.set_last_backfill(hobject_t::get_max());
2239   } else {
2240     // restart backfill on parent and child to be safe.  we could
2241     // probably do better in the bitwise sort case, but it's more
2242     // fragile (there may be special work to do on backfill completion
2243     // in the future).
2244     info.set_last_backfill(hobject_t());
2245     child->info.set_last_backfill(hobject_t());
2246     // restarting backfill implies that the missing set is empty,
2247     // since it is only used for objects prior to last_backfill
2248     pg_log.reset_backfill();
2249     child->pg_log.reset_backfill();
2250   }
2251
2252   child->info.stats = info.stats;
2253   child->info.stats.parent_split_bits = split_bits;
2254   info.stats.stats_invalid = true;
2255   child->info.stats.stats_invalid = true;
2256   child->info.last_epoch_started = info.last_epoch_started;
2257   child->info.last_interval_started = info.last_interval_started;
2258
2259   child->snap_trimq = snap_trimq;
2260
2261   // There can't be recovery/backfill going on now
2262   int primary, up_primary;
2263   vector<int> newup, newacting;
2264   get_osdmap()->pg_to_up_acting_osds(
2265     child->info.pgid.pgid, &newup, &up_primary, &newacting, &primary);
2266   child->init_primary_up_acting(
2267     newup,
2268     newacting,
2269     up_primary,
2270     primary);
2271   child->role = OSDMap::calc_pg_role(osd->whoami, child->acting);
2272
2273   // this comparison includes primary rank via pg_shard_t
2274   if (get_primary() != child->get_primary())
2275     child->info.history.same_primary_since = get_osdmap()->get_epoch();
2276
2277   child->info.stats.up = up;
2278   child->info.stats.up_primary = up_primary;
2279   child->info.stats.acting = acting;
2280   child->info.stats.acting_primary = primary;
2281   child->info.stats.mapping_epoch = get_osdmap()->get_epoch();
2282
2283   // History
2284   child->past_intervals = past_intervals;
2285
2286   _split_into(child_pgid, child, split_bits);
2287
2288   // release all backoffs for simplicity
2289   release_backoffs(hobject_t(), hobject_t::get_max());
2290
2291   child->on_new_interval();
2292
2293   child->dirty_info = true;
2294   child->dirty_big_info = true;
2295   dirty_info = true;
2296   dirty_big_info = true;
2297 }
2298
2299 void PG::add_backoff(SessionRef s, const hobject_t& begin, const hobject_t& end)
2300 {
2301   ConnectionRef con = s->con;
2302   if (!con)   // OSD::ms_handle_reset clears s->con without a lock
2303     return;
2304   BackoffRef b(s->have_backoff(info.pgid, begin));
2305   if (b) {
2306     derr << __func__ << " already have backoff for " << s << " begin " << begin
2307          << " " << *b << dendl;
2308     ceph_abort();
2309   }
2310   Mutex::Locker l(backoff_lock);
2311   {
2312     b = new Backoff(info.pgid, this, s, ++s->backoff_seq, begin, end);
2313     backoffs[begin].insert(b);
2314     s->add_backoff(b);
2315     dout(10) << __func__ << " session " << s << " added " << *b << dendl;
2316   }
2317   con->send_message(
2318     new MOSDBackoff(
2319       info.pgid,
2320       get_osdmap()->get_epoch(),
2321       CEPH_OSD_BACKOFF_OP_BLOCK,
2322       b->id,
2323       begin,
2324       end));
2325 }
2326
2327 void PG::release_backoffs(const hobject_t& begin, const hobject_t& end)
2328 {
2329   dout(10) << __func__ << " [" << begin << "," << end << ")" << dendl;
2330   vector<BackoffRef> bv;
2331   {
2332     Mutex::Locker l(backoff_lock);
2333     auto p = backoffs.lower_bound(begin);
2334     while (p != backoffs.end()) {
2335       int r = cmp(p->first, end);
2336       dout(20) << __func__ << " ? " << r << " " << p->first
2337                << " " << p->second << dendl;
2338       // note: must still examine begin=end=p->first case
2339       if (r > 0 || (r == 0 && begin < end)) {
2340         break;
2341       }
2342       dout(20) << __func__ << " checking " << p->first
2343                << " " << p->second << dendl;
2344       auto q = p->second.begin();
2345       while (q != p->second.end()) {
2346         dout(20) << __func__ << " checking  " << *q << dendl;
2347         int r = cmp((*q)->begin, begin);
2348         if (r == 0 || (r > 0 && (*q)->end < end)) {
2349           bv.push_back(*q);
2350           q = p->second.erase(q);
2351         } else {
2352           ++q;
2353         }
2354       }
2355       if (p->second.empty()) {
2356         p = backoffs.erase(p);
2357       } else {
2358         ++p;
2359       }
2360     }
2361   }
2362   for (auto b : bv) {
2363     Mutex::Locker l(b->lock);
2364     dout(10) << __func__ << " " << *b << dendl;
2365     if (b->session) {
2366       assert(b->pg == this);
2367       ConnectionRef con = b->session->con;
2368       if (con) {   // OSD::ms_handle_reset clears s->con without a lock
2369         con->send_message(
2370           new MOSDBackoff(
2371             info.pgid,
2372             get_osdmap()->get_epoch(),
2373             CEPH_OSD_BACKOFF_OP_UNBLOCK,
2374             b->id,
2375             b->begin,
2376             b->end));
2377       }
2378       if (b->is_new()) {
2379         b->state = Backoff::STATE_DELETING;
2380       } else {
2381         b->session->rm_backoff(b);
2382         b->session.reset();
2383       }
2384       b->pg.reset();
2385     }
2386   }
2387 }
2388
2389 void PG::clear_backoffs()
2390 {
2391   dout(10) << __func__ << " " << dendl;
2392   map<hobject_t,set<BackoffRef>> ls;
2393   {
2394     Mutex::Locker l(backoff_lock);
2395     ls.swap(backoffs);
2396   }
2397   for (auto& p : ls) {
2398     for (auto& b : p.second) {
2399       Mutex::Locker l(b->lock);
2400       dout(10) << __func__ << " " << *b << dendl;
2401       if (b->session) {
2402         assert(b->pg == this);
2403         if (b->is_new()) {
2404           b->state = Backoff::STATE_DELETING;
2405         } else {
2406           b->session->rm_backoff(b);
2407           b->session.reset();
2408         }
2409         b->pg.reset();
2410       }
2411     }
2412   }
2413 }
2414
2415 // called by Session::clear_backoffs()
2416 void PG::rm_backoff(BackoffRef b)
2417 {
2418   dout(10) << __func__ << " " << *b << dendl;
2419   Mutex::Locker l(backoff_lock);
2420   assert(b->lock.is_locked_by_me());
2421   assert(b->pg == this);
2422   auto p = backoffs.find(b->begin);
2423   // may race with release_backoffs()
2424   if (p != backoffs.end()) {
2425     auto q = p->second.find(b);
2426     if (q != p->second.end()) {
2427       p->second.erase(q);
2428       if (p->second.empty()) {
2429         backoffs.erase(p);
2430       }
2431     }
2432   }
2433 }
2434
2435 void PG::clear_recovery_state()
2436 {
2437   dout(10) << "clear_recovery_state" << dendl;
2438
2439   pg_log.reset_recovery_pointers();
2440   finish_sync_event = 0;
2441
2442   hobject_t soid;
2443   while (recovery_ops_active > 0) {
2444 #ifdef DEBUG_RECOVERY_OIDS
2445     soid = *recovering_oids.begin();
2446 #endif
2447     finish_recovery_op(soid, true);
2448   }
2449
2450   backfill_targets.clear();
2451   backfill_info.clear();
2452   peer_backfill_info.clear();
2453   waiting_on_backfill.clear();
2454   _clear_recovery_state();  // pg impl specific hook
2455 }
2456
2457 void PG::cancel_recovery()
2458 {
2459   dout(10) << "cancel_recovery" << dendl;
2460   clear_recovery_state();
2461 }
2462
2463
2464 void PG::purge_strays()
2465 {
2466   dout(10) << "purge_strays " << stray_set << dendl;
2467
2468   bool removed = false;
2469   for (set<pg_shard_t>::iterator p = stray_set.begin();
2470        p != stray_set.end();
2471        ++p) {
2472     assert(!is_actingbackfill(*p));
2473     if (get_osdmap()->is_up(p->osd)) {
2474       dout(10) << "sending PGRemove to osd." << *p << dendl;
2475       vector<spg_t> to_remove;
2476       to_remove.push_back(spg_t(info.pgid.pgid, p->shard));
2477       MOSDPGRemove *m = new MOSDPGRemove(
2478         get_osdmap()->get_epoch(),
2479         to_remove);
2480       osd->send_message_osd_cluster(p->osd, m, get_osdmap()->get_epoch());
2481     } else {
2482       dout(10) << "not sending PGRemove to down osd." << *p << dendl;
2483     }
2484     peer_missing.erase(*p);
2485     peer_info.erase(*p);
2486     peer_purged.insert(*p);
2487     removed = true;
2488   }
2489
2490   // if we removed anyone, update peers (which include peer_info)
2491   if (removed)
2492     update_heartbeat_peers();
2493
2494   stray_set.clear();
2495
2496   // clear _requested maps; we may have to peer() again if we discover
2497   // (more) stray content
2498   peer_log_requested.clear();
2499   peer_missing_requested.clear();
2500 }
2501
2502 void PG::set_probe_targets(const set<pg_shard_t> &probe_set)
2503 {
2504   Mutex::Locker l(heartbeat_peer_lock);
2505   probe_targets.clear();
2506   for (set<pg_shard_t>::iterator i = probe_set.begin();
2507        i != probe_set.end();
2508        ++i) {
2509     probe_targets.insert(i->osd);
2510   }
2511 }
2512
2513 void PG::clear_probe_targets()
2514 {
2515   Mutex::Locker l(heartbeat_peer_lock);
2516   probe_targets.clear();
2517 }
2518
2519 void PG::update_heartbeat_peers()
2520 {
2521   assert(is_locked());
2522
2523   if (!is_primary())
2524     return;
2525
2526   set<int> new_peers;
2527   for (unsigned i=0; i<acting.size(); i++) {
2528     if (acting[i] != CRUSH_ITEM_NONE)
2529       new_peers.insert(acting[i]);
2530   }
2531   for (unsigned i=0; i<up.size(); i++) {
2532     if (up[i] != CRUSH_ITEM_NONE)
2533       new_peers.insert(up[i]);
2534   }
2535   for (map<pg_shard_t,pg_info_t>::iterator p = peer_info.begin();
2536     p != peer_info.end();
2537     ++p)
2538     new_peers.insert(p->first.osd);
2539
2540   bool need_update = false;
2541   heartbeat_peer_lock.Lock();
2542   if (new_peers == heartbeat_peers) {
2543     dout(10) << "update_heartbeat_peers " << heartbeat_peers << " unchanged" << dendl;
2544   } else {
2545     dout(10) << "update_heartbeat_peers " << heartbeat_peers << " -> " << new_peers << dendl;
2546     heartbeat_peers.swap(new_peers);
2547     need_update = true;
2548   }
2549   heartbeat_peer_lock.Unlock();
2550
2551   if (need_update)
2552     osd->need_heartbeat_peer_update();
2553 }
2554
2555
2556 bool PG::check_in_progress_op(
2557   const osd_reqid_t &r,
2558   eversion_t *version,
2559   version_t *user_version,
2560   int *return_code) const
2561 {
2562   return (
2563     projected_log.get_request(r, version, user_version, return_code) ||
2564     pg_log.get_log().get_request(r, version, user_version, return_code));
2565 }
2566
2567 void PG::_update_calc_stats()
2568 {
2569   info.stats.version = info.last_update;
2570   info.stats.created = info.history.epoch_created;
2571   info.stats.last_scrub = info.history.last_scrub;
2572   info.stats.last_scrub_stamp = info.history.last_scrub_stamp;
2573   info.stats.last_deep_scrub = info.history.last_deep_scrub;
2574   info.stats.last_deep_scrub_stamp = info.history.last_deep_scrub_stamp;
2575   info.stats.last_clean_scrub_stamp = info.history.last_clean_scrub_stamp;
2576   info.stats.last_epoch_clean = info.history.last_epoch_clean;
2577
2578   info.stats.log_size = pg_log.get_head().version - pg_log.get_tail().version;
2579   info.stats.ondisk_log_size = info.stats.log_size;
2580   info.stats.log_start = pg_log.get_tail();
2581   info.stats.ondisk_log_start = pg_log.get_tail();
2582   info.stats.snaptrimq_len = snap_trimq.size();
2583
2584   unsigned num_shards = get_osdmap()->get_pg_size(info.pgid.pgid);
2585
2586   // In rare case that upset is too large (usually transient), use as target
2587   // for calculations below.
2588   unsigned target = std::max(num_shards, (unsigned)upset.size());
2589   // Not sure this could ever happen, that actingset > upset
2590   // which only matters if actingset > num_shards.
2591   unsigned nrep = std::max(actingset.size(), upset.size());
2592   // calc num_object_copies
2593   info.stats.stats.calc_copies(MAX(target, nrep));
2594   info.stats.stats.sum.num_objects_degraded = 0;
2595   info.stats.stats.sum.num_objects_unfound = 0;
2596   info.stats.stats.sum.num_objects_misplaced = 0;
2597   if ((is_remapped() || is_undersized() || !is_clean()) && (is_peered() || is_activating())) {
2598     dout(20) << __func__ << " actingset " << actingset << " upset "
2599              << upset << " actingbackfill " << actingbackfill << dendl;
2600     dout(20) << __func__ << " acting " << acting << " up " << up << dendl;
2601
2602     assert(!actingbackfill.empty());
2603
2604     // NOTE: we only generate degraded, misplaced and unfound
2605     // values for the summation, not individual stat categories.
2606     int64_t num_objects = info.stats.stats.sum.num_objects;
2607
2608     // Objects missing from up nodes, sorted by # objects.
2609     boost::container::flat_set<pair<int64_t,pg_shard_t>> missing_target_objects;
2610     // Objects missing from nodes not in up, sort by # objects
2611     boost::container::flat_set<pair<int64_t,pg_shard_t>> acting_source_objects;
2612
2613     int64_t missing;
2614
2615     // Primary first
2616     missing = pg_log.get_missing().num_missing();
2617     assert(actingbackfill.count(pg_whoami));
2618     if (upset.count(pg_whoami)) {
2619       missing_target_objects.insert(make_pair(missing, pg_whoami));
2620     } else {
2621       acting_source_objects.insert(make_pair(missing, pg_whoami));
2622     }
2623     info.stats.stats.sum.num_objects_missing_on_primary = missing;
2624
2625     // All other peers
2626     for (auto& peer : peer_info) {
2627       // Ignore other peers until we add code to look at detailed missing
2628       // information. (recovery)
2629       if (!actingbackfill.count(peer.first)) {
2630         continue;
2631       }
2632       missing = 0;
2633       // Backfill targets always track num_objects accurately
2634       // all other peers track missing accurately.
2635       if (is_backfill_targets(peer.first)) {
2636         missing = std::max((int64_t)0, num_objects - peer.second.stats.stats.sum.num_objects);
2637       } else {
2638         if (peer_missing.count(peer.first)) {
2639           missing = peer_missing[peer.first].num_missing();
2640         } else {
2641           dout(20) << __func__ << " no peer_missing found for " << peer.first << dendl;
2642         }
2643       }
2644       if (upset.count(peer.first)) {
2645         missing_target_objects.insert(make_pair(missing, peer.first));
2646       } else {
2647         acting_source_objects.insert(make_pair(missing, peer.first));
2648       }
2649       peer.second.stats.stats.sum.num_objects_missing = missing;
2650     }
2651
2652     if (pool.info.is_replicated()) {
2653       // Add to missing_target_objects up to target elements (num_objects missing)
2654       assert(target >= missing_target_objects.size());
2655       unsigned needed = target - missing_target_objects.size();
2656       for (; needed; --needed)
2657         missing_target_objects.insert(make_pair(num_objects, pg_shard_t(pg_shard_t::NO_OSD)));
2658     } else {
2659       for (unsigned i = 0 ; i < num_shards; ++i) {
2660         shard_id_t shard(i);
2661         bool found = false;
2662         for (const auto& t : missing_target_objects) {
2663           if (std::get<1>(t).shard == shard) {
2664             found = true;
2665             break;
2666           }
2667         }
2668         if (!found)
2669           missing_target_objects.insert(make_pair(num_objects, pg_shard_t(pg_shard_t::NO_OSD,shard)));
2670       }
2671     }
2672
2673     for (const auto& item : missing_target_objects)
2674       dout(20) << __func__ << " missing shard " << std::get<1>(item) << " missing= " << std::get<0>(item) << dendl;
2675     for (const auto& item : acting_source_objects)
2676       dout(20) << __func__ << " acting shard " << std::get<1>(item) << " missing= " << std::get<0>(item) << dendl;
2677
2678     // A misplaced object is not stored on the correct OSD
2679     int64_t misplaced = 0;
2680     // a degraded objects has fewer replicas or EC shards than the pool specifies.
2681     int64_t degraded = 0;
2682
2683     for (auto m = missing_target_objects.rbegin();
2684         m != missing_target_objects.rend(); ++m) {
2685
2686       int64_t extra_missing = -1;
2687
2688       if (pool.info.is_replicated()) {
2689         if (!acting_source_objects.empty()) {
2690           auto extra_copy = acting_source_objects.begin();
2691           extra_missing = std::get<0>(*extra_copy);
2692           acting_source_objects.erase(extra_copy);
2693         }
2694       } else {  // Erasure coded
2695         // Use corresponding shard
2696         for (const auto& a : acting_source_objects) {
2697           if (std::get<1>(a).shard == std::get<1>(*m).shard) {
2698             extra_missing = std::get<0>(a);
2699             acting_source_objects.erase(a);
2700             break;
2701           }
2702         }
2703       }
2704
2705       if (extra_missing >= 0 && std::get<0>(*m) >= extra_missing) {
2706         // We don't know which of the objects on the target
2707         // are part of extra_missing so assume are all degraded.
2708         misplaced += std::get<0>(*m) - extra_missing;
2709         degraded += extra_missing;
2710       } else {
2711         // 1. extra_missing == -1, more targets than sources so degraded
2712         // 2. extra_missing > std::get<0>(m), so that we know that some extra_missing
2713         //    previously degraded are now present on the target.
2714         degraded += std::get<0>(*m);
2715       }
2716     }
2717     // If there are still acting that haven't been accounted for
2718     // then they are misplaced
2719     for (const auto& a : acting_source_objects) {
2720       int64_t extra_misplaced = std::max((int64_t)0, num_objects - std::get<0>(a));
2721       dout(20) << __func__ << " extra acting misplaced " << extra_misplaced << dendl;
2722       misplaced += extra_misplaced;
2723     }
2724     dout(20) << __func__ << " degraded " << degraded << dendl;
2725     dout(20) << __func__ << " misplaced " << misplaced << dendl;
2726
2727     info.stats.stats.sum.num_objects_degraded = degraded;
2728     info.stats.stats.sum.num_objects_unfound = get_num_unfound();
2729     info.stats.stats.sum.num_objects_misplaced = misplaced;
2730   }
2731 }
2732
2733 void PG::_update_blocked_by()
2734 {
2735   // set a max on the number of blocking peers we report. if we go
2736   // over, report a random subset.  keep the result sorted.
2737   unsigned keep = MIN(blocked_by.size(), cct->_conf->osd_max_pg_blocked_by);
2738   unsigned skip = blocked_by.size() - keep;
2739   info.stats.blocked_by.clear();
2740   info.stats.blocked_by.resize(keep);
2741   unsigned pos = 0;
2742   for (set<int>::iterator p = blocked_by.begin();
2743        p != blocked_by.end() && keep > 0;
2744        ++p) {
2745     if (skip > 0 && (rand() % (skip + keep) < skip)) {
2746       --skip;
2747     } else {
2748       info.stats.blocked_by[pos++] = *p;
2749       --keep;
2750     }
2751   }
2752 }
2753
2754 void PG::publish_stats_to_osd()
2755 {
2756   if (!is_primary())
2757     return;
2758
2759   pg_stats_publish_lock.Lock();
2760
2761   if (info.stats.stats.sum.num_scrub_errors)
2762     state_set(PG_STATE_INCONSISTENT);
2763   else
2764     state_clear(PG_STATE_INCONSISTENT);
2765
2766   utime_t now = ceph_clock_now();
2767   if (info.stats.state != state) {
2768     info.stats.last_change = now;
2769     // Optimistic estimation, if we just find out an inactive PG,
2770     // assumt it is active till now.
2771     if (!(state & PG_STATE_ACTIVE) &&
2772         (info.stats.state & PG_STATE_ACTIVE))
2773       info.stats.last_active = now;
2774
2775     if ((state & PG_STATE_ACTIVE) &&
2776         !(info.stats.state & PG_STATE_ACTIVE))
2777       info.stats.last_became_active = now;
2778     if ((state & (PG_STATE_ACTIVE|PG_STATE_PEERED)) &&
2779         !(info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED)))
2780       info.stats.last_became_peered = now;
2781     if (!(state & PG_STATE_CREATING) &&
2782         (info.stats.state & PG_STATE_CREATING)) {
2783       osd->send_pg_created(get_pgid().pgid);
2784     }
2785     info.stats.state = state;
2786   }
2787
2788   _update_calc_stats();
2789   if (info.stats.stats.sum.num_objects_degraded) {
2790     state_set(PG_STATE_DEGRADED);
2791   } else {
2792     state_clear(PG_STATE_DEGRADED);
2793   }
2794   _update_blocked_by();
2795
2796   bool publish = false;
2797   pg_stat_t pre_publish = info.stats;
2798   pre_publish.stats.add(unstable_stats);
2799   utime_t cutoff = now;
2800   cutoff -= cct->_conf->osd_pg_stat_report_interval_max;
2801   if (pg_stats_publish_valid && pre_publish == pg_stats_publish &&
2802       info.stats.last_fresh > cutoff) {
2803     dout(15) << "publish_stats_to_osd " << pg_stats_publish.reported_epoch
2804              << ": no change since " << info.stats.last_fresh << dendl;
2805   } else {
2806     // update our stat summary and timestamps
2807     info.stats.reported_epoch = get_osdmap()->get_epoch();
2808     ++info.stats.reported_seq;
2809
2810     info.stats.last_fresh = now;
2811
2812     if (info.stats.state & PG_STATE_CLEAN)
2813       info.stats.last_clean = now;
2814     if (info.stats.state & PG_STATE_ACTIVE)
2815       info.stats.last_active = now;
2816     if (info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED))
2817       info.stats.last_peered = now;
2818     info.stats.last_unstale = now;
2819     if ((info.stats.state & PG_STATE_DEGRADED) == 0)
2820       info.stats.last_undegraded = now;
2821     if ((info.stats.state & PG_STATE_UNDERSIZED) == 0)
2822       info.stats.last_fullsized = now;
2823
2824     // do not send pgstat to mon anymore once we are luminous, since mgr takes
2825     // care of this by sending MMonMgrReport to mon.
2826     publish =
2827       osd->osd->get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
2828     pg_stats_publish_valid = true;
2829     pg_stats_publish = pre_publish;
2830
2831     dout(15) << "publish_stats_to_osd " << pg_stats_publish.reported_epoch
2832              << ":" << pg_stats_publish.reported_seq << dendl;
2833   }
2834   pg_stats_publish_lock.Unlock();
2835
2836   if (publish)
2837     osd->pg_stat_queue_enqueue(this);
2838 }
2839
2840 void PG::clear_publish_stats()
2841 {
2842   dout(15) << "clear_stats" << dendl;
2843   pg_stats_publish_lock.Lock();
2844   pg_stats_publish_valid = false;
2845   pg_stats_publish_lock.Unlock();
2846
2847   osd->pg_stat_queue_dequeue(this);
2848 }
2849
2850 /**
2851  * initialize a newly instantiated pg
2852  *
2853  * Initialize PG state, as when a PG is initially created, or when it
2854  * is first instantiated on the current node.
2855  *
2856  * @param role our role/rank
2857  * @param newup up set
2858  * @param newacting acting set
2859  * @param history pg history
2860  * @param pi past_intervals
2861  * @param backfill true if info should be marked as backfill
2862  * @param t transaction to write out our new state in
2863  */
2864 void PG::init(
2865   int role,
2866   const vector<int>& newup, int new_up_primary,
2867   const vector<int>& newacting, int new_acting_primary,
2868   const pg_history_t& history,
2869   const PastIntervals& pi,
2870   bool backfill,
2871   ObjectStore::Transaction *t)
2872 {
2873   dout(10) << "init role " << role << " up " << newup << " acting " << newacting
2874            << " history " << history
2875            << " past_intervals " << pi
2876            << dendl;
2877
2878   set_role(role);
2879   acting = newacting;
2880   up = newup;
2881   init_primary_up_acting(
2882     newup,
2883     newacting,
2884     new_up_primary,
2885     new_acting_primary);
2886
2887   info.history = history;
2888   past_intervals = pi;
2889
2890   info.stats.up = up;
2891   info.stats.up_primary = new_up_primary;
2892   info.stats.acting = acting;
2893   info.stats.acting_primary = new_acting_primary;
2894   info.stats.mapping_epoch = info.history.same_interval_since;
2895
2896   if (backfill) {
2897     dout(10) << __func__ << ": Setting backfill" << dendl;
2898     info.set_last_backfill(hobject_t());
2899     info.last_complete = info.last_update;
2900     pg_log.mark_log_for_rewrite();
2901   }
2902
2903   on_new_interval();
2904
2905   dirty_info = true;
2906   dirty_big_info = true;
2907   write_if_dirty(*t);
2908 }
2909
2910 #pragma GCC diagnostic ignored "-Wpragmas"
2911 #pragma GCC diagnostic push
2912 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
2913
2914 void PG::upgrade(ObjectStore *store)
2915 {
2916   assert(info_struct_v <= 10);
2917   ObjectStore::Transaction t;
2918
2919   assert(info_struct_v >= 7);
2920
2921   // 7 -> 8
2922   if (info_struct_v <= 7) {
2923     pg_log.mark_log_for_rewrite();
2924     ghobject_t log_oid(OSD::make_pg_log_oid(pg_id));
2925     ghobject_t biginfo_oid(OSD::make_pg_biginfo_oid(pg_id));
2926     t.remove(coll_t::meta(), log_oid);
2927     t.remove(coll_t::meta(), biginfo_oid);
2928     t.touch(coll, pgmeta_oid);
2929   }
2930
2931   // 8 -> 9
2932   if (info_struct_v <= 8) {
2933     // no special action needed.
2934   }
2935
2936   // 9 -> 10
2937   if (info_struct_v <= 9) {
2938     // previous versions weren't (as) aggressively clearing past_intervals
2939     if (info.history.last_epoch_clean >= info.history.same_interval_since) {
2940       dout(20) << __func__ << " clearing past_intervals" << dendl;
2941       past_intervals.clear();
2942     }
2943   }
2944
2945   // update infover_key
2946   if (info_struct_v < cur_struct_v) {
2947     map<string,bufferlist> v;
2948     __u8 ver = cur_struct_v;
2949     ::encode(ver, v[infover_key]);
2950     t.omap_setkeys(coll, pgmeta_oid, v);
2951   }
2952
2953   dirty_info = true;
2954   dirty_big_info = true;
2955   write_if_dirty(t);
2956
2957   ceph::shared_ptr<ObjectStore::Sequencer> osr (std::make_shared<
2958                                       ObjectStore::Sequencer>("upgrade"));
2959   int r = store->apply_transaction(osr.get(), std::move(t));
2960   if (r != 0) {
2961     derr << __func__ << ": apply_transaction returned "
2962          << cpp_strerror(r) << dendl;
2963     ceph_abort();
2964   }
2965   assert(r == 0);
2966
2967   C_SaferCond waiter;
2968   if (!osr->flush_commit(&waiter)) {
2969     waiter.wait();
2970   }
2971 }
2972
2973 #pragma GCC diagnostic pop
2974 #pragma GCC diagnostic warning "-Wpragmas"
2975
2976 int PG::_prepare_write_info(CephContext* cct,
2977                             map<string,bufferlist> *km,
2978                             epoch_t epoch,
2979                             pg_info_t &info, pg_info_t &last_written_info,
2980                             PastIntervals &past_intervals,
2981                             bool dirty_big_info,
2982                             bool dirty_epoch,
2983                             bool try_fast_info,
2984                             PerfCounters *logger)
2985 {
2986   if (dirty_epoch) {
2987     ::encode(epoch, (*km)[epoch_key]);
2988   }
2989
2990   if (logger)
2991     logger->inc(l_osd_pg_info);
2992
2993   // try to do info efficiently?
2994   if (!dirty_big_info && try_fast_info &&
2995       info.last_update > last_written_info.last_update) {
2996     pg_fast_info_t fast;
2997     fast.populate_from(info);
2998     bool did = fast.try_apply_to(&last_written_info);
2999     assert(did);  // we verified last_update increased above
3000     if (info == last_written_info) {
3001       ::encode(fast, (*km)[fastinfo_key]);
3002       if (logger)
3003         logger->inc(l_osd_pg_fastinfo);
3004       return 0;
3005     }
3006     generic_dout(30) << __func__ << " fastinfo failed, info:\n";
3007     {
3008       JSONFormatter jf(true);
3009       jf.dump_object("info", info);
3010       jf.flush(*_dout);
3011     }
3012     {
3013       *_dout << "\nlast_written_info:\n";
3014       JSONFormatter jf(true);
3015       jf.dump_object("last_written_info", last_written_info);
3016       jf.flush(*_dout);
3017     }
3018     *_dout << dendl;
3019   }
3020   last_written_info = info;
3021
3022   // info.  store purged_snaps separately.
3023   interval_set<snapid_t> purged_snaps;
3024   purged_snaps.swap(info.purged_snaps);
3025   ::encode(info, (*km)[info_key]);
3026   purged_snaps.swap(info.purged_snaps);
3027
3028   if (dirty_big_info) {
3029     // potentially big stuff
3030     bufferlist& bigbl = (*km)[biginfo_key];
3031     ::encode(past_intervals, bigbl);
3032     ::encode(info.purged_snaps, bigbl);
3033     //dout(20) << "write_info bigbl " << bigbl.length() << dendl;
3034     if (logger)
3035       logger->inc(l_osd_pg_biginfo);
3036   }
3037
3038   return 0;
3039 }
3040
3041 void PG::_create(ObjectStore::Transaction& t, spg_t pgid, int bits)
3042 {
3043   coll_t coll(pgid);
3044   t.create_collection(coll, bits);
3045 }
3046
3047 void PG::_init(ObjectStore::Transaction& t, spg_t pgid, const pg_pool_t *pool)
3048 {
3049   coll_t coll(pgid);
3050
3051   if (pool) {
3052     // Give a hint to the PG collection
3053     bufferlist hint;
3054     uint32_t pg_num = pool->get_pg_num();
3055     uint64_t expected_num_objects_pg = pool->expected_num_objects / pg_num;
3056     ::encode(pg_num, hint);
3057     ::encode(expected_num_objects_pg, hint);
3058     uint32_t hint_type = ObjectStore::Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS;
3059     t.collection_hint(coll, hint_type, hint);
3060   }
3061
3062   ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
3063   t.touch(coll, pgmeta_oid);
3064   map<string,bufferlist> values;
3065   __u8 struct_v = cur_struct_v;
3066   ::encode(struct_v, values[infover_key]);
3067   t.omap_setkeys(coll, pgmeta_oid, values);
3068 }
3069
3070 void PG::prepare_write_info(map<string,bufferlist> *km)
3071 {
3072   info.stats.stats.add(unstable_stats);
3073   unstable_stats.clear();
3074
3075   bool need_update_epoch = last_epoch < get_osdmap()->get_epoch();
3076   int ret = _prepare_write_info(cct, km, get_osdmap()->get_epoch(),
3077                                 info,
3078                                 last_written_info,
3079                                 past_intervals,
3080                                 dirty_big_info, need_update_epoch,
3081                                 cct->_conf->osd_fast_info,
3082                                 osd->logger);
3083   assert(ret == 0);
3084   if (need_update_epoch)
3085     last_epoch = get_osdmap()->get_epoch();
3086   last_persisted_osdmap_ref = osdmap_ref;
3087
3088   dirty_info = false;
3089   dirty_big_info = false;
3090 }
3091
3092 #pragma GCC diagnostic ignored "-Wpragmas"
3093 #pragma GCC diagnostic push
3094 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
3095
3096 bool PG::_has_removal_flag(ObjectStore *store,
3097                            spg_t pgid)
3098 {
3099   coll_t coll(pgid);
3100   ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
3101
3102   // first try new way
3103   set<string> keys;
3104   keys.insert("_remove");
3105   map<string,bufferlist> values;
3106   if (store->omap_get_values(coll, pgmeta_oid, keys, &values) == 0 &&
3107       values.size() == 1)
3108     return true;
3109
3110   return false;
3111 }
3112
3113 int PG::peek_map_epoch(ObjectStore *store,
3114                        spg_t pgid,
3115                        epoch_t *pepoch,
3116                        bufferlist *bl)
3117 {
3118   coll_t coll(pgid);
3119   ghobject_t legacy_infos_oid(OSD::make_infos_oid());
3120   ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
3121   epoch_t cur_epoch = 0;
3122
3123   assert(bl);
3124   {
3125     // validate collection name
3126     assert(coll.is_pg());
3127   }
3128
3129   // try for v8
3130   set<string> keys;
3131   keys.insert(infover_key);
3132   keys.insert(epoch_key);
3133   map<string,bufferlist> values;
3134   int r = store->omap_get_values(coll, pgmeta_oid, keys, &values);
3135   if (r == 0) {
3136     assert(values.size() == 2);
3137
3138     // sanity check version
3139     bufferlist::iterator bp = values[infover_key].begin();
3140     __u8 struct_v = 0;
3141     ::decode(struct_v, bp);
3142     assert(struct_v >= 8);
3143
3144     // get epoch
3145     bp = values[epoch_key].begin();
3146     ::decode(cur_epoch, bp);
3147   } else {
3148     // probably bug 10617; see OSD::load_pgs()
3149     return -1;
3150   }
3151
3152   *pepoch = cur_epoch;
3153   return 0;
3154 }
3155
3156 #pragma GCC diagnostic pop
3157 #pragma GCC diagnostic warning "-Wpragmas"
3158
3159 void PG::write_if_dirty(ObjectStore::Transaction& t)
3160 {
3161   map<string,bufferlist> km;
3162   if (dirty_big_info || dirty_info)
3163     prepare_write_info(&km);
3164   pg_log.write_log_and_missing(t, &km, coll, pgmeta_oid, pool.info.require_rollback());
3165   if (!km.empty())
3166     t.omap_setkeys(coll, pgmeta_oid, km);
3167 }
3168
3169 void PG::trim_log()
3170 {
3171   assert(is_primary());
3172   calc_trim_to();
3173   dout(10) << __func__ << " to " << pg_trim_to << dendl;
3174   if (pg_trim_to != eversion_t()) {
3175     // inform peers to trim log
3176     assert(!actingbackfill.empty());
3177     for (set<pg_shard_t>::iterator i = actingbackfill.begin();
3178          i != actingbackfill.end();
3179          ++i) {
3180       if (*i == pg_whoami) continue;
3181       osd->send_message_osd_cluster(
3182         i->osd,
3183         new MOSDPGTrim(
3184           get_osdmap()->get_epoch(),
3185           spg_t(info.pgid.pgid, i->shard),
3186           pg_trim_to),
3187         get_osdmap()->get_epoch());
3188     }
3189
3190     // trim primary as well
3191     pg_log.trim(pg_trim_to, info);
3192     dirty_info = true;
3193   }
3194 }
3195
3196 void PG::add_log_entry(const pg_log_entry_t& e, bool applied)
3197 {
3198   // raise last_complete only if we were previously up to date
3199   if (info.last_complete == info.last_update)
3200     info.last_complete = e.version;
3201
3202   // raise last_update.
3203   assert(e.version > info.last_update);
3204   info.last_update = e.version;
3205
3206   // raise user_version, if it increased (it may have not get bumped
3207   // by all logged updates)
3208   if (e.user_version > info.last_user_version)
3209     info.last_user_version = e.user_version;
3210
3211   // log mutation
3212   pg_log.add(e, applied);
3213   dout(10) << "add_log_entry " << e << dendl;
3214 }
3215
3216
3217 void PG::append_log(
3218   const vector<pg_log_entry_t>& logv,
3219   eversion_t trim_to,
3220   eversion_t roll_forward_to,
3221   ObjectStore::Transaction &t,
3222   bool transaction_applied)
3223 {
3224   if (transaction_applied)
3225     update_snap_map(logv, t);
3226
3227   /* The primary has sent an info updating the history, but it may not
3228    * have arrived yet.  We want to make sure that we cannot remember this
3229    * write without remembering that it happened in an interval which went
3230    * active in epoch history.last_epoch_started.
3231    */
3232   if (info.last_epoch_started != info.history.last_epoch_started) {
3233     info.history.last_epoch_started = info.last_epoch_started;
3234   }
3235   if (info.last_interval_started != info.history.last_interval_started) {
3236     info.history.last_interval_started = info.last_interval_started;
3237   }
3238   dout(10) << "append_log " << pg_log.get_log() << " " << logv << dendl;
3239
3240   PGLogEntryHandler handler{this, &t};
3241   if (!transaction_applied) {
3242      /* We must be a backfill peer, so it's ok if we apply
3243       * out-of-turn since we won't be considered when
3244       * determining a min possible last_update.
3245       */
3246     pg_log.roll_forward(&handler);
3247   }
3248
3249   for (vector<pg_log_entry_t>::const_iterator p = logv.begin();
3250        p != logv.end();
3251        ++p) {
3252     add_log_entry(*p, transaction_applied);
3253
3254     /* We don't want to leave the rollforward artifacts around
3255      * here past last_backfill.  It's ok for the same reason as
3256      * above */
3257     if (transaction_applied &&
3258         p->soid > info.last_backfill) {
3259       pg_log.roll_forward(&handler);
3260     }
3261   }
3262   auto last = logv.rbegin();
3263   if (is_primary() && last != logv.rend()) {
3264     projected_log.skip_can_rollback_to_to_head();
3265     projected_log.trim(cct, last->version, nullptr, nullptr, nullptr);
3266   }
3267
3268   if (transaction_applied && roll_forward_to > pg_log.get_can_rollback_to()) {
3269     pg_log.roll_forward_to(
3270       roll_forward_to,
3271       &handler);
3272     t.register_on_applied(
3273       new C_UpdateLastRollbackInfoTrimmedToApplied(
3274         this,
3275         get_osdmap()->get_epoch(),
3276         roll_forward_to));
3277   }
3278
3279   pg_log.trim(trim_to, info);
3280
3281   // update the local pg, pg log
3282   dirty_info = true;
3283   write_if_dirty(t);
3284 }
3285
3286 bool PG::check_log_for_corruption(ObjectStore *store)
3287 {
3288   /// TODO: this method needs to work with the omap log
3289   return true;
3290 }
3291
3292 //! Get the name we're going to save our corrupt page log as
3293 std::string PG::get_corrupt_pg_log_name() const
3294 {
3295   const int MAX_BUF = 512;
3296   char buf[MAX_BUF];
3297   struct tm tm_buf;
3298   time_t my_time(time(NULL));
3299   const struct tm *t = localtime_r(&my_time, &tm_buf);
3300   int ret = strftime(buf, sizeof(buf), "corrupt_log_%Y-%m-%d_%k:%M_", t);
3301   if (ret == 0) {
3302     dout(0) << "strftime failed" << dendl;
3303     return "corrupt_log_unknown_time";
3304   }
3305   string out(buf);
3306   out += stringify(info.pgid);
3307   return out;
3308 }
3309
3310 int PG::read_info(
3311   ObjectStore *store, spg_t pgid, const coll_t &coll, bufferlist &bl,
3312   pg_info_t &info, PastIntervals &past_intervals,
3313   __u8 &struct_v)
3314 {
3315   // try for v8 or later
3316   set<string> keys;
3317   keys.insert(infover_key);
3318   keys.insert(info_key);
3319   keys.insert(biginfo_key);
3320   keys.insert(fastinfo_key);
3321   ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
3322   map<string,bufferlist> values;
3323   int r = store->omap_get_values(coll, pgmeta_oid, keys, &values);
3324   if (r == 0) {
3325     assert(values.size() == 3 ||
3326            values.size() == 4);
3327
3328     bufferlist::iterator p = values[infover_key].begin();
3329     ::decode(struct_v, p);
3330     assert(struct_v >= 8);
3331
3332     p = values[info_key].begin();
3333     ::decode(info, p);
3334
3335     p = values[biginfo_key].begin();
3336     if (struct_v >= 10) {
3337       ::decode(past_intervals, p);
3338     } else {
3339       past_intervals.decode_classic(p);
3340     }
3341     ::decode(info.purged_snaps, p);
3342
3343     p = values[fastinfo_key].begin();
3344     if (!p.end()) {
3345       pg_fast_info_t fast;
3346       ::decode(fast, p);
3347       fast.try_apply_to(&info);
3348     }
3349     return 0;
3350   }
3351
3352   // legacy (ver < 8)
3353   ghobject_t infos_oid(OSD::make_infos_oid());
3354   bufferlist::iterator p = bl.begin();
3355   ::decode(struct_v, p);
3356   assert(struct_v == 7);
3357
3358   // get info out of leveldb
3359   string k = get_info_key(info.pgid);
3360   string bk = get_biginfo_key(info.pgid);
3361   keys.clear();
3362   keys.insert(k);
3363   keys.insert(bk);
3364   values.clear();
3365   store->omap_get_values(coll_t::meta(), ghobject_t(infos_oid), keys, &values);
3366   assert(values.size() == 2);
3367
3368   p = values[k].begin();
3369   ::decode(info, p);
3370
3371   p = values[bk].begin();
3372   ::decode(past_intervals, p);
3373   interval_set<snapid_t> snap_collections;  // obsolete
3374   ::decode(snap_collections, p);
3375   ::decode(info.purged_snaps, p);
3376   return 0;
3377 }
3378
3379 void PG::read_state(ObjectStore *store, bufferlist &bl)
3380 {
3381   int r = read_info(store, pg_id, coll, bl, info, past_intervals,
3382                     info_struct_v);
3383   assert(r >= 0);
3384
3385   last_written_info = info;
3386
3387   // if we are upgrading from jewel, we need to force rebuild of
3388   // missing set.  v9 was fastinfo, added v11.0.2-331-g1d5dc29a13
3389   // (before kraken).  persisted missing set was circa
3390   // v11.0.0-866-gb0e239da95 (a bit earlier, also before kraken).
3391   // v8 was pre-jewel (per-pg meta object).
3392   bool force_rebuild_missing = info_struct_v < 9;
3393   if (force_rebuild_missing) {
3394     dout(10) << __func__ << " detected upgrade from jewel, force_rebuild_missing"
3395              << dendl;
3396   }
3397
3398   ostringstream oss;
3399   pg_log.read_log_and_missing(
3400     store,
3401     coll,
3402     info_struct_v < 8 ? coll_t::meta() : coll,
3403     ghobject_t(info_struct_v < 8 ? OSD::make_pg_log_oid(pg_id) : pgmeta_oid),
3404     info,
3405     force_rebuild_missing,
3406     oss,
3407     cct->_conf->osd_ignore_stale_divergent_priors,
3408     cct->_conf->osd_debug_verify_missing_on_start);
3409   if (oss.tellp())
3410     osd->clog->error() << oss.str();
3411
3412   if (force_rebuild_missing) {
3413     dout(10) << __func__ << " forced rebuild of missing got "
3414              << pg_log.get_missing()
3415              << dendl;
3416   }
3417
3418   // log any weirdness
3419   log_weirdness();
3420 }
3421
3422 void PG::log_weirdness()
3423 {
3424   if (pg_log.get_tail() != info.log_tail)
3425     osd->clog->error() << info.pgid
3426                        << " info mismatch, log.tail " << pg_log.get_tail()
3427                        << " != info.log_tail " << info.log_tail;
3428   if (pg_log.get_head() != info.last_update)
3429     osd->clog->error() << info.pgid
3430                        << " info mismatch, log.head " << pg_log.get_head()
3431                        << " != info.last_update " << info.last_update;
3432
3433   if (!pg_log.get_log().empty()) {
3434     // sloppy check
3435     if ((pg_log.get_log().log.begin()->version <= pg_log.get_tail()))
3436       osd->clog->error() << info.pgid
3437                         << " log bound mismatch, info (tail,head] ("
3438                         << pg_log.get_tail() << "," << pg_log.get_head() << "]"
3439                         << " actual ["
3440                         << pg_log.get_log().log.begin()->version << ","
3441                          << pg_log.get_log().log.rbegin()->version << "]";
3442   }
3443
3444   if (pg_log.get_log().caller_ops.size() > pg_log.get_log().log.size()) {
3445     osd->clog->error() << info.pgid
3446                       << " caller_ops.size " << pg_log.get_log().caller_ops.size()
3447                        << " > log size " << pg_log.get_log().log.size();
3448   }
3449 }
3450
3451 void PG::update_snap_map(
3452   const vector<pg_log_entry_t> &log_entries,
3453   ObjectStore::Transaction &t)
3454 {
3455   for (vector<pg_log_entry_t>::const_iterator i = log_entries.begin();
3456        i != log_entries.end();
3457        ++i) {
3458     OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
3459     if (i->soid.snap < CEPH_MAXSNAP) {
3460       if (i->is_delete()) {
3461         int r = snap_mapper.remove_oid(
3462           i->soid,
3463           &_t);
3464         assert(r == 0);
3465       } else if (i->is_update()) {
3466         assert(i->snaps.length() > 0);
3467         vector<snapid_t> snaps;
3468         bufferlist snapbl = i->snaps;
3469         bufferlist::iterator p = snapbl.begin();
3470         try {
3471           ::decode(snaps, p);
3472         } catch (...) {
3473           derr << __func__ << " decode snaps failure on " << *i << dendl;
3474           snaps.clear();
3475         }
3476         set<snapid_t> _snaps(snaps.begin(), snaps.end());
3477
3478         if (i->is_clone() || i->is_promote()) {
3479           snap_mapper.add_oid(
3480             i->soid,
3481             _snaps,
3482             &_t);
3483         } else if (i->is_modify()) {
3484           assert(i->is_modify());
3485           int r = snap_mapper.update_snaps(
3486             i->soid,
3487             _snaps,
3488             0,
3489             &_t);
3490           assert(r == 0);
3491         } else {
3492           assert(i->is_clean());
3493         }
3494       }
3495     }
3496   }
3497 }
3498
3499 /**
3500  * filter trimming|trimmed snaps out of snapcontext
3501  */
3502 void PG::filter_snapc(vector<snapid_t> &snaps)
3503 {
3504   //nothing needs to trim, we can return immediately
3505   if(snap_trimq.empty() && info.purged_snaps.empty())
3506     return;
3507
3508   bool filtering = false;
3509   vector<snapid_t> newsnaps;
3510   for (vector<snapid_t>::iterator p = snaps.begin();
3511        p != snaps.end();
3512        ++p) {
3513     if (snap_trimq.contains(*p) || info.purged_snaps.contains(*p)) {
3514       if (!filtering) {
3515         // start building a new vector with what we've seen so far
3516         dout(10) << "filter_snapc filtering " << snaps << dendl;
3517         newsnaps.insert(newsnaps.begin(), snaps.begin(), p);
3518         filtering = true;
3519       }
3520       dout(20) << "filter_snapc  removing trimq|purged snap " << *p << dendl;
3521     } else {
3522       if (filtering)
3523         newsnaps.push_back(*p);  // continue building new vector
3524     }
3525   }
3526   if (filtering) {
3527     snaps.swap(newsnaps);
3528     dout(10) << "filter_snapc  result " << snaps << dendl;
3529   }
3530 }
3531
3532 void PG::requeue_object_waiters(map<hobject_t, list<OpRequestRef>>& m)
3533 {
3534   for (map<hobject_t, list<OpRequestRef>>::iterator it = m.begin();
3535        it != m.end();
3536        ++it)
3537     requeue_ops(it->second);
3538   m.clear();
3539 }
3540
3541 void PG::requeue_op(OpRequestRef op)
3542 {
3543   auto p = waiting_for_map.find(op->get_source());
3544   if (p != waiting_for_map.end()) {
3545     dout(20) << __func__ << " " << op << " (waiting_for_map " << p->first << ")"
3546              << dendl;
3547     p->second.push_front(op);
3548   } else {
3549     dout(20) << __func__ << " " << op << dendl;
3550     osd->enqueue_front(info.pgid, PGQueueable(op, get_osdmap()->get_epoch()));
3551   }
3552 }
3553
3554 void PG::requeue_ops(list<OpRequestRef> &ls)
3555 {
3556   for (list<OpRequestRef>::reverse_iterator i = ls.rbegin();
3557        i != ls.rend();
3558        ++i) {
3559     auto p = waiting_for_map.find((*i)->get_source());
3560     if (p != waiting_for_map.end()) {
3561       dout(20) << __func__ << " " << *i << " (waiting_for_map " << p->first
3562                << ")" << dendl;
3563       p->second.push_front(*i);
3564     } else {
3565       dout(20) << __func__ << " " << *i << dendl;
3566       osd->enqueue_front(info.pgid, PGQueueable(*i, get_osdmap()->get_epoch()));
3567     }
3568   }
3569   ls.clear();
3570 }
3571
3572 void PG::requeue_map_waiters()
3573 {
3574   epoch_t epoch = get_osdmap()->get_epoch();
3575   auto p = waiting_for_map.begin();
3576   while (p != waiting_for_map.end()) {
3577     if (epoch < p->second.front()->min_epoch) {
3578       dout(20) << __func__ << " " << p->first << " front op "
3579                << p->second.front() << " must still wait, doing nothing"
3580                << dendl;
3581       ++p;
3582     } else {
3583       dout(20) << __func__ << " " << p->first << " " << p->second << dendl;
3584       for (auto q = p->second.rbegin(); q != p->second.rend(); ++q) {
3585         osd->enqueue_front(info.pgid, PGQueueable(*q, epoch));
3586       }
3587       p = waiting_for_map.erase(p);
3588     }
3589   }
3590 }
3591
3592
3593 // ==========================================================================================
3594 // SCRUB
3595
3596 /*
3597  * when holding pg and sched_scrub_lock, then the states are:
3598  *   scheduling:
3599  *     scrubber.reserved = true
3600  *     scrub_rserved_peers includes whoami
3601  *     osd->scrub_pending++
3602  *   scheduling, replica declined:
3603  *     scrubber.reserved = true
3604  *     scrubber.reserved_peers includes -1
3605  *     osd->scrub_pending++
3606  *   pending:
3607  *     scrubber.reserved = true
3608  *     scrubber.reserved_peers.size() == acting.size();
3609  *     pg on scrub_wq
3610  *     osd->scrub_pending++
3611  *   scrubbing:
3612  *     scrubber.reserved = false;
3613  *     scrubber.reserved_peers empty
3614  *     osd->scrubber.active++
3615  */
3616
3617 // returns true if a scrub has been newly kicked off
3618 bool PG::sched_scrub()
3619 {
3620   bool nodeep_scrub = false;
3621   assert(is_locked());
3622   if (!(is_primary() && is_active() && is_clean() && !is_scrubbing())) {
3623     return false;
3624   }
3625
3626   double deep_scrub_interval = 0;
3627   pool.info.opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &deep_scrub_interval);
3628   if (deep_scrub_interval <= 0) {
3629     deep_scrub_interval = cct->_conf->osd_deep_scrub_interval;
3630   }
3631   bool time_for_deep = ceph_clock_now() >=
3632     info.history.last_deep_scrub_stamp + deep_scrub_interval;
3633
3634   bool deep_coin_flip = false;
3635   // Only add random deep scrubs when NOT user initiated scrub
3636   if (!scrubber.must_scrub)
3637       deep_coin_flip = (rand() % 100) < cct->_conf->osd_deep_scrub_randomize_ratio * 100;
3638   dout(20) << __func__ << ": time_for_deep=" << time_for_deep << " deep_coin_flip=" << deep_coin_flip << dendl;
3639
3640   time_for_deep = (time_for_deep || deep_coin_flip);
3641
3642   //NODEEP_SCRUB so ignore time initiated deep-scrub
3643   if (osd->osd->get_osdmap()->test_flag(CEPH_OSDMAP_NODEEP_SCRUB) ||
3644       pool.info.has_flag(pg_pool_t::FLAG_NODEEP_SCRUB)) {
3645     time_for_deep = false;
3646     nodeep_scrub = true;
3647   }
3648
3649   if (!scrubber.must_scrub) {
3650     assert(!scrubber.must_deep_scrub);
3651
3652     //NOSCRUB so skip regular scrubs
3653     if ((osd->osd->get_osdmap()->test_flag(CEPH_OSDMAP_NOSCRUB) ||
3654          pool.info.has_flag(pg_pool_t::FLAG_NOSCRUB)) && !time_for_deep) {
3655       if (scrubber.reserved) {
3656         // cancel scrub if it is still in scheduling,
3657         // so pgs from other pools where scrub are still legal
3658         // have a chance to go ahead with scrubbing.
3659         clear_scrub_reserved();
3660         scrub_unreserve_replicas();
3661       }
3662       return false;
3663     }
3664   }
3665
3666   if (cct->_conf->osd_scrub_auto_repair
3667       && get_pgbackend()->auto_repair_supported()
3668       && time_for_deep
3669       // respect the command from user, and not do auto-repair
3670       && !scrubber.must_repair
3671       && !scrubber.must_scrub
3672       && !scrubber.must_deep_scrub) {
3673     dout(20) << __func__ << ": auto repair with deep scrubbing" << dendl;
3674     scrubber.auto_repair = true;
3675   } else {
3676     // this happens when user issue the scrub/repair command during
3677     // the scheduling of the scrub/repair (e.g. request reservation)
3678     scrubber.auto_repair = false;
3679   }
3680
3681   bool ret = true;
3682   if (!scrubber.reserved) {
3683     assert(scrubber.reserved_peers.empty());
3684     if ((cct->_conf->osd_scrub_during_recovery || !osd->is_recovery_active()) &&
3685          osd->inc_scrubs_pending()) {
3686       dout(20) << __func__ << ": reserved locally, reserving replicas" << dendl;
3687       scrubber.reserved = true;
3688       scrubber.reserved_peers.insert(pg_whoami);
3689       scrub_reserve_replicas();
3690     } else {
3691       dout(20) << __func__ << ": failed to reserve locally" << dendl;
3692       ret = false;
3693     }
3694   }
3695   if (scrubber.reserved) {
3696     if (scrubber.reserve_failed) {
3697       dout(20) << "sched_scrub: failed, a peer declined" << dendl;
3698       clear_scrub_reserved();
3699       scrub_unreserve_replicas();
3700       ret = false;
3701     } else if (scrubber.reserved_peers.size() == acting.size()) {
3702       dout(20) << "sched_scrub: success, reserved self and replicas" << dendl;
3703       if (time_for_deep) {
3704         dout(10) << "sched_scrub: scrub will be deep" << dendl;
3705         state_set(PG_STATE_DEEP_SCRUB);
3706       } else if (!scrubber.must_deep_scrub && info.stats.stats.sum.num_deep_scrub_errors) {
3707         if (!nodeep_scrub) {
3708           osd->clog->info() << "osd." << osd->whoami
3709                             << " pg " << info.pgid
3710                             << " Deep scrub errors, upgrading scrub to deep-scrub";
3711           state_set(PG_STATE_DEEP_SCRUB);
3712         } else if (!scrubber.must_scrub) {
3713           osd->clog->error() << "osd." << osd->whoami
3714                              << " pg " << info.pgid
3715                              << " Regular scrub skipped due to deep-scrub errors and nodeep-scrub set";
3716           clear_scrub_reserved();
3717           scrub_unreserve_replicas();
3718           return false;
3719         } else {
3720           osd->clog->error() << "osd." << osd->whoami
3721                              << " pg " << info.pgid
3722                              << " Regular scrub request, deep-scrub details will be lost";
3723         }
3724       }
3725       queue_scrub();
3726     } else {
3727       // none declined, since scrubber.reserved is set
3728       dout(20) << "sched_scrub: reserved " << scrubber.reserved_peers << ", waiting for replicas" << dendl;
3729     }
3730   }
3731
3732   return ret;
3733 }
3734
3735 void PG::reg_next_scrub()
3736 {
3737   if (!is_primary())
3738     return;
3739
3740   utime_t reg_stamp;
3741   if (scrubber.must_scrub ||
3742       (info.stats.stats_invalid && cct->_conf->osd_scrub_invalid_stats)) {
3743     reg_stamp = ceph_clock_now();
3744   } else {
3745     reg_stamp = info.history.last_scrub_stamp;
3746   }
3747   // note down the sched_time, so we can locate this scrub, and remove it
3748   // later on.
3749   double scrub_min_interval = 0, scrub_max_interval = 0;
3750   pool.info.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &scrub_min_interval);
3751   pool.info.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &scrub_max_interval);
3752   assert(scrubber.scrub_reg_stamp == utime_t());
3753   scrubber.scrub_reg_stamp = osd->reg_pg_scrub(info.pgid,
3754                                                reg_stamp,
3755                                                scrub_min_interval,
3756                                                scrub_max_interval,
3757                                                scrubber.must_scrub);
3758 }
3759
3760 void PG::unreg_next_scrub()
3761 {
3762   if (is_primary()) {
3763     osd->unreg_pg_scrub(info.pgid, scrubber.scrub_reg_stamp);
3764     scrubber.scrub_reg_stamp = utime_t();
3765   }
3766 }
3767
3768 void PG::do_replica_scrub_map(OpRequestRef op)
3769 {
3770   const MOSDRepScrubMap *m = static_cast<const MOSDRepScrubMap*>(op->get_req());
3771   dout(7) << __func__ << " " << *m << dendl;
3772   if (m->map_epoch < info.history.same_interval_since) {
3773     dout(10) << __func__ << " discarding old from "
3774              << m->map_epoch << " < " << info.history.same_interval_since
3775              << dendl;
3776     return;
3777   }
3778   if (!scrubber.is_chunky_scrub_active()) {
3779     dout(10) << __func__ << " scrub isn't active" << dendl;
3780     return;
3781   }
3782
3783   op->mark_started();
3784
3785   bufferlist::iterator p = const_cast<bufferlist&>(m->get_data()).begin();
3786   scrubber.received_maps[m->from].decode(p, info.pgid.pool());
3787   dout(10) << "map version is "
3788            << scrubber.received_maps[m->from].valid_through
3789            << dendl;
3790
3791   --scrubber.waiting_on;
3792   scrubber.waiting_on_whom.erase(m->from);
3793   if (scrubber.waiting_on == 0) {
3794     if (ops_blocked_by_scrub()) {
3795       requeue_scrub(true);
3796     } else {
3797       requeue_scrub(false);
3798     }
3799   }
3800 }
3801
3802 void PG::sub_op_scrub_map(OpRequestRef op)
3803 {
3804   // for legacy jewel compatibility only
3805   const MOSDSubOp *m = static_cast<const MOSDSubOp *>(op->get_req());
3806   assert(m->get_type() == MSG_OSD_SUBOP);
3807   dout(7) << "sub_op_scrub_map" << dendl;
3808
3809   if (m->map_epoch < info.history.same_interval_since) {
3810     dout(10) << "sub_op_scrub discarding old sub_op from "
3811              << m->map_epoch << " < " << info.history.same_interval_since << dendl;
3812     return;
3813   }
3814
3815   if (!scrubber.is_chunky_scrub_active()) {
3816     dout(10) << "sub_op_scrub_map scrub isn't active" << dendl;
3817     return;
3818   }
3819
3820   op->mark_started();
3821
3822   dout(10) << " got " << m->from << " scrub map" << dendl;
3823   bufferlist::iterator p = const_cast<bufferlist&>(m->get_data()).begin();
3824
3825   scrubber.received_maps[m->from].decode(p, info.pgid.pool());
3826   dout(10) << "map version is "
3827              << scrubber.received_maps[m->from].valid_through
3828              << dendl;
3829
3830   --scrubber.waiting_on;
3831   scrubber.waiting_on_whom.erase(m->from);
3832
3833   if (scrubber.waiting_on == 0) {
3834     if (ops_blocked_by_scrub()) {
3835       requeue_scrub(true);
3836     } else {
3837       requeue_scrub(false);
3838     }
3839   }
3840 }
3841
3842 // send scrub v3 messages (chunky scrub)
3843 void PG::_request_scrub_map(
3844   pg_shard_t replica, eversion_t version,
3845   hobject_t start, hobject_t end,
3846   bool deep, uint32_t seed)
3847 {
3848   assert(replica != pg_whoami);
3849   dout(10) << "scrub  requesting scrubmap from osd." << replica
3850            << " deep " << (int)deep << " seed " << seed << dendl;
3851   MOSDRepScrub *repscrubop = new MOSDRepScrub(
3852     spg_t(info.pgid.pgid, replica.shard), version,
3853     get_osdmap()->get_epoch(),
3854     get_last_peering_reset(),
3855     start, end, deep, seed);
3856   // default priority, we want the rep scrub processed prior to any recovery
3857   // or client io messages (we are holding a lock!)
3858   osd->send_message_osd_cluster(
3859     replica.osd, repscrubop, get_osdmap()->get_epoch());
3860 }
3861
3862 void PG::handle_scrub_reserve_request(OpRequestRef op)
3863 {
3864   dout(7) << __func__ << " " << *op->get_req() << dendl;
3865   op->mark_started();
3866   if (scrubber.reserved) {
3867     dout(10) << __func__ << " ignoring reserve request: Already reserved"
3868              << dendl;
3869     return;
3870   }
3871   if ((cct->_conf->osd_scrub_during_recovery || !osd->is_recovery_active()) &&
3872       osd->inc_scrubs_pending()) {
3873     scrubber.reserved = true;
3874   } else {
3875     dout(20) << __func__ << ": failed to reserve remotely" << dendl;
3876     scrubber.reserved = false;
3877   }
3878   if (op->get_req()->get_type() == MSG_OSD_SCRUB_RESERVE) {
3879     const MOSDScrubReserve *m =
3880       static_cast<const MOSDScrubReserve*>(op->get_req());
3881     Message *reply = new MOSDScrubReserve(
3882       spg_t(info.pgid.pgid, primary.shard),
3883       m->map_epoch,
3884       scrubber.reserved ? MOSDScrubReserve::GRANT : MOSDScrubReserve::REJECT,
3885       pg_whoami);
3886     osd->send_message_osd_cluster(reply, op->get_req()->get_connection());
3887   } else {
3888     // for jewel compat only
3889     const MOSDSubOp *req = static_cast<const MOSDSubOp*>(op->get_req());
3890     assert(req->get_type() == MSG_OSD_SUBOP);
3891     MOSDSubOpReply *reply = new MOSDSubOpReply(
3892       req, pg_whoami, 0, get_osdmap()->get_epoch(), CEPH_OSD_FLAG_ACK);
3893     ::encode(scrubber.reserved, reply->get_data());
3894     osd->send_message_osd_cluster(reply, op->get_req()->get_connection());
3895   }
3896 }
3897
3898 void PG::handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from)
3899 {
3900   dout(7) << __func__ << " " << *op->get_req() << dendl;
3901   op->mark_started();
3902   if (!scrubber.reserved) {
3903     dout(10) << "ignoring obsolete scrub reserve reply" << dendl;
3904     return;
3905   }
3906   if (scrubber.reserved_peers.find(from) != scrubber.reserved_peers.end()) {
3907     dout(10) << " already had osd." << from << " reserved" << dendl;
3908   } else {
3909     dout(10) << " osd." << from << " scrub reserve = success" << dendl;
3910     scrubber.reserved_peers.insert(from);
3911     sched_scrub();
3912   }
3913 }
3914
3915 void PG::handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from)
3916 {
3917   dout(7) << __func__ << " " << *op->get_req() << dendl;
3918   op->mark_started();
3919   if (!scrubber.reserved) {
3920     dout(10) << "ignoring obsolete scrub reserve reply" << dendl;
3921     return;
3922   }
3923   if (scrubber.reserved_peers.find(from) != scrubber.reserved_peers.end()) {
3924     dout(10) << " already had osd." << from << " reserved" << dendl;
3925   } else {
3926     /* One decline stops this pg from being scheduled for scrubbing. */
3927     dout(10) << " osd." << from << " scrub reserve = fail" << dendl;
3928     scrubber.reserve_failed = true;
3929     sched_scrub();
3930   }
3931 }
3932
3933 void PG::handle_scrub_reserve_release(OpRequestRef op)
3934 {
3935   dout(7) << __func__ << " " << *op->get_req() << dendl;
3936   op->mark_started();
3937   clear_scrub_reserved();
3938 }
3939
3940 void PG::reject_reservation()
3941 {
3942   osd->send_message_osd_cluster(
3943     primary.osd,
3944     new MBackfillReserve(
3945       MBackfillReserve::REJECT,
3946       spg_t(info.pgid.pgid, primary.shard),
3947       get_osdmap()->get_epoch()),
3948     get_osdmap()->get_epoch());
3949 }
3950
3951 void PG::schedule_backfill_retry(float delay)
3952 {
3953   Mutex::Locker lock(osd->recovery_request_lock);
3954   osd->recovery_request_timer.add_event_after(
3955     delay,
3956     new QueuePeeringEvt<RequestBackfill>(
3957       this, get_osdmap()->get_epoch(),
3958       RequestBackfill()));
3959 }
3960
3961 void PG::schedule_recovery_retry(float delay)
3962 {
3963   Mutex::Locker lock(osd->recovery_request_lock);
3964   osd->recovery_request_timer.add_event_after(
3965     delay,
3966     new QueuePeeringEvt<DoRecovery>(
3967       this, get_osdmap()->get_epoch(),
3968       DoRecovery()));
3969 }
3970
3971 void PG::clear_scrub_reserved()
3972 {
3973   scrubber.reserved_peers.clear();
3974   scrubber.reserve_failed = false;
3975
3976   if (scrubber.reserved) {
3977     scrubber.reserved = false;
3978     osd->dec_scrubs_pending();
3979   }
3980 }
3981
3982 void PG::scrub_reserve_replicas()
3983 {
3984   assert(backfill_targets.empty());
3985   for (set<pg_shard_t>::iterator i = actingbackfill.begin();
3986        i != actingbackfill.end();
3987        ++i) {
3988     if (*i == pg_whoami) continue;
3989     dout(10) << "scrub requesting reserve from osd." << *i << dendl;
3990     if (HAVE_FEATURE(get_min_acting_features(), SERVER_LUMINOUS)) {
3991       osd->send_message_osd_cluster(
3992         i->osd,
3993         new MOSDScrubReserve(spg_t(info.pgid.pgid, i->shard),
3994                              get_osdmap()->get_epoch(),
3995                              MOSDScrubReserve::REQUEST, pg_whoami),
3996         get_osdmap()->get_epoch());
3997     } else {
3998       // for jewel compat only
3999       vector<OSDOp> scrub(1);
4000       scrub[0].op.op = CEPH_OSD_OP_SCRUB_RESERVE;
4001       hobject_t poid;
4002       eversion_t v;
4003       osd_reqid_t reqid;
4004       MOSDSubOp *subop = new MOSDSubOp(
4005         reqid, pg_whoami, spg_t(info.pgid.pgid, i->shard), poid, 0,
4006         get_osdmap()->get_epoch(), osd->get_tid(), v);
4007       subop->ops = scrub;
4008       osd->send_message_osd_cluster(
4009         i->osd, subop, get_osdmap()->get_epoch());
4010     }
4011   }
4012 }
4013
4014 void PG::scrub_unreserve_replicas()
4015 {
4016   assert(backfill_targets.empty());
4017   for (set<pg_shard_t>::iterator i = actingbackfill.begin();
4018        i != actingbackfill.end();
4019        ++i) {
4020     if (*i == pg_whoami) continue;
4021     dout(10) << "scrub requesting unreserve from osd." << *i << dendl;
4022     if (HAVE_FEATURE(get_min_acting_features(), SERVER_LUMINOUS)) {
4023       osd->send_message_osd_cluster(
4024         i->osd,
4025         new MOSDScrubReserve(spg_t(info.pgid.pgid, i->shard),
4026                              get_osdmap()->get_epoch(),
4027                              MOSDScrubReserve::RELEASE, pg_whoami),
4028         get_osdmap()->get_epoch());
4029     } else {
4030       // for jewel compat only
4031       vector<OSDOp> scrub(1);
4032       scrub[0].op.op = CEPH_OSD_OP_SCRUB_UNRESERVE;
4033       hobject_t poid;
4034       eversion_t v;
4035       osd_reqid_t reqid;
4036       MOSDSubOp *subop = new MOSDSubOp(
4037         reqid, pg_whoami, spg_t(info.pgid.pgid, i->shard), poid, 0,
4038         get_osdmap()->get_epoch(), osd->get_tid(), v);
4039       subop->ops = scrub;
4040       osd->send_message_osd_cluster(i->osd, subop, get_osdmap()->get_epoch());
4041     }
4042   }
4043 }
4044
4045 void PG::_scan_rollback_obs(
4046   const vector<ghobject_t> &rollback_obs,
4047   ThreadPool::TPHandle &handle)
4048 {
4049   ObjectStore::Transaction t;
4050   eversion_t trimmed_to = last_rollback_info_trimmed_to_applied;
4051   for (vector<ghobject_t>::const_iterator i = rollback_obs.begin();
4052        i != rollback_obs.end();
4053        ++i) {
4054     if (i->generation < trimmed_to.version) {
4055       osd->clog->error() << "osd." << osd->whoami
4056                         << " pg " << info.pgid
4057                         << " found obsolete rollback obj "
4058                         << *i << " generation < trimmed_to "
4059                         << trimmed_to
4060                         << "...repaired";
4061       t.remove(coll, *i);
4062     }
4063   }
4064   if (!t.empty()) {
4065     derr << __func__ << ": queueing trans to clean up obsolete rollback objs"
4066          << dendl;
4067     osd->store->queue_transaction(osr.get(), std::move(t), NULL);
4068   }
4069 }
4070
4071 void PG::_scan_snaps(ScrubMap &smap)
4072 {
4073   hobject_t head;
4074   SnapSet snapset;
4075   for (map<hobject_t, ScrubMap::object>::reverse_iterator i = smap.objects.rbegin();
4076        i != smap.objects.rend();
4077        ++i) {
4078     const hobject_t &hoid = i->first;
4079     ScrubMap::object &o = i->second;
4080
4081     if (hoid.is_head() || hoid.is_snapdir()) {
4082       // parse the SnapSet
4083       bufferlist bl;
4084       if (o.attrs.find(SS_ATTR) == o.attrs.end()) {
4085         continue;
4086       }
4087       bl.push_back(o.attrs[SS_ATTR]);
4088       auto p = bl.begin();
4089       try {
4090         ::decode(snapset, p);
4091       } catch(...) {
4092         continue;
4093       }
4094       head = hoid.get_head();
4095       // Make sure head_exists is correct for is_legacy() check
4096       if (hoid.is_head())
4097         snapset.head_exists = true;
4098       continue;
4099     }
4100     if (hoid.snap < CEPH_MAXSNAP) {
4101       // check and if necessary fix snap_mapper
4102       if (hoid.get_head() != head) {
4103         derr << __func__ << " no head for " << hoid << " (have " << head << ")"
4104              << dendl;
4105         continue;
4106       }
4107       set<snapid_t> obj_snaps;
4108       if (!snapset.is_legacy()) {
4109         auto p = snapset.clone_snaps.find(hoid.snap);
4110         if (p == snapset.clone_snaps.end()) {
4111           derr << __func__ << " no clone_snaps for " << hoid << " in " << snapset
4112                << dendl;
4113           continue;
4114         }
4115         obj_snaps.insert(p->second.begin(), p->second.end());
4116       } else {
4117         bufferlist bl;
4118         if (o.attrs.find(OI_ATTR) == o.attrs.end()) {
4119           continue;
4120         }
4121         bl.push_back(o.attrs[OI_ATTR]);
4122         object_info_t oi;
4123         try {
4124           oi.decode(bl);
4125         } catch(...) {
4126           continue;
4127         }
4128         obj_snaps.insert(oi.legacy_snaps.begin(), oi.legacy_snaps.end());
4129       }
4130       set<snapid_t> cur_snaps;
4131       int r = snap_mapper.get_snaps(hoid, &cur_snaps);
4132       if (r != 0 && r != -ENOENT) {
4133         derr << __func__ << ": get_snaps returned " << cpp_strerror(r) << dendl;
4134         ceph_abort();
4135       }
4136       if (r == -ENOENT || cur_snaps != obj_snaps) {
4137         ObjectStore::Transaction t;
4138         OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
4139         if (r == 0) {
4140           r = snap_mapper.remove_oid(hoid, &_t);
4141           if (r != 0) {
4142             derr << __func__ << ": remove_oid returned " << cpp_strerror(r)
4143                  << dendl;
4144             ceph_abort();
4145           }
4146           osd->clog->error() << "osd." << osd->whoami
4147                             << " found snap mapper error on pg "
4148                             << info.pgid
4149                             << " oid " << hoid << " snaps in mapper: "
4150                             << cur_snaps << ", oi: "
4151                             << obj_snaps
4152                             << "...repaired";
4153         } else {
4154           osd->clog->error() << "osd." << osd->whoami
4155                             << " found snap mapper error on pg "
4156                             << info.pgid
4157                             << " oid " << hoid << " snaps missing in mapper"
4158                             << ", should be: "
4159                             << obj_snaps
4160                             << "...repaired";
4161         }
4162         snap_mapper.add_oid(hoid, obj_snaps, &_t);
4163
4164         // wait for repair to apply to avoid confusing other bits of the system.
4165         {
4166           Cond my_cond;
4167           Mutex my_lock("PG::_scan_snaps my_lock");
4168           int r = 0;
4169           bool done;
4170           t.register_on_applied_sync(
4171             new C_SafeCond(&my_lock, &my_cond, &done, &r));
4172           r = osd->store->apply_transaction(osr.get(), std::move(t));
4173           if (r != 0) {
4174             derr << __func__ << ": apply_transaction got " << cpp_strerror(r)
4175                  << dendl;
4176           } else {
4177             my_lock.Lock();
4178             while (!done)
4179               my_cond.Wait(my_lock);
4180             my_lock.Unlock();
4181           }
4182         }
4183       }
4184     }
4185   }
4186 }
4187
4188 void PG::_repair_oinfo_oid(ScrubMap &smap)
4189 {
4190   for (map<hobject_t, ScrubMap::object>::reverse_iterator i = smap.objects.rbegin();
4191        i != smap.objects.rend();
4192        ++i) {
4193     const hobject_t &hoid = i->first;
4194     ScrubMap::object &o = i->second;
4195
4196     bufferlist bl;
4197     if (o.attrs.find(OI_ATTR) == o.attrs.end()) {
4198       continue;
4199     }
4200     bl.push_back(o.attrs[OI_ATTR]);
4201     object_info_t oi;
4202     try {
4203       oi.decode(bl);
4204     } catch(...) {
4205       continue;
4206     }
4207     if (oi.soid != hoid) {
4208       ObjectStore::Transaction t;
4209       OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
4210       osd->clog->error() << "osd." << osd->whoami
4211                             << " found object info error on pg "
4212                             << info.pgid
4213                             << " oid " << hoid << " oid in object info: "
4214                             << oi.soid
4215                             << "...repaired";
4216       // Fix object info
4217       oi.soid = hoid;
4218       bl.clear();
4219       ::encode(oi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
4220
4221       bufferptr bp(bl.c_str(), bl.length());
4222       o.attrs[OI_ATTR] = bp;
4223
4224       t.setattr(coll, ghobject_t(hoid), OI_ATTR, bl);
4225       int r = osd->store->apply_transaction(osr.get(), std::move(t));
4226       if (r != 0) {
4227         derr << __func__ << ": apply_transaction got " << cpp_strerror(r)
4228              << dendl;
4229       }
4230     }
4231   }
4232 }
4233
4234 /*
4235  * build a scrub map over a chunk without releasing the lock
4236  * only used by chunky scrub
4237  */
4238 int PG::build_scrub_map_chunk(
4239   ScrubMap &map,
4240   hobject_t start, hobject_t end, bool deep, uint32_t seed,
4241   ThreadPool::TPHandle &handle)
4242 {
4243   dout(10) << __func__ << " [" << start << "," << end << ") "
4244            << " seed " << seed << dendl;
4245
4246   map.valid_through = info.last_update;
4247
4248   // objects
4249   vector<hobject_t> ls;
4250   vector<ghobject_t> rollback_obs;
4251   int ret = get_pgbackend()->objects_list_range(
4252     start,
4253     end,
4254     0,
4255     &ls,
4256     &rollback_obs);
4257   if (ret < 0) {
4258     dout(5) << "objects_list_range error: " << ret << dendl;
4259     return ret;
4260   }
4261
4262
4263   get_pgbackend()->be_scan_list(map, ls, deep, seed, handle);
4264   _scan_rollback_obs(rollback_obs, handle);
4265   _scan_snaps(map);
4266   _repair_oinfo_oid(map);
4267
4268   dout(20) << __func__ << " done" << dendl;
4269   return 0;
4270 }
4271
4272 void PG::Scrubber::cleanup_store(ObjectStore::Transaction *t) {
4273   if (!store)
4274     return;
4275   struct OnComplete : Context {
4276     std::unique_ptr<Scrub::Store> store;
4277     OnComplete(
4278       std::unique_ptr<Scrub::Store> &&store)
4279       : store(std::move(store)) {}
4280     void finish(int) override {}
4281   };
4282   store->cleanup(t);
4283   t->register_on_complete(new OnComplete(std::move(store)));
4284   assert(!store);
4285 }
4286
4287 void PG::repair_object(
4288   const hobject_t& soid, list<pair<ScrubMap::object, pg_shard_t> > *ok_peers,
4289   pg_shard_t bad_peer)
4290 {
4291   list<pg_shard_t> op_shards;
4292   for (auto i : *ok_peers) {
4293     op_shards.push_back(i.second);
4294   }
4295   dout(10) << "repair_object " << soid << " bad_peer osd."
4296            << bad_peer << " ok_peers osd.{" << op_shards << "}" << dendl;
4297   ScrubMap::object &po = ok_peers->back().first;
4298   eversion_t v;
4299   bufferlist bv;
4300   bv.push_back(po.attrs[OI_ATTR]);
4301   object_info_t oi;
4302   try {
4303     bufferlist::iterator bliter = bv.begin();
4304     ::decode(oi, bliter);
4305   } catch (...) {
4306     dout(0) << __func__ << ": Need version of replica, bad object_info_t: " << soid << dendl;
4307     assert(0);
4308   }
4309   if (bad_peer != primary) {
4310     peer_missing[bad_peer].add(soid, oi.version, eversion_t(), false);
4311   } else {
4312     // We should only be scrubbing if the PG is clean.
4313     assert(waiting_for_unreadable_object.empty());
4314
4315     pg_log.missing_add(soid, oi.version, eversion_t());
4316
4317     pg_log.set_last_requested(0);
4318     dout(10) << __func__ << ": primary = " << primary << dendl;
4319   }
4320
4321   if (is_ec_pg() || bad_peer == primary) {
4322     // we'd better collect all shard for EC pg, and prepare good peers as the
4323     // source of pull in the case of replicated pg.
4324     missing_loc.add_missing(soid, oi.version, eversion_t());
4325     list<pair<ScrubMap::object, pg_shard_t> >::iterator i;
4326     for (i = ok_peers->begin();
4327         i != ok_peers->end();
4328         ++i)
4329       missing_loc.add_location(soid, i->second);
4330   }
4331 }
4332
4333 /* replica_scrub
4334  *
4335  * Wait for last_update_applied to match msg->scrub_to as above. Wait
4336  * for pushes to complete in case of recent recovery. Build a single
4337  * scrubmap of objects that are in the range [msg->start, msg->end).
4338  */
4339 void PG::replica_scrub(
4340   OpRequestRef op,
4341   ThreadPool::TPHandle &handle)
4342 {
4343   const MOSDRepScrub *msg = static_cast<const MOSDRepScrub *>(op->get_req());
4344   assert(!scrubber.active_rep_scrub);
4345   dout(7) << "replica_scrub" << dendl;
4346
4347   if (msg->map_epoch < info.history.same_interval_since) {
4348     dout(10) << "replica_scrub discarding old replica_scrub from "
4349              << msg->map_epoch << " < " << info.history.same_interval_since
4350              << dendl;
4351     return;
4352   }
4353
4354   ScrubMap map;
4355
4356   assert(msg->chunky);
4357   if (last_update_applied < msg->scrub_to) {
4358     dout(10) << "waiting for last_update_applied to catch up" << dendl;
4359     scrubber.active_rep_scrub = op;
4360     return;
4361   }
4362
4363   if (active_pushes > 0) {
4364     dout(10) << "waiting for active pushes to finish" << dendl;
4365     scrubber.active_rep_scrub = op;
4366     return;
4367   }
4368
4369   // compensate for hobject_t's with wrong pool from sloppy hammer OSDs
4370   hobject_t start = msg->start;
4371   hobject_t end = msg->end;
4372   if (!start.is_max())
4373     start.pool = info.pgid.pool();
4374   if (!end.is_max())
4375     end.pool = info.pgid.pool();
4376
4377   build_scrub_map_chunk(
4378     map, start, end, msg->deep, msg->seed,
4379     handle);
4380
4381   if (HAVE_FEATURE(acting_features, SERVER_LUMINOUS)) {
4382     MOSDRepScrubMap *reply = new MOSDRepScrubMap(
4383       spg_t(info.pgid.pgid, get_primary().shard),
4384       msg->map_epoch,
4385       pg_whoami);
4386     ::encode(map, reply->get_data());
4387     osd->send_message_osd_cluster(reply, msg->get_connection());
4388   } else {
4389     // for jewel compatibility
4390     vector<OSDOp> scrub(1);
4391     scrub[0].op.op = CEPH_OSD_OP_SCRUB_MAP;
4392     hobject_t poid;
4393     eversion_t v;
4394     osd_reqid_t reqid;
4395     MOSDSubOp *subop = new MOSDSubOp(
4396       reqid,
4397       pg_whoami,
4398       spg_t(info.pgid.pgid, get_primary().shard),
4399       poid,
4400       0,
4401       msg->map_epoch,
4402       osd->get_tid(),
4403       v);
4404     ::encode(map, subop->get_data());
4405     subop->ops = scrub;
4406     osd->send_message_osd_cluster(subop, msg->get_connection());
4407   }
4408 }
4409
4410 /* Scrub:
4411  * PG_STATE_SCRUBBING is set when the scrub is queued
4412  *
4413  * scrub will be chunky if all OSDs in PG support chunky scrub
4414  * scrub will fail if OSDs are too old.
4415  */
4416 void PG::scrub(epoch_t queued, ThreadPool::TPHandle &handle)
4417 {
4418   if (cct->_conf->osd_scrub_sleep > 0 &&
4419       (scrubber.state == PG::Scrubber::NEW_CHUNK ||
4420        scrubber.state == PG::Scrubber::INACTIVE) &&
4421        scrubber.needs_sleep) {
4422     ceph_assert(!scrubber.sleeping);
4423     dout(20) << __func__ << " state is INACTIVE|NEW_CHUNK, sleeping" << dendl;
4424
4425     // Do an async sleep so we don't block the op queue
4426     OSDService *osds = osd;
4427     spg_t pgid = get_pgid();
4428     int state = scrubber.state;
4429     auto scrub_requeue_callback =
4430         new FunctionContext([osds, pgid, state](int r) {
4431           PG *pg = osds->osd->lookup_lock_pg(pgid);
4432           if (pg == nullptr) {
4433             lgeneric_dout(osds->osd->cct, 20)
4434                 << "scrub_requeue_callback: Could not find "
4435                 << "PG " << pgid << " can't complete scrub requeue after sleep"
4436                 << dendl;
4437             return;
4438           }
4439           pg->scrubber.sleeping = false;
4440           pg->scrubber.needs_sleep = false;
4441           lgeneric_dout(pg->cct, 20)
4442               << "scrub_requeue_callback: slept for "
4443               << ceph_clock_now() - pg->scrubber.sleep_start
4444               << ", re-queuing scrub with state " << state << dendl;
4445           pg->scrub_queued = false;
4446           pg->requeue_scrub();
4447           pg->scrubber.sleep_start = utime_t();
4448           pg->unlock();
4449         });
4450     Mutex::Locker l(osd->scrub_sleep_lock);
4451     osd->scrub_sleep_timer.add_event_after(cct->_conf->osd_scrub_sleep,
4452                                            scrub_requeue_callback);
4453     scrubber.sleeping = true;
4454     scrubber.sleep_start = ceph_clock_now();
4455     return;
4456   }
4457   if (pg_has_reset_since(queued)) {
4458     return;
4459   }
4460   assert(scrub_queued);
4461   scrub_queued = false;
4462   scrubber.needs_sleep = true;
4463
4464   if (!is_primary() || !is_active() || !is_clean() || !is_scrubbing()) {
4465     dout(10) << "scrub -- not primary or active or not clean" << dendl;
4466     state_clear(PG_STATE_SCRUBBING);
4467     state_clear(PG_STATE_REPAIR);
4468     state_clear(PG_STATE_DEEP_SCRUB);
4469     publish_stats_to_osd();
4470     return;
4471   }
4472
4473   if (!scrubber.active) {
4474     assert(backfill_targets.empty());
4475
4476     scrubber.deep = state_test(PG_STATE_DEEP_SCRUB);
4477
4478     dout(10) << "starting a new chunky scrub" << dendl;
4479   }
4480
4481   chunky_scrub(handle);
4482 }
4483
4484 /*
4485  * Chunky scrub scrubs objects one chunk at a time with writes blocked for that
4486  * chunk.
4487  *
4488  * The object store is partitioned into chunks which end on hash boundaries. For
4489  * each chunk, the following logic is performed:
4490  *
4491  *  (1) Block writes on the chunk
4492  *  (2) Request maps from replicas
4493  *  (3) Wait for pushes to be applied (after recovery)
4494  *  (4) Wait for writes to flush on the chunk
4495  *  (5) Wait for maps from replicas
4496  *  (6) Compare / repair all scrub maps
4497  *  (7) Wait for digest updates to apply
4498  *
4499  * This logic is encoded in the mostly linear state machine:
4500  *
4501  *           +------------------+
4502  *  _________v__________        |
4503  * |                    |       |
4504  * |      INACTIVE      |       |
4505  * |____________________|       |
4506  *           |                  |
4507  *           |   +----------+   |
4508  *  _________v___v______    |   |
4509  * |                    |   |   |
4510  * |      NEW_CHUNK     |   |   |
4511  * |____________________|   |   |
4512  *           |              |   |
4513  *  _________v__________    |   |
4514  * |                    |   |   |
4515  * |     WAIT_PUSHES    |   |   |
4516  * |____________________|   |   |
4517  *           |              |   |
4518  *  _________v__________    |   |
4519  * |                    |   |   |
4520  * |  WAIT_LAST_UPDATE  |   |   |
4521  * |____________________|   |   |
4522  *           |              |   |
4523  *  _________v__________    |   |
4524  * |                    |   |   |
4525  * |      BUILD_MAP     |   |   |
4526  * |____________________|   |   |
4527  *           |              |   |
4528  *  _________v__________    |   |
4529  * |                    |   |   |
4530  * |    WAIT_REPLICAS   |   |   |
4531  * |____________________|   |   |
4532  *           |              |   |
4533  *  _________v__________    |   |
4534  * |                    |   |   |
4535  * |    COMPARE_MAPS    |   |   |
4536  * |____________________|   |   |
4537  *           |              |   |
4538  *           |              |   |
4539  *  _________v__________    |   |
4540  * |                    |   |   |
4541  * |WAIT_DIGEST_UPDATES |   |   |
4542  * |____________________|   |   |
4543  *           |   |          |   |
4544  *           |   +----------+   |
4545  *  _________v__________        |
4546  * |                    |       |
4547  * |       FINISH       |       |
4548  * |____________________|       |
4549  *           |                  |
4550  *           +------------------+
4551  *
4552  * The primary determines the last update from the subset by walking the log. If
4553  * it sees a log entry pertaining to a file in the chunk, it tells the replicas
4554  * to wait until that update is applied before building a scrub map. Both the
4555  * primary and replicas will wait for any active pushes to be applied.
4556  *
4557  * In contrast to classic_scrub, chunky_scrub is entirely handled by scrub_wq.
4558  *
4559  * scrubber.state encodes the current state of the scrub (refer to state diagram
4560  * for details).
4561  */
4562 void PG::chunky_scrub(ThreadPool::TPHandle &handle)
4563 {
4564   // check for map changes
4565   if (scrubber.is_chunky_scrub_active()) {
4566     if (scrubber.epoch_start != info.history.same_interval_since) {
4567       dout(10) << "scrub  pg changed, aborting" << dendl;
4568       scrub_clear_state();
4569       scrub_unreserve_replicas();
4570       return;
4571     }
4572   }
4573
4574   bool done = false;
4575   int ret;
4576
4577   while (!done) {
4578     dout(20) << "scrub state " << Scrubber::state_string(scrubber.state)
4579              << " [" << scrubber.start << "," << scrubber.end << ")" << dendl;
4580
4581     switch (scrubber.state) {
4582       case PG::Scrubber::INACTIVE:
4583         dout(10) << "scrub start" << dendl;
4584
4585         publish_stats_to_osd();
4586         scrubber.epoch_start = info.history.same_interval_since;
4587         scrubber.active = true;
4588
4589         osd->inc_scrubs_active(scrubber.reserved);
4590         if (scrubber.reserved) {
4591           scrubber.reserved = false;
4592           scrubber.reserved_peers.clear();
4593         }
4594
4595         {
4596           ObjectStore::Transaction t;
4597           scrubber.cleanup_store(&t);
4598           scrubber.store.reset(Scrub::Store::create(osd->store, &t,
4599                                                     info.pgid, coll));
4600           osd->store->queue_transaction(osr.get(), std::move(t), nullptr);
4601         }
4602
4603         // Don't include temporary objects when scrubbing
4604         scrubber.start = info.pgid.pgid.get_hobj_start();
4605         scrubber.state = PG::Scrubber::NEW_CHUNK;
4606
4607         {
4608           bool repair = state_test(PG_STATE_REPAIR);
4609           bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
4610           const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
4611           stringstream oss;
4612           oss << info.pgid.pgid << " " << mode << " starts" << std::endl;
4613           osd->clog->debug(oss);
4614         }
4615
4616         scrubber.seed = -1;
4617
4618         break;
4619
4620       case PG::Scrubber::NEW_CHUNK:
4621         scrubber.primary_scrubmap = ScrubMap();
4622         scrubber.received_maps.clear();
4623
4624         {
4625           /* get the start and end of our scrub chunk
4626            *
4627            * Our scrub chunk has an important restriction we're going to need to
4628            * respect. We can't let head or snapdir be start or end.
4629            * Using a half-open interval means that if end == head|snapdir,
4630            * we'd scrub/lock head and the clone right next to head in different
4631            * chunks which would allow us to miss clones created between
4632            * scrubbing that chunk and scrubbing the chunk including head.
4633            * This isn't true for any of the other clones since clones can
4634            * only be created "just to the left of" head.  There is one exception
4635            * to this: promotion of clones which always happens to the left of the
4636            * left-most clone, but promote_object checks the scrubber in that
4637            * case, so it should be ok.  Also, it's ok to "miss" clones at the
4638            * left end of the range if we are a tier because they may legitimately
4639            * not exist (see _scrub).
4640            */
4641           int min = MAX(3, cct->_conf->osd_scrub_chunk_min);
4642           hobject_t start = scrubber.start;
4643           hobject_t candidate_end;
4644           vector<hobject_t> objects;
4645           ret = get_pgbackend()->objects_list_partial(
4646             start,
4647             min,
4648             MAX(min, cct->_conf->osd_scrub_chunk_max),
4649             &objects,
4650             &candidate_end);
4651           assert(ret >= 0);
4652
4653           if (!objects.empty()) {
4654             hobject_t back = objects.back();
4655             while (candidate_end.has_snapset() &&
4656                       candidate_end.get_head() == back.get_head()) {
4657               candidate_end = back;
4658               objects.pop_back();
4659               if (objects.empty()) {
4660                 assert(0 ==
4661                        "Somehow we got more than 2 objects which"
4662                        "have the same head but are not clones");
4663               }
4664               back = objects.back();
4665             }
4666             if (candidate_end.has_snapset()) {
4667               assert(candidate_end.get_head() != back.get_head());
4668               candidate_end = candidate_end.get_object_boundary();
4669             }
4670           } else {
4671             assert(candidate_end.is_max());
4672           }
4673
4674           if (!_range_available_for_scrub(scrubber.start, candidate_end)) {
4675             // we'll be requeued by whatever made us unavailable for scrub
4676             dout(10) << __func__ << ": scrub blocked somewhere in range "
4677                      << "[" << scrubber.start << ", " << candidate_end << ")"
4678                      << dendl;
4679             done = true;
4680             break;
4681           }
4682           scrubber.end = candidate_end;
4683         }
4684
4685         // walk the log to find the latest update that affects our chunk
4686         scrubber.subset_last_update = eversion_t();
4687         for (auto p = projected_log.log.rbegin();
4688              p != projected_log.log.rend();
4689              ++p) {
4690           if (p->soid >= scrubber.start &&
4691               p->soid < scrubber.end) {
4692             scrubber.subset_last_update = p->version;
4693             break;
4694           }
4695         }
4696         if (scrubber.subset_last_update == eversion_t()) {
4697           for (list<pg_log_entry_t>::const_reverse_iterator p =
4698                  pg_log.get_log().log.rbegin();
4699                p != pg_log.get_log().log.rend();
4700                ++p) {
4701             if (p->soid >= scrubber.start &&
4702                 p->soid < scrubber.end) {
4703               scrubber.subset_last_update = p->version;
4704               break;
4705             }
4706           }
4707         }
4708
4709         // ask replicas to wait until
4710         // last_update_applied >= scrubber.subset_last_update and then scan
4711         scrubber.waiting_on_whom.insert(pg_whoami);
4712         ++scrubber.waiting_on;
4713
4714         // request maps from replicas
4715         for (set<pg_shard_t>::iterator i = actingbackfill.begin();
4716              i != actingbackfill.end();
4717              ++i) {
4718           if (*i == pg_whoami) continue;
4719           _request_scrub_map(*i, scrubber.subset_last_update,
4720                              scrubber.start, scrubber.end, scrubber.deep,
4721                              scrubber.seed);
4722           scrubber.waiting_on_whom.insert(*i);
4723           ++scrubber.waiting_on;
4724         }
4725
4726         scrubber.state = PG::Scrubber::WAIT_PUSHES;
4727
4728         break;
4729
4730       case PG::Scrubber::WAIT_PUSHES:
4731         if (active_pushes == 0) {
4732           scrubber.state = PG::Scrubber::WAIT_LAST_UPDATE;
4733         } else {
4734           dout(15) << "wait for pushes to apply" << dendl;
4735           done = true;
4736         }
4737         break;
4738
4739       case PG::Scrubber::WAIT_LAST_UPDATE:
4740         if (last_update_applied >= scrubber.subset_last_update) {
4741           scrubber.state = PG::Scrubber::BUILD_MAP;
4742         } else {
4743           // will be requeued by op_applied
4744           dout(15) << "wait for writes to flush" << dendl;
4745           done = true;
4746         }
4747         break;
4748
4749       case PG::Scrubber::BUILD_MAP:
4750         assert(last_update_applied >= scrubber.subset_last_update);
4751
4752         // build my own scrub map
4753         ret = build_scrub_map_chunk(scrubber.primary_scrubmap,
4754                                     scrubber.start, scrubber.end,
4755                                     scrubber.deep, scrubber.seed,
4756                                     handle);
4757         if (ret < 0) {
4758           dout(5) << "error building scrub map: " << ret << ", aborting" << dendl;
4759           scrub_clear_state();
4760           scrub_unreserve_replicas();
4761           return;
4762         }
4763
4764         --scrubber.waiting_on;
4765         scrubber.waiting_on_whom.erase(pg_whoami);
4766
4767         scrubber.state = PG::Scrubber::WAIT_REPLICAS;
4768         break;
4769
4770       case PG::Scrubber::WAIT_REPLICAS:
4771         if (scrubber.waiting_on > 0) {
4772           // will be requeued by sub_op_scrub_map
4773           dout(10) << "wait for replicas to build scrub map" << dendl;
4774           done = true;
4775         } else {
4776           scrubber.state = PG::Scrubber::COMPARE_MAPS;
4777         }
4778         break;
4779
4780       case PG::Scrubber::COMPARE_MAPS:
4781         assert(last_update_applied >= scrubber.subset_last_update);
4782         assert(scrubber.waiting_on == 0);
4783
4784         scrub_compare_maps();
4785         scrubber.start = scrubber.end;
4786         scrubber.run_callbacks();
4787
4788         // requeue the writes from the chunk that just finished
4789         requeue_ops(waiting_for_scrub);
4790
4791         scrubber.state = PG::Scrubber::WAIT_DIGEST_UPDATES;
4792
4793         // fall-thru
4794
4795       case PG::Scrubber::WAIT_DIGEST_UPDATES:
4796         if (scrubber.num_digest_updates_pending) {
4797           dout(10) << __func__ << " waiting on "
4798                    << scrubber.num_digest_updates_pending
4799                    << " digest updates" << dendl;
4800           done = true;
4801           break;
4802         }
4803
4804         if (!(scrubber.end.is_max())) {
4805           scrubber.state = PG::Scrubber::NEW_CHUNK;
4806           requeue_scrub();
4807           done = true;
4808         } else {
4809           scrubber.state = PG::Scrubber::FINISH;
4810         }
4811
4812         break;
4813
4814       case PG::Scrubber::FINISH:
4815         scrub_finish();
4816         scrubber.state = PG::Scrubber::INACTIVE;
4817         done = true;
4818
4819         if (!snap_trimq.empty()) {
4820           dout(10) << "scrub finished, requeuing snap_trimmer" << dendl;
4821           snap_trimmer_scrub_complete();
4822         }
4823
4824         break;
4825
4826       default:
4827         ceph_abort();
4828     }
4829   }
4830   dout(20) << "scrub final state " << Scrubber::state_string(scrubber.state)
4831            << " [" << scrubber.start << "," << scrubber.end << ")" << dendl;
4832 }
4833
4834 void PG::scrub_clear_state()
4835 {
4836   assert(is_locked());
4837   state_clear(PG_STATE_SCRUBBING);
4838   state_clear(PG_STATE_REPAIR);
4839   state_clear(PG_STATE_DEEP_SCRUB);
4840   publish_stats_to_osd();
4841
4842   // active -> nothing.
4843   if (scrubber.active)
4844     osd->dec_scrubs_active();
4845
4846   requeue_ops(waiting_for_scrub);
4847
4848   scrubber.reset();
4849
4850   // type-specific state clear
4851   _scrub_clear_state();
4852 }
4853
4854 void PG::scrub_compare_maps()
4855 {
4856   dout(10) << __func__ << " has maps, analyzing" << dendl;
4857
4858   // construct authoritative scrub map for type specific scrubbing
4859   scrubber.cleaned_meta_map.insert(scrubber.primary_scrubmap);
4860   map<hobject_t, pair<uint32_t, uint32_t>> missing_digest;
4861
4862   if (acting.size() > 1) {
4863     dout(10) << __func__ << "  comparing replica scrub maps" << dendl;
4864
4865     stringstream ss;
4866
4867     // Map from object with errors to good peer
4868     map<hobject_t, list<pg_shard_t>> authoritative;
4869     map<pg_shard_t, ScrubMap *> maps;
4870
4871     dout(2) << __func__ << "   osd." << acting[0] << " has "
4872             << scrubber.primary_scrubmap.objects.size() << " items" << dendl;
4873     maps[pg_whoami] = &scrubber.primary_scrubmap;
4874
4875     for (set<pg_shard_t>::iterator i = actingbackfill.begin();
4876          i != actingbackfill.end();
4877          ++i) {
4878       if (*i == pg_whoami) continue;
4879       dout(2) << __func__ << " replica " << *i << " has "
4880               << scrubber.received_maps[*i].objects.size()
4881               << " items" << dendl;
4882       maps[*i] = &scrubber.received_maps[*i];
4883     }
4884
4885     get_pgbackend()->be_compare_scrubmaps(
4886       maps,
4887       state_test(PG_STATE_REPAIR),
4888       scrubber.missing,
4889       scrubber.inconsistent,
4890       authoritative,
4891       missing_digest,
4892       scrubber.shallow_errors,
4893       scrubber.deep_errors,
4894       scrubber.store.get(),
4895       info.pgid, acting,
4896       ss);
4897     dout(2) << ss.str() << dendl;
4898
4899     if (!ss.str().empty()) {
4900       osd->clog->error(ss);
4901     }
4902
4903     for (map<hobject_t, list<pg_shard_t>>::iterator i = authoritative.begin();
4904          i != authoritative.end();
4905          ++i) {
4906       list<pair<ScrubMap::object, pg_shard_t> > good_peers;
4907       for (list<pg_shard_t>::const_iterator j = i->second.begin();
4908            j != i->second.end();
4909            ++j) {
4910         good_peers.push_back(make_pair(maps[*j]->objects[i->first], *j));
4911       }
4912       scrubber.authoritative.insert(
4913         make_pair(
4914           i->first,
4915           good_peers));
4916     }
4917
4918     for (map<hobject_t, list<pg_shard_t>>::iterator i = authoritative.begin();
4919          i != authoritative.end();
4920          ++i) {
4921       scrubber.cleaned_meta_map.objects.erase(i->first);
4922       scrubber.cleaned_meta_map.objects.insert(
4923         *(maps[i->second.back()]->objects.find(i->first))
4924         );
4925     }
4926   }
4927
4928   ScrubMap for_meta_scrub;
4929   if (scrubber.end.is_max() ||
4930       scrubber.cleaned_meta_map.objects.empty()) {
4931     scrubber.cleaned_meta_map.swap(for_meta_scrub);
4932   } else {
4933     auto iter = scrubber.cleaned_meta_map.objects.end();
4934     --iter; // not empty, see if clause
4935     auto begin = scrubber.cleaned_meta_map.objects.begin();
4936     while (iter != begin) {
4937       auto next = iter--;
4938       if (next->first.get_head() != iter->first.get_head()) {
4939         ++iter;
4940         break;
4941       }
4942     }
4943     for_meta_scrub.objects.insert(begin, iter);
4944     scrubber.cleaned_meta_map.objects.erase(begin, iter);
4945   }
4946
4947   // ok, do the pg-type specific scrubbing
4948   scrub_snapshot_metadata(for_meta_scrub, missing_digest);
4949   if (!scrubber.store->empty()) {
4950     if (state_test(PG_STATE_REPAIR)) {
4951       dout(10) << __func__ << ": discarding scrub results" << dendl;
4952       scrubber.store->flush(nullptr);
4953     } else {
4954       dout(10) << __func__ << ": updating scrub object" << dendl;
4955       ObjectStore::Transaction t;
4956       scrubber.store->flush(&t);
4957       osd->store->queue_transaction(osr.get(), std::move(t), nullptr);
4958     }
4959   }
4960 }
4961
4962 bool PG::scrub_process_inconsistent()
4963 {
4964   dout(10) << __func__ << ": checking authoritative" << dendl;
4965   bool repair = state_test(PG_STATE_REPAIR);
4966   bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
4967   const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
4968
4969   // authoriative only store objects which missing or inconsistent.
4970   if (!scrubber.authoritative.empty()) {
4971     stringstream ss;
4972     ss << info.pgid << " " << mode << " "
4973        << scrubber.missing.size() << " missing, "
4974        << scrubber.inconsistent.size() << " inconsistent objects";
4975     dout(2) << ss.str() << dendl;
4976     osd->clog->error(ss);
4977     if (repair) {
4978       state_clear(PG_STATE_CLEAN);
4979       for (map<hobject_t, list<pair<ScrubMap::object, pg_shard_t> >>::iterator i =
4980              scrubber.authoritative.begin();
4981            i != scrubber.authoritative.end();
4982            ++i) {
4983         set<pg_shard_t>::iterator j;
4984
4985         auto missing_entry = scrubber.missing.find(i->first);
4986         if (missing_entry != scrubber.missing.end()) {
4987           for (j = missing_entry->second.begin();
4988                j != missing_entry->second.end();
4989                ++j) {
4990             repair_object(
4991               i->first,
4992               &(i->second),
4993               *j);
4994             ++scrubber.fixed;
4995           }
4996         }
4997         if (scrubber.inconsistent.count(i->first)) {
4998           for (j = scrubber.inconsistent[i->first].begin();
4999                j != scrubber.inconsistent[i->first].end();
5000                ++j) {
5001             repair_object(i->first,
5002               &(i->second),
5003               *j);
5004             ++scrubber.fixed;
5005           }
5006         }
5007       }
5008     }
5009   }
5010   return (!scrubber.authoritative.empty() && repair);
5011 }
5012
5013 bool PG::ops_blocked_by_scrub() const {
5014   return (waiting_for_scrub.size() != 0);
5015 }
5016
5017 // the part that actually finalizes a scrub
5018 void PG::scrub_finish()
5019 {
5020   bool repair = state_test(PG_STATE_REPAIR);
5021   // if the repair request comes from auto-repair and large number of errors,
5022   // we would like to cancel auto-repair
5023   if (repair && scrubber.auto_repair
5024       && scrubber.authoritative.size() > cct->_conf->osd_scrub_auto_repair_num_errors) {
5025     state_clear(PG_STATE_REPAIR);
5026     repair = false;
5027   }
5028   bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
5029   const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
5030
5031   // type-specific finish (can tally more errors)
5032   _scrub_finish();
5033
5034   bool has_error = scrub_process_inconsistent();
5035
5036   {
5037     stringstream oss;
5038     oss << info.pgid.pgid << " " << mode << " ";
5039     int total_errors = scrubber.shallow_errors + scrubber.deep_errors;
5040     if (total_errors)
5041       oss << total_errors << " errors";
5042     else
5043       oss << "ok";
5044     if (!deep_scrub && info.stats.stats.sum.num_deep_scrub_errors)
5045       oss << " ( " << info.stats.stats.sum.num_deep_scrub_errors
5046           << " remaining deep scrub error details lost)";
5047     if (repair)
5048       oss << ", " << scrubber.fixed << " fixed";
5049     if (total_errors)
5050       osd->clog->error(oss);
5051     else
5052       osd->clog->debug(oss);
5053   }
5054
5055   // finish up
5056   unreg_next_scrub();
5057   utime_t now = ceph_clock_now();
5058   info.history.last_scrub = info.last_update;
5059   info.history.last_scrub_stamp = now;
5060   if (scrubber.deep) {
5061     info.history.last_deep_scrub = info.last_update;
5062     info.history.last_deep_scrub_stamp = now;
5063   }
5064   // Since we don't know which errors were fixed, we can only clear them
5065   // when every one has been fixed.
5066   if (repair) {
5067     if (scrubber.fixed == scrubber.shallow_errors + scrubber.deep_errors) {
5068       assert(deep_scrub);
5069       scrubber.shallow_errors = scrubber.deep_errors = 0;
5070     } else {
5071       // Deep scrub in order to get corrected error counts
5072       scrub_after_recovery = true;
5073     }
5074   }
5075   if (deep_scrub) {
5076     if ((scrubber.shallow_errors == 0) && (scrubber.deep_errors == 0))
5077       info.history.last_clean_scrub_stamp = now;
5078     info.stats.stats.sum.num_shallow_scrub_errors = scrubber.shallow_errors;
5079     info.stats.stats.sum.num_deep_scrub_errors = scrubber.deep_errors;
5080   } else {
5081     info.stats.stats.sum.num_shallow_scrub_errors = scrubber.shallow_errors;
5082     // XXX: last_clean_scrub_stamp doesn't mean the pg is not inconsistent
5083     // because of deep-scrub errors
5084     if (scrubber.shallow_errors == 0)
5085       info.history.last_clean_scrub_stamp = now;
5086   }
5087   info.stats.stats.sum.num_scrub_errors =
5088     info.stats.stats.sum.num_shallow_scrub_errors +
5089     info.stats.stats.sum.num_deep_scrub_errors;
5090   reg_next_scrub();
5091
5092   {
5093     ObjectStore::Transaction t;
5094     dirty_info = true;
5095     write_if_dirty(t);
5096     int tr = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
5097     assert(tr == 0);
5098   }
5099
5100
5101   if (has_error) {
5102     queue_peering_event(
5103       CephPeeringEvtRef(
5104         std::make_shared<CephPeeringEvt>(
5105           get_osdmap()->get_epoch(),
5106           get_osdmap()->get_epoch(),
5107           DoRecovery())));
5108   }
5109
5110   scrub_clear_state();
5111   scrub_unreserve_replicas();
5112
5113   if (is_active() && is_primary()) {
5114     share_pg_info();
5115   }
5116 }
5117
5118 void PG::share_pg_info()
5119 {
5120   dout(10) << "share_pg_info" << dendl;
5121
5122   // share new pg_info_t with replicas
5123   assert(!actingbackfill.empty());
5124   for (set<pg_shard_t>::iterator i = actingbackfill.begin();
5125        i != actingbackfill.end();
5126        ++i) {
5127     if (*i == pg_whoami) continue;
5128     pg_shard_t peer = *i;
5129     if (peer_info.count(peer)) {
5130       peer_info[peer].last_epoch_started = info.last_epoch_started;
5131       peer_info[peer].last_interval_started = info.last_interval_started;
5132       peer_info[peer].history.merge(info.history);
5133     }
5134     MOSDPGInfo *m = new MOSDPGInfo(get_osdmap()->get_epoch());
5135     m->pg_list.push_back(
5136       make_pair(
5137         pg_notify_t(
5138           peer.shard, pg_whoami.shard,
5139           get_osdmap()->get_epoch(),
5140           get_osdmap()->get_epoch(),
5141           info),
5142         PastIntervals()));
5143     osd->send_message_osd_cluster(peer.osd, m, get_osdmap()->get_epoch());
5144   }
5145 }
5146
5147 bool PG::append_log_entries_update_missing(
5148   const mempool::osd_pglog::list<pg_log_entry_t> &entries,
5149   ObjectStore::Transaction &t, boost::optional<eversion_t> trim_to,
5150   boost::optional<eversion_t> roll_forward_to)
5151 {
5152   assert(!entries.empty());
5153   assert(entries.begin()->version > info.last_update);
5154
5155   PGLogEntryHandler rollbacker{this, &t};
5156   if (roll_forward_to) {
5157     pg_log.roll_forward(&rollbacker);
5158   }
5159   bool invalidate_stats =
5160     pg_log.append_new_log_entries(info.last_backfill,
5161                                   info.last_backfill_bitwise,
5162                                   entries,
5163                                   &rollbacker);
5164
5165   if (roll_forward_to && entries.rbegin()->soid > info.last_backfill) {
5166     pg_log.roll_forward(&rollbacker);
5167   }
5168   if (roll_forward_to && *roll_forward_to > pg_log.get_can_rollback_to()) {
5169     pg_log.roll_forward_to(*roll_forward_to, &rollbacker);
5170     last_rollback_info_trimmed_to_applied = *roll_forward_to;
5171   }
5172
5173   info.last_update = pg_log.get_head();
5174
5175   if (pg_log.get_missing().num_missing() == 0) {
5176     // advance last_complete since nothing else is missing!
5177     info.last_complete = info.last_update;
5178   }
5179   info.stats.stats_invalid = info.stats.stats_invalid || invalidate_stats;
5180
5181   dout(20) << __func__ << "trim_to bool = " << bool(trim_to) << " trim_to = " << (trim_to ? *trim_to : eversion_t()) << dendl;
5182   if (trim_to)
5183     pg_log.trim(*trim_to, info);
5184   dirty_info = true;
5185   write_if_dirty(t);
5186   return invalidate_stats;
5187 }
5188
5189
5190 void PG::merge_new_log_entries(
5191   const mempool::osd_pglog::list<pg_log_entry_t> &entries,
5192   ObjectStore::Transaction &t,
5193   boost::optional<eversion_t> trim_to,
5194   boost::optional<eversion_t> roll_forward_to)
5195 {
5196   dout(10) << __func__ << " " << entries << dendl;
5197   assert(is_primary());
5198
5199   bool rebuild_missing = append_log_entries_update_missing(entries, t, trim_to, roll_forward_to);
5200   for (set<pg_shard_t>::const_iterator i = actingbackfill.begin();
5201        i != actingbackfill.end();
5202        ++i) {
5203     pg_shard_t peer(*i);
5204     if (peer == pg_whoami) continue;
5205     assert(peer_missing.count(peer));
5206     assert(peer_info.count(peer));
5207     pg_missing_t& pmissing(peer_missing[peer]);
5208     dout(20) << __func__ << " peer_missing for " << peer << " = " << pmissing << dendl;
5209     pg_info_t& pinfo(peer_info[peer]);
5210     bool invalidate_stats = PGLog::append_log_entries_update_missing(
5211       pinfo.last_backfill,
5212       info.last_backfill_bitwise,
5213       entries,
5214       true,
5215       NULL,
5216       pmissing,
5217       NULL,
5218       this);
5219     pinfo.last_update = info.last_update;
5220     pinfo.stats.stats_invalid = pinfo.stats.stats_invalid || invalidate_stats;
5221     rebuild_missing = rebuild_missing || invalidate_stats;
5222   }
5223
5224   if (!rebuild_missing) {
5225     return;
5226   }
5227
5228   for (auto &&i: entries) {
5229     missing_loc.rebuild(
5230       i.soid,
5231       pg_whoami,
5232       actingbackfill,
5233       info,
5234       pg_log.get_missing(),
5235       peer_missing,
5236       peer_info);
5237   }
5238 }
5239
5240 void PG::update_history(const pg_history_t& new_history)
5241 {
5242   unreg_next_scrub();
5243   if (info.history.merge(new_history)) {
5244     dout(20) << __func__ << " advanced history from " << new_history << dendl;
5245     dirty_info = true;
5246     if (info.history.last_epoch_clean >= info.history.same_interval_since) {
5247       dout(20) << __func__ << " clearing past_intervals" << dendl;
5248       past_intervals.clear();
5249       dirty_big_info = true;
5250     }
5251   }
5252   reg_next_scrub();
5253 }
5254
5255 void PG::fulfill_info(
5256   pg_shard_t from, const pg_query_t &query,
5257   pair<pg_shard_t, pg_info_t> &notify_info)
5258 {
5259   assert(from == primary);
5260   assert(query.type == pg_query_t::INFO);
5261
5262   // info
5263   dout(10) << "sending info" << dendl;
5264   notify_info = make_pair(from, info);
5265 }
5266
5267 void PG::fulfill_log(
5268   pg_shard_t from, const pg_query_t &query, epoch_t query_epoch)
5269 {
5270   dout(10) << "log request from " << from << dendl;
5271   assert(from == primary);
5272   assert(query.type != pg_query_t::INFO);
5273   ConnectionRef con = osd->get_con_osd_cluster(
5274     from.osd, get_osdmap()->get_epoch());
5275   if (!con) return;
5276
5277   MOSDPGLog *mlog = new MOSDPGLog(
5278     from.shard, pg_whoami.shard,
5279     get_osdmap()->get_epoch(),
5280     info, query_epoch);
5281   mlog->missing = pg_log.get_missing();
5282
5283   // primary -> other, when building master log
5284   if (query.type == pg_query_t::LOG) {
5285     dout(10) << " sending info+missing+log since " << query.since
5286              << dendl;
5287     if (query.since != eversion_t() && query.since < pg_log.get_tail()) {
5288       osd->clog->error() << info.pgid << " got broken pg_query_t::LOG since " << query.since
5289                         << " when my log.tail is " << pg_log.get_tail()
5290                         << ", sending full log instead";
5291       mlog->log = pg_log.get_log();           // primary should not have requested this!!
5292     } else
5293       mlog->log.copy_after(pg_log.get_log(), query.since);
5294   }
5295   else if (query.type == pg_query_t::FULLLOG) {
5296     dout(10) << " sending info+missing+full log" << dendl;
5297     mlog->log = pg_log.get_log();
5298   }
5299
5300   dout(10) << " sending " << mlog->log << " " << mlog->missing << dendl;
5301
5302   osd->share_map_peer(from.osd, con.get(), get_osdmap());
5303   osd->send_message_osd_cluster(mlog, con.get());
5304 }
5305
5306 void PG::check_full_transition(OSDMapRef lastmap, OSDMapRef osdmap)
5307 {
5308   bool changed = false;
5309   if (osdmap->test_flag(CEPH_OSDMAP_FULL) &&
5310       !lastmap->test_flag(CEPH_OSDMAP_FULL)) {
5311     dout(10) << " cluster was marked full in " << osdmap->get_epoch() << dendl;
5312     changed = true;
5313   }
5314   const pg_pool_t *pi = osdmap->get_pg_pool(info.pgid.pool());
5315   assert(pi);
5316   if (pi->has_flag(pg_pool_t::FLAG_FULL)) {
5317     const pg_pool_t *opi = lastmap->get_pg_pool(info.pgid.pool());
5318     if (!opi || !opi->has_flag(pg_pool_t::FLAG_FULL)) {
5319       dout(10) << " pool was marked full in " << osdmap->get_epoch() << dendl;
5320       changed = true;
5321     }
5322   }
5323   if (changed) {
5324     info.history.last_epoch_marked_full = osdmap->get_epoch();
5325     dirty_info = true;
5326   }
5327 }
5328
5329 bool PG::should_restart_peering(
5330   int newupprimary,
5331   int newactingprimary,
5332   const vector<int>& newup,
5333   const vector<int>& newacting,
5334   OSDMapRef lastmap,
5335   OSDMapRef osdmap)
5336 {
5337   if (PastIntervals::is_new_interval(
5338         primary.osd,
5339         newactingprimary,
5340         acting,
5341         newacting,
5342         up_primary.osd,
5343         newupprimary,
5344         up,
5345         newup,
5346         osdmap,
5347         lastmap,
5348         info.pgid.pgid)) {
5349     dout(20) << "new interval newup " << newup
5350              << " newacting " << newacting << dendl;
5351     return true;
5352   } else {
5353     return false;
5354   }
5355 }
5356
5357 bool PG::old_peering_msg(epoch_t reply_epoch, epoch_t query_epoch)
5358 {
5359   if (last_peering_reset > reply_epoch ||
5360       last_peering_reset > query_epoch) {
5361     dout(10) << "old_peering_msg reply_epoch " << reply_epoch << " query_epoch " << query_epoch
5362              << " last_peering_reset " << last_peering_reset
5363              << dendl;
5364     return true;
5365   }
5366   return false;
5367 }
5368
5369 void PG::set_last_peering_reset()
5370 {
5371   dout(20) << "set_last_peering_reset " << get_osdmap()->get_epoch() << dendl;
5372   if (last_peering_reset != get_osdmap()->get_epoch()) {
5373     last_peering_reset = get_osdmap()->get_epoch();
5374     reset_interval_flush();
5375   }
5376 }
5377
5378 struct FlushState {
5379   PGRef pg;
5380   epoch_t epoch;
5381   FlushState(PG *pg, epoch_t epoch) : pg(pg), epoch(epoch) {}
5382   ~FlushState() {
5383     pg->lock();
5384     if (!pg->pg_has_reset_since(epoch))
5385       pg->queue_flushed(epoch);
5386     pg->unlock();
5387   }
5388 };
5389 typedef ceph::shared_ptr<FlushState> FlushStateRef;
5390
5391 void PG::start_flush(ObjectStore::Transaction *t,
5392                      list<Context *> *on_applied,
5393                      list<Context *> *on_safe)
5394 {
5395   // flush in progress ops
5396   FlushStateRef flush_trigger (std::make_shared<FlushState>(
5397                                this, get_osdmap()->get_epoch()));
5398   t->nop();
5399   flushes_in_progress++;
5400   on_applied->push_back(new ContainerContext<FlushStateRef>(flush_trigger));
5401   on_safe->push_back(new ContainerContext<FlushStateRef>(flush_trigger));
5402 }
5403
5404 void PG::reset_interval_flush()
5405 {
5406   dout(10) << "Clearing blocked outgoing recovery messages" << dendl;
5407   recovery_state.clear_blocked_outgoing();
5408
5409   Context *c = new QueuePeeringEvt<IntervalFlush>(
5410     this, get_osdmap()->get_epoch(), IntervalFlush());
5411   if (!osr->flush_commit(c)) {
5412     dout(10) << "Beginning to block outgoing recovery messages" << dendl;
5413     recovery_state.begin_block_outgoing();
5414   } else {
5415     dout(10) << "Not blocking outgoing recovery messages" << dendl;
5416     delete c;
5417   }
5418 }
5419
5420 /* Called before initializing peering during advance_map */
5421 void PG::start_peering_interval(
5422   const OSDMapRef lastmap,
5423   const vector<int>& newup, int new_up_primary,
5424   const vector<int>& newacting, int new_acting_primary,
5425   ObjectStore::Transaction *t)
5426 {
5427   const OSDMapRef osdmap = get_osdmap();
5428
5429   set_last_peering_reset();
5430
5431   vector<int> oldacting, oldup;
5432   int oldrole = get_role();
5433
5434   unreg_next_scrub();
5435
5436   pg_shard_t old_acting_primary = get_primary();
5437   pg_shard_t old_up_primary = up_primary;
5438   bool was_old_primary = is_primary();
5439   bool was_old_replica = is_replica();
5440
5441   acting.swap(oldacting);
5442   up.swap(oldup);
5443   init_primary_up_acting(
5444     newup,
5445     newacting,
5446     new_up_primary,
5447     new_acting_primary);
5448
5449   if (info.stats.up != up ||
5450       info.stats.acting != acting ||
5451       info.stats.up_primary != new_up_primary ||
5452       info.stats.acting_primary != new_acting_primary) {
5453     info.stats.up = up;
5454     info.stats.up_primary = new_up_primary;
5455     info.stats.acting = acting;
5456     info.stats.acting_primary = new_acting_primary;
5457     info.stats.mapping_epoch = osdmap->get_epoch();
5458   }
5459
5460   pg_stats_publish_lock.Lock();
5461   pg_stats_publish_valid = false;
5462   pg_stats_publish_lock.Unlock();
5463
5464   // This will now be remapped during a backfill in cases
5465   // that it would not have been before.
5466   if (up != acting)
5467     state_set(PG_STATE_REMAPPED);
5468   else
5469     state_clear(PG_STATE_REMAPPED);
5470
5471   int role = osdmap->calc_pg_role(osd->whoami, acting, acting.size());
5472   if (pool.info.is_replicated() || role == pg_whoami.shard)
5473     set_role(role);
5474   else
5475     set_role(-1);
5476
5477   // did acting, up, primary|acker change?
5478   if (!lastmap) {
5479     dout(10) << " no lastmap" << dendl;
5480     dirty_info = true;
5481     dirty_big_info = true;
5482     info.history.same_interval_since = osdmap->get_epoch();
5483   } else {
5484     std::stringstream debug;
5485     assert(info.history.same_interval_since != 0);
5486     boost::scoped_ptr<IsPGRecoverablePredicate> recoverable(
5487       get_is_recoverable_predicate());
5488     bool new_interval = PastIntervals::check_new_interval(
5489       old_acting_primary.osd,
5490       new_acting_primary,
5491       oldacting, newacting,
5492       old_up_primary.osd,
5493       new_up_primary,
5494       oldup, newup,
5495       info.history.same_interval_since,
5496       info.history.last_epoch_clean,
5497       osdmap,
5498       lastmap,
5499       info.pgid.pgid,
5500       recoverable.get(),
5501       &past_intervals,
5502       &debug);
5503     dout(10) << __func__ << ": check_new_interval output: "
5504              << debug.str() << dendl;
5505     if (new_interval) {
5506       if (osdmap->get_epoch() == osd->get_superblock().oldest_map &&
5507           info.history.last_epoch_clean < osdmap->get_epoch()) {
5508         dout(10) << " map gap, clearing past_intervals and faking" << dendl;
5509         // our information is incomplete and useless; someone else was clean
5510         // after everything we know if osdmaps were trimmed.
5511         past_intervals.clear();
5512       } else {
5513         dout(10) << " noting past " << past_intervals << dendl;
5514       }
5515       dirty_info = true;
5516       dirty_big_info = true;
5517       info.history.same_interval_since = osdmap->get_epoch();
5518       if (info.pgid.pgid.is_split(lastmap->get_pg_num(info.pgid.pgid.pool()),
5519                                   osdmap->get_pg_num(info.pgid.pgid.pool()),
5520                                   nullptr)) {
5521         info.history.last_epoch_split = osdmap->get_epoch();
5522       }
5523     }
5524   }
5525
5526   if (old_up_primary != up_primary ||
5527       oldup != up) {
5528     info.history.same_up_since = osdmap->get_epoch();
5529   }
5530   // this comparison includes primary rank via pg_shard_t
5531   if (old_acting_primary != get_primary()) {
5532     info.history.same_primary_since = osdmap->get_epoch();
5533   }
5534
5535   on_new_interval();
5536
5537   dout(1) << __func__ << " up " << oldup << " -> " << up
5538            << ", acting " << oldacting << " -> " << acting
5539            << ", acting_primary " << old_acting_primary << " -> " << new_acting_primary
5540            << ", up_primary " << old_up_primary << " -> " << new_up_primary
5541            << ", role " << oldrole << " -> " << role
5542            << ", features acting " << acting_features
5543            << " upacting " << upacting_features
5544            << dendl;
5545
5546   // deactivate.
5547   state_clear(PG_STATE_ACTIVE);
5548   state_clear(PG_STATE_PEERED);
5549   state_clear(PG_STATE_DOWN);
5550   state_clear(PG_STATE_RECOVERY_WAIT);
5551   state_clear(PG_STATE_RECOVERY_TOOFULL);
5552   state_clear(PG_STATE_RECOVERING);
5553
5554   peer_purged.clear();
5555   actingbackfill.clear();
5556   scrub_queued = false;
5557
5558   // reset primary/replica state?
5559   if (was_old_primary || is_primary()) {
5560     osd->remove_want_pg_temp(info.pgid.pgid);
5561   } else if (was_old_replica || is_replica()) {
5562     osd->remove_want_pg_temp(info.pgid.pgid);
5563   }
5564   clear_primary_state();
5565
5566
5567   // pg->on_*
5568   on_change(t);
5569
5570   projected_last_update = eversion_t();
5571
5572   assert(!deleting);
5573
5574   // should we tell the primary we are here?
5575   send_notify = !is_primary();
5576
5577   if (role != oldrole ||
5578       was_old_primary != is_primary()) {
5579     // did primary change?
5580     if (was_old_primary != is_primary()) {
5581       state_clear(PG_STATE_CLEAN);
5582       clear_publish_stats();
5583     }
5584
5585     on_role_change();
5586
5587     // take active waiters
5588     requeue_ops(waiting_for_peered);
5589
5590   } else {
5591     // no role change.
5592     // did primary change?
5593     if (get_primary() != old_acting_primary) {
5594       dout(10) << *this << " " << oldacting << " -> " << acting
5595                << ", acting primary "
5596                << old_acting_primary << " -> " << get_primary()
5597                << dendl;
5598     } else {
5599       // primary is the same.
5600       if (is_primary()) {
5601         // i am (still) primary. but my replica set changed.
5602         state_clear(PG_STATE_CLEAN);
5603
5604         dout(10) << oldacting << " -> " << acting
5605                  << ", replicas changed" << dendl;
5606       }
5607     }
5608   }
5609   cancel_recovery();
5610
5611   if (acting.empty() && !up.empty() && up_primary == pg_whoami) {
5612     dout(10) << " acting empty, but i am up[0], clearing pg_temp" << dendl;
5613     osd->queue_want_pg_temp(info.pgid.pgid, acting);
5614   }
5615 }
5616
5617 void PG::on_new_interval()
5618 {
5619   const OSDMapRef osdmap = get_osdmap();
5620
5621   reg_next_scrub();
5622
5623   // initialize features
5624   acting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
5625   upacting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
5626   for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p) {
5627     if (*p == CRUSH_ITEM_NONE)
5628       continue;
5629     uint64_t f = osdmap->get_xinfo(*p).features;
5630     acting_features &= f;
5631     upacting_features &= f;
5632   }
5633   for (vector<int>::iterator p = up.begin(); p != up.end(); ++p) {
5634     if (*p == CRUSH_ITEM_NONE)
5635       continue;
5636     upacting_features &= osdmap->get_xinfo(*p).features;
5637   }
5638
5639   _on_new_interval();
5640 }
5641
5642 void PG::proc_primary_info(ObjectStore::Transaction &t, const pg_info_t &oinfo)
5643 {
5644   assert(!is_primary());
5645
5646   update_history(oinfo.history);
5647   if (!info.stats.stats_invalid && info.stats.stats.sum.num_scrub_errors) {
5648     info.stats.stats.sum.num_scrub_errors = 0;
5649     info.stats.stats.sum.num_shallow_scrub_errors = 0;
5650     info.stats.stats.sum.num_deep_scrub_errors = 0;
5651     dirty_info = true;
5652   }
5653
5654   if (!(info.purged_snaps == oinfo.purged_snaps)) {
5655     dout(10) << __func__ << " updating purged_snaps to " << oinfo.purged_snaps
5656              << dendl;
5657     info.purged_snaps = oinfo.purged_snaps;
5658     dirty_info = true;
5659     dirty_big_info = true;
5660   }
5661 }
5662
5663 ostream& operator<<(ostream& out, const PG& pg)
5664 {
5665   out << "pg[" << pg.info
5666       << " " << pg.up;
5667   if (pg.acting != pg.up)
5668     out << "/" << pg.acting;
5669   if (pg.is_ec_pg())
5670     out << "p" << pg.get_primary();
5671   out << " r=" << pg.get_role();
5672   out << " lpr=" << pg.get_last_peering_reset();
5673
5674   if (!pg.past_intervals.empty()) {
5675     out << " pi=[" << pg.past_intervals.get_bounds()
5676         << ")/" << pg.past_intervals.size();
5677   }
5678
5679   if (pg.is_peered()) {
5680     if (pg.last_update_ondisk != pg.info.last_update)
5681       out << " luod=" << pg.last_update_ondisk;
5682     if (pg.last_update_applied != pg.info.last_update)
5683       out << " lua=" << pg.last_update_applied;
5684   }
5685
5686   if (pg.recovery_ops_active)
5687     out << " rops=" << pg.recovery_ops_active;
5688
5689   if (pg.pg_log.get_tail() != pg.info.log_tail ||
5690       pg.pg_log.get_head() != pg.info.last_update)
5691     out << " (info mismatch, " << pg.pg_log.get_log() << ")";
5692
5693   if (!pg.pg_log.get_log().empty()) {
5694     if ((pg.pg_log.get_log().log.begin()->version <= pg.pg_log.get_tail())) {
5695       out << " (log bound mismatch, actual=["
5696           << pg.pg_log.get_log().log.begin()->version << ","
5697           << pg.pg_log.get_log().log.rbegin()->version << "]";
5698       out << ")";
5699     }
5700   }
5701
5702   if (!pg.backfill_targets.empty())
5703     out << " bft=" << pg.backfill_targets;
5704   out << " crt=" << pg.pg_log.get_can_rollback_to();
5705
5706   if (pg.last_complete_ondisk != pg.info.last_complete)
5707     out << " lcod " << pg.last_complete_ondisk;
5708
5709   if (pg.is_primary()) {
5710     out << " mlcod " << pg.min_last_complete_ondisk;
5711   }
5712
5713   out << " " << pg_state_string(pg.get_state());
5714   if (pg.should_send_notify())
5715     out << " NOTIFY";
5716
5717   if (pg.scrubber.must_repair)
5718     out << " MUST_REPAIR";
5719   if (pg.scrubber.auto_repair)
5720     out << " AUTO_REPAIR";
5721   if (pg.scrubber.must_deep_scrub)
5722     out << " MUST_DEEP_SCRUB";
5723   if (pg.scrubber.must_scrub)
5724     out << " MUST_SCRUB";
5725
5726   //out << " (" << pg.pg_log.get_tail() << "," << pg.pg_log.get_head() << "]";
5727   if (pg.pg_log.get_missing().num_missing()) {
5728     out << " m=" << pg.pg_log.get_missing().num_missing();
5729     if (pg.is_primary()) {
5730       uint64_t unfound = pg.get_num_unfound();
5731       if (unfound)
5732         out << " u=" << unfound;
5733     }
5734   }
5735   if (pg.snap_trimq.size())
5736     out << " snaptrimq=" << pg.snap_trimq;
5737
5738   out << "]";
5739
5740
5741   return out;
5742 }
5743
5744 bool PG::can_discard_op(OpRequestRef& op)
5745 {
5746   const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
5747   if (cct->_conf->osd_discard_disconnected_ops && OSD::op_is_discardable(m)) {
5748     dout(20) << " discard " << *m << dendl;
5749     return true;
5750   }
5751
5752   if (m->get_map_epoch() < info.history.same_primary_since) {
5753     dout(7) << " changed after " << m->get_map_epoch()
5754             << ", dropping " << *m << dendl;
5755     return true;
5756   }
5757
5758   if (m->get_connection()->has_feature(CEPH_FEATURE_RESEND_ON_SPLIT)) {
5759     if (m->get_map_epoch() < pool.info.get_last_force_op_resend()) {
5760       dout(7) << __func__ << " sent before last_force_op_resend "
5761               << pool.info.last_force_op_resend << ", dropping" << *m << dendl;
5762       return true;
5763     }
5764     if (m->get_map_epoch() < info.history.last_epoch_split) {
5765       dout(7) << __func__ << " pg split in "
5766               << info.history.last_epoch_split << ", dropping" << dendl;
5767       return true;
5768     }
5769   } else if (m->get_connection()->has_feature(CEPH_FEATURE_OSD_POOLRESEND)) {
5770     if (m->get_map_epoch() < pool.info.get_last_force_op_resend_preluminous()) {
5771       dout(7) << __func__ << " sent before last_force_op_resend_preluminous "
5772               << pool.info.last_force_op_resend_preluminous
5773               << ", dropping" << *m << dendl;
5774       return true;
5775     }
5776   }
5777
5778   return false;
5779 }
5780
5781 template<typename T, int MSGTYPE>
5782 bool PG::can_discard_replica_op(OpRequestRef& op)
5783 {
5784   const T *m = static_cast<const T *>(op->get_req());
5785   assert(m->get_type() == MSGTYPE);
5786
5787   int from = m->get_source().num();
5788
5789   // if a repop is replied after a replica goes down in a new osdmap, and
5790   // before the pg advances to this new osdmap, the repop replies before this
5791   // repop can be discarded by that replica OSD, because the primary resets the
5792   // connection to it when handling the new osdmap marking it down, and also
5793   // resets the messenger sesssion when the replica reconnects. to avoid the
5794   // out-of-order replies, the messages from that replica should be discarded.
5795   if (osd->get_osdmap()->is_down(from))
5796     return true;
5797   /* Mostly, this overlaps with the old_peering_msg
5798    * condition.  An important exception is pushes
5799    * sent by replicas not in the acting set, since
5800    * if such a replica goes down it does not cause
5801    * a new interval. */
5802   if (get_osdmap()->get_down_at(from) >= m->map_epoch)
5803     return true;
5804
5805   // same pg?
5806   //  if pg changes _at all_, we reset and repeer!
5807   if (old_peering_msg(m->map_epoch, m->map_epoch)) {
5808     dout(10) << "can_discard_replica_op pg changed " << info.history
5809              << " after " << m->map_epoch
5810              << ", dropping" << dendl;
5811     return true;
5812   }
5813   return false;
5814 }
5815
5816 bool PG::can_discard_scan(OpRequestRef op)
5817 {
5818   const MOSDPGScan *m = static_cast<const MOSDPGScan *>(op->get_req());
5819   assert(m->get_type() == MSG_OSD_PG_SCAN);
5820
5821   if (old_peering_msg(m->map_epoch, m->query_epoch)) {
5822     dout(10) << " got old scan, ignoring" << dendl;
5823     return true;
5824   }
5825   return false;
5826 }
5827
5828 bool PG::can_discard_backfill(OpRequestRef op)
5829 {
5830   const MOSDPGBackfill *m = static_cast<const MOSDPGBackfill *>(op->get_req());
5831   assert(m->get_type() == MSG_OSD_PG_BACKFILL);
5832
5833   if (old_peering_msg(m->map_epoch, m->query_epoch)) {
5834     dout(10) << " got old backfill, ignoring" << dendl;
5835     return true;
5836   }
5837
5838   return false;
5839
5840 }
5841
5842 bool PG::can_discard_request(OpRequestRef& op)
5843 {
5844   switch (op->get_req()->get_type()) {
5845   case CEPH_MSG_OSD_OP:
5846     return can_discard_op(op);
5847   case CEPH_MSG_OSD_BACKOFF:
5848     return false; // never discard
5849   case MSG_OSD_SUBOP:
5850     return can_discard_replica_op<MOSDSubOp, MSG_OSD_SUBOP>(op);
5851   case MSG_OSD_REPOP:
5852     return can_discard_replica_op<MOSDRepOp, MSG_OSD_REPOP>(op);
5853   case MSG_OSD_PG_PUSH:
5854     return can_discard_replica_op<MOSDPGPush, MSG_OSD_PG_PUSH>(op);
5855   case MSG_OSD_PG_PULL:
5856     return can_discard_replica_op<MOSDPGPull, MSG_OSD_PG_PULL>(op);
5857   case MSG_OSD_PG_PUSH_REPLY:
5858     return can_discard_replica_op<MOSDPGPushReply, MSG_OSD_PG_PUSH_REPLY>(op);
5859   case MSG_OSD_SUBOPREPLY:
5860     return can_discard_replica_op<MOSDSubOpReply, MSG_OSD_SUBOPREPLY>(op);
5861   case MSG_OSD_REPOPREPLY:
5862     return can_discard_replica_op<MOSDRepOpReply, MSG_OSD_REPOPREPLY>(op);
5863   case MSG_OSD_PG_RECOVERY_DELETE:
5864     return can_discard_replica_op<MOSDPGRecoveryDelete, MSG_OSD_PG_RECOVERY_DELETE>(op);
5865
5866   case MSG_OSD_PG_RECOVERY_DELETE_REPLY:
5867     return can_discard_replica_op<MOSDPGRecoveryDeleteReply, MSG_OSD_PG_RECOVERY_DELETE_REPLY>(op);
5868
5869   case MSG_OSD_EC_WRITE:
5870     return can_discard_replica_op<MOSDECSubOpWrite, MSG_OSD_EC_WRITE>(op);
5871   case MSG_OSD_EC_WRITE_REPLY:
5872     return can_discard_replica_op<MOSDECSubOpWriteReply, MSG_OSD_EC_WRITE_REPLY>(op);
5873   case MSG_OSD_EC_READ:
5874     return can_discard_replica_op<MOSDECSubOpRead, MSG_OSD_EC_READ>(op);
5875   case MSG_OSD_EC_READ_REPLY:
5876     return can_discard_replica_op<MOSDECSubOpReadReply, MSG_OSD_EC_READ_REPLY>(op);
5877   case MSG_OSD_REP_SCRUB:
5878     return can_discard_replica_op<MOSDRepScrub, MSG_OSD_REP_SCRUB>(op);
5879   case MSG_OSD_SCRUB_RESERVE:
5880     return can_discard_replica_op<MOSDScrubReserve, MSG_OSD_SCRUB_RESERVE>(op);
5881   case MSG_OSD_REP_SCRUBMAP:
5882     return can_discard_replica_op<MOSDRepScrubMap, MSG_OSD_REP_SCRUBMAP>(op);
5883   case MSG_OSD_PG_UPDATE_LOG_MISSING:
5884     return can_discard_replica_op<
5885       MOSDPGUpdateLogMissing, MSG_OSD_PG_UPDATE_LOG_MISSING>(op);
5886   case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY:
5887     return can_discard_replica_op<
5888       MOSDPGUpdateLogMissingReply, MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY>(op);
5889
5890   case MSG_OSD_PG_SCAN:
5891     return can_discard_scan(op);
5892   case MSG_OSD_PG_BACKFILL:
5893     return can_discard_backfill(op);
5894   case MSG_OSD_PG_BACKFILL_REMOVE:
5895     return can_discard_replica_op<MOSDPGBackfillRemove,
5896                                   MSG_OSD_PG_BACKFILL_REMOVE>(op);
5897   }
5898   return true;
5899 }
5900
5901 void PG::take_waiters()
5902 {
5903   dout(10) << "take_waiters" << dendl;
5904   requeue_map_waiters();
5905   for (list<CephPeeringEvtRef>::iterator i = peering_waiters.begin();
5906        i != peering_waiters.end();
5907        ++i) osd->queue_for_peering(this);
5908   peering_queue.splice(peering_queue.begin(), peering_waiters,
5909                        peering_waiters.begin(), peering_waiters.end());
5910 }
5911
5912 void PG::handle_peering_event(CephPeeringEvtRef evt, RecoveryCtx *rctx)
5913 {
5914   dout(10) << "handle_peering_event: " << evt->get_desc() << dendl;
5915   if (!have_same_or_newer_map(evt->get_epoch_sent())) {
5916     dout(10) << "deferring event " << evt->get_desc() << dendl;
5917     peering_waiters.push_back(evt);
5918     return;
5919   }
5920   if (old_peering_evt(evt))
5921     return;
5922   recovery_state.handle_event(evt, rctx);
5923 }
5924
5925 void PG::queue_peering_event(CephPeeringEvtRef evt)
5926 {
5927   if (old_peering_evt(evt))
5928     return;
5929   peering_queue.push_back(evt);
5930   osd->queue_for_peering(this);
5931 }
5932
5933 void PG::queue_null(epoch_t msg_epoch,
5934                     epoch_t query_epoch)
5935 {
5936   dout(10) << "null" << dendl;
5937   queue_peering_event(
5938     CephPeeringEvtRef(std::make_shared<CephPeeringEvt>(msg_epoch, query_epoch,
5939                                          NullEvt())));
5940 }
5941
5942 void PG::queue_flushed(epoch_t e)
5943 {
5944   dout(10) << "flushed" << dendl;
5945   queue_peering_event(
5946     CephPeeringEvtRef(std::make_shared<CephPeeringEvt>(e, e,
5947                                          FlushedEvt())));
5948 }
5949
5950 void PG::queue_query(epoch_t msg_epoch,
5951                      epoch_t query_epoch,
5952                      pg_shard_t from, const pg_query_t& q)
5953 {
5954   dout(10) << "handle_query " << q << " from replica " << from << dendl;
5955   queue_peering_event(
5956     CephPeeringEvtRef(std::make_shared<CephPeeringEvt>(msg_epoch, query_epoch,
5957                                          MQuery(from, q, query_epoch))));
5958 }
5959
5960 void PG::handle_advance_map(
5961   OSDMapRef osdmap, OSDMapRef lastmap,
5962   vector<int>& newup, int up_primary,
5963   vector<int>& newacting, int acting_primary,
5964   RecoveryCtx *rctx)
5965 {
5966   assert(lastmap->get_epoch() == osdmap_ref->get_epoch());
5967   assert(lastmap == osdmap_ref);
5968   dout(10) << "handle_advance_map "
5969            << newup << "/" << newacting
5970            << " -- " << up_primary << "/" << acting_primary
5971            << dendl;
5972   update_osdmap_ref(osdmap);
5973   pool.update(osdmap);
5974   past_intervals.update_type_from_map(pool.info.ec_pool(), *osdmap);
5975   if (cct->_conf->osd_debug_verify_cached_snaps) {
5976     interval_set<snapid_t> actual_removed_snaps;
5977     const pg_pool_t *pi = osdmap->get_pg_pool(info.pgid.pool());
5978     assert(pi);
5979     pi->build_removed_snaps(actual_removed_snaps);
5980     if (!(actual_removed_snaps == pool.cached_removed_snaps)) {
5981       derr << __func__ << ": mismatch between the actual removed snaps "
5982            << actual_removed_snaps << " and pool.cached_removed_snaps "
5983            << " pool.cached_removed_snaps " << pool.cached_removed_snaps
5984            << dendl;
5985     }
5986     assert(actual_removed_snaps == pool.cached_removed_snaps);
5987   }
5988   AdvMap evt(
5989     osdmap, lastmap, newup, up_primary,
5990     newacting, acting_primary);
5991   recovery_state.handle_event(evt, rctx);
5992   if (pool.info.last_change == osdmap_ref->get_epoch()) {
5993     on_pool_change();
5994     update_store_with_options();
5995   }
5996 }
5997
5998 void PG::handle_activate_map(RecoveryCtx *rctx)
5999 {
6000   dout(10) << "handle_activate_map " << dendl;
6001   ActMap evt;
6002   recovery_state.handle_event(evt, rctx);
6003   if (osdmap_ref->get_epoch() - last_persisted_osdmap_ref->get_epoch() >
6004     cct->_conf->osd_pg_epoch_persisted_max_stale) {
6005     dout(20) << __func__ << ": Dirtying info: last_persisted is "
6006              << last_persisted_osdmap_ref->get_epoch()
6007              << " while current is " << osdmap_ref->get_epoch() << dendl;
6008     dirty_info = true;
6009   } else {
6010     dout(20) << __func__ << ": Not dirtying info: last_persisted is "
6011              << last_persisted_osdmap_ref->get_epoch()
6012              << " while current is " << osdmap_ref->get_epoch() << dendl;
6013   }
6014   if (osdmap_ref->check_new_blacklist_entries()) check_blacklisted_watchers();
6015 }
6016
6017 void PG::handle_loaded(RecoveryCtx *rctx)
6018 {
6019   dout(10) << "handle_loaded" << dendl;
6020   Load evt;
6021   recovery_state.handle_event(evt, rctx);
6022 }
6023
6024 void PG::handle_create(RecoveryCtx *rctx)
6025 {
6026   dout(10) << "handle_create" << dendl;
6027   rctx->created_pgs.insert(this);
6028   Initialize evt;
6029   recovery_state.handle_event(evt, rctx);
6030   ActMap evt2;
6031   recovery_state.handle_event(evt2, rctx);
6032
6033   rctx->on_applied->add(make_lambda_context([this]() {
6034     update_store_with_options();
6035   }));
6036 }
6037
6038 void PG::handle_query_state(Formatter *f)
6039 {
6040   dout(10) << "handle_query_state" << dendl;
6041   QueryState q(f);
6042   recovery_state.handle_event(q, 0);
6043 }
6044
6045 void PG::update_store_with_options()
6046 {
6047   auto r = osd->store->set_collection_opts(coll, pool.info.opts);
6048   if(r < 0 && r != -EOPNOTSUPP) {
6049     derr << __func__ << " set_collection_opts returns error:" << r << dendl;
6050   }
6051 }
6052
6053 void PG::update_store_on_load()
6054 {
6055   if (osd->store->get_type() == "filestore") {
6056     // legacy filestore didn't store collection bit width; fix.
6057     int bits = osd->store->collection_bits(coll);
6058     if (bits < 0) {
6059       assert(!coll.is_meta()); // otherwise OSD::load_pgs() did a bad thing
6060       bits = info.pgid.get_split_bits(pool.info.get_pg_num());
6061       lderr(cct) << __func__ << " setting bit width to " << bits << dendl;
6062       ObjectStore::Transaction t;
6063       t.collection_set_bits(coll, bits);
6064       osd->store->apply_transaction(osr.get(), std::move(t));
6065     }
6066   }
6067 }
6068
6069 /*------------ Recovery State Machine----------------*/
6070 #undef dout_prefix
6071 #define dout_prefix (*_dout << context< RecoveryMachine >().pg->gen_prefix() \
6072                      << "state<" << get_state_name() << ">: ")
6073
6074 /*------Crashed-------*/
6075 PG::RecoveryState::Crashed::Crashed(my_context ctx)
6076   : my_base(ctx),
6077     NamedState(context< RecoveryMachine >().pg, "Crashed")
6078 {
6079   context< RecoveryMachine >().log_enter(state_name);
6080   assert(0 == "we got a bad state machine event");
6081 }
6082
6083
6084 /*------Initial-------*/
6085 PG::RecoveryState::Initial::Initial(my_context ctx)
6086   : my_base(ctx),
6087     NamedState(context< RecoveryMachine >().pg, "Initial")
6088 {
6089   context< RecoveryMachine >().log_enter(state_name);
6090 }
6091
6092 boost::statechart::result PG::RecoveryState::Initial::react(const Load& l)
6093 {
6094   PG *pg = context< RecoveryMachine >().pg;
6095
6096   // do we tell someone we're here?
6097   pg->send_notify = (!pg->is_primary());
6098   pg->update_store_with_options();
6099
6100   pg->update_store_on_load();
6101
6102   return transit< Reset >();
6103 }
6104
6105 boost::statechart::result PG::RecoveryState::Initial::react(const MNotifyRec& notify)
6106 {
6107   PG *pg = context< RecoveryMachine >().pg;
6108   pg->proc_replica_info(
6109     notify.from, notify.notify.info, notify.notify.epoch_sent);
6110   pg->set_last_peering_reset();
6111   return transit< Primary >();
6112 }
6113
6114 boost::statechart::result PG::RecoveryState::Initial::react(const MInfoRec& i)
6115 {
6116   PG *pg = context< RecoveryMachine >().pg;
6117   assert(!pg->is_primary());
6118   post_event(i);
6119   return transit< Stray >();
6120 }
6121
6122 boost::statechart::result PG::RecoveryState::Initial::react(const MLogRec& i)
6123 {
6124   PG *pg = context< RecoveryMachine >().pg;
6125   assert(!pg->is_primary());
6126   post_event(i);
6127   return transit< Stray >();
6128 }
6129
6130 void PG::RecoveryState::Initial::exit()
6131 {
6132   context< RecoveryMachine >().log_exit(state_name, enter_time);
6133   PG *pg = context< RecoveryMachine >().pg;
6134   utime_t dur = ceph_clock_now() - enter_time;
6135   pg->osd->recoverystate_perf->tinc(rs_initial_latency, dur);
6136 }
6137
6138 /*------Started-------*/
6139 PG::RecoveryState::Started::Started(my_context ctx)
6140   : my_base(ctx),
6141     NamedState(context< RecoveryMachine >().pg, "Started")
6142 {
6143   context< RecoveryMachine >().log_enter(state_name);
6144 }
6145
6146 boost::statechart::result
6147 PG::RecoveryState::Started::react(const IntervalFlush&)
6148 {
6149   PG *pg = context< RecoveryMachine >().pg;
6150   ldout(pg->cct, 10) << "Ending blocked outgoing recovery messages" << dendl;
6151   context< RecoveryMachine >().pg->recovery_state.end_block_outgoing();
6152   return discard_event();
6153 }
6154
6155
6156 boost::statechart::result
6157 PG::RecoveryState::Started::react(const FlushedEvt&)
6158 {
6159   PG *pg = context< RecoveryMachine >().pg;
6160   pg->on_flushed();
6161   return discard_event();
6162 }
6163
6164
6165 boost::statechart::result PG::RecoveryState::Started::react(const AdvMap& advmap)
6166 {
6167   PG *pg = context< RecoveryMachine >().pg;
6168   ldout(pg->cct, 10) << "Started advmap" << dendl;
6169   pg->check_full_transition(advmap.lastmap, advmap.osdmap);
6170   if (pg->should_restart_peering(
6171         advmap.up_primary,
6172         advmap.acting_primary,
6173         advmap.newup,
6174         advmap.newacting,
6175         advmap.lastmap,
6176         advmap.osdmap)) {
6177     ldout(pg->cct, 10) << "should_restart_peering, transitioning to Reset"
6178                        << dendl;
6179     post_event(advmap);
6180     return transit< Reset >();
6181   }
6182   pg->remove_down_peer_info(advmap.osdmap);
6183   return discard_event();
6184 }
6185
6186 boost::statechart::result PG::RecoveryState::Started::react(const QueryState& q)
6187 {
6188   q.f->open_object_section("state");
6189   q.f->dump_string("name", state_name);
6190   q.f->dump_stream("enter_time") << enter_time;
6191   q.f->close_section();
6192   return discard_event();
6193 }
6194
6195 void PG::RecoveryState::Started::exit()
6196 {
6197   context< RecoveryMachine >().log_exit(state_name, enter_time);
6198   PG *pg = context< RecoveryMachine >().pg;
6199   utime_t dur = ceph_clock_now() - enter_time;
6200   pg->osd->recoverystate_perf->tinc(rs_started_latency, dur);
6201 }
6202
6203 /*--------Reset---------*/
6204 PG::RecoveryState::Reset::Reset(my_context ctx)
6205   : my_base(ctx),
6206     NamedState(context< RecoveryMachine >().pg, "Reset")
6207 {
6208   context< RecoveryMachine >().log_enter(state_name);
6209   PG *pg = context< RecoveryMachine >().pg;
6210
6211   pg->flushes_in_progress = 0;
6212   pg->set_last_peering_reset();
6213 }
6214
6215 boost::statechart::result
6216 PG::RecoveryState::Reset::react(const FlushedEvt&)
6217 {
6218   PG *pg = context< RecoveryMachine >().pg;
6219   pg->on_flushed();
6220   return discard_event();
6221 }
6222
6223 boost::statechart::result
6224 PG::RecoveryState::Reset::react(const IntervalFlush&)
6225 {
6226   PG *pg = context< RecoveryMachine >().pg;
6227   ldout(pg->cct, 10) << "Ending blocked outgoing recovery messages" << dendl;
6228   context< RecoveryMachine >().pg->recovery_state.end_block_outgoing();
6229   return discard_event();
6230 }
6231
6232 boost::statechart::result PG::RecoveryState::Reset::react(const AdvMap& advmap)
6233 {
6234   PG *pg = context< RecoveryMachine >().pg;
6235   ldout(pg->cct, 10) << "Reset advmap" << dendl;
6236
6237   pg->check_full_transition(advmap.lastmap, advmap.osdmap);
6238
6239   if (pg->should_restart_peering(
6240         advmap.up_primary,
6241         advmap.acting_primary,
6242         advmap.newup,
6243         advmap.newacting,
6244         advmap.lastmap,
6245         advmap.osdmap)) {
6246     ldout(pg->cct, 10) << "should restart peering, calling start_peering_interval again"
6247                        << dendl;
6248     pg->start_peering_interval(
6249       advmap.lastmap,
6250       advmap.newup, advmap.up_primary,
6251       advmap.newacting, advmap.acting_primary,
6252       context< RecoveryMachine >().get_cur_transaction());
6253   }
6254   pg->remove_down_peer_info(advmap.osdmap);
6255   pg->check_past_interval_bounds();
6256   return discard_event();
6257 }
6258
6259 boost::statechart::result PG::RecoveryState::Reset::react(const ActMap&)
6260 {
6261   PG *pg = context< RecoveryMachine >().pg;
6262   if (pg->should_send_notify() && pg->get_primary().osd >= 0) {
6263     context< RecoveryMachine >().send_notify(
6264       pg->get_primary(),
6265       pg_notify_t(
6266         pg->get_primary().shard, pg->pg_whoami.shard,
6267         pg->get_osdmap()->get_epoch(),
6268         pg->get_osdmap()->get_epoch(),
6269         pg->info),
6270       pg->past_intervals);
6271   }
6272
6273   pg->update_heartbeat_peers();
6274   pg->take_waiters();
6275
6276   return transit< Started >();
6277 }
6278
6279 boost::statechart::result PG::RecoveryState::Reset::react(const QueryState& q)
6280 {
6281   q.f->open_object_section("state");
6282   q.f->dump_string("name", state_name);
6283   q.f->dump_stream("enter_time") << enter_time;
6284   q.f->close_section();
6285   return discard_event();
6286 }
6287
6288 void PG::RecoveryState::Reset::exit()
6289 {
6290   context< RecoveryMachine >().log_exit(state_name, enter_time);
6291   PG *pg = context< RecoveryMachine >().pg;
6292   utime_t dur = ceph_clock_now() - enter_time;
6293   pg->osd->recoverystate_perf->tinc(rs_reset_latency, dur);
6294 }
6295
6296 /*-------Start---------*/
6297 PG::RecoveryState::Start::Start(my_context ctx)
6298   : my_base(ctx),
6299     NamedState(context< RecoveryMachine >().pg, "Start")
6300 {
6301   context< RecoveryMachine >().log_enter(state_name);
6302
6303   PG *pg = context< RecoveryMachine >().pg;
6304   if (pg->is_primary()) {
6305     ldout(pg->cct, 1) << "transitioning to Primary" << dendl;
6306     post_event(MakePrimary());
6307   } else { //is_stray
6308     ldout(pg->cct, 1) << "transitioning to Stray" << dendl;
6309     post_event(MakeStray());
6310   }
6311 }
6312
6313 void PG::RecoveryState::Start::exit()
6314 {
6315   context< RecoveryMachine >().log_exit(state_name, enter_time);
6316   PG *pg = context< RecoveryMachine >().pg;
6317   utime_t dur = ceph_clock_now() - enter_time;
6318   pg->osd->recoverystate_perf->tinc(rs_start_latency, dur);
6319 }
6320
6321 /*---------Primary--------*/
6322 PG::RecoveryState::Primary::Primary(my_context ctx)
6323   : my_base(ctx),
6324     NamedState(context< RecoveryMachine >().pg, "Started/Primary")
6325 {
6326   context< RecoveryMachine >().log_enter(state_name);
6327   PG *pg = context< RecoveryMachine >().pg;
6328   assert(pg->want_acting.empty());
6329
6330   // set CREATING bit until we have peered for the first time.
6331   if (pg->info.history.last_epoch_started == 0) {
6332     pg->state_set(PG_STATE_CREATING);
6333     // use the history timestamp, which ultimately comes from the
6334     // monitor in the create case.
6335     utime_t t = pg->info.history.last_scrub_stamp;
6336     pg->info.stats.last_fresh = t;
6337     pg->info.stats.last_active = t;
6338     pg->info.stats.last_change = t;
6339     pg->info.stats.last_peered = t;
6340     pg->info.stats.last_clean = t;
6341     pg->info.stats.last_unstale = t;
6342     pg->info.stats.last_undegraded = t;
6343     pg->info.stats.last_fullsized = t;
6344     pg->info.stats.last_scrub_stamp = t;
6345     pg->info.stats.last_deep_scrub_stamp = t;
6346     pg->info.stats.last_clean_scrub_stamp = t;
6347   }
6348 }
6349
6350 boost::statechart::result PG::RecoveryState::Primary::react(const MNotifyRec& notevt)
6351 {
6352   PG *pg = context< RecoveryMachine >().pg;
6353   ldout(pg->cct, 7) << "handle_pg_notify from osd." << notevt.from << dendl;
6354   pg->proc_replica_info(
6355     notevt.from, notevt.notify.info, notevt.notify.epoch_sent);
6356   return discard_event();
6357 }
6358
6359 boost::statechart::result PG::RecoveryState::Primary::react(const ActMap&)
6360 {
6361   PG *pg = context< RecoveryMachine >().pg;
6362   ldout(pg->cct, 7) << "handle ActMap primary" << dendl;
6363   pg->publish_stats_to_osd();
6364   pg->take_waiters();
6365   return discard_event();
6366 }
6367
6368 void PG::RecoveryState::Primary::exit()
6369 {
6370   context< RecoveryMachine >().log_exit(state_name, enter_time);
6371   PG *pg = context< RecoveryMachine >().pg;
6372   pg->want_acting.clear();
6373   utime_t dur = ceph_clock_now() - enter_time;
6374   pg->osd->recoverystate_perf->tinc(rs_primary_latency, dur);
6375   pg->clear_primary_state();
6376   pg->state_clear(PG_STATE_CREATING);
6377 }
6378
6379 /*---------Peering--------*/
6380 PG::RecoveryState::Peering::Peering(my_context ctx)
6381   : my_base(ctx),
6382     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering"),
6383     history_les_bound(false)
6384 {
6385   context< RecoveryMachine >().log_enter(state_name);
6386
6387   PG *pg = context< RecoveryMachine >().pg;
6388   assert(!pg->is_peered());
6389   assert(!pg->is_peering());
6390   assert(pg->is_primary());
6391   pg->state_set(PG_STATE_PEERING);
6392 }
6393
6394 boost::statechart::result PG::RecoveryState::Peering::react(const AdvMap& advmap)
6395 {
6396   PG *pg = context< RecoveryMachine >().pg;
6397   ldout(pg->cct, 10) << "Peering advmap" << dendl;
6398   if (prior_set.affected_by_map(*(advmap.osdmap), pg)) {
6399     ldout(pg->cct, 1) << "Peering, affected_by_map, going to Reset" << dendl;
6400     post_event(advmap);
6401     return transit< Reset >();
6402   }
6403
6404   pg->adjust_need_up_thru(advmap.osdmap);
6405
6406   return forward_event();
6407 }
6408
6409 boost::statechart::result PG::RecoveryState::Peering::react(const QueryState& q)
6410 {
6411   PG *pg = context< RecoveryMachine >().pg;
6412
6413   q.f->open_object_section("state");
6414   q.f->dump_string("name", state_name);
6415   q.f->dump_stream("enter_time") << enter_time;
6416
6417   q.f->open_array_section("past_intervals");
6418   pg->past_intervals.dump(q.f);
6419   q.f->close_section();
6420
6421   q.f->open_array_section("probing_osds");
6422   for (set<pg_shard_t>::iterator p = prior_set.probe.begin();
6423        p != prior_set.probe.end();
6424        ++p)
6425     q.f->dump_stream("osd") << *p;
6426   q.f->close_section();
6427
6428   if (prior_set.pg_down)
6429     q.f->dump_string("blocked", "peering is blocked due to down osds");
6430
6431   q.f->open_array_section("down_osds_we_would_probe");
6432   for (set<int>::iterator p = prior_set.down.begin();
6433        p != prior_set.down.end();
6434        ++p)
6435     q.f->dump_int("osd", *p);
6436   q.f->close_section();
6437
6438   q.f->open_array_section("peering_blocked_by");
6439   for (map<int,epoch_t>::iterator p = prior_set.blocked_by.begin();
6440        p != prior_set.blocked_by.end();
6441        ++p) {
6442     q.f->open_object_section("osd");
6443     q.f->dump_int("osd", p->first);
6444     q.f->dump_int("current_lost_at", p->second);
6445     q.f->dump_string("comment", "starting or marking this osd lost may let us proceed");
6446     q.f->close_section();
6447   }
6448   q.f->close_section();
6449
6450   if (history_les_bound) {
6451     q.f->open_array_section("peering_blocked_by_detail");
6452     q.f->open_object_section("item");
6453     q.f->dump_string("detail","peering_blocked_by_history_les_bound");
6454     q.f->close_section();
6455     q.f->close_section();
6456   }
6457
6458   q.f->close_section();
6459   return forward_event();
6460 }
6461
6462 void PG::RecoveryState::Peering::exit()
6463 {
6464   PG *pg = context< RecoveryMachine >().pg;
6465   ldout(pg->cct, 10) << "Leaving Peering" << dendl;
6466   context< RecoveryMachine >().log_exit(state_name, enter_time);
6467   pg->state_clear(PG_STATE_PEERING);
6468   pg->clear_probe_targets();
6469
6470   utime_t dur = ceph_clock_now() - enter_time;
6471   pg->osd->recoverystate_perf->tinc(rs_peering_latency, dur);
6472 }
6473
6474
6475 /*------Backfilling-------*/
6476 PG::RecoveryState::Backfilling::Backfilling(my_context ctx)
6477   : my_base(ctx),
6478     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Backfilling")
6479 {
6480   context< RecoveryMachine >().log_enter(state_name);
6481   PG *pg = context< RecoveryMachine >().pg;
6482   pg->backfill_reserved = true;
6483   pg->queue_recovery();
6484   pg->state_clear(PG_STATE_BACKFILL_TOOFULL);
6485   pg->state_clear(PG_STATE_BACKFILL_WAIT);
6486   pg->state_set(PG_STATE_BACKFILLING);
6487   pg->publish_stats_to_osd();
6488 }
6489
6490 boost::statechart::result
6491 PG::RecoveryState::Backfilling::react(const DeferBackfill &c)
6492 {
6493   PG *pg = context< RecoveryMachine >().pg;
6494   ldout(pg->cct, 10) << "defer backfill, retry delay " << c.delay << dendl;
6495   pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
6496
6497   pg->state_set(PG_STATE_BACKFILL_WAIT);
6498   pg->state_clear(PG_STATE_BACKFILLING);
6499
6500   for (set<pg_shard_t>::iterator it = pg->backfill_targets.begin();
6501        it != pg->backfill_targets.end();
6502        ++it) {
6503     assert(*it != pg->pg_whoami);
6504     ConnectionRef con = pg->osd->get_con_osd_cluster(
6505       it->osd, pg->get_osdmap()->get_epoch());
6506     if (con) {
6507       pg->osd->send_message_osd_cluster(
6508         new MBackfillReserve(
6509           MBackfillReserve::REJECT,
6510           spg_t(pg->info.pgid.pgid, it->shard),
6511           pg->get_osdmap()->get_epoch()),
6512         con.get());
6513     }
6514   }
6515
6516
6517   if (!pg->waiting_on_backfill.empty()) {
6518     pg->waiting_on_backfill.clear();
6519     pg->finish_recovery_op(hobject_t::get_max());
6520   }
6521
6522   pg->schedule_backfill_retry(c.delay);
6523   return transit<NotBackfilling>();
6524 }
6525
6526 boost::statechart::result
6527 PG::RecoveryState::Backfilling::react(const UnfoundBackfill &c)
6528 {
6529   PG *pg = context< RecoveryMachine >().pg;
6530   ldout(pg->cct, 10) << "backfill has unfound, can't continue" << dendl;
6531   pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
6532
6533   pg->state_set(PG_STATE_BACKFILL_UNFOUND);
6534   pg->state_clear(PG_STATE_BACKFILLING);
6535
6536   for (set<pg_shard_t>::iterator it = pg->backfill_targets.begin();
6537        it != pg->backfill_targets.end();
6538        ++it) {
6539     assert(*it != pg->pg_whoami);
6540     ConnectionRef con = pg->osd->get_con_osd_cluster(
6541       it->osd, pg->get_osdmap()->get_epoch());
6542     if (con) {
6543       pg->osd->send_message_osd_cluster(
6544         new MBackfillReserve(
6545           MBackfillReserve::REJECT,
6546           spg_t(pg->info.pgid.pgid, it->shard),
6547           pg->get_osdmap()->get_epoch()),
6548         con.get());
6549     }
6550   }
6551
6552   pg->waiting_on_backfill.clear();
6553
6554   return transit<NotBackfilling>();
6555 }
6556
6557 boost::statechart::result
6558 PG::RecoveryState::Backfilling::react(const RemoteReservationRejected &)
6559 {
6560   PG *pg = context< RecoveryMachine >().pg;
6561   pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
6562   pg->state_set(PG_STATE_BACKFILL_TOOFULL);
6563
6564   for (set<pg_shard_t>::iterator it = pg->backfill_targets.begin();
6565        it != pg->backfill_targets.end();
6566        ++it) {
6567     assert(*it != pg->pg_whoami);
6568     ConnectionRef con = pg->osd->get_con_osd_cluster(
6569       it->osd, pg->get_osdmap()->get_epoch());
6570     if (con) {
6571       pg->osd->send_message_osd_cluster(
6572         new MBackfillReserve(
6573           MBackfillReserve::REJECT,
6574           spg_t(pg->info.pgid.pgid, it->shard),
6575           pg->get_osdmap()->get_epoch()),
6576         con.get());
6577     }
6578   }
6579
6580   if (!pg->waiting_on_backfill.empty()) {
6581     pg->waiting_on_backfill.clear();
6582     pg->finish_recovery_op(hobject_t::get_max());
6583   }
6584
6585   pg->schedule_backfill_retry(pg->cct->_conf->osd_recovery_retry_interval);
6586   return transit<NotBackfilling>();
6587 }
6588
6589 void PG::RecoveryState::Backfilling::exit()
6590 {
6591   context< RecoveryMachine >().log_exit(state_name, enter_time);
6592   PG *pg = context< RecoveryMachine >().pg;
6593   pg->backfill_reserved = false;
6594   pg->backfill_reserving = false;
6595   pg->state_clear(PG_STATE_BACKFILLING);
6596   pg->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
6597   utime_t dur = ceph_clock_now() - enter_time;
6598   pg->osd->recoverystate_perf->tinc(rs_backfilling_latency, dur);
6599 }
6600
6601 /*--WaitRemoteBackfillReserved--*/
6602
6603 PG::RecoveryState::WaitRemoteBackfillReserved::WaitRemoteBackfillReserved(my_context ctx)
6604   : my_base(ctx),
6605     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitRemoteBackfillReserved"),
6606     backfill_osd_it(context< Active >().remote_shards_to_reserve_backfill.begin())
6607 {
6608   context< RecoveryMachine >().log_enter(state_name);
6609   PG *pg = context< RecoveryMachine >().pg;
6610   pg->state_set(PG_STATE_BACKFILL_WAIT);
6611   pg->publish_stats_to_osd();
6612   post_event(RemoteBackfillReserved());
6613 }
6614
6615 boost::statechart::result
6616 PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteBackfillReserved &evt)
6617 {
6618   PG *pg = context< RecoveryMachine >().pg;
6619
6620   if (backfill_osd_it != context< Active >().remote_shards_to_reserve_backfill.end()) {
6621     //The primary never backfills itself
6622     assert(*backfill_osd_it != pg->pg_whoami);
6623     ConnectionRef con = pg->osd->get_con_osd_cluster(
6624       backfill_osd_it->osd, pg->get_osdmap()->get_epoch());
6625     if (con) {
6626       pg->osd->send_message_osd_cluster(
6627         new MBackfillReserve(
6628         MBackfillReserve::REQUEST,
6629         spg_t(pg->info.pgid.pgid, backfill_osd_it->shard),
6630         pg->get_osdmap()->get_epoch(),
6631         pg->get_backfill_priority()),
6632       con.get());
6633     }
6634     ++backfill_osd_it;
6635   } else {
6636     post_event(AllBackfillsReserved());
6637   }
6638   return discard_event();
6639 }
6640
6641 void PG::RecoveryState::WaitRemoteBackfillReserved::exit()
6642 {
6643   context< RecoveryMachine >().log_exit(state_name, enter_time);
6644   PG *pg = context< RecoveryMachine >().pg;
6645   utime_t dur = ceph_clock_now() - enter_time;
6646   pg->osd->recoverystate_perf->tinc(rs_waitremotebackfillreserved_latency, dur);
6647 }
6648
6649 boost::statechart::result
6650 PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteReservationRejected &evt)
6651 {
6652   PG *pg = context< RecoveryMachine >().pg;
6653   pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
6654
6655   // Send REJECT to all previously acquired reservations
6656   set<pg_shard_t>::const_iterator it, begin, end, next;
6657   begin = context< Active >().remote_shards_to_reserve_backfill.begin();
6658   end = context< Active >().remote_shards_to_reserve_backfill.end();
6659   assert(begin != end);
6660   for (next = it = begin, ++next ; next != backfill_osd_it; ++it, ++next) {
6661     //The primary never backfills itself
6662     assert(*it != pg->pg_whoami);
6663     ConnectionRef con = pg->osd->get_con_osd_cluster(
6664       it->osd, pg->get_osdmap()->get_epoch());
6665     if (con) {
6666       pg->osd->send_message_osd_cluster(
6667         new MBackfillReserve(
6668         MBackfillReserve::REJECT,
6669         spg_t(pg->info.pgid.pgid, it->shard),
6670         pg->get_osdmap()->get_epoch()),
6671       con.get());
6672     }
6673   }
6674
6675   pg->state_clear(PG_STATE_BACKFILL_WAIT);
6676   pg->state_set(PG_STATE_BACKFILL_TOOFULL);
6677   pg->publish_stats_to_osd();
6678
6679   pg->schedule_backfill_retry(pg->cct->_conf->osd_recovery_retry_interval);
6680
6681   return transit<NotBackfilling>();
6682 }
6683
6684 /*--WaitLocalBackfillReserved--*/
6685 PG::RecoveryState::WaitLocalBackfillReserved::WaitLocalBackfillReserved(my_context ctx)
6686   : my_base(ctx),
6687     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitLocalBackfillReserved")
6688 {
6689   context< RecoveryMachine >().log_enter(state_name);
6690   PG *pg = context< RecoveryMachine >().pg;
6691   pg->state_set(PG_STATE_BACKFILL_WAIT);
6692   pg->osd->local_reserver.request_reservation(
6693     pg->info.pgid,
6694     new QueuePeeringEvt<LocalBackfillReserved>(
6695       pg, pg->get_osdmap()->get_epoch(),
6696       LocalBackfillReserved()),
6697     pg->get_backfill_priority(),
6698     new QueuePeeringEvt<DeferBackfill>(
6699       pg, pg->get_osdmap()->get_epoch(),
6700       DeferBackfill(0.0)));
6701   pg->publish_stats_to_osd();
6702 }
6703
6704 void PG::RecoveryState::WaitLocalBackfillReserved::exit()
6705 {
6706   context< RecoveryMachine >().log_exit(state_name, enter_time);
6707   PG *pg = context< RecoveryMachine >().pg;
6708   utime_t dur = ceph_clock_now() - enter_time;
6709   pg->osd->recoverystate_perf->tinc(rs_waitlocalbackfillreserved_latency, dur);
6710 }
6711
6712 /*----NotBackfilling------*/
6713 PG::RecoveryState::NotBackfilling::NotBackfilling(my_context ctx)
6714   : my_base(ctx),
6715     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/NotBackfilling")
6716 {
6717   context< RecoveryMachine >().log_enter(state_name);
6718   PG *pg = context< RecoveryMachine >().pg;
6719   pg->publish_stats_to_osd();
6720 }
6721
6722 boost::statechart::result
6723 PG::RecoveryState::NotBackfilling::react(const RemoteBackfillReserved &evt)
6724 {
6725   return discard_event();
6726 }
6727
6728 boost::statechart::result
6729 PG::RecoveryState::NotBackfilling::react(const RemoteReservationRejected &evt)
6730 {
6731   return discard_event();
6732 }
6733
6734 void PG::RecoveryState::NotBackfilling::exit()
6735 {
6736   context< RecoveryMachine >().log_exit(state_name, enter_time);
6737   PG *pg = context< RecoveryMachine >().pg;
6738   pg->state_clear(PG_STATE_BACKFILL_UNFOUND);
6739   utime_t dur = ceph_clock_now() - enter_time;
6740   pg->osd->recoverystate_perf->tinc(rs_notbackfilling_latency, dur);
6741 }
6742
6743 /*----NotRecovering------*/
6744 PG::RecoveryState::NotRecovering::NotRecovering(my_context ctx)
6745   : my_base(ctx),
6746     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/NotRecovering")
6747 {
6748   context< RecoveryMachine >().log_enter(state_name);
6749   PG *pg = context< RecoveryMachine >().pg;
6750   pg->publish_stats_to_osd();
6751 }
6752
6753 void PG::RecoveryState::NotRecovering::exit()
6754 {
6755   context< RecoveryMachine >().log_exit(state_name, enter_time);
6756   PG *pg = context< RecoveryMachine >().pg;
6757   pg->state_clear(PG_STATE_RECOVERY_UNFOUND);
6758   utime_t dur = ceph_clock_now() - enter_time;
6759   pg->osd->recoverystate_perf->tinc(rs_notrecovering_latency, dur);
6760 }
6761
6762 /*---RepNotRecovering----*/
6763 PG::RecoveryState::RepNotRecovering::RepNotRecovering(my_context ctx)
6764   : my_base(ctx),
6765     NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepNotRecovering")
6766 {
6767   context< RecoveryMachine >().log_enter(state_name);
6768 }
6769
6770 boost::statechart::result
6771 PG::RecoveryState::RepNotRecovering::react(const RejectRemoteReservation &evt)
6772 {
6773   PG *pg = context< RecoveryMachine >().pg;
6774   pg->reject_reservation();
6775   post_event(RemoteReservationRejected());
6776   return discard_event();
6777 }
6778
6779 void PG::RecoveryState::RepNotRecovering::exit()
6780 {
6781   context< RecoveryMachine >().log_exit(state_name, enter_time);
6782   PG *pg = context< RecoveryMachine >().pg;
6783   utime_t dur = ceph_clock_now() - enter_time;
6784   pg->osd->recoverystate_perf->tinc(rs_repnotrecovering_latency, dur);
6785 }
6786
6787 /*---RepWaitRecoveryReserved--*/
6788 PG::RecoveryState::RepWaitRecoveryReserved::RepWaitRecoveryReserved(my_context ctx)
6789   : my_base(ctx),
6790     NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepWaitRecoveryReserved")
6791 {
6792   context< RecoveryMachine >().log_enter(state_name);
6793   PG *pg = context< RecoveryMachine >().pg;
6794
6795   pg->osd->remote_reserver.request_reservation(
6796     pg->info.pgid,
6797     new QueuePeeringEvt<RemoteRecoveryReserved>(
6798       pg, pg->get_osdmap()->get_epoch(),
6799       RemoteRecoveryReserved()),
6800     pg->get_recovery_priority());
6801 }
6802
6803 boost::statechart::result
6804 PG::RecoveryState::RepWaitRecoveryReserved::react(const RemoteRecoveryReserved &evt)
6805 {
6806   PG *pg = context< RecoveryMachine >().pg;
6807   pg->osd->send_message_osd_cluster(
6808     pg->primary.osd,
6809     new MRecoveryReserve(
6810       MRecoveryReserve::GRANT,
6811       spg_t(pg->info.pgid.pgid, pg->primary.shard),
6812       pg->get_osdmap()->get_epoch()),
6813     pg->get_osdmap()->get_epoch());
6814   return transit<RepRecovering>();
6815 }
6816
6817 boost::statechart::result
6818 PG::RecoveryState::RepWaitRecoveryReserved::react(
6819   const RemoteReservationCanceled &evt)
6820 {
6821   PG *pg = context< RecoveryMachine >().pg;
6822   pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
6823   return transit<RepNotRecovering>();
6824 }
6825
6826 void PG::RecoveryState::RepWaitRecoveryReserved::exit()
6827 {
6828   context< RecoveryMachine >().log_exit(state_name, enter_time);
6829   PG *pg = context< RecoveryMachine >().pg;
6830   utime_t dur = ceph_clock_now() - enter_time;
6831   pg->osd->recoverystate_perf->tinc(rs_repwaitrecoveryreserved_latency, dur);
6832 }
6833
6834 /*-RepWaitBackfillReserved*/
6835 PG::RecoveryState::RepWaitBackfillReserved::RepWaitBackfillReserved(my_context ctx)
6836   : my_base(ctx),
6837     NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepWaitBackfillReserved")
6838 {
6839   context< RecoveryMachine >().log_enter(state_name);
6840 }
6841
6842 boost::statechart::result
6843 PG::RecoveryState::RepNotRecovering::react(const RequestBackfillPrio &evt)
6844 {
6845   PG *pg = context< RecoveryMachine >().pg;
6846   ostringstream ss;
6847
6848   if (pg->cct->_conf->osd_debug_reject_backfill_probability > 0 &&
6849       (rand()%1000 < (pg->cct->_conf->osd_debug_reject_backfill_probability*1000.0))) {
6850     ldout(pg->cct, 10) << "backfill reservation rejected: failure injection"
6851                        << dendl;
6852     post_event(RejectRemoteReservation());
6853   } else if (!pg->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation &&
6854       pg->osd->check_backfill_full(ss)) {
6855     ldout(pg->cct, 10) << "backfill reservation rejected: "
6856                        << ss.str() << dendl;
6857     post_event(RejectRemoteReservation());
6858   } else {
6859     pg->osd->remote_reserver.request_reservation(
6860       pg->info.pgid,
6861       new QueuePeeringEvt<RemoteBackfillReserved>(
6862         pg, pg->get_osdmap()->get_epoch(),
6863         RemoteBackfillReserved()), evt.priority);
6864   }
6865   return transit<RepWaitBackfillReserved>();
6866 }
6867
6868 void PG::RecoveryState::RepWaitBackfillReserved::exit()
6869 {
6870   context< RecoveryMachine >().log_exit(state_name, enter_time);
6871   PG *pg = context< RecoveryMachine >().pg;
6872   utime_t dur = ceph_clock_now() - enter_time;
6873   pg->osd->recoverystate_perf->tinc(rs_repwaitbackfillreserved_latency, dur);
6874 }
6875
6876 boost::statechart::result
6877 PG::RecoveryState::RepWaitBackfillReserved::react(const RemoteBackfillReserved &evt)
6878 {
6879   PG *pg = context< RecoveryMachine >().pg;
6880
6881   ostringstream ss;
6882   if (pg->cct->_conf->osd_debug_reject_backfill_probability > 0 &&
6883       (rand()%1000 < (pg->cct->_conf->osd_debug_reject_backfill_probability*1000.0))) {
6884     ldout(pg->cct, 10) << "backfill reservation rejected after reservation: "
6885                        << "failure injection" << dendl;
6886     post_event(RejectRemoteReservation());
6887     return discard_event();
6888   } else if (!pg->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation &&
6889              pg->osd->check_backfill_full(ss)) {
6890     ldout(pg->cct, 10) << "backfill reservation rejected after reservation: "
6891                        << ss.str() << dendl;
6892     post_event(RejectRemoteReservation());
6893     return discard_event();
6894   } else {
6895     pg->osd->send_message_osd_cluster(
6896       pg->primary.osd,
6897       new MBackfillReserve(
6898         MBackfillReserve::GRANT,
6899         spg_t(pg->info.pgid.pgid, pg->primary.shard),
6900         pg->get_osdmap()->get_epoch()),
6901       pg->get_osdmap()->get_epoch());
6902     return transit<RepRecovering>();
6903   }
6904 }
6905
6906 boost::statechart::result
6907 PG::RecoveryState::RepWaitBackfillReserved::react(
6908   const RejectRemoteReservation &evt)
6909 {
6910   PG *pg = context< RecoveryMachine >().pg;
6911   pg->reject_reservation();
6912   post_event(RemoteReservationRejected());
6913   return discard_event();
6914 }
6915
6916 boost::statechart::result
6917 PG::RecoveryState::RepWaitBackfillReserved::react(
6918   const RemoteReservationRejected &evt)
6919 {
6920   PG *pg = context< RecoveryMachine >().pg;
6921   pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
6922   return transit<RepNotRecovering>();
6923 }
6924
6925 boost::statechart::result
6926 PG::RecoveryState::RepWaitBackfillReserved::react(
6927   const RemoteReservationCanceled &evt)
6928 {
6929   PG *pg = context< RecoveryMachine >().pg;
6930   pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
6931   return transit<RepNotRecovering>();
6932 }
6933
6934 /*---RepRecovering-------*/
6935 PG::RecoveryState::RepRecovering::RepRecovering(my_context ctx)
6936   : my_base(ctx),
6937     NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepRecovering")
6938 {
6939   context< RecoveryMachine >().log_enter(state_name);
6940 }
6941
6942 boost::statechart::result
6943 PG::RecoveryState::RepRecovering::react(const BackfillTooFull &)
6944 {
6945   PG *pg = context< RecoveryMachine >().pg;
6946   pg->reject_reservation();
6947   return discard_event();
6948 }
6949
6950 void PG::RecoveryState::RepRecovering::exit()
6951 {
6952   context< RecoveryMachine >().log_exit(state_name, enter_time);
6953   PG *pg = context< RecoveryMachine >().pg;
6954   pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
6955   utime_t dur = ceph_clock_now() - enter_time;
6956   pg->osd->recoverystate_perf->tinc(rs_reprecovering_latency, dur);
6957 }
6958
6959 /*------Activating--------*/
6960 PG::RecoveryState::Activating::Activating(my_context ctx)
6961   : my_base(ctx),
6962     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Activating")
6963 {
6964   context< RecoveryMachine >().log_enter(state_name);
6965 }
6966
6967 void PG::RecoveryState::Activating::exit()
6968 {
6969   context< RecoveryMachine >().log_exit(state_name, enter_time);
6970   PG *pg = context< RecoveryMachine >().pg;
6971   utime_t dur = ceph_clock_now() - enter_time;
6972   pg->osd->recoverystate_perf->tinc(rs_activating_latency, dur);
6973 }
6974
6975 PG::RecoveryState::WaitLocalRecoveryReserved::WaitLocalRecoveryReserved(my_context ctx)
6976   : my_base(ctx),
6977     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitLocalRecoveryReserved")
6978 {
6979   context< RecoveryMachine >().log_enter(state_name);
6980   PG *pg = context< RecoveryMachine >().pg;
6981
6982   // Make sure all nodes that part of the recovery aren't full
6983   if (!pg->cct->_conf->osd_debug_skip_full_check_in_recovery &&
6984       pg->osd->check_osdmap_full(pg->actingbackfill)) {
6985     post_event(RecoveryTooFull());
6986     return;
6987   }
6988
6989   pg->state_clear(PG_STATE_RECOVERY_TOOFULL);
6990   pg->state_set(PG_STATE_RECOVERY_WAIT);
6991   pg->osd->local_reserver.request_reservation(
6992     pg->info.pgid,
6993     new QueuePeeringEvt<LocalRecoveryReserved>(
6994       pg, pg->get_osdmap()->get_epoch(),
6995       LocalRecoveryReserved()),
6996     pg->get_recovery_priority(),
6997     new QueuePeeringEvt<DeferRecovery>(
6998       pg, pg->get_osdmap()->get_epoch(),
6999       DeferRecovery(0.0)));
7000   pg->publish_stats_to_osd();
7001 }
7002
7003 boost::statechart::result
7004 PG::RecoveryState::WaitLocalRecoveryReserved::react(const RecoveryTooFull &evt)
7005 {
7006   PG *pg = context< RecoveryMachine >().pg;
7007   pg->state_set(PG_STATE_RECOVERY_TOOFULL);
7008   pg->schedule_recovery_retry(pg->cct->_conf->osd_recovery_retry_interval);
7009   return transit<NotRecovering>();
7010 }
7011
7012 void PG::RecoveryState::WaitLocalRecoveryReserved::exit()
7013 {
7014   context< RecoveryMachine >().log_exit(state_name, enter_time);
7015   PG *pg = context< RecoveryMachine >().pg;
7016   utime_t dur = ceph_clock_now() - enter_time;
7017   pg->osd->recoverystate_perf->tinc(rs_waitlocalrecoveryreserved_latency, dur);
7018 }
7019
7020 PG::RecoveryState::WaitRemoteRecoveryReserved::WaitRemoteRecoveryReserved(my_context ctx)
7021   : my_base(ctx),
7022     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitRemoteRecoveryReserved"),
7023     remote_recovery_reservation_it(context< Active >().remote_shards_to_reserve_recovery.begin())
7024 {
7025   context< RecoveryMachine >().log_enter(state_name);
7026   post_event(RemoteRecoveryReserved());
7027 }
7028
7029 boost::statechart::result
7030 PG::RecoveryState::WaitRemoteRecoveryReserved::react(const RemoteRecoveryReserved &evt) {
7031   PG *pg = context< RecoveryMachine >().pg;
7032
7033   if (remote_recovery_reservation_it != context< Active >().remote_shards_to_reserve_recovery.end()) {
7034     assert(*remote_recovery_reservation_it != pg->pg_whoami);
7035     ConnectionRef con = pg->osd->get_con_osd_cluster(
7036       remote_recovery_reservation_it->osd, pg->get_osdmap()->get_epoch());
7037     if (con) {
7038       pg->osd->send_message_osd_cluster(
7039         new MRecoveryReserve(
7040           MRecoveryReserve::REQUEST,
7041           spg_t(pg->info.pgid.pgid, remote_recovery_reservation_it->shard),
7042           pg->get_osdmap()->get_epoch()),
7043         con.get());
7044     }
7045     ++remote_recovery_reservation_it;
7046   } else {
7047     post_event(AllRemotesReserved());
7048   }
7049   return discard_event();
7050 }
7051
7052 void PG::RecoveryState::WaitRemoteRecoveryReserved::exit()
7053 {
7054   context< RecoveryMachine >().log_exit(state_name, enter_time);
7055   PG *pg = context< RecoveryMachine >().pg;
7056   utime_t dur = ceph_clock_now() - enter_time;
7057   pg->osd->recoverystate_perf->tinc(rs_waitremoterecoveryreserved_latency, dur);
7058 }
7059
7060 PG::RecoveryState::Recovering::Recovering(my_context ctx)
7061   : my_base(ctx),
7062     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Recovering")
7063 {
7064   context< RecoveryMachine >().log_enter(state_name);
7065
7066   PG *pg = context< RecoveryMachine >().pg;
7067   pg->state_clear(PG_STATE_RECOVERY_WAIT);
7068   pg->state_clear(PG_STATE_RECOVERY_TOOFULL);
7069   pg->state_set(PG_STATE_RECOVERING);
7070   assert(!pg->state_test(PG_STATE_ACTIVATING));
7071   pg->publish_stats_to_osd();
7072   pg->queue_recovery();
7073 }
7074
7075 void PG::RecoveryState::Recovering::release_reservations(bool cancel)
7076 {
7077   PG *pg = context< RecoveryMachine >().pg;
7078   assert(cancel || !pg->pg_log.get_missing().have_missing());
7079
7080   // release remote reservations
7081   for (set<pg_shard_t>::const_iterator i =
7082          context< Active >().remote_shards_to_reserve_recovery.begin();
7083         i != context< Active >().remote_shards_to_reserve_recovery.end();
7084         ++i) {
7085     if (*i == pg->pg_whoami) // skip myself
7086       continue;
7087     ConnectionRef con = pg->osd->get_con_osd_cluster(
7088       i->osd, pg->get_osdmap()->get_epoch());
7089     if (con) {
7090       pg->osd->send_message_osd_cluster(
7091         new MRecoveryReserve(
7092           MRecoveryReserve::RELEASE,
7093           spg_t(pg->info.pgid.pgid, i->shard),
7094           pg->get_osdmap()->get_epoch()),
7095         con.get());
7096     }
7097   }
7098 }
7099
7100 boost::statechart::result
7101 PG::RecoveryState::Recovering::react(const AllReplicasRecovered &evt)
7102 {
7103   PG *pg = context< RecoveryMachine >().pg;
7104   pg->state_clear(PG_STATE_RECOVERING);
7105   pg->state_clear(PG_STATE_FORCED_RECOVERY);
7106   release_reservations();
7107   pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
7108   return transit<Recovered>();
7109 }
7110
7111 boost::statechart::result
7112 PG::RecoveryState::Recovering::react(const RequestBackfill &evt)
7113 {
7114   PG *pg = context< RecoveryMachine >().pg;
7115   pg->state_clear(PG_STATE_RECOVERING);
7116   pg->state_clear(PG_STATE_FORCED_RECOVERY);
7117   release_reservations();
7118   pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
7119   // XXX: Is this needed?
7120   pg->publish_stats_to_osd();
7121   return transit<WaitLocalBackfillReserved>();
7122 }
7123
7124 boost::statechart::result
7125 PG::RecoveryState::Recovering::react(const DeferRecovery &evt)
7126 {
7127   PG *pg = context< RecoveryMachine >().pg;
7128   ldout(pg->cct, 10) << "defer recovery, retry delay " << evt.delay << dendl;
7129   pg->state_clear(PG_STATE_RECOVERING);
7130   pg->state_set(PG_STATE_RECOVERY_WAIT);
7131   pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
7132   release_reservations(true);
7133   pg->schedule_recovery_retry(evt.delay);
7134   return transit<NotRecovering>();
7135 }
7136
7137 boost::statechart::result
7138 PG::RecoveryState::Recovering::react(const UnfoundRecovery &evt)
7139 {
7140   PG *pg = context< RecoveryMachine >().pg;
7141   ldout(pg->cct, 10) << "recovery has unfound, can't continue" << dendl;
7142   pg->state_set(PG_STATE_RECOVERY_UNFOUND);
7143   pg->state_clear(PG_STATE_RECOVERING);
7144   pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
7145   release_reservations(true);
7146   return transit<NotRecovering>();
7147 }
7148
7149 void PG::RecoveryState::Recovering::exit()
7150 {
7151   context< RecoveryMachine >().log_exit(state_name, enter_time);
7152   PG *pg = context< RecoveryMachine >().pg;
7153   utime_t dur = ceph_clock_now() - enter_time;
7154   pg->osd->recoverystate_perf->tinc(rs_recovering_latency, dur);
7155 }
7156
7157 PG::RecoveryState::Recovered::Recovered(my_context ctx)
7158   : my_base(ctx),
7159     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Recovered")
7160 {
7161   pg_shard_t auth_log_shard;
7162
7163   context< RecoveryMachine >().log_enter(state_name);
7164
7165   PG *pg = context< RecoveryMachine >().pg;
7166
7167   assert(!pg->needs_recovery());
7168
7169   // if we finished backfill, all acting are active; recheck if
7170   // DEGRADED | UNDERSIZED is appropriate.
7171   assert(!pg->actingbackfill.empty());
7172   if (pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid) <=
7173       pg->actingbackfill.size()) {
7174     pg->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
7175     pg->publish_stats_to_osd();
7176   }
7177
7178   // trim pglog on recovered
7179   pg->trim_log();
7180
7181   // adjust acting set?  (e.g. because backfill completed...)
7182   bool history_les_bound = false;
7183   if (pg->acting != pg->up && !pg->choose_acting(auth_log_shard,
7184                                                  true, &history_les_bound))
7185     assert(pg->want_acting.size());
7186
7187   if (context< Active >().all_replicas_activated)
7188     post_event(GoClean());
7189 }
7190
7191 void PG::RecoveryState::Recovered::exit()
7192 {
7193   context< RecoveryMachine >().log_exit(state_name, enter_time);
7194   PG *pg = context< RecoveryMachine >().pg;
7195   utime_t dur = ceph_clock_now() - enter_time;
7196   pg->osd->recoverystate_perf->tinc(rs_recovered_latency, dur);
7197 }
7198
7199 PG::RecoveryState::Clean::Clean(my_context ctx)
7200   : my_base(ctx),
7201     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Clean")
7202 {
7203   context< RecoveryMachine >().log_enter(state_name);
7204
7205   PG *pg = context< RecoveryMachine >().pg;
7206
7207   if (pg->info.last_complete != pg->info.last_update) {
7208     ceph_abort();
7209   }
7210   pg->finish_recovery(*context< RecoveryMachine >().get_on_safe_context_list());
7211
7212   if (pg->is_active()) {
7213     pg->mark_clean();
7214   }
7215
7216   pg->share_pg_info();
7217   pg->publish_stats_to_osd();
7218   pg->requeue_ops(pg->waiting_for_clean_to_primary_repair);
7219 }
7220
7221 void PG::RecoveryState::Clean::exit()
7222 {
7223   context< RecoveryMachine >().log_exit(state_name, enter_time);
7224   PG *pg = context< RecoveryMachine >().pg;
7225   pg->state_clear(PG_STATE_CLEAN);
7226   utime_t dur = ceph_clock_now() - enter_time;
7227   pg->osd->recoverystate_perf->tinc(rs_clean_latency, dur);
7228 }
7229
7230 template <typename T>
7231 set<pg_shard_t> unique_osd_shard_set(const pg_shard_t & skip, const T &in)
7232 {
7233   set<int> osds_found;
7234   set<pg_shard_t> out;
7235   for (typename T::const_iterator i = in.begin();
7236        i != in.end();
7237        ++i) {
7238     if (*i != skip && !osds_found.count(i->osd)) {
7239       osds_found.insert(i->osd);
7240       out.insert(*i);
7241     }
7242   }
7243   return out;
7244 }
7245
7246 /*---------Active---------*/
7247 PG::RecoveryState::Active::Active(my_context ctx)
7248   : my_base(ctx),
7249     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active"),
7250     remote_shards_to_reserve_recovery(
7251       unique_osd_shard_set(
7252         context< RecoveryMachine >().pg->pg_whoami,
7253         context< RecoveryMachine >().pg->actingbackfill)),
7254     remote_shards_to_reserve_backfill(
7255       unique_osd_shard_set(
7256         context< RecoveryMachine >().pg->pg_whoami,
7257         context< RecoveryMachine >().pg->backfill_targets)),
7258     all_replicas_activated(false)
7259 {
7260   context< RecoveryMachine >().log_enter(state_name);
7261
7262   PG *pg = context< RecoveryMachine >().pg;
7263
7264   assert(!pg->backfill_reserving);
7265   assert(!pg->backfill_reserved);
7266   assert(pg->is_primary());
7267   ldout(pg->cct, 10) << "In Active, about to call activate" << dendl;
7268   pg->start_flush(
7269     context< RecoveryMachine >().get_cur_transaction(),
7270     context< RecoveryMachine >().get_on_applied_context_list(),
7271     context< RecoveryMachine >().get_on_safe_context_list());
7272   pg->activate(*context< RecoveryMachine >().get_cur_transaction(),
7273                pg->get_osdmap()->get_epoch(),
7274                *context< RecoveryMachine >().get_on_safe_context_list(),
7275                *context< RecoveryMachine >().get_query_map(),
7276                context< RecoveryMachine >().get_info_map(),
7277                context< RecoveryMachine >().get_recovery_ctx());
7278
7279   // everyone has to commit/ack before we are truly active
7280   pg->blocked_by.clear();
7281   for (set<pg_shard_t>::iterator p = pg->actingbackfill.begin();
7282        p != pg->actingbackfill.end();
7283        ++p) {
7284     if (p->shard != pg->pg_whoami.shard) {
7285       pg->blocked_by.insert(p->shard);
7286     }
7287   }
7288   pg->publish_stats_to_osd();
7289   ldout(pg->cct, 10) << "Activate Finished" << dendl;
7290 }
7291
7292 boost::statechart::result PG::RecoveryState::Active::react(const AdvMap& advmap)
7293 {
7294   PG *pg = context< RecoveryMachine >().pg;
7295   ldout(pg->cct, 10) << "Active advmap" << dendl;
7296   if (!pg->pool.newly_removed_snaps.empty()) {
7297     pg->snap_trimq.union_of(pg->pool.newly_removed_snaps);
7298     ldout(pg->cct, 10) << *pg << " snap_trimq now " << pg->snap_trimq << dendl;
7299     pg->dirty_info = true;
7300     pg->dirty_big_info = true;
7301   }
7302
7303   for (size_t i = 0; i < pg->want_acting.size(); i++) {
7304     int osd = pg->want_acting[i];
7305     if (!advmap.osdmap->is_up(osd)) {
7306       pg_shard_t osd_with_shard(osd, shard_id_t(i));
7307       assert(pg->is_acting(osd_with_shard) || pg->is_up(osd_with_shard));
7308     }
7309   }
7310
7311   bool need_publish = false;
7312   /* Check for changes in pool size (if the acting set changed as a result,
7313    * this does not matter) */
7314   if (advmap.lastmap->get_pg_size(pg->info.pgid.pgid) !=
7315       pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid)) {
7316     if (pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid) <= pg->actingset.size()) {
7317       pg->state_clear(PG_STATE_UNDERSIZED);
7318     } else {
7319       pg->state_set(PG_STATE_UNDERSIZED);
7320     }
7321     // degraded changes will be detected by call from publish_stats_to_osd()
7322     need_publish = true;
7323   }
7324
7325   // if we haven't reported our PG stats in a long time, do so now.
7326   if (pg->info.stats.reported_epoch + pg->cct->_conf->osd_pg_stat_report_interval_max < advmap.osdmap->get_epoch()) {
7327     ldout(pg->cct, 20) << "reporting stats to osd after " << (advmap.osdmap->get_epoch() - pg->info.stats.reported_epoch)
7328                        << " epochs" << dendl;
7329     need_publish = true;
7330   }
7331
7332   if (need_publish)
7333     pg->publish_stats_to_osd();
7334
7335   return forward_event();
7336 }
7337
7338 boost::statechart::result PG::RecoveryState::Active::react(const ActMap&)
7339 {
7340   PG *pg = context< RecoveryMachine >().pg;
7341   ldout(pg->cct, 10) << "Active: handling ActMap" << dendl;
7342   assert(pg->is_primary());
7343
7344   if (pg->have_unfound()) {
7345     // object may have become unfound
7346     pg->discover_all_missing(*context< RecoveryMachine >().get_query_map());
7347   }
7348
7349   if (pg->cct->_conf->osd_check_for_log_corruption)
7350     pg->check_log_for_corruption(pg->osd->store);
7351
7352   uint64_t unfound = pg->missing_loc.num_unfound();
7353   if (unfound > 0 &&
7354       pg->all_unfound_are_queried_or_lost(pg->get_osdmap())) {
7355     if (pg->cct->_conf->osd_auto_mark_unfound_lost) {
7356       pg->osd->clog->error() << pg->info.pgid.pgid << " has " << unfound
7357                             << " objects unfound and apparently lost, would automatically "
7358                             << "mark these objects lost but this feature is not yet implemented "
7359                             << "(osd_auto_mark_unfound_lost)";
7360     } else
7361       pg->osd->clog->error() << pg->info.pgid.pgid << " has "
7362                              << unfound << " objects unfound and apparently lost";
7363   }
7364
7365   if (pg->is_active()) {
7366     ldout(pg->cct, 10) << "Active: kicking snap trim" << dendl;
7367     pg->kick_snap_trim();
7368   }
7369
7370   if (pg->is_peered() &&
7371       !pg->is_clean() &&
7372       !pg->get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL) &&
7373       (!pg->get_osdmap()->test_flag(CEPH_OSDMAP_NOREBALANCE) || pg->is_degraded())) {
7374     pg->queue_recovery();
7375   }
7376   return forward_event();
7377 }
7378
7379 boost::statechart::result PG::RecoveryState::Active::react(const MNotifyRec& notevt)
7380 {
7381   PG *pg = context< RecoveryMachine >().pg;
7382   assert(pg->is_primary());
7383   if (pg->peer_info.count(notevt.from)) {
7384     ldout(pg->cct, 10) << "Active: got notify from " << notevt.from
7385                        << ", already have info from that osd, ignoring"
7386                        << dendl;
7387   } else if (pg->peer_purged.count(notevt.from)) {
7388     ldout(pg->cct, 10) << "Active: got notify from " << notevt.from
7389                        << ", already purged that peer, ignoring"
7390                        << dendl;
7391   } else {
7392     ldout(pg->cct, 10) << "Active: got notify from " << notevt.from
7393                        << ", calling proc_replica_info and discover_all_missing"
7394                        << dendl;
7395     pg->proc_replica_info(
7396       notevt.from, notevt.notify.info, notevt.notify.epoch_sent);
7397     if (pg->have_unfound()) {
7398       pg->discover_all_missing(*context< RecoveryMachine >().get_query_map());
7399     }
7400   }
7401   return discard_event();
7402 }
7403
7404 boost::statechart::result PG::RecoveryState::Active::react(const MInfoRec& infoevt)
7405 {
7406   PG *pg = context< RecoveryMachine >().pg;
7407   assert(pg->is_primary());
7408
7409   assert(!pg->actingbackfill.empty());
7410   // don't update history (yet) if we are active and primary; the replica
7411   // may be telling us they have activated (and committed) but we can't
7412   // share that until _everyone_ does the same.
7413   if (pg->is_actingbackfill(infoevt.from)) {
7414     ldout(pg->cct, 10) << " peer osd." << infoevt.from
7415                        << " activated and committed" << dendl;
7416     pg->peer_activated.insert(infoevt.from);
7417     pg->blocked_by.erase(infoevt.from.shard);
7418     pg->publish_stats_to_osd();
7419     if (pg->peer_activated.size() == pg->actingbackfill.size()) {
7420       pg->all_activated_and_committed();
7421     }
7422   }
7423   return discard_event();
7424 }
7425
7426 boost::statechart::result PG::RecoveryState::Active::react(const MLogRec& logevt)
7427 {
7428   PG *pg = context< RecoveryMachine >().pg;
7429   ldout(pg->cct, 10) << "searching osd." << logevt.from
7430                      << " log for unfound items" << dendl;
7431   pg->proc_replica_log(
7432     logevt.msg->info, logevt.msg->log, logevt.msg->missing, logevt.from);
7433   bool got_missing = pg->search_for_missing(
7434     pg->peer_info[logevt.from],
7435     pg->peer_missing[logevt.from],
7436     logevt.from,
7437     context< RecoveryMachine >().get_recovery_ctx());
7438   // If there are missing AND we are "fully" active then start recovery now
7439   if (got_missing && pg->state_test(PG_STATE_ACTIVE)) {
7440     post_event(DoRecovery());
7441   }
7442   return discard_event();
7443 }
7444
7445 boost::statechart::result PG::RecoveryState::Active::react(const QueryState& q)
7446 {
7447   PG *pg = context< RecoveryMachine >().pg;
7448
7449   q.f->open_object_section("state");
7450   q.f->dump_string("name", state_name);
7451   q.f->dump_stream("enter_time") << enter_time;
7452
7453   {
7454     q.f->open_array_section("might_have_unfound");
7455     for (set<pg_shard_t>::iterator p = pg->might_have_unfound.begin();
7456          p != pg->might_have_unfound.end();
7457          ++p) {
7458       q.f->open_object_section("osd");
7459       q.f->dump_stream("osd") << *p;
7460       if (pg->peer_missing.count(*p)) {
7461         q.f->dump_string("status", "already probed");
7462       } else if (pg->peer_missing_requested.count(*p)) {
7463         q.f->dump_string("status", "querying");
7464       } else if (!pg->get_osdmap()->is_up(p->osd)) {
7465         q.f->dump_string("status", "osd is down");
7466       } else {
7467         q.f->dump_string("status", "not queried");
7468       }
7469       q.f->close_section();
7470     }
7471     q.f->close_section();
7472   }
7473   {
7474     q.f->open_object_section("recovery_progress");
7475     pg->dump_recovery_info(q.f);
7476     q.f->close_section();
7477   }
7478
7479   {
7480     q.f->open_object_section("scrub");
7481     q.f->dump_stream("scrubber.epoch_start") << pg->scrubber.epoch_start;
7482     q.f->dump_bool("scrubber.active", pg->scrubber.active);
7483     q.f->dump_string("scrubber.state", Scrubber::state_string(pg->scrubber.state));
7484     q.f->dump_stream("scrubber.start") << pg->scrubber.start;
7485     q.f->dump_stream("scrubber.end") << pg->scrubber.end;
7486     q.f->dump_stream("scrubber.subset_last_update") << pg->scrubber.subset_last_update;
7487     q.f->dump_bool("scrubber.deep", pg->scrubber.deep);
7488     q.f->dump_unsigned("scrubber.seed", pg->scrubber.seed);
7489     q.f->dump_int("scrubber.waiting_on", pg->scrubber.waiting_on);
7490     {
7491       q.f->open_array_section("scrubber.waiting_on_whom");
7492       for (set<pg_shard_t>::iterator p = pg->scrubber.waiting_on_whom.begin();
7493            p != pg->scrubber.waiting_on_whom.end();
7494            ++p) {
7495         q.f->dump_stream("shard") << *p;
7496       }
7497       q.f->close_section();
7498     }
7499     q.f->close_section();
7500   }
7501
7502   q.f->close_section();
7503   return forward_event();
7504 }
7505
7506 boost::statechart::result PG::RecoveryState::Active::react(const AllReplicasActivated &evt)
7507 {
7508   PG *pg = context< RecoveryMachine >().pg;
7509   all_replicas_activated = true;
7510
7511   pg->state_clear(PG_STATE_ACTIVATING);
7512   pg->state_clear(PG_STATE_CREATING);
7513   if (pg->acting.size() >= pg->pool.info.min_size) {
7514     pg->state_set(PG_STATE_ACTIVE);
7515   } else {
7516     pg->state_set(PG_STATE_PEERED);
7517   }
7518
7519   // info.last_epoch_started is set during activate()
7520   pg->info.history.last_epoch_started = pg->info.last_epoch_started;
7521   pg->info.history.last_interval_started = pg->info.last_interval_started;
7522   pg->dirty_info = true;
7523
7524   pg->share_pg_info();
7525   pg->publish_stats_to_osd();
7526
7527   pg->check_local();
7528
7529   // waiters
7530   if (pg->flushes_in_progress == 0) {
7531     pg->requeue_ops(pg->waiting_for_peered);
7532   } else if (!pg->waiting_for_peered.empty()) {
7533     ldout(pg->cct, 10) << __func__ << " flushes in progress, moving "
7534                        << pg->waiting_for_peered.size()
7535                        << " items to waiting_for_flush"
7536                        << dendl;
7537     assert(pg->waiting_for_flush.empty());
7538     pg->waiting_for_flush.swap(pg->waiting_for_peered);
7539   }
7540
7541   pg->on_activate();
7542
7543   return discard_event();
7544 }
7545
7546 void PG::RecoveryState::Active::exit()
7547 {
7548   context< RecoveryMachine >().log_exit(state_name, enter_time);
7549   PG *pg = context< RecoveryMachine >().pg;
7550   pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
7551
7552   pg->blocked_by.clear();
7553   pg->backfill_reserved = false;
7554   pg->backfill_reserving = false;
7555   pg->state_clear(PG_STATE_ACTIVATING);
7556   pg->state_clear(PG_STATE_DEGRADED);
7557   pg->state_clear(PG_STATE_UNDERSIZED);
7558   pg->state_clear(PG_STATE_BACKFILL_TOOFULL);
7559   pg->state_clear(PG_STATE_BACKFILL_WAIT);
7560   pg->state_clear(PG_STATE_RECOVERY_WAIT);
7561   pg->state_clear(PG_STATE_RECOVERY_TOOFULL);
7562   utime_t dur = ceph_clock_now() - enter_time;
7563   pg->osd->recoverystate_perf->tinc(rs_active_latency, dur);
7564   pg->agent_stop();
7565 }
7566
7567 /*------ReplicaActive-----*/
7568 PG::RecoveryState::ReplicaActive::ReplicaActive(my_context ctx)
7569   : my_base(ctx),
7570     NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive")
7571 {
7572   context< RecoveryMachine >().log_enter(state_name);
7573
7574   PG *pg = context< RecoveryMachine >().pg;
7575   pg->start_flush(
7576     context< RecoveryMachine >().get_cur_transaction(),
7577     context< RecoveryMachine >().get_on_applied_context_list(),
7578     context< RecoveryMachine >().get_on_safe_context_list());
7579 }
7580
7581
7582 boost::statechart::result PG::RecoveryState::ReplicaActive::react(
7583   const Activate& actevt) {
7584   PG *pg = context< RecoveryMachine >().pg;
7585   ldout(pg->cct, 10) << "In ReplicaActive, about to call activate" << dendl;
7586   map<int, map<spg_t, pg_query_t> > query_map;
7587   pg->activate(*context< RecoveryMachine >().get_cur_transaction(),
7588                actevt.activation_epoch,
7589                *context< RecoveryMachine >().get_on_safe_context_list(),
7590                query_map, NULL, NULL);
7591   ldout(pg->cct, 10) << "Activate Finished" << dendl;
7592   return discard_event();
7593 }
7594
7595 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const MInfoRec& infoevt)
7596 {
7597   PG *pg = context< RecoveryMachine >().pg;
7598   pg->proc_primary_info(*context<RecoveryMachine>().get_cur_transaction(),
7599                         infoevt.info);
7600   return discard_event();
7601 }
7602
7603 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const MLogRec& logevt)
7604 {
7605   PG *pg = context< RecoveryMachine >().pg;
7606   ldout(pg->cct, 10) << "received log from " << logevt.from << dendl;
7607   ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
7608   pg->merge_log(*t, logevt.msg->info, logevt.msg->log, logevt.from);
7609   assert(pg->pg_log.get_head() == pg->info.last_update);
7610
7611   return discard_event();
7612 }
7613
7614 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const ActMap&)
7615 {
7616   PG *pg = context< RecoveryMachine >().pg;
7617   if (pg->should_send_notify() && pg->get_primary().osd >= 0) {
7618     context< RecoveryMachine >().send_notify(
7619       pg->get_primary(),
7620       pg_notify_t(
7621         pg->get_primary().shard, pg->pg_whoami.shard,
7622         pg->get_osdmap()->get_epoch(),
7623         pg->get_osdmap()->get_epoch(),
7624         pg->info),
7625       pg->past_intervals);
7626   }
7627   pg->take_waiters();
7628   return discard_event();
7629 }
7630
7631 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const MQuery& query)
7632 {
7633   PG *pg = context< RecoveryMachine >().pg;
7634   if (query.query.type == pg_query_t::MISSING) {
7635     pg->update_history(query.query.history);
7636     pg->fulfill_log(query.from, query.query, query.query_epoch);
7637   } // else: from prior to activation, safe to ignore
7638   return discard_event();
7639 }
7640
7641 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const QueryState& q)
7642 {
7643   q.f->open_object_section("state");
7644   q.f->dump_string("name", state_name);
7645   q.f->dump_stream("enter_time") << enter_time;
7646   q.f->close_section();
7647   return forward_event();
7648 }
7649
7650 void PG::RecoveryState::ReplicaActive::exit()
7651 {
7652   context< RecoveryMachine >().log_exit(state_name, enter_time);
7653   PG *pg = context< RecoveryMachine >().pg;
7654   pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
7655   utime_t dur = ceph_clock_now() - enter_time;
7656   pg->osd->recoverystate_perf->tinc(rs_replicaactive_latency, dur);
7657 }
7658
7659 /*-------Stray---*/
7660 PG::RecoveryState::Stray::Stray(my_context ctx)
7661   : my_base(ctx),
7662     NamedState(context< RecoveryMachine >().pg, "Started/Stray")
7663 {
7664   context< RecoveryMachine >().log_enter(state_name);
7665
7666   PG *pg = context< RecoveryMachine >().pg;
7667   assert(!pg->is_peered());
7668   assert(!pg->is_peering());
7669   assert(!pg->is_primary());
7670   pg->start_flush(
7671     context< RecoveryMachine >().get_cur_transaction(),
7672     context< RecoveryMachine >().get_on_applied_context_list(),
7673     context< RecoveryMachine >().get_on_safe_context_list());
7674 }
7675
7676 boost::statechart::result PG::RecoveryState::Stray::react(const MLogRec& logevt)
7677 {
7678   PG *pg = context< RecoveryMachine >().pg;
7679   MOSDPGLog *msg = logevt.msg.get();
7680   ldout(pg->cct, 10) << "got info+log from osd." << logevt.from << " " << msg->info << " " << msg->log << dendl;
7681
7682   ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
7683   if (msg->info.last_backfill == hobject_t()) {
7684     // restart backfill
7685     pg->unreg_next_scrub();
7686     pg->info = msg->info;
7687     pg->reg_next_scrub();
7688     pg->dirty_info = true;
7689     pg->dirty_big_info = true;  // maybe.
7690
7691     PGLogEntryHandler rollbacker{pg, t};
7692     pg->pg_log.reset_backfill_claim_log(msg->log, &rollbacker);
7693
7694     pg->pg_log.reset_backfill();
7695   } else {
7696     pg->merge_log(*t, msg->info, msg->log, logevt.from);
7697   }
7698
7699   assert(pg->pg_log.get_head() == pg->info.last_update);
7700
7701   post_event(Activate(logevt.msg->info.last_epoch_started));
7702   return transit<ReplicaActive>();
7703 }
7704
7705 boost::statechart::result PG::RecoveryState::Stray::react(const MInfoRec& infoevt)
7706 {
7707   PG *pg = context< RecoveryMachine >().pg;
7708   ldout(pg->cct, 10) << "got info from osd." << infoevt.from << " " << infoevt.info << dendl;
7709
7710   if (pg->info.last_update > infoevt.info.last_update) {
7711     // rewind divergent log entries
7712     ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
7713     pg->rewind_divergent_log(*t, infoevt.info.last_update);
7714     pg->info.stats = infoevt.info.stats;
7715     pg->info.hit_set = infoevt.info.hit_set;
7716   }
7717
7718   assert(infoevt.info.last_update == pg->info.last_update);
7719   assert(pg->pg_log.get_head() == pg->info.last_update);
7720
7721   post_event(Activate(infoevt.info.last_epoch_started));
7722   return transit<ReplicaActive>();
7723 }
7724
7725 boost::statechart::result PG::RecoveryState::Stray::react(const MQuery& query)
7726 {
7727   PG *pg = context< RecoveryMachine >().pg;
7728   if (query.query.type == pg_query_t::INFO) {
7729     pair<pg_shard_t, pg_info_t> notify_info;
7730     pg->update_history(query.query.history);
7731     pg->fulfill_info(query.from, query.query, notify_info);
7732     context< RecoveryMachine >().send_notify(
7733       notify_info.first,
7734       pg_notify_t(
7735         notify_info.first.shard, pg->pg_whoami.shard,
7736         query.query_epoch,
7737         pg->get_osdmap()->get_epoch(),
7738         notify_info.second),
7739       pg->past_intervals);
7740   } else {
7741     pg->fulfill_log(query.from, query.query, query.query_epoch);
7742   }
7743   return discard_event();
7744 }
7745
7746 boost::statechart::result PG::RecoveryState::Stray::react(const ActMap&)
7747 {
7748   PG *pg = context< RecoveryMachine >().pg;
7749   if (pg->should_send_notify() && pg->get_primary().osd >= 0) {
7750     context< RecoveryMachine >().send_notify(
7751       pg->get_primary(),
7752       pg_notify_t(
7753         pg->get_primary().shard, pg->pg_whoami.shard,
7754         pg->get_osdmap()->get_epoch(),
7755         pg->get_osdmap()->get_epoch(),
7756         pg->info),
7757       pg->past_intervals);
7758   }
7759   pg->take_waiters();
7760   return discard_event();
7761 }
7762
7763 void PG::RecoveryState::Stray::exit()
7764 {
7765   context< RecoveryMachine >().log_exit(state_name, enter_time);
7766   PG *pg = context< RecoveryMachine >().pg;
7767   utime_t dur = ceph_clock_now() - enter_time;
7768   pg->osd->recoverystate_perf->tinc(rs_stray_latency, dur);
7769 }
7770
7771 /*--------GetInfo---------*/
7772 PG::RecoveryState::GetInfo::GetInfo(my_context ctx)
7773   : my_base(ctx),
7774     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/GetInfo")
7775 {
7776   context< RecoveryMachine >().log_enter(state_name);
7777
7778   PG *pg = context< RecoveryMachine >().pg;
7779   pg->check_past_interval_bounds();
7780   PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
7781
7782   assert(pg->blocked_by.empty());
7783
7784   prior_set = pg->build_prior();
7785
7786   pg->reset_min_peer_features();
7787   get_infos();
7788   if (prior_set.pg_down) {
7789     post_event(IsDown());
7790   } else if (peer_info_requested.empty()) {
7791     post_event(GotInfo());
7792   }
7793 }
7794
7795 void PG::RecoveryState::GetInfo::get_infos()
7796 {
7797   PG *pg = context< RecoveryMachine >().pg;
7798   PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
7799
7800   pg->blocked_by.clear();
7801   for (set<pg_shard_t>::const_iterator it = prior_set.probe.begin();
7802        it != prior_set.probe.end();
7803        ++it) {
7804     pg_shard_t peer = *it;
7805     if (peer == pg->pg_whoami) {
7806       continue;
7807     }
7808     if (pg->peer_info.count(peer)) {
7809       ldout(pg->cct, 10) << " have osd." << peer << " info " << pg->peer_info[peer] << dendl;
7810       continue;
7811     }
7812     if (peer_info_requested.count(peer)) {
7813       ldout(pg->cct, 10) << " already requested info from osd." << peer << dendl;
7814       pg->blocked_by.insert(peer.osd);
7815     } else if (!pg->get_osdmap()->is_up(peer.osd)) {
7816       ldout(pg->cct, 10) << " not querying info from down osd." << peer << dendl;
7817     } else {
7818       ldout(pg->cct, 10) << " querying info from osd." << peer << dendl;
7819       context< RecoveryMachine >().send_query(
7820         peer, pg_query_t(pg_query_t::INFO,
7821                          it->shard, pg->pg_whoami.shard,
7822                          pg->info.history,
7823                          pg->get_osdmap()->get_epoch()));
7824       peer_info_requested.insert(peer);
7825       pg->blocked_by.insert(peer.osd);
7826     }
7827   }
7828
7829   pg->publish_stats_to_osd();
7830 }
7831
7832 boost::statechart::result PG::RecoveryState::GetInfo::react(const MNotifyRec& infoevt)
7833 {
7834   PG *pg = context< RecoveryMachine >().pg;
7835
7836   set<pg_shard_t>::iterator p = peer_info_requested.find(infoevt.from);
7837   if (p != peer_info_requested.end()) {
7838     peer_info_requested.erase(p);
7839     pg->blocked_by.erase(infoevt.from.osd);
7840   }
7841
7842   epoch_t old_start = pg->info.history.last_epoch_started;
7843   if (pg->proc_replica_info(
7844         infoevt.from, infoevt.notify.info, infoevt.notify.epoch_sent)) {
7845     // we got something new ...
7846     PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
7847     if (old_start < pg->info.history.last_epoch_started) {
7848       ldout(pg->cct, 10) << " last_epoch_started moved forward, rebuilding prior" << dendl;
7849       prior_set = pg->build_prior();
7850
7851       // filter out any osds that got dropped from the probe set from
7852       // peer_info_requested.  this is less expensive than restarting
7853       // peering (which would re-probe everyone).
7854       set<pg_shard_t>::iterator p = peer_info_requested.begin();
7855       while (p != peer_info_requested.end()) {
7856         if (prior_set.probe.count(*p) == 0) {
7857           ldout(pg->cct, 20) << " dropping osd." << *p << " from info_requested, no longer in probe set" << dendl;
7858           peer_info_requested.erase(p++);
7859         } else {
7860           ++p;
7861         }
7862       }
7863       get_infos();
7864     }
7865     ldout(pg->cct, 20) << "Adding osd: " << infoevt.from.osd << " peer features: "
7866                        << hex << infoevt.features << dec << dendl;
7867     pg->apply_peer_features(infoevt.features);
7868
7869     // are we done getting everything?
7870     if (peer_info_requested.empty() && !prior_set.pg_down) {
7871       ldout(pg->cct, 20) << "Common peer features: " << hex << pg->get_min_peer_features() << dec << dendl;
7872       ldout(pg->cct, 20) << "Common acting features: " << hex << pg->get_min_acting_features() << dec << dendl;
7873       ldout(pg->cct, 20) << "Common upacting features: " << hex << pg->get_min_upacting_features() << dec << dendl;
7874       post_event(GotInfo());
7875     }
7876   }
7877   return discard_event();
7878 }
7879
7880 boost::statechart::result PG::RecoveryState::GetInfo::react(const QueryState& q)
7881 {
7882   PG *pg = context< RecoveryMachine >().pg;
7883   q.f->open_object_section("state");
7884   q.f->dump_string("name", state_name);
7885   q.f->dump_stream("enter_time") << enter_time;
7886
7887   q.f->open_array_section("requested_info_from");
7888   for (set<pg_shard_t>::iterator p = peer_info_requested.begin();
7889        p != peer_info_requested.end();
7890        ++p) {
7891     q.f->open_object_section("osd");
7892     q.f->dump_stream("osd") << *p;
7893     if (pg->peer_info.count(*p)) {
7894       q.f->open_object_section("got_info");
7895       pg->peer_info[*p].dump(q.f);
7896       q.f->close_section();
7897     }
7898     q.f->close_section();
7899   }
7900   q.f->close_section();
7901
7902   q.f->close_section();
7903   return forward_event();
7904 }
7905
7906 void PG::RecoveryState::GetInfo::exit()
7907 {
7908   context< RecoveryMachine >().log_exit(state_name, enter_time);
7909   PG *pg = context< RecoveryMachine >().pg;
7910   utime_t dur = ceph_clock_now() - enter_time;
7911   pg->osd->recoverystate_perf->tinc(rs_getinfo_latency, dur);
7912   pg->blocked_by.clear();
7913   pg->publish_stats_to_osd();
7914 }
7915
7916 /*------GetLog------------*/
7917 PG::RecoveryState::GetLog::GetLog(my_context ctx)
7918   : my_base(ctx),
7919     NamedState(
7920       context< RecoveryMachine >().pg, "Started/Primary/Peering/GetLog"),
7921     msg(0)
7922 {
7923   context< RecoveryMachine >().log_enter(state_name);
7924
7925   PG *pg = context< RecoveryMachine >().pg;
7926
7927   // adjust acting?
7928   if (!pg->choose_acting(auth_log_shard, false,
7929                          &context< Peering >().history_les_bound)) {
7930     if (!pg->want_acting.empty()) {
7931       post_event(NeedActingChange());
7932     } else {
7933       post_event(IsIncomplete());
7934     }
7935     return;
7936   }
7937
7938   // am i the best?
7939   if (auth_log_shard == pg->pg_whoami) {
7940     post_event(GotLog());
7941     return;
7942   }
7943
7944   const pg_info_t& best = pg->peer_info[auth_log_shard];
7945
7946   // am i broken?
7947   if (pg->info.last_update < best.log_tail) {
7948     ldout(pg->cct, 10) << " not contiguous with osd." << auth_log_shard << ", down" << dendl;
7949     post_event(IsIncomplete());
7950     return;
7951   }
7952
7953   // how much log to request?
7954   eversion_t request_log_from = pg->info.last_update;
7955   assert(!pg->actingbackfill.empty());
7956   for (set<pg_shard_t>::iterator p = pg->actingbackfill.begin();
7957        p != pg->actingbackfill.end();
7958        ++p) {
7959     if (*p == pg->pg_whoami) continue;
7960     pg_info_t& ri = pg->peer_info[*p];
7961     if (ri.last_update < pg->info.log_tail && ri.last_update >= best.log_tail &&
7962         ri.last_update < request_log_from)
7963       request_log_from = ri.last_update;
7964   }
7965
7966   // how much?
7967   ldout(pg->cct, 10) << " requesting log from osd." << auth_log_shard << dendl;
7968   context<RecoveryMachine>().send_query(
7969     auth_log_shard,
7970     pg_query_t(
7971       pg_query_t::LOG,
7972       auth_log_shard.shard, pg->pg_whoami.shard,
7973       request_log_from, pg->info.history,
7974       pg->get_osdmap()->get_epoch()));
7975
7976   assert(pg->blocked_by.empty());
7977   pg->blocked_by.insert(auth_log_shard.osd);
7978   pg->publish_stats_to_osd();
7979 }
7980
7981 boost::statechart::result PG::RecoveryState::GetLog::react(const AdvMap& advmap)
7982 {
7983   PG *pg = context< RecoveryMachine >().pg;
7984   // make sure our log source didn't go down.  we need to check
7985   // explicitly because it may not be part of the prior set, which
7986   // means the Peering state check won't catch it going down.
7987   if (!advmap.osdmap->is_up(auth_log_shard.osd)) {
7988     ldout(pg->cct, 10) << "GetLog: auth_log_shard osd."
7989                        << auth_log_shard.osd << " went down" << dendl;
7990     post_event(advmap);
7991     return transit< Reset >();
7992   }
7993
7994   // let the Peering state do its checks.
7995   return forward_event();
7996 }
7997
7998 boost::statechart::result PG::RecoveryState::GetLog::react(const MLogRec& logevt)
7999 {
8000   PG *pg = context< RecoveryMachine >().pg;
8001   assert(!msg);
8002   if (logevt.from != auth_log_shard) {
8003     ldout(pg->cct, 10) << "GetLog: discarding log from "
8004                        << "non-auth_log_shard osd." << logevt.from << dendl;
8005     return discard_event();
8006   }
8007   ldout(pg->cct, 10) << "GetLog: received master log from osd"
8008                      << logevt.from << dendl;
8009   msg = logevt.msg;
8010   post_event(GotLog());
8011   return discard_event();
8012 }
8013
8014 boost::statechart::result PG::RecoveryState::GetLog::react(const GotLog&)
8015 {
8016   PG *pg = context< RecoveryMachine >().pg;
8017   ldout(pg->cct, 10) << "leaving GetLog" << dendl;
8018   if (msg) {
8019     ldout(pg->cct, 10) << "processing master log" << dendl;
8020     pg->proc_master_log(*context<RecoveryMachine>().get_cur_transaction(),
8021                         msg->info, msg->log, msg->missing,
8022                         auth_log_shard);
8023   }
8024   pg->start_flush(
8025     context< RecoveryMachine >().get_cur_transaction(),
8026     context< RecoveryMachine >().get_on_applied_context_list(),
8027     context< RecoveryMachine >().get_on_safe_context_list());
8028   return transit< GetMissing >();
8029 }
8030
8031 boost::statechart::result PG::RecoveryState::GetLog::react(const QueryState& q)
8032 {
8033   q.f->open_object_section("state");
8034   q.f->dump_string("name", state_name);
8035   q.f->dump_stream("enter_time") << enter_time;
8036   q.f->dump_stream("auth_log_shard") << auth_log_shard;
8037   q.f->close_section();
8038   return forward_event();
8039 }
8040
8041 void PG::RecoveryState::GetLog::exit()
8042 {
8043   context< RecoveryMachine >().log_exit(state_name, enter_time);
8044   PG *pg = context< RecoveryMachine >().pg;
8045   utime_t dur = ceph_clock_now() - enter_time;
8046   pg->osd->recoverystate_perf->tinc(rs_getlog_latency, dur);
8047   pg->blocked_by.clear();
8048   pg->publish_stats_to_osd();
8049 }
8050
8051 /*------WaitActingChange--------*/
8052 PG::RecoveryState::WaitActingChange::WaitActingChange(my_context ctx)
8053   : my_base(ctx),
8054     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/WaitActingChange")
8055 {
8056   context< RecoveryMachine >().log_enter(state_name);
8057 }
8058
8059 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const AdvMap& advmap)
8060 {
8061   PG *pg = context< RecoveryMachine >().pg;
8062   OSDMapRef osdmap = advmap.osdmap;
8063
8064   ldout(pg->cct, 10) << "verifying no want_acting " << pg->want_acting << " targets didn't go down" << dendl;
8065   for (vector<int>::iterator p = pg->want_acting.begin(); p != pg->want_acting.end(); ++p) {
8066     if (!osdmap->is_up(*p)) {
8067       ldout(pg->cct, 10) << " want_acting target osd." << *p << " went down, resetting" << dendl;
8068       post_event(advmap);
8069       return transit< Reset >();
8070     }
8071   }
8072   return forward_event();
8073 }
8074
8075 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const MLogRec& logevt)
8076 {
8077   PG *pg = context< RecoveryMachine >().pg;
8078   ldout(pg->cct, 10) << "In WaitActingChange, ignoring MLocRec" << dendl;
8079   return discard_event();
8080 }
8081
8082 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const MInfoRec& evt)
8083 {
8084   PG *pg = context< RecoveryMachine >().pg;
8085   ldout(pg->cct, 10) << "In WaitActingChange, ignoring MInfoRec" << dendl;
8086   return discard_event();
8087 }
8088
8089 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const MNotifyRec& evt)
8090 {
8091   PG *pg = context< RecoveryMachine >().pg;
8092   ldout(pg->cct, 10) << "In WaitActingChange, ignoring MNotifyRec" << dendl;
8093   return discard_event();
8094 }
8095
8096 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const QueryState& q)
8097 {
8098   q.f->open_object_section("state");
8099   q.f->dump_string("name", state_name);
8100   q.f->dump_stream("enter_time") << enter_time;
8101   q.f->dump_string("comment", "waiting for pg acting set to change");
8102   q.f->close_section();
8103   return forward_event();
8104 }
8105
8106 void PG::RecoveryState::WaitActingChange::exit()
8107 {
8108   context< RecoveryMachine >().log_exit(state_name, enter_time);
8109   PG *pg = context< RecoveryMachine >().pg;
8110   utime_t dur = ceph_clock_now() - enter_time;
8111   pg->osd->recoverystate_perf->tinc(rs_waitactingchange_latency, dur);
8112 }
8113
8114 /*------Down--------*/
8115 PG::RecoveryState::Down::Down(my_context ctx)
8116   : my_base(ctx),
8117     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/Down")
8118 {
8119   context< RecoveryMachine >().log_enter(state_name);
8120   PG *pg = context< RecoveryMachine >().pg;
8121
8122   pg->state_clear(PG_STATE_PEERING);
8123   pg->state_set(PG_STATE_DOWN);
8124
8125   auto &prior_set = context< Peering >().prior_set;
8126   assert(pg->blocked_by.empty());
8127   pg->blocked_by.insert(prior_set.down.begin(), prior_set.down.end());
8128   pg->publish_stats_to_osd();
8129 }
8130
8131 void PG::RecoveryState::Down::exit()
8132 {
8133   context< RecoveryMachine >().log_exit(state_name, enter_time);
8134   PG *pg = context< RecoveryMachine >().pg;
8135
8136   pg->state_clear(PG_STATE_DOWN);
8137   utime_t dur = ceph_clock_now() - enter_time;
8138   pg->osd->recoverystate_perf->tinc(rs_down_latency, dur);
8139
8140   pg->blocked_by.clear();
8141   pg->publish_stats_to_osd();
8142 }
8143
8144 boost::statechart::result PG::RecoveryState::Down::react(const QueryState& q)
8145 {
8146   q.f->open_object_section("state");
8147   q.f->dump_string("name", state_name);
8148   q.f->dump_stream("enter_time") << enter_time;
8149   q.f->dump_string("comment",
8150                    "not enough up instances of this PG to go active");
8151   q.f->close_section();
8152   return forward_event();
8153 }
8154
8155 /*------Incomplete--------*/
8156 PG::RecoveryState::Incomplete::Incomplete(my_context ctx)
8157   : my_base(ctx),
8158     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/Incomplete")
8159 {
8160   context< RecoveryMachine >().log_enter(state_name);
8161   PG *pg = context< RecoveryMachine >().pg;
8162
8163   pg->state_clear(PG_STATE_PEERING);
8164   pg->state_set(PG_STATE_INCOMPLETE);
8165
8166   PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
8167   assert(pg->blocked_by.empty());
8168   pg->blocked_by.insert(prior_set.down.begin(), prior_set.down.end());
8169   pg->publish_stats_to_osd();
8170 }
8171
8172 boost::statechart::result PG::RecoveryState::Incomplete::react(const AdvMap &advmap) {
8173   PG *pg = context< RecoveryMachine >().pg;
8174   int64_t poolnum = pg->info.pgid.pool();
8175
8176   // Reset if min_size turn smaller than previous value, pg might now be able to go active
8177   if (advmap.lastmap->get_pools().find(poolnum)->second.min_size >
8178       advmap.osdmap->get_pools().find(poolnum)->second.min_size) {
8179     post_event(advmap);
8180     return transit< Reset >();
8181   }
8182
8183   return forward_event();
8184 }
8185
8186 boost::statechart::result PG::RecoveryState::Incomplete::react(const MNotifyRec& notevt) {
8187   PG *pg = context< RecoveryMachine >().pg;
8188   ldout(pg->cct, 7) << "handle_pg_notify from osd." << notevt.from << dendl;
8189   if (pg->proc_replica_info(
8190     notevt.from, notevt.notify.info, notevt.notify.epoch_sent)) {
8191     // We got something new, try again!
8192     return transit< GetLog >();
8193   } else {
8194     return discard_event();
8195   }
8196 }
8197
8198 boost::statechart::result PG::RecoveryState::Incomplete::react(
8199   const QueryState& q)
8200 {
8201   q.f->open_object_section("state");
8202   q.f->dump_string("name", state_name);
8203   q.f->dump_stream("enter_time") << enter_time;
8204   q.f->dump_string("comment", "not enough complete instances of this PG");
8205   q.f->close_section();
8206   return forward_event();
8207 }
8208
8209 void PG::RecoveryState::Incomplete::exit()
8210 {
8211   context< RecoveryMachine >().log_exit(state_name, enter_time);
8212   PG *pg = context< RecoveryMachine >().pg;
8213
8214   pg->state_clear(PG_STATE_INCOMPLETE);
8215   utime_t dur = ceph_clock_now() - enter_time;
8216   pg->osd->recoverystate_perf->tinc(rs_incomplete_latency, dur);
8217
8218   pg->blocked_by.clear();
8219   pg->publish_stats_to_osd();
8220 }
8221
8222 /*------GetMissing--------*/
8223 PG::RecoveryState::GetMissing::GetMissing(my_context ctx)
8224   : my_base(ctx),
8225     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/GetMissing")
8226 {
8227   context< RecoveryMachine >().log_enter(state_name);
8228
8229   PG *pg = context< RecoveryMachine >().pg;
8230   assert(!pg->actingbackfill.empty());
8231   eversion_t since;
8232   for (set<pg_shard_t>::iterator i = pg->actingbackfill.begin();
8233        i != pg->actingbackfill.end();
8234        ++i) {
8235     if (*i == pg->get_primary()) continue;
8236     const pg_info_t& pi = pg->peer_info[*i];
8237     // reset this so to make sure the pg_missing_t is initialized and
8238     // has the correct semantics even if we don't need to get a
8239     // missing set from a shard. This way later additions due to
8240     // lost+unfound delete work properly.
8241     pg->peer_missing[*i].may_include_deletes = !pg->perform_deletes_during_peering();
8242
8243     if (pi.is_empty())
8244       continue;                                // no pg data, nothing divergent
8245
8246     if (pi.last_update < pg->pg_log.get_tail()) {
8247       ldout(pg->cct, 10) << " osd." << *i << " is not contiguous, will restart backfill" << dendl;
8248       pg->peer_missing[*i].clear();
8249       continue;
8250     }
8251     if (pi.last_backfill == hobject_t()) {
8252       ldout(pg->cct, 10) << " osd." << *i << " will fully backfill; can infer empty missing set" << dendl;
8253       pg->peer_missing[*i].clear();
8254       continue;
8255     }
8256
8257     if (pi.last_update == pi.last_complete &&  // peer has no missing
8258         pi.last_update == pg->info.last_update) {  // peer is up to date
8259       // replica has no missing and identical log as us.  no need to
8260       // pull anything.
8261       // FIXME: we can do better here.  if last_update==last_complete we
8262       //        can infer the rest!
8263       ldout(pg->cct, 10) << " osd." << *i << " has no missing, identical log" << dendl;
8264       pg->peer_missing[*i].clear();
8265       continue;
8266     }
8267
8268     // We pull the log from the peer's last_epoch_started to ensure we
8269     // get enough log to detect divergent updates.
8270     since.epoch = pi.last_epoch_started;
8271     assert(pi.last_update >= pg->info.log_tail);  // or else choose_acting() did a bad thing
8272     if (pi.log_tail <= since) {
8273       ldout(pg->cct, 10) << " requesting log+missing since " << since << " from osd." << *i << dendl;
8274       context< RecoveryMachine >().send_query(
8275         *i,
8276         pg_query_t(
8277           pg_query_t::LOG,
8278           i->shard, pg->pg_whoami.shard,
8279           since, pg->info.history,
8280           pg->get_osdmap()->get_epoch()));
8281     } else {
8282       ldout(pg->cct, 10) << " requesting fulllog+missing from osd." << *i
8283                          << " (want since " << since << " < log.tail "
8284                          << pi.log_tail << ")" << dendl;
8285       context< RecoveryMachine >().send_query(
8286         *i, pg_query_t(
8287           pg_query_t::FULLLOG,
8288           i->shard, pg->pg_whoami.shard,
8289           pg->info.history, pg->get_osdmap()->get_epoch()));
8290     }
8291     peer_missing_requested.insert(*i);
8292     pg->blocked_by.insert(i->osd);
8293   }
8294
8295   if (peer_missing_requested.empty()) {
8296     if (pg->need_up_thru) {
8297       ldout(pg->cct, 10) << " still need up_thru update before going active"
8298                          << dendl;
8299       post_event(NeedUpThru());
8300       return;
8301     }
8302
8303     // all good!
8304     post_event(Activate(pg->get_osdmap()->get_epoch()));
8305   } else {
8306     pg->publish_stats_to_osd();
8307   }
8308 }
8309
8310 boost::statechart::result PG::RecoveryState::GetMissing::react(const MLogRec& logevt)
8311 {
8312   PG *pg = context< RecoveryMachine >().pg;
8313
8314   peer_missing_requested.erase(logevt.from);
8315   pg->proc_replica_log(logevt.msg->info, logevt.msg->log, logevt.msg->missing, logevt.from);
8316
8317   if (peer_missing_requested.empty()) {
8318     if (pg->need_up_thru) {
8319       ldout(pg->cct, 10) << " still need up_thru update before going active"
8320                          << dendl;
8321       post_event(NeedUpThru());
8322     } else {
8323       ldout(pg->cct, 10) << "Got last missing, don't need missing "
8324                          << "posting Activate" << dendl;
8325       post_event(Activate(pg->get_osdmap()->get_epoch()));
8326     }
8327   }
8328   return discard_event();
8329 }
8330
8331 boost::statechart::result PG::RecoveryState::GetMissing::react(const QueryState& q)
8332 {
8333   PG *pg = context< RecoveryMachine >().pg;
8334   q.f->open_object_section("state");
8335   q.f->dump_string("name", state_name);
8336   q.f->dump_stream("enter_time") << enter_time;
8337
8338   q.f->open_array_section("peer_missing_requested");
8339   for (set<pg_shard_t>::iterator p = peer_missing_requested.begin();
8340        p != peer_missing_requested.end();
8341        ++p) {
8342     q.f->open_object_section("osd");
8343     q.f->dump_stream("osd") << *p;
8344     if (pg->peer_missing.count(*p)) {
8345       q.f->open_object_section("got_missing");
8346       pg->peer_missing[*p].dump(q.f);
8347       q.f->close_section();
8348     }
8349     q.f->close_section();
8350   }
8351   q.f->close_section();
8352
8353   q.f->close_section();
8354   return forward_event();
8355 }
8356
8357 void PG::RecoveryState::GetMissing::exit()
8358 {
8359   context< RecoveryMachine >().log_exit(state_name, enter_time);
8360   PG *pg = context< RecoveryMachine >().pg;
8361   utime_t dur = ceph_clock_now() - enter_time;
8362   pg->osd->recoverystate_perf->tinc(rs_getmissing_latency, dur);
8363   pg->blocked_by.clear();
8364   pg->publish_stats_to_osd();
8365 }
8366
8367 /*------WaitUpThru--------*/
8368 PG::RecoveryState::WaitUpThru::WaitUpThru(my_context ctx)
8369   : my_base(ctx),
8370     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/WaitUpThru")
8371 {
8372   context< RecoveryMachine >().log_enter(state_name);
8373 }
8374
8375 boost::statechart::result PG::RecoveryState::WaitUpThru::react(const ActMap& am)
8376 {
8377   PG *pg = context< RecoveryMachine >().pg;
8378   if (!pg->need_up_thru) {
8379     post_event(Activate(pg->get_osdmap()->get_epoch()));
8380   }
8381   return forward_event();
8382 }
8383
8384 boost::statechart::result PG::RecoveryState::WaitUpThru::react(const MLogRec& logevt)
8385 {
8386   PG *pg = context< RecoveryMachine >().pg;
8387   ldout(pg->cct, 10) << "Noting missing from osd." << logevt.from << dendl;
8388   pg->peer_missing[logevt.from].claim(logevt.msg->missing);
8389   pg->peer_info[logevt.from] = logevt.msg->info;
8390   return discard_event();
8391 }
8392
8393 boost::statechart::result PG::RecoveryState::WaitUpThru::react(const QueryState& q)
8394 {
8395   q.f->open_object_section("state");
8396   q.f->dump_string("name", state_name);
8397   q.f->dump_stream("enter_time") << enter_time;
8398   q.f->dump_string("comment", "waiting for osdmap to reflect a new up_thru for this osd");
8399   q.f->close_section();
8400   return forward_event();
8401 }
8402
8403 void PG::RecoveryState::WaitUpThru::exit()
8404 {
8405   context< RecoveryMachine >().log_exit(state_name, enter_time);
8406   PG *pg = context< RecoveryMachine >().pg;
8407   utime_t dur = ceph_clock_now() - enter_time;
8408   pg->osd->recoverystate_perf->tinc(rs_waitupthru_latency, dur);
8409 }
8410
8411 /*----RecoveryState::RecoveryMachine Methods-----*/
8412 #undef dout_prefix
8413 #define dout_prefix *_dout << pg->gen_prefix()
8414
8415 void PG::RecoveryState::RecoveryMachine::log_enter(const char *state_name)
8416 {
8417   PG *pg = context< RecoveryMachine >().pg;
8418   ldout(pg->cct, 5) << "enter " << state_name << dendl;
8419   pg->osd->pg_recovery_stats.log_enter(state_name);
8420 }
8421
8422 void PG::RecoveryState::RecoveryMachine::log_exit(const char *state_name, utime_t enter_time)
8423 {
8424   utime_t dur = ceph_clock_now() - enter_time;
8425   PG *pg = context< RecoveryMachine >().pg;
8426   ldout(pg->cct, 5) << "exit " << state_name << " " << dur << " " << event_count << " " << event_time << dendl;
8427   pg->osd->pg_recovery_stats.log_exit(state_name, ceph_clock_now() - enter_time,
8428                                       event_count, event_time);
8429   event_count = 0;
8430   event_time = utime_t();
8431 }
8432
8433
8434 /*---------------------------------------------------*/
8435 #undef dout_prefix
8436 #define dout_prefix (*_dout << (debug_pg ? debug_pg->gen_prefix() : string()) << " PriorSet: ")
8437
8438 void PG::RecoveryState::start_handle(RecoveryCtx *new_ctx) {
8439   assert(!rctx);
8440   assert(!orig_ctx);
8441   orig_ctx = new_ctx;
8442   if (new_ctx) {
8443     if (messages_pending_flush) {
8444       rctx = RecoveryCtx(*messages_pending_flush, *new_ctx);
8445     } else {
8446       rctx = *new_ctx;
8447     }
8448     rctx->start_time = ceph_clock_now();
8449   }
8450 }
8451
8452 void PG::RecoveryState::begin_block_outgoing() {
8453   assert(!messages_pending_flush);
8454   assert(orig_ctx);
8455   assert(rctx);
8456   messages_pending_flush = BufferedRecoveryMessages();
8457   rctx = RecoveryCtx(*messages_pending_flush, *orig_ctx);
8458 }
8459
8460 void PG::RecoveryState::clear_blocked_outgoing() {
8461   assert(orig_ctx);
8462   assert(rctx);
8463   messages_pending_flush = boost::optional<BufferedRecoveryMessages>();
8464 }
8465
8466 void PG::RecoveryState::end_block_outgoing() {
8467   assert(messages_pending_flush);
8468   assert(orig_ctx);
8469   assert(rctx);
8470
8471   rctx = RecoveryCtx(*orig_ctx);
8472   rctx->accept_buffered_messages(*messages_pending_flush);
8473   messages_pending_flush = boost::optional<BufferedRecoveryMessages>();
8474 }
8475
8476 void PG::RecoveryState::end_handle() {
8477   if (rctx) {
8478     utime_t dur = ceph_clock_now() - rctx->start_time;
8479     machine.event_time += dur;
8480   }
8481
8482   machine.event_count++;
8483   rctx = boost::optional<RecoveryCtx>();
8484   orig_ctx = NULL;
8485 }
8486
8487 ostream& operator<<(ostream& out, const PG::BackfillInterval& bi)
8488 {
8489   out << "BackfillInfo(" << bi.begin << "-" << bi.end
8490       << " " << bi.objects.size() << " objects";
8491   if (!bi.objects.empty())
8492     out << " " << bi.objects;
8493   out << ")";
8494   return out;
8495 }
8496
8497 void intrusive_ptr_add_ref(PG *pg) { pg->get("intptr"); }
8498 void intrusive_ptr_release(PG *pg) { pg->put("intptr"); }
8499
8500 #ifdef PG_DEBUG_REFS
8501   uint64_t get_with_id(PG *pg) { return pg->get_with_id(); }
8502   void put_with_id(PG *pg, uint64_t id) { return pg->put_with_id(id); }
8503 #endif